diff --git "a/CompeteSMoE/competesmoe_versions/Full_down_router_competesmoev6/trainer_state.json" "b/CompeteSMoE/competesmoe_versions/Full_down_router_competesmoev6/trainer_state.json" new file mode 100644--- /dev/null +++ "b/CompeteSMoE/competesmoe_versions/Full_down_router_competesmoev6/trainer_state.json" @@ -0,0 +1,249523 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.999969938373666, + "eval_steps": 500, + "global_step": 16632, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "auxiliary_loss_clip": 0.05028385, + "auxiliary_loss_mlp": 0.02214695, + "balance_loss_clip": 2.43604374, + "balance_loss_mlp": 1.76990139, + "epoch": 6.012325266796934e-05, + "flos": 24456507091200.0, + "grad_norm": 54.579032998233, + "language_loss": 2.8565619, + "learning_rate": 0.0, + "loss": 1.94862103, + "num_input_tokens_seen": 19155, + "step": 1, + "time_per_iteration": 18.035600662231445 + }, + { + "auxiliary_loss_clip": 0.03380201, + "auxiliary_loss_mlp": 0.01459532, + "balance_loss_clip": 1.62761772, + "balance_loss_mlp": 1.18926096, + "epoch": 0.00012024650533593868, + "flos": 20225931246720.0, + "grad_norm": 35.21412936449433, + "language_loss": 1.82461584, + "learning_rate": 4.4628432569317594e-07, + "loss": 1.87301314, + "num_input_tokens_seen": 36175, + "step": 2, + "time_per_iteration": 2.5662355422973633 + }, + { + "auxiliary_loss_clip": 0.03320054, + "auxiliary_loss_mlp": 0.01438878, + "balance_loss_clip": 1.62571526, + "balance_loss_mlp": 1.18672669, + "epoch": 0.000180369758003908, + "flos": 22309935454080.0, + "grad_norm": 33.07312380609338, + "language_loss": 1.57280946, + "learning_rate": 7.073439208833112e-07, + "loss": 1.62039876, + "num_input_tokens_seen": 54870, + "step": 3, + "time_per_iteration": 2.5861754417419434 + }, + { + "auxiliary_loss_clip": 0.03362624, + "auxiliary_loss_mlp": 0.01451741, + "balance_loss_clip": 1.62419748, + "balance_loss_mlp": 1.15533876, + "epoch": 0.00024049301067187735, + "flos": 22414650577920.0, + "grad_norm": 51.0278670667761, + "language_loss": 1.67538047, + "learning_rate": 8.925686513863519e-07, + "loss": 1.72352409, + "num_input_tokens_seen": 74575, + "step": 4, + "time_per_iteration": 2.5887081623077393 + }, + { + "auxiliary_loss_clip": 0.03401604, + "auxiliary_loss_mlp": 0.01505435, + "balance_loss_clip": 1.62511694, + "balance_loss_mlp": 1.21723461, + "epoch": 0.0003006162633398467, + "flos": 21396978449280.0, + "grad_norm": 56.494256380537756, + "language_loss": 1.91558707, + "learning_rate": 1.0362401141348472e-06, + "loss": 1.96465743, + "num_input_tokens_seen": 92580, + "step": 5, + "time_per_iteration": 2.8718974590301514 + }, + { + "auxiliary_loss_clip": 0.03371054, + "auxiliary_loss_mlp": 0.01515083, + "balance_loss_clip": 1.61601067, + "balance_loss_mlp": 1.22058868, + "epoch": 0.000360739516007816, + "flos": 21652375127040.0, + "grad_norm": 33.38726329039865, + "language_loss": 1.60911179, + "learning_rate": 1.153628246576487e-06, + "loss": 1.65797305, + "num_input_tokens_seen": 109705, + "step": 6, + "time_per_iteration": 2.8143558502197266 + }, + { + "auxiliary_loss_clip": 0.03354096, + "auxiliary_loss_mlp": 0.01489314, + "balance_loss_clip": 1.61568749, + "balance_loss_mlp": 1.20550084, + "epoch": 0.0004208627686757854, + "flos": 27159742897920.0, + "grad_norm": 24.715622605875115, + "language_loss": 1.53694797, + "learning_rate": 1.2528784983718962e-06, + "loss": 1.58538198, + "num_input_tokens_seen": 129425, + "step": 7, + "time_per_iteration": 2.9598090648651123 + }, + { + "auxiliary_loss_clip": 0.03321157, + "auxiliary_loss_mlp": 0.01443421, + "balance_loss_clip": 1.61228418, + "balance_loss_mlp": 1.16532981, + "epoch": 0.0004809860213437547, + "flos": 31319096135040.0, + "grad_norm": 32.064708496332884, + "language_loss": 1.43556428, + "learning_rate": 1.338852977079528e-06, + "loss": 1.48321009, + "num_input_tokens_seen": 149210, + "step": 8, + "time_per_iteration": 2.91843581199646 + }, + { + "auxiliary_loss_clip": 0.03369751, + "auxiliary_loss_mlp": 0.01495648, + "balance_loss_clip": 1.61183226, + "balance_loss_mlp": 1.21107185, + "epoch": 0.000541109274011724, + "flos": 32160411463680.0, + "grad_norm": 32.34547664059949, + "language_loss": 1.50276577, + "learning_rate": 1.4146878417666224e-06, + "loss": 1.55141973, + "num_input_tokens_seen": 169055, + "step": 9, + "time_per_iteration": 2.881974220275879 + }, + { + "auxiliary_loss_clip": 0.03309229, + "auxiliary_loss_mlp": 0.01473798, + "balance_loss_clip": 1.61573124, + "balance_loss_mlp": 1.20467091, + "epoch": 0.0006012325266796934, + "flos": 18916808163840.0, + "grad_norm": 23.490577888559578, + "language_loss": 1.44717276, + "learning_rate": 1.4825244398280232e-06, + "loss": 1.4950031, + "num_input_tokens_seen": 188045, + "step": 10, + "time_per_iteration": 2.9729621410369873 + }, + { + "auxiliary_loss_clip": 0.03366292, + "auxiliary_loss_mlp": 0.01492574, + "balance_loss_clip": 1.62111163, + "balance_loss_mlp": 1.2184881, + "epoch": 0.0006613557793476627, + "flos": 20774861867520.0, + "grad_norm": 18.3055383030311, + "language_loss": 1.45190001, + "learning_rate": 1.5438901072051983e-06, + "loss": 1.50048852, + "num_input_tokens_seen": 207035, + "step": 11, + "time_per_iteration": 2.91629695892334 + }, + { + "auxiliary_loss_clip": 0.03294057, + "auxiliary_loss_mlp": 0.01450647, + "balance_loss_clip": 1.60773635, + "balance_loss_mlp": 1.1742723, + "epoch": 0.000721479032015632, + "flos": 16581680997120.0, + "grad_norm": 16.757672593147248, + "language_loss": 1.45338738, + "learning_rate": 1.5999125722696629e-06, + "loss": 1.50083447, + "num_input_tokens_seen": 223225, + "step": 12, + "time_per_iteration": 2.792154550552368 + }, + { + "auxiliary_loss_clip": 0.03322826, + "auxiliary_loss_mlp": 0.01405545, + "balance_loss_clip": 1.6177063, + "balance_loss_mlp": 1.14709914, + "epoch": 0.0007816022846836014, + "flos": 23805471144960.0, + "grad_norm": 12.293210444119456, + "language_loss": 1.24085629, + "learning_rate": 1.6514482443788434e-06, + "loss": 1.28813994, + "num_input_tokens_seen": 242570, + "step": 13, + "time_per_iteration": 2.872943162918091 + }, + { + "auxiliary_loss_clip": 0.03288513, + "auxiliary_loss_mlp": 0.01471187, + "balance_loss_clip": 1.61289716, + "balance_loss_mlp": 1.20301366, + "epoch": 0.0008417255373515708, + "flos": 19172204841600.0, + "grad_norm": 12.054831013507977, + "language_loss": 1.20836496, + "learning_rate": 1.6991628240650723e-06, + "loss": 1.25596201, + "num_input_tokens_seen": 261215, + "step": 14, + "time_per_iteration": 2.8786022663116455 + }, + { + "auxiliary_loss_clip": 0.03275525, + "auxiliary_loss_mlp": 0.01431547, + "balance_loss_clip": 1.61807978, + "balance_loss_mlp": 1.16852391, + "epoch": 0.00090184879001954, + "flos": 26395564026240.0, + "grad_norm": 6.220541998698913, + "language_loss": 1.12955081, + "learning_rate": 1.7435840350181584e-06, + "loss": 1.17662168, + "num_input_tokens_seen": 280035, + "step": 15, + "time_per_iteration": 3.0011115074157715 + }, + { + "auxiliary_loss_clip": 0.03242502, + "auxiliary_loss_mlp": 0.01412917, + "balance_loss_clip": 1.60312343, + "balance_loss_mlp": 1.16248202, + "epoch": 0.0009619720426875094, + "flos": 24679500785280.0, + "grad_norm": 4.443250570681593, + "language_loss": 1.1119585, + "learning_rate": 1.7851373027727038e-06, + "loss": 1.15851259, + "num_input_tokens_seen": 300265, + "step": 16, + "time_per_iteration": 2.877013921737671 + }, + { + "auxiliary_loss_clip": 0.03228723, + "auxiliary_loss_mlp": 0.01419891, + "balance_loss_clip": 1.60942066, + "balance_loss_mlp": 1.17918348, + "epoch": 0.0010220952953554788, + "flos": 18624531196800.0, + "grad_norm": 5.5236890719937355, + "language_loss": 1.12934434, + "learning_rate": 1.8241705979033208e-06, + "loss": 1.17583048, + "num_input_tokens_seen": 317375, + "step": 17, + "time_per_iteration": 2.7876977920532227 + }, + { + "auxiliary_loss_clip": 0.03163271, + "auxiliary_loss_mlp": 0.01378582, + "balance_loss_clip": 1.60663605, + "balance_loss_mlp": 1.14702964, + "epoch": 0.001082218548023448, + "flos": 26142537646080.0, + "grad_norm": 5.160414613524022, + "language_loss": 1.08265221, + "learning_rate": 1.860972167459798e-06, + "loss": 1.12807083, + "num_input_tokens_seen": 337975, + "step": 18, + "time_per_iteration": 2.8423023223876953 + }, + { + "auxiliary_loss_clip": 0.03191147, + "auxiliary_loss_mlp": 0.01402093, + "balance_loss_clip": 1.60588551, + "balance_loss_mlp": 1.13582683, + "epoch": 0.0011423418006914173, + "flos": 19609776322560.0, + "grad_norm": 6.5480999886093585, + "language_loss": 1.0245893, + "learning_rate": 1.89578346593066e-06, + "loss": 1.07052171, + "num_input_tokens_seen": 356635, + "step": 19, + "time_per_iteration": 2.8313870429992676 + }, + { + "auxiliary_loss_clip": 0.03134577, + "auxiliary_loss_mlp": 0.0134412, + "balance_loss_clip": 1.607674, + "balance_loss_mlp": 1.12305856, + "epoch": 0.0012024650533593868, + "flos": 17895365107200.0, + "grad_norm": 4.094925488985278, + "language_loss": 1.16673374, + "learning_rate": 1.928808765521199e-06, + "loss": 1.21152067, + "num_input_tokens_seen": 375625, + "step": 20, + "time_per_iteration": 2.7212107181549072 + }, + { + "auxiliary_loss_clip": 0.03123479, + "auxiliary_loss_mlp": 0.01381299, + "balance_loss_clip": 1.58928347, + "balance_loss_mlp": 1.13162732, + "epoch": 0.001262588306027356, + "flos": 21252043071360.0, + "grad_norm": 6.517918008613153, + "language_loss": 1.06296873, + "learning_rate": 1.9602224192552076e-06, + "loss": 1.10801649, + "num_input_tokens_seen": 394350, + "step": 21, + "time_per_iteration": 2.8778836727142334 + }, + { + "auxiliary_loss_clip": 0.03019175, + "auxiliary_loss_mlp": 0.01383504, + "balance_loss_clip": 1.57023442, + "balance_loss_mlp": 1.14851832, + "epoch": 0.0013227115586953253, + "flos": 26104077158400.0, + "grad_norm": 3.4895649914613167, + "language_loss": 1.06249619, + "learning_rate": 1.9901744328983746e-06, + "loss": 1.10652304, + "num_input_tokens_seen": 413255, + "step": 22, + "time_per_iteration": 2.776231288909912 + }, + { + "auxiliary_loss_clip": 0.02972367, + "auxiliary_loss_mlp": 0.01336222, + "balance_loss_clip": 1.57249165, + "balance_loss_mlp": 1.12536454, + "epoch": 0.0013828348113632948, + "flos": 23951376190080.0, + "grad_norm": 3.69446799868023, + "language_loss": 0.91967744, + "learning_rate": 2.018794797290208e-06, + "loss": 0.96276331, + "num_input_tokens_seen": 433065, + "step": 23, + "time_per_iteration": 2.8462836742401123 + }, + { + "auxiliary_loss_clip": 0.02939683, + "auxiliary_loss_mlp": 0.01365785, + "balance_loss_clip": 1.56479359, + "balance_loss_mlp": 1.14472294, + "epoch": 0.001442958064031264, + "flos": 15959851724160.0, + "grad_norm": 2.8163431863153843, + "language_loss": 1.08278155, + "learning_rate": 2.046196897962839e-06, + "loss": 1.12583625, + "num_input_tokens_seen": 451175, + "step": 24, + "time_per_iteration": 3.0190494060516357 + }, + { + "auxiliary_loss_clip": 0.02829784, + "auxiliary_loss_mlp": 0.0133076, + "balance_loss_clip": 1.55754042, + "balance_loss_mlp": 1.11923528, + "epoch": 0.0015030813166992333, + "flos": 18108350801280.0, + "grad_norm": 3.667091322280896, + "language_loss": 1.0150789, + "learning_rate": 2.0724802282696944e-06, + "loss": 1.05668437, + "num_input_tokens_seen": 468775, + "step": 25, + "time_per_iteration": 3.003577709197998 + }, + { + "auxiliary_loss_clip": 0.02824128, + "auxiliary_loss_mlp": 0.01311373, + "balance_loss_clip": 1.56074798, + "balance_loss_mlp": 1.1017555, + "epoch": 0.0015632045693672028, + "flos": 22234558763520.0, + "grad_norm": 3.369458862319891, + "language_loss": 1.06796145, + "learning_rate": 2.0977325700720194e-06, + "loss": 1.10931659, + "num_input_tokens_seen": 488530, + "step": 26, + "time_per_iteration": 3.134169340133667 + }, + { + "auxiliary_loss_clip": 0.0276693, + "auxiliary_loss_mlp": 0.01326068, + "balance_loss_clip": 1.55108893, + "balance_loss_mlp": 1.1251291, + "epoch": 0.001623327822035172, + "flos": 23991955580160.0, + "grad_norm": 3.1879973773912087, + "language_loss": 0.95542645, + "learning_rate": 2.122031762649933e-06, + "loss": 0.99635649, + "num_input_tokens_seen": 510495, + "step": 27, + "time_per_iteration": 2.9047770500183105 + }, + { + "auxiliary_loss_clip": 0.02742654, + "auxiliary_loss_mlp": 0.01314455, + "balance_loss_clip": 1.55494928, + "balance_loss_mlp": 1.13258958, + "epoch": 0.0016834510747031415, + "flos": 19677647070720.0, + "grad_norm": 2.501353560296928, + "language_loss": 1.06375134, + "learning_rate": 2.1454471497582483e-06, + "loss": 1.10432243, + "num_input_tokens_seen": 528605, + "step": 28, + "time_per_iteration": 2.879300117492676 + }, + { + "auxiliary_loss_clip": 0.02709382, + "auxiliary_loss_mlp": 0.01318747, + "balance_loss_clip": 1.53994918, + "balance_loss_mlp": 1.13163662, + "epoch": 0.0017435743273711108, + "flos": 20923819568640.0, + "grad_norm": 2.4127617949198013, + "language_loss": 1.02602446, + "learning_rate": 2.1680407726407727e-06, + "loss": 1.06630576, + "num_input_tokens_seen": 548515, + "step": 29, + "time_per_iteration": 2.767143726348877 + }, + { + "auxiliary_loss_clip": 0.02701146, + "auxiliary_loss_mlp": 0.01313319, + "balance_loss_clip": 1.53629136, + "balance_loss_mlp": 1.12658989, + "epoch": 0.00180369758003908, + "flos": 19528976678400.0, + "grad_norm": 2.98959821960668, + "language_loss": 1.19348812, + "learning_rate": 2.189868360711334e-06, + "loss": 1.2336328, + "num_input_tokens_seen": 564025, + "step": 30, + "time_per_iteration": 4.448051452636719 + }, + { + "auxiliary_loss_clip": 0.02619858, + "auxiliary_loss_mlp": 0.01339864, + "balance_loss_clip": 1.52242923, + "balance_loss_mlp": 1.15809405, + "epoch": 0.0018638208327070496, + "flos": 27453169100160.0, + "grad_norm": 4.124880001439743, + "language_loss": 1.02572525, + "learning_rate": 2.2109801597326265e-06, + "loss": 1.0653224, + "num_input_tokens_seen": 583345, + "step": 31, + "time_per_iteration": 4.4202470779418945 + }, + { + "auxiliary_loss_clip": 0.0259117, + "auxiliary_loss_mlp": 0.01331067, + "balance_loss_clip": 1.52402616, + "balance_loss_mlp": 1.15101361, + "epoch": 0.0019239440853750188, + "flos": 13589460380160.0, + "grad_norm": 2.6207761110099206, + "language_loss": 0.9551003, + "learning_rate": 2.2314216284658796e-06, + "loss": 0.99432272, + "num_input_tokens_seen": 600010, + "step": 32, + "time_per_iteration": 2.865131378173828 + }, + { + "auxiliary_loss_clip": 0.02572601, + "auxiliary_loss_mlp": 0.01304391, + "balance_loss_clip": 1.5191946, + "balance_loss_mlp": 1.13749862, + "epoch": 0.001984067338042988, + "flos": 11253866336640.0, + "grad_norm": 3.3657011745004186, + "language_loss": 0.95194066, + "learning_rate": 2.2512340280885094e-06, + "loss": 0.99071062, + "num_input_tokens_seen": 616295, + "step": 33, + "time_per_iteration": 2.813169479370117 + }, + { + "auxiliary_loss_clip": 0.02424758, + "auxiliary_loss_mlp": 0.01300168, + "balance_loss_clip": 1.48389781, + "balance_loss_mlp": 1.14395654, + "epoch": 0.0020441905907109576, + "flos": 22386245898240.0, + "grad_norm": 3.320242202666024, + "language_loss": 0.91629934, + "learning_rate": 2.270454923596497e-06, + "loss": 0.95354867, + "num_input_tokens_seen": 637640, + "step": 34, + "time_per_iteration": 2.9272937774658203 + }, + { + "auxiliary_loss_clip": 0.02378378, + "auxiliary_loss_mlp": 0.01271545, + "balance_loss_clip": 1.44982636, + "balance_loss_mlp": 1.11743128, + "epoch": 0.0021043138433789266, + "flos": 49778580337920.0, + "grad_norm": 2.4990824036335813, + "language_loss": 0.76609683, + "learning_rate": 2.2891186125067434e-06, + "loss": 0.80259603, + "num_input_tokens_seen": 659710, + "step": 35, + "time_per_iteration": 3.014483690261841 + }, + { + "auxiliary_loss_clip": 0.02350422, + "auxiliary_loss_mlp": 0.01275567, + "balance_loss_clip": 1.46367788, + "balance_loss_mlp": 1.13242102, + "epoch": 0.002164437096046896, + "flos": 20557961591040.0, + "grad_norm": 2.320290551115586, + "language_loss": 0.8886227, + "learning_rate": 2.307256493152974e-06, + "loss": 0.92488259, + "num_input_tokens_seen": 679670, + "step": 36, + "time_per_iteration": 2.797126054763794 + }, + { + "auxiliary_loss_clip": 0.02292911, + "auxiliary_loss_mlp": 0.01336446, + "balance_loss_clip": 1.45009267, + "balance_loss_mlp": 1.19062936, + "epoch": 0.0022245603487148656, + "flos": 26542295084160.0, + "grad_norm": 2.58314501217778, + "language_loss": 0.93055344, + "learning_rate": 2.3248973825097614e-06, + "loss": 0.96684706, + "num_input_tokens_seen": 700170, + "step": 37, + "time_per_iteration": 2.7755305767059326 + }, + { + "auxiliary_loss_clip": 0.02252867, + "auxiliary_loss_mlp": 0.01277558, + "balance_loss_clip": 1.44473004, + "balance_loss_mlp": 1.15596497, + "epoch": 0.0022846836013828346, + "flos": 20338188226560.0, + "grad_norm": 2.1014821420998997, + "language_loss": 1.0399754, + "learning_rate": 2.3420677916238357e-06, + "loss": 1.07527971, + "num_input_tokens_seen": 718545, + "step": 38, + "time_per_iteration": 2.885153293609619 + }, + { + "auxiliary_loss_clip": 0.02217424, + "auxiliary_loss_mlp": 0.01256727, + "balance_loss_clip": 1.43618071, + "balance_loss_mlp": 1.13456178, + "epoch": 0.002344806854050804, + "flos": 26247575992320.0, + "grad_norm": 2.3664237661391, + "language_loss": 0.85355413, + "learning_rate": 2.358792165262154e-06, + "loss": 0.88829571, + "num_input_tokens_seen": 739865, + "step": 39, + "time_per_iteration": 2.8219757080078125 + }, + { + "auxiliary_loss_clip": 0.02195557, + "auxiliary_loss_mlp": 0.01247565, + "balance_loss_clip": 1.42803979, + "balance_loss_mlp": 1.12063134, + "epoch": 0.0024049301067187736, + "flos": 11801539981440.0, + "grad_norm": 3.1419566740780622, + "language_loss": 0.90141582, + "learning_rate": 2.3750930912143747e-06, + "loss": 0.9358471, + "num_input_tokens_seen": 755770, + "step": 40, + "time_per_iteration": 2.7997214794158936 + }, + { + "auxiliary_loss_clip": 0.02146639, + "auxiliary_loss_mlp": 0.0127052, + "balance_loss_clip": 1.41779709, + "balance_loss_mlp": 1.15865445, + "epoch": 0.0024650533593867426, + "flos": 20631506688000.0, + "grad_norm": 2.874930847474739, + "language_loss": 0.93253231, + "learning_rate": 2.3909914837471044e-06, + "loss": 0.96670389, + "num_input_tokens_seen": 773440, + "step": 41, + "time_per_iteration": 2.8831541538238525 + }, + { + "auxiliary_loss_clip": 0.02109751, + "auxiliary_loss_mlp": 0.01250979, + "balance_loss_clip": 1.40949345, + "balance_loss_mlp": 1.14879322, + "epoch": 0.002525176612054712, + "flos": 18406122549120.0, + "grad_norm": 4.230989876539809, + "language_loss": 0.97364104, + "learning_rate": 2.4065067449483835e-06, + "loss": 1.00724828, + "num_input_tokens_seen": 790455, + "step": 42, + "time_per_iteration": 2.7395670413970947 + }, + { + "auxiliary_loss_clip": 0.02071758, + "auxiliary_loss_mlp": 0.01297584, + "balance_loss_clip": 1.41077375, + "balance_loss_mlp": 1.19148815, + "epoch": 0.0025852998647226816, + "flos": 28184023128960.0, + "grad_norm": 2.575943479542531, + "language_loss": 0.9741261, + "learning_rate": 2.4216569070848724e-06, + "loss": 1.00781941, + "num_input_tokens_seen": 810645, + "step": 43, + "time_per_iteration": 2.8662261962890625 + }, + { + "auxiliary_loss_clip": 0.02089905, + "auxiliary_loss_mlp": 0.01308673, + "balance_loss_clip": 1.40828407, + "balance_loss_mlp": 1.19756985, + "epoch": 0.0026454231173906506, + "flos": 14283110897280.0, + "grad_norm": 2.0720286644542307, + "language_loss": 0.93441468, + "learning_rate": 2.4364587585915504e-06, + "loss": 0.96840042, + "num_input_tokens_seen": 827470, + "step": 44, + "time_per_iteration": 2.729267120361328 + }, + { + "auxiliary_loss_clip": 0.02046648, + "auxiliary_loss_mlp": 0.01267982, + "balance_loss_clip": 1.4026444, + "balance_loss_mlp": 1.17056417, + "epoch": 0.00270554637005862, + "flos": 22419211605120.0, + "grad_norm": 2.656888875906477, + "language_loss": 0.98507571, + "learning_rate": 2.450927955901469e-06, + "loss": 1.01822209, + "num_input_tokens_seen": 847285, + "step": 45, + "time_per_iteration": 2.7877213954925537 + }, + { + "auxiliary_loss_clip": 0.02023739, + "auxiliary_loss_mlp": 0.01224452, + "balance_loss_clip": 1.39009094, + "balance_loss_mlp": 1.13757277, + "epoch": 0.0027656696227265896, + "flos": 23985778440960.0, + "grad_norm": 1.78507114724886, + "language_loss": 1.02581716, + "learning_rate": 2.465079122983384e-06, + "loss": 1.05829906, + "num_input_tokens_seen": 867545, + "step": 46, + "time_per_iteration": 2.77933931350708 + }, + { + "auxiliary_loss_clip": 0.01993203, + "auxiliary_loss_mlp": 0.01265624, + "balance_loss_clip": 1.3825016, + "balance_loss_mlp": 1.17569292, + "epoch": 0.0028257928753945586, + "flos": 37669503087360.0, + "grad_norm": 2.0262440413306293, + "language_loss": 0.8793546, + "learning_rate": 2.4789259401737868e-06, + "loss": 0.9119429, + "num_input_tokens_seen": 889915, + "step": 47, + "time_per_iteration": 2.888049364089966 + }, + { + "auxiliary_loss_clip": 0.01955489, + "auxiliary_loss_mlp": 0.01248931, + "balance_loss_clip": 1.372365, + "balance_loss_mlp": 1.16300535, + "epoch": 0.002885916128062528, + "flos": 22454547609600.0, + "grad_norm": 2.2398179365500535, + "language_loss": 0.87828577, + "learning_rate": 2.492481223656015e-06, + "loss": 0.91033, + "num_input_tokens_seen": 908975, + "step": 48, + "time_per_iteration": 2.7992591857910156 + }, + { + "auxiliary_loss_clip": 0.019543, + "auxiliary_loss_mlp": 0.01234609, + "balance_loss_clip": 1.36162627, + "balance_loss_mlp": 1.14415312, + "epoch": 0.0029460393807304976, + "flos": 27012796358400.0, + "grad_norm": 2.924602528821633, + "language_loss": 0.89672893, + "learning_rate": 2.5057569967437924e-06, + "loss": 0.92861807, + "num_input_tokens_seen": 929810, + "step": 49, + "time_per_iteration": 2.8268253803253174 + }, + { + "auxiliary_loss_clip": 0.01944916, + "auxiliary_loss_mlp": 0.01229119, + "balance_loss_clip": 1.35492158, + "balance_loss_mlp": 1.14486229, + "epoch": 0.0030061626333984666, + "flos": 15851832549120.0, + "grad_norm": 3.2362658734718805, + "language_loss": 0.90814078, + "learning_rate": 2.51876455396287e-06, + "loss": 0.93988109, + "num_input_tokens_seen": 948650, + "step": 50, + "time_per_iteration": 2.854123115539551 + }, + { + "auxiliary_loss_clip": 0.01944041, + "auxiliary_loss_mlp": 0.01191311, + "balance_loss_clip": 1.36035919, + "balance_loss_mlp": 1.11058283, + "epoch": 0.003066285886066436, + "flos": 31827052316160.0, + "grad_norm": 4.38961791751761, + "language_loss": 0.86762989, + "learning_rate": 2.5315145187866316e-06, + "loss": 0.89898336, + "num_input_tokens_seen": 966455, + "step": 51, + "time_per_iteration": 2.802963972091675 + }, + { + "auxiliary_loss_clip": 0.01900418, + "auxiliary_loss_mlp": 0.01198986, + "balance_loss_clip": 1.34975755, + "balance_loss_mlp": 1.12035561, + "epoch": 0.0031264091387344056, + "flos": 41427482774400.0, + "grad_norm": 2.103026245487531, + "language_loss": 0.95127207, + "learning_rate": 2.5440168957651953e-06, + "loss": 0.98226607, + "num_input_tokens_seen": 988110, + "step": 52, + "time_per_iteration": 2.9115259647369385 + }, + { + "auxiliary_loss_clip": 0.0189669, + "auxiliary_loss_mlp": 0.01230008, + "balance_loss_clip": 1.34679425, + "balance_loss_mlp": 1.1513294, + "epoch": 0.0031865323914023747, + "flos": 23440941970560.0, + "grad_norm": 1.9698501726981694, + "language_loss": 0.92147863, + "learning_rate": 2.5562811176888872e-06, + "loss": 0.9527455, + "num_input_tokens_seen": 1008550, + "step": 53, + "time_per_iteration": 2.7163705825805664 + }, + { + "auxiliary_loss_clip": 0.0188504, + "auxiliary_loss_mlp": 0.0118832, + "balance_loss_clip": 1.34806657, + "balance_loss_mlp": 1.10811591, + "epoch": 0.003246655644070344, + "flos": 14429195510400.0, + "grad_norm": 2.4614573929306998, + "language_loss": 0.827874, + "learning_rate": 2.5683160883431093e-06, + "loss": 0.85860765, + "num_input_tokens_seen": 1026840, + "step": 54, + "time_per_iteration": 2.709979295730591 + }, + { + "auxiliary_loss_clip": 0.01881672, + "auxiliary_loss_mlp": 0.01199788, + "balance_loss_clip": 1.33730745, + "balance_loss_mlp": 1.12115788, + "epoch": 0.0033067788967383136, + "flos": 35918247496320.0, + "grad_norm": 2.4087240697602637, + "language_loss": 0.81154013, + "learning_rate": 2.580130221340046e-06, + "loss": 0.84235477, + "num_input_tokens_seen": 1048875, + "step": 55, + "time_per_iteration": 2.826672077178955 + }, + { + "auxiliary_loss_clip": 0.01866829, + "auxiliary_loss_mlp": 0.01192998, + "balance_loss_clip": 1.33000195, + "balance_loss_mlp": 1.1139859, + "epoch": 0.003366902149406283, + "flos": 22958732862720.0, + "grad_norm": 3.526326345746903, + "language_loss": 0.86888397, + "learning_rate": 2.5917314754514246e-06, + "loss": 0.89948225, + "num_input_tokens_seen": 1066435, + "step": 56, + "time_per_iteration": 2.7303783893585205 + }, + { + "auxiliary_loss_clip": 0.01865308, + "auxiliary_loss_mlp": 0.01157825, + "balance_loss_clip": 1.32176983, + "balance_loss_mlp": 1.08424926, + "epoch": 0.003427025402074252, + "flos": 26582838560640.0, + "grad_norm": 1.8350274002212101, + "language_loss": 0.92618436, + "learning_rate": 2.6031273868139713e-06, + "loss": 0.95641565, + "num_input_tokens_seen": 1090330, + "step": 57, + "time_per_iteration": 2.7964468002319336 + }, + { + "auxiliary_loss_clip": 0.01835324, + "auxiliary_loss_mlp": 0.01203225, + "balance_loss_clip": 1.32898617, + "balance_loss_mlp": 1.1306982, + "epoch": 0.0034871486547422216, + "flos": 23951196622080.0, + "grad_norm": 2.1798220167833287, + "language_loss": 0.99393106, + "learning_rate": 2.614325098333948e-06, + "loss": 1.02431655, + "num_input_tokens_seen": 1109840, + "step": 58, + "time_per_iteration": 2.8645172119140625 + }, + { + "auxiliary_loss_clip": 0.01817505, + "auxiliary_loss_mlp": 0.01187415, + "balance_loss_clip": 1.31544495, + "balance_loss_mlp": 1.11612749, + "epoch": 0.003547271907410191, + "flos": 21214983214080.0, + "grad_norm": 4.672292019252427, + "language_loss": 0.88116634, + "learning_rate": 2.625331386578098e-06, + "loss": 0.91121554, + "num_input_tokens_seen": 1128415, + "step": 59, + "time_per_iteration": 2.8390605449676514 + }, + { + "auxiliary_loss_clip": 0.01838762, + "auxiliary_loss_mlp": 0.01157678, + "balance_loss_clip": 1.32348323, + "balance_loss_mlp": 1.08453119, + "epoch": 0.00360739516007816, + "flos": 16504903676160.0, + "grad_norm": 1.9939379984837533, + "language_loss": 0.93340272, + "learning_rate": 2.63615268640451e-06, + "loss": 0.9633671, + "num_input_tokens_seen": 1146515, + "step": 60, + "time_per_iteration": 2.855483055114746 + }, + { + "auxiliary_loss_clip": 0.01815625, + "auxiliary_loss_mlp": 0.0116698, + "balance_loss_clip": 1.30655861, + "balance_loss_mlp": 1.09869695, + "epoch": 0.0036675184127461296, + "flos": 19464805031040.0, + "grad_norm": 3.0883069415476525, + "language_loss": 0.89831531, + "learning_rate": 2.6467951135575943e-06, + "loss": 0.9281413, + "num_input_tokens_seen": 1166330, + "step": 61, + "time_per_iteration": 2.7396180629730225 + }, + { + "auxiliary_loss_clip": 0.0179987, + "auxiliary_loss_mlp": 0.01142216, + "balance_loss_clip": 1.30384493, + "balance_loss_mlp": 1.07503009, + "epoch": 0.003727641665414099, + "flos": 20957323979520.0, + "grad_norm": 2.6906828171922554, + "language_loss": 0.88430774, + "learning_rate": 2.657264485425803e-06, + "loss": 0.91372865, + "num_input_tokens_seen": 1186010, + "step": 62, + "time_per_iteration": 2.7519121170043945 + }, + { + "auxiliary_loss_clip": 0.01781165, + "auxiliary_loss_mlp": 0.01158283, + "balance_loss_clip": 1.29463029, + "balance_loss_mlp": 1.08794999, + "epoch": 0.003787764918082068, + "flos": 18406050721920.0, + "grad_norm": 2.446021761493366, + "language_loss": 0.96075886, + "learning_rate": 2.6675663401385186e-06, + "loss": 0.99015337, + "num_input_tokens_seen": 1204985, + "step": 63, + "time_per_iteration": 2.862579107284546 + }, + { + "auxiliary_loss_clip": 0.0179106, + "auxiliary_loss_mlp": 0.01168183, + "balance_loss_clip": 1.30312669, + "balance_loss_mlp": 1.10085392, + "epoch": 0.0038478881707500376, + "flos": 12459243962880.0, + "grad_norm": 3.6468804111593642, + "language_loss": 0.98715389, + "learning_rate": 2.677705954159056e-06, + "loss": 1.0167464, + "num_input_tokens_seen": 1223545, + "step": 64, + "time_per_iteration": 2.867640256881714 + }, + { + "auxiliary_loss_clip": 0.01799522, + "auxiliary_loss_mlp": 0.0114778, + "balance_loss_clip": 1.30354464, + "balance_loss_mlp": 1.07983065, + "epoch": 0.003908011423418007, + "flos": 13553334276480.0, + "grad_norm": 3.4300256347515736, + "language_loss": 0.8528145, + "learning_rate": 2.6876883585136904e-06, + "loss": 0.8822875, + "num_input_tokens_seen": 1241175, + "step": 65, + "time_per_iteration": 2.8761467933654785 + }, + { + "auxiliary_loss_clip": 0.01777672, + "auxiliary_loss_mlp": 0.01155388, + "balance_loss_clip": 1.29055953, + "balance_loss_mlp": 1.08748686, + "epoch": 0.003968134676085976, + "flos": 18333475292160.0, + "grad_norm": 5.75463808610966, + "language_loss": 0.85388923, + "learning_rate": 2.697518353781685e-06, + "loss": 0.88321984, + "num_input_tokens_seen": 1259315, + "step": 66, + "time_per_iteration": 2.834174633026123 + }, + { + "auxiliary_loss_clip": 0.0178151, + "auxiliary_loss_mlp": 0.01153154, + "balance_loss_clip": 1.28879797, + "balance_loss_mlp": 1.07767057, + "epoch": 0.004028257928753946, + "flos": 20485242506880.0, + "grad_norm": 5.265965581404753, + "language_loss": 0.96429205, + "learning_rate": 2.7072005239581103e-06, + "loss": 0.99363869, + "num_input_tokens_seen": 1277055, + "step": 67, + "time_per_iteration": 2.8506240844726562 + }, + { + "auxiliary_loss_clip": 0.01755096, + "auxiliary_loss_mlp": 0.01154259, + "balance_loss_clip": 1.28370035, + "balance_loss_mlp": 1.08149326, + "epoch": 0.004088381181421915, + "flos": 18843837684480.0, + "grad_norm": 2.245782165627833, + "language_loss": 0.94481671, + "learning_rate": 2.7167392492896727e-06, + "loss": 0.97391021, + "num_input_tokens_seen": 1294355, + "step": 68, + "time_per_iteration": 2.809539318084717 + }, + { + "auxiliary_loss_clip": 0.01750025, + "auxiliary_loss_mlp": 0.01155985, + "balance_loss_clip": 1.28142524, + "balance_loss_mlp": 1.08593798, + "epoch": 0.004148504434089885, + "flos": 19427817000960.0, + "grad_norm": 2.2004242152490905, + "language_loss": 0.95795888, + "learning_rate": 2.7261387181735195e-06, + "loss": 0.98701906, + "num_input_tokens_seen": 1313525, + "step": 69, + "time_per_iteration": 2.831289052963257 + }, + { + "auxiliary_loss_clip": 0.01744477, + "auxiliary_loss_mlp": 0.01160181, + "balance_loss_clip": 1.28414929, + "balance_loss_mlp": 1.09475923, + "epoch": 0.004208627686757853, + "flos": 20811023884800.0, + "grad_norm": 3.529723674892965, + "language_loss": 0.98140854, + "learning_rate": 2.7354029381999196e-06, + "loss": 1.01045513, + "num_input_tokens_seen": 1330505, + "step": 70, + "time_per_iteration": 2.807192802429199 + }, + { + "auxiliary_loss_clip": 0.01751045, + "auxiliary_loss_mlp": 0.01146105, + "balance_loss_clip": 1.27434683, + "balance_loss_mlp": 1.07691598, + "epoch": 0.004268750939425823, + "flos": 19098623831040.0, + "grad_norm": 3.245116624781828, + "language_loss": 0.9394775, + "learning_rate": 2.7445357464116983e-06, + "loss": 0.968449, + "num_input_tokens_seen": 1349615, + "step": 71, + "time_per_iteration": 2.935589551925659 + }, + { + "auxiliary_loss_clip": 0.01831483, + "auxiliary_loss_mlp": 0.0131551, + "balance_loss_clip": 1.43444347, + "balance_loss_mlp": 1.2762183, + "epoch": 0.004328874192093792, + "flos": 52439635514880.0, + "grad_norm": 2.4346589848100075, + "language_loss": 0.65763354, + "learning_rate": 2.75354081884615e-06, + "loss": 0.68910348, + "num_input_tokens_seen": 1410275, + "step": 72, + "time_per_iteration": 3.4172465801239014 + }, + { + "auxiliary_loss_clip": 0.01814489, + "auxiliary_loss_mlp": 0.01281499, + "balance_loss_clip": 1.42715561, + "balance_loss_mlp": 1.24239826, + "epoch": 0.004388997444761762, + "flos": 66473239564800.0, + "grad_norm": 2.269512625553817, + "language_loss": 0.63767815, + "learning_rate": 2.7624216794188286e-06, + "loss": 0.66863805, + "num_input_tokens_seen": 1473020, + "step": 73, + "time_per_iteration": 3.307532548904419 + }, + { + "auxiliary_loss_clip": 0.01727923, + "auxiliary_loss_mlp": 0.01141928, + "balance_loss_clip": 1.26715207, + "balance_loss_mlp": 1.07388353, + "epoch": 0.004449120697429731, + "flos": 18952970181120.0, + "grad_norm": 2.185813929534106, + "language_loss": 0.85865861, + "learning_rate": 2.771181708202938e-06, + "loss": 0.88735718, + "num_input_tokens_seen": 1490385, + "step": 74, + "time_per_iteration": 2.7185122966766357 + }, + { + "auxiliary_loss_clip": 0.01732357, + "auxiliary_loss_mlp": 0.01161629, + "balance_loss_clip": 1.26728857, + "balance_loss_mlp": 1.09172451, + "epoch": 0.004509243950097701, + "flos": 21105491581440.0, + "grad_norm": 2.387045088925537, + "language_loss": 0.97117877, + "learning_rate": 2.779824149153005e-06, + "loss": 1.00011861, + "num_input_tokens_seen": 1509725, + "step": 75, + "time_per_iteration": 2.694956064224243 + }, + { + "auxiliary_loss_clip": 0.01710554, + "auxiliary_loss_mlp": 0.01145272, + "balance_loss_clip": 1.26305294, + "balance_loss_mlp": 1.0783242, + "epoch": 0.004569367202765669, + "flos": 20698730991360.0, + "grad_norm": 2.0886523345345416, + "language_loss": 0.87707686, + "learning_rate": 2.788352117317012e-06, + "loss": 0.90563518, + "num_input_tokens_seen": 1527245, + "step": 76, + "time_per_iteration": 2.7865233421325684 + }, + { + "auxiliary_loss_clip": 0.01712433, + "auxiliary_loss_mlp": 0.01148356, + "balance_loss_clip": 1.261976, + "balance_loss_mlp": 1.07802224, + "epoch": 0.004629490455433639, + "flos": 28658474899200.0, + "grad_norm": 2.017281454462382, + "language_loss": 0.91796356, + "learning_rate": 2.796768605577095e-06, + "loss": 0.94657141, + "num_input_tokens_seen": 1548930, + "step": 77, + "time_per_iteration": 5.94242787361145 + }, + { + "auxiliary_loss_clip": 0.01703409, + "auxiliary_loss_mlp": 0.01168176, + "balance_loss_clip": 1.26292789, + "balance_loss_mlp": 1.09674621, + "epoch": 0.004689613708101608, + "flos": 11072409805440.0, + "grad_norm": 2.113983086620846, + "language_loss": 0.9243902, + "learning_rate": 2.80507649095533e-06, + "loss": 0.95310611, + "num_input_tokens_seen": 1565695, + "step": 78, + "time_per_iteration": 2.6969692707061768 + }, + { + "auxiliary_loss_clip": 0.01697964, + "auxiliary_loss_mlp": 0.01154486, + "balance_loss_clip": 1.25696027, + "balance_loss_mlp": 1.08544016, + "epoch": 0.004749736960769578, + "flos": 21799106184960.0, + "grad_norm": 2.595929528370548, + "language_loss": 0.82421374, + "learning_rate": 2.813278540517843e-06, + "loss": 0.85273826, + "num_input_tokens_seen": 1582625, + "step": 79, + "time_per_iteration": 4.327785968780518 + }, + { + "auxiliary_loss_clip": 0.01712627, + "auxiliary_loss_mlp": 0.01134659, + "balance_loss_clip": 1.26035261, + "balance_loss_mlp": 1.06389606, + "epoch": 0.004809860213437547, + "flos": 19792597570560.0, + "grad_norm": 1.892073448669268, + "language_loss": 0.91287601, + "learning_rate": 2.8213774169075505e-06, + "loss": 0.94134885, + "num_input_tokens_seen": 1601725, + "step": 80, + "time_per_iteration": 2.826565742492676 + }, + { + "auxiliary_loss_clip": 0.01681276, + "auxiliary_loss_mlp": 0.01147338, + "balance_loss_clip": 1.25303006, + "balance_loss_mlp": 1.07619381, + "epoch": 0.004869983466105517, + "flos": 26574327037440.0, + "grad_norm": 2.133280597832094, + "language_loss": 0.94978434, + "learning_rate": 2.829375683533245e-06, + "loss": 0.9780705, + "num_input_tokens_seen": 1622420, + "step": 81, + "time_per_iteration": 2.9407167434692383 + }, + { + "auxiliary_loss_clip": 0.01695569, + "auxiliary_loss_mlp": 0.01148997, + "balance_loss_clip": 1.25687361, + "balance_loss_mlp": 1.08176279, + "epoch": 0.004930106718773485, + "flos": 12823378087680.0, + "grad_norm": 3.3517585718845675, + "language_loss": 0.95972067, + "learning_rate": 2.8372758094402803e-06, + "loss": 0.98816633, + "num_input_tokens_seen": 1640715, + "step": 82, + "time_per_iteration": 2.82920241355896 + }, + { + "auxiliary_loss_clip": 0.01678807, + "auxiliary_loss_mlp": 0.01158965, + "balance_loss_clip": 1.2453779, + "balance_loss_mlp": 1.08825028, + "epoch": 0.004990229971441455, + "flos": 25774919902080.0, + "grad_norm": 1.9240928445058207, + "language_loss": 0.86549926, + "learning_rate": 2.84508017388607e-06, + "loss": 0.89387703, + "num_input_tokens_seen": 1662210, + "step": 83, + "time_per_iteration": 2.869555950164795 + }, + { + "auxiliary_loss_clip": 0.01670546, + "auxiliary_loss_mlp": 0.0115761, + "balance_loss_clip": 1.2459296, + "balance_loss_mlp": 1.08670449, + "epoch": 0.005050353224109424, + "flos": 17457254922240.0, + "grad_norm": 2.8475640832256612, + "language_loss": 0.91685319, + "learning_rate": 2.852791070641559e-06, + "loss": 0.94513476, + "num_input_tokens_seen": 1681070, + "step": 84, + "time_per_iteration": 2.8550877571105957 + }, + { + "auxiliary_loss_clip": 0.01653687, + "auxiliary_loss_mlp": 0.01173016, + "balance_loss_clip": 1.34870887, + "balance_loss_mlp": 1.13505936, + "epoch": 0.005110476476777394, + "flos": 69805460367360.0, + "grad_norm": 1.393202874877644, + "language_loss": 0.62589455, + "learning_rate": 2.8604107120381682e-06, + "loss": 0.65416157, + "num_input_tokens_seen": 1747140, + "step": 85, + "time_per_iteration": 3.2999770641326904 + }, + { + "auxiliary_loss_clip": 0.01658053, + "auxiliary_loss_mlp": 0.01126807, + "balance_loss_clip": 1.23556495, + "balance_loss_mlp": 1.05528188, + "epoch": 0.005170599729445363, + "flos": 24790105739520.0, + "grad_norm": 1.6699261697646595, + "language_loss": 0.90804088, + "learning_rate": 2.8679412327780482e-06, + "loss": 0.93588948, + "num_input_tokens_seen": 1767475, + "step": 86, + "time_per_iteration": 2.795461654663086 + }, + { + "auxiliary_loss_clip": 0.01661974, + "auxiliary_loss_mlp": 0.01160392, + "balance_loss_clip": 1.2415961, + "balance_loss_mlp": 1.08796048, + "epoch": 0.005230722982113333, + "flos": 23258048895360.0, + "grad_norm": 2.3977681762661436, + "language_loss": 0.82184505, + "learning_rate": 2.8753846935240833e-06, + "loss": 0.85006875, + "num_input_tokens_seen": 1784980, + "step": 87, + "time_per_iteration": 2.774489164352417 + }, + { + "auxiliary_loss_clip": 0.01650605, + "auxiliary_loss_mlp": 0.01153431, + "balance_loss_clip": 1.23951626, + "balance_loss_mlp": 1.08386052, + "epoch": 0.005290846234781301, + "flos": 16727909264640.0, + "grad_norm": 2.01901411778523, + "language_loss": 0.95756078, + "learning_rate": 2.8827430842847267e-06, + "loss": 0.98560113, + "num_input_tokens_seen": 1803030, + "step": 88, + "time_per_iteration": 2.7408664226531982 + }, + { + "auxiliary_loss_clip": 0.01667033, + "auxiliary_loss_mlp": 0.01150165, + "balance_loss_clip": 1.23940039, + "balance_loss_mlp": 1.08197689, + "epoch": 0.005350969487449271, + "flos": 20886077352960.0, + "grad_norm": 2.1039011967703627, + "language_loss": 0.86048251, + "learning_rate": 2.8900183276075957e-06, + "loss": 0.88865447, + "num_input_tokens_seen": 1822865, + "step": 89, + "time_per_iteration": 2.722506284713745 + }, + { + "auxiliary_loss_clip": 0.01653725, + "auxiliary_loss_mlp": 0.01133211, + "balance_loss_clip": 1.23324633, + "balance_loss_mlp": 1.06597674, + "epoch": 0.00541109274011724, + "flos": 26209977431040.0, + "grad_norm": 3.2206303030981256, + "language_loss": 0.91390312, + "learning_rate": 2.8972122815946455e-06, + "loss": 0.94177246, + "num_input_tokens_seen": 1842435, + "step": 90, + "time_per_iteration": 2.9260141849517822 + }, + { + "auxiliary_loss_clip": 0.0163462, + "auxiliary_loss_mlp": 0.01135252, + "balance_loss_clip": 1.22855091, + "balance_loss_mlp": 1.06587243, + "epoch": 0.00547121599278521, + "flos": 21178569801600.0, + "grad_norm": 3.568256963335304, + "language_loss": 0.85891831, + "learning_rate": 2.90432674275074e-06, + "loss": 0.88661695, + "num_input_tokens_seen": 1860065, + "step": 91, + "time_per_iteration": 2.7664718627929688 + }, + { + "auxiliary_loss_clip": 0.01633567, + "auxiliary_loss_mlp": 0.01138968, + "balance_loss_clip": 1.22489619, + "balance_loss_mlp": 1.07211494, + "epoch": 0.005531339245453179, + "flos": 19718801078400.0, + "grad_norm": 2.2924989918847976, + "language_loss": 0.87128413, + "learning_rate": 2.91136344867656e-06, + "loss": 0.89900941, + "num_input_tokens_seen": 1878135, + "step": 92, + "time_per_iteration": 2.7021617889404297 + }, + { + "auxiliary_loss_clip": 0.01625716, + "auxiliary_loss_mlp": 0.01176677, + "balance_loss_clip": 1.2160331, + "balance_loss_mlp": 1.10815501, + "epoch": 0.005591462498121149, + "flos": 17636089760640.0, + "grad_norm": 2.8931069120785464, + "language_loss": 0.9220686, + "learning_rate": 2.918324080615938e-06, + "loss": 0.95009255, + "num_input_tokens_seen": 1894895, + "step": 93, + "time_per_iteration": 2.701667547225952 + }, + { + "auxiliary_loss_clip": 0.01638372, + "auxiliary_loss_mlp": 0.01148984, + "balance_loss_clip": 1.2225101, + "balance_loss_mlp": 1.07736325, + "epoch": 0.005651585750789117, + "flos": 20011221699840.0, + "grad_norm": 3.4183193060189696, + "language_loss": 0.87358642, + "learning_rate": 2.925210265866963e-06, + "loss": 0.90146005, + "num_input_tokens_seen": 1913220, + "step": 94, + "time_per_iteration": 2.7908637523651123 + }, + { + "auxiliary_loss_clip": 0.0157256, + "auxiliary_loss_mlp": 0.01043273, + "balance_loss_clip": 1.30530524, + "balance_loss_mlp": 1.00531697, + "epoch": 0.005711709003457087, + "flos": 59812957981440.0, + "grad_norm": 1.349962305942747, + "language_loss": 0.68113959, + "learning_rate": 2.932023580065507e-06, + "loss": 0.70729792, + "num_input_tokens_seen": 1970970, + "step": 95, + "time_per_iteration": 3.146547794342041 + }, + { + "auxiliary_loss_clip": 0.01616812, + "auxiliary_loss_mlp": 0.01150359, + "balance_loss_clip": 1.21152472, + "balance_loss_mlp": 1.08231401, + "epoch": 0.005771832256125056, + "flos": 15559591495680.0, + "grad_norm": 4.976417769864593, + "language_loss": 0.90011764, + "learning_rate": 2.9387655493491906e-06, + "loss": 0.92778933, + "num_input_tokens_seen": 1988930, + "step": 96, + "time_per_iteration": 2.84930682182312 + }, + { + "auxiliary_loss_clip": 0.01609206, + "auxiliary_loss_mlp": 0.01143161, + "balance_loss_clip": 1.21448207, + "balance_loss_mlp": 1.07974148, + "epoch": 0.005831955508793026, + "flos": 22528380015360.0, + "grad_norm": 3.2672403014298763, + "language_loss": 0.89606208, + "learning_rate": 2.9454376524092147e-06, + "loss": 0.92358577, + "num_input_tokens_seen": 2006285, + "step": 97, + "time_per_iteration": 2.790029525756836 + }, + { + "auxiliary_loss_clip": 0.01596608, + "auxiliary_loss_mlp": 0.01140574, + "balance_loss_clip": 1.20652533, + "balance_loss_mlp": 1.07105148, + "epoch": 0.005892078761460995, + "flos": 22049834094720.0, + "grad_norm": 2.312490727355087, + "language_loss": 0.76467001, + "learning_rate": 2.952041322436969e-06, + "loss": 0.79204178, + "num_input_tokens_seen": 2024905, + "step": 98, + "time_per_iteration": 2.7208268642425537 + }, + { + "auxiliary_loss_clip": 0.01540873, + "auxiliary_loss_mlp": 0.01041309, + "balance_loss_clip": 1.28620684, + "balance_loss_mlp": 1.00430667, + "epoch": 0.005952202014128965, + "flos": 68539143317760.0, + "grad_norm": 1.0386788469726733, + "language_loss": 0.65474904, + "learning_rate": 2.9585779489718204e-06, + "loss": 0.68057096, + "num_input_tokens_seen": 2086220, + "step": 99, + "time_per_iteration": 3.2485756874084473 + }, + { + "auxiliary_loss_clip": 0.01597873, + "auxiliary_loss_mlp": 0.01146173, + "balance_loss_clip": 1.2060529, + "balance_loss_mlp": 1.07464743, + "epoch": 0.006012325266796933, + "flos": 22960887678720.0, + "grad_norm": 2.082786143771389, + "language_loss": 0.90867281, + "learning_rate": 2.9650488796560464e-06, + "loss": 0.9361133, + "num_input_tokens_seen": 2103365, + "step": 100, + "time_per_iteration": 2.709456443786621 + }, + { + "auxiliary_loss_clip": 0.01608672, + "auxiliary_loss_mlp": 0.01149658, + "balance_loss_clip": 1.20864964, + "balance_loss_mlp": 1.08266211, + "epoch": 0.006072448519464903, + "flos": 17347942857600.0, + "grad_norm": 2.559403570398529, + "language_loss": 0.90997583, + "learning_rate": 2.971455421902446e-06, + "loss": 0.93755907, + "num_input_tokens_seen": 2121995, + "step": 101, + "time_per_iteration": 2.747548818588257 + }, + { + "auxiliary_loss_clip": 0.01597708, + "auxiliary_loss_mlp": 0.01150368, + "balance_loss_clip": 1.20893741, + "balance_loss_mlp": 1.07922399, + "epoch": 0.006132571772132872, + "flos": 24681116897280.0, + "grad_norm": 2.310700794360503, + "language_loss": 0.90773195, + "learning_rate": 2.9777988444798075e-06, + "loss": 0.93521273, + "num_input_tokens_seen": 2141815, + "step": 102, + "time_per_iteration": 2.8696160316467285 + }, + { + "auxiliary_loss_clip": 0.01591399, + "auxiliary_loss_mlp": 0.0113395, + "balance_loss_clip": 1.20608044, + "balance_loss_mlp": 1.06890976, + "epoch": 0.006192695024800842, + "flos": 21465675210240.0, + "grad_norm": 2.62441094854212, + "language_loss": 0.87722194, + "learning_rate": 2.9840803790210285e-06, + "loss": 0.90447545, + "num_input_tokens_seen": 2161125, + "step": 103, + "time_per_iteration": 2.794735908508301 + }, + { + "auxiliary_loss_clip": 0.01590421, + "auxiliary_loss_mlp": 0.01138273, + "balance_loss_clip": 1.20713234, + "balance_loss_mlp": 1.07065749, + "epoch": 0.006252818277468811, + "flos": 17420410546560.0, + "grad_norm": 1.9639995624160855, + "language_loss": 0.93914419, + "learning_rate": 2.990301221458371e-06, + "loss": 0.96643114, + "num_input_tokens_seen": 2179510, + "step": 104, + "time_per_iteration": 2.6781342029571533 + }, + { + "auxiliary_loss_clip": 0.01582335, + "auxiliary_loss_mlp": 0.01148093, + "balance_loss_clip": 1.19798875, + "balance_loss_mlp": 1.08300531, + "epoch": 0.006312941530136781, + "flos": 19099557584640.0, + "grad_norm": 3.311662504206375, + "language_loss": 0.96300036, + "learning_rate": 2.9964625333900544e-06, + "loss": 0.99030465, + "num_input_tokens_seen": 2197870, + "step": 105, + "time_per_iteration": 2.74914288520813 + }, + { + "auxiliary_loss_clip": 0.01581, + "auxiliary_loss_mlp": 0.01158024, + "balance_loss_clip": 1.19786668, + "balance_loss_mlp": 1.08616519, + "epoch": 0.006373064782804749, + "flos": 24060831909120.0, + "grad_norm": 2.4024757024410306, + "language_loss": 0.86978102, + "learning_rate": 3.002565443382063e-06, + "loss": 0.89717132, + "num_input_tokens_seen": 2217495, + "step": 106, + "time_per_iteration": 2.7273354530334473 + }, + { + "auxiliary_loss_clip": 0.01563661, + "auxiliary_loss_mlp": 0.01140898, + "balance_loss_clip": 1.18389714, + "balance_loss_mlp": 1.07213783, + "epoch": 0.006433188035472719, + "flos": 18332433797760.0, + "grad_norm": 6.323285569360312, + "language_loss": 0.83376288, + "learning_rate": 3.008611048208843e-06, + "loss": 0.86080843, + "num_input_tokens_seen": 2236520, + "step": 107, + "time_per_iteration": 2.7020444869995117 + }, + { + "auxiliary_loss_clip": 0.01472641, + "auxiliary_loss_mlp": 0.01034771, + "balance_loss_clip": 1.24029088, + "balance_loss_mlp": 1.00139225, + "epoch": 0.006493311288140688, + "flos": 62562387594240.0, + "grad_norm": 0.9889903989851356, + "language_loss": 0.64767432, + "learning_rate": 3.014600414036285e-06, + "loss": 0.67274845, + "num_input_tokens_seen": 2300140, + "step": 108, + "time_per_iteration": 3.3199644088745117 + }, + { + "auxiliary_loss_clip": 0.0155572, + "auxiliary_loss_mlp": 0.01136413, + "balance_loss_clip": 1.18634558, + "balance_loss_mlp": 1.06655681, + "epoch": 0.006553434540808658, + "flos": 19500141035520.0, + "grad_norm": 2.3638972005894425, + "language_loss": 0.97801107, + "learning_rate": 3.0205345775501937e-06, + "loss": 1.0049324, + "num_input_tokens_seen": 2317320, + "step": 109, + "time_per_iteration": 2.8643527030944824 + }, + { + "auxiliary_loss_clip": 0.01554713, + "auxiliary_loss_mlp": 0.01144521, + "balance_loss_clip": 1.18849874, + "balance_loss_mlp": 1.07719159, + "epoch": 0.006613557793476627, + "flos": 21105132445440.0, + "grad_norm": 1.7984709001463148, + "language_loss": 0.84074354, + "learning_rate": 3.0264145470332218e-06, + "loss": 0.86773586, + "num_input_tokens_seen": 2337820, + "step": 110, + "time_per_iteration": 2.7061426639556885 + }, + { + "auxiliary_loss_clip": 0.01549098, + "auxiliary_loss_mlp": 0.01153652, + "balance_loss_clip": 1.18168998, + "balance_loss_mlp": 1.08651328, + "epoch": 0.006673681046144597, + "flos": 26030747543040.0, + "grad_norm": 2.3981404454478654, + "language_loss": 0.8297447, + "learning_rate": 3.032241303393073e-06, + "loss": 0.85677224, + "num_input_tokens_seen": 2358560, + "step": 111, + "time_per_iteration": 2.795861005783081 + }, + { + "auxiliary_loss_clip": 0.01550659, + "auxiliary_loss_mlp": 0.01132112, + "balance_loss_clip": 1.18534184, + "balance_loss_mlp": 1.06735742, + "epoch": 0.006733804298812566, + "flos": 23147767163520.0, + "grad_norm": 3.2735284001385345, + "language_loss": 0.94107652, + "learning_rate": 3.0380158011446e-06, + "loss": 0.96790421, + "num_input_tokens_seen": 2379005, + "step": 112, + "time_per_iteration": 2.808974266052246 + }, + { + "auxiliary_loss_clip": 0.01554576, + "auxiliary_loss_mlp": 0.01140283, + "balance_loss_clip": 1.18165553, + "balance_loss_mlp": 1.0750041, + "epoch": 0.006793927551480535, + "flos": 11764444210560.0, + "grad_norm": 2.374979431552639, + "language_loss": 0.79605687, + "learning_rate": 3.0437389693482466e-06, + "loss": 0.8230055, + "num_input_tokens_seen": 2395610, + "step": 113, + "time_per_iteration": 2.773986339569092 + }, + { + "auxiliary_loss_clip": 0.01542124, + "auxiliary_loss_mlp": 0.01136502, + "balance_loss_clip": 1.17764425, + "balance_loss_mlp": 1.06907749, + "epoch": 0.006854050804148504, + "flos": 19171953446400.0, + "grad_norm": 1.9756672432270663, + "language_loss": 0.93478584, + "learning_rate": 3.0494117125071475e-06, + "loss": 0.96157205, + "num_input_tokens_seen": 2415005, + "step": 114, + "time_per_iteration": 2.94524884223938 + }, + { + "auxiliary_loss_clip": 0.01550237, + "auxiliary_loss_mlp": 0.01138177, + "balance_loss_clip": 1.17810607, + "balance_loss_mlp": 1.0773325, + "epoch": 0.006914174056816474, + "flos": 21981891519360.0, + "grad_norm": 2.0481031120744877, + "language_loss": 0.94655228, + "learning_rate": 3.055034911425055e-06, + "loss": 0.97343647, + "num_input_tokens_seen": 2433965, + "step": 115, + "time_per_iteration": 3.0120561122894287 + }, + { + "auxiliary_loss_clip": 0.01544604, + "auxiliary_loss_mlp": 0.01120826, + "balance_loss_clip": 1.17515922, + "balance_loss_mlp": 1.05325842, + "epoch": 0.006974297309484443, + "flos": 16289152634880.0, + "grad_norm": 3.6055966249175326, + "language_loss": 0.81858426, + "learning_rate": 3.0606094240271244e-06, + "loss": 0.84523857, + "num_input_tokens_seen": 2451605, + "step": 116, + "time_per_iteration": 2.9391119480133057 + }, + { + "auxiliary_loss_clip": 0.0153605, + "auxiliary_loss_mlp": 0.01127165, + "balance_loss_clip": 1.17563701, + "balance_loss_mlp": 1.06160033, + "epoch": 0.007034420562152413, + "flos": 26104005331200.0, + "grad_norm": 2.146328017805021, + "language_loss": 0.87926286, + "learning_rate": 3.0661360861454656e-06, + "loss": 0.90589499, + "num_input_tokens_seen": 2472035, + "step": 117, + "time_per_iteration": 2.8999104499816895 + }, + { + "auxiliary_loss_clip": 0.01533352, + "auxiliary_loss_mlp": 0.0114874, + "balance_loss_clip": 1.17352557, + "balance_loss_mlp": 1.08245945, + "epoch": 0.007094543814820382, + "flos": 14204609723520.0, + "grad_norm": 4.069462947183592, + "language_loss": 0.84538341, + "learning_rate": 3.071615712271274e-06, + "loss": 0.87220442, + "num_input_tokens_seen": 2489285, + "step": 118, + "time_per_iteration": 2.8751063346862793 + }, + { + "auxiliary_loss_clip": 0.01544736, + "auxiliary_loss_mlp": 0.01162473, + "balance_loss_clip": 1.17469645, + "balance_loss_mlp": 1.09705102, + "epoch": 0.007154667067488351, + "flos": 14976007228800.0, + "grad_norm": 2.4774258348711404, + "language_loss": 0.99610603, + "learning_rate": 3.0770490962752172e-06, + "loss": 1.0231781, + "num_input_tokens_seen": 2506460, + "step": 119, + "time_per_iteration": 2.8432440757751465 + }, + { + "auxiliary_loss_clip": 0.01546008, + "auxiliary_loss_mlp": 0.01119792, + "balance_loss_clip": 1.17091203, + "balance_loss_mlp": 1.05618179, + "epoch": 0.00721479032015632, + "flos": 20193288762240.0, + "grad_norm": 2.080006398694266, + "language_loss": 0.89270306, + "learning_rate": 3.082437012097686e-06, + "loss": 0.91936105, + "num_input_tokens_seen": 2525565, + "step": 120, + "time_per_iteration": 2.8756470680236816 + }, + { + "auxiliary_loss_clip": 0.0153223, + "auxiliary_loss_mlp": 0.01130928, + "balance_loss_clip": 1.17162943, + "balance_loss_mlp": 1.06598258, + "epoch": 0.00727491357282429, + "flos": 23147228459520.0, + "grad_norm": 1.9686957696542076, + "language_loss": 0.93276542, + "learning_rate": 3.0877802144103967e-06, + "loss": 0.95939708, + "num_input_tokens_seen": 2546605, + "step": 121, + "time_per_iteration": 2.9329121112823486 + }, + { + "auxiliary_loss_clip": 0.01533366, + "auxiliary_loss_mlp": 0.01149515, + "balance_loss_clip": 1.17206633, + "balance_loss_mlp": 1.08518982, + "epoch": 0.007335036825492259, + "flos": 15521669712000.0, + "grad_norm": 2.196680715313533, + "language_loss": 0.90296304, + "learning_rate": 3.09307943925077e-06, + "loss": 0.92979181, + "num_input_tokens_seen": 2560730, + "step": 122, + "time_per_iteration": 2.8350698947906494 + }, + { + "auxiliary_loss_clip": 0.01528744, + "auxiliary_loss_mlp": 0.01145117, + "balance_loss_clip": 1.16633379, + "balance_loss_mlp": 1.07669079, + "epoch": 0.007395160078160229, + "flos": 24243365848320.0, + "grad_norm": 3.0728752259851366, + "language_loss": 0.92374444, + "learning_rate": 3.0983354046304154e-06, + "loss": 0.95048302, + "num_input_tokens_seen": 2579550, + "step": 123, + "time_per_iteration": 2.7025771141052246 + }, + { + "auxiliary_loss_clip": 0.01525392, + "auxiliary_loss_mlp": 0.01128554, + "balance_loss_clip": 1.16049099, + "balance_loss_mlp": 1.06518197, + "epoch": 0.007455283330828198, + "flos": 31759792099200.0, + "grad_norm": 2.272928865345462, + "language_loss": 0.71204025, + "learning_rate": 3.103548811118979e-06, + "loss": 0.73857975, + "num_input_tokens_seen": 2600390, + "step": 124, + "time_per_iteration": 2.754821300506592 + }, + { + "auxiliary_loss_clip": 0.01514502, + "auxiliary_loss_mlp": 0.01125458, + "balance_loss_clip": 1.16254711, + "balance_loss_mlp": 1.06098938, + "epoch": 0.007515406583496167, + "flos": 26615157822720.0, + "grad_norm": 2.0551267346343884, + "language_loss": 0.88146126, + "learning_rate": 3.108720342404542e-06, + "loss": 0.90786088, + "num_input_tokens_seen": 2620770, + "step": 125, + "time_per_iteration": 5.952621698379517 + }, + { + "auxiliary_loss_clip": 0.01526406, + "auxiliary_loss_mlp": 0.01141002, + "balance_loss_clip": 1.16089618, + "balance_loss_mlp": 1.07686782, + "epoch": 0.007575529836164136, + "flos": 18223696350720.0, + "grad_norm": 2.795774328186644, + "language_loss": 0.82182956, + "learning_rate": 3.1138506658316945e-06, + "loss": 0.84850359, + "num_input_tokens_seen": 2639900, + "step": 126, + "time_per_iteration": 2.688478946685791 + }, + { + "auxiliary_loss_clip": 0.0152078, + "auxiliary_loss_mlp": 0.01140828, + "balance_loss_clip": 1.16015112, + "balance_loss_mlp": 1.07812417, + "epoch": 0.007635653088832106, + "flos": 21580410228480.0, + "grad_norm": 3.1218483804105244, + "language_loss": 0.67558527, + "learning_rate": 3.1189404329183404e-06, + "loss": 0.70220137, + "num_input_tokens_seen": 2657450, + "step": 127, + "time_per_iteration": 4.304124355316162 + }, + { + "auxiliary_loss_clip": 0.01509996, + "auxiliary_loss_mlp": 0.01131892, + "balance_loss_clip": 1.16231656, + "balance_loss_mlp": 1.06680369, + "epoch": 0.007695776341500075, + "flos": 25375054723200.0, + "grad_norm": 3.5627730879289343, + "language_loss": 0.88199258, + "learning_rate": 3.1239902798522317e-06, + "loss": 0.90841144, + "num_input_tokens_seen": 2678150, + "step": 128, + "time_per_iteration": 2.789644479751587 + }, + { + "auxiliary_loss_clip": 0.01512675, + "auxiliary_loss_mlp": 0.01141235, + "balance_loss_clip": 1.15778089, + "balance_loss_mlp": 1.07700503, + "epoch": 0.007755899594168045, + "flos": 22343906741760.0, + "grad_norm": 1.8153657413573407, + "language_loss": 0.84418726, + "learning_rate": 3.129000827968184e-06, + "loss": 0.87072641, + "num_input_tokens_seen": 2698290, + "step": 129, + "time_per_iteration": 2.685485363006592 + }, + { + "auxiliary_loss_clip": 0.01505093, + "auxiliary_loss_mlp": 0.01136298, + "balance_loss_clip": 1.15645301, + "balance_loss_mlp": 1.07144856, + "epoch": 0.007816022846836013, + "flos": 22638230784000.0, + "grad_norm": 2.4536870435473164, + "language_loss": 0.97419649, + "learning_rate": 3.133972684206866e-06, + "loss": 1.00061047, + "num_input_tokens_seen": 2717630, + "step": 130, + "time_per_iteration": 2.7843704223632812 + }, + { + "auxiliary_loss_clip": 0.01498673, + "auxiliary_loss_mlp": 0.01135997, + "balance_loss_clip": 1.15308833, + "balance_loss_mlp": 1.07076597, + "epoch": 0.007876146099503984, + "flos": 18182901479040.0, + "grad_norm": 2.3657127318271556, + "language_loss": 0.82547629, + "learning_rate": 3.138906441556014e-06, + "loss": 0.85182297, + "num_input_tokens_seen": 2735835, + "step": 131, + "time_per_iteration": 2.6518681049346924 + }, + { + "auxiliary_loss_clip": 0.01508605, + "auxiliary_loss_mlp": 0.01129359, + "balance_loss_clip": 1.1555779, + "balance_loss_mlp": 1.06737018, + "epoch": 0.007936269352171952, + "flos": 27119486730240.0, + "grad_norm": 3.6233140083614415, + "language_loss": 0.82419318, + "learning_rate": 3.143802679474861e-06, + "loss": 0.85057282, + "num_input_tokens_seen": 2756335, + "step": 132, + "time_per_iteration": 2.7956013679504395 + }, + { + "auxiliary_loss_clip": 0.01497931, + "auxiliary_loss_mlp": 0.01129155, + "balance_loss_clip": 1.14956093, + "balance_loss_mlp": 1.06649852, + "epoch": 0.007996392604839923, + "flos": 19026335710080.0, + "grad_norm": 2.1650704831091883, + "language_loss": 0.95536363, + "learning_rate": 3.1486619643025565e-06, + "loss": 0.98163438, + "num_input_tokens_seen": 2775090, + "step": 133, + "time_per_iteration": 2.7174947261810303 + }, + { + "auxiliary_loss_clip": 0.01494167, + "auxiliary_loss_mlp": 0.01127478, + "balance_loss_clip": 1.15804648, + "balance_loss_mlp": 1.06639481, + "epoch": 0.008056515857507891, + "flos": 25484151306240.0, + "grad_norm": 1.7022419639765296, + "language_loss": 0.73419607, + "learning_rate": 3.153484849651286e-06, + "loss": 0.76041257, + "num_input_tokens_seen": 2795320, + "step": 134, + "time_per_iteration": 2.817107677459717 + }, + { + "auxiliary_loss_clip": 0.01490514, + "auxiliary_loss_mlp": 0.0113063, + "balance_loss_clip": 1.14698088, + "balance_loss_mlp": 1.06506538, + "epoch": 0.00811663911017586, + "flos": 20557566541440.0, + "grad_norm": 3.690071999093916, + "language_loss": 0.8869617, + "learning_rate": 3.1582718767847806e-06, + "loss": 0.9131732, + "num_input_tokens_seen": 2812815, + "step": 135, + "time_per_iteration": 2.821535110473633 + }, + { + "auxiliary_loss_clip": 0.01493721, + "auxiliary_loss_mlp": 0.0113481, + "balance_loss_clip": 1.15080249, + "balance_loss_mlp": 1.06910169, + "epoch": 0.00817676236284383, + "flos": 18799738761600.0, + "grad_norm": 2.031261233074354, + "language_loss": 0.89196932, + "learning_rate": 3.1630235749828485e-06, + "loss": 0.91825467, + "num_input_tokens_seen": 2830445, + "step": 136, + "time_per_iteration": 2.7443923950195312 + }, + { + "auxiliary_loss_clip": 0.01491308, + "auxiliary_loss_mlp": 0.01109462, + "balance_loss_clip": 1.14595985, + "balance_loss_mlp": 1.04828429, + "epoch": 0.008236885615511799, + "flos": 23873593288320.0, + "grad_norm": 2.2667306707226165, + "language_loss": 0.84321928, + "learning_rate": 3.1677404618925676e-06, + "loss": 0.86922705, + "num_input_tokens_seen": 2846965, + "step": 137, + "time_per_iteration": 2.664860725402832 + }, + { + "auxiliary_loss_clip": 0.01486724, + "auxiliary_loss_mlp": 0.01122438, + "balance_loss_clip": 1.14510012, + "balance_loss_mlp": 1.06149793, + "epoch": 0.00829700886817977, + "flos": 24643626076800.0, + "grad_norm": 1.8310011491654261, + "language_loss": 0.90017003, + "learning_rate": 3.1724230438666953e-06, + "loss": 0.92626166, + "num_input_tokens_seen": 2867520, + "step": 138, + "time_per_iteration": 2.7025387287139893 + }, + { + "auxiliary_loss_clip": 0.0147677, + "auxiliary_loss_mlp": 0.01122791, + "balance_loss_clip": 1.14385688, + "balance_loss_mlp": 1.05722618, + "epoch": 0.008357132120847738, + "flos": 25262007644160.0, + "grad_norm": 2.4503352460698995, + "language_loss": 0.91379052, + "learning_rate": 3.177071816289865e-06, + "loss": 0.9397862, + "num_input_tokens_seen": 2885675, + "step": 139, + "time_per_iteration": 2.7894792556762695 + }, + { + "auxiliary_loss_clip": 0.01492907, + "auxiliary_loss_mlp": 0.01124831, + "balance_loss_clip": 1.15004158, + "balance_loss_mlp": 1.06083894, + "epoch": 0.008417255373515706, + "flos": 27344898529920.0, + "grad_norm": 2.3169147183954637, + "language_loss": 0.85615921, + "learning_rate": 3.181687263893095e-06, + "loss": 0.88233662, + "num_input_tokens_seen": 2905960, + "step": 140, + "time_per_iteration": 2.7585158348083496 + }, + { + "auxiliary_loss_clip": 0.01478674, + "auxiliary_loss_mlp": 0.01122111, + "balance_loss_clip": 1.14419079, + "balance_loss_mlp": 1.06050372, + "epoch": 0.008477378626183677, + "flos": 17639070589440.0, + "grad_norm": 4.683187943091712, + "language_loss": 0.84420407, + "learning_rate": 3.186269861057098e-06, + "loss": 0.87021196, + "num_input_tokens_seen": 2922780, + "step": 141, + "time_per_iteration": 2.876380681991577 + }, + { + "auxiliary_loss_clip": 0.01482854, + "auxiliary_loss_mlp": 0.01133971, + "balance_loss_clip": 1.14164472, + "balance_loss_mlp": 1.07088494, + "epoch": 0.008537501878851645, + "flos": 13881342297600.0, + "grad_norm": 2.1416185270606976, + "language_loss": 0.81209761, + "learning_rate": 3.1908200721048745e-06, + "loss": 0.8382659, + "num_input_tokens_seen": 2938765, + "step": 142, + "time_per_iteration": 2.6631314754486084 + }, + { + "auxiliary_loss_clip": 0.01383418, + "auxiliary_loss_mlp": 0.01029349, + "balance_loss_clip": 1.18478155, + "balance_loss_mlp": 1.00226426, + "epoch": 0.008597625131519616, + "flos": 71248101281280.0, + "grad_norm": 1.0373877089226184, + "language_loss": 0.6689688, + "learning_rate": 3.195338351584042e-06, + "loss": 0.69309646, + "num_input_tokens_seen": 3006665, + "step": 143, + "time_per_iteration": 3.3560028076171875 + }, + { + "auxiliary_loss_clip": 0.01473737, + "auxiliary_loss_mlp": 0.01125837, + "balance_loss_clip": 1.14183974, + "balance_loss_mlp": 1.06322885, + "epoch": 0.008657748384187584, + "flos": 17602836744960.0, + "grad_norm": 2.4099925217493836, + "language_loss": 0.84046304, + "learning_rate": 3.1998251445393258e-06, + "loss": 0.86645877, + "num_input_tokens_seen": 3024335, + "step": 144, + "time_per_iteration": 2.764874219894409 + }, + { + "auxiliary_loss_clip": 0.01462237, + "auxiliary_loss_mlp": 0.01114125, + "balance_loss_clip": 1.13629282, + "balance_loss_mlp": 1.04922712, + "epoch": 0.008717871636855555, + "flos": 19715317459200.0, + "grad_norm": 2.158358784100212, + "language_loss": 0.88483638, + "learning_rate": 3.204280886775619e-06, + "loss": 0.91059995, + "num_input_tokens_seen": 3043300, + "step": 145, + "time_per_iteration": 2.7723591327667236 + }, + { + "auxiliary_loss_clip": 0.0147812, + "auxiliary_loss_mlp": 0.01126722, + "balance_loss_clip": 1.13795018, + "balance_loss_mlp": 1.06201482, + "epoch": 0.008777994889523523, + "flos": 24717422568960.0, + "grad_norm": 1.955078435744052, + "language_loss": 0.86146319, + "learning_rate": 3.208706005112005e-06, + "loss": 0.88751161, + "num_input_tokens_seen": 3064610, + "step": 146, + "time_per_iteration": 2.8320155143737793 + }, + { + "auxiliary_loss_clip": 0.013632, + "auxiliary_loss_mlp": 0.01026397, + "balance_loss_clip": 1.17151141, + "balance_loss_mlp": 0.99969399, + "epoch": 0.008838118142191492, + "flos": 70132067758080.0, + "grad_norm": 0.8586992379835717, + "language_loss": 0.60076892, + "learning_rate": 3.213100917627104e-06, + "loss": 0.6246649, + "num_input_tokens_seen": 3130385, + "step": 147, + "time_per_iteration": 3.3305439949035645 + }, + { + "auxiliary_loss_clip": 0.01468911, + "auxiliary_loss_mlp": 0.01126434, + "balance_loss_clip": 1.14190173, + "balance_loss_mlp": 1.06630492, + "epoch": 0.008898241394859462, + "flos": 20044797937920.0, + "grad_norm": 1.92421634853192, + "language_loss": 0.84639341, + "learning_rate": 3.2174660338961135e-06, + "loss": 0.87234688, + "num_input_tokens_seen": 3149760, + "step": 148, + "time_per_iteration": 2.659133195877075 + }, + { + "auxiliary_loss_clip": 0.01472287, + "auxiliary_loss_mlp": 0.01145842, + "balance_loss_clip": 1.14295268, + "balance_loss_mlp": 1.07975221, + "epoch": 0.008958364647527431, + "flos": 10743611685120.0, + "grad_norm": 2.3230602014027992, + "language_loss": 0.88818997, + "learning_rate": 3.2218017552198588e-06, + "loss": 0.91437125, + "num_input_tokens_seen": 3164500, + "step": 149, + "time_per_iteration": 2.6913979053497314 + }, + { + "auxiliary_loss_clip": 0.01469293, + "auxiliary_loss_mlp": 0.01114821, + "balance_loss_clip": 1.13759732, + "balance_loss_mlp": 1.05512142, + "epoch": 0.009018487900195401, + "flos": 29127467802240.0, + "grad_norm": 2.4679627529723316, + "language_loss": 0.93029606, + "learning_rate": 3.226108474846181e-06, + "loss": 0.9561373, + "num_input_tokens_seen": 3182455, + "step": 150, + "time_per_iteration": 2.781553030014038 + }, + { + "auxiliary_loss_clip": 0.01456298, + "auxiliary_loss_mlp": 0.01114435, + "balance_loss_clip": 1.13300478, + "balance_loss_mlp": 1.05606997, + "epoch": 0.00907861115286337, + "flos": 32963661354240.0, + "grad_norm": 2.003023573715677, + "language_loss": 0.74187261, + "learning_rate": 3.2303865781839817e-06, + "loss": 0.76757991, + "num_input_tokens_seen": 3203995, + "step": 151, + "time_per_iteration": 2.7865850925445557 + }, + { + "auxiliary_loss_clip": 0.01468891, + "auxiliary_loss_mlp": 0.01126339, + "balance_loss_clip": 1.13775742, + "balance_loss_mlp": 1.06592393, + "epoch": 0.009138734405531338, + "flos": 21762441377280.0, + "grad_norm": 2.9660917861167646, + "language_loss": 0.88154984, + "learning_rate": 3.234636443010188e-06, + "loss": 0.90750217, + "num_input_tokens_seen": 3222575, + "step": 152, + "time_per_iteration": 2.7397165298461914 + }, + { + "auxiliary_loss_clip": 0.01468164, + "auxiliary_loss_mlp": 0.01123133, + "balance_loss_clip": 1.14200783, + "balance_loss_mlp": 1.06181204, + "epoch": 0.009198857658199309, + "flos": 20842517134080.0, + "grad_norm": 6.52069065604263, + "language_loss": 0.84205478, + "learning_rate": 3.238858439669943e-06, + "loss": 0.86796772, + "num_input_tokens_seen": 3240180, + "step": 153, + "time_per_iteration": 2.675072193145752 + }, + { + "auxiliary_loss_clip": 0.01459607, + "auxiliary_loss_mlp": 0.0113581, + "balance_loss_clip": 1.13542438, + "balance_loss_mlp": 1.07367778, + "epoch": 0.009258980910867277, + "flos": 24827381078400.0, + "grad_norm": 1.9207176126764394, + "language_loss": 0.896797, + "learning_rate": 3.2430529312702712e-06, + "loss": 0.92275113, + "num_input_tokens_seen": 3259800, + "step": 154, + "time_per_iteration": 2.785554885864258 + }, + { + "auxiliary_loss_clip": 0.01462856, + "auxiliary_loss_mlp": 0.01156879, + "balance_loss_clip": 1.13728702, + "balance_loss_mlp": 1.09512842, + "epoch": 0.009319104163535248, + "flos": 28767786963840.0, + "grad_norm": 2.0237024569128113, + "language_loss": 0.89765888, + "learning_rate": 3.2472202738674737e-06, + "loss": 0.92385626, + "num_input_tokens_seen": 3280400, + "step": 155, + "time_per_iteration": 2.7373528480529785 + }, + { + "auxiliary_loss_clip": 0.01466265, + "auxiliary_loss_mlp": 0.0112068, + "balance_loss_clip": 1.13478065, + "balance_loss_mlp": 1.061028, + "epoch": 0.009379227416203216, + "flos": 16582004219520.0, + "grad_norm": 2.3058376779776633, + "language_loss": 0.86560762, + "learning_rate": 3.2513608166485063e-06, + "loss": 0.89147699, + "num_input_tokens_seen": 3297600, + "step": 156, + "time_per_iteration": 2.6945650577545166 + }, + { + "auxiliary_loss_clip": 0.01465342, + "auxiliary_loss_mlp": 0.01121025, + "balance_loss_clip": 1.13920736, + "balance_loss_mlp": 1.06103861, + "epoch": 0.009439350668871187, + "flos": 18329919845760.0, + "grad_norm": 2.7930875289959283, + "language_loss": 0.99719423, + "learning_rate": 3.2554749021065498e-06, + "loss": 1.02305794, + "num_input_tokens_seen": 3313635, + "step": 157, + "time_per_iteration": 2.8970866203308105 + }, + { + "auxiliary_loss_clip": 0.01448917, + "auxiliary_loss_mlp": 0.01142835, + "balance_loss_clip": 1.13382959, + "balance_loss_mlp": 1.08213365, + "epoch": 0.009499473921539155, + "flos": 24349912565760.0, + "grad_norm": 2.0448042341630925, + "language_loss": 0.88434863, + "learning_rate": 3.2595628662110186e-06, + "loss": 0.91026616, + "num_input_tokens_seen": 3333735, + "step": 158, + "time_per_iteration": 2.877938985824585 + }, + { + "auxiliary_loss_clip": 0.01455366, + "auxiliary_loss_mlp": 0.01127452, + "balance_loss_clip": 1.13262129, + "balance_loss_mlp": 1.06594014, + "epoch": 0.009559597174207124, + "flos": 16399326625920.0, + "grad_norm": 2.3180502300727324, + "language_loss": 0.86527151, + "learning_rate": 3.2636250385721982e-06, + "loss": 0.89109969, + "num_input_tokens_seen": 3348800, + "step": 159, + "time_per_iteration": 2.8386785984039307 + }, + { + "auxiliary_loss_clip": 0.01443803, + "auxiliary_loss_mlp": 0.0113849, + "balance_loss_clip": 1.12760043, + "balance_loss_mlp": 1.07702577, + "epoch": 0.009619720426875094, + "flos": 22856890826880.0, + "grad_norm": 1.823003653790601, + "language_loss": 0.86545295, + "learning_rate": 3.2676617426007263e-06, + "loss": 0.89127588, + "num_input_tokens_seen": 3368595, + "step": 160, + "time_per_iteration": 2.8097658157348633 + }, + { + "auxiliary_loss_clip": 0.0145166, + "auxiliary_loss_mlp": 0.01122577, + "balance_loss_clip": 1.13363206, + "balance_loss_mlp": 1.06578565, + "epoch": 0.009679843679543063, + "flos": 19135001329920.0, + "grad_norm": 2.1646347778997144, + "language_loss": 0.91326618, + "learning_rate": 3.2716732956621042e-06, + "loss": 0.93900859, + "num_input_tokens_seen": 3384975, + "step": 161, + "time_per_iteration": 2.7702269554138184 + }, + { + "auxiliary_loss_clip": 0.01456122, + "auxiliary_loss_mlp": 0.01113951, + "balance_loss_clip": 1.13284218, + "balance_loss_mlp": 1.05668271, + "epoch": 0.009739966932211033, + "flos": 20302995876480.0, + "grad_norm": 1.9132416672901613, + "language_loss": 0.91767257, + "learning_rate": 3.2756600092264203e-06, + "loss": 0.94337332, + "num_input_tokens_seen": 3404755, + "step": 162, + "time_per_iteration": 2.743861675262451 + }, + { + "auxiliary_loss_clip": 0.0131272, + "auxiliary_loss_mlp": 0.01027304, + "balance_loss_clip": 1.13849413, + "balance_loss_mlp": 1.00288987, + "epoch": 0.009800090184879002, + "flos": 67034234177280.0, + "grad_norm": 1.1765878067545124, + "language_loss": 0.72342646, + "learning_rate": 3.279622189013474e-06, + "loss": 0.74682665, + "num_input_tokens_seen": 3467210, + "step": 163, + "time_per_iteration": 3.2559006214141846 + }, + { + "auxiliary_loss_clip": 0.01440989, + "auxiliary_loss_mlp": 0.01116027, + "balance_loss_clip": 1.12976515, + "balance_loss_mlp": 1.05790067, + "epoch": 0.00986021343754697, + "flos": 17164690646400.0, + "grad_norm": 2.3706007413140076, + "language_loss": 0.84454668, + "learning_rate": 3.283560135133457e-06, + "loss": 0.87011683, + "num_input_tokens_seen": 3483220, + "step": 164, + "time_per_iteration": 2.6766648292541504 + }, + { + "auxiliary_loss_clip": 0.01432556, + "auxiliary_loss_mlp": 0.01104052, + "balance_loss_clip": 1.12256455, + "balance_loss_mlp": 1.04678416, + "epoch": 0.00992033669021494, + "flos": 17749424148480.0, + "grad_norm": 1.9780359896353146, + "language_loss": 0.89003873, + "learning_rate": 3.2874741422233565e-06, + "loss": 0.91540486, + "num_input_tokens_seen": 3501465, + "step": 165, + "time_per_iteration": 2.7192647457122803 + }, + { + "auxiliary_loss_clip": 0.01435332, + "auxiliary_loss_mlp": 0.01130001, + "balance_loss_clip": 1.12274897, + "balance_loss_mlp": 1.0682503, + "epoch": 0.00998045994288291, + "flos": 25297164080640.0, + "grad_norm": 1.8771989627881724, + "language_loss": 0.79965043, + "learning_rate": 3.2913644995792465e-06, + "loss": 0.82530373, + "num_input_tokens_seen": 3520480, + "step": 166, + "time_per_iteration": 2.8459370136260986 + }, + { + "auxiliary_loss_clip": 0.01440091, + "auxiliary_loss_mlp": 0.01128303, + "balance_loss_clip": 1.12644553, + "balance_loss_mlp": 1.06721973, + "epoch": 0.01004058319555088, + "flos": 32298954220800.0, + "grad_norm": 2.8422101564792497, + "language_loss": 0.91533291, + "learning_rate": 3.2952314912845914e-06, + "loss": 0.94101691, + "num_input_tokens_seen": 3539570, + "step": 167, + "time_per_iteration": 2.794205904006958 + }, + { + "auxiliary_loss_clip": 0.01430535, + "auxiliary_loss_mlp": 0.01136586, + "balance_loss_clip": 1.1240859, + "balance_loss_mlp": 1.07893634, + "epoch": 0.010100706448218848, + "flos": 11319941404800.0, + "grad_norm": 3.243291952047374, + "language_loss": 0.90696269, + "learning_rate": 3.299075396334735e-06, + "loss": 0.93263388, + "num_input_tokens_seen": 3555465, + "step": 168, + "time_per_iteration": 2.6639392375946045 + }, + { + "auxiliary_loss_clip": 0.01425785, + "auxiliary_loss_mlp": 0.01107908, + "balance_loss_clip": 1.11994684, + "balance_loss_mlp": 1.04854202, + "epoch": 0.010160829700886819, + "flos": 29719491765120.0, + "grad_norm": 1.6006937285753302, + "language_loss": 0.86931497, + "learning_rate": 3.3028964887576868e-06, + "loss": 0.89465189, + "num_input_tokens_seen": 3578970, + "step": 169, + "time_per_iteration": 2.8036038875579834 + }, + { + "auxiliary_loss_clip": 0.01426488, + "auxiliary_loss_mlp": 0.01116346, + "balance_loss_clip": 1.12163413, + "balance_loss_mlp": 1.05688441, + "epoch": 0.010220952953554787, + "flos": 20412343854720.0, + "grad_norm": 2.143902460110142, + "language_loss": 0.84927499, + "learning_rate": 3.306695037731344e-06, + "loss": 0.87470335, + "num_input_tokens_seen": 3597275, + "step": 170, + "time_per_iteration": 2.681755542755127 + }, + { + "auxiliary_loss_clip": 0.01437496, + "auxiliary_loss_mlp": 0.01134278, + "balance_loss_clip": 1.12283516, + "balance_loss_mlp": 1.07453036, + "epoch": 0.010281076206222756, + "flos": 31285124847360.0, + "grad_norm": 2.839091728958882, + "language_loss": 0.89918816, + "learning_rate": 3.3104713076972827e-06, + "loss": 0.9249059, + "num_input_tokens_seen": 3618905, + "step": 171, + "time_per_iteration": 2.7579524517059326 + }, + { + "auxiliary_loss_clip": 0.0143029, + "auxiliary_loss_mlp": 0.011089, + "balance_loss_clip": 1.12487042, + "balance_loss_mlp": 1.05215645, + "epoch": 0.010341199458890726, + "flos": 21982286568960.0, + "grad_norm": 1.8676106103996866, + "language_loss": 0.88979775, + "learning_rate": 3.314225558471224e-06, + "loss": 0.91518962, + "num_input_tokens_seen": 3639610, + "step": 172, + "time_per_iteration": 2.70339035987854 + }, + { + "auxiliary_loss_clip": 0.014182, + "auxiliary_loss_mlp": 0.01128189, + "balance_loss_clip": 1.11734104, + "balance_loss_mlp": 1.0706346, + "epoch": 0.010401322711558695, + "flos": 30810529422720.0, + "grad_norm": 1.9821452294498765, + "language_loss": 0.81135118, + "learning_rate": 3.317958045350308e-06, + "loss": 0.83681512, + "num_input_tokens_seen": 3664030, + "step": 173, + "time_per_iteration": 5.874817371368408 + }, + { + "auxiliary_loss_clip": 0.01431038, + "auxiliary_loss_mlp": 0.0111238, + "balance_loss_clip": 1.12190771, + "balance_loss_mlp": 1.0573771, + "epoch": 0.010461445964226665, + "flos": 24715124098560.0, + "grad_norm": 2.6905270078960952, + "language_loss": 0.82731295, + "learning_rate": 3.3216690192172596e-06, + "loss": 0.85274708, + "num_input_tokens_seen": 3683615, + "step": 174, + "time_per_iteration": 2.7947936058044434 + }, + { + "auxiliary_loss_clip": 0.01425451, + "auxiliary_loss_mlp": 0.01121426, + "balance_loss_clip": 1.11961019, + "balance_loss_mlp": 1.06406271, + "epoch": 0.010521569216894634, + "flos": 27710361457920.0, + "grad_norm": 2.619277932061431, + "language_loss": 0.72858572, + "learning_rate": 3.325358726641591e-06, + "loss": 0.75405455, + "num_input_tokens_seen": 3704540, + "step": 175, + "time_per_iteration": 4.3025219440460205 + }, + { + "auxiliary_loss_clip": 0.01427866, + "auxiliary_loss_mlp": 0.01128832, + "balance_loss_clip": 1.1204958, + "balance_loss_mlp": 1.06941807, + "epoch": 0.010581692469562603, + "flos": 12458346122880.0, + "grad_norm": 2.6158591870373313, + "language_loss": 0.97700357, + "learning_rate": 3.329027409977902e-06, + "loss": 1.00257063, + "num_input_tokens_seen": 3721320, + "step": 176, + "time_per_iteration": 2.6233396530151367 + }, + { + "auxiliary_loss_clip": 0.01413282, + "auxiliary_loss_mlp": 0.01134274, + "balance_loss_clip": 1.11640882, + "balance_loss_mlp": 1.0787704, + "epoch": 0.010641815722230573, + "flos": 19427601519360.0, + "grad_norm": 2.521015541668131, + "language_loss": 0.77030516, + "learning_rate": 3.3326753074614087e-06, + "loss": 0.79578066, + "num_input_tokens_seen": 3739385, + "step": 177, + "time_per_iteration": 2.662210464477539 + }, + { + "auxiliary_loss_clip": 0.01422905, + "auxiliary_loss_mlp": 0.0110263, + "balance_loss_clip": 1.11550498, + "balance_loss_mlp": 1.04614866, + "epoch": 0.010701938974898541, + "flos": 18332577452160.0, + "grad_norm": 3.352593735031459, + "language_loss": 0.76782846, + "learning_rate": 3.3363026533007716e-06, + "loss": 0.79308379, + "num_input_tokens_seen": 3756360, + "step": 178, + "time_per_iteration": 2.6979384422302246 + }, + { + "auxiliary_loss_clip": 0.01428891, + "auxiliary_loss_mlp": 0.01108547, + "balance_loss_clip": 1.11969638, + "balance_loss_mlp": 1.05027723, + "epoch": 0.010762062227566512, + "flos": 19203985399680.0, + "grad_norm": 5.981378470930928, + "language_loss": 0.84275579, + "learning_rate": 3.3399096777683303e-06, + "loss": 0.86813021, + "num_input_tokens_seen": 3773930, + "step": 179, + "time_per_iteration": 2.726335287094116 + }, + { + "auxiliary_loss_clip": 0.01420853, + "auxiliary_loss_mlp": 0.01111656, + "balance_loss_clip": 1.11366248, + "balance_loss_mlp": 1.05300498, + "epoch": 0.01082218548023448, + "flos": 31425427370880.0, + "grad_norm": 2.0571431568130203, + "language_loss": 0.83626461, + "learning_rate": 3.3434966072878213e-06, + "loss": 0.86158979, + "num_input_tokens_seen": 3793630, + "step": 180, + "time_per_iteration": 2.754516363143921 + }, + { + "auxiliary_loss_clip": 0.01419584, + "auxiliary_loss_mlp": 0.01121646, + "balance_loss_clip": 1.11538315, + "balance_loss_mlp": 1.06344831, + "epoch": 0.01088230873290245, + "flos": 25046436170880.0, + "grad_norm": 2.4223994333391494, + "language_loss": 0.77770269, + "learning_rate": 3.3470636645196674e-06, + "loss": 0.80311489, + "num_input_tokens_seen": 3813610, + "step": 181, + "time_per_iteration": 2.706646680831909 + }, + { + "auxiliary_loss_clip": 0.01415103, + "auxiliary_loss_mlp": 0.01129977, + "balance_loss_clip": 1.11239874, + "balance_loss_mlp": 1.07294738, + "epoch": 0.01094243198557042, + "flos": 22893411980160.0, + "grad_norm": 3.8413061030147246, + "language_loss": 0.76372874, + "learning_rate": 3.3506110684439156e-06, + "loss": 0.78917956, + "num_input_tokens_seen": 3831390, + "step": 182, + "time_per_iteration": 2.6801114082336426 + }, + { + "auxiliary_loss_clip": 0.01413494, + "auxiliary_loss_mlp": 0.01129088, + "balance_loss_clip": 1.11145806, + "balance_loss_mlp": 1.07105732, + "epoch": 0.011002555238238388, + "flos": 17165049782400.0, + "grad_norm": 2.444535764394348, + "language_loss": 0.87720621, + "learning_rate": 3.3541390344409054e-06, + "loss": 0.902632, + "num_input_tokens_seen": 3849705, + "step": 183, + "time_per_iteration": 2.8071675300598145 + }, + { + "auxiliary_loss_clip": 0.01414117, + "auxiliary_loss_mlp": 0.01112239, + "balance_loss_clip": 1.11344194, + "balance_loss_mlp": 1.05978727, + "epoch": 0.011062678490906358, + "flos": 22310150935680.0, + "grad_norm": 2.456092335631355, + "language_loss": 0.86662853, + "learning_rate": 3.357647774369736e-06, + "loss": 0.89189208, + "num_input_tokens_seen": 3869230, + "step": 184, + "time_per_iteration": 2.813713312149048 + }, + { + "auxiliary_loss_clip": 0.01409928, + "auxiliary_loss_mlp": 0.0111294, + "balance_loss_clip": 1.1139127, + "balance_loss_mlp": 1.05598128, + "epoch": 0.011122801743574327, + "flos": 24388373053440.0, + "grad_norm": 2.5333933635664088, + "language_loss": 0.83633077, + "learning_rate": 3.3611374966446085e-06, + "loss": 0.86155951, + "num_input_tokens_seen": 3889735, + "step": 185, + "time_per_iteration": 2.783606767654419 + }, + { + "auxiliary_loss_clip": 0.014202, + "auxiliary_loss_mlp": 0.01112357, + "balance_loss_clip": 1.11148572, + "balance_loss_mlp": 1.05237091, + "epoch": 0.011182924996242297, + "flos": 18150258994560.0, + "grad_norm": 5.3746385183992, + "language_loss": 0.71176469, + "learning_rate": 3.3646084063091142e-06, + "loss": 0.73709035, + "num_input_tokens_seen": 3908855, + "step": 186, + "time_per_iteration": 2.6163811683654785 + }, + { + "auxiliary_loss_clip": 0.01416906, + "auxiliary_loss_mlp": 0.01105532, + "balance_loss_clip": 1.11317015, + "balance_loss_mlp": 1.05136347, + "epoch": 0.011243048248910266, + "flos": 15486800584320.0, + "grad_norm": 2.735522954756486, + "language_loss": 1.02108753, + "learning_rate": 3.3680607051085194e-06, + "loss": 1.04631197, + "num_input_tokens_seen": 3923865, + "step": 187, + "time_per_iteration": 2.622969388961792 + }, + { + "auxiliary_loss_clip": 0.01404811, + "auxiliary_loss_mlp": 0.01111149, + "balance_loss_clip": 1.11065328, + "balance_loss_mlp": 1.05426252, + "epoch": 0.011303171501578235, + "flos": 40916868986880.0, + "grad_norm": 1.677051298861494, + "language_loss": 0.74953824, + "learning_rate": 3.371494591560139e-06, + "loss": 0.77469784, + "num_input_tokens_seen": 3946870, + "step": 188, + "time_per_iteration": 2.856904983520508 + }, + { + "auxiliary_loss_clip": 0.0130787, + "auxiliary_loss_mlp": 0.01103872, + "balance_loss_clip": 1.12064946, + "balance_loss_mlp": 1.08022082, + "epoch": 0.011363294754246205, + "flos": 66302697790080.0, + "grad_norm": 0.7773408136432085, + "language_loss": 0.56186926, + "learning_rate": 3.3749102610218297e-06, + "loss": 0.58598673, + "num_input_tokens_seen": 4010005, + "step": 189, + "time_per_iteration": 3.3736276626586914 + }, + { + "auxiliary_loss_clip": 0.01404354, + "auxiliary_loss_mlp": 0.01123381, + "balance_loss_clip": 1.10829449, + "balance_loss_mlp": 1.06649435, + "epoch": 0.011423418006914174, + "flos": 24900279730560.0, + "grad_norm": 2.287074480313007, + "language_loss": 0.95035338, + "learning_rate": 3.3783079057586833e-06, + "loss": 0.9756307, + "num_input_tokens_seen": 4029035, + "step": 190, + "time_per_iteration": 2.676772117614746 + }, + { + "auxiliary_loss_clip": 0.01404429, + "auxiliary_loss_mlp": 0.01106916, + "balance_loss_clip": 1.10978556, + "balance_loss_mlp": 1.05291414, + "epoch": 0.011483541259582144, + "flos": 19791879298560.0, + "grad_norm": 4.881378874755272, + "language_loss": 0.84609079, + "learning_rate": 3.3816877150079665e-06, + "loss": 0.87120432, + "num_input_tokens_seen": 4046995, + "step": 191, + "time_per_iteration": 2.6798179149627686 + }, + { + "auxiliary_loss_clip": 0.01405535, + "auxiliary_loss_mlp": 0.01122805, + "balance_loss_clip": 1.10772681, + "balance_loss_mlp": 1.06901777, + "epoch": 0.011543664512250112, + "flos": 26176939896960.0, + "grad_norm": 2.083813367331014, + "language_loss": 0.91668558, + "learning_rate": 3.385049875042367e-06, + "loss": 0.94196904, + "num_input_tokens_seen": 4065865, + "step": 192, + "time_per_iteration": 2.7140610218048096 + }, + { + "auxiliary_loss_clip": 0.01401826, + "auxiliary_loss_mlp": 0.01120785, + "balance_loss_clip": 1.10925984, + "balance_loss_mlp": 1.06180036, + "epoch": 0.011603787764918083, + "flos": 23768985905280.0, + "grad_norm": 2.3815652395730047, + "language_loss": 0.86949253, + "learning_rate": 3.3883945692315938e-06, + "loss": 0.89471865, + "num_input_tokens_seen": 4085305, + "step": 193, + "time_per_iteration": 2.674741744995117 + }, + { + "auxiliary_loss_clip": 0.0140435, + "auxiliary_loss_mlp": 0.01103137, + "balance_loss_clip": 1.10723114, + "balance_loss_mlp": 1.04975474, + "epoch": 0.011663911017586051, + "flos": 25954688494080.0, + "grad_norm": 2.620593063126082, + "language_loss": 0.92194819, + "learning_rate": 3.3917219781023906e-06, + "loss": 0.94702303, + "num_input_tokens_seen": 4105185, + "step": 194, + "time_per_iteration": 2.772975206375122 + }, + { + "auxiliary_loss_clip": 0.01408718, + "auxiliary_loss_mlp": 0.01108389, + "balance_loss_clip": 1.11203361, + "balance_loss_mlp": 1.05402923, + "epoch": 0.01172403427025402, + "flos": 17895149625600.0, + "grad_norm": 2.6696368851427876, + "language_loss": 0.89810824, + "learning_rate": 3.3950322793970014e-06, + "loss": 0.92327929, + "num_input_tokens_seen": 4123160, + "step": 195, + "time_per_iteration": 2.5723025798797607 + }, + { + "auxiliary_loss_clip": 0.01405175, + "auxiliary_loss_mlp": 0.01125569, + "balance_loss_clip": 1.11255264, + "balance_loss_mlp": 1.06882513, + "epoch": 0.01178415752292199, + "flos": 17894539094400.0, + "grad_norm": 2.5705329587217074, + "language_loss": 0.85707134, + "learning_rate": 3.3983256481301445e-06, + "loss": 0.88237882, + "num_input_tokens_seen": 4140425, + "step": 196, + "time_per_iteration": 2.6537108421325684 + }, + { + "auxiliary_loss_clip": 0.01401677, + "auxiliary_loss_mlp": 0.01112212, + "balance_loss_clip": 1.10833681, + "balance_loss_mlp": 1.05632687, + "epoch": 0.011844280775589959, + "flos": 22893555634560.0, + "grad_norm": 2.457279310649977, + "language_loss": 0.93237966, + "learning_rate": 3.4016022566445335e-06, + "loss": 0.95751858, + "num_input_tokens_seen": 4159555, + "step": 197, + "time_per_iteration": 2.7015581130981445 + }, + { + "auxiliary_loss_clip": 0.01399671, + "auxiliary_loss_mlp": 0.01121913, + "balance_loss_clip": 1.11041856, + "balance_loss_mlp": 1.06702876, + "epoch": 0.01190440402825793, + "flos": 26980333441920.0, + "grad_norm": 2.3975730490711626, + "language_loss": 0.78779101, + "learning_rate": 3.4048622746649966e-06, + "loss": 0.81300688, + "num_input_tokens_seen": 4180480, + "step": 198, + "time_per_iteration": 2.7012104988098145 + }, + { + "auxiliary_loss_clip": 0.01396871, + "auxiliary_loss_mlp": 0.01124797, + "balance_loss_clip": 1.11142921, + "balance_loss_mlp": 1.07143903, + "epoch": 0.011964527280925898, + "flos": 20521584092160.0, + "grad_norm": 1.9945018822893117, + "language_loss": 0.88292295, + "learning_rate": 3.4081058693512278e-06, + "loss": 0.90813959, + "num_input_tokens_seen": 4198835, + "step": 199, + "time_per_iteration": 2.6237311363220215 + }, + { + "auxiliary_loss_clip": 0.01409655, + "auxiliary_loss_mlp": 0.01134804, + "balance_loss_clip": 1.11574399, + "balance_loss_mlp": 1.07600951, + "epoch": 0.012024650533593867, + "flos": 27745984771200.0, + "grad_norm": 2.449378322458857, + "language_loss": 0.81414747, + "learning_rate": 3.411333205349222e-06, + "loss": 0.83959198, + "num_input_tokens_seen": 4219335, + "step": 200, + "time_per_iteration": 2.6472690105438232 + }, + { + "auxiliary_loss_clip": 0.01406121, + "auxiliary_loss_mlp": 0.01107184, + "balance_loss_clip": 1.11064541, + "balance_loss_mlp": 1.05201387, + "epoch": 0.012084773786261837, + "flos": 10452017076480.0, + "grad_norm": 2.295938684884742, + "language_loss": 0.87582088, + "learning_rate": 3.4145444448414217e-06, + "loss": 0.90095395, + "num_input_tokens_seen": 4236940, + "step": 201, + "time_per_iteration": 2.6147453784942627 + }, + { + "auxiliary_loss_clip": 0.01399954, + "auxiliary_loss_mlp": 0.01108211, + "balance_loss_clip": 1.11002731, + "balance_loss_mlp": 1.05220628, + "epoch": 0.012144897038929806, + "flos": 23105751229440.0, + "grad_norm": 1.8844238677057512, + "language_loss": 0.840693, + "learning_rate": 3.4177397475956223e-06, + "loss": 0.86577463, + "num_input_tokens_seen": 4256755, + "step": 202, + "time_per_iteration": 2.691176414489746 + }, + { + "auxiliary_loss_clip": 0.01390215, + "auxiliary_loss_mlp": 0.01109102, + "balance_loss_clip": 1.10327077, + "balance_loss_mlp": 1.05521953, + "epoch": 0.012205020291597776, + "flos": 21033203460480.0, + "grad_norm": 4.082316262052686, + "language_loss": 0.90008092, + "learning_rate": 3.4209192710126685e-06, + "loss": 0.9250741, + "num_input_tokens_seen": 4276505, + "step": 203, + "time_per_iteration": 2.6729233264923096 + }, + { + "auxiliary_loss_clip": 0.01329958, + "auxiliary_loss_mlp": 0.01123006, + "balance_loss_clip": 1.1587553, + "balance_loss_mlp": 1.10154808, + "epoch": 0.012265143544265745, + "flos": 68447785075200.0, + "grad_norm": 1.0332042968615376, + "language_loss": 0.61213887, + "learning_rate": 3.4240831701729837e-06, + "loss": 0.63666844, + "num_input_tokens_seen": 4330965, + "step": 204, + "time_per_iteration": 3.1664140224456787 + }, + { + "auxiliary_loss_clip": 0.01397758, + "auxiliary_loss_mlp": 0.01109795, + "balance_loss_clip": 1.10480165, + "balance_loss_mlp": 1.05567384, + "epoch": 0.012325266796933715, + "flos": 17019252478080.0, + "grad_norm": 2.65871662316326, + "language_loss": 0.91437382, + "learning_rate": 3.4272315978819516e-06, + "loss": 0.93944931, + "num_input_tokens_seen": 4348200, + "step": 205, + "time_per_iteration": 2.7193832397460938 + }, + { + "auxiliary_loss_clip": 0.01407667, + "auxiliary_loss_mlp": 0.0112178, + "balance_loss_clip": 1.11054134, + "balance_loss_mlp": 1.06575131, + "epoch": 0.012385390049601683, + "flos": 20190056538240.0, + "grad_norm": 2.8555605863689926, + "language_loss": 0.89046174, + "learning_rate": 3.4303647047142043e-06, + "loss": 0.91575623, + "num_input_tokens_seen": 4365460, + "step": 206, + "time_per_iteration": 3.0111751556396484 + }, + { + "auxiliary_loss_clip": 0.01396499, + "auxiliary_loss_mlp": 0.0110115, + "balance_loss_clip": 1.10390389, + "balance_loss_mlp": 1.04848289, + "epoch": 0.012445513302269652, + "flos": 16253134272000.0, + "grad_norm": 6.751993597505026, + "language_loss": 0.9553166, + "learning_rate": 3.43348263905683e-06, + "loss": 0.9802931, + "num_input_tokens_seen": 4383650, + "step": 207, + "time_per_iteration": 2.7638027667999268 + }, + { + "auxiliary_loss_clip": 0.01394689, + "auxiliary_loss_mlp": 0.01121156, + "balance_loss_clip": 1.10831583, + "balance_loss_mlp": 1.06748772, + "epoch": 0.012505636554937622, + "flos": 23769380954880.0, + "grad_norm": 2.7653240837414748, + "language_loss": 0.75963068, + "learning_rate": 3.436585547151547e-06, + "loss": 0.7847892, + "num_input_tokens_seen": 4403765, + "step": 208, + "time_per_iteration": 2.8403544425964355 + }, + { + "auxiliary_loss_clip": 0.0138365, + "auxiliary_loss_mlp": 0.01114046, + "balance_loss_clip": 1.10270083, + "balance_loss_mlp": 1.06085467, + "epoch": 0.012565759807605591, + "flos": 30591546157440.0, + "grad_norm": 2.549116165782525, + "language_loss": 0.98745322, + "learning_rate": 3.4396735731358586e-06, + "loss": 1.01243019, + "num_input_tokens_seen": 4421935, + "step": 209, + "time_per_iteration": 2.7109599113464355 + }, + { + "auxiliary_loss_clip": 0.01390948, + "auxiliary_loss_mlp": 0.01118971, + "balance_loss_clip": 1.10572588, + "balance_loss_mlp": 1.06470692, + "epoch": 0.012625883060273561, + "flos": 40113511355520.0, + "grad_norm": 2.8471392299542155, + "language_loss": 0.85392559, + "learning_rate": 3.4427468590832302e-06, + "loss": 0.87902474, + "num_input_tokens_seen": 4441470, + "step": 210, + "time_per_iteration": 2.8976383209228516 + }, + { + "auxiliary_loss_clip": 0.01388039, + "auxiliary_loss_mlp": 0.01117999, + "balance_loss_clip": 1.10358953, + "balance_loss_mlp": 1.06704903, + "epoch": 0.01268600631294153, + "flos": 27089178629760.0, + "grad_norm": 2.1186699313032076, + "language_loss": 0.9700526, + "learning_rate": 3.445805545042314e-06, + "loss": 0.99511302, + "num_input_tokens_seen": 4459950, + "step": 211, + "time_per_iteration": 2.7972559928894043 + }, + { + "auxiliary_loss_clip": 0.01396615, + "auxiliary_loss_mlp": 0.01122136, + "balance_loss_clip": 1.10839701, + "balance_loss_mlp": 1.06813431, + "epoch": 0.012746129565609499, + "flos": 16982767238400.0, + "grad_norm": 4.163302508669801, + "language_loss": 0.95254338, + "learning_rate": 3.448849769075239e-06, + "loss": 0.97773087, + "num_input_tokens_seen": 4478390, + "step": 212, + "time_per_iteration": 2.776641368865967 + }, + { + "auxiliary_loss_clip": 0.01383872, + "auxiliary_loss_mlp": 0.01123837, + "balance_loss_clip": 1.10451484, + "balance_loss_mlp": 1.07028842, + "epoch": 0.012806252818277469, + "flos": 46533476995200.0, + "grad_norm": 2.579101667844445, + "language_loss": 0.76093161, + "learning_rate": 3.4518796672950093e-06, + "loss": 0.78600872, + "num_input_tokens_seen": 4501665, + "step": 213, + "time_per_iteration": 2.9591715335845947 + }, + { + "auxiliary_loss_clip": 0.01389851, + "auxiliary_loss_mlp": 0.011169, + "balance_loss_clip": 1.10408318, + "balance_loss_mlp": 1.06504369, + "epoch": 0.012866376070945438, + "flos": 14388616120320.0, + "grad_norm": 2.769847718384219, + "language_loss": 0.86801964, + "learning_rate": 3.4548953739020187e-06, + "loss": 0.89308715, + "num_input_tokens_seen": 4519055, + "step": 214, + "time_per_iteration": 2.7854056358337402 + }, + { + "auxiliary_loss_clip": 0.0138374, + "auxiliary_loss_mlp": 0.01130705, + "balance_loss_clip": 1.10682404, + "balance_loss_mlp": 1.07574964, + "epoch": 0.012926499323613408, + "flos": 26140813793280.0, + "grad_norm": 2.1755305594582204, + "language_loss": 0.77636886, + "learning_rate": 3.4578970212197196e-06, + "loss": 0.80151325, + "num_input_tokens_seen": 4540870, + "step": 215, + "time_per_iteration": 2.8000433444976807 + }, + { + "auxiliary_loss_clip": 0.01394076, + "auxiliary_loss_mlp": 0.01110401, + "balance_loss_clip": 1.10618079, + "balance_loss_mlp": 1.05814004, + "epoch": 0.012986622576281377, + "flos": 30117202128000.0, + "grad_norm": 2.3908922671138413, + "language_loss": 0.90400618, + "learning_rate": 3.460884739729461e-06, + "loss": 0.92905092, + "num_input_tokens_seen": 4560395, + "step": 216, + "time_per_iteration": 2.801731824874878 + }, + { + "auxiliary_loss_clip": 0.01384703, + "auxiliary_loss_mlp": 0.01107129, + "balance_loss_clip": 1.09945202, + "balance_loss_mlp": 1.05491483, + "epoch": 0.013046745828949347, + "flos": 13954025468160.0, + "grad_norm": 2.4405509137373955, + "language_loss": 0.93448114, + "learning_rate": 3.463858658104523e-06, + "loss": 0.9593994, + "num_input_tokens_seen": 4575785, + "step": 217, + "time_per_iteration": 2.7048463821411133 + }, + { + "auxiliary_loss_clip": 0.01381422, + "auxiliary_loss_mlp": 0.01109423, + "balance_loss_clip": 1.1000756, + "balance_loss_mlp": 1.0542531, + "epoch": 0.013106869081617315, + "flos": 17347835116800.0, + "grad_norm": 2.1991453389197764, + "language_loss": 0.93463945, + "learning_rate": 3.4668189032433696e-06, + "loss": 0.95954794, + "num_input_tokens_seen": 4594985, + "step": 218, + "time_per_iteration": 2.7718052864074707 + }, + { + "auxiliary_loss_clip": 0.01377174, + "auxiliary_loss_mlp": 0.01117923, + "balance_loss_clip": 1.09926534, + "balance_loss_mlp": 1.0661149, + "epoch": 0.013166992334285284, + "flos": 25884914325120.0, + "grad_norm": 2.044489378574315, + "language_loss": 0.86255801, + "learning_rate": 3.46976560030214e-06, + "loss": 0.88750899, + "num_input_tokens_seen": 4616125, + "step": 219, + "time_per_iteration": 2.7873387336730957 + }, + { + "auxiliary_loss_clip": 0.01381256, + "auxiliary_loss_mlp": 0.01103662, + "balance_loss_clip": 1.10089993, + "balance_loss_mlp": 1.05197263, + "epoch": 0.013227115586953254, + "flos": 31175956437120.0, + "grad_norm": 1.6714964981298814, + "language_loss": 0.87326306, + "learning_rate": 3.4726988727263976e-06, + "loss": 0.89811224, + "num_input_tokens_seen": 4637795, + "step": 220, + "time_per_iteration": 4.28123927116394 + }, + { + "auxiliary_loss_clip": 0.01375753, + "auxiliary_loss_mlp": 0.01115117, + "balance_loss_clip": 1.09861588, + "balance_loss_mlp": 1.06726623, + "epoch": 0.013287238839621223, + "flos": 20409470766720.0, + "grad_norm": 1.996106585465144, + "language_loss": 0.86340046, + "learning_rate": 3.475618842282164e-06, + "loss": 0.88830918, + "num_input_tokens_seen": 4656835, + "step": 221, + "time_per_iteration": 4.233871936798096 + }, + { + "auxiliary_loss_clip": 0.01380026, + "auxiliary_loss_mlp": 0.01114108, + "balance_loss_clip": 1.09735501, + "balance_loss_mlp": 1.06175172, + "epoch": 0.013347362092289193, + "flos": 14137134024960.0, + "grad_norm": 2.1139699006985917, + "language_loss": 0.92205459, + "learning_rate": 3.4785256290862486e-06, + "loss": 0.94699591, + "num_input_tokens_seen": 4673015, + "step": 222, + "time_per_iteration": 2.699824333190918 + }, + { + "auxiliary_loss_clip": 0.01375479, + "auxiliary_loss_mlp": 0.01109664, + "balance_loss_clip": 1.09925139, + "balance_loss_mlp": 1.05401671, + "epoch": 0.013407485344957162, + "flos": 21797705554560.0, + "grad_norm": 2.919475834409171, + "language_loss": 0.95718354, + "learning_rate": 3.481419351635897e-06, + "loss": 0.98203498, + "num_input_tokens_seen": 4692355, + "step": 223, + "time_per_iteration": 4.417291164398193 + }, + { + "auxiliary_loss_clip": 0.01375127, + "auxiliary_loss_mlp": 0.01109786, + "balance_loss_clip": 1.09935689, + "balance_loss_mlp": 1.05866885, + "epoch": 0.013467608597625132, + "flos": 18621622195200.0, + "grad_norm": 3.19787346297187, + "language_loss": 0.87815559, + "learning_rate": 3.484300126837776e-06, + "loss": 0.90300477, + "num_input_tokens_seen": 4710080, + "step": 224, + "time_per_iteration": 2.6278574466705322 + }, + { + "auxiliary_loss_clip": 0.0137567, + "auxiliary_loss_mlp": 0.01105977, + "balance_loss_clip": 1.0984571, + "balance_loss_mlp": 1.05118871, + "epoch": 0.013527731850293101, + "flos": 18552314903040.0, + "grad_norm": 2.0180756479623962, + "language_loss": 0.893327, + "learning_rate": 3.487168070036317e-06, + "loss": 0.91814351, + "num_input_tokens_seen": 4728980, + "step": 225, + "time_per_iteration": 2.6404550075531006 + }, + { + "auxiliary_loss_clip": 0.01371936, + "auxiliary_loss_mlp": 0.01119511, + "balance_loss_clip": 1.0975461, + "balance_loss_mlp": 1.06601012, + "epoch": 0.01358785510296107, + "flos": 19165381257600.0, + "grad_norm": 3.351896294186231, + "language_loss": 0.99065119, + "learning_rate": 3.4900232950414224e-06, + "loss": 1.01556575, + "num_input_tokens_seen": 4747020, + "step": 226, + "time_per_iteration": 2.746309995651245 + }, + { + "auxiliary_loss_clip": 0.0137815, + "auxiliary_loss_mlp": 0.01116441, + "balance_loss_clip": 1.10079217, + "balance_loss_mlp": 1.06167614, + "epoch": 0.01364797835562904, + "flos": 23329941966720.0, + "grad_norm": 2.5420060055977456, + "language_loss": 0.91040832, + "learning_rate": 3.4928659141555727e-06, + "loss": 0.93535423, + "num_input_tokens_seen": 4765000, + "step": 227, + "time_per_iteration": 2.6502671241760254 + }, + { + "auxiliary_loss_clip": 0.01256599, + "auxiliary_loss_mlp": 0.01061414, + "balance_loss_clip": 1.10722363, + "balance_loss_mlp": 1.04262698, + "epoch": 0.013708101608297009, + "flos": 70993746097920.0, + "grad_norm": 0.9448336461586809, + "language_loss": 0.57632434, + "learning_rate": 3.4956960382003234e-06, + "loss": 0.59950453, + "num_input_tokens_seen": 4833210, + "step": 228, + "time_per_iteration": 3.284337043762207 + }, + { + "auxiliary_loss_clip": 0.01367104, + "auxiliary_loss_mlp": 0.0111237, + "balance_loss_clip": 1.09625006, + "balance_loss_mlp": 1.06206357, + "epoch": 0.013768224860964979, + "flos": 16325170997760.0, + "grad_norm": 4.726262672713423, + "language_loss": 0.87692118, + "learning_rate": 3.4985137765422354e-06, + "loss": 0.90171587, + "num_input_tokens_seen": 4850120, + "step": 229, + "time_per_iteration": 2.7367987632751465 + }, + { + "auxiliary_loss_clip": 0.01376927, + "auxiliary_loss_mlp": 0.01102878, + "balance_loss_clip": 1.09788978, + "balance_loss_mlp": 1.05137992, + "epoch": 0.013828348113632948, + "flos": 20193037367040.0, + "grad_norm": 3.3767975002857615, + "language_loss": 0.83971554, + "learning_rate": 3.501319237118231e-06, + "loss": 0.86451364, + "num_input_tokens_seen": 4866215, + "step": 230, + "time_per_iteration": 2.8419981002807617 + }, + { + "auxiliary_loss_clip": 0.01373342, + "auxiliary_loss_mlp": 0.0112007, + "balance_loss_clip": 1.0981102, + "balance_loss_mlp": 1.06895292, + "epoch": 0.013888471366300916, + "flos": 20741070147840.0, + "grad_norm": 3.4655070632861746, + "language_loss": 0.90509701, + "learning_rate": 3.5041125264604056e-06, + "loss": 0.93003112, + "num_input_tokens_seen": 4885630, + "step": 231, + "time_per_iteration": 2.6490917205810547 + }, + { + "auxiliary_loss_clip": 0.01375213, + "auxiliary_loss_mlp": 0.01110283, + "balance_loss_clip": 1.10066676, + "balance_loss_mlp": 1.05978644, + "epoch": 0.013948594618968886, + "flos": 22090628966400.0, + "grad_norm": 2.6967742992775814, + "language_loss": 0.8380698, + "learning_rate": 3.5068937497203002e-06, + "loss": 0.86292475, + "num_input_tokens_seen": 4905570, + "step": 232, + "time_per_iteration": 2.7096335887908936 + }, + { + "auxiliary_loss_clip": 0.01379098, + "auxiliary_loss_mlp": 0.01105075, + "balance_loss_clip": 1.09303188, + "balance_loss_mlp": 1.05166948, + "epoch": 0.014008717871636855, + "flos": 19063108258560.0, + "grad_norm": 6.0185411195112035, + "language_loss": 0.74352396, + "learning_rate": 3.509663010692652e-06, + "loss": 0.76836562, + "num_input_tokens_seen": 4923535, + "step": 233, + "time_per_iteration": 2.813244104385376 + }, + { + "auxiliary_loss_clip": 0.01380638, + "auxiliary_loss_mlp": 0.0112494, + "balance_loss_clip": 1.09952664, + "balance_loss_mlp": 1.07110488, + "epoch": 0.014068841124304825, + "flos": 14530822064640.0, + "grad_norm": 2.440766351954348, + "language_loss": 0.85784793, + "learning_rate": 3.512420411838642e-06, + "loss": 0.8829037, + "num_input_tokens_seen": 4939200, + "step": 234, + "time_per_iteration": 2.627843141555786 + }, + { + "auxiliary_loss_clip": 0.01373807, + "auxiliary_loss_mlp": 0.01113669, + "balance_loss_clip": 1.09972477, + "balance_loss_mlp": 1.06372046, + "epoch": 0.014128964376972794, + "flos": 18077396256000.0, + "grad_norm": 2.3905184693551895, + "language_loss": 0.89276969, + "learning_rate": 3.515166054308634e-06, + "loss": 0.91764438, + "num_input_tokens_seen": 4956620, + "step": 235, + "time_per_iteration": 2.8379976749420166 + }, + { + "auxiliary_loss_clip": 0.01372784, + "auxiliary_loss_mlp": 0.01114221, + "balance_loss_clip": 1.10048819, + "balance_loss_mlp": 1.06272244, + "epoch": 0.014189087629640764, + "flos": 25334331678720.0, + "grad_norm": 2.192164413401438, + "language_loss": 0.85465103, + "learning_rate": 3.5179000379644498e-06, + "loss": 0.87952107, + "num_input_tokens_seen": 4975650, + "step": 236, + "time_per_iteration": 2.6618595123291016 + }, + { + "auxiliary_loss_clip": 0.01366843, + "auxiliary_loss_mlp": 0.01107483, + "balance_loss_clip": 1.09211957, + "balance_loss_mlp": 1.05591297, + "epoch": 0.014249210882308733, + "flos": 36139744713600.0, + "grad_norm": 3.236001491687545, + "language_loss": 0.82511795, + "learning_rate": 3.520622461401154e-06, + "loss": 0.84986126, + "num_input_tokens_seen": 4997415, + "step": 237, + "time_per_iteration": 2.759021759033203 + }, + { + "auxiliary_loss_clip": 0.01366546, + "auxiliary_loss_mlp": 0.01121161, + "balance_loss_clip": 1.09617758, + "balance_loss_mlp": 1.06763554, + "epoch": 0.014309334134976702, + "flos": 12932977461120.0, + "grad_norm": 2.019746277273406, + "language_loss": 0.77129132, + "learning_rate": 3.5233334219683935e-06, + "loss": 0.79616833, + "num_input_tokens_seen": 5013905, + "step": 238, + "time_per_iteration": 2.723698377609253 + }, + { + "auxiliary_loss_clip": 0.01363555, + "auxiliary_loss_mlp": 0.01128043, + "balance_loss_clip": 1.09777069, + "balance_loss_mlp": 1.07962036, + "epoch": 0.014369457387644672, + "flos": 20777519473920.0, + "grad_norm": 2.955397374045151, + "language_loss": 0.86862135, + "learning_rate": 3.526033015791284e-06, + "loss": 0.89353728, + "num_input_tokens_seen": 5033645, + "step": 239, + "time_per_iteration": 2.6798317432403564 + }, + { + "auxiliary_loss_clip": 0.01348428, + "auxiliary_loss_mlp": 0.01106209, + "balance_loss_clip": 1.08805919, + "balance_loss_mlp": 1.05742824, + "epoch": 0.01442958064031264, + "flos": 25848536826240.0, + "grad_norm": 2.3556555829141983, + "language_loss": 0.93252432, + "learning_rate": 3.528721337790862e-06, + "loss": 0.95707071, + "num_input_tokens_seen": 5052875, + "step": 240, + "time_per_iteration": 2.689141035079956 + }, + { + "auxiliary_loss_clip": 0.01356755, + "auxiliary_loss_mlp": 0.0111023, + "balance_loss_clip": 1.09333742, + "balance_loss_mlp": 1.06292772, + "epoch": 0.014489703892980611, + "flos": 28219718269440.0, + "grad_norm": 2.6663127518675203, + "language_loss": 0.85197282, + "learning_rate": 3.531398481704111e-06, + "loss": 0.8766427, + "num_input_tokens_seen": 5075005, + "step": 241, + "time_per_iteration": 2.7327721118927 + }, + { + "auxiliary_loss_clip": 0.01355377, + "auxiliary_loss_mlp": 0.01131404, + "balance_loss_clip": 1.0982579, + "balance_loss_mlp": 1.08069217, + "epoch": 0.01454982714564858, + "flos": 22490925108480.0, + "grad_norm": 1.9653640894801685, + "language_loss": 0.8850379, + "learning_rate": 3.534064540103573e-06, + "loss": 0.90990573, + "num_input_tokens_seen": 5091875, + "step": 242, + "time_per_iteration": 2.6245076656341553 + }, + { + "auxiliary_loss_clip": 0.01355914, + "auxiliary_loss_mlp": 0.01104456, + "balance_loss_clip": 1.0918498, + "balance_loss_mlp": 1.0522666, + "epoch": 0.014609950398316548, + "flos": 21653201139840.0, + "grad_norm": 2.2389674631677488, + "language_loss": 0.86789858, + "learning_rate": 3.536719604416555e-06, + "loss": 0.89250231, + "num_input_tokens_seen": 5111290, + "step": 243, + "time_per_iteration": 3.0442543029785156 + }, + { + "auxiliary_loss_clip": 0.01362501, + "auxiliary_loss_mlp": 0.0111445, + "balance_loss_clip": 1.09421897, + "balance_loss_mlp": 1.06319046, + "epoch": 0.014670073650984519, + "flos": 21869993675520.0, + "grad_norm": 1.6919519797543334, + "language_loss": 0.84256965, + "learning_rate": 3.5393637649439464e-06, + "loss": 0.86733913, + "num_input_tokens_seen": 5132265, + "step": 244, + "time_per_iteration": 3.109563112258911 + }, + { + "auxiliary_loss_clip": 0.01368899, + "auxiliary_loss_mlp": 0.01109947, + "balance_loss_clip": 1.09585011, + "balance_loss_mlp": 1.05716062, + "epoch": 0.014730196903652487, + "flos": 23183713699200.0, + "grad_norm": 2.382737976072448, + "language_loss": 0.78703934, + "learning_rate": 3.54199711087864e-06, + "loss": 0.81182778, + "num_input_tokens_seen": 5148575, + "step": 245, + "time_per_iteration": 2.7565553188323975 + }, + { + "auxiliary_loss_clip": 0.01368768, + "auxiliary_loss_mlp": 0.01105532, + "balance_loss_clip": 1.09117758, + "balance_loss_mlp": 1.04988539, + "epoch": 0.014790320156320457, + "flos": 23222605150080.0, + "grad_norm": 2.2372684444509305, + "language_loss": 0.84208959, + "learning_rate": 3.5446197303235913e-06, + "loss": 0.86683261, + "num_input_tokens_seen": 5170415, + "step": 246, + "time_per_iteration": 2.6803231239318848 + }, + { + "auxiliary_loss_clip": 0.01359341, + "auxiliary_loss_mlp": 0.01097245, + "balance_loss_clip": 1.08871603, + "balance_loss_mlp": 1.04534173, + "epoch": 0.014850443408988426, + "flos": 15815490963840.0, + "grad_norm": 2.015549589985396, + "language_loss": 0.89846432, + "learning_rate": 3.5472317103095034e-06, + "loss": 0.92303014, + "num_input_tokens_seen": 5188565, + "step": 247, + "time_per_iteration": 2.6498398780822754 + }, + { + "auxiliary_loss_clip": 0.01359776, + "auxiliary_loss_mlp": 0.01099236, + "balance_loss_clip": 1.08465827, + "balance_loss_mlp": 1.04919171, + "epoch": 0.014910566661656396, + "flos": 22781657790720.0, + "grad_norm": 2.684585315860511, + "language_loss": 0.78128064, + "learning_rate": 3.549833136812155e-06, + "loss": 0.80587077, + "num_input_tokens_seen": 5207810, + "step": 248, + "time_per_iteration": 2.7270867824554443 + }, + { + "auxiliary_loss_clip": 0.01359669, + "auxiliary_loss_mlp": 0.01107409, + "balance_loss_clip": 1.0932709, + "balance_loss_mlp": 1.05648255, + "epoch": 0.014970689914324365, + "flos": 26865023806080.0, + "grad_norm": 2.0255889671503575, + "language_loss": 0.835899, + "learning_rate": 3.552424094769381e-06, + "loss": 0.86056983, + "num_input_tokens_seen": 5226210, + "step": 249, + "time_per_iteration": 2.7166404724121094 + }, + { + "auxiliary_loss_clip": 0.01352764, + "auxiliary_loss_mlp": 0.01102159, + "balance_loss_clip": 1.08731318, + "balance_loss_mlp": 1.05316401, + "epoch": 0.015030813166992334, + "flos": 13985662371840.0, + "grad_norm": 2.0593726107335306, + "language_loss": 0.93482637, + "learning_rate": 3.5550046680977174e-06, + "loss": 0.95937556, + "num_input_tokens_seen": 5241660, + "step": 250, + "time_per_iteration": 2.5755040645599365 + }, + { + "auxiliary_loss_clip": 0.0136231, + "auxiliary_loss_mlp": 0.01115377, + "balance_loss_clip": 1.09246659, + "balance_loss_mlp": 1.06364024, + "epoch": 0.015090936419660304, + "flos": 24717817618560.0, + "grad_norm": 2.566309005571831, + "language_loss": 0.96913564, + "learning_rate": 3.5575749397087034e-06, + "loss": 0.99391252, + "num_input_tokens_seen": 5261090, + "step": 251, + "time_per_iteration": 2.7066009044647217 + }, + { + "auxiliary_loss_clip": 0.0135536, + "auxiliary_loss_mlp": 0.01107393, + "balance_loss_clip": 1.08731556, + "balance_loss_mlp": 1.05784941, + "epoch": 0.015151059672328273, + "flos": 25738793798400.0, + "grad_norm": 2.1237969522058293, + "language_loss": 0.8425591, + "learning_rate": 3.5601349915248707e-06, + "loss": 0.86718661, + "num_input_tokens_seen": 5279175, + "step": 252, + "time_per_iteration": 2.7610058784484863 + }, + { + "auxiliary_loss_clip": 0.01348823, + "auxiliary_loss_mlp": 0.01112016, + "balance_loss_clip": 1.08797956, + "balance_loss_mlp": 1.0616858, + "epoch": 0.015211182924996243, + "flos": 21871214737920.0, + "grad_norm": 3.1835550034005555, + "language_loss": 0.98309648, + "learning_rate": 3.5626849044954064e-06, + "loss": 1.00770485, + "num_input_tokens_seen": 5296975, + "step": 253, + "time_per_iteration": 2.7185440063476562 + }, + { + "auxiliary_loss_clip": 0.01230967, + "auxiliary_loss_mlp": 0.01084656, + "balance_loss_clip": 1.08867383, + "balance_loss_mlp": 1.06529605, + "epoch": 0.015271306177664212, + "flos": 66895080888960.0, + "grad_norm": 0.8555015576691021, + "language_loss": 0.55615073, + "learning_rate": 3.5652247586115167e-06, + "loss": 0.5793069, + "num_input_tokens_seen": 5358375, + "step": 254, + "time_per_iteration": 3.240044593811035 + }, + { + "auxiliary_loss_clip": 0.01352662, + "auxiliary_loss_mlp": 0.0111837, + "balance_loss_clip": 1.08459532, + "balance_loss_mlp": 1.06772995, + "epoch": 0.01533142943033218, + "flos": 26834069260800.0, + "grad_norm": 2.374801070051703, + "language_loss": 0.90209663, + "learning_rate": 3.567754632921479e-06, + "loss": 0.92680693, + "num_input_tokens_seen": 5377255, + "step": 255, + "time_per_iteration": 2.6596426963806152 + }, + { + "auxiliary_loss_clip": 0.01349407, + "auxiliary_loss_mlp": 0.01128074, + "balance_loss_clip": 1.0860821, + "balance_loss_mlp": 1.07690895, + "epoch": 0.01539155268300015, + "flos": 20813753318400.0, + "grad_norm": 3.313127954287162, + "language_loss": 0.85470039, + "learning_rate": 3.5702746055454075e-06, + "loss": 0.87947518, + "num_input_tokens_seen": 5395320, + "step": 256, + "time_per_iteration": 2.6669342517852783 + }, + { + "auxiliary_loss_clip": 0.01356575, + "auxiliary_loss_mlp": 0.01113731, + "balance_loss_clip": 1.08668303, + "balance_loss_mlp": 1.06285238, + "epoch": 0.01545167593566812, + "flos": 15961862885760.0, + "grad_norm": 2.6238271972611495, + "language_loss": 0.71437883, + "learning_rate": 3.5727847536897254e-06, + "loss": 0.73908186, + "num_input_tokens_seen": 5411970, + "step": 257, + "time_per_iteration": 2.6034793853759766 + }, + { + "auxiliary_loss_clip": 0.01349556, + "auxiliary_loss_mlp": 0.01111035, + "balance_loss_clip": 1.08529055, + "balance_loss_mlp": 1.06046653, + "epoch": 0.01551179918833609, + "flos": 22601745544320.0, + "grad_norm": 2.1224909188844534, + "language_loss": 0.94703108, + "learning_rate": 3.5752851536613596e-06, + "loss": 0.97163701, + "num_input_tokens_seen": 5430245, + "step": 258, + "time_per_iteration": 2.7196216583251953 + }, + { + "auxiliary_loss_clip": 0.01350648, + "auxiliary_loss_mlp": 0.0112104, + "balance_loss_clip": 1.08394241, + "balance_loss_mlp": 1.07149649, + "epoch": 0.015571922441004058, + "flos": 22816706486400.0, + "grad_norm": 2.4479288495867113, + "language_loss": 0.93062371, + "learning_rate": 3.577775880881658e-06, + "loss": 0.95534062, + "num_input_tokens_seen": 5448905, + "step": 259, + "time_per_iteration": 2.64868426322937 + }, + { + "auxiliary_loss_clip": 0.01343305, + "auxiliary_loss_mlp": 0.01111799, + "balance_loss_clip": 1.08665538, + "balance_loss_mlp": 1.06404328, + "epoch": 0.015632045693672027, + "flos": 18947439486720.0, + "grad_norm": 2.7573815957824173, + "language_loss": 0.97274762, + "learning_rate": 3.5802570099000424e-06, + "loss": 0.99729872, + "num_input_tokens_seen": 5466405, + "step": 260, + "time_per_iteration": 2.642025947570801 + }, + { + "auxiliary_loss_clip": 0.01357979, + "auxiliary_loss_mlp": 0.01115992, + "balance_loss_clip": 1.08755088, + "balance_loss_mlp": 1.06718814, + "epoch": 0.015692168946339995, + "flos": 29971728046080.0, + "grad_norm": 2.213114312844813, + "language_loss": 0.87909847, + "learning_rate": 3.5827286144073947e-06, + "loss": 0.90383822, + "num_input_tokens_seen": 5487055, + "step": 261, + "time_per_iteration": 2.7102127075195312 + }, + { + "auxiliary_loss_clip": 0.01348953, + "auxiliary_loss_mlp": 0.01107825, + "balance_loss_clip": 1.08362424, + "balance_loss_mlp": 1.05730438, + "epoch": 0.015752292199007967, + "flos": 19392085946880.0, + "grad_norm": 3.275050286770421, + "language_loss": 0.67214727, + "learning_rate": 3.5851907672491904e-06, + "loss": 0.696715, + "num_input_tokens_seen": 5506600, + "step": 262, + "time_per_iteration": 2.709921360015869 + }, + { + "auxiliary_loss_clip": 0.01345513, + "auxiliary_loss_mlp": 0.01122124, + "balance_loss_clip": 1.0846417, + "balance_loss_mlp": 1.07153141, + "epoch": 0.015812415451675936, + "flos": 20339804338560.0, + "grad_norm": 2.35481442502325, + "language_loss": 0.68349051, + "learning_rate": 3.587643540438383e-06, + "loss": 0.70816684, + "num_input_tokens_seen": 5524350, + "step": 263, + "time_per_iteration": 2.6273205280303955 + }, + { + "auxiliary_loss_clip": 0.01347597, + "auxiliary_loss_mlp": 0.01110381, + "balance_loss_clip": 1.08077097, + "balance_loss_mlp": 1.05957437, + "epoch": 0.015872538704343905, + "flos": 17525412979200.0, + "grad_norm": 2.6765854315758477, + "language_loss": 0.85356057, + "learning_rate": 3.590087005168037e-06, + "loss": 0.87814033, + "num_input_tokens_seen": 5542145, + "step": 264, + "time_per_iteration": 2.685412645339966 + }, + { + "auxiliary_loss_clip": 0.01351687, + "auxiliary_loss_mlp": 0.0110105, + "balance_loss_clip": 1.08469236, + "balance_loss_mlp": 1.05365229, + "epoch": 0.015932661957011873, + "flos": 15260490944640.0, + "grad_norm": 2.2944504205749503, + "language_loss": 1.04210973, + "learning_rate": 3.5925212318237344e-06, + "loss": 1.06663716, + "num_input_tokens_seen": 5557920, + "step": 265, + "time_per_iteration": 2.69449520111084 + }, + { + "auxiliary_loss_clip": 0.01356568, + "auxiliary_loss_mlp": 0.01118616, + "balance_loss_clip": 1.08836865, + "balance_loss_mlp": 1.06516218, + "epoch": 0.015992785209679845, + "flos": 20302528999680.0, + "grad_norm": 2.4731050491462483, + "language_loss": 0.75150025, + "learning_rate": 3.5949462899957323e-06, + "loss": 0.77625203, + "num_input_tokens_seen": 5576290, + "step": 266, + "time_per_iteration": 2.668816566467285 + }, + { + "auxiliary_loss_clip": 0.01340946, + "auxiliary_loss_mlp": 0.0110246, + "balance_loss_clip": 1.0846976, + "balance_loss_mlp": 1.05246401, + "epoch": 0.016052908462347814, + "flos": 23362368969600.0, + "grad_norm": 1.8381746832120822, + "language_loss": 0.90668917, + "learning_rate": 3.5973622484909068e-06, + "loss": 0.93112326, + "num_input_tokens_seen": 5595205, + "step": 267, + "time_per_iteration": 2.6592581272125244 + }, + { + "auxiliary_loss_clip": 0.01350437, + "auxiliary_loss_mlp": 0.01124146, + "balance_loss_clip": 1.08441842, + "balance_loss_mlp": 1.07522273, + "epoch": 0.016113031715015783, + "flos": 21286588976640.0, + "grad_norm": 2.8182528398379763, + "language_loss": 0.85820997, + "learning_rate": 3.599769175344462e-06, + "loss": 0.88295585, + "num_input_tokens_seen": 5612645, + "step": 268, + "time_per_iteration": 4.321256637573242 + }, + { + "auxiliary_loss_clip": 0.01342576, + "auxiliary_loss_mlp": 0.01096155, + "balance_loss_clip": 1.08563972, + "balance_loss_mlp": 1.04789853, + "epoch": 0.01617315496768375, + "flos": 18914689261440.0, + "grad_norm": 4.497423811483033, + "language_loss": 0.88334453, + "learning_rate": 3.602167137831432e-06, + "loss": 0.90773177, + "num_input_tokens_seen": 5628345, + "step": 269, + "time_per_iteration": 4.199312925338745 + }, + { + "auxiliary_loss_clip": 0.01348034, + "auxiliary_loss_mlp": 0.01104511, + "balance_loss_clip": 1.08264267, + "balance_loss_mlp": 1.05117667, + "epoch": 0.01623327822035172, + "flos": 16546488647040.0, + "grad_norm": 13.80351748596323, + "language_loss": 0.97330916, + "learning_rate": 3.6045562024779565e-06, + "loss": 0.99783456, + "num_input_tokens_seen": 5645940, + "step": 270, + "time_per_iteration": 2.618669033050537 + }, + { + "auxiliary_loss_clip": 0.01347947, + "auxiliary_loss_mlp": 0.01120558, + "balance_loss_clip": 1.08745313, + "balance_loss_mlp": 1.07120585, + "epoch": 0.016293401473019692, + "flos": 23513481486720.0, + "grad_norm": 2.0651586741038033, + "language_loss": 0.86291456, + "learning_rate": 3.606936435072361e-06, + "loss": 0.88759971, + "num_input_tokens_seen": 5665690, + "step": 271, + "time_per_iteration": 4.2279582023620605 + }, + { + "auxiliary_loss_clip": 0.0134523, + "auxiliary_loss_mlp": 0.01103544, + "balance_loss_clip": 1.07956553, + "balance_loss_mlp": 1.05469179, + "epoch": 0.01635352472568766, + "flos": 29016072748800.0, + "grad_norm": 2.569732634129511, + "language_loss": 0.81386071, + "learning_rate": 3.609307900676025e-06, + "loss": 0.83834851, + "num_input_tokens_seen": 5683190, + "step": 272, + "time_per_iteration": 2.6775057315826416 + }, + { + "auxiliary_loss_clip": 0.01337817, + "auxiliary_loss_mlp": 0.01118338, + "balance_loss_clip": 1.08088934, + "balance_loss_mlp": 1.07024884, + "epoch": 0.01641364797835563, + "flos": 13370513028480.0, + "grad_norm": 2.4253143134518393, + "language_loss": 0.81378174, + "learning_rate": 3.611670663634051e-06, + "loss": 0.83834326, + "num_input_tokens_seen": 5699780, + "step": 273, + "time_per_iteration": 2.759084463119507 + }, + { + "auxiliary_loss_clip": 0.01337734, + "auxiliary_loss_mlp": 0.01106381, + "balance_loss_clip": 1.07698154, + "balance_loss_mlp": 1.05698013, + "epoch": 0.016473771231023598, + "flos": 18878239935360.0, + "grad_norm": 2.2397820643471635, + "language_loss": 0.91556299, + "learning_rate": 3.614024787585744e-06, + "loss": 0.94000411, + "num_input_tokens_seen": 5716980, + "step": 274, + "time_per_iteration": 2.5743818283081055 + }, + { + "auxiliary_loss_clip": 0.01336702, + "auxiliary_loss_mlp": 0.01110564, + "balance_loss_clip": 1.07990205, + "balance_loss_mlp": 1.06123519, + "epoch": 0.016533894483691566, + "flos": 22601637803520.0, + "grad_norm": 4.074291276087454, + "language_loss": 0.88154364, + "learning_rate": 3.6163703354748927e-06, + "loss": 0.90601623, + "num_input_tokens_seen": 5737780, + "step": 275, + "time_per_iteration": 2.648561954498291 + }, + { + "auxiliary_loss_clip": 0.01337679, + "auxiliary_loss_mlp": 0.01102686, + "balance_loss_clip": 1.07896304, + "balance_loss_mlp": 1.05233216, + "epoch": 0.01659401773635954, + "flos": 21507188353920.0, + "grad_norm": 2.778862596434413, + "language_loss": 0.80831546, + "learning_rate": 3.6187073695598707e-06, + "loss": 0.83271915, + "num_input_tokens_seen": 5758330, + "step": 276, + "time_per_iteration": 2.615337371826172 + }, + { + "auxiliary_loss_clip": 0.01330446, + "auxiliary_loss_mlp": 0.01099122, + "balance_loss_clip": 1.07934141, + "balance_loss_mlp": 1.05394185, + "epoch": 0.016654140989027507, + "flos": 32850973411200.0, + "grad_norm": 2.3565966501595317, + "language_loss": 0.8107357, + "learning_rate": 3.621035951423551e-06, + "loss": 0.83503139, + "num_input_tokens_seen": 5778340, + "step": 277, + "time_per_iteration": 2.704707384109497 + }, + { + "auxiliary_loss_clip": 0.01328739, + "auxiliary_loss_mlp": 0.01097325, + "balance_loss_clip": 1.07253933, + "balance_loss_mlp": 1.04842567, + "epoch": 0.016714264241695476, + "flos": 12306228024960.0, + "grad_norm": 2.278403980825578, + "language_loss": 0.80672061, + "learning_rate": 3.623356141983041e-06, + "loss": 0.83098125, + "num_input_tokens_seen": 5794295, + "step": 278, + "time_per_iteration": 2.5711722373962402 + }, + { + "auxiliary_loss_clip": 0.01332855, + "auxiliary_loss_mlp": 0.01105304, + "balance_loss_clip": 1.07711482, + "balance_loss_mlp": 1.05802584, + "epoch": 0.016774387494363444, + "flos": 27123796362240.0, + "grad_norm": 1.6730910273299624, + "language_loss": 0.90579838, + "learning_rate": 3.6256680014992486e-06, + "loss": 0.93017995, + "num_input_tokens_seen": 5814405, + "step": 279, + "time_per_iteration": 2.647176504135132 + }, + { + "auxiliary_loss_clip": 0.01336444, + "auxiliary_loss_mlp": 0.01118781, + "balance_loss_clip": 1.07660341, + "balance_loss_mlp": 1.06940424, + "epoch": 0.016834510747031413, + "flos": 20191493082240.0, + "grad_norm": 2.983434125222615, + "language_loss": 0.94109571, + "learning_rate": 3.6279715895862713e-06, + "loss": 0.96564794, + "num_input_tokens_seen": 5832795, + "step": 280, + "time_per_iteration": 2.6599597930908203 + }, + { + "auxiliary_loss_clip": 0.01338025, + "auxiliary_loss_mlp": 0.01112251, + "balance_loss_clip": 1.07633305, + "balance_loss_mlp": 1.06277943, + "epoch": 0.016894633999699385, + "flos": 27274262434560.0, + "grad_norm": 2.923231508499632, + "language_loss": 0.74043828, + "learning_rate": 3.6302669652206183e-06, + "loss": 0.76494104, + "num_input_tokens_seen": 5855750, + "step": 281, + "time_per_iteration": 2.707334041595459 + }, + { + "auxiliary_loss_clip": 0.01331803, + "auxiliary_loss_mlp": 0.01119378, + "balance_loss_clip": 1.07748449, + "balance_loss_mlp": 1.07257664, + "epoch": 0.016954757252367354, + "flos": 14902964922240.0, + "grad_norm": 3.4716590017830815, + "language_loss": 0.8037743, + "learning_rate": 3.632554186750274e-06, + "loss": 0.82828605, + "num_input_tokens_seen": 5872610, + "step": 282, + "time_per_iteration": 2.7092578411102295 + }, + { + "auxiliary_loss_clip": 0.01338262, + "auxiliary_loss_mlp": 0.01119228, + "balance_loss_clip": 1.07964015, + "balance_loss_mlp": 1.06992292, + "epoch": 0.017014880505035322, + "flos": 21358805270400.0, + "grad_norm": 2.1549541142250437, + "language_loss": 0.7771889, + "learning_rate": 3.6348333119035937e-06, + "loss": 0.80176377, + "num_input_tokens_seen": 5892985, + "step": 283, + "time_per_iteration": 2.6203463077545166 + }, + { + "auxiliary_loss_clip": 0.01338469, + "auxiliary_loss_mlp": 0.01095293, + "balance_loss_clip": 1.08097184, + "balance_loss_mlp": 1.04930258, + "epoch": 0.01707500375770329, + "flos": 35333154858240.0, + "grad_norm": 3.515547642871152, + "language_loss": 0.84393585, + "learning_rate": 3.6371043977980503e-06, + "loss": 0.8682735, + "num_input_tokens_seen": 5914060, + "step": 284, + "time_per_iteration": 2.7212212085723877 + }, + { + "auxiliary_loss_clip": 0.01326431, + "auxiliary_loss_mlp": 0.0110234, + "balance_loss_clip": 1.07471085, + "balance_loss_mlp": 1.05384588, + "epoch": 0.01713512701037126, + "flos": 23582070506880.0, + "grad_norm": 2.48288568465737, + "language_loss": 0.9726184, + "learning_rate": 3.639367500948819e-06, + "loss": 0.99690616, + "num_input_tokens_seen": 5932860, + "step": 285, + "time_per_iteration": 2.6124978065490723 + }, + { + "auxiliary_loss_clip": 0.01330479, + "auxiliary_loss_mlp": 0.01093845, + "balance_loss_clip": 1.07736433, + "balance_loss_mlp": 1.04880786, + "epoch": 0.01719525026303923, + "flos": 27634661544960.0, + "grad_norm": 3.7129888542036236, + "language_loss": 0.93716776, + "learning_rate": 3.6416226772772178e-06, + "loss": 0.96141094, + "num_input_tokens_seen": 5952725, + "step": 286, + "time_per_iteration": 2.784734010696411 + }, + { + "auxiliary_loss_clip": 0.01324431, + "auxiliary_loss_mlp": 0.01090526, + "balance_loss_clip": 1.07395172, + "balance_loss_mlp": 1.04343772, + "epoch": 0.0172553735157072, + "flos": 26979722910720.0, + "grad_norm": 1.701008847608052, + "language_loss": 0.92258352, + "learning_rate": 3.643869982119001e-06, + "loss": 0.94673312, + "num_input_tokens_seen": 5970560, + "step": 287, + "time_per_iteration": 2.6322476863861084 + }, + { + "auxiliary_loss_clip": 0.0132601, + "auxiliary_loss_mlp": 0.01089932, + "balance_loss_clip": 1.07207084, + "balance_loss_mlp": 1.04332089, + "epoch": 0.01731549676837517, + "flos": 14056621689600.0, + "grad_norm": 2.618443952822227, + "language_loss": 1.01585138, + "learning_rate": 3.646109470232502e-06, + "loss": 1.04001069, + "num_input_tokens_seen": 5982980, + "step": 288, + "time_per_iteration": 2.5362799167633057 + }, + { + "auxiliary_loss_clip": 0.01210401, + "auxiliary_loss_mlp": 0.01031023, + "balance_loss_clip": 1.07221532, + "balance_loss_mlp": 1.01433325, + "epoch": 0.017375620021043137, + "flos": 66510694471680.0, + "grad_norm": 1.4394439674383683, + "language_loss": 0.63870847, + "learning_rate": 3.6483411958066417e-06, + "loss": 0.66112274, + "num_input_tokens_seen": 6049445, + "step": 289, + "time_per_iteration": 3.264655113220215 + }, + { + "auxiliary_loss_clip": 0.01329668, + "auxiliary_loss_mlp": 0.01111463, + "balance_loss_clip": 1.07741642, + "balance_loss_mlp": 1.06673574, + "epoch": 0.01743574327371111, + "flos": 15225154940160.0, + "grad_norm": 2.3489241047261493, + "language_loss": 0.8827278, + "learning_rate": 3.6505652124687957e-06, + "loss": 0.90713912, + "num_input_tokens_seen": 6064150, + "step": 290, + "time_per_iteration": 2.5733680725097656 + }, + { + "auxiliary_loss_clip": 0.01325611, + "auxiliary_loss_mlp": 0.01090202, + "balance_loss_clip": 1.07546568, + "balance_loss_mlp": 1.04382992, + "epoch": 0.017495866526379078, + "flos": 25373869574400.0, + "grad_norm": 1.854547897862497, + "language_loss": 0.84709382, + "learning_rate": 3.6527815732925258e-06, + "loss": 0.87125194, + "num_input_tokens_seen": 6083920, + "step": 291, + "time_per_iteration": 2.6280152797698975 + }, + { + "auxiliary_loss_clip": 0.01333427, + "auxiliary_loss_mlp": 0.01108849, + "balance_loss_clip": 1.08316517, + "balance_loss_mlp": 1.05913842, + "epoch": 0.017555989779047047, + "flos": 26359473836160.0, + "grad_norm": 1.9015769863011813, + "language_loss": 0.72604382, + "learning_rate": 3.6549903308051806e-06, + "loss": 0.75046659, + "num_input_tokens_seen": 6105460, + "step": 292, + "time_per_iteration": 2.6762425899505615 + }, + { + "auxiliary_loss_clip": 0.01322629, + "auxiliary_loss_mlp": 0.011042, + "balance_loss_clip": 1.07431889, + "balance_loss_mlp": 1.05730343, + "epoch": 0.017616113031715015, + "flos": 22338807010560.0, + "grad_norm": 2.958120957948009, + "language_loss": 0.87187028, + "learning_rate": 3.6571915369953646e-06, + "loss": 0.89613855, + "num_input_tokens_seen": 6122890, + "step": 293, + "time_per_iteration": 2.680638074874878 + }, + { + "auxiliary_loss_clip": 0.01323137, + "auxiliary_loss_mlp": 0.01105499, + "balance_loss_clip": 1.07446837, + "balance_loss_mlp": 1.05946016, + "epoch": 0.017676236284382984, + "flos": 20156911263360.0, + "grad_norm": 2.0174327466020108, + "language_loss": 0.80922306, + "learning_rate": 3.6593852433202797e-06, + "loss": 0.83350945, + "num_input_tokens_seen": 6142890, + "step": 294, + "time_per_iteration": 2.734192371368408 + }, + { + "auxiliary_loss_clip": 0.01321942, + "auxiliary_loss_mlp": 0.01110938, + "balance_loss_clip": 1.07123852, + "balance_loss_mlp": 1.06447053, + "epoch": 0.017736359537050956, + "flos": 25223331674880.0, + "grad_norm": 1.918751698027929, + "language_loss": 0.84236372, + "learning_rate": 3.6615715007129453e-06, + "loss": 0.86669254, + "num_input_tokens_seen": 6162030, + "step": 295, + "time_per_iteration": 2.697453260421753 + }, + { + "auxiliary_loss_clip": 0.01327006, + "auxiliary_loss_mlp": 0.01106026, + "balance_loss_clip": 1.08101177, + "balance_loss_mlp": 1.06022596, + "epoch": 0.017796482789718925, + "flos": 20338798757760.0, + "grad_norm": 6.282143309537842, + "language_loss": 0.84657049, + "learning_rate": 3.6637503595892897e-06, + "loss": 0.87090087, + "num_input_tokens_seen": 6180540, + "step": 296, + "time_per_iteration": 2.615553379058838 + }, + { + "auxiliary_loss_clip": 0.01327645, + "auxiliary_loss_mlp": 0.01100373, + "balance_loss_clip": 1.07743204, + "balance_loss_mlp": 1.05483556, + "epoch": 0.017856606042386893, + "flos": 22379206832640.0, + "grad_norm": 2.11506774639899, + "language_loss": 0.87730408, + "learning_rate": 3.665921869855132e-06, + "loss": 0.90158427, + "num_input_tokens_seen": 6199425, + "step": 297, + "time_per_iteration": 2.57332444190979 + }, + { + "auxiliary_loss_clip": 0.01327036, + "auxiliary_loss_mlp": 0.01099264, + "balance_loss_clip": 1.07569122, + "balance_loss_mlp": 1.05439401, + "epoch": 0.017916729295054862, + "flos": 20230061310720.0, + "grad_norm": 3.8489969085192848, + "language_loss": 0.88623232, + "learning_rate": 3.6680860809130346e-06, + "loss": 0.91049528, + "num_input_tokens_seen": 6219170, + "step": 298, + "time_per_iteration": 2.589998483657837 + }, + { + "auxiliary_loss_clip": 0.01320378, + "auxiliary_loss_mlp": 0.01116275, + "balance_loss_clip": 1.07594109, + "balance_loss_mlp": 1.06866288, + "epoch": 0.01797685254772283, + "flos": 19390972625280.0, + "grad_norm": 1.8580744591898186, + "language_loss": 0.88573742, + "learning_rate": 3.6702430416690516e-06, + "loss": 0.91010398, + "num_input_tokens_seen": 6237930, + "step": 299, + "time_per_iteration": 2.6367874145507812 + }, + { + "auxiliary_loss_clip": 0.01326683, + "auxiliary_loss_mlp": 0.01098735, + "balance_loss_clip": 1.07623005, + "balance_loss_mlp": 1.05209982, + "epoch": 0.018036975800390802, + "flos": 24426007528320.0, + "grad_norm": 5.242743153055466, + "language_loss": 0.64609963, + "learning_rate": 3.672392800539357e-06, + "loss": 0.67035377, + "num_input_tokens_seen": 6257170, + "step": 300, + "time_per_iteration": 2.611854076385498 + }, + { + "auxiliary_loss_clip": 0.01327604, + "auxiliary_loss_mlp": 0.0110722, + "balance_loss_clip": 1.07970035, + "balance_loss_mlp": 1.0608716, + "epoch": 0.01809709905305877, + "flos": 15778933896960.0, + "grad_norm": 2.090592353158092, + "language_loss": 0.88402581, + "learning_rate": 3.6745354054567686e-06, + "loss": 0.90837401, + "num_input_tokens_seen": 6274780, + "step": 301, + "time_per_iteration": 2.5728893280029297 + }, + { + "auxiliary_loss_clip": 0.01195509, + "auxiliary_loss_mlp": 0.01016896, + "balance_loss_clip": 1.06018794, + "balance_loss_mlp": 1.00077868, + "epoch": 0.01815722230572674, + "flos": 67348382526720.0, + "grad_norm": 0.835647416945689, + "language_loss": 0.62215596, + "learning_rate": 3.676670903877158e-06, + "loss": 0.64428002, + "num_input_tokens_seen": 6340435, + "step": 302, + "time_per_iteration": 3.2994627952575684 + }, + { + "auxiliary_loss_clip": 0.01316766, + "auxiliary_loss_mlp": 0.01103983, + "balance_loss_clip": 1.07215607, + "balance_loss_mlp": 1.05744398, + "epoch": 0.01821734555839471, + "flos": 15485615435520.0, + "grad_norm": 2.207343695057352, + "language_loss": 0.89533734, + "learning_rate": 3.6787993427857567e-06, + "loss": 0.91954482, + "num_input_tokens_seen": 6358160, + "step": 303, + "time_per_iteration": 2.6820528507232666 + }, + { + "auxiliary_loss_clip": 0.01324696, + "auxiliary_loss_mlp": 0.01118815, + "balance_loss_clip": 1.07680213, + "balance_loss_mlp": 1.07110751, + "epoch": 0.018277468811062677, + "flos": 24097424889600.0, + "grad_norm": 1.8023888646649528, + "language_loss": 0.80464816, + "learning_rate": 3.680920768703364e-06, + "loss": 0.82908332, + "num_input_tokens_seen": 6378485, + "step": 304, + "time_per_iteration": 2.6672680377960205 + }, + { + "auxiliary_loss_clip": 0.01317348, + "auxiliary_loss_mlp": 0.0109718, + "balance_loss_clip": 1.07885289, + "balance_loss_mlp": 1.05261946, + "epoch": 0.01833759206373065, + "flos": 20959335141120.0, + "grad_norm": 1.7001497876372458, + "language_loss": 0.830176, + "learning_rate": 3.6830352276924415e-06, + "loss": 0.85432124, + "num_input_tokens_seen": 6397845, + "step": 305, + "time_per_iteration": 2.6906235218048096 + }, + { + "auxiliary_loss_clip": 0.0132097, + "auxiliary_loss_mlp": 0.0109217, + "balance_loss_clip": 1.07342863, + "balance_loss_mlp": 1.04830098, + "epoch": 0.018397715316398618, + "flos": 19390757143680.0, + "grad_norm": 1.9447067658497708, + "language_loss": 0.90652823, + "learning_rate": 3.685142765363119e-06, + "loss": 0.93065965, + "num_input_tokens_seen": 6416475, + "step": 306, + "time_per_iteration": 2.733896493911743 + }, + { + "auxiliary_loss_clip": 0.01311716, + "auxiliary_loss_mlp": 0.01090613, + "balance_loss_clip": 1.06926739, + "balance_loss_mlp": 1.04719687, + "epoch": 0.018457838569066586, + "flos": 29132531619840.0, + "grad_norm": 2.0292513132920327, + "language_loss": 0.86621094, + "learning_rate": 3.687243426879095e-06, + "loss": 0.89023423, + "num_input_tokens_seen": 6437520, + "step": 307, + "time_per_iteration": 2.79573130607605 + }, + { + "auxiliary_loss_clip": 0.01314152, + "auxiliary_loss_mlp": 0.01100158, + "balance_loss_clip": 1.07573533, + "balance_loss_mlp": 1.05204546, + "epoch": 0.018517961821734555, + "flos": 19208654167680.0, + "grad_norm": 2.2998391330004484, + "language_loss": 0.71648097, + "learning_rate": 3.6893372569634466e-06, + "loss": 0.74062407, + "num_input_tokens_seen": 6455680, + "step": 308, + "time_per_iteration": 2.6803479194641113 + }, + { + "auxiliary_loss_clip": 0.0131814, + "auxiliary_loss_mlp": 0.01100015, + "balance_loss_clip": 1.07103491, + "balance_loss_mlp": 1.055884, + "epoch": 0.018578085074402523, + "flos": 19863018184320.0, + "grad_norm": 1.9627986765623684, + "language_loss": 0.91721022, + "learning_rate": 3.6914242999043395e-06, + "loss": 0.94139183, + "num_input_tokens_seen": 6474880, + "step": 309, + "time_per_iteration": 2.6205942630767822 + }, + { + "auxiliary_loss_clip": 0.01328211, + "auxiliary_loss_mlp": 0.01096567, + "balance_loss_clip": 1.07380366, + "balance_loss_mlp": 1.05000353, + "epoch": 0.018638208327070496, + "flos": 29606947476480.0, + "grad_norm": 6.892621376926494, + "language_loss": 0.72681284, + "learning_rate": 3.69350459956065e-06, + "loss": 0.75106061, + "num_input_tokens_seen": 6495945, + "step": 310, + "time_per_iteration": 2.722083806991577 + }, + { + "auxiliary_loss_clip": 0.01317905, + "auxiliary_loss_mlp": 0.01109981, + "balance_loss_clip": 1.07726264, + "balance_loss_mlp": 1.06556404, + "epoch": 0.018698331579738464, + "flos": 45731555907840.0, + "grad_norm": 3.7718819751532915, + "language_loss": 0.7384423, + "learning_rate": 3.695578199367497e-06, + "loss": 0.76272112, + "num_input_tokens_seen": 6519930, + "step": 311, + "time_per_iteration": 2.825033187866211 + }, + { + "auxiliary_loss_clip": 0.01329951, + "auxiliary_loss_mlp": 0.01111212, + "balance_loss_clip": 1.07628894, + "balance_loss_mlp": 1.06629419, + "epoch": 0.018758454832406433, + "flos": 20483662308480.0, + "grad_norm": 2.770967336772884, + "language_loss": 0.91862637, + "learning_rate": 3.6976451423416825e-06, + "loss": 0.94303799, + "num_input_tokens_seen": 6535070, + "step": 312, + "time_per_iteration": 2.621713399887085 + }, + { + "auxiliary_loss_clip": 0.01325157, + "auxiliary_loss_mlp": 0.01104382, + "balance_loss_clip": 1.07615411, + "balance_loss_mlp": 1.05767632, + "epoch": 0.0188185780850744, + "flos": 15777784661760.0, + "grad_norm": 2.9635820196734968, + "language_loss": 0.89915395, + "learning_rate": 3.699705471087043e-06, + "loss": 0.9234494, + "num_input_tokens_seen": 6554135, + "step": 313, + "time_per_iteration": 2.655832290649414 + }, + { + "auxiliary_loss_clip": 0.01330298, + "auxiliary_loss_mlp": 0.01100304, + "balance_loss_clip": 1.07590652, + "balance_loss_mlp": 1.05290627, + "epoch": 0.018878701337742373, + "flos": 22455732758400.0, + "grad_norm": 2.496751868295619, + "language_loss": 0.73230743, + "learning_rate": 3.7017592277997256e-06, + "loss": 0.75661349, + "num_input_tokens_seen": 6572275, + "step": 314, + "time_per_iteration": 2.7271478176116943 + }, + { + "auxiliary_loss_clip": 0.01316125, + "auxiliary_loss_mlp": 0.0109918, + "balance_loss_clip": 1.07276177, + "balance_loss_mlp": 1.05504847, + "epoch": 0.018938824590410342, + "flos": 30993530238720.0, + "grad_norm": 4.902696728495509, + "language_loss": 0.89791477, + "learning_rate": 3.7038064542733654e-06, + "loss": 0.92206776, + "num_input_tokens_seen": 6594520, + "step": 315, + "time_per_iteration": 2.8392581939697266 + }, + { + "auxiliary_loss_clip": 0.01318996, + "auxiliary_loss_mlp": 0.01090912, + "balance_loss_clip": 1.07489038, + "balance_loss_mlp": 1.0459938, + "epoch": 0.01899894784307831, + "flos": 23258910821760.0, + "grad_norm": 1.7913693176265926, + "language_loss": 0.80753106, + "learning_rate": 3.7058471919041945e-06, + "loss": 0.83163017, + "num_input_tokens_seen": 6614245, + "step": 316, + "time_per_iteration": 5.947572946548462 + }, + { + "auxiliary_loss_clip": 0.01311956, + "auxiliary_loss_mlp": 0.01089642, + "balance_loss_clip": 1.07137465, + "balance_loss_mlp": 1.0456543, + "epoch": 0.01905907109574628, + "flos": 17457901367040.0, + "grad_norm": 2.387485177514069, + "language_loss": 0.90268195, + "learning_rate": 3.7078814816960605e-06, + "loss": 0.92669797, + "num_input_tokens_seen": 6632015, + "step": 317, + "time_per_iteration": 2.6020121574401855 + }, + { + "auxiliary_loss_clip": 0.01309204, + "auxiliary_loss_mlp": 0.01094151, + "balance_loss_clip": 1.06959438, + "balance_loss_mlp": 1.04866087, + "epoch": 0.019119194348414248, + "flos": 14970225139200.0, + "grad_norm": 2.699920016946436, + "language_loss": 0.90908796, + "learning_rate": 3.709909364265374e-06, + "loss": 0.93312144, + "num_input_tokens_seen": 6649015, + "step": 318, + "time_per_iteration": 2.6226727962493896 + }, + { + "auxiliary_loss_clip": 0.0131517, + "auxiliary_loss_mlp": 0.0108873, + "balance_loss_clip": 1.07329845, + "balance_loss_mlp": 1.04650569, + "epoch": 0.01917931760108222, + "flos": 25482822503040.0, + "grad_norm": 3.7235091594600527, + "language_loss": 0.94196522, + "learning_rate": 3.7119308798459706e-06, + "loss": 0.96600425, + "num_input_tokens_seen": 6669225, + "step": 319, + "time_per_iteration": 4.313559532165527 + }, + { + "auxiliary_loss_clip": 0.01182996, + "auxiliary_loss_mlp": 0.01042961, + "balance_loss_clip": 1.0530417, + "balance_loss_mlp": 1.02722502, + "epoch": 0.01923944085375019, + "flos": 71556967353600.0, + "grad_norm": 0.9160463550530726, + "language_loss": 0.59825242, + "learning_rate": 3.7139460682939026e-06, + "loss": 0.62051201, + "num_input_tokens_seen": 6725775, + "step": 320, + "time_per_iteration": 3.075559616088867 + }, + { + "auxiliary_loss_clip": 0.01309616, + "auxiliary_loss_mlp": 0.01098729, + "balance_loss_clip": 1.07021952, + "balance_loss_mlp": 1.05552816, + "epoch": 0.019299564106418157, + "flos": 19682495406720.0, + "grad_norm": 2.408603693581503, + "language_loss": 0.89825904, + "learning_rate": 3.715954969092154e-06, + "loss": 0.92234254, + "num_input_tokens_seen": 6744170, + "step": 321, + "time_per_iteration": 2.592599868774414 + }, + { + "auxiliary_loss_clip": 0.01321683, + "auxiliary_loss_mlp": 0.0111733, + "balance_loss_clip": 1.07551682, + "balance_loss_mlp": 1.07179189, + "epoch": 0.019359687359086126, + "flos": 24387151991040.0, + "grad_norm": 2.267877552506471, + "language_loss": 0.82988816, + "learning_rate": 3.7179576213552805e-06, + "loss": 0.85427827, + "num_input_tokens_seen": 6764565, + "step": 322, + "time_per_iteration": 2.612514019012451 + }, + { + "auxiliary_loss_clip": 0.01322628, + "auxiliary_loss_mlp": 0.01090342, + "balance_loss_clip": 1.07440495, + "balance_loss_mlp": 1.04764152, + "epoch": 0.019419810611754094, + "flos": 23951376190080.0, + "grad_norm": 2.0747323980059176, + "language_loss": 0.72968364, + "learning_rate": 3.719954063833981e-06, + "loss": 0.75381339, + "num_input_tokens_seen": 6785310, + "step": 323, + "time_per_iteration": 2.8516433238983154 + }, + { + "auxiliary_loss_clip": 0.01309889, + "auxiliary_loss_mlp": 0.01087354, + "balance_loss_clip": 1.06763327, + "balance_loss_mlp": 1.04350924, + "epoch": 0.019479933864422067, + "flos": 22160223567360.0, + "grad_norm": 5.098830481531631, + "language_loss": 0.92534518, + "learning_rate": 3.721944334919596e-06, + "loss": 0.94931757, + "num_input_tokens_seen": 6803290, + "step": 324, + "time_per_iteration": 2.7112083435058594 + }, + { + "auxiliary_loss_clip": 0.01319865, + "auxiliary_loss_mlp": 0.01087463, + "balance_loss_clip": 1.07595992, + "balance_loss_mlp": 1.04593039, + "epoch": 0.019540057117090035, + "flos": 22236821320320.0, + "grad_norm": 2.694279329385173, + "language_loss": 0.6499871, + "learning_rate": 3.7239284726485375e-06, + "loss": 0.67406034, + "num_input_tokens_seen": 6822570, + "step": 325, + "time_per_iteration": 2.6996335983276367 + }, + { + "auxiliary_loss_clip": 0.01317039, + "auxiliary_loss_mlp": 0.01105231, + "balance_loss_clip": 1.07923234, + "balance_loss_mlp": 1.0612433, + "epoch": 0.019600180369758004, + "flos": 23076771932160.0, + "grad_norm": 2.199573917496143, + "language_loss": 0.76425928, + "learning_rate": 3.72590651470665e-06, + "loss": 0.78848195, + "num_input_tokens_seen": 6841910, + "step": 326, + "time_per_iteration": 2.669921636581421 + }, + { + "auxiliary_loss_clip": 0.01310872, + "auxiliary_loss_mlp": 0.01102827, + "balance_loss_clip": 1.07537532, + "balance_loss_mlp": 1.05857611, + "epoch": 0.019660303622425972, + "flos": 25410857604480.0, + "grad_norm": 2.002106210486272, + "language_loss": 0.79746711, + "learning_rate": 3.727878498433505e-06, + "loss": 0.82160407, + "num_input_tokens_seen": 6862480, + "step": 327, + "time_per_iteration": 2.6755526065826416 + }, + { + "auxiliary_loss_clip": 0.01318067, + "auxiliary_loss_mlp": 0.01107503, + "balance_loss_clip": 1.07665229, + "balance_loss_mlp": 1.06413484, + "epoch": 0.01972042687509394, + "flos": 23657519024640.0, + "grad_norm": 2.2750901933211853, + "language_loss": 0.80801213, + "learning_rate": 3.7298444608266328e-06, + "loss": 0.83226776, + "num_input_tokens_seen": 6882015, + "step": 328, + "time_per_iteration": 2.637685537338257 + }, + { + "auxiliary_loss_clip": 0.01315295, + "auxiliary_loss_mlp": 0.0109303, + "balance_loss_clip": 1.07037783, + "balance_loss_mlp": 1.04866028, + "epoch": 0.019780550127761913, + "flos": 18223480869120.0, + "grad_norm": 2.405541058492227, + "language_loss": 0.93632352, + "learning_rate": 3.731804438545683e-06, + "loss": 0.96040684, + "num_input_tokens_seen": 6899785, + "step": 329, + "time_per_iteration": 2.6796083450317383 + }, + { + "auxiliary_loss_clip": 0.01322292, + "auxiliary_loss_mlp": 0.01105864, + "balance_loss_clip": 1.07484818, + "balance_loss_mlp": 1.06147075, + "epoch": 0.01984067338042988, + "flos": 22418780641920.0, + "grad_norm": 2.5592272082806193, + "language_loss": 0.74555469, + "learning_rate": 3.7337584679165324e-06, + "loss": 0.76983619, + "num_input_tokens_seen": 6918575, + "step": 330, + "time_per_iteration": 2.601409912109375 + }, + { + "auxiliary_loss_clip": 0.01317948, + "auxiliary_loss_mlp": 0.01114301, + "balance_loss_clip": 1.07344222, + "balance_loss_mlp": 1.07036066, + "epoch": 0.01990079663309785, + "flos": 17055199013760.0, + "grad_norm": 3.488543119611403, + "language_loss": 0.93571568, + "learning_rate": 3.7357065849353186e-06, + "loss": 0.96003819, + "num_input_tokens_seen": 6936965, + "step": 331, + "time_per_iteration": 2.5878171920776367 + }, + { + "auxiliary_loss_clip": 0.01305167, + "auxiliary_loss_mlp": 0.01085034, + "balance_loss_clip": 1.07153869, + "balance_loss_mlp": 1.04354918, + "epoch": 0.01996091988576582, + "flos": 15961791058560.0, + "grad_norm": 2.2258832873825414, + "language_loss": 0.92571288, + "learning_rate": 3.737648825272422e-06, + "loss": 0.94961488, + "num_input_tokens_seen": 6953475, + "step": 332, + "time_per_iteration": 2.623152732849121 + }, + { + "auxiliary_loss_clip": 0.01312942, + "auxiliary_loss_mlp": 0.01084395, + "balance_loss_clip": 1.07562757, + "balance_loss_mlp": 1.04007244, + "epoch": 0.02002104313843379, + "flos": 23586451966080.0, + "grad_norm": 2.4592631722403384, + "language_loss": 0.75769603, + "learning_rate": 3.739585224276384e-06, + "loss": 0.78166932, + "num_input_tokens_seen": 6971630, + "step": 333, + "time_per_iteration": 2.706662893295288 + }, + { + "auxiliary_loss_clip": 0.01314002, + "auxiliary_loss_mlp": 0.01086515, + "balance_loss_clip": 1.07386065, + "balance_loss_mlp": 1.04383802, + "epoch": 0.02008116639110176, + "flos": 34094883352320.0, + "grad_norm": 2.3417128756234136, + "language_loss": 0.79108953, + "learning_rate": 3.7415158169777673e-06, + "loss": 0.81509471, + "num_input_tokens_seen": 6992775, + "step": 334, + "time_per_iteration": 2.75630784034729 + }, + { + "auxiliary_loss_clip": 0.01312504, + "auxiliary_loss_mlp": 0.0109658, + "balance_loss_clip": 1.068084, + "balance_loss_mlp": 1.05089891, + "epoch": 0.020141289643769728, + "flos": 19683716469120.0, + "grad_norm": 1.8747627860696265, + "language_loss": 0.8305167, + "learning_rate": 3.7434406380929575e-06, + "loss": 0.85460746, + "num_input_tokens_seen": 7011425, + "step": 335, + "time_per_iteration": 2.6342434883117676 + }, + { + "auxiliary_loss_clip": 0.01307987, + "auxiliary_loss_mlp": 0.01083047, + "balance_loss_clip": 1.07032216, + "balance_loss_mlp": 1.04022694, + "epoch": 0.020201412896437697, + "flos": 20740567357440.0, + "grad_norm": 2.2709156376507775, + "language_loss": 0.92203796, + "learning_rate": 3.745359722027911e-06, + "loss": 0.9459483, + "num_input_tokens_seen": 7029450, + "step": 336, + "time_per_iteration": 2.564739227294922 + }, + { + "auxiliary_loss_clip": 0.01310607, + "auxiliary_loss_mlp": 0.01085267, + "balance_loss_clip": 1.06968856, + "balance_loss_mlp": 1.04321027, + "epoch": 0.020261536149105665, + "flos": 20266510636800.0, + "grad_norm": 1.8504961472514045, + "language_loss": 0.88593662, + "learning_rate": 3.7472731028818428e-06, + "loss": 0.9098953, + "num_input_tokens_seen": 7047555, + "step": 337, + "time_per_iteration": 2.6106183528900146 + }, + { + "auxiliary_loss_clip": 0.01300345, + "auxiliary_loss_mlp": 0.01101179, + "balance_loss_clip": 1.06726968, + "balance_loss_mlp": 1.05692863, + "epoch": 0.020321659401773638, + "flos": 25848752307840.0, + "grad_norm": 1.6841193181051137, + "language_loss": 0.89965743, + "learning_rate": 3.7491808144508626e-06, + "loss": 0.92367268, + "num_input_tokens_seen": 7068185, + "step": 338, + "time_per_iteration": 2.758965015411377 + }, + { + "auxiliary_loss_clip": 0.01311398, + "auxiliary_loss_mlp": 0.0109478, + "balance_loss_clip": 1.0703671, + "balance_loss_mlp": 1.05074358, + "epoch": 0.020381782654441606, + "flos": 17495033051520.0, + "grad_norm": 2.0035498286103075, + "language_loss": 0.8488968, + "learning_rate": 3.7510828902315576e-06, + "loss": 0.8729586, + "num_input_tokens_seen": 7085955, + "step": 339, + "time_per_iteration": 2.7501235008239746 + }, + { + "auxiliary_loss_clip": 0.0131498, + "auxiliary_loss_mlp": 0.01097194, + "balance_loss_clip": 1.07303965, + "balance_loss_mlp": 1.05280054, + "epoch": 0.020441905907109575, + "flos": 24243940465920.0, + "grad_norm": 1.7341163741764758, + "language_loss": 0.88889241, + "learning_rate": 3.75297936342452e-06, + "loss": 0.91301405, + "num_input_tokens_seen": 7106345, + "step": 340, + "time_per_iteration": 2.699678421020508 + }, + { + "auxiliary_loss_clip": 0.01311438, + "auxiliary_loss_mlp": 0.01086236, + "balance_loss_clip": 1.07075751, + "balance_loss_mlp": 1.0402689, + "epoch": 0.020502029159777543, + "flos": 22233301787520.0, + "grad_norm": 2.8186226531661305, + "language_loss": 0.88255984, + "learning_rate": 3.7548702669378253e-06, + "loss": 0.90653658, + "num_input_tokens_seen": 7125070, + "step": 341, + "time_per_iteration": 2.7570688724517822 + }, + { + "auxiliary_loss_clip": 0.01314414, + "auxiliary_loss_mlp": 0.01101818, + "balance_loss_clip": 1.06906819, + "balance_loss_mlp": 1.05699539, + "epoch": 0.020562152412445512, + "flos": 23987861429760.0, + "grad_norm": 2.5005463449075482, + "language_loss": 0.80585676, + "learning_rate": 3.756755633390458e-06, + "loss": 0.83001912, + "num_input_tokens_seen": 7144675, + "step": 342, + "time_per_iteration": 2.8302412033081055 + }, + { + "auxiliary_loss_clip": 0.01301924, + "auxiliary_loss_mlp": 0.01097845, + "balance_loss_clip": 1.06851542, + "balance_loss_mlp": 1.05130625, + "epoch": 0.020622275665113484, + "flos": 26975305537920.0, + "grad_norm": 4.9980377015595145, + "language_loss": 0.89652902, + "learning_rate": 3.7586354951156886e-06, + "loss": 0.92052674, + "num_input_tokens_seen": 7165505, + "step": 343, + "time_per_iteration": 2.683971405029297 + }, + { + "auxiliary_loss_clip": 0.01313123, + "auxiliary_loss_mlp": 0.0109678, + "balance_loss_clip": 1.07488203, + "balance_loss_mlp": 1.05429387, + "epoch": 0.020682398917781453, + "flos": 22600704049920.0, + "grad_norm": 1.7915404953307703, + "language_loss": 0.78191179, + "learning_rate": 3.7605098841644e-06, + "loss": 0.80601084, + "num_input_tokens_seen": 7184605, + "step": 344, + "time_per_iteration": 2.6882548332214355 + }, + { + "auxiliary_loss_clip": 0.01297658, + "auxiliary_loss_mlp": 0.01100522, + "balance_loss_clip": 1.06794882, + "balance_loss_mlp": 1.05605698, + "epoch": 0.02074252217044942, + "flos": 15013605790080.0, + "grad_norm": 1.7163901208343126, + "language_loss": 0.74991989, + "learning_rate": 3.7623788323083666e-06, + "loss": 0.77390176, + "num_input_tokens_seen": 7203065, + "step": 345, + "time_per_iteration": 2.739362955093384 + }, + { + "auxiliary_loss_clip": 0.01304919, + "auxiliary_loss_mlp": 0.0110156, + "balance_loss_clip": 1.07263803, + "balance_loss_mlp": 1.05769134, + "epoch": 0.02080264542311739, + "flos": 25337958952320.0, + "grad_norm": 2.1309637624511715, + "language_loss": 0.90373492, + "learning_rate": 3.7642423710434837e-06, + "loss": 0.9277997, + "num_input_tokens_seen": 7222995, + "step": 346, + "time_per_iteration": 2.722952365875244 + }, + { + "auxiliary_loss_clip": 0.01302163, + "auxiliary_loss_mlp": 0.01099355, + "balance_loss_clip": 1.0699147, + "balance_loss_mlp": 1.05846584, + "epoch": 0.02086276867578536, + "flos": 24388804016640.0, + "grad_norm": 2.1190506123154003, + "language_loss": 0.7902199, + "learning_rate": 3.7661005315929563e-06, + "loss": 0.81423509, + "num_input_tokens_seen": 7244625, + "step": 347, + "time_per_iteration": 2.7350802421569824 + }, + { + "auxiliary_loss_clip": 0.01305433, + "auxiliary_loss_mlp": 0.01100287, + "balance_loss_clip": 1.07301259, + "balance_loss_mlp": 1.05565476, + "epoch": 0.02092289192845333, + "flos": 24462205459200.0, + "grad_norm": 2.3506766368487058, + "language_loss": 0.71267927, + "learning_rate": 3.7679533449104354e-06, + "loss": 0.73673654, + "num_input_tokens_seen": 7263255, + "step": 348, + "time_per_iteration": 2.8480424880981445 + }, + { + "auxiliary_loss_clip": 0.01307189, + "auxiliary_loss_mlp": 0.0110182, + "balance_loss_clip": 1.06959784, + "balance_loss_mlp": 1.05771232, + "epoch": 0.0209830151811213, + "flos": 17451185523840.0, + "grad_norm": 2.2792925418061083, + "language_loss": 0.77124065, + "learning_rate": 3.7698008416831116e-06, + "loss": 0.79533076, + "num_input_tokens_seen": 7279275, + "step": 349, + "time_per_iteration": 2.6479313373565674 + }, + { + "auxiliary_loss_clip": 0.01293989, + "auxiliary_loss_mlp": 0.01096996, + "balance_loss_clip": 1.06953895, + "balance_loss_mlp": 1.05505776, + "epoch": 0.021043138433789268, + "flos": 24573995562240.0, + "grad_norm": 1.70067047384129, + "language_loss": 0.85254037, + "learning_rate": 3.7716430523347664e-06, + "loss": 0.87645024, + "num_input_tokens_seen": 7300180, + "step": 350, + "time_per_iteration": 2.658762216567993 + }, + { + "auxiliary_loss_clip": 0.0129932, + "auxiliary_loss_mlp": 0.01088945, + "balance_loss_clip": 1.07189035, + "balance_loss_mlp": 1.0489862, + "epoch": 0.021103261686457236, + "flos": 24454053072000.0, + "grad_norm": 2.4390269340724786, + "language_loss": 0.79755116, + "learning_rate": 3.773480007028776e-06, + "loss": 0.82143378, + "num_input_tokens_seen": 7317430, + "step": 351, + "time_per_iteration": 2.580385684967041 + }, + { + "auxiliary_loss_clip": 0.01308119, + "auxiliary_loss_mlp": 0.01104817, + "balance_loss_clip": 1.07321239, + "balance_loss_mlp": 1.05997014, + "epoch": 0.021163384939125205, + "flos": 14683083816960.0, + "grad_norm": 2.172952856678639, + "language_loss": 0.87370217, + "learning_rate": 3.775311735671078e-06, + "loss": 0.8978315, + "num_input_tokens_seen": 7334875, + "step": 352, + "time_per_iteration": 2.616832733154297 + }, + { + "auxiliary_loss_clip": 0.01299318, + "auxiliary_loss_mlp": 0.01102464, + "balance_loss_clip": 1.07080793, + "balance_loss_mlp": 1.05847621, + "epoch": 0.021223508191793177, + "flos": 24493195918080.0, + "grad_norm": 1.9457384683681536, + "language_loss": 0.82615346, + "learning_rate": 3.7771382679130878e-06, + "loss": 0.85017133, + "num_input_tokens_seen": 7355185, + "step": 353, + "time_per_iteration": 2.748600482940674 + }, + { + "auxiliary_loss_clip": 0.01297315, + "auxiliary_loss_mlp": 0.01094707, + "balance_loss_clip": 1.07013369, + "balance_loss_mlp": 1.05319786, + "epoch": 0.021283631444461146, + "flos": 24126978804480.0, + "grad_norm": 2.5180100505141625, + "language_loss": 0.81057906, + "learning_rate": 3.7789596331545845e-06, + "loss": 0.83449936, + "num_input_tokens_seen": 7374425, + "step": 354, + "time_per_iteration": 2.617891550064087 + }, + { + "auxiliary_loss_clip": 0.01303463, + "auxiliary_loss_mlp": 0.01092684, + "balance_loss_clip": 1.06824756, + "balance_loss_mlp": 1.04850507, + "epoch": 0.021343754697129114, + "flos": 25192233475200.0, + "grad_norm": 2.1555047170472976, + "language_loss": 0.81335324, + "learning_rate": 3.780775860546545e-06, + "loss": 0.83731472, + "num_input_tokens_seen": 7394175, + "step": 355, + "time_per_iteration": 2.667787551879883 + }, + { + "auxiliary_loss_clip": 0.01300736, + "auxiliary_loss_mlp": 0.01087329, + "balance_loss_clip": 1.06816483, + "balance_loss_mlp": 1.04489088, + "epoch": 0.021403877949797083, + "flos": 17274182279040.0, + "grad_norm": 2.594113686079276, + "language_loss": 0.89154005, + "learning_rate": 3.7825869789939474e-06, + "loss": 0.91542071, + "num_input_tokens_seen": 7412645, + "step": 356, + "time_per_iteration": 2.6630425453186035 + }, + { + "auxiliary_loss_clip": 0.012977, + "auxiliary_loss_mlp": 0.0108548, + "balance_loss_clip": 1.070575, + "balance_loss_mlp": 1.04173064, + "epoch": 0.021464001202465055, + "flos": 30917435276160.0, + "grad_norm": 2.640336103117537, + "language_loss": 0.80379832, + "learning_rate": 3.784393017158528e-06, + "loss": 0.82763016, + "num_input_tokens_seen": 7432275, + "step": 357, + "time_per_iteration": 2.683546781539917 + }, + { + "auxiliary_loss_clip": 0.01299558, + "auxiliary_loss_mlp": 0.01080292, + "balance_loss_clip": 1.06778634, + "balance_loss_mlp": 1.04002333, + "epoch": 0.021524124455133024, + "flos": 18186385098240.0, + "grad_norm": 2.61944007869126, + "language_loss": 0.768062, + "learning_rate": 3.786194003461506e-06, + "loss": 0.79186058, + "num_input_tokens_seen": 7450245, + "step": 358, + "time_per_iteration": 2.6840367317199707 + }, + { + "auxiliary_loss_clip": 0.01296497, + "auxiliary_loss_mlp": 0.01089744, + "balance_loss_clip": 1.06552207, + "balance_loss_mlp": 1.0454222, + "epoch": 0.021584247707800992, + "flos": 13805786039040.0, + "grad_norm": 2.156950270151829, + "language_loss": 0.88814342, + "learning_rate": 3.787989966086264e-06, + "loss": 0.9120059, + "num_input_tokens_seen": 7466845, + "step": 359, + "time_per_iteration": 2.633556604385376 + }, + { + "auxiliary_loss_clip": 0.01305793, + "auxiliary_loss_mlp": 0.01090247, + "balance_loss_clip": 1.0720228, + "balance_loss_mlp": 1.04976344, + "epoch": 0.02164437096046896, + "flos": 23294713703040.0, + "grad_norm": 6.261986455911618, + "language_loss": 0.75898075, + "learning_rate": 3.789780932980997e-06, + "loss": 0.7829411, + "num_input_tokens_seen": 7485450, + "step": 360, + "time_per_iteration": 2.7315478324890137 + }, + { + "auxiliary_loss_clip": 0.01175605, + "auxiliary_loss_mlp": 0.01046139, + "balance_loss_clip": 1.04925585, + "balance_loss_mlp": 1.03250146, + "epoch": 0.02170449421313693, + "flos": 68899578341760.0, + "grad_norm": 0.8490232535008863, + "language_loss": 0.64951217, + "learning_rate": 3.79156693186132e-06, + "loss": 0.67172956, + "num_input_tokens_seen": 7553780, + "step": 361, + "time_per_iteration": 3.45383620262146 + }, + { + "auxiliary_loss_clip": 0.01294355, + "auxiliary_loss_mlp": 0.01085435, + "balance_loss_clip": 1.06445932, + "balance_loss_mlp": 1.04406893, + "epoch": 0.0217646174658049, + "flos": 25228539146880.0, + "grad_norm": 2.7510362811811433, + "language_loss": 0.78556657, + "learning_rate": 3.7933479902128433e-06, + "loss": 0.80936444, + "num_input_tokens_seen": 7574155, + "step": 362, + "time_per_iteration": 2.782212018966675 + }, + { + "auxiliary_loss_clip": 0.01299738, + "auxiliary_loss_mlp": 0.01090989, + "balance_loss_clip": 1.06747866, + "balance_loss_mlp": 1.04938543, + "epoch": 0.02182474071847287, + "flos": 22893124671360.0, + "grad_norm": 2.0018428265331276, + "language_loss": 0.92572492, + "learning_rate": 3.7951241352937077e-06, + "loss": 0.94963217, + "num_input_tokens_seen": 7592320, + "step": 363, + "time_per_iteration": 2.7133262157440186 + }, + { + "auxiliary_loss_clip": 0.01295299, + "auxiliary_loss_mlp": 0.01097001, + "balance_loss_clip": 1.06739652, + "balance_loss_mlp": 1.05630314, + "epoch": 0.02188486397114084, + "flos": 23658991482240.0, + "grad_norm": 2.296739970115719, + "language_loss": 0.90028799, + "learning_rate": 3.7968953941370915e-06, + "loss": 0.92421097, + "num_input_tokens_seen": 7611185, + "step": 364, + "time_per_iteration": 4.459094047546387 + }, + { + "auxiliary_loss_clip": 0.01301506, + "auxiliary_loss_mlp": 0.01093769, + "balance_loss_clip": 1.0711689, + "balance_loss_mlp": 1.05013835, + "epoch": 0.021944987223808807, + "flos": 21543637680000.0, + "grad_norm": 15.367405632209453, + "language_loss": 0.79420435, + "learning_rate": 3.798661793553676e-06, + "loss": 0.81815708, + "num_input_tokens_seen": 7631970, + "step": 365, + "time_per_iteration": 4.529667854309082 + }, + { + "auxiliary_loss_clip": 0.01295527, + "auxiliary_loss_mlp": 0.01093516, + "balance_loss_clip": 1.06835067, + "balance_loss_mlp": 1.04988503, + "epoch": 0.022005110476476776, + "flos": 16070887641600.0, + "grad_norm": 1.86225669064033, + "language_loss": 0.84457648, + "learning_rate": 3.8004233601340808e-06, + "loss": 0.86846697, + "num_input_tokens_seen": 7649745, + "step": 366, + "time_per_iteration": 2.696357011795044 + }, + { + "auxiliary_loss_clip": 0.01302108, + "auxiliary_loss_mlp": 0.0108653, + "balance_loss_clip": 1.06902373, + "balance_loss_mlp": 1.04669034, + "epoch": 0.022065233729144748, + "flos": 21433715084160.0, + "grad_norm": 2.0940782725455245, + "language_loss": 0.87151986, + "learning_rate": 3.8021801202512694e-06, + "loss": 0.89540625, + "num_input_tokens_seen": 7668830, + "step": 367, + "time_per_iteration": 4.286772727966309 + }, + { + "auxiliary_loss_clip": 0.01303444, + "auxiliary_loss_mlp": 0.01094569, + "balance_loss_clip": 1.06755888, + "balance_loss_mlp": 1.05101001, + "epoch": 0.022125356981812717, + "flos": 21543709507200.0, + "grad_norm": 2.797576783212061, + "language_loss": 0.84795946, + "learning_rate": 3.803932100062912e-06, + "loss": 0.8719396, + "num_input_tokens_seen": 7687240, + "step": 368, + "time_per_iteration": 2.7957396507263184 + }, + { + "auxiliary_loss_clip": 0.01301369, + "auxiliary_loss_mlp": 0.01084358, + "balance_loss_clip": 1.06529951, + "balance_loss_mlp": 1.04339731, + "epoch": 0.022185480234480685, + "flos": 20704153944960.0, + "grad_norm": 6.93914034863794, + "language_loss": 0.75931191, + "learning_rate": 3.8056793255137264e-06, + "loss": 0.78316915, + "num_input_tokens_seen": 7704440, + "step": 369, + "time_per_iteration": 2.796330451965332 + }, + { + "auxiliary_loss_clip": 0.01295475, + "auxiliary_loss_mlp": 0.01099441, + "balance_loss_clip": 1.06765366, + "balance_loss_mlp": 1.0579567, + "epoch": 0.022245603487148654, + "flos": 25193203142400.0, + "grad_norm": 2.247576176027636, + "language_loss": 0.82951784, + "learning_rate": 3.8074218223377844e-06, + "loss": 0.85346699, + "num_input_tokens_seen": 7727160, + "step": 370, + "time_per_iteration": 3.0052850246429443 + }, + { + "auxiliary_loss_clip": 0.01294954, + "auxiliary_loss_mlp": 0.01094714, + "balance_loss_clip": 1.06688535, + "balance_loss_mlp": 1.05284762, + "epoch": 0.022305726739816623, + "flos": 21395936954880.0, + "grad_norm": 1.9443790796035025, + "language_loss": 0.81997102, + "learning_rate": 3.8091596160607834e-06, + "loss": 0.84386772, + "num_input_tokens_seen": 7747730, + "step": 371, + "time_per_iteration": 2.9274587631225586 + }, + { + "auxiliary_loss_clip": 0.0130277, + "auxiliary_loss_mlp": 0.01095864, + "balance_loss_clip": 1.07239568, + "balance_loss_mlp": 1.05240047, + "epoch": 0.022365849992484595, + "flos": 22492146170880.0, + "grad_norm": 2.3220099816115627, + "language_loss": 0.83525956, + "learning_rate": 3.8108927320022896e-06, + "loss": 0.8592459, + "num_input_tokens_seen": 7766765, + "step": 372, + "time_per_iteration": 2.8399455547332764 + }, + { + "auxiliary_loss_clip": 0.01294306, + "auxiliary_loss_mlp": 0.01091786, + "balance_loss_clip": 1.06761026, + "balance_loss_mlp": 1.04908562, + "epoch": 0.022425973245152563, + "flos": 17856581397120.0, + "grad_norm": 3.377178606094479, + "language_loss": 0.78984863, + "learning_rate": 3.8126211952779548e-06, + "loss": 0.81370962, + "num_input_tokens_seen": 7784010, + "step": 373, + "time_per_iteration": 2.8585195541381836 + }, + { + "auxiliary_loss_clip": 0.01298627, + "auxiliary_loss_mlp": 0.01091532, + "balance_loss_clip": 1.06935954, + "balance_loss_mlp": 1.04806805, + "epoch": 0.022486096497820532, + "flos": 15483029656320.0, + "grad_norm": 6.105370253059748, + "language_loss": 0.78001535, + "learning_rate": 3.8143450308016952e-06, + "loss": 0.80391693, + "num_input_tokens_seen": 7801305, + "step": 374, + "time_per_iteration": 2.8877956867218018 + }, + { + "auxiliary_loss_clip": 0.0129149, + "auxiliary_loss_mlp": 0.01080839, + "balance_loss_clip": 1.06156659, + "balance_loss_mlp": 1.03761339, + "epoch": 0.0225462197504885, + "flos": 27784157950080.0, + "grad_norm": 1.567695270098774, + "language_loss": 0.86145973, + "learning_rate": 3.8160642632878525e-06, + "loss": 0.88518304, + "num_input_tokens_seen": 7823965, + "step": 375, + "time_per_iteration": 2.716864824295044 + }, + { + "auxiliary_loss_clip": 0.01297777, + "auxiliary_loss_mlp": 0.01099021, + "balance_loss_clip": 1.06972289, + "balance_loss_mlp": 1.05510473, + "epoch": 0.02260634300315647, + "flos": 19975490645760.0, + "grad_norm": 2.2727879958604755, + "language_loss": 0.88903362, + "learning_rate": 3.817778917253314e-06, + "loss": 0.91300154, + "num_input_tokens_seen": 7842115, + "step": 376, + "time_per_iteration": 2.8215136528015137 + }, + { + "auxiliary_loss_clip": 0.01296, + "auxiliary_loss_mlp": 0.01084793, + "balance_loss_clip": 1.06421912, + "balance_loss_mlp": 1.04421377, + "epoch": 0.02266646625582444, + "flos": 16028189349120.0, + "grad_norm": 2.5396547922574455, + "language_loss": 0.75023592, + "learning_rate": 3.8194890170196155e-06, + "loss": 0.77404386, + "num_input_tokens_seen": 7857830, + "step": 377, + "time_per_iteration": 2.62929630279541 + }, + { + "auxiliary_loss_clip": 0.01286931, + "auxiliary_loss_mlp": 0.0109206, + "balance_loss_clip": 1.06656134, + "balance_loss_mlp": 1.0498594, + "epoch": 0.02272658950849241, + "flos": 20404622430720.0, + "grad_norm": 2.467405269359806, + "language_loss": 0.99266046, + "learning_rate": 3.8211945867150055e-06, + "loss": 1.01645041, + "num_input_tokens_seen": 7875840, + "step": 378, + "time_per_iteration": 2.5993268489837646 + }, + { + "auxiliary_loss_clip": 0.01161384, + "auxiliary_loss_mlp": 0.01110819, + "balance_loss_clip": 1.03589296, + "balance_loss_mlp": 1.09680033, + "epoch": 0.02278671276116038, + "flos": 69847332647040.0, + "grad_norm": 1.57223667334614, + "language_loss": 0.75391978, + "learning_rate": 3.822895650276492e-06, + "loss": 0.77664185, + "num_input_tokens_seen": 7940190, + "step": 379, + "time_per_iteration": 3.191347122192383 + }, + { + "auxiliary_loss_clip": 0.01296952, + "auxiliary_loss_mlp": 0.01089422, + "balance_loss_clip": 1.06409311, + "balance_loss_mlp": 1.04948735, + "epoch": 0.022846836013828347, + "flos": 38508771340800.0, + "grad_norm": 2.275107416372842, + "language_loss": 0.78321254, + "learning_rate": 3.824592231451859e-06, + "loss": 0.80707628, + "num_input_tokens_seen": 7960840, + "step": 380, + "time_per_iteration": 2.7336556911468506 + }, + { + "auxiliary_loss_clip": 0.01292266, + "auxiliary_loss_mlp": 0.01087908, + "balance_loss_clip": 1.06703997, + "balance_loss_mlp": 1.04799628, + "epoch": 0.02290695926649632, + "flos": 20959478795520.0, + "grad_norm": 3.26540369094989, + "language_loss": 0.96584326, + "learning_rate": 3.826284353801652e-06, + "loss": 0.98964494, + "num_input_tokens_seen": 7975500, + "step": 381, + "time_per_iteration": 2.6317484378814697 + }, + { + "auxiliary_loss_clip": 0.01302846, + "auxiliary_loss_mlp": 0.0109073, + "balance_loss_clip": 1.06872249, + "balance_loss_mlp": 1.04910243, + "epoch": 0.022967082519164288, + "flos": 24022407335040.0, + "grad_norm": 6.60045344225757, + "language_loss": 0.87966365, + "learning_rate": 3.827972040701142e-06, + "loss": 0.90359938, + "num_input_tokens_seen": 7993880, + "step": 382, + "time_per_iteration": 2.6930646896362305 + }, + { + "auxiliary_loss_clip": 0.01295114, + "auxiliary_loss_mlp": 0.01100354, + "balance_loss_clip": 1.06971836, + "balance_loss_mlp": 1.05939329, + "epoch": 0.023027205771832256, + "flos": 20997149184000.0, + "grad_norm": 2.995165809942644, + "language_loss": 0.84936696, + "learning_rate": 3.829655315342268e-06, + "loss": 0.87332165, + "num_input_tokens_seen": 8012730, + "step": 383, + "time_per_iteration": 2.687861442565918 + }, + { + "auxiliary_loss_clip": 0.01290759, + "auxiliary_loss_mlp": 0.01110421, + "balance_loss_clip": 1.06803846, + "balance_loss_mlp": 1.07000864, + "epoch": 0.023087329024500225, + "flos": 21360816432000.0, + "grad_norm": 2.0418962557468254, + "language_loss": 0.83414495, + "learning_rate": 3.831334200735543e-06, + "loss": 0.8581568, + "num_input_tokens_seen": 8031275, + "step": 384, + "time_per_iteration": 2.615386486053467 + }, + { + "auxiliary_loss_clip": 0.01290476, + "auxiliary_loss_mlp": 0.01095551, + "balance_loss_clip": 1.0703789, + "balance_loss_mlp": 1.05745149, + "epoch": 0.023147452277168194, + "flos": 21872435800320.0, + "grad_norm": 2.012332624927178, + "language_loss": 0.89376462, + "learning_rate": 3.8330087197119426e-06, + "loss": 0.91762495, + "num_input_tokens_seen": 8051600, + "step": 385, + "time_per_iteration": 2.6058952808380127 + }, + { + "auxiliary_loss_clip": 0.01295083, + "auxiliary_loss_mlp": 0.0111555, + "balance_loss_clip": 1.06889415, + "balance_loss_mlp": 1.07540083, + "epoch": 0.023207575529836166, + "flos": 18916700423040.0, + "grad_norm": 1.7767332845513801, + "language_loss": 0.70041311, + "learning_rate": 3.83467889492477e-06, + "loss": 0.72451943, + "num_input_tokens_seen": 8070600, + "step": 386, + "time_per_iteration": 2.615854024887085 + }, + { + "auxiliary_loss_clip": 0.01294057, + "auxiliary_loss_mlp": 0.010912, + "balance_loss_clip": 1.06945038, + "balance_loss_mlp": 1.05243373, + "epoch": 0.023267698782504134, + "flos": 25046005207680.0, + "grad_norm": 1.9177775775898815, + "language_loss": 0.87978959, + "learning_rate": 3.836344748851495e-06, + "loss": 0.90364212, + "num_input_tokens_seen": 8090680, + "step": 387, + "time_per_iteration": 2.7657928466796875 + }, + { + "auxiliary_loss_clip": 0.01295435, + "auxiliary_loss_mlp": 0.0108442, + "balance_loss_clip": 1.06871641, + "balance_loss_mlp": 1.0437454, + "epoch": 0.023327822035172103, + "flos": 28879217930880.0, + "grad_norm": 1.8599814347199846, + "language_loss": 0.83250463, + "learning_rate": 3.838006303795566e-06, + "loss": 0.85630322, + "num_input_tokens_seen": 8114610, + "step": 388, + "time_per_iteration": 2.7093822956085205 + }, + { + "auxiliary_loss_clip": 0.01292068, + "auxiliary_loss_mlp": 0.01093168, + "balance_loss_clip": 1.06714916, + "balance_loss_mlp": 1.05533123, + "epoch": 0.02338794528784007, + "flos": 27121533805440.0, + "grad_norm": 3.2165272627592842, + "language_loss": 0.93790698, + "learning_rate": 3.839663581888206e-06, + "loss": 0.96175933, + "num_input_tokens_seen": 8133975, + "step": 389, + "time_per_iteration": 2.80572247505188 + }, + { + "auxiliary_loss_clip": 0.01286094, + "auxiliary_loss_mlp": 0.01083202, + "balance_loss_clip": 1.06756377, + "balance_loss_mlp": 1.04357684, + "epoch": 0.02344806854050804, + "flos": 21322355944320.0, + "grad_norm": 2.9137949801801737, + "language_loss": 0.88002884, + "learning_rate": 3.841316605090178e-06, + "loss": 0.90372181, + "num_input_tokens_seen": 8153570, + "step": 390, + "time_per_iteration": 2.726670265197754 + }, + { + "auxiliary_loss_clip": 0.01291486, + "auxiliary_loss_mlp": 0.01093219, + "balance_loss_clip": 1.06926823, + "balance_loss_mlp": 1.05576301, + "epoch": 0.023508191793176012, + "flos": 24789997998720.0, + "grad_norm": 2.6096591715083526, + "language_loss": 0.89894378, + "learning_rate": 3.842965395193529e-06, + "loss": 0.92279088, + "num_input_tokens_seen": 8170075, + "step": 391, + "time_per_iteration": 2.757099151611328 + }, + { + "auxiliary_loss_clip": 0.0128491, + "auxiliary_loss_mlp": 0.01073706, + "balance_loss_clip": 1.06496632, + "balance_loss_mlp": 1.03493881, + "epoch": 0.02356831504584398, + "flos": 25995375624960.0, + "grad_norm": 2.0246562580748386, + "language_loss": 0.86011469, + "learning_rate": 3.84460997382332e-06, + "loss": 0.88370085, + "num_input_tokens_seen": 8190420, + "step": 392, + "time_per_iteration": 2.81728196144104 + }, + { + "auxiliary_loss_clip": 0.01285852, + "auxiliary_loss_mlp": 0.01089977, + "balance_loss_clip": 1.06701362, + "balance_loss_mlp": 1.05137694, + "epoch": 0.02362843829851195, + "flos": 19062461813760.0, + "grad_norm": 1.8856371316193319, + "language_loss": 0.89022529, + "learning_rate": 3.8462503624393256e-06, + "loss": 0.91398364, + "num_input_tokens_seen": 8208790, + "step": 393, + "time_per_iteration": 2.6411609649658203 + }, + { + "auxiliary_loss_clip": 0.01294926, + "auxiliary_loss_mlp": 0.0110406, + "balance_loss_clip": 1.07115412, + "balance_loss_mlp": 1.0630753, + "epoch": 0.023688561551179918, + "flos": 16071031296000.0, + "grad_norm": 1.9189846739878667, + "language_loss": 0.81498533, + "learning_rate": 3.84788658233771e-06, + "loss": 0.83897519, + "num_input_tokens_seen": 8226885, + "step": 394, + "time_per_iteration": 2.568938970565796 + }, + { + "auxiliary_loss_clip": 0.01283672, + "auxiliary_loss_mlp": 0.01087493, + "balance_loss_clip": 1.06387186, + "balance_loss_mlp": 1.04762983, + "epoch": 0.023748684803847887, + "flos": 21724375939200.0, + "grad_norm": 2.1997954948440315, + "language_loss": 0.85819942, + "learning_rate": 3.84951865465269e-06, + "loss": 0.8819111, + "num_input_tokens_seen": 8246825, + "step": 395, + "time_per_iteration": 2.618933916091919 + }, + { + "auxiliary_loss_clip": 0.01168636, + "auxiliary_loss_mlp": 0.01031348, + "balance_loss_clip": 1.03311694, + "balance_loss_mlp": 1.02066696, + "epoch": 0.02380880805651586, + "flos": 61926192881280.0, + "grad_norm": 0.9214875639236436, + "language_loss": 0.63788414, + "learning_rate": 3.851146600358172e-06, + "loss": 0.65988404, + "num_input_tokens_seen": 8302835, + "step": 396, + "time_per_iteration": 3.0150108337402344 + }, + { + "auxiliary_loss_clip": 0.0128282, + "auxiliary_loss_mlp": 0.01071943, + "balance_loss_clip": 1.0638597, + "balance_loss_mlp": 1.034392, + "epoch": 0.023868931309183827, + "flos": 20266331068800.0, + "grad_norm": 2.551121262634904, + "language_loss": 0.8379885, + "learning_rate": 3.852770440269372e-06, + "loss": 0.86153615, + "num_input_tokens_seen": 8320745, + "step": 397, + "time_per_iteration": 2.609043598175049 + }, + { + "auxiliary_loss_clip": 0.01287306, + "auxiliary_loss_mlp": 0.01094404, + "balance_loss_clip": 1.06699145, + "balance_loss_mlp": 1.0548265, + "epoch": 0.023929054561851796, + "flos": 21139103733120.0, + "grad_norm": 2.2629304694087633, + "language_loss": 0.84344143, + "learning_rate": 3.854390195044404e-06, + "loss": 0.86725855, + "num_input_tokens_seen": 8339540, + "step": 398, + "time_per_iteration": 2.627077102661133 + }, + { + "auxiliary_loss_clip": 0.01286937, + "auxiliary_loss_mlp": 0.01080542, + "balance_loss_clip": 1.06362832, + "balance_loss_mlp": 1.04051149, + "epoch": 0.023989177814519765, + "flos": 13698521049600.0, + "grad_norm": 3.128937367311238, + "language_loss": 0.85761976, + "learning_rate": 3.856005885185868e-06, + "loss": 0.88129455, + "num_input_tokens_seen": 8354890, + "step": 399, + "time_per_iteration": 2.5514142513275146 + }, + { + "auxiliary_loss_clip": 0.01283325, + "auxiliary_loss_mlp": 0.01091024, + "balance_loss_clip": 1.06652737, + "balance_loss_mlp": 1.05261469, + "epoch": 0.024049301067187733, + "flos": 26322018929280.0, + "grad_norm": 1.9516028420666534, + "language_loss": 0.86096334, + "learning_rate": 3.857617531042398e-06, + "loss": 0.88470685, + "num_input_tokens_seen": 8375845, + "step": 400, + "time_per_iteration": 2.638235569000244 + }, + { + "auxiliary_loss_clip": 0.01290087, + "auxiliary_loss_mlp": 0.01080525, + "balance_loss_clip": 1.06936026, + "balance_loss_mlp": 1.04242563, + "epoch": 0.024109424319855705, + "flos": 24425432910720.0, + "grad_norm": 2.1144893453415112, + "language_loss": 0.79228091, + "learning_rate": 3.8592251528102065e-06, + "loss": 0.81598705, + "num_input_tokens_seen": 8395240, + "step": 401, + "time_per_iteration": 2.63185977935791 + }, + { + "auxiliary_loss_clip": 0.01282999, + "auxiliary_loss_mlp": 0.01094544, + "balance_loss_clip": 1.06476784, + "balance_loss_mlp": 1.05668366, + "epoch": 0.024169547572523674, + "flos": 29604397610880.0, + "grad_norm": 2.0888875385353534, + "language_loss": 0.78417987, + "learning_rate": 3.8608287705345976e-06, + "loss": 0.80795527, + "num_input_tokens_seen": 8416950, + "step": 402, + "time_per_iteration": 2.641836166381836 + }, + { + "auxiliary_loss_clip": 0.01285508, + "auxiliary_loss_mlp": 0.01078352, + "balance_loss_clip": 1.06397271, + "balance_loss_mlp": 1.03891802, + "epoch": 0.024229670825191642, + "flos": 22601458235520.0, + "grad_norm": 2.511299645043428, + "language_loss": 0.94815576, + "learning_rate": 3.86242840411147e-06, + "loss": 0.97179443, + "num_input_tokens_seen": 8433660, + "step": 403, + "time_per_iteration": 2.563660144805908 + }, + { + "auxiliary_loss_clip": 0.01289267, + "auxiliary_loss_mlp": 0.01089441, + "balance_loss_clip": 1.06333733, + "balance_loss_mlp": 1.04914832, + "epoch": 0.02428979407785961, + "flos": 18150258994560.0, + "grad_norm": 2.4012791828273845, + "language_loss": 0.99626637, + "learning_rate": 3.864024073288798e-06, + "loss": 1.02005339, + "num_input_tokens_seen": 8450180, + "step": 404, + "time_per_iteration": 2.6296467781066895 + }, + { + "auxiliary_loss_clip": 0.01289852, + "auxiliary_loss_mlp": 0.01095351, + "balance_loss_clip": 1.06710887, + "balance_loss_mlp": 1.05694175, + "epoch": 0.024349917330527583, + "flos": 15304984917120.0, + "grad_norm": 4.146622133267797, + "language_loss": 0.87888384, + "learning_rate": 3.865615797668091e-06, + "loss": 0.90273583, + "num_input_tokens_seen": 8467775, + "step": 405, + "time_per_iteration": 2.6490395069122314 + }, + { + "auxiliary_loss_clip": 0.01298206, + "auxiliary_loss_mlp": 0.01096478, + "balance_loss_clip": 1.07179189, + "balance_loss_mlp": 1.0573535, + "epoch": 0.024410040583195552, + "flos": 20773892200320.0, + "grad_norm": 2.0806664623535376, + "language_loss": 0.93538582, + "learning_rate": 3.867203596705844e-06, + "loss": 0.9593327, + "num_input_tokens_seen": 8486765, + "step": 406, + "time_per_iteration": 2.5728957653045654 + }, + { + "auxiliary_loss_clip": 0.01288539, + "auxiliary_loss_mlp": 0.01087093, + "balance_loss_clip": 1.06852853, + "balance_loss_mlp": 1.04725337, + "epoch": 0.02447016383586352, + "flos": 21798854789760.0, + "grad_norm": 3.086397344109466, + "language_loss": 0.87053931, + "learning_rate": 3.86878748971496e-06, + "loss": 0.89429557, + "num_input_tokens_seen": 8506515, + "step": 407, + "time_per_iteration": 2.585289478302002 + }, + { + "auxiliary_loss_clip": 0.01286159, + "auxiliary_loss_mlp": 0.01082294, + "balance_loss_clip": 1.06981182, + "balance_loss_mlp": 1.04386127, + "epoch": 0.02453028708853149, + "flos": 33948116380800.0, + "grad_norm": 2.556678086810295, + "language_loss": 0.73854989, + "learning_rate": 3.8703674958661596e-06, + "loss": 0.76223439, + "num_input_tokens_seen": 8528035, + "step": 408, + "time_per_iteration": 2.7603776454925537 + }, + { + "auxiliary_loss_clip": 0.01290256, + "auxiliary_loss_mlp": 0.01092577, + "balance_loss_clip": 1.06855774, + "balance_loss_mlp": 1.05340505, + "epoch": 0.024590410341199458, + "flos": 21793000872960.0, + "grad_norm": 2.7367272947398877, + "language_loss": 0.92554295, + "learning_rate": 3.871943634189376e-06, + "loss": 0.94937134, + "num_input_tokens_seen": 8546455, + "step": 409, + "time_per_iteration": 2.614166259765625 + }, + { + "auxiliary_loss_clip": 0.01288445, + "auxiliary_loss_mlp": 0.01074578, + "balance_loss_clip": 1.06853747, + "balance_loss_mlp": 1.03886294, + "epoch": 0.02465053359386743, + "flos": 35114782124160.0, + "grad_norm": 2.1915713409337862, + "language_loss": 0.82994759, + "learning_rate": 3.873515923575128e-06, + "loss": 0.85357785, + "num_input_tokens_seen": 8568450, + "step": 410, + "time_per_iteration": 2.7084197998046875 + }, + { + "auxiliary_loss_clip": 0.01288786, + "auxiliary_loss_mlp": 0.0108802, + "balance_loss_clip": 1.06769872, + "balance_loss_mlp": 1.05044508, + "epoch": 0.0247106568465354, + "flos": 27451409333760.0, + "grad_norm": 2.4796605956245803, + "language_loss": 0.77910626, + "learning_rate": 3.875084382775879e-06, + "loss": 0.80287427, + "num_input_tokens_seen": 8589340, + "step": 411, + "time_per_iteration": 2.7223010063171387 + }, + { + "auxiliary_loss_clip": 0.01289678, + "auxiliary_loss_mlp": 0.01102063, + "balance_loss_clip": 1.06623173, + "balance_loss_mlp": 1.06270027, + "epoch": 0.024770780099203367, + "flos": 20703794808960.0, + "grad_norm": 2.6749698867643183, + "language_loss": 0.86623955, + "learning_rate": 3.87664903040738e-06, + "loss": 0.89015704, + "num_input_tokens_seen": 8607150, + "step": 412, + "time_per_iteration": 4.545894384384155 + }, + { + "auxiliary_loss_clip": 0.01160752, + "auxiliary_loss_mlp": 0.01018148, + "balance_loss_clip": 1.03933477, + "balance_loss_mlp": 1.00832558, + "epoch": 0.024830903351871336, + "flos": 69551859369600.0, + "grad_norm": 0.841261276165467, + "language_loss": 0.58559453, + "learning_rate": 3.878209884949994e-06, + "loss": 0.60738355, + "num_input_tokens_seen": 8669865, + "step": 413, + "time_per_iteration": 4.823506832122803 + }, + { + "auxiliary_loss_clip": 0.01281805, + "auxiliary_loss_mlp": 0.01093359, + "balance_loss_clip": 1.06394625, + "balance_loss_mlp": 1.05239916, + "epoch": 0.024891026604539304, + "flos": 32270477713920.0, + "grad_norm": 1.7987144730033375, + "language_loss": 0.80483919, + "learning_rate": 3.879766964750006e-06, + "loss": 0.82859081, + "num_input_tokens_seen": 8690235, + "step": 414, + "time_per_iteration": 2.707127094268799 + }, + { + "auxiliary_loss_clip": 0.01277483, + "auxiliary_loss_mlp": 0.01097447, + "balance_loss_clip": 1.06311381, + "balance_loss_mlp": 1.060063, + "epoch": 0.024951149857207276, + "flos": 18840282238080.0, + "grad_norm": 2.2661588115702878, + "language_loss": 0.80323255, + "learning_rate": 3.881320288020917e-06, + "loss": 0.8269819, + "num_input_tokens_seen": 8706295, + "step": 415, + "time_per_iteration": 4.168542146682739 + }, + { + "auxiliary_loss_clip": 0.01294565, + "auxiliary_loss_mlp": 0.01086196, + "balance_loss_clip": 1.06959462, + "balance_loss_mlp": 1.04816854, + "epoch": 0.025011273109875245, + "flos": 15377201210880.0, + "grad_norm": 2.8980934037151216, + "language_loss": 0.95788372, + "learning_rate": 3.882869872844723e-06, + "loss": 0.98169124, + "num_input_tokens_seen": 8724200, + "step": 416, + "time_per_iteration": 2.7519373893737793 + }, + { + "auxiliary_loss_clip": 0.01284768, + "auxiliary_loss_mlp": 0.01075341, + "balance_loss_clip": 1.06503212, + "balance_loss_mlp": 1.03531027, + "epoch": 0.025071396362543213, + "flos": 18915515274240.0, + "grad_norm": 2.58406194393968, + "language_loss": 0.77581054, + "learning_rate": 3.884415737173176e-06, + "loss": 0.79941165, + "num_input_tokens_seen": 8744170, + "step": 417, + "time_per_iteration": 2.587305784225464 + }, + { + "auxiliary_loss_clip": 0.01281056, + "auxiliary_loss_mlp": 0.01088907, + "balance_loss_clip": 1.06784272, + "balance_loss_mlp": 1.05002129, + "epoch": 0.025131519615211182, + "flos": 25337958952320.0, + "grad_norm": 1.7265059536141512, + "language_loss": 0.77118337, + "learning_rate": 3.8859578988290344e-06, + "loss": 0.79488301, + "num_input_tokens_seen": 8765120, + "step": 418, + "time_per_iteration": 2.6562187671661377 + }, + { + "auxiliary_loss_clip": 0.01292119, + "auxiliary_loss_mlp": 0.01076854, + "balance_loss_clip": 1.07067776, + "balance_loss_mlp": 1.03999424, + "epoch": 0.02519164286787915, + "flos": 18953149749120.0, + "grad_norm": 2.38411297772616, + "language_loss": 0.81395245, + "learning_rate": 3.887496375507294e-06, + "loss": 0.83764219, + "num_input_tokens_seen": 8783500, + "step": 419, + "time_per_iteration": 2.624119758605957 + }, + { + "auxiliary_loss_clip": 0.01283716, + "auxiliary_loss_mlp": 0.01088071, + "balance_loss_clip": 1.06792259, + "balance_loss_mlp": 1.04811239, + "epoch": 0.025251766120547123, + "flos": 17421092904960.0, + "grad_norm": 1.9999238932675196, + "language_loss": 0.73624635, + "learning_rate": 3.8890311847764065e-06, + "loss": 0.75996423, + "num_input_tokens_seen": 8801175, + "step": 420, + "time_per_iteration": 2.5732195377349854 + }, + { + "auxiliary_loss_clip": 0.01283694, + "auxiliary_loss_mlp": 0.01101034, + "balance_loss_clip": 1.06456566, + "balance_loss_mlp": 1.0632925, + "epoch": 0.02531188937321509, + "flos": 25045430590080.0, + "grad_norm": 1.8127847142707836, + "language_loss": 0.78971708, + "learning_rate": 3.890562344079484e-06, + "loss": 0.8135643, + "num_input_tokens_seen": 8820215, + "step": 421, + "time_per_iteration": 2.815981149673462 + }, + { + "auxiliary_loss_clip": 0.01282077, + "auxiliary_loss_mlp": 0.01090509, + "balance_loss_clip": 1.0676775, + "balance_loss_mlp": 1.05112231, + "epoch": 0.02537201262588306, + "flos": 30592228515840.0, + "grad_norm": 2.128356081564463, + "language_loss": 0.81831658, + "learning_rate": 3.89208987073549e-06, + "loss": 0.84204245, + "num_input_tokens_seen": 8839660, + "step": 422, + "time_per_iteration": 2.6935267448425293 + }, + { + "auxiliary_loss_clip": 0.01284309, + "auxiliary_loss_mlp": 0.0107699, + "balance_loss_clip": 1.06364703, + "balance_loss_mlp": 1.04141784, + "epoch": 0.02543213587855103, + "flos": 26065365275520.0, + "grad_norm": 1.9070402823279282, + "language_loss": 0.8364023, + "learning_rate": 3.893613781940409e-06, + "loss": 0.86001527, + "num_input_tokens_seen": 8859280, + "step": 423, + "time_per_iteration": 2.614481210708618 + }, + { + "auxiliary_loss_clip": 0.0127854, + "auxiliary_loss_mlp": 0.01080357, + "balance_loss_clip": 1.06300664, + "balance_loss_mlp": 1.0431875, + "epoch": 0.025492259131218997, + "flos": 36022818965760.0, + "grad_norm": 1.948159690824482, + "language_loss": 0.74321783, + "learning_rate": 3.895134094768415e-06, + "loss": 0.76680672, + "num_input_tokens_seen": 8880560, + "step": 424, + "time_per_iteration": 2.684600591659546 + }, + { + "auxiliary_loss_clip": 0.01290895, + "auxiliary_loss_mlp": 0.01099163, + "balance_loss_clip": 1.06966925, + "balance_loss_mlp": 1.06185031, + "epoch": 0.02555238238388697, + "flos": 18588045957120.0, + "grad_norm": 2.9924014081502275, + "language_loss": 0.8311106, + "learning_rate": 3.896650826173015e-06, + "loss": 0.85501117, + "num_input_tokens_seen": 8899155, + "step": 425, + "time_per_iteration": 2.7444896697998047 + }, + { + "auxiliary_loss_clip": 0.01284318, + "auxiliary_loss_mlp": 0.01091003, + "balance_loss_clip": 1.06110048, + "balance_loss_mlp": 1.05078197, + "epoch": 0.025612505636554938, + "flos": 24243186280320.0, + "grad_norm": 2.5460500684478182, + "language_loss": 0.85483402, + "learning_rate": 3.898163992988186e-06, + "loss": 0.87858725, + "num_input_tokens_seen": 8917890, + "step": 426, + "time_per_iteration": 2.644582748413086 + }, + { + "auxiliary_loss_clip": 0.01145296, + "auxiliary_loss_mlp": 0.01022425, + "balance_loss_clip": 1.03069139, + "balance_loss_mlp": 1.00993204, + "epoch": 0.025672628889222907, + "flos": 60586941265920.0, + "grad_norm": 0.9067461981570925, + "language_loss": 0.57272518, + "learning_rate": 3.899673611929491e-06, + "loss": 0.59440237, + "num_input_tokens_seen": 8978260, + "step": 427, + "time_per_iteration": 3.2782859802246094 + }, + { + "auxiliary_loss_clip": 0.01286647, + "auxiliary_loss_mlp": 0.0109863, + "balance_loss_clip": 1.07172513, + "balance_loss_mlp": 1.06095946, + "epoch": 0.025732752141890875, + "flos": 19573255169280.0, + "grad_norm": 2.400478247027811, + "language_loss": 0.88253653, + "learning_rate": 3.901179699595194e-06, + "loss": 0.90638924, + "num_input_tokens_seen": 8994460, + "step": 428, + "time_per_iteration": 2.657355308532715 + }, + { + "auxiliary_loss_clip": 0.01278203, + "auxiliary_loss_mlp": 0.01080088, + "balance_loss_clip": 1.06365395, + "balance_loss_mlp": 1.03962827, + "epoch": 0.025792875394558847, + "flos": 31284262920960.0, + "grad_norm": 1.5862657513355183, + "language_loss": 0.8559823, + "learning_rate": 3.902682272467353e-06, + "loss": 0.87956524, + "num_input_tokens_seen": 9016670, + "step": 429, + "time_per_iteration": 2.6539883613586426 + }, + { + "auxiliary_loss_clip": 0.01282717, + "auxiliary_loss_mlp": 0.01086668, + "balance_loss_clip": 1.06234837, + "balance_loss_mlp": 1.04680419, + "epoch": 0.025852998647226816, + "flos": 32379610210560.0, + "grad_norm": 2.190995900416133, + "language_loss": 0.88192785, + "learning_rate": 3.904181346912895e-06, + "loss": 0.90562171, + "num_input_tokens_seen": 9039720, + "step": 430, + "time_per_iteration": 2.8757901191711426 + }, + { + "auxiliary_loss_clip": 0.01285211, + "auxiliary_loss_mlp": 0.01081202, + "balance_loss_clip": 1.07042861, + "balance_loss_mlp": 1.04446197, + "epoch": 0.025913121899894784, + "flos": 20193288762240.0, + "grad_norm": 2.4105853106198114, + "language_loss": 0.84258723, + "learning_rate": 3.905676939184698e-06, + "loss": 0.86625135, + "num_input_tokens_seen": 9059850, + "step": 431, + "time_per_iteration": 2.7435572147369385 + }, + { + "auxiliary_loss_clip": 0.01281731, + "auxiliary_loss_mlp": 0.01076112, + "balance_loss_clip": 1.06641686, + "balance_loss_mlp": 1.04061151, + "epoch": 0.025973245152562753, + "flos": 14720430983040.0, + "grad_norm": 13.816318277652899, + "language_loss": 0.8648147, + "learning_rate": 3.907169065422638e-06, + "loss": 0.88839316, + "num_input_tokens_seen": 9077590, + "step": 432, + "time_per_iteration": 2.8101842403411865 + }, + { + "auxiliary_loss_clip": 0.01283317, + "auxiliary_loss_mlp": 0.01076467, + "balance_loss_clip": 1.06615996, + "balance_loss_mlp": 1.03986931, + "epoch": 0.02603336840523072, + "flos": 30992991534720.0, + "grad_norm": 2.3708340326532413, + "language_loss": 0.76264548, + "learning_rate": 3.908657741654636e-06, + "loss": 0.78624332, + "num_input_tokens_seen": 9099880, + "step": 433, + "time_per_iteration": 2.8997673988342285 + }, + { + "auxiliary_loss_clip": 0.01281073, + "auxiliary_loss_mlp": 0.01088841, + "balance_loss_clip": 1.06351352, + "balance_loss_mlp": 1.04928732, + "epoch": 0.026093491657898694, + "flos": 17674262939520.0, + "grad_norm": 2.393676945557173, + "language_loss": 0.89538157, + "learning_rate": 3.910142983797699e-06, + "loss": 0.91908062, + "num_input_tokens_seen": 9118620, + "step": 434, + "time_per_iteration": 2.862962007522583 + }, + { + "auxiliary_loss_clip": 0.01283725, + "auxiliary_loss_mlp": 0.01099874, + "balance_loss_clip": 1.06957507, + "balance_loss_mlp": 1.06065369, + "epoch": 0.026153614910566662, + "flos": 17857874286720.0, + "grad_norm": 2.5607855211541475, + "language_loss": 0.8036716, + "learning_rate": 3.9116248076589305e-06, + "loss": 0.82750762, + "num_input_tokens_seen": 9135655, + "step": 435, + "time_per_iteration": 2.6986472606658936 + }, + { + "auxiliary_loss_clip": 0.01278423, + "auxiliary_loss_mlp": 0.01087979, + "balance_loss_clip": 1.06300139, + "balance_loss_mlp": 1.04947424, + "epoch": 0.02621373816323463, + "flos": 20011113959040.0, + "grad_norm": 3.611921510499358, + "language_loss": 0.86733818, + "learning_rate": 3.913103228936546e-06, + "loss": 0.89100218, + "num_input_tokens_seen": 9153520, + "step": 436, + "time_per_iteration": 2.8138527870178223 + }, + { + "auxiliary_loss_clip": 0.01283155, + "auxiliary_loss_mlp": 0.01095543, + "balance_loss_clip": 1.06730223, + "balance_loss_mlp": 1.05799246, + "epoch": 0.0262738614159026, + "flos": 19281193683840.0, + "grad_norm": 2.2385476826659434, + "language_loss": 0.74791932, + "learning_rate": 3.914578263220868e-06, + "loss": 0.77170634, + "num_input_tokens_seen": 9170750, + "step": 437, + "time_per_iteration": 2.8078222274780273 + }, + { + "auxiliary_loss_clip": 0.01278221, + "auxiliary_loss_mlp": 0.01096247, + "balance_loss_clip": 1.06561255, + "balance_loss_mlp": 1.05628812, + "epoch": 0.026333984668570568, + "flos": 18807208790400.0, + "grad_norm": 4.011039691969745, + "language_loss": 0.91364765, + "learning_rate": 3.916049925995316e-06, + "loss": 0.93739235, + "num_input_tokens_seen": 9188430, + "step": 438, + "time_per_iteration": 2.738203763961792 + }, + { + "auxiliary_loss_clip": 0.01135259, + "auxiliary_loss_mlp": 0.01036418, + "balance_loss_clip": 1.02547336, + "balance_loss_mlp": 1.02497375, + "epoch": 0.02639410792123854, + "flos": 64572020691840.0, + "grad_norm": 0.8893110843183661, + "language_loss": 0.62608665, + "learning_rate": 3.917518232637377e-06, + "loss": 0.64780343, + "num_input_tokens_seen": 9255835, + "step": 439, + "time_per_iteration": 3.2276647090911865 + }, + { + "auxiliary_loss_clip": 0.01286979, + "auxiliary_loss_mlp": 0.01095267, + "balance_loss_clip": 1.06982672, + "balance_loss_mlp": 1.05645299, + "epoch": 0.02645423117390651, + "flos": 28473462921600.0, + "grad_norm": 1.7862139301484332, + "language_loss": 0.76001132, + "learning_rate": 3.918983198419573e-06, + "loss": 0.7838338, + "num_input_tokens_seen": 9276835, + "step": 440, + "time_per_iteration": 2.754429578781128 + }, + { + "auxiliary_loss_clip": 0.01279161, + "auxiliary_loss_mlp": 0.01076587, + "balance_loss_clip": 1.06586504, + "balance_loss_mlp": 1.03858268, + "epoch": 0.026514354426574478, + "flos": 18551237495040.0, + "grad_norm": 3.3834723943560587, + "language_loss": 0.83166015, + "learning_rate": 3.920444838510415e-06, + "loss": 0.85521764, + "num_input_tokens_seen": 9295075, + "step": 441, + "time_per_iteration": 2.736642599105835 + }, + { + "auxiliary_loss_clip": 0.01283195, + "auxiliary_loss_mlp": 0.01089702, + "balance_loss_clip": 1.06424522, + "balance_loss_mlp": 1.04974341, + "epoch": 0.026574477679242446, + "flos": 20667812359680.0, + "grad_norm": 2.0235620507561767, + "language_loss": 0.78444421, + "learning_rate": 3.92190316797534e-06, + "loss": 0.80817318, + "num_input_tokens_seen": 9314205, + "step": 442, + "time_per_iteration": 2.638272285461426 + }, + { + "auxiliary_loss_clip": 0.01132537, + "auxiliary_loss_mlp": 0.01011278, + "balance_loss_clip": 1.02320981, + "balance_loss_mlp": 1.00069237, + "epoch": 0.026634600931910415, + "flos": 57956125340160.0, + "grad_norm": 0.9608707022631889, + "language_loss": 0.64485157, + "learning_rate": 3.92335820177765e-06, + "loss": 0.66628969, + "num_input_tokens_seen": 9367395, + "step": 443, + "time_per_iteration": 3.0053956508636475 + }, + { + "auxiliary_loss_clip": 0.01280444, + "auxiliary_loss_mlp": 0.01088329, + "balance_loss_clip": 1.06766605, + "balance_loss_mlp": 1.05080199, + "epoch": 0.026694724184578387, + "flos": 15815131827840.0, + "grad_norm": 2.0110782329160237, + "language_loss": 0.82476938, + "learning_rate": 3.924809954779425e-06, + "loss": 0.8484571, + "num_input_tokens_seen": 9385185, + "step": 444, + "time_per_iteration": 2.694033622741699 + }, + { + "auxiliary_loss_clip": 0.01283199, + "auxiliary_loss_mlp": 0.01084706, + "balance_loss_clip": 1.06597662, + "balance_loss_mlp": 1.0440079, + "epoch": 0.026754847437246355, + "flos": 23440259612160.0, + "grad_norm": 2.567615202221613, + "language_loss": 0.95764697, + "learning_rate": 3.9262584417424425e-06, + "loss": 0.98132604, + "num_input_tokens_seen": 9403225, + "step": 445, + "time_per_iteration": 2.655694007873535 + }, + { + "auxiliary_loss_clip": 0.01280279, + "auxiliary_loss_mlp": 0.01097564, + "balance_loss_clip": 1.06684279, + "balance_loss_mlp": 1.05786753, + "epoch": 0.026814970689914324, + "flos": 17341801632000.0, + "grad_norm": 4.245890387255104, + "language_loss": 0.91438961, + "learning_rate": 3.9277036773290725e-06, + "loss": 0.93816805, + "num_input_tokens_seen": 9420540, + "step": 446, + "time_per_iteration": 2.59576153755188 + }, + { + "auxiliary_loss_clip": 0.01278052, + "auxiliary_loss_mlp": 0.01083557, + "balance_loss_clip": 1.06712914, + "balance_loss_mlp": 1.04531443, + "epoch": 0.026875093942582293, + "flos": 17894718662400.0, + "grad_norm": 2.289675733209683, + "language_loss": 0.80086648, + "learning_rate": 3.92914567610317e-06, + "loss": 0.82448256, + "num_input_tokens_seen": 9438840, + "step": 447, + "time_per_iteration": 2.633049488067627 + }, + { + "auxiliary_loss_clip": 0.01275234, + "auxiliary_loss_mlp": 0.01073763, + "balance_loss_clip": 1.06348968, + "balance_loss_mlp": 1.03831017, + "epoch": 0.026935217195250265, + "flos": 21723980889600.0, + "grad_norm": 7.170857506455217, + "language_loss": 0.86774552, + "learning_rate": 3.930584452530952e-06, + "loss": 0.89123553, + "num_input_tokens_seen": 9457215, + "step": 448, + "time_per_iteration": 2.6964170932769775 + }, + { + "auxiliary_loss_clip": 0.01269457, + "auxiliary_loss_mlp": 0.01091229, + "balance_loss_clip": 1.06269264, + "balance_loss_mlp": 1.05632436, + "epoch": 0.026995340447918233, + "flos": 23622685810560.0, + "grad_norm": 1.9942662554582777, + "language_loss": 0.88357753, + "learning_rate": 3.9320200209818755e-06, + "loss": 0.90718436, + "num_input_tokens_seen": 9475615, + "step": 449, + "time_per_iteration": 2.62984299659729 + }, + { + "auxiliary_loss_clip": 0.01281563, + "auxiliary_loss_mlp": 0.0108841, + "balance_loss_clip": 1.06473255, + "balance_loss_mlp": 1.04971492, + "epoch": 0.027055463700586202, + "flos": 17931275729280.0, + "grad_norm": 2.277517411607817, + "language_loss": 0.80505192, + "learning_rate": 3.933452395729493e-06, + "loss": 0.82875168, + "num_input_tokens_seen": 9493975, + "step": 450, + "time_per_iteration": 2.5953311920166016 + }, + { + "auxiliary_loss_clip": 0.01275693, + "auxiliary_loss_mlp": 0.01078431, + "balance_loss_clip": 1.06863284, + "balance_loss_mlp": 1.04138112, + "epoch": 0.02711558695325417, + "flos": 25118903859840.0, + "grad_norm": 1.564862876982506, + "language_loss": 0.81655622, + "learning_rate": 3.934881590952304e-06, + "loss": 0.84009749, + "num_input_tokens_seen": 9514810, + "step": 451, + "time_per_iteration": 2.7317698001861572 + }, + { + "auxiliary_loss_clip": 0.01275748, + "auxiliary_loss_mlp": 0.01088494, + "balance_loss_clip": 1.06913185, + "balance_loss_mlp": 1.04970276, + "epoch": 0.02717571020592214, + "flos": 24239559006720.0, + "grad_norm": 1.7426127878597277, + "language_loss": 0.77263248, + "learning_rate": 3.936307620734599e-06, + "loss": 0.7962749, + "num_input_tokens_seen": 9533635, + "step": 452, + "time_per_iteration": 2.720168352127075 + }, + { + "auxiliary_loss_clip": 0.01275236, + "auxiliary_loss_mlp": 0.01084523, + "balance_loss_clip": 1.06638658, + "balance_loss_mlp": 1.04654253, + "epoch": 0.02723583345859011, + "flos": 25118939773440.0, + "grad_norm": 1.8071995569295172, + "language_loss": 0.73014039, + "learning_rate": 3.937730499067294e-06, + "loss": 0.75373793, + "num_input_tokens_seen": 9555420, + "step": 453, + "time_per_iteration": 2.6846446990966797 + }, + { + "auxiliary_loss_clip": 0.01270443, + "auxiliary_loss_mlp": 0.01083362, + "balance_loss_clip": 1.06390619, + "balance_loss_mlp": 1.04662228, + "epoch": 0.02729595671125808, + "flos": 42741597847680.0, + "grad_norm": 1.8739457553121481, + "language_loss": 0.82189643, + "learning_rate": 3.939150239848748e-06, + "loss": 0.84543443, + "num_input_tokens_seen": 9578950, + "step": 454, + "time_per_iteration": 2.798614978790283 + }, + { + "auxiliary_loss_clip": 0.0127505, + "auxiliary_loss_mlp": 0.01079711, + "balance_loss_clip": 1.06691146, + "balance_loss_mlp": 1.04566467, + "epoch": 0.02735607996392605, + "flos": 21430985650560.0, + "grad_norm": 2.04723265596025, + "language_loss": 0.75219887, + "learning_rate": 3.9405668568855866e-06, + "loss": 0.77574646, + "num_input_tokens_seen": 9598160, + "step": 455, + "time_per_iteration": 2.657407283782959 + }, + { + "auxiliary_loss_clip": 0.0127481, + "auxiliary_loss_mlp": 0.01088756, + "balance_loss_clip": 1.0638144, + "balance_loss_mlp": 1.05287433, + "epoch": 0.027416203216594017, + "flos": 20851280052480.0, + "grad_norm": 2.9991489435344283, + "language_loss": 0.81008023, + "learning_rate": 3.941980363893499e-06, + "loss": 0.83371592, + "num_input_tokens_seen": 9616010, + "step": 456, + "time_per_iteration": 2.742621421813965 + }, + { + "auxiliary_loss_clip": 0.01267993, + "auxiliary_loss_mlp": 0.01076843, + "balance_loss_clip": 1.06247675, + "balance_loss_mlp": 1.03924465, + "epoch": 0.027476326469261986, + "flos": 13224500242560.0, + "grad_norm": 1.814794105849659, + "language_loss": 0.81642759, + "learning_rate": 3.9433907744980384e-06, + "loss": 0.839876, + "num_input_tokens_seen": 9634000, + "step": 457, + "time_per_iteration": 2.615199327468872 + }, + { + "auxiliary_loss_clip": 0.0127448, + "auxiliary_loss_mlp": 0.01080659, + "balance_loss_clip": 1.06267738, + "balance_loss_mlp": 1.04377532, + "epoch": 0.027536449721929958, + "flos": 24024526237440.0, + "grad_norm": 1.9658374961635177, + "language_loss": 0.93994415, + "learning_rate": 3.944798102235412e-06, + "loss": 0.96349549, + "num_input_tokens_seen": 9653455, + "step": 458, + "time_per_iteration": 2.7362289428710938 + }, + { + "auxiliary_loss_clip": 0.01270526, + "auxiliary_loss_mlp": 0.01096062, + "balance_loss_clip": 1.06231904, + "balance_loss_mlp": 1.06051421, + "epoch": 0.027596572974597926, + "flos": 13006055681280.0, + "grad_norm": 2.246368490996435, + "language_loss": 0.78660858, + "learning_rate": 3.9462023605532545e-06, + "loss": 0.81027436, + "num_input_tokens_seen": 9669650, + "step": 459, + "time_per_iteration": 2.5900771617889404 + }, + { + "auxiliary_loss_clip": 0.01276518, + "auxiliary_loss_mlp": 0.01089443, + "balance_loss_clip": 1.06842303, + "balance_loss_mlp": 1.04900682, + "epoch": 0.027656696227265895, + "flos": 26143076350080.0, + "grad_norm": 2.5565023850354875, + "language_loss": 0.83485758, + "learning_rate": 3.947603562811407e-06, + "loss": 0.85851723, + "num_input_tokens_seen": 9691415, + "step": 460, + "time_per_iteration": 5.861920118331909 + }, + { + "auxiliary_loss_clip": 0.01151325, + "auxiliary_loss_mlp": 0.01016937, + "balance_loss_clip": 1.03433943, + "balance_loss_mlp": 1.00797236, + "epoch": 0.027716819479933864, + "flos": 60697222997760.0, + "grad_norm": 1.5986115412229147, + "language_loss": 0.73674506, + "learning_rate": 3.949001722282675e-06, + "loss": 0.75842768, + "num_input_tokens_seen": 9755605, + "step": 461, + "time_per_iteration": 4.725020885467529 + }, + { + "auxiliary_loss_clip": 0.01270655, + "auxiliary_loss_mlp": 0.01083532, + "balance_loss_clip": 1.0681541, + "balance_loss_mlp": 1.04953337, + "epoch": 0.027776942732601832, + "flos": 31211938886400.0, + "grad_norm": 260.7543556214416, + "language_loss": 0.81368768, + "learning_rate": 3.950396852153582e-06, + "loss": 0.83722961, + "num_input_tokens_seen": 9776270, + "step": 462, + "time_per_iteration": 2.774601936340332 + }, + { + "auxiliary_loss_clip": 0.01271733, + "auxiliary_loss_mlp": 0.01077119, + "balance_loss_clip": 1.06532252, + "balance_loss_mlp": 1.04335856, + "epoch": 0.027837065985269804, + "flos": 22674644196480.0, + "grad_norm": 2.3572661668731225, + "language_loss": 0.90454018, + "learning_rate": 3.951788965525118e-06, + "loss": 0.9280287, + "num_input_tokens_seen": 9794465, + "step": 463, + "time_per_iteration": 4.202226638793945 + }, + { + "auxiliary_loss_clip": 0.01154375, + "auxiliary_loss_mlp": 0.01008948, + "balance_loss_clip": 1.03854811, + "balance_loss_mlp": 0.99993604, + "epoch": 0.027897189237937773, + "flos": 62182487399040.0, + "grad_norm": 0.8877255674017761, + "language_loss": 0.59106553, + "learning_rate": 3.953178075413476e-06, + "loss": 0.61269879, + "num_input_tokens_seen": 9849685, + "step": 464, + "time_per_iteration": 3.1065964698791504 + }, + { + "auxiliary_loss_clip": 0.01281097, + "auxiliary_loss_mlp": 0.01097273, + "balance_loss_clip": 1.0689702, + "balance_loss_mlp": 1.05976963, + "epoch": 0.02795731249060574, + "flos": 24493160004480.0, + "grad_norm": 2.896965894700881, + "language_loss": 0.81411994, + "learning_rate": 3.954564194750784e-06, + "loss": 0.83790368, + "num_input_tokens_seen": 9869505, + "step": 465, + "time_per_iteration": 2.7152411937713623 + }, + { + "auxiliary_loss_clip": 0.01271244, + "auxiliary_loss_mlp": 0.01085676, + "balance_loss_clip": 1.06413603, + "balance_loss_mlp": 1.04895973, + "epoch": 0.02801743574327371, + "flos": 23733003456000.0, + "grad_norm": 2.1722927451178253, + "language_loss": 0.78715897, + "learning_rate": 3.955947336385828e-06, + "loss": 0.81072813, + "num_input_tokens_seen": 9890950, + "step": 466, + "time_per_iteration": 2.694983959197998 + }, + { + "auxiliary_loss_clip": 0.0126972, + "auxiliary_loss_mlp": 0.01086171, + "balance_loss_clip": 1.06548572, + "balance_loss_mlp": 1.0503608, + "epoch": 0.02807755899594168, + "flos": 20629100476800.0, + "grad_norm": 1.7985450996859609, + "language_loss": 0.87600017, + "learning_rate": 3.957327513084761e-06, + "loss": 0.89955908, + "num_input_tokens_seen": 9911265, + "step": 467, + "time_per_iteration": 2.618959903717041 + }, + { + "auxiliary_loss_clip": 0.01275413, + "auxiliary_loss_mlp": 0.01101326, + "balance_loss_clip": 1.0659641, + "balance_loss_mlp": 1.0627737, + "epoch": 0.02813768224860965, + "flos": 19244564789760.0, + "grad_norm": 2.1421734622112467, + "language_loss": 0.86494744, + "learning_rate": 3.958704737531818e-06, + "loss": 0.88871485, + "num_input_tokens_seen": 9929025, + "step": 468, + "time_per_iteration": 2.6226987838745117 + }, + { + "auxiliary_loss_clip": 0.01270026, + "auxiliary_loss_mlp": 0.01077445, + "balance_loss_clip": 1.06231594, + "balance_loss_mlp": 1.03913105, + "epoch": 0.02819780550127762, + "flos": 20813968800000.0, + "grad_norm": 2.5718302365339136, + "language_loss": 0.91677552, + "learning_rate": 3.9600790223300065e-06, + "loss": 0.94025022, + "num_input_tokens_seen": 9945190, + "step": 469, + "time_per_iteration": 2.6723246574401855 + }, + { + "auxiliary_loss_clip": 0.0126856, + "auxiliary_loss_mlp": 0.01088807, + "balance_loss_clip": 1.06509233, + "balance_loss_mlp": 1.05094647, + "epoch": 0.028257928753945588, + "flos": 19974125928960.0, + "grad_norm": 3.0321034646196217, + "language_loss": 0.81761837, + "learning_rate": 3.96145038000181e-06, + "loss": 0.84119207, + "num_input_tokens_seen": 9962820, + "step": 470, + "time_per_iteration": 2.609459638595581 + }, + { + "auxiliary_loss_clip": 0.01271183, + "auxiliary_loss_mlp": 0.01081264, + "balance_loss_clip": 1.06192613, + "balance_loss_mlp": 1.04268754, + "epoch": 0.028318052006613557, + "flos": 20484488321280.0, + "grad_norm": 1.7662029803655868, + "language_loss": 0.93235213, + "learning_rate": 3.962818822989861e-06, + "loss": 0.95587665, + "num_input_tokens_seen": 9982595, + "step": 471, + "time_per_iteration": 2.670048236846924 + }, + { + "auxiliary_loss_clip": 0.01266473, + "auxiliary_loss_mlp": 0.0109075, + "balance_loss_clip": 1.06195545, + "balance_loss_mlp": 1.05372357, + "epoch": 0.02837817525928153, + "flos": 28514832410880.0, + "grad_norm": 1.924905588944489, + "language_loss": 0.76255387, + "learning_rate": 3.964184363657625e-06, + "loss": 0.78612614, + "num_input_tokens_seen": 10004645, + "step": 472, + "time_per_iteration": 2.626009464263916 + }, + { + "auxiliary_loss_clip": 0.01271718, + "auxiliary_loss_mlp": 0.01077018, + "balance_loss_clip": 1.06066525, + "balance_loss_mlp": 1.04080188, + "epoch": 0.028438298511949497, + "flos": 18551668458240.0, + "grad_norm": 1.8187583886357026, + "language_loss": 0.93360382, + "learning_rate": 3.965547014290071e-06, + "loss": 0.95709115, + "num_input_tokens_seen": 10022555, + "step": 473, + "time_per_iteration": 2.64597749710083 + }, + { + "auxiliary_loss_clip": 0.01277787, + "auxiliary_loss_mlp": 0.01108142, + "balance_loss_clip": 1.06571722, + "balance_loss_mlp": 1.07242668, + "epoch": 0.028498421764617466, + "flos": 16910227722240.0, + "grad_norm": 5.0714105513935275, + "language_loss": 0.88334572, + "learning_rate": 3.96690678709433e-06, + "loss": 0.90720499, + "num_input_tokens_seen": 10041025, + "step": 474, + "time_per_iteration": 2.6212172508239746 + }, + { + "auxiliary_loss_clip": 0.01269024, + "auxiliary_loss_mlp": 0.01085573, + "balance_loss_clip": 1.06431746, + "balance_loss_mlp": 1.04797482, + "epoch": 0.028558545017285435, + "flos": 27778699082880.0, + "grad_norm": 2.4306488360725282, + "language_loss": 0.78951001, + "learning_rate": 3.968263694200355e-06, + "loss": 0.81305599, + "num_input_tokens_seen": 10060775, + "step": 475, + "time_per_iteration": 2.687919855117798 + }, + { + "auxiliary_loss_clip": 0.01143236, + "auxiliary_loss_mlp": 0.01042137, + "balance_loss_clip": 1.0339787, + "balance_loss_mlp": 1.03269613, + "epoch": 0.028618668269953403, + "flos": 65654367258240.0, + "grad_norm": 0.9254283621412792, + "language_loss": 0.66993898, + "learning_rate": 3.969617747661569e-06, + "loss": 0.69179273, + "num_input_tokens_seen": 10120225, + "step": 476, + "time_per_iteration": 3.104609966278076 + }, + { + "auxiliary_loss_clip": 0.01270351, + "auxiliary_loss_mlp": 0.01081805, + "balance_loss_clip": 1.06341588, + "balance_loss_mlp": 1.04361022, + "epoch": 0.028678791522621375, + "flos": 21937074324480.0, + "grad_norm": 2.1237611969336387, + "language_loss": 0.83930516, + "learning_rate": 3.970968959455509e-06, + "loss": 0.8628267, + "num_input_tokens_seen": 10137880, + "step": 477, + "time_per_iteration": 2.6136069297790527 + }, + { + "auxiliary_loss_clip": 0.01276043, + "auxiliary_loss_mlp": 0.01085272, + "balance_loss_clip": 1.06727588, + "balance_loss_mlp": 1.04824555, + "epoch": 0.028738914775289344, + "flos": 24572128055040.0, + "grad_norm": 2.1144590238497205, + "language_loss": 0.82247663, + "learning_rate": 3.97231734148446e-06, + "loss": 0.84608972, + "num_input_tokens_seen": 10156930, + "step": 478, + "time_per_iteration": 2.616790294647217 + }, + { + "auxiliary_loss_clip": 0.01268871, + "auxiliary_loss_mlp": 0.01079391, + "balance_loss_clip": 1.06317449, + "balance_loss_mlp": 1.04305577, + "epoch": 0.028799038027957313, + "flos": 23257977068160.0, + "grad_norm": 1.655588896604934, + "language_loss": 0.8116762, + "learning_rate": 3.973662905576082e-06, + "loss": 0.83515882, + "num_input_tokens_seen": 10176295, + "step": 479, + "time_per_iteration": 2.575855255126953 + }, + { + "auxiliary_loss_clip": 0.01265895, + "auxiliary_loss_mlp": 0.0108464, + "balance_loss_clip": 1.06137037, + "balance_loss_mlp": 1.04556346, + "epoch": 0.02885916128062528, + "flos": 22164102236160.0, + "grad_norm": 2.386208864486978, + "language_loss": 0.73494834, + "learning_rate": 3.975005663484038e-06, + "loss": 0.75845367, + "num_input_tokens_seen": 10195790, + "step": 480, + "time_per_iteration": 2.647947311401367 + }, + { + "auxiliary_loss_clip": 0.01264398, + "auxiliary_loss_mlp": 0.01074923, + "balance_loss_clip": 1.06303096, + "balance_loss_mlp": 1.0417347, + "epoch": 0.02891928453329325, + "flos": 22932842135040.0, + "grad_norm": 1.7032782450041153, + "language_loss": 0.87748379, + "learning_rate": 3.976345626888605e-06, + "loss": 0.900877, + "num_input_tokens_seen": 10218405, + "step": 481, + "time_per_iteration": 2.7394602298736572 + }, + { + "auxiliary_loss_clip": 0.01137168, + "auxiliary_loss_mlp": 0.01018552, + "balance_loss_clip": 1.02862203, + "balance_loss_mlp": 1.00973034, + "epoch": 0.028979407785961222, + "flos": 57432941792640.0, + "grad_norm": 0.8283484721366327, + "language_loss": 0.66087043, + "learning_rate": 3.9776828073972864e-06, + "loss": 0.68242764, + "num_input_tokens_seen": 10271005, + "step": 482, + "time_per_iteration": 2.9428391456604004 + }, + { + "auxiliary_loss_clip": 0.01280916, + "auxiliary_loss_mlp": 0.01081355, + "balance_loss_clip": 1.06767929, + "balance_loss_mlp": 1.04580653, + "epoch": 0.02903953103862919, + "flos": 16722737706240.0, + "grad_norm": 2.5338679230832617, + "language_loss": 0.78922659, + "learning_rate": 3.979017216545415e-06, + "loss": 0.81284928, + "num_input_tokens_seen": 10288405, + "step": 483, + "time_per_iteration": 2.619147777557373 + }, + { + "auxiliary_loss_clip": 0.01277068, + "auxiliary_loss_mlp": 0.01098387, + "balance_loss_clip": 1.06808281, + "balance_loss_mlp": 1.06064487, + "epoch": 0.02909965429129716, + "flos": 16763640318720.0, + "grad_norm": 2.0019116602087204, + "language_loss": 0.75944865, + "learning_rate": 3.980348865796749e-06, + "loss": 0.78320318, + "num_input_tokens_seen": 10306875, + "step": 484, + "time_per_iteration": 2.6591296195983887 + }, + { + "auxiliary_loss_clip": 0.01271841, + "auxiliary_loss_mlp": 0.01082957, + "balance_loss_clip": 1.06526279, + "balance_loss_mlp": 1.04810011, + "epoch": 0.029159777543965128, + "flos": 19785343023360.0, + "grad_norm": 2.781026216149872, + "language_loss": 0.84114796, + "learning_rate": 3.9816777665440615e-06, + "loss": 0.86469597, + "num_input_tokens_seen": 10323965, + "step": 485, + "time_per_iteration": 2.6842541694641113 + }, + { + "auxiliary_loss_clip": 0.01275578, + "auxiliary_loss_mlp": 0.01085018, + "balance_loss_clip": 1.06959307, + "balance_loss_mlp": 1.04880202, + "epoch": 0.029219900796633096, + "flos": 19642670202240.0, + "grad_norm": 2.199672628314003, + "language_loss": 0.84416658, + "learning_rate": 3.983003930109732e-06, + "loss": 0.86777252, + "num_input_tokens_seen": 10342620, + "step": 486, + "time_per_iteration": 2.609041690826416 + }, + { + "auxiliary_loss_clip": 0.01270479, + "auxiliary_loss_mlp": 0.01096639, + "balance_loss_clip": 1.06490135, + "balance_loss_mlp": 1.06016064, + "epoch": 0.02928002404930107, + "flos": 25885704424320.0, + "grad_norm": 2.3726514224549073, + "language_loss": 0.88979071, + "learning_rate": 3.984327367746315e-06, + "loss": 0.91346186, + "num_input_tokens_seen": 10364610, + "step": 487, + "time_per_iteration": 2.688138484954834 + }, + { + "auxiliary_loss_clip": 0.01273475, + "auxiliary_loss_mlp": 0.01067963, + "balance_loss_clip": 1.06727755, + "balance_loss_mlp": 1.03444099, + "epoch": 0.029340147301969037, + "flos": 20660234590080.0, + "grad_norm": 3.872678802639815, + "language_loss": 0.88720596, + "learning_rate": 3.985648090637122e-06, + "loss": 0.91062027, + "num_input_tokens_seen": 10380910, + "step": 488, + "time_per_iteration": 2.6452624797821045 + }, + { + "auxiliary_loss_clip": 0.01267843, + "auxiliary_loss_mlp": 0.01080727, + "balance_loss_clip": 1.06400836, + "balance_loss_mlp": 1.04534614, + "epoch": 0.029400270554637006, + "flos": 24428018689920.0, + "grad_norm": 2.1516107078532207, + "language_loss": 0.89237493, + "learning_rate": 3.986966109896785e-06, + "loss": 0.91586065, + "num_input_tokens_seen": 10400665, + "step": 489, + "time_per_iteration": 2.768216609954834 + }, + { + "auxiliary_loss_clip": 0.0126052, + "auxiliary_loss_mlp": 0.01076143, + "balance_loss_clip": 1.0583322, + "balance_loss_mlp": 1.04071391, + "epoch": 0.029460393807304974, + "flos": 20120892900480.0, + "grad_norm": 1.8856935895877105, + "language_loss": 0.88565737, + "learning_rate": 3.988281436571815e-06, + "loss": 0.909024, + "num_input_tokens_seen": 10420150, + "step": 490, + "time_per_iteration": 2.7038731575012207 + }, + { + "auxiliary_loss_clip": 0.01269209, + "auxiliary_loss_mlp": 0.0108229, + "balance_loss_clip": 1.06337965, + "balance_loss_mlp": 1.04712319, + "epoch": 0.029520517059972943, + "flos": 17675914965120.0, + "grad_norm": 2.9469159354824987, + "language_loss": 0.91676539, + "learning_rate": 3.989594081641164e-06, + "loss": 0.94028038, + "num_input_tokens_seen": 10438210, + "step": 491, + "time_per_iteration": 2.6024913787841797 + }, + { + "auxiliary_loss_clip": 0.01258523, + "auxiliary_loss_mlp": 0.01075265, + "balance_loss_clip": 1.06132269, + "balance_loss_mlp": 1.0421015, + "epoch": 0.029580640312640915, + "flos": 18953185662720.0, + "grad_norm": 2.0912920046687535, + "language_loss": 0.85439479, + "learning_rate": 3.9909040560167675e-06, + "loss": 0.87773263, + "num_input_tokens_seen": 10455125, + "step": 492, + "time_per_iteration": 2.6153955459594727 + }, + { + "auxiliary_loss_clip": 0.01270715, + "auxiliary_loss_mlp": 0.01097918, + "balance_loss_clip": 1.06691837, + "balance_loss_mlp": 1.0617975, + "epoch": 0.029640763565308884, + "flos": 18726121837440.0, + "grad_norm": 6.598729091385164, + "language_loss": 0.8431592, + "learning_rate": 3.992211370544093e-06, + "loss": 0.86684549, + "num_input_tokens_seen": 10470990, + "step": 493, + "time_per_iteration": 2.775865316390991 + }, + { + "auxiliary_loss_clip": 0.01265148, + "auxiliary_loss_mlp": 0.01080246, + "balance_loss_clip": 1.06185246, + "balance_loss_mlp": 1.04589009, + "epoch": 0.029700886817976852, + "flos": 20595308757120.0, + "grad_norm": 1.6817069811359948, + "language_loss": 0.8670215, + "learning_rate": 3.99351603600268e-06, + "loss": 0.89047539, + "num_input_tokens_seen": 10490685, + "step": 494, + "time_per_iteration": 2.575991630554199 + }, + { + "auxiliary_loss_clip": 0.01269799, + "auxiliary_loss_mlp": 0.01081617, + "balance_loss_clip": 1.06418931, + "balance_loss_mlp": 1.04921615, + "epoch": 0.02976101007064482, + "flos": 22236857233920.0, + "grad_norm": 2.441634530040442, + "language_loss": 0.86705279, + "learning_rate": 3.994818063106668e-06, + "loss": 0.89056695, + "num_input_tokens_seen": 10509435, + "step": 495, + "time_per_iteration": 2.595148801803589 + }, + { + "auxiliary_loss_clip": 0.01257314, + "auxiliary_loss_mlp": 0.01078513, + "balance_loss_clip": 1.06103146, + "balance_loss_mlp": 1.04520547, + "epoch": 0.029821133323312793, + "flos": 23732644320000.0, + "grad_norm": 2.0519455958038506, + "language_loss": 0.6225782, + "learning_rate": 3.99611746250533e-06, + "loss": 0.64593643, + "num_input_tokens_seen": 10530050, + "step": 496, + "time_per_iteration": 2.650268793106079 + }, + { + "auxiliary_loss_clip": 0.01264118, + "auxiliary_loss_mlp": 0.01086023, + "balance_loss_clip": 1.06672692, + "balance_loss_mlp": 1.05282354, + "epoch": 0.02988125657598076, + "flos": 22419498913920.0, + "grad_norm": 1.7997462864686877, + "language_loss": 0.88830537, + "learning_rate": 3.997414244783595e-06, + "loss": 0.91180676, + "num_input_tokens_seen": 10551370, + "step": 497, + "time_per_iteration": 2.7747745513916016 + }, + { + "auxiliary_loss_clip": 0.01267715, + "auxiliary_loss_mlp": 0.0108308, + "balance_loss_clip": 1.06499696, + "balance_loss_mlp": 1.04872417, + "epoch": 0.02994137982864873, + "flos": 13845108453120.0, + "grad_norm": 3.0819365764708326, + "language_loss": 0.85022694, + "learning_rate": 3.998708420462557e-06, + "loss": 0.87373489, + "num_input_tokens_seen": 10569225, + "step": 498, + "time_per_iteration": 2.6005992889404297 + }, + { + "auxiliary_loss_clip": 0.01265875, + "auxiliary_loss_mlp": 0.01080946, + "balance_loss_clip": 1.06412971, + "balance_loss_mlp": 1.04802036, + "epoch": 0.0300015030813167, + "flos": 23908354675200.0, + "grad_norm": 3.8803754633516108, + "language_loss": 0.78637695, + "learning_rate": 4e-06, + "loss": 0.80984515, + "num_input_tokens_seen": 10586170, + "step": 499, + "time_per_iteration": 2.6115381717681885 + }, + { + "auxiliary_loss_clip": 0.01266412, + "auxiliary_loss_mlp": 0.01081511, + "balance_loss_clip": 1.06572294, + "balance_loss_mlp": 1.04782271, + "epoch": 0.030061626333984667, + "flos": 22016796560640.0, + "grad_norm": 1.8372929393533073, + "language_loss": 0.82642961, + "learning_rate": 3.9999999620799e-06, + "loss": 0.84990883, + "num_input_tokens_seen": 10606205, + "step": 500, + "time_per_iteration": 2.617170572280884 + }, + { + "auxiliary_loss_clip": 0.01257615, + "auxiliary_loss_mlp": 0.01089749, + "balance_loss_clip": 1.06004298, + "balance_loss_mlp": 1.05286539, + "epoch": 0.03012174958665264, + "flos": 23039747988480.0, + "grad_norm": 2.9246416863410993, + "language_loss": 0.88214296, + "learning_rate": 3.9999998483196e-06, + "loss": 0.90561658, + "num_input_tokens_seen": 10625995, + "step": 501, + "time_per_iteration": 2.747514009475708 + }, + { + "auxiliary_loss_clip": 0.0126562, + "auxiliary_loss_mlp": 0.01071334, + "balance_loss_clip": 1.06306875, + "balance_loss_mlp": 1.03886092, + "epoch": 0.030181872839320608, + "flos": 18953257489920.0, + "grad_norm": 2.350459058611249, + "language_loss": 0.86726499, + "learning_rate": 3.9999996587191065e-06, + "loss": 0.89063442, + "num_input_tokens_seen": 10644105, + "step": 502, + "time_per_iteration": 2.6408181190490723 + }, + { + "auxiliary_loss_clip": 0.01260782, + "auxiliary_loss_mlp": 0.01081778, + "balance_loss_clip": 1.06371033, + "balance_loss_mlp": 1.0472554, + "epoch": 0.030241996091988577, + "flos": 16728017005440.0, + "grad_norm": 3.019779047351164, + "language_loss": 0.84699571, + "learning_rate": 3.999999393278425e-06, + "loss": 0.87042129, + "num_input_tokens_seen": 10661090, + "step": 503, + "time_per_iteration": 2.5664048194885254 + }, + { + "auxiliary_loss_clip": 0.01255215, + "auxiliary_loss_mlp": 0.01087116, + "balance_loss_clip": 1.06140924, + "balance_loss_mlp": 1.0529983, + "epoch": 0.030302119344656545, + "flos": 28621271387520.0, + "grad_norm": 1.888607836924632, + "language_loss": 0.8818984, + "learning_rate": 3.999999051997567e-06, + "loss": 0.90532172, + "num_input_tokens_seen": 10682380, + "step": 504, + "time_per_iteration": 2.7439093589782715 + }, + { + "auxiliary_loss_clip": 0.01257296, + "auxiliary_loss_mlp": 0.01088926, + "balance_loss_clip": 1.06043839, + "balance_loss_mlp": 1.05461764, + "epoch": 0.030362242597324514, + "flos": 15669334523520.0, + "grad_norm": 2.0596188094002317, + "language_loss": 0.77990383, + "learning_rate": 3.9999986348765425e-06, + "loss": 0.80336607, + "num_input_tokens_seen": 10699925, + "step": 505, + "time_per_iteration": 2.6781909465789795 + }, + { + "auxiliary_loss_clip": 0.0114636, + "auxiliary_loss_mlp": 0.01028015, + "balance_loss_clip": 1.03736544, + "balance_loss_mlp": 1.01924157, + "epoch": 0.030422365849992486, + "flos": 72125973676800.0, + "grad_norm": 0.8448125607231747, + "language_loss": 0.55021358, + "learning_rate": 3.999998141915371e-06, + "loss": 0.57195735, + "num_input_tokens_seen": 10766525, + "step": 506, + "time_per_iteration": 3.3490796089172363 + }, + { + "auxiliary_loss_clip": 0.01256736, + "auxiliary_loss_mlp": 0.01087151, + "balance_loss_clip": 1.05920482, + "balance_loss_mlp": 1.0530808, + "epoch": 0.030482489102660455, + "flos": 19427817000960.0, + "grad_norm": 2.5439549611735126, + "language_loss": 0.83408535, + "learning_rate": 3.999997573114069e-06, + "loss": 0.85752416, + "num_input_tokens_seen": 10786725, + "step": 507, + "time_per_iteration": 2.6582911014556885 + }, + { + "auxiliary_loss_clip": 0.01261982, + "auxiliary_loss_mlp": 0.0107766, + "balance_loss_clip": 1.0611614, + "balance_loss_mlp": 1.04323208, + "epoch": 0.030542612355328423, + "flos": 20375822701440.0, + "grad_norm": 2.5096530321145822, + "language_loss": 0.88513291, + "learning_rate": 3.999996928472659e-06, + "loss": 0.9085294, + "num_input_tokens_seen": 10805390, + "step": 508, + "time_per_iteration": 7.456804513931274 + }, + { + "auxiliary_loss_clip": 0.01265093, + "auxiliary_loss_mlp": 0.01066849, + "balance_loss_clip": 1.06251884, + "balance_loss_mlp": 1.0327071, + "epoch": 0.030602735607996392, + "flos": 34677354297600.0, + "grad_norm": 1.9812925741505962, + "language_loss": 0.71851432, + "learning_rate": 3.999996207991165e-06, + "loss": 0.74183375, + "num_input_tokens_seen": 10828030, + "step": 509, + "time_per_iteration": 2.7643797397613525 + }, + { + "auxiliary_loss_clip": 0.01256798, + "auxiliary_loss_mlp": 0.01068198, + "balance_loss_clip": 1.0620178, + "balance_loss_mlp": 1.03596425, + "epoch": 0.03066285886066436, + "flos": 23658668259840.0, + "grad_norm": 3.590747780856493, + "language_loss": 0.82181257, + "learning_rate": 3.999995411669614e-06, + "loss": 0.84506255, + "num_input_tokens_seen": 10845240, + "step": 510, + "time_per_iteration": 2.6370575428009033 + }, + { + "auxiliary_loss_clip": 0.01261166, + "auxiliary_loss_mlp": 0.0108299, + "balance_loss_clip": 1.06508172, + "balance_loss_mlp": 1.0484674, + "epoch": 0.030722982113332332, + "flos": 23002975440000.0, + "grad_norm": 4.086955779693833, + "language_loss": 0.83524418, + "learning_rate": 3.999994539508036e-06, + "loss": 0.85868573, + "num_input_tokens_seen": 10864325, + "step": 511, + "time_per_iteration": 4.2285473346710205 + }, + { + "auxiliary_loss_clip": 0.01261085, + "auxiliary_loss_mlp": 0.0107364, + "balance_loss_clip": 1.06124783, + "balance_loss_mlp": 1.04033279, + "epoch": 0.0307831053660003, + "flos": 24750855152640.0, + "grad_norm": 2.598307631316001, + "language_loss": 0.82324827, + "learning_rate": 3.9999935915064655e-06, + "loss": 0.84659553, + "num_input_tokens_seen": 10883860, + "step": 512, + "time_per_iteration": 2.6438117027282715 + }, + { + "auxiliary_loss_clip": 0.01257891, + "auxiliary_loss_mlp": 0.01087825, + "balance_loss_clip": 1.05993259, + "balance_loss_mlp": 1.0531826, + "epoch": 0.03084322861866827, + "flos": 26140885620480.0, + "grad_norm": 2.4080240000921216, + "language_loss": 0.87167287, + "learning_rate": 3.9999925676649374e-06, + "loss": 0.89512992, + "num_input_tokens_seen": 10904555, + "step": 513, + "time_per_iteration": 2.719515323638916 + }, + { + "auxiliary_loss_clip": 0.0126547, + "auxiliary_loss_mlp": 0.01078613, + "balance_loss_clip": 1.06358361, + "balance_loss_mlp": 1.04394734, + "epoch": 0.03090335187133624, + "flos": 18771298168320.0, + "grad_norm": 1.757883635330411, + "language_loss": 0.79034066, + "learning_rate": 3.999991467983491e-06, + "loss": 0.8137815, + "num_input_tokens_seen": 10923700, + "step": 514, + "time_per_iteration": 2.6607022285461426 + }, + { + "auxiliary_loss_clip": 0.01260577, + "auxiliary_loss_mlp": 0.01064204, + "balance_loss_clip": 1.06452596, + "balance_loss_mlp": 1.03196955, + "epoch": 0.030963475124004207, + "flos": 23221886878080.0, + "grad_norm": 2.983782296415325, + "language_loss": 0.77030945, + "learning_rate": 3.999990292462167e-06, + "loss": 0.79355723, + "num_input_tokens_seen": 10942730, + "step": 515, + "time_per_iteration": 2.611295700073242 + }, + { + "auxiliary_loss_clip": 0.01255772, + "auxiliary_loss_mlp": 0.01071931, + "balance_loss_clip": 1.05823731, + "balance_loss_mlp": 1.03752685, + "epoch": 0.03102359837667218, + "flos": 42525595411200.0, + "grad_norm": 1.85397087483449, + "language_loss": 0.8284896, + "learning_rate": 3.999989041101011e-06, + "loss": 0.85176665, + "num_input_tokens_seen": 10967120, + "step": 516, + "time_per_iteration": 2.8387768268585205 + }, + { + "auxiliary_loss_clip": 0.01255589, + "auxiliary_loss_mlp": 0.01076042, + "balance_loss_clip": 1.06149435, + "balance_loss_mlp": 1.04132879, + "epoch": 0.031083721629340148, + "flos": 21176953689600.0, + "grad_norm": 1.8481819046643853, + "language_loss": 0.78938633, + "learning_rate": 3.999987713900071e-06, + "loss": 0.81270266, + "num_input_tokens_seen": 10986775, + "step": 517, + "time_per_iteration": 2.6865460872650146 + }, + { + "auxiliary_loss_clip": 0.01253816, + "auxiliary_loss_mlp": 0.01075541, + "balance_loss_clip": 1.06294477, + "balance_loss_mlp": 1.04235291, + "epoch": 0.031143844882008116, + "flos": 29716187713920.0, + "grad_norm": 1.6256200589750278, + "language_loss": 0.90711063, + "learning_rate": 3.999986310859396e-06, + "loss": 0.93040419, + "num_input_tokens_seen": 11011360, + "step": 518, + "time_per_iteration": 2.750323534011841 + }, + { + "auxiliary_loss_clip": 0.01265989, + "auxiliary_loss_mlp": 0.01100491, + "balance_loss_clip": 1.06969905, + "balance_loss_mlp": 1.06389427, + "epoch": 0.031203968134676085, + "flos": 23112467072640.0, + "grad_norm": 2.250014307590285, + "language_loss": 0.86358941, + "learning_rate": 3.999984831979039e-06, + "loss": 0.88725412, + "num_input_tokens_seen": 11030150, + "step": 519, + "time_per_iteration": 2.622795581817627 + }, + { + "auxiliary_loss_clip": 0.01260733, + "auxiliary_loss_mlp": 0.01090918, + "balance_loss_clip": 1.05995798, + "balance_loss_mlp": 1.0569911, + "epoch": 0.03126409138734405, + "flos": 20954379064320.0, + "grad_norm": 2.6591500238252994, + "language_loss": 0.87205797, + "learning_rate": 3.999983277259057e-06, + "loss": 0.89557451, + "num_input_tokens_seen": 11049145, + "step": 520, + "time_per_iteration": 2.7533626556396484 + }, + { + "auxiliary_loss_clip": 0.01263484, + "auxiliary_loss_mlp": 0.01094127, + "balance_loss_clip": 1.0618248, + "balance_loss_mlp": 1.05841172, + "epoch": 0.031324214640012026, + "flos": 21650112570240.0, + "grad_norm": 1.8864090535543063, + "language_loss": 0.89320678, + "learning_rate": 3.999981646699509e-06, + "loss": 0.91678286, + "num_input_tokens_seen": 11068835, + "step": 521, + "time_per_iteration": 2.7114946842193604 + }, + { + "auxiliary_loss_clip": 0.01255944, + "auxiliary_loss_mlp": 0.01083814, + "balance_loss_clip": 1.06130397, + "balance_loss_mlp": 1.04817045, + "epoch": 0.03138433789267999, + "flos": 23441337020160.0, + "grad_norm": 2.0796522329265157, + "language_loss": 0.71023804, + "learning_rate": 3.999979940300456e-06, + "loss": 0.73363566, + "num_input_tokens_seen": 11088980, + "step": 522, + "time_per_iteration": 2.603396415710449 + }, + { + "auxiliary_loss_clip": 0.01261048, + "auxiliary_loss_mlp": 0.01082436, + "balance_loss_clip": 1.0613215, + "balance_loss_mlp": 1.04955769, + "epoch": 0.03144446114534796, + "flos": 18982164960000.0, + "grad_norm": 5.28170221339422, + "language_loss": 0.84958905, + "learning_rate": 3.999978158061963e-06, + "loss": 0.87302387, + "num_input_tokens_seen": 11104300, + "step": 523, + "time_per_iteration": 2.5611178874969482 + }, + { + "auxiliary_loss_clip": 0.01263659, + "auxiliary_loss_mlp": 0.01078615, + "balance_loss_clip": 1.06097221, + "balance_loss_mlp": 1.04321003, + "epoch": 0.031504584398015935, + "flos": 22637692080000.0, + "grad_norm": 2.080982571250834, + "language_loss": 0.90400314, + "learning_rate": 3.999976299984099e-06, + "loss": 0.92742586, + "num_input_tokens_seen": 11123335, + "step": 524, + "time_per_iteration": 2.5871100425720215 + }, + { + "auxiliary_loss_clip": 0.01269199, + "auxiliary_loss_mlp": 0.01086307, + "balance_loss_clip": 1.06604052, + "balance_loss_mlp": 1.05035353, + "epoch": 0.0315647076506839, + "flos": 25297056339840.0, + "grad_norm": 2.238371658633788, + "language_loss": 0.80086881, + "learning_rate": 3.999974366066933e-06, + "loss": 0.82442385, + "num_input_tokens_seen": 11140880, + "step": 525, + "time_per_iteration": 2.848668336868286 + }, + { + "auxiliary_loss_clip": 0.01260025, + "auxiliary_loss_mlp": 0.01083921, + "balance_loss_clip": 1.05950069, + "balance_loss_mlp": 1.04977918, + "epoch": 0.03162483090335187, + "flos": 16982839065600.0, + "grad_norm": 2.3120860856113983, + "language_loss": 0.80670166, + "learning_rate": 3.999972356310538e-06, + "loss": 0.83014113, + "num_input_tokens_seen": 11158710, + "step": 526, + "time_per_iteration": 2.8382463455200195 + }, + { + "auxiliary_loss_clip": 0.01269961, + "auxiliary_loss_mlp": 0.01074893, + "balance_loss_clip": 1.06629133, + "balance_loss_mlp": 1.03841507, + "epoch": 0.03168495415601984, + "flos": 18734489706240.0, + "grad_norm": 2.188543694358384, + "language_loss": 0.81371057, + "learning_rate": 3.999970270714991e-06, + "loss": 0.83715904, + "num_input_tokens_seen": 11177550, + "step": 527, + "time_per_iteration": 2.7326009273529053 + }, + { + "auxiliary_loss_clip": 0.0125636, + "auxiliary_loss_mlp": 0.01087782, + "balance_loss_clip": 1.05917442, + "balance_loss_mlp": 1.05325913, + "epoch": 0.03174507740868781, + "flos": 21214875473280.0, + "grad_norm": 2.145726393653074, + "language_loss": 0.94128525, + "learning_rate": 3.999968109280371e-06, + "loss": 0.96472663, + "num_input_tokens_seen": 11196230, + "step": 528, + "time_per_iteration": 2.7258129119873047 + }, + { + "auxiliary_loss_clip": 0.01258047, + "auxiliary_loss_mlp": 0.01075161, + "balance_loss_clip": 1.05988955, + "balance_loss_mlp": 1.04099584, + "epoch": 0.03180520066135578, + "flos": 24787663614720.0, + "grad_norm": 3.065834472901689, + "language_loss": 0.84181619, + "learning_rate": 3.99996587200676e-06, + "loss": 0.86514831, + "num_input_tokens_seen": 11214935, + "step": 529, + "time_per_iteration": 2.766818046569824 + }, + { + "auxiliary_loss_clip": 0.01261816, + "auxiliary_loss_mlp": 0.01086224, + "balance_loss_clip": 1.0673213, + "balance_loss_mlp": 1.05289352, + "epoch": 0.03186532391402375, + "flos": 24864261367680.0, + "grad_norm": 2.0661572222799434, + "language_loss": 0.90775239, + "learning_rate": 3.999963558894243e-06, + "loss": 0.93123281, + "num_input_tokens_seen": 11235310, + "step": 530, + "time_per_iteration": 2.9131886959075928 + }, + { + "auxiliary_loss_clip": 0.01254771, + "auxiliary_loss_mlp": 0.01078423, + "balance_loss_clip": 1.05629182, + "balance_loss_mlp": 1.04280329, + "epoch": 0.03192544716669172, + "flos": 21215055041280.0, + "grad_norm": 2.0778350436201234, + "language_loss": 0.76517683, + "learning_rate": 3.999961169942907e-06, + "loss": 0.78850877, + "num_input_tokens_seen": 11254425, + "step": 531, + "time_per_iteration": 2.612107515335083 + }, + { + "auxiliary_loss_clip": 0.01254115, + "auxiliary_loss_mlp": 0.01067109, + "balance_loss_clip": 1.05854118, + "balance_loss_mlp": 1.03191841, + "epoch": 0.03198557041935969, + "flos": 24353216616960.0, + "grad_norm": 2.2365509882060035, + "language_loss": 0.90497434, + "learning_rate": 3.999958705152843e-06, + "loss": 0.92818666, + "num_input_tokens_seen": 11274595, + "step": 532, + "time_per_iteration": 2.6974568367004395 + }, + { + "auxiliary_loss_clip": 0.01139935, + "auxiliary_loss_mlp": 0.01037876, + "balance_loss_clip": 1.03502345, + "balance_loss_mlp": 1.02838707, + "epoch": 0.032045693672027656, + "flos": 61827367587840.0, + "grad_norm": 0.7386699747196634, + "language_loss": 0.57938039, + "learning_rate": 3.9999561645241445e-06, + "loss": 0.6011585, + "num_input_tokens_seen": 11336705, + "step": 533, + "time_per_iteration": 3.2149643898010254 + }, + { + "auxiliary_loss_clip": 0.01252211, + "auxiliary_loss_mlp": 0.01085955, + "balance_loss_clip": 1.0578208, + "balance_loss_mlp": 1.05260015, + "epoch": 0.03210581692469563, + "flos": 28401174800640.0, + "grad_norm": 1.6177739787148424, + "language_loss": 0.86478806, + "learning_rate": 3.999953548056907e-06, + "loss": 0.88816965, + "num_input_tokens_seen": 11356820, + "step": 534, + "time_per_iteration": 2.624685764312744 + }, + { + "auxiliary_loss_clip": 0.01255481, + "auxiliary_loss_mlp": 0.01070161, + "balance_loss_clip": 1.06137478, + "balance_loss_mlp": 1.03637695, + "epoch": 0.03216594017736359, + "flos": 24717709877760.0, + "grad_norm": 4.272797049761065, + "language_loss": 0.77505982, + "learning_rate": 3.999950855751232e-06, + "loss": 0.7983163, + "num_input_tokens_seen": 11376645, + "step": 535, + "time_per_iteration": 2.6080353260040283 + }, + { + "auxiliary_loss_clip": 0.01255872, + "auxiliary_loss_mlp": 0.01091452, + "balance_loss_clip": 1.06047976, + "balance_loss_mlp": 1.05750108, + "epoch": 0.032226063430031565, + "flos": 31175453646720.0, + "grad_norm": 2.1001559114899186, + "language_loss": 0.80605215, + "learning_rate": 3.999948087607219e-06, + "loss": 0.82952535, + "num_input_tokens_seen": 11397310, + "step": 536, + "time_per_iteration": 2.6674888134002686 + }, + { + "auxiliary_loss_clip": 0.01258376, + "auxiliary_loss_mlp": 0.01081197, + "balance_loss_clip": 1.06302178, + "balance_loss_mlp": 1.04576802, + "epoch": 0.03228618668269954, + "flos": 32198225506560.0, + "grad_norm": 1.9622747037341854, + "language_loss": 0.70301485, + "learning_rate": 3.999945243624975e-06, + "loss": 0.72641057, + "num_input_tokens_seen": 11418475, + "step": 537, + "time_per_iteration": 2.6841211318969727 + }, + { + "auxiliary_loss_clip": 0.01259296, + "auxiliary_loss_mlp": 0.0108956, + "balance_loss_clip": 1.06755197, + "balance_loss_mlp": 1.05553746, + "epoch": 0.0323463099353675, + "flos": 22670154996480.0, + "grad_norm": 2.1826055095997754, + "language_loss": 0.82972717, + "learning_rate": 3.999942323804607e-06, + "loss": 0.85321569, + "num_input_tokens_seen": 11436630, + "step": 538, + "time_per_iteration": 2.6633834838867188 + }, + { + "auxiliary_loss_clip": 0.01263799, + "auxiliary_loss_mlp": 0.01093554, + "balance_loss_clip": 1.06230807, + "balance_loss_mlp": 1.05948424, + "epoch": 0.032406433188035474, + "flos": 26905172232960.0, + "grad_norm": 3.982285534016788, + "language_loss": 0.79411012, + "learning_rate": 3.999939328146225e-06, + "loss": 0.81768364, + "num_input_tokens_seen": 11457275, + "step": 539, + "time_per_iteration": 2.659013509750366 + }, + { + "auxiliary_loss_clip": 0.01257388, + "auxiliary_loss_mlp": 0.01077979, + "balance_loss_clip": 1.06210756, + "balance_loss_mlp": 1.04197776, + "epoch": 0.03246655644070344, + "flos": 31503928544640.0, + "grad_norm": 2.3504152712631745, + "language_loss": 0.77540243, + "learning_rate": 3.999936256649943e-06, + "loss": 0.79875612, + "num_input_tokens_seen": 11476925, + "step": 540, + "time_per_iteration": 2.8029325008392334 + }, + { + "auxiliary_loss_clip": 0.01268047, + "auxiliary_loss_mlp": 0.0108878, + "balance_loss_clip": 1.06736505, + "balance_loss_mlp": 1.05475807, + "epoch": 0.03252667969337141, + "flos": 23218331431680.0, + "grad_norm": 2.1327550728603324, + "language_loss": 0.85208362, + "learning_rate": 3.999933109315878e-06, + "loss": 0.8756519, + "num_input_tokens_seen": 11496830, + "step": 541, + "time_per_iteration": 2.5932223796844482 + }, + { + "auxiliary_loss_clip": 0.01254979, + "auxiliary_loss_mlp": 0.01093292, + "balance_loss_clip": 1.06497478, + "balance_loss_mlp": 1.05736279, + "epoch": 0.032586802946039384, + "flos": 14757454926720.0, + "grad_norm": 2.3929831328445132, + "language_loss": 0.89024442, + "learning_rate": 3.9999298861441496e-06, + "loss": 0.91372716, + "num_input_tokens_seen": 11515605, + "step": 542, + "time_per_iteration": 2.6444177627563477 + }, + { + "auxiliary_loss_clip": 0.01256937, + "auxiliary_loss_mlp": 0.01090539, + "balance_loss_clip": 1.0621438, + "balance_loss_mlp": 1.05575383, + "epoch": 0.03264692619870735, + "flos": 24280677100800.0, + "grad_norm": 2.0426247327618356, + "language_loss": 0.70716631, + "learning_rate": 3.999926587134879e-06, + "loss": 0.73064101, + "num_input_tokens_seen": 11536230, + "step": 543, + "time_per_iteration": 2.806098699569702 + }, + { + "auxiliary_loss_clip": 0.0125486, + "auxiliary_loss_mlp": 0.01093661, + "balance_loss_clip": 1.05710375, + "balance_loss_mlp": 1.05878067, + "epoch": 0.03270704945137532, + "flos": 22893160584960.0, + "grad_norm": 3.498732661234056, + "language_loss": 0.92100644, + "learning_rate": 3.999923212288192e-06, + "loss": 0.94449162, + "num_input_tokens_seen": 11554715, + "step": 544, + "time_per_iteration": 2.752643585205078 + }, + { + "auxiliary_loss_clip": 0.01260325, + "auxiliary_loss_mlp": 0.01089521, + "balance_loss_clip": 1.0640341, + "balance_loss_mlp": 1.05731058, + "epoch": 0.032767172704043286, + "flos": 18041018757120.0, + "grad_norm": 4.6087839393544625, + "language_loss": 0.66295922, + "learning_rate": 3.999919761604216e-06, + "loss": 0.68645769, + "num_input_tokens_seen": 11571370, + "step": 545, + "time_per_iteration": 2.78071665763855 + }, + { + "auxiliary_loss_clip": 0.01257147, + "auxiliary_loss_mlp": 0.0107566, + "balance_loss_clip": 1.06036496, + "balance_loss_mlp": 1.04161406, + "epoch": 0.03282729595671126, + "flos": 22528739151360.0, + "grad_norm": 2.4149116260882155, + "language_loss": 0.92327714, + "learning_rate": 3.999916235083083e-06, + "loss": 0.94660521, + "num_input_tokens_seen": 11588560, + "step": 546, + "time_per_iteration": 2.7103841304779053 + }, + { + "auxiliary_loss_clip": 0.01253579, + "auxiliary_loss_mlp": 0.01077157, + "balance_loss_clip": 1.05663276, + "balance_loss_mlp": 1.04106092, + "epoch": 0.03288741920937923, + "flos": 20410620001920.0, + "grad_norm": 2.2285091138737245, + "language_loss": 0.82076216, + "learning_rate": 3.999912632724925e-06, + "loss": 0.84406948, + "num_input_tokens_seen": 11605685, + "step": 547, + "time_per_iteration": 2.6100785732269287 + }, + { + "auxiliary_loss_clip": 0.01254929, + "auxiliary_loss_mlp": 0.01076751, + "balance_loss_clip": 1.06050026, + "balance_loss_mlp": 1.04153681, + "epoch": 0.032947542462047195, + "flos": 20777986350720.0, + "grad_norm": 2.8665367200476832, + "language_loss": 0.81150007, + "learning_rate": 3.999908954529881e-06, + "loss": 0.83481693, + "num_input_tokens_seen": 11626290, + "step": 548, + "time_per_iteration": 2.677849054336548 + }, + { + "auxiliary_loss_clip": 0.0125723, + "auxiliary_loss_mlp": 0.01084291, + "balance_loss_clip": 1.06200218, + "balance_loss_mlp": 1.04836178, + "epoch": 0.03300766571471517, + "flos": 19901263190400.0, + "grad_norm": 2.6664984956800493, + "language_loss": 0.67648894, + "learning_rate": 3.999905200498087e-06, + "loss": 0.69990414, + "num_input_tokens_seen": 11643950, + "step": 549, + "time_per_iteration": 2.5970118045806885 + }, + { + "auxiliary_loss_clip": 0.01249581, + "auxiliary_loss_mlp": 0.01078145, + "balance_loss_clip": 1.06021476, + "balance_loss_mlp": 1.04359806, + "epoch": 0.03306778896738313, + "flos": 17967760968960.0, + "grad_norm": 2.388956086442457, + "language_loss": 0.86354113, + "learning_rate": 3.999901370629689e-06, + "loss": 0.88681835, + "num_input_tokens_seen": 11662560, + "step": 550, + "time_per_iteration": 2.556403636932373 + }, + { + "auxiliary_loss_clip": 0.01256949, + "auxiliary_loss_mlp": 0.01099175, + "balance_loss_clip": 1.06446409, + "balance_loss_mlp": 1.06450927, + "epoch": 0.033127912220051105, + "flos": 21653380707840.0, + "grad_norm": 1.931024764331954, + "language_loss": 0.81168532, + "learning_rate": 3.99989746492483e-06, + "loss": 0.83524656, + "num_input_tokens_seen": 11682265, + "step": 551, + "time_per_iteration": 2.6169543266296387 + }, + { + "auxiliary_loss_clip": 0.01263928, + "auxiliary_loss_mlp": 0.01085301, + "balance_loss_clip": 1.06546593, + "balance_loss_mlp": 1.05082583, + "epoch": 0.03318803547271908, + "flos": 30188376927360.0, + "grad_norm": 2.758387801451639, + "language_loss": 0.86402822, + "learning_rate": 3.999893483383658e-06, + "loss": 0.88752043, + "num_input_tokens_seen": 11699300, + "step": 552, + "time_per_iteration": 2.717876434326172 + }, + { + "auxiliary_loss_clip": 0.01260847, + "auxiliary_loss_mlp": 0.01083599, + "balance_loss_clip": 1.06493962, + "balance_loss_mlp": 1.04659605, + "epoch": 0.03324815872538704, + "flos": 20376038183040.0, + "grad_norm": 2.701259784744368, + "language_loss": 0.92938083, + "learning_rate": 3.999889426006326e-06, + "loss": 0.95282531, + "num_input_tokens_seen": 11716955, + "step": 553, + "time_per_iteration": 2.8162076473236084 + }, + { + "auxiliary_loss_clip": 0.01254749, + "auxiliary_loss_mlp": 0.01079737, + "balance_loss_clip": 1.06077123, + "balance_loss_mlp": 1.04309201, + "epoch": 0.033308281978055014, + "flos": 24494560634880.0, + "grad_norm": 2.3923281610243783, + "language_loss": 0.78738242, + "learning_rate": 3.999885292792986e-06, + "loss": 0.8107273, + "num_input_tokens_seen": 11736130, + "step": 554, + "time_per_iteration": 2.708688974380493 + }, + { + "auxiliary_loss_clip": 0.01251089, + "auxiliary_loss_mlp": 0.01088966, + "balance_loss_clip": 1.06093216, + "balance_loss_mlp": 1.05279756, + "epoch": 0.03336840523072298, + "flos": 23400326666880.0, + "grad_norm": 2.224174786923328, + "language_loss": 0.82053566, + "learning_rate": 3.999881083743795e-06, + "loss": 0.8439362, + "num_input_tokens_seen": 11754425, + "step": 555, + "time_per_iteration": 2.632736921310425 + }, + { + "auxiliary_loss_clip": 0.01254451, + "auxiliary_loss_mlp": 0.01086277, + "balance_loss_clip": 1.05963016, + "balance_loss_mlp": 1.05113399, + "epoch": 0.03342852848339095, + "flos": 30550571717760.0, + "grad_norm": 2.236312930426662, + "language_loss": 0.88633543, + "learning_rate": 3.999876798858914e-06, + "loss": 0.90974277, + "num_input_tokens_seen": 11772845, + "step": 556, + "time_per_iteration": 5.9376220703125 + }, + { + "auxiliary_loss_clip": 0.01253968, + "auxiliary_loss_mlp": 0.01086327, + "balance_loss_clip": 1.06142902, + "balance_loss_mlp": 1.05011141, + "epoch": 0.03348865173605892, + "flos": 22893304239360.0, + "grad_norm": 4.4547118751442, + "language_loss": 0.83601987, + "learning_rate": 3.999872438138503e-06, + "loss": 0.8594228, + "num_input_tokens_seen": 11792850, + "step": 557, + "time_per_iteration": 2.766984701156616 + }, + { + "auxiliary_loss_clip": 0.0125905, + "auxiliary_loss_mlp": 0.01071212, + "balance_loss_clip": 1.06438184, + "balance_loss_mlp": 1.03790474, + "epoch": 0.03354877498872689, + "flos": 17676022705920.0, + "grad_norm": 2.549003392282023, + "language_loss": 0.9406684, + "learning_rate": 3.999868001582729e-06, + "loss": 0.96397102, + "num_input_tokens_seen": 11809670, + "step": 558, + "time_per_iteration": 2.663086175918579 + }, + { + "auxiliary_loss_clip": 0.01248044, + "auxiliary_loss_mlp": 0.01075128, + "balance_loss_clip": 1.05661702, + "balance_loss_mlp": 1.04053354, + "epoch": 0.03360889824139486, + "flos": 21652985658240.0, + "grad_norm": 2.493371693107777, + "language_loss": 0.7745387, + "learning_rate": 3.99986348919176e-06, + "loss": 0.79777044, + "num_input_tokens_seen": 11829665, + "step": 559, + "time_per_iteration": 4.4272520542144775 + }, + { + "auxiliary_loss_clip": 0.01252268, + "auxiliary_loss_mlp": 0.01081456, + "balance_loss_clip": 1.05971992, + "balance_loss_mlp": 1.04793429, + "epoch": 0.033669021494062826, + "flos": 21795730306560.0, + "grad_norm": 2.1202315480214273, + "language_loss": 0.87552077, + "learning_rate": 3.9998589009657675e-06, + "loss": 0.89885795, + "num_input_tokens_seen": 11848190, + "step": 560, + "time_per_iteration": 2.7010109424591064 + }, + { + "auxiliary_loss_clip": 0.01247953, + "auxiliary_loss_mlp": 0.01071358, + "balance_loss_clip": 1.05814028, + "balance_loss_mlp": 1.03964806, + "epoch": 0.0337291447467308, + "flos": 21866222747520.0, + "grad_norm": 2.721838229757338, + "language_loss": 0.81358856, + "learning_rate": 3.999854236904925e-06, + "loss": 0.83678162, + "num_input_tokens_seen": 11864795, + "step": 561, + "time_per_iteration": 2.626387596130371 + }, + { + "auxiliary_loss_clip": 0.01248502, + "auxiliary_loss_mlp": 0.01075928, + "balance_loss_clip": 1.05919456, + "balance_loss_mlp": 1.0427165, + "epoch": 0.03378926799939877, + "flos": 24245951627520.0, + "grad_norm": 1.6181321432720952, + "language_loss": 0.82075983, + "learning_rate": 3.999849497009409e-06, + "loss": 0.84400415, + "num_input_tokens_seen": 11885275, + "step": 562, + "time_per_iteration": 2.697252035140991 + }, + { + "auxiliary_loss_clip": 0.0125415, + "auxiliary_loss_mlp": 0.01081561, + "balance_loss_clip": 1.06051755, + "balance_loss_mlp": 1.04713297, + "epoch": 0.033849391252066735, + "flos": 16507812677760.0, + "grad_norm": 2.0621570373328266, + "language_loss": 0.84138584, + "learning_rate": 3.999844681279401e-06, + "loss": 0.86474293, + "num_input_tokens_seen": 11903595, + "step": 563, + "time_per_iteration": 2.6713900566101074 + }, + { + "auxiliary_loss_clip": 0.0125279, + "auxiliary_loss_mlp": 0.01083375, + "balance_loss_clip": 1.06190944, + "balance_loss_mlp": 1.04913831, + "epoch": 0.03390951450473471, + "flos": 15669298609920.0, + "grad_norm": 2.1342190206442613, + "language_loss": 0.93947858, + "learning_rate": 3.99983978971508e-06, + "loss": 0.9628402, + "num_input_tokens_seen": 11917815, + "step": 564, + "time_per_iteration": 2.691270351409912 + }, + { + "auxiliary_loss_clip": 0.01251095, + "auxiliary_loss_mlp": 0.01075622, + "balance_loss_clip": 1.05725074, + "balance_loss_mlp": 1.04000223, + "epoch": 0.03396963775740267, + "flos": 22674787850880.0, + "grad_norm": 2.7687914413925805, + "language_loss": 0.94144189, + "learning_rate": 3.999834822316635e-06, + "loss": 0.96470904, + "num_input_tokens_seen": 11936305, + "step": 565, + "time_per_iteration": 2.6650760173797607 + }, + { + "auxiliary_loss_clip": 0.01136977, + "auxiliary_loss_mlp": 0.01078846, + "balance_loss_clip": 1.03154767, + "balance_loss_mlp": 1.06854653, + "epoch": 0.034029761010070644, + "flos": 64392683063040.0, + "grad_norm": 1.11469742003701, + "language_loss": 0.54905689, + "learning_rate": 3.9998297790842535e-06, + "loss": 0.57121515, + "num_input_tokens_seen": 11998940, + "step": 566, + "time_per_iteration": 3.208740472793579 + }, + { + "auxiliary_loss_clip": 0.0125368, + "auxiliary_loss_mlp": 0.01075876, + "balance_loss_clip": 1.06086552, + "balance_loss_mlp": 1.03980339, + "epoch": 0.034089884262738616, + "flos": 25004204755200.0, + "grad_norm": 13.597638719146769, + "language_loss": 0.76824033, + "learning_rate": 3.999824660018126e-06, + "loss": 0.79153597, + "num_input_tokens_seen": 12018860, + "step": 567, + "time_per_iteration": 2.6048452854156494 + }, + { + "auxiliary_loss_clip": 0.01245571, + "auxiliary_loss_mlp": 0.01084357, + "balance_loss_clip": 1.05865431, + "balance_loss_mlp": 1.05133557, + "epoch": 0.03415000751540658, + "flos": 28439096584320.0, + "grad_norm": 3.3182045187689906, + "language_loss": 0.80580199, + "learning_rate": 3.999819465118447e-06, + "loss": 0.8291012, + "num_input_tokens_seen": 12039675, + "step": 568, + "time_per_iteration": 2.621033191680908 + }, + { + "auxiliary_loss_clip": 0.01248578, + "auxiliary_loss_mlp": 0.01085472, + "balance_loss_clip": 1.06139922, + "balance_loss_mlp": 1.0512594, + "epoch": 0.034210130768074554, + "flos": 21468727866240.0, + "grad_norm": 1.762926175294898, + "language_loss": 0.86777806, + "learning_rate": 3.999814194385413e-06, + "loss": 0.89111853, + "num_input_tokens_seen": 12057680, + "step": 569, + "time_per_iteration": 2.7055587768554688 + }, + { + "auxiliary_loss_clip": 0.01250278, + "auxiliary_loss_mlp": 0.01081908, + "balance_loss_clip": 1.06032133, + "balance_loss_mlp": 1.04843402, + "epoch": 0.03427025402074252, + "flos": 18697501676160.0, + "grad_norm": 2.316703689117639, + "language_loss": 0.9615494, + "learning_rate": 3.9998088478192255e-06, + "loss": 0.98487121, + "num_input_tokens_seen": 12076135, + "step": 570, + "time_per_iteration": 2.681990385055542 + }, + { + "auxiliary_loss_clip": 0.01248653, + "auxiliary_loss_mlp": 0.01079602, + "balance_loss_clip": 1.05511987, + "balance_loss_mlp": 1.04341054, + "epoch": 0.03433037727341049, + "flos": 20849987162880.0, + "grad_norm": 2.716689492730247, + "language_loss": 0.79659712, + "learning_rate": 3.9998034254200846e-06, + "loss": 0.81987965, + "num_input_tokens_seen": 12094785, + "step": 571, + "time_per_iteration": 2.6095619201660156 + }, + { + "auxiliary_loss_clip": 0.0125313, + "auxiliary_loss_mlp": 0.0109055, + "balance_loss_clip": 1.06262159, + "balance_loss_mlp": 1.05497837, + "epoch": 0.03439050052607846, + "flos": 25410282986880.0, + "grad_norm": 2.488187891949155, + "language_loss": 0.80722642, + "learning_rate": 3.999797927188199e-06, + "loss": 0.8306632, + "num_input_tokens_seen": 12114590, + "step": 572, + "time_per_iteration": 2.6674177646636963 + }, + { + "auxiliary_loss_clip": 0.01256878, + "auxiliary_loss_mlp": 0.0108407, + "balance_loss_clip": 1.06249881, + "balance_loss_mlp": 1.04969049, + "epoch": 0.03445062377874643, + "flos": 17640147997440.0, + "grad_norm": 2.0583698531688164, + "language_loss": 0.84747624, + "learning_rate": 3.999792353123774e-06, + "loss": 0.87088573, + "num_input_tokens_seen": 12132390, + "step": 573, + "time_per_iteration": 2.5825624465942383 + }, + { + "auxiliary_loss_clip": 0.01252781, + "auxiliary_loss_mlp": 0.01081263, + "balance_loss_clip": 1.05900979, + "balance_loss_mlp": 1.04790807, + "epoch": 0.0345107470314144, + "flos": 16764502245120.0, + "grad_norm": 2.1502006816339136, + "language_loss": 0.77034801, + "learning_rate": 3.999786703227023e-06, + "loss": 0.79368848, + "num_input_tokens_seen": 12149035, + "step": 574, + "time_per_iteration": 2.581970453262329 + }, + { + "auxiliary_loss_clip": 0.01250711, + "auxiliary_loss_mlp": 0.01078168, + "balance_loss_clip": 1.06007504, + "balance_loss_mlp": 1.0456717, + "epoch": 0.03457087028408237, + "flos": 14684448533760.0, + "grad_norm": 3.688870648152385, + "language_loss": 0.83610314, + "learning_rate": 3.9997809774981606e-06, + "loss": 0.85939193, + "num_input_tokens_seen": 12167530, + "step": 575, + "time_per_iteration": 2.607205629348755 + }, + { + "auxiliary_loss_clip": 0.01245052, + "auxiliary_loss_mlp": 0.01078168, + "balance_loss_clip": 1.06004918, + "balance_loss_mlp": 1.04488432, + "epoch": 0.03463099353675034, + "flos": 20011293527040.0, + "grad_norm": 2.349222969048849, + "language_loss": 0.83810997, + "learning_rate": 3.9997751759374025e-06, + "loss": 0.86134207, + "num_input_tokens_seen": 12186340, + "step": 576, + "time_per_iteration": 2.6626126766204834 + }, + { + "auxiliary_loss_clip": 0.01252579, + "auxiliary_loss_mlp": 0.01077725, + "balance_loss_clip": 1.06566572, + "balance_loss_mlp": 1.04549134, + "epoch": 0.03469111678941831, + "flos": 25301150490240.0, + "grad_norm": 4.675709308428823, + "language_loss": 0.86260736, + "learning_rate": 3.99976929854497e-06, + "loss": 0.88591045, + "num_input_tokens_seen": 12204090, + "step": 577, + "time_per_iteration": 2.630817413330078 + }, + { + "auxiliary_loss_clip": 0.01248819, + "auxiliary_loss_mlp": 0.01081486, + "balance_loss_clip": 1.06362963, + "balance_loss_mlp": 1.04808331, + "epoch": 0.034751240042086275, + "flos": 23259413612160.0, + "grad_norm": 1.936668033846405, + "language_loss": 0.71790785, + "learning_rate": 3.9997633453210845e-06, + "loss": 0.74121082, + "num_input_tokens_seen": 12224850, + "step": 578, + "time_per_iteration": 2.7321059703826904 + }, + { + "auxiliary_loss_clip": 0.01246555, + "auxiliary_loss_mlp": 0.01080036, + "balance_loss_clip": 1.0585742, + "balance_loss_mlp": 1.04529834, + "epoch": 0.03481136329475425, + "flos": 23769237300480.0, + "grad_norm": 1.7461103550149655, + "language_loss": 0.77555066, + "learning_rate": 3.999757316265973e-06, + "loss": 0.79881656, + "num_input_tokens_seen": 12244935, + "step": 579, + "time_per_iteration": 2.595163345336914 + }, + { + "auxiliary_loss_clip": 0.01245497, + "auxiliary_loss_mlp": 0.01083975, + "balance_loss_clip": 1.05922246, + "balance_loss_mlp": 1.04926169, + "epoch": 0.03487148654742222, + "flos": 20157521794560.0, + "grad_norm": 1.808563231419863, + "language_loss": 0.86391246, + "learning_rate": 3.999751211379863e-06, + "loss": 0.88720715, + "num_input_tokens_seen": 12262140, + "step": 580, + "time_per_iteration": 2.584106683731079 + }, + { + "auxiliary_loss_clip": 0.01250073, + "auxiliary_loss_mlp": 0.01072062, + "balance_loss_clip": 1.05962658, + "balance_loss_mlp": 1.04140139, + "epoch": 0.034931609800090184, + "flos": 15669585918720.0, + "grad_norm": 3.1397340560776152, + "language_loss": 0.82330686, + "learning_rate": 3.999745030662987e-06, + "loss": 0.84652823, + "num_input_tokens_seen": 12280930, + "step": 581, + "time_per_iteration": 2.53130841255188 + }, + { + "auxiliary_loss_clip": 0.01248746, + "auxiliary_loss_mlp": 0.01071325, + "balance_loss_clip": 1.06215155, + "balance_loss_mlp": 1.03966284, + "epoch": 0.034991733052758156, + "flos": 16362374509440.0, + "grad_norm": 2.2050147366959374, + "language_loss": 0.76828647, + "learning_rate": 3.99973877411558e-06, + "loss": 0.79148722, + "num_input_tokens_seen": 12299125, + "step": 582, + "time_per_iteration": 2.6087658405303955 + }, + { + "auxiliary_loss_clip": 0.01246071, + "auxiliary_loss_mlp": 0.01084473, + "balance_loss_clip": 1.06166267, + "balance_loss_mlp": 1.05040264, + "epoch": 0.03505185630542612, + "flos": 19387309438080.0, + "grad_norm": 1.8967628500197962, + "language_loss": 0.87837505, + "learning_rate": 3.999732441737877e-06, + "loss": 0.90168047, + "num_input_tokens_seen": 12316905, + "step": 583, + "time_per_iteration": 2.5392889976501465 + }, + { + "auxiliary_loss_clip": 0.01252015, + "auxiliary_loss_mlp": 0.01088438, + "balance_loss_clip": 1.06103122, + "balance_loss_mlp": 1.05508327, + "epoch": 0.03511197955809409, + "flos": 21323828401920.0, + "grad_norm": 2.3818980771526608, + "language_loss": 0.80990762, + "learning_rate": 3.99972603353012e-06, + "loss": 0.83331209, + "num_input_tokens_seen": 12335070, + "step": 584, + "time_per_iteration": 2.5545201301574707 + }, + { + "auxiliary_loss_clip": 0.01246892, + "auxiliary_loss_mlp": 0.01071832, + "balance_loss_clip": 1.05753112, + "balance_loss_mlp": 1.0385251, + "epoch": 0.035172102810762065, + "flos": 14136595320960.0, + "grad_norm": 4.5550791883312645, + "language_loss": 0.92734623, + "learning_rate": 3.999719549492551e-06, + "loss": 0.95053339, + "num_input_tokens_seen": 12350315, + "step": 585, + "time_per_iteration": 2.5190470218658447 + }, + { + "auxiliary_loss_clip": 0.01249019, + "auxiliary_loss_mlp": 0.01076947, + "balance_loss_clip": 1.06067228, + "balance_loss_mlp": 1.04333019, + "epoch": 0.03523222606343003, + "flos": 20296890564480.0, + "grad_norm": 2.3524656791551486, + "language_loss": 0.878281, + "learning_rate": 3.9997129896254165e-06, + "loss": 0.90154058, + "num_input_tokens_seen": 12366030, + "step": 586, + "time_per_iteration": 2.6771011352539062 + }, + { + "auxiliary_loss_clip": 0.01252684, + "auxiliary_loss_mlp": 0.01081377, + "balance_loss_clip": 1.0622611, + "balance_loss_mlp": 1.04854643, + "epoch": 0.035292349316098, + "flos": 20375822701440.0, + "grad_norm": 1.9694906682298325, + "language_loss": 0.764274, + "learning_rate": 3.999706353928965e-06, + "loss": 0.78761458, + "num_input_tokens_seen": 12384895, + "step": 587, + "time_per_iteration": 2.563396453857422 + }, + { + "auxiliary_loss_clip": 0.01251893, + "auxiliary_loss_mlp": 0.01066226, + "balance_loss_clip": 1.0605588, + "balance_loss_mlp": 1.03234649, + "epoch": 0.03535247256876597, + "flos": 21468871520640.0, + "grad_norm": 1.8490680599872793, + "language_loss": 0.78673279, + "learning_rate": 3.999699642403449e-06, + "loss": 0.80991393, + "num_input_tokens_seen": 12404980, + "step": 588, + "time_per_iteration": 2.6023266315460205 + }, + { + "auxiliary_loss_clip": 0.01251451, + "auxiliary_loss_mlp": 0.01079375, + "balance_loss_clip": 1.05954707, + "balance_loss_mlp": 1.0439465, + "epoch": 0.03541259582143394, + "flos": 23623044946560.0, + "grad_norm": 2.166420724435565, + "language_loss": 0.93504101, + "learning_rate": 3.99969285504912e-06, + "loss": 0.95834923, + "num_input_tokens_seen": 12423835, + "step": 589, + "time_per_iteration": 2.5752885341644287 + }, + { + "auxiliary_loss_clip": 0.01252727, + "auxiliary_loss_mlp": 0.01077538, + "balance_loss_clip": 1.06029153, + "balance_loss_mlp": 1.04532719, + "epoch": 0.03547271907410191, + "flos": 33726367768320.0, + "grad_norm": 2.330878387506576, + "language_loss": 0.83885616, + "learning_rate": 3.99968599186624e-06, + "loss": 0.86215883, + "num_input_tokens_seen": 12443135, + "step": 590, + "time_per_iteration": 2.741983652114868 + }, + { + "auxiliary_loss_clip": 0.01243323, + "auxiliary_loss_mlp": 0.0106941, + "balance_loss_clip": 1.0596137, + "balance_loss_mlp": 1.03874969, + "epoch": 0.03553284232676988, + "flos": 21142695093120.0, + "grad_norm": 2.4448697309813157, + "language_loss": 0.87233829, + "learning_rate": 3.999679052855065e-06, + "loss": 0.89546567, + "num_input_tokens_seen": 12462895, + "step": 591, + "time_per_iteration": 2.5839085578918457 + }, + { + "auxiliary_loss_clip": 0.01248842, + "auxiliary_loss_mlp": 0.01083146, + "balance_loss_clip": 1.05826521, + "balance_loss_mlp": 1.04876602, + "epoch": 0.03559296557943785, + "flos": 20046593617920.0, + "grad_norm": 2.029373871431861, + "language_loss": 0.83186811, + "learning_rate": 3.999672038015861e-06, + "loss": 0.85518801, + "num_input_tokens_seen": 12481515, + "step": 592, + "time_per_iteration": 2.6124300956726074 + }, + { + "auxiliary_loss_clip": 0.01123191, + "auxiliary_loss_mlp": 0.01069405, + "balance_loss_clip": 1.02628231, + "balance_loss_mlp": 1.06063128, + "epoch": 0.035653088832105814, + "flos": 60334597244160.0, + "grad_norm": 0.9031692370044492, + "language_loss": 0.59789014, + "learning_rate": 3.999664947348893e-06, + "loss": 0.61981606, + "num_input_tokens_seen": 12548220, + "step": 593, + "time_per_iteration": 3.259474515914917 + }, + { + "auxiliary_loss_clip": 0.01247044, + "auxiliary_loss_mlp": 0.01079658, + "balance_loss_clip": 1.06234026, + "balance_loss_mlp": 1.04582691, + "epoch": 0.035713212084773786, + "flos": 20113135562880.0, + "grad_norm": 1.7597498985141962, + "language_loss": 0.87286496, + "learning_rate": 3.999657780854429e-06, + "loss": 0.89613199, + "num_input_tokens_seen": 12566105, + "step": 594, + "time_per_iteration": 2.7053511142730713 + }, + { + "auxiliary_loss_clip": 0.01245585, + "auxiliary_loss_mlp": 0.01079544, + "balance_loss_clip": 1.05741596, + "balance_loss_mlp": 1.04752493, + "epoch": 0.03577333533744176, + "flos": 26285785084800.0, + "grad_norm": 2.1656823850072016, + "language_loss": 0.83422405, + "learning_rate": 3.999650538532742e-06, + "loss": 0.85747534, + "num_input_tokens_seen": 12586680, + "step": 595, + "time_per_iteration": 2.6115334033966064 + }, + { + "auxiliary_loss_clip": 0.01243635, + "auxiliary_loss_mlp": 0.01091202, + "balance_loss_clip": 1.05989099, + "balance_loss_mlp": 1.05830026, + "epoch": 0.035833458590109724, + "flos": 10889732211840.0, + "grad_norm": 3.060292292655428, + "language_loss": 0.95950586, + "learning_rate": 3.999643220384106e-06, + "loss": 0.98285419, + "num_input_tokens_seen": 12601605, + "step": 596, + "time_per_iteration": 2.5505964756011963 + }, + { + "auxiliary_loss_clip": 0.01246993, + "auxiliary_loss_mlp": 0.01079135, + "balance_loss_clip": 1.06097651, + "balance_loss_mlp": 1.04830754, + "epoch": 0.035893581842777696, + "flos": 22090198003200.0, + "grad_norm": 3.1070388779020996, + "language_loss": 0.82406604, + "learning_rate": 3.999635826408799e-06, + "loss": 0.84732723, + "num_input_tokens_seen": 12620365, + "step": 597, + "time_per_iteration": 2.672292947769165 + }, + { + "auxiliary_loss_clip": 0.01245781, + "auxiliary_loss_mlp": 0.01084121, + "balance_loss_clip": 1.06311882, + "balance_loss_mlp": 1.05124307, + "epoch": 0.03595370509544566, + "flos": 23038347358080.0, + "grad_norm": 1.6278609146655656, + "language_loss": 0.81446671, + "learning_rate": 3.999628356607101e-06, + "loss": 0.83776575, + "num_input_tokens_seen": 12641140, + "step": 598, + "time_per_iteration": 2.5849454402923584 + }, + { + "auxiliary_loss_clip": 0.01240058, + "auxiliary_loss_mlp": 0.01080486, + "balance_loss_clip": 1.06073332, + "balance_loss_mlp": 1.04658318, + "epoch": 0.03601382834811363, + "flos": 20777734955520.0, + "grad_norm": 1.8022550123699685, + "language_loss": 0.8145656, + "learning_rate": 3.999620810979295e-06, + "loss": 0.83777106, + "num_input_tokens_seen": 12661080, + "step": 599, + "time_per_iteration": 2.5546071529388428 + }, + { + "auxiliary_loss_clip": 0.01247646, + "auxiliary_loss_mlp": 0.0108472, + "balance_loss_clip": 1.05873156, + "balance_loss_mlp": 1.05413103, + "epoch": 0.036073951600781605, + "flos": 23951627585280.0, + "grad_norm": 2.0953143884810244, + "language_loss": 0.85865664, + "learning_rate": 3.999613189525668e-06, + "loss": 0.8819803, + "num_input_tokens_seen": 12678270, + "step": 600, + "time_per_iteration": 2.5811116695404053 + }, + { + "auxiliary_loss_clip": 0.01239115, + "auxiliary_loss_mlp": 0.01090144, + "balance_loss_clip": 1.05515051, + "balance_loss_mlp": 1.05764723, + "epoch": 0.03613407485344957, + "flos": 18912283050240.0, + "grad_norm": 2.057587588073147, + "language_loss": 0.82317412, + "learning_rate": 3.999605492246508e-06, + "loss": 0.84646672, + "num_input_tokens_seen": 12697295, + "step": 601, + "time_per_iteration": 2.614609718322754 + }, + { + "auxiliary_loss_clip": 0.01235625, + "auxiliary_loss_mlp": 0.01073719, + "balance_loss_clip": 1.05515659, + "balance_loss_mlp": 1.0416044, + "epoch": 0.03619419810611754, + "flos": 23038526926080.0, + "grad_norm": 2.162337830937685, + "language_loss": 0.75347906, + "learning_rate": 3.999597719142107e-06, + "loss": 0.77657247, + "num_input_tokens_seen": 12716165, + "step": 602, + "time_per_iteration": 2.6118245124816895 + }, + { + "auxiliary_loss_clip": 0.01235173, + "auxiliary_loss_mlp": 0.01068401, + "balance_loss_clip": 1.05551791, + "balance_loss_mlp": 1.03735876, + "epoch": 0.03625432135878551, + "flos": 29457774293760.0, + "grad_norm": 3.2548938512042507, + "language_loss": 0.80047071, + "learning_rate": 3.999589870212761e-06, + "loss": 0.82350647, + "num_input_tokens_seen": 12735475, + "step": 603, + "time_per_iteration": 2.743528366088867 + }, + { + "auxiliary_loss_clip": 0.01242545, + "auxiliary_loss_mlp": 0.01073468, + "balance_loss_clip": 1.06141686, + "balance_loss_mlp": 1.04273605, + "epoch": 0.03631444461145348, + "flos": 23508525409920.0, + "grad_norm": 2.1923158537746703, + "language_loss": 0.87067586, + "learning_rate": 3.9995819454587664e-06, + "loss": 0.89383602, + "num_input_tokens_seen": 12754540, + "step": 604, + "time_per_iteration": 4.2345194816589355 + }, + { + "auxiliary_loss_clip": 0.0124155, + "auxiliary_loss_mlp": 0.01065556, + "balance_loss_clip": 1.05991781, + "balance_loss_mlp": 1.03301191, + "epoch": 0.03637456786412145, + "flos": 16618130323200.0, + "grad_norm": 2.809302579439046, + "language_loss": 0.805632, + "learning_rate": 3.999573944880424e-06, + "loss": 0.82870305, + "num_input_tokens_seen": 12773050, + "step": 605, + "time_per_iteration": 4.322201490402222 + }, + { + "auxiliary_loss_clip": 0.01240262, + "auxiliary_loss_mlp": 0.01072641, + "balance_loss_clip": 1.05769682, + "balance_loss_mlp": 1.04250479, + "epoch": 0.03643469111678942, + "flos": 15851832549120.0, + "grad_norm": 2.3547742116643784, + "language_loss": 0.85805702, + "learning_rate": 3.9995658684780375e-06, + "loss": 0.88118601, + "num_input_tokens_seen": 12791240, + "step": 606, + "time_per_iteration": 4.22185754776001 + }, + { + "auxiliary_loss_clip": 0.01242116, + "auxiliary_loss_mlp": 0.01073324, + "balance_loss_clip": 1.05815411, + "balance_loss_mlp": 1.04130483, + "epoch": 0.03649481436945739, + "flos": 23620387340160.0, + "grad_norm": 4.4009349096059465, + "language_loss": 0.82413459, + "learning_rate": 3.999557716251912e-06, + "loss": 0.84728897, + "num_input_tokens_seen": 12812245, + "step": 607, + "time_per_iteration": 2.60573148727417 + }, + { + "auxiliary_loss_clip": 0.01240905, + "auxiliary_loss_mlp": 0.01068331, + "balance_loss_clip": 1.06053531, + "balance_loss_mlp": 1.03776526, + "epoch": 0.036554937622125354, + "flos": 21755581879680.0, + "grad_norm": 2.2881437675405385, + "language_loss": 0.83684462, + "learning_rate": 3.999549488202358e-06, + "loss": 0.85993695, + "num_input_tokens_seen": 12831085, + "step": 608, + "time_per_iteration": 2.565384864807129 + }, + { + "auxiliary_loss_clip": 0.01242336, + "auxiliary_loss_mlp": 0.01067543, + "balance_loss_clip": 1.05947769, + "balance_loss_mlp": 1.03468907, + "epoch": 0.036615060874793326, + "flos": 17819772935040.0, + "grad_norm": 2.0094048871765593, + "language_loss": 0.81989074, + "learning_rate": 3.999541184329688e-06, + "loss": 0.84298944, + "num_input_tokens_seen": 12849115, + "step": 609, + "time_per_iteration": 2.5522828102111816 + }, + { + "auxiliary_loss_clip": 0.0124813, + "auxiliary_loss_mlp": 0.01085739, + "balance_loss_clip": 1.06504869, + "balance_loss_mlp": 1.05581713, + "epoch": 0.0366751841274613, + "flos": 26753808320640.0, + "grad_norm": 2.1596436756754223, + "language_loss": 0.79381204, + "learning_rate": 3.999532804634215e-06, + "loss": 0.81715071, + "num_input_tokens_seen": 12868005, + "step": 610, + "time_per_iteration": 2.7076809406280518 + }, + { + "auxiliary_loss_clip": 0.01246072, + "auxiliary_loss_mlp": 0.01084186, + "balance_loss_clip": 1.06064785, + "balance_loss_mlp": 1.05226183, + "epoch": 0.03673530738012926, + "flos": 22196960202240.0, + "grad_norm": 2.4979041036112934, + "language_loss": 0.87386942, + "learning_rate": 3.9995243491162575e-06, + "loss": 0.89717197, + "num_input_tokens_seen": 12886890, + "step": 611, + "time_per_iteration": 2.6205599308013916 + }, + { + "auxiliary_loss_clip": 0.0124093, + "auxiliary_loss_mlp": 0.01090837, + "balance_loss_clip": 1.06162715, + "balance_loss_mlp": 1.05953288, + "epoch": 0.036795430632797235, + "flos": 24681655601280.0, + "grad_norm": 2.3284133115527466, + "language_loss": 0.73012614, + "learning_rate": 3.999515817776136e-06, + "loss": 0.75344384, + "num_input_tokens_seen": 12906130, + "step": 612, + "time_per_iteration": 2.5947494506835938 + }, + { + "auxiliary_loss_clip": 0.0124129, + "auxiliary_loss_mlp": 0.01072126, + "balance_loss_clip": 1.05793309, + "balance_loss_mlp": 1.04020214, + "epoch": 0.0368555538854652, + "flos": 17748921358080.0, + "grad_norm": 3.114475513427696, + "language_loss": 0.78996533, + "learning_rate": 3.999507210614175e-06, + "loss": 0.8130995, + "num_input_tokens_seen": 12925260, + "step": 613, + "time_per_iteration": 2.7475688457489014 + }, + { + "auxiliary_loss_clip": 0.01234904, + "auxiliary_loss_mlp": 0.0107814, + "balance_loss_clip": 1.05573201, + "balance_loss_mlp": 1.04710984, + "epoch": 0.03691567713813317, + "flos": 20594554571520.0, + "grad_norm": 1.9300035561479885, + "language_loss": 0.93692857, + "learning_rate": 3.9994985276307e-06, + "loss": 0.96005899, + "num_input_tokens_seen": 12944590, + "step": 614, + "time_per_iteration": 2.576913356781006 + }, + { + "auxiliary_loss_clip": 0.01245171, + "auxiliary_loss_mlp": 0.01079546, + "balance_loss_clip": 1.06199241, + "balance_loss_mlp": 1.04578543, + "epoch": 0.036975800390801145, + "flos": 33650380546560.0, + "grad_norm": 2.956925238899606, + "language_loss": 0.72892499, + "learning_rate": 3.999489768826041e-06, + "loss": 0.75217217, + "num_input_tokens_seen": 12964785, + "step": 615, + "time_per_iteration": 2.7495410442352295 + }, + { + "auxiliary_loss_clip": 0.01240983, + "auxiliary_loss_mlp": 0.01075905, + "balance_loss_clip": 1.05687857, + "balance_loss_mlp": 1.04443371, + "epoch": 0.03703592364346911, + "flos": 28293694329600.0, + "grad_norm": 2.017153479635017, + "language_loss": 0.81607139, + "learning_rate": 3.999480934200528e-06, + "loss": 0.83924025, + "num_input_tokens_seen": 12986705, + "step": 616, + "time_per_iteration": 2.8537774085998535 + }, + { + "auxiliary_loss_clip": 0.0123969, + "auxiliary_loss_mlp": 0.0107363, + "balance_loss_clip": 1.05817473, + "balance_loss_mlp": 1.04351807, + "epoch": 0.03709604689613708, + "flos": 31504215853440.0, + "grad_norm": 1.9874440184388134, + "language_loss": 0.68208897, + "learning_rate": 3.999472023754499e-06, + "loss": 0.70522213, + "num_input_tokens_seen": 13010560, + "step": 617, + "time_per_iteration": 2.7263524532318115 + }, + { + "auxiliary_loss_clip": 0.01246341, + "auxiliary_loss_mlp": 0.01071919, + "balance_loss_clip": 1.06353366, + "balance_loss_mlp": 1.03823054, + "epoch": 0.03715617014880505, + "flos": 19609381272960.0, + "grad_norm": 2.3285213566221863, + "language_loss": 0.80242884, + "learning_rate": 3.99946303748829e-06, + "loss": 0.82561147, + "num_input_tokens_seen": 13028935, + "step": 618, + "time_per_iteration": 2.6238229274749756 + }, + { + "auxiliary_loss_clip": 0.0124668, + "auxiliary_loss_mlp": 0.0107454, + "balance_loss_clip": 1.05919385, + "balance_loss_mlp": 1.04058957, + "epoch": 0.03721629340147302, + "flos": 15924192497280.0, + "grad_norm": 2.140988142212869, + "language_loss": 0.91164839, + "learning_rate": 3.999453975402242e-06, + "loss": 0.93486071, + "num_input_tokens_seen": 13046000, + "step": 619, + "time_per_iteration": 2.6630566120147705 + }, + { + "auxiliary_loss_clip": 0.01242391, + "auxiliary_loss_mlp": 0.01081596, + "balance_loss_clip": 1.06282973, + "balance_loss_mlp": 1.05024374, + "epoch": 0.03727641665414099, + "flos": 21104090951040.0, + "grad_norm": 7.336051870283171, + "language_loss": 0.94279408, + "learning_rate": 3.9994448374967e-06, + "loss": 0.96603394, + "num_input_tokens_seen": 13062995, + "step": 620, + "time_per_iteration": 2.6290173530578613 + }, + { + "auxiliary_loss_clip": 0.01240061, + "auxiliary_loss_mlp": 0.01086111, + "balance_loss_clip": 1.05870128, + "balance_loss_mlp": 1.0524224, + "epoch": 0.037336539906808956, + "flos": 24131683486080.0, + "grad_norm": 1.7407265678851849, + "language_loss": 0.77280378, + "learning_rate": 3.999435623772008e-06, + "loss": 0.79606545, + "num_input_tokens_seen": 13084120, + "step": 621, + "time_per_iteration": 2.6605758666992188 + }, + { + "auxiliary_loss_clip": 0.01239254, + "auxiliary_loss_mlp": 0.01066297, + "balance_loss_clip": 1.06145549, + "balance_loss_mlp": 1.0337764, + "epoch": 0.03739666315947693, + "flos": 22346384780160.0, + "grad_norm": 2.5501539901805175, + "language_loss": 0.86522955, + "learning_rate": 3.999426334228518e-06, + "loss": 0.88828504, + "num_input_tokens_seen": 13100035, + "step": 622, + "time_per_iteration": 2.8715460300445557 + }, + { + "auxiliary_loss_clip": 0.01238531, + "auxiliary_loss_mlp": 0.01067663, + "balance_loss_clip": 1.05832243, + "balance_loss_mlp": 1.03561974, + "epoch": 0.0374567864121449, + "flos": 20449511452800.0, + "grad_norm": 2.0839513452456147, + "language_loss": 0.90031511, + "learning_rate": 3.999416968866581e-06, + "loss": 0.92337704, + "num_input_tokens_seen": 13118070, + "step": 623, + "time_per_iteration": 2.7463178634643555 + }, + { + "auxiliary_loss_clip": 0.01242982, + "auxiliary_loss_mlp": 0.01082718, + "balance_loss_clip": 1.06284046, + "balance_loss_mlp": 1.05060256, + "epoch": 0.037516909664812866, + "flos": 19208043636480.0, + "grad_norm": 1.8405319013920525, + "language_loss": 0.84212214, + "learning_rate": 3.999407527686551e-06, + "loss": 0.86537921, + "num_input_tokens_seen": 13136355, + "step": 624, + "time_per_iteration": 2.732814311981201 + }, + { + "auxiliary_loss_clip": 0.01243143, + "auxiliary_loss_mlp": 0.01071282, + "balance_loss_clip": 1.05852556, + "balance_loss_mlp": 1.03897679, + "epoch": 0.03757703291748084, + "flos": 35005218664320.0, + "grad_norm": 2.7184855488724224, + "language_loss": 0.66660923, + "learning_rate": 3.999398010688788e-06, + "loss": 0.68975347, + "num_input_tokens_seen": 13155435, + "step": 625, + "time_per_iteration": 2.6779985427856445 + }, + { + "auxiliary_loss_clip": 0.012361, + "auxiliary_loss_mlp": 0.01076307, + "balance_loss_clip": 1.0588057, + "balance_loss_mlp": 1.04249978, + "epoch": 0.0376371561701488, + "flos": 25483899911040.0, + "grad_norm": 1.8135715074889822, + "language_loss": 0.77481806, + "learning_rate": 3.999388417873652e-06, + "loss": 0.7979421, + "num_input_tokens_seen": 13174295, + "step": 626, + "time_per_iteration": 2.628462314605713 + }, + { + "auxiliary_loss_clip": 0.01238652, + "auxiliary_loss_mlp": 0.01075224, + "balance_loss_clip": 1.05930161, + "balance_loss_mlp": 1.04368174, + "epoch": 0.037697279422816775, + "flos": 18185630912640.0, + "grad_norm": 1.9785244519134, + "language_loss": 0.81622225, + "learning_rate": 3.999378749241506e-06, + "loss": 0.83936101, + "num_input_tokens_seen": 13192500, + "step": 627, + "time_per_iteration": 2.5633106231689453 + }, + { + "auxiliary_loss_clip": 0.01242724, + "auxiliary_loss_mlp": 0.01081877, + "balance_loss_clip": 1.06110311, + "balance_loss_mlp": 1.05009568, + "epoch": 0.03775740267548475, + "flos": 24644272521600.0, + "grad_norm": 1.6965979459156717, + "language_loss": 0.88483578, + "learning_rate": 3.999369004792719e-06, + "loss": 0.90808177, + "num_input_tokens_seen": 13213470, + "step": 628, + "time_per_iteration": 2.6713485717773438 + }, + { + "auxiliary_loss_clip": 0.01237054, + "auxiliary_loss_mlp": 0.01072591, + "balance_loss_clip": 1.05556428, + "balance_loss_mlp": 1.04142964, + "epoch": 0.03781752592815271, + "flos": 21288205088640.0, + "grad_norm": 2.8528190008355994, + "language_loss": 0.80208611, + "learning_rate": 3.999359184527658e-06, + "loss": 0.82518256, + "num_input_tokens_seen": 13232365, + "step": 629, + "time_per_iteration": 2.5639846324920654 + }, + { + "auxiliary_loss_clip": 0.01237495, + "auxiliary_loss_mlp": 0.01069552, + "balance_loss_clip": 1.05760837, + "balance_loss_mlp": 1.04015541, + "epoch": 0.037877649180820684, + "flos": 22089623385600.0, + "grad_norm": 1.6920947681628529, + "language_loss": 0.76868737, + "learning_rate": 3.999349288446696e-06, + "loss": 0.79175782, + "num_input_tokens_seen": 13251920, + "step": 630, + "time_per_iteration": 2.6219000816345215 + }, + { + "auxiliary_loss_clip": 0.01240772, + "auxiliary_loss_mlp": 0.01072834, + "balance_loss_clip": 1.05842972, + "balance_loss_mlp": 1.04174423, + "epoch": 0.03793777243348865, + "flos": 14501339976960.0, + "grad_norm": 2.825097705280994, + "language_loss": 0.91638589, + "learning_rate": 3.99933931655021e-06, + "loss": 0.93952191, + "num_input_tokens_seen": 13267440, + "step": 631, + "time_per_iteration": 2.667356252670288 + }, + { + "auxiliary_loss_clip": 0.01233484, + "auxiliary_loss_mlp": 0.01087116, + "balance_loss_clip": 1.05612576, + "balance_loss_mlp": 1.05316496, + "epoch": 0.03799789568615662, + "flos": 21908418249600.0, + "grad_norm": 1.5958063740254642, + "language_loss": 0.92314994, + "learning_rate": 3.999329268838575e-06, + "loss": 0.94635594, + "num_input_tokens_seen": 13287850, + "step": 632, + "time_per_iteration": 2.776552438735962 + }, + { + "auxiliary_loss_clip": 0.01235715, + "auxiliary_loss_mlp": 0.0106471, + "balance_loss_clip": 1.05850482, + "balance_loss_mlp": 1.03414512, + "epoch": 0.03805801893882459, + "flos": 24827021942400.0, + "grad_norm": 1.920365128667671, + "language_loss": 0.83229387, + "learning_rate": 3.999319145312175e-06, + "loss": 0.85529816, + "num_input_tokens_seen": 13307760, + "step": 633, + "time_per_iteration": 2.7731728553771973 + }, + { + "auxiliary_loss_clip": 0.01236276, + "auxiliary_loss_mlp": 0.01072164, + "balance_loss_clip": 1.05710554, + "balance_loss_mlp": 1.04181337, + "epoch": 0.03811814219149256, + "flos": 30482952364800.0, + "grad_norm": 1.7453018705786252, + "language_loss": 0.69734657, + "learning_rate": 3.999308945971392e-06, + "loss": 0.72043091, + "num_input_tokens_seen": 13331230, + "step": 634, + "time_per_iteration": 2.7435989379882812 + }, + { + "auxiliary_loss_clip": 0.01113707, + "auxiliary_loss_mlp": 0.01024466, + "balance_loss_clip": 1.02340961, + "balance_loss_mlp": 1.01717055, + "epoch": 0.03817826544416053, + "flos": 66992577379200.0, + "grad_norm": 0.8841809578224902, + "language_loss": 0.61612862, + "learning_rate": 3.999298670816614e-06, + "loss": 0.63751036, + "num_input_tokens_seen": 13394760, + "step": 635, + "time_per_iteration": 3.1693272590637207 + }, + { + "auxiliary_loss_clip": 0.01232744, + "auxiliary_loss_mlp": 0.01065555, + "balance_loss_clip": 1.05646658, + "balance_loss_mlp": 1.03539467, + "epoch": 0.038238388696828496, + "flos": 20485350247680.0, + "grad_norm": 2.14722379746137, + "language_loss": 0.83918953, + "learning_rate": 3.9992883198482294e-06, + "loss": 0.86217248, + "num_input_tokens_seen": 13412775, + "step": 636, + "time_per_iteration": 2.796236753463745 + }, + { + "auxiliary_loss_clip": 0.01237115, + "auxiliary_loss_mlp": 0.01085346, + "balance_loss_clip": 1.05926824, + "balance_loss_mlp": 1.0547328, + "epoch": 0.03829851194949647, + "flos": 17965893461760.0, + "grad_norm": 2.441918120359117, + "language_loss": 0.79111993, + "learning_rate": 3.999277893066632e-06, + "loss": 0.81434453, + "num_input_tokens_seen": 13427835, + "step": 637, + "time_per_iteration": 2.7708706855773926 + }, + { + "auxiliary_loss_clip": 0.01236097, + "auxiliary_loss_mlp": 0.01081459, + "balance_loss_clip": 1.05508518, + "balance_loss_mlp": 1.05013108, + "epoch": 0.03835863520216444, + "flos": 22456522857600.0, + "grad_norm": 13.258061539439254, + "language_loss": 0.84164381, + "learning_rate": 3.999267390472215e-06, + "loss": 0.86481941, + "num_input_tokens_seen": 13447295, + "step": 638, + "time_per_iteration": 2.720252513885498 + }, + { + "auxiliary_loss_clip": 0.01242199, + "auxiliary_loss_mlp": 0.01066568, + "balance_loss_clip": 1.05799365, + "balance_loss_mlp": 1.03517973, + "epoch": 0.038418758454832405, + "flos": 22164425458560.0, + "grad_norm": 4.315492733461109, + "language_loss": 0.69938356, + "learning_rate": 3.999256812065381e-06, + "loss": 0.72247124, + "num_input_tokens_seen": 13468455, + "step": 639, + "time_per_iteration": 2.7296063899993896 + }, + { + "auxiliary_loss_clip": 0.01237899, + "auxiliary_loss_mlp": 0.01078713, + "balance_loss_clip": 1.05806434, + "balance_loss_mlp": 1.04659855, + "epoch": 0.03847888170750038, + "flos": 22747435107840.0, + "grad_norm": 2.8107157105971003, + "language_loss": 0.85380292, + "learning_rate": 3.999246157846526e-06, + "loss": 0.87696898, + "num_input_tokens_seen": 13489085, + "step": 640, + "time_per_iteration": 2.559969425201416 + }, + { + "auxiliary_loss_clip": 0.01240263, + "auxiliary_loss_mlp": 0.0107703, + "balance_loss_clip": 1.05879045, + "balance_loss_mlp": 1.04427147, + "epoch": 0.03853900496016834, + "flos": 22711201263360.0, + "grad_norm": 2.498006174545127, + "language_loss": 0.81858492, + "learning_rate": 3.9992354278160574e-06, + "loss": 0.84175783, + "num_input_tokens_seen": 13509120, + "step": 641, + "time_per_iteration": 2.540372371673584 + }, + { + "auxiliary_loss_clip": 0.01101297, + "auxiliary_loss_mlp": 0.01006807, + "balance_loss_clip": 1.01353264, + "balance_loss_mlp": 0.99889106, + "epoch": 0.038599128212836314, + "flos": 70399136355840.0, + "grad_norm": 0.9057273146408735, + "language_loss": 0.65470946, + "learning_rate": 3.999224621974381e-06, + "loss": 0.67579043, + "num_input_tokens_seen": 13562005, + "step": 642, + "time_per_iteration": 3.131199359893799 + }, + { + "auxiliary_loss_clip": 0.0123464, + "auxiliary_loss_mlp": 0.01063711, + "balance_loss_clip": 1.05635905, + "balance_loss_mlp": 1.03388512, + "epoch": 0.03865925146550429, + "flos": 23295144666240.0, + "grad_norm": 1.9033471107340223, + "language_loss": 0.79783821, + "learning_rate": 3.999213740321906e-06, + "loss": 0.8208217, + "num_input_tokens_seen": 13582185, + "step": 643, + "time_per_iteration": 2.6253128051757812 + }, + { + "auxiliary_loss_clip": 0.01232348, + "auxiliary_loss_mlp": 0.01074825, + "balance_loss_clip": 1.05418229, + "balance_loss_mlp": 1.0446887, + "epoch": 0.03871937471817225, + "flos": 21430446946560.0, + "grad_norm": 1.8644553141859908, + "language_loss": 0.83103991, + "learning_rate": 3.999202782859046e-06, + "loss": 0.85411167, + "num_input_tokens_seen": 13599555, + "step": 644, + "time_per_iteration": 2.633126735687256 + }, + { + "auxiliary_loss_clip": 0.01237706, + "auxiliary_loss_mlp": 0.01067278, + "balance_loss_clip": 1.05755007, + "balance_loss_mlp": 1.03554416, + "epoch": 0.038779497970840224, + "flos": 34277309550720.0, + "grad_norm": 2.067179211457706, + "language_loss": 0.82138914, + "learning_rate": 3.9991917495862165e-06, + "loss": 0.84443897, + "num_input_tokens_seen": 13621160, + "step": 645, + "time_per_iteration": 2.749969959259033 + }, + { + "auxiliary_loss_clip": 0.0124018, + "auxiliary_loss_mlp": 0.01073063, + "balance_loss_clip": 1.05777395, + "balance_loss_mlp": 1.04137754, + "epoch": 0.03883962122350819, + "flos": 22748189293440.0, + "grad_norm": 2.6200371166797285, + "language_loss": 0.81684661, + "learning_rate": 3.9991806405038345e-06, + "loss": 0.83997911, + "num_input_tokens_seen": 13641915, + "step": 646, + "time_per_iteration": 2.713759660720825 + }, + { + "auxiliary_loss_clip": 0.01240172, + "auxiliary_loss_mlp": 0.0108376, + "balance_loss_clip": 1.06274331, + "balance_loss_mlp": 1.05202651, + "epoch": 0.03889974447617616, + "flos": 21945837242880.0, + "grad_norm": 1.9793813762268158, + "language_loss": 0.81946051, + "learning_rate": 3.999169455612323e-06, + "loss": 0.84269983, + "num_input_tokens_seen": 13661410, + "step": 647, + "time_per_iteration": 2.7308292388916016 + }, + { + "auxiliary_loss_clip": 0.01235249, + "auxiliary_loss_mlp": 0.01067615, + "balance_loss_clip": 1.05782819, + "balance_loss_mlp": 1.03776455, + "epoch": 0.03895986772884413, + "flos": 31504826384640.0, + "grad_norm": 2.4650064954169135, + "language_loss": 0.84459877, + "learning_rate": 3.999158194912106e-06, + "loss": 0.86762738, + "num_input_tokens_seen": 13681705, + "step": 648, + "time_per_iteration": 2.8621022701263428 + }, + { + "auxiliary_loss_clip": 0.01234821, + "auxiliary_loss_mlp": 0.01073367, + "balance_loss_clip": 1.0575248, + "balance_loss_mlp": 1.04277813, + "epoch": 0.0390199909815121, + "flos": 19901011795200.0, + "grad_norm": 2.041006483813791, + "language_loss": 0.84354198, + "learning_rate": 3.9991468584036086e-06, + "loss": 0.86662388, + "num_input_tokens_seen": 13700400, + "step": 649, + "time_per_iteration": 2.634155750274658 + }, + { + "auxiliary_loss_clip": 0.01235297, + "auxiliary_loss_mlp": 0.01070433, + "balance_loss_clip": 1.05599952, + "balance_loss_mlp": 1.03927124, + "epoch": 0.03908011423418007, + "flos": 21612478095360.0, + "grad_norm": 2.03795485080332, + "language_loss": 0.79865301, + "learning_rate": 3.999135446087263e-06, + "loss": 0.82171029, + "num_input_tokens_seen": 13720145, + "step": 650, + "time_per_iteration": 2.608920097351074 + }, + { + "auxiliary_loss_clip": 0.01229682, + "auxiliary_loss_mlp": 0.01073465, + "balance_loss_clip": 1.05291367, + "balance_loss_mlp": 1.04273283, + "epoch": 0.039140237486848035, + "flos": 18661411486080.0, + "grad_norm": 2.3538478054811054, + "language_loss": 0.78486502, + "learning_rate": 3.9991239579635e-06, + "loss": 0.80789649, + "num_input_tokens_seen": 13737500, + "step": 651, + "time_per_iteration": 4.195107936859131 + }, + { + "auxiliary_loss_clip": 0.01231645, + "auxiliary_loss_mlp": 0.01076986, + "balance_loss_clip": 1.0549804, + "balance_loss_mlp": 1.04348814, + "epoch": 0.03920036073951601, + "flos": 18661124177280.0, + "grad_norm": 2.808418787522272, + "language_loss": 0.87873149, + "learning_rate": 3.999112394032757e-06, + "loss": 0.9018178, + "num_input_tokens_seen": 13754750, + "step": 652, + "time_per_iteration": 5.720244884490967 + }, + { + "auxiliary_loss_clip": 0.01223841, + "auxiliary_loss_mlp": 0.01067589, + "balance_loss_clip": 1.05225372, + "balance_loss_mlp": 1.03771484, + "epoch": 0.03926048399218398, + "flos": 31354468053120.0, + "grad_norm": 2.980097376666324, + "language_loss": 0.79159325, + "learning_rate": 3.999100754295471e-06, + "loss": 0.81450754, + "num_input_tokens_seen": 13771990, + "step": 653, + "time_per_iteration": 2.597721815109253 + }, + { + "auxiliary_loss_clip": 0.01239421, + "auxiliary_loss_mlp": 0.0106915, + "balance_loss_clip": 1.05771518, + "balance_loss_mlp": 1.03834629, + "epoch": 0.039320607244851945, + "flos": 29603499770880.0, + "grad_norm": 2.6172771478499155, + "language_loss": 0.85733443, + "learning_rate": 3.999089038752085e-06, + "loss": 0.88042009, + "num_input_tokens_seen": 13792750, + "step": 654, + "time_per_iteration": 4.29282546043396 + }, + { + "auxiliary_loss_clip": 0.01096164, + "auxiliary_loss_mlp": 0.01012721, + "balance_loss_clip": 1.00999856, + "balance_loss_mlp": 1.00504351, + "epoch": 0.03938073049751992, + "flos": 66534609951360.0, + "grad_norm": 0.8255940424884866, + "language_loss": 0.49936402, + "learning_rate": 3.999077247403041e-06, + "loss": 0.52045286, + "num_input_tokens_seen": 13858570, + "step": 655, + "time_per_iteration": 3.17952036857605 + }, + { + "auxiliary_loss_clip": 0.01228927, + "auxiliary_loss_mlp": 0.0107125, + "balance_loss_clip": 1.05684924, + "balance_loss_mlp": 1.04171014, + "epoch": 0.03944085375018788, + "flos": 23367827836800.0, + "grad_norm": 2.0983961625612593, + "language_loss": 0.80581987, + "learning_rate": 3.9990653802487886e-06, + "loss": 0.82882166, + "num_input_tokens_seen": 13876335, + "step": 656, + "time_per_iteration": 2.5430734157562256 + }, + { + "auxiliary_loss_clip": 0.0124156, + "auxiliary_loss_mlp": 0.01091384, + "balance_loss_clip": 1.05931842, + "balance_loss_mlp": 1.05614603, + "epoch": 0.039500977002855854, + "flos": 18548292579840.0, + "grad_norm": 2.380294541122905, + "language_loss": 0.76607656, + "learning_rate": 3.999053437289776e-06, + "loss": 0.78940594, + "num_input_tokens_seen": 13892640, + "step": 657, + "time_per_iteration": 2.5206258296966553 + }, + { + "auxiliary_loss_clip": 0.0123651, + "auxiliary_loss_mlp": 0.01071047, + "balance_loss_clip": 1.05764699, + "balance_loss_mlp": 1.04000473, + "epoch": 0.039561100255523826, + "flos": 25338174433920.0, + "grad_norm": 2.0958369364799236, + "language_loss": 0.81869376, + "learning_rate": 3.999041418526457e-06, + "loss": 0.84176934, + "num_input_tokens_seen": 13910085, + "step": 658, + "time_per_iteration": 2.5661957263946533 + }, + { + "auxiliary_loss_clip": 0.01231123, + "auxiliary_loss_mlp": 0.01075731, + "balance_loss_clip": 1.055655, + "balance_loss_mlp": 1.04187548, + "epoch": 0.03962122350819179, + "flos": 18219889509120.0, + "grad_norm": 2.0216454246280207, + "language_loss": 0.9107976, + "learning_rate": 3.999029323959287e-06, + "loss": 0.93386614, + "num_input_tokens_seen": 13928800, + "step": 659, + "time_per_iteration": 2.69765305519104 + }, + { + "auxiliary_loss_clip": 0.01237406, + "auxiliary_loss_mlp": 0.01071066, + "balance_loss_clip": 1.05730808, + "balance_loss_mlp": 1.04035759, + "epoch": 0.03968134676085976, + "flos": 20522230536960.0, + "grad_norm": 2.912535972706843, + "language_loss": 0.79281735, + "learning_rate": 3.999017153588724e-06, + "loss": 0.81590205, + "num_input_tokens_seen": 13948325, + "step": 660, + "time_per_iteration": 2.689612865447998 + }, + { + "auxiliary_loss_clip": 0.01233129, + "auxiliary_loss_mlp": 0.0107517, + "balance_loss_clip": 1.05883622, + "balance_loss_mlp": 1.04298353, + "epoch": 0.03974147001352773, + "flos": 22422587483520.0, + "grad_norm": 1.5866468795398576, + "language_loss": 0.8148821, + "learning_rate": 3.999004907415231e-06, + "loss": 0.83796513, + "num_input_tokens_seen": 13969090, + "step": 661, + "time_per_iteration": 2.5947940349578857 + }, + { + "auxiliary_loss_clip": 0.01094603, + "auxiliary_loss_mlp": 0.01006791, + "balance_loss_clip": 1.00874376, + "balance_loss_mlp": 0.99963874, + "epoch": 0.0398015932661957, + "flos": 71128769322240.0, + "grad_norm": 0.9172554384695869, + "language_loss": 0.69358134, + "learning_rate": 3.998992585439272e-06, + "loss": 0.71459532, + "num_input_tokens_seen": 14037555, + "step": 662, + "time_per_iteration": 3.314939022064209 + }, + { + "auxiliary_loss_clip": 0.01235907, + "auxiliary_loss_mlp": 0.01076325, + "balance_loss_clip": 1.0595274, + "balance_loss_mlp": 1.04516387, + "epoch": 0.03986171651886367, + "flos": 16800951571200.0, + "grad_norm": 1.8255133122005778, + "language_loss": 0.82833862, + "learning_rate": 3.998980187661314e-06, + "loss": 0.85146093, + "num_input_tokens_seen": 14055765, + "step": 663, + "time_per_iteration": 2.534083366394043 + }, + { + "auxiliary_loss_clip": 0.01241423, + "auxiliary_loss_mlp": 0.01065844, + "balance_loss_clip": 1.06081796, + "balance_loss_mlp": 1.03353834, + "epoch": 0.03992183977153164, + "flos": 24535068197760.0, + "grad_norm": 2.960340789013061, + "language_loss": 0.8716495, + "learning_rate": 3.998967714081826e-06, + "loss": 0.89472222, + "num_input_tokens_seen": 14074195, + "step": 664, + "time_per_iteration": 2.588332414627075 + }, + { + "auxiliary_loss_clip": 0.01228359, + "auxiliary_loss_mlp": 0.01066694, + "balance_loss_clip": 1.05604386, + "balance_loss_mlp": 1.03503239, + "epoch": 0.03998196302419961, + "flos": 15595897167360.0, + "grad_norm": 1.9743112500005, + "language_loss": 0.84970391, + "learning_rate": 3.998955164701281e-06, + "loss": 0.87265444, + "num_input_tokens_seen": 14090215, + "step": 665, + "time_per_iteration": 2.5642096996307373 + }, + { + "auxiliary_loss_clip": 0.01242345, + "auxiliary_loss_mlp": 0.01084605, + "balance_loss_clip": 1.05965233, + "balance_loss_mlp": 1.05110717, + "epoch": 0.04004208627686758, + "flos": 25305065072640.0, + "grad_norm": 2.155856941836293, + "language_loss": 0.81795913, + "learning_rate": 3.998942539520158e-06, + "loss": 0.84122866, + "num_input_tokens_seen": 14112150, + "step": 666, + "time_per_iteration": 2.625605583190918 + }, + { + "auxiliary_loss_clip": 0.01231922, + "auxiliary_loss_mlp": 0.01074597, + "balance_loss_clip": 1.05667949, + "balance_loss_mlp": 1.04128981, + "epoch": 0.04010220952953555, + "flos": 23475847011840.0, + "grad_norm": 2.0808224732990013, + "language_loss": 0.87119949, + "learning_rate": 3.998929838538932e-06, + "loss": 0.8942647, + "num_input_tokens_seen": 14131475, + "step": 667, + "time_per_iteration": 2.7758469581604004 + }, + { + "auxiliary_loss_clip": 0.0123251, + "auxiliary_loss_mlp": 0.01070062, + "balance_loss_clip": 1.06041336, + "balance_loss_mlp": 1.03949666, + "epoch": 0.04016233278220352, + "flos": 18617025254400.0, + "grad_norm": 2.197007118041273, + "language_loss": 0.80406237, + "learning_rate": 3.998917061758087e-06, + "loss": 0.82708812, + "num_input_tokens_seen": 14146165, + "step": 668, + "time_per_iteration": 2.550760269165039 + }, + { + "auxiliary_loss_clip": 0.01092004, + "auxiliary_loss_mlp": 0.01010348, + "balance_loss_clip": 1.00895429, + "balance_loss_mlp": 1.00348175, + "epoch": 0.040222456034871484, + "flos": 70906194696960.0, + "grad_norm": 0.7874591881705072, + "language_loss": 0.6011858, + "learning_rate": 3.998904209178107e-06, + "loss": 0.62220931, + "num_input_tokens_seen": 14215005, + "step": 669, + "time_per_iteration": 3.315673351287842 + }, + { + "auxiliary_loss_clip": 0.01230693, + "auxiliary_loss_mlp": 0.01071921, + "balance_loss_clip": 1.05483627, + "balance_loss_mlp": 1.04128408, + "epoch": 0.040282579287539456, + "flos": 23764712186880.0, + "grad_norm": 2.2948514351372347, + "language_loss": 0.86176455, + "learning_rate": 3.9988912807994785e-06, + "loss": 0.88479066, + "num_input_tokens_seen": 14235510, + "step": 670, + "time_per_iteration": 2.7299177646636963 + }, + { + "auxiliary_loss_clip": 0.01232187, + "auxiliary_loss_mlp": 0.01079842, + "balance_loss_clip": 1.05943179, + "balance_loss_mlp": 1.04949152, + "epoch": 0.04034270254020743, + "flos": 18478518410880.0, + "grad_norm": 1.7500789558600836, + "language_loss": 0.75397748, + "learning_rate": 3.998878276622692e-06, + "loss": 0.77709776, + "num_input_tokens_seen": 14254565, + "step": 671, + "time_per_iteration": 2.7307324409484863 + }, + { + "auxiliary_loss_clip": 0.01238953, + "auxiliary_loss_mlp": 0.01077013, + "balance_loss_clip": 1.06033492, + "balance_loss_mlp": 1.04577994, + "epoch": 0.040402825792875394, + "flos": 17201858244480.0, + "grad_norm": 2.6164075493941925, + "language_loss": 0.92735744, + "learning_rate": 3.998865196648242e-06, + "loss": 0.95051706, + "num_input_tokens_seen": 14271885, + "step": 672, + "time_per_iteration": 2.613999843597412 + }, + { + "auxiliary_loss_clip": 0.01232152, + "auxiliary_loss_mlp": 0.01078646, + "balance_loss_clip": 1.05810082, + "balance_loss_mlp": 1.04543471, + "epoch": 0.040462949045543366, + "flos": 19172168928000.0, + "grad_norm": 2.0261565041320853, + "language_loss": 0.90291858, + "learning_rate": 3.998852040876622e-06, + "loss": 0.92602652, + "num_input_tokens_seen": 14289670, + "step": 673, + "time_per_iteration": 2.594834566116333 + }, + { + "auxiliary_loss_clip": 0.01228311, + "auxiliary_loss_mlp": 0.01081206, + "balance_loss_clip": 1.0553335, + "balance_loss_mlp": 1.04866147, + "epoch": 0.04052307229821133, + "flos": 24019821555840.0, + "grad_norm": 2.2884134879733895, + "language_loss": 0.75051808, + "learning_rate": 3.998838809308334e-06, + "loss": 0.77361321, + "num_input_tokens_seen": 14309285, + "step": 674, + "time_per_iteration": 2.584967613220215 + }, + { + "auxiliary_loss_clip": 0.01240648, + "auxiliary_loss_mlp": 0.01068431, + "balance_loss_clip": 1.05970716, + "balance_loss_mlp": 1.03653049, + "epoch": 0.0405831955508793, + "flos": 16436601964800.0, + "grad_norm": 3.1021197596904884, + "language_loss": 0.77994668, + "learning_rate": 3.9988255019438766e-06, + "loss": 0.80303752, + "num_input_tokens_seen": 14328300, + "step": 675, + "time_per_iteration": 2.6375744342803955 + }, + { + "auxiliary_loss_clip": 0.01231016, + "auxiliary_loss_mlp": 0.01084447, + "balance_loss_clip": 1.05571401, + "balance_loss_mlp": 1.05106878, + "epoch": 0.040643318803547275, + "flos": 24279922915200.0, + "grad_norm": 1.7727708393833361, + "language_loss": 0.76585054, + "learning_rate": 3.998812118783757e-06, + "loss": 0.78900516, + "num_input_tokens_seen": 14346395, + "step": 676, + "time_per_iteration": 2.5850417613983154 + }, + { + "auxiliary_loss_clip": 0.01237427, + "auxiliary_loss_mlp": 0.01082926, + "balance_loss_clip": 1.05994105, + "balance_loss_mlp": 1.04985702, + "epoch": 0.04070344205621524, + "flos": 17712076982400.0, + "grad_norm": 2.263561118647764, + "language_loss": 0.85253179, + "learning_rate": 3.9987986598284804e-06, + "loss": 0.87573534, + "num_input_tokens_seen": 14364605, + "step": 677, + "time_per_iteration": 2.6804704666137695 + }, + { + "auxiliary_loss_clip": 0.01229322, + "auxiliary_loss_mlp": 0.01067344, + "balance_loss_clip": 1.05600333, + "balance_loss_mlp": 1.03568161, + "epoch": 0.04076356530888321, + "flos": 26177658168960.0, + "grad_norm": 1.8012292395897125, + "language_loss": 0.76444447, + "learning_rate": 3.998785125078559e-06, + "loss": 0.78741115, + "num_input_tokens_seen": 14385265, + "step": 678, + "time_per_iteration": 2.642399549484253 + }, + { + "auxiliary_loss_clip": 0.01232368, + "auxiliary_loss_mlp": 0.01077345, + "balance_loss_clip": 1.05679822, + "balance_loss_mlp": 1.04687464, + "epoch": 0.04082368856155118, + "flos": 35773455772800.0, + "grad_norm": 1.7700823970690254, + "language_loss": 0.81785494, + "learning_rate": 3.998771514534505e-06, + "loss": 0.84095204, + "num_input_tokens_seen": 14406090, + "step": 679, + "time_per_iteration": 2.7300901412963867 + }, + { + "auxiliary_loss_clip": 0.01238605, + "auxiliary_loss_mlp": 0.0106679, + "balance_loss_clip": 1.06304073, + "balance_loss_mlp": 1.03610587, + "epoch": 0.04088381181421915, + "flos": 28146640049280.0, + "grad_norm": 2.0341830358737227, + "language_loss": 0.76114625, + "learning_rate": 3.998757828196835e-06, + "loss": 0.78420025, + "num_input_tokens_seen": 14425130, + "step": 680, + "time_per_iteration": 2.6402032375335693 + }, + { + "auxiliary_loss_clip": 0.01236456, + "auxiliary_loss_mlp": 0.01068679, + "balance_loss_clip": 1.05653501, + "balance_loss_mlp": 1.03382206, + "epoch": 0.04094393506688712, + "flos": 27597673514880.0, + "grad_norm": 5.304768027975369, + "language_loss": 0.83349347, + "learning_rate": 3.9987440660660685e-06, + "loss": 0.85654485, + "num_input_tokens_seen": 14447355, + "step": 681, + "time_per_iteration": 2.6540348529815674 + }, + { + "auxiliary_loss_clip": 0.01232479, + "auxiliary_loss_mlp": 0.01069492, + "balance_loss_clip": 1.05532742, + "balance_loss_mlp": 1.0368166, + "epoch": 0.04100405831955509, + "flos": 23112036109440.0, + "grad_norm": 2.4755722767212975, + "language_loss": 0.71195203, + "learning_rate": 3.998730228142726e-06, + "loss": 0.73497176, + "num_input_tokens_seen": 14466790, + "step": 682, + "time_per_iteration": 2.5648200511932373 + }, + { + "auxiliary_loss_clip": 0.01231809, + "auxiliary_loss_mlp": 0.01076677, + "balance_loss_clip": 1.05558395, + "balance_loss_mlp": 1.04577827, + "epoch": 0.04106418157222306, + "flos": 20156731695360.0, + "grad_norm": 1.6326955559581173, + "language_loss": 0.726008, + "learning_rate": 3.998716314427333e-06, + "loss": 0.74909282, + "num_input_tokens_seen": 14485195, + "step": 683, + "time_per_iteration": 2.5963311195373535 + }, + { + "auxiliary_loss_clip": 0.01232309, + "auxiliary_loss_mlp": 0.01083986, + "balance_loss_clip": 1.06267881, + "balance_loss_mlp": 1.05249107, + "epoch": 0.041124304824891024, + "flos": 17420697855360.0, + "grad_norm": 2.238075207973956, + "language_loss": 0.8139559, + "learning_rate": 3.998702324920417e-06, + "loss": 0.8371188, + "num_input_tokens_seen": 14503370, + "step": 684, + "time_per_iteration": 2.5961713790893555 + }, + { + "auxiliary_loss_clip": 0.01233864, + "auxiliary_loss_mlp": 0.01074855, + "balance_loss_clip": 1.06038749, + "balance_loss_mlp": 1.04152441, + "epoch": 0.041184428077558996, + "flos": 25780163287680.0, + "grad_norm": 1.5333448504297333, + "language_loss": 0.90695715, + "learning_rate": 3.9986882596225085e-06, + "loss": 0.93004441, + "num_input_tokens_seen": 14526415, + "step": 685, + "time_per_iteration": 2.666438341140747 + }, + { + "auxiliary_loss_clip": 0.0123245, + "auxiliary_loss_mlp": 0.01071699, + "balance_loss_clip": 1.05714965, + "balance_loss_mlp": 1.03975105, + "epoch": 0.04124455133022697, + "flos": 22964766347520.0, + "grad_norm": 1.9307408152737702, + "language_loss": 0.88044751, + "learning_rate": 3.998674118534141e-06, + "loss": 0.90348899, + "num_input_tokens_seen": 14546595, + "step": 686, + "time_per_iteration": 2.660414934158325 + }, + { + "auxiliary_loss_clip": 0.01238116, + "auxiliary_loss_mlp": 0.01075711, + "balance_loss_clip": 1.05871665, + "balance_loss_mlp": 1.04421639, + "epoch": 0.04130467458289493, + "flos": 21289067015040.0, + "grad_norm": 2.2635909268004966, + "language_loss": 0.71368885, + "learning_rate": 3.998659901655851e-06, + "loss": 0.73682702, + "num_input_tokens_seen": 14566590, + "step": 687, + "time_per_iteration": 2.637375831604004 + }, + { + "auxiliary_loss_clip": 0.01231247, + "auxiliary_loss_mlp": 0.01075986, + "balance_loss_clip": 1.06158161, + "balance_loss_mlp": 1.04666066, + "epoch": 0.041364797835562905, + "flos": 19974233669760.0, + "grad_norm": 1.4963219296996242, + "language_loss": 0.86056709, + "learning_rate": 3.998645608988177e-06, + "loss": 0.88363945, + "num_input_tokens_seen": 14585965, + "step": 688, + "time_per_iteration": 2.6089484691619873 + }, + { + "auxiliary_loss_clip": 0.01232354, + "auxiliary_loss_mlp": 0.01084679, + "balance_loss_clip": 1.059834, + "balance_loss_mlp": 1.05377984, + "epoch": 0.04142492108823087, + "flos": 21906227520000.0, + "grad_norm": 1.922151689904278, + "language_loss": 0.83179617, + "learning_rate": 3.998631240531661e-06, + "loss": 0.85496652, + "num_input_tokens_seen": 14606015, + "step": 689, + "time_per_iteration": 2.5736730098724365 + }, + { + "auxiliary_loss_clip": 0.01230248, + "auxiliary_loss_mlp": 0.01084683, + "balance_loss_clip": 1.05636716, + "balance_loss_mlp": 1.05366504, + "epoch": 0.04148504434089884, + "flos": 27639617621760.0, + "grad_norm": 1.9104996774053407, + "language_loss": 0.68092895, + "learning_rate": 3.998616796286848e-06, + "loss": 0.70407826, + "num_input_tokens_seen": 14629955, + "step": 690, + "time_per_iteration": 2.680795669555664 + }, + { + "auxiliary_loss_clip": 0.01225469, + "auxiliary_loss_mlp": 0.01070765, + "balance_loss_clip": 1.0546422, + "balance_loss_mlp": 1.03996181, + "epoch": 0.041545167593566815, + "flos": 20518387781760.0, + "grad_norm": 1.8075153228155456, + "language_loss": 0.75183624, + "learning_rate": 3.998602276254286e-06, + "loss": 0.77479863, + "num_input_tokens_seen": 14648000, + "step": 691, + "time_per_iteration": 2.6809425354003906 + }, + { + "auxiliary_loss_clip": 0.01228105, + "auxiliary_loss_mlp": 0.01079921, + "balance_loss_clip": 1.0572958, + "balance_loss_mlp": 1.04895055, + "epoch": 0.04160529084623478, + "flos": 11868907939200.0, + "grad_norm": 2.2319060921561844, + "language_loss": 0.84640431, + "learning_rate": 3.998587680434526e-06, + "loss": 0.86948454, + "num_input_tokens_seen": 14662235, + "step": 692, + "time_per_iteration": 2.5624444484710693 + }, + { + "auxiliary_loss_clip": 0.01231921, + "auxiliary_loss_mlp": 0.0107921, + "balance_loss_clip": 1.0560956, + "balance_loss_mlp": 1.04637969, + "epoch": 0.04166541409890275, + "flos": 14828306503680.0, + "grad_norm": 2.7653031012022695, + "language_loss": 0.89198673, + "learning_rate": 3.99857300882812e-06, + "loss": 0.91509807, + "num_input_tokens_seen": 14676065, + "step": 693, + "time_per_iteration": 2.5405685901641846 + }, + { + "auxiliary_loss_clip": 0.01233652, + "auxiliary_loss_mlp": 0.01069042, + "balance_loss_clip": 1.06009698, + "balance_loss_mlp": 1.03878713, + "epoch": 0.04172553735157072, + "flos": 25808137004160.0, + "grad_norm": 2.694396499350872, + "language_loss": 0.81817269, + "learning_rate": 3.998558261435626e-06, + "loss": 0.84119964, + "num_input_tokens_seen": 14694955, + "step": 694, + "time_per_iteration": 2.6840598583221436 + }, + { + "auxiliary_loss_clip": 0.01233058, + "auxiliary_loss_mlp": 0.01073504, + "balance_loss_clip": 1.05553126, + "balance_loss_mlp": 1.0426054, + "epoch": 0.04178566060423869, + "flos": 24279815174400.0, + "grad_norm": 2.0432563462553603, + "language_loss": 0.83578789, + "learning_rate": 3.9985434382576015e-06, + "loss": 0.85885346, + "num_input_tokens_seen": 14715510, + "step": 695, + "time_per_iteration": 2.6009654998779297 + }, + { + "auxiliary_loss_clip": 0.01229649, + "auxiliary_loss_mlp": 0.01079393, + "balance_loss_clip": 1.05616379, + "balance_loss_mlp": 1.04713535, + "epoch": 0.04184578385690666, + "flos": 18222008411520.0, + "grad_norm": 3.208231787002776, + "language_loss": 0.84620714, + "learning_rate": 3.99852853929461e-06, + "loss": 0.86929756, + "num_input_tokens_seen": 14731755, + "step": 696, + "time_per_iteration": 2.571491241455078 + }, + { + "auxiliary_loss_clip": 0.01228834, + "auxiliary_loss_mlp": 0.01078653, + "balance_loss_clip": 1.05562663, + "balance_loss_mlp": 1.04691982, + "epoch": 0.041905907109574626, + "flos": 22776342577920.0, + "grad_norm": 2.248426241663397, + "language_loss": 0.92462194, + "learning_rate": 3.998513564547216e-06, + "loss": 0.94769681, + "num_input_tokens_seen": 14750810, + "step": 697, + "time_per_iteration": 2.6231753826141357 + }, + { + "auxiliary_loss_clip": 0.0122341, + "auxiliary_loss_mlp": 0.010691, + "balance_loss_clip": 1.05419993, + "balance_loss_mlp": 1.03951192, + "epoch": 0.0419660303622426, + "flos": 20156947176960.0, + "grad_norm": 2.646917390208478, + "language_loss": 0.83966684, + "learning_rate": 3.998498514015987e-06, + "loss": 0.86259192, + "num_input_tokens_seen": 14768435, + "step": 698, + "time_per_iteration": 2.5466198921203613 + }, + { + "auxiliary_loss_clip": 0.01228049, + "auxiliary_loss_mlp": 0.01083963, + "balance_loss_clip": 1.05524015, + "balance_loss_mlp": 1.0514189, + "epoch": 0.042026153614910564, + "flos": 23076376882560.0, + "grad_norm": 2.298676806683977, + "language_loss": 0.91296732, + "learning_rate": 3.998483387701495e-06, + "loss": 0.93608749, + "num_input_tokens_seen": 14786690, + "step": 699, + "time_per_iteration": 4.24056339263916 + }, + { + "auxiliary_loss_clip": 0.01105663, + "auxiliary_loss_mlp": 0.01011523, + "balance_loss_clip": 1.02230763, + "balance_loss_mlp": 1.00553834, + "epoch": 0.042086276867578536, + "flos": 64495243370880.0, + "grad_norm": 0.9138663784488092, + "language_loss": 0.67926437, + "learning_rate": 3.998468185604312e-06, + "loss": 0.70043623, + "num_input_tokens_seen": 14853840, + "step": 700, + "time_per_iteration": 4.811952829360962 + }, + { + "auxiliary_loss_clip": 0.01233102, + "auxiliary_loss_mlp": 0.01078188, + "balance_loss_clip": 1.05729687, + "balance_loss_mlp": 1.04483366, + "epoch": 0.04214640012024651, + "flos": 15487016065920.0, + "grad_norm": 4.047294567476363, + "language_loss": 0.88751191, + "learning_rate": 3.998452907725016e-06, + "loss": 0.91062474, + "num_input_tokens_seen": 14869580, + "step": 701, + "time_per_iteration": 2.776735782623291 + }, + { + "auxiliary_loss_clip": 0.01228832, + "auxiliary_loss_mlp": 0.01077506, + "balance_loss_clip": 1.0591538, + "balance_loss_mlp": 1.04605818, + "epoch": 0.04220652337291447, + "flos": 23877040993920.0, + "grad_norm": 2.1224832476596402, + "language_loss": 0.67181242, + "learning_rate": 3.998437554064184e-06, + "loss": 0.69487584, + "num_input_tokens_seen": 14891065, + "step": 702, + "time_per_iteration": 4.4943695068359375 + }, + { + "auxiliary_loss_clip": 0.01095554, + "auxiliary_loss_mlp": 0.01006904, + "balance_loss_clip": 1.01374626, + "balance_loss_mlp": 1.00084829, + "epoch": 0.042266646625582445, + "flos": 63795451628160.0, + "grad_norm": 0.8449380406500417, + "language_loss": 0.60788643, + "learning_rate": 3.9984221246224006e-06, + "loss": 0.62891102, + "num_input_tokens_seen": 14954815, + "step": 703, + "time_per_iteration": 3.3073859214782715 + }, + { + "auxiliary_loss_clip": 0.01092579, + "auxiliary_loss_mlp": 0.01007229, + "balance_loss_clip": 1.01067007, + "balance_loss_mlp": 1.00107765, + "epoch": 0.04232676987825041, + "flos": 50018863345920.0, + "grad_norm": 1.0167952992695186, + "language_loss": 0.57784152, + "learning_rate": 3.9984066194002494e-06, + "loss": 0.59883964, + "num_input_tokens_seen": 15003050, + "step": 704, + "time_per_iteration": 3.108564615249634 + }, + { + "auxiliary_loss_clip": 0.01231122, + "auxiliary_loss_mlp": 0.01069426, + "balance_loss_clip": 1.05760741, + "balance_loss_mlp": 1.03769207, + "epoch": 0.04238689313091838, + "flos": 21616105368960.0, + "grad_norm": 2.1456630832628845, + "language_loss": 0.8729046, + "learning_rate": 3.998391038398319e-06, + "loss": 0.89591008, + "num_input_tokens_seen": 15021990, + "step": 705, + "time_per_iteration": 2.85418438911438 + }, + { + "auxiliary_loss_clip": 0.01218071, + "auxiliary_loss_mlp": 0.0107144, + "balance_loss_clip": 1.05307651, + "balance_loss_mlp": 1.04163814, + "epoch": 0.042447016383586354, + "flos": 19135109070720.0, + "grad_norm": 3.273971195764619, + "language_loss": 0.71569383, + "learning_rate": 3.998375381617201e-06, + "loss": 0.73858893, + "num_input_tokens_seen": 15040700, + "step": 706, + "time_per_iteration": 2.70827317237854 + }, + { + "auxiliary_loss_clip": 0.01222297, + "auxiliary_loss_mlp": 0.01071221, + "balance_loss_clip": 1.05464816, + "balance_loss_mlp": 1.03898692, + "epoch": 0.04250713963625432, + "flos": 24426007528320.0, + "grad_norm": 2.4813391459766967, + "language_loss": 0.93622792, + "learning_rate": 3.9983596490574875e-06, + "loss": 0.95916307, + "num_input_tokens_seen": 15056725, + "step": 707, + "time_per_iteration": 2.8246443271636963 + }, + { + "auxiliary_loss_clip": 0.0122755, + "auxiliary_loss_mlp": 0.01068996, + "balance_loss_clip": 1.05375588, + "balance_loss_mlp": 1.03771567, + "epoch": 0.04256726288892229, + "flos": 30367391333760.0, + "grad_norm": 1.8928336086802167, + "language_loss": 0.81266576, + "learning_rate": 3.998343840719776e-06, + "loss": 0.83563125, + "num_input_tokens_seen": 15077550, + "step": 708, + "time_per_iteration": 2.854233503341675 + }, + { + "auxiliary_loss_clip": 0.01231245, + "auxiliary_loss_mlp": 0.01076124, + "balance_loss_clip": 1.05625784, + "balance_loss_mlp": 1.04353213, + "epoch": 0.04262738614159026, + "flos": 16362661818240.0, + "grad_norm": 2.516237445138053, + "language_loss": 0.82510257, + "learning_rate": 3.998327956604666e-06, + "loss": 0.8481763, + "num_input_tokens_seen": 15094955, + "step": 709, + "time_per_iteration": 2.547232151031494 + }, + { + "auxiliary_loss_clip": 0.01234777, + "auxiliary_loss_mlp": 0.01069601, + "balance_loss_clip": 1.05959582, + "balance_loss_mlp": 1.03898799, + "epoch": 0.04268750939425823, + "flos": 20412379768320.0, + "grad_norm": 12.454316394232693, + "language_loss": 0.85372686, + "learning_rate": 3.99831199671276e-06, + "loss": 0.87677062, + "num_input_tokens_seen": 15113395, + "step": 710, + "time_per_iteration": 2.5471456050872803 + }, + { + "auxiliary_loss_clip": 0.01233654, + "auxiliary_loss_mlp": 0.01076994, + "balance_loss_clip": 1.06163049, + "balance_loss_mlp": 1.04645216, + "epoch": 0.0427476326469262, + "flos": 20302959962880.0, + "grad_norm": 4.729217236102158, + "language_loss": 0.84860396, + "learning_rate": 3.998295961044662e-06, + "loss": 0.87171042, + "num_input_tokens_seen": 15132920, + "step": 711, + "time_per_iteration": 2.525848627090454 + }, + { + "auxiliary_loss_clip": 0.01224189, + "auxiliary_loss_mlp": 0.01073897, + "balance_loss_clip": 1.0537895, + "balance_loss_mlp": 1.04273546, + "epoch": 0.042807755899594166, + "flos": 21650794928640.0, + "grad_norm": 1.644836680346159, + "language_loss": 0.85297048, + "learning_rate": 3.9982798496009804e-06, + "loss": 0.87595129, + "num_input_tokens_seen": 15153115, + "step": 712, + "time_per_iteration": 2.7155439853668213 + }, + { + "auxiliary_loss_clip": 0.01231483, + "auxiliary_loss_mlp": 0.01073087, + "balance_loss_clip": 1.05479693, + "balance_loss_mlp": 1.04329681, + "epoch": 0.04286787915226214, + "flos": 21435007973760.0, + "grad_norm": 2.5915408789303744, + "language_loss": 0.9125694, + "learning_rate": 3.998263662382328e-06, + "loss": 0.93561506, + "num_input_tokens_seen": 15172770, + "step": 713, + "time_per_iteration": 2.7128608226776123 + }, + { + "auxiliary_loss_clip": 0.01094664, + "auxiliary_loss_mlp": 0.01018795, + "balance_loss_clip": 1.01494825, + "balance_loss_mlp": 1.01254833, + "epoch": 0.04292800240493011, + "flos": 66397970615040.0, + "grad_norm": 0.8757620219817711, + "language_loss": 0.63757044, + "learning_rate": 3.9982473993893165e-06, + "loss": 0.65870506, + "num_input_tokens_seen": 15240055, + "step": 714, + "time_per_iteration": 3.2828190326690674 + }, + { + "auxiliary_loss_clip": 0.01227148, + "auxiliary_loss_mlp": 0.0108217, + "balance_loss_clip": 1.0596447, + "balance_loss_mlp": 1.05258203, + "epoch": 0.042988125657598075, + "flos": 31650264552960.0, + "grad_norm": 1.994695443080399, + "language_loss": 0.75106514, + "learning_rate": 3.998231060622563e-06, + "loss": 0.77415836, + "num_input_tokens_seen": 15261585, + "step": 715, + "time_per_iteration": 2.651054859161377 + }, + { + "auxiliary_loss_clip": 0.01229295, + "auxiliary_loss_mlp": 0.01075518, + "balance_loss_clip": 1.05885959, + "balance_loss_mlp": 1.04373693, + "epoch": 0.04304824891026605, + "flos": 33248468292480.0, + "grad_norm": 2.0744305362838436, + "language_loss": 0.72839189, + "learning_rate": 3.998214646082688e-06, + "loss": 0.75143999, + "num_input_tokens_seen": 15281160, + "step": 716, + "time_per_iteration": 2.7589786052703857 + }, + { + "auxiliary_loss_clip": 0.0108787, + "auxiliary_loss_mlp": 0.01010072, + "balance_loss_clip": 1.0098846, + "balance_loss_mlp": 1.00382543, + "epoch": 0.04310837216293401, + "flos": 64064782782720.0, + "grad_norm": 1.2121069308638377, + "language_loss": 0.65541947, + "learning_rate": 3.998198155770314e-06, + "loss": 0.67639887, + "num_input_tokens_seen": 15344505, + "step": 717, + "time_per_iteration": 3.215055227279663 + }, + { + "auxiliary_loss_clip": 0.01086522, + "auxiliary_loss_mlp": 0.01009436, + "balance_loss_clip": 1.00906026, + "balance_loss_mlp": 1.00295091, + "epoch": 0.043168495415601985, + "flos": 61343757849600.0, + "grad_norm": 0.9827932588509711, + "language_loss": 0.58789933, + "learning_rate": 3.998181589686065e-06, + "loss": 0.60885888, + "num_input_tokens_seen": 15404050, + "step": 718, + "time_per_iteration": 2.960855722427368 + }, + { + "auxiliary_loss_clip": 0.01224923, + "auxiliary_loss_mlp": 0.01074565, + "balance_loss_clip": 1.0594492, + "balance_loss_mlp": 1.04142499, + "epoch": 0.04322861866826996, + "flos": 20704261685760.0, + "grad_norm": 2.0421995582205543, + "language_loss": 0.91318548, + "learning_rate": 3.99816494783057e-06, + "loss": 0.93618029, + "num_input_tokens_seen": 15424190, + "step": 719, + "time_per_iteration": 2.578029155731201 + }, + { + "auxiliary_loss_clip": 0.01223093, + "auxiliary_loss_mlp": 0.01072291, + "balance_loss_clip": 1.05371261, + "balance_loss_mlp": 1.04244065, + "epoch": 0.04328874192093792, + "flos": 30373352991360.0, + "grad_norm": 1.7770590587157362, + "language_loss": 0.66513205, + "learning_rate": 3.99814823020446e-06, + "loss": 0.68808579, + "num_input_tokens_seen": 15446500, + "step": 720, + "time_per_iteration": 2.6373331546783447 + }, + { + "auxiliary_loss_clip": 0.01223101, + "auxiliary_loss_mlp": 0.01072946, + "balance_loss_clip": 1.05582929, + "balance_loss_mlp": 1.04199898, + "epoch": 0.043348865173605894, + "flos": 21944795748480.0, + "grad_norm": 2.2259925260808777, + "language_loss": 0.77468729, + "learning_rate": 3.9981314368083684e-06, + "loss": 0.79764771, + "num_input_tokens_seen": 15465830, + "step": 721, + "time_per_iteration": 2.670539379119873 + }, + { + "auxiliary_loss_clip": 0.0122854, + "auxiliary_loss_mlp": 0.01082326, + "balance_loss_clip": 1.05945635, + "balance_loss_mlp": 1.051952, + "epoch": 0.04340898842627386, + "flos": 15264225959040.0, + "grad_norm": 2.7150315815709187, + "language_loss": 0.88049459, + "learning_rate": 3.998114567642933e-06, + "loss": 0.90360332, + "num_input_tokens_seen": 15479985, + "step": 722, + "time_per_iteration": 2.58138370513916 + }, + { + "auxiliary_loss_clip": 0.01232636, + "auxiliary_loss_mlp": 0.01075414, + "balance_loss_clip": 1.05939603, + "balance_loss_mlp": 1.04534972, + "epoch": 0.04346911167894183, + "flos": 27965434913280.0, + "grad_norm": 1.758610667921224, + "language_loss": 0.84111041, + "learning_rate": 3.998097622708792e-06, + "loss": 0.86419094, + "num_input_tokens_seen": 15501545, + "step": 723, + "time_per_iteration": 2.679837465286255 + }, + { + "auxiliary_loss_clip": 0.01234523, + "auxiliary_loss_mlp": 0.01074543, + "balance_loss_clip": 1.06331956, + "balance_loss_mlp": 1.04400122, + "epoch": 0.0435292349316098, + "flos": 29242202820480.0, + "grad_norm": 1.8848495113287762, + "language_loss": 0.82768893, + "learning_rate": 3.99808060200659e-06, + "loss": 0.85077959, + "num_input_tokens_seen": 15521725, + "step": 724, + "time_per_iteration": 2.6232171058654785 + }, + { + "auxiliary_loss_clip": 0.01227465, + "auxiliary_loss_mlp": 0.01078944, + "balance_loss_clip": 1.05857658, + "balance_loss_mlp": 1.04756832, + "epoch": 0.04358935818427777, + "flos": 20558356640640.0, + "grad_norm": 1.9585058308572127, + "language_loss": 0.79896867, + "learning_rate": 3.998063505536971e-06, + "loss": 0.82203281, + "num_input_tokens_seen": 15540910, + "step": 725, + "time_per_iteration": 2.601607322692871 + }, + { + "auxiliary_loss_clip": 0.01237636, + "auxiliary_loss_mlp": 0.01069391, + "balance_loss_clip": 1.06061625, + "balance_loss_mlp": 1.03734803, + "epoch": 0.04364948143694574, + "flos": 14464926564480.0, + "grad_norm": 2.1858439033248405, + "language_loss": 0.87088579, + "learning_rate": 3.998046333300584e-06, + "loss": 0.89395607, + "num_input_tokens_seen": 15558640, + "step": 726, + "time_per_iteration": 2.563006639480591 + }, + { + "auxiliary_loss_clip": 0.01092893, + "auxiliary_loss_mlp": 0.01029564, + "balance_loss_clip": 1.01761222, + "balance_loss_mlp": 1.02384233, + "epoch": 0.043709604689613706, + "flos": 50067268922880.0, + "grad_norm": 0.9105509879675263, + "language_loss": 0.55872536, + "learning_rate": 3.998029085298079e-06, + "loss": 0.57994992, + "num_input_tokens_seen": 15612975, + "step": 727, + "time_per_iteration": 3.300549030303955 + }, + { + "auxiliary_loss_clip": 0.01229272, + "auxiliary_loss_mlp": 0.01071514, + "balance_loss_clip": 1.05855417, + "balance_loss_mlp": 1.04028141, + "epoch": 0.04376972794228168, + "flos": 13991588115840.0, + "grad_norm": 2.2069790930108595, + "language_loss": 0.82247663, + "learning_rate": 3.998011761530112e-06, + "loss": 0.8454845, + "num_input_tokens_seen": 15631070, + "step": 728, + "time_per_iteration": 2.5753016471862793 + }, + { + "auxiliary_loss_clip": 0.01223925, + "auxiliary_loss_mlp": 0.01066848, + "balance_loss_clip": 1.05746889, + "balance_loss_mlp": 1.03768897, + "epoch": 0.04382985119494965, + "flos": 22009901149440.0, + "grad_norm": 2.1527165860339794, + "language_loss": 0.77027893, + "learning_rate": 3.997994361997338e-06, + "loss": 0.79318666, + "num_input_tokens_seen": 15647825, + "step": 729, + "time_per_iteration": 2.5779478549957275 + }, + { + "auxiliary_loss_clip": 0.01228262, + "auxiliary_loss_mlp": 0.01070885, + "balance_loss_clip": 1.05550122, + "balance_loss_mlp": 1.04041493, + "epoch": 0.043889974447617615, + "flos": 24206521472640.0, + "grad_norm": 2.2137853340891422, + "language_loss": 0.95085013, + "learning_rate": 3.997976886700417e-06, + "loss": 0.97384155, + "num_input_tokens_seen": 15668260, + "step": 730, + "time_per_iteration": 2.581484317779541 + }, + { + "auxiliary_loss_clip": 0.01224375, + "auxiliary_loss_mlp": 0.01064235, + "balance_loss_clip": 1.05378437, + "balance_loss_mlp": 1.03271604, + "epoch": 0.04395009770028559, + "flos": 17274541415040.0, + "grad_norm": 2.1412241302641735, + "language_loss": 0.88622618, + "learning_rate": 3.997959335640013e-06, + "loss": 0.90911222, + "num_input_tokens_seen": 15685630, + "step": 731, + "time_per_iteration": 2.6281213760375977 + }, + { + "auxiliary_loss_clip": 0.01225669, + "auxiliary_loss_mlp": 0.01069124, + "balance_loss_clip": 1.05659413, + "balance_loss_mlp": 1.04112196, + "epoch": 0.04401022095295355, + "flos": 12310286261760.0, + "grad_norm": 3.6056457002272446, + "language_loss": 0.88709021, + "learning_rate": 3.997941708816791e-06, + "loss": 0.91003817, + "num_input_tokens_seen": 15698645, + "step": 732, + "time_per_iteration": 2.5138914585113525 + }, + { + "auxiliary_loss_clip": 0.01226525, + "auxiliary_loss_mlp": 0.01079215, + "balance_loss_clip": 1.05725074, + "balance_loss_mlp": 1.04853046, + "epoch": 0.044070344205621524, + "flos": 20959658363520.0, + "grad_norm": 2.1559843978477145, + "language_loss": 0.86021078, + "learning_rate": 3.997924006231419e-06, + "loss": 0.88326818, + "num_input_tokens_seen": 15716775, + "step": 733, + "time_per_iteration": 2.7106125354766846 + }, + { + "auxiliary_loss_clip": 0.01230055, + "auxiliary_loss_mlp": 0.01081277, + "balance_loss_clip": 1.05855608, + "balance_loss_mlp": 1.04839873, + "epoch": 0.044130467458289496, + "flos": 13845288021120.0, + "grad_norm": 2.5605068574916983, + "language_loss": 0.9116447, + "learning_rate": 3.9979062278845685e-06, + "loss": 0.93475807, + "num_input_tokens_seen": 15733320, + "step": 734, + "time_per_iteration": 2.5778093338012695 + }, + { + "auxiliary_loss_clip": 0.01223229, + "auxiliary_loss_mlp": 0.0106918, + "balance_loss_clip": 1.05831099, + "balance_loss_mlp": 1.03992653, + "epoch": 0.04419059071095746, + "flos": 28655063107200.0, + "grad_norm": 3.1402930679379075, + "language_loss": 0.78146839, + "learning_rate": 3.9978883737769125e-06, + "loss": 0.80439246, + "num_input_tokens_seen": 15752705, + "step": 735, + "time_per_iteration": 2.669970989227295 + }, + { + "auxiliary_loss_clip": 0.01218598, + "auxiliary_loss_mlp": 0.01064087, + "balance_loss_clip": 1.05332637, + "balance_loss_mlp": 1.03547645, + "epoch": 0.04425071396362543, + "flos": 28183304856960.0, + "grad_norm": 2.4004207079693685, + "language_loss": 0.8821196, + "learning_rate": 3.9978704439091305e-06, + "loss": 0.90494645, + "num_input_tokens_seen": 15772800, + "step": 736, + "time_per_iteration": 2.660985231399536 + }, + { + "auxiliary_loss_clip": 0.01224575, + "auxiliary_loss_mlp": 0.01077517, + "balance_loss_clip": 1.06106138, + "balance_loss_mlp": 1.04795313, + "epoch": 0.0443108372162934, + "flos": 23658452778240.0, + "grad_norm": 1.6885361130834633, + "language_loss": 0.84380347, + "learning_rate": 3.997852438281901e-06, + "loss": 0.86682439, + "num_input_tokens_seen": 15793665, + "step": 737, + "time_per_iteration": 2.7081258296966553 + }, + { + "auxiliary_loss_clip": 0.0122791, + "auxiliary_loss_mlp": 0.01073754, + "balance_loss_clip": 1.05892372, + "balance_loss_mlp": 1.04051864, + "epoch": 0.04437096046896137, + "flos": 33979861025280.0, + "grad_norm": 1.944354497807343, + "language_loss": 0.84904975, + "learning_rate": 3.997834356895906e-06, + "loss": 0.87206644, + "num_input_tokens_seen": 15813175, + "step": 738, + "time_per_iteration": 2.6419122219085693 + }, + { + "auxiliary_loss_clip": 0.01086981, + "auxiliary_loss_mlp": 0.0101882, + "balance_loss_clip": 1.01261103, + "balance_loss_mlp": 1.01355076, + "epoch": 0.04443108372162934, + "flos": 67397506375680.0, + "grad_norm": 1.1742906695634687, + "language_loss": 0.59160984, + "learning_rate": 3.9978161997518324e-06, + "loss": 0.6126678, + "num_input_tokens_seen": 15872050, + "step": 739, + "time_per_iteration": 3.1104483604431152 + }, + { + "auxiliary_loss_clip": 0.01224065, + "auxiliary_loss_mlp": 0.01067969, + "balance_loss_clip": 1.05841923, + "balance_loss_mlp": 1.03821468, + "epoch": 0.04449120697429731, + "flos": 29752672953600.0, + "grad_norm": 2.7297088734343475, + "language_loss": 0.91519815, + "learning_rate": 3.997797966850369e-06, + "loss": 0.93811852, + "num_input_tokens_seen": 15891085, + "step": 740, + "time_per_iteration": 2.7313599586486816 + }, + { + "auxiliary_loss_clip": 0.01229841, + "auxiliary_loss_mlp": 0.01067486, + "balance_loss_clip": 1.06099546, + "balance_loss_mlp": 1.03849435, + "epoch": 0.04455133022696528, + "flos": 36502119072000.0, + "grad_norm": 2.0190329926601756, + "language_loss": 0.72224152, + "learning_rate": 3.997779658192205e-06, + "loss": 0.74521482, + "num_input_tokens_seen": 15914225, + "step": 741, + "time_per_iteration": 2.684311866760254 + }, + { + "auxiliary_loss_clip": 0.01220566, + "auxiliary_loss_mlp": 0.01076538, + "balance_loss_clip": 1.05524278, + "balance_loss_mlp": 1.04742765, + "epoch": 0.044611453479633245, + "flos": 28803661672320.0, + "grad_norm": 1.7602651498571436, + "language_loss": 0.88506305, + "learning_rate": 3.997761273778037e-06, + "loss": 0.90803409, + "num_input_tokens_seen": 15934540, + "step": 742, + "time_per_iteration": 2.6436922550201416 + }, + { + "auxiliary_loss_clip": 0.01221202, + "auxiliary_loss_mlp": 0.01059812, + "balance_loss_clip": 1.0561465, + "balance_loss_mlp": 1.02883005, + "epoch": 0.04467157673230122, + "flos": 20010970304640.0, + "grad_norm": 2.450165841937834, + "language_loss": 0.84104776, + "learning_rate": 3.997742813608561e-06, + "loss": 0.86385787, + "num_input_tokens_seen": 15952560, + "step": 743, + "time_per_iteration": 2.5945041179656982 + }, + { + "auxiliary_loss_clip": 0.01227272, + "auxiliary_loss_mlp": 0.01068291, + "balance_loss_clip": 1.05791283, + "balance_loss_mlp": 1.03906095, + "epoch": 0.04473169998496919, + "flos": 18004964480640.0, + "grad_norm": 2.1031554005853463, + "language_loss": 0.8003279, + "learning_rate": 3.997724277684479e-06, + "loss": 0.82328355, + "num_input_tokens_seen": 15970620, + "step": 744, + "time_per_iteration": 2.521329402923584 + }, + { + "auxiliary_loss_clip": 0.01220501, + "auxiliary_loss_mlp": 0.01071299, + "balance_loss_clip": 1.05575752, + "balance_loss_mlp": 1.04255772, + "epoch": 0.044791823237637154, + "flos": 20631722169600.0, + "grad_norm": 6.043009716076407, + "language_loss": 0.85333025, + "learning_rate": 3.99770566600649e-06, + "loss": 0.87624824, + "num_input_tokens_seen": 15987325, + "step": 745, + "time_per_iteration": 2.7512145042419434 + }, + { + "auxiliary_loss_clip": 0.0121973, + "auxiliary_loss_mlp": 0.0106214, + "balance_loss_clip": 1.05431402, + "balance_loss_mlp": 1.03231382, + "epoch": 0.04485194649030513, + "flos": 31176171918720.0, + "grad_norm": 1.9096896913973602, + "language_loss": 0.68898392, + "learning_rate": 3.997686978575302e-06, + "loss": 0.7118026, + "num_input_tokens_seen": 16008310, + "step": 746, + "time_per_iteration": 2.774487257003784 + }, + { + "auxiliary_loss_clip": 0.0122761, + "auxiliary_loss_mlp": 0.01080168, + "balance_loss_clip": 1.06166697, + "balance_loss_mlp": 1.04950774, + "epoch": 0.04491206974297309, + "flos": 26143291831680.0, + "grad_norm": 2.825521403882454, + "language_loss": 0.69170547, + "learning_rate": 3.997668215391625e-06, + "loss": 0.71478331, + "num_input_tokens_seen": 16029620, + "step": 747, + "time_per_iteration": 4.498899221420288 + }, + { + "auxiliary_loss_clip": 0.01229495, + "auxiliary_loss_mlp": 0.01082603, + "balance_loss_clip": 1.05969191, + "balance_loss_mlp": 1.05208588, + "epoch": 0.044972192995641064, + "flos": 20667668705280.0, + "grad_norm": 3.8957744916158483, + "language_loss": 0.66604781, + "learning_rate": 3.997649376456168e-06, + "loss": 0.68916881, + "num_input_tokens_seen": 16049065, + "step": 748, + "time_per_iteration": 4.221456527709961 + }, + { + "auxiliary_loss_clip": 0.01229131, + "auxiliary_loss_mlp": 0.01081511, + "balance_loss_clip": 1.0639534, + "balance_loss_mlp": 1.05153012, + "epoch": 0.045032316248309036, + "flos": 16106834177280.0, + "grad_norm": 2.2866997806195424, + "language_loss": 0.76567501, + "learning_rate": 3.997630461769647e-06, + "loss": 0.78878146, + "num_input_tokens_seen": 16066765, + "step": 749, + "time_per_iteration": 4.199462175369263 + }, + { + "auxiliary_loss_clip": 0.0122814, + "auxiliary_loss_mlp": 0.01077132, + "balance_loss_clip": 1.06070077, + "balance_loss_mlp": 1.04725802, + "epoch": 0.045092439500977, + "flos": 17858843953920.0, + "grad_norm": 3.0649278153963047, + "language_loss": 0.88953304, + "learning_rate": 3.997611471332778e-06, + "loss": 0.91258579, + "num_input_tokens_seen": 16085980, + "step": 750, + "time_per_iteration": 2.707911968231201 + }, + { + "auxiliary_loss_clip": 0.01223724, + "auxiliary_loss_mlp": 0.0107243, + "balance_loss_clip": 1.05465925, + "balance_loss_mlp": 1.03967094, + "epoch": 0.04515256275364497, + "flos": 24462815990400.0, + "grad_norm": 1.8243404307458593, + "language_loss": 0.74258852, + "learning_rate": 3.9975924051462825e-06, + "loss": 0.76555014, + "num_input_tokens_seen": 16106260, + "step": 751, + "time_per_iteration": 2.698618173599243 + }, + { + "auxiliary_loss_clip": 0.01221587, + "auxiliary_loss_mlp": 0.01072647, + "balance_loss_clip": 1.05558872, + "balance_loss_mlp": 1.04313052, + "epoch": 0.04521268600631294, + "flos": 20916385453440.0, + "grad_norm": 2.21553795524734, + "language_loss": 0.69325662, + "learning_rate": 3.997573263210883e-06, + "loss": 0.71619898, + "num_input_tokens_seen": 16123475, + "step": 752, + "time_per_iteration": 2.556725025177002 + }, + { + "auxiliary_loss_clip": 0.01221036, + "auxiliary_loss_mlp": 0.01061141, + "balance_loss_clip": 1.0553062, + "balance_loss_mlp": 1.03193498, + "epoch": 0.04527280925898091, + "flos": 13371374954880.0, + "grad_norm": 3.57080056758631, + "language_loss": 0.92598307, + "learning_rate": 3.997554045527305e-06, + "loss": 0.94880486, + "num_input_tokens_seen": 16138335, + "step": 753, + "time_per_iteration": 2.5786778926849365 + }, + { + "auxiliary_loss_clip": 0.01225968, + "auxiliary_loss_mlp": 0.01083531, + "balance_loss_clip": 1.05835366, + "balance_loss_mlp": 1.05313301, + "epoch": 0.04533293251164888, + "flos": 23254565276160.0, + "grad_norm": 2.111409938954196, + "language_loss": 0.90943038, + "learning_rate": 3.997534752096277e-06, + "loss": 0.9325254, + "num_input_tokens_seen": 16157110, + "step": 754, + "time_per_iteration": 2.6670725345611572 + }, + { + "auxiliary_loss_clip": 0.01214643, + "auxiliary_loss_mlp": 0.01074911, + "balance_loss_clip": 1.05596292, + "balance_loss_mlp": 1.04358315, + "epoch": 0.04539305576431685, + "flos": 12422004537600.0, + "grad_norm": 2.0708986640414504, + "language_loss": 0.78867912, + "learning_rate": 3.997515382918531e-06, + "loss": 0.8115747, + "num_input_tokens_seen": 16174155, + "step": 755, + "time_per_iteration": 2.6617043018341064 + }, + { + "auxiliary_loss_clip": 0.01226515, + "auxiliary_loss_mlp": 0.01083392, + "balance_loss_clip": 1.05875802, + "balance_loss_mlp": 1.05401909, + "epoch": 0.04545317901698482, + "flos": 16070995382400.0, + "grad_norm": 2.9827964459438197, + "language_loss": 0.79072207, + "learning_rate": 3.9974959379948015e-06, + "loss": 0.81382114, + "num_input_tokens_seen": 16192240, + "step": 756, + "time_per_iteration": 2.6786365509033203 + }, + { + "auxiliary_loss_clip": 0.01087771, + "auxiliary_loss_mlp": 0.01012726, + "balance_loss_clip": 1.0148834, + "balance_loss_mlp": 1.00788581, + "epoch": 0.045513302269652785, + "flos": 66396139021440.0, + "grad_norm": 0.8087404193152781, + "language_loss": 0.62688541, + "learning_rate": 3.997476417325827e-06, + "loss": 0.64789033, + "num_input_tokens_seen": 16255775, + "step": 757, + "time_per_iteration": 3.2638063430786133 + }, + { + "auxiliary_loss_clip": 0.01221383, + "auxiliary_loss_mlp": 0.01070563, + "balance_loss_clip": 1.05657399, + "balance_loss_mlp": 1.04210758, + "epoch": 0.04557342552232076, + "flos": 21471169991040.0, + "grad_norm": 1.4985290265210922, + "language_loss": 0.84321779, + "learning_rate": 3.997456820912346e-06, + "loss": 0.86613727, + "num_input_tokens_seen": 16277015, + "step": 758, + "time_per_iteration": 2.6442883014678955 + }, + { + "auxiliary_loss_clip": 0.01215335, + "auxiliary_loss_mlp": 0.01067109, + "balance_loss_clip": 1.05120242, + "balance_loss_mlp": 1.03886819, + "epoch": 0.04563354877498873, + "flos": 23732680233600.0, + "grad_norm": 1.8086611785518862, + "language_loss": 0.88411224, + "learning_rate": 3.997437148755101e-06, + "loss": 0.90693665, + "num_input_tokens_seen": 16296005, + "step": 759, + "time_per_iteration": 2.631491184234619 + }, + { + "auxiliary_loss_clip": 0.01224936, + "auxiliary_loss_mlp": 0.01072669, + "balance_loss_clip": 1.05921936, + "balance_loss_mlp": 1.04224706, + "epoch": 0.045693672027656694, + "flos": 25735741142400.0, + "grad_norm": 1.9966260034834837, + "language_loss": 0.74118954, + "learning_rate": 3.9974174008548405e-06, + "loss": 0.76416558, + "num_input_tokens_seen": 16315300, + "step": 760, + "time_per_iteration": 2.760021448135376 + }, + { + "auxiliary_loss_clip": 0.01225332, + "auxiliary_loss_mlp": 0.01075665, + "balance_loss_clip": 1.0618813, + "balance_loss_mlp": 1.04706645, + "epoch": 0.045753795280324666, + "flos": 19719016560000.0, + "grad_norm": 2.4567411434262585, + "language_loss": 0.8195048, + "learning_rate": 3.9973975772123105e-06, + "loss": 0.84251475, + "num_input_tokens_seen": 16333820, + "step": 761, + "time_per_iteration": 2.790785551071167 + }, + { + "auxiliary_loss_clip": 0.0121612, + "auxiliary_loss_mlp": 0.01075769, + "balance_loss_clip": 1.05494714, + "balance_loss_mlp": 1.04643154, + "epoch": 0.04581391853299264, + "flos": 23255786338560.0, + "grad_norm": 1.6783021081714637, + "language_loss": 0.79556394, + "learning_rate": 3.997377677828266e-06, + "loss": 0.81848282, + "num_input_tokens_seen": 16355290, + "step": 762, + "time_per_iteration": 2.6188952922821045 + }, + { + "auxiliary_loss_clip": 0.01078812, + "auxiliary_loss_mlp": 0.01004166, + "balance_loss_clip": 1.00718617, + "balance_loss_mlp": 0.99961245, + "epoch": 0.0458740417856606, + "flos": 64231155601920.0, + "grad_norm": 1.010704402562237, + "language_loss": 0.5868777, + "learning_rate": 3.9973577027034585e-06, + "loss": 0.6077075, + "num_input_tokens_seen": 16415995, + "step": 763, + "time_per_iteration": 3.319258689880371 + }, + { + "auxiliary_loss_clip": 0.01221711, + "auxiliary_loss_mlp": 0.0107366, + "balance_loss_clip": 1.05572629, + "balance_loss_mlp": 1.04500198, + "epoch": 0.045934165038328575, + "flos": 20770121272320.0, + "grad_norm": 2.410076440559906, + "language_loss": 0.87853664, + "learning_rate": 3.9973376518386475e-06, + "loss": 0.90149033, + "num_input_tokens_seen": 16433120, + "step": 764, + "time_per_iteration": 2.791978120803833 + }, + { + "auxiliary_loss_clip": 0.01222308, + "auxiliary_loss_mlp": 0.01078292, + "balance_loss_clip": 1.05832624, + "balance_loss_mlp": 1.04953837, + "epoch": 0.04599428829099654, + "flos": 30262891691520.0, + "grad_norm": 2.334438317098767, + "language_loss": 0.85722953, + "learning_rate": 3.997317525234592e-06, + "loss": 0.88023549, + "num_input_tokens_seen": 16453360, + "step": 765, + "time_per_iteration": 2.679691791534424 + }, + { + "auxiliary_loss_clip": 0.01224942, + "auxiliary_loss_mlp": 0.01075582, + "balance_loss_clip": 1.05891478, + "balance_loss_mlp": 1.0441587, + "epoch": 0.04605441154366451, + "flos": 23038921975680.0, + "grad_norm": 2.4665875125784646, + "language_loss": 0.88218737, + "learning_rate": 3.997297322892056e-06, + "loss": 0.90519261, + "num_input_tokens_seen": 16471160, + "step": 766, + "time_per_iteration": 2.5915868282318115 + }, + { + "auxiliary_loss_clip": 0.01219553, + "auxiliary_loss_mlp": 0.01073097, + "balance_loss_clip": 1.05574799, + "balance_loss_mlp": 1.04501152, + "epoch": 0.046114534796332485, + "flos": 22017407091840.0, + "grad_norm": 2.4438010404237382, + "language_loss": 0.84037167, + "learning_rate": 3.997277044811806e-06, + "loss": 0.86329812, + "num_input_tokens_seen": 16488940, + "step": 767, + "time_per_iteration": 2.584214687347412 + }, + { + "auxiliary_loss_clip": 0.01220807, + "auxiliary_loss_mlp": 0.0106106, + "balance_loss_clip": 1.05914187, + "balance_loss_mlp": 1.03199708, + "epoch": 0.04617465804900045, + "flos": 29862380067840.0, + "grad_norm": 2.765077748867124, + "language_loss": 0.86822236, + "learning_rate": 3.99725669099461e-06, + "loss": 0.89104104, + "num_input_tokens_seen": 16509505, + "step": 768, + "time_per_iteration": 2.673823118209839 + }, + { + "auxiliary_loss_clip": 0.01217122, + "auxiliary_loss_mlp": 0.01072581, + "balance_loss_clip": 1.0523566, + "balance_loss_mlp": 1.04445922, + "epoch": 0.04623478130166842, + "flos": 25630056351360.0, + "grad_norm": 4.414071466816872, + "language_loss": 0.7520172, + "learning_rate": 3.9972362614412395e-06, + "loss": 0.77491426, + "num_input_tokens_seen": 16528840, + "step": 769, + "time_per_iteration": 2.596041440963745 + }, + { + "auxiliary_loss_clip": 0.01217386, + "auxiliary_loss_mlp": 0.01063869, + "balance_loss_clip": 1.05758023, + "balance_loss_mlp": 1.03708279, + "epoch": 0.04629490455433639, + "flos": 20449080489600.0, + "grad_norm": 2.00944484634459, + "language_loss": 0.86598516, + "learning_rate": 3.997215756152471e-06, + "loss": 0.88879776, + "num_input_tokens_seen": 16548335, + "step": 770, + "time_per_iteration": 2.5861148834228516 + }, + { + "auxiliary_loss_clip": 0.0122327, + "auxiliary_loss_mlp": 0.01070165, + "balance_loss_clip": 1.05499506, + "balance_loss_mlp": 1.04126871, + "epoch": 0.04635502780700436, + "flos": 23148736830720.0, + "grad_norm": 2.0247557592116756, + "language_loss": 0.87453276, + "learning_rate": 3.99719517512908e-06, + "loss": 0.89746714, + "num_input_tokens_seen": 16567725, + "step": 771, + "time_per_iteration": 2.6070799827575684 + }, + { + "auxiliary_loss_clip": 0.01223262, + "auxiliary_loss_mlp": 0.01082436, + "balance_loss_clip": 1.05402339, + "balance_loss_mlp": 1.05239487, + "epoch": 0.04641515105967233, + "flos": 23292020183040.0, + "grad_norm": 2.0116108771986845, + "language_loss": 0.83784211, + "learning_rate": 3.997174518371848e-06, + "loss": 0.86089909, + "num_input_tokens_seen": 16588175, + "step": 772, + "time_per_iteration": 2.6010799407958984 + }, + { + "auxiliary_loss_clip": 0.01219811, + "auxiliary_loss_mlp": 0.0106663, + "balance_loss_clip": 1.0581485, + "balance_loss_mlp": 1.03925943, + "epoch": 0.046475274312340296, + "flos": 25115204759040.0, + "grad_norm": 2.1110857249584454, + "language_loss": 0.73931146, + "learning_rate": 3.997153785881557e-06, + "loss": 0.76217586, + "num_input_tokens_seen": 16607735, + "step": 773, + "time_per_iteration": 2.623885154724121 + }, + { + "auxiliary_loss_clip": 0.01214783, + "auxiliary_loss_mlp": 0.01073037, + "balance_loss_clip": 1.05638194, + "balance_loss_mlp": 1.04318666, + "epoch": 0.04653539756500827, + "flos": 25264916645760.0, + "grad_norm": 2.0534262490470305, + "language_loss": 0.78725278, + "learning_rate": 3.997132977658996e-06, + "loss": 0.81013107, + "num_input_tokens_seen": 16627225, + "step": 774, + "time_per_iteration": 2.865659713745117 + }, + { + "auxiliary_loss_clip": 0.01213129, + "auxiliary_loss_mlp": 0.01066959, + "balance_loss_clip": 1.05424857, + "balance_loss_mlp": 1.03990996, + "epoch": 0.046595520817676234, + "flos": 35404150089600.0, + "grad_norm": 2.95044694952362, + "language_loss": 0.732391, + "learning_rate": 3.997112093704952e-06, + "loss": 0.7551918, + "num_input_tokens_seen": 16647785, + "step": 775, + "time_per_iteration": 2.6368823051452637 + }, + { + "auxiliary_loss_clip": 0.01218777, + "auxiliary_loss_mlp": 0.01058419, + "balance_loss_clip": 1.05620766, + "balance_loss_mlp": 1.02947521, + "epoch": 0.046655644070344206, + "flos": 18112516778880.0, + "grad_norm": 1.6677439970812271, + "language_loss": 0.77214617, + "learning_rate": 3.997091134020217e-06, + "loss": 0.79491812, + "num_input_tokens_seen": 16667555, + "step": 776, + "time_per_iteration": 2.5409693717956543 + }, + { + "auxiliary_loss_clip": 0.01211169, + "auxiliary_loss_mlp": 0.01063346, + "balance_loss_clip": 1.05224395, + "balance_loss_mlp": 1.03639317, + "epoch": 0.04671576732301218, + "flos": 29205286617600.0, + "grad_norm": 2.1670330294977878, + "language_loss": 0.71246982, + "learning_rate": 3.997070098605585e-06, + "loss": 0.73521501, + "num_input_tokens_seen": 16686875, + "step": 777, + "time_per_iteration": 2.5971662998199463 + }, + { + "auxiliary_loss_clip": 0.01216529, + "auxiliary_loss_mlp": 0.01076772, + "balance_loss_clip": 1.05555725, + "balance_loss_mlp": 1.04706478, + "epoch": 0.04677589057568014, + "flos": 30478319510400.0, + "grad_norm": 1.8185198431644762, + "language_loss": 0.76767671, + "learning_rate": 3.997048987461856e-06, + "loss": 0.79060972, + "num_input_tokens_seen": 16706420, + "step": 778, + "time_per_iteration": 2.6778624057769775 + }, + { + "auxiliary_loss_clip": 0.01214291, + "auxiliary_loss_mlp": 0.01068616, + "balance_loss_clip": 1.05462074, + "balance_loss_mlp": 1.03937364, + "epoch": 0.046836013828348115, + "flos": 20557674282240.0, + "grad_norm": 2.797950847685566, + "language_loss": 0.79160368, + "learning_rate": 3.997027800589829e-06, + "loss": 0.81443274, + "num_input_tokens_seen": 16726390, + "step": 779, + "time_per_iteration": 2.549663543701172 + }, + { + "auxiliary_loss_clip": 0.01207307, + "auxiliary_loss_mlp": 0.01071823, + "balance_loss_clip": 1.05255592, + "balance_loss_mlp": 1.04440439, + "epoch": 0.04689613708101608, + "flos": 25447378757760.0, + "grad_norm": 1.6817943654140255, + "language_loss": 0.77615714, + "learning_rate": 3.997006537990308e-06, + "loss": 0.79894847, + "num_input_tokens_seen": 16748965, + "step": 780, + "time_per_iteration": 2.615443706512451 + }, + { + "auxiliary_loss_clip": 0.01213562, + "auxiliary_loss_mlp": 0.01070831, + "balance_loss_clip": 1.05559683, + "balance_loss_mlp": 1.04443824, + "epoch": 0.04695626033368405, + "flos": 23001395241600.0, + "grad_norm": 1.8022054442518833, + "language_loss": 0.76478279, + "learning_rate": 3.996985199664099e-06, + "loss": 0.78762668, + "num_input_tokens_seen": 16768620, + "step": 781, + "time_per_iteration": 2.6395726203918457 + }, + { + "auxiliary_loss_clip": 0.01223888, + "auxiliary_loss_mlp": 0.01074396, + "balance_loss_clip": 1.05877674, + "balance_loss_mlp": 1.04477215, + "epoch": 0.047016383586352024, + "flos": 29133357632640.0, + "grad_norm": 2.4121167668280443, + "language_loss": 0.73727506, + "learning_rate": 3.99696378561201e-06, + "loss": 0.7602579, + "num_input_tokens_seen": 16789755, + "step": 782, + "time_per_iteration": 2.6816394329071045 + }, + { + "auxiliary_loss_clip": 0.01217583, + "auxiliary_loss_mlp": 0.01068925, + "balance_loss_clip": 1.05807388, + "balance_loss_mlp": 1.04155493, + "epoch": 0.04707650683901999, + "flos": 14976330451200.0, + "grad_norm": 2.310930325104245, + "language_loss": 0.80398589, + "learning_rate": 3.996942295834855e-06, + "loss": 0.82685095, + "num_input_tokens_seen": 16807585, + "step": 783, + "time_per_iteration": 2.588198184967041 + }, + { + "auxiliary_loss_clip": 0.01209064, + "auxiliary_loss_mlp": 0.0106384, + "balance_loss_clip": 1.0548811, + "balance_loss_mlp": 1.03713715, + "epoch": 0.04713663009168796, + "flos": 21651118151040.0, + "grad_norm": 2.651737032487811, + "language_loss": 0.81711274, + "learning_rate": 3.996920730333448e-06, + "loss": 0.83984184, + "num_input_tokens_seen": 16827220, + "step": 784, + "time_per_iteration": 2.7356221675872803 + }, + { + "auxiliary_loss_clip": 0.0121531, + "auxiliary_loss_mlp": 0.01075444, + "balance_loss_clip": 1.05334759, + "balance_loss_mlp": 1.04833567, + "epoch": 0.04719675334435593, + "flos": 21325408600320.0, + "grad_norm": 2.23292544158374, + "language_loss": 0.80372351, + "learning_rate": 3.996899089108607e-06, + "loss": 0.82663107, + "num_input_tokens_seen": 16846230, + "step": 785, + "time_per_iteration": 2.72523832321167 + }, + { + "auxiliary_loss_clip": 0.01219469, + "auxiliary_loss_mlp": 0.01068251, + "balance_loss_clip": 1.06141424, + "balance_loss_mlp": 1.04237044, + "epoch": 0.0472568765970239, + "flos": 17931383470080.0, + "grad_norm": 5.13320079055433, + "language_loss": 0.89526916, + "learning_rate": 3.996877372161152e-06, + "loss": 0.91814631, + "num_input_tokens_seen": 16865325, + "step": 786, + "time_per_iteration": 2.893051862716675 + }, + { + "auxiliary_loss_clip": 0.01213306, + "auxiliary_loss_mlp": 0.01068624, + "balance_loss_clip": 1.04808021, + "balance_loss_mlp": 1.03858376, + "epoch": 0.04731699984969187, + "flos": 18077324428800.0, + "grad_norm": 3.558801258998313, + "language_loss": 0.76622844, + "learning_rate": 3.9968555794919065e-06, + "loss": 0.78904778, + "num_input_tokens_seen": 16882930, + "step": 787, + "time_per_iteration": 2.5354130268096924 + }, + { + "auxiliary_loss_clip": 0.01223383, + "auxiliary_loss_mlp": 0.01071535, + "balance_loss_clip": 1.06057882, + "balance_loss_mlp": 1.04228091, + "epoch": 0.047377123102359836, + "flos": 23185078416000.0, + "grad_norm": 6.919329025184245, + "language_loss": 0.80991066, + "learning_rate": 3.996833711101698e-06, + "loss": 0.83285981, + "num_input_tokens_seen": 16900710, + "step": 788, + "time_per_iteration": 2.672304391860962 + }, + { + "auxiliary_loss_clip": 0.01213328, + "auxiliary_loss_mlp": 0.0107951, + "balance_loss_clip": 1.05696201, + "balance_loss_mlp": 1.04877758, + "epoch": 0.04743724635502781, + "flos": 22747794243840.0, + "grad_norm": 2.706482110134917, + "language_loss": 0.84445721, + "learning_rate": 3.996811766991355e-06, + "loss": 0.86738551, + "num_input_tokens_seen": 16919210, + "step": 789, + "time_per_iteration": 2.601733684539795 + }, + { + "auxiliary_loss_clip": 0.01216904, + "auxiliary_loss_mlp": 0.01075671, + "balance_loss_clip": 1.05744743, + "balance_loss_mlp": 1.04753792, + "epoch": 0.04749736960769577, + "flos": 17238702620160.0, + "grad_norm": 1.8338829587760241, + "language_loss": 0.82003343, + "learning_rate": 3.996789747161709e-06, + "loss": 0.84295923, + "num_input_tokens_seen": 16937125, + "step": 790, + "time_per_iteration": 2.5930819511413574 + }, + { + "auxiliary_loss_clip": 0.01212285, + "auxiliary_loss_mlp": 0.01069052, + "balance_loss_clip": 1.05291879, + "balance_loss_mlp": 1.03929758, + "epoch": 0.047557492860363745, + "flos": 40479261592320.0, + "grad_norm": 2.387751970592113, + "language_loss": 0.8822248, + "learning_rate": 3.996767651613597e-06, + "loss": 0.90503824, + "num_input_tokens_seen": 16958610, + "step": 791, + "time_per_iteration": 2.8101847171783447 + }, + { + "auxiliary_loss_clip": 0.01216565, + "auxiliary_loss_mlp": 0.01068923, + "balance_loss_clip": 1.05640411, + "balance_loss_mlp": 1.03955007, + "epoch": 0.04761761611303172, + "flos": 18698004466560.0, + "grad_norm": 1.9856901553952309, + "language_loss": 0.90381223, + "learning_rate": 3.996745480347854e-06, + "loss": 0.92666709, + "num_input_tokens_seen": 16977300, + "step": 792, + "time_per_iteration": 2.590496063232422 + }, + { + "auxiliary_loss_clip": 0.01215238, + "auxiliary_loss_mlp": 0.01072081, + "balance_loss_clip": 1.0537008, + "balance_loss_mlp": 1.04516363, + "epoch": 0.04767773936569968, + "flos": 20921987975040.0, + "grad_norm": 1.936105140735928, + "language_loss": 0.73372513, + "learning_rate": 3.996723233365324e-06, + "loss": 0.75659835, + "num_input_tokens_seen": 16994950, + "step": 793, + "time_per_iteration": 2.672757148742676 + }, + { + "auxiliary_loss_clip": 0.01219081, + "auxiliary_loss_mlp": 0.0106927, + "balance_loss_clip": 1.05575657, + "balance_loss_mlp": 1.03984928, + "epoch": 0.047737862618367655, + "flos": 23732680233600.0, + "grad_norm": 1.9418183495046901, + "language_loss": 0.86372185, + "learning_rate": 3.996700910666847e-06, + "loss": 0.88660538, + "num_input_tokens_seen": 17014760, + "step": 794, + "time_per_iteration": 2.634957790374756 + }, + { + "auxiliary_loss_clip": 0.01218774, + "auxiliary_loss_mlp": 0.0107664, + "balance_loss_clip": 1.05584049, + "balance_loss_mlp": 1.04714775, + "epoch": 0.04779798587103562, + "flos": 23695764030720.0, + "grad_norm": 3.0244094054834396, + "language_loss": 0.69488811, + "learning_rate": 3.996678512253272e-06, + "loss": 0.71784228, + "num_input_tokens_seen": 17032715, + "step": 795, + "time_per_iteration": 4.160778522491455 + }, + { + "auxiliary_loss_clip": 0.01213596, + "auxiliary_loss_mlp": 0.01076686, + "balance_loss_clip": 1.05454969, + "balance_loss_mlp": 1.04666948, + "epoch": 0.04785810912370359, + "flos": 23183641872000.0, + "grad_norm": 1.8184842150640765, + "language_loss": 0.80918312, + "learning_rate": 3.996656038125449e-06, + "loss": 0.83208591, + "num_input_tokens_seen": 17052215, + "step": 796, + "time_per_iteration": 4.165215969085693 + }, + { + "auxiliary_loss_clip": 0.012152, + "auxiliary_loss_mlp": 0.0106457, + "balance_loss_clip": 1.05459964, + "balance_loss_mlp": 1.03570962, + "epoch": 0.047918232376371564, + "flos": 18040623707520.0, + "grad_norm": 2.14053819783508, + "language_loss": 0.81173545, + "learning_rate": 3.996633488284228e-06, + "loss": 0.83453315, + "num_input_tokens_seen": 17069225, + "step": 797, + "time_per_iteration": 4.175282716751099 + }, + { + "auxiliary_loss_clip": 0.01079235, + "auxiliary_loss_mlp": 0.01011163, + "balance_loss_clip": 1.0080297, + "balance_loss_mlp": 1.00594139, + "epoch": 0.04797835562903953, + "flos": 62442588758400.0, + "grad_norm": 0.9186517671566723, + "language_loss": 0.64414394, + "learning_rate": 3.996610862730465e-06, + "loss": 0.665048, + "num_input_tokens_seen": 17126680, + "step": 798, + "time_per_iteration": 3.0698280334472656 + }, + { + "auxiliary_loss_clip": 0.0121976, + "auxiliary_loss_mlp": 0.01071754, + "balance_loss_clip": 1.0532527, + "balance_loss_mlp": 1.04389524, + "epoch": 0.0480384788817075, + "flos": 21507296094720.0, + "grad_norm": 1.9587857161949922, + "language_loss": 0.91242468, + "learning_rate": 3.996588161465018e-06, + "loss": 0.93533981, + "num_input_tokens_seen": 17144835, + "step": 799, + "time_per_iteration": 2.5495405197143555 + }, + { + "auxiliary_loss_clip": 0.01216516, + "auxiliary_loss_mlp": 0.01073701, + "balance_loss_clip": 1.05936933, + "balance_loss_mlp": 1.04408956, + "epoch": 0.048098602134375466, + "flos": 21726710323200.0, + "grad_norm": 3.0702345965243945, + "language_loss": 0.86895394, + "learning_rate": 3.996565384488748e-06, + "loss": 0.89185607, + "num_input_tokens_seen": 17165030, + "step": 800, + "time_per_iteration": 2.5901947021484375 + }, + { + "auxiliary_loss_clip": 0.01217389, + "auxiliary_loss_mlp": 0.01068221, + "balance_loss_clip": 1.05485666, + "balance_loss_mlp": 1.04101753, + "epoch": 0.04815872538704344, + "flos": 22931082368640.0, + "grad_norm": 2.8601791660186056, + "language_loss": 0.83504075, + "learning_rate": 3.996542531802518e-06, + "loss": 0.8578968, + "num_input_tokens_seen": 17184895, + "step": 801, + "time_per_iteration": 2.70853590965271 + }, + { + "auxiliary_loss_clip": 0.01215726, + "auxiliary_loss_mlp": 0.01075439, + "balance_loss_clip": 1.05518246, + "balance_loss_mlp": 1.04686451, + "epoch": 0.04821884863971141, + "flos": 43174716042240.0, + "grad_norm": 1.7841452912372096, + "language_loss": 0.79720497, + "learning_rate": 3.996519603407196e-06, + "loss": 0.82011658, + "num_input_tokens_seen": 17208225, + "step": 802, + "time_per_iteration": 2.8269050121307373 + }, + { + "auxiliary_loss_clip": 0.01217757, + "auxiliary_loss_mlp": 0.01069739, + "balance_loss_clip": 1.05768275, + "balance_loss_mlp": 1.04153466, + "epoch": 0.048278971892379376, + "flos": 18620006083200.0, + "grad_norm": 1.9124398027135934, + "language_loss": 0.86333549, + "learning_rate": 3.996496599303649e-06, + "loss": 0.88621044, + "num_input_tokens_seen": 17226305, + "step": 803, + "time_per_iteration": 2.5360331535339355 + }, + { + "auxiliary_loss_clip": 0.0120973, + "auxiliary_loss_mlp": 0.01061535, + "balance_loss_clip": 1.05521274, + "balance_loss_mlp": 1.03378296, + "epoch": 0.04833909514504735, + "flos": 20230061310720.0, + "grad_norm": 2.2488007308325777, + "language_loss": 0.85303998, + "learning_rate": 3.996473519492753e-06, + "loss": 0.87575269, + "num_input_tokens_seen": 17244545, + "step": 804, + "time_per_iteration": 2.565702438354492 + }, + { + "auxiliary_loss_clip": 0.0121472, + "auxiliary_loss_mlp": 0.01068417, + "balance_loss_clip": 1.055475, + "balance_loss_mlp": 1.04018795, + "epoch": 0.04839921839771532, + "flos": 24645170361600.0, + "grad_norm": 2.014228470997478, + "language_loss": 0.86216295, + "learning_rate": 3.99645036397538e-06, + "loss": 0.88499433, + "num_input_tokens_seen": 17265730, + "step": 805, + "time_per_iteration": 2.58823561668396 + }, + { + "auxiliary_loss_clip": 0.0120865, + "auxiliary_loss_mlp": 0.01067374, + "balance_loss_clip": 1.05224895, + "balance_loss_mlp": 1.04027724, + "epoch": 0.048459341650383285, + "flos": 24827452905600.0, + "grad_norm": 1.9551116929667653, + "language_loss": 0.6823343, + "learning_rate": 3.9964271327524085e-06, + "loss": 0.70509446, + "num_input_tokens_seen": 17284820, + "step": 806, + "time_per_iteration": 2.7188003063201904 + }, + { + "auxiliary_loss_clip": 0.012092, + "auxiliary_loss_mlp": 0.01061897, + "balance_loss_clip": 1.05476415, + "balance_loss_mlp": 1.03490841, + "epoch": 0.04851946490305126, + "flos": 22163204396160.0, + "grad_norm": 2.0921764687109645, + "language_loss": 0.76879835, + "learning_rate": 3.9964038258247214e-06, + "loss": 0.79150933, + "num_input_tokens_seen": 17305085, + "step": 807, + "time_per_iteration": 2.673222780227661 + }, + { + "auxiliary_loss_clip": 0.01206155, + "auxiliary_loss_mlp": 0.01064621, + "balance_loss_clip": 1.04984915, + "balance_loss_mlp": 1.03772748, + "epoch": 0.04857958815571922, + "flos": 19792022952960.0, + "grad_norm": 2.2762301206573072, + "language_loss": 0.86858881, + "learning_rate": 3.9963804431932005e-06, + "loss": 0.89129663, + "num_input_tokens_seen": 17322715, + "step": 808, + "time_per_iteration": 2.582641839981079 + }, + { + "auxiliary_loss_clip": 0.01216068, + "auxiliary_loss_mlp": 0.01067175, + "balance_loss_clip": 1.05549383, + "balance_loss_mlp": 1.03923249, + "epoch": 0.048639711408387194, + "flos": 18697968552960.0, + "grad_norm": 2.140111354875045, + "language_loss": 0.89919293, + "learning_rate": 3.996356984858732e-06, + "loss": 0.92202532, + "num_input_tokens_seen": 17341455, + "step": 809, + "time_per_iteration": 2.609889030456543 + }, + { + "auxiliary_loss_clip": 0.01212654, + "auxiliary_loss_mlp": 0.01070877, + "balance_loss_clip": 1.05677009, + "balance_loss_mlp": 1.04329169, + "epoch": 0.048699834661055166, + "flos": 24863507182080.0, + "grad_norm": 2.0209031920096314, + "language_loss": 0.85077977, + "learning_rate": 3.996333450822208e-06, + "loss": 0.87361515, + "num_input_tokens_seen": 17360765, + "step": 810, + "time_per_iteration": 2.684054374694824 + }, + { + "auxiliary_loss_clip": 0.01213684, + "auxiliary_loss_mlp": 0.01066842, + "balance_loss_clip": 1.05445361, + "balance_loss_mlp": 1.03913784, + "epoch": 0.04875995791372313, + "flos": 20704010290560.0, + "grad_norm": 1.9483764831830932, + "language_loss": 0.80766594, + "learning_rate": 3.99630984108452e-06, + "loss": 0.83047122, + "num_input_tokens_seen": 17380625, + "step": 811, + "time_per_iteration": 2.7634057998657227 + }, + { + "auxiliary_loss_clip": 0.01204151, + "auxiliary_loss_mlp": 0.01071793, + "balance_loss_clip": 1.05158567, + "balance_loss_mlp": 1.04516125, + "epoch": 0.048820081166391104, + "flos": 18588297352320.0, + "grad_norm": 1.9374542705021478, + "language_loss": 0.74692822, + "learning_rate": 3.9962861556465615e-06, + "loss": 0.76968765, + "num_input_tokens_seen": 17399355, + "step": 812, + "time_per_iteration": 2.655829668045044 + }, + { + "auxiliary_loss_clip": 0.01208694, + "auxiliary_loss_mlp": 0.01074596, + "balance_loss_clip": 1.05600071, + "balance_loss_mlp": 1.04748726, + "epoch": 0.04888020441905907, + "flos": 22707322594560.0, + "grad_norm": 1.8159952906259882, + "language_loss": 0.90170538, + "learning_rate": 3.996262394509233e-06, + "loss": 0.92453837, + "num_input_tokens_seen": 17418240, + "step": 813, + "time_per_iteration": 2.8502368927001953 + }, + { + "auxiliary_loss_clip": 0.01206702, + "auxiliary_loss_mlp": 0.01057713, + "balance_loss_clip": 1.05386686, + "balance_loss_mlp": 1.03203583, + "epoch": 0.04894032767172704, + "flos": 22784351310720.0, + "grad_norm": 2.1654006842179583, + "language_loss": 0.74759626, + "learning_rate": 3.9962385576734335e-06, + "loss": 0.77024043, + "num_input_tokens_seen": 17436250, + "step": 814, + "time_per_iteration": 2.882859945297241 + }, + { + "auxiliary_loss_clip": 0.01210095, + "auxiliary_loss_mlp": 0.01073045, + "balance_loss_clip": 1.05348909, + "balance_loss_mlp": 1.04488802, + "epoch": 0.04900045092439501, + "flos": 25516147345920.0, + "grad_norm": 2.148604190783859, + "language_loss": 0.83466631, + "learning_rate": 3.9962146451400675e-06, + "loss": 0.85749769, + "num_input_tokens_seen": 17455750, + "step": 815, + "time_per_iteration": 2.6943161487579346 + }, + { + "auxiliary_loss_clip": 0.01212596, + "auxiliary_loss_mlp": 0.0106205, + "balance_loss_clip": 1.05520833, + "balance_loss_mlp": 1.034536, + "epoch": 0.04906057417706298, + "flos": 25958136199680.0, + "grad_norm": 2.3524004956881543, + "language_loss": 0.90915799, + "learning_rate": 3.996190656910043e-06, + "loss": 0.93190438, + "num_input_tokens_seen": 17474995, + "step": 816, + "time_per_iteration": 2.6603341102600098 + }, + { + "auxiliary_loss_clip": 0.01213768, + "auxiliary_loss_mlp": 0.01059405, + "balance_loss_clip": 1.05560088, + "balance_loss_mlp": 1.03216529, + "epoch": 0.04912069742973095, + "flos": 18624638937600.0, + "grad_norm": 2.3072729955530167, + "language_loss": 0.80193591, + "learning_rate": 3.996166592984268e-06, + "loss": 0.82466757, + "num_input_tokens_seen": 17493395, + "step": 817, + "time_per_iteration": 2.6415107250213623 + }, + { + "auxiliary_loss_clip": 0.01209629, + "auxiliary_loss_mlp": 0.01078024, + "balance_loss_clip": 1.05611193, + "balance_loss_mlp": 1.05090356, + "epoch": 0.049180820682398915, + "flos": 23699786353920.0, + "grad_norm": 1.715085905484113, + "language_loss": 0.84781814, + "learning_rate": 3.996142453363656e-06, + "loss": 0.87069476, + "num_input_tokens_seen": 17514565, + "step": 818, + "time_per_iteration": 2.8388895988464355 + }, + { + "auxiliary_loss_clip": 0.01215519, + "auxiliary_loss_mlp": 0.01066979, + "balance_loss_clip": 1.0551784, + "balance_loss_mlp": 1.03854764, + "epoch": 0.04924094393506689, + "flos": 22420396753920.0, + "grad_norm": 2.141545674082967, + "language_loss": 0.75650984, + "learning_rate": 3.996118238049124e-06, + "loss": 0.77933484, + "num_input_tokens_seen": 17534590, + "step": 819, + "time_per_iteration": 2.795238971710205 + }, + { + "auxiliary_loss_clip": 0.01212908, + "auxiliary_loss_mlp": 0.01065129, + "balance_loss_clip": 1.05751038, + "balance_loss_mlp": 1.04004741, + "epoch": 0.04930106718773486, + "flos": 15738246766080.0, + "grad_norm": 2.509584145808956, + "language_loss": 0.84624887, + "learning_rate": 3.996093947041586e-06, + "loss": 0.86902928, + "num_input_tokens_seen": 17551900, + "step": 820, + "time_per_iteration": 2.512408971786499 + }, + { + "auxiliary_loss_clip": 0.01211925, + "auxiliary_loss_mlp": 0.0106688, + "balance_loss_clip": 1.05450141, + "balance_loss_mlp": 1.03981972, + "epoch": 0.049361190440402825, + "flos": 26250628648320.0, + "grad_norm": 1.8104175793244677, + "language_loss": 0.90448511, + "learning_rate": 3.996069580341966e-06, + "loss": 0.92727309, + "num_input_tokens_seen": 17571485, + "step": 821, + "time_per_iteration": 2.6171164512634277 + }, + { + "auxiliary_loss_clip": 0.0120853, + "auxiliary_loss_mlp": 0.01075098, + "balance_loss_clip": 1.05389059, + "balance_loss_mlp": 1.04919338, + "epoch": 0.0494213136930708, + "flos": 21252366293760.0, + "grad_norm": 2.0492241980101045, + "language_loss": 0.89618599, + "learning_rate": 3.996045137951188e-06, + "loss": 0.91902226, + "num_input_tokens_seen": 17591410, + "step": 822, + "time_per_iteration": 2.6115095615386963 + }, + { + "auxiliary_loss_clip": 0.01210895, + "auxiliary_loss_mlp": 0.01063974, + "balance_loss_clip": 1.05685341, + "balance_loss_mlp": 1.03454173, + "epoch": 0.04948143694573876, + "flos": 27965506740480.0, + "grad_norm": 1.7998673420181461, + "language_loss": 0.66950309, + "learning_rate": 3.996020619870178e-06, + "loss": 0.69225174, + "num_input_tokens_seen": 17612010, + "step": 823, + "time_per_iteration": 2.5932204723358154 + }, + { + "auxiliary_loss_clip": 0.010772, + "auxiliary_loss_mlp": 0.01009928, + "balance_loss_clip": 1.00782776, + "balance_loss_mlp": 1.00475454, + "epoch": 0.049541560198406734, + "flos": 66180995533440.0, + "grad_norm": 1.3307818457871912, + "language_loss": 0.62313652, + "learning_rate": 3.995996026099866e-06, + "loss": 0.6440078, + "num_input_tokens_seen": 17673430, + "step": 824, + "time_per_iteration": 3.2050116062164307 + }, + { + "auxiliary_loss_clip": 0.01213297, + "auxiliary_loss_mlp": 0.01071849, + "balance_loss_clip": 1.05460823, + "balance_loss_mlp": 1.04270244, + "epoch": 0.049601683451074706, + "flos": 22892693708160.0, + "grad_norm": 1.8103779694516102, + "language_loss": 0.90601355, + "learning_rate": 3.995971356641185e-06, + "loss": 0.92886496, + "num_input_tokens_seen": 17689545, + "step": 825, + "time_per_iteration": 2.6450541019439697 + }, + { + "auxiliary_loss_clip": 0.01212694, + "auxiliary_loss_mlp": 0.01070894, + "balance_loss_clip": 1.05645943, + "balance_loss_mlp": 1.04223585, + "epoch": 0.04966180670374267, + "flos": 21433643256960.0, + "grad_norm": 2.811813891767578, + "language_loss": 0.67362618, + "learning_rate": 3.9959466114950695e-06, + "loss": 0.69646204, + "num_input_tokens_seen": 17705965, + "step": 826, + "time_per_iteration": 2.720360040664673 + }, + { + "auxiliary_loss_clip": 0.0121441, + "auxiliary_loss_mlp": 0.01068833, + "balance_loss_clip": 1.05786347, + "balance_loss_mlp": 1.04114103, + "epoch": 0.04972192995641064, + "flos": 23107367341440.0, + "grad_norm": 1.87846324042641, + "language_loss": 0.78370333, + "learning_rate": 3.995921790662459e-06, + "loss": 0.80653572, + "num_input_tokens_seen": 17724580, + "step": 827, + "time_per_iteration": 2.631547212600708 + }, + { + "auxiliary_loss_clip": 0.01216173, + "auxiliary_loss_mlp": 0.01080878, + "balance_loss_clip": 1.05675626, + "balance_loss_mlp": 1.0521729, + "epoch": 0.04978205320907861, + "flos": 40406147458560.0, + "grad_norm": 1.9876662593386498, + "language_loss": 0.78798878, + "learning_rate": 3.995896894144294e-06, + "loss": 0.81095934, + "num_input_tokens_seen": 17747755, + "step": 828, + "time_per_iteration": 2.788661003112793 + }, + { + "auxiliary_loss_clip": 0.01203264, + "auxiliary_loss_mlp": 0.01061781, + "balance_loss_clip": 1.05151343, + "balance_loss_mlp": 1.03523314, + "epoch": 0.04984217646174658, + "flos": 25228539146880.0, + "grad_norm": 2.7616994207845065, + "language_loss": 0.83811277, + "learning_rate": 3.995871921941519e-06, + "loss": 0.86076325, + "num_input_tokens_seen": 17768550, + "step": 829, + "time_per_iteration": 2.618488073348999 + }, + { + "auxiliary_loss_clip": 0.0121, + "auxiliary_loss_mlp": 0.01079888, + "balance_loss_clip": 1.05320036, + "balance_loss_mlp": 1.04920363, + "epoch": 0.04990229971441455, + "flos": 15959636242560.0, + "grad_norm": 3.1475516642437804, + "language_loss": 0.75439447, + "learning_rate": 3.99584687405508e-06, + "loss": 0.77729332, + "num_input_tokens_seen": 17786080, + "step": 830, + "time_per_iteration": 2.7773218154907227 + }, + { + "auxiliary_loss_clip": 0.01211715, + "auxiliary_loss_mlp": 0.01073022, + "balance_loss_clip": 1.05459404, + "balance_loss_mlp": 1.04425669, + "epoch": 0.04996242296708252, + "flos": 18405116968320.0, + "grad_norm": 2.363032818564249, + "language_loss": 0.7941103, + "learning_rate": 3.995821750485929e-06, + "loss": 0.81695771, + "num_input_tokens_seen": 17803635, + "step": 831, + "time_per_iteration": 2.67832612991333 + }, + { + "auxiliary_loss_clip": 0.01153484, + "auxiliary_loss_mlp": 0.01077677, + "balance_loss_clip": 1.04933023, + "balance_loss_mlp": 1.05024683, + "epoch": 0.05002254621975049, + "flos": 17858053854720.0, + "grad_norm": 3.466103080211867, + "language_loss": 0.91867781, + "learning_rate": 3.995796551235016e-06, + "loss": 0.94098938, + "num_input_tokens_seen": 17822190, + "step": 832, + "time_per_iteration": 2.82501220703125 + }, + { + "auxiliary_loss_clip": 0.01176303, + "auxiliary_loss_mlp": 0.01080087, + "balance_loss_clip": 1.05093312, + "balance_loss_mlp": 1.05356348, + "epoch": 0.050082669472418455, + "flos": 45660273367680.0, + "grad_norm": 1.9492947451589453, + "language_loss": 0.83439112, + "learning_rate": 3.9957712763032974e-06, + "loss": 0.85695505, + "num_input_tokens_seen": 17846915, + "step": 833, + "time_per_iteration": 2.9548141956329346 + }, + { + "auxiliary_loss_clip": 0.01186007, + "auxiliary_loss_mlp": 0.01065656, + "balance_loss_clip": 1.05247509, + "balance_loss_mlp": 1.037117, + "epoch": 0.05014279272508643, + "flos": 37962067363200.0, + "grad_norm": 2.0908159790115284, + "language_loss": 0.82036215, + "learning_rate": 3.995745925691733e-06, + "loss": 0.84287876, + "num_input_tokens_seen": 17867270, + "step": 834, + "time_per_iteration": 2.8405065536499023 + }, + { + "auxiliary_loss_clip": 0.01202529, + "auxiliary_loss_mlp": 0.01068381, + "balance_loss_clip": 1.05473483, + "balance_loss_mlp": 1.03881693, + "epoch": 0.0502029159777544, + "flos": 20996179516800.0, + "grad_norm": 2.0868988327074374, + "language_loss": 0.91937673, + "learning_rate": 3.995720499401282e-06, + "loss": 0.9420858, + "num_input_tokens_seen": 17884880, + "step": 835, + "time_per_iteration": 2.6293511390686035 + }, + { + "auxiliary_loss_clip": 0.0121303, + "auxiliary_loss_mlp": 0.01074558, + "balance_loss_clip": 1.05255139, + "balance_loss_mlp": 1.04530454, + "epoch": 0.050263039230422364, + "flos": 15888066393600.0, + "grad_norm": 1.91125367169868, + "language_loss": 0.76136231, + "learning_rate": 3.995694997432911e-06, + "loss": 0.78423822, + "num_input_tokens_seen": 17903695, + "step": 836, + "time_per_iteration": 2.562558889389038 + }, + { + "auxiliary_loss_clip": 0.01195777, + "auxiliary_loss_mlp": 0.01076871, + "balance_loss_clip": 1.05399215, + "balance_loss_mlp": 1.04948854, + "epoch": 0.050323162483090336, + "flos": 23732752060800.0, + "grad_norm": 2.0114597967733587, + "language_loss": 0.83628428, + "learning_rate": 3.9956694197875855e-06, + "loss": 0.8590107, + "num_input_tokens_seen": 17920745, + "step": 837, + "time_per_iteration": 2.742666721343994 + }, + { + "auxiliary_loss_clip": 0.01182214, + "auxiliary_loss_mlp": 0.00749988, + "balance_loss_clip": 1.05473316, + "balance_loss_mlp": 1.00067532, + "epoch": 0.0503832857357583, + "flos": 20266223328000.0, + "grad_norm": 2.2960106650816225, + "language_loss": 0.73292971, + "learning_rate": 3.995643766466275e-06, + "loss": 0.75225174, + "num_input_tokens_seen": 17938220, + "step": 838, + "time_per_iteration": 2.763394832611084 + }, + { + "auxiliary_loss_clip": 0.01169716, + "auxiliary_loss_mlp": 0.01070984, + "balance_loss_clip": 1.04619789, + "balance_loss_mlp": 1.04307699, + "epoch": 0.05044340898842627, + "flos": 17785011548160.0, + "grad_norm": 1.657946639806262, + "language_loss": 0.83570671, + "learning_rate": 3.995618037469953e-06, + "loss": 0.85811371, + "num_input_tokens_seen": 17957325, + "step": 839, + "time_per_iteration": 2.6983482837677 + }, + { + "auxiliary_loss_clip": 0.01206103, + "auxiliary_loss_mlp": 0.01077001, + "balance_loss_clip": 1.05277824, + "balance_loss_mlp": 1.04948723, + "epoch": 0.050503532241094246, + "flos": 22966526113920.0, + "grad_norm": 2.590748531973028, + "language_loss": 0.8563484, + "learning_rate": 3.995592232799595e-06, + "loss": 0.87917948, + "num_input_tokens_seen": 17975875, + "step": 840, + "time_per_iteration": 2.565044641494751 + }, + { + "auxiliary_loss_clip": 0.01171265, + "auxiliary_loss_mlp": 0.01066778, + "balance_loss_clip": 1.04783535, + "balance_loss_mlp": 1.03676069, + "epoch": 0.05056365549376221, + "flos": 22776989022720.0, + "grad_norm": 3.9851461309507044, + "language_loss": 0.94292223, + "learning_rate": 3.99556635245618e-06, + "loss": 0.96530265, + "num_input_tokens_seen": 17994340, + "step": 841, + "time_per_iteration": 2.6908726692199707 + }, + { + "auxiliary_loss_clip": 0.01211951, + "auxiliary_loss_mlp": 0.01071519, + "balance_loss_clip": 1.05511761, + "balance_loss_mlp": 1.04190791, + "epoch": 0.05062377874643018, + "flos": 30916968399360.0, + "grad_norm": 3.8299262351527417, + "language_loss": 0.77752399, + "learning_rate": 3.995540396440688e-06, + "loss": 0.80035871, + "num_input_tokens_seen": 18015260, + "step": 842, + "time_per_iteration": 2.833820343017578 + }, + { + "auxiliary_loss_clip": 0.01200728, + "auxiliary_loss_mlp": 0.01071981, + "balance_loss_clip": 1.05608642, + "balance_loss_mlp": 1.0429889, + "epoch": 0.05068390199909815, + "flos": 19647159402240.0, + "grad_norm": 2.4632095941419716, + "language_loss": 0.783198, + "learning_rate": 3.995514364754105e-06, + "loss": 0.80592513, + "num_input_tokens_seen": 18033960, + "step": 843, + "time_per_iteration": 5.680008888244629 + }, + { + "auxiliary_loss_clip": 0.0120157, + "auxiliary_loss_mlp": 0.01062875, + "balance_loss_clip": 1.05601716, + "balance_loss_mlp": 1.03585005, + "epoch": 0.05074402525176612, + "flos": 37962103276800.0, + "grad_norm": 2.3552634756715154, + "language_loss": 0.83382571, + "learning_rate": 3.995488257397417e-06, + "loss": 0.85647017, + "num_input_tokens_seen": 18056700, + "step": 844, + "time_per_iteration": 2.6983189582824707 + }, + { + "auxiliary_loss_clip": 0.01201481, + "auxiliary_loss_mlp": 0.01066298, + "balance_loss_clip": 1.05439162, + "balance_loss_mlp": 1.03868926, + "epoch": 0.05080414850443409, + "flos": 22054610603520.0, + "grad_norm": 3.7617264895751763, + "language_loss": 0.76928413, + "learning_rate": 3.995462074371614e-06, + "loss": 0.79196191, + "num_input_tokens_seen": 18075815, + "step": 845, + "time_per_iteration": 4.450125217437744 + }, + { + "auxiliary_loss_clip": 0.0119299, + "auxiliary_loss_mlp": 0.01068972, + "balance_loss_clip": 1.05227637, + "balance_loss_mlp": 1.04054046, + "epoch": 0.05086427175710206, + "flos": 20225787592320.0, + "grad_norm": 1.7156935924071408, + "language_loss": 0.87492347, + "learning_rate": 3.99543581567769e-06, + "loss": 0.89754307, + "num_input_tokens_seen": 18095095, + "step": 846, + "time_per_iteration": 2.621060609817505 + }, + { + "auxiliary_loss_clip": 0.01189168, + "auxiliary_loss_mlp": 0.01071517, + "balance_loss_clip": 1.05184066, + "balance_loss_mlp": 1.04368186, + "epoch": 0.05092439500977003, + "flos": 15159223526400.0, + "grad_norm": 1.8171822762578396, + "language_loss": 0.87607276, + "learning_rate": 3.9954094813166394e-06, + "loss": 0.89867961, + "num_input_tokens_seen": 18112675, + "step": 847, + "time_per_iteration": 2.6391348838806152 + }, + { + "auxiliary_loss_clip": 0.01158544, + "auxiliary_loss_mlp": 0.01071315, + "balance_loss_clip": 1.05241895, + "balance_loss_mlp": 1.04229891, + "epoch": 0.050984518262437994, + "flos": 22055149307520.0, + "grad_norm": 2.23413625589459, + "language_loss": 0.819399, + "learning_rate": 3.995383071289462e-06, + "loss": 0.84169757, + "num_input_tokens_seen": 18130745, + "step": 848, + "time_per_iteration": 2.7050540447235107 + }, + { + "auxiliary_loss_clip": 0.01211107, + "auxiliary_loss_mlp": 0.01073143, + "balance_loss_clip": 1.05746639, + "balance_loss_mlp": 1.04582024, + "epoch": 0.05104464151510597, + "flos": 30225329043840.0, + "grad_norm": 1.9885401012017514, + "language_loss": 0.87022346, + "learning_rate": 3.995356585597158e-06, + "loss": 0.89306593, + "num_input_tokens_seen": 18152410, + "step": 849, + "time_per_iteration": 2.6627254486083984 + }, + { + "auxiliary_loss_clip": 0.01206386, + "auxiliary_loss_mlp": 0.01063738, + "balance_loss_clip": 1.0533067, + "balance_loss_mlp": 1.03629565, + "epoch": 0.05110476476777394, + "flos": 18332900674560.0, + "grad_norm": 2.197352945423765, + "language_loss": 0.83090079, + "learning_rate": 3.995330024240732e-06, + "loss": 0.85360199, + "num_input_tokens_seen": 18170870, + "step": 850, + "time_per_iteration": 2.5753889083862305 + }, + { + "auxiliary_loss_clip": 0.01196029, + "auxiliary_loss_mlp": 0.01064008, + "balance_loss_clip": 1.05232096, + "balance_loss_mlp": 1.03678107, + "epoch": 0.051164888020441904, + "flos": 37998732170880.0, + "grad_norm": 2.0161150392056113, + "language_loss": 0.65034211, + "learning_rate": 3.995303387221192e-06, + "loss": 0.67294252, + "num_input_tokens_seen": 18191555, + "step": 851, + "time_per_iteration": 2.86295485496521 + }, + { + "auxiliary_loss_clip": 0.01193717, + "auxiliary_loss_mlp": 0.01079201, + "balance_loss_clip": 1.05127347, + "balance_loss_mlp": 1.04873085, + "epoch": 0.051225011273109876, + "flos": 23038634666880.0, + "grad_norm": 3.0084184815667925, + "language_loss": 0.83553326, + "learning_rate": 3.995276674539547e-06, + "loss": 0.85826242, + "num_input_tokens_seen": 18208620, + "step": 852, + "time_per_iteration": 2.6711206436157227 + }, + { + "auxiliary_loss_clip": 0.0118114, + "auxiliary_loss_mlp": 0.01074762, + "balance_loss_clip": 1.05208087, + "balance_loss_mlp": 1.04532981, + "epoch": 0.05128513452577785, + "flos": 18259822454400.0, + "grad_norm": 1.9708463995864245, + "language_loss": 0.8055988, + "learning_rate": 3.995249886196811e-06, + "loss": 0.82815778, + "num_input_tokens_seen": 18226370, + "step": 853, + "time_per_iteration": 2.776932954788208 + }, + { + "auxiliary_loss_clip": 0.01204751, + "auxiliary_loss_mlp": 0.01069694, + "balance_loss_clip": 1.05269372, + "balance_loss_mlp": 1.04166806, + "epoch": 0.05134525777844581, + "flos": 27198957571200.0, + "grad_norm": 1.8943133493757678, + "language_loss": 0.75756276, + "learning_rate": 3.995223022193999e-06, + "loss": 0.78030729, + "num_input_tokens_seen": 18247075, + "step": 854, + "time_per_iteration": 2.7284657955169678 + }, + { + "auxiliary_loss_clip": 0.01183199, + "auxiliary_loss_mlp": 0.01066608, + "balance_loss_clip": 1.0504272, + "balance_loss_mlp": 1.03811646, + "epoch": 0.051405381031113785, + "flos": 28362247436160.0, + "grad_norm": 2.947951252894466, + "language_loss": 0.81744599, + "learning_rate": 3.99519608253213e-06, + "loss": 0.83994406, + "num_input_tokens_seen": 18265680, + "step": 855, + "time_per_iteration": 2.6918699741363525 + }, + { + "auxiliary_loss_clip": 0.01060514, + "auxiliary_loss_mlp": 0.00749979, + "balance_loss_clip": 1.02094507, + "balance_loss_mlp": 1.00103557, + "epoch": 0.05146550428378175, + "flos": 65618169327360.0, + "grad_norm": 0.9849519171623474, + "language_loss": 0.65641665, + "learning_rate": 3.995169067212227e-06, + "loss": 0.67452157, + "num_input_tokens_seen": 18327015, + "step": 856, + "time_per_iteration": 3.197787284851074 + }, + { + "auxiliary_loss_clip": 0.01175685, + "auxiliary_loss_mlp": 0.01056674, + "balance_loss_clip": 1.04931235, + "balance_loss_mlp": 1.02883828, + "epoch": 0.05152562753644972, + "flos": 22054861998720.0, + "grad_norm": 1.825398747319256, + "language_loss": 0.77032697, + "learning_rate": 3.9951419762353116e-06, + "loss": 0.79265058, + "num_input_tokens_seen": 18345235, + "step": 857, + "time_per_iteration": 2.7281460762023926 + }, + { + "auxiliary_loss_clip": 0.01159704, + "auxiliary_loss_mlp": 0.01059353, + "balance_loss_clip": 1.04530334, + "balance_loss_mlp": 1.0306952, + "epoch": 0.051585750789117694, + "flos": 18509544783360.0, + "grad_norm": 2.0215561475296666, + "language_loss": 0.89243221, + "learning_rate": 3.995114809602412e-06, + "loss": 0.91462284, + "num_input_tokens_seen": 18362350, + "step": 858, + "time_per_iteration": 2.647146701812744 + }, + { + "auxiliary_loss_clip": 0.01182235, + "auxiliary_loss_mlp": 0.01062766, + "balance_loss_clip": 1.05259514, + "balance_loss_mlp": 1.03410792, + "epoch": 0.05164587404178566, + "flos": 23730238108800.0, + "grad_norm": 3.0762197456625406, + "language_loss": 0.75246382, + "learning_rate": 3.9950875673145605e-06, + "loss": 0.77491385, + "num_input_tokens_seen": 18383390, + "step": 859, + "time_per_iteration": 2.699126720428467 + }, + { + "auxiliary_loss_clip": 0.01166922, + "auxiliary_loss_mlp": 0.01078421, + "balance_loss_clip": 1.04925549, + "balance_loss_mlp": 1.04766524, + "epoch": 0.05170599729445363, + "flos": 16252882876800.0, + "grad_norm": 2.1207780279817894, + "language_loss": 0.90692717, + "learning_rate": 3.995060249372788e-06, + "loss": 0.92938066, + "num_input_tokens_seen": 18399220, + "step": 860, + "time_per_iteration": 2.696225643157959 + }, + { + "auxiliary_loss_clip": 0.01208634, + "auxiliary_loss_mlp": 0.01066389, + "balance_loss_clip": 1.05568826, + "balance_loss_mlp": 1.0396626, + "epoch": 0.0517661205471216, + "flos": 23985922095360.0, + "grad_norm": 2.1600796920374323, + "language_loss": 0.82210088, + "learning_rate": 3.99503285577813e-06, + "loss": 0.84485102, + "num_input_tokens_seen": 18419005, + "step": 861, + "time_per_iteration": 2.656764030456543 + }, + { + "auxiliary_loss_clip": 0.01184038, + "auxiliary_loss_mlp": 0.01061312, + "balance_loss_clip": 1.05160749, + "balance_loss_mlp": 1.03351283, + "epoch": 0.05182624379978957, + "flos": 29277718392960.0, + "grad_norm": 2.4499781808385603, + "language_loss": 0.78481442, + "learning_rate": 3.995005386531627e-06, + "loss": 0.80726796, + "num_input_tokens_seen": 18440550, + "step": 862, + "time_per_iteration": 2.719419240951538 + }, + { + "auxiliary_loss_clip": 0.01170754, + "auxiliary_loss_mlp": 0.0106945, + "balance_loss_clip": 1.0495919, + "balance_loss_mlp": 1.04252052, + "epoch": 0.05188636705245754, + "flos": 24170826332160.0, + "grad_norm": 1.985082059847255, + "language_loss": 0.89125741, + "learning_rate": 3.9949778416343195e-06, + "loss": 0.91365945, + "num_input_tokens_seen": 18461950, + "step": 863, + "time_per_iteration": 2.6609532833099365 + }, + { + "auxiliary_loss_clip": 0.01186309, + "auxiliary_loss_mlp": 0.01065006, + "balance_loss_clip": 1.05467498, + "balance_loss_mlp": 1.03584766, + "epoch": 0.051946490305125506, + "flos": 26760703731840.0, + "grad_norm": 2.329656288631359, + "language_loss": 0.76055783, + "learning_rate": 3.9949502210872525e-06, + "loss": 0.78307104, + "num_input_tokens_seen": 18480555, + "step": 864, + "time_per_iteration": 2.67069935798645 + }, + { + "auxiliary_loss_clip": 0.01164621, + "auxiliary_loss_mlp": 0.01072185, + "balance_loss_clip": 1.04785347, + "balance_loss_mlp": 1.04417038, + "epoch": 0.05200661355779348, + "flos": 21502519585920.0, + "grad_norm": 1.9925041864503563, + "language_loss": 0.79186392, + "learning_rate": 3.994922524891474e-06, + "loss": 0.81423199, + "num_input_tokens_seen": 18499645, + "step": 865, + "time_per_iteration": 2.640662670135498 + }, + { + "auxiliary_loss_clip": 0.01194172, + "auxiliary_loss_mlp": 0.01065901, + "balance_loss_clip": 1.05190408, + "balance_loss_mlp": 1.0379703, + "epoch": 0.05206673681046144, + "flos": 18114492026880.0, + "grad_norm": 2.240198637145871, + "language_loss": 0.85854059, + "learning_rate": 3.994894753048032e-06, + "loss": 0.8811413, + "num_input_tokens_seen": 18516810, + "step": 866, + "time_per_iteration": 2.583238363265991 + }, + { + "auxiliary_loss_clip": 0.01172625, + "auxiliary_loss_mlp": 0.01074836, + "balance_loss_clip": 1.05608439, + "balance_loss_mlp": 1.04633331, + "epoch": 0.052126860063129415, + "flos": 17524191916800.0, + "grad_norm": 2.289188328144818, + "language_loss": 0.87373942, + "learning_rate": 3.9948669055579815e-06, + "loss": 0.89621407, + "num_input_tokens_seen": 18532510, + "step": 867, + "time_per_iteration": 2.6799845695495605 + }, + { + "auxiliary_loss_clip": 0.01148854, + "auxiliary_loss_mlp": 0.01070333, + "balance_loss_clip": 1.04971123, + "balance_loss_mlp": 1.04476273, + "epoch": 0.05218698331579739, + "flos": 32598054771840.0, + "grad_norm": 1.9522212902647633, + "language_loss": 0.63984841, + "learning_rate": 3.9948389824223785e-06, + "loss": 0.66204023, + "num_input_tokens_seen": 18557380, + "step": 868, + "time_per_iteration": 2.752331256866455 + }, + { + "auxiliary_loss_clip": 0.0120914, + "auxiliary_loss_mlp": 0.01074212, + "balance_loss_clip": 1.05396652, + "balance_loss_mlp": 1.0445056, + "epoch": 0.05224710656846535, + "flos": 22127293774080.0, + "grad_norm": 1.92584094111543, + "language_loss": 0.82911688, + "learning_rate": 3.994810983642281e-06, + "loss": 0.85195041, + "num_input_tokens_seen": 18575720, + "step": 869, + "time_per_iteration": 2.5515215396881104 + }, + { + "auxiliary_loss_clip": 0.01198028, + "auxiliary_loss_mlp": 0.010612, + "balance_loss_clip": 1.05414748, + "balance_loss_mlp": 1.03361452, + "epoch": 0.052307229821133325, + "flos": 11145092976000.0, + "grad_norm": 2.062711020430892, + "language_loss": 0.87488127, + "learning_rate": 3.994782909218751e-06, + "loss": 0.89747357, + "num_input_tokens_seen": 18592185, + "step": 870, + "time_per_iteration": 2.681389331817627 + }, + { + "auxiliary_loss_clip": 0.01210225, + "auxiliary_loss_mlp": 0.01069112, + "balance_loss_clip": 1.05603743, + "balance_loss_mlp": 1.042063, + "epoch": 0.05236735307380129, + "flos": 19128070005120.0, + "grad_norm": 1.9663001397858204, + "language_loss": 0.80861628, + "learning_rate": 3.994754759152854e-06, + "loss": 0.83140969, + "num_input_tokens_seen": 18609560, + "step": 871, + "time_per_iteration": 2.6975274085998535 + }, + { + "auxiliary_loss_clip": 0.01178858, + "auxiliary_loss_mlp": 0.01070021, + "balance_loss_clip": 1.05600047, + "balance_loss_mlp": 1.0436523, + "epoch": 0.05242747632646926, + "flos": 20960663944320.0, + "grad_norm": 1.704548182861968, + "language_loss": 0.81324244, + "learning_rate": 3.994726533445656e-06, + "loss": 0.83573115, + "num_input_tokens_seen": 18629405, + "step": 872, + "time_per_iteration": 2.7636947631835938 + }, + { + "auxiliary_loss_clip": 0.01063297, + "auxiliary_loss_mlp": 0.01008957, + "balance_loss_clip": 1.02614355, + "balance_loss_mlp": 1.00270998, + "epoch": 0.052487599579137234, + "flos": 65020542842880.0, + "grad_norm": 0.8846697803769138, + "language_loss": 0.61674559, + "learning_rate": 3.9946982320982274e-06, + "loss": 0.63746822, + "num_input_tokens_seen": 18681480, + "step": 873, + "time_per_iteration": 3.131986141204834 + }, + { + "auxiliary_loss_clip": 0.01178506, + "auxiliary_loss_mlp": 0.01061765, + "balance_loss_clip": 1.05093098, + "balance_loss_mlp": 1.03384662, + "epoch": 0.0525477228318052, + "flos": 23288859786240.0, + "grad_norm": 1.989913323951253, + "language_loss": 0.88948631, + "learning_rate": 3.994669855111643e-06, + "loss": 0.91188896, + "num_input_tokens_seen": 18700390, + "step": 874, + "time_per_iteration": 2.836238145828247 + }, + { + "auxiliary_loss_clip": 0.01178465, + "auxiliary_loss_mlp": 0.01062095, + "balance_loss_clip": 1.04977727, + "balance_loss_mlp": 1.03517795, + "epoch": 0.05260784608447317, + "flos": 32230221546240.0, + "grad_norm": 1.838129243777448, + "language_loss": 0.74561733, + "learning_rate": 3.994641402486977e-06, + "loss": 0.76802289, + "num_input_tokens_seen": 18721280, + "step": 875, + "time_per_iteration": 2.863738775253296 + }, + { + "auxiliary_loss_clip": 0.01187843, + "auxiliary_loss_mlp": 0.01058273, + "balance_loss_clip": 1.05228555, + "balance_loss_mlp": 1.02990103, + "epoch": 0.052667969337141136, + "flos": 24463211040000.0, + "grad_norm": 1.9879906711031814, + "language_loss": 0.92849219, + "learning_rate": 3.99461287422531e-06, + "loss": 0.95095336, + "num_input_tokens_seen": 18741545, + "step": 876, + "time_per_iteration": 2.674699068069458 + }, + { + "auxiliary_loss_clip": 0.01084439, + "auxiliary_loss_mlp": 0.0102325, + "balance_loss_clip": 1.01305556, + "balance_loss_mlp": 1.01800442, + "epoch": 0.05272809258980911, + "flos": 57784329567360.0, + "grad_norm": 0.8263489865976652, + "language_loss": 0.62913245, + "learning_rate": 3.994584270327722e-06, + "loss": 0.65020937, + "num_input_tokens_seen": 18801400, + "step": 877, + "time_per_iteration": 3.110445737838745 + }, + { + "auxiliary_loss_clip": 0.01181141, + "auxiliary_loss_mlp": 0.01068887, + "balance_loss_clip": 1.05042076, + "balance_loss_mlp": 1.03989506, + "epoch": 0.05278821584247708, + "flos": 17420805596160.0, + "grad_norm": 2.560300622199653, + "language_loss": 0.85570186, + "learning_rate": 3.994555590795299e-06, + "loss": 0.8782022, + "num_input_tokens_seen": 18819670, + "step": 878, + "time_per_iteration": 2.6504433155059814 + }, + { + "auxiliary_loss_clip": 0.0120613, + "auxiliary_loss_mlp": 0.01062177, + "balance_loss_clip": 1.05298352, + "balance_loss_mlp": 1.03542638, + "epoch": 0.052848339095145046, + "flos": 26137258346880.0, + "grad_norm": 3.0181000905020587, + "language_loss": 0.82990158, + "learning_rate": 3.9945268356291275e-06, + "loss": 0.85258466, + "num_input_tokens_seen": 18840580, + "step": 879, + "time_per_iteration": 2.7387568950653076 + }, + { + "auxiliary_loss_clip": 0.01174331, + "auxiliary_loss_mlp": 0.01065503, + "balance_loss_clip": 1.05190516, + "balance_loss_mlp": 1.03763175, + "epoch": 0.05290846234781302, + "flos": 16472081623680.0, + "grad_norm": 2.4317219954544824, + "language_loss": 0.8440066, + "learning_rate": 3.9944980048302985e-06, + "loss": 0.86640495, + "num_input_tokens_seen": 18859295, + "step": 880, + "time_per_iteration": 2.6068363189697266 + }, + { + "auxiliary_loss_clip": 0.01166672, + "auxiliary_loss_mlp": 0.01065107, + "balance_loss_clip": 1.05219388, + "balance_loss_mlp": 1.03793919, + "epoch": 0.05296858560048098, + "flos": 19865173000320.0, + "grad_norm": 2.576356113888802, + "language_loss": 0.87298685, + "learning_rate": 3.994469098399906e-06, + "loss": 0.89530462, + "num_input_tokens_seen": 18877485, + "step": 881, + "time_per_iteration": 2.7348520755767822 + }, + { + "auxiliary_loss_clip": 0.01189156, + "auxiliary_loss_mlp": 0.0106349, + "balance_loss_clip": 1.05056965, + "balance_loss_mlp": 1.03523779, + "epoch": 0.053028708853148955, + "flos": 24388588535040.0, + "grad_norm": 2.313229181478497, + "language_loss": 0.87931299, + "learning_rate": 3.994440116339046e-06, + "loss": 0.90183949, + "num_input_tokens_seen": 18898275, + "step": 882, + "time_per_iteration": 2.6392524242401123 + }, + { + "auxiliary_loss_clip": 0.01207741, + "auxiliary_loss_mlp": 0.01056394, + "balance_loss_clip": 1.05420077, + "balance_loss_mlp": 1.02878559, + "epoch": 0.05308883210581693, + "flos": 36393166143360.0, + "grad_norm": 2.311071447687122, + "language_loss": 0.69376314, + "learning_rate": 3.994411058648816e-06, + "loss": 0.71640444, + "num_input_tokens_seen": 18920665, + "step": 883, + "time_per_iteration": 2.7092652320861816 + }, + { + "auxiliary_loss_clip": 0.01149249, + "auxiliary_loss_mlp": 0.01064608, + "balance_loss_clip": 1.04924393, + "balance_loss_mlp": 1.03790522, + "epoch": 0.05314895535848489, + "flos": 22855095146880.0, + "grad_norm": 2.3635916098812864, + "language_loss": 0.76341856, + "learning_rate": 3.994381925330319e-06, + "loss": 0.78555709, + "num_input_tokens_seen": 18939835, + "step": 884, + "time_per_iteration": 2.6772003173828125 + }, + { + "auxiliary_loss_clip": 0.01168213, + "auxiliary_loss_mlp": 0.01072108, + "balance_loss_clip": 1.06367683, + "balance_loss_mlp": 1.04554844, + "epoch": 0.053209078611152864, + "flos": 12860330204160.0, + "grad_norm": 1.9338633665008016, + "language_loss": 0.8595103, + "learning_rate": 3.994352716384659e-06, + "loss": 0.88191354, + "num_input_tokens_seen": 18958405, + "step": 885, + "time_per_iteration": 2.698594808578491 + }, + { + "auxiliary_loss_clip": 0.01166378, + "auxiliary_loss_mlp": 0.01068384, + "balance_loss_clip": 1.04877996, + "balance_loss_mlp": 1.04091811, + "epoch": 0.05326920186382083, + "flos": 12164596698240.0, + "grad_norm": 2.556858285483537, + "language_loss": 0.85961831, + "learning_rate": 3.994323431812945e-06, + "loss": 0.88196588, + "num_input_tokens_seen": 18975445, + "step": 886, + "time_per_iteration": 2.585512161254883 + }, + { + "auxiliary_loss_clip": 0.01160569, + "auxiliary_loss_mlp": 0.01067185, + "balance_loss_clip": 1.04923296, + "balance_loss_mlp": 1.0392189, + "epoch": 0.0533293251164888, + "flos": 22704485420160.0, + "grad_norm": 2.231256253847238, + "language_loss": 0.89743894, + "learning_rate": 3.994294071616286e-06, + "loss": 0.91971642, + "num_input_tokens_seen": 18991930, + "step": 887, + "time_per_iteration": 2.705929756164551 + }, + { + "auxiliary_loss_clip": 0.01119621, + "auxiliary_loss_mlp": 0.010748, + "balance_loss_clip": 1.0392375, + "balance_loss_mlp": 1.04489028, + "epoch": 0.053389448369156774, + "flos": 26940939200640.0, + "grad_norm": 1.965795234649119, + "language_loss": 0.74965537, + "learning_rate": 3.994264635795796e-06, + "loss": 0.77159959, + "num_input_tokens_seen": 19009790, + "step": 888, + "time_per_iteration": 2.7559494972229004 + }, + { + "auxiliary_loss_clip": 0.01146905, + "auxiliary_loss_mlp": 0.0107648, + "balance_loss_clip": 1.04846716, + "balance_loss_mlp": 1.04703522, + "epoch": 0.05344957162182474, + "flos": 25556331686400.0, + "grad_norm": 1.84669198434875, + "language_loss": 0.88239801, + "learning_rate": 3.994235124352592e-06, + "loss": 0.90463191, + "num_input_tokens_seen": 19030170, + "step": 889, + "time_per_iteration": 2.8038828372955322 + }, + { + "auxiliary_loss_clip": 0.01198155, + "auxiliary_loss_mlp": 0.01052527, + "balance_loss_clip": 1.05078387, + "balance_loss_mlp": 1.02577651, + "epoch": 0.05350969487449271, + "flos": 19719591177600.0, + "grad_norm": 2.743535176134688, + "language_loss": 0.88443762, + "learning_rate": 3.994205537287791e-06, + "loss": 0.90694445, + "num_input_tokens_seen": 19048075, + "step": 890, + "time_per_iteration": 5.740016460418701 + }, + { + "auxiliary_loss_clip": 0.01176147, + "auxiliary_loss_mlp": 0.01070429, + "balance_loss_clip": 1.04892075, + "balance_loss_mlp": 1.04503751, + "epoch": 0.053569818127160676, + "flos": 27016351804800.0, + "grad_norm": 3.3316836101485774, + "language_loss": 0.93602014, + "learning_rate": 3.994175874602517e-06, + "loss": 0.95848596, + "num_input_tokens_seen": 19067465, + "step": 891, + "time_per_iteration": 4.21731162071228 + }, + { + "auxiliary_loss_clip": 0.01168664, + "auxiliary_loss_mlp": 0.01070519, + "balance_loss_clip": 1.04776669, + "balance_loss_mlp": 1.04116976, + "epoch": 0.05362994137982865, + "flos": 13188338225280.0, + "grad_norm": 2.237027010411287, + "language_loss": 0.72117269, + "learning_rate": 3.994146136297893e-06, + "loss": 0.74356455, + "num_input_tokens_seen": 19085505, + "step": 892, + "time_per_iteration": 2.7167696952819824 + }, + { + "auxiliary_loss_clip": 0.01177786, + "auxiliary_loss_mlp": 0.00749977, + "balance_loss_clip": 1.05078435, + "balance_loss_mlp": 1.00064969, + "epoch": 0.05369006463249662, + "flos": 28658008022400.0, + "grad_norm": 7.804519338156062, + "language_loss": 0.82237291, + "learning_rate": 3.994116322375049e-06, + "loss": 0.84165055, + "num_input_tokens_seen": 19104360, + "step": 893, + "time_per_iteration": 4.295018434524536 + }, + { + "auxiliary_loss_clip": 0.01177347, + "auxiliary_loss_mlp": 0.0106384, + "balance_loss_clip": 1.04990327, + "balance_loss_mlp": 1.03793573, + "epoch": 0.053750187885164585, + "flos": 28913153304960.0, + "grad_norm": 1.9711065372116687, + "language_loss": 0.81433403, + "learning_rate": 3.994086432835114e-06, + "loss": 0.83674586, + "num_input_tokens_seen": 19124680, + "step": 894, + "time_per_iteration": 2.706514835357666 + }, + { + "auxiliary_loss_clip": 0.01180662, + "auxiliary_loss_mlp": 0.01063268, + "balance_loss_clip": 1.04954338, + "balance_loss_mlp": 1.0372088, + "epoch": 0.05381031113783256, + "flos": 15158828476800.0, + "grad_norm": 2.234528839194162, + "language_loss": 0.75598741, + "learning_rate": 3.994056467679221e-06, + "loss": 0.77842671, + "num_input_tokens_seen": 19142895, + "step": 895, + "time_per_iteration": 2.644099712371826 + }, + { + "auxiliary_loss_clip": 0.01183897, + "auxiliary_loss_mlp": 0.01059403, + "balance_loss_clip": 1.05658841, + "balance_loss_mlp": 1.03292644, + "epoch": 0.05387043439050053, + "flos": 21835232288640.0, + "grad_norm": 2.1565861506580832, + "language_loss": 0.86877966, + "learning_rate": 3.9940264269085065e-06, + "loss": 0.8912127, + "num_input_tokens_seen": 19163125, + "step": 896, + "time_per_iteration": 2.7547383308410645 + }, + { + "auxiliary_loss_clip": 0.01206093, + "auxiliary_loss_mlp": 0.00749953, + "balance_loss_clip": 1.05383325, + "balance_loss_mlp": 1.0006175, + "epoch": 0.053930557643168495, + "flos": 17310308382720.0, + "grad_norm": 2.179854853726447, + "language_loss": 0.88134789, + "learning_rate": 3.9939963105241115e-06, + "loss": 0.90090835, + "num_input_tokens_seen": 19179385, + "step": 897, + "time_per_iteration": 2.5651485919952393 + }, + { + "auxiliary_loss_clip": 0.01187045, + "auxiliary_loss_mlp": 0.01067303, + "balance_loss_clip": 1.05295837, + "balance_loss_mlp": 1.03833568, + "epoch": 0.05399068089583647, + "flos": 17348481561600.0, + "grad_norm": 1.6887488479998822, + "language_loss": 0.90279853, + "learning_rate": 3.993966118527175e-06, + "loss": 0.92534208, + "num_input_tokens_seen": 19198725, + "step": 898, + "time_per_iteration": 2.5967392921447754 + }, + { + "auxiliary_loss_clip": 0.01182381, + "auxiliary_loss_mlp": 0.01077061, + "balance_loss_clip": 1.05030608, + "balance_loss_mlp": 1.05022717, + "epoch": 0.05405080414850443, + "flos": 17486952491520.0, + "grad_norm": 3.143618036130061, + "language_loss": 0.92502725, + "learning_rate": 3.993935850918845e-06, + "loss": 0.9476217, + "num_input_tokens_seen": 19212380, + "step": 899, + "time_per_iteration": 2.6180779933929443 + }, + { + "auxiliary_loss_clip": 0.01167861, + "auxiliary_loss_mlp": 0.01068713, + "balance_loss_clip": 1.04932451, + "balance_loss_mlp": 1.04196262, + "epoch": 0.054110927401172404, + "flos": 24496787278080.0, + "grad_norm": 2.30814556081372, + "language_loss": 0.75486112, + "learning_rate": 3.9939055077002665e-06, + "loss": 0.77722681, + "num_input_tokens_seen": 19232235, + "step": 900, + "time_per_iteration": 2.650266170501709 + }, + { + "auxiliary_loss_clip": 0.01196124, + "auxiliary_loss_mlp": 0.01066515, + "balance_loss_clip": 1.05164218, + "balance_loss_mlp": 1.04049122, + "epoch": 0.054171050653840376, + "flos": 22930040874240.0, + "grad_norm": 3.0705326880325066, + "language_loss": 0.73825085, + "learning_rate": 3.993875088872592e-06, + "loss": 0.76087719, + "num_input_tokens_seen": 19251460, + "step": 901, + "time_per_iteration": 2.6603035926818848 + }, + { + "auxiliary_loss_clip": 0.01160489, + "auxiliary_loss_mlp": 0.01072475, + "balance_loss_clip": 1.04934621, + "balance_loss_mlp": 1.04727399, + "epoch": 0.05423117390650834, + "flos": 12933192942720.0, + "grad_norm": 2.2879333714037773, + "language_loss": 0.84702289, + "learning_rate": 3.9938445944369745e-06, + "loss": 0.86935252, + "num_input_tokens_seen": 19269060, + "step": 902, + "time_per_iteration": 2.7822725772857666 + }, + { + "auxiliary_loss_clip": 0.01135348, + "auxiliary_loss_mlp": 0.01067255, + "balance_loss_clip": 1.04084849, + "balance_loss_mlp": 1.03962278, + "epoch": 0.05429129715917631, + "flos": 19901335017600.0, + "grad_norm": 3.5079998553713283, + "language_loss": 0.86898875, + "learning_rate": 3.993814024394569e-06, + "loss": 0.89101475, + "num_input_tokens_seen": 19288620, + "step": 903, + "time_per_iteration": 2.9181230068206787 + }, + { + "auxiliary_loss_clip": 0.01190227, + "auxiliary_loss_mlp": 0.0106368, + "balance_loss_clip": 1.0524497, + "balance_loss_mlp": 1.03838348, + "epoch": 0.05435142041184428, + "flos": 16908611610240.0, + "grad_norm": 2.0891315042294827, + "language_loss": 0.75170672, + "learning_rate": 3.993783378746537e-06, + "loss": 0.7742458, + "num_input_tokens_seen": 19306615, + "step": 904, + "time_per_iteration": 2.679715156555176 + }, + { + "auxiliary_loss_clip": 0.0118675, + "auxiliary_loss_mlp": 0.01075373, + "balance_loss_clip": 1.05172443, + "balance_loss_mlp": 1.04955196, + "epoch": 0.05441154366451225, + "flos": 23948323534080.0, + "grad_norm": 2.4898089229227978, + "language_loss": 0.8608917, + "learning_rate": 3.993752657494039e-06, + "loss": 0.88351297, + "num_input_tokens_seen": 19321680, + "step": 905, + "time_per_iteration": 2.731687068939209 + }, + { + "auxiliary_loss_clip": 0.01180088, + "auxiliary_loss_mlp": 0.01070962, + "balance_loss_clip": 1.05673969, + "balance_loss_mlp": 1.045928, + "epoch": 0.05447166691718022, + "flos": 19975382904960.0, + "grad_norm": 1.8258284252507313, + "language_loss": 0.74520886, + "learning_rate": 3.993721860638241e-06, + "loss": 0.76771939, + "num_input_tokens_seen": 19339760, + "step": 906, + "time_per_iteration": 2.751469135284424 + }, + { + "auxiliary_loss_clip": 0.01176965, + "auxiliary_loss_mlp": 0.01069851, + "balance_loss_clip": 1.05203164, + "balance_loss_mlp": 1.04293406, + "epoch": 0.05453179016984819, + "flos": 24936513575040.0, + "grad_norm": 2.4877090854538344, + "language_loss": 0.87730432, + "learning_rate": 3.993690988180309e-06, + "loss": 0.89977252, + "num_input_tokens_seen": 19359585, + "step": 907, + "time_per_iteration": 2.7179009914398193 + }, + { + "auxiliary_loss_clip": 0.0118795, + "auxiliary_loss_mlp": 0.01071109, + "balance_loss_clip": 1.05302691, + "balance_loss_mlp": 1.04459631, + "epoch": 0.05459191342251616, + "flos": 18115102558080.0, + "grad_norm": 1.7858292210823976, + "language_loss": 0.87049055, + "learning_rate": 3.9936600401214165e-06, + "loss": 0.89308113, + "num_input_tokens_seen": 19378590, + "step": 908, + "time_per_iteration": 2.7247514724731445 + }, + { + "auxiliary_loss_clip": 0.01183163, + "auxiliary_loss_mlp": 0.01073736, + "balance_loss_clip": 1.05333924, + "balance_loss_mlp": 1.04629445, + "epoch": 0.054652036675184125, + "flos": 19208295031680.0, + "grad_norm": 2.3014002824919157, + "language_loss": 0.89397359, + "learning_rate": 3.9936290164627345e-06, + "loss": 0.91654259, + "num_input_tokens_seen": 19397910, + "step": 909, + "time_per_iteration": 2.615906238555908 + }, + { + "auxiliary_loss_clip": 0.01184501, + "auxiliary_loss_mlp": 0.01073366, + "balance_loss_clip": 1.05429769, + "balance_loss_mlp": 1.04623377, + "epoch": 0.0547121599278521, + "flos": 16325745615360.0, + "grad_norm": 3.8283408612898646, + "language_loss": 0.71179622, + "learning_rate": 3.99359791720544e-06, + "loss": 0.73437488, + "num_input_tokens_seen": 19415950, + "step": 910, + "time_per_iteration": 2.6664700508117676 + }, + { + "auxiliary_loss_clip": 0.01169635, + "auxiliary_loss_mlp": 0.0105635, + "balance_loss_clip": 1.04942036, + "balance_loss_mlp": 1.03120899, + "epoch": 0.05477228318052007, + "flos": 20339014239360.0, + "grad_norm": 2.695009819639257, + "language_loss": 0.83328074, + "learning_rate": 3.993566742350714e-06, + "loss": 0.85554063, + "num_input_tokens_seen": 19435275, + "step": 911, + "time_per_iteration": 2.6523358821868896 + }, + { + "auxiliary_loss_clip": 0.01168475, + "auxiliary_loss_mlp": 0.01078012, + "balance_loss_clip": 1.04793501, + "balance_loss_mlp": 1.05011737, + "epoch": 0.054832406433188034, + "flos": 21973092687360.0, + "grad_norm": 2.5831015804312627, + "language_loss": 0.76017785, + "learning_rate": 3.993535491899736e-06, + "loss": 0.78264272, + "num_input_tokens_seen": 19452090, + "step": 912, + "time_per_iteration": 2.667454242706299 + }, + { + "auxiliary_loss_clip": 0.01173389, + "auxiliary_loss_mlp": 0.01054272, + "balance_loss_clip": 1.0521096, + "balance_loss_mlp": 1.02779567, + "epoch": 0.054892529685856006, + "flos": 16398931576320.0, + "grad_norm": 2.122536660857909, + "language_loss": 0.82743263, + "learning_rate": 3.993504165853694e-06, + "loss": 0.84970927, + "num_input_tokens_seen": 19470865, + "step": 913, + "time_per_iteration": 2.685760259628296 + }, + { + "auxiliary_loss_clip": 0.01175395, + "auxiliary_loss_mlp": 0.01059868, + "balance_loss_clip": 1.05166268, + "balance_loss_mlp": 1.03405929, + "epoch": 0.05495265293852397, + "flos": 23912341084800.0, + "grad_norm": 2.0466529874685686, + "language_loss": 0.83643961, + "learning_rate": 3.993472764213772e-06, + "loss": 0.8587923, + "num_input_tokens_seen": 19492145, + "step": 914, + "time_per_iteration": 2.652545213699341 + }, + { + "auxiliary_loss_clip": 0.01192264, + "auxiliary_loss_mlp": 0.00749957, + "balance_loss_clip": 1.05438316, + "balance_loss_mlp": 1.00067151, + "epoch": 0.055012776191191944, + "flos": 23586954756480.0, + "grad_norm": 2.492326672806583, + "language_loss": 0.90253502, + "learning_rate": 3.9934412869811655e-06, + "loss": 0.92195719, + "num_input_tokens_seen": 19511015, + "step": 915, + "time_per_iteration": 2.6517984867095947 + }, + { + "auxiliary_loss_clip": 0.01191572, + "auxiliary_loss_mlp": 0.01060298, + "balance_loss_clip": 1.05883956, + "balance_loss_mlp": 1.03524029, + "epoch": 0.055072899443859916, + "flos": 17528501548800.0, + "grad_norm": 1.6477821857679051, + "language_loss": 0.89808297, + "learning_rate": 3.993409734157064e-06, + "loss": 0.92060173, + "num_input_tokens_seen": 19529040, + "step": 916, + "time_per_iteration": 2.5786356925964355 + }, + { + "auxiliary_loss_clip": 0.01157346, + "auxiliary_loss_mlp": 0.01070154, + "balance_loss_clip": 1.05079091, + "balance_loss_mlp": 1.04379678, + "epoch": 0.05513302269652788, + "flos": 21687172427520.0, + "grad_norm": 1.9309347678108353, + "language_loss": 0.80237621, + "learning_rate": 3.993378105742666e-06, + "loss": 0.82465124, + "num_input_tokens_seen": 19549540, + "step": 917, + "time_per_iteration": 2.64534592628479 + }, + { + "auxiliary_loss_clip": 0.0111864, + "auxiliary_loss_mlp": 0.01067377, + "balance_loss_clip": 1.04594433, + "balance_loss_mlp": 1.03994656, + "epoch": 0.05519314594919585, + "flos": 21613340021760.0, + "grad_norm": 2.299520165219871, + "language_loss": 0.79698479, + "learning_rate": 3.9933464017391705e-06, + "loss": 0.81884491, + "num_input_tokens_seen": 19567570, + "step": 918, + "time_per_iteration": 2.7866454124450684 + }, + { + "auxiliary_loss_clip": 0.01183948, + "auxiliary_loss_mlp": 0.0105478, + "balance_loss_clip": 1.05019188, + "balance_loss_mlp": 1.02920985, + "epoch": 0.05525326920186382, + "flos": 21798567480960.0, + "grad_norm": 2.9103943088834945, + "language_loss": 0.88852262, + "learning_rate": 3.99331462214778e-06, + "loss": 0.91090989, + "num_input_tokens_seen": 19585330, + "step": 919, + "time_per_iteration": 2.5758862495422363 + }, + { + "auxiliary_loss_clip": 0.01196814, + "auxiliary_loss_mlp": 0.01066981, + "balance_loss_clip": 1.05079794, + "balance_loss_mlp": 1.04020619, + "epoch": 0.05531339245453179, + "flos": 28439635288320.0, + "grad_norm": 2.2024840128968446, + "language_loss": 0.87527883, + "learning_rate": 3.993282766969699e-06, + "loss": 0.89791679, + "num_input_tokens_seen": 19604970, + "step": 920, + "time_per_iteration": 2.7523016929626465 + }, + { + "auxiliary_loss_clip": 0.01175698, + "auxiliary_loss_mlp": 0.01061127, + "balance_loss_clip": 1.05379391, + "balance_loss_mlp": 1.03574717, + "epoch": 0.05537351570719976, + "flos": 37375143131520.0, + "grad_norm": 2.138998759901821, + "language_loss": 0.65714073, + "learning_rate": 3.993250836206136e-06, + "loss": 0.67950904, + "num_input_tokens_seen": 19626235, + "step": 921, + "time_per_iteration": 2.8403429985046387 + }, + { + "auxiliary_loss_clip": 0.01194738, + "auxiliary_loss_mlp": 0.01066255, + "balance_loss_clip": 1.054649, + "balance_loss_mlp": 1.03713226, + "epoch": 0.05543363895986773, + "flos": 20084479488000.0, + "grad_norm": 2.5141442168327557, + "language_loss": 0.7191869, + "learning_rate": 3.993218829858301e-06, + "loss": 0.74179685, + "num_input_tokens_seen": 19644305, + "step": 922, + "time_per_iteration": 2.7340755462646484 + }, + { + "auxiliary_loss_clip": 0.01165455, + "auxiliary_loss_mlp": 0.01070975, + "balance_loss_clip": 1.0470264, + "balance_loss_mlp": 1.04371214, + "epoch": 0.0554937622125357, + "flos": 24533200690560.0, + "grad_norm": 2.91093818944902, + "language_loss": 0.82486093, + "learning_rate": 3.993186747927408e-06, + "loss": 0.84722519, + "num_input_tokens_seen": 19662130, + "step": 923, + "time_per_iteration": 2.7577288150787354 + }, + { + "auxiliary_loss_clip": 0.01187687, + "auxiliary_loss_mlp": 0.01067432, + "balance_loss_clip": 1.05014014, + "balance_loss_mlp": 1.04082417, + "epoch": 0.055553885465203665, + "flos": 14320063013760.0, + "grad_norm": 1.9274480291058704, + "language_loss": 0.78534889, + "learning_rate": 3.993154590414675e-06, + "loss": 0.80790007, + "num_input_tokens_seen": 19680715, + "step": 924, + "time_per_iteration": 2.646690607070923 + }, + { + "auxiliary_loss_clip": 0.0115165, + "auxiliary_loss_mlp": 0.01058064, + "balance_loss_clip": 1.04765832, + "balance_loss_mlp": 1.03124201, + "epoch": 0.05561400871787164, + "flos": 27381132374400.0, + "grad_norm": 1.9876302598414433, + "language_loss": 1.01965773, + "learning_rate": 3.993122357321319e-06, + "loss": 1.04175496, + "num_input_tokens_seen": 19700535, + "step": 925, + "time_per_iteration": 2.823664903640747 + }, + { + "auxiliary_loss_clip": 0.01142416, + "auxiliary_loss_mlp": 0.01052818, + "balance_loss_clip": 1.04611731, + "balance_loss_mlp": 1.02603126, + "epoch": 0.05567413197053961, + "flos": 23221096778880.0, + "grad_norm": 1.8729217401200255, + "language_loss": 0.80897796, + "learning_rate": 3.993090048648564e-06, + "loss": 0.83093035, + "num_input_tokens_seen": 19718825, + "step": 926, + "time_per_iteration": 2.741086006164551 + }, + { + "auxiliary_loss_clip": 0.01193816, + "auxiliary_loss_mlp": 0.01069231, + "balance_loss_clip": 1.05324268, + "balance_loss_mlp": 1.04152632, + "epoch": 0.055734255223207574, + "flos": 25264952559360.0, + "grad_norm": 2.7184977619990427, + "language_loss": 0.73415995, + "learning_rate": 3.993057664397634e-06, + "loss": 0.7567904, + "num_input_tokens_seen": 19739080, + "step": 927, + "time_per_iteration": 2.686833381652832 + }, + { + "auxiliary_loss_clip": 0.01079679, + "auxiliary_loss_mlp": 0.01014649, + "balance_loss_clip": 1.01607656, + "balance_loss_mlp": 1.01021433, + "epoch": 0.055794378475875546, + "flos": 66503116702080.0, + "grad_norm": 0.8097469869716882, + "language_loss": 0.59911382, + "learning_rate": 3.9930252045697585e-06, + "loss": 0.62005711, + "num_input_tokens_seen": 19802960, + "step": 928, + "time_per_iteration": 3.292363166809082 + }, + { + "auxiliary_loss_clip": 0.01191503, + "auxiliary_loss_mlp": 0.01065433, + "balance_loss_clip": 1.0544914, + "balance_loss_mlp": 1.03861129, + "epoch": 0.05585450172854351, + "flos": 25337635729920.0, + "grad_norm": 2.08468809475699, + "language_loss": 0.95108032, + "learning_rate": 3.992992669166168e-06, + "loss": 0.97364974, + "num_input_tokens_seen": 19822765, + "step": 929, + "time_per_iteration": 2.749879837036133 + }, + { + "auxiliary_loss_clip": 0.01159811, + "auxiliary_loss_mlp": 0.01068331, + "balance_loss_clip": 1.04946005, + "balance_loss_mlp": 1.04005432, + "epoch": 0.05591462498121148, + "flos": 33911738881920.0, + "grad_norm": 1.9415353592566966, + "language_loss": 0.71682298, + "learning_rate": 3.992960058188094e-06, + "loss": 0.73910439, + "num_input_tokens_seen": 19843590, + "step": 930, + "time_per_iteration": 2.897690534591675 + }, + { + "auxiliary_loss_clip": 0.01178545, + "auxiliary_loss_mlp": 0.0106626, + "balance_loss_clip": 1.05464041, + "balance_loss_mlp": 1.03900886, + "epoch": 0.055974748233879455, + "flos": 17930880679680.0, + "grad_norm": 2.660800816685655, + "language_loss": 0.85475433, + "learning_rate": 3.992927371636776e-06, + "loss": 0.87720239, + "num_input_tokens_seen": 19860230, + "step": 931, + "time_per_iteration": 2.8638908863067627 + }, + { + "auxiliary_loss_clip": 0.01187787, + "auxiliary_loss_mlp": 0.00750012, + "balance_loss_clip": 1.0526315, + "balance_loss_mlp": 1.00068736, + "epoch": 0.05603487148654742, + "flos": 24021976371840.0, + "grad_norm": 2.191910316174497, + "language_loss": 0.8367703, + "learning_rate": 3.9928946095134525e-06, + "loss": 0.8561483, + "num_input_tokens_seen": 19880795, + "step": 932, + "time_per_iteration": 2.6748125553131104 + }, + { + "auxiliary_loss_clip": 0.01190128, + "auxiliary_loss_mlp": 0.0107451, + "balance_loss_clip": 1.05579638, + "balance_loss_mlp": 1.04613817, + "epoch": 0.05609499473921539, + "flos": 17307758517120.0, + "grad_norm": 2.0878546927145263, + "language_loss": 0.73390484, + "learning_rate": 3.992861771819365e-06, + "loss": 0.75655121, + "num_input_tokens_seen": 19897960, + "step": 933, + "time_per_iteration": 2.6496293544769287 + }, + { + "auxiliary_loss_clip": 0.01135139, + "auxiliary_loss_mlp": 0.01072564, + "balance_loss_clip": 1.04411685, + "balance_loss_mlp": 1.04495537, + "epoch": 0.05615511799188336, + "flos": 20994742972800.0, + "grad_norm": 2.6167014071645176, + "language_loss": 0.86686826, + "learning_rate": 3.99282885855576e-06, + "loss": 0.88894528, + "num_input_tokens_seen": 19913315, + "step": 934, + "time_per_iteration": 2.7217743396759033 + }, + { + "auxiliary_loss_clip": 0.01146221, + "auxiliary_loss_mlp": 0.0107075, + "balance_loss_clip": 1.0483954, + "balance_loss_mlp": 1.04432154, + "epoch": 0.05621524124455133, + "flos": 17273535834240.0, + "grad_norm": 2.3818892725825473, + "language_loss": 0.80399263, + "learning_rate": 3.992795869723885e-06, + "loss": 0.82616234, + "num_input_tokens_seen": 19928790, + "step": 935, + "time_per_iteration": 2.720475912094116 + }, + { + "auxiliary_loss_clip": 0.01074515, + "auxiliary_loss_mlp": 0.01006086, + "balance_loss_clip": 1.01139712, + "balance_loss_mlp": 1.00186634, + "epoch": 0.0562753644972193, + "flos": 58719370458240.0, + "grad_norm": 0.8118448557553322, + "language_loss": 0.69154716, + "learning_rate": 3.99276280532499e-06, + "loss": 0.71235317, + "num_input_tokens_seen": 19988785, + "step": 936, + "time_per_iteration": 3.010758876800537 + }, + { + "auxiliary_loss_clip": 0.01199188, + "auxiliary_loss_mlp": 0.01070159, + "balance_loss_clip": 1.05284977, + "balance_loss_mlp": 1.04448128, + "epoch": 0.05633548774988727, + "flos": 17457039440640.0, + "grad_norm": 2.018635801974308, + "language_loss": 0.75708801, + "learning_rate": 3.992729665360331e-06, + "loss": 0.77978146, + "num_input_tokens_seen": 20007685, + "step": 937, + "time_per_iteration": 4.149141788482666 + }, + { + "auxiliary_loss_clip": 0.01062219, + "auxiliary_loss_mlp": 0.01003337, + "balance_loss_clip": 1.00999749, + "balance_loss_mlp": 0.99904585, + "epoch": 0.05639561100255524, + "flos": 70654928083200.0, + "grad_norm": 0.8596029614236209, + "language_loss": 0.64302993, + "learning_rate": 3.992696449831162e-06, + "loss": 0.66368544, + "num_input_tokens_seen": 20072750, + "step": 938, + "time_per_iteration": 6.23375940322876 + }, + { + "auxiliary_loss_clip": 0.01150873, + "auxiliary_loss_mlp": 0.01069239, + "balance_loss_clip": 1.04722977, + "balance_loss_mlp": 1.04109383, + "epoch": 0.056455734255223204, + "flos": 20485996692480.0, + "grad_norm": 2.868806766240339, + "language_loss": 0.78936249, + "learning_rate": 3.992663158738745e-06, + "loss": 0.81156361, + "num_input_tokens_seen": 20089070, + "step": 939, + "time_per_iteration": 4.308915376663208 + }, + { + "auxiliary_loss_clip": 0.01153102, + "auxiliary_loss_mlp": 0.01065067, + "balance_loss_clip": 1.04505217, + "balance_loss_mlp": 1.03910339, + "epoch": 0.056515857507891176, + "flos": 22053569109120.0, + "grad_norm": 1.925225240783135, + "language_loss": 0.74263966, + "learning_rate": 3.992629792084341e-06, + "loss": 0.76482129, + "num_input_tokens_seen": 20108790, + "step": 940, + "time_per_iteration": 2.699058771133423 + }, + { + "auxiliary_loss_clip": 0.01189809, + "auxiliary_loss_mlp": 0.01064687, + "balance_loss_clip": 1.05625534, + "balance_loss_mlp": 1.03729272, + "epoch": 0.05657598076055915, + "flos": 24025316336640.0, + "grad_norm": 1.8036386241958104, + "language_loss": 0.7045238, + "learning_rate": 3.992596349869216e-06, + "loss": 0.72706866, + "num_input_tokens_seen": 20128455, + "step": 941, + "time_per_iteration": 2.705524206161499 + }, + { + "auxiliary_loss_clip": 0.0111614, + "auxiliary_loss_mlp": 0.01065683, + "balance_loss_clip": 1.04569077, + "balance_loss_mlp": 1.03843188, + "epoch": 0.05663610401322711, + "flos": 20480609652480.0, + "grad_norm": 1.895914251695046, + "language_loss": 0.80903006, + "learning_rate": 3.992562832094637e-06, + "loss": 0.83084828, + "num_input_tokens_seen": 20145775, + "step": 942, + "time_per_iteration": 2.675563335418701 + }, + { + "auxiliary_loss_clip": 0.01166432, + "auxiliary_loss_mlp": 0.01060961, + "balance_loss_clip": 1.04554725, + "balance_loss_mlp": 1.03534245, + "epoch": 0.056696227265895086, + "flos": 21069042255360.0, + "grad_norm": 2.2127054565146635, + "language_loss": 0.88717085, + "learning_rate": 3.9925292387618755e-06, + "loss": 0.90944481, + "num_input_tokens_seen": 20164315, + "step": 943, + "time_per_iteration": 2.6149630546569824 + }, + { + "auxiliary_loss_clip": 0.01185091, + "auxiliary_loss_mlp": 0.01061179, + "balance_loss_clip": 1.05321431, + "balance_loss_mlp": 1.0354656, + "epoch": 0.05675635051856306, + "flos": 17821317219840.0, + "grad_norm": 2.4832109458737404, + "language_loss": 0.75048441, + "learning_rate": 3.992495569872206e-06, + "loss": 0.77294719, + "num_input_tokens_seen": 20182760, + "step": 944, + "time_per_iteration": 2.618140697479248 + }, + { + "auxiliary_loss_clip": 0.01189655, + "auxiliary_loss_mlp": 0.01066233, + "balance_loss_clip": 1.05220056, + "balance_loss_mlp": 1.04153299, + "epoch": 0.05681647377123102, + "flos": 23114945111040.0, + "grad_norm": 1.8730148471968968, + "language_loss": 0.79471993, + "learning_rate": 3.992461825426906e-06, + "loss": 0.8172788, + "num_input_tokens_seen": 20203830, + "step": 945, + "time_per_iteration": 2.6691863536834717 + }, + { + "auxiliary_loss_clip": 0.011826, + "auxiliary_loss_mlp": 0.01063434, + "balance_loss_clip": 1.05178905, + "balance_loss_mlp": 1.03733897, + "epoch": 0.056876597023898995, + "flos": 16070528505600.0, + "grad_norm": 2.5206614653524273, + "language_loss": 0.82745904, + "learning_rate": 3.992428005427252e-06, + "loss": 0.84991932, + "num_input_tokens_seen": 20220365, + "step": 946, + "time_per_iteration": 2.6906678676605225 + }, + { + "auxiliary_loss_clip": 0.01201244, + "auxiliary_loss_mlp": 0.01064949, + "balance_loss_clip": 1.05339909, + "balance_loss_mlp": 1.03793657, + "epoch": 0.05693672027656696, + "flos": 16835641130880.0, + "grad_norm": 1.8558649381345582, + "language_loss": 0.79065007, + "learning_rate": 3.992394109874529e-06, + "loss": 0.81331193, + "num_input_tokens_seen": 20238640, + "step": 947, + "time_per_iteration": 2.5473217964172363 + }, + { + "auxiliary_loss_clip": 0.01156396, + "auxiliary_loss_mlp": 0.01066911, + "balance_loss_clip": 1.05091846, + "balance_loss_mlp": 1.04076886, + "epoch": 0.05699684352923493, + "flos": 21389113370880.0, + "grad_norm": 4.944750610730303, + "language_loss": 0.85574841, + "learning_rate": 3.9923601387700225e-06, + "loss": 0.87798154, + "num_input_tokens_seen": 20251025, + "step": 948, + "time_per_iteration": 2.6942801475524902 + }, + { + "auxiliary_loss_clip": 0.01198847, + "auxiliary_loss_mlp": 0.0107237, + "balance_loss_clip": 1.05352998, + "balance_loss_mlp": 1.04426086, + "epoch": 0.057056966781902904, + "flos": 15560309767680.0, + "grad_norm": 1.843494252416337, + "language_loss": 0.87586224, + "learning_rate": 3.992326092115019e-06, + "loss": 0.89857435, + "num_input_tokens_seen": 20269775, + "step": 949, + "time_per_iteration": 2.6262786388397217 + }, + { + "auxiliary_loss_clip": 0.01179927, + "auxiliary_loss_mlp": 0.01069218, + "balance_loss_clip": 1.0524168, + "balance_loss_mlp": 1.04473257, + "epoch": 0.05711709003457087, + "flos": 19937856170880.0, + "grad_norm": 1.954690860463464, + "language_loss": 0.7875101, + "learning_rate": 3.992291969910811e-06, + "loss": 0.81000161, + "num_input_tokens_seen": 20287715, + "step": 950, + "time_per_iteration": 2.6725282669067383 + }, + { + "auxiliary_loss_clip": 0.01164015, + "auxiliary_loss_mlp": 0.01071355, + "balance_loss_clip": 1.04819703, + "balance_loss_mlp": 1.04529595, + "epoch": 0.05717721328723884, + "flos": 30332701774080.0, + "grad_norm": 1.980596984876685, + "language_loss": 0.82086647, + "learning_rate": 3.992257772158691e-06, + "loss": 0.84322017, + "num_input_tokens_seen": 20307070, + "step": 951, + "time_per_iteration": 2.712832450866699 + }, + { + "auxiliary_loss_clip": 0.01142666, + "auxiliary_loss_mlp": 0.0106417, + "balance_loss_clip": 1.04264462, + "balance_loss_mlp": 1.03622699, + "epoch": 0.05723733653990681, + "flos": 23654358627840.0, + "grad_norm": 2.8045528238342015, + "language_loss": 0.87047243, + "learning_rate": 3.992223498859958e-06, + "loss": 0.89254081, + "num_input_tokens_seen": 20324945, + "step": 952, + "time_per_iteration": 2.707472562789917 + }, + { + "auxiliary_loss_clip": 0.01174046, + "auxiliary_loss_mlp": 0.01062248, + "balance_loss_clip": 1.04636562, + "balance_loss_mlp": 1.033566, + "epoch": 0.05729745979257478, + "flos": 22055759838720.0, + "grad_norm": 3.313069317216928, + "language_loss": 0.79346693, + "learning_rate": 3.9921891500159084e-06, + "loss": 0.81582987, + "num_input_tokens_seen": 20346135, + "step": 953, + "time_per_iteration": 2.6789016723632812 + }, + { + "auxiliary_loss_clip": 0.01167926, + "auxiliary_loss_mlp": 0.01065418, + "balance_loss_clip": 1.05118394, + "balance_loss_mlp": 1.03884649, + "epoch": 0.05735758304524275, + "flos": 19604353368960.0, + "grad_norm": 2.171145450319032, + "language_loss": 0.86913413, + "learning_rate": 3.992154725627848e-06, + "loss": 0.89146757, + "num_input_tokens_seen": 20364450, + "step": 954, + "time_per_iteration": 2.6377928256988525 + }, + { + "auxiliary_loss_clip": 0.01194581, + "auxiliary_loss_mlp": 0.01063603, + "balance_loss_clip": 1.05492711, + "balance_loss_mlp": 1.03744853, + "epoch": 0.057417706297910716, + "flos": 19099018880640.0, + "grad_norm": 2.7510171401725287, + "language_loss": 0.87985855, + "learning_rate": 3.9921202256970804e-06, + "loss": 0.90244037, + "num_input_tokens_seen": 20383500, + "step": 955, + "time_per_iteration": 2.5901918411254883 + }, + { + "auxiliary_loss_clip": 0.01157859, + "auxiliary_loss_mlp": 0.0106529, + "balance_loss_clip": 1.04598117, + "balance_loss_mlp": 1.03900468, + "epoch": 0.05747782955057869, + "flos": 16654507822080.0, + "grad_norm": 2.6193879780355536, + "language_loss": 0.89431006, + "learning_rate": 3.992085650224914e-06, + "loss": 0.91654152, + "num_input_tokens_seen": 20400295, + "step": 956, + "time_per_iteration": 2.66497802734375 + }, + { + "auxiliary_loss_clip": 0.01150625, + "auxiliary_loss_mlp": 0.01058804, + "balance_loss_clip": 1.05017257, + "balance_loss_mlp": 1.03281605, + "epoch": 0.05753795280324665, + "flos": 14502058248960.0, + "grad_norm": 1.7800354786811083, + "language_loss": 0.7528621, + "learning_rate": 3.99205099921266e-06, + "loss": 0.77495635, + "num_input_tokens_seen": 20419085, + "step": 957, + "time_per_iteration": 2.7661237716674805 + }, + { + "auxiliary_loss_clip": 0.01153959, + "auxiliary_loss_mlp": 0.01074527, + "balance_loss_clip": 1.04609942, + "balance_loss_mlp": 1.04515398, + "epoch": 0.057598076055914625, + "flos": 18076318848000.0, + "grad_norm": 2.622638372927634, + "language_loss": 0.80114961, + "learning_rate": 3.992016272661633e-06, + "loss": 0.82343441, + "num_input_tokens_seen": 20437465, + "step": 958, + "time_per_iteration": 2.6583616733551025 + }, + { + "auxiliary_loss_clip": 0.01169661, + "auxiliary_loss_mlp": 0.01059213, + "balance_loss_clip": 1.04876697, + "balance_loss_mlp": 1.03452432, + "epoch": 0.0576581993085826, + "flos": 22124600254080.0, + "grad_norm": 3.511726241893174, + "language_loss": 0.88188195, + "learning_rate": 3.99198147057315e-06, + "loss": 0.90417075, + "num_input_tokens_seen": 20456235, + "step": 959, + "time_per_iteration": 2.6417362689971924 + }, + { + "auxiliary_loss_clip": 0.01154532, + "auxiliary_loss_mlp": 0.01065614, + "balance_loss_clip": 1.04871011, + "balance_loss_mlp": 1.03917289, + "epoch": 0.05771832256125056, + "flos": 33181746779520.0, + "grad_norm": 2.2676500532645583, + "language_loss": 0.78607994, + "learning_rate": 3.991946592948529e-06, + "loss": 0.80828142, + "num_input_tokens_seen": 20476825, + "step": 960, + "time_per_iteration": 2.7629995346069336 + }, + { + "auxiliary_loss_clip": 0.01119265, + "auxiliary_loss_mlp": 0.0106766, + "balance_loss_clip": 1.04879498, + "balance_loss_mlp": 1.03899038, + "epoch": 0.057778445813918534, + "flos": 24170143973760.0, + "grad_norm": 2.128275051925485, + "language_loss": 0.93086511, + "learning_rate": 3.991911639789094e-06, + "loss": 0.95273435, + "num_input_tokens_seen": 20496965, + "step": 961, + "time_per_iteration": 2.7583069801330566 + }, + { + "auxiliary_loss_clip": 0.01159685, + "auxiliary_loss_mlp": 0.01072521, + "balance_loss_clip": 1.04652822, + "balance_loss_mlp": 1.04468596, + "epoch": 0.0578385690665865, + "flos": 29643037666560.0, + "grad_norm": 2.721822615214077, + "language_loss": 0.6811192, + "learning_rate": 3.991876611096169e-06, + "loss": 0.70344126, + "num_input_tokens_seen": 20518035, + "step": 962, + "time_per_iteration": 2.740294933319092 + }, + { + "auxiliary_loss_clip": 0.01136094, + "auxiliary_loss_mlp": 0.01069496, + "balance_loss_clip": 1.04527438, + "balance_loss_mlp": 1.04287696, + "epoch": 0.05789869231925447, + "flos": 20885430908160.0, + "grad_norm": 2.7450112661489547, + "language_loss": 0.8848455, + "learning_rate": 3.991841506871084e-06, + "loss": 0.90690136, + "num_input_tokens_seen": 20534740, + "step": 963, + "time_per_iteration": 2.7357065677642822 + }, + { + "auxiliary_loss_clip": 0.01161162, + "auxiliary_loss_mlp": 0.01059133, + "balance_loss_clip": 1.05247998, + "balance_loss_mlp": 1.03251386, + "epoch": 0.057958815571922444, + "flos": 26031106679040.0, + "grad_norm": 2.2240022136990185, + "language_loss": 0.85175061, + "learning_rate": 3.99180632711517e-06, + "loss": 0.87395358, + "num_input_tokens_seen": 20553485, + "step": 964, + "time_per_iteration": 2.8102104663848877 + }, + { + "auxiliary_loss_clip": 0.01172346, + "auxiliary_loss_mlp": 0.01068755, + "balance_loss_clip": 1.05171943, + "balance_loss_mlp": 1.04171848, + "epoch": 0.05801893882459041, + "flos": 18077683564800.0, + "grad_norm": 3.228402949060512, + "language_loss": 0.78033763, + "learning_rate": 3.99177107182976e-06, + "loss": 0.80274862, + "num_input_tokens_seen": 20572155, + "step": 965, + "time_per_iteration": 2.654900312423706 + }, + { + "auxiliary_loss_clip": 0.01131865, + "auxiliary_loss_mlp": 0.01066731, + "balance_loss_clip": 1.04479003, + "balance_loss_mlp": 1.04066026, + "epoch": 0.05807906207725838, + "flos": 17748885444480.0, + "grad_norm": 1.9126794489640693, + "language_loss": 0.8131547, + "learning_rate": 3.99173574101619e-06, + "loss": 0.83514059, + "num_input_tokens_seen": 20590395, + "step": 966, + "time_per_iteration": 2.828982353210449 + }, + { + "auxiliary_loss_clip": 0.01182976, + "auxiliary_loss_mlp": 0.01063145, + "balance_loss_clip": 1.05135846, + "balance_loss_mlp": 1.03819466, + "epoch": 0.058139185329926346, + "flos": 18040372312320.0, + "grad_norm": 2.0114526259123053, + "language_loss": 0.76241773, + "learning_rate": 3.9917003346758035e-06, + "loss": 0.78487897, + "num_input_tokens_seen": 20608435, + "step": 967, + "time_per_iteration": 2.568788528442383 + }, + { + "auxiliary_loss_clip": 0.01067439, + "auxiliary_loss_mlp": 0.0103191, + "balance_loss_clip": 1.01473606, + "balance_loss_mlp": 1.02673662, + "epoch": 0.05819930858259432, + "flos": 62363297485440.0, + "grad_norm": 0.799338554797054, + "language_loss": 0.57359433, + "learning_rate": 3.991664852809939e-06, + "loss": 0.5945878, + "num_input_tokens_seen": 20668575, + "step": 968, + "time_per_iteration": 3.186244010925293 + }, + { + "auxiliary_loss_clip": 0.01173443, + "auxiliary_loss_mlp": 0.01059112, + "balance_loss_clip": 1.05246902, + "balance_loss_mlp": 1.03088307, + "epoch": 0.05825943183526229, + "flos": 19135360465920.0, + "grad_norm": 2.1220170606143873, + "language_loss": 0.82212079, + "learning_rate": 3.991629295419945e-06, + "loss": 0.8444463, + "num_input_tokens_seen": 20687355, + "step": 969, + "time_per_iteration": 2.755262613296509 + }, + { + "auxiliary_loss_clip": 0.01187934, + "auxiliary_loss_mlp": 0.00750087, + "balance_loss_clip": 1.05462384, + "balance_loss_mlp": 1.00083232, + "epoch": 0.058319555087930255, + "flos": 29022465369600.0, + "grad_norm": 2.074503574607013, + "language_loss": 0.77808619, + "learning_rate": 3.991593662507167e-06, + "loss": 0.7974664, + "num_input_tokens_seen": 20705710, + "step": 970, + "time_per_iteration": 2.680332660675049 + }, + { + "auxiliary_loss_clip": 0.01165115, + "auxiliary_loss_mlp": 0.01061823, + "balance_loss_clip": 1.05081868, + "balance_loss_mlp": 1.03419018, + "epoch": 0.05837967834059823, + "flos": 18879999701760.0, + "grad_norm": 2.887394318097223, + "language_loss": 0.92051113, + "learning_rate": 3.991557954072958e-06, + "loss": 0.94278055, + "num_input_tokens_seen": 20722405, + "step": 971, + "time_per_iteration": 2.6465518474578857 + }, + { + "auxiliary_loss_clip": 0.01162256, + "auxiliary_loss_mlp": 0.01062622, + "balance_loss_clip": 1.04660165, + "balance_loss_mlp": 1.03624129, + "epoch": 0.05843980159326619, + "flos": 25703062744320.0, + "grad_norm": 1.6946261921859649, + "language_loss": 0.85983658, + "learning_rate": 3.991522170118673e-06, + "loss": 0.88208538, + "num_input_tokens_seen": 20741480, + "step": 972, + "time_per_iteration": 2.665067434310913 + }, + { + "auxiliary_loss_clip": 0.01154562, + "auxiliary_loss_mlp": 0.01066763, + "balance_loss_clip": 1.04985738, + "balance_loss_mlp": 1.04156196, + "epoch": 0.058499924845934165, + "flos": 25552129795200.0, + "grad_norm": 3.2209825218563584, + "language_loss": 0.87802458, + "learning_rate": 3.991486310645667e-06, + "loss": 0.9002378, + "num_input_tokens_seen": 20759685, + "step": 973, + "time_per_iteration": 2.759666681289673 + }, + { + "auxiliary_loss_clip": 0.01189776, + "auxiliary_loss_mlp": 0.00750035, + "balance_loss_clip": 1.05385208, + "balance_loss_mlp": 1.00081468, + "epoch": 0.05856004809860214, + "flos": 16436171001600.0, + "grad_norm": 1.907436627573284, + "language_loss": 0.75169182, + "learning_rate": 3.991450375655301e-06, + "loss": 0.77108991, + "num_input_tokens_seen": 20778180, + "step": 974, + "time_per_iteration": 2.5923147201538086 + }, + { + "auxiliary_loss_clip": 0.0118146, + "auxiliary_loss_mlp": 0.00750024, + "balance_loss_clip": 1.05221903, + "balance_loss_mlp": 1.00080538, + "epoch": 0.0586201713512701, + "flos": 39458824116480.0, + "grad_norm": 1.6116604312663567, + "language_loss": 0.76724088, + "learning_rate": 3.991414365148936e-06, + "loss": 0.78655577, + "num_input_tokens_seen": 20802705, + "step": 975, + "time_per_iteration": 2.9065628051757812 + }, + { + "auxiliary_loss_clip": 0.01198861, + "auxiliary_loss_mlp": 0.01069403, + "balance_loss_clip": 1.05375934, + "balance_loss_mlp": 1.04321241, + "epoch": 0.058680294603938074, + "flos": 23365170230400.0, + "grad_norm": 2.042300012226092, + "language_loss": 0.76858246, + "learning_rate": 3.99137827912794e-06, + "loss": 0.79126507, + "num_input_tokens_seen": 20822540, + "step": 976, + "time_per_iteration": 2.570943593978882 + }, + { + "auxiliary_loss_clip": 0.01153062, + "auxiliary_loss_mlp": 0.01063394, + "balance_loss_clip": 1.04479718, + "balance_loss_mlp": 1.03630996, + "epoch": 0.05874041785660604, + "flos": 32232017226240.0, + "grad_norm": 1.8382669838808783, + "language_loss": 0.87692088, + "learning_rate": 3.991342117593679e-06, + "loss": 0.8990854, + "num_input_tokens_seen": 20844175, + "step": 977, + "time_per_iteration": 2.734365940093994 + }, + { + "auxiliary_loss_clip": 0.01175458, + "auxiliary_loss_mlp": 0.01066611, + "balance_loss_clip": 1.05727005, + "balance_loss_mlp": 1.03897882, + "epoch": 0.05880054110927401, + "flos": 22310043194880.0, + "grad_norm": 1.4933857533376194, + "language_loss": 0.79621887, + "learning_rate": 3.991305880547527e-06, + "loss": 0.81863952, + "num_input_tokens_seen": 20864730, + "step": 978, + "time_per_iteration": 2.639549493789673 + }, + { + "auxiliary_loss_clip": 0.01120055, + "auxiliary_loss_mlp": 0.01071649, + "balance_loss_clip": 1.05145121, + "balance_loss_mlp": 1.04258573, + "epoch": 0.05886066436194198, + "flos": 27380450016000.0, + "grad_norm": 1.9333833931931128, + "language_loss": 0.80630553, + "learning_rate": 3.991269567990855e-06, + "loss": 0.82822251, + "num_input_tokens_seen": 20885200, + "step": 979, + "time_per_iteration": 2.8436896800994873 + }, + { + "auxiliary_loss_clip": 0.01051387, + "auxiliary_loss_mlp": 0.01017658, + "balance_loss_clip": 1.01241016, + "balance_loss_mlp": 1.01222229, + "epoch": 0.05892078761460995, + "flos": 59584493525760.0, + "grad_norm": 0.931911039962927, + "language_loss": 0.58973324, + "learning_rate": 3.9912331799250415e-06, + "loss": 0.61042368, + "num_input_tokens_seen": 20940325, + "step": 980, + "time_per_iteration": 3.076179027557373 + }, + { + "auxiliary_loss_clip": 0.01195283, + "auxiliary_loss_mlp": 0.0106914, + "balance_loss_clip": 1.05521202, + "balance_loss_mlp": 1.04099488, + "epoch": 0.05898091086727792, + "flos": 15414081500160.0, + "grad_norm": 2.1258451006585957, + "language_loss": 0.86484063, + "learning_rate": 3.9911967163514665e-06, + "loss": 0.88748485, + "num_input_tokens_seen": 20958220, + "step": 981, + "time_per_iteration": 2.5902554988861084 + }, + { + "auxiliary_loss_clip": 0.01175993, + "auxiliary_loss_mlp": 0.01063279, + "balance_loss_clip": 1.05278134, + "balance_loss_mlp": 1.03758907, + "epoch": 0.059041034119945886, + "flos": 23655328295040.0, + "grad_norm": 2.0815650902032803, + "language_loss": 0.79531735, + "learning_rate": 3.991160177271513e-06, + "loss": 0.81771004, + "num_input_tokens_seen": 20978920, + "step": 982, + "time_per_iteration": 2.7235631942749023 + }, + { + "auxiliary_loss_clip": 0.01172445, + "auxiliary_loss_mlp": 0.01064862, + "balance_loss_clip": 1.05098248, + "balance_loss_mlp": 1.03824282, + "epoch": 0.05910115737261386, + "flos": 24754087376640.0, + "grad_norm": 2.4320494669823187, + "language_loss": 0.84050739, + "learning_rate": 3.9911235626865654e-06, + "loss": 0.86288047, + "num_input_tokens_seen": 20999490, + "step": 983, + "time_per_iteration": 2.644615411758423 + }, + { + "auxiliary_loss_clip": 0.01178349, + "auxiliary_loss_mlp": 0.01073165, + "balance_loss_clip": 1.0499661, + "balance_loss_mlp": 1.04740417, + "epoch": 0.05916128062528183, + "flos": 11728749070080.0, + "grad_norm": 1.9035668538560666, + "language_loss": 0.84530807, + "learning_rate": 3.9910868725980125e-06, + "loss": 0.86782318, + "num_input_tokens_seen": 21017865, + "step": 984, + "time_per_iteration": 2.572120428085327 + }, + { + "auxiliary_loss_clip": 0.01169058, + "auxiliary_loss_mlp": 0.01059326, + "balance_loss_clip": 1.04996872, + "balance_loss_mlp": 1.03449488, + "epoch": 0.059221403877949795, + "flos": 21902995296000.0, + "grad_norm": 2.38268376814202, + "language_loss": 0.7738741, + "learning_rate": 3.9910501070072465e-06, + "loss": 0.79615796, + "num_input_tokens_seen": 21035900, + "step": 985, + "time_per_iteration": 5.9587414264678955 + }, + { + "auxiliary_loss_clip": 0.01123617, + "auxiliary_loss_mlp": 0.01073552, + "balance_loss_clip": 1.04581511, + "balance_loss_mlp": 1.04696798, + "epoch": 0.05928152713061777, + "flos": 20514580940160.0, + "grad_norm": 2.049208419173488, + "language_loss": 0.90879542, + "learning_rate": 3.991013265915661e-06, + "loss": 0.93076718, + "num_input_tokens_seen": 21053235, + "step": 986, + "time_per_iteration": 4.316771984100342 + }, + { + "auxiliary_loss_clip": 0.01180948, + "auxiliary_loss_mlp": 0.01066179, + "balance_loss_clip": 1.04788852, + "balance_loss_mlp": 1.0374496, + "epoch": 0.05934165038328574, + "flos": 24495135252480.0, + "grad_norm": 3.364507623847524, + "language_loss": 0.75612438, + "learning_rate": 3.9909763493246525e-06, + "loss": 0.77859557, + "num_input_tokens_seen": 21073090, + "step": 987, + "time_per_iteration": 2.6478376388549805 + }, + { + "auxiliary_loss_clip": 0.01184175, + "auxiliary_loss_mlp": 0.01064566, + "balance_loss_clip": 1.05243468, + "balance_loss_mlp": 1.03787518, + "epoch": 0.059401773635953704, + "flos": 38728041914880.0, + "grad_norm": 2.3875301386453556, + "language_loss": 0.7130878, + "learning_rate": 3.990939357235621e-06, + "loss": 0.7355752, + "num_input_tokens_seen": 21094895, + "step": 988, + "time_per_iteration": 2.7409989833831787 + }, + { + "auxiliary_loss_clip": 0.01037066, + "auxiliary_loss_mlp": 0.01009305, + "balance_loss_clip": 1.01279545, + "balance_loss_mlp": 1.00465572, + "epoch": 0.059461896888621676, + "flos": 58023565125120.0, + "grad_norm": 0.9397959789511368, + "language_loss": 0.71199965, + "learning_rate": 3.99090228964997e-06, + "loss": 0.73246336, + "num_input_tokens_seen": 21147555, + "step": 989, + "time_per_iteration": 3.123626232147217 + }, + { + "auxiliary_loss_clip": 0.01153261, + "auxiliary_loss_mlp": 0.01075477, + "balance_loss_clip": 1.04901803, + "balance_loss_mlp": 1.04605639, + "epoch": 0.05952202014128964, + "flos": 22127760650880.0, + "grad_norm": 1.9722164779620428, + "language_loss": 0.7822262, + "learning_rate": 3.990865146569105e-06, + "loss": 0.80451357, + "num_input_tokens_seen": 21167845, + "step": 990, + "time_per_iteration": 2.7436270713806152 + }, + { + "auxiliary_loss_clip": 0.01182969, + "auxiliary_loss_mlp": 0.01062501, + "balance_loss_clip": 1.05426633, + "balance_loss_mlp": 1.03437972, + "epoch": 0.059582143393957614, + "flos": 20445776438400.0, + "grad_norm": 2.323776214405011, + "language_loss": 0.86359119, + "learning_rate": 3.990827927994434e-06, + "loss": 0.88604593, + "num_input_tokens_seen": 21185085, + "step": 991, + "time_per_iteration": 2.628704071044922 + }, + { + "auxiliary_loss_clip": 0.01197903, + "auxiliary_loss_mlp": 0.010706, + "balance_loss_clip": 1.05271268, + "balance_loss_mlp": 1.04375386, + "epoch": 0.059642266646625586, + "flos": 20594877793920.0, + "grad_norm": 2.261377566587132, + "language_loss": 0.76934826, + "learning_rate": 3.9907906339273674e-06, + "loss": 0.79203326, + "num_input_tokens_seen": 21204230, + "step": 992, + "time_per_iteration": 2.5800321102142334 + }, + { + "auxiliary_loss_clip": 0.01125987, + "auxiliary_loss_mlp": 0.01065054, + "balance_loss_clip": 1.04805291, + "balance_loss_mlp": 1.03975797, + "epoch": 0.05970238989929355, + "flos": 19352655792000.0, + "grad_norm": 2.3454227447451155, + "language_loss": 0.74933851, + "learning_rate": 3.9907532643693215e-06, + "loss": 0.77124894, + "num_input_tokens_seen": 21222655, + "step": 993, + "time_per_iteration": 2.7602620124816895 + }, + { + "auxiliary_loss_clip": 0.0113021, + "auxiliary_loss_mlp": 0.010741, + "balance_loss_clip": 1.04476023, + "balance_loss_mlp": 1.04670548, + "epoch": 0.05976251315196152, + "flos": 30264040926720.0, + "grad_norm": 1.8999608327980924, + "language_loss": 0.78822756, + "learning_rate": 3.990715819321712e-06, + "loss": 0.81027067, + "num_input_tokens_seen": 21242310, + "step": 994, + "time_per_iteration": 2.725144863128662 + }, + { + "auxiliary_loss_clip": 0.01194648, + "auxiliary_loss_mlp": 0.01085241, + "balance_loss_clip": 1.05296183, + "balance_loss_mlp": 1.05858564, + "epoch": 0.05982263640462949, + "flos": 23185150243200.0, + "grad_norm": 2.4084300595816797, + "language_loss": 0.7978524, + "learning_rate": 3.99067829878596e-06, + "loss": 0.82065129, + "num_input_tokens_seen": 21261410, + "step": 995, + "time_per_iteration": 2.6232845783233643 + }, + { + "auxiliary_loss_clip": 0.01155633, + "auxiliary_loss_mlp": 0.01066534, + "balance_loss_clip": 1.05267978, + "balance_loss_mlp": 1.03837657, + "epoch": 0.05988275965729746, + "flos": 27850879463040.0, + "grad_norm": 2.1622957645792824, + "language_loss": 0.87247413, + "learning_rate": 3.990640702763487e-06, + "loss": 0.89469582, + "num_input_tokens_seen": 21280080, + "step": 996, + "time_per_iteration": 2.7926814556121826 + }, + { + "auxiliary_loss_clip": 0.01148119, + "auxiliary_loss_mlp": 0.01079859, + "balance_loss_clip": 1.04645586, + "balance_loss_mlp": 1.04865003, + "epoch": 0.05994288290996543, + "flos": 24680003575680.0, + "grad_norm": 2.7599853696652277, + "language_loss": 0.87839609, + "learning_rate": 3.990603031255718e-06, + "loss": 0.90067589, + "num_input_tokens_seen": 21296765, + "step": 997, + "time_per_iteration": 2.732853889465332 + }, + { + "auxiliary_loss_clip": 0.01047306, + "auxiliary_loss_mlp": 0.01009956, + "balance_loss_clip": 1.0097822, + "balance_loss_mlp": 1.00478256, + "epoch": 0.0600030061626334, + "flos": 69929568835200.0, + "grad_norm": 1.0138500899591556, + "language_loss": 0.75415385, + "learning_rate": 3.990565284264083e-06, + "loss": 0.77472651, + "num_input_tokens_seen": 21363345, + "step": 998, + "time_per_iteration": 3.253094434738159 + }, + { + "auxiliary_loss_clip": 0.01147358, + "auxiliary_loss_mlp": 0.01071462, + "balance_loss_clip": 1.05041134, + "balance_loss_mlp": 1.04374588, + "epoch": 0.06006312941530137, + "flos": 26540140268160.0, + "grad_norm": 1.8221490242697314, + "language_loss": 0.7589736, + "learning_rate": 3.990527461790013e-06, + "loss": 0.78116179, + "num_input_tokens_seen": 21385290, + "step": 999, + "time_per_iteration": 2.7128965854644775 + }, + { + "auxiliary_loss_clip": 0.0117169, + "auxiliary_loss_mlp": 0.01062321, + "balance_loss_clip": 1.04588568, + "balance_loss_mlp": 1.03546333, + "epoch": 0.060123252667969335, + "flos": 27344000689920.0, + "grad_norm": 2.1384249398574786, + "language_loss": 0.828228, + "learning_rate": 3.990489563834943e-06, + "loss": 0.85056812, + "num_input_tokens_seen": 21407625, + "step": 1000, + "time_per_iteration": 2.644680976867676 + }, + { + "auxiliary_loss_clip": 0.01158011, + "auxiliary_loss_mlp": 0.01063126, + "balance_loss_clip": 1.04753101, + "balance_loss_mlp": 1.036304, + "epoch": 0.06018337592063731, + "flos": 27016710940800.0, + "grad_norm": 2.859845329551807, + "language_loss": 0.8608886, + "learning_rate": 3.990451590400309e-06, + "loss": 0.88309997, + "num_input_tokens_seen": 21426835, + "step": 1001, + "time_per_iteration": 2.6477978229522705 + }, + { + "auxiliary_loss_clip": 0.0117091, + "auxiliary_loss_mlp": 0.0106125, + "balance_loss_clip": 1.05092907, + "balance_loss_mlp": 1.03527439, + "epoch": 0.06024349917330528, + "flos": 25592960580480.0, + "grad_norm": 2.085858356198947, + "language_loss": 0.74378383, + "learning_rate": 3.990413541487551e-06, + "loss": 0.76610547, + "num_input_tokens_seen": 21444920, + "step": 1002, + "time_per_iteration": 2.779524564743042 + }, + { + "auxiliary_loss_clip": 0.01192595, + "auxiliary_loss_mlp": 0.01066383, + "balance_loss_clip": 1.05163884, + "balance_loss_mlp": 1.03941774, + "epoch": 0.060303622425973244, + "flos": 26133271937280.0, + "grad_norm": 4.225269746357485, + "language_loss": 0.75851381, + "learning_rate": 3.990375417098112e-06, + "loss": 0.78110349, + "num_input_tokens_seen": 21463555, + "step": 1003, + "time_per_iteration": 2.5676159858703613 + }, + { + "auxiliary_loss_clip": 0.01163266, + "auxiliary_loss_mlp": 0.01064651, + "balance_loss_clip": 1.0491302, + "balance_loss_mlp": 1.0380199, + "epoch": 0.060363745678641216, + "flos": 20377187418240.0, + "grad_norm": 2.852341125759798, + "language_loss": 0.70274782, + "learning_rate": 3.990337217233437e-06, + "loss": 0.72502697, + "num_input_tokens_seen": 21481990, + "step": 1004, + "time_per_iteration": 2.6219658851623535 + }, + { + "auxiliary_loss_clip": 0.01191002, + "auxiliary_loss_mlp": 0.01076971, + "balance_loss_clip": 1.05544317, + "balance_loss_mlp": 1.05020893, + "epoch": 0.06042386893130918, + "flos": 17749172753280.0, + "grad_norm": 2.4230004062469406, + "language_loss": 0.83618528, + "learning_rate": 3.990298941894976e-06, + "loss": 0.85886502, + "num_input_tokens_seen": 21500385, + "step": 1005, + "time_per_iteration": 2.540661573410034 + }, + { + "auxiliary_loss_clip": 0.01058183, + "auxiliary_loss_mlp": 0.01008242, + "balance_loss_clip": 1.01002884, + "balance_loss_mlp": 1.0042361, + "epoch": 0.06048399218397715, + "flos": 68538496872960.0, + "grad_norm": 0.9381390903804737, + "language_loss": 0.59004778, + "learning_rate": 3.9902605910841794e-06, + "loss": 0.61071199, + "num_input_tokens_seen": 21561040, + "step": 1006, + "time_per_iteration": 3.346466541290283 + }, + { + "auxiliary_loss_clip": 0.01154004, + "auxiliary_loss_mlp": 0.01055628, + "balance_loss_clip": 1.04162312, + "balance_loss_mlp": 1.02886569, + "epoch": 0.060544115436645125, + "flos": 23258515772160.0, + "grad_norm": 2.015305292680391, + "language_loss": 0.74514401, + "learning_rate": 3.990222164802503e-06, + "loss": 0.76724029, + "num_input_tokens_seen": 21580655, + "step": 1007, + "time_per_iteration": 2.662600040435791 + }, + { + "auxiliary_loss_clip": 0.01168701, + "auxiliary_loss_mlp": 0.01060296, + "balance_loss_clip": 1.04813802, + "balance_loss_mlp": 1.03347373, + "epoch": 0.06060423868931309, + "flos": 23878441624320.0, + "grad_norm": 1.7371223146376926, + "language_loss": 0.80347592, + "learning_rate": 3.9901836630514006e-06, + "loss": 0.82576591, + "num_input_tokens_seen": 21599650, + "step": 1008, + "time_per_iteration": 2.7110917568206787 + }, + { + "auxiliary_loss_clip": 0.01152834, + "auxiliary_loss_mlp": 0.01060093, + "balance_loss_clip": 1.0504663, + "balance_loss_mlp": 1.0333426, + "epoch": 0.06066436194198106, + "flos": 18728061171840.0, + "grad_norm": 2.3583541822332568, + "language_loss": 0.77904725, + "learning_rate": 3.990145085832335e-06, + "loss": 0.80117655, + "num_input_tokens_seen": 21617550, + "step": 1009, + "time_per_iteration": 2.7092983722686768 + }, + { + "auxiliary_loss_clip": 0.01168516, + "auxiliary_loss_mlp": 0.01060908, + "balance_loss_clip": 1.05049133, + "balance_loss_mlp": 1.03508711, + "epoch": 0.06072448519464903, + "flos": 24640465680000.0, + "grad_norm": 1.7887436365121454, + "language_loss": 0.92954594, + "learning_rate": 3.990106433146769e-06, + "loss": 0.95184016, + "num_input_tokens_seen": 21635865, + "step": 1010, + "time_per_iteration": 2.708012342453003 + }, + { + "auxiliary_loss_clip": 0.01126371, + "auxiliary_loss_mlp": 0.0075011, + "balance_loss_clip": 1.04560328, + "balance_loss_mlp": 1.00095308, + "epoch": 0.060784608447317, + "flos": 17378825575680.0, + "grad_norm": 2.499080096572675, + "language_loss": 0.7169013, + "learning_rate": 3.9900677049961665e-06, + "loss": 0.7356661, + "num_input_tokens_seen": 21653945, + "step": 1011, + "time_per_iteration": 2.79933500289917 + }, + { + "auxiliary_loss_clip": 0.0116736, + "auxiliary_loss_mlp": 0.01069751, + "balance_loss_clip": 1.04773879, + "balance_loss_mlp": 1.04126036, + "epoch": 0.06084473169998497, + "flos": 23692208584320.0, + "grad_norm": 2.526221028125656, + "language_loss": 0.87488914, + "learning_rate": 3.990028901381999e-06, + "loss": 0.89726025, + "num_input_tokens_seen": 21671230, + "step": 1012, + "time_per_iteration": 2.6506075859069824 + }, + { + "auxiliary_loss_clip": 0.01157114, + "auxiliary_loss_mlp": 0.01062188, + "balance_loss_clip": 1.0447892, + "balance_loss_mlp": 1.03646278, + "epoch": 0.06090485495265294, + "flos": 23546339452800.0, + "grad_norm": 1.8771981640943072, + "language_loss": 0.76868671, + "learning_rate": 3.989990022305734e-06, + "loss": 0.79087973, + "num_input_tokens_seen": 21691155, + "step": 1013, + "time_per_iteration": 2.627894878387451 + }, + { + "auxiliary_loss_clip": 0.0118346, + "auxiliary_loss_mlp": 0.0075008, + "balance_loss_clip": 1.05215287, + "balance_loss_mlp": 1.00099742, + "epoch": 0.06096497820532091, + "flos": 20339301548160.0, + "grad_norm": 2.4440745515499853, + "language_loss": 0.85736853, + "learning_rate": 3.98995106776885e-06, + "loss": 0.87670398, + "num_input_tokens_seen": 21707405, + "step": 1014, + "time_per_iteration": 2.579472780227661 + }, + { + "auxiliary_loss_clip": 0.01190034, + "auxiliary_loss_mlp": 0.01071947, + "balance_loss_clip": 1.05200672, + "balance_loss_mlp": 1.04255033, + "epoch": 0.061025101457988874, + "flos": 26939035779840.0, + "grad_norm": 4.93359287127707, + "language_loss": 0.73374963, + "learning_rate": 3.98991203777282e-06, + "loss": 0.75636947, + "num_input_tokens_seen": 21728090, + "step": 1015, + "time_per_iteration": 2.6523404121398926 + }, + { + "auxiliary_loss_clip": 0.01161264, + "auxiliary_loss_mlp": 0.01074516, + "balance_loss_clip": 1.04682612, + "balance_loss_mlp": 1.04806328, + "epoch": 0.061085224710656846, + "flos": 25375054723200.0, + "grad_norm": 1.7441702689833654, + "language_loss": 0.79051042, + "learning_rate": 3.9898729323191275e-06, + "loss": 0.81286824, + "num_input_tokens_seen": 21747950, + "step": 1016, + "time_per_iteration": 2.6737003326416016 + }, + { + "auxiliary_loss_clip": 0.01145955, + "auxiliary_loss_mlp": 0.01064474, + "balance_loss_clip": 1.04757452, + "balance_loss_mlp": 1.03814125, + "epoch": 0.06114534796332482, + "flos": 24824759385600.0, + "grad_norm": 3.9847200695217553, + "language_loss": 0.75992548, + "learning_rate": 3.989833751409254e-06, + "loss": 0.78202975, + "num_input_tokens_seen": 21767900, + "step": 1017, + "time_per_iteration": 2.706134080886841 + }, + { + "auxiliary_loss_clip": 0.01157247, + "auxiliary_loss_mlp": 0.01075554, + "balance_loss_clip": 1.05042005, + "balance_loss_mlp": 1.04885125, + "epoch": 0.061205471215992784, + "flos": 20631434860800.0, + "grad_norm": 2.2359475844756673, + "language_loss": 0.85907984, + "learning_rate": 3.989794495044685e-06, + "loss": 0.88140786, + "num_input_tokens_seen": 21787375, + "step": 1018, + "time_per_iteration": 2.647860527038574 + }, + { + "auxiliary_loss_clip": 0.01151584, + "auxiliary_loss_mlp": 0.01073538, + "balance_loss_clip": 1.05128384, + "balance_loss_mlp": 1.04504681, + "epoch": 0.061265594468660756, + "flos": 16508351381760.0, + "grad_norm": 2.4787757510851556, + "language_loss": 0.77517748, + "learning_rate": 3.989755163226909e-06, + "loss": 0.79742873, + "num_input_tokens_seen": 21806275, + "step": 1019, + "time_per_iteration": 2.6640512943267822 + }, + { + "auxiliary_loss_clip": 0.01138221, + "auxiliary_loss_mlp": 0.01067259, + "balance_loss_clip": 1.04441488, + "balance_loss_mlp": 1.03937626, + "epoch": 0.06132571772132872, + "flos": 26246211275520.0, + "grad_norm": 2.260952461468038, + "language_loss": 0.84265095, + "learning_rate": 3.989715755957418e-06, + "loss": 0.86470574, + "num_input_tokens_seen": 21826430, + "step": 1020, + "time_per_iteration": 2.697364330291748 + }, + { + "auxiliary_loss_clip": 0.0117257, + "auxiliary_loss_mlp": 0.01065173, + "balance_loss_clip": 1.05211139, + "balance_loss_mlp": 1.03820801, + "epoch": 0.06138584097399669, + "flos": 37414788768000.0, + "grad_norm": 1.9527565102127098, + "language_loss": 0.79375434, + "learning_rate": 3.989676273237705e-06, + "loss": 0.81613171, + "num_input_tokens_seen": 21847800, + "step": 1021, + "time_per_iteration": 2.768798828125 + }, + { + "auxiliary_loss_clip": 0.01160658, + "auxiliary_loss_mlp": 0.01066193, + "balance_loss_clip": 1.04646349, + "balance_loss_mlp": 1.04242301, + "epoch": 0.061445964226664665, + "flos": 17420661941760.0, + "grad_norm": 2.0470870236680567, + "language_loss": 0.87587893, + "learning_rate": 3.9896367150692705e-06, + "loss": 0.89814746, + "num_input_tokens_seen": 21863385, + "step": 1022, + "time_per_iteration": 2.7744064331054688 + }, + { + "auxiliary_loss_clip": 0.01171589, + "auxiliary_loss_mlp": 0.01069726, + "balance_loss_clip": 1.05372262, + "balance_loss_mlp": 1.04313052, + "epoch": 0.06150608747933263, + "flos": 22600021691520.0, + "grad_norm": 1.8897407400511932, + "language_loss": 0.83098191, + "learning_rate": 3.989597081453611e-06, + "loss": 0.85339504, + "num_input_tokens_seen": 21881880, + "step": 1023, + "time_per_iteration": 2.643617630004883 + }, + { + "auxiliary_loss_clip": 0.0106733, + "auxiliary_loss_mlp": 0.0103984, + "balance_loss_clip": 1.01074958, + "balance_loss_mlp": 1.03554881, + "epoch": 0.0615662107320006, + "flos": 56741482005120.0, + "grad_norm": 0.8991012564752618, + "language_loss": 0.65025032, + "learning_rate": 3.989557372392231e-06, + "loss": 0.67132205, + "num_input_tokens_seen": 21940550, + "step": 1024, + "time_per_iteration": 3.140080451965332 + }, + { + "auxiliary_loss_clip": 0.01133831, + "auxiliary_loss_mlp": 0.0107157, + "balance_loss_clip": 1.04646039, + "balance_loss_mlp": 1.04343641, + "epoch": 0.06162633398466857, + "flos": 22564793427840.0, + "grad_norm": 2.002769335728369, + "language_loss": 0.88232291, + "learning_rate": 3.989517587886636e-06, + "loss": 0.90437692, + "num_input_tokens_seen": 21958390, + "step": 1025, + "time_per_iteration": 2.7375810146331787 + }, + { + "auxiliary_loss_clip": 0.01173031, + "auxiliary_loss_mlp": 0.01064548, + "balance_loss_clip": 1.05153465, + "balance_loss_mlp": 1.03879857, + "epoch": 0.06168645723733654, + "flos": 25593104234880.0, + "grad_norm": 2.9233601590995164, + "language_loss": 0.84619474, + "learning_rate": 3.989477727938335e-06, + "loss": 0.86857057, + "num_input_tokens_seen": 21978625, + "step": 1026, + "time_per_iteration": 2.615612030029297 + }, + { + "auxiliary_loss_clip": 0.01128945, + "auxiliary_loss_mlp": 0.01074202, + "balance_loss_clip": 1.04490912, + "balance_loss_mlp": 1.04678404, + "epoch": 0.06174658049000451, + "flos": 15997917162240.0, + "grad_norm": 1.9756280001274285, + "language_loss": 0.82507914, + "learning_rate": 3.989437792548839e-06, + "loss": 0.84711057, + "num_input_tokens_seen": 21996035, + "step": 1027, + "time_per_iteration": 2.5938587188720703 + }, + { + "auxiliary_loss_clip": 0.01145461, + "auxiliary_loss_mlp": 0.01064081, + "balance_loss_clip": 1.05497277, + "balance_loss_mlp": 1.03768814, + "epoch": 0.06180670374267248, + "flos": 11285970117120.0, + "grad_norm": 4.195138898123038, + "language_loss": 0.83950281, + "learning_rate": 3.989397781719663e-06, + "loss": 0.86159825, + "num_input_tokens_seen": 22011625, + "step": 1028, + "time_per_iteration": 2.770787477493286 + }, + { + "auxiliary_loss_clip": 0.0104107, + "auxiliary_loss_mlp": 0.01007663, + "balance_loss_clip": 1.00763059, + "balance_loss_mlp": 1.00363374, + "epoch": 0.06186682699534045, + "flos": 65130142216320.0, + "grad_norm": 0.9396187595799308, + "language_loss": 0.60484022, + "learning_rate": 3.989357695452323e-06, + "loss": 0.62532759, + "num_input_tokens_seen": 22066035, + "step": 1029, + "time_per_iteration": 3.1021361351013184 + }, + { + "auxiliary_loss_clip": 0.01149175, + "auxiliary_loss_mlp": 0.01065584, + "balance_loss_clip": 1.04531085, + "balance_loss_mlp": 1.03841579, + "epoch": 0.061926950248008414, + "flos": 21105742976640.0, + "grad_norm": 1.851410603603841, + "language_loss": 0.82491606, + "learning_rate": 3.98931753374834e-06, + "loss": 0.84706366, + "num_input_tokens_seen": 22085015, + "step": 1030, + "time_per_iteration": 2.7044198513031006 + }, + { + "auxiliary_loss_clip": 0.01201282, + "auxiliary_loss_mlp": 0.0108116, + "balance_loss_clip": 1.05681062, + "balance_loss_mlp": 1.05306196, + "epoch": 0.061987073500676386, + "flos": 17748454481280.0, + "grad_norm": 2.3135341612725755, + "language_loss": 0.79822773, + "learning_rate": 3.989277296609237e-06, + "loss": 0.82105219, + "num_input_tokens_seen": 22102775, + "step": 1031, + "time_per_iteration": 4.158156871795654 + }, + { + "auxiliary_loss_clip": 0.01166845, + "auxiliary_loss_mlp": 0.01084154, + "balance_loss_clip": 1.05287278, + "balance_loss_mlp": 1.05623579, + "epoch": 0.06204719675334436, + "flos": 21836237869440.0, + "grad_norm": 1.5878133244904022, + "language_loss": 0.77368069, + "learning_rate": 3.98923698403654e-06, + "loss": 0.79619062, + "num_input_tokens_seen": 22121680, + "step": 1032, + "time_per_iteration": 4.310216188430786 + }, + { + "auxiliary_loss_clip": 0.01173225, + "auxiliary_loss_mlp": 0.01071955, + "balance_loss_clip": 1.04768419, + "balance_loss_mlp": 1.04466772, + "epoch": 0.06210732000601232, + "flos": 19353697286400.0, + "grad_norm": 7.5232197020271165, + "language_loss": 0.892856, + "learning_rate": 3.989196596031776e-06, + "loss": 0.91530788, + "num_input_tokens_seen": 22138155, + "step": 1033, + "time_per_iteration": 4.233097076416016 + }, + { + "auxiliary_loss_clip": 0.01185474, + "auxiliary_loss_mlp": 0.01062756, + "balance_loss_clip": 1.05227983, + "balance_loss_mlp": 1.03726947, + "epoch": 0.062167443258680295, + "flos": 24749382695040.0, + "grad_norm": 2.337408468947209, + "language_loss": 0.84708059, + "learning_rate": 3.989156132596479e-06, + "loss": 0.86956286, + "num_input_tokens_seen": 22157420, + "step": 1034, + "time_per_iteration": 4.226016283035278 + }, + { + "auxiliary_loss_clip": 0.01162823, + "auxiliary_loss_mlp": 0.01065354, + "balance_loss_clip": 1.05016935, + "balance_loss_mlp": 1.0378406, + "epoch": 0.06222756651134827, + "flos": 34458478773120.0, + "grad_norm": 1.8989719976638335, + "language_loss": 0.80540907, + "learning_rate": 3.989115593732182e-06, + "loss": 0.82769084, + "num_input_tokens_seen": 22178620, + "step": 1035, + "time_per_iteration": 2.703613519668579 + }, + { + "auxiliary_loss_clip": 0.01143028, + "auxiliary_loss_mlp": 0.01080347, + "balance_loss_clip": 1.05336833, + "balance_loss_mlp": 1.05087829, + "epoch": 0.06228768976401623, + "flos": 25666469763840.0, + "grad_norm": 2.3312238755859203, + "language_loss": 0.78381944, + "learning_rate": 3.989074979440421e-06, + "loss": 0.80605322, + "num_input_tokens_seen": 22197125, + "step": 1036, + "time_per_iteration": 2.7313320636749268 + }, + { + "auxiliary_loss_clip": 0.0117374, + "auxiliary_loss_mlp": 0.01072812, + "balance_loss_clip": 1.05135977, + "balance_loss_mlp": 1.04712236, + "epoch": 0.062347813016684205, + "flos": 25295619795840.0, + "grad_norm": 2.1616972096056215, + "language_loss": 0.86686188, + "learning_rate": 3.989034289722739e-06, + "loss": 0.88932741, + "num_input_tokens_seen": 22217575, + "step": 1037, + "time_per_iteration": 2.64101243019104 + }, + { + "auxiliary_loss_clip": 0.01176599, + "auxiliary_loss_mlp": 0.01063718, + "balance_loss_clip": 1.05059075, + "balance_loss_mlp": 1.03613329, + "epoch": 0.06240793626935217, + "flos": 26907039740160.0, + "grad_norm": 2.3772751433916897, + "language_loss": 0.80264115, + "learning_rate": 3.988993524580676e-06, + "loss": 0.82504439, + "num_input_tokens_seen": 22236840, + "step": 1038, + "time_per_iteration": 2.805361032485962 + }, + { + "auxiliary_loss_clip": 0.01117852, + "auxiliary_loss_mlp": 0.0107445, + "balance_loss_clip": 1.04357004, + "balance_loss_mlp": 1.04636383, + "epoch": 0.06246805952202014, + "flos": 21615782146560.0, + "grad_norm": 1.9022063258179347, + "language_loss": 0.85506606, + "learning_rate": 3.98895268401578e-06, + "loss": 0.87698901, + "num_input_tokens_seen": 22256465, + "step": 1039, + "time_per_iteration": 2.6698129177093506 + }, + { + "auxiliary_loss_clip": 0.01168569, + "auxiliary_loss_mlp": 0.01072929, + "balance_loss_clip": 1.04936039, + "balance_loss_mlp": 1.04579747, + "epoch": 0.0625281827746881, + "flos": 19311896833920.0, + "grad_norm": 1.842248750709184, + "language_loss": 0.81208402, + "learning_rate": 3.9889117680296e-06, + "loss": 0.834499, + "num_input_tokens_seen": 22274025, + "step": 1040, + "time_per_iteration": 2.687743902206421 + }, + { + "auxiliary_loss_clip": 0.01196013, + "auxiliary_loss_mlp": 0.0106724, + "balance_loss_clip": 1.05745864, + "balance_loss_mlp": 1.0408709, + "epoch": 0.06258830602735609, + "flos": 27745769289600.0, + "grad_norm": 2.3938458871746944, + "language_loss": 0.69833678, + "learning_rate": 3.988870776623685e-06, + "loss": 0.72096932, + "num_input_tokens_seen": 22292245, + "step": 1041, + "time_per_iteration": 2.6313252449035645 + }, + { + "auxiliary_loss_clip": 0.01192318, + "auxiliary_loss_mlp": 0.01062156, + "balance_loss_clip": 1.05099368, + "balance_loss_mlp": 1.03508329, + "epoch": 0.06264842928002405, + "flos": 23222605150080.0, + "grad_norm": 2.425356976323965, + "language_loss": 0.81536537, + "learning_rate": 3.9888297097995905e-06, + "loss": 0.83791018, + "num_input_tokens_seen": 22311455, + "step": 1042, + "time_per_iteration": 2.5585007667541504 + }, + { + "auxiliary_loss_clip": 0.01190031, + "auxiliary_loss_mlp": 0.01054712, + "balance_loss_clip": 1.0515095, + "balance_loss_mlp": 1.03001165, + "epoch": 0.06270855253269202, + "flos": 38399495189760.0, + "grad_norm": 1.7810333521900208, + "language_loss": 0.76196569, + "learning_rate": 3.988788567558874e-06, + "loss": 0.7844131, + "num_input_tokens_seen": 22333750, + "step": 1043, + "time_per_iteration": 2.7359559535980225 + }, + { + "auxiliary_loss_clip": 0.01169731, + "auxiliary_loss_mlp": 0.01065198, + "balance_loss_clip": 1.05085731, + "balance_loss_mlp": 1.04016399, + "epoch": 0.06276867578535998, + "flos": 22453542028800.0, + "grad_norm": 1.9733562769343442, + "language_loss": 0.92248261, + "learning_rate": 3.988747349903097e-06, + "loss": 0.94483185, + "num_input_tokens_seen": 22351940, + "step": 1044, + "time_per_iteration": 2.6253442764282227 + }, + { + "auxiliary_loss_clip": 0.01172992, + "auxiliary_loss_mlp": 0.01070344, + "balance_loss_clip": 1.04883313, + "balance_loss_mlp": 1.04498863, + "epoch": 0.06282879903802796, + "flos": 22930435923840.0, + "grad_norm": 2.194405047519767, + "language_loss": 0.85858071, + "learning_rate": 3.988706056833821e-06, + "loss": 0.88101399, + "num_input_tokens_seen": 22372085, + "step": 1045, + "time_per_iteration": 2.6325366497039795 + }, + { + "auxiliary_loss_clip": 0.01159169, + "auxiliary_loss_mlp": 0.01068741, + "balance_loss_clip": 1.04788017, + "balance_loss_mlp": 1.04368329, + "epoch": 0.06288892229069593, + "flos": 34819237019520.0, + "grad_norm": 2.164427473465319, + "language_loss": 0.78549749, + "learning_rate": 3.9886646883526125e-06, + "loss": 0.80777657, + "num_input_tokens_seen": 22392020, + "step": 1046, + "time_per_iteration": 2.7786378860473633 + }, + { + "auxiliary_loss_clip": 0.01174091, + "auxiliary_loss_mlp": 0.0107661, + "balance_loss_clip": 1.04994202, + "balance_loss_mlp": 1.05161214, + "epoch": 0.06294904554336389, + "flos": 19427134642560.0, + "grad_norm": 2.451832923282723, + "language_loss": 0.77406198, + "learning_rate": 3.988623244461039e-06, + "loss": 0.79656905, + "num_input_tokens_seen": 22411180, + "step": 1047, + "time_per_iteration": 2.604234457015991 + }, + { + "auxiliary_loss_clip": 0.01178017, + "auxiliary_loss_mlp": 0.01064036, + "balance_loss_clip": 1.05143213, + "balance_loss_mlp": 1.03908598, + "epoch": 0.06300916879603187, + "flos": 40661867358720.0, + "grad_norm": 4.023196543685692, + "language_loss": 0.76685995, + "learning_rate": 3.988581725160672e-06, + "loss": 0.78928053, + "num_input_tokens_seen": 22435105, + "step": 1048, + "time_per_iteration": 2.798414945602417 + }, + { + "auxiliary_loss_clip": 0.01161469, + "auxiliary_loss_mlp": 0.01069588, + "balance_loss_clip": 1.05033374, + "balance_loss_mlp": 1.04349303, + "epoch": 0.06306929204869983, + "flos": 23804142341760.0, + "grad_norm": 2.6936137717219575, + "language_loss": 0.77424055, + "learning_rate": 3.988540130453087e-06, + "loss": 0.79655111, + "num_input_tokens_seen": 22452710, + "step": 1049, + "time_per_iteration": 2.6778340339660645 + }, + { + "auxiliary_loss_clip": 0.01175971, + "auxiliary_loss_mlp": 0.01066946, + "balance_loss_clip": 1.05107903, + "balance_loss_mlp": 1.04162598, + "epoch": 0.0631294153013678, + "flos": 18915802583040.0, + "grad_norm": 2.865665734841617, + "language_loss": 0.82960296, + "learning_rate": 3.988498460339862e-06, + "loss": 0.85203212, + "num_input_tokens_seen": 22470175, + "step": 1050, + "time_per_iteration": 2.675727367401123 + }, + { + "auxiliary_loss_clip": 0.01188787, + "auxiliary_loss_mlp": 0.01067717, + "balance_loss_clip": 1.05349731, + "balance_loss_mlp": 1.04383993, + "epoch": 0.06318953855403578, + "flos": 24280174310400.0, + "grad_norm": 1.849751628672901, + "language_loss": 0.77006012, + "learning_rate": 3.988456714822575e-06, + "loss": 0.79262519, + "num_input_tokens_seen": 22490020, + "step": 1051, + "time_per_iteration": 2.642860174179077 + }, + { + "auxiliary_loss_clip": 0.01165312, + "auxiliary_loss_mlp": 0.01078434, + "balance_loss_clip": 1.04944825, + "balance_loss_mlp": 1.0528872, + "epoch": 0.06324966180670374, + "flos": 22528918719360.0, + "grad_norm": 1.9888927866555717, + "language_loss": 0.80022657, + "learning_rate": 3.98841489390281e-06, + "loss": 0.82266402, + "num_input_tokens_seen": 22509685, + "step": 1052, + "time_per_iteration": 2.648695230484009 + }, + { + "auxiliary_loss_clip": 0.01189542, + "auxiliary_loss_mlp": 0.01067058, + "balance_loss_clip": 1.05167711, + "balance_loss_mlp": 1.04205966, + "epoch": 0.06330978505937171, + "flos": 15778107884160.0, + "grad_norm": 2.8732287135613976, + "language_loss": 0.78126889, + "learning_rate": 3.988372997582155e-06, + "loss": 0.80383492, + "num_input_tokens_seen": 22527905, + "step": 1053, + "time_per_iteration": 2.532982349395752 + }, + { + "auxiliary_loss_clip": 0.01172905, + "auxiliary_loss_mlp": 0.00749962, + "balance_loss_clip": 1.05379868, + "balance_loss_mlp": 1.00101745, + "epoch": 0.06336990831203967, + "flos": 21471098163840.0, + "grad_norm": 1.902724812628741, + "language_loss": 0.84987414, + "learning_rate": 3.988331025862195e-06, + "loss": 0.86910284, + "num_input_tokens_seen": 22546335, + "step": 1054, + "time_per_iteration": 2.629561185836792 + }, + { + "auxiliary_loss_clip": 0.01144174, + "auxiliary_loss_mlp": 0.01068277, + "balance_loss_clip": 1.04542816, + "balance_loss_mlp": 1.04271817, + "epoch": 0.06343003156470765, + "flos": 18478877546880.0, + "grad_norm": 2.2712947329461834, + "language_loss": 0.86035597, + "learning_rate": 3.9882889787445225e-06, + "loss": 0.8824805, + "num_input_tokens_seen": 22563885, + "step": 1055, + "time_per_iteration": 2.60131573677063 + }, + { + "auxiliary_loss_clip": 0.01139602, + "auxiliary_loss_mlp": 0.01076896, + "balance_loss_clip": 1.04750764, + "balance_loss_mlp": 1.05151629, + "epoch": 0.06349015481737562, + "flos": 25154886309120.0, + "grad_norm": 4.101171661279208, + "language_loss": 0.81280994, + "learning_rate": 3.988246856230734e-06, + "loss": 0.83497494, + "num_input_tokens_seen": 22583035, + "step": 1056, + "time_per_iteration": 2.7352304458618164 + }, + { + "auxiliary_loss_clip": 0.01118287, + "auxiliary_loss_mlp": 0.01062199, + "balance_loss_clip": 1.03951895, + "balance_loss_mlp": 1.03460169, + "epoch": 0.06355027807004358, + "flos": 26871775562880.0, + "grad_norm": 2.7017696765422303, + "language_loss": 0.81194639, + "learning_rate": 3.988204658322426e-06, + "loss": 0.83375132, + "num_input_tokens_seen": 22605055, + "step": 1057, + "time_per_iteration": 2.823761463165283 + }, + { + "auxiliary_loss_clip": 0.01105446, + "auxiliary_loss_mlp": 0.01062212, + "balance_loss_clip": 1.0392729, + "balance_loss_mlp": 1.03963351, + "epoch": 0.06361040132271156, + "flos": 21396691140480.0, + "grad_norm": 1.8172920108802426, + "language_loss": 0.83568531, + "learning_rate": 3.988162385021196e-06, + "loss": 0.85736191, + "num_input_tokens_seen": 22623760, + "step": 1058, + "time_per_iteration": 2.7093358039855957 + }, + { + "auxiliary_loss_clip": 0.0115833, + "auxiliary_loss_mlp": 0.01064677, + "balance_loss_clip": 1.04738069, + "balance_loss_mlp": 1.03793871, + "epoch": 0.06367052457537953, + "flos": 25733765894400.0, + "grad_norm": 2.20304244287923, + "language_loss": 0.87431771, + "learning_rate": 3.988120036328651e-06, + "loss": 0.89654773, + "num_input_tokens_seen": 22643000, + "step": 1059, + "time_per_iteration": 2.7660157680511475 + }, + { + "auxiliary_loss_clip": 0.01144866, + "auxiliary_loss_mlp": 0.01068457, + "balance_loss_clip": 1.0495255, + "balance_loss_mlp": 1.04214692, + "epoch": 0.0637306478280475, + "flos": 17631420992640.0, + "grad_norm": 2.7457787776137335, + "language_loss": 0.91905665, + "learning_rate": 3.988077612246394e-06, + "loss": 0.94118989, + "num_input_tokens_seen": 22660460, + "step": 1060, + "time_per_iteration": 2.617842197418213 + }, + { + "auxiliary_loss_clip": 0.01153229, + "auxiliary_loss_mlp": 0.01067206, + "balance_loss_clip": 1.04906702, + "balance_loss_mlp": 1.04077673, + "epoch": 0.06379077108071547, + "flos": 13662610427520.0, + "grad_norm": 1.8581058928508507, + "language_loss": 0.86872917, + "learning_rate": 3.988035112776035e-06, + "loss": 0.89093351, + "num_input_tokens_seen": 22679270, + "step": 1061, + "time_per_iteration": 2.6352434158325195 + }, + { + "auxiliary_loss_clip": 0.01169938, + "auxiliary_loss_mlp": 0.01061456, + "balance_loss_clip": 1.04796493, + "balance_loss_mlp": 1.03449047, + "epoch": 0.06385089433338344, + "flos": 28478849961600.0, + "grad_norm": 2.77544904384013, + "language_loss": 0.77340114, + "learning_rate": 3.987992537919185e-06, + "loss": 0.79571503, + "num_input_tokens_seen": 22699330, + "step": 1062, + "time_per_iteration": 2.711946487426758 + }, + { + "auxiliary_loss_clip": 0.01151927, + "auxiliary_loss_mlp": 0.01067613, + "balance_loss_clip": 1.04633498, + "balance_loss_mlp": 1.04259086, + "epoch": 0.0639110175860514, + "flos": 24311057028480.0, + "grad_norm": 15.20195359161396, + "language_loss": 0.86918902, + "learning_rate": 3.987949887677459e-06, + "loss": 0.89138436, + "num_input_tokens_seen": 22717945, + "step": 1063, + "time_per_iteration": 2.7242465019226074 + }, + { + "auxiliary_loss_clip": 0.01188025, + "auxiliary_loss_mlp": 0.01064934, + "balance_loss_clip": 1.04918313, + "balance_loss_mlp": 1.03907728, + "epoch": 0.06397114083871938, + "flos": 22090772620800.0, + "grad_norm": 2.4592525046290716, + "language_loss": 0.80315417, + "learning_rate": 3.9879071620524744e-06, + "loss": 0.82568377, + "num_input_tokens_seen": 22736790, + "step": 1064, + "time_per_iteration": 2.588895797729492 + }, + { + "auxiliary_loss_clip": 0.01173048, + "auxiliary_loss_mlp": 0.01075028, + "balance_loss_clip": 1.04651439, + "balance_loss_mlp": 1.04844379, + "epoch": 0.06403126409138735, + "flos": 19572824206080.0, + "grad_norm": 2.351693393528435, + "language_loss": 0.84534287, + "learning_rate": 3.987864361045851e-06, + "loss": 0.8678236, + "num_input_tokens_seen": 22754745, + "step": 1065, + "time_per_iteration": 2.704458713531494 + }, + { + "auxiliary_loss_clip": 0.01145588, + "auxiliary_loss_mlp": 0.01060546, + "balance_loss_clip": 1.04866302, + "balance_loss_mlp": 1.03634644, + "epoch": 0.06409138734405531, + "flos": 40807413267840.0, + "grad_norm": 11.030434486836828, + "language_loss": 0.68326461, + "learning_rate": 3.987821484659211e-06, + "loss": 0.70532596, + "num_input_tokens_seen": 22776780, + "step": 1066, + "time_per_iteration": 2.7930400371551514 + }, + { + "auxiliary_loss_clip": 0.01188556, + "auxiliary_loss_mlp": 0.01074203, + "balance_loss_clip": 1.05261016, + "balance_loss_mlp": 1.04801273, + "epoch": 0.06415151059672328, + "flos": 20441610460800.0, + "grad_norm": 2.76481316534003, + "language_loss": 0.90051508, + "learning_rate": 3.987778532894181e-06, + "loss": 0.92314267, + "num_input_tokens_seen": 22793915, + "step": 1067, + "time_per_iteration": 2.6562297344207764 + }, + { + "auxiliary_loss_clip": 0.01164008, + "auxiliary_loss_mlp": 0.01067557, + "balance_loss_clip": 1.04866695, + "balance_loss_mlp": 1.04384637, + "epoch": 0.06421163384939126, + "flos": 18072045129600.0, + "grad_norm": 2.0057375447838135, + "language_loss": 0.83504707, + "learning_rate": 3.987735505752391e-06, + "loss": 0.85736269, + "num_input_tokens_seen": 22812670, + "step": 1068, + "time_per_iteration": 2.7465975284576416 + }, + { + "auxiliary_loss_clip": 0.01149684, + "auxiliary_loss_mlp": 0.01070673, + "balance_loss_clip": 1.0461818, + "balance_loss_mlp": 1.0463779, + "epoch": 0.06427175710205922, + "flos": 25119442563840.0, + "grad_norm": 3.7430474898141064, + "language_loss": 0.89662135, + "learning_rate": 3.987692403235471e-06, + "loss": 0.91882491, + "num_input_tokens_seen": 22832440, + "step": 1069, + "time_per_iteration": 2.7222652435302734 + }, + { + "auxiliary_loss_clip": 0.01156637, + "auxiliary_loss_mlp": 0.01074713, + "balance_loss_clip": 1.04680967, + "balance_loss_mlp": 1.04871345, + "epoch": 0.06433188035472719, + "flos": 17380549428480.0, + "grad_norm": 3.079454399872106, + "language_loss": 0.95586324, + "learning_rate": 3.987649225345056e-06, + "loss": 0.97817671, + "num_input_tokens_seen": 22845495, + "step": 1070, + "time_per_iteration": 2.661424398422241 + }, + { + "auxiliary_loss_clip": 0.01114205, + "auxiliary_loss_mlp": 0.01055232, + "balance_loss_clip": 1.04517603, + "balance_loss_mlp": 1.0285058, + "epoch": 0.06439200360739517, + "flos": 23546267625600.0, + "grad_norm": 2.575502677619022, + "language_loss": 0.88118124, + "learning_rate": 3.987605972082782e-06, + "loss": 0.90287566, + "num_input_tokens_seen": 22865390, + "step": 1071, + "time_per_iteration": 2.771836280822754 + }, + { + "auxiliary_loss_clip": 0.01122304, + "auxiliary_loss_mlp": 0.0105846, + "balance_loss_clip": 1.03979301, + "balance_loss_mlp": 1.03349733, + "epoch": 0.06445212686006313, + "flos": 21979772616960.0, + "grad_norm": 1.94131556194255, + "language_loss": 0.76357311, + "learning_rate": 3.987562643450292e-06, + "loss": 0.78538072, + "num_input_tokens_seen": 22885495, + "step": 1072, + "time_per_iteration": 2.754162311553955 + }, + { + "auxiliary_loss_clip": 0.01144313, + "auxiliary_loss_mlp": 0.0107586, + "balance_loss_clip": 1.04714584, + "balance_loss_mlp": 1.04753613, + "epoch": 0.0645122501127311, + "flos": 25921291824000.0, + "grad_norm": 1.9254433164556317, + "language_loss": 0.81028795, + "learning_rate": 3.987519239449226e-06, + "loss": 0.83248961, + "num_input_tokens_seen": 22904845, + "step": 1073, + "time_per_iteration": 2.741945505142212 + }, + { + "auxiliary_loss_clip": 0.01169772, + "auxiliary_loss_mlp": 0.01062747, + "balance_loss_clip": 1.04829645, + "balance_loss_mlp": 1.037498, + "epoch": 0.06457237336539907, + "flos": 25626034028160.0, + "grad_norm": 1.711730111122821, + "language_loss": 0.80399013, + "learning_rate": 3.987475760081233e-06, + "loss": 0.8263154, + "num_input_tokens_seen": 22925940, + "step": 1074, + "time_per_iteration": 2.7234997749328613 + }, + { + "auxiliary_loss_clip": 0.01144553, + "auxiliary_loss_mlp": 0.01063394, + "balance_loss_clip": 1.04582477, + "balance_loss_mlp": 1.03652382, + "epoch": 0.06463249661806704, + "flos": 19463979018240.0, + "grad_norm": 1.744913363969633, + "language_loss": 0.78949785, + "learning_rate": 3.987432205347958e-06, + "loss": 0.81157732, + "num_input_tokens_seen": 22944375, + "step": 1075, + "time_per_iteration": 2.800011157989502 + }, + { + "auxiliary_loss_clip": 0.01142109, + "auxiliary_loss_mlp": 0.01066765, + "balance_loss_clip": 1.04565001, + "balance_loss_mlp": 1.04274464, + "epoch": 0.064692619870735, + "flos": 24498044254080.0, + "grad_norm": 2.814147173291461, + "language_loss": 0.87799424, + "learning_rate": 3.987388575251055e-06, + "loss": 0.90008301, + "num_input_tokens_seen": 22959145, + "step": 1076, + "time_per_iteration": 2.724477767944336 + }, + { + "auxiliary_loss_clip": 0.01159464, + "auxiliary_loss_mlp": 0.0105739, + "balance_loss_clip": 1.04385424, + "balance_loss_mlp": 1.03227317, + "epoch": 0.06475274312340297, + "flos": 17018677860480.0, + "grad_norm": 1.705145673070331, + "language_loss": 0.80746979, + "learning_rate": 3.98734486979218e-06, + "loss": 0.82963836, + "num_input_tokens_seen": 22978100, + "step": 1077, + "time_per_iteration": 2.7519376277923584 + }, + { + "auxiliary_loss_clip": 0.01172223, + "auxiliary_loss_mlp": 0.01070851, + "balance_loss_clip": 1.04836726, + "balance_loss_mlp": 1.0445416, + "epoch": 0.06481286637607095, + "flos": 24572379450240.0, + "grad_norm": 3.0652674563586664, + "language_loss": 0.91538262, + "learning_rate": 3.987301088972986e-06, + "loss": 0.93781334, + "num_input_tokens_seen": 22997285, + "step": 1078, + "time_per_iteration": 2.649470329284668 + }, + { + "auxiliary_loss_clip": 0.01197107, + "auxiliary_loss_mlp": 0.01065761, + "balance_loss_clip": 1.05271792, + "balance_loss_mlp": 1.03945112, + "epoch": 0.06487298962873891, + "flos": 21105635235840.0, + "grad_norm": 3.55690942700065, + "language_loss": 0.78814816, + "learning_rate": 3.987257232795137e-06, + "loss": 0.81077683, + "num_input_tokens_seen": 23016285, + "step": 1079, + "time_per_iteration": 4.123103141784668 + }, + { + "auxiliary_loss_clip": 0.011314, + "auxiliary_loss_mlp": 0.01063671, + "balance_loss_clip": 1.04409349, + "balance_loss_mlp": 1.03746915, + "epoch": 0.06493311288140688, + "flos": 24608182331520.0, + "grad_norm": 2.7735095756880637, + "language_loss": 0.69266504, + "learning_rate": 3.987213301260294e-06, + "loss": 0.7146157, + "num_input_tokens_seen": 23036420, + "step": 1080, + "time_per_iteration": 6.038952589035034 + }, + { + "auxiliary_loss_clip": 0.01150489, + "auxiliary_loss_mlp": 0.01067881, + "balance_loss_clip": 1.05073619, + "balance_loss_mlp": 1.03916311, + "epoch": 0.06499323613407486, + "flos": 25337994865920.0, + "grad_norm": 1.80595295983134, + "language_loss": 0.71841288, + "learning_rate": 3.987169294370123e-06, + "loss": 0.74059653, + "num_input_tokens_seen": 23056945, + "step": 1081, + "time_per_iteration": 2.737905502319336 + }, + { + "auxiliary_loss_clip": 0.01104064, + "auxiliary_loss_mlp": 0.01064502, + "balance_loss_clip": 1.03754795, + "balance_loss_mlp": 1.03682208, + "epoch": 0.06505335938674282, + "flos": 20375714960640.0, + "grad_norm": 2.7049559054648395, + "language_loss": 0.84301627, + "learning_rate": 3.987125212126294e-06, + "loss": 0.86470193, + "num_input_tokens_seen": 23074940, + "step": 1082, + "time_per_iteration": 2.648505687713623 + }, + { + "auxiliary_loss_clip": 0.01172557, + "auxiliary_loss_mlp": 0.0106754, + "balance_loss_clip": 1.04799461, + "balance_loss_mlp": 1.04065859, + "epoch": 0.06511348263941079, + "flos": 25337923038720.0, + "grad_norm": 2.3725476347728947, + "language_loss": 0.82490671, + "learning_rate": 3.987081054530478e-06, + "loss": 0.84730768, + "num_input_tokens_seen": 23093420, + "step": 1083, + "time_per_iteration": 2.626478910446167 + }, + { + "auxiliary_loss_clip": 0.01138425, + "auxiliary_loss_mlp": 0.01063606, + "balance_loss_clip": 1.04523659, + "balance_loss_mlp": 1.03615236, + "epoch": 0.06517360589207877, + "flos": 20332801186560.0, + "grad_norm": 2.6548721937377024, + "language_loss": 0.79365855, + "learning_rate": 3.987036821584348e-06, + "loss": 0.81567889, + "num_input_tokens_seen": 23111550, + "step": 1084, + "time_per_iteration": 2.633540391921997 + }, + { + "auxiliary_loss_clip": 0.01139855, + "auxiliary_loss_mlp": 0.01061762, + "balance_loss_clip": 1.04400873, + "balance_loss_mlp": 1.03542864, + "epoch": 0.06523372914474673, + "flos": 31681650061440.0, + "grad_norm": 3.072130119266088, + "language_loss": 0.66385347, + "learning_rate": 3.986992513289584e-06, + "loss": 0.68586963, + "num_input_tokens_seen": 23130335, + "step": 1085, + "time_per_iteration": 2.770937204360962 + }, + { + "auxiliary_loss_clip": 0.01140213, + "auxiliary_loss_mlp": 0.01071347, + "balance_loss_clip": 1.04427278, + "balance_loss_mlp": 1.04564571, + "epoch": 0.0652938523974147, + "flos": 20778165918720.0, + "grad_norm": 2.068166282893522, + "language_loss": 0.7634117, + "learning_rate": 3.9869481296478645e-06, + "loss": 0.78552735, + "num_input_tokens_seen": 23152380, + "step": 1086, + "time_per_iteration": 2.728756904602051 + }, + { + "auxiliary_loss_clip": 0.0117168, + "auxiliary_loss_mlp": 0.01062686, + "balance_loss_clip": 1.04669893, + "balance_loss_mlp": 1.03605437, + "epoch": 0.06535397565008266, + "flos": 16690993061760.0, + "grad_norm": 2.1642051847962325, + "language_loss": 0.85286832, + "learning_rate": 3.986903670660872e-06, + "loss": 0.87521195, + "num_input_tokens_seen": 23171630, + "step": 1087, + "time_per_iteration": 2.669414520263672 + }, + { + "auxiliary_loss_clip": 0.01160865, + "auxiliary_loss_mlp": 0.01063729, + "balance_loss_clip": 1.04828978, + "balance_loss_mlp": 1.0374192, + "epoch": 0.06541409890275064, + "flos": 26868220116480.0, + "grad_norm": 1.796209292378719, + "language_loss": 0.77644306, + "learning_rate": 3.9868591363302945e-06, + "loss": 0.79868901, + "num_input_tokens_seen": 23192520, + "step": 1088, + "time_per_iteration": 2.796234607696533 + }, + { + "auxiliary_loss_clip": 0.01166928, + "auxiliary_loss_mlp": 0.01066696, + "balance_loss_clip": 1.04957795, + "balance_loss_mlp": 1.04129195, + "epoch": 0.06547422215541861, + "flos": 20521620005760.0, + "grad_norm": 2.100648511377207, + "language_loss": 0.71223706, + "learning_rate": 3.9868145266578186e-06, + "loss": 0.73457325, + "num_input_tokens_seen": 23210710, + "step": 1089, + "time_per_iteration": 2.673576593399048 + }, + { + "auxiliary_loss_clip": 0.01142119, + "auxiliary_loss_mlp": 0.00750189, + "balance_loss_clip": 1.04666579, + "balance_loss_mlp": 1.00119865, + "epoch": 0.06553434540808657, + "flos": 22016616992640.0, + "grad_norm": 1.653068309127836, + "language_loss": 0.85756618, + "learning_rate": 3.9867698416451366e-06, + "loss": 0.87648928, + "num_input_tokens_seen": 23230305, + "step": 1090, + "time_per_iteration": 2.8053059577941895 + }, + { + "auxiliary_loss_clip": 0.01188752, + "auxiliary_loss_mlp": 0.0106049, + "balance_loss_clip": 1.05188239, + "balance_loss_mlp": 1.0346936, + "epoch": 0.06559446866075455, + "flos": 24608649208320.0, + "grad_norm": 1.9977885354001763, + "language_loss": 0.71811724, + "learning_rate": 3.9867250812939434e-06, + "loss": 0.74060965, + "num_input_tokens_seen": 23249015, + "step": 1091, + "time_per_iteration": 2.595785140991211 + }, + { + "auxiliary_loss_clip": 0.01099127, + "auxiliary_loss_mlp": 0.01063714, + "balance_loss_clip": 1.04263842, + "balance_loss_mlp": 1.03732061, + "epoch": 0.06565459191342252, + "flos": 24274679529600.0, + "grad_norm": 2.6186023521192165, + "language_loss": 0.82532442, + "learning_rate": 3.986680245605936e-06, + "loss": 0.8469528, + "num_input_tokens_seen": 23265105, + "step": 1092, + "time_per_iteration": 3.0514519214630127 + }, + { + "auxiliary_loss_clip": 0.01190818, + "auxiliary_loss_mlp": 0.0106712, + "balance_loss_clip": 1.05019939, + "balance_loss_mlp": 1.03960645, + "epoch": 0.06571471516609048, + "flos": 24787124910720.0, + "grad_norm": 1.897273052948154, + "language_loss": 0.71580327, + "learning_rate": 3.986635334582814e-06, + "loss": 0.73838258, + "num_input_tokens_seen": 23283950, + "step": 1093, + "time_per_iteration": 2.831453800201416 + }, + { + "auxiliary_loss_clip": 0.01163764, + "auxiliary_loss_mlp": 0.010685, + "balance_loss_clip": 1.05048108, + "balance_loss_mlp": 1.0410347, + "epoch": 0.06577483841875846, + "flos": 26214071581440.0, + "grad_norm": 1.762382863195975, + "language_loss": 0.87929606, + "learning_rate": 3.986590348226282e-06, + "loss": 0.90161872, + "num_input_tokens_seen": 23305005, + "step": 1094, + "time_per_iteration": 2.71897292137146 + }, + { + "auxiliary_loss_clip": 0.01166048, + "auxiliary_loss_mlp": 0.01066947, + "balance_loss_clip": 1.04897439, + "balance_loss_mlp": 1.03870654, + "epoch": 0.06583496167142643, + "flos": 25080802508160.0, + "grad_norm": 1.6787178127677846, + "language_loss": 0.81456739, + "learning_rate": 3.986545286538044e-06, + "loss": 0.83689737, + "num_input_tokens_seen": 23323220, + "step": 1095, + "time_per_iteration": 2.692582607269287 + }, + { + "auxiliary_loss_clip": 0.01147493, + "auxiliary_loss_mlp": 0.01060238, + "balance_loss_clip": 1.05186927, + "balance_loss_mlp": 1.03535855, + "epoch": 0.06589508492409439, + "flos": 25629804956160.0, + "grad_norm": 3.43720167433506, + "language_loss": 0.69808376, + "learning_rate": 3.986500149519811e-06, + "loss": 0.72016108, + "num_input_tokens_seen": 23342235, + "step": 1096, + "time_per_iteration": 2.6942174434661865 + }, + { + "auxiliary_loss_clip": 0.01177295, + "auxiliary_loss_mlp": 0.01074218, + "balance_loss_clip": 1.05219507, + "balance_loss_mlp": 1.04829025, + "epoch": 0.06595520817676236, + "flos": 23621249266560.0, + "grad_norm": 1.9236330416609584, + "language_loss": 0.77644819, + "learning_rate": 3.986454937173292e-06, + "loss": 0.79896331, + "num_input_tokens_seen": 23363680, + "step": 1097, + "time_per_iteration": 2.6775128841400146 + }, + { + "auxiliary_loss_clip": 0.01188364, + "auxiliary_loss_mlp": 0.01062008, + "balance_loss_clip": 1.05114436, + "balance_loss_mlp": 1.03643799, + "epoch": 0.06601533142943034, + "flos": 33801708545280.0, + "grad_norm": 3.4883016589507436, + "language_loss": 0.78098387, + "learning_rate": 3.986409649500203e-06, + "loss": 0.80348754, + "num_input_tokens_seen": 23385590, + "step": 1098, + "time_per_iteration": 2.6712281703948975 + }, + { + "auxiliary_loss_clip": 0.0116965, + "auxiliary_loss_mlp": 0.01076562, + "balance_loss_clip": 1.04953337, + "balance_loss_mlp": 1.04969263, + "epoch": 0.0660754546820983, + "flos": 20259184262400.0, + "grad_norm": 2.021384073864871, + "language_loss": 0.82064164, + "learning_rate": 3.986364286502261e-06, + "loss": 0.84310377, + "num_input_tokens_seen": 23402945, + "step": 1099, + "time_per_iteration": 2.6197142601013184 + }, + { + "auxiliary_loss_clip": 0.0115011, + "auxiliary_loss_mlp": 0.01057409, + "balance_loss_clip": 1.04385114, + "balance_loss_mlp": 1.03096807, + "epoch": 0.06613557793476627, + "flos": 19354164163200.0, + "grad_norm": 2.0160668264249857, + "language_loss": 0.82651931, + "learning_rate": 3.986318848181186e-06, + "loss": 0.84859443, + "num_input_tokens_seen": 23421410, + "step": 1100, + "time_per_iteration": 2.5771257877349854 + }, + { + "auxiliary_loss_clip": 0.01170835, + "auxiliary_loss_mlp": 0.01067381, + "balance_loss_clip": 1.05767322, + "balance_loss_mlp": 1.04169106, + "epoch": 0.06619570118743424, + "flos": 13772568936960.0, + "grad_norm": 2.258364032174232, + "language_loss": 0.73275721, + "learning_rate": 3.986273334538702e-06, + "loss": 0.75513935, + "num_input_tokens_seen": 23438870, + "step": 1101, + "time_per_iteration": 2.7263834476470947 + }, + { + "auxiliary_loss_clip": 0.01174652, + "auxiliary_loss_mlp": 0.01061266, + "balance_loss_clip": 1.04821014, + "balance_loss_mlp": 1.03579068, + "epoch": 0.06625582444010221, + "flos": 17857874286720.0, + "grad_norm": 3.0452738226865197, + "language_loss": 0.86137784, + "learning_rate": 3.986227745576533e-06, + "loss": 0.88373697, + "num_input_tokens_seen": 23456975, + "step": 1102, + "time_per_iteration": 2.5492281913757324 + }, + { + "auxiliary_loss_clip": 0.01163418, + "auxiliary_loss_mlp": 0.01065766, + "balance_loss_clip": 1.05024076, + "balance_loss_mlp": 1.0396595, + "epoch": 0.06631594769277017, + "flos": 11838707579520.0, + "grad_norm": 2.057783633537324, + "language_loss": 0.81172359, + "learning_rate": 3.98618208129641e-06, + "loss": 0.83401543, + "num_input_tokens_seen": 23473440, + "step": 1103, + "time_per_iteration": 2.719926118850708 + }, + { + "auxiliary_loss_clip": 0.01174172, + "auxiliary_loss_mlp": 0.00750074, + "balance_loss_clip": 1.05096829, + "balance_loss_mlp": 1.00113904, + "epoch": 0.06637607094543815, + "flos": 19793351756160.0, + "grad_norm": 1.810007270579655, + "language_loss": 0.82102644, + "learning_rate": 3.986136341700063e-06, + "loss": 0.84026891, + "num_input_tokens_seen": 23493880, + "step": 1104, + "time_per_iteration": 2.6811788082122803 + }, + { + "auxiliary_loss_clip": 0.01141586, + "auxiliary_loss_mlp": 0.01050445, + "balance_loss_clip": 1.04347944, + "balance_loss_mlp": 1.024755, + "epoch": 0.06643619419810612, + "flos": 25485659677440.0, + "grad_norm": 1.688042406353061, + "language_loss": 0.80388278, + "learning_rate": 3.986090526789227e-06, + "loss": 0.8258031, + "num_input_tokens_seen": 23514920, + "step": 1105, + "time_per_iteration": 2.7193045616149902 + }, + { + "auxiliary_loss_clip": 0.01155138, + "auxiliary_loss_mlp": 0.01063731, + "balance_loss_clip": 1.0487299, + "balance_loss_mlp": 1.04041362, + "epoch": 0.06649631745077408, + "flos": 16946533393920.0, + "grad_norm": 1.9009025173710377, + "language_loss": 0.96769869, + "learning_rate": 3.986044636565639e-06, + "loss": 0.98988736, + "num_input_tokens_seen": 23531635, + "step": 1106, + "time_per_iteration": 2.652068614959717 + }, + { + "auxiliary_loss_clip": 0.01178055, + "auxiliary_loss_mlp": 0.01059345, + "balance_loss_clip": 1.05010319, + "balance_loss_mlp": 1.03282106, + "epoch": 0.06655644070344206, + "flos": 17858592558720.0, + "grad_norm": 1.9803498888204225, + "language_loss": 0.82369053, + "learning_rate": 3.985998671031039e-06, + "loss": 0.84606457, + "num_input_tokens_seen": 23551020, + "step": 1107, + "time_per_iteration": 2.6310088634490967 + }, + { + "auxiliary_loss_clip": 0.01051224, + "auxiliary_loss_mlp": 0.0100519, + "balance_loss_clip": 1.00776887, + "balance_loss_mlp": 1.0018996, + "epoch": 0.06661656395611003, + "flos": 61419350021760.0, + "grad_norm": 0.7977620691463387, + "language_loss": 0.56724632, + "learning_rate": 3.9859526301871705e-06, + "loss": 0.5878104, + "num_input_tokens_seen": 23610675, + "step": 1108, + "time_per_iteration": 3.1329071521759033 + }, + { + "auxiliary_loss_clip": 0.01154255, + "auxiliary_loss_mlp": 0.01061511, + "balance_loss_clip": 1.04210818, + "balance_loss_mlp": 1.0350585, + "epoch": 0.066676687208778, + "flos": 20662856282880.0, + "grad_norm": 2.9508537735852918, + "language_loss": 0.72944367, + "learning_rate": 3.9859065140357795e-06, + "loss": 0.75160134, + "num_input_tokens_seen": 23628710, + "step": 1109, + "time_per_iteration": 2.8411240577697754 + }, + { + "auxiliary_loss_clip": 0.01129232, + "auxiliary_loss_mlp": 0.01067514, + "balance_loss_clip": 1.04546738, + "balance_loss_mlp": 1.04133546, + "epoch": 0.06673681046144596, + "flos": 20923280864640.0, + "grad_norm": 1.6694449999591172, + "language_loss": 0.78093112, + "learning_rate": 3.985860322578614e-06, + "loss": 0.80289853, + "num_input_tokens_seen": 23649160, + "step": 1110, + "time_per_iteration": 2.795570135116577 + }, + { + "auxiliary_loss_clip": 0.01126703, + "auxiliary_loss_mlp": 0.01061083, + "balance_loss_clip": 1.04358542, + "balance_loss_mlp": 1.03606105, + "epoch": 0.06679693371411394, + "flos": 31065818359680.0, + "grad_norm": 1.819864677709477, + "language_loss": 0.71385098, + "learning_rate": 3.985814055817427e-06, + "loss": 0.73572886, + "num_input_tokens_seen": 23671995, + "step": 1111, + "time_per_iteration": 2.791295051574707 + }, + { + "auxiliary_loss_clip": 0.01147202, + "auxiliary_loss_mlp": 0.01068611, + "balance_loss_clip": 1.04600167, + "balance_loss_mlp": 1.04317141, + "epoch": 0.0668570569667819, + "flos": 21726135705600.0, + "grad_norm": 2.0927828218195943, + "language_loss": 0.7846579, + "learning_rate": 3.985767713753971e-06, + "loss": 0.80681604, + "num_input_tokens_seen": 23690705, + "step": 1112, + "time_per_iteration": 2.708803176879883 + }, + { + "auxiliary_loss_clip": 0.01123452, + "auxiliary_loss_mlp": 0.01060813, + "balance_loss_clip": 1.04272401, + "balance_loss_mlp": 1.03625643, + "epoch": 0.06691718021944987, + "flos": 22747255539840.0, + "grad_norm": 1.9240146312191644, + "language_loss": 0.78979254, + "learning_rate": 3.985721296390005e-06, + "loss": 0.81163514, + "num_input_tokens_seen": 23709990, + "step": 1113, + "time_per_iteration": 2.8085057735443115 + }, + { + "auxiliary_loss_clip": 0.0111166, + "auxiliary_loss_mlp": 0.01055872, + "balance_loss_clip": 1.03728771, + "balance_loss_mlp": 1.03161252, + "epoch": 0.06697730347211785, + "flos": 16545626720640.0, + "grad_norm": 1.9607137684861602, + "language_loss": 0.82322413, + "learning_rate": 3.985674803727289e-06, + "loss": 0.84489942, + "num_input_tokens_seen": 23728485, + "step": 1114, + "time_per_iteration": 2.7851362228393555 + }, + { + "auxiliary_loss_clip": 0.01023274, + "auxiliary_loss_mlp": 0.01005465, + "balance_loss_clip": 1.01069856, + "balance_loss_mlp": 1.00203216, + "epoch": 0.06703742672478581, + "flos": 59782326658560.0, + "grad_norm": 0.8417090220823316, + "language_loss": 0.58154339, + "learning_rate": 3.985628235767584e-06, + "loss": 0.60183078, + "num_input_tokens_seen": 23786650, + "step": 1115, + "time_per_iteration": 3.532572031021118 + }, + { + "auxiliary_loss_clip": 0.01161908, + "auxiliary_loss_mlp": 0.010636, + "balance_loss_clip": 1.05192173, + "balance_loss_mlp": 1.0368855, + "epoch": 0.06709754997745378, + "flos": 16800197385600.0, + "grad_norm": 3.5332189897813944, + "language_loss": 0.91302788, + "learning_rate": 3.985581592512658e-06, + "loss": 0.93528295, + "num_input_tokens_seen": 23802555, + "step": 1116, + "time_per_iteration": 2.8721187114715576 + }, + { + "auxiliary_loss_clip": 0.01146756, + "auxiliary_loss_mlp": 0.00750111, + "balance_loss_clip": 1.04905415, + "balance_loss_mlp": 1.00114775, + "epoch": 0.06715767323012176, + "flos": 22123917895680.0, + "grad_norm": 1.8232335402386757, + "language_loss": 0.86952102, + "learning_rate": 3.985534873964279e-06, + "loss": 0.88848972, + "num_input_tokens_seen": 23822945, + "step": 1117, + "time_per_iteration": 2.748264789581299 + }, + { + "auxiliary_loss_clip": 0.01046102, + "auxiliary_loss_mlp": 0.01006315, + "balance_loss_clip": 1.00576019, + "balance_loss_mlp": 1.00278592, + "epoch": 0.06721779648278972, + "flos": 66618100137600.0, + "grad_norm": 0.8603449326825978, + "language_loss": 0.59740472, + "learning_rate": 3.985488080124218e-06, + "loss": 0.61792898, + "num_input_tokens_seen": 23874075, + "step": 1118, + "time_per_iteration": 3.2174503803253174 + }, + { + "auxiliary_loss_clip": 0.01151174, + "auxiliary_loss_mlp": 0.01056975, + "balance_loss_clip": 1.04292083, + "balance_loss_mlp": 1.03223968, + "epoch": 0.06727791973545769, + "flos": 22382474970240.0, + "grad_norm": 3.6702463640476, + "language_loss": 0.82659376, + "learning_rate": 3.985441210994251e-06, + "loss": 0.84867525, + "num_input_tokens_seen": 23889720, + "step": 1119, + "time_per_iteration": 2.598008632659912 + }, + { + "auxiliary_loss_clip": 0.01158681, + "auxiliary_loss_mlp": 0.01063216, + "balance_loss_clip": 1.04822469, + "balance_loss_mlp": 1.0401485, + "epoch": 0.06733804298812565, + "flos": 24280210224000.0, + "grad_norm": 4.425135710162499, + "language_loss": 0.84874892, + "learning_rate": 3.9853942665761545e-06, + "loss": 0.87096786, + "num_input_tokens_seen": 23909385, + "step": 1120, + "time_per_iteration": 2.708371877670288 + }, + { + "auxiliary_loss_clip": 0.01194368, + "auxiliary_loss_mlp": 0.01068091, + "balance_loss_clip": 1.05732465, + "balance_loss_mlp": 1.04249692, + "epoch": 0.06739816624079363, + "flos": 15918230839680.0, + "grad_norm": 1.8840520596428396, + "language_loss": 0.78539371, + "learning_rate": 3.985347246871708e-06, + "loss": 0.80801827, + "num_input_tokens_seen": 23926830, + "step": 1121, + "time_per_iteration": 2.5164737701416016 + }, + { + "auxiliary_loss_clip": 0.01042408, + "auxiliary_loss_mlp": 0.01007861, + "balance_loss_clip": 1.0046345, + "balance_loss_mlp": 1.00471401, + "epoch": 0.0674582894934616, + "flos": 71398567353600.0, + "grad_norm": 0.758737737206602, + "language_loss": 0.58363283, + "learning_rate": 3.985300151882694e-06, + "loss": 0.60413551, + "num_input_tokens_seen": 23992640, + "step": 1122, + "time_per_iteration": 3.3538413047790527 + }, + { + "auxiliary_loss_clip": 0.01127255, + "auxiliary_loss_mlp": 0.01065652, + "balance_loss_clip": 1.04939699, + "balance_loss_mlp": 1.04074931, + "epoch": 0.06751841274612956, + "flos": 25264952559360.0, + "grad_norm": 1.895040259476689, + "language_loss": 0.713916, + "learning_rate": 3.985252981610901e-06, + "loss": 0.73584509, + "num_input_tokens_seen": 24011135, + "step": 1123, + "time_per_iteration": 2.8050596714019775 + }, + { + "auxiliary_loss_clip": 0.01120711, + "auxiliary_loss_mlp": 0.01063026, + "balance_loss_clip": 1.04463935, + "balance_loss_mlp": 1.03529799, + "epoch": 0.06757853599879754, + "flos": 23802741711360.0, + "grad_norm": 2.319817317856014, + "language_loss": 0.78964627, + "learning_rate": 3.985205736058114e-06, + "loss": 0.81148362, + "num_input_tokens_seen": 24030695, + "step": 1124, + "time_per_iteration": 2.7649309635162354 + }, + { + "auxiliary_loss_clip": 0.01166426, + "auxiliary_loss_mlp": 0.01060591, + "balance_loss_clip": 1.04689777, + "balance_loss_mlp": 1.036654, + "epoch": 0.0676386592514655, + "flos": 21033742164480.0, + "grad_norm": 2.2764034957143378, + "language_loss": 0.71231866, + "learning_rate": 3.985158415226128e-06, + "loss": 0.7345888, + "num_input_tokens_seen": 24050680, + "step": 1125, + "time_per_iteration": 4.290425777435303 + }, + { + "auxiliary_loss_clip": 0.01148595, + "auxiliary_loss_mlp": 0.01072653, + "balance_loss_clip": 1.05057526, + "balance_loss_mlp": 1.04468632, + "epoch": 0.06769878250413347, + "flos": 25556331686400.0, + "grad_norm": 2.74901359076824, + "language_loss": 0.81253666, + "learning_rate": 3.985111019116736e-06, + "loss": 0.8347491, + "num_input_tokens_seen": 24067205, + "step": 1126, + "time_per_iteration": 4.376361846923828 + }, + { + "auxiliary_loss_clip": 0.01036396, + "auxiliary_loss_mlp": 0.01003747, + "balance_loss_clip": 1.00664973, + "balance_loss_mlp": 1.0005765, + "epoch": 0.06775890575680145, + "flos": 70655251305600.0, + "grad_norm": 0.783278387009044, + "language_loss": 0.59745264, + "learning_rate": 3.985063547731735e-06, + "loss": 0.61785406, + "num_input_tokens_seen": 24131320, + "step": 1127, + "time_per_iteration": 6.409546136856079 + }, + { + "auxiliary_loss_clip": 0.01183556, + "auxiliary_loss_mlp": 0.01058367, + "balance_loss_clip": 1.05218911, + "balance_loss_mlp": 1.03361869, + "epoch": 0.06781902900946941, + "flos": 24235500769920.0, + "grad_norm": 2.2604490951514515, + "language_loss": 0.81471658, + "learning_rate": 3.985016001072925e-06, + "loss": 0.83713579, + "num_input_tokens_seen": 24149930, + "step": 1128, + "time_per_iteration": 2.7699291706085205 + }, + { + "auxiliary_loss_clip": 0.01129641, + "auxiliary_loss_mlp": 0.01053045, + "balance_loss_clip": 1.04481292, + "balance_loss_mlp": 1.02726007, + "epoch": 0.06787915226213738, + "flos": 22417523665920.0, + "grad_norm": 2.3763223543894867, + "language_loss": 0.7579031, + "learning_rate": 3.984968379142109e-06, + "loss": 0.7797299, + "num_input_tokens_seen": 24169590, + "step": 1129, + "time_per_iteration": 2.8068654537200928 + }, + { + "auxiliary_loss_clip": 0.01088913, + "auxiliary_loss_mlp": 0.01061322, + "balance_loss_clip": 1.03760469, + "balance_loss_mlp": 1.0357039, + "epoch": 0.06793927551480534, + "flos": 37706922080640.0, + "grad_norm": 2.511512561330808, + "language_loss": 0.71810544, + "learning_rate": 3.984920681941094e-06, + "loss": 0.73960781, + "num_input_tokens_seen": 24189965, + "step": 1130, + "time_per_iteration": 3.121631383895874 + }, + { + "auxiliary_loss_clip": 0.01137582, + "auxiliary_loss_mlp": 0.0106379, + "balance_loss_clip": 1.0474, + "balance_loss_mlp": 1.03812408, + "epoch": 0.06799939876747332, + "flos": 20631398947200.0, + "grad_norm": 2.1829839228415864, + "language_loss": 0.80643773, + "learning_rate": 3.984872909471688e-06, + "loss": 0.82845145, + "num_input_tokens_seen": 24208045, + "step": 1131, + "time_per_iteration": 3.194866895675659 + }, + { + "auxiliary_loss_clip": 0.01170567, + "auxiliary_loss_mlp": 0.01071712, + "balance_loss_clip": 1.04939151, + "balance_loss_mlp": 1.0471431, + "epoch": 0.06805952202014129, + "flos": 14864755829760.0, + "grad_norm": 2.154857224662258, + "language_loss": 0.80237246, + "learning_rate": 3.984825061735701e-06, + "loss": 0.82479531, + "num_input_tokens_seen": 24223805, + "step": 1132, + "time_per_iteration": 2.625993251800537 + }, + { + "auxiliary_loss_clip": 0.01147205, + "auxiliary_loss_mlp": 0.01066397, + "balance_loss_clip": 1.04592001, + "balance_loss_mlp": 1.04186416, + "epoch": 0.06811964527280925, + "flos": 48909434947200.0, + "grad_norm": 1.4772128161532934, + "language_loss": 0.63564211, + "learning_rate": 3.9847771387349495e-06, + "loss": 0.65777814, + "num_input_tokens_seen": 24249475, + "step": 1133, + "time_per_iteration": 2.9563755989074707 + }, + { + "auxiliary_loss_clip": 0.01097629, + "auxiliary_loss_mlp": 0.01061183, + "balance_loss_clip": 1.03750634, + "balance_loss_mlp": 1.03269243, + "epoch": 0.06817976852547723, + "flos": 15377273038080.0, + "grad_norm": 1.9851342334833013, + "language_loss": 0.74672759, + "learning_rate": 3.9847291404712506e-06, + "loss": 0.76831567, + "num_input_tokens_seen": 24267980, + "step": 1134, + "time_per_iteration": 2.8028295040130615 + }, + { + "auxiliary_loss_clip": 0.01144344, + "auxiliary_loss_mlp": 0.00750047, + "balance_loss_clip": 1.04713035, + "balance_loss_mlp": 1.00104845, + "epoch": 0.0682398917781452, + "flos": 20155690200960.0, + "grad_norm": 1.8533994546306107, + "language_loss": 0.87339562, + "learning_rate": 3.984681066946423e-06, + "loss": 0.89233953, + "num_input_tokens_seen": 24286805, + "step": 1135, + "time_per_iteration": 2.7393252849578857 + }, + { + "auxiliary_loss_clip": 0.01151952, + "auxiliary_loss_mlp": 0.00749939, + "balance_loss_clip": 1.04422712, + "balance_loss_mlp": 1.00092673, + "epoch": 0.06830001503081316, + "flos": 23440618748160.0, + "grad_norm": 2.627579165285243, + "language_loss": 0.77996176, + "learning_rate": 3.984632918162291e-06, + "loss": 0.79898065, + "num_input_tokens_seen": 24305855, + "step": 1136, + "time_per_iteration": 2.6411290168762207 + }, + { + "auxiliary_loss_clip": 0.01165454, + "auxiliary_loss_mlp": 0.01071032, + "balance_loss_clip": 1.0510397, + "balance_loss_mlp": 1.04525876, + "epoch": 0.06836013828348114, + "flos": 34349813153280.0, + "grad_norm": 2.29983566670342, + "language_loss": 0.84430182, + "learning_rate": 3.984584694120679e-06, + "loss": 0.86666656, + "num_input_tokens_seen": 24326535, + "step": 1137, + "time_per_iteration": 2.763791799545288 + }, + { + "auxiliary_loss_clip": 0.01122789, + "auxiliary_loss_mlp": 0.01062204, + "balance_loss_clip": 1.04043043, + "balance_loss_mlp": 1.03670514, + "epoch": 0.06842026153614911, + "flos": 23148844571520.0, + "grad_norm": 2.265406285851197, + "language_loss": 0.7867915, + "learning_rate": 3.984536394823418e-06, + "loss": 0.80864143, + "num_input_tokens_seen": 24345810, + "step": 1138, + "time_per_iteration": 2.7212166786193848 + }, + { + "auxiliary_loss_clip": 0.01185586, + "auxiliary_loss_mlp": 0.01060532, + "balance_loss_clip": 1.05091679, + "balance_loss_mlp": 1.03525996, + "epoch": 0.06848038478881707, + "flos": 24608972430720.0, + "grad_norm": 2.2978293758503874, + "language_loss": 0.85408813, + "learning_rate": 3.984488020272336e-06, + "loss": 0.87654924, + "num_input_tokens_seen": 24366095, + "step": 1139, + "time_per_iteration": 2.702258825302124 + }, + { + "auxiliary_loss_clip": 0.01124631, + "auxiliary_loss_mlp": 0.01060483, + "balance_loss_clip": 1.04204071, + "balance_loss_mlp": 1.03392351, + "epoch": 0.06854050804148504, + "flos": 40880994278400.0, + "grad_norm": 1.6794230746784617, + "language_loss": 0.74962342, + "learning_rate": 3.984439570469271e-06, + "loss": 0.77147454, + "num_input_tokens_seen": 24388665, + "step": 1140, + "time_per_iteration": 2.8252861499786377 + }, + { + "auxiliary_loss_clip": 0.01161017, + "auxiliary_loss_mlp": 0.00749898, + "balance_loss_clip": 1.04748392, + "balance_loss_mlp": 1.00089574, + "epoch": 0.06860063129415302, + "flos": 31686354743040.0, + "grad_norm": 1.9922037352370774, + "language_loss": 0.68357742, + "learning_rate": 3.9843910454160574e-06, + "loss": 0.70268661, + "num_input_tokens_seen": 24407705, + "step": 1141, + "time_per_iteration": 2.779557466506958 + }, + { + "auxiliary_loss_clip": 0.01171133, + "auxiliary_loss_mlp": 0.01068597, + "balance_loss_clip": 1.04743862, + "balance_loss_mlp": 1.04265738, + "epoch": 0.06866075454682098, + "flos": 26542007775360.0, + "grad_norm": 2.3184477540890445, + "language_loss": 0.79349613, + "learning_rate": 3.984342445114538e-06, + "loss": 0.81589341, + "num_input_tokens_seen": 24428390, + "step": 1142, + "time_per_iteration": 2.637511968612671 + }, + { + "auxiliary_loss_clip": 0.01163763, + "auxiliary_loss_mlp": 0.01063357, + "balance_loss_clip": 1.04974198, + "balance_loss_mlp": 1.03795362, + "epoch": 0.06872087779948895, + "flos": 29789768724480.0, + "grad_norm": 1.6929893744704865, + "language_loss": 0.68747234, + "learning_rate": 3.984293769566553e-06, + "loss": 0.7097435, + "num_input_tokens_seen": 24450810, + "step": 1143, + "time_per_iteration": 2.7947301864624023 + }, + { + "auxiliary_loss_clip": 0.01148149, + "auxiliary_loss_mlp": 0.01060482, + "balance_loss_clip": 1.04534566, + "balance_loss_mlp": 1.03698599, + "epoch": 0.06878100105215693, + "flos": 26941118768640.0, + "grad_norm": 1.710369541314542, + "language_loss": 0.7438705, + "learning_rate": 3.98424501877395e-06, + "loss": 0.76595688, + "num_input_tokens_seen": 24469965, + "step": 1144, + "time_per_iteration": 2.597541332244873 + }, + { + "auxiliary_loss_clip": 0.01161992, + "auxiliary_loss_mlp": 0.01069643, + "balance_loss_clip": 1.04619765, + "balance_loss_mlp": 1.04401302, + "epoch": 0.06884112430482489, + "flos": 10670748946560.0, + "grad_norm": 2.3726011914502747, + "language_loss": 0.91545546, + "learning_rate": 3.984196192738577e-06, + "loss": 0.9377718, + "num_input_tokens_seen": 24486370, + "step": 1145, + "time_per_iteration": 2.667846918106079 + }, + { + "auxiliary_loss_clip": 0.01186666, + "auxiliary_loss_mlp": 0.01067394, + "balance_loss_clip": 1.04899621, + "balance_loss_mlp": 1.04083455, + "epoch": 0.06890124755749286, + "flos": 20193647898240.0, + "grad_norm": 3.4490364898200534, + "language_loss": 0.8229655, + "learning_rate": 3.984147291462285e-06, + "loss": 0.84550613, + "num_input_tokens_seen": 24503780, + "step": 1146, + "time_per_iteration": 2.6422457695007324 + }, + { + "auxiliary_loss_clip": 0.01180231, + "auxiliary_loss_mlp": 0.01065428, + "balance_loss_clip": 1.05014801, + "balance_loss_mlp": 1.04187202, + "epoch": 0.06896137081016084, + "flos": 20449224144000.0, + "grad_norm": 2.1975926317659, + "language_loss": 0.85083532, + "learning_rate": 3.98409831494693e-06, + "loss": 0.87329185, + "num_input_tokens_seen": 24522320, + "step": 1147, + "time_per_iteration": 2.670198678970337 + }, + { + "auxiliary_loss_clip": 0.01135698, + "auxiliary_loss_mlp": 0.01071724, + "balance_loss_clip": 1.04677773, + "balance_loss_mlp": 1.04615331, + "epoch": 0.0690214940628288, + "flos": 18368703555840.0, + "grad_norm": 2.0387547069529184, + "language_loss": 0.85959876, + "learning_rate": 3.984049263194367e-06, + "loss": 0.88167298, + "num_input_tokens_seen": 24540445, + "step": 1148, + "time_per_iteration": 2.8269283771514893 + }, + { + "auxiliary_loss_clip": 0.01142656, + "auxiliary_loss_mlp": 0.01061053, + "balance_loss_clip": 1.04297233, + "balance_loss_mlp": 1.03579295, + "epoch": 0.06908161731549677, + "flos": 20558033418240.0, + "grad_norm": 6.988872488111155, + "language_loss": 0.69939274, + "learning_rate": 3.9840001362064575e-06, + "loss": 0.72142982, + "num_input_tokens_seen": 24557105, + "step": 1149, + "time_per_iteration": 2.6960554122924805 + }, + { + "auxiliary_loss_clip": 0.01184399, + "auxiliary_loss_mlp": 0.01053643, + "balance_loss_clip": 1.04897344, + "balance_loss_mlp": 1.02887082, + "epoch": 0.06914174056816474, + "flos": 27563666313600.0, + "grad_norm": 2.079215771758705, + "language_loss": 0.83672118, + "learning_rate": 3.983950933985064e-06, + "loss": 0.85910159, + "num_input_tokens_seen": 24578240, + "step": 1150, + "time_per_iteration": 2.5994343757629395 + }, + { + "auxiliary_loss_clip": 0.01161132, + "auxiliary_loss_mlp": 0.01058127, + "balance_loss_clip": 1.05131519, + "balance_loss_mlp": 1.0328424, + "epoch": 0.06920186382083271, + "flos": 15304015249920.0, + "grad_norm": 3.498069284839412, + "language_loss": 0.81305867, + "learning_rate": 3.983901656532052e-06, + "loss": 0.83525121, + "num_input_tokens_seen": 24593585, + "step": 1151, + "time_per_iteration": 2.6794798374176025 + }, + { + "auxiliary_loss_clip": 0.01180976, + "auxiliary_loss_mlp": 0.01057044, + "balance_loss_clip": 1.05147743, + "balance_loss_mlp": 1.03334546, + "epoch": 0.06926198707350067, + "flos": 25191227894400.0, + "grad_norm": 1.8789144369960948, + "language_loss": 0.85714, + "learning_rate": 3.983852303849291e-06, + "loss": 0.87952024, + "num_input_tokens_seen": 24613110, + "step": 1152, + "time_per_iteration": 2.7225661277770996 + }, + { + "auxiliary_loss_clip": 0.01164369, + "auxiliary_loss_mlp": 0.01063593, + "balance_loss_clip": 1.04895329, + "balance_loss_mlp": 1.0401926, + "epoch": 0.06932211032616864, + "flos": 13256137146240.0, + "grad_norm": 2.174106360654755, + "language_loss": 0.90766817, + "learning_rate": 3.983802875938651e-06, + "loss": 0.92994773, + "num_input_tokens_seen": 24628795, + "step": 1153, + "time_per_iteration": 2.7396230697631836 + }, + { + "auxiliary_loss_clip": 0.01141057, + "auxiliary_loss_mlp": 0.01054153, + "balance_loss_clip": 1.04508615, + "balance_loss_mlp": 1.02972746, + "epoch": 0.06938223357883662, + "flos": 24827381078400.0, + "grad_norm": 2.0062766420343197, + "language_loss": 0.82014358, + "learning_rate": 3.983753372802008e-06, + "loss": 0.84209567, + "num_input_tokens_seen": 24645480, + "step": 1154, + "time_per_iteration": 2.7576379776000977 + }, + { + "auxiliary_loss_clip": 0.01167173, + "auxiliary_loss_mlp": 0.01064043, + "balance_loss_clip": 1.05870116, + "balance_loss_mlp": 1.03998613, + "epoch": 0.06944235683150458, + "flos": 27267977554560.0, + "grad_norm": 1.9345155848689402, + "language_loss": 0.75347579, + "learning_rate": 3.983703794441237e-06, + "loss": 0.77578795, + "num_input_tokens_seen": 24664630, + "step": 1155, + "time_per_iteration": 2.836543321609497 + }, + { + "auxiliary_loss_clip": 0.01150024, + "auxiliary_loss_mlp": 0.00749956, + "balance_loss_clip": 1.04238892, + "balance_loss_mlp": 1.00095725, + "epoch": 0.06950248008417255, + "flos": 25808065176960.0, + "grad_norm": 1.8507268887516914, + "language_loss": 0.71009928, + "learning_rate": 3.98365414085822e-06, + "loss": 0.72909909, + "num_input_tokens_seen": 24684210, + "step": 1156, + "time_per_iteration": 2.716240406036377 + }, + { + "auxiliary_loss_clip": 0.01146915, + "auxiliary_loss_mlp": 0.00749924, + "balance_loss_clip": 1.04542077, + "balance_loss_mlp": 1.00084877, + "epoch": 0.06956260333684053, + "flos": 22271546793600.0, + "grad_norm": 1.8496301720437365, + "language_loss": 0.75395578, + "learning_rate": 3.98360441205484e-06, + "loss": 0.77292418, + "num_input_tokens_seen": 24702490, + "step": 1157, + "time_per_iteration": 2.696436643600464 + }, + { + "auxiliary_loss_clip": 0.01153294, + "auxiliary_loss_mlp": 0.01060221, + "balance_loss_clip": 1.04515207, + "balance_loss_mlp": 1.03561604, + "epoch": 0.0696227265895085, + "flos": 29681390413440.0, + "grad_norm": 1.974965240232157, + "language_loss": 0.71783209, + "learning_rate": 3.983554608032982e-06, + "loss": 0.73996723, + "num_input_tokens_seen": 24724340, + "step": 1158, + "time_per_iteration": 2.7730653285980225 + }, + { + "auxiliary_loss_clip": 0.0118275, + "auxiliary_loss_mlp": 0.01062756, + "balance_loss_clip": 1.04866028, + "balance_loss_mlp": 1.03774595, + "epoch": 0.06968284984217646, + "flos": 25523545547520.0, + "grad_norm": 1.8832143790350775, + "language_loss": 0.7997824, + "learning_rate": 3.983504728794533e-06, + "loss": 0.82223743, + "num_input_tokens_seen": 24745550, + "step": 1159, + "time_per_iteration": 2.7396373748779297 + }, + { + "auxiliary_loss_clip": 0.01183513, + "auxiliary_loss_mlp": 0.01065442, + "balance_loss_clip": 1.05077624, + "balance_loss_mlp": 1.03807199, + "epoch": 0.06974297309484444, + "flos": 20698192287360.0, + "grad_norm": 7.2219111798870514, + "language_loss": 0.80720294, + "learning_rate": 3.983454774341387e-06, + "loss": 0.82969248, + "num_input_tokens_seen": 24762575, + "step": 1160, + "time_per_iteration": 2.670172691345215 + }, + { + "auxiliary_loss_clip": 0.01161025, + "auxiliary_loss_mlp": 0.01060126, + "balance_loss_clip": 1.04383779, + "balance_loss_mlp": 1.03515124, + "epoch": 0.0698030963475124, + "flos": 26505199313280.0, + "grad_norm": 1.6700864389034629, + "language_loss": 0.76103485, + "learning_rate": 3.983404744675437e-06, + "loss": 0.78324634, + "num_input_tokens_seen": 24782605, + "step": 1161, + "time_per_iteration": 2.6751596927642822 + }, + { + "auxiliary_loss_clip": 0.01148681, + "auxiliary_loss_mlp": 0.01068402, + "balance_loss_clip": 1.04355633, + "balance_loss_mlp": 1.04347587, + "epoch": 0.06986321960018037, + "flos": 23040430346880.0, + "grad_norm": 1.8770500158609043, + "language_loss": 0.83000457, + "learning_rate": 3.9833546397985794e-06, + "loss": 0.85217547, + "num_input_tokens_seen": 24802910, + "step": 1162, + "time_per_iteration": 2.6737890243530273 + }, + { + "auxiliary_loss_clip": 0.0115099, + "auxiliary_loss_mlp": 0.01056767, + "balance_loss_clip": 1.04340827, + "balance_loss_mlp": 1.03132749, + "epoch": 0.06992334285284833, + "flos": 28584822061440.0, + "grad_norm": 1.800626566040732, + "language_loss": 0.79508722, + "learning_rate": 3.983304459712716e-06, + "loss": 0.81716472, + "num_input_tokens_seen": 24823305, + "step": 1163, + "time_per_iteration": 2.6798741817474365 + }, + { + "auxiliary_loss_clip": 0.01162831, + "auxiliary_loss_mlp": 0.01062343, + "balance_loss_clip": 1.04602563, + "balance_loss_mlp": 1.03685641, + "epoch": 0.06998346610551631, + "flos": 20595344670720.0, + "grad_norm": 2.1193828588078003, + "language_loss": 0.79146469, + "learning_rate": 3.983254204419749e-06, + "loss": 0.81371641, + "num_input_tokens_seen": 24842155, + "step": 1164, + "time_per_iteration": 2.595832109451294 + }, + { + "auxiliary_loss_clip": 0.01113704, + "auxiliary_loss_mlp": 0.01068064, + "balance_loss_clip": 1.04034483, + "balance_loss_mlp": 1.04140902, + "epoch": 0.07004358935818428, + "flos": 22528810978560.0, + "grad_norm": 2.1271298512499364, + "language_loss": 0.72904301, + "learning_rate": 3.983203873921583e-06, + "loss": 0.75086069, + "num_input_tokens_seen": 24862080, + "step": 1165, + "time_per_iteration": 2.7622063159942627 + }, + { + "auxiliary_loss_clip": 0.01153668, + "auxiliary_loss_mlp": 0.01058647, + "balance_loss_clip": 1.04497778, + "balance_loss_mlp": 1.03387558, + "epoch": 0.07010371261085224, + "flos": 28949997680640.0, + "grad_norm": 2.6893841035445827, + "language_loss": 0.81379628, + "learning_rate": 3.983153468220128e-06, + "loss": 0.83591944, + "num_input_tokens_seen": 24886165, + "step": 1166, + "time_per_iteration": 2.7433083057403564 + }, + { + "auxiliary_loss_clip": 0.01140091, + "auxiliary_loss_mlp": 0.01046083, + "balance_loss_clip": 1.04190338, + "balance_loss_mlp": 1.0211798, + "epoch": 0.07016383586352022, + "flos": 23659171050240.0, + "grad_norm": 2.5344704842432493, + "language_loss": 0.84889603, + "learning_rate": 3.983102987317295e-06, + "loss": 0.87075776, + "num_input_tokens_seen": 24905775, + "step": 1167, + "time_per_iteration": 2.6479573249816895 + }, + { + "auxiliary_loss_clip": 0.01165596, + "auxiliary_loss_mlp": 0.01062111, + "balance_loss_clip": 1.04761219, + "balance_loss_mlp": 1.03701794, + "epoch": 0.07022395911618819, + "flos": 19792130693760.0, + "grad_norm": 2.094661719027138, + "language_loss": 0.8963232, + "learning_rate": 3.983052431214997e-06, + "loss": 0.91860032, + "num_input_tokens_seen": 24924295, + "step": 1168, + "time_per_iteration": 2.6372263431549072 + }, + { + "auxiliary_loss_clip": 0.01156987, + "auxiliary_loss_mlp": 0.01071053, + "balance_loss_clip": 1.04464602, + "balance_loss_mlp": 1.0427053, + "epoch": 0.07028408236885615, + "flos": 21689147675520.0, + "grad_norm": 2.339077958260483, + "language_loss": 0.8885234, + "learning_rate": 3.983001799915153e-06, + "loss": 0.91080379, + "num_input_tokens_seen": 24943210, + "step": 1169, + "time_per_iteration": 2.795948028564453 + }, + { + "auxiliary_loss_clip": 0.01180164, + "auxiliary_loss_mlp": 0.01071422, + "balance_loss_clip": 1.04848206, + "balance_loss_mlp": 1.04600632, + "epoch": 0.07034420562152413, + "flos": 25630271832960.0, + "grad_norm": 4.880397743750786, + "language_loss": 0.839522, + "learning_rate": 3.982951093419681e-06, + "loss": 0.86203778, + "num_input_tokens_seen": 24960360, + "step": 1170, + "time_per_iteration": 2.6120059490203857 + }, + { + "auxiliary_loss_clip": 0.01146398, + "auxiliary_loss_mlp": 0.00749932, + "balance_loss_clip": 1.04580235, + "balance_loss_mlp": 1.0009408, + "epoch": 0.0704043288741921, + "flos": 20810449267200.0, + "grad_norm": 1.850191097316382, + "language_loss": 0.75407112, + "learning_rate": 3.982900311730506e-06, + "loss": 0.77303445, + "num_input_tokens_seen": 24978290, + "step": 1171, + "time_per_iteration": 2.748812675476074 + }, + { + "auxiliary_loss_clip": 0.01153892, + "auxiliary_loss_mlp": 0.01055512, + "balance_loss_clip": 1.04811132, + "balance_loss_mlp": 1.03156304, + "epoch": 0.07046445212686006, + "flos": 25593176062080.0, + "grad_norm": 1.792473353249838, + "language_loss": 0.8921814, + "learning_rate": 3.9828494548495514e-06, + "loss": 0.91427541, + "num_input_tokens_seen": 24997055, + "step": 1172, + "time_per_iteration": 2.7337491512298584 + }, + { + "auxiliary_loss_clip": 0.01161939, + "auxiliary_loss_mlp": 0.0105322, + "balance_loss_clip": 1.04736972, + "balance_loss_mlp": 1.02726865, + "epoch": 0.07052457537952803, + "flos": 25556978131200.0, + "grad_norm": 2.2588888818633275, + "language_loss": 0.81881285, + "learning_rate": 3.982798522778748e-06, + "loss": 0.84096444, + "num_input_tokens_seen": 25017490, + "step": 1173, + "time_per_iteration": 4.467356443405151 + }, + { + "auxiliary_loss_clip": 0.01171185, + "auxiliary_loss_mlp": 0.01058488, + "balance_loss_clip": 1.04881716, + "balance_loss_mlp": 1.03288209, + "epoch": 0.070584698632196, + "flos": 17968515154560.0, + "grad_norm": 21.766122966167718, + "language_loss": 0.82142347, + "learning_rate": 3.9827475155200245e-06, + "loss": 0.8437202, + "num_input_tokens_seen": 25035660, + "step": 1174, + "time_per_iteration": 4.3033607006073 + }, + { + "auxiliary_loss_clip": 0.01148944, + "auxiliary_loss_mlp": 0.01057529, + "balance_loss_clip": 1.04292727, + "balance_loss_mlp": 1.03288853, + "epoch": 0.07064482188486397, + "flos": 25370888745600.0, + "grad_norm": 1.9498701588813496, + "language_loss": 0.85370797, + "learning_rate": 3.982696433075317e-06, + "loss": 0.87577271, + "num_input_tokens_seen": 25054785, + "step": 1175, + "time_per_iteration": 4.3357672691345215 + }, + { + "auxiliary_loss_clip": 0.01166274, + "auxiliary_loss_mlp": 0.01063819, + "balance_loss_clip": 1.04793477, + "balance_loss_mlp": 1.04013193, + "epoch": 0.07070494513753194, + "flos": 24899848767360.0, + "grad_norm": 2.105038315468637, + "language_loss": 0.83414012, + "learning_rate": 3.982645275446563e-06, + "loss": 0.85644102, + "num_input_tokens_seen": 25075180, + "step": 1176, + "time_per_iteration": 2.634709596633911 + }, + { + "auxiliary_loss_clip": 0.01121891, + "auxiliary_loss_mlp": 0.01061932, + "balance_loss_clip": 1.04107106, + "balance_loss_mlp": 1.0363853, + "epoch": 0.07076506839019991, + "flos": 22338447874560.0, + "grad_norm": 2.4867322334428086, + "language_loss": 0.7464366, + "learning_rate": 3.982594042635701e-06, + "loss": 0.76827478, + "num_input_tokens_seen": 25093035, + "step": 1177, + "time_per_iteration": 2.661773204803467 + }, + { + "auxiliary_loss_clip": 0.01156508, + "auxiliary_loss_mlp": 0.01060229, + "balance_loss_clip": 1.04642463, + "balance_loss_mlp": 1.03359795, + "epoch": 0.07082519164286788, + "flos": 18660800954880.0, + "grad_norm": 1.7567229666188136, + "language_loss": 0.85813379, + "learning_rate": 3.982542734644673e-06, + "loss": 0.88030112, + "num_input_tokens_seen": 25112520, + "step": 1178, + "time_per_iteration": 2.7389655113220215 + }, + { + "auxiliary_loss_clip": 0.01039352, + "auxiliary_loss_mlp": 0.01014026, + "balance_loss_clip": 1.00847018, + "balance_loss_mlp": 1.0108552, + "epoch": 0.07088531489553584, + "flos": 63654107610240.0, + "grad_norm": 0.8466718540600141, + "language_loss": 0.63257611, + "learning_rate": 3.982491351475427e-06, + "loss": 0.65310979, + "num_input_tokens_seen": 25177760, + "step": 1179, + "time_per_iteration": 3.293729066848755 + }, + { + "auxiliary_loss_clip": 0.01180354, + "auxiliary_loss_mlp": 0.01062089, + "balance_loss_clip": 1.05422878, + "balance_loss_mlp": 1.03793716, + "epoch": 0.07094543814820382, + "flos": 21572688804480.0, + "grad_norm": 2.6481077514210867, + "language_loss": 0.83999646, + "learning_rate": 3.98243989312991e-06, + "loss": 0.86242086, + "num_input_tokens_seen": 25195260, + "step": 1180, + "time_per_iteration": 2.63338041305542 + }, + { + "auxiliary_loss_clip": 0.01149034, + "auxiliary_loss_mlp": 0.01066706, + "balance_loss_clip": 1.04582214, + "balance_loss_mlp": 1.04155338, + "epoch": 0.07100556140087179, + "flos": 22089946608000.0, + "grad_norm": 2.3226804989146936, + "language_loss": 0.88643503, + "learning_rate": 3.982388359610074e-06, + "loss": 0.90859246, + "num_input_tokens_seen": 25212740, + "step": 1181, + "time_per_iteration": 2.6330769062042236 + }, + { + "auxiliary_loss_clip": 0.0115755, + "auxiliary_loss_mlp": 0.01065472, + "balance_loss_clip": 1.05387878, + "balance_loss_mlp": 1.04111767, + "epoch": 0.07106568465353975, + "flos": 47922286400640.0, + "grad_norm": 2.0724627725230307, + "language_loss": 0.836869, + "learning_rate": 3.9823367509178725e-06, + "loss": 0.85909921, + "num_input_tokens_seen": 25236420, + "step": 1182, + "time_per_iteration": 2.953556537628174 + }, + { + "auxiliary_loss_clip": 0.01163627, + "auxiliary_loss_mlp": 0.01060063, + "balance_loss_clip": 1.04902768, + "balance_loss_mlp": 1.03430212, + "epoch": 0.07112580790620772, + "flos": 23440798316160.0, + "grad_norm": 3.114821781367385, + "language_loss": 0.79530913, + "learning_rate": 3.982285067055262e-06, + "loss": 0.81754601, + "num_input_tokens_seen": 25255120, + "step": 1183, + "time_per_iteration": 2.632697343826294 + }, + { + "auxiliary_loss_clip": 0.01181119, + "auxiliary_loss_mlp": 0.01062725, + "balance_loss_clip": 1.04615438, + "balance_loss_mlp": 1.03653479, + "epoch": 0.0711859311588757, + "flos": 31868888682240.0, + "grad_norm": 2.715469267243898, + "language_loss": 0.78957963, + "learning_rate": 3.982233308024204e-06, + "loss": 0.81201804, + "num_input_tokens_seen": 25275150, + "step": 1184, + "time_per_iteration": 2.6912198066711426 + }, + { + "auxiliary_loss_clip": 0.01126013, + "auxiliary_loss_mlp": 0.0106299, + "balance_loss_clip": 1.04547274, + "balance_loss_mlp": 1.03832507, + "epoch": 0.07124605441154366, + "flos": 19610315026560.0, + "grad_norm": 3.2823526551871303, + "language_loss": 0.77075809, + "learning_rate": 3.98218147382666e-06, + "loss": 0.79264814, + "num_input_tokens_seen": 25293680, + "step": 1185, + "time_per_iteration": 2.708838939666748 + }, + { + "auxiliary_loss_clip": 0.0117934, + "auxiliary_loss_mlp": 0.01064521, + "balance_loss_clip": 1.04931617, + "balance_loss_mlp": 1.03917682, + "epoch": 0.07130617766421163, + "flos": 14684448533760.0, + "grad_norm": 3.7862511357238873, + "language_loss": 0.65395445, + "learning_rate": 3.982129564464596e-06, + "loss": 0.67639303, + "num_input_tokens_seen": 25310050, + "step": 1186, + "time_per_iteration": 2.6471469402313232 + }, + { + "auxiliary_loss_clip": 0.01164648, + "auxiliary_loss_mlp": 0.01055255, + "balance_loss_clip": 1.048262, + "balance_loss_mlp": 1.03014958, + "epoch": 0.07136630091687961, + "flos": 26067915141120.0, + "grad_norm": 1.8211129838524498, + "language_loss": 0.69740427, + "learning_rate": 3.98207757993998e-06, + "loss": 0.7196033, + "num_input_tokens_seen": 25331020, + "step": 1187, + "time_per_iteration": 2.6433606147766113 + }, + { + "auxiliary_loss_clip": 0.01124592, + "auxiliary_loss_mlp": 0.01056938, + "balance_loss_clip": 1.04344201, + "balance_loss_mlp": 1.03348923, + "epoch": 0.07142642416954757, + "flos": 15669190869120.0, + "grad_norm": 2.5680637090619727, + "language_loss": 0.78835154, + "learning_rate": 3.9820255202547845e-06, + "loss": 0.81016684, + "num_input_tokens_seen": 25347875, + "step": 1188, + "time_per_iteration": 2.7950761318206787 + }, + { + "auxiliary_loss_clip": 0.0117636, + "auxiliary_loss_mlp": 0.01058164, + "balance_loss_clip": 1.0481925, + "balance_loss_mlp": 1.03379774, + "epoch": 0.07148654742221554, + "flos": 19755322231680.0, + "grad_norm": 1.8569363083776482, + "language_loss": 0.85157812, + "learning_rate": 3.981973385410981e-06, + "loss": 0.8739233, + "num_input_tokens_seen": 25366715, + "step": 1189, + "time_per_iteration": 2.5512635707855225 + }, + { + "auxiliary_loss_clip": 0.01142896, + "auxiliary_loss_mlp": 0.00749864, + "balance_loss_clip": 1.04807949, + "balance_loss_mlp": 1.0007987, + "epoch": 0.07154667067488352, + "flos": 23471824688640.0, + "grad_norm": 1.6666514820816238, + "language_loss": 0.7662434, + "learning_rate": 3.9819211754105494e-06, + "loss": 0.78517097, + "num_input_tokens_seen": 25385450, + "step": 1190, + "time_per_iteration": 2.6885218620300293 + }, + { + "auxiliary_loss_clip": 0.01179116, + "auxiliary_loss_mlp": 0.01070851, + "balance_loss_clip": 1.04774582, + "balance_loss_mlp": 1.04389727, + "epoch": 0.07160679392755148, + "flos": 18332936588160.0, + "grad_norm": 2.4354925368037885, + "language_loss": 0.75781149, + "learning_rate": 3.981868890255468e-06, + "loss": 0.78031117, + "num_input_tokens_seen": 25403940, + "step": 1191, + "time_per_iteration": 2.546879529953003 + }, + { + "auxiliary_loss_clip": 0.01128958, + "auxiliary_loss_mlp": 0.01061419, + "balance_loss_clip": 1.04157233, + "balance_loss_mlp": 1.03499055, + "epoch": 0.07166691718021945, + "flos": 17747017937280.0, + "grad_norm": 2.8638212292162364, + "language_loss": 0.73286092, + "learning_rate": 3.981816529947719e-06, + "loss": 0.75476474, + "num_input_tokens_seen": 25420410, + "step": 1192, + "time_per_iteration": 2.749547243118286 + }, + { + "auxiliary_loss_clip": 0.01172602, + "auxiliary_loss_mlp": 0.01050976, + "balance_loss_clip": 1.043805, + "balance_loss_mlp": 1.0276351, + "epoch": 0.07172704043288743, + "flos": 22451925916800.0, + "grad_norm": 1.9323130724786022, + "language_loss": 0.78208542, + "learning_rate": 3.9817640944892896e-06, + "loss": 0.80432129, + "num_input_tokens_seen": 25439415, + "step": 1193, + "time_per_iteration": 2.567195415496826 + }, + { + "auxiliary_loss_clip": 0.01158773, + "auxiliary_loss_mlp": 0.01061322, + "balance_loss_clip": 1.04955912, + "balance_loss_mlp": 1.0352273, + "epoch": 0.07178716368555539, + "flos": 23222210100480.0, + "grad_norm": 1.9302375122157251, + "language_loss": 0.85816306, + "learning_rate": 3.981711583882166e-06, + "loss": 0.88036406, + "num_input_tokens_seen": 25458715, + "step": 1194, + "time_per_iteration": 2.644129991531372 + }, + { + "auxiliary_loss_clip": 0.01152617, + "auxiliary_loss_mlp": 0.01059436, + "balance_loss_clip": 1.04416442, + "balance_loss_mlp": 1.03466463, + "epoch": 0.07184728693822336, + "flos": 25150828072320.0, + "grad_norm": 1.8239920234779115, + "language_loss": 0.81522506, + "learning_rate": 3.981658998128341e-06, + "loss": 0.8373456, + "num_input_tokens_seen": 25477985, + "step": 1195, + "time_per_iteration": 2.6373963356018066 + }, + { + "auxiliary_loss_clip": 0.01133719, + "auxiliary_loss_mlp": 0.01055379, + "balance_loss_clip": 1.04438198, + "balance_loss_mlp": 1.03232408, + "epoch": 0.07190741019089132, + "flos": 22711237176960.0, + "grad_norm": 2.329849456290162, + "language_loss": 0.79819018, + "learning_rate": 3.981606337229808e-06, + "loss": 0.82008117, + "num_input_tokens_seen": 25497110, + "step": 1196, + "time_per_iteration": 2.738826036453247 + }, + { + "auxiliary_loss_clip": 0.01142171, + "auxiliary_loss_mlp": 0.00749891, + "balance_loss_clip": 1.04491365, + "balance_loss_mlp": 1.00073576, + "epoch": 0.0719675334435593, + "flos": 29349791032320.0, + "grad_norm": 2.2748520805115744, + "language_loss": 0.70950043, + "learning_rate": 3.9815536011885655e-06, + "loss": 0.72842103, + "num_input_tokens_seen": 25516555, + "step": 1197, + "time_per_iteration": 2.8700523376464844 + }, + { + "auxiliary_loss_clip": 0.01133647, + "auxiliary_loss_mlp": 0.01050869, + "balance_loss_clip": 1.04964912, + "balance_loss_mlp": 1.02591884, + "epoch": 0.07202765669622727, + "flos": 17639788861440.0, + "grad_norm": 1.9551082923819008, + "language_loss": 0.85806125, + "learning_rate": 3.98150079000661e-06, + "loss": 0.87990642, + "num_input_tokens_seen": 25533895, + "step": 1198, + "time_per_iteration": 2.7483088970184326 + }, + { + "auxiliary_loss_clip": 0.01134769, + "auxiliary_loss_mlp": 0.01066471, + "balance_loss_clip": 1.04744577, + "balance_loss_mlp": 1.04100835, + "epoch": 0.07208777994889523, + "flos": 21434038306560.0, + "grad_norm": 2.096773790109715, + "language_loss": 0.83896279, + "learning_rate": 3.981447903685947e-06, + "loss": 0.86097527, + "num_input_tokens_seen": 25554195, + "step": 1199, + "time_per_iteration": 2.724682092666626 + }, + { + "auxiliary_loss_clip": 0.01185361, + "auxiliary_loss_mlp": 0.01057978, + "balance_loss_clip": 1.05448413, + "balance_loss_mlp": 1.03462446, + "epoch": 0.07214790320156321, + "flos": 26940867373440.0, + "grad_norm": 2.755060683106642, + "language_loss": 0.76711154, + "learning_rate": 3.981394942228581e-06, + "loss": 0.78954494, + "num_input_tokens_seen": 25574155, + "step": 1200, + "time_per_iteration": 2.6846518516540527 + }, + { + "auxiliary_loss_clip": 0.01170324, + "auxiliary_loss_mlp": 0.01070567, + "balance_loss_clip": 1.05567229, + "balance_loss_mlp": 1.04511571, + "epoch": 0.07220802645423118, + "flos": 23879949995520.0, + "grad_norm": 1.876430452777867, + "language_loss": 0.82327521, + "learning_rate": 3.98134190563652e-06, + "loss": 0.84568411, + "num_input_tokens_seen": 25592735, + "step": 1201, + "time_per_iteration": 2.6853229999542236 + }, + { + "auxiliary_loss_clip": 0.01168426, + "auxiliary_loss_mlp": 0.01059526, + "balance_loss_clip": 1.04732049, + "balance_loss_mlp": 1.03333545, + "epoch": 0.07226814970689914, + "flos": 19243631036160.0, + "grad_norm": 2.4877771089282357, + "language_loss": 0.68239117, + "learning_rate": 3.981288793911775e-06, + "loss": 0.70467067, + "num_input_tokens_seen": 25611510, + "step": 1202, + "time_per_iteration": 2.6779532432556152 + }, + { + "auxiliary_loss_clip": 0.01155984, + "auxiliary_loss_mlp": 0.00749773, + "balance_loss_clip": 1.04851675, + "balance_loss_mlp": 1.000705, + "epoch": 0.07232827295956712, + "flos": 19172025273600.0, + "grad_norm": 2.3748884898880123, + "language_loss": 0.87635636, + "learning_rate": 3.98123560705636e-06, + "loss": 0.89541399, + "num_input_tokens_seen": 25629560, + "step": 1203, + "time_per_iteration": 2.651028633117676 + }, + { + "auxiliary_loss_clip": 0.0112503, + "auxiliary_loss_mlp": 0.01062269, + "balance_loss_clip": 1.04139924, + "balance_loss_mlp": 1.03711581, + "epoch": 0.07238839621223508, + "flos": 17639752947840.0, + "grad_norm": 1.8980200065603747, + "language_loss": 0.78495228, + "learning_rate": 3.981182345072293e-06, + "loss": 0.80682528, + "num_input_tokens_seen": 25648330, + "step": 1204, + "time_per_iteration": 2.721325159072876 + }, + { + "auxiliary_loss_clip": 0.01167014, + "auxiliary_loss_mlp": 0.01063356, + "balance_loss_clip": 1.04917812, + "balance_loss_mlp": 1.039729, + "epoch": 0.07244851946490305, + "flos": 28292401440000.0, + "grad_norm": 1.7222582402061148, + "language_loss": 0.82124019, + "learning_rate": 3.981129007961593e-06, + "loss": 0.84354389, + "num_input_tokens_seen": 25669470, + "step": 1205, + "time_per_iteration": 2.6646907329559326 + }, + { + "auxiliary_loss_clip": 0.01158564, + "auxiliary_loss_mlp": 0.00749769, + "balance_loss_clip": 1.05320692, + "balance_loss_mlp": 1.00067616, + "epoch": 0.07250864271757101, + "flos": 22564829341440.0, + "grad_norm": 1.8305562178654393, + "language_loss": 0.76733756, + "learning_rate": 3.981075595726283e-06, + "loss": 0.78642088, + "num_input_tokens_seen": 25690470, + "step": 1206, + "time_per_iteration": 2.6364972591400146 + }, + { + "auxiliary_loss_clip": 0.01168146, + "auxiliary_loss_mlp": 0.01059496, + "balance_loss_clip": 1.05316734, + "balance_loss_mlp": 1.03404498, + "epoch": 0.072568765970239, + "flos": 21762405463680.0, + "grad_norm": 1.8733732586963274, + "language_loss": 0.77204174, + "learning_rate": 3.981022108368387e-06, + "loss": 0.7943182, + "num_input_tokens_seen": 25709205, + "step": 1207, + "time_per_iteration": 2.6395833492279053 + }, + { + "auxiliary_loss_clip": 0.01154969, + "auxiliary_loss_mlp": 0.01052069, + "balance_loss_clip": 1.04517341, + "balance_loss_mlp": 1.02956176, + "epoch": 0.07262888922290696, + "flos": 25519702792320.0, + "grad_norm": 3.438901351250978, + "language_loss": 0.79794919, + "learning_rate": 3.9809685458899345e-06, + "loss": 0.82001954, + "num_input_tokens_seen": 25728485, + "step": 1208, + "time_per_iteration": 2.771872043609619 + }, + { + "auxiliary_loss_clip": 0.01153883, + "auxiliary_loss_mlp": 0.0105534, + "balance_loss_clip": 1.04491735, + "balance_loss_mlp": 1.03214228, + "epoch": 0.07268901247557492, + "flos": 21246548290560.0, + "grad_norm": 1.84229089051384, + "language_loss": 0.78637171, + "learning_rate": 3.980914908292955e-06, + "loss": 0.80846399, + "num_input_tokens_seen": 25747730, + "step": 1209, + "time_per_iteration": 2.663569211959839 + }, + { + "auxiliary_loss_clip": 0.01161774, + "auxiliary_loss_mlp": 0.01059907, + "balance_loss_clip": 1.04547584, + "balance_loss_mlp": 1.03669739, + "epoch": 0.0727491357282429, + "flos": 25479302970240.0, + "grad_norm": 3.113429008342111, + "language_loss": 0.81107926, + "learning_rate": 3.980861195579486e-06, + "loss": 0.83329612, + "num_input_tokens_seen": 25768050, + "step": 1210, + "time_per_iteration": 2.6492912769317627 + }, + { + "auxiliary_loss_clip": 0.01147021, + "auxiliary_loss_mlp": 0.01061024, + "balance_loss_clip": 1.04689085, + "balance_loss_mlp": 1.03720593, + "epoch": 0.07280925898091087, + "flos": 24462169545600.0, + "grad_norm": 1.8054517220482915, + "language_loss": 0.84378707, + "learning_rate": 3.98080740775156e-06, + "loss": 0.8658675, + "num_input_tokens_seen": 25787985, + "step": 1211, + "time_per_iteration": 2.7824442386627197 + }, + { + "auxiliary_loss_clip": 0.01126179, + "auxiliary_loss_mlp": 0.01051469, + "balance_loss_clip": 1.03904247, + "balance_loss_mlp": 1.02813995, + "epoch": 0.07286938223357883, + "flos": 18288191220480.0, + "grad_norm": 2.351694621402233, + "language_loss": 0.90703845, + "learning_rate": 3.98075354481122e-06, + "loss": 0.92881495, + "num_input_tokens_seen": 25803620, + "step": 1212, + "time_per_iteration": 2.5900912284851074 + }, + { + "auxiliary_loss_clip": 0.01171978, + "auxiliary_loss_mlp": 0.01058475, + "balance_loss_clip": 1.04767525, + "balance_loss_mlp": 1.03485942, + "epoch": 0.07292950548624681, + "flos": 21214803646080.0, + "grad_norm": 1.7698551922569785, + "language_loss": 0.72601175, + "learning_rate": 3.9806996067605055e-06, + "loss": 0.74831629, + "num_input_tokens_seen": 25823315, + "step": 1213, + "time_per_iteration": 2.5538299083709717 + }, + { + "auxiliary_loss_clip": 0.01125585, + "auxiliary_loss_mlp": 0.0105244, + "balance_loss_clip": 1.04150176, + "balance_loss_mlp": 1.02858615, + "epoch": 0.07298962873891478, + "flos": 24642009964800.0, + "grad_norm": 1.932327189470942, + "language_loss": 0.84499836, + "learning_rate": 3.980645593601465e-06, + "loss": 0.86677861, + "num_input_tokens_seen": 25842605, + "step": 1214, + "time_per_iteration": 2.7276084423065186 + }, + { + "auxiliary_loss_clip": 0.01179474, + "auxiliary_loss_mlp": 0.01058269, + "balance_loss_clip": 1.05037856, + "balance_loss_mlp": 1.03364086, + "epoch": 0.07304975199158274, + "flos": 27052765217280.0, + "grad_norm": 2.1411417597482796, + "language_loss": 0.84394622, + "learning_rate": 3.980591505336144e-06, + "loss": 0.86632371, + "num_input_tokens_seen": 25863030, + "step": 1215, + "time_per_iteration": 2.5801870822906494 + }, + { + "auxiliary_loss_clip": 0.01110951, + "auxiliary_loss_mlp": 0.01059748, + "balance_loss_clip": 1.03731823, + "balance_loss_mlp": 1.03559589, + "epoch": 0.07310987524425071, + "flos": 33549544091520.0, + "grad_norm": 6.558877126042088, + "language_loss": 0.81181133, + "learning_rate": 3.980537341966595e-06, + "loss": 0.83351839, + "num_input_tokens_seen": 25888015, + "step": 1216, + "time_per_iteration": 2.7592015266418457 + }, + { + "auxiliary_loss_clip": 0.01133097, + "auxiliary_loss_mlp": 0.01052864, + "balance_loss_clip": 1.04189634, + "balance_loss_mlp": 1.02974927, + "epoch": 0.07316999849691869, + "flos": 28110944908800.0, + "grad_norm": 2.0131551468629763, + "language_loss": 0.75793457, + "learning_rate": 3.980483103494872e-06, + "loss": 0.77979422, + "num_input_tokens_seen": 25908660, + "step": 1217, + "time_per_iteration": 2.772559404373169 + }, + { + "auxiliary_loss_clip": 0.01139195, + "auxiliary_loss_mlp": 0.0105559, + "balance_loss_clip": 1.04538286, + "balance_loss_mlp": 1.03404891, + "epoch": 0.07323012174958665, + "flos": 14392602529920.0, + "grad_norm": 2.1707794687513964, + "language_loss": 0.86036801, + "learning_rate": 3.98042878992303e-06, + "loss": 0.88231587, + "num_input_tokens_seen": 25927215, + "step": 1218, + "time_per_iteration": 2.7202651500701904 + }, + { + "auxiliary_loss_clip": 0.01161521, + "auxiliary_loss_mlp": 0.01064179, + "balance_loss_clip": 1.04419339, + "balance_loss_mlp": 1.04143381, + "epoch": 0.07329024500225462, + "flos": 21616428591360.0, + "grad_norm": 2.0439175738011715, + "language_loss": 0.86783504, + "learning_rate": 3.9803744012531305e-06, + "loss": 0.89009202, + "num_input_tokens_seen": 25945500, + "step": 1219, + "time_per_iteration": 2.694411277770996 + }, + { + "auxiliary_loss_clip": 0.01173151, + "auxiliary_loss_mlp": 0.01055165, + "balance_loss_clip": 1.04657793, + "balance_loss_mlp": 1.0333612, + "epoch": 0.0733503682549226, + "flos": 13224141106560.0, + "grad_norm": 2.3239590968525268, + "language_loss": 0.8486141, + "learning_rate": 3.980319937487235e-06, + "loss": 0.87089729, + "num_input_tokens_seen": 25963105, + "step": 1220, + "time_per_iteration": 4.1401636600494385 + }, + { + "auxiliary_loss_clip": 0.01129813, + "auxiliary_loss_mlp": 0.01063197, + "balance_loss_clip": 1.04193521, + "balance_loss_mlp": 1.03972459, + "epoch": 0.07341049150759056, + "flos": 20886975192960.0, + "grad_norm": 2.2885770496464493, + "language_loss": 0.7697475, + "learning_rate": 3.98026539862741e-06, + "loss": 0.79167765, + "num_input_tokens_seen": 25981690, + "step": 1221, + "time_per_iteration": 4.210748672485352 + }, + { + "auxiliary_loss_clip": 0.01131606, + "auxiliary_loss_mlp": 0.01061252, + "balance_loss_clip": 1.04592514, + "balance_loss_mlp": 1.03787494, + "epoch": 0.07347061476025853, + "flos": 15413614623360.0, + "grad_norm": 1.915085011345699, + "language_loss": 0.91804314, + "learning_rate": 3.980210784675722e-06, + "loss": 0.93997169, + "num_input_tokens_seen": 25999890, + "step": 1222, + "time_per_iteration": 4.253572940826416 + }, + { + "auxiliary_loss_clip": 0.01108177, + "auxiliary_loss_mlp": 0.01062003, + "balance_loss_clip": 1.0422039, + "balance_loss_mlp": 1.03897154, + "epoch": 0.0735307380129265, + "flos": 11108859131520.0, + "grad_norm": 2.6839099235116515, + "language_loss": 0.90940773, + "learning_rate": 3.980156095634242e-06, + "loss": 0.93110955, + "num_input_tokens_seen": 26016445, + "step": 1223, + "time_per_iteration": 2.7060630321502686 + }, + { + "auxiliary_loss_clip": 0.01174874, + "auxiliary_loss_mlp": 0.01073547, + "balance_loss_clip": 1.04943514, + "balance_loss_mlp": 1.05062294, + "epoch": 0.07359086126559447, + "flos": 23732392924800.0, + "grad_norm": 2.2976135046814816, + "language_loss": 0.82132912, + "learning_rate": 3.980101331505045e-06, + "loss": 0.8438133, + "num_input_tokens_seen": 26036080, + "step": 1224, + "time_per_iteration": 2.612847089767456 + }, + { + "auxiliary_loss_clip": 0.01171773, + "auxiliary_loss_mlp": 0.01061451, + "balance_loss_clip": 1.04588366, + "balance_loss_mlp": 1.03667974, + "epoch": 0.07365098451826244, + "flos": 20993270515200.0, + "grad_norm": 2.0232959625441427, + "language_loss": 0.83372021, + "learning_rate": 3.9800464922902076e-06, + "loss": 0.85605252, + "num_input_tokens_seen": 26055805, + "step": 1225, + "time_per_iteration": 2.68446946144104 + }, + { + "auxiliary_loss_clip": 0.01142527, + "auxiliary_loss_mlp": 0.01054398, + "balance_loss_clip": 1.04473543, + "balance_loss_mlp": 1.03125978, + "epoch": 0.0737111077709304, + "flos": 19933582452480.0, + "grad_norm": 1.8762273418176243, + "language_loss": 0.90150052, + "learning_rate": 3.979991577991808e-06, + "loss": 0.92346978, + "num_input_tokens_seen": 26073905, + "step": 1226, + "time_per_iteration": 2.6290204524993896 + }, + { + "auxiliary_loss_clip": 0.01182521, + "auxiliary_loss_mlp": 0.01051001, + "balance_loss_clip": 1.04700863, + "balance_loss_mlp": 1.02690911, + "epoch": 0.07377123102359838, + "flos": 16581537342720.0, + "grad_norm": 5.825436415259781, + "language_loss": 0.76457775, + "learning_rate": 3.97993658861193e-06, + "loss": 0.78691298, + "num_input_tokens_seen": 26091700, + "step": 1227, + "time_per_iteration": 2.6177070140838623 + }, + { + "auxiliary_loss_clip": 0.01161706, + "auxiliary_loss_mlp": 0.01051285, + "balance_loss_clip": 1.05006063, + "balance_loss_mlp": 1.02838469, + "epoch": 0.07383135427626634, + "flos": 28328563457280.0, + "grad_norm": 1.6873132540798903, + "language_loss": 0.85594285, + "learning_rate": 3.9798815241526575e-06, + "loss": 0.8780728, + "num_input_tokens_seen": 26114105, + "step": 1228, + "time_per_iteration": 2.770914077758789 + }, + { + "auxiliary_loss_clip": 0.01162178, + "auxiliary_loss_mlp": 0.01058255, + "balance_loss_clip": 1.04583049, + "balance_loss_mlp": 1.03529525, + "epoch": 0.07389147752893431, + "flos": 20047168235520.0, + "grad_norm": 2.1758324110121716, + "language_loss": 0.79590642, + "learning_rate": 3.97982638461608e-06, + "loss": 0.8181107, + "num_input_tokens_seen": 26131165, + "step": 1229, + "time_per_iteration": 2.618776321411133 + }, + { + "auxiliary_loss_clip": 0.01162507, + "auxiliary_loss_mlp": 0.00749753, + "balance_loss_clip": 1.04673803, + "balance_loss_mlp": 1.00063217, + "epoch": 0.07395160078160229, + "flos": 18114132890880.0, + "grad_norm": 1.930194032429918, + "language_loss": 0.78209734, + "learning_rate": 3.979771170004287e-06, + "loss": 0.80121988, + "num_input_tokens_seen": 26150040, + "step": 1230, + "time_per_iteration": 2.6165688037872314 + }, + { + "auxiliary_loss_clip": 0.01174377, + "auxiliary_loss_mlp": 0.01053015, + "balance_loss_clip": 1.0491941, + "balance_loss_mlp": 1.02774251, + "epoch": 0.07401172403427025, + "flos": 23586918842880.0, + "grad_norm": 2.3623246840730463, + "language_loss": 0.8142274, + "learning_rate": 3.979715880319372e-06, + "loss": 0.83650136, + "num_input_tokens_seen": 26169380, + "step": 1231, + "time_per_iteration": 2.5730345249176025 + }, + { + "auxiliary_loss_clip": 0.01148335, + "auxiliary_loss_mlp": 0.01063612, + "balance_loss_clip": 1.0437386, + "balance_loss_mlp": 1.03955531, + "epoch": 0.07407184728693822, + "flos": 26359904799360.0, + "grad_norm": 2.0553261670380234, + "language_loss": 0.95140821, + "learning_rate": 3.979660515563434e-06, + "loss": 0.97352767, + "num_input_tokens_seen": 26189420, + "step": 1232, + "time_per_iteration": 2.679325819015503 + }, + { + "auxiliary_loss_clip": 0.01158317, + "auxiliary_loss_mlp": 0.01063625, + "balance_loss_clip": 1.04809213, + "balance_loss_mlp": 1.04178619, + "epoch": 0.0741319705396062, + "flos": 22200443821440.0, + "grad_norm": 1.8406836654414134, + "language_loss": 0.80632085, + "learning_rate": 3.979605075738569e-06, + "loss": 0.82854033, + "num_input_tokens_seen": 26209300, + "step": 1233, + "time_per_iteration": 2.6496341228485107 + }, + { + "auxiliary_loss_clip": 0.01179704, + "auxiliary_loss_mlp": 0.01062988, + "balance_loss_clip": 1.04860187, + "balance_loss_mlp": 1.03690529, + "epoch": 0.07419209379227416, + "flos": 39200482523520.0, + "grad_norm": 2.1331327313920507, + "language_loss": 0.70519954, + "learning_rate": 3.979549560846883e-06, + "loss": 0.72762644, + "num_input_tokens_seen": 26228110, + "step": 1234, + "time_per_iteration": 2.7426185607910156 + }, + { + "auxiliary_loss_clip": 0.01134927, + "auxiliary_loss_mlp": 0.01067628, + "balance_loss_clip": 1.04328775, + "balance_loss_mlp": 1.04234385, + "epoch": 0.07425221704494213, + "flos": 22781657790720.0, + "grad_norm": 1.9025918917153402, + "language_loss": 0.77405417, + "learning_rate": 3.979493970890478e-06, + "loss": 0.7960797, + "num_input_tokens_seen": 26247020, + "step": 1235, + "time_per_iteration": 2.637022018432617 + }, + { + "auxiliary_loss_clip": 0.01174015, + "auxiliary_loss_mlp": 0.01055441, + "balance_loss_clip": 1.04916477, + "balance_loss_mlp": 1.03248167, + "epoch": 0.0743123402976101, + "flos": 22272983337600.0, + "grad_norm": 2.1432830264131377, + "language_loss": 0.82656389, + "learning_rate": 3.979438305871464e-06, + "loss": 0.84885848, + "num_input_tokens_seen": 26265750, + "step": 1236, + "time_per_iteration": 2.6072473526000977 + }, + { + "auxiliary_loss_clip": 0.01131504, + "auxiliary_loss_mlp": 0.00749833, + "balance_loss_clip": 1.04761255, + "balance_loss_mlp": 1.0007087, + "epoch": 0.07437246355027807, + "flos": 29315029645440.0, + "grad_norm": 1.8779707793014029, + "language_loss": 0.75888824, + "learning_rate": 3.979382565791951e-06, + "loss": 0.77770162, + "num_input_tokens_seen": 26287905, + "step": 1237, + "time_per_iteration": 2.847212076187134 + }, + { + "auxiliary_loss_clip": 0.01109322, + "auxiliary_loss_mlp": 0.00749792, + "balance_loss_clip": 1.04079998, + "balance_loss_mlp": 1.00063407, + "epoch": 0.07443258680294604, + "flos": 31944732249600.0, + "grad_norm": 2.3678909465377416, + "language_loss": 0.77321702, + "learning_rate": 3.979326750654053e-06, + "loss": 0.79180813, + "num_input_tokens_seen": 26311795, + "step": 1238, + "time_per_iteration": 2.9567246437072754 + }, + { + "auxiliary_loss_clip": 0.01152358, + "auxiliary_loss_mlp": 0.01057234, + "balance_loss_clip": 1.04550862, + "balance_loss_mlp": 1.03361917, + "epoch": 0.074492710055614, + "flos": 22675290641280.0, + "grad_norm": 2.9016599118231765, + "language_loss": 0.86794102, + "learning_rate": 3.9792708604598854e-06, + "loss": 0.89003694, + "num_input_tokens_seen": 26330330, + "step": 1239, + "time_per_iteration": 2.8091821670532227 + }, + { + "auxiliary_loss_clip": 0.01129355, + "auxiliary_loss_mlp": 0.01055477, + "balance_loss_clip": 1.04153752, + "balance_loss_mlp": 1.02936983, + "epoch": 0.07455283330828198, + "flos": 21284901037440.0, + "grad_norm": 2.2193537827133754, + "language_loss": 0.89051163, + "learning_rate": 3.979214895211569e-06, + "loss": 0.91235995, + "num_input_tokens_seen": 26348865, + "step": 1240, + "time_per_iteration": 2.6555542945861816 + }, + { + "auxiliary_loss_clip": 0.01153263, + "auxiliary_loss_mlp": 0.0106464, + "balance_loss_clip": 1.05003929, + "balance_loss_mlp": 1.03901005, + "epoch": 0.07461295656094995, + "flos": 24388408967040.0, + "grad_norm": 2.009146985192484, + "language_loss": 0.88874632, + "learning_rate": 3.979158854911225e-06, + "loss": 0.91092539, + "num_input_tokens_seen": 26368210, + "step": 1241, + "time_per_iteration": 2.78259015083313 + }, + { + "auxiliary_loss_clip": 0.01038074, + "auxiliary_loss_mlp": 0.01014133, + "balance_loss_clip": 1.00930011, + "balance_loss_mlp": 1.01029444, + "epoch": 0.07467307981361791, + "flos": 62109660574080.0, + "grad_norm": 0.9008829103111013, + "language_loss": 0.6312834, + "learning_rate": 3.979102739560979e-06, + "loss": 0.65180546, + "num_input_tokens_seen": 26424890, + "step": 1242, + "time_per_iteration": 3.3791582584381104 + }, + { + "auxiliary_loss_clip": 0.01134022, + "auxiliary_loss_mlp": 0.01060906, + "balance_loss_clip": 1.04293704, + "balance_loss_mlp": 1.03370297, + "epoch": 0.07473320306628589, + "flos": 24863148046080.0, + "grad_norm": 2.7828363773911073, + "language_loss": 0.63186812, + "learning_rate": 3.9790465491629595e-06, + "loss": 0.65381742, + "num_input_tokens_seen": 26446405, + "step": 1243, + "time_per_iteration": 2.719144821166992 + }, + { + "auxiliary_loss_clip": 0.01159266, + "auxiliary_loss_mlp": 0.01052664, + "balance_loss_clip": 1.0455929, + "balance_loss_mlp": 1.02817822, + "epoch": 0.07479332631895386, + "flos": 24897442556160.0, + "grad_norm": 2.485673587848739, + "language_loss": 0.76299012, + "learning_rate": 3.978990283719296e-06, + "loss": 0.7851094, + "num_input_tokens_seen": 26466070, + "step": 1244, + "time_per_iteration": 2.638777017593384 + }, + { + "auxiliary_loss_clip": 0.01158878, + "auxiliary_loss_mlp": 0.00749834, + "balance_loss_clip": 1.04976749, + "balance_loss_mlp": 1.00068617, + "epoch": 0.07485344957162182, + "flos": 17815247821440.0, + "grad_norm": 4.137091350525871, + "language_loss": 0.69414109, + "learning_rate": 3.978933943232123e-06, + "loss": 0.71322823, + "num_input_tokens_seen": 26479350, + "step": 1245, + "time_per_iteration": 2.63925838470459 + }, + { + "auxiliary_loss_clip": 0.01176711, + "auxiliary_loss_mlp": 0.01056989, + "balance_loss_clip": 1.04939461, + "balance_loss_mlp": 1.03230083, + "epoch": 0.0749135728242898, + "flos": 25010202326400.0, + "grad_norm": 1.8837727445826458, + "language_loss": 0.88989717, + "learning_rate": 3.978877527703576e-06, + "loss": 0.91223419, + "num_input_tokens_seen": 26498255, + "step": 1246, + "time_per_iteration": 2.5982561111450195 + }, + { + "auxiliary_loss_clip": 0.01186905, + "auxiliary_loss_mlp": 0.01069815, + "balance_loss_clip": 1.05152702, + "balance_loss_mlp": 1.04261148, + "epoch": 0.07497369607695777, + "flos": 17822071405440.0, + "grad_norm": 2.181045372487348, + "language_loss": 0.87663078, + "learning_rate": 3.9788210371357945e-06, + "loss": 0.899198, + "num_input_tokens_seen": 26515375, + "step": 1247, + "time_per_iteration": 2.5173821449279785 + }, + { + "auxiliary_loss_clip": 0.01153033, + "auxiliary_loss_mlp": 0.01065956, + "balance_loss_clip": 1.04570282, + "balance_loss_mlp": 1.04006386, + "epoch": 0.07503381932962573, + "flos": 15121086261120.0, + "grad_norm": 2.0997876529833293, + "language_loss": 0.64616209, + "learning_rate": 3.978764471530921e-06, + "loss": 0.66835195, + "num_input_tokens_seen": 26533595, + "step": 1248, + "time_per_iteration": 2.5874340534210205 + }, + { + "auxiliary_loss_clip": 0.0115412, + "auxiliary_loss_mlp": 0.00749762, + "balance_loss_clip": 1.04873109, + "balance_loss_mlp": 1.00069189, + "epoch": 0.0750939425822937, + "flos": 12816734071680.0, + "grad_norm": 1.9907553217290526, + "language_loss": 0.74395466, + "learning_rate": 3.978707830891102e-06, + "loss": 0.76299351, + "num_input_tokens_seen": 26549405, + "step": 1249, + "time_per_iteration": 2.596221923828125 + }, + { + "auxiliary_loss_clip": 0.01136068, + "auxiliary_loss_mlp": 0.01077005, + "balance_loss_clip": 1.04523134, + "balance_loss_mlp": 1.05063629, + "epoch": 0.07515406583496168, + "flos": 24206844695040.0, + "grad_norm": 2.8701543653404364, + "language_loss": 0.82275879, + "learning_rate": 3.978651115218482e-06, + "loss": 0.84488952, + "num_input_tokens_seen": 26567200, + "step": 1250, + "time_per_iteration": 2.622568130493164 + }, + { + "auxiliary_loss_clip": 0.01120346, + "auxiliary_loss_mlp": 0.01061964, + "balance_loss_clip": 1.04693961, + "balance_loss_mlp": 1.03646541, + "epoch": 0.07521418908762964, + "flos": 26688164215680.0, + "grad_norm": 2.0357878481468554, + "language_loss": 0.66670841, + "learning_rate": 3.978594324515215e-06, + "loss": 0.68853152, + "num_input_tokens_seen": 26586190, + "step": 1251, + "time_per_iteration": 2.847625255584717 + }, + { + "auxiliary_loss_clip": 0.0103149, + "auxiliary_loss_mlp": 0.01022101, + "balance_loss_clip": 1.01471257, + "balance_loss_mlp": 1.01809525, + "epoch": 0.0752743123402976, + "flos": 59095140589440.0, + "grad_norm": 0.9101776411896332, + "language_loss": 0.70375848, + "learning_rate": 3.9785374587834515e-06, + "loss": 0.7242943, + "num_input_tokens_seen": 26650710, + "step": 1252, + "time_per_iteration": 3.363680124282837 + }, + { + "auxiliary_loss_clip": 0.01174439, + "auxiliary_loss_mlp": 0.01066333, + "balance_loss_clip": 1.04805708, + "balance_loss_mlp": 1.0419904, + "epoch": 0.07533443559296558, + "flos": 23477032160640.0, + "grad_norm": 2.238047514575092, + "language_loss": 0.7966097, + "learning_rate": 3.97848051802535e-06, + "loss": 0.81901741, + "num_input_tokens_seen": 26669000, + "step": 1253, + "time_per_iteration": 2.5851786136627197 + }, + { + "auxiliary_loss_clip": 0.01134339, + "auxiliary_loss_mlp": 0.01067909, + "balance_loss_clip": 1.04568481, + "balance_loss_mlp": 1.04297042, + "epoch": 0.07539455884563355, + "flos": 20879110114560.0, + "grad_norm": 2.854332956002259, + "language_loss": 0.93273056, + "learning_rate": 3.978423502243069e-06, + "loss": 0.9547531, + "num_input_tokens_seen": 26683075, + "step": 1254, + "time_per_iteration": 2.629021167755127 + }, + { + "auxiliary_loss_clip": 0.0115137, + "auxiliary_loss_mlp": 0.01063646, + "balance_loss_clip": 1.0503372, + "balance_loss_mlp": 1.04001856, + "epoch": 0.07545468209830151, + "flos": 27672906551040.0, + "grad_norm": 1.9337940106915423, + "language_loss": 0.87842321, + "learning_rate": 3.97836641143877e-06, + "loss": 0.90057337, + "num_input_tokens_seen": 26701875, + "step": 1255, + "time_per_iteration": 2.7648367881774902 + }, + { + "auxiliary_loss_clip": 0.01169811, + "auxiliary_loss_mlp": 0.01064493, + "balance_loss_clip": 1.04610872, + "balance_loss_mlp": 1.03844559, + "epoch": 0.0755148053509695, + "flos": 14136990370560.0, + "grad_norm": 2.4464917170616767, + "language_loss": 0.79610157, + "learning_rate": 3.978309245614618e-06, + "loss": 0.81844461, + "num_input_tokens_seen": 26719050, + "step": 1256, + "time_per_iteration": 2.509350538253784 + }, + { + "auxiliary_loss_clip": 0.01030544, + "auxiliary_loss_mlp": 0.01009321, + "balance_loss_clip": 1.00798631, + "balance_loss_mlp": 1.00579262, + "epoch": 0.07557492860363746, + "flos": 58235257929600.0, + "grad_norm": 0.7750162513482644, + "language_loss": 0.58084565, + "learning_rate": 3.9782520047727825e-06, + "loss": 0.60124433, + "num_input_tokens_seen": 26780650, + "step": 1257, + "time_per_iteration": 3.297630786895752 + }, + { + "auxiliary_loss_clip": 0.01124748, + "auxiliary_loss_mlp": 0.01062306, + "balance_loss_clip": 1.04745722, + "balance_loss_mlp": 1.0383451, + "epoch": 0.07563505185630542, + "flos": 24644380262400.0, + "grad_norm": 2.3576559716208383, + "language_loss": 0.90187883, + "learning_rate": 3.978194688915432e-06, + "loss": 0.92374945, + "num_input_tokens_seen": 26798725, + "step": 1258, + "time_per_iteration": 2.7357215881347656 + }, + { + "auxiliary_loss_clip": 0.0113659, + "auxiliary_loss_mlp": 0.01068738, + "balance_loss_clip": 1.04753995, + "balance_loss_mlp": 1.04449069, + "epoch": 0.07569517510897339, + "flos": 15522998515200.0, + "grad_norm": 2.2941432784592046, + "language_loss": 0.8094629, + "learning_rate": 3.978137298044741e-06, + "loss": 0.83151615, + "num_input_tokens_seen": 26817005, + "step": 1259, + "time_per_iteration": 2.6456029415130615 + }, + { + "auxiliary_loss_clip": 0.01165297, + "auxiliary_loss_mlp": 0.01062912, + "balance_loss_clip": 1.04914999, + "balance_loss_mlp": 1.039428, + "epoch": 0.07575529836164137, + "flos": 22928532503040.0, + "grad_norm": 3.2551100713854373, + "language_loss": 0.76169169, + "learning_rate": 3.978079832162885e-06, + "loss": 0.78397381, + "num_input_tokens_seen": 26836655, + "step": 1260, + "time_per_iteration": 2.644340991973877 + }, + { + "auxiliary_loss_clip": 0.01126453, + "auxiliary_loss_mlp": 0.01070186, + "balance_loss_clip": 1.04123235, + "balance_loss_mlp": 1.04504442, + "epoch": 0.07581542161430933, + "flos": 19500428344320.0, + "grad_norm": 2.6695701729582226, + "language_loss": 0.85014993, + "learning_rate": 3.978022291272044e-06, + "loss": 0.87211633, + "num_input_tokens_seen": 26854925, + "step": 1261, + "time_per_iteration": 2.6933441162109375 + }, + { + "auxiliary_loss_clip": 0.01181153, + "auxiliary_loss_mlp": 0.01069539, + "balance_loss_clip": 1.05346596, + "balance_loss_mlp": 1.04557836, + "epoch": 0.0758755448669773, + "flos": 24973465691520.0, + "grad_norm": 1.8094087717533363, + "language_loss": 0.82356668, + "learning_rate": 3.977964675374399e-06, + "loss": 0.84607357, + "num_input_tokens_seen": 26876170, + "step": 1262, + "time_per_iteration": 2.6005218029022217 + }, + { + "auxiliary_loss_clip": 0.01174058, + "auxiliary_loss_mlp": 0.01063711, + "balance_loss_clip": 1.04787576, + "balance_loss_mlp": 1.03948724, + "epoch": 0.07593566811964528, + "flos": 22747973811840.0, + "grad_norm": 2.9708698033475844, + "language_loss": 0.82475138, + "learning_rate": 3.977906984472136e-06, + "loss": 0.84712911, + "num_input_tokens_seen": 26895005, + "step": 1263, + "time_per_iteration": 2.587228298187256 + }, + { + "auxiliary_loss_clip": 0.01134083, + "auxiliary_loss_mlp": 0.01065618, + "balance_loss_clip": 1.05044508, + "balance_loss_mlp": 1.04188347, + "epoch": 0.07599579137231324, + "flos": 23112395245440.0, + "grad_norm": 4.159672953243429, + "language_loss": 0.76081574, + "learning_rate": 3.977849218567442e-06, + "loss": 0.78281283, + "num_input_tokens_seen": 26913930, + "step": 1264, + "time_per_iteration": 2.7247347831726074 + }, + { + "auxiliary_loss_clip": 0.01150759, + "auxiliary_loss_mlp": 0.01067707, + "balance_loss_clip": 1.04814434, + "balance_loss_mlp": 1.04346013, + "epoch": 0.07605591462498121, + "flos": 14502058248960.0, + "grad_norm": 2.2279918215594163, + "language_loss": 0.80966103, + "learning_rate": 3.977791377662507e-06, + "loss": 0.83184576, + "num_input_tokens_seen": 26931485, + "step": 1265, + "time_per_iteration": 2.660639524459839 + }, + { + "auxiliary_loss_clip": 0.01107289, + "auxiliary_loss_mlp": 0.01066536, + "balance_loss_clip": 1.0408659, + "balance_loss_mlp": 1.0418005, + "epoch": 0.07611603787764919, + "flos": 23514199758720.0, + "grad_norm": 2.5732382106706857, + "language_loss": 0.65365636, + "learning_rate": 3.977733461759524e-06, + "loss": 0.67539454, + "num_input_tokens_seen": 26951670, + "step": 1266, + "time_per_iteration": 4.282203435897827 + }, + { + "auxiliary_loss_clip": 0.01133371, + "auxiliary_loss_mlp": 0.01068683, + "balance_loss_clip": 1.0447011, + "balance_loss_mlp": 1.04500782, + "epoch": 0.07617616113031715, + "flos": 21507188353920.0, + "grad_norm": 1.9491464644150305, + "language_loss": 0.79909933, + "learning_rate": 3.977675470860691e-06, + "loss": 0.8211199, + "num_input_tokens_seen": 26970335, + "step": 1267, + "time_per_iteration": 2.675906181335449 + }, + { + "auxiliary_loss_clip": 0.0115354, + "auxiliary_loss_mlp": 0.01058787, + "balance_loss_clip": 1.04873109, + "balance_loss_mlp": 1.03549349, + "epoch": 0.07623628438298512, + "flos": 14573161221120.0, + "grad_norm": 2.572086189680429, + "language_loss": 0.7273438, + "learning_rate": 3.977617404968205e-06, + "loss": 0.74946707, + "num_input_tokens_seen": 26986025, + "step": 1268, + "time_per_iteration": 2.6229724884033203 + }, + { + "auxiliary_loss_clip": 0.01162981, + "auxiliary_loss_mlp": 0.01064359, + "balance_loss_clip": 1.04767203, + "balance_loss_mlp": 1.03995681, + "epoch": 0.07629640763565308, + "flos": 14720395069440.0, + "grad_norm": 1.960816118321841, + "language_loss": 0.822685, + "learning_rate": 3.977559264084269e-06, + "loss": 0.84495842, + "num_input_tokens_seen": 27004045, + "step": 1269, + "time_per_iteration": 5.855030298233032 + }, + { + "auxiliary_loss_clip": 0.01163639, + "auxiliary_loss_mlp": 0.01056758, + "balance_loss_clip": 1.04849052, + "balance_loss_mlp": 1.03228474, + "epoch": 0.07635653088832106, + "flos": 14902929008640.0, + "grad_norm": 2.450986645802303, + "language_loss": 0.88750952, + "learning_rate": 3.977501048211088e-06, + "loss": 0.90971351, + "num_input_tokens_seen": 27022070, + "step": 1270, + "time_per_iteration": 2.589167833328247 + }, + { + "auxiliary_loss_clip": 0.01163366, + "auxiliary_loss_mlp": 0.01060059, + "balance_loss_clip": 1.04859734, + "balance_loss_mlp": 1.03577578, + "epoch": 0.07641665414098903, + "flos": 26651571235200.0, + "grad_norm": 2.104535841978225, + "language_loss": 0.70866394, + "learning_rate": 3.977442757350869e-06, + "loss": 0.73089814, + "num_input_tokens_seen": 27041755, + "step": 1271, + "time_per_iteration": 2.6932480335235596 + }, + { + "auxiliary_loss_clip": 0.01130116, + "auxiliary_loss_mlp": 0.01070024, + "balance_loss_clip": 1.04733682, + "balance_loss_mlp": 1.04638433, + "epoch": 0.07647677739365699, + "flos": 25192808092800.0, + "grad_norm": 1.838609648922848, + "language_loss": 0.82911694, + "learning_rate": 3.977384391505823e-06, + "loss": 0.85111833, + "num_input_tokens_seen": 27061540, + "step": 1272, + "time_per_iteration": 2.67775559425354 + }, + { + "auxiliary_loss_clip": 0.01140615, + "auxiliary_loss_mlp": 0.00749833, + "balance_loss_clip": 1.0428592, + "balance_loss_mlp": 1.00071549, + "epoch": 0.07653690064632497, + "flos": 20558141159040.0, + "grad_norm": 2.104554651778794, + "language_loss": 0.80005676, + "learning_rate": 3.977325950678162e-06, + "loss": 0.8189612, + "num_input_tokens_seen": 27081395, + "step": 1273, + "time_per_iteration": 2.6324994564056396 + }, + { + "auxiliary_loss_clip": 0.01150332, + "auxiliary_loss_mlp": 0.01061174, + "balance_loss_clip": 1.04827702, + "balance_loss_mlp": 1.03770149, + "epoch": 0.07659702389899294, + "flos": 22269320150400.0, + "grad_norm": 1.7893921821227803, + "language_loss": 0.81456256, + "learning_rate": 3.977267434870103e-06, + "loss": 0.83667755, + "num_input_tokens_seen": 27101175, + "step": 1274, + "time_per_iteration": 2.6457667350769043 + }, + { + "auxiliary_loss_clip": 0.01153258, + "auxiliary_loss_mlp": 0.01065621, + "balance_loss_clip": 1.04781747, + "balance_loss_mlp": 1.04055142, + "epoch": 0.0766571471516609, + "flos": 32636120209920.0, + "grad_norm": 2.0525765520348362, + "language_loss": 0.73082662, + "learning_rate": 3.977208844083865e-06, + "loss": 0.7530154, + "num_input_tokens_seen": 27124505, + "step": 1275, + "time_per_iteration": 2.7893543243408203 + }, + { + "auxiliary_loss_clip": 0.01173532, + "auxiliary_loss_mlp": 0.0106289, + "balance_loss_clip": 1.04846072, + "balance_loss_mlp": 1.03720045, + "epoch": 0.07671727040432888, + "flos": 15267386355840.0, + "grad_norm": 2.2681406590317827, + "language_loss": 0.79310805, + "learning_rate": 3.9771501783216685e-06, + "loss": 0.81547225, + "num_input_tokens_seen": 27140960, + "step": 1276, + "time_per_iteration": 2.5831329822540283 + }, + { + "auxiliary_loss_clip": 0.01160805, + "auxiliary_loss_mlp": 0.01057948, + "balance_loss_clip": 1.04757428, + "balance_loss_mlp": 1.03457141, + "epoch": 0.07677739365699685, + "flos": 28184094956160.0, + "grad_norm": 2.268345181829906, + "language_loss": 0.59395683, + "learning_rate": 3.97709143758574e-06, + "loss": 0.6161443, + "num_input_tokens_seen": 27160985, + "step": 1277, + "time_per_iteration": 2.63771390914917 + }, + { + "auxiliary_loss_clip": 0.01168239, + "auxiliary_loss_mlp": 0.01060964, + "balance_loss_clip": 1.04827118, + "balance_loss_mlp": 1.03670549, + "epoch": 0.07683751690966481, + "flos": 18296128126080.0, + "grad_norm": 3.0117059903545615, + "language_loss": 0.74708366, + "learning_rate": 3.977032621878305e-06, + "loss": 0.76937568, + "num_input_tokens_seen": 27178390, + "step": 1278, + "time_per_iteration": 2.543775796890259 + }, + { + "auxiliary_loss_clip": 0.01130359, + "auxiliary_loss_mlp": 0.01060884, + "balance_loss_clip": 1.04655623, + "balance_loss_mlp": 1.03645813, + "epoch": 0.07689764016233278, + "flos": 21981101420160.0, + "grad_norm": 2.8107387000845416, + "language_loss": 0.88107646, + "learning_rate": 3.976973731201596e-06, + "loss": 0.90298891, + "num_input_tokens_seen": 27197505, + "step": 1279, + "time_per_iteration": 2.6485626697540283 + }, + { + "auxiliary_loss_clip": 0.01132303, + "auxiliary_loss_mlp": 0.01061238, + "balance_loss_clip": 1.04238248, + "balance_loss_mlp": 1.03650188, + "epoch": 0.07695776341500075, + "flos": 22235995307520.0, + "grad_norm": 3.2401227716355345, + "language_loss": 0.82591212, + "learning_rate": 3.976914765557845e-06, + "loss": 0.84784752, + "num_input_tokens_seen": 27214260, + "step": 1280, + "time_per_iteration": 2.652863025665283 + }, + { + "auxiliary_loss_clip": 0.0115988, + "auxiliary_loss_mlp": 0.0106129, + "balance_loss_clip": 1.04735947, + "balance_loss_mlp": 1.03719807, + "epoch": 0.07701788666766872, + "flos": 16143750380160.0, + "grad_norm": 2.0368807123810924, + "language_loss": 0.76032013, + "learning_rate": 3.9768557249492875e-06, + "loss": 0.7825318, + "num_input_tokens_seen": 27232525, + "step": 1281, + "time_per_iteration": 2.5618090629577637 + }, + { + "auxiliary_loss_clip": 0.0114135, + "auxiliary_loss_mlp": 0.01061283, + "balance_loss_clip": 1.04433823, + "balance_loss_mlp": 1.03615379, + "epoch": 0.07707800992033668, + "flos": 19463045264640.0, + "grad_norm": 1.8730359882665908, + "language_loss": 0.75554454, + "learning_rate": 3.9767966093781634e-06, + "loss": 0.7775709, + "num_input_tokens_seen": 27249800, + "step": 1282, + "time_per_iteration": 2.8883190155029297 + }, + { + "auxiliary_loss_clip": 0.01174555, + "auxiliary_loss_mlp": 0.01069277, + "balance_loss_clip": 1.04943633, + "balance_loss_mlp": 1.04457712, + "epoch": 0.07713813317300466, + "flos": 18990281433600.0, + "grad_norm": 1.895003914596204, + "language_loss": 0.84015512, + "learning_rate": 3.976737418846713e-06, + "loss": 0.86259341, + "num_input_tokens_seen": 27268895, + "step": 1283, + "time_per_iteration": 2.706550359725952 + }, + { + "auxiliary_loss_clip": 0.01162556, + "auxiliary_loss_mlp": 0.01065582, + "balance_loss_clip": 1.04879797, + "balance_loss_mlp": 1.03854561, + "epoch": 0.07719825642567263, + "flos": 18113953322880.0, + "grad_norm": 2.0694781904871413, + "language_loss": 0.74586844, + "learning_rate": 3.976678153357181e-06, + "loss": 0.76814979, + "num_input_tokens_seen": 27288180, + "step": 1284, + "time_per_iteration": 2.5737650394439697 + }, + { + "auxiliary_loss_clip": 0.01142212, + "auxiliary_loss_mlp": 0.01066259, + "balance_loss_clip": 1.04427004, + "balance_loss_mlp": 1.0422622, + "epoch": 0.0772583796783406, + "flos": 42194426993280.0, + "grad_norm": 1.713239686050339, + "language_loss": 0.76160163, + "learning_rate": 3.976618812911817e-06, + "loss": 0.78368634, + "num_input_tokens_seen": 27311815, + "step": 1285, + "time_per_iteration": 2.761702537536621 + }, + { + "auxiliary_loss_clip": 0.01178971, + "auxiliary_loss_mlp": 0.01068592, + "balance_loss_clip": 1.05235064, + "balance_loss_mlp": 1.04470229, + "epoch": 0.07731850293100857, + "flos": 24753692327040.0, + "grad_norm": 1.8276179279219487, + "language_loss": 0.84090257, + "learning_rate": 3.9765593975128685e-06, + "loss": 0.86337817, + "num_input_tokens_seen": 27331890, + "step": 1286, + "time_per_iteration": 2.5990772247314453 + }, + { + "auxiliary_loss_clip": 0.01144953, + "auxiliary_loss_mlp": 0.01059582, + "balance_loss_clip": 1.04488456, + "balance_loss_mlp": 1.03534651, + "epoch": 0.07737862618367654, + "flos": 17565884628480.0, + "grad_norm": 2.2607461789060816, + "language_loss": 0.76734388, + "learning_rate": 3.97649990716259e-06, + "loss": 0.78938919, + "num_input_tokens_seen": 27348320, + "step": 1287, + "time_per_iteration": 2.6885712146759033 + }, + { + "auxiliary_loss_clip": 0.01144125, + "auxiliary_loss_mlp": 0.01061871, + "balance_loss_clip": 1.04410553, + "balance_loss_mlp": 1.03690851, + "epoch": 0.0774387494363445, + "flos": 25627147349760.0, + "grad_norm": 2.438893237547737, + "language_loss": 0.84691858, + "learning_rate": 3.976440341863237e-06, + "loss": 0.86897856, + "num_input_tokens_seen": 27367670, + "step": 1288, + "time_per_iteration": 2.7226622104644775 + }, + { + "auxiliary_loss_clip": 0.01172548, + "auxiliary_loss_mlp": 0.01057545, + "balance_loss_clip": 1.04551196, + "balance_loss_mlp": 1.03352392, + "epoch": 0.07749887268901248, + "flos": 12239865648000.0, + "grad_norm": 2.088563798873869, + "language_loss": 0.85177457, + "learning_rate": 3.976380701617068e-06, + "loss": 0.87407547, + "num_input_tokens_seen": 27385485, + "step": 1289, + "time_per_iteration": 2.5327162742614746 + }, + { + "auxiliary_loss_clip": 0.01171265, + "auxiliary_loss_mlp": 0.0104823, + "balance_loss_clip": 1.04625988, + "balance_loss_mlp": 1.02406621, + "epoch": 0.07755899594168045, + "flos": 25081736261760.0, + "grad_norm": 1.9652649145608487, + "language_loss": 0.85344875, + "learning_rate": 3.976320986426344e-06, + "loss": 0.87564373, + "num_input_tokens_seen": 27405110, + "step": 1290, + "time_per_iteration": 2.5621657371520996 + }, + { + "auxiliary_loss_clip": 0.01140368, + "auxiliary_loss_mlp": 0.0106028, + "balance_loss_clip": 1.04558682, + "balance_loss_mlp": 1.03438771, + "epoch": 0.07761911919434841, + "flos": 14246410176000.0, + "grad_norm": 2.116067895526994, + "language_loss": 0.90538245, + "learning_rate": 3.9762611962933315e-06, + "loss": 0.92738897, + "num_input_tokens_seen": 27422855, + "step": 1291, + "time_per_iteration": 2.5832250118255615 + }, + { + "auxiliary_loss_clip": 0.01038789, + "auxiliary_loss_mlp": 0.01050092, + "balance_loss_clip": 1.01208925, + "balance_loss_mlp": 1.04558623, + "epoch": 0.07767924244701638, + "flos": 67237202954880.0, + "grad_norm": 0.8974041080954163, + "language_loss": 0.65015912, + "learning_rate": 3.9762013312202955e-06, + "loss": 0.67104793, + "num_input_tokens_seen": 27487190, + "step": 1292, + "time_per_iteration": 3.3025403022766113 + }, + { + "auxiliary_loss_clip": 0.01159243, + "auxiliary_loss_mlp": 0.01058737, + "balance_loss_clip": 1.04628086, + "balance_loss_mlp": 1.03500223, + "epoch": 0.07773936569968436, + "flos": 28550635292160.0, + "grad_norm": 1.8942939330962858, + "language_loss": 0.87576246, + "learning_rate": 3.9761413912095075e-06, + "loss": 0.8979423, + "num_input_tokens_seen": 27510465, + "step": 1293, + "time_per_iteration": 2.6584537029266357 + }, + { + "auxiliary_loss_clip": 0.01078071, + "auxiliary_loss_mlp": 0.01068965, + "balance_loss_clip": 1.03827143, + "balance_loss_mlp": 1.04165459, + "epoch": 0.07779948895235232, + "flos": 27490264871040.0, + "grad_norm": 4.452717683460257, + "language_loss": 0.84886825, + "learning_rate": 3.976081376263239e-06, + "loss": 0.87033862, + "num_input_tokens_seen": 27528645, + "step": 1294, + "time_per_iteration": 2.776477813720703 + }, + { + "auxiliary_loss_clip": 0.01118909, + "auxiliary_loss_mlp": 0.01058974, + "balance_loss_clip": 1.04376674, + "balance_loss_mlp": 1.03376079, + "epoch": 0.07785961220502029, + "flos": 18223301301120.0, + "grad_norm": 2.171551953499598, + "language_loss": 0.79181927, + "learning_rate": 3.976021286383768e-06, + "loss": 0.81359816, + "num_input_tokens_seen": 27546165, + "step": 1295, + "time_per_iteration": 2.751335859298706 + }, + { + "auxiliary_loss_clip": 0.01131694, + "auxiliary_loss_mlp": 0.01060382, + "balance_loss_clip": 1.05040073, + "balance_loss_mlp": 1.03497899, + "epoch": 0.07791973545768827, + "flos": 24608218245120.0, + "grad_norm": 2.3700307147444493, + "language_loss": 0.8846131, + "learning_rate": 3.975961121573371e-06, + "loss": 0.90653384, + "num_input_tokens_seen": 27566520, + "step": 1296, + "time_per_iteration": 2.6788229942321777 + }, + { + "auxiliary_loss_clip": 0.01176333, + "auxiliary_loss_mlp": 0.01067235, + "balance_loss_clip": 1.05058801, + "balance_loss_mlp": 1.04183173, + "epoch": 0.07797985871035623, + "flos": 14282069402880.0, + "grad_norm": 2.894453424049952, + "language_loss": 0.96197915, + "learning_rate": 3.9759008818343305e-06, + "loss": 0.98441488, + "num_input_tokens_seen": 27581960, + "step": 1297, + "time_per_iteration": 2.644681930541992 + }, + { + "auxiliary_loss_clip": 0.01137685, + "auxiliary_loss_mlp": 0.01070295, + "balance_loss_clip": 1.0442152, + "balance_loss_mlp": 1.04689431, + "epoch": 0.0780399819630242, + "flos": 26610453141120.0, + "grad_norm": 2.0600421475841877, + "language_loss": 0.76536596, + "learning_rate": 3.97584056716893e-06, + "loss": 0.78744572, + "num_input_tokens_seen": 27601415, + "step": 1298, + "time_per_iteration": 2.787630081176758 + }, + { + "auxiliary_loss_clip": 0.01121382, + "auxiliary_loss_mlp": 0.00749931, + "balance_loss_clip": 1.05033231, + "balance_loss_mlp": 1.00096011, + "epoch": 0.07810010521569218, + "flos": 21834514016640.0, + "grad_norm": 1.5786391577675238, + "language_loss": 0.80758184, + "learning_rate": 3.9757801775794575e-06, + "loss": 0.8262949, + "num_input_tokens_seen": 27621490, + "step": 1299, + "time_per_iteration": 2.810046911239624 + }, + { + "auxiliary_loss_clip": 0.01132659, + "auxiliary_loss_mlp": 0.01059005, + "balance_loss_clip": 1.0465759, + "balance_loss_mlp": 1.03497219, + "epoch": 0.07816022846836014, + "flos": 25081233471360.0, + "grad_norm": 3.7592588983385036, + "language_loss": 0.86819577, + "learning_rate": 3.975719713068202e-06, + "loss": 0.8901124, + "num_input_tokens_seen": 27640600, + "step": 1300, + "time_per_iteration": 2.7985405921936035 + }, + { + "auxiliary_loss_clip": 0.0117309, + "auxiliary_loss_mlp": 0.01055013, + "balance_loss_clip": 1.04927528, + "balance_loss_mlp": 1.0315764, + "epoch": 0.0782203517210281, + "flos": 40917515431680.0, + "grad_norm": 2.146173062081435, + "language_loss": 0.71643996, + "learning_rate": 3.975659173637458e-06, + "loss": 0.73872095, + "num_input_tokens_seen": 27663070, + "step": 1301, + "time_per_iteration": 2.7765979766845703 + }, + { + "auxiliary_loss_clip": 0.01163046, + "auxiliary_loss_mlp": 0.01073904, + "balance_loss_clip": 1.05021811, + "balance_loss_mlp": 1.04942989, + "epoch": 0.07828047497369607, + "flos": 41172014269440.0, + "grad_norm": 1.6440015169085276, + "language_loss": 0.70732015, + "learning_rate": 3.97559855928952e-06, + "loss": 0.7296896, + "num_input_tokens_seen": 27686425, + "step": 1302, + "time_per_iteration": 2.857335090637207 + }, + { + "auxiliary_loss_clip": 0.01131, + "auxiliary_loss_mlp": 0.00749996, + "balance_loss_clip": 1.04811406, + "balance_loss_mlp": 1.00106299, + "epoch": 0.07834059822636405, + "flos": 23508130360320.0, + "grad_norm": 2.827947575840931, + "language_loss": 0.81818473, + "learning_rate": 3.9755378700266864e-06, + "loss": 0.83699465, + "num_input_tokens_seen": 27704900, + "step": 1303, + "time_per_iteration": 2.6833250522613525 + }, + { + "auxiliary_loss_clip": 0.01158567, + "auxiliary_loss_mlp": 0.01069566, + "balance_loss_clip": 1.04875278, + "balance_loss_mlp": 1.045259, + "epoch": 0.07840072147903202, + "flos": 20193899293440.0, + "grad_norm": 1.647100080761981, + "language_loss": 0.74796021, + "learning_rate": 3.9754771058512585e-06, + "loss": 0.7702415, + "num_input_tokens_seen": 27724890, + "step": 1304, + "time_per_iteration": 2.6344807147979736 + }, + { + "auxiliary_loss_clip": 0.01175189, + "auxiliary_loss_mlp": 0.01072362, + "balance_loss_clip": 1.05196357, + "balance_loss_mlp": 1.04760265, + "epoch": 0.07846084473169998, + "flos": 21360816432000.0, + "grad_norm": 1.720104286079477, + "language_loss": 0.76194465, + "learning_rate": 3.975416266765542e-06, + "loss": 0.78442025, + "num_input_tokens_seen": 27743115, + "step": 1305, + "time_per_iteration": 2.5933690071105957 + }, + { + "auxiliary_loss_clip": 0.0111114, + "auxiliary_loss_mlp": 0.01067424, + "balance_loss_clip": 1.04259825, + "balance_loss_mlp": 1.04297388, + "epoch": 0.07852096798436796, + "flos": 25410965345280.0, + "grad_norm": 1.785160406147557, + "language_loss": 0.84852111, + "learning_rate": 3.975355352771841e-06, + "loss": 0.87030673, + "num_input_tokens_seen": 27763570, + "step": 1306, + "time_per_iteration": 2.7405660152435303 + }, + { + "auxiliary_loss_clip": 0.01161877, + "auxiliary_loss_mlp": 0.01046876, + "balance_loss_clip": 1.05022943, + "balance_loss_mlp": 1.0246911, + "epoch": 0.07858109123703592, + "flos": 24571481610240.0, + "grad_norm": 2.662280127754673, + "language_loss": 0.90542698, + "learning_rate": 3.975294363872468e-06, + "loss": 0.92751449, + "num_input_tokens_seen": 27780030, + "step": 1307, + "time_per_iteration": 2.6534476280212402 + }, + { + "auxiliary_loss_clip": 0.01111369, + "auxiliary_loss_mlp": 0.01055366, + "balance_loss_clip": 1.04025388, + "balance_loss_mlp": 1.03061807, + "epoch": 0.07864121448970389, + "flos": 20698874645760.0, + "grad_norm": 1.815061135343664, + "language_loss": 0.83352864, + "learning_rate": 3.975233300069735e-06, + "loss": 0.85519594, + "num_input_tokens_seen": 27796225, + "step": 1308, + "time_per_iteration": 2.6572961807250977 + }, + { + "auxiliary_loss_clip": 0.01109992, + "auxiliary_loss_mlp": 0.01057816, + "balance_loss_clip": 1.03911173, + "balance_loss_mlp": 1.03426027, + "epoch": 0.07870133774237187, + "flos": 22966526113920.0, + "grad_norm": 1.4823159659897258, + "language_loss": 0.77239877, + "learning_rate": 3.975172161365958e-06, + "loss": 0.79407686, + "num_input_tokens_seen": 27815975, + "step": 1309, + "time_per_iteration": 2.648916721343994 + }, + { + "auxiliary_loss_clip": 0.01163059, + "auxiliary_loss_mlp": 0.01063946, + "balance_loss_clip": 1.04654479, + "balance_loss_mlp": 1.03822064, + "epoch": 0.07876146099503983, + "flos": 18842832103680.0, + "grad_norm": 2.035277487399298, + "language_loss": 0.80214757, + "learning_rate": 3.975110947763453e-06, + "loss": 0.82441759, + "num_input_tokens_seen": 27832255, + "step": 1310, + "time_per_iteration": 2.6354479789733887 + }, + { + "auxiliary_loss_clip": 0.01139133, + "auxiliary_loss_mlp": 0.0074995, + "balance_loss_clip": 1.04607224, + "balance_loss_mlp": 1.00112057, + "epoch": 0.0788215842477078, + "flos": 23805794367360.0, + "grad_norm": 2.158290480540742, + "language_loss": 0.73825455, + "learning_rate": 3.9750496592645435e-06, + "loss": 0.7571454, + "num_input_tokens_seen": 27852180, + "step": 1311, + "time_per_iteration": 2.693199634552002 + }, + { + "auxiliary_loss_clip": 0.01154045, + "auxiliary_loss_mlp": 0.01080757, + "balance_loss_clip": 1.04833937, + "balance_loss_mlp": 1.05559182, + "epoch": 0.07888170750037576, + "flos": 21579907438080.0, + "grad_norm": 1.9389599046926378, + "language_loss": 0.85562629, + "learning_rate": 3.974988295871553e-06, + "loss": 0.87797427, + "num_input_tokens_seen": 27871435, + "step": 1312, + "time_per_iteration": 2.637793779373169 + }, + { + "auxiliary_loss_clip": 0.01143915, + "auxiliary_loss_mlp": 0.01067149, + "balance_loss_clip": 1.04841423, + "balance_loss_mlp": 1.04411745, + "epoch": 0.07894183075304374, + "flos": 19864849777920.0, + "grad_norm": 1.679046318311751, + "language_loss": 0.82118011, + "learning_rate": 3.9749268575868085e-06, + "loss": 0.84329081, + "num_input_tokens_seen": 27890625, + "step": 1313, + "time_per_iteration": 2.6744537353515625 + }, + { + "auxiliary_loss_clip": 0.01145594, + "auxiliary_loss_mlp": 0.0074996, + "balance_loss_clip": 1.04495335, + "balance_loss_mlp": 1.00108302, + "epoch": 0.07900195400571171, + "flos": 16143463071360.0, + "grad_norm": 2.8826193483399494, + "language_loss": 0.7329421, + "learning_rate": 3.97486534441264e-06, + "loss": 0.75189769, + "num_input_tokens_seen": 27906530, + "step": 1314, + "time_per_iteration": 4.397356986999512 + }, + { + "auxiliary_loss_clip": 0.01115646, + "auxiliary_loss_mlp": 0.00749881, + "balance_loss_clip": 1.03999949, + "balance_loss_mlp": 1.00093925, + "epoch": 0.07906207725837967, + "flos": 23730417676800.0, + "grad_norm": 1.7218220910609174, + "language_loss": 0.79847312, + "learning_rate": 3.974803756351379e-06, + "loss": 0.81712842, + "num_input_tokens_seen": 27926725, + "step": 1315, + "time_per_iteration": 2.804863929748535 + }, + { + "auxiliary_loss_clip": 0.0115415, + "auxiliary_loss_mlp": 0.01065608, + "balance_loss_clip": 1.04417372, + "balance_loss_mlp": 1.03951359, + "epoch": 0.07912220051104765, + "flos": 24315905364480.0, + "grad_norm": 1.7562355357104174, + "language_loss": 0.73951137, + "learning_rate": 3.974742093405362e-06, + "loss": 0.76170892, + "num_input_tokens_seen": 27947875, + "step": 1316, + "time_per_iteration": 5.926944017410278 + }, + { + "auxiliary_loss_clip": 0.0112861, + "auxiliary_loss_mlp": 0.01067049, + "balance_loss_clip": 1.04445601, + "balance_loss_mlp": 1.04162157, + "epoch": 0.07918232376371562, + "flos": 18880035615360.0, + "grad_norm": 3.763905631447755, + "language_loss": 0.65780008, + "learning_rate": 3.974680355576927e-06, + "loss": 0.6797567, + "num_input_tokens_seen": 27965040, + "step": 1317, + "time_per_iteration": 2.694969415664673 + }, + { + "auxiliary_loss_clip": 0.0114109, + "auxiliary_loss_mlp": 0.010681, + "balance_loss_clip": 1.04678345, + "balance_loss_mlp": 1.04297042, + "epoch": 0.07924244701638358, + "flos": 27376284038400.0, + "grad_norm": 2.381136826114846, + "language_loss": 0.73143017, + "learning_rate": 3.974618542868415e-06, + "loss": 0.7535221, + "num_input_tokens_seen": 27985330, + "step": 1318, + "time_per_iteration": 2.810049295425415 + }, + { + "auxiliary_loss_clip": 0.01109909, + "auxiliary_loss_mlp": 0.01063212, + "balance_loss_clip": 1.04523993, + "balance_loss_mlp": 1.03985906, + "epoch": 0.07930257026905156, + "flos": 25120340403840.0, + "grad_norm": 1.6378876242809142, + "language_loss": 0.90567815, + "learning_rate": 3.97455665528217e-06, + "loss": 0.92740941, + "num_input_tokens_seen": 28007615, + "step": 1319, + "time_per_iteration": 2.7628939151763916 + }, + { + "auxiliary_loss_clip": 0.01140188, + "auxiliary_loss_mlp": 0.0105681, + "balance_loss_clip": 1.04208231, + "balance_loss_mlp": 1.03253937, + "epoch": 0.07936269352171953, + "flos": 21834478103040.0, + "grad_norm": 2.7375648830477224, + "language_loss": 0.80270493, + "learning_rate": 3.974494692820539e-06, + "loss": 0.8246749, + "num_input_tokens_seen": 28027765, + "step": 1320, + "time_per_iteration": 2.777306079864502 + }, + { + "auxiliary_loss_clip": 0.01147012, + "auxiliary_loss_mlp": 0.01060673, + "balance_loss_clip": 1.04691648, + "balance_loss_mlp": 1.0368073, + "epoch": 0.07942281677438749, + "flos": 16939889377920.0, + "grad_norm": 1.923671823760741, + "language_loss": 0.69128782, + "learning_rate": 3.974432655485872e-06, + "loss": 0.71336466, + "num_input_tokens_seen": 28044225, + "step": 1321, + "time_per_iteration": 2.6277711391448975 + }, + { + "auxiliary_loss_clip": 0.01154603, + "auxiliary_loss_mlp": 0.01063537, + "balance_loss_clip": 1.0463953, + "balance_loss_mlp": 1.03969479, + "epoch": 0.07948294002705546, + "flos": 18986941468800.0, + "grad_norm": 2.0906711348061946, + "language_loss": 0.83977425, + "learning_rate": 3.9743705432805195e-06, + "loss": 0.86195564, + "num_input_tokens_seen": 28062915, + "step": 1322, + "time_per_iteration": 2.5708200931549072 + }, + { + "auxiliary_loss_clip": 0.01168632, + "auxiliary_loss_mlp": 0.01060263, + "balance_loss_clip": 1.04547083, + "balance_loss_mlp": 1.03670692, + "epoch": 0.07954306327972344, + "flos": 21653452535040.0, + "grad_norm": 2.196864881990737, + "language_loss": 0.90356362, + "learning_rate": 3.974308356206838e-06, + "loss": 0.92585254, + "num_input_tokens_seen": 28082175, + "step": 1323, + "time_per_iteration": 2.565192937850952 + }, + { + "auxiliary_loss_clip": 0.0112725, + "auxiliary_loss_mlp": 0.01060826, + "balance_loss_clip": 1.0471108, + "balance_loss_mlp": 1.03620934, + "epoch": 0.0796031865323914, + "flos": 23220270766080.0, + "grad_norm": 1.66561396715647, + "language_loss": 0.82428783, + "learning_rate": 3.974246094267187e-06, + "loss": 0.84616864, + "num_input_tokens_seen": 28102645, + "step": 1324, + "time_per_iteration": 2.769230842590332 + }, + { + "auxiliary_loss_clip": 0.01146014, + "auxiliary_loss_mlp": 0.01051668, + "balance_loss_clip": 1.04591322, + "balance_loss_mlp": 1.02732599, + "epoch": 0.07966330978505937, + "flos": 23294534135040.0, + "grad_norm": 2.624653206673735, + "language_loss": 0.79094839, + "learning_rate": 3.974183757463925e-06, + "loss": 0.81292522, + "num_input_tokens_seen": 28122805, + "step": 1325, + "time_per_iteration": 2.675572633743286 + }, + { + "auxiliary_loss_clip": 0.01076975, + "auxiliary_loss_mlp": 0.0074995, + "balance_loss_clip": 1.03602183, + "balance_loss_mlp": 1.00101542, + "epoch": 0.07972343303772735, + "flos": 18363783392640.0, + "grad_norm": 2.0778485878242416, + "language_loss": 0.88457876, + "learning_rate": 3.974121345799418e-06, + "loss": 0.90284795, + "num_input_tokens_seen": 28140530, + "step": 1326, + "time_per_iteration": 2.7273266315460205 + }, + { + "auxiliary_loss_clip": 0.01165104, + "auxiliary_loss_mlp": 0.01058849, + "balance_loss_clip": 1.04573929, + "balance_loss_mlp": 1.03405356, + "epoch": 0.07978355629039531, + "flos": 21762513204480.0, + "grad_norm": 2.0741502753913994, + "language_loss": 0.83223379, + "learning_rate": 3.974058859276032e-06, + "loss": 0.85447329, + "num_input_tokens_seen": 28159640, + "step": 1327, + "time_per_iteration": 2.551405906677246 + }, + { + "auxiliary_loss_clip": 0.01172544, + "auxiliary_loss_mlp": 0.01056653, + "balance_loss_clip": 1.04869413, + "balance_loss_mlp": 1.03220272, + "epoch": 0.07984367954306328, + "flos": 18551309322240.0, + "grad_norm": 2.60301045906905, + "language_loss": 0.78779501, + "learning_rate": 3.9739962978961354e-06, + "loss": 0.81008697, + "num_input_tokens_seen": 28177050, + "step": 1328, + "time_per_iteration": 2.644451141357422 + }, + { + "auxiliary_loss_clip": 0.01164381, + "auxiliary_loss_mlp": 0.01052994, + "balance_loss_clip": 1.04884481, + "balance_loss_mlp": 1.02772188, + "epoch": 0.07990380279573125, + "flos": 16904050583040.0, + "grad_norm": 4.4721401333260715, + "language_loss": 0.74210048, + "learning_rate": 3.973933661662101e-06, + "loss": 0.76427424, + "num_input_tokens_seen": 28193245, + "step": 1329, + "time_per_iteration": 2.570915699005127 + }, + { + "auxiliary_loss_clip": 0.0114283, + "auxiliary_loss_mlp": 0.01063953, + "balance_loss_clip": 1.04702187, + "balance_loss_mlp": 1.03959823, + "epoch": 0.07996392604839922, + "flos": 24098358643200.0, + "grad_norm": 1.6495387564587567, + "language_loss": 0.81535959, + "learning_rate": 3.973870950576305e-06, + "loss": 0.83742738, + "num_input_tokens_seen": 28213570, + "step": 1330, + "time_per_iteration": 2.6530117988586426 + }, + { + "auxiliary_loss_clip": 0.01170171, + "auxiliary_loss_mlp": 0.00749916, + "balance_loss_clip": 1.04675496, + "balance_loss_mlp": 1.00095534, + "epoch": 0.08002404930106718, + "flos": 14278729438080.0, + "grad_norm": 1.9244785832942697, + "language_loss": 0.88637489, + "learning_rate": 3.9738081646411255e-06, + "loss": 0.90557575, + "num_input_tokens_seen": 28229980, + "step": 1331, + "time_per_iteration": 2.6192798614501953 + }, + { + "auxiliary_loss_clip": 0.0116204, + "auxiliary_loss_mlp": 0.00749949, + "balance_loss_clip": 1.04485774, + "balance_loss_mlp": 1.00091898, + "epoch": 0.08008417255373516, + "flos": 40406219285760.0, + "grad_norm": 1.8789914237140204, + "language_loss": 0.7345773, + "learning_rate": 3.973745303858942e-06, + "loss": 0.75369722, + "num_input_tokens_seen": 28253840, + "step": 1332, + "time_per_iteration": 2.876214027404785 + }, + { + "auxiliary_loss_clip": 0.01144424, + "auxiliary_loss_mlp": 0.01058305, + "balance_loss_clip": 1.04491985, + "balance_loss_mlp": 1.03455818, + "epoch": 0.08014429580640313, + "flos": 18478913460480.0, + "grad_norm": 1.7178016666526066, + "language_loss": 0.82830215, + "learning_rate": 3.973682368232138e-06, + "loss": 0.8503294, + "num_input_tokens_seen": 28271675, + "step": 1333, + "time_per_iteration": 2.5989181995391846 + }, + { + "auxiliary_loss_clip": 0.01118691, + "auxiliary_loss_mlp": 0.01053785, + "balance_loss_clip": 1.04077339, + "balance_loss_mlp": 1.03005052, + "epoch": 0.0802044190590711, + "flos": 22053461368320.0, + "grad_norm": 3.0967671877106473, + "language_loss": 0.74495798, + "learning_rate": 3.9736193577631015e-06, + "loss": 0.76668274, + "num_input_tokens_seen": 28291850, + "step": 1334, + "time_per_iteration": 2.739820957183838 + }, + { + "auxiliary_loss_clip": 0.01144188, + "auxiliary_loss_mlp": 0.0106272, + "balance_loss_clip": 1.04920864, + "balance_loss_mlp": 1.03941417, + "epoch": 0.08026454231173906, + "flos": 24572128055040.0, + "grad_norm": 1.8297035027412012, + "language_loss": 0.80048066, + "learning_rate": 3.973556272454221e-06, + "loss": 0.8225497, + "num_input_tokens_seen": 28310780, + "step": 1335, + "time_per_iteration": 2.6822640895843506 + }, + { + "auxiliary_loss_clip": 0.01028648, + "auxiliary_loss_mlp": 0.01045996, + "balance_loss_clip": 1.01287842, + "balance_loss_mlp": 1.04230011, + "epoch": 0.08032466556440704, + "flos": 52581841459200.0, + "grad_norm": 0.7560056047160851, + "language_loss": 0.56032443, + "learning_rate": 3.973493112307889e-06, + "loss": 0.5810709, + "num_input_tokens_seen": 28369985, + "step": 1336, + "time_per_iteration": 3.296168088912964 + }, + { + "auxiliary_loss_clip": 0.01143159, + "auxiliary_loss_mlp": 0.01059809, + "balance_loss_clip": 1.04431093, + "balance_loss_mlp": 1.03683734, + "epoch": 0.080384788817075, + "flos": 23842602829440.0, + "grad_norm": 1.977495752231207, + "language_loss": 0.67409295, + "learning_rate": 3.9734298773265005e-06, + "loss": 0.69612265, + "num_input_tokens_seen": 28388670, + "step": 1337, + "time_per_iteration": 2.669912576675415 + }, + { + "auxiliary_loss_clip": 0.01134346, + "auxiliary_loss_mlp": 0.01073882, + "balance_loss_clip": 1.04484367, + "balance_loss_mlp": 1.04947972, + "epoch": 0.08044491206974297, + "flos": 25300719527040.0, + "grad_norm": 1.8184018060227314, + "language_loss": 0.86514652, + "learning_rate": 3.973366567512453e-06, + "loss": 0.88722879, + "num_input_tokens_seen": 28411845, + "step": 1338, + "time_per_iteration": 2.7524750232696533 + }, + { + "auxiliary_loss_clip": 0.01095478, + "auxiliary_loss_mlp": 0.01071195, + "balance_loss_clip": 1.03512096, + "balance_loss_mlp": 1.04508817, + "epoch": 0.08050503532241095, + "flos": 22376549226240.0, + "grad_norm": 3.596983579457287, + "language_loss": 0.87590837, + "learning_rate": 3.973303182868147e-06, + "loss": 0.89757514, + "num_input_tokens_seen": 28427875, + "step": 1339, + "time_per_iteration": 2.66068959236145 + }, + { + "auxiliary_loss_clip": 0.01156289, + "auxiliary_loss_mlp": 0.01055215, + "balance_loss_clip": 1.04772103, + "balance_loss_mlp": 1.03215957, + "epoch": 0.08056515857507891, + "flos": 18369421827840.0, + "grad_norm": 2.297506544972218, + "language_loss": 0.89296663, + "learning_rate": 3.973239723395988e-06, + "loss": 0.91508168, + "num_input_tokens_seen": 28446615, + "step": 1340, + "time_per_iteration": 2.6196274757385254 + }, + { + "auxiliary_loss_clip": 0.01044075, + "auxiliary_loss_mlp": 0.01007142, + "balance_loss_clip": 1.01013994, + "balance_loss_mlp": 1.00370836, + "epoch": 0.08062528182774688, + "flos": 51348130980480.0, + "grad_norm": 0.8810492022223408, + "language_loss": 0.64768934, + "learning_rate": 3.97317618909838e-06, + "loss": 0.66820157, + "num_input_tokens_seen": 28505290, + "step": 1341, + "time_per_iteration": 3.1180505752563477 + }, + { + "auxiliary_loss_clip": 0.011627, + "auxiliary_loss_mlp": 0.01062138, + "balance_loss_clip": 1.04466951, + "balance_loss_mlp": 1.03598356, + "epoch": 0.08068540508041486, + "flos": 17599712261760.0, + "grad_norm": 2.2220236343702053, + "language_loss": 0.89610541, + "learning_rate": 3.973112579977733e-06, + "loss": 0.9183538, + "num_input_tokens_seen": 28522735, + "step": 1342, + "time_per_iteration": 2.5823607444763184 + }, + { + "auxiliary_loss_clip": 0.01141612, + "auxiliary_loss_mlp": 0.01061146, + "balance_loss_clip": 1.04861116, + "balance_loss_mlp": 1.03607607, + "epoch": 0.08074552833308282, + "flos": 10561185486720.0, + "grad_norm": 2.6556448318429764, + "language_loss": 0.76451886, + "learning_rate": 3.973048896036459e-06, + "loss": 0.78654647, + "num_input_tokens_seen": 28539460, + "step": 1343, + "time_per_iteration": 2.728511095046997 + }, + { + "auxiliary_loss_clip": 0.01032358, + "auxiliary_loss_mlp": 0.01007713, + "balance_loss_clip": 1.01074433, + "balance_loss_mlp": 1.00408888, + "epoch": 0.08080565158575079, + "flos": 60840254954880.0, + "grad_norm": 0.8001985527119377, + "language_loss": 0.57448757, + "learning_rate": 3.972985137276974e-06, + "loss": 0.59488833, + "num_input_tokens_seen": 28599855, + "step": 1344, + "time_per_iteration": 3.150578260421753 + }, + { + "auxiliary_loss_clip": 0.01117728, + "auxiliary_loss_mlp": 0.01069592, + "balance_loss_clip": 1.04251695, + "balance_loss_mlp": 1.04355717, + "epoch": 0.08086577483841875, + "flos": 18332361970560.0, + "grad_norm": 2.2434049111766674, + "language_loss": 0.86411369, + "learning_rate": 3.972921303701695e-06, + "loss": 0.88598692, + "num_input_tokens_seen": 28617585, + "step": 1345, + "time_per_iteration": 2.7347867488861084 + }, + { + "auxiliary_loss_clip": 0.01170392, + "auxiliary_loss_mlp": 0.01063438, + "balance_loss_clip": 1.04881787, + "balance_loss_mlp": 1.04055023, + "epoch": 0.08092589809108673, + "flos": 21543601766400.0, + "grad_norm": 2.081833336512495, + "language_loss": 0.87598473, + "learning_rate": 3.972857395313042e-06, + "loss": 0.89832306, + "num_input_tokens_seen": 28636355, + "step": 1346, + "time_per_iteration": 2.589998722076416 + }, + { + "auxiliary_loss_clip": 0.01159026, + "auxiliary_loss_mlp": 0.01063452, + "balance_loss_clip": 1.04741728, + "balance_loss_mlp": 1.03952646, + "epoch": 0.0809860213437547, + "flos": 22128012046080.0, + "grad_norm": 1.5363128804304111, + "language_loss": 0.92854381, + "learning_rate": 3.972793412113439e-06, + "loss": 0.95076859, + "num_input_tokens_seen": 28656260, + "step": 1347, + "time_per_iteration": 2.5950443744659424 + }, + { + "auxiliary_loss_clip": 0.01154759, + "auxiliary_loss_mlp": 0.01071868, + "balance_loss_clip": 1.04741573, + "balance_loss_mlp": 1.04543924, + "epoch": 0.08104614459642266, + "flos": 21725489260800.0, + "grad_norm": 1.6740088836327514, + "language_loss": 0.89086598, + "learning_rate": 3.972729354105312e-06, + "loss": 0.91313231, + "num_input_tokens_seen": 28675865, + "step": 1348, + "time_per_iteration": 2.5992207527160645 + }, + { + "auxiliary_loss_clip": 0.0111271, + "auxiliary_loss_mlp": 0.01064335, + "balance_loss_clip": 1.05015779, + "balance_loss_mlp": 1.040851, + "epoch": 0.08110626784909064, + "flos": 23951878980480.0, + "grad_norm": 1.700941587662581, + "language_loss": 0.7658301, + "learning_rate": 3.97266522129109e-06, + "loss": 0.78760058, + "num_input_tokens_seen": 28696255, + "step": 1349, + "time_per_iteration": 2.7610177993774414 + }, + { + "auxiliary_loss_clip": 0.0117125, + "auxiliary_loss_mlp": 0.01063337, + "balance_loss_clip": 1.04829407, + "balance_loss_mlp": 1.03855383, + "epoch": 0.0811663911017586, + "flos": 19025689265280.0, + "grad_norm": 4.347763841786505, + "language_loss": 0.88636184, + "learning_rate": 3.972601013673205e-06, + "loss": 0.90870774, + "num_input_tokens_seen": 28713905, + "step": 1350, + "time_per_iteration": 2.5962905883789062 + }, + { + "auxiliary_loss_clip": 0.01128364, + "auxiliary_loss_mlp": 0.00749957, + "balance_loss_clip": 1.04273498, + "balance_loss_mlp": 1.00105023, + "epoch": 0.08122651435442657, + "flos": 15341290588800.0, + "grad_norm": 2.06838600894228, + "language_loss": 0.82317948, + "learning_rate": 3.972536731254092e-06, + "loss": 0.8419627, + "num_input_tokens_seen": 28732075, + "step": 1351, + "time_per_iteration": 2.697115659713745 + }, + { + "auxiliary_loss_clip": 0.01168684, + "auxiliary_loss_mlp": 0.01060608, + "balance_loss_clip": 1.04576409, + "balance_loss_mlp": 1.03503776, + "epoch": 0.08128663760709455, + "flos": 23221563655680.0, + "grad_norm": 2.0098718730161043, + "language_loss": 0.75491256, + "learning_rate": 3.972472374036189e-06, + "loss": 0.77720547, + "num_input_tokens_seen": 28751150, + "step": 1352, + "time_per_iteration": 2.558637857437134 + }, + { + "auxiliary_loss_clip": 0.01163372, + "auxiliary_loss_mlp": 0.00749869, + "balance_loss_clip": 1.05002141, + "balance_loss_mlp": 1.00090671, + "epoch": 0.08134676085976252, + "flos": 22965628273920.0, + "grad_norm": 1.8088584832831511, + "language_loss": 0.82943368, + "learning_rate": 3.972407942021935e-06, + "loss": 0.84856606, + "num_input_tokens_seen": 28773360, + "step": 1353, + "time_per_iteration": 2.666177272796631 + }, + { + "auxiliary_loss_clip": 0.01041214, + "auxiliary_loss_mlp": 0.01024326, + "balance_loss_clip": 1.00946236, + "balance_loss_mlp": 1.02082086, + "epoch": 0.08140688411243048, + "flos": 64322115816960.0, + "grad_norm": 0.8542879079405756, + "language_loss": 0.59757781, + "learning_rate": 3.972343435213775e-06, + "loss": 0.6182332, + "num_input_tokens_seen": 28833390, + "step": 1354, + "time_per_iteration": 3.2171895503997803 + }, + { + "auxiliary_loss_clip": 0.01120462, + "auxiliary_loss_mlp": 0.01062558, + "balance_loss_clip": 1.04244494, + "balance_loss_mlp": 1.0390141, + "epoch": 0.08146700736509845, + "flos": 22491858862080.0, + "grad_norm": 1.7689409221821137, + "language_loss": 0.82853907, + "learning_rate": 3.972278853614154e-06, + "loss": 0.85036927, + "num_input_tokens_seen": 28852430, + "step": 1355, + "time_per_iteration": 2.7436771392822266 + }, + { + "auxiliary_loss_clip": 0.0115213, + "auxiliary_loss_mlp": 0.01058436, + "balance_loss_clip": 1.04518569, + "balance_loss_mlp": 1.03298497, + "epoch": 0.08152713061776642, + "flos": 20447823513600.0, + "grad_norm": 1.9884365495048626, + "language_loss": 0.7115314, + "learning_rate": 3.972214197225521e-06, + "loss": 0.73363698, + "num_input_tokens_seen": 28870685, + "step": 1356, + "time_per_iteration": 2.642871141433716 + }, + { + "auxiliary_loss_clip": 0.01160902, + "auxiliary_loss_mlp": 0.01060467, + "balance_loss_clip": 1.0458889, + "balance_loss_mlp": 1.03520644, + "epoch": 0.08158725387043439, + "flos": 23550218121600.0, + "grad_norm": 1.9200872158437676, + "language_loss": 0.70633805, + "learning_rate": 3.972149466050329e-06, + "loss": 0.72855169, + "num_input_tokens_seen": 28889860, + "step": 1357, + "time_per_iteration": 2.6228086948394775 + }, + { + "auxiliary_loss_clip": 0.01157502, + "auxiliary_loss_mlp": 0.01056194, + "balance_loss_clip": 1.04710138, + "balance_loss_mlp": 1.03255427, + "epoch": 0.08164737712310235, + "flos": 22017335264640.0, + "grad_norm": 2.9100753841407276, + "language_loss": 0.84225023, + "learning_rate": 3.97208466009103e-06, + "loss": 0.86438727, + "num_input_tokens_seen": 28905865, + "step": 1358, + "time_per_iteration": 2.6154325008392334 + }, + { + "auxiliary_loss_clip": 0.01146167, + "auxiliary_loss_mlp": 0.01056731, + "balance_loss_clip": 1.04689193, + "balance_loss_mlp": 1.03088677, + "epoch": 0.08170750037577033, + "flos": 23367827836800.0, + "grad_norm": 5.511260723886638, + "language_loss": 1.02570176, + "learning_rate": 3.972019779350084e-06, + "loss": 1.0477308, + "num_input_tokens_seen": 28925250, + "step": 1359, + "time_per_iteration": 2.6732265949249268 + }, + { + "auxiliary_loss_clip": 0.01095184, + "auxiliary_loss_mlp": 0.01061489, + "balance_loss_clip": 1.03826272, + "balance_loss_mlp": 1.03647864, + "epoch": 0.0817676236284383, + "flos": 28397978490240.0, + "grad_norm": 2.2590610066651045, + "language_loss": 0.83577943, + "learning_rate": 3.971954823829951e-06, + "loss": 0.85734618, + "num_input_tokens_seen": 28943445, + "step": 1360, + "time_per_iteration": 2.792299270629883 + }, + { + "auxiliary_loss_clip": 0.01174869, + "auxiliary_loss_mlp": 0.01067415, + "balance_loss_clip": 1.05063283, + "balance_loss_mlp": 1.0427382, + "epoch": 0.08182774688110626, + "flos": 19208905562880.0, + "grad_norm": 2.0255643095758886, + "language_loss": 0.72307217, + "learning_rate": 3.971889793533093e-06, + "loss": 0.74549508, + "num_input_tokens_seen": 28962695, + "step": 1361, + "time_per_iteration": 4.238358736038208 + }, + { + "auxiliary_loss_clip": 0.01136206, + "auxiliary_loss_mlp": 0.01060269, + "balance_loss_clip": 1.03946257, + "balance_loss_mlp": 1.03444815, + "epoch": 0.08188787013377424, + "flos": 22784099915520.0, + "grad_norm": 2.106502917426344, + "language_loss": 0.7655164, + "learning_rate": 3.971824688461976e-06, + "loss": 0.78748119, + "num_input_tokens_seen": 28982120, + "step": 1362, + "time_per_iteration": 2.6746175289154053 + }, + { + "auxiliary_loss_clip": 0.01166881, + "auxiliary_loss_mlp": 0.01054924, + "balance_loss_clip": 1.04823518, + "balance_loss_mlp": 1.03120112, + "epoch": 0.08194799338644221, + "flos": 16468095214080.0, + "grad_norm": 2.072028100639055, + "language_loss": 0.72597134, + "learning_rate": 3.971759508619069e-06, + "loss": 0.74818945, + "num_input_tokens_seen": 28998100, + "step": 1363, + "time_per_iteration": 5.857705593109131 + }, + { + "auxiliary_loss_clip": 0.01171012, + "auxiliary_loss_mlp": 0.01066149, + "balance_loss_clip": 1.05101395, + "balance_loss_mlp": 1.03989935, + "epoch": 0.08200811663911017, + "flos": 23913633974400.0, + "grad_norm": 1.7871026171631703, + "language_loss": 0.77248943, + "learning_rate": 3.971694254006844e-06, + "loss": 0.79486102, + "num_input_tokens_seen": 29017095, + "step": 1364, + "time_per_iteration": 2.592571258544922 + }, + { + "auxiliary_loss_clip": 0.01094638, + "auxiliary_loss_mlp": 0.01069828, + "balance_loss_clip": 1.03842664, + "balance_loss_mlp": 1.04389966, + "epoch": 0.08206823989177814, + "flos": 17896550256000.0, + "grad_norm": 2.081438927769404, + "language_loss": 0.82064199, + "learning_rate": 3.971628924627776e-06, + "loss": 0.84228659, + "num_input_tokens_seen": 29037240, + "step": 1365, + "time_per_iteration": 2.7549171447753906 + }, + { + "auxiliary_loss_clip": 0.01160824, + "auxiliary_loss_mlp": 0.01068067, + "balance_loss_clip": 1.05113876, + "balance_loss_mlp": 1.0440824, + "epoch": 0.08212836314444612, + "flos": 22088186841600.0, + "grad_norm": 1.6232231531963888, + "language_loss": 0.82034826, + "learning_rate": 3.97156352048434e-06, + "loss": 0.84263718, + "num_input_tokens_seen": 29056250, + "step": 1366, + "time_per_iteration": 2.6617724895477295 + }, + { + "auxiliary_loss_clip": 0.01121102, + "auxiliary_loss_mlp": 0.0106716, + "balance_loss_clip": 1.04131198, + "balance_loss_mlp": 1.0434252, + "epoch": 0.08218848639711408, + "flos": 17597485618560.0, + "grad_norm": 3.789252561933645, + "language_loss": 0.81897998, + "learning_rate": 3.97149804157902e-06, + "loss": 0.84086263, + "num_input_tokens_seen": 29073380, + "step": 1367, + "time_per_iteration": 2.6125216484069824 + }, + { + "auxiliary_loss_clip": 0.0117091, + "auxiliary_loss_mlp": 0.01061943, + "balance_loss_clip": 1.04749811, + "balance_loss_mlp": 1.03762472, + "epoch": 0.08224860964978205, + "flos": 17857838373120.0, + "grad_norm": 2.3657656066177526, + "language_loss": 0.83935547, + "learning_rate": 3.9714324879142946e-06, + "loss": 0.86168402, + "num_input_tokens_seen": 29091330, + "step": 1368, + "time_per_iteration": 2.55129075050354 + }, + { + "auxiliary_loss_clip": 0.01127026, + "auxiliary_loss_mlp": 0.0105308, + "balance_loss_clip": 1.04586518, + "balance_loss_mlp": 1.03007245, + "epoch": 0.08230873290245003, + "flos": 25227533566080.0, + "grad_norm": 1.7215121411726588, + "language_loss": 0.81378204, + "learning_rate": 3.971366859492653e-06, + "loss": 0.83558309, + "num_input_tokens_seen": 29110375, + "step": 1369, + "time_per_iteration": 2.7637791633605957 + }, + { + "auxiliary_loss_clip": 0.01103973, + "auxiliary_loss_mlp": 0.00749938, + "balance_loss_clip": 1.04224956, + "balance_loss_mlp": 1.00107598, + "epoch": 0.08236885615511799, + "flos": 31759935753600.0, + "grad_norm": 2.285210813785828, + "language_loss": 0.74720341, + "learning_rate": 3.971301156316582e-06, + "loss": 0.76574254, + "num_input_tokens_seen": 29129395, + "step": 1370, + "time_per_iteration": 2.7603654861450195 + }, + { + "auxiliary_loss_clip": 0.01122508, + "auxiliary_loss_mlp": 0.01063904, + "balance_loss_clip": 1.04650211, + "balance_loss_mlp": 1.03902483, + "epoch": 0.08242897940778596, + "flos": 23185832601600.0, + "grad_norm": 1.6565583158614974, + "language_loss": 0.74209428, + "learning_rate": 3.971235378388573e-06, + "loss": 0.76395845, + "num_input_tokens_seen": 29148650, + "step": 1371, + "time_per_iteration": 2.6889235973358154 + }, + { + "auxiliary_loss_clip": 0.01064136, + "auxiliary_loss_mlp": 0.01065283, + "balance_loss_clip": 1.03763616, + "balance_loss_mlp": 1.03909254, + "epoch": 0.08248910266045394, + "flos": 34491480393600.0, + "grad_norm": 2.001999842806191, + "language_loss": 0.71014321, + "learning_rate": 3.971169525711122e-06, + "loss": 0.73143744, + "num_input_tokens_seen": 29170785, + "step": 1372, + "time_per_iteration": 3.0065996646881104 + }, + { + "auxiliary_loss_clip": 0.01121025, + "auxiliary_loss_mlp": 0.01058278, + "balance_loss_clip": 1.03911376, + "balance_loss_mlp": 1.03368545, + "epoch": 0.0825492259131219, + "flos": 13436228960640.0, + "grad_norm": 2.477904280775311, + "language_loss": 0.87909305, + "learning_rate": 3.9711035982867246e-06, + "loss": 0.90088612, + "num_input_tokens_seen": 29185210, + "step": 1373, + "time_per_iteration": 2.8349761962890625 + }, + { + "auxiliary_loss_clip": 0.01116751, + "auxiliary_loss_mlp": 0.01064873, + "balance_loss_clip": 1.04198992, + "balance_loss_mlp": 1.04027987, + "epoch": 0.08260934916578987, + "flos": 25812446636160.0, + "grad_norm": 2.345961100630552, + "language_loss": 0.81852901, + "learning_rate": 3.971037596117882e-06, + "loss": 0.8403452, + "num_input_tokens_seen": 29205210, + "step": 1374, + "time_per_iteration": 2.787200927734375 + }, + { + "auxiliary_loss_clip": 0.01029992, + "auxiliary_loss_mlp": 0.01025239, + "balance_loss_clip": 1.03304553, + "balance_loss_mlp": 1.01977968, + "epoch": 0.08266947241845783, + "flos": 63460009491840.0, + "grad_norm": 0.8503636511092834, + "language_loss": 0.60646224, + "learning_rate": 3.970971519207095e-06, + "loss": 0.62701464, + "num_input_tokens_seen": 29265350, + "step": 1375, + "time_per_iteration": 3.352938175201416 + }, + { + "auxiliary_loss_clip": 0.0103068, + "auxiliary_loss_mlp": 0.01040227, + "balance_loss_clip": 1.00848472, + "balance_loss_mlp": 1.0364598, + "epoch": 0.08272959567112581, + "flos": 69993704568960.0, + "grad_norm": 0.9101057231369265, + "language_loss": 0.62166846, + "learning_rate": 3.970905367556871e-06, + "loss": 0.6423775, + "num_input_tokens_seen": 29321475, + "step": 1376, + "time_per_iteration": 3.1927740573883057 + }, + { + "auxiliary_loss_clip": 0.01124517, + "auxiliary_loss_mlp": 0.01069456, + "balance_loss_clip": 1.04679537, + "balance_loss_mlp": 1.04539931, + "epoch": 0.08278971892379378, + "flos": 20413205781120.0, + "grad_norm": 2.3817852727262956, + "language_loss": 0.82997227, + "learning_rate": 3.970839141169718e-06, + "loss": 0.8519119, + "num_input_tokens_seen": 29341405, + "step": 1377, + "time_per_iteration": 2.8208391666412354 + }, + { + "auxiliary_loss_clip": 0.01144122, + "auxiliary_loss_mlp": 0.01058019, + "balance_loss_clip": 1.04762065, + "balance_loss_mlp": 1.03322363, + "epoch": 0.08284984217646174, + "flos": 26250233598720.0, + "grad_norm": 1.80227140750556, + "language_loss": 0.84665054, + "learning_rate": 3.970772840048147e-06, + "loss": 0.86867195, + "num_input_tokens_seen": 29361955, + "step": 1378, + "time_per_iteration": 2.788015365600586 + }, + { + "auxiliary_loss_clip": 0.0115551, + "auxiliary_loss_mlp": 0.01058896, + "balance_loss_clip": 1.04714847, + "balance_loss_mlp": 1.03454137, + "epoch": 0.08290996542912972, + "flos": 27194683852800.0, + "grad_norm": 3.2851322497417716, + "language_loss": 0.87713909, + "learning_rate": 3.970706464194672e-06, + "loss": 0.89928317, + "num_input_tokens_seen": 29382395, + "step": 1379, + "time_per_iteration": 2.930013656616211 + }, + { + "auxiliary_loss_clip": 0.01119598, + "auxiliary_loss_mlp": 0.01061035, + "balance_loss_clip": 1.0413065, + "balance_loss_mlp": 1.03763461, + "epoch": 0.08297008868179769, + "flos": 38618191146240.0, + "grad_norm": 2.612837128939636, + "language_loss": 0.78245401, + "learning_rate": 3.970640013611812e-06, + "loss": 0.80426037, + "num_input_tokens_seen": 29404460, + "step": 1380, + "time_per_iteration": 2.963831901550293 + }, + { + "auxiliary_loss_clip": 0.01156909, + "auxiliary_loss_mlp": 0.01058261, + "balance_loss_clip": 1.05242836, + "balance_loss_mlp": 1.03338218, + "epoch": 0.08303021193446565, + "flos": 19974736460160.0, + "grad_norm": 2.557768419964124, + "language_loss": 0.86491543, + "learning_rate": 3.970573488302083e-06, + "loss": 0.8870672, + "num_input_tokens_seen": 29422675, + "step": 1381, + "time_per_iteration": 2.7325525283813477 + }, + { + "auxiliary_loss_clip": 0.01166507, + "auxiliary_loss_mlp": 0.0075003, + "balance_loss_clip": 1.05153465, + "balance_loss_mlp": 1.00112283, + "epoch": 0.08309033518713363, + "flos": 13662646341120.0, + "grad_norm": 2.9562336480073306, + "language_loss": 0.88272095, + "learning_rate": 3.970506888268011e-06, + "loss": 0.90188634, + "num_input_tokens_seen": 29439840, + "step": 1382, + "time_per_iteration": 2.6631972789764404 + }, + { + "auxiliary_loss_clip": 0.01126654, + "auxiliary_loss_mlp": 0.01057407, + "balance_loss_clip": 1.04625773, + "balance_loss_mlp": 1.03491199, + "epoch": 0.0831504584398016, + "flos": 17968551068160.0, + "grad_norm": 2.3549463276276184, + "language_loss": 0.77193505, + "learning_rate": 3.970440213512121e-06, + "loss": 0.79377568, + "num_input_tokens_seen": 29457360, + "step": 1383, + "time_per_iteration": 2.811278820037842 + }, + { + "auxiliary_loss_clip": 0.01159957, + "auxiliary_loss_mlp": 0.01061161, + "balance_loss_clip": 1.04908442, + "balance_loss_mlp": 1.03739107, + "epoch": 0.08321058169246956, + "flos": 22601386408320.0, + "grad_norm": 1.9916126369404545, + "language_loss": 0.82755446, + "learning_rate": 3.97037346403694e-06, + "loss": 0.84976566, + "num_input_tokens_seen": 29477040, + "step": 1384, + "time_per_iteration": 2.827012300491333 + }, + { + "auxiliary_loss_clip": 0.01110449, + "auxiliary_loss_mlp": 0.01060294, + "balance_loss_clip": 1.0419569, + "balance_loss_mlp": 1.03248262, + "epoch": 0.08327070494513754, + "flos": 22850426378880.0, + "grad_norm": 2.7673255091043485, + "language_loss": 0.84617686, + "learning_rate": 3.970306639845e-06, + "loss": 0.86788434, + "num_input_tokens_seen": 29492010, + "step": 1385, + "time_per_iteration": 2.705166816711426 + }, + { + "auxiliary_loss_clip": 0.01125467, + "auxiliary_loss_mlp": 0.0106147, + "balance_loss_clip": 1.04742146, + "balance_loss_mlp": 1.03722262, + "epoch": 0.0833308281978055, + "flos": 22782986593920.0, + "grad_norm": 2.0208495313226433, + "language_loss": 0.68720722, + "learning_rate": 3.970239740938835e-06, + "loss": 0.70907658, + "num_input_tokens_seen": 29511850, + "step": 1386, + "time_per_iteration": 2.7096757888793945 + }, + { + "auxiliary_loss_clip": 0.01139755, + "auxiliary_loss_mlp": 0.01058438, + "balance_loss_clip": 1.04232228, + "balance_loss_mlp": 1.03421474, + "epoch": 0.08339095145047347, + "flos": 20812604083200.0, + "grad_norm": 1.6213855080867667, + "language_loss": 0.82102507, + "learning_rate": 3.97017276732098e-06, + "loss": 0.84300697, + "num_input_tokens_seen": 29531415, + "step": 1387, + "time_per_iteration": 2.6502721309661865 + }, + { + "auxiliary_loss_clip": 0.01151311, + "auxiliary_loss_mlp": 0.0106808, + "balance_loss_clip": 1.04898858, + "balance_loss_mlp": 1.04229474, + "epoch": 0.08345107470314143, + "flos": 18515326872960.0, + "grad_norm": 2.1239894982144936, + "language_loss": 0.76837742, + "learning_rate": 3.970105718993978e-06, + "loss": 0.79057133, + "num_input_tokens_seen": 29549525, + "step": 1388, + "time_per_iteration": 2.6703381538391113 + }, + { + "auxiliary_loss_clip": 0.01111491, + "auxiliary_loss_mlp": 0.01065445, + "balance_loss_clip": 1.04859734, + "balance_loss_mlp": 1.04018462, + "epoch": 0.08351119795580941, + "flos": 18807567926400.0, + "grad_norm": 2.4278321245846275, + "language_loss": 0.79572648, + "learning_rate": 3.970038595960369e-06, + "loss": 0.81749582, + "num_input_tokens_seen": 29568705, + "step": 1389, + "time_per_iteration": 2.720834493637085 + }, + { + "auxiliary_loss_clip": 0.01143636, + "auxiliary_loss_mlp": 0.01062508, + "balance_loss_clip": 1.04628801, + "balance_loss_mlp": 1.0387857, + "epoch": 0.08357132120847738, + "flos": 18441817689600.0, + "grad_norm": 2.8904753538539567, + "language_loss": 0.87287098, + "learning_rate": 3.969971398222699e-06, + "loss": 0.89493239, + "num_input_tokens_seen": 29585855, + "step": 1390, + "time_per_iteration": 2.6527082920074463 + }, + { + "auxiliary_loss_clip": 0.01125976, + "auxiliary_loss_mlp": 0.01060965, + "balance_loss_clip": 1.0410558, + "balance_loss_mlp": 1.03572893, + "epoch": 0.08363144446114534, + "flos": 25922333318400.0, + "grad_norm": 9.082239936075528, + "language_loss": 0.86764085, + "learning_rate": 3.969904125783517e-06, + "loss": 0.88951027, + "num_input_tokens_seen": 29607280, + "step": 1391, + "time_per_iteration": 2.6869516372680664 + }, + { + "auxiliary_loss_clip": 0.01123972, + "auxiliary_loss_mlp": 0.01071947, + "balance_loss_clip": 1.04358387, + "balance_loss_mlp": 1.0478071, + "epoch": 0.08369156771381332, + "flos": 18041306065920.0, + "grad_norm": 2.2824169290099303, + "language_loss": 0.876293, + "learning_rate": 3.969836778645371e-06, + "loss": 0.89825213, + "num_input_tokens_seen": 29624130, + "step": 1392, + "time_per_iteration": 2.721726417541504 + }, + { + "auxiliary_loss_clip": 0.01154783, + "auxiliary_loss_mlp": 0.0106721, + "balance_loss_clip": 1.04590547, + "balance_loss_mlp": 1.04307055, + "epoch": 0.08375169096648129, + "flos": 22675111073280.0, + "grad_norm": 2.2587808477161073, + "language_loss": 0.80196017, + "learning_rate": 3.969769356810819e-06, + "loss": 0.82418013, + "num_input_tokens_seen": 29643210, + "step": 1393, + "time_per_iteration": 2.639988660812378 + }, + { + "auxiliary_loss_clip": 0.01169734, + "auxiliary_loss_mlp": 0.01056408, + "balance_loss_clip": 1.05071354, + "balance_loss_mlp": 1.03322136, + "epoch": 0.08381181421914925, + "flos": 26103215232000.0, + "grad_norm": 2.266560601892035, + "language_loss": 0.84640229, + "learning_rate": 3.969701860282415e-06, + "loss": 0.86866379, + "num_input_tokens_seen": 29663920, + "step": 1394, + "time_per_iteration": 2.655113935470581 + }, + { + "auxiliary_loss_clip": 0.01127659, + "auxiliary_loss_mlp": 0.01061824, + "balance_loss_clip": 1.05154538, + "balance_loss_mlp": 1.03667128, + "epoch": 0.08387193747181723, + "flos": 20629782835200.0, + "grad_norm": 1.9301640702232858, + "language_loss": 0.82917702, + "learning_rate": 3.969634289062719e-06, + "loss": 0.85107183, + "num_input_tokens_seen": 29683825, + "step": 1395, + "time_per_iteration": 2.742238759994507 + }, + { + "auxiliary_loss_clip": 0.01159169, + "auxiliary_loss_mlp": 0.00749934, + "balance_loss_clip": 1.04829788, + "balance_loss_mlp": 1.00096774, + "epoch": 0.0839320607244852, + "flos": 13443196199040.0, + "grad_norm": 3.334046307398827, + "language_loss": 0.83022773, + "learning_rate": 3.969566643154293e-06, + "loss": 0.84931874, + "num_input_tokens_seen": 29698775, + "step": 1396, + "time_per_iteration": 2.6493775844573975 + }, + { + "auxiliary_loss_clip": 0.01155867, + "auxiliary_loss_mlp": 0.01060137, + "balance_loss_clip": 1.04907155, + "balance_loss_mlp": 1.03439963, + "epoch": 0.08399218397715316, + "flos": 23477247642240.0, + "grad_norm": 2.2789415169015195, + "language_loss": 0.76758003, + "learning_rate": 3.969498922559703e-06, + "loss": 0.78974009, + "num_input_tokens_seen": 29719430, + "step": 1397, + "time_per_iteration": 2.6254489421844482 + }, + { + "auxiliary_loss_clip": 0.01134188, + "auxiliary_loss_mlp": 0.01051737, + "balance_loss_clip": 1.04958844, + "balance_loss_mlp": 1.02596402, + "epoch": 0.08405230722982113, + "flos": 25920717206400.0, + "grad_norm": 2.8209954610725796, + "language_loss": 0.78590345, + "learning_rate": 3.969431127281516e-06, + "loss": 0.80776274, + "num_input_tokens_seen": 29739685, + "step": 1398, + "time_per_iteration": 2.7296667098999023 + }, + { + "auxiliary_loss_clip": 0.01159726, + "auxiliary_loss_mlp": 0.0105332, + "balance_loss_clip": 1.0452981, + "balance_loss_mlp": 1.03034842, + "epoch": 0.0841124304824891, + "flos": 17967437746560.0, + "grad_norm": 2.1518413109019194, + "language_loss": 0.94889522, + "learning_rate": 3.969363257322304e-06, + "loss": 0.97102571, + "num_input_tokens_seen": 29756165, + "step": 1399, + "time_per_iteration": 2.584588050842285 + }, + { + "auxiliary_loss_clip": 0.01136927, + "auxiliary_loss_mlp": 0.01057683, + "balance_loss_clip": 1.04231429, + "balance_loss_mlp": 1.03273249, + "epoch": 0.08417255373515707, + "flos": 25629661301760.0, + "grad_norm": 1.8375443925298427, + "language_loss": 0.81856686, + "learning_rate": 3.96929531268464e-06, + "loss": 0.84051299, + "num_input_tokens_seen": 29776425, + "step": 1400, + "time_per_iteration": 2.7384729385375977 + }, + { + "auxiliary_loss_clip": 0.01138872, + "auxiliary_loss_mlp": 0.01063631, + "balance_loss_clip": 1.0451467, + "balance_loss_mlp": 1.03939581, + "epoch": 0.08423267698782504, + "flos": 26249730808320.0, + "grad_norm": 1.8261478204595951, + "language_loss": 0.8683604, + "learning_rate": 3.969227293371099e-06, + "loss": 0.89038545, + "num_input_tokens_seen": 29796440, + "step": 1401, + "time_per_iteration": 2.7121200561523438 + }, + { + "auxiliary_loss_clip": 0.01167483, + "auxiliary_loss_mlp": 0.01062577, + "balance_loss_clip": 1.04573679, + "balance_loss_mlp": 1.03651786, + "epoch": 0.08429280024049302, + "flos": 20119707751680.0, + "grad_norm": 2.4580785085505408, + "language_loss": 0.87823111, + "learning_rate": 3.969159199384263e-06, + "loss": 0.90053171, + "num_input_tokens_seen": 29814755, + "step": 1402, + "time_per_iteration": 2.5587005615234375 + }, + { + "auxiliary_loss_clip": 0.01109645, + "auxiliary_loss_mlp": 0.00749974, + "balance_loss_clip": 1.03792334, + "balance_loss_mlp": 1.00102425, + "epoch": 0.08435292349316098, + "flos": 42924526836480.0, + "grad_norm": 2.446137547103018, + "language_loss": 0.88660884, + "learning_rate": 3.9690910307267125e-06, + "loss": 0.90520507, + "num_input_tokens_seen": 29834785, + "step": 1403, + "time_per_iteration": 2.8423964977264404 + }, + { + "auxiliary_loss_clip": 0.01139114, + "auxiliary_loss_mlp": 0.01051672, + "balance_loss_clip": 1.0446322, + "balance_loss_mlp": 1.02710271, + "epoch": 0.08441304674582895, + "flos": 22857285876480.0, + "grad_norm": 2.048206671558645, + "language_loss": 0.80142254, + "learning_rate": 3.969022787401033e-06, + "loss": 0.8233304, + "num_input_tokens_seen": 29854695, + "step": 1404, + "time_per_iteration": 2.6360270977020264 + }, + { + "auxiliary_loss_clip": 0.01152046, + "auxiliary_loss_mlp": 0.01066978, + "balance_loss_clip": 1.04819846, + "balance_loss_mlp": 1.0414319, + "epoch": 0.08447316999849692, + "flos": 18697501676160.0, + "grad_norm": 2.351654886308059, + "language_loss": 0.83418381, + "learning_rate": 3.968954469409811e-06, + "loss": 0.85637403, + "num_input_tokens_seen": 29872180, + "step": 1405, + "time_per_iteration": 2.5882818698883057 + }, + { + "auxiliary_loss_clip": 0.01148535, + "auxiliary_loss_mlp": 0.01056806, + "balance_loss_clip": 1.04168344, + "balance_loss_mlp": 1.03317904, + "epoch": 0.08453329325116489, + "flos": 25483971738240.0, + "grad_norm": 1.828658998432534, + "language_loss": 0.79903072, + "learning_rate": 3.968886076755639e-06, + "loss": 0.82108414, + "num_input_tokens_seen": 29893205, + "step": 1406, + "time_per_iteration": 2.613632917404175 + }, + { + "auxiliary_loss_clip": 0.01138978, + "auxiliary_loss_mlp": 0.01067017, + "balance_loss_clip": 1.0444845, + "balance_loss_mlp": 1.04255509, + "epoch": 0.08459341650383286, + "flos": 20920048640640.0, + "grad_norm": 1.8330392149698511, + "language_loss": 0.79294133, + "learning_rate": 3.96881760944111e-06, + "loss": 0.81500131, + "num_input_tokens_seen": 29911970, + "step": 1407, + "time_per_iteration": 2.6343843936920166 + }, + { + "auxiliary_loss_clip": 0.01155834, + "auxiliary_loss_mlp": 0.0105535, + "balance_loss_clip": 1.04603326, + "balance_loss_mlp": 1.03191328, + "epoch": 0.08465353975650082, + "flos": 13043079624960.0, + "grad_norm": 2.08611637018983, + "language_loss": 0.91503167, + "learning_rate": 3.968749067468819e-06, + "loss": 0.93714356, + "num_input_tokens_seen": 29929925, + "step": 1408, + "time_per_iteration": 4.262296438217163 + }, + { + "auxiliary_loss_clip": 0.01031106, + "auxiliary_loss_mlp": 0.01011414, + "balance_loss_clip": 1.00898361, + "balance_loss_mlp": 1.00833797, + "epoch": 0.0847136630091688, + "flos": 60877422552960.0, + "grad_norm": 0.886341748493254, + "language_loss": 0.61761785, + "learning_rate": 3.968680450841368e-06, + "loss": 0.63804305, + "num_input_tokens_seen": 29985950, + "step": 1409, + "time_per_iteration": 3.2593743801116943 + }, + { + "auxiliary_loss_clip": 0.01158113, + "auxiliary_loss_mlp": 0.01058607, + "balance_loss_clip": 1.04551578, + "balance_loss_mlp": 1.03549266, + "epoch": 0.08477378626183676, + "flos": 22046530043520.0, + "grad_norm": 1.897012093825748, + "language_loss": 0.86555982, + "learning_rate": 3.968611759561355e-06, + "loss": 0.88772702, + "num_input_tokens_seen": 30004330, + "step": 1410, + "time_per_iteration": 4.3945910930633545 + }, + { + "auxiliary_loss_clip": 0.01148777, + "auxiliary_loss_mlp": 0.01050972, + "balance_loss_clip": 1.04415369, + "balance_loss_mlp": 1.02491331, + "epoch": 0.08483390951450473, + "flos": 16690059308160.0, + "grad_norm": 2.199089072652248, + "language_loss": 0.74245602, + "learning_rate": 3.968542993631388e-06, + "loss": 0.76445347, + "num_input_tokens_seen": 30022555, + "step": 1411, + "time_per_iteration": 4.316800594329834 + }, + { + "auxiliary_loss_clip": 0.01052184, + "auxiliary_loss_mlp": 0.01003596, + "balance_loss_clip": 1.00804842, + "balance_loss_mlp": 1.00058007, + "epoch": 0.08489403276717271, + "flos": 51584640082560.0, + "grad_norm": 0.9098244327295381, + "language_loss": 0.56803483, + "learning_rate": 3.968474153054073e-06, + "loss": 0.58859259, + "num_input_tokens_seen": 30077220, + "step": 1412, + "time_per_iteration": 3.111741781234741 + }, + { + "auxiliary_loss_clip": 0.01123817, + "auxiliary_loss_mlp": 0.01056804, + "balance_loss_clip": 1.0406847, + "balance_loss_mlp": 1.0326407, + "epoch": 0.08495415601984067, + "flos": 17092330698240.0, + "grad_norm": 2.4695363276017845, + "language_loss": 0.89000863, + "learning_rate": 3.96840523783202e-06, + "loss": 0.91181481, + "num_input_tokens_seen": 30094600, + "step": 1413, + "time_per_iteration": 2.6788628101348877 + }, + { + "auxiliary_loss_clip": 0.01138322, + "auxiliary_loss_mlp": 0.01056444, + "balance_loss_clip": 1.04657269, + "balance_loss_mlp": 1.03191018, + "epoch": 0.08501427927250864, + "flos": 23148413608320.0, + "grad_norm": 2.15655096019245, + "language_loss": 0.88031, + "learning_rate": 3.968336247967844e-06, + "loss": 0.90225768, + "num_input_tokens_seen": 30114475, + "step": 1414, + "time_per_iteration": 2.6882874965667725 + }, + { + "auxiliary_loss_clip": 0.01138105, + "auxiliary_loss_mlp": 0.0105922, + "balance_loss_clip": 1.04382336, + "balance_loss_mlp": 1.03651047, + "epoch": 0.08507440252517662, + "flos": 19063467394560.0, + "grad_norm": 1.620553294482893, + "language_loss": 0.77354074, + "learning_rate": 3.96826718346416e-06, + "loss": 0.79551399, + "num_input_tokens_seen": 30133350, + "step": 1415, + "time_per_iteration": 2.5994198322296143 + }, + { + "auxiliary_loss_clip": 0.01151464, + "auxiliary_loss_mlp": 0.01058101, + "balance_loss_clip": 1.04599953, + "balance_loss_mlp": 1.03653574, + "epoch": 0.08513452577784458, + "flos": 60182296600320.0, + "grad_norm": 2.2316666565156877, + "language_loss": 0.70680815, + "learning_rate": 3.968198044323587e-06, + "loss": 0.72890383, + "num_input_tokens_seen": 30159005, + "step": 1416, + "time_per_iteration": 2.942575216293335 + }, + { + "auxiliary_loss_clip": 0.01145132, + "auxiliary_loss_mlp": 0.01064318, + "balance_loss_clip": 1.04731321, + "balance_loss_mlp": 1.03872347, + "epoch": 0.08519464903051255, + "flos": 27308485117440.0, + "grad_norm": 2.1068116083484076, + "language_loss": 0.74796396, + "learning_rate": 3.968128830548748e-06, + "loss": 0.77005845, + "num_input_tokens_seen": 30179450, + "step": 1417, + "time_per_iteration": 2.6614885330200195 + }, + { + "auxiliary_loss_clip": 0.01139418, + "auxiliary_loss_mlp": 0.01053663, + "balance_loss_clip": 1.04403341, + "balance_loss_mlp": 1.02957129, + "epoch": 0.08525477228318051, + "flos": 20266438809600.0, + "grad_norm": 3.4021224810802435, + "language_loss": 0.82075131, + "learning_rate": 3.968059542142265e-06, + "loss": 0.84268212, + "num_input_tokens_seen": 30197235, + "step": 1418, + "time_per_iteration": 2.646566390991211 + }, + { + "auxiliary_loss_clip": 0.01019966, + "auxiliary_loss_mlp": 0.01017763, + "balance_loss_clip": 1.01296079, + "balance_loss_mlp": 1.01461625, + "epoch": 0.08531489553584849, + "flos": 67615017183360.0, + "grad_norm": 0.8796831362094885, + "language_loss": 0.56587267, + "learning_rate": 3.9679901791067685e-06, + "loss": 0.58624995, + "num_input_tokens_seen": 30257410, + "step": 1419, + "time_per_iteration": 3.172421455383301 + }, + { + "auxiliary_loss_clip": 0.01159659, + "auxiliary_loss_mlp": 0.01060502, + "balance_loss_clip": 1.04345679, + "balance_loss_mlp": 1.03668404, + "epoch": 0.08537501878851646, + "flos": 27526965592320.0, + "grad_norm": 2.031909573023981, + "language_loss": 0.70070976, + "learning_rate": 3.967920741444886e-06, + "loss": 0.72291136, + "num_input_tokens_seen": 30277865, + "step": 1420, + "time_per_iteration": 2.792550802230835 + }, + { + "auxiliary_loss_clip": 0.0111762, + "auxiliary_loss_mlp": 0.01050461, + "balance_loss_clip": 1.03849602, + "balance_loss_mlp": 1.02646446, + "epoch": 0.08543514204118442, + "flos": 22784243569920.0, + "grad_norm": 1.6393484752349667, + "language_loss": 0.87955946, + "learning_rate": 3.967851229159252e-06, + "loss": 0.90124029, + "num_input_tokens_seen": 30298545, + "step": 1421, + "time_per_iteration": 2.6924591064453125 + }, + { + "auxiliary_loss_clip": 0.01049733, + "auxiliary_loss_mlp": 0.01002674, + "balance_loss_clip": 1.00629425, + "balance_loss_mlp": 0.99933642, + "epoch": 0.0854952652938524, + "flos": 60990721027200.0, + "grad_norm": 0.7967240401538714, + "language_loss": 0.63488084, + "learning_rate": 3.967781642252502e-06, + "loss": 0.65540493, + "num_input_tokens_seen": 30361725, + "step": 1422, + "time_per_iteration": 3.194446086883545 + }, + { + "auxiliary_loss_clip": 0.01122175, + "auxiliary_loss_mlp": 0.010594, + "balance_loss_clip": 1.04486823, + "balance_loss_mlp": 1.03638029, + "epoch": 0.08555538854652037, + "flos": 28038046256640.0, + "grad_norm": 1.9728000147563642, + "language_loss": 0.8295486, + "learning_rate": 3.967711980727276e-06, + "loss": 0.85136425, + "num_input_tokens_seen": 30382180, + "step": 1423, + "time_per_iteration": 2.914706230163574 + }, + { + "auxiliary_loss_clip": 0.01115689, + "auxiliary_loss_mlp": 0.01058001, + "balance_loss_clip": 1.03972912, + "balance_loss_mlp": 1.03492165, + "epoch": 0.08561551179918833, + "flos": 23509279595520.0, + "grad_norm": 1.701317422602443, + "language_loss": 0.75025612, + "learning_rate": 3.967642244586213e-06, + "loss": 0.77199304, + "num_input_tokens_seen": 30402980, + "step": 1424, + "time_per_iteration": 2.775325298309326 + }, + { + "auxiliary_loss_clip": 0.01120212, + "auxiliary_loss_mlp": 0.01059906, + "balance_loss_clip": 1.05004787, + "balance_loss_mlp": 1.03621924, + "epoch": 0.08567563505185631, + "flos": 17926930183680.0, + "grad_norm": 1.7771285713678764, + "language_loss": 0.75746489, + "learning_rate": 3.96757243383196e-06, + "loss": 0.779266, + "num_input_tokens_seen": 30420800, + "step": 1425, + "time_per_iteration": 2.9178171157836914 + }, + { + "auxiliary_loss_clip": 0.0115939, + "auxiliary_loss_mlp": 0.01054844, + "balance_loss_clip": 1.04601467, + "balance_loss_mlp": 1.03213513, + "epoch": 0.08573575830452428, + "flos": 19719519350400.0, + "grad_norm": 2.081379408023289, + "language_loss": 0.93026495, + "learning_rate": 3.9675025484671624e-06, + "loss": 0.95240736, + "num_input_tokens_seen": 30439620, + "step": 1426, + "time_per_iteration": 2.784634828567505 + }, + { + "auxiliary_loss_clip": 0.01107677, + "auxiliary_loss_mlp": 0.01065371, + "balance_loss_clip": 1.04103279, + "balance_loss_mlp": 1.03949118, + "epoch": 0.08579588155719224, + "flos": 17931563038080.0, + "grad_norm": 1.9910534929732115, + "language_loss": 0.75348979, + "learning_rate": 3.967432588494471e-06, + "loss": 0.77522027, + "num_input_tokens_seen": 30457300, + "step": 1427, + "time_per_iteration": 2.9182839393615723 + }, + { + "auxiliary_loss_clip": 0.01160142, + "auxiliary_loss_mlp": 0.0106009, + "balance_loss_clip": 1.04720652, + "balance_loss_mlp": 1.03760731, + "epoch": 0.08585600480986022, + "flos": 16033324993920.0, + "grad_norm": 2.799394989551892, + "language_loss": 0.8186202, + "learning_rate": 3.96736255391654e-06, + "loss": 0.84082258, + "num_input_tokens_seen": 30471580, + "step": 1428, + "time_per_iteration": 2.611084222793579 + }, + { + "auxiliary_loss_clip": 0.0114082, + "auxiliary_loss_mlp": 0.01067275, + "balance_loss_clip": 1.04533792, + "balance_loss_mlp": 1.0428369, + "epoch": 0.08591612806252819, + "flos": 28657433404800.0, + "grad_norm": 2.0935594979730445, + "language_loss": 0.80177033, + "learning_rate": 3.967292444736023e-06, + "loss": 0.82385129, + "num_input_tokens_seen": 30492720, + "step": 1429, + "time_per_iteration": 2.7187106609344482 + }, + { + "auxiliary_loss_clip": 0.01140919, + "auxiliary_loss_mlp": 0.01060602, + "balance_loss_clip": 1.04833412, + "balance_loss_mlp": 1.03759432, + "epoch": 0.08597625131519615, + "flos": 20959119659520.0, + "grad_norm": 1.970447748130672, + "language_loss": 0.88113308, + "learning_rate": 3.967222260955578e-06, + "loss": 0.90314823, + "num_input_tokens_seen": 30509535, + "step": 1430, + "time_per_iteration": 2.6149227619171143 + }, + { + "auxiliary_loss_clip": 0.01110713, + "auxiliary_loss_mlp": 0.01070716, + "balance_loss_clip": 1.0444479, + "balance_loss_mlp": 1.04676652, + "epoch": 0.08603637456786412, + "flos": 23256360956160.0, + "grad_norm": 1.6164168776285914, + "language_loss": 0.81909227, + "learning_rate": 3.96715200257787e-06, + "loss": 0.84090656, + "num_input_tokens_seen": 30529490, + "step": 1431, + "time_per_iteration": 2.6792867183685303 + }, + { + "auxiliary_loss_clip": 0.01127734, + "auxiliary_loss_mlp": 0.01061226, + "balance_loss_clip": 1.04815102, + "balance_loss_mlp": 1.03693104, + "epoch": 0.0860964978205321, + "flos": 28694170039680.0, + "grad_norm": 1.6912011895661867, + "language_loss": 0.7798537, + "learning_rate": 3.967081669605559e-06, + "loss": 0.80174327, + "num_input_tokens_seen": 30550205, + "step": 1432, + "time_per_iteration": 2.743518352508545 + }, + { + "auxiliary_loss_clip": 0.01135381, + "auxiliary_loss_mlp": 0.01064153, + "balance_loss_clip": 1.04077053, + "balance_loss_mlp": 1.0391314, + "epoch": 0.08615662107320006, + "flos": 19318397195520.0, + "grad_norm": 1.9701339669960647, + "language_loss": 0.73054785, + "learning_rate": 3.967011262041315e-06, + "loss": 0.75254321, + "num_input_tokens_seen": 30568830, + "step": 1433, + "time_per_iteration": 2.718247890472412 + }, + { + "auxiliary_loss_clip": 0.01118726, + "auxiliary_loss_mlp": 0.00749967, + "balance_loss_clip": 1.0417428, + "balance_loss_mlp": 1.00112498, + "epoch": 0.08621674432586802, + "flos": 15851688894720.0, + "grad_norm": 2.7603260600672503, + "language_loss": 0.8605144, + "learning_rate": 3.9669407798878065e-06, + "loss": 0.87920129, + "num_input_tokens_seen": 30585730, + "step": 1434, + "time_per_iteration": 2.7045376300811768 + }, + { + "auxiliary_loss_clip": 0.01137975, + "auxiliary_loss_mlp": 0.01059637, + "balance_loss_clip": 1.04248869, + "balance_loss_mlp": 1.03593826, + "epoch": 0.086276867578536, + "flos": 14100648785280.0, + "grad_norm": 2.529392799521887, + "language_loss": 0.79329538, + "learning_rate": 3.966870223147707e-06, + "loss": 0.8152715, + "num_input_tokens_seen": 30603180, + "step": 1435, + "time_per_iteration": 2.650176763534546 + }, + { + "auxiliary_loss_clip": 0.01015839, + "auxiliary_loss_mlp": 0.01027603, + "balance_loss_clip": 1.00481653, + "balance_loss_mlp": 1.02431321, + "epoch": 0.08633699083120397, + "flos": 70184857772160.0, + "grad_norm": 0.8891290488585697, + "language_loss": 0.57945561, + "learning_rate": 3.96679959182369e-06, + "loss": 0.59988999, + "num_input_tokens_seen": 30668895, + "step": 1436, + "time_per_iteration": 3.420365571975708 + }, + { + "auxiliary_loss_clip": 0.01135943, + "auxiliary_loss_mlp": 0.01059355, + "balance_loss_clip": 1.04082513, + "balance_loss_mlp": 1.03459537, + "epoch": 0.08639711408387193, + "flos": 30298874140800.0, + "grad_norm": 3.0811688816437512, + "language_loss": 0.69150591, + "learning_rate": 3.966728885918437e-06, + "loss": 0.7134589, + "num_input_tokens_seen": 30688955, + "step": 1437, + "time_per_iteration": 2.7230868339538574 + }, + { + "auxiliary_loss_clip": 0.01084633, + "auxiliary_loss_mlp": 0.0105655, + "balance_loss_clip": 1.03841329, + "balance_loss_mlp": 1.03324437, + "epoch": 0.08645723733653991, + "flos": 20297680663680.0, + "grad_norm": 2.0246059519441832, + "language_loss": 0.72881722, + "learning_rate": 3.966658105434627e-06, + "loss": 0.750229, + "num_input_tokens_seen": 30706095, + "step": 1438, + "time_per_iteration": 2.7951667308807373 + }, + { + "auxiliary_loss_clip": 0.01145452, + "auxiliary_loss_mlp": 0.01052378, + "balance_loss_clip": 1.04518163, + "balance_loss_mlp": 1.02879882, + "epoch": 0.08651736058920788, + "flos": 32890583134080.0, + "grad_norm": 1.5562047171195605, + "language_loss": 0.64375693, + "learning_rate": 3.966587250374945e-06, + "loss": 0.66573524, + "num_input_tokens_seen": 30729025, + "step": 1439, + "time_per_iteration": 2.7528984546661377 + }, + { + "auxiliary_loss_clip": 0.01122809, + "auxiliary_loss_mlp": 0.01058617, + "balance_loss_clip": 1.04400694, + "balance_loss_mlp": 1.03411937, + "epoch": 0.08657748384187584, + "flos": 22637368857600.0, + "grad_norm": 1.9954333948437821, + "language_loss": 0.87278759, + "learning_rate": 3.966516320742077e-06, + "loss": 0.89460188, + "num_input_tokens_seen": 30746155, + "step": 1440, + "time_per_iteration": 2.882694721221924 + }, + { + "auxiliary_loss_clip": 0.01124756, + "auxiliary_loss_mlp": 0.00750042, + "balance_loss_clip": 1.04349709, + "balance_loss_mlp": 1.00123811, + "epoch": 0.08663760709454381, + "flos": 23658380951040.0, + "grad_norm": 2.8272600947052813, + "language_loss": 0.83364308, + "learning_rate": 3.9664453165387124e-06, + "loss": 0.85239106, + "num_input_tokens_seen": 30761410, + "step": 1441, + "time_per_iteration": 2.857685089111328 + }, + { + "auxiliary_loss_clip": 0.01049231, + "auxiliary_loss_mlp": 0.0101317, + "balance_loss_clip": 1.00641155, + "balance_loss_mlp": 1.01024902, + "epoch": 0.08669773034721179, + "flos": 62686564911360.0, + "grad_norm": 0.84886823431073, + "language_loss": 0.6045599, + "learning_rate": 3.966374237767545e-06, + "loss": 0.62518394, + "num_input_tokens_seen": 30823010, + "step": 1442, + "time_per_iteration": 3.254646062850952 + }, + { + "auxiliary_loss_clip": 0.01137009, + "auxiliary_loss_mlp": 0.01055703, + "balance_loss_clip": 1.04445314, + "balance_loss_mlp": 1.03170657, + "epoch": 0.08675785359987975, + "flos": 20667489137280.0, + "grad_norm": 2.0956649707266113, + "language_loss": 0.79151547, + "learning_rate": 3.96630308443127e-06, + "loss": 0.81344259, + "num_input_tokens_seen": 30841980, + "step": 1443, + "time_per_iteration": 2.754532814025879 + }, + { + "auxiliary_loss_clip": 0.01146836, + "auxiliary_loss_mlp": 0.0105243, + "balance_loss_clip": 1.04207361, + "balance_loss_mlp": 1.02910042, + "epoch": 0.08681797685254772, + "flos": 26941118768640.0, + "grad_norm": 1.9324854693018754, + "language_loss": 0.82258558, + "learning_rate": 3.966231856532584e-06, + "loss": 0.84457827, + "num_input_tokens_seen": 30863280, + "step": 1444, + "time_per_iteration": 2.713210344314575 + }, + { + "auxiliary_loss_clip": 0.01163706, + "auxiliary_loss_mlp": 0.0105273, + "balance_loss_clip": 1.04558551, + "balance_loss_mlp": 1.02956724, + "epoch": 0.0868781001052157, + "flos": 17712831168000.0, + "grad_norm": 2.381336733065158, + "language_loss": 0.87417132, + "learning_rate": 3.966160554074189e-06, + "loss": 0.8963356, + "num_input_tokens_seen": 30881710, + "step": 1445, + "time_per_iteration": 2.5504486560821533 + }, + { + "auxiliary_loss_clip": 0.0115024, + "auxiliary_loss_mlp": 0.01056805, + "balance_loss_clip": 1.04821014, + "balance_loss_mlp": 1.0345006, + "epoch": 0.08693822335788366, + "flos": 19896522595200.0, + "grad_norm": 1.8134911235191398, + "language_loss": 0.82104373, + "learning_rate": 3.96608917705879e-06, + "loss": 0.8431142, + "num_input_tokens_seen": 30900225, + "step": 1446, + "time_per_iteration": 2.7651593685150146 + }, + { + "auxiliary_loss_clip": 0.01034436, + "auxiliary_loss_mlp": 0.01016093, + "balance_loss_clip": 1.00624526, + "balance_loss_mlp": 1.01322031, + "epoch": 0.08699834661055163, + "flos": 67023747406080.0, + "grad_norm": 0.731684386138108, + "language_loss": 0.5479157, + "learning_rate": 3.966017725489091e-06, + "loss": 0.56842101, + "num_input_tokens_seen": 30959580, + "step": 1447, + "time_per_iteration": 3.2805349826812744 + }, + { + "auxiliary_loss_clip": 0.01117624, + "auxiliary_loss_mlp": 0.01055116, + "balance_loss_clip": 1.04168689, + "balance_loss_mlp": 1.03311038, + "epoch": 0.0870584698632196, + "flos": 13480507451520.0, + "grad_norm": 2.342744207762734, + "language_loss": 0.83965909, + "learning_rate": 3.965946199367804e-06, + "loss": 0.86138654, + "num_input_tokens_seen": 30976775, + "step": 1448, + "time_per_iteration": 2.7726683616638184 + }, + { + "auxiliary_loss_clip": 0.01162117, + "auxiliary_loss_mlp": 0.01056143, + "balance_loss_clip": 1.04563355, + "balance_loss_mlp": 1.03354084, + "epoch": 0.08711859311588757, + "flos": 16107013745280.0, + "grad_norm": 3.027327774690064, + "language_loss": 0.80329424, + "learning_rate": 3.965874598697638e-06, + "loss": 0.82547688, + "num_input_tokens_seen": 30990495, + "step": 1449, + "time_per_iteration": 2.903334856033325 + }, + { + "auxiliary_loss_clip": 0.01113214, + "auxiliary_loss_mlp": 0.01056329, + "balance_loss_clip": 1.04619181, + "balance_loss_mlp": 1.03297639, + "epoch": 0.08717871636855554, + "flos": 38472357928320.0, + "grad_norm": 1.5822152434278904, + "language_loss": 0.70772707, + "learning_rate": 3.965802923481313e-06, + "loss": 0.72942251, + "num_input_tokens_seen": 31014080, + "step": 1450, + "time_per_iteration": 2.88375186920166 + }, + { + "auxiliary_loss_clip": 0.01095211, + "auxiliary_loss_mlp": 0.01059703, + "balance_loss_clip": 1.04340124, + "balance_loss_mlp": 1.03642166, + "epoch": 0.0872388396212235, + "flos": 17600574188160.0, + "grad_norm": 1.762066925861579, + "language_loss": 0.83541143, + "learning_rate": 3.965731173721542e-06, + "loss": 0.85696054, + "num_input_tokens_seen": 31031210, + "step": 1451, + "time_per_iteration": 2.6922197341918945 + }, + { + "auxiliary_loss_clip": 0.0110596, + "auxiliary_loss_mlp": 0.00750016, + "balance_loss_clip": 1.04110193, + "balance_loss_mlp": 1.00128937, + "epoch": 0.08729896287389148, + "flos": 25259385951360.0, + "grad_norm": 2.2839411372883798, + "language_loss": 0.7494002, + "learning_rate": 3.965659349421049e-06, + "loss": 0.76795995, + "num_input_tokens_seen": 31049710, + "step": 1452, + "time_per_iteration": 2.9885966777801514 + }, + { + "auxiliary_loss_clip": 0.01121411, + "auxiliary_loss_mlp": 0.010676, + "balance_loss_clip": 1.04249203, + "balance_loss_mlp": 1.04355526, + "epoch": 0.08735908612655945, + "flos": 15632454234240.0, + "grad_norm": 3.256068031701224, + "language_loss": 0.79903471, + "learning_rate": 3.965587450582556e-06, + "loss": 0.82092482, + "num_input_tokens_seen": 31066160, + "step": 1453, + "time_per_iteration": 2.898988723754883 + }, + { + "auxiliary_loss_clip": 0.01138889, + "auxiliary_loss_mlp": 0.0106221, + "balance_loss_clip": 1.047364, + "balance_loss_mlp": 1.03844011, + "epoch": 0.08741920937922741, + "flos": 20339660684160.0, + "grad_norm": 1.9219678622633571, + "language_loss": 0.71088403, + "learning_rate": 3.96551547720879e-06, + "loss": 0.73289508, + "num_input_tokens_seen": 31085270, + "step": 1454, + "time_per_iteration": 2.840202569961548 + }, + { + "auxiliary_loss_clip": 0.01039422, + "auxiliary_loss_mlp": 0.01024687, + "balance_loss_clip": 1.00667477, + "balance_loss_mlp": 1.02139711, + "epoch": 0.08747933263189539, + "flos": 62819795433600.0, + "grad_norm": 0.7965558129635844, + "language_loss": 0.58595347, + "learning_rate": 3.96544342930248e-06, + "loss": 0.60659456, + "num_input_tokens_seen": 31148445, + "step": 1455, + "time_per_iteration": 4.870271682739258 + }, + { + "auxiliary_loss_clip": 0.01159036, + "auxiliary_loss_mlp": 0.01059079, + "balance_loss_clip": 1.04555392, + "balance_loss_mlp": 1.03586864, + "epoch": 0.08753945588456336, + "flos": 33035877648000.0, + "grad_norm": 2.2677660419450687, + "language_loss": 0.7720499, + "learning_rate": 3.965371306866359e-06, + "loss": 0.79423106, + "num_input_tokens_seen": 31168770, + "step": 1456, + "time_per_iteration": 2.838653087615967 + }, + { + "auxiliary_loss_clip": 0.01091648, + "auxiliary_loss_mlp": 0.01054389, + "balance_loss_clip": 1.03759789, + "balance_loss_mlp": 1.03108406, + "epoch": 0.08759957913723132, + "flos": 35547182046720.0, + "grad_norm": 2.0347526756833387, + "language_loss": 0.72200012, + "learning_rate": 3.96529910990316e-06, + "loss": 0.74346042, + "num_input_tokens_seen": 31189270, + "step": 1457, + "time_per_iteration": 4.445201396942139 + }, + { + "auxiliary_loss_clip": 0.01142116, + "auxiliary_loss_mlp": 0.01048447, + "balance_loss_clip": 1.04231739, + "balance_loss_mlp": 1.0264051, + "epoch": 0.0876597023898993, + "flos": 23911120022400.0, + "grad_norm": 1.5306362896260202, + "language_loss": 0.86607027, + "learning_rate": 3.965226838415622e-06, + "loss": 0.88797581, + "num_input_tokens_seen": 31210385, + "step": 1458, + "time_per_iteration": 4.311153888702393 + }, + { + "auxiliary_loss_clip": 0.01137659, + "auxiliary_loss_mlp": 0.01062303, + "balance_loss_clip": 1.04599667, + "balance_loss_mlp": 1.03927171, + "epoch": 0.08771982564256726, + "flos": 18114025150080.0, + "grad_norm": 1.5525247222832195, + "language_loss": 0.80718756, + "learning_rate": 3.965154492406486e-06, + "loss": 0.82918715, + "num_input_tokens_seen": 31229745, + "step": 1459, + "time_per_iteration": 4.337510347366333 + }, + { + "auxiliary_loss_clip": 0.01105735, + "auxiliary_loss_mlp": 0.01056083, + "balance_loss_clip": 1.04374814, + "balance_loss_mlp": 1.03202677, + "epoch": 0.08777994889523523, + "flos": 17712005155200.0, + "grad_norm": 3.1577651494210732, + "language_loss": 0.84370309, + "learning_rate": 3.9650820718784945e-06, + "loss": 0.86532128, + "num_input_tokens_seen": 31248280, + "step": 1460, + "time_per_iteration": 2.704598903656006 + }, + { + "auxiliary_loss_clip": 0.01143515, + "auxiliary_loss_mlp": 0.0105761, + "balance_loss_clip": 1.04238987, + "balance_loss_mlp": 1.03571081, + "epoch": 0.0878400721479032, + "flos": 12819930382080.0, + "grad_norm": 4.3445463577238215, + "language_loss": 0.80606705, + "learning_rate": 3.965009576834394e-06, + "loss": 0.82807833, + "num_input_tokens_seen": 31262190, + "step": 1461, + "time_per_iteration": 2.5618467330932617 + }, + { + "auxiliary_loss_clip": 0.01136252, + "auxiliary_loss_mlp": 0.01060061, + "balance_loss_clip": 1.04592693, + "balance_loss_mlp": 1.03751826, + "epoch": 0.08790019540057117, + "flos": 26392690938240.0, + "grad_norm": 1.639155136703702, + "language_loss": 0.76136887, + "learning_rate": 3.964937007276932e-06, + "loss": 0.78333199, + "num_input_tokens_seen": 31283690, + "step": 1462, + "time_per_iteration": 2.665557861328125 + }, + { + "auxiliary_loss_clip": 0.01141587, + "auxiliary_loss_mlp": 0.01058821, + "balance_loss_clip": 1.04539108, + "balance_loss_mlp": 1.03431177, + "epoch": 0.08796031865323914, + "flos": 19134031662720.0, + "grad_norm": 2.052915297780736, + "language_loss": 0.74337232, + "learning_rate": 3.9648643632088634e-06, + "loss": 0.76537639, + "num_input_tokens_seen": 31302505, + "step": 1463, + "time_per_iteration": 2.6081061363220215 + }, + { + "auxiliary_loss_clip": 0.01157749, + "auxiliary_loss_mlp": 0.01062167, + "balance_loss_clip": 1.04750299, + "balance_loss_mlp": 1.03726387, + "epoch": 0.0880204419059071, + "flos": 26064287867520.0, + "grad_norm": 2.678664586597327, + "language_loss": 0.83442187, + "learning_rate": 3.964791644632941e-06, + "loss": 0.85662103, + "num_input_tokens_seen": 31323070, + "step": 1464, + "time_per_iteration": 2.7300353050231934 + }, + { + "auxiliary_loss_clip": 0.01136637, + "auxiliary_loss_mlp": 0.01068097, + "balance_loss_clip": 1.04231942, + "balance_loss_mlp": 1.04507804, + "epoch": 0.08808056515857508, + "flos": 22377842115840.0, + "grad_norm": 2.224629946095085, + "language_loss": 0.78244877, + "learning_rate": 3.964718851551923e-06, + "loss": 0.80449617, + "num_input_tokens_seen": 31341880, + "step": 1465, + "time_per_iteration": 2.630331039428711 + }, + { + "auxiliary_loss_clip": 0.01167675, + "auxiliary_loss_mlp": 0.01058559, + "balance_loss_clip": 1.04853249, + "balance_loss_mlp": 1.03625488, + "epoch": 0.08814068841124305, + "flos": 23185293897600.0, + "grad_norm": 2.004165409149157, + "language_loss": 0.85107648, + "learning_rate": 3.9646459839685675e-06, + "loss": 0.87333882, + "num_input_tokens_seen": 31361995, + "step": 1466, + "time_per_iteration": 2.616316318511963 + }, + { + "auxiliary_loss_clip": 0.01081479, + "auxiliary_loss_mlp": 0.00750138, + "balance_loss_clip": 1.03698754, + "balance_loss_mlp": 1.00145459, + "epoch": 0.08820081166391101, + "flos": 25155281358720.0, + "grad_norm": 6.292755219981434, + "language_loss": 0.83901954, + "learning_rate": 3.964573041885641e-06, + "loss": 0.85733569, + "num_input_tokens_seen": 31381515, + "step": 1467, + "time_per_iteration": 2.719799518585205 + }, + { + "auxiliary_loss_clip": 0.0114706, + "auxiliary_loss_mlp": 0.01055402, + "balance_loss_clip": 1.04602242, + "balance_loss_mlp": 1.03121448, + "epoch": 0.08826093491657899, + "flos": 22231685675520.0, + "grad_norm": 1.7305500827906979, + "language_loss": 0.76001924, + "learning_rate": 3.964500025305907e-06, + "loss": 0.78204387, + "num_input_tokens_seen": 31400345, + "step": 1468, + "time_per_iteration": 2.627115249633789 + }, + { + "auxiliary_loss_clip": 0.01145475, + "auxiliary_loss_mlp": 0.01054638, + "balance_loss_clip": 1.04598165, + "balance_loss_mlp": 1.0326314, + "epoch": 0.08832105816924696, + "flos": 22126826897280.0, + "grad_norm": 1.6584025051636482, + "language_loss": 0.804708, + "learning_rate": 3.9644269342321355e-06, + "loss": 0.82670915, + "num_input_tokens_seen": 31419620, + "step": 1469, + "time_per_iteration": 2.5912137031555176 + }, + { + "auxiliary_loss_clip": 0.01162133, + "auxiliary_loss_mlp": 0.0105662, + "balance_loss_clip": 1.04548538, + "balance_loss_mlp": 1.03323162, + "epoch": 0.08838118142191492, + "flos": 17566495159680.0, + "grad_norm": 2.61391675903299, + "language_loss": 0.77769953, + "learning_rate": 3.9643537686670974e-06, + "loss": 0.79988706, + "num_input_tokens_seen": 31437970, + "step": 1470, + "time_per_iteration": 2.5716004371643066 + }, + { + "auxiliary_loss_clip": 0.01156923, + "auxiliary_loss_mlp": 0.01062153, + "balance_loss_clip": 1.04573834, + "balance_loss_mlp": 1.03814435, + "epoch": 0.0884413046745829, + "flos": 20777196251520.0, + "grad_norm": 2.894524325458132, + "language_loss": 0.8444038, + "learning_rate": 3.964280528613569e-06, + "loss": 0.86659455, + "num_input_tokens_seen": 31457040, + "step": 1471, + "time_per_iteration": 2.643655776977539 + }, + { + "auxiliary_loss_clip": 0.01118475, + "auxiliary_loss_mlp": 0.01053191, + "balance_loss_clip": 1.04293394, + "balance_loss_mlp": 1.03198385, + "epoch": 0.08850142792725087, + "flos": 22125462180480.0, + "grad_norm": 2.1736914584755076, + "language_loss": 0.83643043, + "learning_rate": 3.964207214074324e-06, + "loss": 0.85814708, + "num_input_tokens_seen": 31477520, + "step": 1472, + "time_per_iteration": 2.7915642261505127 + }, + { + "auxiliary_loss_clip": 0.01139409, + "auxiliary_loss_mlp": 0.01056232, + "balance_loss_clip": 1.04603636, + "balance_loss_mlp": 1.03233051, + "epoch": 0.08856155117991883, + "flos": 22418744728320.0, + "grad_norm": 2.2540477777790855, + "language_loss": 0.83038032, + "learning_rate": 3.964133825052146e-06, + "loss": 0.85233676, + "num_input_tokens_seen": 31495575, + "step": 1473, + "time_per_iteration": 2.7035741806030273 + }, + { + "auxiliary_loss_clip": 0.01095492, + "auxiliary_loss_mlp": 0.01060469, + "balance_loss_clip": 1.04079628, + "balance_loss_mlp": 1.03861856, + "epoch": 0.0886216744325868, + "flos": 29937002572800.0, + "grad_norm": 1.5880627176716344, + "language_loss": 0.78837651, + "learning_rate": 3.964060361549816e-06, + "loss": 0.80993617, + "num_input_tokens_seen": 31520020, + "step": 1474, + "time_per_iteration": 2.870394468307495 + }, + { + "auxiliary_loss_clip": 0.01107231, + "auxiliary_loss_mlp": 0.01067773, + "balance_loss_clip": 1.04187286, + "balance_loss_mlp": 1.04253626, + "epoch": 0.08868179768525478, + "flos": 23982833525760.0, + "grad_norm": 1.9147715534408556, + "language_loss": 0.79162377, + "learning_rate": 3.963986823570121e-06, + "loss": 0.8133738, + "num_input_tokens_seen": 31539265, + "step": 1475, + "time_per_iteration": 2.7466118335723877 + }, + { + "auxiliary_loss_clip": 0.01159026, + "auxiliary_loss_mlp": 0.01055394, + "balance_loss_clip": 1.04616749, + "balance_loss_mlp": 1.03229165, + "epoch": 0.08874192093792274, + "flos": 43177553216640.0, + "grad_norm": 1.6040090887845109, + "language_loss": 0.7429226, + "learning_rate": 3.963913211115848e-06, + "loss": 0.76506674, + "num_input_tokens_seen": 31563425, + "step": 1476, + "time_per_iteration": 2.842087984085083 + }, + { + "auxiliary_loss_clip": 0.01143078, + "auxiliary_loss_mlp": 0.01066443, + "balance_loss_clip": 1.04825342, + "balance_loss_mlp": 1.04224372, + "epoch": 0.0888020441905907, + "flos": 32852445868800.0, + "grad_norm": 1.4837204786756375, + "language_loss": 0.74765295, + "learning_rate": 3.9638395241897895e-06, + "loss": 0.76974821, + "num_input_tokens_seen": 31584525, + "step": 1477, + "time_per_iteration": 2.7606663703918457 + }, + { + "auxiliary_loss_clip": 0.01159268, + "auxiliary_loss_mlp": 0.01054748, + "balance_loss_clip": 1.04707372, + "balance_loss_mlp": 1.03089452, + "epoch": 0.08886216744325869, + "flos": 23149347361920.0, + "grad_norm": 1.76659869097056, + "language_loss": 0.87061, + "learning_rate": 3.963765762794739e-06, + "loss": 0.89275014, + "num_input_tokens_seen": 31603325, + "step": 1478, + "time_per_iteration": 2.6212587356567383 + }, + { + "auxiliary_loss_clip": 0.01140268, + "auxiliary_loss_mlp": 0.01059316, + "balance_loss_clip": 1.04320967, + "balance_loss_mlp": 1.03704798, + "epoch": 0.08892229069592665, + "flos": 23331593992320.0, + "grad_norm": 1.5696712467232925, + "language_loss": 0.77715838, + "learning_rate": 3.963691926933495e-06, + "loss": 0.79915422, + "num_input_tokens_seen": 31624820, + "step": 1479, + "time_per_iteration": 2.670985698699951 + }, + { + "auxiliary_loss_clip": 0.01127292, + "auxiliary_loss_mlp": 0.01056734, + "balance_loss_clip": 1.04210317, + "balance_loss_mlp": 1.03267717, + "epoch": 0.08898241394859462, + "flos": 26213784272640.0, + "grad_norm": 2.9730804303860707, + "language_loss": 0.77983856, + "learning_rate": 3.9636180166088555e-06, + "loss": 0.80167878, + "num_input_tokens_seen": 31646080, + "step": 1480, + "time_per_iteration": 2.730189800262451 + }, + { + "auxiliary_loss_clip": 0.01146726, + "auxiliary_loss_mlp": 0.01063308, + "balance_loss_clip": 1.04404736, + "balance_loss_mlp": 1.03910875, + "epoch": 0.0890425372012626, + "flos": 23550613171200.0, + "grad_norm": 2.0518196592403997, + "language_loss": 0.6660918, + "learning_rate": 3.963544031823624e-06, + "loss": 0.68819213, + "num_input_tokens_seen": 31665770, + "step": 1481, + "time_per_iteration": 2.6255557537078857 + }, + { + "auxiliary_loss_clip": 0.01105504, + "auxiliary_loss_mlp": 0.01052635, + "balance_loss_clip": 1.04111767, + "balance_loss_mlp": 1.03006852, + "epoch": 0.08910266045393056, + "flos": 23002795872000.0, + "grad_norm": 2.291059845914, + "language_loss": 0.96637696, + "learning_rate": 3.9634699725806065e-06, + "loss": 0.98795831, + "num_input_tokens_seen": 31683805, + "step": 1482, + "time_per_iteration": 2.7154886722564697 + }, + { + "auxiliary_loss_clip": 0.01125652, + "auxiliary_loss_mlp": 0.01058969, + "balance_loss_clip": 1.04749, + "balance_loss_mlp": 1.03482914, + "epoch": 0.08916278370659853, + "flos": 31936508035200.0, + "grad_norm": 1.9010116319179133, + "language_loss": 0.79007769, + "learning_rate": 3.96339583888261e-06, + "loss": 0.81192386, + "num_input_tokens_seen": 31704630, + "step": 1483, + "time_per_iteration": 2.714120626449585 + }, + { + "auxiliary_loss_clip": 0.01134249, + "auxiliary_loss_mlp": 0.01081966, + "balance_loss_clip": 1.04382861, + "balance_loss_mlp": 1.05732584, + "epoch": 0.08922290695926649, + "flos": 17530404969600.0, + "grad_norm": 2.2621183242371665, + "language_loss": 0.85681164, + "learning_rate": 3.963321630732448e-06, + "loss": 0.87897384, + "num_input_tokens_seen": 31723255, + "step": 1484, + "time_per_iteration": 2.597705602645874 + }, + { + "auxiliary_loss_clip": 0.01167604, + "auxiliary_loss_mlp": 0.01068157, + "balance_loss_clip": 1.05051148, + "balance_loss_mlp": 1.04396963, + "epoch": 0.08928303021193447, + "flos": 32125075459200.0, + "grad_norm": 1.8309646159997044, + "language_loss": 0.8032068, + "learning_rate": 3.963247348132932e-06, + "loss": 0.82556438, + "num_input_tokens_seen": 31747045, + "step": 1485, + "time_per_iteration": 2.7138636112213135 + }, + { + "auxiliary_loss_clip": 0.0115013, + "auxiliary_loss_mlp": 0.01066921, + "balance_loss_clip": 1.04847956, + "balance_loss_mlp": 1.04275715, + "epoch": 0.08934315346460243, + "flos": 22125210785280.0, + "grad_norm": 2.507667299445726, + "language_loss": 0.82938534, + "learning_rate": 3.96317299108688e-06, + "loss": 0.85155588, + "num_input_tokens_seen": 31766615, + "step": 1486, + "time_per_iteration": 2.664694309234619 + }, + { + "auxiliary_loss_clip": 0.01123216, + "auxiliary_loss_mlp": 0.01063578, + "balance_loss_clip": 1.04927027, + "balance_loss_mlp": 1.03928304, + "epoch": 0.0894032767172704, + "flos": 22565583527040.0, + "grad_norm": 1.9183450362950003, + "language_loss": 0.76452935, + "learning_rate": 3.963098559597111e-06, + "loss": 0.78639734, + "num_input_tokens_seen": 31785855, + "step": 1487, + "time_per_iteration": 2.786066770553589 + }, + { + "auxiliary_loss_clip": 0.01118808, + "auxiliary_loss_mlp": 0.01061955, + "balance_loss_clip": 1.03800464, + "balance_loss_mlp": 1.03714716, + "epoch": 0.08946339996993838, + "flos": 20193396503040.0, + "grad_norm": 2.060299613529554, + "language_loss": 0.82705069, + "learning_rate": 3.963024053666449e-06, + "loss": 0.8488583, + "num_input_tokens_seen": 31804210, + "step": 1488, + "time_per_iteration": 2.6299643516540527 + }, + { + "auxiliary_loss_clip": 0.01136353, + "auxiliary_loss_mlp": 0.01051312, + "balance_loss_clip": 1.04239678, + "balance_loss_mlp": 1.02929354, + "epoch": 0.08952352322260634, + "flos": 48360181104000.0, + "grad_norm": 2.9655332154068677, + "language_loss": 0.7145505, + "learning_rate": 3.962949473297718e-06, + "loss": 0.73642713, + "num_input_tokens_seen": 31826150, + "step": 1489, + "time_per_iteration": 2.8699090480804443 + }, + { + "auxiliary_loss_clip": 0.01117994, + "auxiliary_loss_mlp": 0.0105318, + "balance_loss_clip": 1.03958058, + "balance_loss_mlp": 1.02996981, + "epoch": 0.08958364647527431, + "flos": 31793081028480.0, + "grad_norm": 1.8050488844094785, + "language_loss": 0.89937252, + "learning_rate": 3.962874818493745e-06, + "loss": 0.92108428, + "num_input_tokens_seen": 31848060, + "step": 1490, + "time_per_iteration": 2.7622263431549072 + }, + { + "auxiliary_loss_clip": 0.01153794, + "auxiliary_loss_mlp": 0.01067186, + "balance_loss_clip": 1.04428124, + "balance_loss_mlp": 1.04421413, + "epoch": 0.08964376972794229, + "flos": 23368186972800.0, + "grad_norm": 2.3510398416760863, + "language_loss": 0.73236179, + "learning_rate": 3.9628000892573635e-06, + "loss": 0.75457162, + "num_input_tokens_seen": 31870040, + "step": 1491, + "time_per_iteration": 2.67681622505188 + }, + { + "auxiliary_loss_clip": 0.01159779, + "auxiliary_loss_mlp": 0.00749922, + "balance_loss_clip": 1.04842138, + "balance_loss_mlp": 1.00123119, + "epoch": 0.08970389298061025, + "flos": 23294785530240.0, + "grad_norm": 1.7157274014430333, + "language_loss": 0.77191126, + "learning_rate": 3.9627252855914055e-06, + "loss": 0.79100823, + "num_input_tokens_seen": 31890400, + "step": 1492, + "time_per_iteration": 2.59633469581604 + }, + { + "auxiliary_loss_clip": 0.01158258, + "auxiliary_loss_mlp": 0.01060151, + "balance_loss_clip": 1.04816246, + "balance_loss_mlp": 1.03674984, + "epoch": 0.08976401623327822, + "flos": 33761703772800.0, + "grad_norm": 2.2607856650793776, + "language_loss": 0.70948327, + "learning_rate": 3.962650407498707e-06, + "loss": 0.73166734, + "num_input_tokens_seen": 31913435, + "step": 1493, + "time_per_iteration": 2.709929943084717 + }, + { + "auxiliary_loss_clip": 0.01159192, + "auxiliary_loss_mlp": 0.01058577, + "balance_loss_clip": 1.04744673, + "balance_loss_mlp": 1.03511631, + "epoch": 0.08982413948594618, + "flos": 23911335504000.0, + "grad_norm": 2.5325600945829887, + "language_loss": 0.86638355, + "learning_rate": 3.962575454982109e-06, + "loss": 0.88856125, + "num_input_tokens_seen": 31932435, + "step": 1494, + "time_per_iteration": 2.579838991165161 + }, + { + "auxiliary_loss_clip": 0.01056068, + "auxiliary_loss_mlp": 0.01070682, + "balance_loss_clip": 1.03958654, + "balance_loss_mlp": 1.04623199, + "epoch": 0.08988426273861416, + "flos": 16837544551680.0, + "grad_norm": 3.8837511962337166, + "language_loss": 0.82894576, + "learning_rate": 3.962500428044454e-06, + "loss": 0.85021329, + "num_input_tokens_seen": 31950125, + "step": 1495, + "time_per_iteration": 2.899381160736084 + }, + { + "auxiliary_loss_clip": 0.01136987, + "auxiliary_loss_mlp": 0.0106291, + "balance_loss_clip": 1.04642248, + "balance_loss_mlp": 1.03978372, + "epoch": 0.08994438599128213, + "flos": 14793365548800.0, + "grad_norm": 2.4551438679363584, + "language_loss": 0.70074904, + "learning_rate": 3.962425326688585e-06, + "loss": 0.72274804, + "num_input_tokens_seen": 31968050, + "step": 1496, + "time_per_iteration": 2.79228138923645 + }, + { + "auxiliary_loss_clip": 0.01133357, + "auxiliary_loss_mlp": 0.01051306, + "balance_loss_clip": 1.04286194, + "balance_loss_mlp": 1.03024137, + "epoch": 0.09000450924395009, + "flos": 17384320356480.0, + "grad_norm": 1.6466223924697079, + "language_loss": 0.79933548, + "learning_rate": 3.962350150917351e-06, + "loss": 0.82118207, + "num_input_tokens_seen": 31985675, + "step": 1497, + "time_per_iteration": 2.6906044483184814 + }, + { + "auxiliary_loss_clip": 0.01097059, + "auxiliary_loss_mlp": 0.01057072, + "balance_loss_clip": 1.04139113, + "balance_loss_mlp": 1.0331825, + "epoch": 0.09006463249661807, + "flos": 24280317964800.0, + "grad_norm": 1.9903964037733146, + "language_loss": 0.82424974, + "learning_rate": 3.9622749007336035e-06, + "loss": 0.84579104, + "num_input_tokens_seen": 32005180, + "step": 1498, + "time_per_iteration": 2.88464093208313 + }, + { + "auxiliary_loss_clip": 0.01139216, + "auxiliary_loss_mlp": 0.0106628, + "balance_loss_clip": 1.0454843, + "balance_loss_mlp": 1.04403567, + "epoch": 0.09012475574928604, + "flos": 13661928069120.0, + "grad_norm": 2.4867119248989997, + "language_loss": 0.78671646, + "learning_rate": 3.962199576140195e-06, + "loss": 0.80877137, + "num_input_tokens_seen": 32022970, + "step": 1499, + "time_per_iteration": 2.758702039718628 + }, + { + "auxiliary_loss_clip": 0.01127401, + "auxiliary_loss_mlp": 0.00749882, + "balance_loss_clip": 1.04372954, + "balance_loss_mlp": 1.00118899, + "epoch": 0.090184879001954, + "flos": 23327751237120.0, + "grad_norm": 1.6725436220903864, + "language_loss": 0.93015456, + "learning_rate": 3.962124177139981e-06, + "loss": 0.9489274, + "num_input_tokens_seen": 32043055, + "step": 1500, + "time_per_iteration": 2.6577348709106445 + }, + { + "auxiliary_loss_clip": 0.01118327, + "auxiliary_loss_mlp": 0.0105186, + "balance_loss_clip": 1.04208124, + "balance_loss_mlp": 1.0275774, + "epoch": 0.09024500225462198, + "flos": 23002688131200.0, + "grad_norm": 2.391462321651423, + "language_loss": 0.74040568, + "learning_rate": 3.962048703735822e-06, + "loss": 0.76210761, + "num_input_tokens_seen": 32061900, + "step": 1501, + "time_per_iteration": 2.761110305786133 + }, + { + "auxiliary_loss_clip": 0.01041476, + "auxiliary_loss_mlp": 0.01033095, + "balance_loss_clip": 1.03017282, + "balance_loss_mlp": 1.029423, + "epoch": 0.09030512550728995, + "flos": 62189203242240.0, + "grad_norm": 0.7431681613083785, + "language_loss": 0.58298659, + "learning_rate": 3.96197315593058e-06, + "loss": 0.60373235, + "num_input_tokens_seen": 32122745, + "step": 1502, + "time_per_iteration": 4.87851357460022 + }, + { + "auxiliary_loss_clip": 0.01110542, + "auxiliary_loss_mlp": 0.01061372, + "balance_loss_clip": 1.03777468, + "balance_loss_mlp": 1.0392344, + "epoch": 0.09036524875995791, + "flos": 38800689171840.0, + "grad_norm": 2.4392934373883253, + "language_loss": 0.6944117, + "learning_rate": 3.961897533727119e-06, + "loss": 0.71613079, + "num_input_tokens_seen": 32145125, + "step": 1503, + "time_per_iteration": 2.8112595081329346 + }, + { + "auxiliary_loss_clip": 0.01108035, + "auxiliary_loss_mlp": 0.0106658, + "balance_loss_clip": 1.04196775, + "balance_loss_mlp": 1.04375172, + "epoch": 0.09042537201262588, + "flos": 21690081429120.0, + "grad_norm": 2.214145528048002, + "language_loss": 0.85866404, + "learning_rate": 3.961821837128306e-06, + "loss": 0.88041013, + "num_input_tokens_seen": 32166255, + "step": 1504, + "time_per_iteration": 2.8287787437438965 + }, + { + "auxiliary_loss_clip": 0.01113364, + "auxiliary_loss_mlp": 0.01078353, + "balance_loss_clip": 1.04400754, + "balance_loss_mlp": 1.05109, + "epoch": 0.09048549526529386, + "flos": 22267021680000.0, + "grad_norm": 2.165348549970753, + "language_loss": 0.72223866, + "learning_rate": 3.961746066137014e-06, + "loss": 0.74415588, + "num_input_tokens_seen": 32184010, + "step": 1505, + "time_per_iteration": 5.871511459350586 + }, + { + "auxiliary_loss_clip": 0.01103713, + "auxiliary_loss_mlp": 0.01061213, + "balance_loss_clip": 1.04036701, + "balance_loss_mlp": 1.03776455, + "epoch": 0.09054561851796182, + "flos": 14610939350400.0, + "grad_norm": 2.2338846716860843, + "language_loss": 0.8101238, + "learning_rate": 3.961670220756114e-06, + "loss": 0.83177304, + "num_input_tokens_seen": 32201635, + "step": 1506, + "time_per_iteration": 4.307340860366821 + }, + { + "auxiliary_loss_clip": 0.0111168, + "auxiliary_loss_mlp": 0.01068034, + "balance_loss_clip": 1.04189038, + "balance_loss_mlp": 1.04638577, + "epoch": 0.09060574177062979, + "flos": 27636169916160.0, + "grad_norm": 1.6923397249301266, + "language_loss": 0.76138949, + "learning_rate": 3.961594300988482e-06, + "loss": 0.78318667, + "num_input_tokens_seen": 32221940, + "step": 1507, + "time_per_iteration": 2.8643553256988525 + }, + { + "auxiliary_loss_clip": 0.0103078, + "auxiliary_loss_mlp": 0.01021322, + "balance_loss_clip": 1.01168251, + "balance_loss_mlp": 1.01765049, + "epoch": 0.09066586502329776, + "flos": 66085797513600.0, + "grad_norm": 0.7399368072797915, + "language_loss": 0.57661092, + "learning_rate": 3.961518306836998e-06, + "loss": 0.59713185, + "num_input_tokens_seen": 32276495, + "step": 1508, + "time_per_iteration": 3.2601447105407715 + }, + { + "auxiliary_loss_clip": 0.01134319, + "auxiliary_loss_mlp": 0.01060777, + "balance_loss_clip": 1.04297066, + "balance_loss_mlp": 1.03776979, + "epoch": 0.09072598827596573, + "flos": 18916449027840.0, + "grad_norm": 1.8496276511057541, + "language_loss": 0.85081685, + "learning_rate": 3.961442238304543e-06, + "loss": 0.87276781, + "num_input_tokens_seen": 32294130, + "step": 1509, + "time_per_iteration": 2.684018850326538 + }, + { + "auxiliary_loss_clip": 0.01139883, + "auxiliary_loss_mlp": 0.01071727, + "balance_loss_clip": 1.04719996, + "balance_loss_mlp": 1.04702699, + "epoch": 0.0907861115286337, + "flos": 24821742643200.0, + "grad_norm": 2.3512660255003865, + "language_loss": 0.84189963, + "learning_rate": 3.961366095394002e-06, + "loss": 0.86401582, + "num_input_tokens_seen": 32313555, + "step": 1510, + "time_per_iteration": 2.767056703567505 + }, + { + "auxiliary_loss_clip": 0.01118307, + "auxiliary_loss_mlp": 0.0106097, + "balance_loss_clip": 1.04244351, + "balance_loss_mlp": 1.03702104, + "epoch": 0.09084623478130167, + "flos": 21652842003840.0, + "grad_norm": 1.942226952751169, + "language_loss": 0.8521449, + "learning_rate": 3.961289878108262e-06, + "loss": 0.87393761, + "num_input_tokens_seen": 32331430, + "step": 1511, + "time_per_iteration": 2.78056001663208 + }, + { + "auxiliary_loss_clip": 0.01114963, + "auxiliary_loss_mlp": 0.01055978, + "balance_loss_clip": 1.0420692, + "balance_loss_mlp": 1.03337622, + "epoch": 0.09090635803396964, + "flos": 27639258485760.0, + "grad_norm": 1.6415349876097691, + "language_loss": 0.84791803, + "learning_rate": 3.9612135864502135e-06, + "loss": 0.86962748, + "num_input_tokens_seen": 32353705, + "step": 1512, + "time_per_iteration": 2.7602312564849854 + }, + { + "auxiliary_loss_clip": 0.01128333, + "auxiliary_loss_mlp": 0.01058746, + "balance_loss_clip": 1.04851305, + "balance_loss_mlp": 1.03690696, + "epoch": 0.0909664812866376, + "flos": 17669127294720.0, + "grad_norm": 2.7003107996558198, + "language_loss": 0.86953157, + "learning_rate": 3.961137220422749e-06, + "loss": 0.89140236, + "num_input_tokens_seen": 32370520, + "step": 1513, + "time_per_iteration": 2.798321008682251 + }, + { + "auxiliary_loss_clip": 0.0114616, + "auxiliary_loss_mlp": 0.01056038, + "balance_loss_clip": 1.04990172, + "balance_loss_mlp": 1.03386521, + "epoch": 0.09102660453930557, + "flos": 23951448017280.0, + "grad_norm": 1.8595786628523308, + "language_loss": 0.86280489, + "learning_rate": 3.961060780028764e-06, + "loss": 0.8848269, + "num_input_tokens_seen": 32389105, + "step": 1514, + "time_per_iteration": 2.657824754714966 + }, + { + "auxiliary_loss_clip": 0.01107807, + "auxiliary_loss_mlp": 0.01065473, + "balance_loss_clip": 1.04646051, + "balance_loss_mlp": 1.04445672, + "epoch": 0.09108672779197355, + "flos": 25812949426560.0, + "grad_norm": 1.8610061928005657, + "language_loss": 0.90142155, + "learning_rate": 3.960984265271159e-06, + "loss": 0.92315435, + "num_input_tokens_seen": 32408065, + "step": 1515, + "time_per_iteration": 2.816152811050415 + }, + { + "auxiliary_loss_clip": 0.01134043, + "auxiliary_loss_mlp": 0.01051785, + "balance_loss_clip": 1.04427743, + "balance_loss_mlp": 1.02808607, + "epoch": 0.09114685104464151, + "flos": 29639482220160.0, + "grad_norm": 1.8444556272321038, + "language_loss": 0.85459125, + "learning_rate": 3.9609076761528335e-06, + "loss": 0.87644947, + "num_input_tokens_seen": 32427225, + "step": 1516, + "time_per_iteration": 2.7346813678741455 + }, + { + "auxiliary_loss_clip": 0.01126474, + "auxiliary_loss_mlp": 0.01058401, + "balance_loss_clip": 1.04128742, + "balance_loss_mlp": 1.03472602, + "epoch": 0.09120697429730948, + "flos": 33729635905920.0, + "grad_norm": 1.7029758142237343, + "language_loss": 0.81029141, + "learning_rate": 3.960831012676692e-06, + "loss": 0.83214021, + "num_input_tokens_seen": 32450510, + "step": 1517, + "time_per_iteration": 2.791921377182007 + }, + { + "auxiliary_loss_clip": 0.01149328, + "auxiliary_loss_mlp": 0.0107204, + "balance_loss_clip": 1.04705691, + "balance_loss_mlp": 1.04967618, + "epoch": 0.09126709754997746, + "flos": 18401381953920.0, + "grad_norm": 1.6413104284648068, + "language_loss": 0.77878559, + "learning_rate": 3.960754274845642e-06, + "loss": 0.80099922, + "num_input_tokens_seen": 32468425, + "step": 1518, + "time_per_iteration": 2.6458208560943604 + }, + { + "auxiliary_loss_clip": 0.01131474, + "auxiliary_loss_mlp": 0.01060126, + "balance_loss_clip": 1.04200792, + "balance_loss_mlp": 1.03788185, + "epoch": 0.09132722080264542, + "flos": 22091957769600.0, + "grad_norm": 1.704480906322897, + "language_loss": 0.86072731, + "learning_rate": 3.960677462662594e-06, + "loss": 0.8826434, + "num_input_tokens_seen": 32487510, + "step": 1519, + "time_per_iteration": 2.8091793060302734 + }, + { + "auxiliary_loss_clip": 0.01137121, + "auxiliary_loss_mlp": 0.01057979, + "balance_loss_clip": 1.04544306, + "balance_loss_mlp": 1.03369617, + "epoch": 0.09138734405531339, + "flos": 21033131633280.0, + "grad_norm": 6.549669009729155, + "language_loss": 0.7316168, + "learning_rate": 3.96060057613046e-06, + "loss": 0.75356781, + "num_input_tokens_seen": 32507250, + "step": 1520, + "time_per_iteration": 2.6726176738739014 + }, + { + "auxiliary_loss_clip": 0.01138606, + "auxiliary_loss_mlp": 0.01062312, + "balance_loss_clip": 1.04952097, + "balance_loss_mlp": 1.03812456, + "epoch": 0.09144746730798137, + "flos": 20083940784000.0, + "grad_norm": 3.065657481388865, + "language_loss": 0.85048807, + "learning_rate": 3.960523615252156e-06, + "loss": 0.87249726, + "num_input_tokens_seen": 32526045, + "step": 1521, + "time_per_iteration": 2.6885528564453125 + }, + { + "auxiliary_loss_clip": 0.01085329, + "auxiliary_loss_mlp": 0.01060963, + "balance_loss_clip": 1.04248285, + "balance_loss_mlp": 1.03713298, + "epoch": 0.09150759056064933, + "flos": 22778210085120.0, + "grad_norm": 3.482954185760705, + "language_loss": 0.84312046, + "learning_rate": 3.960446580030599e-06, + "loss": 0.86458337, + "num_input_tokens_seen": 32546575, + "step": 1522, + "time_per_iteration": 2.753850221633911 + }, + { + "auxiliary_loss_clip": 0.0115054, + "auxiliary_loss_mlp": 0.01061269, + "balance_loss_clip": 1.04393864, + "balance_loss_mlp": 1.03768897, + "epoch": 0.0915677138133173, + "flos": 27564205017600.0, + "grad_norm": 1.6573943127452342, + "language_loss": 0.81069636, + "learning_rate": 3.960369470468711e-06, + "loss": 0.83281434, + "num_input_tokens_seen": 32568795, + "step": 1523, + "time_per_iteration": 2.624990940093994 + }, + { + "auxiliary_loss_clip": 0.01122588, + "auxiliary_loss_mlp": 0.00750078, + "balance_loss_clip": 1.04244661, + "balance_loss_mlp": 1.00145006, + "epoch": 0.09162783706598528, + "flos": 17674765729920.0, + "grad_norm": 2.3132138866266923, + "language_loss": 0.74970418, + "learning_rate": 3.960292286569418e-06, + "loss": 0.76843083, + "num_input_tokens_seen": 32587010, + "step": 1524, + "time_per_iteration": 2.5784778594970703 + }, + { + "auxiliary_loss_clip": 0.01110517, + "auxiliary_loss_mlp": 0.01062676, + "balance_loss_clip": 1.04494691, + "balance_loss_mlp": 1.03848839, + "epoch": 0.09168796031865324, + "flos": 18478195188480.0, + "grad_norm": 2.5083868881136384, + "language_loss": 0.86423182, + "learning_rate": 3.960215028335644e-06, + "loss": 0.88596374, + "num_input_tokens_seen": 32602375, + "step": 1525, + "time_per_iteration": 2.6246418952941895 + }, + { + "auxiliary_loss_clip": 0.01136757, + "auxiliary_loss_mlp": 0.01050058, + "balance_loss_clip": 1.04663193, + "balance_loss_mlp": 1.0269196, + "epoch": 0.0917480835713212, + "flos": 29387605075200.0, + "grad_norm": 2.1105355422098366, + "language_loss": 0.75331914, + "learning_rate": 3.96013769577032e-06, + "loss": 0.77518725, + "num_input_tokens_seen": 32621460, + "step": 1526, + "time_per_iteration": 2.7465152740478516 + }, + { + "auxiliary_loss_clip": 0.01156029, + "auxiliary_loss_mlp": 0.01053215, + "balance_loss_clip": 1.04652119, + "balance_loss_mlp": 1.03058958, + "epoch": 0.09180820682398917, + "flos": 19829262378240.0, + "grad_norm": 2.0896977605173346, + "language_loss": 0.77041203, + "learning_rate": 3.960060288876378e-06, + "loss": 0.79250449, + "num_input_tokens_seen": 32640440, + "step": 1527, + "time_per_iteration": 2.588829517364502 + }, + { + "auxiliary_loss_clip": 0.01144134, + "auxiliary_loss_mlp": 0.01057347, + "balance_loss_clip": 1.04363596, + "balance_loss_mlp": 1.03356469, + "epoch": 0.09186833007665715, + "flos": 23841848643840.0, + "grad_norm": 1.8701553730443337, + "language_loss": 0.78661311, + "learning_rate": 3.959982807656753e-06, + "loss": 0.8086279, + "num_input_tokens_seen": 32660020, + "step": 1528, + "time_per_iteration": 2.5827393531799316 + }, + { + "auxiliary_loss_clip": 0.0111269, + "auxiliary_loss_mlp": 0.01052652, + "balance_loss_clip": 1.04054415, + "balance_loss_mlp": 1.02927554, + "epoch": 0.09192845332932512, + "flos": 12932726065920.0, + "grad_norm": 4.232116146378557, + "language_loss": 0.76608992, + "learning_rate": 3.959905252114384e-06, + "loss": 0.78774333, + "num_input_tokens_seen": 32678170, + "step": 1529, + "time_per_iteration": 2.6144444942474365 + }, + { + "auxiliary_loss_clip": 0.01158079, + "auxiliary_loss_mlp": 0.00750047, + "balance_loss_clip": 1.04448819, + "balance_loss_mlp": 1.00140595, + "epoch": 0.09198857658199308, + "flos": 24568177559040.0, + "grad_norm": 2.1868843642903664, + "language_loss": 0.82967329, + "learning_rate": 3.959827622252211e-06, + "loss": 0.84875453, + "num_input_tokens_seen": 32697540, + "step": 1530, + "time_per_iteration": 2.589353561401367 + }, + { + "auxiliary_loss_clip": 0.01095528, + "auxiliary_loss_mlp": 0.01065957, + "balance_loss_clip": 1.0377574, + "balance_loss_mlp": 1.04253221, + "epoch": 0.09204869983466106, + "flos": 20266941600000.0, + "grad_norm": 2.693850614705187, + "language_loss": 0.84371936, + "learning_rate": 3.959749918073179e-06, + "loss": 0.86533427, + "num_input_tokens_seen": 32716805, + "step": 1531, + "time_per_iteration": 2.7625393867492676 + }, + { + "auxiliary_loss_clip": 0.01102256, + "auxiliary_loss_mlp": 0.01055131, + "balance_loss_clip": 1.03931308, + "balance_loss_mlp": 1.03083599, + "epoch": 0.09210882308732903, + "flos": 20885646389760.0, + "grad_norm": 1.9467276827618087, + "language_loss": 0.81071305, + "learning_rate": 3.959672139580233e-06, + "loss": 0.83228689, + "num_input_tokens_seen": 32736385, + "step": 1532, + "time_per_iteration": 2.6664206981658936 + }, + { + "auxiliary_loss_clip": 0.01127105, + "auxiliary_loss_mlp": 0.01057638, + "balance_loss_clip": 1.04319715, + "balance_loss_mlp": 1.0341301, + "epoch": 0.09216894633999699, + "flos": 30956326727040.0, + "grad_norm": 2.0961016946695294, + "language_loss": 0.83724654, + "learning_rate": 3.9595942867763235e-06, + "loss": 0.8590939, + "num_input_tokens_seen": 32757140, + "step": 1533, + "time_per_iteration": 2.7621281147003174 + }, + { + "auxiliary_loss_clip": 0.01119322, + "auxiliary_loss_mlp": 0.01055353, + "balance_loss_clip": 1.04476511, + "balance_loss_mlp": 1.03178573, + "epoch": 0.09222906959266497, + "flos": 13151565676800.0, + "grad_norm": 5.716639466972139, + "language_loss": 0.89678848, + "learning_rate": 3.959516359664402e-06, + "loss": 0.91853529, + "num_input_tokens_seen": 32774860, + "step": 1534, + "time_per_iteration": 2.7473666667938232 + }, + { + "auxiliary_loss_clip": 0.01120613, + "auxiliary_loss_mlp": 0.01067822, + "balance_loss_clip": 1.04015064, + "balance_loss_mlp": 1.04154837, + "epoch": 0.09228919284533293, + "flos": 25994477784960.0, + "grad_norm": 2.1461958701797936, + "language_loss": 0.754942, + "learning_rate": 3.959438358247424e-06, + "loss": 0.77682638, + "num_input_tokens_seen": 32795250, + "step": 1535, + "time_per_iteration": 2.720867156982422 + }, + { + "auxiliary_loss_clip": 0.01138538, + "auxiliary_loss_mlp": 0.01048034, + "balance_loss_clip": 1.04171014, + "balance_loss_mlp": 1.02646947, + "epoch": 0.0923493160980009, + "flos": 18660800954880.0, + "grad_norm": 1.8040139853633161, + "language_loss": 0.81412899, + "learning_rate": 3.959360282528346e-06, + "loss": 0.83599478, + "num_input_tokens_seen": 32813805, + "step": 1536, + "time_per_iteration": 2.7329459190368652 + }, + { + "auxiliary_loss_clip": 0.01151237, + "auxiliary_loss_mlp": 0.01059338, + "balance_loss_clip": 1.04278243, + "balance_loss_mlp": 1.03678346, + "epoch": 0.09240943935066886, + "flos": 21140576190720.0, + "grad_norm": 1.8416526602749352, + "language_loss": 0.88967824, + "learning_rate": 3.959282132510131e-06, + "loss": 0.91178399, + "num_input_tokens_seen": 32830960, + "step": 1537, + "time_per_iteration": 2.573112726211548 + }, + { + "auxiliary_loss_clip": 0.01125194, + "auxiliary_loss_mlp": 0.01065188, + "balance_loss_clip": 1.03938293, + "balance_loss_mlp": 1.04063058, + "epoch": 0.09246956260333684, + "flos": 20592435669120.0, + "grad_norm": 3.130507097967696, + "language_loss": 0.8036226, + "learning_rate": 3.959203908195741e-06, + "loss": 0.82552642, + "num_input_tokens_seen": 32848275, + "step": 1538, + "time_per_iteration": 2.8388831615448 + }, + { + "auxiliary_loss_clip": 0.01028268, + "auxiliary_loss_mlp": 0.01009393, + "balance_loss_clip": 1.01098371, + "balance_loss_mlp": 1.00665116, + "epoch": 0.09252968585600481, + "flos": 67558710614400.0, + "grad_norm": 0.7475611639106441, + "language_loss": 0.57416999, + "learning_rate": 3.959125609588142e-06, + "loss": 0.59454668, + "num_input_tokens_seen": 32917730, + "step": 1539, + "time_per_iteration": 3.3949549198150635 + }, + { + "auxiliary_loss_clip": 0.01130669, + "auxiliary_loss_mlp": 0.01053584, + "balance_loss_clip": 1.04454255, + "balance_loss_mlp": 1.03014791, + "epoch": 0.09258980910867277, + "flos": 17383853479680.0, + "grad_norm": 2.4960764669109876, + "language_loss": 0.67607915, + "learning_rate": 3.959047236690304e-06, + "loss": 0.69792169, + "num_input_tokens_seen": 32934910, + "step": 1540, + "time_per_iteration": 2.7561724185943604 + }, + { + "auxiliary_loss_clip": 0.01115711, + "auxiliary_loss_mlp": 0.01052653, + "balance_loss_clip": 1.04203391, + "balance_loss_mlp": 1.02842927, + "epoch": 0.09264993236134075, + "flos": 19865927185920.0, + "grad_norm": 1.756898786293511, + "language_loss": 0.83728093, + "learning_rate": 3.958968789505198e-06, + "loss": 0.85896456, + "num_input_tokens_seen": 32953840, + "step": 1541, + "time_per_iteration": 2.6421868801116943 + }, + { + "auxiliary_loss_clip": 0.01046495, + "auxiliary_loss_mlp": 0.01010254, + "balance_loss_clip": 1.00764716, + "balance_loss_mlp": 1.00735748, + "epoch": 0.09271005561400872, + "flos": 62284401262080.0, + "grad_norm": 0.8915483975050223, + "language_loss": 0.61908478, + "learning_rate": 3.9588902680358e-06, + "loss": 0.63965231, + "num_input_tokens_seen": 33011410, + "step": 1542, + "time_per_iteration": 3.1721434593200684 + }, + { + "auxiliary_loss_clip": 0.01134374, + "auxiliary_loss_mlp": 0.01060927, + "balance_loss_clip": 1.04548645, + "balance_loss_mlp": 1.03883743, + "epoch": 0.09277017886667668, + "flos": 23329870139520.0, + "grad_norm": 1.6003626370940391, + "language_loss": 0.83135206, + "learning_rate": 3.958811672285086e-06, + "loss": 0.85330504, + "num_input_tokens_seen": 33031675, + "step": 1543, + "time_per_iteration": 2.770681381225586 + }, + { + "auxiliary_loss_clip": 0.01102727, + "auxiliary_loss_mlp": 0.01064179, + "balance_loss_clip": 1.03823709, + "balance_loss_mlp": 1.04173207, + "epoch": 0.09283030211934466, + "flos": 54745169875200.0, + "grad_norm": 1.7198490006411533, + "language_loss": 0.72421658, + "learning_rate": 3.958733002256038e-06, + "loss": 0.74588561, + "num_input_tokens_seen": 33056355, + "step": 1544, + "time_per_iteration": 2.961297035217285 + }, + { + "auxiliary_loss_clip": 0.01132269, + "auxiliary_loss_mlp": 0.01061304, + "balance_loss_clip": 1.04156542, + "balance_loss_mlp": 1.03627002, + "epoch": 0.09289042537201263, + "flos": 30334784762880.0, + "grad_norm": 2.169208079455215, + "language_loss": 0.77047193, + "learning_rate": 3.958654257951637e-06, + "loss": 0.79240763, + "num_input_tokens_seen": 33079520, + "step": 1545, + "time_per_iteration": 2.7501213550567627 + }, + { + "auxiliary_loss_clip": 0.0111342, + "auxiliary_loss_mlp": 0.01063428, + "balance_loss_clip": 1.04360008, + "balance_loss_mlp": 1.04044414, + "epoch": 0.09295054862468059, + "flos": 17746838369280.0, + "grad_norm": 2.5270419321591477, + "language_loss": 0.74379486, + "learning_rate": 3.9585754393748706e-06, + "loss": 0.76556337, + "num_input_tokens_seen": 33096135, + "step": 1546, + "time_per_iteration": 2.6700832843780518 + }, + { + "auxiliary_loss_clip": 0.01129209, + "auxiliary_loss_mlp": 0.01054848, + "balance_loss_clip": 1.04237056, + "balance_loss_mlp": 1.03098273, + "epoch": 0.09301067187734856, + "flos": 23658021815040.0, + "grad_norm": 1.9024920281405677, + "language_loss": 0.84523964, + "learning_rate": 3.9584965465287275e-06, + "loss": 0.86708021, + "num_input_tokens_seen": 33115245, + "step": 1547, + "time_per_iteration": 2.6344308853149414 + }, + { + "auxiliary_loss_clip": 0.01108073, + "auxiliary_loss_mlp": 0.01060062, + "balance_loss_clip": 1.03809118, + "balance_loss_mlp": 1.03684044, + "epoch": 0.09307079513001654, + "flos": 27527719777920.0, + "grad_norm": 2.6454227786275406, + "language_loss": 0.67424631, + "learning_rate": 3.958417579416199e-06, + "loss": 0.69592762, + "num_input_tokens_seen": 33136640, + "step": 1548, + "time_per_iteration": 2.7340173721313477 + }, + { + "auxiliary_loss_clip": 0.01090104, + "auxiliary_loss_mlp": 0.0106095, + "balance_loss_clip": 1.03720474, + "balance_loss_mlp": 1.03744197, + "epoch": 0.0931309183826845, + "flos": 20627340710400.0, + "grad_norm": 1.7806829265297575, + "language_loss": 0.83405757, + "learning_rate": 3.9583385380402795e-06, + "loss": 0.85556811, + "num_input_tokens_seen": 33155060, + "step": 1549, + "time_per_iteration": 4.363773345947266 + }, + { + "auxiliary_loss_clip": 0.01146222, + "auxiliary_loss_mlp": 0.01051586, + "balance_loss_clip": 1.04737234, + "balance_loss_mlp": 1.0293653, + "epoch": 0.09319104163535247, + "flos": 29020921084800.0, + "grad_norm": 1.8772948060258057, + "language_loss": 0.75526136, + "learning_rate": 3.958259422403966e-06, + "loss": 0.77723944, + "num_input_tokens_seen": 33175420, + "step": 1550, + "time_per_iteration": 2.6837222576141357 + }, + { + "auxiliary_loss_clip": 0.01116412, + "auxiliary_loss_mlp": 0.01070427, + "balance_loss_clip": 1.04211664, + "balance_loss_mlp": 1.04416502, + "epoch": 0.09325116488802045, + "flos": 25301545539840.0, + "grad_norm": 2.200293645995628, + "language_loss": 0.83532608, + "learning_rate": 3.95818023251026e-06, + "loss": 0.85719448, + "num_input_tokens_seen": 33194120, + "step": 1551, + "time_per_iteration": 2.7955007553100586 + }, + { + "auxiliary_loss_clip": 0.01032297, + "auxiliary_loss_mlp": 0.00749693, + "balance_loss_clip": 1.00572228, + "balance_loss_mlp": 1.00127017, + "epoch": 0.09331128814068841, + "flos": 61536203942400.0, + "grad_norm": 0.7471836345480405, + "language_loss": 0.61854482, + "learning_rate": 3.958100968362163e-06, + "loss": 0.63636482, + "num_input_tokens_seen": 33261080, + "step": 1552, + "time_per_iteration": 6.4891767501831055 + }, + { + "auxiliary_loss_clip": 0.0105013, + "auxiliary_loss_mlp": 0.01004231, + "balance_loss_clip": 1.02171969, + "balance_loss_mlp": 1.00089288, + "epoch": 0.09337141139335638, + "flos": 53293700171520.0, + "grad_norm": 0.8342805560351552, + "language_loss": 0.58930397, + "learning_rate": 3.958021629962681e-06, + "loss": 0.60984761, + "num_input_tokens_seen": 33330235, + "step": 1553, + "time_per_iteration": 4.9007978439331055 + }, + { + "auxiliary_loss_clip": 0.01119635, + "auxiliary_loss_mlp": 0.0106094, + "balance_loss_clip": 1.04357696, + "balance_loss_mlp": 1.03620374, + "epoch": 0.09343153464602436, + "flos": 23476852592640.0, + "grad_norm": 3.5591602271347123, + "language_loss": 0.87383223, + "learning_rate": 3.957942217314823e-06, + "loss": 0.89563799, + "num_input_tokens_seen": 33349035, + "step": 1554, + "time_per_iteration": 2.8188986778259277 + }, + { + "auxiliary_loss_clip": 0.01119574, + "auxiliary_loss_mlp": 0.01054506, + "balance_loss_clip": 1.04419482, + "balance_loss_mlp": 1.03171372, + "epoch": 0.09349165789869232, + "flos": 19353481804800.0, + "grad_norm": 1.970870353228134, + "language_loss": 0.81414109, + "learning_rate": 3.957862730421599e-06, + "loss": 0.83588183, + "num_input_tokens_seen": 33368060, + "step": 1555, + "time_per_iteration": 2.763803720474243 + }, + { + "auxiliary_loss_clip": 0.01033182, + "auxiliary_loss_mlp": 0.01014999, + "balance_loss_clip": 1.00694323, + "balance_loss_mlp": 1.01203048, + "epoch": 0.09355178115136029, + "flos": 67502580635520.0, + "grad_norm": 0.8823295901290594, + "language_loss": 0.59638399, + "learning_rate": 3.957783169286024e-06, + "loss": 0.61686581, + "num_input_tokens_seen": 33430825, + "step": 1556, + "time_per_iteration": 3.1801366806030273 + }, + { + "auxiliary_loss_clip": 0.0114053, + "auxiliary_loss_mlp": 0.01058927, + "balance_loss_clip": 1.04506314, + "balance_loss_mlp": 1.03612232, + "epoch": 0.09361190440402825, + "flos": 37341638720640.0, + "grad_norm": 1.6870101506637738, + "language_loss": 0.84571308, + "learning_rate": 3.9577035339111155e-06, + "loss": 0.86770767, + "num_input_tokens_seen": 33454855, + "step": 1557, + "time_per_iteration": 2.9179327487945557 + }, + { + "auxiliary_loss_clip": 0.01088423, + "auxiliary_loss_mlp": 0.01064599, + "balance_loss_clip": 1.04572511, + "balance_loss_mlp": 1.03867078, + "epoch": 0.09367202765669623, + "flos": 24899705112960.0, + "grad_norm": 1.5743924117677799, + "language_loss": 0.77792907, + "learning_rate": 3.957623824299893e-06, + "loss": 0.79945928, + "num_input_tokens_seen": 33476000, + "step": 1558, + "time_per_iteration": 2.850191354751587 + }, + { + "auxiliary_loss_clip": 0.01135002, + "auxiliary_loss_mlp": 0.01052373, + "balance_loss_clip": 1.04563904, + "balance_loss_mlp": 1.02825689, + "epoch": 0.0937321509093642, + "flos": 15705568368000.0, + "grad_norm": 1.8974728427145728, + "language_loss": 0.79912394, + "learning_rate": 3.957544040455379e-06, + "loss": 0.82099771, + "num_input_tokens_seen": 33493845, + "step": 1559, + "time_per_iteration": 2.6407358646392822 + }, + { + "auxiliary_loss_clip": 0.01110902, + "auxiliary_loss_mlp": 0.01060538, + "balance_loss_clip": 1.04200602, + "balance_loss_mlp": 1.03761446, + "epoch": 0.09379227416203216, + "flos": 20483698222080.0, + "grad_norm": 6.56297694937907, + "language_loss": 0.76715314, + "learning_rate": 3.957464182380599e-06, + "loss": 0.78886759, + "num_input_tokens_seen": 33510850, + "step": 1560, + "time_per_iteration": 2.6857962608337402 + }, + { + "auxiliary_loss_clip": 0.01105201, + "auxiliary_loss_mlp": 0.01055566, + "balance_loss_clip": 1.04078519, + "balance_loss_mlp": 1.03153324, + "epoch": 0.09385239741470014, + "flos": 24352498344960.0, + "grad_norm": 1.668845928554493, + "language_loss": 0.80648309, + "learning_rate": 3.95738425007858e-06, + "loss": 0.82809079, + "num_input_tokens_seen": 33530430, + "step": 1561, + "time_per_iteration": 2.68920636177063 + }, + { + "auxiliary_loss_clip": 0.01145105, + "auxiliary_loss_mlp": 0.01049103, + "balance_loss_clip": 1.04463172, + "balance_loss_mlp": 1.026227, + "epoch": 0.0939125206673681, + "flos": 33291489807360.0, + "grad_norm": 2.145810603735929, + "language_loss": 0.61570162, + "learning_rate": 3.957304243552354e-06, + "loss": 0.63764369, + "num_input_tokens_seen": 33551975, + "step": 1562, + "time_per_iteration": 2.71158504486084 + }, + { + "auxiliary_loss_clip": 0.01133421, + "auxiliary_loss_mlp": 0.0105936, + "balance_loss_clip": 1.04674578, + "balance_loss_mlp": 1.03686488, + "epoch": 0.09397264392003607, + "flos": 19244923925760.0, + "grad_norm": 3.5167321914674297, + "language_loss": 0.84969455, + "learning_rate": 3.957224162804956e-06, + "loss": 0.87162232, + "num_input_tokens_seen": 33569850, + "step": 1563, + "time_per_iteration": 2.668423891067505 + }, + { + "auxiliary_loss_clip": 0.01135693, + "auxiliary_loss_mlp": 0.01052368, + "balance_loss_clip": 1.0483216, + "balance_loss_mlp": 1.0308032, + "epoch": 0.09403276717270405, + "flos": 19317930318720.0, + "grad_norm": 1.780169521008424, + "language_loss": 0.76186085, + "learning_rate": 3.9571440078394205e-06, + "loss": 0.78374147, + "num_input_tokens_seen": 33590510, + "step": 1564, + "time_per_iteration": 2.6413283348083496 + }, + { + "auxiliary_loss_clip": 0.01113686, + "auxiliary_loss_mlp": 0.01061829, + "balance_loss_clip": 1.04096293, + "balance_loss_mlp": 1.03839266, + "epoch": 0.09409289042537201, + "flos": 23583471137280.0, + "grad_norm": 1.975132293698884, + "language_loss": 0.80103219, + "learning_rate": 3.9570637786587895e-06, + "loss": 0.82278734, + "num_input_tokens_seen": 33608810, + "step": 1565, + "time_per_iteration": 2.6556448936462402 + }, + { + "auxiliary_loss_clip": 0.01127058, + "auxiliary_loss_mlp": 0.01072398, + "balance_loss_clip": 1.04155517, + "balance_loss_mlp": 1.04904485, + "epoch": 0.09415301367803998, + "flos": 20078446003200.0, + "grad_norm": 3.219817586991556, + "language_loss": 0.75221437, + "learning_rate": 3.956983475266103e-06, + "loss": 0.7742089, + "num_input_tokens_seen": 33627265, + "step": 1566, + "time_per_iteration": 2.615556240081787 + }, + { + "auxiliary_loss_clip": 0.01117032, + "auxiliary_loss_mlp": 0.00749891, + "balance_loss_clip": 1.04173231, + "balance_loss_mlp": 1.00119483, + "epoch": 0.09421313693070796, + "flos": 21062075016960.0, + "grad_norm": 2.0110227293636527, + "language_loss": 0.7833457, + "learning_rate": 3.956903097664407e-06, + "loss": 0.80201495, + "num_input_tokens_seen": 33644810, + "step": 1567, + "time_per_iteration": 2.808257579803467 + }, + { + "auxiliary_loss_clip": 0.0113274, + "auxiliary_loss_mlp": 0.01057684, + "balance_loss_clip": 1.0467906, + "balance_loss_mlp": 1.03520107, + "epoch": 0.09427326018337592, + "flos": 24316156759680.0, + "grad_norm": 1.8996807808012395, + "language_loss": 0.82527077, + "learning_rate": 3.956822645856749e-06, + "loss": 0.847175, + "num_input_tokens_seen": 33665665, + "step": 1568, + "time_per_iteration": 2.7746524810791016 + }, + { + "auxiliary_loss_clip": 0.01156684, + "auxiliary_loss_mlp": 0.01050135, + "balance_loss_clip": 1.04605126, + "balance_loss_mlp": 1.02647161, + "epoch": 0.09433338343604389, + "flos": 20263888944000.0, + "grad_norm": 1.886947706355857, + "language_loss": 0.76775104, + "learning_rate": 3.9567421198461814e-06, + "loss": 0.78981918, + "num_input_tokens_seen": 33684760, + "step": 1569, + "time_per_iteration": 2.5825235843658447 + }, + { + "auxiliary_loss_clip": 0.01085645, + "auxiliary_loss_mlp": 0.01058498, + "balance_loss_clip": 1.03752136, + "balance_loss_mlp": 1.03375041, + "epoch": 0.09439350668871185, + "flos": 12742973493120.0, + "grad_norm": 2.1855955691033366, + "language_loss": 0.85960519, + "learning_rate": 3.956661519635756e-06, + "loss": 0.88104653, + "num_input_tokens_seen": 33700750, + "step": 1570, + "time_per_iteration": 2.7252986431121826 + }, + { + "auxiliary_loss_clip": 0.01095782, + "auxiliary_loss_mlp": 0.01059438, + "balance_loss_clip": 1.04335988, + "balance_loss_mlp": 1.03447545, + "epoch": 0.09445362994137983, + "flos": 25962266263680.0, + "grad_norm": 1.712573132544271, + "language_loss": 0.75901568, + "learning_rate": 3.95658084522853e-06, + "loss": 0.78056788, + "num_input_tokens_seen": 33724430, + "step": 1571, + "time_per_iteration": 2.8157958984375 + }, + { + "auxiliary_loss_clip": 0.01097568, + "auxiliary_loss_mlp": 0.01056219, + "balance_loss_clip": 1.03889179, + "balance_loss_mlp": 1.03348553, + "epoch": 0.0945137531940478, + "flos": 19715353372800.0, + "grad_norm": 1.5362182377650382, + "language_loss": 0.79188466, + "learning_rate": 3.956500096627561e-06, + "loss": 0.81342256, + "num_input_tokens_seen": 33743455, + "step": 1572, + "time_per_iteration": 2.7300851345062256 + }, + { + "auxiliary_loss_clip": 0.01114244, + "auxiliary_loss_mlp": 0.01064159, + "balance_loss_clip": 1.04464436, + "balance_loss_mlp": 1.04030478, + "epoch": 0.09457387644671576, + "flos": 23617047375360.0, + "grad_norm": 1.6718153091212344, + "language_loss": 0.87850505, + "learning_rate": 3.956419273835913e-06, + "loss": 0.90028906, + "num_input_tokens_seen": 33763435, + "step": 1573, + "time_per_iteration": 2.7053332328796387 + }, + { + "auxiliary_loss_clip": 0.01130396, + "auxiliary_loss_mlp": 0.01067882, + "balance_loss_clip": 1.0438087, + "balance_loss_mlp": 1.04172802, + "epoch": 0.09463399969938374, + "flos": 26907291135360.0, + "grad_norm": 1.9991107278832472, + "language_loss": 0.81290019, + "learning_rate": 3.95633837685665e-06, + "loss": 0.83488292, + "num_input_tokens_seen": 33784325, + "step": 1574, + "time_per_iteration": 2.7778546810150146 + }, + { + "auxiliary_loss_clip": 0.01126554, + "auxiliary_loss_mlp": 0.0105589, + "balance_loss_clip": 1.0437088, + "balance_loss_mlp": 1.03413391, + "epoch": 0.0946941229520517, + "flos": 23659566099840.0, + "grad_norm": 1.8454999392345162, + "language_loss": 0.81216824, + "learning_rate": 3.95625740569284e-06, + "loss": 0.83399272, + "num_input_tokens_seen": 33802510, + "step": 1575, + "time_per_iteration": 2.69111704826355 + }, + { + "auxiliary_loss_clip": 0.01155049, + "auxiliary_loss_mlp": 0.01067515, + "balance_loss_clip": 1.0470829, + "balance_loss_mlp": 1.04432917, + "epoch": 0.09475424620471967, + "flos": 24134053783680.0, + "grad_norm": 1.864899699235447, + "language_loss": 0.86721885, + "learning_rate": 3.956176360347553e-06, + "loss": 0.88944453, + "num_input_tokens_seen": 33819980, + "step": 1576, + "time_per_iteration": 2.6540944576263428 + }, + { + "auxiliary_loss_clip": 0.01022111, + "auxiliary_loss_mlp": 0.01011035, + "balance_loss_clip": 1.0062089, + "balance_loss_mlp": 1.00781679, + "epoch": 0.09481436945738765, + "flos": 68426168065920.0, + "grad_norm": 0.9909579169503554, + "language_loss": 0.65859401, + "learning_rate": 3.956095240823862e-06, + "loss": 0.67892545, + "num_input_tokens_seen": 33878925, + "step": 1577, + "time_per_iteration": 3.189540147781372 + }, + { + "auxiliary_loss_clip": 0.01117015, + "auxiliary_loss_mlp": 0.01047673, + "balance_loss_clip": 1.04227972, + "balance_loss_mlp": 1.02596498, + "epoch": 0.09487449271005562, + "flos": 16654076858880.0, + "grad_norm": 2.660934001686351, + "language_loss": 0.79607213, + "learning_rate": 3.956014047124844e-06, + "loss": 0.81771898, + "num_input_tokens_seen": 33897600, + "step": 1578, + "time_per_iteration": 2.7585158348083496 + }, + { + "auxiliary_loss_clip": 0.01154383, + "auxiliary_loss_mlp": 0.01065324, + "balance_loss_clip": 1.04700565, + "balance_loss_mlp": 1.04148257, + "epoch": 0.09493461596272358, + "flos": 24275685110400.0, + "grad_norm": 1.596370954339298, + "language_loss": 0.782341, + "learning_rate": 3.955932779253578e-06, + "loss": 0.80453807, + "num_input_tokens_seen": 33917365, + "step": 1579, + "time_per_iteration": 2.6050212383270264 + }, + { + "auxiliary_loss_clip": 0.01091839, + "auxiliary_loss_mlp": 0.01063711, + "balance_loss_clip": 1.03824139, + "balance_loss_mlp": 1.03834331, + "epoch": 0.09499473921539155, + "flos": 21870173243520.0, + "grad_norm": 1.7984198805100813, + "language_loss": 0.73501235, + "learning_rate": 3.955851437213144e-06, + "loss": 0.75656784, + "num_input_tokens_seen": 33936680, + "step": 1580, + "time_per_iteration": 2.790818214416504 + }, + { + "auxiliary_loss_clip": 0.01128362, + "auxiliary_loss_mlp": 0.01059621, + "balance_loss_clip": 1.04458308, + "balance_loss_mlp": 1.03724504, + "epoch": 0.09505486246805953, + "flos": 33547137880320.0, + "grad_norm": 1.8138452403653036, + "language_loss": 0.77525884, + "learning_rate": 3.955770021006627e-06, + "loss": 0.79713869, + "num_input_tokens_seen": 33960685, + "step": 1581, + "time_per_iteration": 2.780409097671509 + }, + { + "auxiliary_loss_clip": 0.01119957, + "auxiliary_loss_mlp": 0.01063203, + "balance_loss_clip": 1.04764998, + "balance_loss_mlp": 1.04014766, + "epoch": 0.09511498572072749, + "flos": 21215342350080.0, + "grad_norm": 1.9131512950385003, + "language_loss": 0.86977333, + "learning_rate": 3.955688530637116e-06, + "loss": 0.8916049, + "num_input_tokens_seen": 33980015, + "step": 1582, + "time_per_iteration": 2.814631938934326 + }, + { + "auxiliary_loss_clip": 0.0114248, + "auxiliary_loss_mlp": 0.01063069, + "balance_loss_clip": 1.04455447, + "balance_loss_mlp": 1.0389533, + "epoch": 0.09517510897339546, + "flos": 14611262572800.0, + "grad_norm": 1.80974622372045, + "language_loss": 0.6713295, + "learning_rate": 3.955606966107699e-06, + "loss": 0.693385, + "num_input_tokens_seen": 33997705, + "step": 1583, + "time_per_iteration": 2.6778109073638916 + }, + { + "auxiliary_loss_clip": 0.01146291, + "auxiliary_loss_mlp": 0.01057088, + "balance_loss_clip": 1.04707813, + "balance_loss_mlp": 1.03266191, + "epoch": 0.09523523222606343, + "flos": 27817339138560.0, + "grad_norm": 1.8071491956039292, + "language_loss": 0.70339316, + "learning_rate": 3.95552532742147e-06, + "loss": 0.72542697, + "num_input_tokens_seen": 34017465, + "step": 1584, + "time_per_iteration": 2.691439628601074 + }, + { + "auxiliary_loss_clip": 0.01105223, + "auxiliary_loss_mlp": 0.01056961, + "balance_loss_clip": 1.03984022, + "balance_loss_mlp": 1.03528881, + "epoch": 0.0952953554787314, + "flos": 20706272847360.0, + "grad_norm": 1.4494956380579191, + "language_loss": 0.80825412, + "learning_rate": 3.955443614581525e-06, + "loss": 0.82987595, + "num_input_tokens_seen": 34038550, + "step": 1585, + "time_per_iteration": 2.8562369346618652 + }, + { + "auxiliary_loss_clip": 0.01125656, + "auxiliary_loss_mlp": 0.01059881, + "balance_loss_clip": 1.0427165, + "balance_loss_mlp": 1.03426313, + "epoch": 0.09535547873139937, + "flos": 24787627701120.0, + "grad_norm": 1.6997199900099984, + "language_loss": 0.71674848, + "learning_rate": 3.955361827590961e-06, + "loss": 0.73860389, + "num_input_tokens_seen": 34058665, + "step": 1586, + "time_per_iteration": 2.8581149578094482 + }, + { + "auxiliary_loss_clip": 0.01013345, + "auxiliary_loss_mlp": 0.01006838, + "balance_loss_clip": 1.00859094, + "balance_loss_mlp": 1.00364304, + "epoch": 0.09541560198406734, + "flos": 71912194905600.0, + "grad_norm": 0.8120871285878475, + "language_loss": 0.55431318, + "learning_rate": 3.955279966452883e-06, + "loss": 0.57451504, + "num_input_tokens_seen": 34109655, + "step": 1587, + "time_per_iteration": 3.168105363845825 + }, + { + "auxiliary_loss_clip": 0.01093199, + "auxiliary_loss_mlp": 0.01061271, + "balance_loss_clip": 1.03909302, + "balance_loss_mlp": 1.03701174, + "epoch": 0.09547572523673531, + "flos": 28982604251520.0, + "grad_norm": 1.8133595356945693, + "language_loss": 0.80983591, + "learning_rate": 3.955198031170391e-06, + "loss": 0.83138061, + "num_input_tokens_seen": 34131115, + "step": 1588, + "time_per_iteration": 2.9449217319488525 + }, + { + "auxiliary_loss_clip": 0.01093055, + "auxiliary_loss_mlp": 0.01064424, + "balance_loss_clip": 1.03632188, + "balance_loss_mlp": 1.04057074, + "epoch": 0.09553584848940327, + "flos": 24133910129280.0, + "grad_norm": 1.4711995078794022, + "language_loss": 0.81465173, + "learning_rate": 3.955116021746594e-06, + "loss": 0.83622652, + "num_input_tokens_seen": 34151925, + "step": 1589, + "time_per_iteration": 2.854912519454956 + }, + { + "auxiliary_loss_clip": 0.01098199, + "auxiliary_loss_mlp": 0.00749815, + "balance_loss_clip": 1.04421866, + "balance_loss_mlp": 1.00094175, + "epoch": 0.09559597174207124, + "flos": 42851376789120.0, + "grad_norm": 2.3162082899171144, + "language_loss": 0.65042865, + "learning_rate": 3.955033938184601e-06, + "loss": 0.66890883, + "num_input_tokens_seen": 34175395, + "step": 1590, + "time_per_iteration": 3.0688703060150146 + }, + { + "auxiliary_loss_clip": 0.01113179, + "auxiliary_loss_mlp": 0.0105761, + "balance_loss_clip": 1.04059613, + "balance_loss_mlp": 1.03462648, + "epoch": 0.09565609499473922, + "flos": 32670845683200.0, + "grad_norm": 2.0338726966762226, + "language_loss": 0.83400893, + "learning_rate": 3.954951780487526e-06, + "loss": 0.85571671, + "num_input_tokens_seen": 34197760, + "step": 1591, + "time_per_iteration": 2.9580347537994385 + }, + { + "auxiliary_loss_clip": 0.01131188, + "auxiliary_loss_mlp": 0.01060242, + "balance_loss_clip": 1.04185033, + "balance_loss_mlp": 1.03712785, + "epoch": 0.09571621824740718, + "flos": 18478410670080.0, + "grad_norm": 3.252765766438778, + "language_loss": 0.74628651, + "learning_rate": 3.9548695486584835e-06, + "loss": 0.76820081, + "num_input_tokens_seen": 34215330, + "step": 1592, + "time_per_iteration": 2.839777946472168 + }, + { + "auxiliary_loss_clip": 0.01136206, + "auxiliary_loss_mlp": 0.01057122, + "balance_loss_clip": 1.04052162, + "balance_loss_mlp": 1.03426945, + "epoch": 0.09577634150007515, + "flos": 29387497334400.0, + "grad_norm": 1.8827133595914272, + "language_loss": 0.74629211, + "learning_rate": 3.954787242700592e-06, + "loss": 0.76822531, + "num_input_tokens_seen": 34237745, + "step": 1593, + "time_per_iteration": 2.796797037124634 + }, + { + "auxiliary_loss_clip": 0.01146424, + "auxiliary_loss_mlp": 0.01058379, + "balance_loss_clip": 1.04649127, + "balance_loss_mlp": 1.03532398, + "epoch": 0.09583646475274313, + "flos": 22747830157440.0, + "grad_norm": 2.0694314794930033, + "language_loss": 0.7041297, + "learning_rate": 3.954704862616971e-06, + "loss": 0.72617769, + "num_input_tokens_seen": 34256565, + "step": 1594, + "time_per_iteration": 2.713806390762329 + }, + { + "auxiliary_loss_clip": 0.01140752, + "auxiliary_loss_mlp": 0.01059003, + "balance_loss_clip": 1.04248881, + "balance_loss_mlp": 1.0362941, + "epoch": 0.0958965880054111, + "flos": 23218367345280.0, + "grad_norm": 2.199051490931693, + "language_loss": 0.82766056, + "learning_rate": 3.954622408410747e-06, + "loss": 0.84965813, + "num_input_tokens_seen": 34275970, + "step": 1595, + "time_per_iteration": 2.758241891860962 + }, + { + "auxiliary_loss_clip": 0.0111681, + "auxiliary_loss_mlp": 0.01062254, + "balance_loss_clip": 1.0395155, + "balance_loss_mlp": 1.03705335, + "epoch": 0.09595671125807906, + "flos": 21324438933120.0, + "grad_norm": 1.86065306768715, + "language_loss": 0.8439554, + "learning_rate": 3.954539880085045e-06, + "loss": 0.86574602, + "num_input_tokens_seen": 34295490, + "step": 1596, + "time_per_iteration": 2.718188762664795 + }, + { + "auxiliary_loss_clip": 0.01132184, + "auxiliary_loss_mlp": 0.01054198, + "balance_loss_clip": 1.04475248, + "balance_loss_mlp": 1.02971268, + "epoch": 0.09601683451074704, + "flos": 39603472185600.0, + "grad_norm": 1.6291592129264263, + "language_loss": 0.68885618, + "learning_rate": 3.9544572776429945e-06, + "loss": 0.71072, + "num_input_tokens_seen": 34319990, + "step": 1597, + "time_per_iteration": 4.420611143112183 + }, + { + "auxiliary_loss_clip": 0.01130949, + "auxiliary_loss_mlp": 0.00749799, + "balance_loss_clip": 1.03957129, + "balance_loss_mlp": 1.00100112, + "epoch": 0.096076957763415, + "flos": 23732716147200.0, + "grad_norm": 1.9996082729246782, + "language_loss": 0.74474424, + "learning_rate": 3.954374601087729e-06, + "loss": 0.76355171, + "num_input_tokens_seen": 34339225, + "step": 1598, + "time_per_iteration": 2.670170307159424 + }, + { + "auxiliary_loss_clip": 0.01146083, + "auxiliary_loss_mlp": 0.01051303, + "balance_loss_clip": 1.04592204, + "balance_loss_mlp": 1.02761579, + "epoch": 0.09613708101608297, + "flos": 34678108483200.0, + "grad_norm": 1.7877524584172713, + "language_loss": 0.69154608, + "learning_rate": 3.954291850422382e-06, + "loss": 0.71351999, + "num_input_tokens_seen": 34361020, + "step": 1599, + "time_per_iteration": 4.272948503494263 + }, + { + "auxiliary_loss_clip": 0.01127911, + "auxiliary_loss_mlp": 0.01058334, + "balance_loss_clip": 1.04790115, + "balance_loss_mlp": 1.03527904, + "epoch": 0.09619720426875093, + "flos": 20740028653440.0, + "grad_norm": 2.0548971137703154, + "language_loss": 0.8395822, + "learning_rate": 3.954209025650093e-06, + "loss": 0.86144465, + "num_input_tokens_seen": 34378630, + "step": 1600, + "time_per_iteration": 5.797153472900391 + }, + { + "auxiliary_loss_clip": 0.01115033, + "auxiliary_loss_mlp": 0.01054136, + "balance_loss_clip": 1.03985476, + "balance_loss_mlp": 1.03155756, + "epoch": 0.09625732752141891, + "flos": 13042720488960.0, + "grad_norm": 1.9708631721644163, + "language_loss": 0.80027694, + "learning_rate": 3.954126126774001e-06, + "loss": 0.82196862, + "num_input_tokens_seen": 34397110, + "step": 1601, + "time_per_iteration": 2.6258766651153564 + }, + { + "auxiliary_loss_clip": 0.01146188, + "auxiliary_loss_mlp": 0.01054219, + "balance_loss_clip": 1.04466307, + "balance_loss_mlp": 1.03119957, + "epoch": 0.09631745077408688, + "flos": 22273629782400.0, + "grad_norm": 2.1512214814415955, + "language_loss": 0.82874334, + "learning_rate": 3.954043153797251e-06, + "loss": 0.85074741, + "num_input_tokens_seen": 34414165, + "step": 1602, + "time_per_iteration": 2.590738534927368 + }, + { + "auxiliary_loss_clip": 0.01098523, + "auxiliary_loss_mlp": 0.01054989, + "balance_loss_clip": 1.0383842, + "balance_loss_mlp": 1.0306108, + "epoch": 0.09637757402675484, + "flos": 24754266944640.0, + "grad_norm": 1.912719891833225, + "language_loss": 0.6278255, + "learning_rate": 3.953960106722989e-06, + "loss": 0.6493606, + "num_input_tokens_seen": 34434445, + "step": 1603, + "time_per_iteration": 2.7232303619384766 + }, + { + "auxiliary_loss_clip": 0.01155762, + "auxiliary_loss_mlp": 0.01058849, + "balance_loss_clip": 1.04604769, + "balance_loss_mlp": 1.03441131, + "epoch": 0.09643769727942282, + "flos": 22525758322560.0, + "grad_norm": 2.1781682152481032, + "language_loss": 0.71054244, + "learning_rate": 3.953876985554364e-06, + "loss": 0.73268855, + "num_input_tokens_seen": 34453095, + "step": 1604, + "time_per_iteration": 2.75280499458313 + }, + { + "auxiliary_loss_clip": 0.01140788, + "auxiliary_loss_mlp": 0.01057046, + "balance_loss_clip": 1.04399788, + "balance_loss_mlp": 1.03571928, + "epoch": 0.09649782053209079, + "flos": 30921026636160.0, + "grad_norm": 2.0691292191473782, + "language_loss": 0.79671395, + "learning_rate": 3.953793790294527e-06, + "loss": 0.81869233, + "num_input_tokens_seen": 34473680, + "step": 1605, + "time_per_iteration": 2.8916592597961426 + }, + { + "auxiliary_loss_clip": 0.01128909, + "auxiliary_loss_mlp": 0.01048148, + "balance_loss_clip": 1.04186988, + "balance_loss_mlp": 1.02562904, + "epoch": 0.09655794378475875, + "flos": 25337635729920.0, + "grad_norm": 2.3099857976076565, + "language_loss": 0.75157261, + "learning_rate": 3.953710520946634e-06, + "loss": 0.77334321, + "num_input_tokens_seen": 34492610, + "step": 1606, + "time_per_iteration": 2.8950397968292236 + }, + { + "auxiliary_loss_clip": 0.01140167, + "auxiliary_loss_mlp": 0.01054659, + "balance_loss_clip": 1.04518819, + "balance_loss_mlp": 1.03268909, + "epoch": 0.09661806703742673, + "flos": 22346061557760.0, + "grad_norm": 1.774137629663868, + "language_loss": 0.75826389, + "learning_rate": 3.953627177513843e-06, + "loss": 0.78021216, + "num_input_tokens_seen": 34511855, + "step": 1607, + "time_per_iteration": 2.800731897354126 + }, + { + "auxiliary_loss_clip": 0.01102329, + "auxiliary_loss_mlp": 0.01048633, + "balance_loss_clip": 1.03686535, + "balance_loss_mlp": 1.02723455, + "epoch": 0.0966781902900947, + "flos": 17457578144640.0, + "grad_norm": 1.7100893694930437, + "language_loss": 0.86496454, + "learning_rate": 3.953543759999312e-06, + "loss": 0.88647413, + "num_input_tokens_seen": 34528905, + "step": 1608, + "time_per_iteration": 2.6420915126800537 + }, + { + "auxiliary_loss_clip": 0.01090225, + "auxiliary_loss_mlp": 0.01058374, + "balance_loss_clip": 1.0416584, + "balance_loss_mlp": 1.03507996, + "epoch": 0.09673831354276266, + "flos": 36903995412480.0, + "grad_norm": 3.2277277423071697, + "language_loss": 0.71403116, + "learning_rate": 3.953460268406207e-06, + "loss": 0.73551714, + "num_input_tokens_seen": 34548480, + "step": 1609, + "time_per_iteration": 3.114224433898926 + }, + { + "auxiliary_loss_clip": 0.01119689, + "auxiliary_loss_mlp": 0.01060159, + "balance_loss_clip": 1.04326153, + "balance_loss_mlp": 1.03829587, + "epoch": 0.09679843679543064, + "flos": 20701388597760.0, + "grad_norm": 2.3323658886657186, + "language_loss": 0.84335899, + "learning_rate": 3.953376702737693e-06, + "loss": 0.86515749, + "num_input_tokens_seen": 34565410, + "step": 1610, + "time_per_iteration": 2.7691457271575928 + }, + { + "auxiliary_loss_clip": 0.01129272, + "auxiliary_loss_mlp": 0.01055956, + "balance_loss_clip": 1.04622841, + "balance_loss_mlp": 1.03251886, + "epoch": 0.0968585600480986, + "flos": 23514415240320.0, + "grad_norm": 2.093140902515936, + "language_loss": 0.66666538, + "learning_rate": 3.953293062996939e-06, + "loss": 0.68851769, + "num_input_tokens_seen": 34584840, + "step": 1611, + "time_per_iteration": 2.862318992614746 + }, + { + "auxiliary_loss_clip": 0.01084655, + "auxiliary_loss_mlp": 0.01053773, + "balance_loss_clip": 1.03632677, + "balance_loss_mlp": 1.03180289, + "epoch": 0.09691868330076657, + "flos": 20121072468480.0, + "grad_norm": 1.6624686620309064, + "language_loss": 0.81015348, + "learning_rate": 3.953209349187115e-06, + "loss": 0.83153772, + "num_input_tokens_seen": 34603360, + "step": 1612, + "time_per_iteration": 2.8718974590301514 + }, + { + "auxiliary_loss_clip": 0.01147784, + "auxiliary_loss_mlp": 0.01067729, + "balance_loss_clip": 1.04684043, + "balance_loss_mlp": 1.0456636, + "epoch": 0.09697880655343454, + "flos": 16544692967040.0, + "grad_norm": 2.200088397821874, + "language_loss": 0.81126034, + "learning_rate": 3.953125561311398e-06, + "loss": 0.83341551, + "num_input_tokens_seen": 34620760, + "step": 1613, + "time_per_iteration": 2.6688828468322754 + }, + { + "auxiliary_loss_clip": 0.01113125, + "auxiliary_loss_mlp": 0.01055932, + "balance_loss_clip": 1.04242969, + "balance_loss_mlp": 1.0331986, + "epoch": 0.09703892980610251, + "flos": 26104184899200.0, + "grad_norm": 5.0372573198441675, + "language_loss": 0.84362686, + "learning_rate": 3.953041699372964e-06, + "loss": 0.86531746, + "num_input_tokens_seen": 34640695, + "step": 1614, + "time_per_iteration": 2.736595869064331 + }, + { + "auxiliary_loss_clip": 0.01033561, + "auxiliary_loss_mlp": 0.00749539, + "balance_loss_clip": 1.00651336, + "balance_loss_mlp": 1.00105464, + "epoch": 0.09709905305877048, + "flos": 60443622000000.0, + "grad_norm": 0.7021876337335939, + "language_loss": 0.54653996, + "learning_rate": 3.952957763374992e-06, + "loss": 0.56437099, + "num_input_tokens_seen": 34702395, + "step": 1615, + "time_per_iteration": 3.185386896133423 + }, + { + "auxiliary_loss_clip": 0.00998529, + "auxiliary_loss_mlp": 0.01019604, + "balance_loss_clip": 1.00729752, + "balance_loss_mlp": 1.01674306, + "epoch": 0.09715917631143844, + "flos": 57639932893440.0, + "grad_norm": 0.7766464209911559, + "language_loss": 0.58270776, + "learning_rate": 3.952873753320666e-06, + "loss": 0.60288906, + "num_input_tokens_seen": 34768910, + "step": 1616, + "time_per_iteration": 3.410944700241089 + }, + { + "auxiliary_loss_clip": 0.01115122, + "auxiliary_loss_mlp": 0.01062275, + "balance_loss_clip": 1.04097259, + "balance_loss_mlp": 1.03829014, + "epoch": 0.09721929956410642, + "flos": 20558212986240.0, + "grad_norm": 3.0839144041600903, + "language_loss": 0.6912384, + "learning_rate": 3.952789669213172e-06, + "loss": 0.71301234, + "num_input_tokens_seen": 34787680, + "step": 1617, + "time_per_iteration": 2.8693785667419434 + }, + { + "auxiliary_loss_clip": 0.01115533, + "auxiliary_loss_mlp": 0.01059821, + "balance_loss_clip": 1.03971565, + "balance_loss_mlp": 1.03408325, + "epoch": 0.09727942281677439, + "flos": 27344359825920.0, + "grad_norm": 1.808805719952025, + "language_loss": 0.81139052, + "learning_rate": 3.952705511055698e-06, + "loss": 0.83314407, + "num_input_tokens_seen": 34808330, + "step": 1618, + "time_per_iteration": 2.753112316131592 + }, + { + "auxiliary_loss_clip": 0.01131144, + "auxiliary_loss_mlp": 0.01050935, + "balance_loss_clip": 1.04382849, + "balance_loss_mlp": 1.03045475, + "epoch": 0.09733954606944235, + "flos": 24900028335360.0, + "grad_norm": 1.7469550409460672, + "language_loss": 0.92935574, + "learning_rate": 3.952621278851435e-06, + "loss": 0.95117658, + "num_input_tokens_seen": 34830020, + "step": 1619, + "time_per_iteration": 2.6950724124908447 + }, + { + "auxiliary_loss_clip": 0.01138299, + "auxiliary_loss_mlp": 0.01055476, + "balance_loss_clip": 1.04592597, + "balance_loss_mlp": 1.03361344, + "epoch": 0.09739966932211033, + "flos": 31503928544640.0, + "grad_norm": 1.8767696123289688, + "language_loss": 0.88648319, + "learning_rate": 3.9525369726035784e-06, + "loss": 0.90842092, + "num_input_tokens_seen": 34850330, + "step": 1620, + "time_per_iteration": 2.6489109992980957 + }, + { + "auxiliary_loss_clip": 0.01128322, + "auxiliary_loss_mlp": 0.01068985, + "balance_loss_clip": 1.04808855, + "balance_loss_mlp": 1.04447532, + "epoch": 0.0974597925747783, + "flos": 23878764846720.0, + "grad_norm": 2.431707152926359, + "language_loss": 0.77348578, + "learning_rate": 3.952452592315324e-06, + "loss": 0.79545879, + "num_input_tokens_seen": 34871640, + "step": 1621, + "time_per_iteration": 2.7100675106048584 + }, + { + "auxiliary_loss_clip": 0.01091538, + "auxiliary_loss_mlp": 0.01067172, + "balance_loss_clip": 1.03638494, + "balance_loss_mlp": 1.04359198, + "epoch": 0.09751991582744626, + "flos": 17019575700480.0, + "grad_norm": 2.863229956502501, + "language_loss": 0.77751631, + "learning_rate": 3.952368137989871e-06, + "loss": 0.79910338, + "num_input_tokens_seen": 34888100, + "step": 1622, + "time_per_iteration": 2.682060956954956 + }, + { + "auxiliary_loss_clip": 0.01115737, + "auxiliary_loss_mlp": 0.01060145, + "balance_loss_clip": 1.04358804, + "balance_loss_mlp": 1.03720868, + "epoch": 0.09758003908011423, + "flos": 28402826826240.0, + "grad_norm": 1.8067651500965702, + "language_loss": 0.85722268, + "learning_rate": 3.9522836096304225e-06, + "loss": 0.87898153, + "num_input_tokens_seen": 34910485, + "step": 1623, + "time_per_iteration": 2.729506731033325 + }, + { + "auxiliary_loss_clip": 0.01140732, + "auxiliary_loss_mlp": 0.01063931, + "balance_loss_clip": 1.04472315, + "balance_loss_mlp": 1.04156756, + "epoch": 0.09764016233278221, + "flos": 18144297336960.0, + "grad_norm": 2.073541566105492, + "language_loss": 0.80081427, + "learning_rate": 3.952199007240184e-06, + "loss": 0.8228609, + "num_input_tokens_seen": 34928615, + "step": 1624, + "time_per_iteration": 2.641291379928589 + }, + { + "auxiliary_loss_clip": 0.01138653, + "auxiliary_loss_mlp": 0.01048036, + "balance_loss_clip": 1.04151011, + "balance_loss_mlp": 1.0270915, + "epoch": 0.09770028558545017, + "flos": 15265842071040.0, + "grad_norm": 2.16716672145727, + "language_loss": 0.85857683, + "learning_rate": 3.952114330822364e-06, + "loss": 0.88044369, + "num_input_tokens_seen": 34946045, + "step": 1625, + "time_per_iteration": 2.579291343688965 + }, + { + "auxiliary_loss_clip": 0.0114062, + "auxiliary_loss_mlp": 0.01060898, + "balance_loss_clip": 1.04195809, + "balance_loss_mlp": 1.03910685, + "epoch": 0.09776040883811814, + "flos": 23472435219840.0, + "grad_norm": 1.987293806239168, + "language_loss": 0.85462838, + "learning_rate": 3.952029580380172e-06, + "loss": 0.87664354, + "num_input_tokens_seen": 34962865, + "step": 1626, + "time_per_iteration": 2.594695806503296 + }, + { + "auxiliary_loss_clip": 0.01126185, + "auxiliary_loss_mlp": 0.00749666, + "balance_loss_clip": 1.04180431, + "balance_loss_mlp": 1.00087595, + "epoch": 0.09782053209078612, + "flos": 24499480798080.0, + "grad_norm": 1.9978584899209897, + "language_loss": 0.8324616, + "learning_rate": 3.9519447559168234e-06, + "loss": 0.85122013, + "num_input_tokens_seen": 34983505, + "step": 1627, + "time_per_iteration": 2.629995346069336 + }, + { + "auxiliary_loss_clip": 0.01135352, + "auxiliary_loss_mlp": 0.01053818, + "balance_loss_clip": 1.04435182, + "balance_loss_mlp": 1.03245592, + "epoch": 0.09788065534345408, + "flos": 21580158833280.0, + "grad_norm": 2.1039485031710154, + "language_loss": 0.84148717, + "learning_rate": 3.951859857435534e-06, + "loss": 0.86337894, + "num_input_tokens_seen": 35001825, + "step": 1628, + "time_per_iteration": 2.581037759780884 + }, + { + "auxiliary_loss_clip": 0.01131909, + "auxiliary_loss_mlp": 0.01056552, + "balance_loss_clip": 1.0396148, + "balance_loss_mlp": 1.03485644, + "epoch": 0.09794077859612205, + "flos": 23842459175040.0, + "grad_norm": 1.7123605682148058, + "language_loss": 0.76074964, + "learning_rate": 3.951774884939523e-06, + "loss": 0.78263426, + "num_input_tokens_seen": 35023075, + "step": 1629, + "time_per_iteration": 2.691695213317871 + }, + { + "auxiliary_loss_clip": 0.01104639, + "auxiliary_loss_mlp": 0.01056541, + "balance_loss_clip": 1.049106, + "balance_loss_mlp": 1.0335815, + "epoch": 0.09800090184879003, + "flos": 23659889322240.0, + "grad_norm": 1.7492527357608132, + "language_loss": 0.78052878, + "learning_rate": 3.951689838432013e-06, + "loss": 0.80214059, + "num_input_tokens_seen": 35043480, + "step": 1630, + "time_per_iteration": 2.76007080078125 + }, + { + "auxiliary_loss_clip": 0.01131016, + "auxiliary_loss_mlp": 0.01053314, + "balance_loss_clip": 1.04494643, + "balance_loss_mlp": 1.02971065, + "epoch": 0.09806102510145799, + "flos": 17055773631360.0, + "grad_norm": 1.790510521785142, + "language_loss": 0.86672699, + "learning_rate": 3.951604717916228e-06, + "loss": 0.88857031, + "num_input_tokens_seen": 35061490, + "step": 1631, + "time_per_iteration": 2.7948055267333984 + }, + { + "auxiliary_loss_clip": 0.0113115, + "auxiliary_loss_mlp": 0.01050309, + "balance_loss_clip": 1.044415, + "balance_loss_mlp": 1.02918541, + "epoch": 0.09812114835412596, + "flos": 23878477537920.0, + "grad_norm": 2.2480356172220572, + "language_loss": 0.83205533, + "learning_rate": 3.9515195233953975e-06, + "loss": 0.85386992, + "num_input_tokens_seen": 35079670, + "step": 1632, + "time_per_iteration": 2.693571090698242 + }, + { + "auxiliary_loss_clip": 0.01113859, + "auxiliary_loss_mlp": 0.01057407, + "balance_loss_clip": 1.04204845, + "balance_loss_mlp": 1.03635454, + "epoch": 0.09818127160679392, + "flos": 20595488325120.0, + "grad_norm": 1.656549933595322, + "language_loss": 0.78934979, + "learning_rate": 3.951434254872751e-06, + "loss": 0.81106246, + "num_input_tokens_seen": 35099205, + "step": 1633, + "time_per_iteration": 2.6445062160491943 + }, + { + "auxiliary_loss_clip": 0.01129049, + "auxiliary_loss_mlp": 0.0105643, + "balance_loss_clip": 1.03900313, + "balance_loss_mlp": 1.03476977, + "epoch": 0.0982413948594619, + "flos": 15487339288320.0, + "grad_norm": 2.148308451073305, + "language_loss": 0.72900742, + "learning_rate": 3.951348912351521e-06, + "loss": 0.75086224, + "num_input_tokens_seen": 35115270, + "step": 1634, + "time_per_iteration": 2.660386085510254 + }, + { + "auxiliary_loss_clip": 0.01124552, + "auxiliary_loss_mlp": 0.01065396, + "balance_loss_clip": 1.04036546, + "balance_loss_mlp": 1.04172146, + "epoch": 0.09830151811212987, + "flos": 24207958016640.0, + "grad_norm": 2.8605585959543016, + "language_loss": 0.73287505, + "learning_rate": 3.951263495834947e-06, + "loss": 0.75477457, + "num_input_tokens_seen": 35134065, + "step": 1635, + "time_per_iteration": 2.765482187271118 + }, + { + "auxiliary_loss_clip": 0.01115, + "auxiliary_loss_mlp": 0.01062909, + "balance_loss_clip": 1.04103732, + "balance_loss_mlp": 1.03855479, + "epoch": 0.09836164136479783, + "flos": 20594590485120.0, + "grad_norm": 6.449020115700862, + "language_loss": 0.78166443, + "learning_rate": 3.951178005326264e-06, + "loss": 0.80344355, + "num_input_tokens_seen": 35154870, + "step": 1636, + "time_per_iteration": 2.7239267826080322 + }, + { + "auxiliary_loss_clip": 0.01122158, + "auxiliary_loss_mlp": 0.01056403, + "balance_loss_clip": 1.03964639, + "balance_loss_mlp": 1.03429019, + "epoch": 0.09842176461746581, + "flos": 19934157070080.0, + "grad_norm": 1.9746439959525621, + "language_loss": 0.69848901, + "learning_rate": 3.951092440828715e-06, + "loss": 0.72027463, + "num_input_tokens_seen": 35171850, + "step": 1637, + "time_per_iteration": 2.7168707847595215 + }, + { + "auxiliary_loss_clip": 0.01150243, + "auxiliary_loss_mlp": 0.01057265, + "balance_loss_clip": 1.04332745, + "balance_loss_mlp": 1.03515208, + "epoch": 0.09848188787013377, + "flos": 21214659991680.0, + "grad_norm": 2.659198478231009, + "language_loss": 0.77279258, + "learning_rate": 3.951006802345545e-06, + "loss": 0.79486769, + "num_input_tokens_seen": 35188795, + "step": 1638, + "time_per_iteration": 2.6437342166900635 + }, + { + "auxiliary_loss_clip": 0.01115135, + "auxiliary_loss_mlp": 0.01051271, + "balance_loss_clip": 1.04555941, + "balance_loss_mlp": 1.02919328, + "epoch": 0.09854201112280174, + "flos": 30154226071680.0, + "grad_norm": 1.4529361071814142, + "language_loss": 0.72516423, + "learning_rate": 3.950921089880003e-06, + "loss": 0.74682832, + "num_input_tokens_seen": 35212100, + "step": 1639, + "time_per_iteration": 2.7446932792663574 + }, + { + "auxiliary_loss_clip": 0.01135698, + "auxiliary_loss_mlp": 0.01048311, + "balance_loss_clip": 1.04075241, + "balance_loss_mlp": 1.02654374, + "epoch": 0.09860213437546972, + "flos": 21795730306560.0, + "grad_norm": 1.8331377969924934, + "language_loss": 0.88499105, + "learning_rate": 3.950835303435337e-06, + "loss": 0.90683115, + "num_input_tokens_seen": 35230390, + "step": 1640, + "time_per_iteration": 2.7501401901245117 + }, + { + "auxiliary_loss_clip": 0.01135561, + "auxiliary_loss_mlp": 0.01043986, + "balance_loss_clip": 1.04181743, + "balance_loss_mlp": 1.02223015, + "epoch": 0.09866225762813768, + "flos": 21835555511040.0, + "grad_norm": 1.9790478130035711, + "language_loss": 0.8063339, + "learning_rate": 3.950749443014801e-06, + "loss": 0.82812929, + "num_input_tokens_seen": 35250405, + "step": 1641, + "time_per_iteration": 2.677825450897217 + }, + { + "auxiliary_loss_clip": 0.01137635, + "auxiliary_loss_mlp": 0.01060005, + "balance_loss_clip": 1.0420022, + "balance_loss_mlp": 1.03615165, + "epoch": 0.09872238088080565, + "flos": 17599855916160.0, + "grad_norm": 2.9364998669860043, + "language_loss": 0.85794747, + "learning_rate": 3.95066350862165e-06, + "loss": 0.87992388, + "num_input_tokens_seen": 35262820, + "step": 1642, + "time_per_iteration": 2.544754981994629 + }, + { + "auxiliary_loss_clip": 0.01112014, + "auxiliary_loss_mlp": 0.01056759, + "balance_loss_clip": 1.04090428, + "balance_loss_mlp": 1.03456175, + "epoch": 0.09878250413347361, + "flos": 27636134002560.0, + "grad_norm": 1.5850891243891225, + "language_loss": 0.81067836, + "learning_rate": 3.950577500259144e-06, + "loss": 0.83236605, + "num_input_tokens_seen": 35284490, + "step": 1643, + "time_per_iteration": 2.7197835445404053 + }, + { + "auxiliary_loss_clip": 0.01133752, + "auxiliary_loss_mlp": 0.01071123, + "balance_loss_clip": 1.04054403, + "balance_loss_mlp": 1.04855633, + "epoch": 0.0988426273861416, + "flos": 16544728880640.0, + "grad_norm": 1.8067690947837038, + "language_loss": 0.82526094, + "learning_rate": 3.950491417930543e-06, + "loss": 0.84730965, + "num_input_tokens_seen": 35302815, + "step": 1644, + "time_per_iteration": 4.19950008392334 + }, + { + "auxiliary_loss_clip": 0.01120435, + "auxiliary_loss_mlp": 0.00749632, + "balance_loss_clip": 1.03831434, + "balance_loss_mlp": 1.00083423, + "epoch": 0.09890275063880956, + "flos": 21215270522880.0, + "grad_norm": 1.6501209128998064, + "language_loss": 0.68224788, + "learning_rate": 3.9504052616391124e-06, + "loss": 0.70094854, + "num_input_tokens_seen": 35321175, + "step": 1645, + "time_per_iteration": 2.5611841678619385 + }, + { + "auxiliary_loss_clip": 0.01020439, + "auxiliary_loss_mlp": 0.01005352, + "balance_loss_clip": 1.0053314, + "balance_loss_mlp": 1.00203824, + "epoch": 0.09896287389147752, + "flos": 59379372910080.0, + "grad_norm": 0.8794274544655396, + "language_loss": 0.60856104, + "learning_rate": 3.950319031388119e-06, + "loss": 0.62881899, + "num_input_tokens_seen": 35381740, + "step": 1646, + "time_per_iteration": 3.092470407485962 + }, + { + "auxiliary_loss_clip": 0.01113018, + "auxiliary_loss_mlp": 0.01055938, + "balance_loss_clip": 1.04340088, + "balance_loss_mlp": 1.03138125, + "epoch": 0.0990229971441455, + "flos": 29642678530560.0, + "grad_norm": 1.7223422235912564, + "language_loss": 0.7306422, + "learning_rate": 3.950232727180833e-06, + "loss": 0.75233179, + "num_input_tokens_seen": 35403760, + "step": 1647, + "time_per_iteration": 6.0305867195129395 + }, + { + "auxiliary_loss_clip": 0.01121732, + "auxiliary_loss_mlp": 0.01064877, + "balance_loss_clip": 1.04373622, + "balance_loss_mlp": 1.04380083, + "epoch": 0.09908312039681347, + "flos": 21834873152640.0, + "grad_norm": 2.8884285650813943, + "language_loss": 0.83926809, + "learning_rate": 3.950146349020525e-06, + "loss": 0.86113423, + "num_input_tokens_seen": 35424050, + "step": 1648, + "time_per_iteration": 4.534884452819824 + }, + { + "auxiliary_loss_clip": 0.01029187, + "auxiliary_loss_mlp": 0.0100425, + "balance_loss_clip": 1.00502479, + "balance_loss_mlp": 1.00131726, + "epoch": 0.09914324364948143, + "flos": 57564304807680.0, + "grad_norm": 0.7764029328070517, + "language_loss": 0.55716681, + "learning_rate": 3.950059896910473e-06, + "loss": 0.57750118, + "num_input_tokens_seen": 35481690, + "step": 1649, + "time_per_iteration": 3.148236036300659 + }, + { + "auxiliary_loss_clip": 0.01131918, + "auxiliary_loss_mlp": 0.01045873, + "balance_loss_clip": 1.03867531, + "balance_loss_mlp": 1.02497566, + "epoch": 0.09920336690214941, + "flos": 34123934476800.0, + "grad_norm": 2.5238827978191667, + "language_loss": 0.90521467, + "learning_rate": 3.949973370853954e-06, + "loss": 0.92699254, + "num_input_tokens_seen": 35498635, + "step": 1650, + "time_per_iteration": 2.7521588802337646 + }, + { + "auxiliary_loss_clip": 0.01016531, + "auxiliary_loss_mlp": 0.00749518, + "balance_loss_clip": 1.02158117, + "balance_loss_mlp": 1.00100064, + "epoch": 0.09926349015481738, + "flos": 71216428464000.0, + "grad_norm": 0.8005654441166745, + "language_loss": 0.6374855, + "learning_rate": 3.94988677085425e-06, + "loss": 0.655146, + "num_input_tokens_seen": 35565720, + "step": 1651, + "time_per_iteration": 3.5496206283569336 + }, + { + "auxiliary_loss_clip": 0.01131871, + "auxiliary_loss_mlp": 0.01057285, + "balance_loss_clip": 1.04165196, + "balance_loss_mlp": 1.03484941, + "epoch": 0.09932361340748534, + "flos": 23148700917120.0, + "grad_norm": 2.1391377318753686, + "language_loss": 0.88283682, + "learning_rate": 3.949800096914643e-06, + "loss": 0.90472841, + "num_input_tokens_seen": 35586000, + "step": 1652, + "time_per_iteration": 2.6714837551116943 + }, + { + "auxiliary_loss_clip": 0.0112961, + "auxiliary_loss_mlp": 0.01053855, + "balance_loss_clip": 1.04505634, + "balance_loss_mlp": 1.0322547, + "epoch": 0.09938373666015332, + "flos": 19828651847040.0, + "grad_norm": 1.938779158558018, + "language_loss": 0.82147598, + "learning_rate": 3.949713349038422e-06, + "loss": 0.84331059, + "num_input_tokens_seen": 35604355, + "step": 1653, + "time_per_iteration": 2.7552411556243896 + }, + { + "auxiliary_loss_clip": 0.01137128, + "auxiliary_loss_mlp": 0.00749636, + "balance_loss_clip": 1.04260612, + "balance_loss_mlp": 1.00088, + "epoch": 0.09944385991282129, + "flos": 22090664880000.0, + "grad_norm": 1.7482635438996588, + "language_loss": 0.79335207, + "learning_rate": 3.949626527228875e-06, + "loss": 0.81221974, + "num_input_tokens_seen": 35625495, + "step": 1654, + "time_per_iteration": 2.7216739654541016 + }, + { + "auxiliary_loss_clip": 0.01151153, + "auxiliary_loss_mlp": 0.01057631, + "balance_loss_clip": 1.04805923, + "balance_loss_mlp": 1.03685308, + "epoch": 0.09950398316548925, + "flos": 19828867328640.0, + "grad_norm": 1.7780104212305656, + "language_loss": 0.81423861, + "learning_rate": 3.949539631489295e-06, + "loss": 0.83632648, + "num_input_tokens_seen": 35645030, + "step": 1655, + "time_per_iteration": 2.6220693588256836 + }, + { + "auxiliary_loss_clip": 0.01144538, + "auxiliary_loss_mlp": 0.01052525, + "balance_loss_clip": 1.04159999, + "balance_loss_mlp": 1.03063869, + "epoch": 0.09956410641815722, + "flos": 25003701964800.0, + "grad_norm": 1.9275199895242336, + "language_loss": 0.80988026, + "learning_rate": 3.9494526618229765e-06, + "loss": 0.83185083, + "num_input_tokens_seen": 35664305, + "step": 1656, + "time_per_iteration": 2.616800308227539 + }, + { + "auxiliary_loss_clip": 0.01138756, + "auxiliary_loss_mlp": 0.01063709, + "balance_loss_clip": 1.04631031, + "balance_loss_mlp": 1.04142869, + "epoch": 0.0996242296708252, + "flos": 19317714837120.0, + "grad_norm": 1.626369152596914, + "language_loss": 0.88776612, + "learning_rate": 3.949365618233217e-06, + "loss": 0.90979075, + "num_input_tokens_seen": 35684060, + "step": 1657, + "time_per_iteration": 2.7071452140808105 + }, + { + "auxiliary_loss_clip": 0.0112921, + "auxiliary_loss_mlp": 0.0106219, + "balance_loss_clip": 1.04159284, + "balance_loss_mlp": 1.03810978, + "epoch": 0.09968435292349316, + "flos": 21871609787520.0, + "grad_norm": 2.223883508401783, + "language_loss": 0.84997141, + "learning_rate": 3.9492785007233195e-06, + "loss": 0.87188542, + "num_input_tokens_seen": 35703250, + "step": 1658, + "time_per_iteration": 2.6827056407928467 + }, + { + "auxiliary_loss_clip": 0.0103948, + "auxiliary_loss_mlp": 0.01007784, + "balance_loss_clip": 1.0054884, + "balance_loss_mlp": 1.00506639, + "epoch": 0.09974447617616113, + "flos": 65384533313280.0, + "grad_norm": 0.9138241381354066, + "language_loss": 0.60851586, + "learning_rate": 3.949191309296585e-06, + "loss": 0.6289885, + "num_input_tokens_seen": 35762165, + "step": 1659, + "time_per_iteration": 3.1514980792999268 + }, + { + "auxiliary_loss_clip": 0.01119331, + "auxiliary_loss_mlp": 0.0105842, + "balance_loss_clip": 1.03996134, + "balance_loss_mlp": 1.03509033, + "epoch": 0.0998045994288291, + "flos": 23659817495040.0, + "grad_norm": 2.0343304002774696, + "language_loss": 0.85259974, + "learning_rate": 3.949104043956321e-06, + "loss": 0.87437725, + "num_input_tokens_seen": 35781520, + "step": 1660, + "time_per_iteration": 2.724780559539795 + }, + { + "auxiliary_loss_clip": 0.01113167, + "auxiliary_loss_mlp": 0.01063356, + "balance_loss_clip": 1.0416553, + "balance_loss_mlp": 1.03932381, + "epoch": 0.09986472268149707, + "flos": 19609704495360.0, + "grad_norm": 2.1032802883330324, + "language_loss": 0.8003664, + "learning_rate": 3.949016704705836e-06, + "loss": 0.82213163, + "num_input_tokens_seen": 35799565, + "step": 1661, + "time_per_iteration": 2.7084543704986572 + }, + { + "auxiliary_loss_clip": 0.01137828, + "auxiliary_loss_mlp": 0.01058329, + "balance_loss_clip": 1.0438149, + "balance_loss_mlp": 1.0343914, + "epoch": 0.09992484593416504, + "flos": 26213317395840.0, + "grad_norm": 2.10125505166436, + "language_loss": 0.83725375, + "learning_rate": 3.948929291548443e-06, + "loss": 0.85921526, + "num_input_tokens_seen": 35821085, + "step": 1662, + "time_per_iteration": 2.7144813537597656 + }, + { + "auxiliary_loss_clip": 0.01119144, + "auxiliary_loss_mlp": 0.0106598, + "balance_loss_clip": 1.03997278, + "balance_loss_mlp": 1.04071975, + "epoch": 0.09998496918683301, + "flos": 17493632421120.0, + "grad_norm": 1.939572377834587, + "language_loss": 0.89081967, + "learning_rate": 3.9488418044874546e-06, + "loss": 0.91267091, + "num_input_tokens_seen": 35839840, + "step": 1663, + "time_per_iteration": 2.5962531566619873 + }, + { + "auxiliary_loss_clip": 0.01144501, + "auxiliary_loss_mlp": 0.01056516, + "balance_loss_clip": 1.04616022, + "balance_loss_mlp": 1.03230476, + "epoch": 0.10004509243950098, + "flos": 22784925928320.0, + "grad_norm": 1.7062837868677942, + "language_loss": 0.70343781, + "learning_rate": 3.948754243526191e-06, + "loss": 0.72544801, + "num_input_tokens_seen": 35861545, + "step": 1664, + "time_per_iteration": 2.616893768310547 + }, + { + "auxiliary_loss_clip": 0.01113798, + "auxiliary_loss_mlp": 0.01054666, + "balance_loss_clip": 1.04454279, + "balance_loss_mlp": 1.03094387, + "epoch": 0.10010521569216894, + "flos": 16253385667200.0, + "grad_norm": 2.731660568486488, + "language_loss": 0.78656423, + "learning_rate": 3.94866660866797e-06, + "loss": 0.80824888, + "num_input_tokens_seen": 35878295, + "step": 1665, + "time_per_iteration": 2.6239781379699707 + }, + { + "auxiliary_loss_clip": 0.01143861, + "auxiliary_loss_mlp": 0.01066629, + "balance_loss_clip": 1.05042839, + "balance_loss_mlp": 1.04381299, + "epoch": 0.10016533894483691, + "flos": 23402589223680.0, + "grad_norm": 2.2386957216193033, + "language_loss": 0.69870502, + "learning_rate": 3.9485788999161165e-06, + "loss": 0.72080994, + "num_input_tokens_seen": 35898990, + "step": 1666, + "time_per_iteration": 2.6037240028381348 + }, + { + "auxiliary_loss_clip": 0.01083725, + "auxiliary_loss_mlp": 0.01069112, + "balance_loss_clip": 1.04406452, + "balance_loss_mlp": 1.0433743, + "epoch": 0.10022546219750489, + "flos": 19354164163200.0, + "grad_norm": 2.18011276714654, + "language_loss": 0.78806269, + "learning_rate": 3.948491117273956e-06, + "loss": 0.80959105, + "num_input_tokens_seen": 35916225, + "step": 1667, + "time_per_iteration": 2.8048529624938965 + }, + { + "auxiliary_loss_clip": 0.01115303, + "auxiliary_loss_mlp": 0.01058778, + "balance_loss_clip": 1.04063058, + "balance_loss_mlp": 1.03399408, + "epoch": 0.10028558545017285, + "flos": 27085766837760.0, + "grad_norm": 2.10942879753086, + "language_loss": 0.77117872, + "learning_rate": 3.948403260744817e-06, + "loss": 0.79291952, + "num_input_tokens_seen": 35934630, + "step": 1668, + "time_per_iteration": 2.7591323852539062 + }, + { + "auxiliary_loss_clip": 0.01151547, + "auxiliary_loss_mlp": 0.01054063, + "balance_loss_clip": 1.04516006, + "balance_loss_mlp": 1.02975631, + "epoch": 0.10034570870284082, + "flos": 25847136195840.0, + "grad_norm": 1.7089442843158424, + "language_loss": 0.77782249, + "learning_rate": 3.948315330332031e-06, + "loss": 0.7998786, + "num_input_tokens_seen": 35953855, + "step": 1669, + "time_per_iteration": 2.6022439002990723 + }, + { + "auxiliary_loss_clip": 0.01158099, + "auxiliary_loss_mlp": 0.010674, + "balance_loss_clip": 1.04716229, + "balance_loss_mlp": 1.04233027, + "epoch": 0.1004058319555088, + "flos": 26249587153920.0, + "grad_norm": 2.35599271286955, + "language_loss": 0.85384601, + "learning_rate": 3.948227326038933e-06, + "loss": 0.87610102, + "num_input_tokens_seen": 35974555, + "step": 1670, + "time_per_iteration": 2.718351364135742 + }, + { + "auxiliary_loss_clip": 0.01144526, + "auxiliary_loss_mlp": 0.01057145, + "balance_loss_clip": 1.04281712, + "balance_loss_mlp": 1.03476906, + "epoch": 0.10046595520817676, + "flos": 25374480105600.0, + "grad_norm": 1.5299951658838857, + "language_loss": 0.76795167, + "learning_rate": 3.9481392478688586e-06, + "loss": 0.78996837, + "num_input_tokens_seen": 35996830, + "step": 1671, + "time_per_iteration": 2.679985761642456 + }, + { + "auxiliary_loss_clip": 0.01030286, + "auxiliary_loss_mlp": 0.01002897, + "balance_loss_clip": 1.00511003, + "balance_loss_mlp": 1.00001228, + "epoch": 0.10052607846084473, + "flos": 67461821677440.0, + "grad_norm": 0.7702269438435534, + "language_loss": 0.60762882, + "learning_rate": 3.948051095825149e-06, + "loss": 0.62796062, + "num_input_tokens_seen": 36054465, + "step": 1672, + "time_per_iteration": 3.225782632827759 + }, + { + "auxiliary_loss_clip": 0.01112513, + "auxiliary_loss_mlp": 0.01061499, + "balance_loss_clip": 1.04011345, + "balance_loss_mlp": 1.0375973, + "epoch": 0.10058620171351271, + "flos": 21360493209600.0, + "grad_norm": 6.643402385918192, + "language_loss": 0.77313566, + "learning_rate": 3.947962869911147e-06, + "loss": 0.79487568, + "num_input_tokens_seen": 36073480, + "step": 1673, + "time_per_iteration": 2.732595205307007 + }, + { + "auxiliary_loss_clip": 0.0109777, + "auxiliary_loss_mlp": 0.01058755, + "balance_loss_clip": 1.03819656, + "balance_loss_mlp": 1.0343771, + "epoch": 0.10064632496618067, + "flos": 16800125558400.0, + "grad_norm": 2.3682511047293264, + "language_loss": 0.73533785, + "learning_rate": 3.947874570130197e-06, + "loss": 0.75690311, + "num_input_tokens_seen": 36091830, + "step": 1674, + "time_per_iteration": 2.686027765274048 + }, + { + "auxiliary_loss_clip": 0.01139491, + "auxiliary_loss_mlp": 0.00749682, + "balance_loss_clip": 1.04254889, + "balance_loss_mlp": 1.00080812, + "epoch": 0.10070644821884864, + "flos": 23624445576960.0, + "grad_norm": 1.7711053095559748, + "language_loss": 0.79274035, + "learning_rate": 3.947786196485649e-06, + "loss": 0.81163204, + "num_input_tokens_seen": 36111400, + "step": 1675, + "time_per_iteration": 2.653374433517456 + }, + { + "auxiliary_loss_clip": 0.01147472, + "auxiliary_loss_mlp": 0.01065185, + "balance_loss_clip": 1.04316139, + "balance_loss_mlp": 1.0432868, + "epoch": 0.1007665714715166, + "flos": 24462564595200.0, + "grad_norm": 2.4102227206848053, + "language_loss": 0.81637061, + "learning_rate": 3.947697748980853e-06, + "loss": 0.83849722, + "num_input_tokens_seen": 36129345, + "step": 1676, + "time_per_iteration": 2.5967812538146973 + }, + { + "auxiliary_loss_clip": 0.0114165, + "auxiliary_loss_mlp": 0.0106385, + "balance_loss_clip": 1.04593611, + "balance_loss_mlp": 1.04137933, + "epoch": 0.10082669472418458, + "flos": 16799119977600.0, + "grad_norm": 2.273504457645246, + "language_loss": 0.86315584, + "learning_rate": 3.947609227619163e-06, + "loss": 0.88521087, + "num_input_tokens_seen": 36146255, + "step": 1677, + "time_per_iteration": 2.6352851390838623 + }, + { + "auxiliary_loss_clip": 0.0112702, + "auxiliary_loss_mlp": 0.0105549, + "balance_loss_clip": 1.042696, + "balance_loss_mlp": 1.03318596, + "epoch": 0.10088681797685255, + "flos": 13553513844480.0, + "grad_norm": 2.2844510483132994, + "language_loss": 0.86396331, + "learning_rate": 3.947520632403936e-06, + "loss": 0.88578844, + "num_input_tokens_seen": 36164050, + "step": 1678, + "time_per_iteration": 2.797865629196167 + }, + { + "auxiliary_loss_clip": 0.01123034, + "auxiliary_loss_mlp": 0.01053218, + "balance_loss_clip": 1.04307055, + "balance_loss_mlp": 1.03041327, + "epoch": 0.10094694122952051, + "flos": 25265706744960.0, + "grad_norm": 1.919862105943344, + "language_loss": 0.89717948, + "learning_rate": 3.947431963338532e-06, + "loss": 0.91894197, + "num_input_tokens_seen": 36183530, + "step": 1679, + "time_per_iteration": 2.6834194660186768 + }, + { + "auxiliary_loss_clip": 0.01038602, + "auxiliary_loss_mlp": 0.01003484, + "balance_loss_clip": 1.00447655, + "balance_loss_mlp": 1.0007422, + "epoch": 0.10100706448218849, + "flos": 69854299885440.0, + "grad_norm": 0.7820039857135154, + "language_loss": 0.52967548, + "learning_rate": 3.947343220426312e-06, + "loss": 0.55009627, + "num_input_tokens_seen": 36248550, + "step": 1680, + "time_per_iteration": 3.1852478981018066 + }, + { + "auxiliary_loss_clip": 0.01150163, + "auxiliary_loss_mlp": 0.00749596, + "balance_loss_clip": 1.04652071, + "balance_loss_mlp": 1.00067592, + "epoch": 0.10106718773485646, + "flos": 20007163463040.0, + "grad_norm": 1.6093639040691143, + "language_loss": 0.77170545, + "learning_rate": 3.947254403670641e-06, + "loss": 0.79070306, + "num_input_tokens_seen": 36266065, + "step": 1681, + "time_per_iteration": 2.6181397438049316 + }, + { + "auxiliary_loss_clip": 0.0111728, + "auxiliary_loss_mlp": 0.01060362, + "balance_loss_clip": 1.03981924, + "balance_loss_mlp": 1.03381371, + "epoch": 0.10112731098752442, + "flos": 13479825093120.0, + "grad_norm": 2.1373222800574423, + "language_loss": 0.938869, + "learning_rate": 3.947165513074889e-06, + "loss": 0.9606455, + "num_input_tokens_seen": 36280960, + "step": 1682, + "time_per_iteration": 2.628868579864502 + }, + { + "auxiliary_loss_clip": 0.01135466, + "auxiliary_loss_mlp": 0.01052129, + "balance_loss_clip": 1.04129553, + "balance_loss_mlp": 1.03009868, + "epoch": 0.1011874342401924, + "flos": 18515901490560.0, + "grad_norm": 1.8836909592917381, + "language_loss": 0.87823749, + "learning_rate": 3.947076548642425e-06, + "loss": 0.90011346, + "num_input_tokens_seen": 36299010, + "step": 1683, + "time_per_iteration": 2.5889644622802734 + }, + { + "auxiliary_loss_clip": 0.01090893, + "auxiliary_loss_mlp": 0.01060193, + "balance_loss_clip": 1.03805137, + "balance_loss_mlp": 1.03664923, + "epoch": 0.10124755749286037, + "flos": 20702861055360.0, + "grad_norm": 1.6749178988985007, + "language_loss": 0.74700755, + "learning_rate": 3.946987510376624e-06, + "loss": 0.76851845, + "num_input_tokens_seen": 36318400, + "step": 1684, + "time_per_iteration": 2.7436013221740723 + }, + { + "auxiliary_loss_clip": 0.01027455, + "auxiliary_loss_mlp": 0.01011333, + "balance_loss_clip": 1.01265097, + "balance_loss_mlp": 1.00868702, + "epoch": 0.10130768074552833, + "flos": 56109456247680.0, + "grad_norm": 0.7629072506175254, + "language_loss": 0.61096394, + "learning_rate": 3.9468983982808615e-06, + "loss": 0.63135183, + "num_input_tokens_seen": 36381815, + "step": 1685, + "time_per_iteration": 3.284489154815674 + }, + { + "auxiliary_loss_clip": 0.01122541, + "auxiliary_loss_mlp": 0.01057597, + "balance_loss_clip": 1.04061973, + "balance_loss_mlp": 1.03425622, + "epoch": 0.1013678039981963, + "flos": 33402346156800.0, + "grad_norm": 3.466512152030363, + "language_loss": 0.62171084, + "learning_rate": 3.946809212358516e-06, + "loss": 0.64351225, + "num_input_tokens_seen": 36404320, + "step": 1686, + "time_per_iteration": 2.714216947555542 + }, + { + "auxiliary_loss_clip": 0.01110401, + "auxiliary_loss_mlp": 0.01057148, + "balance_loss_clip": 1.04424, + "balance_loss_mlp": 1.03341317, + "epoch": 0.10142792725086427, + "flos": 31905338008320.0, + "grad_norm": 1.998481355656295, + "language_loss": 0.81346989, + "learning_rate": 3.946719952612972e-06, + "loss": 0.83514535, + "num_input_tokens_seen": 36427510, + "step": 1687, + "time_per_iteration": 2.84625244140625 + }, + { + "auxiliary_loss_clip": 0.01137989, + "auxiliary_loss_mlp": 0.01056161, + "balance_loss_clip": 1.04455936, + "balance_loss_mlp": 1.03344011, + "epoch": 0.10148805050353224, + "flos": 28475905046400.0, + "grad_norm": 1.8275144156196035, + "language_loss": 0.7173394, + "learning_rate": 3.94663061904761e-06, + "loss": 0.73928088, + "num_input_tokens_seen": 36448230, + "step": 1688, + "time_per_iteration": 2.6805434226989746 + }, + { + "auxiliary_loss_clip": 0.0111582, + "auxiliary_loss_mlp": 0.01059616, + "balance_loss_clip": 1.04084885, + "balance_loss_mlp": 1.03650153, + "epoch": 0.1015481737562002, + "flos": 25148888737920.0, + "grad_norm": 2.074252463897756, + "language_loss": 0.86416411, + "learning_rate": 3.94654121166582e-06, + "loss": 0.88591844, + "num_input_tokens_seen": 36464395, + "step": 1689, + "time_per_iteration": 2.6300137042999268 + }, + { + "auxiliary_loss_clip": 0.01137098, + "auxiliary_loss_mlp": 0.01049254, + "balance_loss_clip": 1.04130185, + "balance_loss_mlp": 1.0279634, + "epoch": 0.10160829700886818, + "flos": 30882781630080.0, + "grad_norm": 1.8903196419903396, + "language_loss": 0.87829638, + "learning_rate": 3.946451730470993e-06, + "loss": 0.9001599, + "num_input_tokens_seen": 36486475, + "step": 1690, + "time_per_iteration": 2.658613681793213 + }, + { + "auxiliary_loss_clip": 0.01126757, + "auxiliary_loss_mlp": 0.01056251, + "balance_loss_clip": 1.04269898, + "balance_loss_mlp": 1.03264737, + "epoch": 0.10166842026153615, + "flos": 20412020632320.0, + "grad_norm": 2.0444327999249317, + "language_loss": 0.83538437, + "learning_rate": 3.946362175466521e-06, + "loss": 0.85721439, + "num_input_tokens_seen": 36505310, + "step": 1691, + "time_per_iteration": 2.605757713317871 + }, + { + "auxiliary_loss_clip": 0.01129071, + "auxiliary_loss_mlp": 0.01055721, + "balance_loss_clip": 1.04316664, + "balance_loss_mlp": 1.0328927, + "epoch": 0.10172854351420411, + "flos": 33476968661760.0, + "grad_norm": 1.6552311026497266, + "language_loss": 0.66817594, + "learning_rate": 3.946272546655801e-06, + "loss": 0.6900239, + "num_input_tokens_seen": 36529820, + "step": 1692, + "time_per_iteration": 4.416239500045776 + }, + { + "auxiliary_loss_clip": 0.01108945, + "auxiliary_loss_mlp": 0.01076018, + "balance_loss_clip": 1.03879333, + "balance_loss_mlp": 1.05235505, + "epoch": 0.1017886667668721, + "flos": 23550325862400.0, + "grad_norm": 2.0706088701441168, + "language_loss": 0.75825977, + "learning_rate": 3.94618284404223e-06, + "loss": 0.78010947, + "num_input_tokens_seen": 36549000, + "step": 1693, + "time_per_iteration": 2.736626386642456 + }, + { + "auxiliary_loss_clip": 0.01090571, + "auxiliary_loss_mlp": 0.01053838, + "balance_loss_clip": 1.03715432, + "balance_loss_mlp": 1.02886331, + "epoch": 0.10184879001954006, + "flos": 23296078419840.0, + "grad_norm": 1.6528679404160915, + "language_loss": 0.87141025, + "learning_rate": 3.9460930676292105e-06, + "loss": 0.89285439, + "num_input_tokens_seen": 36567515, + "step": 1694, + "time_per_iteration": 4.507707357406616 + }, + { + "auxiliary_loss_clip": 0.01094286, + "auxiliary_loss_mlp": 0.01058838, + "balance_loss_clip": 1.03770804, + "balance_loss_mlp": 1.034531, + "epoch": 0.10190891327220802, + "flos": 18333116156160.0, + "grad_norm": 1.8911537428156218, + "language_loss": 0.79470098, + "learning_rate": 3.946003217420147e-06, + "loss": 0.8162322, + "num_input_tokens_seen": 36586190, + "step": 1695, + "time_per_iteration": 5.885903596878052 + }, + { + "auxiliary_loss_clip": 0.01095126, + "auxiliary_loss_mlp": 0.01065065, + "balance_loss_clip": 1.03712893, + "balance_loss_mlp": 1.04164064, + "epoch": 0.10196903652487599, + "flos": 26465374108800.0, + "grad_norm": 1.7106481753126301, + "language_loss": 0.86492324, + "learning_rate": 3.945913293418447e-06, + "loss": 0.88652515, + "num_input_tokens_seen": 36607495, + "step": 1696, + "time_per_iteration": 2.8203537464141846 + }, + { + "auxiliary_loss_clip": 0.01128388, + "auxiliary_loss_mlp": 0.01060199, + "balance_loss_clip": 1.04144907, + "balance_loss_mlp": 1.03758454, + "epoch": 0.10202915977754397, + "flos": 21869526798720.0, + "grad_norm": 1.7789972373832839, + "language_loss": 0.82131177, + "learning_rate": 3.945823295627519e-06, + "loss": 0.84319758, + "num_input_tokens_seen": 36628555, + "step": 1697, + "time_per_iteration": 2.6840426921844482 + }, + { + "auxiliary_loss_clip": 0.0114834, + "auxiliary_loss_mlp": 0.01053472, + "balance_loss_clip": 1.04263926, + "balance_loss_mlp": 1.03035784, + "epoch": 0.10208928303021193, + "flos": 22309755886080.0, + "grad_norm": 3.529483295316622, + "language_loss": 0.80766964, + "learning_rate": 3.9457332240507775e-06, + "loss": 0.82968783, + "num_input_tokens_seen": 36646250, + "step": 1698, + "time_per_iteration": 2.640146255493164 + }, + { + "auxiliary_loss_clip": 0.01111531, + "auxiliary_loss_mlp": 0.01051493, + "balance_loss_clip": 1.04253376, + "balance_loss_mlp": 1.02966547, + "epoch": 0.1021494062828799, + "flos": 22125569921280.0, + "grad_norm": 2.543262603228418, + "language_loss": 0.76284468, + "learning_rate": 3.945643078691637e-06, + "loss": 0.78447497, + "num_input_tokens_seen": 36666675, + "step": 1699, + "time_per_iteration": 2.6648449897766113 + }, + { + "auxiliary_loss_clip": 0.0112544, + "auxiliary_loss_mlp": 0.01056016, + "balance_loss_clip": 1.04362202, + "balance_loss_mlp": 1.03347373, + "epoch": 0.10220952953554788, + "flos": 19646728439040.0, + "grad_norm": 2.11531519465129, + "language_loss": 0.80208385, + "learning_rate": 3.945552859553516e-06, + "loss": 0.82389838, + "num_input_tokens_seen": 36685225, + "step": 1700, + "time_per_iteration": 2.6166367530822754 + }, + { + "auxiliary_loss_clip": 0.01136575, + "auxiliary_loss_mlp": 0.01052934, + "balance_loss_clip": 1.04303598, + "balance_loss_mlp": 1.030761, + "epoch": 0.10226965278821584, + "flos": 29787290686080.0, + "grad_norm": 1.8574113864527388, + "language_loss": 0.76990652, + "learning_rate": 3.945462566639836e-06, + "loss": 0.79180163, + "num_input_tokens_seen": 36705985, + "step": 1701, + "time_per_iteration": 2.6643495559692383 + }, + { + "auxiliary_loss_clip": 0.01143574, + "auxiliary_loss_mlp": 0.01053714, + "balance_loss_clip": 1.04612851, + "balance_loss_mlp": 1.03094554, + "epoch": 0.10232977604088381, + "flos": 27016818681600.0, + "grad_norm": 2.0117624080296244, + "language_loss": 0.7804898, + "learning_rate": 3.945372199954019e-06, + "loss": 0.8024627, + "num_input_tokens_seen": 36725815, + "step": 1702, + "time_per_iteration": 2.6567108631134033 + }, + { + "auxiliary_loss_clip": 0.01120265, + "auxiliary_loss_mlp": 0.0105307, + "balance_loss_clip": 1.04103768, + "balance_loss_mlp": 1.03201759, + "epoch": 0.10238989929355179, + "flos": 20777519473920.0, + "grad_norm": 2.494198029365681, + "language_loss": 0.94560385, + "learning_rate": 3.945281759499494e-06, + "loss": 0.96733725, + "num_input_tokens_seen": 36742345, + "step": 1703, + "time_per_iteration": 2.623626708984375 + }, + { + "auxiliary_loss_clip": 0.00997305, + "auxiliary_loss_mlp": 0.01003435, + "balance_loss_clip": 1.00889444, + "balance_loss_mlp": 1.00033534, + "epoch": 0.10245002254621975, + "flos": 57698322451200.0, + "grad_norm": 0.8863983206313294, + "language_loss": 0.55009377, + "learning_rate": 3.94519124527969e-06, + "loss": 0.57010114, + "num_input_tokens_seen": 36798775, + "step": 1704, + "time_per_iteration": 3.2022879123687744 + }, + { + "auxiliary_loss_clip": 0.01146237, + "auxiliary_loss_mlp": 0.01053041, + "balance_loss_clip": 1.0434432, + "balance_loss_mlp": 1.03068972, + "epoch": 0.10251014579888772, + "flos": 16800125558400.0, + "grad_norm": 2.2812526261440063, + "language_loss": 0.84052455, + "learning_rate": 3.945100657298039e-06, + "loss": 0.86251736, + "num_input_tokens_seen": 36816295, + "step": 1705, + "time_per_iteration": 2.5826079845428467 + }, + { + "auxiliary_loss_clip": 0.01034511, + "auxiliary_loss_mlp": 0.01028439, + "balance_loss_clip": 1.02128899, + "balance_loss_mlp": 1.02450526, + "epoch": 0.1025702690515557, + "flos": 68565500922240.0, + "grad_norm": 0.7708903991813096, + "language_loss": 0.60402983, + "learning_rate": 3.9450099955579765e-06, + "loss": 0.6246593, + "num_input_tokens_seen": 36882030, + "step": 1706, + "time_per_iteration": 3.266179084777832 + }, + { + "auxiliary_loss_clip": 0.01109748, + "auxiliary_loss_mlp": 0.01050828, + "balance_loss_clip": 1.04076767, + "balance_loss_mlp": 1.02844071, + "epoch": 0.10263039230422366, + "flos": 14866623336960.0, + "grad_norm": 2.1458022465556583, + "language_loss": 0.86358738, + "learning_rate": 3.94491926006294e-06, + "loss": 0.88519311, + "num_input_tokens_seen": 36899245, + "step": 1707, + "time_per_iteration": 2.814596176147461 + }, + { + "auxiliary_loss_clip": 0.01135412, + "auxiliary_loss_mlp": 0.01050066, + "balance_loss_clip": 1.04596734, + "balance_loss_mlp": 1.02907336, + "epoch": 0.10269051555689163, + "flos": 25337599816320.0, + "grad_norm": 1.5249041127890097, + "language_loss": 0.72849679, + "learning_rate": 3.944828450816369e-06, + "loss": 0.75035155, + "num_input_tokens_seen": 36920950, + "step": 1708, + "time_per_iteration": 2.6484365463256836 + }, + { + "auxiliary_loss_clip": 0.01122464, + "auxiliary_loss_mlp": 0.00749577, + "balance_loss_clip": 1.04396415, + "balance_loss_mlp": 1.00078249, + "epoch": 0.10275063880955959, + "flos": 21068826773760.0, + "grad_norm": 1.7028804922008076, + "language_loss": 0.91098058, + "learning_rate": 3.944737567821709e-06, + "loss": 0.92970097, + "num_input_tokens_seen": 36938900, + "step": 1709, + "time_per_iteration": 2.824906826019287 + }, + { + "auxiliary_loss_clip": 0.01086519, + "auxiliary_loss_mlp": 0.01054907, + "balance_loss_clip": 1.04079282, + "balance_loss_mlp": 1.03219795, + "epoch": 0.10281076206222757, + "flos": 30366780802560.0, + "grad_norm": 1.8048320066798091, + "language_loss": 0.88317871, + "learning_rate": 3.944646611082406e-06, + "loss": 0.90459293, + "num_input_tokens_seen": 36957010, + "step": 1710, + "time_per_iteration": 2.8969314098358154 + }, + { + "auxiliary_loss_clip": 0.01130708, + "auxiliary_loss_mlp": 0.01062088, + "balance_loss_clip": 1.04182649, + "balance_loss_mlp": 1.04013014, + "epoch": 0.10287088531489554, + "flos": 22418313765120.0, + "grad_norm": 1.8661179563898984, + "language_loss": 0.79077065, + "learning_rate": 3.944555580601908e-06, + "loss": 0.8126986, + "num_input_tokens_seen": 36977690, + "step": 1711, + "time_per_iteration": 2.6842050552368164 + }, + { + "auxiliary_loss_clip": 0.0112259, + "auxiliary_loss_mlp": 0.01057393, + "balance_loss_clip": 1.04623342, + "balance_loss_mlp": 1.03488648, + "epoch": 0.1029310085675635, + "flos": 25115994858240.0, + "grad_norm": 1.9299034150693042, + "language_loss": 0.73667538, + "learning_rate": 3.944464476383668e-06, + "loss": 0.75847518, + "num_input_tokens_seen": 36997300, + "step": 1712, + "time_per_iteration": 2.7346713542938232 + }, + { + "auxiliary_loss_clip": 0.01103822, + "auxiliary_loss_mlp": 0.01066493, + "balance_loss_clip": 1.04592204, + "balance_loss_mlp": 1.04420102, + "epoch": 0.10299113182023148, + "flos": 19865639877120.0, + "grad_norm": 1.7849626918532393, + "language_loss": 0.86918533, + "learning_rate": 3.94437329843114e-06, + "loss": 0.89088851, + "num_input_tokens_seen": 37016110, + "step": 1713, + "time_per_iteration": 2.723792791366577 + }, + { + "auxiliary_loss_clip": 0.01132281, + "auxiliary_loss_mlp": 0.01064062, + "balance_loss_clip": 1.04166722, + "balance_loss_mlp": 1.04389167, + "epoch": 0.10305125507289944, + "flos": 20447608032000.0, + "grad_norm": 1.5888353019798334, + "language_loss": 0.72437203, + "learning_rate": 3.944282046747782e-06, + "loss": 0.74633545, + "num_input_tokens_seen": 37036405, + "step": 1714, + "time_per_iteration": 2.65096116065979 + }, + { + "auxiliary_loss_clip": 0.01137737, + "auxiliary_loss_mlp": 0.01061262, + "balance_loss_clip": 1.04352403, + "balance_loss_mlp": 1.03863621, + "epoch": 0.10311137832556741, + "flos": 26250772302720.0, + "grad_norm": 1.980264774047044, + "language_loss": 0.90894401, + "learning_rate": 3.944190721337053e-06, + "loss": 0.93093407, + "num_input_tokens_seen": 37057580, + "step": 1715, + "time_per_iteration": 2.8235554695129395 + }, + { + "auxiliary_loss_clip": 0.01133026, + "auxiliary_loss_mlp": 0.01068556, + "balance_loss_clip": 1.0417136, + "balance_loss_mlp": 1.04551244, + "epoch": 0.10317150157823539, + "flos": 35298932175360.0, + "grad_norm": 1.8375623380777433, + "language_loss": 0.76163018, + "learning_rate": 3.944099322202418e-06, + "loss": 0.78364605, + "num_input_tokens_seen": 37079120, + "step": 1716, + "time_per_iteration": 2.7621145248413086 + }, + { + "auxiliary_loss_clip": 0.01123538, + "auxiliary_loss_mlp": 0.01078323, + "balance_loss_clip": 1.04082751, + "balance_loss_mlp": 1.05506492, + "epoch": 0.10323162483090335, + "flos": 25739943033600.0, + "grad_norm": 2.0371852176346588, + "language_loss": 0.85544485, + "learning_rate": 3.944007849347342e-06, + "loss": 0.87746346, + "num_input_tokens_seen": 37099710, + "step": 1717, + "time_per_iteration": 2.793485403060913 + }, + { + "auxiliary_loss_clip": 0.01084975, + "auxiliary_loss_mlp": 0.01070225, + "balance_loss_clip": 1.03724539, + "balance_loss_mlp": 1.04675305, + "epoch": 0.10329174808357132, + "flos": 16289870906880.0, + "grad_norm": 3.423831776541767, + "language_loss": 0.82716131, + "learning_rate": 3.943916302775292e-06, + "loss": 0.84871328, + "num_input_tokens_seen": 37117775, + "step": 1718, + "time_per_iteration": 2.7096028327941895 + }, + { + "auxiliary_loss_clip": 0.01137311, + "auxiliary_loss_mlp": 0.01054624, + "balance_loss_clip": 1.04631305, + "balance_loss_mlp": 1.03236771, + "epoch": 0.10335187133623928, + "flos": 36687166963200.0, + "grad_norm": 1.7368194681651534, + "language_loss": 0.72970098, + "learning_rate": 3.943824682489742e-06, + "loss": 0.75162029, + "num_input_tokens_seen": 37140280, + "step": 1719, + "time_per_iteration": 2.739509105682373 + }, + { + "auxiliary_loss_clip": 0.01136442, + "auxiliary_loss_mlp": 0.01047537, + "balance_loss_clip": 1.04689038, + "balance_loss_mlp": 1.02671099, + "epoch": 0.10341199458890726, + "flos": 14975648092800.0, + "grad_norm": 1.8688523817737923, + "language_loss": 0.92898542, + "learning_rate": 3.9437329884941665e-06, + "loss": 0.95082521, + "num_input_tokens_seen": 37158350, + "step": 1720, + "time_per_iteration": 2.5771613121032715 + }, + { + "auxiliary_loss_clip": 0.01103086, + "auxiliary_loss_mlp": 0.0105265, + "balance_loss_clip": 1.03887606, + "balance_loss_mlp": 1.03027439, + "epoch": 0.10347211784157523, + "flos": 21031587348480.0, + "grad_norm": 1.8892445200294947, + "language_loss": 0.78894222, + "learning_rate": 3.943641220792039e-06, + "loss": 0.81049955, + "num_input_tokens_seen": 37177120, + "step": 1721, + "time_per_iteration": 2.6645569801330566 + }, + { + "auxiliary_loss_clip": 0.01095558, + "auxiliary_loss_mlp": 0.0106644, + "balance_loss_clip": 1.04180896, + "balance_loss_mlp": 1.0398922, + "epoch": 0.1035322410942432, + "flos": 19792094780160.0, + "grad_norm": 1.7249355244243056, + "language_loss": 0.80666035, + "learning_rate": 3.9435493793868434e-06, + "loss": 0.82828033, + "num_input_tokens_seen": 37195895, + "step": 1722, + "time_per_iteration": 2.6809768676757812 + }, + { + "auxiliary_loss_clip": 0.01023855, + "auxiliary_loss_mlp": 0.01044476, + "balance_loss_clip": 1.01010323, + "balance_loss_mlp": 1.04181802, + "epoch": 0.10359236434691117, + "flos": 52698874947840.0, + "grad_norm": 0.965392381858129, + "language_loss": 0.67217183, + "learning_rate": 3.943457464282059e-06, + "loss": 0.69285512, + "num_input_tokens_seen": 37247270, + "step": 1723, + "time_per_iteration": 3.0165443420410156 + }, + { + "auxiliary_loss_clip": 0.01135175, + "auxiliary_loss_mlp": 0.01055371, + "balance_loss_clip": 1.04096031, + "balance_loss_mlp": 1.03413987, + "epoch": 0.10365248759957914, + "flos": 18405404277120.0, + "grad_norm": 2.6935244309471704, + "language_loss": 0.77784199, + "learning_rate": 3.9433654754811745e-06, + "loss": 0.79974747, + "num_input_tokens_seen": 37265595, + "step": 1724, + "time_per_iteration": 2.5592029094696045 + }, + { + "auxiliary_loss_clip": 0.01114206, + "auxiliary_loss_mlp": 0.0105791, + "balance_loss_clip": 1.04315722, + "balance_loss_mlp": 1.03633332, + "epoch": 0.1037126108522471, + "flos": 47553555335040.0, + "grad_norm": 1.8431464060350935, + "language_loss": 0.74874759, + "learning_rate": 3.943273412987676e-06, + "loss": 0.77046871, + "num_input_tokens_seen": 37286660, + "step": 1725, + "time_per_iteration": 2.8741588592529297 + }, + { + "auxiliary_loss_clip": 0.0110303, + "auxiliary_loss_mlp": 0.01058885, + "balance_loss_clip": 1.04094481, + "balance_loss_mlp": 1.03648567, + "epoch": 0.10377273410491508, + "flos": 22816670572800.0, + "grad_norm": 1.9797023909794662, + "language_loss": 0.74995291, + "learning_rate": 3.943181276805054e-06, + "loss": 0.77157199, + "num_input_tokens_seen": 37304915, + "step": 1726, + "time_per_iteration": 2.676145315170288 + }, + { + "auxiliary_loss_clip": 0.01111528, + "auxiliary_loss_mlp": 0.01064559, + "balance_loss_clip": 1.04075122, + "balance_loss_mlp": 1.04138517, + "epoch": 0.10383285735758305, + "flos": 26138694890880.0, + "grad_norm": 2.661667880773427, + "language_loss": 0.7397579, + "learning_rate": 3.9430890669368035e-06, + "loss": 0.76151872, + "num_input_tokens_seen": 37325265, + "step": 1727, + "time_per_iteration": 2.7354698181152344 + }, + { + "auxiliary_loss_clip": 0.01120697, + "auxiliary_loss_mlp": 0.01058838, + "balance_loss_clip": 1.04075074, + "balance_loss_mlp": 1.03690362, + "epoch": 0.10389298061025101, + "flos": 17091791994240.0, + "grad_norm": 2.148740699834808, + "language_loss": 0.84850156, + "learning_rate": 3.942996783386422e-06, + "loss": 0.87029696, + "num_input_tokens_seen": 37341650, + "step": 1728, + "time_per_iteration": 2.6342825889587402 + }, + { + "auxiliary_loss_clip": 0.01125027, + "auxiliary_loss_mlp": 0.01054669, + "balance_loss_clip": 1.04435682, + "balance_loss_mlp": 1.03231704, + "epoch": 0.10395310386291898, + "flos": 20776513893120.0, + "grad_norm": 2.4471239306828347, + "language_loss": 0.70711094, + "learning_rate": 3.942904426157406e-06, + "loss": 0.72890782, + "num_input_tokens_seen": 37360270, + "step": 1729, + "time_per_iteration": 2.684610605239868 + }, + { + "auxiliary_loss_clip": 0.01119684, + "auxiliary_loss_mlp": 0.01060506, + "balance_loss_clip": 1.04071474, + "balance_loss_mlp": 1.0365808, + "epoch": 0.10401322711558696, + "flos": 12820540913280.0, + "grad_norm": 2.34400123340936, + "language_loss": 0.81488973, + "learning_rate": 3.9428119952532605e-06, + "loss": 0.83669162, + "num_input_tokens_seen": 37375225, + "step": 1730, + "time_per_iteration": 2.6035685539245605 + }, + { + "auxiliary_loss_clip": 0.010489, + "auxiliary_loss_mlp": 0.01053084, + "balance_loss_clip": 1.03867221, + "balance_loss_mlp": 1.03207886, + "epoch": 0.10407335036825492, + "flos": 23184683366400.0, + "grad_norm": 1.758380710676101, + "language_loss": 0.75808769, + "learning_rate": 3.942719490677489e-06, + "loss": 0.77910751, + "num_input_tokens_seen": 37395165, + "step": 1731, + "time_per_iteration": 3.178776741027832 + }, + { + "auxiliary_loss_clip": 0.01097668, + "auxiliary_loss_mlp": 0.01051292, + "balance_loss_clip": 1.04243231, + "balance_loss_mlp": 1.03054929, + "epoch": 0.10413347362092289, + "flos": 26104184899200.0, + "grad_norm": 1.747133552507575, + "language_loss": 0.82494128, + "learning_rate": 3.9426269124336e-06, + "loss": 0.8464309, + "num_input_tokens_seen": 37414845, + "step": 1732, + "time_per_iteration": 3.023968458175659 + }, + { + "auxiliary_loss_clip": 0.01116325, + "auxiliary_loss_mlp": 0.01050535, + "balance_loss_clip": 1.04845524, + "balance_loss_mlp": 1.03025723, + "epoch": 0.10419359687359087, + "flos": 12641059630080.0, + "grad_norm": 2.0524778748031998, + "language_loss": 0.83124316, + "learning_rate": 3.942534260525104e-06, + "loss": 0.85291177, + "num_input_tokens_seen": 37432490, + "step": 1733, + "time_per_iteration": 2.7370924949645996 + }, + { + "auxiliary_loss_clip": 0.01127583, + "auxiliary_loss_mlp": 0.0104824, + "balance_loss_clip": 1.04646683, + "balance_loss_mlp": 1.0274384, + "epoch": 0.10425372012625883, + "flos": 12125094716160.0, + "grad_norm": 2.2654475180944713, + "language_loss": 0.76657116, + "learning_rate": 3.942441534955514e-06, + "loss": 0.78832942, + "num_input_tokens_seen": 37449435, + "step": 1734, + "time_per_iteration": 2.6695330142974854 + }, + { + "auxiliary_loss_clip": 0.011121, + "auxiliary_loss_mlp": 0.01048718, + "balance_loss_clip": 1.04490507, + "balance_loss_mlp": 1.02753448, + "epoch": 0.1043138433789268, + "flos": 25337563902720.0, + "grad_norm": 1.9612874172264296, + "language_loss": 0.75052434, + "learning_rate": 3.9423487357283465e-06, + "loss": 0.77213252, + "num_input_tokens_seen": 37469105, + "step": 1735, + "time_per_iteration": 2.7547974586486816 + }, + { + "auxiliary_loss_clip": 0.01133492, + "auxiliary_loss_mlp": 0.01053941, + "balance_loss_clip": 1.04354608, + "balance_loss_mlp": 1.03232861, + "epoch": 0.10437396663159478, + "flos": 29167149352320.0, + "grad_norm": 1.677715218409099, + "language_loss": 0.7856977, + "learning_rate": 3.94225586284712e-06, + "loss": 0.80757207, + "num_input_tokens_seen": 37490540, + "step": 1736, + "time_per_iteration": 2.693942070007324 + }, + { + "auxiliary_loss_clip": 0.01131669, + "auxiliary_loss_mlp": 0.01061705, + "balance_loss_clip": 1.04250252, + "balance_loss_mlp": 1.04034233, + "epoch": 0.10443408988426274, + "flos": 25080946162560.0, + "grad_norm": 1.850602475901, + "language_loss": 0.70891172, + "learning_rate": 3.942162916315356e-06, + "loss": 0.73084545, + "num_input_tokens_seen": 37511905, + "step": 1737, + "time_per_iteration": 2.6840970516204834 + }, + { + "auxiliary_loss_clip": 0.01114524, + "auxiliary_loss_mlp": 0.01057773, + "balance_loss_clip": 1.03780258, + "balance_loss_mlp": 1.0324893, + "epoch": 0.1044942131369307, + "flos": 26759662237440.0, + "grad_norm": 1.7886377156510462, + "language_loss": 0.81712729, + "learning_rate": 3.942069896136581e-06, + "loss": 0.83885026, + "num_input_tokens_seen": 37533635, + "step": 1738, + "time_per_iteration": 4.507697820663452 + }, + { + "auxiliary_loss_clip": 0.0114756, + "auxiliary_loss_mlp": 0.01063482, + "balance_loss_clip": 1.04333127, + "balance_loss_mlp": 1.04018879, + "epoch": 0.10455433638959867, + "flos": 18442571875200.0, + "grad_norm": 2.0425551192950433, + "language_loss": 0.75450635, + "learning_rate": 3.9419768023143196e-06, + "loss": 0.77661675, + "num_input_tokens_seen": 37552035, + "step": 1739, + "time_per_iteration": 2.6901183128356934 + }, + { + "auxiliary_loss_clip": 0.01110684, + "auxiliary_loss_mlp": 0.01057241, + "balance_loss_clip": 1.04113245, + "balance_loss_mlp": 1.03571212, + "epoch": 0.10461445964226665, + "flos": 23218977876480.0, + "grad_norm": 1.6897098208413672, + "language_loss": 0.7750299, + "learning_rate": 3.941883634852104e-06, + "loss": 0.79670912, + "num_input_tokens_seen": 37571540, + "step": 1740, + "time_per_iteration": 2.7996373176574707 + }, + { + "auxiliary_loss_clip": 0.01128722, + "auxiliary_loss_mlp": 0.01061645, + "balance_loss_clip": 1.04918802, + "balance_loss_mlp": 1.03998518, + "epoch": 0.10467458289493461, + "flos": 24345243797760.0, + "grad_norm": 2.4908627027725334, + "language_loss": 0.85910016, + "learning_rate": 3.941790393753467e-06, + "loss": 0.88100386, + "num_input_tokens_seen": 37588265, + "step": 1741, + "time_per_iteration": 4.2537384033203125 + }, + { + "auxiliary_loss_clip": 0.01126065, + "auxiliary_loss_mlp": 0.01059403, + "balance_loss_clip": 1.04357564, + "balance_loss_mlp": 1.03696787, + "epoch": 0.10473470614760258, + "flos": 21287953693440.0, + "grad_norm": 2.484823103560165, + "language_loss": 0.74832726, + "learning_rate": 3.941697079021942e-06, + "loss": 0.77018195, + "num_input_tokens_seen": 37606860, + "step": 1742, + "time_per_iteration": 4.261258840560913 + }, + { + "auxiliary_loss_clip": 0.01100244, + "auxiliary_loss_mlp": 0.01065498, + "balance_loss_clip": 1.04562616, + "balance_loss_mlp": 1.04475605, + "epoch": 0.10479482940027056, + "flos": 21687208341120.0, + "grad_norm": 3.1861694967844727, + "language_loss": 0.87612981, + "learning_rate": 3.94160369066107e-06, + "loss": 0.89778721, + "num_input_tokens_seen": 37625210, + "step": 1743, + "time_per_iteration": 4.488224267959595 + }, + { + "auxiliary_loss_clip": 0.01109672, + "auxiliary_loss_mlp": 0.01055791, + "balance_loss_clip": 1.04295516, + "balance_loss_mlp": 1.03182971, + "epoch": 0.10485495265293852, + "flos": 21573694385280.0, + "grad_norm": 1.934565016510413, + "language_loss": 0.76023483, + "learning_rate": 3.941510228674391e-06, + "loss": 0.78188944, + "num_input_tokens_seen": 37644110, + "step": 1744, + "time_per_iteration": 2.8674161434173584 + }, + { + "auxiliary_loss_clip": 0.01137802, + "auxiliary_loss_mlp": 0.01056847, + "balance_loss_clip": 1.0466944, + "balance_loss_mlp": 1.03666496, + "epoch": 0.10491507590560649, + "flos": 37961923708800.0, + "grad_norm": 2.0795103717305383, + "language_loss": 0.78808796, + "learning_rate": 3.941416693065451e-06, + "loss": 0.81003451, + "num_input_tokens_seen": 37665800, + "step": 1745, + "time_per_iteration": 2.9195799827575684 + }, + { + "auxiliary_loss_clip": 0.01146818, + "auxiliary_loss_mlp": 0.01067695, + "balance_loss_clip": 1.04498529, + "balance_loss_mlp": 1.04595113, + "epoch": 0.10497519915827447, + "flos": 26396282298240.0, + "grad_norm": 2.0489297772569843, + "language_loss": 0.82633543, + "learning_rate": 3.941323083837794e-06, + "loss": 0.84848058, + "num_input_tokens_seen": 37685095, + "step": 1746, + "time_per_iteration": 2.8124284744262695 + }, + { + "auxiliary_loss_clip": 0.01128023, + "auxiliary_loss_mlp": 0.01069838, + "balance_loss_clip": 1.0457778, + "balance_loss_mlp": 1.04797482, + "epoch": 0.10503532241094243, + "flos": 40662190581120.0, + "grad_norm": 1.5248355528416608, + "language_loss": 0.70147693, + "learning_rate": 3.941229400994971e-06, + "loss": 0.72345555, + "num_input_tokens_seen": 37707445, + "step": 1747, + "time_per_iteration": 3.094072103500366 + }, + { + "auxiliary_loss_clip": 0.01128811, + "auxiliary_loss_mlp": 0.01065148, + "balance_loss_clip": 1.04715145, + "balance_loss_mlp": 1.04235482, + "epoch": 0.1050954456636104, + "flos": 29789409588480.0, + "grad_norm": 2.1446400694277044, + "language_loss": 0.84044588, + "learning_rate": 3.941135644540535e-06, + "loss": 0.86238545, + "num_input_tokens_seen": 37728325, + "step": 1748, + "time_per_iteration": 2.88409686088562 + }, + { + "auxiliary_loss_clip": 0.01141126, + "auxiliary_loss_mlp": 0.01058321, + "balance_loss_clip": 1.04123735, + "balance_loss_mlp": 1.03548002, + "epoch": 0.10515556891627838, + "flos": 23948754497280.0, + "grad_norm": 2.073827013286465, + "language_loss": 0.71688831, + "learning_rate": 3.941041814478041e-06, + "loss": 0.73888284, + "num_input_tokens_seen": 37748910, + "step": 1749, + "time_per_iteration": 2.64902925491333 + }, + { + "auxiliary_loss_clip": 0.0112263, + "auxiliary_loss_mlp": 0.01058561, + "balance_loss_clip": 1.04180443, + "balance_loss_mlp": 1.03661418, + "epoch": 0.10521569216894634, + "flos": 18259606972800.0, + "grad_norm": 2.6146025009743257, + "language_loss": 0.8162837, + "learning_rate": 3.940947910811047e-06, + "loss": 0.83809555, + "num_input_tokens_seen": 37765745, + "step": 1750, + "time_per_iteration": 2.6857519149780273 + }, + { + "auxiliary_loss_clip": 0.01117399, + "auxiliary_loss_mlp": 0.0106866, + "balance_loss_clip": 1.04387093, + "balance_loss_mlp": 1.04653454, + "epoch": 0.10527581542161431, + "flos": 15630909949440.0, + "grad_norm": 2.3291007790843774, + "language_loss": 0.91867137, + "learning_rate": 3.940853933543114e-06, + "loss": 0.94053191, + "num_input_tokens_seen": 37780520, + "step": 1751, + "time_per_iteration": 2.691490888595581 + }, + { + "auxiliary_loss_clip": 0.0113311, + "auxiliary_loss_mlp": 0.01052218, + "balance_loss_clip": 1.04411316, + "balance_loss_mlp": 1.03127337, + "epoch": 0.10533593867428227, + "flos": 18296559089280.0, + "grad_norm": 2.2521185490137667, + "language_loss": 0.79459846, + "learning_rate": 3.940759882677805e-06, + "loss": 0.81645179, + "num_input_tokens_seen": 37799515, + "step": 1752, + "time_per_iteration": 2.6468639373779297 + }, + { + "auxiliary_loss_clip": 0.0108373, + "auxiliary_loss_mlp": 0.01056041, + "balance_loss_clip": 1.03996754, + "balance_loss_mlp": 1.03435707, + "epoch": 0.10539606192695025, + "flos": 29023219555200.0, + "grad_norm": 1.6983027685327758, + "language_loss": 0.7595771, + "learning_rate": 3.940665758218686e-06, + "loss": 0.78097481, + "num_input_tokens_seen": 37818695, + "step": 1753, + "time_per_iteration": 2.832050085067749 + }, + { + "auxiliary_loss_clip": 0.01117686, + "auxiliary_loss_mlp": 0.01059205, + "balance_loss_clip": 1.04703665, + "balance_loss_mlp": 1.03508854, + "epoch": 0.10545618517961822, + "flos": 19969313506560.0, + "grad_norm": 2.0684164450495155, + "language_loss": 0.83909214, + "learning_rate": 3.940571560169328e-06, + "loss": 0.86086106, + "num_input_tokens_seen": 37837860, + "step": 1754, + "time_per_iteration": 2.7053356170654297 + }, + { + "auxiliary_loss_clip": 0.01103717, + "auxiliary_loss_mlp": 0.01053896, + "balance_loss_clip": 1.04499316, + "balance_loss_mlp": 1.0293982, + "epoch": 0.10551630843228618, + "flos": 16143427157760.0, + "grad_norm": 2.234399406809884, + "language_loss": 0.68466055, + "learning_rate": 3.940477288533302e-06, + "loss": 0.70623672, + "num_input_tokens_seen": 37856260, + "step": 1755, + "time_per_iteration": 2.710303544998169 + }, + { + "auxiliary_loss_clip": 0.01126313, + "auxiliary_loss_mlp": 0.01063544, + "balance_loss_clip": 1.04397678, + "balance_loss_mlp": 1.04073906, + "epoch": 0.10557643168495416, + "flos": 23440115957760.0, + "grad_norm": 2.1630270318160316, + "language_loss": 0.7656436, + "learning_rate": 3.940382943314182e-06, + "loss": 0.7875421, + "num_input_tokens_seen": 37876960, + "step": 1756, + "time_per_iteration": 2.648263692855835 + }, + { + "auxiliary_loss_clip": 0.01148882, + "auxiliary_loss_mlp": 0.01058735, + "balance_loss_clip": 1.04502439, + "balance_loss_mlp": 1.037063, + "epoch": 0.10563655493762213, + "flos": 21799034357760.0, + "grad_norm": 1.713323671616018, + "language_loss": 0.79945421, + "learning_rate": 3.940288524515547e-06, + "loss": 0.82153046, + "num_input_tokens_seen": 37897070, + "step": 1757, + "time_per_iteration": 2.64129376411438 + }, + { + "auxiliary_loss_clip": 0.01108191, + "auxiliary_loss_mlp": 0.01058725, + "balance_loss_clip": 1.03807592, + "balance_loss_mlp": 1.03674328, + "epoch": 0.10569667819029009, + "flos": 53800863275520.0, + "grad_norm": 2.085127208531435, + "language_loss": 0.78700829, + "learning_rate": 3.940194032140976e-06, + "loss": 0.80867743, + "num_input_tokens_seen": 37923635, + "step": 1758, + "time_per_iteration": 3.051929473876953 + }, + { + "auxiliary_loss_clip": 0.01130978, + "auxiliary_loss_mlp": 0.0105228, + "balance_loss_clip": 1.04480839, + "balance_loss_mlp": 1.0308702, + "epoch": 0.10575680144295807, + "flos": 22925515760640.0, + "grad_norm": 1.8762711736347293, + "language_loss": 0.91854596, + "learning_rate": 3.940099466194054e-06, + "loss": 0.94037855, + "num_input_tokens_seen": 37942650, + "step": 1759, + "time_per_iteration": 2.697845458984375 + }, + { + "auxiliary_loss_clip": 0.01116646, + "auxiliary_loss_mlp": 0.01058076, + "balance_loss_clip": 1.03974187, + "balance_loss_mlp": 1.03412724, + "epoch": 0.10581692469562604, + "flos": 14136667148160.0, + "grad_norm": 2.3910098977150755, + "language_loss": 0.7782203, + "learning_rate": 3.940004826678365e-06, + "loss": 0.79996753, + "num_input_tokens_seen": 37960660, + "step": 1760, + "time_per_iteration": 2.6826910972595215 + }, + { + "auxiliary_loss_clip": 0.01124386, + "auxiliary_loss_mlp": 0.01060485, + "balance_loss_clip": 1.04160213, + "balance_loss_mlp": 1.03684604, + "epoch": 0.105877047948294, + "flos": 25958674903680.0, + "grad_norm": 2.5569353115985147, + "language_loss": 0.89131236, + "learning_rate": 3.939910113597498e-06, + "loss": 0.9131611, + "num_input_tokens_seen": 37978625, + "step": 1761, + "time_per_iteration": 2.6421074867248535 + }, + { + "auxiliary_loss_clip": 0.01065066, + "auxiliary_loss_mlp": 0.00749521, + "balance_loss_clip": 1.03398848, + "balance_loss_mlp": 1.00062943, + "epoch": 0.10593717120096197, + "flos": 30664768032000.0, + "grad_norm": 8.625018954966231, + "language_loss": 0.77844501, + "learning_rate": 3.9398153269550464e-06, + "loss": 0.79659086, + "num_input_tokens_seen": 38000005, + "step": 1762, + "time_per_iteration": 2.8183751106262207 + }, + { + "auxiliary_loss_clip": 0.01025078, + "auxiliary_loss_mlp": 0.01012802, + "balance_loss_clip": 1.01386738, + "balance_loss_mlp": 1.01015592, + "epoch": 0.10599729445362994, + "flos": 66436682497920.0, + "grad_norm": 0.761543534602839, + "language_loss": 0.60565382, + "learning_rate": 3.939720466754602e-06, + "loss": 0.62603265, + "num_input_tokens_seen": 38066165, + "step": 1763, + "time_per_iteration": 3.351468801498413 + }, + { + "auxiliary_loss_clip": 0.011241, + "auxiliary_loss_mlp": 0.01051191, + "balance_loss_clip": 1.04210973, + "balance_loss_mlp": 1.03006685, + "epoch": 0.10605741770629791, + "flos": 23948179879680.0, + "grad_norm": 1.9771731898988347, + "language_loss": 0.80072922, + "learning_rate": 3.939625532999763e-06, + "loss": 0.82248211, + "num_input_tokens_seen": 38086150, + "step": 1764, + "time_per_iteration": 2.763956308364868 + }, + { + "auxiliary_loss_clip": 0.01097002, + "auxiliary_loss_mlp": 0.01059189, + "balance_loss_clip": 1.03714705, + "balance_loss_mlp": 1.03471494, + "epoch": 0.10611754095896588, + "flos": 19387524919680.0, + "grad_norm": 1.8329276393919995, + "language_loss": 0.79804885, + "learning_rate": 3.9395305256941314e-06, + "loss": 0.81961077, + "num_input_tokens_seen": 38104205, + "step": 1765, + "time_per_iteration": 2.8056910037994385 + }, + { + "auxiliary_loss_clip": 0.01131835, + "auxiliary_loss_mlp": 0.01057484, + "balance_loss_clip": 1.04229748, + "balance_loss_mlp": 1.03514433, + "epoch": 0.10617766421163385, + "flos": 22237755073920.0, + "grad_norm": 1.9097222614192018, + "language_loss": 0.76985168, + "learning_rate": 3.939435444841306e-06, + "loss": 0.79174489, + "num_input_tokens_seen": 38122005, + "step": 1766, + "time_per_iteration": 2.712857484817505 + }, + { + "auxiliary_loss_clip": 0.01150418, + "auxiliary_loss_mlp": 0.01066737, + "balance_loss_clip": 1.04804802, + "balance_loss_mlp": 1.04382467, + "epoch": 0.10623778746430182, + "flos": 28404407024640.0, + "grad_norm": 1.6588416537114432, + "language_loss": 0.7756108, + "learning_rate": 3.939340290444895e-06, + "loss": 0.79778236, + "num_input_tokens_seen": 38143365, + "step": 1767, + "time_per_iteration": 2.660721778869629 + }, + { + "auxiliary_loss_clip": 0.00977311, + "auxiliary_loss_mlp": 0.0102016, + "balance_loss_clip": 1.018085, + "balance_loss_mlp": 1.01632118, + "epoch": 0.10629791071696978, + "flos": 64234639221120.0, + "grad_norm": 0.6894899186307074, + "language_loss": 0.57944119, + "learning_rate": 3.939245062508506e-06, + "loss": 0.5994159, + "num_input_tokens_seen": 38210035, + "step": 1768, + "time_per_iteration": 3.7126479148864746 + }, + { + "auxiliary_loss_clip": 0.0110133, + "auxiliary_loss_mlp": 0.01048665, + "balance_loss_clip": 1.03849566, + "balance_loss_mlp": 1.02808952, + "epoch": 0.10635803396963776, + "flos": 22747578762240.0, + "grad_norm": 1.399508731419711, + "language_loss": 0.86315757, + "learning_rate": 3.939149761035749e-06, + "loss": 0.8846575, + "num_input_tokens_seen": 38231230, + "step": 1769, + "time_per_iteration": 2.9013500213623047 + }, + { + "auxiliary_loss_clip": 0.01100438, + "auxiliary_loss_mlp": 0.00749458, + "balance_loss_clip": 1.0386579, + "balance_loss_mlp": 1.00055671, + "epoch": 0.10641815722230573, + "flos": 31395586147200.0, + "grad_norm": 1.913110963238462, + "language_loss": 0.61329639, + "learning_rate": 3.9390543860302395e-06, + "loss": 0.63179541, + "num_input_tokens_seen": 38253890, + "step": 1770, + "time_per_iteration": 2.8653135299682617 + }, + { + "auxiliary_loss_clip": 0.01039236, + "auxiliary_loss_mlp": 0.01022604, + "balance_loss_clip": 1.01538205, + "balance_loss_mlp": 1.01971912, + "epoch": 0.1064782804749737, + "flos": 58552527784320.0, + "grad_norm": 0.8845339836653449, + "language_loss": 0.57035118, + "learning_rate": 3.9389589374955925e-06, + "loss": 0.59096956, + "num_input_tokens_seen": 38304290, + "step": 1771, + "time_per_iteration": 3.071337938308716 + }, + { + "auxiliary_loss_clip": 0.01114866, + "auxiliary_loss_mlp": 0.0107131, + "balance_loss_clip": 1.04539728, + "balance_loss_mlp": 1.04901814, + "epoch": 0.10653840372764166, + "flos": 23987825516160.0, + "grad_norm": 1.571427806933288, + "language_loss": 0.88260961, + "learning_rate": 3.938863415435429e-06, + "loss": 0.9044714, + "num_input_tokens_seen": 38324725, + "step": 1772, + "time_per_iteration": 2.7554433345794678 + }, + { + "auxiliary_loss_clip": 0.01154303, + "auxiliary_loss_mlp": 0.01064597, + "balance_loss_clip": 1.0478555, + "balance_loss_mlp": 1.0411483, + "epoch": 0.10659852698030964, + "flos": 18294655668480.0, + "grad_norm": 3.4427495263661925, + "language_loss": 0.75562578, + "learning_rate": 3.93876781985337e-06, + "loss": 0.77781481, + "num_input_tokens_seen": 38340735, + "step": 1773, + "time_per_iteration": 2.579251289367676 + }, + { + "auxiliary_loss_clip": 0.01093244, + "auxiliary_loss_mlp": 0.01068797, + "balance_loss_clip": 1.04173112, + "balance_loss_mlp": 1.04418039, + "epoch": 0.1066586502329776, + "flos": 32160591031680.0, + "grad_norm": 1.9060904666981697, + "language_loss": 0.82855833, + "learning_rate": 3.938672150753041e-06, + "loss": 0.85017884, + "num_input_tokens_seen": 38361315, + "step": 1774, + "time_per_iteration": 2.8478026390075684 + }, + { + "auxiliary_loss_clip": 0.01124959, + "auxiliary_loss_mlp": 0.007495, + "balance_loss_clip": 1.04488862, + "balance_loss_mlp": 1.00057077, + "epoch": 0.10671877348564557, + "flos": 17785155202560.0, + "grad_norm": 2.522180988593252, + "language_loss": 0.76425123, + "learning_rate": 3.9385764081380704e-06, + "loss": 0.78299582, + "num_input_tokens_seen": 38377425, + "step": 1775, + "time_per_iteration": 2.694572687149048 + }, + { + "auxiliary_loss_clip": 0.01041519, + "auxiliary_loss_mlp": 0.01027536, + "balance_loss_clip": 1.00952363, + "balance_loss_mlp": 1.02503228, + "epoch": 0.10677889673831355, + "flos": 63510177813120.0, + "grad_norm": 0.8516341084248567, + "language_loss": 0.57518804, + "learning_rate": 3.9384805920120876e-06, + "loss": 0.5958786, + "num_input_tokens_seen": 38440275, + "step": 1776, + "time_per_iteration": 3.1773014068603516 + }, + { + "auxiliary_loss_clip": 0.01111368, + "auxiliary_loss_mlp": 0.01068638, + "balance_loss_clip": 1.04089832, + "balance_loss_mlp": 1.04396129, + "epoch": 0.10683901999098151, + "flos": 22017694400640.0, + "grad_norm": 1.920371995867778, + "language_loss": 0.83392143, + "learning_rate": 3.938384702378727e-06, + "loss": 0.85572147, + "num_input_tokens_seen": 38461820, + "step": 1777, + "time_per_iteration": 2.761420726776123 + }, + { + "auxiliary_loss_clip": 0.0107854, + "auxiliary_loss_mlp": 0.00749457, + "balance_loss_clip": 1.04071176, + "balance_loss_mlp": 1.00059009, + "epoch": 0.10689914324364948, + "flos": 25042952551680.0, + "grad_norm": 1.7780240197199921, + "language_loss": 0.87392485, + "learning_rate": 3.938288739241625e-06, + "loss": 0.89220482, + "num_input_tokens_seen": 38482235, + "step": 1778, + "time_per_iteration": 2.8499207496643066 + }, + { + "auxiliary_loss_clip": 0.01123423, + "auxiliary_loss_mlp": 0.00749435, + "balance_loss_clip": 1.06174493, + "balance_loss_mlp": 1.00049806, + "epoch": 0.10695926649631746, + "flos": 16435129507200.0, + "grad_norm": 3.7788906244226625, + "language_loss": 0.8427763, + "learning_rate": 3.938192702604417e-06, + "loss": 0.86150485, + "num_input_tokens_seen": 38500690, + "step": 1779, + "time_per_iteration": 2.7714900970458984 + }, + { + "auxiliary_loss_clip": 0.01103695, + "auxiliary_loss_mlp": 0.0074948, + "balance_loss_clip": 1.03994048, + "balance_loss_mlp": 1.0006144, + "epoch": 0.10701938974898542, + "flos": 16979211792000.0, + "grad_norm": 2.224287787882823, + "language_loss": 0.67086148, + "learning_rate": 3.9380965924707495e-06, + "loss": 0.68939328, + "num_input_tokens_seen": 38518405, + "step": 1780, + "time_per_iteration": 2.627073049545288 + }, + { + "auxiliary_loss_clip": 0.01136053, + "auxiliary_loss_mlp": 0.01057368, + "balance_loss_clip": 1.04765522, + "balance_loss_mlp": 1.03489673, + "epoch": 0.10707951300165339, + "flos": 15888102307200.0, + "grad_norm": 3.0057532593523884, + "language_loss": 0.91911232, + "learning_rate": 3.938000408844265e-06, + "loss": 0.94104648, + "num_input_tokens_seen": 38535060, + "step": 1781, + "time_per_iteration": 2.5996663570404053 + }, + { + "auxiliary_loss_clip": 0.01105611, + "auxiliary_loss_mlp": 0.01057121, + "balance_loss_clip": 1.04607534, + "balance_loss_mlp": 1.03593791, + "epoch": 0.10713963625432135, + "flos": 14247164361600.0, + "grad_norm": 3.0984009826723113, + "language_loss": 0.79483163, + "learning_rate": 3.9379041517286105e-06, + "loss": 0.81645888, + "num_input_tokens_seen": 38552855, + "step": 1782, + "time_per_iteration": 2.680032253265381 + }, + { + "auxiliary_loss_clip": 0.01125486, + "auxiliary_loss_mlp": 0.01060892, + "balance_loss_clip": 1.04550791, + "balance_loss_mlp": 1.03802741, + "epoch": 0.10719975950698933, + "flos": 16756780821120.0, + "grad_norm": 2.3094581160834147, + "language_loss": 0.78816211, + "learning_rate": 3.937807821127436e-06, + "loss": 0.81002581, + "num_input_tokens_seen": 38570075, + "step": 1783, + "time_per_iteration": 2.6147279739379883 + }, + { + "auxiliary_loss_clip": 0.01132292, + "auxiliary_loss_mlp": 0.01061943, + "balance_loss_clip": 1.04726517, + "balance_loss_mlp": 1.03895915, + "epoch": 0.1072598827596573, + "flos": 22710626645760.0, + "grad_norm": 2.03554484988823, + "language_loss": 0.86557382, + "learning_rate": 3.937711417044395e-06, + "loss": 0.88751614, + "num_input_tokens_seen": 38587970, + "step": 1784, + "time_per_iteration": 2.651190996170044 + }, + { + "auxiliary_loss_clip": 0.01123832, + "auxiliary_loss_mlp": 0.01059561, + "balance_loss_clip": 1.04515421, + "balance_loss_mlp": 1.03685129, + "epoch": 0.10732000601232526, + "flos": 23258264376960.0, + "grad_norm": 2.1318652895072074, + "language_loss": 1.00791359, + "learning_rate": 3.937614939483143e-06, + "loss": 1.02974761, + "num_input_tokens_seen": 38605840, + "step": 1785, + "time_per_iteration": 4.233652114868164 + }, + { + "auxiliary_loss_clip": 0.01137747, + "auxiliary_loss_mlp": 0.01058288, + "balance_loss_clip": 1.04803443, + "balance_loss_mlp": 1.03625822, + "epoch": 0.10738012926499324, + "flos": 24207060176640.0, + "grad_norm": 1.9356003625217773, + "language_loss": 0.85008562, + "learning_rate": 3.937518388447339e-06, + "loss": 0.87204599, + "num_input_tokens_seen": 38627070, + "step": 1786, + "time_per_iteration": 2.688565731048584 + }, + { + "auxiliary_loss_clip": 0.01148535, + "auxiliary_loss_mlp": 0.01054703, + "balance_loss_clip": 1.0443182, + "balance_loss_mlp": 1.03041983, + "epoch": 0.1074402525176612, + "flos": 20923065383040.0, + "grad_norm": 1.760495208101736, + "language_loss": 0.7860508, + "learning_rate": 3.937421763940642e-06, + "loss": 0.80808318, + "num_input_tokens_seen": 38645840, + "step": 1787, + "time_per_iteration": 2.61877179145813 + }, + { + "auxiliary_loss_clip": 0.01140969, + "auxiliary_loss_mlp": 0.01048858, + "balance_loss_clip": 1.0456574, + "balance_loss_mlp": 1.02612436, + "epoch": 0.10750037577032917, + "flos": 16946928443520.0, + "grad_norm": 3.653402189328164, + "language_loss": 0.82507414, + "learning_rate": 3.937325065966719e-06, + "loss": 0.84697241, + "num_input_tokens_seen": 38664770, + "step": 1788, + "time_per_iteration": 5.864710092544556 + }, + { + "auxiliary_loss_clip": 0.01145998, + "auxiliary_loss_mlp": 0.01059736, + "balance_loss_clip": 1.04507875, + "balance_loss_mlp": 1.03818285, + "epoch": 0.10756049902299715, + "flos": 20266546550400.0, + "grad_norm": 2.210926738358408, + "language_loss": 0.78193003, + "learning_rate": 3.9372282945292335e-06, + "loss": 0.80398738, + "num_input_tokens_seen": 38683865, + "step": 1789, + "time_per_iteration": 4.3253333568573 + }, + { + "auxiliary_loss_clip": 0.01148646, + "auxiliary_loss_mlp": 0.01065454, + "balance_loss_clip": 1.04537702, + "balance_loss_mlp": 1.03987193, + "epoch": 0.10762062227566511, + "flos": 23586523793280.0, + "grad_norm": 2.5621393016873855, + "language_loss": 0.7448073, + "learning_rate": 3.937131449631859e-06, + "loss": 0.76694834, + "num_input_tokens_seen": 38702485, + "step": 1790, + "time_per_iteration": 2.6285345554351807 + }, + { + "auxiliary_loss_clip": 0.01141894, + "auxiliary_loss_mlp": 0.00749484, + "balance_loss_clip": 1.04754519, + "balance_loss_mlp": 1.00067091, + "epoch": 0.10768074552833308, + "flos": 24310626065280.0, + "grad_norm": 2.139627515612358, + "language_loss": 0.78224587, + "learning_rate": 3.9370345312782645e-06, + "loss": 0.80115968, + "num_input_tokens_seen": 38722475, + "step": 1791, + "time_per_iteration": 2.781200885772705 + }, + { + "auxiliary_loss_clip": 0.0110033, + "auxiliary_loss_mlp": 0.01054257, + "balance_loss_clip": 1.04156089, + "balance_loss_mlp": 1.03154731, + "epoch": 0.10774086878100106, + "flos": 25299965341440.0, + "grad_norm": 1.6921050452115949, + "language_loss": 0.7082175, + "learning_rate": 3.936937539472126e-06, + "loss": 0.72976339, + "num_input_tokens_seen": 38743285, + "step": 1792, + "time_per_iteration": 2.984818696975708 + }, + { + "auxiliary_loss_clip": 0.01119048, + "auxiliary_loss_mlp": 0.01051606, + "balance_loss_clip": 1.0442121, + "balance_loss_mlp": 1.02774012, + "epoch": 0.10780099203366902, + "flos": 22054035985920.0, + "grad_norm": 1.9219155066927462, + "language_loss": 0.76348686, + "learning_rate": 3.9368404742171236e-06, + "loss": 0.78519332, + "num_input_tokens_seen": 38763035, + "step": 1793, + "time_per_iteration": 2.712961435317993 + }, + { + "auxiliary_loss_clip": 0.01079714, + "auxiliary_loss_mlp": 0.01064493, + "balance_loss_clip": 1.03961504, + "balance_loss_mlp": 1.04119945, + "epoch": 0.10786111528633699, + "flos": 22747471021440.0, + "grad_norm": 2.1340617167371043, + "language_loss": 0.8500132, + "learning_rate": 3.936743335516936e-06, + "loss": 0.87145531, + "num_input_tokens_seen": 38784900, + "step": 1794, + "time_per_iteration": 2.7450973987579346 + }, + { + "auxiliary_loss_clip": 0.01078844, + "auxiliary_loss_mlp": 0.01050899, + "balance_loss_clip": 1.03708458, + "balance_loss_mlp": 1.02757025, + "epoch": 0.10792123853900495, + "flos": 20851064570880.0, + "grad_norm": 3.928077179341444, + "language_loss": 0.74529445, + "learning_rate": 3.936646123375246e-06, + "loss": 0.76659191, + "num_input_tokens_seen": 38804695, + "step": 1795, + "time_per_iteration": 2.8480348587036133 + }, + { + "auxiliary_loss_clip": 0.01082252, + "auxiliary_loss_mlp": 0.01060779, + "balance_loss_clip": 1.03557467, + "balance_loss_mlp": 1.03752172, + "epoch": 0.10798136179167293, + "flos": 17748705876480.0, + "grad_norm": 3.1597501533148584, + "language_loss": 0.81363058, + "learning_rate": 3.936548837795741e-06, + "loss": 0.83506095, + "num_input_tokens_seen": 38822395, + "step": 1796, + "time_per_iteration": 2.8758227825164795 + }, + { + "auxiliary_loss_clip": 0.01100034, + "auxiliary_loss_mlp": 0.01081379, + "balance_loss_clip": 1.04027486, + "balance_loss_mlp": 1.05619001, + "epoch": 0.1080414850443409, + "flos": 13589639948160.0, + "grad_norm": 2.441031829503232, + "language_loss": 0.74237752, + "learning_rate": 3.936451478782111e-06, + "loss": 0.76419163, + "num_input_tokens_seen": 38839865, + "step": 1797, + "time_per_iteration": 2.770054340362549 + }, + { + "auxiliary_loss_clip": 0.01121946, + "auxiliary_loss_mlp": 0.01050515, + "balance_loss_clip": 1.04146302, + "balance_loss_mlp": 1.02986777, + "epoch": 0.10810160829700886, + "flos": 16253421580800.0, + "grad_norm": 2.119277942364516, + "language_loss": 0.81873703, + "learning_rate": 3.936354046338046e-06, + "loss": 0.84046167, + "num_input_tokens_seen": 38857300, + "step": 1798, + "time_per_iteration": 2.717703342437744 + }, + { + "auxiliary_loss_clip": 0.01107975, + "auxiliary_loss_mlp": 0.01057356, + "balance_loss_clip": 1.03981137, + "balance_loss_mlp": 1.03418183, + "epoch": 0.10816173154967684, + "flos": 15158002464000.0, + "grad_norm": 2.2816138890499538, + "language_loss": 0.85353458, + "learning_rate": 3.936256540467242e-06, + "loss": 0.87518793, + "num_input_tokens_seen": 38874960, + "step": 1799, + "time_per_iteration": 2.8775973320007324 + }, + { + "auxiliary_loss_clip": 0.01118712, + "auxiliary_loss_mlp": 0.01061763, + "balance_loss_clip": 1.04237759, + "balance_loss_mlp": 1.03968525, + "epoch": 0.10822185480234481, + "flos": 17785334770560.0, + "grad_norm": 1.8502272620030327, + "language_loss": 0.77224928, + "learning_rate": 3.9361589611733955e-06, + "loss": 0.79405403, + "num_input_tokens_seen": 38893610, + "step": 1800, + "time_per_iteration": 2.7556188106536865 + }, + { + "auxiliary_loss_clip": 0.01141787, + "auxiliary_loss_mlp": 0.01049533, + "balance_loss_clip": 1.04295254, + "balance_loss_mlp": 1.02863538, + "epoch": 0.10828197805501277, + "flos": 25556654908800.0, + "grad_norm": 1.5103285511779352, + "language_loss": 0.72769678, + "learning_rate": 3.9360613084602075e-06, + "loss": 0.74961001, + "num_input_tokens_seen": 38913485, + "step": 1801, + "time_per_iteration": 2.6044065952301025 + }, + { + "auxiliary_loss_clip": 0.01152183, + "auxiliary_loss_mlp": 0.01052117, + "balance_loss_clip": 1.04534101, + "balance_loss_mlp": 1.0308497, + "epoch": 0.10834210130768075, + "flos": 28984435845120.0, + "grad_norm": 1.9660086233865623, + "language_loss": 0.66353929, + "learning_rate": 3.935963582331381e-06, + "loss": 0.68558228, + "num_input_tokens_seen": 38935650, + "step": 1802, + "time_per_iteration": 2.8172130584716797 + }, + { + "auxiliary_loss_clip": 0.01117825, + "auxiliary_loss_mlp": 0.01060952, + "balance_loss_clip": 1.04053783, + "balance_loss_mlp": 1.03843355, + "epoch": 0.10840222456034872, + "flos": 20264212166400.0, + "grad_norm": 1.7367534674088672, + "language_loss": 0.81597757, + "learning_rate": 3.935865782790621e-06, + "loss": 0.83776534, + "num_input_tokens_seen": 38954130, + "step": 1803, + "time_per_iteration": 2.7651188373565674 + }, + { + "auxiliary_loss_clip": 0.01127488, + "auxiliary_loss_mlp": 0.01056602, + "balance_loss_clip": 1.04123902, + "balance_loss_mlp": 1.03458405, + "epoch": 0.10846234781301668, + "flos": 19863054097920.0, + "grad_norm": 1.5589536153070713, + "language_loss": 0.91125858, + "learning_rate": 3.9357679098416365e-06, + "loss": 0.93309939, + "num_input_tokens_seen": 38972905, + "step": 1804, + "time_per_iteration": 2.699410915374756 + }, + { + "auxiliary_loss_clip": 0.01102913, + "auxiliary_loss_mlp": 0.01055228, + "balance_loss_clip": 1.04300773, + "balance_loss_mlp": 1.03202999, + "epoch": 0.10852247106568465, + "flos": 26469037296000.0, + "grad_norm": 2.16869686320314, + "language_loss": 0.76153237, + "learning_rate": 3.935669963488139e-06, + "loss": 0.78311378, + "num_input_tokens_seen": 38993255, + "step": 1805, + "time_per_iteration": 2.7459523677825928 + }, + { + "auxiliary_loss_clip": 0.01120452, + "auxiliary_loss_mlp": 0.01055098, + "balance_loss_clip": 1.04574561, + "balance_loss_mlp": 1.03468895, + "epoch": 0.10858259431835263, + "flos": 30081506987520.0, + "grad_norm": 1.7549633768506159, + "language_loss": 0.86351079, + "learning_rate": 3.935571943733843e-06, + "loss": 0.8852663, + "num_input_tokens_seen": 39012610, + "step": 1806, + "time_per_iteration": 2.7652337551116943 + }, + { + "auxiliary_loss_clip": 0.01133647, + "auxiliary_loss_mlp": 0.00749615, + "balance_loss_clip": 1.04177666, + "balance_loss_mlp": 1.00088906, + "epoch": 0.10864271757102059, + "flos": 19063180085760.0, + "grad_norm": 2.1855574340152804, + "language_loss": 0.80776888, + "learning_rate": 3.9354738505824635e-06, + "loss": 0.82660145, + "num_input_tokens_seen": 39030120, + "step": 1807, + "time_per_iteration": 2.680001735687256 + }, + { + "auxiliary_loss_clip": 0.01103638, + "auxiliary_loss_mlp": 0.01055298, + "balance_loss_clip": 1.03980124, + "balance_loss_mlp": 1.03589082, + "epoch": 0.10870284082368856, + "flos": 24715052271360.0, + "grad_norm": 1.8461798174670379, + "language_loss": 0.78517091, + "learning_rate": 3.9353756840377225e-06, + "loss": 0.80676031, + "num_input_tokens_seen": 39049875, + "step": 1808, + "time_per_iteration": 2.7952353954315186 + }, + { + "auxiliary_loss_clip": 0.01124461, + "auxiliary_loss_mlp": 0.01056143, + "balance_loss_clip": 1.04557848, + "balance_loss_mlp": 1.03467393, + "epoch": 0.10876296407635654, + "flos": 20627663932800.0, + "grad_norm": 1.6376609057943885, + "language_loss": 0.79125494, + "learning_rate": 3.935277444103342e-06, + "loss": 0.813061, + "num_input_tokens_seen": 39068935, + "step": 1809, + "time_per_iteration": 2.7185006141662598 + }, + { + "auxiliary_loss_clip": 0.01144139, + "auxiliary_loss_mlp": 0.01060363, + "balance_loss_clip": 1.04328096, + "balance_loss_mlp": 1.03948951, + "epoch": 0.1088230873290245, + "flos": 21579835610880.0, + "grad_norm": 2.9985464281864544, + "language_loss": 0.84899127, + "learning_rate": 3.935179130783046e-06, + "loss": 0.87103629, + "num_input_tokens_seen": 39087370, + "step": 1810, + "time_per_iteration": 2.6604561805725098 + }, + { + "auxiliary_loss_clip": 0.01081312, + "auxiliary_loss_mlp": 0.01057614, + "balance_loss_clip": 1.03443813, + "balance_loss_mlp": 1.03401077, + "epoch": 0.10888321058169247, + "flos": 26469037296000.0, + "grad_norm": 1.6433843290123598, + "language_loss": 0.63560486, + "learning_rate": 3.935080744080564e-06, + "loss": 0.6569941, + "num_input_tokens_seen": 39106635, + "step": 1811, + "time_per_iteration": 2.7399933338165283 + }, + { + "auxiliary_loss_clip": 0.01113736, + "auxiliary_loss_mlp": 0.01052411, + "balance_loss_clip": 1.03858829, + "balance_loss_mlp": 1.03045273, + "epoch": 0.10894333383436045, + "flos": 25848608653440.0, + "grad_norm": 2.194075339455121, + "language_loss": 0.73974323, + "learning_rate": 3.934982283999626e-06, + "loss": 0.76140475, + "num_input_tokens_seen": 39126335, + "step": 1812, + "time_per_iteration": 2.6953816413879395 + }, + { + "auxiliary_loss_clip": 0.01103633, + "auxiliary_loss_mlp": 0.01054051, + "balance_loss_clip": 1.03818583, + "balance_loss_mlp": 1.03268898, + "epoch": 0.10900345708702841, + "flos": 19537093152000.0, + "grad_norm": 1.711537664885882, + "language_loss": 0.72837198, + "learning_rate": 3.934883750543966e-06, + "loss": 0.74994874, + "num_input_tokens_seen": 39144820, + "step": 1813, + "time_per_iteration": 2.697995662689209 + }, + { + "auxiliary_loss_clip": 0.01097871, + "auxiliary_loss_mlp": 0.0105069, + "balance_loss_clip": 1.03880417, + "balance_loss_mlp": 1.02967381, + "epoch": 0.10906358033969638, + "flos": 23623296341760.0, + "grad_norm": 1.6832664567570226, + "language_loss": 0.82818526, + "learning_rate": 3.93478514371732e-06, + "loss": 0.84967089, + "num_input_tokens_seen": 39165945, + "step": 1814, + "time_per_iteration": 2.695582628250122 + }, + { + "auxiliary_loss_clip": 0.01108603, + "auxiliary_loss_mlp": 0.01053845, + "balance_loss_clip": 1.04040992, + "balance_loss_mlp": 1.03303123, + "epoch": 0.10912370359236434, + "flos": 21214731818880.0, + "grad_norm": 1.9755135135696313, + "language_loss": 0.84305286, + "learning_rate": 3.934686463523429e-06, + "loss": 0.86467737, + "num_input_tokens_seen": 39183520, + "step": 1815, + "time_per_iteration": 2.661320924758911 + }, + { + "auxiliary_loss_clip": 0.01112225, + "auxiliary_loss_mlp": 0.01052793, + "balance_loss_clip": 1.04289436, + "balance_loss_mlp": 1.03021431, + "epoch": 0.10918382684503232, + "flos": 13553190622080.0, + "grad_norm": 2.267151136529371, + "language_loss": 0.71996522, + "learning_rate": 3.9345877099660315e-06, + "loss": 0.74161541, + "num_input_tokens_seen": 39201190, + "step": 1816, + "time_per_iteration": 2.8815691471099854 + }, + { + "auxiliary_loss_clip": 0.01126832, + "auxiliary_loss_mlp": 0.01062741, + "balance_loss_clip": 1.04003525, + "balance_loss_mlp": 1.04037714, + "epoch": 0.10924395009770028, + "flos": 27964321591680.0, + "grad_norm": 2.028963836615618, + "language_loss": 0.73151159, + "learning_rate": 3.9344888830488744e-06, + "loss": 0.7534073, + "num_input_tokens_seen": 39221210, + "step": 1817, + "time_per_iteration": 3.031245708465576 + }, + { + "auxiliary_loss_clip": 0.01087538, + "auxiliary_loss_mlp": 0.01056148, + "balance_loss_clip": 1.04083753, + "balance_loss_mlp": 1.03448784, + "epoch": 0.10930407335036825, + "flos": 25593750679680.0, + "grad_norm": 1.5119202022515834, + "language_loss": 0.67211294, + "learning_rate": 3.934389982775706e-06, + "loss": 0.69354987, + "num_input_tokens_seen": 39242025, + "step": 1818, + "time_per_iteration": 2.913984537124634 + }, + { + "auxiliary_loss_clip": 0.01128292, + "auxiliary_loss_mlp": 0.01059547, + "balance_loss_clip": 1.04775047, + "balance_loss_mlp": 1.03749323, + "epoch": 0.10936419660303623, + "flos": 18406194376320.0, + "grad_norm": 2.5791612466617817, + "language_loss": 0.72509563, + "learning_rate": 3.934291009150275e-06, + "loss": 0.74697399, + "num_input_tokens_seen": 39259870, + "step": 1819, + "time_per_iteration": 3.258638381958008 + }, + { + "auxiliary_loss_clip": 0.01106148, + "auxiliary_loss_mlp": 0.00749566, + "balance_loss_clip": 1.04091263, + "balance_loss_mlp": 1.00107431, + "epoch": 0.1094243198557042, + "flos": 23840052963840.0, + "grad_norm": 2.8157965334086414, + "language_loss": 0.74273896, + "learning_rate": 3.934191962176335e-06, + "loss": 0.76129609, + "num_input_tokens_seen": 39278500, + "step": 1820, + "time_per_iteration": 2.7758800983428955 + }, + { + "auxiliary_loss_clip": 0.01145257, + "auxiliary_loss_mlp": 0.01057615, + "balance_loss_clip": 1.04473996, + "balance_loss_mlp": 1.03398788, + "epoch": 0.10948444310837216, + "flos": 14643940970880.0, + "grad_norm": 2.067764674945617, + "language_loss": 0.82746828, + "learning_rate": 3.934092841857642e-06, + "loss": 0.84949696, + "num_input_tokens_seen": 39294800, + "step": 1821, + "time_per_iteration": 2.7289645671844482 + }, + { + "auxiliary_loss_clip": 0.01120561, + "auxiliary_loss_mlp": 0.01052605, + "balance_loss_clip": 1.04363704, + "balance_loss_mlp": 1.03102827, + "epoch": 0.10954456636104014, + "flos": 27818811596160.0, + "grad_norm": 2.161530425504123, + "language_loss": 0.76084197, + "learning_rate": 3.933993648197955e-06, + "loss": 0.78257358, + "num_input_tokens_seen": 39314625, + "step": 1822, + "time_per_iteration": 2.756375789642334 + }, + { + "auxiliary_loss_clip": 0.01103924, + "auxiliary_loss_mlp": 0.01050184, + "balance_loss_clip": 1.03760016, + "balance_loss_mlp": 1.02956092, + "epoch": 0.1096046896137081, + "flos": 33620934372480.0, + "grad_norm": 1.7352904919155057, + "language_loss": 0.79385293, + "learning_rate": 3.933894381201034e-06, + "loss": 0.81539398, + "num_input_tokens_seen": 39336465, + "step": 1823, + "time_per_iteration": 2.7714109420776367 + }, + { + "auxiliary_loss_clip": 0.01113199, + "auxiliary_loss_mlp": 0.01046815, + "balance_loss_clip": 1.04198122, + "balance_loss_mlp": 1.02578616, + "epoch": 0.10966481286637607, + "flos": 26980010219520.0, + "grad_norm": 1.568386200048272, + "language_loss": 0.79338956, + "learning_rate": 3.933795040870645e-06, + "loss": 0.81498969, + "num_input_tokens_seen": 39357930, + "step": 1824, + "time_per_iteration": 2.749067783355713 + }, + { + "auxiliary_loss_clip": 0.01105924, + "auxiliary_loss_mlp": 0.01056322, + "balance_loss_clip": 1.03924203, + "balance_loss_mlp": 1.03554392, + "epoch": 0.10972493611904403, + "flos": 23036551678080.0, + "grad_norm": 1.7371892162606826, + "language_loss": 0.87901616, + "learning_rate": 3.933695627210554e-06, + "loss": 0.9006387, + "num_input_tokens_seen": 39376380, + "step": 1825, + "time_per_iteration": 2.756788492202759 + }, + { + "auxiliary_loss_clip": 0.01096376, + "auxiliary_loss_mlp": 0.01047381, + "balance_loss_clip": 1.03780627, + "balance_loss_mlp": 1.02645946, + "epoch": 0.10978505937171201, + "flos": 38104632443520.0, + "grad_norm": 2.241523405667347, + "language_loss": 0.76076877, + "learning_rate": 3.933596140224532e-06, + "loss": 0.78220636, + "num_input_tokens_seen": 39399935, + "step": 1826, + "time_per_iteration": 2.8189539909362793 + }, + { + "auxiliary_loss_clip": 0.01037209, + "auxiliary_loss_mlp": 0.01052452, + "balance_loss_clip": 1.0143553, + "balance_loss_mlp": 1.04988861, + "epoch": 0.10984518262437998, + "flos": 59849694616320.0, + "grad_norm": 0.8558151118580659, + "language_loss": 0.54984522, + "learning_rate": 3.93349657991635e-06, + "loss": 0.57074177, + "num_input_tokens_seen": 39460685, + "step": 1827, + "time_per_iteration": 3.2083351612091064 + }, + { + "auxiliary_loss_clip": 0.01039939, + "auxiliary_loss_mlp": 0.01017367, + "balance_loss_clip": 1.00780559, + "balance_loss_mlp": 1.01466048, + "epoch": 0.10990530587704794, + "flos": 66719837410560.0, + "grad_norm": 0.7387257635060599, + "language_loss": 0.5539943, + "learning_rate": 3.933396946289784e-06, + "loss": 0.57456738, + "num_input_tokens_seen": 39524765, + "step": 1828, + "time_per_iteration": 3.179222583770752 + }, + { + "auxiliary_loss_clip": 0.01123638, + "auxiliary_loss_mlp": 0.01056856, + "balance_loss_clip": 1.04324484, + "balance_loss_mlp": 1.03455162, + "epoch": 0.10996542912971592, + "flos": 25447199189760.0, + "grad_norm": 2.322990933956786, + "language_loss": 0.84449399, + "learning_rate": 3.933297239348612e-06, + "loss": 0.86629891, + "num_input_tokens_seen": 39543640, + "step": 1829, + "time_per_iteration": 2.643672227859497 + }, + { + "auxiliary_loss_clip": 0.01103701, + "auxiliary_loss_mlp": 0.01064326, + "balance_loss_clip": 1.04784989, + "balance_loss_mlp": 1.04122376, + "epoch": 0.11002555238238389, + "flos": 44018186186880.0, + "grad_norm": 2.147991257251499, + "language_loss": 0.89043903, + "learning_rate": 3.933197459096614e-06, + "loss": 0.91211933, + "num_input_tokens_seen": 39567525, + "step": 1830, + "time_per_iteration": 3.0030064582824707 + }, + { + "auxiliary_loss_clip": 0.01024098, + "auxiliary_loss_mlp": 0.01018633, + "balance_loss_clip": 1.012779, + "balance_loss_mlp": 1.01591516, + "epoch": 0.11008567563505185, + "flos": 54065133590400.0, + "grad_norm": 0.6955248336108595, + "language_loss": 0.55525279, + "learning_rate": 3.9330976055375756e-06, + "loss": 0.57568002, + "num_input_tokens_seen": 39628470, + "step": 1831, + "time_per_iteration": 3.253530502319336 + }, + { + "auxiliary_loss_clip": 0.01128468, + "auxiliary_loss_mlp": 0.01072174, + "balance_loss_clip": 1.04858851, + "balance_loss_mlp": 1.04890442, + "epoch": 0.11014579888771983, + "flos": 24243150366720.0, + "grad_norm": 2.34123693287463, + "language_loss": 0.91003573, + "learning_rate": 3.932997678675282e-06, + "loss": 0.93204212, + "num_input_tokens_seen": 39646670, + "step": 1832, + "time_per_iteration": 4.299706220626831 + }, + { + "auxiliary_loss_clip": 0.0103289, + "auxiliary_loss_mlp": 0.01028876, + "balance_loss_clip": 1.01124454, + "balance_loss_mlp": 1.0259794, + "epoch": 0.1102059221403878, + "flos": 57743965658880.0, + "grad_norm": 0.7341650692187619, + "language_loss": 0.59918237, + "learning_rate": 3.932897678513523e-06, + "loss": 0.61980009, + "num_input_tokens_seen": 39712915, + "step": 1833, + "time_per_iteration": 3.162855863571167 + }, + { + "auxiliary_loss_clip": 0.01133817, + "auxiliary_loss_mlp": 0.01054876, + "balance_loss_clip": 1.04198551, + "balance_loss_mlp": 1.03275132, + "epoch": 0.11026604539305576, + "flos": 16795923667200.0, + "grad_norm": 2.9790878309960287, + "language_loss": 0.80715358, + "learning_rate": 3.93279760505609e-06, + "loss": 0.82904059, + "num_input_tokens_seen": 39730650, + "step": 1834, + "time_per_iteration": 2.625675678253174 + }, + { + "auxiliary_loss_clip": 0.01102831, + "auxiliary_loss_mlp": 0.01056194, + "balance_loss_clip": 1.0416683, + "balance_loss_mlp": 1.03251958, + "epoch": 0.11032616864572373, + "flos": 23988076911360.0, + "grad_norm": 2.449463521770174, + "language_loss": 0.90839446, + "learning_rate": 3.932697458306779e-06, + "loss": 0.92998475, + "num_input_tokens_seen": 39751065, + "step": 1835, + "time_per_iteration": 2.766592502593994 + }, + { + "auxiliary_loss_clip": 0.01104068, + "auxiliary_loss_mlp": 0.01059471, + "balance_loss_clip": 1.04625988, + "balance_loss_mlp": 1.03537905, + "epoch": 0.1103862918983917, + "flos": 19683141851520.0, + "grad_norm": 2.2672459815320574, + "language_loss": 0.63617986, + "learning_rate": 3.932597238269386e-06, + "loss": 0.65781522, + "num_input_tokens_seen": 39769245, + "step": 1836, + "time_per_iteration": 7.419116020202637 + }, + { + "auxiliary_loss_clip": 0.01101331, + "auxiliary_loss_mlp": 0.01059628, + "balance_loss_clip": 1.03864765, + "balance_loss_mlp": 1.03821802, + "epoch": 0.11044641515105967, + "flos": 32160878340480.0, + "grad_norm": 2.274590262872657, + "language_loss": 0.72736442, + "learning_rate": 3.932496944947711e-06, + "loss": 0.74897397, + "num_input_tokens_seen": 39790830, + "step": 1837, + "time_per_iteration": 2.9370784759521484 + }, + { + "auxiliary_loss_clip": 0.01130682, + "auxiliary_loss_mlp": 0.01057745, + "balance_loss_clip": 1.04482114, + "balance_loss_mlp": 1.03602469, + "epoch": 0.11050653840372764, + "flos": 16689233295360.0, + "grad_norm": 1.7931228825425016, + "language_loss": 0.78152782, + "learning_rate": 3.93239657834556e-06, + "loss": 0.80341208, + "num_input_tokens_seen": 39809475, + "step": 1838, + "time_per_iteration": 2.7263224124908447 + }, + { + "auxiliary_loss_clip": 0.01120992, + "auxiliary_loss_mlp": 0.01064959, + "balance_loss_clip": 1.04323435, + "balance_loss_mlp": 1.04320359, + "epoch": 0.11056666165639562, + "flos": 21208877902080.0, + "grad_norm": 1.9692551304018373, + "language_loss": 0.71639073, + "learning_rate": 3.932296138466736e-06, + "loss": 0.7382502, + "num_input_tokens_seen": 39826355, + "step": 1839, + "time_per_iteration": 2.647721290588379 + }, + { + "auxiliary_loss_clip": 0.01153457, + "auxiliary_loss_mlp": 0.00749595, + "balance_loss_clip": 1.04980254, + "balance_loss_mlp": 1.00112379, + "epoch": 0.11062678490906358, + "flos": 19165488998400.0, + "grad_norm": 2.1011520893241924, + "language_loss": 0.78175354, + "learning_rate": 3.93219562531505e-06, + "loss": 0.80078405, + "num_input_tokens_seen": 39845335, + "step": 1840, + "time_per_iteration": 2.656963586807251 + }, + { + "auxiliary_loss_clip": 0.01118115, + "auxiliary_loss_mlp": 0.01047534, + "balance_loss_clip": 1.04008126, + "balance_loss_mlp": 1.02577806, + "epoch": 0.11068690816173155, + "flos": 24895287740160.0, + "grad_norm": 1.8896639976589007, + "language_loss": 0.88091558, + "learning_rate": 3.932095038894311e-06, + "loss": 0.90257215, + "num_input_tokens_seen": 39865065, + "step": 1841, + "time_per_iteration": 2.677173137664795 + }, + { + "auxiliary_loss_clip": 0.01091733, + "auxiliary_loss_mlp": 0.01058326, + "balance_loss_clip": 1.03914523, + "balance_loss_mlp": 1.03688073, + "epoch": 0.11074703141439952, + "flos": 16472368932480.0, + "grad_norm": 1.911957114216583, + "language_loss": 0.90696406, + "learning_rate": 3.931994379208334e-06, + "loss": 0.92846459, + "num_input_tokens_seen": 39882780, + "step": 1842, + "time_per_iteration": 2.655001401901245 + }, + { + "auxiliary_loss_clip": 0.01119649, + "auxiliary_loss_mlp": 0.01060639, + "balance_loss_clip": 1.04026961, + "balance_loss_mlp": 1.03974128, + "epoch": 0.11080715466706749, + "flos": 19172420323200.0, + "grad_norm": 2.062597130045207, + "language_loss": 0.86115336, + "learning_rate": 3.931893646260937e-06, + "loss": 0.88295627, + "num_input_tokens_seen": 39900295, + "step": 1843, + "time_per_iteration": 2.585875988006592 + }, + { + "auxiliary_loss_clip": 0.01087845, + "auxiliary_loss_mlp": 0.0074971, + "balance_loss_clip": 1.04202557, + "balance_loss_mlp": 1.00116777, + "epoch": 0.11086727791973545, + "flos": 27704687109120.0, + "grad_norm": 1.5153921714272507, + "language_loss": 0.7440266, + "learning_rate": 3.931792840055941e-06, + "loss": 0.76240212, + "num_input_tokens_seen": 39922075, + "step": 1844, + "time_per_iteration": 2.786679744720459 + }, + { + "auxiliary_loss_clip": 0.01145697, + "auxiliary_loss_mlp": 0.01053964, + "balance_loss_clip": 1.04475713, + "balance_loss_mlp": 1.03068256, + "epoch": 0.11092740117240343, + "flos": 18514967736960.0, + "grad_norm": 1.9882379882193502, + "language_loss": 0.75796425, + "learning_rate": 3.931691960597165e-06, + "loss": 0.77996087, + "num_input_tokens_seen": 39940115, + "step": 1845, + "time_per_iteration": 2.5771656036376953 + }, + { + "auxiliary_loss_clip": 0.01118393, + "auxiliary_loss_mlp": 0.01054621, + "balance_loss_clip": 1.04158592, + "balance_loss_mlp": 1.03414047, + "epoch": 0.1109875244250714, + "flos": 20522446018560.0, + "grad_norm": 1.5340096865321402, + "language_loss": 0.75913978, + "learning_rate": 3.9315910078884375e-06, + "loss": 0.78086984, + "num_input_tokens_seen": 39959920, + "step": 1846, + "time_per_iteration": 2.5851125717163086 + }, + { + "auxiliary_loss_clip": 0.01140499, + "auxiliary_loss_mlp": 0.0105352, + "balance_loss_clip": 1.04703879, + "balance_loss_mlp": 1.03228843, + "epoch": 0.11104764767773936, + "flos": 14098601710080.0, + "grad_norm": 2.6786399830107297, + "language_loss": 0.86047482, + "learning_rate": 3.931489981933584e-06, + "loss": 0.882415, + "num_input_tokens_seen": 39974755, + "step": 1847, + "time_per_iteration": 2.5704455375671387 + }, + { + "auxiliary_loss_clip": 0.01143917, + "auxiliary_loss_mlp": 0.01052245, + "balance_loss_clip": 1.04257298, + "balance_loss_mlp": 1.02982187, + "epoch": 0.11110777093040733, + "flos": 20594518657920.0, + "grad_norm": 1.9970895663180477, + "language_loss": 0.77209646, + "learning_rate": 3.931388882736438e-06, + "loss": 0.79405808, + "num_input_tokens_seen": 39993355, + "step": 1848, + "time_per_iteration": 2.565737009048462 + }, + { + "auxiliary_loss_clip": 0.01140827, + "auxiliary_loss_mlp": 0.01054592, + "balance_loss_clip": 1.05239487, + "balance_loss_mlp": 1.03398108, + "epoch": 0.11116789418307531, + "flos": 21870065502720.0, + "grad_norm": 1.8034758788234657, + "language_loss": 0.77452469, + "learning_rate": 3.931287710300832e-06, + "loss": 0.79647887, + "num_input_tokens_seen": 40012410, + "step": 1849, + "time_per_iteration": 2.623406410217285 + }, + { + "auxiliary_loss_clip": 0.01105945, + "auxiliary_loss_mlp": 0.00749607, + "balance_loss_clip": 1.04059994, + "balance_loss_mlp": 1.00101483, + "epoch": 0.11122801743574327, + "flos": 15523106256000.0, + "grad_norm": 2.5753838889808347, + "language_loss": 0.71944839, + "learning_rate": 3.931186464630601e-06, + "loss": 0.73800385, + "num_input_tokens_seen": 40029315, + "step": 1850, + "time_per_iteration": 2.662773609161377 + }, + { + "auxiliary_loss_clip": 0.01132724, + "auxiliary_loss_mlp": 0.01055707, + "balance_loss_clip": 1.04353046, + "balance_loss_mlp": 1.03304553, + "epoch": 0.11128814068841124, + "flos": 14392279307520.0, + "grad_norm": 2.147436605979569, + "language_loss": 0.81195593, + "learning_rate": 3.931085145729588e-06, + "loss": 0.83384025, + "num_input_tokens_seen": 40045765, + "step": 1851, + "time_per_iteration": 2.628924608230591 + }, + { + "auxiliary_loss_clip": 0.0113649, + "auxiliary_loss_mlp": 0.01057694, + "balance_loss_clip": 1.04880154, + "balance_loss_mlp": 1.03617668, + "epoch": 0.11134826394107922, + "flos": 16653933204480.0, + "grad_norm": 2.6729729065208407, + "language_loss": 0.88635826, + "learning_rate": 3.930983753601631e-06, + "loss": 0.90830004, + "num_input_tokens_seen": 40061660, + "step": 1852, + "time_per_iteration": 2.591027021408081 + }, + { + "auxiliary_loss_clip": 0.0113378, + "auxiliary_loss_mlp": 0.01064259, + "balance_loss_clip": 1.04452491, + "balance_loss_mlp": 1.04061985, + "epoch": 0.11140838719374718, + "flos": 16690993061760.0, + "grad_norm": 3.172475969409247, + "language_loss": 0.72013706, + "learning_rate": 3.930882288250578e-06, + "loss": 0.74211746, + "num_input_tokens_seen": 40080180, + "step": 1853, + "time_per_iteration": 2.6222727298736572 + }, + { + "auxiliary_loss_clip": 0.0104308, + "auxiliary_loss_mlp": 0.01021141, + "balance_loss_clip": 1.01277018, + "balance_loss_mlp": 1.01860189, + "epoch": 0.11146851044641515, + "flos": 60976355587200.0, + "grad_norm": 0.7751522233636527, + "language_loss": 0.53707653, + "learning_rate": 3.930780749680273e-06, + "loss": 0.55771875, + "num_input_tokens_seen": 40138910, + "step": 1854, + "time_per_iteration": 3.2061901092529297 + }, + { + "auxiliary_loss_clip": 0.01122452, + "auxiliary_loss_mlp": 0.01055557, + "balance_loss_clip": 1.0417192, + "balance_loss_mlp": 1.03241849, + "epoch": 0.11152863369908313, + "flos": 22193835719040.0, + "grad_norm": 2.290634675695484, + "language_loss": 0.84869814, + "learning_rate": 3.9306791378945705e-06, + "loss": 0.87047827, + "num_input_tokens_seen": 40157745, + "step": 1855, + "time_per_iteration": 2.660950183868408 + }, + { + "auxiliary_loss_clip": 0.0111688, + "auxiliary_loss_mlp": 0.01069778, + "balance_loss_clip": 1.04357469, + "balance_loss_mlp": 1.04784346, + "epoch": 0.11158875695175109, + "flos": 19537524115200.0, + "grad_norm": 1.97466171623185, + "language_loss": 0.81240237, + "learning_rate": 3.9305774528973205e-06, + "loss": 0.83426899, + "num_input_tokens_seen": 40175375, + "step": 1856, + "time_per_iteration": 2.6889617443084717 + }, + { + "auxiliary_loss_clip": 0.01133634, + "auxiliary_loss_mlp": 0.01049523, + "balance_loss_clip": 1.04520297, + "balance_loss_mlp": 1.02692068, + "epoch": 0.11164888020441906, + "flos": 25442709989760.0, + "grad_norm": 1.8846365657404356, + "language_loss": 0.83144158, + "learning_rate": 3.93047569469238e-06, + "loss": 0.85327309, + "num_input_tokens_seen": 40195715, + "step": 1857, + "time_per_iteration": 2.746687889099121 + }, + { + "auxiliary_loss_clip": 0.01108951, + "auxiliary_loss_mlp": 0.01046449, + "balance_loss_clip": 1.03837132, + "balance_loss_mlp": 1.02505088, + "epoch": 0.11170900345708702, + "flos": 15632741543040.0, + "grad_norm": 3.6507890442792985, + "language_loss": 0.82836348, + "learning_rate": 3.930373863283608e-06, + "loss": 0.84991753, + "num_input_tokens_seen": 40213975, + "step": 1858, + "time_per_iteration": 2.7005584239959717 + }, + { + "auxiliary_loss_clip": 0.01096552, + "auxiliary_loss_mlp": 0.01057608, + "balance_loss_clip": 1.03833532, + "balance_loss_mlp": 1.03455305, + "epoch": 0.111769126709755, + "flos": 23039424766080.0, + "grad_norm": 2.057127627319252, + "language_loss": 0.91347933, + "learning_rate": 3.930271958674866e-06, + "loss": 0.93502086, + "num_input_tokens_seen": 40233905, + "step": 1859, + "time_per_iteration": 2.69526743888855 + }, + { + "auxiliary_loss_clip": 0.01136058, + "auxiliary_loss_mlp": 0.01047314, + "balance_loss_clip": 1.04333806, + "balance_loss_mlp": 1.02478361, + "epoch": 0.11182924996242297, + "flos": 20850705434880.0, + "grad_norm": 2.326238068031393, + "language_loss": 0.81433934, + "learning_rate": 3.930169980870018e-06, + "loss": 0.83617306, + "num_input_tokens_seen": 40252810, + "step": 1860, + "time_per_iteration": 2.621760368347168 + }, + { + "auxiliary_loss_clip": 0.01119427, + "auxiliary_loss_mlp": 0.01058849, + "balance_loss_clip": 1.04384756, + "balance_loss_mlp": 1.03677166, + "epoch": 0.11188937321509093, + "flos": 17455315587840.0, + "grad_norm": 2.031481834693103, + "language_loss": 0.74835193, + "learning_rate": 3.930067929872931e-06, + "loss": 0.77013469, + "num_input_tokens_seen": 40272000, + "step": 1861, + "time_per_iteration": 2.691840410232544 + }, + { + "auxiliary_loss_clip": 0.01144278, + "auxiliary_loss_mlp": 0.01056496, + "balance_loss_clip": 1.04465914, + "balance_loss_mlp": 1.03553939, + "epoch": 0.11194949646775891, + "flos": 24095916518400.0, + "grad_norm": 1.873638284748468, + "language_loss": 0.89081109, + "learning_rate": 3.929965805687474e-06, + "loss": 0.91281879, + "num_input_tokens_seen": 40290660, + "step": 1862, + "time_per_iteration": 2.5758769512176514 + }, + { + "auxiliary_loss_clip": 0.01130993, + "auxiliary_loss_mlp": 0.01064216, + "balance_loss_clip": 1.04768658, + "balance_loss_mlp": 1.04207885, + "epoch": 0.11200961972042688, + "flos": 25153880728320.0, + "grad_norm": 2.2254305298389725, + "language_loss": 0.8737036, + "learning_rate": 3.92986360831752e-06, + "loss": 0.89565563, + "num_input_tokens_seen": 40307820, + "step": 1863, + "time_per_iteration": 2.693206548690796 + }, + { + "auxiliary_loss_clip": 0.01125871, + "auxiliary_loss_mlp": 0.01050629, + "balance_loss_clip": 1.04306138, + "balance_loss_mlp": 1.02651286, + "epoch": 0.11206974297309484, + "flos": 21288312829440.0, + "grad_norm": 2.495181791969052, + "language_loss": 0.64497626, + "learning_rate": 3.929761337766945e-06, + "loss": 0.66674125, + "num_input_tokens_seen": 40327430, + "step": 1864, + "time_per_iteration": 2.6470725536346436 + }, + { + "auxiliary_loss_clip": 0.01088543, + "auxiliary_loss_mlp": 0.01054038, + "balance_loss_clip": 1.04568934, + "balance_loss_mlp": 1.03299785, + "epoch": 0.11212986622576282, + "flos": 18915982151040.0, + "grad_norm": 1.943463696003662, + "language_loss": 0.73822153, + "learning_rate": 3.929658994039627e-06, + "loss": 0.75964737, + "num_input_tokens_seen": 40344545, + "step": 1865, + "time_per_iteration": 2.7704999446868896 + }, + { + "auxiliary_loss_clip": 0.01077478, + "auxiliary_loss_mlp": 0.01060745, + "balance_loss_clip": 1.04067123, + "balance_loss_mlp": 1.03546083, + "epoch": 0.11218998947843078, + "flos": 22054754257920.0, + "grad_norm": 2.533540638787177, + "language_loss": 0.84402263, + "learning_rate": 3.929556577139446e-06, + "loss": 0.86540484, + "num_input_tokens_seen": 40362300, + "step": 1866, + "time_per_iteration": 2.701897144317627 + }, + { + "auxiliary_loss_clip": 0.01050786, + "auxiliary_loss_mlp": 0.00749483, + "balance_loss_clip": 1.0316422, + "balance_loss_mlp": 1.0007875, + "epoch": 0.11225011273109875, + "flos": 24571697091840.0, + "grad_norm": 1.50420394275771, + "language_loss": 0.81275892, + "learning_rate": 3.929454087070286e-06, + "loss": 0.83076155, + "num_input_tokens_seen": 40384720, + "step": 1867, + "time_per_iteration": 2.7666237354278564 + }, + { + "auxiliary_loss_clip": 0.01149504, + "auxiliary_loss_mlp": 0.01059946, + "balance_loss_clip": 1.04875576, + "balance_loss_mlp": 1.03882217, + "epoch": 0.11231023598376672, + "flos": 28438665621120.0, + "grad_norm": 2.067569132601815, + "language_loss": 0.86405087, + "learning_rate": 3.929351523836035e-06, + "loss": 0.88614535, + "num_input_tokens_seen": 40404000, + "step": 1868, + "time_per_iteration": 2.6453070640563965 + }, + { + "auxiliary_loss_clip": 0.01126335, + "auxiliary_loss_mlp": 0.00749445, + "balance_loss_clip": 1.04539442, + "balance_loss_mlp": 1.00070524, + "epoch": 0.1123703592364347, + "flos": 14426466076800.0, + "grad_norm": 2.4023910441335485, + "language_loss": 0.68107545, + "learning_rate": 3.9292488874405795e-06, + "loss": 0.69983321, + "num_input_tokens_seen": 40418665, + "step": 1869, + "time_per_iteration": 2.623420000076294 + }, + { + "auxiliary_loss_clip": 0.01100759, + "auxiliary_loss_mlp": 0.01067064, + "balance_loss_clip": 1.04248095, + "balance_loss_mlp": 1.04306722, + "epoch": 0.11243048248910266, + "flos": 22236282616320.0, + "grad_norm": 1.596217089066123, + "language_loss": 0.77108699, + "learning_rate": 3.929146177887814e-06, + "loss": 0.79276526, + "num_input_tokens_seen": 40437870, + "step": 1870, + "time_per_iteration": 2.7235655784606934 + }, + { + "auxiliary_loss_clip": 0.01106635, + "auxiliary_loss_mlp": 0.01061572, + "balance_loss_clip": 1.0453645, + "balance_loss_mlp": 1.03708673, + "epoch": 0.11249060574177062, + "flos": 18584167288320.0, + "grad_norm": 2.080843378537957, + "language_loss": 0.76176822, + "learning_rate": 3.929043395181631e-06, + "loss": 0.78345031, + "num_input_tokens_seen": 40455570, + "step": 1871, + "time_per_iteration": 2.79702091217041 + }, + { + "auxiliary_loss_clip": 0.01086269, + "auxiliary_loss_mlp": 0.01049744, + "balance_loss_clip": 1.04519892, + "balance_loss_mlp": 1.02811968, + "epoch": 0.1125507289944386, + "flos": 22856567604480.0, + "grad_norm": 3.7724487000335998, + "language_loss": 0.81767386, + "learning_rate": 3.928940539325929e-06, + "loss": 0.83903402, + "num_input_tokens_seen": 40473600, + "step": 1872, + "time_per_iteration": 3.0332999229431152 + }, + { + "auxiliary_loss_clip": 0.01151336, + "auxiliary_loss_mlp": 0.01058709, + "balance_loss_clip": 1.04805434, + "balance_loss_mlp": 1.03616607, + "epoch": 0.11261085224710657, + "flos": 19676390094720.0, + "grad_norm": 2.2652107460373814, + "language_loss": 0.83618343, + "learning_rate": 3.9288376103246095e-06, + "loss": 0.85828388, + "num_input_tokens_seen": 40490025, + "step": 1873, + "time_per_iteration": 2.723940372467041 + }, + { + "auxiliary_loss_clip": 0.01116351, + "auxiliary_loss_mlp": 0.01060939, + "balance_loss_clip": 1.04332852, + "balance_loss_mlp": 1.03638148, + "epoch": 0.11267097549977453, + "flos": 26063246373120.0, + "grad_norm": 2.123393177371943, + "language_loss": 0.92239058, + "learning_rate": 3.928734608181575e-06, + "loss": 0.94416344, + "num_input_tokens_seen": 40511580, + "step": 1874, + "time_per_iteration": 2.9065611362457275 + }, + { + "auxiliary_loss_clip": 0.01102441, + "auxiliary_loss_mlp": 0.01067681, + "balance_loss_clip": 1.03839231, + "balance_loss_mlp": 1.04525769, + "epoch": 0.11273109875244251, + "flos": 21068036674560.0, + "grad_norm": 1.4176165856231782, + "language_loss": 0.75402224, + "learning_rate": 3.928631532900729e-06, + "loss": 0.77572346, + "num_input_tokens_seen": 40530155, + "step": 1875, + "time_per_iteration": 3.0070035457611084 + }, + { + "auxiliary_loss_clip": 0.01134272, + "auxiliary_loss_mlp": 0.0106187, + "balance_loss_clip": 1.04652464, + "balance_loss_mlp": 1.0412823, + "epoch": 0.11279122200511048, + "flos": 27088999061760.0, + "grad_norm": 1.9599888023007677, + "language_loss": 0.7182765, + "learning_rate": 3.928528384485984e-06, + "loss": 0.74023795, + "num_input_tokens_seen": 40549500, + "step": 1876, + "time_per_iteration": 2.8247952461242676 + }, + { + "auxiliary_loss_clip": 0.01113681, + "auxiliary_loss_mlp": 0.01056877, + "balance_loss_clip": 1.04310322, + "balance_loss_mlp": 1.03468049, + "epoch": 0.11285134525777844, + "flos": 20187901722240.0, + "grad_norm": 2.2376649840922562, + "language_loss": 0.76664203, + "learning_rate": 3.9284251629412475e-06, + "loss": 0.7883476, + "num_input_tokens_seen": 40567475, + "step": 1877, + "time_per_iteration": 2.865095615386963 + }, + { + "auxiliary_loss_clip": 0.01135262, + "auxiliary_loss_mlp": 0.01065882, + "balance_loss_clip": 1.04501212, + "balance_loss_mlp": 1.04313755, + "epoch": 0.11291146851044641, + "flos": 12458453863680.0, + "grad_norm": 2.6899459750103683, + "language_loss": 0.87722301, + "learning_rate": 3.928321868270436e-06, + "loss": 0.89923453, + "num_input_tokens_seen": 40583280, + "step": 1878, + "time_per_iteration": 2.6264352798461914 + }, + { + "auxiliary_loss_clip": 0.01111071, + "auxiliary_loss_mlp": 0.01054416, + "balance_loss_clip": 1.04279089, + "balance_loss_mlp": 1.03226709, + "epoch": 0.11297159176311439, + "flos": 23842315520640.0, + "grad_norm": 2.164400246481085, + "language_loss": 0.81138831, + "learning_rate": 3.928218500477466e-06, + "loss": 0.83304316, + "num_input_tokens_seen": 40603080, + "step": 1879, + "time_per_iteration": 4.403631687164307 + }, + { + "auxiliary_loss_clip": 0.01119626, + "auxiliary_loss_mlp": 0.01065668, + "balance_loss_clip": 1.04155993, + "balance_loss_mlp": 1.0420531, + "epoch": 0.11303171501578235, + "flos": 29930538124800.0, + "grad_norm": 1.996770882008451, + "language_loss": 0.69787276, + "learning_rate": 3.928115059566259e-06, + "loss": 0.71972573, + "num_input_tokens_seen": 40623255, + "step": 1880, + "time_per_iteration": 2.701503276824951 + }, + { + "auxiliary_loss_clip": 0.0111927, + "auxiliary_loss_mlp": 0.01055778, + "balance_loss_clip": 1.04170156, + "balance_loss_mlp": 1.03352118, + "epoch": 0.11309183826845032, + "flos": 16180558842240.0, + "grad_norm": 1.5773428891701924, + "language_loss": 0.72344834, + "learning_rate": 3.928011545540734e-06, + "loss": 0.74519873, + "num_input_tokens_seen": 40641570, + "step": 1881, + "time_per_iteration": 2.582202672958374 + }, + { + "auxiliary_loss_clip": 0.01110357, + "auxiliary_loss_mlp": 0.00749488, + "balance_loss_clip": 1.04000354, + "balance_loss_mlp": 1.00065446, + "epoch": 0.1131519615211183, + "flos": 12020702814720.0, + "grad_norm": 2.445882655583624, + "language_loss": 0.74196303, + "learning_rate": 3.927907958404819e-06, + "loss": 0.76056147, + "num_input_tokens_seen": 40658775, + "step": 1882, + "time_per_iteration": 2.8965647220611572 + }, + { + "auxiliary_loss_clip": 0.01143769, + "auxiliary_loss_mlp": 0.01055157, + "balance_loss_clip": 1.04389393, + "balance_loss_mlp": 1.03188705, + "epoch": 0.11321208477378626, + "flos": 26250125857920.0, + "grad_norm": 1.961160949593407, + "language_loss": 0.79120928, + "learning_rate": 3.92780429816244e-06, + "loss": 0.81319851, + "num_input_tokens_seen": 40679555, + "step": 1883, + "time_per_iteration": 5.80859637260437 + }, + { + "auxiliary_loss_clip": 0.01111357, + "auxiliary_loss_mlp": 0.01061328, + "balance_loss_clip": 1.04294157, + "balance_loss_mlp": 1.03798699, + "epoch": 0.11327220802645423, + "flos": 13626376583040.0, + "grad_norm": 1.9991424137555336, + "language_loss": 0.77138156, + "learning_rate": 3.927700564817529e-06, + "loss": 0.79310846, + "num_input_tokens_seen": 40697295, + "step": 1884, + "time_per_iteration": 2.857499599456787 + }, + { + "auxiliary_loss_clip": 0.0102805, + "auxiliary_loss_mlp": 0.01014952, + "balance_loss_clip": 1.00693107, + "balance_loss_mlp": 1.01262748, + "epoch": 0.1133323312791222, + "flos": 57191802814080.0, + "grad_norm": 0.8657596450191127, + "language_loss": 0.55249763, + "learning_rate": 3.927596758374019e-06, + "loss": 0.57292771, + "num_input_tokens_seen": 40758095, + "step": 1885, + "time_per_iteration": 3.0511181354522705 + }, + { + "auxiliary_loss_clip": 0.01050552, + "auxiliary_loss_mlp": 0.01048395, + "balance_loss_clip": 1.03369212, + "balance_loss_mlp": 1.02702069, + "epoch": 0.11339245453179017, + "flos": 24351708245760.0, + "grad_norm": 2.28190756928075, + "language_loss": 0.9053275, + "learning_rate": 3.927492878835848e-06, + "loss": 0.92631692, + "num_input_tokens_seen": 40777140, + "step": 1886, + "time_per_iteration": 2.8297767639160156 + }, + { + "auxiliary_loss_clip": 0.01073367, + "auxiliary_loss_mlp": 0.01057418, + "balance_loss_clip": 1.03781724, + "balance_loss_mlp": 1.0354836, + "epoch": 0.11345257778445814, + "flos": 22670693700480.0, + "grad_norm": 1.9661692970552223, + "language_loss": 0.85048175, + "learning_rate": 3.927388926206953e-06, + "loss": 0.87178957, + "num_input_tokens_seen": 40797505, + "step": 1887, + "time_per_iteration": 2.744987726211548 + }, + { + "auxiliary_loss_clip": 0.01109399, + "auxiliary_loss_mlp": 0.01056875, + "balance_loss_clip": 1.04258895, + "balance_loss_mlp": 1.03671682, + "epoch": 0.11351270103712612, + "flos": 20988242611200.0, + "grad_norm": 4.290979081190958, + "language_loss": 0.76347268, + "learning_rate": 3.927284900491277e-06, + "loss": 0.78513539, + "num_input_tokens_seen": 40812970, + "step": 1888, + "time_per_iteration": 2.707838773727417 + }, + { + "auxiliary_loss_clip": 0.01101049, + "auxiliary_loss_mlp": 0.01061698, + "balance_loss_clip": 1.04456615, + "balance_loss_mlp": 1.03838062, + "epoch": 0.11357282428979408, + "flos": 37347923600640.0, + "grad_norm": 1.889249425897915, + "language_loss": 0.67758864, + "learning_rate": 3.927180801692764e-06, + "loss": 0.69921613, + "num_input_tokens_seen": 40837745, + "step": 1889, + "time_per_iteration": 2.8926286697387695 + }, + { + "auxiliary_loss_clip": 0.0114404, + "auxiliary_loss_mlp": 0.01047829, + "balance_loss_clip": 1.04563951, + "balance_loss_mlp": 1.02619267, + "epoch": 0.11363294754246205, + "flos": 21757018423680.0, + "grad_norm": 2.116129184646634, + "language_loss": 0.8408919, + "learning_rate": 3.927076629815362e-06, + "loss": 0.86281061, + "num_input_tokens_seen": 40856490, + "step": 1890, + "time_per_iteration": 2.5822229385375977 + }, + { + "auxiliary_loss_clip": 0.01113026, + "auxiliary_loss_mlp": 0.01055418, + "balance_loss_clip": 1.04054713, + "balance_loss_mlp": 1.03353119, + "epoch": 0.11369307079513001, + "flos": 22601637803520.0, + "grad_norm": 2.1257602449634647, + "language_loss": 0.64989412, + "learning_rate": 3.926972384863022e-06, + "loss": 0.67157853, + "num_input_tokens_seen": 40874070, + "step": 1891, + "time_per_iteration": 2.6812376976013184 + }, + { + "auxiliary_loss_clip": 0.01104653, + "auxiliary_loss_mlp": 0.0104595, + "balance_loss_clip": 1.03884828, + "balance_loss_mlp": 1.0250051, + "epoch": 0.11375319404779799, + "flos": 21944257044480.0, + "grad_norm": 2.2703808673863, + "language_loss": 0.88228226, + "learning_rate": 3.9268680668396956e-06, + "loss": 0.90378821, + "num_input_tokens_seen": 40892425, + "step": 1892, + "time_per_iteration": 2.7137811183929443 + }, + { + "auxiliary_loss_clip": 0.0108302, + "auxiliary_loss_mlp": 0.0107356, + "balance_loss_clip": 1.04317927, + "balance_loss_mlp": 1.04986119, + "epoch": 0.11381331730046595, + "flos": 26395456285440.0, + "grad_norm": 2.5474221263893346, + "language_loss": 0.72802299, + "learning_rate": 3.926763675749339e-06, + "loss": 0.74958885, + "num_input_tokens_seen": 40912190, + "step": 1893, + "time_per_iteration": 2.807018756866455 + }, + { + "auxiliary_loss_clip": 0.0114034, + "auxiliary_loss_mlp": 0.01064152, + "balance_loss_clip": 1.04198897, + "balance_loss_mlp": 1.04187238, + "epoch": 0.11387344055313392, + "flos": 23804716959360.0, + "grad_norm": 2.069802887046996, + "language_loss": 0.79339725, + "learning_rate": 3.92665921159591e-06, + "loss": 0.8154422, + "num_input_tokens_seen": 40928395, + "step": 1894, + "time_per_iteration": 2.694516658782959 + }, + { + "auxiliary_loss_clip": 0.01126461, + "auxiliary_loss_mlp": 0.01059396, + "balance_loss_clip": 1.04725766, + "balance_loss_mlp": 1.03684187, + "epoch": 0.1139335638058019, + "flos": 34522865902080.0, + "grad_norm": 2.90727558413795, + "language_loss": 0.7932806, + "learning_rate": 3.926554674383371e-06, + "loss": 0.81513917, + "num_input_tokens_seen": 40946555, + "step": 1895, + "time_per_iteration": 2.7925631999969482 + }, + { + "auxiliary_loss_clip": 0.01034909, + "auxiliary_loss_mlp": 0.01035009, + "balance_loss_clip": 1.00547278, + "balance_loss_mlp": 1.03275633, + "epoch": 0.11399368705846986, + "flos": 70587811520640.0, + "grad_norm": 0.810484967209812, + "language_loss": 0.63379127, + "learning_rate": 3.926450064115686e-06, + "loss": 0.65449047, + "num_input_tokens_seen": 41004910, + "step": 1896, + "time_per_iteration": 3.300889730453491 + }, + { + "auxiliary_loss_clip": 0.01117254, + "auxiliary_loss_mlp": 0.01058723, + "balance_loss_clip": 1.04493356, + "balance_loss_mlp": 1.03556097, + "epoch": 0.11405381031113783, + "flos": 21324259365120.0, + "grad_norm": 1.826176097552258, + "language_loss": 0.85129654, + "learning_rate": 3.926345380796821e-06, + "loss": 0.87305629, + "num_input_tokens_seen": 41026385, + "step": 1897, + "time_per_iteration": 2.7087273597717285 + }, + { + "auxiliary_loss_clip": 0.0114229, + "auxiliary_loss_mlp": 0.00749343, + "balance_loss_clip": 1.04299068, + "balance_loss_mlp": 1.00067115, + "epoch": 0.11411393356380581, + "flos": 19719627091200.0, + "grad_norm": 2.2239082767370184, + "language_loss": 0.79698205, + "learning_rate": 3.9262406244307465e-06, + "loss": 0.8158983, + "num_input_tokens_seen": 41045315, + "step": 1898, + "time_per_iteration": 2.674971342086792 + }, + { + "auxiliary_loss_clip": 0.0107747, + "auxiliary_loss_mlp": 0.01058427, + "balance_loss_clip": 1.03493285, + "balance_loss_mlp": 1.034549, + "epoch": 0.11417405681647377, + "flos": 17530440883200.0, + "grad_norm": 16.92837712949887, + "language_loss": 0.72899413, + "learning_rate": 3.926135795021435e-06, + "loss": 0.7503531, + "num_input_tokens_seen": 41063390, + "step": 1899, + "time_per_iteration": 2.6905739307403564 + }, + { + "auxiliary_loss_clip": 0.00999657, + "auxiliary_loss_mlp": 0.01003173, + "balance_loss_clip": 1.00917447, + "balance_loss_mlp": 1.00064552, + "epoch": 0.11423418006914174, + "flos": 59674666619520.0, + "grad_norm": 0.907067964211528, + "language_loss": 0.63425547, + "learning_rate": 3.92603089257286e-06, + "loss": 0.65428376, + "num_input_tokens_seen": 41124180, + "step": 1900, + "time_per_iteration": 3.3141350746154785 + }, + { + "auxiliary_loss_clip": 0.01074115, + "auxiliary_loss_mlp": 0.01055501, + "balance_loss_clip": 1.03629458, + "balance_loss_mlp": 1.0330658, + "epoch": 0.1142943033218097, + "flos": 22963114321920.0, + "grad_norm": 1.6617470825296818, + "language_loss": 0.78399491, + "learning_rate": 3.925925917089001e-06, + "loss": 0.80529112, + "num_input_tokens_seen": 41143485, + "step": 1901, + "time_per_iteration": 2.7537965774536133 + }, + { + "auxiliary_loss_clip": 0.0113562, + "auxiliary_loss_mlp": 0.010544, + "balance_loss_clip": 1.04532623, + "balance_loss_mlp": 1.03328812, + "epoch": 0.11435442657447768, + "flos": 18256267008000.0, + "grad_norm": 2.436907106649645, + "language_loss": 0.83930498, + "learning_rate": 3.925820868573839e-06, + "loss": 0.86120522, + "num_input_tokens_seen": 41161695, + "step": 1902, + "time_per_iteration": 2.614823579788208 + }, + { + "auxiliary_loss_clip": 0.01121048, + "auxiliary_loss_mlp": 0.01049948, + "balance_loss_clip": 1.04078293, + "balance_loss_mlp": 1.02659512, + "epoch": 0.11441454982714565, + "flos": 24061191045120.0, + "grad_norm": 1.6837847361171687, + "language_loss": 0.77646053, + "learning_rate": 3.925715747031356e-06, + "loss": 0.79817045, + "num_input_tokens_seen": 41181715, + "step": 1903, + "time_per_iteration": 2.6047439575195312 + }, + { + "auxiliary_loss_clip": 0.01109988, + "auxiliary_loss_mlp": 0.01043504, + "balance_loss_clip": 1.04171431, + "balance_loss_mlp": 1.02496719, + "epoch": 0.11447467307981361, + "flos": 25337707557120.0, + "grad_norm": 2.0332254846396474, + "language_loss": 0.75630832, + "learning_rate": 3.925610552465539e-06, + "loss": 0.77784324, + "num_input_tokens_seen": 41201770, + "step": 1904, + "time_per_iteration": 2.664895534515381 + }, + { + "auxiliary_loss_clip": 0.01109209, + "auxiliary_loss_mlp": 0.01058129, + "balance_loss_clip": 1.03990412, + "balance_loss_mlp": 1.03626633, + "epoch": 0.11453479633248159, + "flos": 21726063878400.0, + "grad_norm": 2.012701415477347, + "language_loss": 0.91988027, + "learning_rate": 3.9255052848803764e-06, + "loss": 0.94155365, + "num_input_tokens_seen": 41220590, + "step": 1905, + "time_per_iteration": 2.6087934970855713 + }, + { + "auxiliary_loss_clip": 0.01123607, + "auxiliary_loss_mlp": 0.01049999, + "balance_loss_clip": 1.0390954, + "balance_loss_mlp": 1.02754021, + "epoch": 0.11459491958514956, + "flos": 12969714096000.0, + "grad_norm": 2.3483807734576985, + "language_loss": 0.77115381, + "learning_rate": 3.925399944279861e-06, + "loss": 0.79288989, + "num_input_tokens_seen": 41237250, + "step": 1906, + "time_per_iteration": 2.6993191242218018 + }, + { + "auxiliary_loss_clip": 0.01145873, + "auxiliary_loss_mlp": 0.01057352, + "balance_loss_clip": 1.04596424, + "balance_loss_mlp": 1.03582263, + "epoch": 0.11465504283781752, + "flos": 22711273090560.0, + "grad_norm": 1.9022678381672349, + "language_loss": 0.8136372, + "learning_rate": 3.925294530667986e-06, + "loss": 0.83566946, + "num_input_tokens_seen": 41256680, + "step": 1907, + "time_per_iteration": 2.557236909866333 + }, + { + "auxiliary_loss_clip": 0.01097404, + "auxiliary_loss_mlp": 0.01061306, + "balance_loss_clip": 1.04067218, + "balance_loss_mlp": 1.03970504, + "epoch": 0.1147151660904855, + "flos": 23398387332480.0, + "grad_norm": 2.672744884732703, + "language_loss": 0.85008049, + "learning_rate": 3.92518904404875e-06, + "loss": 0.87166756, + "num_input_tokens_seen": 41270955, + "step": 1908, + "time_per_iteration": 2.8047163486480713 + }, + { + "auxiliary_loss_clip": 0.00993697, + "auxiliary_loss_mlp": 0.01024144, + "balance_loss_clip": 1.00453472, + "balance_loss_mlp": 1.02154493, + "epoch": 0.11477528934315347, + "flos": 63011843498880.0, + "grad_norm": 0.9426021179881537, + "language_loss": 0.61027998, + "learning_rate": 3.925083484426153e-06, + "loss": 0.63045841, + "num_input_tokens_seen": 41319180, + "step": 1909, + "time_per_iteration": 3.1102678775787354 + }, + { + "auxiliary_loss_clip": 0.01147418, + "auxiliary_loss_mlp": 0.01050401, + "balance_loss_clip": 1.04826093, + "balance_loss_mlp": 1.02984953, + "epoch": 0.11483541259582143, + "flos": 16325601960960.0, + "grad_norm": 1.7368661772361653, + "language_loss": 0.79122859, + "learning_rate": 3.924977851804197e-06, + "loss": 0.81320673, + "num_input_tokens_seen": 41337480, + "step": 1910, + "time_per_iteration": 2.5859363079071045 + }, + { + "auxiliary_loss_clip": 0.01119969, + "auxiliary_loss_mlp": 0.01052544, + "balance_loss_clip": 1.0448997, + "balance_loss_mlp": 1.0314436, + "epoch": 0.1148955358484894, + "flos": 21580410228480.0, + "grad_norm": 2.1143871766185023, + "language_loss": 0.77060711, + "learning_rate": 3.9248721461868875e-06, + "loss": 0.79233229, + "num_input_tokens_seen": 41354650, + "step": 1911, + "time_per_iteration": 2.647540330886841 + }, + { + "auxiliary_loss_clip": 0.01118439, + "auxiliary_loss_mlp": 0.01048245, + "balance_loss_clip": 1.04146338, + "balance_loss_mlp": 1.02744246, + "epoch": 0.11495565910115738, + "flos": 27673696650240.0, + "grad_norm": 1.9577089195763389, + "language_loss": 0.79407018, + "learning_rate": 3.9247663675782336e-06, + "loss": 0.81573701, + "num_input_tokens_seen": 41376935, + "step": 1912, + "time_per_iteration": 2.771252393722534 + }, + { + "auxiliary_loss_clip": 0.01142219, + "auxiliary_loss_mlp": 0.00749332, + "balance_loss_clip": 1.04421401, + "balance_loss_mlp": 1.00043845, + "epoch": 0.11501578235382534, + "flos": 20632368614400.0, + "grad_norm": 3.2125421324918806, + "language_loss": 0.77815753, + "learning_rate": 3.924660515982246e-06, + "loss": 0.79707301, + "num_input_tokens_seen": 41396105, + "step": 1913, + "time_per_iteration": 2.774667263031006 + }, + { + "auxiliary_loss_clip": 0.01130847, + "auxiliary_loss_mlp": 0.01054137, + "balance_loss_clip": 1.0412215, + "balance_loss_mlp": 1.03269076, + "epoch": 0.1150759056064933, + "flos": 19829046896640.0, + "grad_norm": 1.9047717523403114, + "language_loss": 0.70128316, + "learning_rate": 3.924554591402939e-06, + "loss": 0.72313297, + "num_input_tokens_seen": 41415600, + "step": 1914, + "time_per_iteration": 2.7084238529205322 + }, + { + "auxiliary_loss_clip": 0.00989681, + "auxiliary_loss_mlp": 0.01012268, + "balance_loss_clip": 1.01795304, + "balance_loss_mlp": 1.00811911, + "epoch": 0.11513602885916129, + "flos": 70045776311040.0, + "grad_norm": 0.7623804400503781, + "language_loss": 0.61048603, + "learning_rate": 3.92444859384433e-06, + "loss": 0.63050544, + "num_input_tokens_seen": 41478760, + "step": 1915, + "time_per_iteration": 3.4852206707000732 + }, + { + "auxiliary_loss_clip": 0.0112997, + "auxiliary_loss_mlp": 0.01058102, + "balance_loss_clip": 1.04554093, + "balance_loss_mlp": 1.03676319, + "epoch": 0.11519615211182925, + "flos": 15741730385280.0, + "grad_norm": 2.1656078672793795, + "language_loss": 0.93244046, + "learning_rate": 3.924342523310436e-06, + "loss": 0.95432115, + "num_input_tokens_seen": 41495720, + "step": 1916, + "time_per_iteration": 2.749086380004883 + }, + { + "auxiliary_loss_clip": 0.0112552, + "auxiliary_loss_mlp": 0.010631, + "balance_loss_clip": 1.04431129, + "balance_loss_mlp": 1.03896022, + "epoch": 0.11525627536449722, + "flos": 20667632791680.0, + "grad_norm": 2.0128684378867554, + "language_loss": 0.72803682, + "learning_rate": 3.9242363798052806e-06, + "loss": 0.74992299, + "num_input_tokens_seen": 41513585, + "step": 1917, + "time_per_iteration": 2.6295597553253174 + }, + { + "auxiliary_loss_clip": 0.01104725, + "auxiliary_loss_mlp": 0.01047171, + "balance_loss_clip": 1.04314542, + "balance_loss_mlp": 1.02545142, + "epoch": 0.1153163986171652, + "flos": 20303283185280.0, + "grad_norm": 2.010532370820538, + "language_loss": 0.74616647, + "learning_rate": 3.92413016333289e-06, + "loss": 0.76768541, + "num_input_tokens_seen": 41533390, + "step": 1918, + "time_per_iteration": 2.692603826522827 + }, + { + "auxiliary_loss_clip": 0.01114535, + "auxiliary_loss_mlp": 0.01050982, + "balance_loss_clip": 1.04350185, + "balance_loss_mlp": 1.03025091, + "epoch": 0.11537652186983316, + "flos": 17639321984640.0, + "grad_norm": 2.0697805967225267, + "language_loss": 0.8627823, + "learning_rate": 3.92402387389729e-06, + "loss": 0.8844375, + "num_input_tokens_seen": 41551015, + "step": 1919, + "time_per_iteration": 2.69608473777771 + }, + { + "auxiliary_loss_clip": 0.0109712, + "auxiliary_loss_mlp": 0.01067633, + "balance_loss_clip": 1.0384109, + "balance_loss_mlp": 1.04407775, + "epoch": 0.11543664512250112, + "flos": 21069401391360.0, + "grad_norm": 1.9732312520447388, + "language_loss": 0.86319351, + "learning_rate": 3.923917511502512e-06, + "loss": 0.88484108, + "num_input_tokens_seen": 41568055, + "step": 1920, + "time_per_iteration": 2.662210702896118 + }, + { + "auxiliary_loss_clip": 0.0112911, + "auxiliary_loss_mlp": 0.0105028, + "balance_loss_clip": 1.04272676, + "balance_loss_mlp": 1.02876234, + "epoch": 0.11549676837516909, + "flos": 22747542848640.0, + "grad_norm": 2.029314966829319, + "language_loss": 0.79384798, + "learning_rate": 3.923811076152589e-06, + "loss": 0.81564188, + "num_input_tokens_seen": 41587435, + "step": 1921, + "time_per_iteration": 2.5845963954925537 + }, + { + "auxiliary_loss_clip": 0.01134395, + "auxiliary_loss_mlp": 0.01059618, + "balance_loss_clip": 1.04185462, + "balance_loss_mlp": 1.0368371, + "epoch": 0.11555689162783707, + "flos": 19168972617600.0, + "grad_norm": 1.8467713631781846, + "language_loss": 0.78900051, + "learning_rate": 3.923704567851557e-06, + "loss": 0.81094062, + "num_input_tokens_seen": 41604975, + "step": 1922, + "time_per_iteration": 2.572787046432495 + }, + { + "auxiliary_loss_clip": 0.01060141, + "auxiliary_loss_mlp": 0.01057799, + "balance_loss_clip": 1.03680158, + "balance_loss_mlp": 1.03700936, + "epoch": 0.11561701488050503, + "flos": 24572056227840.0, + "grad_norm": 1.8417624805912733, + "language_loss": 0.84305823, + "learning_rate": 3.923597986603456e-06, + "loss": 0.86423767, + "num_input_tokens_seen": 41626155, + "step": 1923, + "time_per_iteration": 2.8477230072021484 + }, + { + "auxiliary_loss_clip": 0.01135683, + "auxiliary_loss_mlp": 0.01055032, + "balance_loss_clip": 1.04577959, + "balance_loss_mlp": 1.03278756, + "epoch": 0.115677138133173, + "flos": 17092546179840.0, + "grad_norm": 2.1544923165708587, + "language_loss": 0.80798304, + "learning_rate": 3.9234913324123264e-06, + "loss": 0.82989019, + "num_input_tokens_seen": 41644805, + "step": 1924, + "time_per_iteration": 2.7907025814056396 + }, + { + "auxiliary_loss_clip": 0.01015313, + "auxiliary_loss_mlp": 0.01006453, + "balance_loss_clip": 1.00522637, + "balance_loss_mlp": 1.0041045, + "epoch": 0.11573726138584098, + "flos": 62703875266560.0, + "grad_norm": 0.818843080764095, + "language_loss": 0.6119467, + "learning_rate": 3.923384605282212e-06, + "loss": 0.63216436, + "num_input_tokens_seen": 41709345, + "step": 1925, + "time_per_iteration": 4.7567384243011475 + }, + { + "auxiliary_loss_clip": 0.01118213, + "auxiliary_loss_mlp": 0.01070493, + "balance_loss_clip": 1.04199314, + "balance_loss_mlp": 1.04784298, + "epoch": 0.11579738463850894, + "flos": 22601135013120.0, + "grad_norm": 1.7803642270389692, + "language_loss": 0.75117719, + "learning_rate": 3.923277805217161e-06, + "loss": 0.7730642, + "num_input_tokens_seen": 41730210, + "step": 1926, + "time_per_iteration": 2.7335703372955322 + }, + { + "auxiliary_loss_clip": 0.01074086, + "auxiliary_loss_mlp": 0.00749339, + "balance_loss_clip": 1.03538191, + "balance_loss_mlp": 1.00042617, + "epoch": 0.11585750789117691, + "flos": 21726135705600.0, + "grad_norm": 4.362108976197282, + "language_loss": 0.7269243, + "learning_rate": 3.923170932221222e-06, + "loss": 0.74515855, + "num_input_tokens_seen": 41750270, + "step": 1927, + "time_per_iteration": 2.772796154022217 + }, + { + "auxiliary_loss_clip": 0.01103711, + "auxiliary_loss_mlp": 0.01057095, + "balance_loss_clip": 1.04157341, + "balance_loss_mlp": 1.03474343, + "epoch": 0.11591763114384489, + "flos": 26287544851200.0, + "grad_norm": 2.0307554973250412, + "language_loss": 0.86869371, + "learning_rate": 3.92306398629845e-06, + "loss": 0.89030182, + "num_input_tokens_seen": 41772975, + "step": 1928, + "time_per_iteration": 2.745093822479248 + }, + { + "auxiliary_loss_clip": 0.0109644, + "auxiliary_loss_mlp": 0.01061145, + "balance_loss_clip": 1.04320264, + "balance_loss_mlp": 1.03855443, + "epoch": 0.11597775439651285, + "flos": 23000461488000.0, + "grad_norm": 2.0319440669751403, + "language_loss": 0.77528203, + "learning_rate": 3.922956967452898e-06, + "loss": 0.79685795, + "num_input_tokens_seen": 41791765, + "step": 1929, + "time_per_iteration": 2.740773916244507 + }, + { + "auxiliary_loss_clip": 0.01142947, + "auxiliary_loss_mlp": 0.0106024, + "balance_loss_clip": 1.04509306, + "balance_loss_mlp": 1.0397836, + "epoch": 0.11603787764918082, + "flos": 31941715507200.0, + "grad_norm": 1.5410662467280403, + "language_loss": 0.76704001, + "learning_rate": 3.922849875688626e-06, + "loss": 0.78907192, + "num_input_tokens_seen": 41815615, + "step": 1930, + "time_per_iteration": 6.1206276416778564 + }, + { + "auxiliary_loss_clip": 0.01105494, + "auxiliary_loss_mlp": 0.01052363, + "balance_loss_clip": 1.03699076, + "balance_loss_mlp": 1.03020215, + "epoch": 0.1160980009018488, + "flos": 22271654534400.0, + "grad_norm": 1.806573599440605, + "language_loss": 0.72476965, + "learning_rate": 3.922742711009693e-06, + "loss": 0.74634826, + "num_input_tokens_seen": 41834810, + "step": 1931, + "time_per_iteration": 2.7697412967681885 + }, + { + "auxiliary_loss_clip": 0.01112741, + "auxiliary_loss_mlp": 0.01061484, + "balance_loss_clip": 1.04158723, + "balance_loss_mlp": 1.03742743, + "epoch": 0.11615812415451676, + "flos": 22783633038720.0, + "grad_norm": 1.9956868366187956, + "language_loss": 0.82510173, + "learning_rate": 3.922635473420164e-06, + "loss": 0.84684408, + "num_input_tokens_seen": 41854975, + "step": 1932, + "time_per_iteration": 2.741558313369751 + }, + { + "auxiliary_loss_clip": 0.0101054, + "auxiliary_loss_mlp": 0.01009004, + "balance_loss_clip": 1.01500535, + "balance_loss_mlp": 1.00523722, + "epoch": 0.11621824740718473, + "flos": 67146096107520.0, + "grad_norm": 0.7711920044090875, + "language_loss": 0.61104953, + "learning_rate": 3.922528162924105e-06, + "loss": 0.63124496, + "num_input_tokens_seen": 41911105, + "step": 1933, + "time_per_iteration": 3.2212488651275635 + }, + { + "auxiliary_loss_clip": 0.01072987, + "auxiliary_loss_mlp": 0.00749309, + "balance_loss_clip": 1.03761017, + "balance_loss_mlp": 1.00048399, + "epoch": 0.11627837065985269, + "flos": 20375930442240.0, + "grad_norm": 2.4167842073484658, + "language_loss": 0.85897398, + "learning_rate": 3.922420779525586e-06, + "loss": 0.87719691, + "num_input_tokens_seen": 41931750, + "step": 1934, + "time_per_iteration": 2.8135993480682373 + }, + { + "auxiliary_loss_clip": 0.01083017, + "auxiliary_loss_mlp": 0.01059102, + "balance_loss_clip": 1.03947055, + "balance_loss_mlp": 1.03580832, + "epoch": 0.11633849391252067, + "flos": 21725812483200.0, + "grad_norm": 2.582774479893349, + "language_loss": 0.65772116, + "learning_rate": 3.9223133232286776e-06, + "loss": 0.67914236, + "num_input_tokens_seen": 41949400, + "step": 1935, + "time_per_iteration": 2.7915759086608887 + }, + { + "auxiliary_loss_clip": 0.01149636, + "auxiliary_loss_mlp": 0.01049371, + "balance_loss_clip": 1.04602861, + "balance_loss_mlp": 1.02959394, + "epoch": 0.11639861716518864, + "flos": 18805341283200.0, + "grad_norm": 2.066320006284908, + "language_loss": 0.7595197, + "learning_rate": 3.922205794037456e-06, + "loss": 0.78150976, + "num_input_tokens_seen": 41968100, + "step": 1936, + "time_per_iteration": 2.634237051010132 + }, + { + "auxiliary_loss_clip": 0.01145469, + "auxiliary_loss_mlp": 0.01049879, + "balance_loss_clip": 1.04305089, + "balance_loss_mlp": 1.02728844, + "epoch": 0.1164587404178566, + "flos": 21214983214080.0, + "grad_norm": 1.8978717997145602, + "language_loss": 0.84233868, + "learning_rate": 3.922098191955998e-06, + "loss": 0.8642922, + "num_input_tokens_seen": 41986375, + "step": 1937, + "time_per_iteration": 2.7015957832336426 + }, + { + "auxiliary_loss_clip": 0.01118335, + "auxiliary_loss_mlp": 0.01045228, + "balance_loss_clip": 1.04080844, + "balance_loss_mlp": 1.0246644, + "epoch": 0.11651886367052458, + "flos": 27818632028160.0, + "grad_norm": 2.441003203895154, + "language_loss": 0.76233393, + "learning_rate": 3.921990516988384e-06, + "loss": 0.78396952, + "num_input_tokens_seen": 42006055, + "step": 1938, + "time_per_iteration": 2.754767417907715 + }, + { + "auxiliary_loss_clip": 0.01148385, + "auxiliary_loss_mlp": 0.01056696, + "balance_loss_clip": 1.04489553, + "balance_loss_mlp": 1.03551304, + "epoch": 0.11657898692319255, + "flos": 22889569224960.0, + "grad_norm": 2.6292345989909753, + "language_loss": 0.79740983, + "learning_rate": 3.921882769138696e-06, + "loss": 0.81946063, + "num_input_tokens_seen": 42024995, + "step": 1939, + "time_per_iteration": 2.5627219676971436 + }, + { + "auxiliary_loss_clip": 0.01110763, + "auxiliary_loss_mlp": 0.01055906, + "balance_loss_clip": 1.03939533, + "balance_loss_mlp": 1.03347111, + "epoch": 0.11663911017586051, + "flos": 24315905364480.0, + "grad_norm": 2.852434448660947, + "language_loss": 0.86274016, + "learning_rate": 3.9217749484110215e-06, + "loss": 0.88440686, + "num_input_tokens_seen": 42042640, + "step": 1940, + "time_per_iteration": 2.6696228981018066 + }, + { + "auxiliary_loss_clip": 0.01120589, + "auxiliary_loss_mlp": 0.01058324, + "balance_loss_clip": 1.04360545, + "balance_loss_mlp": 1.03822529, + "epoch": 0.11669923342852849, + "flos": 42340152470400.0, + "grad_norm": 1.4263301696256563, + "language_loss": 0.75858343, + "learning_rate": 3.921667054809449e-06, + "loss": 0.7803725, + "num_input_tokens_seen": 42067005, + "step": 1941, + "time_per_iteration": 2.834040880203247 + }, + { + "auxiliary_loss_clip": 0.01119656, + "auxiliary_loss_mlp": 0.00749377, + "balance_loss_clip": 1.04143584, + "balance_loss_mlp": 1.00048804, + "epoch": 0.11675935668119646, + "flos": 14642288945280.0, + "grad_norm": 2.418105888346411, + "language_loss": 0.88651752, + "learning_rate": 3.921559088338068e-06, + "loss": 0.90520787, + "num_input_tokens_seen": 42082295, + "step": 1942, + "time_per_iteration": 2.6270864009857178 + }, + { + "auxiliary_loss_clip": 0.01129035, + "auxiliary_loss_mlp": 0.01048151, + "balance_loss_clip": 1.04222059, + "balance_loss_mlp": 1.02815986, + "epoch": 0.11681947993386442, + "flos": 35116470063360.0, + "grad_norm": 1.5391503245338738, + "language_loss": 0.67677057, + "learning_rate": 3.921451049000975e-06, + "loss": 0.69854242, + "num_input_tokens_seen": 42105295, + "step": 1943, + "time_per_iteration": 2.7075583934783936 + }, + { + "auxiliary_loss_clip": 0.01119726, + "auxiliary_loss_mlp": 0.01048152, + "balance_loss_clip": 1.0428673, + "balance_loss_mlp": 1.02698016, + "epoch": 0.11687960318653239, + "flos": 38983259024640.0, + "grad_norm": 1.886030743165608, + "language_loss": 0.69422698, + "learning_rate": 3.921342936802265e-06, + "loss": 0.71590573, + "num_input_tokens_seen": 42125520, + "step": 1944, + "time_per_iteration": 2.8006327152252197 + }, + { + "auxiliary_loss_clip": 0.01119876, + "auxiliary_loss_mlp": 0.01050673, + "balance_loss_clip": 1.03896129, + "balance_loss_mlp": 1.03105092, + "epoch": 0.11693972643920036, + "flos": 25994980575360.0, + "grad_norm": 1.5157960475532912, + "language_loss": 0.82720077, + "learning_rate": 3.921234751746038e-06, + "loss": 0.84890628, + "num_input_tokens_seen": 42146335, + "step": 1945, + "time_per_iteration": 2.7075955867767334 + }, + { + "auxiliary_loss_clip": 0.0109814, + "auxiliary_loss_mlp": 0.01059219, + "balance_loss_clip": 1.03481746, + "balance_loss_mlp": 1.03819036, + "epoch": 0.11699984969186833, + "flos": 27272107618560.0, + "grad_norm": 2.1226509111453127, + "language_loss": 0.75974292, + "learning_rate": 3.9211264938363975e-06, + "loss": 0.78131652, + "num_input_tokens_seen": 42165320, + "step": 1946, + "time_per_iteration": 2.6923577785491943 + }, + { + "auxiliary_loss_clip": 0.01096409, + "auxiliary_loss_mlp": 0.01056151, + "balance_loss_clip": 1.03770971, + "balance_loss_mlp": 1.0362078, + "epoch": 0.1170599729445363, + "flos": 15267853232640.0, + "grad_norm": 2.1628316131260155, + "language_loss": 0.68488675, + "learning_rate": 3.921018163077448e-06, + "loss": 0.70641243, + "num_input_tokens_seen": 42182955, + "step": 1947, + "time_per_iteration": 2.6523725986480713 + }, + { + "auxiliary_loss_clip": 0.01117228, + "auxiliary_loss_mlp": 0.01065167, + "balance_loss_clip": 1.04499078, + "balance_loss_mlp": 1.04429317, + "epoch": 0.11712009619720427, + "flos": 17164439251200.0, + "grad_norm": 1.7034947701588996, + "language_loss": 0.85066783, + "learning_rate": 3.920909759473295e-06, + "loss": 0.87249172, + "num_input_tokens_seen": 42200760, + "step": 1948, + "time_per_iteration": 2.6323390007019043 + }, + { + "auxiliary_loss_clip": 0.01013554, + "auxiliary_loss_mlp": 0.00748909, + "balance_loss_clip": 1.00350142, + "balance_loss_mlp": 1.0006007, + "epoch": 0.11718021944987224, + "flos": 70940991997440.0, + "grad_norm": 0.810396646511961, + "language_loss": 0.65094686, + "learning_rate": 3.920801283028054e-06, + "loss": 0.66857147, + "num_input_tokens_seen": 42265745, + "step": 1949, + "time_per_iteration": 3.284055233001709 + }, + { + "auxiliary_loss_clip": 0.01124866, + "auxiliary_loss_mlp": 0.01055648, + "balance_loss_clip": 1.04353976, + "balance_loss_mlp": 1.0352751, + "epoch": 0.1172403427025402, + "flos": 27453456408960.0, + "grad_norm": 1.5028736094611486, + "language_loss": 0.71760583, + "learning_rate": 3.920692733745835e-06, + "loss": 0.73941094, + "num_input_tokens_seen": 42286245, + "step": 1950, + "time_per_iteration": 2.6572365760803223 + }, + { + "auxiliary_loss_clip": 0.01138017, + "auxiliary_loss_mlp": 0.01055702, + "balance_loss_clip": 1.04617178, + "balance_loss_mlp": 1.03449428, + "epoch": 0.11730046595520818, + "flos": 15668723992320.0, + "grad_norm": 2.0732577803203545, + "language_loss": 0.76433516, + "learning_rate": 3.920584111630755e-06, + "loss": 0.78627241, + "num_input_tokens_seen": 42302710, + "step": 1951, + "time_per_iteration": 2.6002161502838135 + }, + { + "auxiliary_loss_clip": 0.01090654, + "auxiliary_loss_mlp": 0.01060292, + "balance_loss_clip": 1.04027879, + "balance_loss_mlp": 1.0398953, + "epoch": 0.11736058920787615, + "flos": 25630164092160.0, + "grad_norm": 1.7146865631718553, + "language_loss": 0.76214385, + "learning_rate": 3.9204754166869325e-06, + "loss": 0.78365338, + "num_input_tokens_seen": 42324115, + "step": 1952, + "time_per_iteration": 2.7244203090667725 + }, + { + "auxiliary_loss_clip": 0.01083787, + "auxiliary_loss_mlp": 0.01059113, + "balance_loss_clip": 1.03475499, + "balance_loss_mlp": 1.0379653, + "epoch": 0.11742071246054411, + "flos": 21434289701760.0, + "grad_norm": 2.2612459730521453, + "language_loss": 0.7231518, + "learning_rate": 3.920366648918491e-06, + "loss": 0.74458081, + "num_input_tokens_seen": 42342505, + "step": 1953, + "time_per_iteration": 2.6915693283081055 + }, + { + "auxiliary_loss_clip": 0.01109097, + "auxiliary_loss_mlp": 0.00749317, + "balance_loss_clip": 1.03960836, + "balance_loss_mlp": 1.00057197, + "epoch": 0.11748083571321208, + "flos": 15997845335040.0, + "grad_norm": 2.914850603233899, + "language_loss": 0.79316306, + "learning_rate": 3.920257808329552e-06, + "loss": 0.81174719, + "num_input_tokens_seen": 42360525, + "step": 1954, + "time_per_iteration": 2.6057276725769043 + }, + { + "auxiliary_loss_clip": 0.01080192, + "auxiliary_loss_mlp": 0.01056658, + "balance_loss_clip": 1.03815246, + "balance_loss_mlp": 1.03446162, + "epoch": 0.11754095896588006, + "flos": 16180056051840.0, + "grad_norm": 2.0225495862484797, + "language_loss": 0.8579793, + "learning_rate": 3.920148894924246e-06, + "loss": 0.8793478, + "num_input_tokens_seen": 42377045, + "step": 1955, + "time_per_iteration": 2.820762872695923 + }, + { + "auxiliary_loss_clip": 0.0112395, + "auxiliary_loss_mlp": 0.00749338, + "balance_loss_clip": 1.03767538, + "balance_loss_mlp": 1.00068688, + "epoch": 0.11760108221854802, + "flos": 13261596013440.0, + "grad_norm": 2.091554856252755, + "language_loss": 0.7790035, + "learning_rate": 3.920039908706701e-06, + "loss": 0.79773641, + "num_input_tokens_seen": 42393960, + "step": 1956, + "time_per_iteration": 2.698910713195801 + }, + { + "auxiliary_loss_clip": 0.01124161, + "auxiliary_loss_mlp": 0.01053019, + "balance_loss_clip": 1.04213405, + "balance_loss_mlp": 1.03128767, + "epoch": 0.11766120547121599, + "flos": 24498439303680.0, + "grad_norm": 2.047732749117111, + "language_loss": 0.80254865, + "learning_rate": 3.91993084968105e-06, + "loss": 0.82432044, + "num_input_tokens_seen": 42413160, + "step": 1957, + "time_per_iteration": 2.775144577026367 + }, + { + "auxiliary_loss_clip": 0.01133109, + "auxiliary_loss_mlp": 0.01047261, + "balance_loss_clip": 1.04401159, + "balance_loss_mlp": 1.0267812, + "epoch": 0.11772132872388397, + "flos": 17784005967360.0, + "grad_norm": 1.9609709313136687, + "language_loss": 0.78412199, + "learning_rate": 3.919821717851428e-06, + "loss": 0.80592573, + "num_input_tokens_seen": 42432590, + "step": 1958, + "time_per_iteration": 2.7348361015319824 + }, + { + "auxiliary_loss_clip": 0.01117372, + "auxiliary_loss_mlp": 0.01045254, + "balance_loss_clip": 1.04154253, + "balance_loss_mlp": 1.02304506, + "epoch": 0.11778145197655193, + "flos": 13217030213760.0, + "grad_norm": 2.0279689730241373, + "language_loss": 0.76615578, + "learning_rate": 3.919712513221976e-06, + "loss": 0.78778201, + "num_input_tokens_seen": 42450135, + "step": 1959, + "time_per_iteration": 2.626073122024536 + }, + { + "auxiliary_loss_clip": 0.01118659, + "auxiliary_loss_mlp": 0.010506, + "balance_loss_clip": 1.03906333, + "balance_loss_mlp": 1.02873731, + "epoch": 0.1178415752292199, + "flos": 20230204965120.0, + "grad_norm": 2.0375257922312064, + "language_loss": 0.70168436, + "learning_rate": 3.919603235796832e-06, + "loss": 0.72337693, + "num_input_tokens_seen": 42470050, + "step": 1960, + "time_per_iteration": 2.660550594329834 + }, + { + "auxiliary_loss_clip": 0.01126419, + "auxiliary_loss_mlp": 0.01052393, + "balance_loss_clip": 1.04336143, + "balance_loss_mlp": 1.03058934, + "epoch": 0.11790169848188788, + "flos": 13040134709760.0, + "grad_norm": 4.200538290567167, + "language_loss": 0.81717587, + "learning_rate": 3.9194938855801406e-06, + "loss": 0.83896399, + "num_input_tokens_seen": 42484335, + "step": 1961, + "time_per_iteration": 2.5917601585388184 + }, + { + "auxiliary_loss_clip": 0.01123353, + "auxiliary_loss_mlp": 0.00749266, + "balance_loss_clip": 1.03954029, + "balance_loss_mlp": 1.00058019, + "epoch": 0.11796182173455584, + "flos": 22265728790400.0, + "grad_norm": 1.8053879347365593, + "language_loss": 0.9216888, + "learning_rate": 3.919384462576049e-06, + "loss": 0.94041497, + "num_input_tokens_seen": 42502720, + "step": 1962, + "time_per_iteration": 2.5877528190612793 + }, + { + "auxiliary_loss_clip": 0.01101488, + "auxiliary_loss_mlp": 0.01056445, + "balance_loss_clip": 1.03852797, + "balance_loss_mlp": 1.0350107, + "epoch": 0.1180219449872238, + "flos": 10635017892480.0, + "grad_norm": 2.202070205926197, + "language_loss": 0.87379777, + "learning_rate": 3.919274966788707e-06, + "loss": 0.89537704, + "num_input_tokens_seen": 42519460, + "step": 1963, + "time_per_iteration": 2.645113945007324 + }, + { + "auxiliary_loss_clip": 0.01119406, + "auxiliary_loss_mlp": 0.00749369, + "balance_loss_clip": 1.0400182, + "balance_loss_mlp": 1.00064898, + "epoch": 0.11808206823989177, + "flos": 20923532259840.0, + "grad_norm": 2.637883306523617, + "language_loss": 0.8389473, + "learning_rate": 3.919165398222265e-06, + "loss": 0.85763508, + "num_input_tokens_seen": 42539420, + "step": 1964, + "time_per_iteration": 2.700819492340088 + }, + { + "auxiliary_loss_clip": 0.01083118, + "auxiliary_loss_mlp": 0.01065971, + "balance_loss_clip": 1.04025817, + "balance_loss_mlp": 1.04415619, + "epoch": 0.11814219149255975, + "flos": 20777770869120.0, + "grad_norm": 1.8880764200764881, + "language_loss": 0.82833922, + "learning_rate": 3.919055756880879e-06, + "loss": 0.84983009, + "num_input_tokens_seen": 42558225, + "step": 1965, + "time_per_iteration": 2.7557899951934814 + }, + { + "auxiliary_loss_clip": 0.01140505, + "auxiliary_loss_mlp": 0.01048053, + "balance_loss_clip": 1.04105878, + "balance_loss_mlp": 1.02703619, + "epoch": 0.11820231474522772, + "flos": 48759938542080.0, + "grad_norm": 1.5453011578888718, + "language_loss": 0.74438941, + "learning_rate": 3.918946042768707e-06, + "loss": 0.76627499, + "num_input_tokens_seen": 42580790, + "step": 1966, + "time_per_iteration": 2.7822394371032715 + }, + { + "auxiliary_loss_clip": 0.01127059, + "auxiliary_loss_mlp": 0.01053997, + "balance_loss_clip": 1.04779518, + "balance_loss_mlp": 1.03261042, + "epoch": 0.11826243799789568, + "flos": 16690598012160.0, + "grad_norm": 3.2935487769317717, + "language_loss": 0.73168737, + "learning_rate": 3.918836255889908e-06, + "loss": 0.75349796, + "num_input_tokens_seen": 42597355, + "step": 1967, + "time_per_iteration": 2.592772960662842 + }, + { + "auxiliary_loss_clip": 0.01128409, + "auxiliary_loss_mlp": 0.01049299, + "balance_loss_clip": 1.04255545, + "balance_loss_mlp": 1.02730513, + "epoch": 0.11832256125056366, + "flos": 16909868586240.0, + "grad_norm": 2.1665426526264895, + "language_loss": 0.88363564, + "learning_rate": 3.9187263962486456e-06, + "loss": 0.90541267, + "num_input_tokens_seen": 42616060, + "step": 1968, + "time_per_iteration": 2.650290012359619 + }, + { + "auxiliary_loss_clip": 0.01127937, + "auxiliary_loss_mlp": 0.01047655, + "balance_loss_clip": 1.04595435, + "balance_loss_mlp": 1.02544665, + "epoch": 0.11838268450323162, + "flos": 22820405587200.0, + "grad_norm": 1.8292941941601515, + "language_loss": 0.6710062, + "learning_rate": 3.918616463849087e-06, + "loss": 0.69276214, + "num_input_tokens_seen": 42636285, + "step": 1969, + "time_per_iteration": 2.7327091693878174 + }, + { + "auxiliary_loss_clip": 0.01087664, + "auxiliary_loss_mlp": 0.01055476, + "balance_loss_clip": 1.0382154, + "balance_loss_mlp": 1.03314829, + "epoch": 0.11844280775589959, + "flos": 33545844990720.0, + "grad_norm": 2.077630905255444, + "language_loss": 0.80660284, + "learning_rate": 3.918506458695399e-06, + "loss": 0.82803422, + "num_input_tokens_seen": 42658320, + "step": 1970, + "time_per_iteration": 2.7714505195617676 + }, + { + "auxiliary_loss_clip": 0.01023592, + "auxiliary_loss_mlp": 0.01022647, + "balance_loss_clip": 1.0027585, + "balance_loss_mlp": 1.02013171, + "epoch": 0.11850293100856757, + "flos": 66350998604160.0, + "grad_norm": 0.8066020996326384, + "language_loss": 0.66153049, + "learning_rate": 3.918396380791754e-06, + "loss": 0.68199289, + "num_input_tokens_seen": 42721500, + "step": 1971, + "time_per_iteration": 3.2258071899414062 + }, + { + "auxiliary_loss_clip": 0.01120844, + "auxiliary_loss_mlp": 0.01053278, + "balance_loss_clip": 1.04322958, + "balance_loss_mlp": 1.03223753, + "epoch": 0.11856305426123553, + "flos": 24681045070080.0, + "grad_norm": 2.138672370667727, + "language_loss": 0.79738516, + "learning_rate": 3.918286230142327e-06, + "loss": 0.81912637, + "num_input_tokens_seen": 42739825, + "step": 1972, + "time_per_iteration": 4.310074090957642 + }, + { + "auxiliary_loss_clip": 0.01097487, + "auxiliary_loss_mlp": 0.00749219, + "balance_loss_clip": 1.04007339, + "balance_loss_mlp": 1.00057006, + "epoch": 0.1186231775139035, + "flos": 24280102483200.0, + "grad_norm": 2.5171809407752996, + "language_loss": 0.72902244, + "learning_rate": 3.918176006751292e-06, + "loss": 0.74748951, + "num_input_tokens_seen": 42758695, + "step": 1973, + "time_per_iteration": 2.7097551822662354 + }, + { + "auxiliary_loss_clip": 0.01104885, + "auxiliary_loss_mlp": 0.01044917, + "balance_loss_clip": 1.0426743, + "balance_loss_mlp": 1.02401948, + "epoch": 0.11868330076657148, + "flos": 21757413473280.0, + "grad_norm": 2.000504912806785, + "language_loss": 0.72026622, + "learning_rate": 3.918065710622832e-06, + "loss": 0.74176425, + "num_input_tokens_seen": 42778510, + "step": 1974, + "time_per_iteration": 2.7397842407226562 + }, + { + "auxiliary_loss_clip": 0.01086848, + "auxiliary_loss_mlp": 0.0104327, + "balance_loss_clip": 1.03788757, + "balance_loss_mlp": 1.02212262, + "epoch": 0.11874342401923944, + "flos": 17193274894080.0, + "grad_norm": 2.322708762013432, + "language_loss": 0.77568018, + "learning_rate": 3.917955341761128e-06, + "loss": 0.79698139, + "num_input_tokens_seen": 42793995, + "step": 1975, + "time_per_iteration": 2.841695547103882 + }, + { + "auxiliary_loss_clip": 0.01093055, + "auxiliary_loss_mlp": 0.01053588, + "balance_loss_clip": 1.04137206, + "balance_loss_mlp": 1.03398991, + "epoch": 0.11880354727190741, + "flos": 15229572312960.0, + "grad_norm": 2.334001288785183, + "language_loss": 0.75505954, + "learning_rate": 3.917844900170364e-06, + "loss": 0.77652597, + "num_input_tokens_seen": 42809000, + "step": 1976, + "time_per_iteration": 2.8093783855438232 + }, + { + "auxiliary_loss_clip": 0.01128971, + "auxiliary_loss_mlp": 0.01052062, + "balance_loss_clip": 1.04225576, + "balance_loss_mlp": 1.03170085, + "epoch": 0.11886367052457537, + "flos": 27309706179840.0, + "grad_norm": 1.5855048674054468, + "language_loss": 0.74877363, + "learning_rate": 3.91773438585473e-06, + "loss": 0.77058399, + "num_input_tokens_seen": 42831585, + "step": 1977, + "time_per_iteration": 4.310959577560425 + }, + { + "auxiliary_loss_clip": 0.01142873, + "auxiliary_loss_mlp": 0.01060727, + "balance_loss_clip": 1.0440073, + "balance_loss_mlp": 1.03998435, + "epoch": 0.11892379377724335, + "flos": 21798280172160.0, + "grad_norm": 2.3023879651394195, + "language_loss": 0.73976624, + "learning_rate": 3.9176237988184165e-06, + "loss": 0.76180232, + "num_input_tokens_seen": 42848420, + "step": 1978, + "time_per_iteration": 5.743606328964233 + }, + { + "auxiliary_loss_clip": 0.01110696, + "auxiliary_loss_mlp": 0.01051777, + "balance_loss_clip": 1.04684579, + "balance_loss_mlp": 1.03117728, + "epoch": 0.11898391702991132, + "flos": 13991013498240.0, + "grad_norm": 2.1247774526652714, + "language_loss": 0.73304814, + "learning_rate": 3.917513139065616e-06, + "loss": 0.75467288, + "num_input_tokens_seen": 42866645, + "step": 1979, + "time_per_iteration": 2.830354690551758 + }, + { + "auxiliary_loss_clip": 0.01105916, + "auxiliary_loss_mlp": 0.01050813, + "balance_loss_clip": 1.04399168, + "balance_loss_mlp": 1.02998757, + "epoch": 0.11904404028257928, + "flos": 32234567091840.0, + "grad_norm": 1.679468800259317, + "language_loss": 0.98231637, + "learning_rate": 3.917402406600525e-06, + "loss": 1.0038836, + "num_input_tokens_seen": 42888515, + "step": 1980, + "time_per_iteration": 2.8262226581573486 + }, + { + "auxiliary_loss_clip": 0.01122372, + "auxiliary_loss_mlp": 0.01053025, + "balance_loss_clip": 1.04414201, + "balance_loss_mlp": 1.03098357, + "epoch": 0.11910416353524726, + "flos": 23586272398080.0, + "grad_norm": 2.0728396896844887, + "language_loss": 0.86030459, + "learning_rate": 3.917291601427342e-06, + "loss": 0.88205856, + "num_input_tokens_seen": 42909035, + "step": 1981, + "time_per_iteration": 2.6937546730041504 + }, + { + "auxiliary_loss_clip": 0.01122802, + "auxiliary_loss_mlp": 0.01061869, + "balance_loss_clip": 1.04536235, + "balance_loss_mlp": 1.03946972, + "epoch": 0.11916428678791523, + "flos": 25333038789120.0, + "grad_norm": 1.8667122823270337, + "language_loss": 0.85128689, + "learning_rate": 3.91718072355027e-06, + "loss": 0.8731336, + "num_input_tokens_seen": 42927555, + "step": 1982, + "time_per_iteration": 2.6153290271759033 + }, + { + "auxiliary_loss_clip": 0.01102941, + "auxiliary_loss_mlp": 0.01054871, + "balance_loss_clip": 1.03982186, + "balance_loss_mlp": 1.03380632, + "epoch": 0.11922441004058319, + "flos": 19788431592960.0, + "grad_norm": 1.9360976739446611, + "language_loss": 0.85406327, + "learning_rate": 3.917069772973513e-06, + "loss": 0.87564135, + "num_input_tokens_seen": 42945300, + "step": 1983, + "time_per_iteration": 2.655773401260376 + }, + { + "auxiliary_loss_clip": 0.01094765, + "auxiliary_loss_mlp": 0.0105642, + "balance_loss_clip": 1.04279912, + "balance_loss_mlp": 1.03455675, + "epoch": 0.11928453329325117, + "flos": 21536347219200.0, + "grad_norm": 2.7072799691619327, + "language_loss": 0.76794034, + "learning_rate": 3.916958749701277e-06, + "loss": 0.7894522, + "num_input_tokens_seen": 42961295, + "step": 1984, + "time_per_iteration": 2.6403911113739014 + }, + { + "auxiliary_loss_clip": 0.01125917, + "auxiliary_loss_mlp": 0.01057113, + "balance_loss_clip": 1.04298997, + "balance_loss_mlp": 1.03688323, + "epoch": 0.11934465654591914, + "flos": 20815010294400.0, + "grad_norm": 1.8809049694905997, + "language_loss": 0.83057743, + "learning_rate": 3.9168476537377745e-06, + "loss": 0.85240775, + "num_input_tokens_seen": 42980330, + "step": 1985, + "time_per_iteration": 2.5918548107147217 + }, + { + "auxiliary_loss_clip": 0.01111784, + "auxiliary_loss_mlp": 0.01048591, + "balance_loss_clip": 1.03829384, + "balance_loss_mlp": 1.0276103, + "epoch": 0.1194047797985871, + "flos": 19060486565760.0, + "grad_norm": 2.2160978361060644, + "language_loss": 0.74309635, + "learning_rate": 3.916736485087216e-06, + "loss": 0.76470011, + "num_input_tokens_seen": 42996125, + "step": 1986, + "time_per_iteration": 2.5880424976348877 + }, + { + "auxiliary_loss_clip": 0.01104106, + "auxiliary_loss_mlp": 0.0105682, + "balance_loss_clip": 1.03816605, + "balance_loss_mlp": 1.0358032, + "epoch": 0.11946490305125507, + "flos": 27190805184000.0, + "grad_norm": 2.1639740787848756, + "language_loss": 0.71894467, + "learning_rate": 3.916625243753819e-06, + "loss": 0.74055392, + "num_input_tokens_seen": 43014180, + "step": 1987, + "time_per_iteration": 2.7063939571380615 + }, + { + "auxiliary_loss_clip": 0.01119555, + "auxiliary_loss_mlp": 0.01053854, + "balance_loss_clip": 1.04179204, + "balance_loss_mlp": 1.03170466, + "epoch": 0.11952502630392305, + "flos": 21140791672320.0, + "grad_norm": 1.9811434187987178, + "language_loss": 0.72175044, + "learning_rate": 3.916513929741799e-06, + "loss": 0.74348456, + "num_input_tokens_seen": 43032120, + "step": 1988, + "time_per_iteration": 2.660163402557373 + }, + { + "auxiliary_loss_clip": 0.01127101, + "auxiliary_loss_mlp": 0.01061071, + "balance_loss_clip": 1.04144943, + "balance_loss_mlp": 1.03811169, + "epoch": 0.11958514955659101, + "flos": 22124241118080.0, + "grad_norm": 1.8169051600929207, + "language_loss": 0.80757689, + "learning_rate": 3.91640254305538e-06, + "loss": 0.82945859, + "num_input_tokens_seen": 43052215, + "step": 1989, + "time_per_iteration": 2.652146100997925 + }, + { + "auxiliary_loss_clip": 0.01093422, + "auxiliary_loss_mlp": 0.01056021, + "balance_loss_clip": 1.0371387, + "balance_loss_mlp": 1.03306174, + "epoch": 0.11964527280925898, + "flos": 17421452040960.0, + "grad_norm": 2.3985290172031317, + "language_loss": 0.75365555, + "learning_rate": 3.916291083698784e-06, + "loss": 0.77514994, + "num_input_tokens_seen": 43069720, + "step": 1990, + "time_per_iteration": 2.6510674953460693 + }, + { + "auxiliary_loss_clip": 0.01011047, + "auxiliary_loss_mlp": 0.01004305, + "balance_loss_clip": 1.00330377, + "balance_loss_mlp": 1.00161076, + "epoch": 0.11970539606192696, + "flos": 70679741402880.0, + "grad_norm": 0.8663094618625188, + "language_loss": 0.55201733, + "learning_rate": 3.916179551676238e-06, + "loss": 0.57217085, + "num_input_tokens_seen": 43123130, + "step": 1991, + "time_per_iteration": 3.209773063659668 + }, + { + "auxiliary_loss_clip": 0.01104467, + "auxiliary_loss_mlp": 0.01057263, + "balance_loss_clip": 1.04350209, + "balance_loss_mlp": 1.03659201, + "epoch": 0.11976551931459492, + "flos": 21215019127680.0, + "grad_norm": 2.2629860263635244, + "language_loss": 0.77827001, + "learning_rate": 3.916067946991971e-06, + "loss": 0.79988742, + "num_input_tokens_seen": 43140015, + "step": 1992, + "time_per_iteration": 2.688441514968872 + }, + { + "auxiliary_loss_clip": 0.01141354, + "auxiliary_loss_mlp": 0.01053022, + "balance_loss_clip": 1.0432775, + "balance_loss_mlp": 1.03183866, + "epoch": 0.11982564256726289, + "flos": 25989306226560.0, + "grad_norm": 1.6818982957226898, + "language_loss": 0.79146779, + "learning_rate": 3.915956269650216e-06, + "loss": 0.81341159, + "num_input_tokens_seen": 43160105, + "step": 1993, + "time_per_iteration": 2.604370355606079 + }, + { + "auxiliary_loss_clip": 0.0108811, + "auxiliary_loss_mlp": 0.01057818, + "balance_loss_clip": 1.03634334, + "balance_loss_mlp": 1.03651524, + "epoch": 0.11988576581993086, + "flos": 21650866755840.0, + "grad_norm": 2.0493934187093967, + "language_loss": 0.82273388, + "learning_rate": 3.915844519655208e-06, + "loss": 0.84419316, + "num_input_tokens_seen": 43179835, + "step": 1994, + "time_per_iteration": 2.648043632507324 + }, + { + "auxiliary_loss_clip": 0.01113624, + "auxiliary_loss_mlp": 0.01058308, + "balance_loss_clip": 1.04057932, + "balance_loss_mlp": 1.03851891, + "epoch": 0.11994588907259883, + "flos": 17857407409920.0, + "grad_norm": 2.13484145663112, + "language_loss": 0.88014364, + "learning_rate": 3.915732697011183e-06, + "loss": 0.90186298, + "num_input_tokens_seen": 43197210, + "step": 1995, + "time_per_iteration": 2.6747305393218994 + }, + { + "auxiliary_loss_clip": 0.01116256, + "auxiliary_loss_mlp": 0.01061691, + "balance_loss_clip": 1.04275537, + "balance_loss_mlp": 1.04010177, + "epoch": 0.1200060123252668, + "flos": 24462744163200.0, + "grad_norm": 1.874036638365742, + "language_loss": 0.74210864, + "learning_rate": 3.9156208017223825e-06, + "loss": 0.76388812, + "num_input_tokens_seen": 43215050, + "step": 1996, + "time_per_iteration": 2.6567742824554443 + }, + { + "auxiliary_loss_clip": 0.01106189, + "auxiliary_loss_mlp": 0.01051845, + "balance_loss_clip": 1.04179716, + "balance_loss_mlp": 1.02961278, + "epoch": 0.12006613557793476, + "flos": 18732191235840.0, + "grad_norm": 1.9386446892881808, + "language_loss": 0.87655258, + "learning_rate": 3.915508833793048e-06, + "loss": 0.89813292, + "num_input_tokens_seen": 43233900, + "step": 1997, + "time_per_iteration": 2.7078640460968018 + }, + { + "auxiliary_loss_clip": 0.01131777, + "auxiliary_loss_mlp": 0.00749352, + "balance_loss_clip": 1.04314971, + "balance_loss_mlp": 1.00056243, + "epoch": 0.12012625883060274, + "flos": 22267739952000.0, + "grad_norm": 1.952716973126847, + "language_loss": 0.78985989, + "learning_rate": 3.915396793227428e-06, + "loss": 0.80867124, + "num_input_tokens_seen": 43252105, + "step": 1998, + "time_per_iteration": 2.612963914871216 + }, + { + "auxiliary_loss_clip": 0.01126601, + "auxiliary_loss_mlp": 0.0074927, + "balance_loss_clip": 1.04229546, + "balance_loss_mlp": 1.0006094, + "epoch": 0.1201863820832707, + "flos": 21758885930880.0, + "grad_norm": 1.6521383917958787, + "language_loss": 0.73557031, + "learning_rate": 3.915284680029769e-06, + "loss": 0.75432897, + "num_input_tokens_seen": 43270315, + "step": 1999, + "time_per_iteration": 2.6475048065185547 + }, + { + "auxiliary_loss_clip": 0.01142792, + "auxiliary_loss_mlp": 0.01066261, + "balance_loss_clip": 1.04466605, + "balance_loss_mlp": 1.04499364, + "epoch": 0.12024650533593867, + "flos": 21907987286400.0, + "grad_norm": 2.601274722291908, + "language_loss": 0.74708271, + "learning_rate": 3.915172494204323e-06, + "loss": 0.76917326, + "num_input_tokens_seen": 43289935, + "step": 2000, + "time_per_iteration": 2.6618330478668213 + }, + { + "auxiliary_loss_clip": 0.01113642, + "auxiliary_loss_mlp": 0.01057001, + "balance_loss_clip": 1.03944468, + "balance_loss_mlp": 1.03568625, + "epoch": 0.12030662858860665, + "flos": 21689219502720.0, + "grad_norm": 1.5247860845504884, + "language_loss": 0.85176325, + "learning_rate": 3.915060235755344e-06, + "loss": 0.87346971, + "num_input_tokens_seen": 43309325, + "step": 2001, + "time_per_iteration": 2.7019333839416504 + }, + { + "auxiliary_loss_clip": 0.01115232, + "auxiliary_loss_mlp": 0.01054747, + "balance_loss_clip": 1.04010069, + "balance_loss_mlp": 1.03404081, + "epoch": 0.12036675184127461, + "flos": 12933228856320.0, + "grad_norm": 2.3197300304511668, + "language_loss": 0.74309385, + "learning_rate": 3.91494790468709e-06, + "loss": 0.76479363, + "num_input_tokens_seen": 43327010, + "step": 2002, + "time_per_iteration": 2.7070388793945312 + }, + { + "auxiliary_loss_clip": 0.01098391, + "auxiliary_loss_mlp": 0.01063601, + "balance_loss_clip": 1.04403889, + "balance_loss_mlp": 1.04016399, + "epoch": 0.12042687509394258, + "flos": 20851028657280.0, + "grad_norm": 1.992490476500758, + "language_loss": 0.78069264, + "learning_rate": 3.9148355010038185e-06, + "loss": 0.80231255, + "num_input_tokens_seen": 43345650, + "step": 2003, + "time_per_iteration": 2.663583517074585 + }, + { + "auxiliary_loss_clip": 0.01126615, + "auxiliary_loss_mlp": 0.01060261, + "balance_loss_clip": 1.04071736, + "balance_loss_mlp": 1.0372417, + "epoch": 0.12048699834661056, + "flos": 23878513451520.0, + "grad_norm": 1.732036642882627, + "language_loss": 0.72044563, + "learning_rate": 3.914723024709793e-06, + "loss": 0.74231446, + "num_input_tokens_seen": 43365555, + "step": 2004, + "time_per_iteration": 2.5959408283233643 + }, + { + "auxiliary_loss_clip": 0.01124387, + "auxiliary_loss_mlp": 0.01062346, + "balance_loss_clip": 1.0441047, + "balance_loss_mlp": 1.03895724, + "epoch": 0.12054712159927852, + "flos": 19756363726080.0, + "grad_norm": 1.6291682318499177, + "language_loss": 0.78410828, + "learning_rate": 3.914610475809279e-06, + "loss": 0.80597562, + "num_input_tokens_seen": 43384990, + "step": 2005, + "time_per_iteration": 2.6368770599365234 + }, + { + "auxiliary_loss_clip": 0.01033688, + "auxiliary_loss_mlp": 0.00748904, + "balance_loss_clip": 1.00357425, + "balance_loss_mlp": 1.00062788, + "epoch": 0.12060724485194649, + "flos": 51672763123200.0, + "grad_norm": 0.9289058821944445, + "language_loss": 0.58131289, + "learning_rate": 3.914497854306543e-06, + "loss": 0.59913886, + "num_input_tokens_seen": 43436335, + "step": 2006, + "time_per_iteration": 2.9212698936462402 + }, + { + "auxiliary_loss_clip": 0.01115818, + "auxiliary_loss_mlp": 0.01052937, + "balance_loss_clip": 1.04157996, + "balance_loss_mlp": 1.0322541, + "epoch": 0.12066736810461445, + "flos": 18990425088000.0, + "grad_norm": 1.9837758355569968, + "language_loss": 0.76463389, + "learning_rate": 3.9143851602058575e-06, + "loss": 0.78632146, + "num_input_tokens_seen": 43456495, + "step": 2007, + "time_per_iteration": 2.6222574710845947 + }, + { + "auxiliary_loss_clip": 0.01110156, + "auxiliary_loss_mlp": 0.01059742, + "balance_loss_clip": 1.04196429, + "balance_loss_mlp": 1.03742588, + "epoch": 0.12072749135728243, + "flos": 16471973882880.0, + "grad_norm": 4.560541798441693, + "language_loss": 0.83112091, + "learning_rate": 3.914272393511494e-06, + "loss": 0.85281992, + "num_input_tokens_seen": 43473085, + "step": 2008, + "time_per_iteration": 2.6608200073242188 + }, + { + "auxiliary_loss_clip": 0.01139139, + "auxiliary_loss_mlp": 0.01051896, + "balance_loss_clip": 1.04229903, + "balance_loss_mlp": 1.03064048, + "epoch": 0.1207876146099504, + "flos": 18077108947200.0, + "grad_norm": 2.357280896078497, + "language_loss": 0.84117961, + "learning_rate": 3.91415955422773e-06, + "loss": 0.86309004, + "num_input_tokens_seen": 43491135, + "step": 2009, + "time_per_iteration": 2.5655291080474854 + }, + { + "auxiliary_loss_clip": 0.01141975, + "auxiliary_loss_mlp": 0.01055914, + "balance_loss_clip": 1.04555559, + "balance_loss_mlp": 1.03210831, + "epoch": 0.12084773786261836, + "flos": 21871573873920.0, + "grad_norm": 1.744290379691486, + "language_loss": 0.84110761, + "learning_rate": 3.914046642358844e-06, + "loss": 0.86308652, + "num_input_tokens_seen": 43510440, + "step": 2010, + "time_per_iteration": 2.570551872253418 + }, + { + "auxiliary_loss_clip": 0.01104353, + "auxiliary_loss_mlp": 0.00749295, + "balance_loss_clip": 1.04165006, + "balance_loss_mlp": 1.00041676, + "epoch": 0.12090786111528634, + "flos": 18333044328960.0, + "grad_norm": 2.0524387591771647, + "language_loss": 0.84213549, + "learning_rate": 3.9139336579091174e-06, + "loss": 0.860672, + "num_input_tokens_seen": 43530145, + "step": 2011, + "time_per_iteration": 2.6148581504821777 + }, + { + "auxiliary_loss_clip": 0.01110543, + "auxiliary_loss_mlp": 0.01061341, + "balance_loss_clip": 1.04196107, + "balance_loss_mlp": 1.0395968, + "epoch": 0.1209679843679543, + "flos": 21105850717440.0, + "grad_norm": 3.0094117785908567, + "language_loss": 0.9623239, + "learning_rate": 3.913820600882834e-06, + "loss": 0.98404276, + "num_input_tokens_seen": 43549315, + "step": 2012, + "time_per_iteration": 2.682955026626587 + }, + { + "auxiliary_loss_clip": 0.01110856, + "auxiliary_loss_mlp": 0.01043674, + "balance_loss_clip": 1.04261017, + "balance_loss_mlp": 1.02206111, + "epoch": 0.12102810762062227, + "flos": 29241053585280.0, + "grad_norm": 2.0687113997134, + "language_loss": 0.80388641, + "learning_rate": 3.913707471284283e-06, + "loss": 0.8254317, + "num_input_tokens_seen": 43569240, + "step": 2013, + "time_per_iteration": 2.736534833908081 + }, + { + "auxiliary_loss_clip": 0.01085925, + "auxiliary_loss_mlp": 0.01052635, + "balance_loss_clip": 1.03897524, + "balance_loss_mlp": 1.02911496, + "epoch": 0.12108823087329025, + "flos": 17930701111680.0, + "grad_norm": 2.873962923878547, + "language_loss": 0.76504803, + "learning_rate": 3.9135942691177515e-06, + "loss": 0.7864337, + "num_input_tokens_seen": 43587710, + "step": 2014, + "time_per_iteration": 2.7166945934295654 + }, + { + "auxiliary_loss_clip": 0.01126601, + "auxiliary_loss_mlp": 0.01045469, + "balance_loss_clip": 1.04240429, + "balance_loss_mlp": 1.02382052, + "epoch": 0.12114835412595822, + "flos": 22091850028800.0, + "grad_norm": 1.9289511885050887, + "language_loss": 0.8710866, + "learning_rate": 3.913480994387535e-06, + "loss": 0.89280725, + "num_input_tokens_seen": 43606000, + "step": 2015, + "time_per_iteration": 2.686041831970215 + }, + { + "auxiliary_loss_clip": 0.01135877, + "auxiliary_loss_mlp": 0.01051726, + "balance_loss_clip": 1.04105198, + "balance_loss_mlp": 1.03037584, + "epoch": 0.12120847737862618, + "flos": 20412343854720.0, + "grad_norm": 2.11235764059212, + "language_loss": 0.68626523, + "learning_rate": 3.913367647097926e-06, + "loss": 0.70814127, + "num_input_tokens_seen": 43624815, + "step": 2016, + "time_per_iteration": 2.606714963912964 + }, + { + "auxiliary_loss_clip": 0.01119021, + "auxiliary_loss_mlp": 0.01048398, + "balance_loss_clip": 1.04359341, + "balance_loss_mlp": 1.02499712, + "epoch": 0.12126860063129415, + "flos": 22309037614080.0, + "grad_norm": 2.473421446240025, + "language_loss": 0.80723035, + "learning_rate": 3.913254227253225e-06, + "loss": 0.82890451, + "num_input_tokens_seen": 43643960, + "step": 2017, + "time_per_iteration": 2.730211019515991 + }, + { + "auxiliary_loss_clip": 0.0111668, + "auxiliary_loss_mlp": 0.01049635, + "balance_loss_clip": 1.04049659, + "balance_loss_mlp": 1.02650809, + "epoch": 0.12132872388396213, + "flos": 13699275235200.0, + "grad_norm": 2.266539203732548, + "language_loss": 0.68791282, + "learning_rate": 3.913140734857731e-06, + "loss": 0.70957607, + "num_input_tokens_seen": 43662650, + "step": 2018, + "time_per_iteration": 2.5959668159484863 + }, + { + "auxiliary_loss_clip": 0.01109679, + "auxiliary_loss_mlp": 0.01052747, + "balance_loss_clip": 1.04684508, + "balance_loss_mlp": 1.0310626, + "epoch": 0.12138884713663009, + "flos": 26466954307200.0, + "grad_norm": 1.6519749966985524, + "language_loss": 0.7216599, + "learning_rate": 3.91302716991575e-06, + "loss": 0.74328411, + "num_input_tokens_seen": 43684205, + "step": 2019, + "time_per_iteration": 4.357996463775635 + }, + { + "auxiliary_loss_clip": 0.0108437, + "auxiliary_loss_mlp": 0.01062794, + "balance_loss_clip": 1.04502511, + "balance_loss_mlp": 1.03990579, + "epoch": 0.12144897038929806, + "flos": 26141603892480.0, + "grad_norm": 1.7936321511277438, + "language_loss": 0.92131662, + "learning_rate": 3.912913532431586e-06, + "loss": 0.94278824, + "num_input_tokens_seen": 43706320, + "step": 2020, + "time_per_iteration": 2.7882461547851562 + }, + { + "auxiliary_loss_clip": 0.01101243, + "auxiliary_loss_mlp": 0.0105041, + "balance_loss_clip": 1.04011559, + "balance_loss_mlp": 1.02897668, + "epoch": 0.12150909364196603, + "flos": 24717530309760.0, + "grad_norm": 1.8736220086883815, + "language_loss": 0.77620471, + "learning_rate": 3.912799822409549e-06, + "loss": 0.79772127, + "num_input_tokens_seen": 43724805, + "step": 2021, + "time_per_iteration": 2.6653945446014404 + }, + { + "auxiliary_loss_clip": 0.01140174, + "auxiliary_loss_mlp": 0.01052846, + "balance_loss_clip": 1.04532027, + "balance_loss_mlp": 1.03215086, + "epoch": 0.121569216894634, + "flos": 25186990089600.0, + "grad_norm": 2.029737283918273, + "language_loss": 0.80964506, + "learning_rate": 3.912686039853952e-06, + "loss": 0.83157527, + "num_input_tokens_seen": 43742320, + "step": 2022, + "time_per_iteration": 2.70937180519104 + }, + { + "auxiliary_loss_clip": 0.0111053, + "auxiliary_loss_mlp": 0.01060199, + "balance_loss_clip": 1.04341841, + "balance_loss_mlp": 1.03781164, + "epoch": 0.12162934014730196, + "flos": 13444094039040.0, + "grad_norm": 1.8751062973945878, + "language_loss": 0.85169494, + "learning_rate": 3.912572184769108e-06, + "loss": 0.87340224, + "num_input_tokens_seen": 43760665, + "step": 2023, + "time_per_iteration": 2.694432497024536 + }, + { + "auxiliary_loss_clip": 0.01104427, + "auxiliary_loss_mlp": 0.01051296, + "balance_loss_clip": 1.03963113, + "balance_loss_mlp": 1.02897954, + "epoch": 0.12168946339996994, + "flos": 16946138344320.0, + "grad_norm": 2.005074885546632, + "language_loss": 0.85049045, + "learning_rate": 3.912458257159335e-06, + "loss": 0.87204766, + "num_input_tokens_seen": 43779020, + "step": 2024, + "time_per_iteration": 4.303445339202881 + }, + { + "auxiliary_loss_clip": 0.01136981, + "auxiliary_loss_mlp": 0.01056111, + "balance_loss_clip": 1.03950071, + "balance_loss_mlp": 1.03455782, + "epoch": 0.12174958665263791, + "flos": 29821585196160.0, + "grad_norm": 2.250029121050548, + "language_loss": 0.72275347, + "learning_rate": 3.912344257028954e-06, + "loss": 0.7446844, + "num_input_tokens_seen": 43798850, + "step": 2025, + "time_per_iteration": 4.345383644104004 + }, + { + "auxiliary_loss_clip": 0.01117153, + "auxiliary_loss_mlp": 0.01047204, + "balance_loss_clip": 1.04126704, + "balance_loss_mlp": 1.02611542, + "epoch": 0.12180970990530587, + "flos": 24641902224000.0, + "grad_norm": 1.530446100097363, + "language_loss": 0.76028574, + "learning_rate": 3.912230184382286e-06, + "loss": 0.78192937, + "num_input_tokens_seen": 43820130, + "step": 2026, + "time_per_iteration": 2.6975314617156982 + }, + { + "auxiliary_loss_clip": 0.01108559, + "auxiliary_loss_mlp": 0.01051715, + "balance_loss_clip": 1.03861082, + "balance_loss_mlp": 1.03011382, + "epoch": 0.12186983315797385, + "flos": 20521691832960.0, + "grad_norm": 2.5192889462762826, + "language_loss": 0.88665533, + "learning_rate": 3.912116039223659e-06, + "loss": 0.90825808, + "num_input_tokens_seen": 43838485, + "step": 2027, + "time_per_iteration": 2.596642255783081 + }, + { + "auxiliary_loss_clip": 0.01111043, + "auxiliary_loss_mlp": 0.01053736, + "balance_loss_clip": 1.04005003, + "balance_loss_mlp": 1.03403056, + "epoch": 0.12192995641064182, + "flos": 27818344719360.0, + "grad_norm": 3.1767914439645786, + "language_loss": 0.75883698, + "learning_rate": 3.912001821557399e-06, + "loss": 0.7804848, + "num_input_tokens_seen": 43859080, + "step": 2028, + "time_per_iteration": 2.693021535873413 + }, + { + "auxiliary_loss_clip": 0.01088499, + "auxiliary_loss_mlp": 0.01059838, + "balance_loss_clip": 1.03878844, + "balance_loss_mlp": 1.0377723, + "epoch": 0.12199007966330978, + "flos": 22017119783040.0, + "grad_norm": 2.4775004921810533, + "language_loss": 0.77061188, + "learning_rate": 3.911887531387839e-06, + "loss": 0.79209518, + "num_input_tokens_seen": 43879030, + "step": 2029, + "time_per_iteration": 2.818805694580078 + }, + { + "auxiliary_loss_clip": 0.01123098, + "auxiliary_loss_mlp": 0.01053037, + "balance_loss_clip": 1.03860998, + "balance_loss_mlp": 1.0316031, + "epoch": 0.12205020291597775, + "flos": 23295216493440.0, + "grad_norm": 1.792126341059684, + "language_loss": 0.78690577, + "learning_rate": 3.911773168719313e-06, + "loss": 0.80866712, + "num_input_tokens_seen": 43898505, + "step": 2030, + "time_per_iteration": 2.6013028621673584 + }, + { + "auxiliary_loss_clip": 0.0113629, + "auxiliary_loss_mlp": 0.01048702, + "balance_loss_clip": 1.04250491, + "balance_loss_mlp": 1.02694666, + "epoch": 0.12211032616864573, + "flos": 26031609469440.0, + "grad_norm": 1.9867938584957767, + "language_loss": 0.73824573, + "learning_rate": 3.911658733556155e-06, + "loss": 0.7600956, + "num_input_tokens_seen": 43917945, + "step": 2031, + "time_per_iteration": 2.5538887977600098 + }, + { + "auxiliary_loss_clip": 0.01137661, + "auxiliary_loss_mlp": 0.0104672, + "balance_loss_clip": 1.04332447, + "balance_loss_mlp": 1.02680016, + "epoch": 0.12217044942131369, + "flos": 20410943224320.0, + "grad_norm": 2.016478150515993, + "language_loss": 0.7513532, + "learning_rate": 3.911544225902707e-06, + "loss": 0.77319705, + "num_input_tokens_seen": 43937385, + "step": 2032, + "time_per_iteration": 2.5515406131744385 + }, + { + "auxiliary_loss_clip": 0.011181, + "auxiliary_loss_mlp": 0.01044713, + "balance_loss_clip": 1.03769231, + "balance_loss_mlp": 1.02512646, + "epoch": 0.12223057267398166, + "flos": 22857142222080.0, + "grad_norm": 1.565332963671523, + "language_loss": 0.88892323, + "learning_rate": 3.911429645763311e-06, + "loss": 0.91055137, + "num_input_tokens_seen": 43958130, + "step": 2033, + "time_per_iteration": 2.611618757247925 + }, + { + "auxiliary_loss_clip": 0.01115094, + "auxiliary_loss_mlp": 0.01052561, + "balance_loss_clip": 1.04165983, + "balance_loss_mlp": 1.03135419, + "epoch": 0.12229069592664964, + "flos": 20047563285120.0, + "grad_norm": 2.064745455855644, + "language_loss": 0.65507376, + "learning_rate": 3.911314993142311e-06, + "loss": 0.67675036, + "num_input_tokens_seen": 43976800, + "step": 2034, + "time_per_iteration": 2.6381773948669434 + }, + { + "auxiliary_loss_clip": 0.01118054, + "auxiliary_loss_mlp": 0.0105458, + "balance_loss_clip": 1.04242694, + "balance_loss_mlp": 1.03252614, + "epoch": 0.1223508191793176, + "flos": 22274240313600.0, + "grad_norm": 1.5375696299169386, + "language_loss": 0.76475143, + "learning_rate": 3.911200268044055e-06, + "loss": 0.78647774, + "num_input_tokens_seen": 43996620, + "step": 2035, + "time_per_iteration": 2.678516149520874 + }, + { + "auxiliary_loss_clip": 0.01141084, + "auxiliary_loss_mlp": 0.01050699, + "balance_loss_clip": 1.04303408, + "balance_loss_mlp": 1.02917004, + "epoch": 0.12241094243198557, + "flos": 21285978445440.0, + "grad_norm": 1.9851249914934908, + "language_loss": 0.71166062, + "learning_rate": 3.911085470472892e-06, + "loss": 0.73357844, + "num_input_tokens_seen": 44016175, + "step": 2036, + "time_per_iteration": 2.622586727142334 + }, + { + "auxiliary_loss_clip": 0.0111456, + "auxiliary_loss_mlp": 0.01061225, + "balance_loss_clip": 1.04522109, + "balance_loss_mlp": 1.03955293, + "epoch": 0.12247106568465355, + "flos": 17382381022080.0, + "grad_norm": 1.7417141000264995, + "language_loss": 0.83038759, + "learning_rate": 3.910970600433178e-06, + "loss": 0.85214543, + "num_input_tokens_seen": 44035060, + "step": 2037, + "time_per_iteration": 2.607386350631714 + }, + { + "auxiliary_loss_clip": 0.01118674, + "auxiliary_loss_mlp": 0.01057486, + "balance_loss_clip": 1.04302311, + "balance_loss_mlp": 1.03438342, + "epoch": 0.12253118893732151, + "flos": 27045438842880.0, + "grad_norm": 2.5947644742179055, + "language_loss": 0.80376744, + "learning_rate": 3.910855657929267e-06, + "loss": 0.8255291, + "num_input_tokens_seen": 44053330, + "step": 2038, + "time_per_iteration": 2.704285144805908 + }, + { + "auxiliary_loss_clip": 0.01023151, + "auxiliary_loss_mlp": 0.00748712, + "balance_loss_clip": 1.00382102, + "balance_loss_mlp": 1.00052416, + "epoch": 0.12259131218998948, + "flos": 53861518368000.0, + "grad_norm": 0.8234816522938772, + "language_loss": 0.58688509, + "learning_rate": 3.910740642965518e-06, + "loss": 0.60460377, + "num_input_tokens_seen": 44107575, + "step": 2039, + "time_per_iteration": 3.1740145683288574 + }, + { + "auxiliary_loss_clip": 0.01088372, + "auxiliary_loss_mlp": 0.01058832, + "balance_loss_clip": 1.03976417, + "balance_loss_mlp": 1.03483522, + "epoch": 0.12265143544265744, + "flos": 17891917401600.0, + "grad_norm": 2.295646617901876, + "language_loss": 0.8062079, + "learning_rate": 3.910625555546292e-06, + "loss": 0.82767987, + "num_input_tokens_seen": 44126075, + "step": 2040, + "time_per_iteration": 2.9100759029388428 + }, + { + "auxiliary_loss_clip": 0.01117912, + "auxiliary_loss_mlp": 0.01055326, + "balance_loss_clip": 1.04219902, + "balance_loss_mlp": 1.0334866, + "epoch": 0.12271155869532542, + "flos": 21799932197760.0, + "grad_norm": 2.490706181138635, + "language_loss": 0.83128697, + "learning_rate": 3.910510395675953e-06, + "loss": 0.85301936, + "num_input_tokens_seen": 44145605, + "step": 2041, + "time_per_iteration": 2.791006565093994 + }, + { + "auxiliary_loss_clip": 0.0109219, + "auxiliary_loss_mlp": 0.01054482, + "balance_loss_clip": 1.03614116, + "balance_loss_mlp": 1.03104544, + "epoch": 0.12277168194799339, + "flos": 19828759587840.0, + "grad_norm": 1.642605125471624, + "language_loss": 0.67173129, + "learning_rate": 3.9103951633588694e-06, + "loss": 0.69319797, + "num_input_tokens_seen": 44164770, + "step": 2042, + "time_per_iteration": 2.683478593826294 + }, + { + "auxiliary_loss_clip": 0.01100549, + "auxiliary_loss_mlp": 0.01052317, + "balance_loss_clip": 1.03840303, + "balance_loss_mlp": 1.03053725, + "epoch": 0.12283180520066135, + "flos": 23221024951680.0, + "grad_norm": 1.9322707488061737, + "language_loss": 0.81454766, + "learning_rate": 3.910279858599409e-06, + "loss": 0.83607626, + "num_input_tokens_seen": 44184025, + "step": 2043, + "time_per_iteration": 2.7790169715881348 + }, + { + "auxiliary_loss_clip": 0.01111867, + "auxiliary_loss_mlp": 0.01047599, + "balance_loss_clip": 1.03753376, + "balance_loss_mlp": 1.02523541, + "epoch": 0.12289192845332933, + "flos": 18588476920320.0, + "grad_norm": 1.6462464459144326, + "language_loss": 0.80142272, + "learning_rate": 3.910164481401946e-06, + "loss": 0.82301736, + "num_input_tokens_seen": 44202950, + "step": 2044, + "time_per_iteration": 2.6279940605163574 + }, + { + "auxiliary_loss_clip": 0.01087826, + "auxiliary_loss_mlp": 0.01052565, + "balance_loss_clip": 1.04126537, + "balance_loss_mlp": 1.03107131, + "epoch": 0.1229520517059973, + "flos": 25769532862080.0, + "grad_norm": 1.7585537028329699, + "language_loss": 0.78400838, + "learning_rate": 3.910049031770853e-06, + "loss": 0.80541229, + "num_input_tokens_seen": 44221115, + "step": 2045, + "time_per_iteration": 2.685872793197632 + }, + { + "auxiliary_loss_clip": 0.01131096, + "auxiliary_loss_mlp": 0.01063527, + "balance_loss_clip": 1.04397452, + "balance_loss_mlp": 1.04053164, + "epoch": 0.12301217495866526, + "flos": 20887154760960.0, + "grad_norm": 1.8197449035473208, + "language_loss": 0.67404252, + "learning_rate": 3.90993350971051e-06, + "loss": 0.69598877, + "num_input_tokens_seen": 44240575, + "step": 2046, + "time_per_iteration": 2.701859474182129 + }, + { + "auxiliary_loss_clip": 0.01140257, + "auxiliary_loss_mlp": 0.01056282, + "balance_loss_clip": 1.04570568, + "balance_loss_mlp": 1.03458595, + "epoch": 0.12307229821133324, + "flos": 22378811783040.0, + "grad_norm": 2.0917440120852993, + "language_loss": 0.72374582, + "learning_rate": 3.909817915225297e-06, + "loss": 0.74571127, + "num_input_tokens_seen": 44257145, + "step": 2047, + "time_per_iteration": 2.6677284240722656 + }, + { + "auxiliary_loss_clip": 0.01123152, + "auxiliary_loss_mlp": 0.01058448, + "balance_loss_clip": 1.04192996, + "balance_loss_mlp": 1.03611994, + "epoch": 0.1231324214640012, + "flos": 23367396873600.0, + "grad_norm": 1.6450406279055028, + "language_loss": 0.76628959, + "learning_rate": 3.909702248319597e-06, + "loss": 0.78810561, + "num_input_tokens_seen": 44278035, + "step": 2048, + "time_per_iteration": 2.6775777339935303 + }, + { + "auxiliary_loss_clip": 0.01113436, + "auxiliary_loss_mlp": 0.01050447, + "balance_loss_clip": 1.04126811, + "balance_loss_mlp": 1.03106403, + "epoch": 0.12319254471666917, + "flos": 23767154311680.0, + "grad_norm": 1.972378157048913, + "language_loss": 0.85104448, + "learning_rate": 3.909586508997797e-06, + "loss": 0.87268329, + "num_input_tokens_seen": 44296980, + "step": 2049, + "time_per_iteration": 2.6715638637542725 + }, + { + "auxiliary_loss_clip": 0.01087956, + "auxiliary_loss_mlp": 0.01051793, + "balance_loss_clip": 1.03897882, + "balance_loss_mlp": 1.03022754, + "epoch": 0.12325266796933713, + "flos": 23550146294400.0, + "grad_norm": 1.7174524146557733, + "language_loss": 0.75319731, + "learning_rate": 3.909470697264285e-06, + "loss": 0.77459478, + "num_input_tokens_seen": 44318005, + "step": 2050, + "time_per_iteration": 2.7224321365356445 + }, + { + "auxiliary_loss_clip": 0.01103485, + "auxiliary_loss_mlp": 0.01055633, + "balance_loss_clip": 1.04036641, + "balance_loss_mlp": 1.03411579, + "epoch": 0.12331279122200511, + "flos": 24423996366720.0, + "grad_norm": 2.0344103640810567, + "language_loss": 0.80840492, + "learning_rate": 3.909354813123452e-06, + "loss": 0.82999611, + "num_input_tokens_seen": 44335260, + "step": 2051, + "time_per_iteration": 2.6827218532562256 + }, + { + "auxiliary_loss_clip": 0.01137458, + "auxiliary_loss_mlp": 0.0074919, + "balance_loss_clip": 1.04355288, + "balance_loss_mlp": 1.00041986, + "epoch": 0.12337291447467308, + "flos": 25484294960640.0, + "grad_norm": 1.6504515243060462, + "language_loss": 0.79984725, + "learning_rate": 3.909238856579693e-06, + "loss": 0.81871378, + "num_input_tokens_seen": 44355315, + "step": 2052, + "time_per_iteration": 2.6456713676452637 + }, + { + "auxiliary_loss_clip": 0.01127393, + "auxiliary_loss_mlp": 0.01058859, + "balance_loss_clip": 1.04001999, + "balance_loss_mlp": 1.03589892, + "epoch": 0.12343303772734104, + "flos": 23550002640000.0, + "grad_norm": 2.039332279758271, + "language_loss": 0.73584747, + "learning_rate": 3.909122827637406e-06, + "loss": 0.75770998, + "num_input_tokens_seen": 44373020, + "step": 2053, + "time_per_iteration": 2.637922763824463 + }, + { + "auxiliary_loss_clip": 0.01136683, + "auxiliary_loss_mlp": 0.00749147, + "balance_loss_clip": 1.03927112, + "balance_loss_mlp": 1.00039363, + "epoch": 0.12349316098000902, + "flos": 47557074867840.0, + "grad_norm": 1.5082788763182324, + "language_loss": 0.74469495, + "learning_rate": 3.909006726300991e-06, + "loss": 0.76355326, + "num_input_tokens_seen": 44397525, + "step": 2054, + "time_per_iteration": 2.727147102355957 + }, + { + "auxiliary_loss_clip": 0.01113806, + "auxiliary_loss_mlp": 0.01045683, + "balance_loss_clip": 1.04007745, + "balance_loss_mlp": 1.02526271, + "epoch": 0.12355328423267699, + "flos": 25045969294080.0, + "grad_norm": 2.11937413691987, + "language_loss": 0.85130256, + "learning_rate": 3.908890552574849e-06, + "loss": 0.87289745, + "num_input_tokens_seen": 44415890, + "step": 2055, + "time_per_iteration": 2.7953107357025146 + }, + { + "auxiliary_loss_clip": 0.01102498, + "auxiliary_loss_mlp": 0.01048483, + "balance_loss_clip": 1.0457139, + "balance_loss_mlp": 1.02828908, + "epoch": 0.12361340748534495, + "flos": 27709140395520.0, + "grad_norm": 3.0741083567147567, + "language_loss": 0.77371442, + "learning_rate": 3.908774306463384e-06, + "loss": 0.79522431, + "num_input_tokens_seen": 44436625, + "step": 2056, + "time_per_iteration": 2.8846986293792725 + }, + { + "auxiliary_loss_clip": 0.01124206, + "auxiliary_loss_mlp": 0.01054832, + "balance_loss_clip": 1.03975403, + "balance_loss_mlp": 1.03437555, + "epoch": 0.12367353073801293, + "flos": 26140598311680.0, + "grad_norm": 2.093173665468882, + "language_loss": 0.83092308, + "learning_rate": 3.908657987971009e-06, + "loss": 0.85271347, + "num_input_tokens_seen": 44455265, + "step": 2057, + "time_per_iteration": 2.7774693965911865 + }, + { + "auxiliary_loss_clip": 0.0110705, + "auxiliary_loss_mlp": 0.01053823, + "balance_loss_clip": 1.03731227, + "balance_loss_mlp": 1.03203177, + "epoch": 0.1237336539906809, + "flos": 25156035544320.0, + "grad_norm": 1.4923118317326334, + "language_loss": 0.77974522, + "learning_rate": 3.90854159710213e-06, + "loss": 0.80135393, + "num_input_tokens_seen": 44475815, + "step": 2058, + "time_per_iteration": 2.8040006160736084 + }, + { + "auxiliary_loss_clip": 0.01099828, + "auxiliary_loss_mlp": 0.01054836, + "balance_loss_clip": 1.03673029, + "balance_loss_mlp": 1.0316143, + "epoch": 0.12379377724334886, + "flos": 15304589867520.0, + "grad_norm": 2.337211040553978, + "language_loss": 0.83484817, + "learning_rate": 3.9084251338611624e-06, + "loss": 0.85639483, + "num_input_tokens_seen": 44494045, + "step": 2059, + "time_per_iteration": 2.7072036266326904 + }, + { + "auxiliary_loss_clip": 0.01110201, + "auxiliary_loss_mlp": 0.01062207, + "balance_loss_clip": 1.04412425, + "balance_loss_mlp": 1.0385921, + "epoch": 0.12385390049601683, + "flos": 21316717509120.0, + "grad_norm": 3.4083346621835315, + "language_loss": 0.81462103, + "learning_rate": 3.908308598252523e-06, + "loss": 0.83634514, + "num_input_tokens_seen": 44509120, + "step": 2060, + "time_per_iteration": 2.851085901260376 + }, + { + "auxiliary_loss_clip": 0.0111346, + "auxiliary_loss_mlp": 0.01056469, + "balance_loss_clip": 1.03893232, + "balance_loss_mlp": 1.03404558, + "epoch": 0.1239140237486848, + "flos": 15116309752320.0, + "grad_norm": 1.960725187763668, + "language_loss": 0.86303753, + "learning_rate": 3.9081919902806306e-06, + "loss": 0.88473678, + "num_input_tokens_seen": 44525780, + "step": 2061, + "time_per_iteration": 2.688969373703003 + }, + { + "auxiliary_loss_clip": 0.01121137, + "auxiliary_loss_mlp": 0.01044728, + "balance_loss_clip": 1.04016805, + "balance_loss_mlp": 1.02520132, + "epoch": 0.12397414700135277, + "flos": 21976791788160.0, + "grad_norm": 1.9712869541621336, + "language_loss": 0.85061252, + "learning_rate": 3.908075309949906e-06, + "loss": 0.87227118, + "num_input_tokens_seen": 44543125, + "step": 2062, + "time_per_iteration": 2.6399924755096436 + }, + { + "auxiliary_loss_clip": 0.01089673, + "auxiliary_loss_mlp": 0.01059731, + "balance_loss_clip": 1.04040062, + "balance_loss_mlp": 1.03704596, + "epoch": 0.12403427025402074, + "flos": 13400892956160.0, + "grad_norm": 1.8015044031070382, + "language_loss": 0.79233903, + "learning_rate": 3.907958557264774e-06, + "loss": 0.81383312, + "num_input_tokens_seen": 44560275, + "step": 2063, + "time_per_iteration": 2.686713695526123 + }, + { + "auxiliary_loss_clip": 0.01093345, + "auxiliary_loss_mlp": 0.01055599, + "balance_loss_clip": 1.04197025, + "balance_loss_mlp": 1.03265154, + "epoch": 0.12409439350668872, + "flos": 15304374385920.0, + "grad_norm": 1.9760857041556488, + "language_loss": 0.78927147, + "learning_rate": 3.907841732229663e-06, + "loss": 0.81076097, + "num_input_tokens_seen": 44577640, + "step": 2064, + "time_per_iteration": 2.684230327606201 + }, + { + "auxiliary_loss_clip": 0.01109802, + "auxiliary_loss_mlp": 0.01052789, + "balance_loss_clip": 1.0384649, + "balance_loss_mlp": 1.03198695, + "epoch": 0.12415451675935668, + "flos": 25009376313600.0, + "grad_norm": 2.222271309043735, + "language_loss": 0.92630506, + "learning_rate": 3.907724834849002e-06, + "loss": 0.94793093, + "num_input_tokens_seen": 44594860, + "step": 2065, + "time_per_iteration": 2.753760814666748 + }, + { + "auxiliary_loss_clip": 0.01113788, + "auxiliary_loss_mlp": 0.01046365, + "balance_loss_clip": 1.0377202, + "balance_loss_mlp": 1.02434683, + "epoch": 0.12421464001202465, + "flos": 23659673840640.0, + "grad_norm": 2.153584598331364, + "language_loss": 0.80668294, + "learning_rate": 3.907607865127225e-06, + "loss": 0.8282845, + "num_input_tokens_seen": 44614780, + "step": 2066, + "time_per_iteration": 4.453660488128662 + }, + { + "auxiliary_loss_clip": 0.00996544, + "auxiliary_loss_mlp": 0.01003506, + "balance_loss_clip": 1.00750017, + "balance_loss_mlp": 1.00111008, + "epoch": 0.12427476326469263, + "flos": 65732904345600.0, + "grad_norm": 0.8644042826315003, + "language_loss": 0.63327509, + "learning_rate": 3.907490823068766e-06, + "loss": 0.65327555, + "num_input_tokens_seen": 44671240, + "step": 2067, + "time_per_iteration": 3.3303115367889404 + }, + { + "auxiliary_loss_clip": 0.01078485, + "auxiliary_loss_mlp": 0.01057302, + "balance_loss_clip": 1.03440332, + "balance_loss_mlp": 1.03484273, + "epoch": 0.12433488651736059, + "flos": 24535427333760.0, + "grad_norm": 1.7732672116130388, + "language_loss": 0.92970455, + "learning_rate": 3.907373708678063e-06, + "loss": 0.95106238, + "num_input_tokens_seen": 44691050, + "step": 2068, + "time_per_iteration": 3.040593147277832 + }, + { + "auxiliary_loss_clip": 0.01129129, + "auxiliary_loss_mlp": 0.01048366, + "balance_loss_clip": 1.04316902, + "balance_loss_mlp": 1.0294838, + "epoch": 0.12439500977002856, + "flos": 21031659175680.0, + "grad_norm": 2.0549695709169433, + "language_loss": 0.81127369, + "learning_rate": 3.9072565219595596e-06, + "loss": 0.83304864, + "num_input_tokens_seen": 44709850, + "step": 2069, + "time_per_iteration": 2.829876184463501 + }, + { + "auxiliary_loss_clip": 0.01077755, + "auxiliary_loss_mlp": 0.01059326, + "balance_loss_clip": 1.03646421, + "balance_loss_mlp": 1.03748679, + "epoch": 0.12445513302269653, + "flos": 26830621555200.0, + "grad_norm": 2.375835817091381, + "language_loss": 0.77365005, + "learning_rate": 3.907139262917696e-06, + "loss": 0.79502088, + "num_input_tokens_seen": 44731475, + "step": 2070, + "time_per_iteration": 2.9128479957580566 + }, + { + "auxiliary_loss_clip": 0.01120875, + "auxiliary_loss_mlp": 0.01049604, + "balance_loss_clip": 1.04255271, + "balance_loss_mlp": 1.02769363, + "epoch": 0.1245152562753645, + "flos": 18368919037440.0, + "grad_norm": 2.047351037495372, + "language_loss": 0.80752581, + "learning_rate": 3.907021931556922e-06, + "loss": 0.82923061, + "num_input_tokens_seen": 44749685, + "step": 2071, + "time_per_iteration": 5.9543092250823975 + }, + { + "auxiliary_loss_clip": 0.01123459, + "auxiliary_loss_mlp": 0.01053565, + "balance_loss_clip": 1.0429318, + "balance_loss_mlp": 1.0322032, + "epoch": 0.12457537952803246, + "flos": 33107986200960.0, + "grad_norm": 1.783679967369325, + "language_loss": 0.7823602, + "learning_rate": 3.906904527881684e-06, + "loss": 0.80413043, + "num_input_tokens_seen": 44772165, + "step": 2072, + "time_per_iteration": 4.2418811321258545 + }, + { + "auxiliary_loss_clip": 0.01121146, + "auxiliary_loss_mlp": 0.01054784, + "balance_loss_clip": 1.04876101, + "balance_loss_mlp": 1.03389835, + "epoch": 0.12463550278070043, + "flos": 22270217990400.0, + "grad_norm": 1.7571273682642945, + "language_loss": 0.75121623, + "learning_rate": 3.9067870518964355e-06, + "loss": 0.77297556, + "num_input_tokens_seen": 44790580, + "step": 2073, + "time_per_iteration": 2.662353754043579 + }, + { + "auxiliary_loss_clip": 0.01056851, + "auxiliary_loss_mlp": 0.01050412, + "balance_loss_clip": 1.03130841, + "balance_loss_mlp": 1.0289191, + "epoch": 0.12469562603336841, + "flos": 14679025580160.0, + "grad_norm": 2.0094414524886117, + "language_loss": 0.90560818, + "learning_rate": 3.906669503605631e-06, + "loss": 0.92668086, + "num_input_tokens_seen": 44806730, + "step": 2074, + "time_per_iteration": 2.7496707439422607 + }, + { + "auxiliary_loss_clip": 0.01072317, + "auxiliary_loss_mlp": 0.01052642, + "balance_loss_clip": 1.03675818, + "balance_loss_mlp": 1.02969408, + "epoch": 0.12475574928603637, + "flos": 24644775312000.0, + "grad_norm": 2.5420928133465996, + "language_loss": 0.83465219, + "learning_rate": 3.906551883013728e-06, + "loss": 0.85590178, + "num_input_tokens_seen": 44825550, + "step": 2075, + "time_per_iteration": 2.7725024223327637 + }, + { + "auxiliary_loss_clip": 0.01068965, + "auxiliary_loss_mlp": 0.01054375, + "balance_loss_clip": 1.03285432, + "balance_loss_mlp": 1.03258371, + "epoch": 0.12481587253870434, + "flos": 21762980081280.0, + "grad_norm": 1.7682871475160928, + "language_loss": 0.73804903, + "learning_rate": 3.9064341901251865e-06, + "loss": 0.75928247, + "num_input_tokens_seen": 44844155, + "step": 2076, + "time_per_iteration": 2.699232339859009 + }, + { + "auxiliary_loss_clip": 0.01064293, + "auxiliary_loss_mlp": 0.01043948, + "balance_loss_clip": 1.03595805, + "balance_loss_mlp": 1.02476692, + "epoch": 0.12487599579137232, + "flos": 21432529935360.0, + "grad_norm": 1.8289431682716266, + "language_loss": 0.75827861, + "learning_rate": 3.906316424944469e-06, + "loss": 0.77936107, + "num_input_tokens_seen": 44863780, + "step": 2077, + "time_per_iteration": 2.7105536460876465 + }, + { + "auxiliary_loss_clip": 0.01124013, + "auxiliary_loss_mlp": 0.01054904, + "balance_loss_clip": 1.04070497, + "balance_loss_mlp": 1.03312421, + "epoch": 0.12493611904404028, + "flos": 16107624276480.0, + "grad_norm": 2.149693662234729, + "language_loss": 0.82590866, + "learning_rate": 3.906198587476043e-06, + "loss": 0.84769785, + "num_input_tokens_seen": 44881480, + "step": 2078, + "time_per_iteration": 2.6717309951782227 + }, + { + "auxiliary_loss_clip": 0.0110853, + "auxiliary_loss_mlp": 0.01052395, + "balance_loss_clip": 1.03997016, + "balance_loss_mlp": 1.03061557, + "epoch": 0.12499624229670825, + "flos": 21580266574080.0, + "grad_norm": 1.593077781353054, + "language_loss": 0.74975562, + "learning_rate": 3.906080677724374e-06, + "loss": 0.77136493, + "num_input_tokens_seen": 44900390, + "step": 2079, + "time_per_iteration": 2.6831064224243164 + }, + { + "auxiliary_loss_clip": 0.01134926, + "auxiliary_loss_mlp": 0.01060186, + "balance_loss_clip": 1.04646349, + "balance_loss_mlp": 1.03828704, + "epoch": 0.1250563655493762, + "flos": 25699040421120.0, + "grad_norm": 5.471481759053198, + "language_loss": 0.83885074, + "learning_rate": 3.905962695693935e-06, + "loss": 0.86080182, + "num_input_tokens_seen": 44920375, + "step": 2080, + "time_per_iteration": 2.68324875831604 + }, + { + "auxiliary_loss_clip": 0.01122109, + "auxiliary_loss_mlp": 0.01058951, + "balance_loss_clip": 1.04185748, + "balance_loss_mlp": 1.03799379, + "epoch": 0.12511648880204418, + "flos": 16909509450240.0, + "grad_norm": 1.9790458666840565, + "language_loss": 0.84874129, + "learning_rate": 3.9058446413892e-06, + "loss": 0.87055182, + "num_input_tokens_seen": 44938415, + "step": 2081, + "time_per_iteration": 2.5665721893310547 + }, + { + "auxiliary_loss_clip": 0.01123681, + "auxiliary_loss_mlp": 0.01043283, + "balance_loss_clip": 1.04139543, + "balance_loss_mlp": 1.02289796, + "epoch": 0.12517661205471217, + "flos": 17567500740480.0, + "grad_norm": 1.7222054678129282, + "language_loss": 0.76953858, + "learning_rate": 3.905726514814646e-06, + "loss": 0.79120821, + "num_input_tokens_seen": 44957135, + "step": 2082, + "time_per_iteration": 2.7391629219055176 + }, + { + "auxiliary_loss_clip": 0.01129225, + "auxiliary_loss_mlp": 0.01052818, + "balance_loss_clip": 1.05154419, + "balance_loss_mlp": 1.03037095, + "epoch": 0.12523673530738014, + "flos": 16033791870720.0, + "grad_norm": 2.4800335477970634, + "language_loss": 0.79138821, + "learning_rate": 3.9056083159747495e-06, + "loss": 0.81320864, + "num_input_tokens_seen": 44974480, + "step": 2083, + "time_per_iteration": 2.599543809890747 + }, + { + "auxiliary_loss_clip": 0.01113325, + "auxiliary_loss_mlp": 0.01047619, + "balance_loss_clip": 1.04024267, + "balance_loss_mlp": 1.0254817, + "epoch": 0.1252968585600481, + "flos": 18807747494400.0, + "grad_norm": 4.956180704657822, + "language_loss": 0.90687776, + "learning_rate": 3.9054900448739966e-06, + "loss": 0.92848712, + "num_input_tokens_seen": 44990310, + "step": 2084, + "time_per_iteration": 2.6386358737945557 + }, + { + "auxiliary_loss_clip": 0.01096654, + "auxiliary_loss_mlp": 0.01052054, + "balance_loss_clip": 1.03903031, + "balance_loss_mlp": 1.03077555, + "epoch": 0.12535698181271607, + "flos": 27271568914560.0, + "grad_norm": 2.367969278759624, + "language_loss": 0.80313599, + "learning_rate": 3.905371701516869e-06, + "loss": 0.82462311, + "num_input_tokens_seen": 45010720, + "step": 2085, + "time_per_iteration": 2.7117087841033936 + }, + { + "auxiliary_loss_clip": 0.01135316, + "auxiliary_loss_mlp": 0.01049343, + "balance_loss_clip": 1.04289746, + "balance_loss_mlp": 1.0284456, + "epoch": 0.12541710506538403, + "flos": 22054107813120.0, + "grad_norm": 1.8448171443020573, + "language_loss": 0.88086236, + "learning_rate": 3.905253285907856e-06, + "loss": 0.90270889, + "num_input_tokens_seen": 45030360, + "step": 2086, + "time_per_iteration": 2.5268614292144775 + }, + { + "auxiliary_loss_clip": 0.01096569, + "auxiliary_loss_mlp": 0.01046145, + "balance_loss_clip": 1.03699994, + "balance_loss_mlp": 1.0268805, + "epoch": 0.125477228318052, + "flos": 12603173760000.0, + "grad_norm": 2.0818697785276488, + "language_loss": 0.87168992, + "learning_rate": 3.905134798051447e-06, + "loss": 0.89311707, + "num_input_tokens_seen": 45045085, + "step": 2087, + "time_per_iteration": 2.5989067554473877 + }, + { + "auxiliary_loss_clip": 0.01111161, + "auxiliary_loss_mlp": 0.01051129, + "balance_loss_clip": 1.04042459, + "balance_loss_mlp": 1.02926564, + "epoch": 0.12553735157071996, + "flos": 23878549365120.0, + "grad_norm": 1.7708082074772673, + "language_loss": 0.73532522, + "learning_rate": 3.905016237952136e-06, + "loss": 0.75694811, + "num_input_tokens_seen": 45065145, + "step": 2088, + "time_per_iteration": 2.693638563156128 + }, + { + "auxiliary_loss_clip": 0.01022225, + "auxiliary_loss_mlp": 0.01008331, + "balance_loss_clip": 1.00470877, + "balance_loss_mlp": 1.00587583, + "epoch": 0.12559747482338796, + "flos": 69920841830400.0, + "grad_norm": 0.763791117545733, + "language_loss": 0.61763418, + "learning_rate": 3.904897605614418e-06, + "loss": 0.63793981, + "num_input_tokens_seen": 45126230, + "step": 2089, + "time_per_iteration": 3.1594808101654053 + }, + { + "auxiliary_loss_clip": 0.01114515, + "auxiliary_loss_mlp": 0.010599, + "balance_loss_clip": 1.04198527, + "balance_loss_mlp": 1.03818047, + "epoch": 0.12565759807605592, + "flos": 24279563779200.0, + "grad_norm": 1.7556345088568168, + "language_loss": 0.77939552, + "learning_rate": 3.904778901042793e-06, + "loss": 0.80113971, + "num_input_tokens_seen": 45145545, + "step": 2090, + "time_per_iteration": 2.6819114685058594 + }, + { + "auxiliary_loss_clip": 0.01012805, + "auxiliary_loss_mlp": 0.01005879, + "balance_loss_clip": 1.00631666, + "balance_loss_mlp": 1.00310123, + "epoch": 0.12571772132872389, + "flos": 56451180286080.0, + "grad_norm": 0.7454981556932675, + "language_loss": 0.59395325, + "learning_rate": 3.90466012424176e-06, + "loss": 0.61414003, + "num_input_tokens_seen": 45206845, + "step": 2091, + "time_per_iteration": 3.127596616744995 + }, + { + "auxiliary_loss_clip": 0.01125598, + "auxiliary_loss_mlp": 0.01048696, + "balance_loss_clip": 1.044065, + "balance_loss_mlp": 1.02897847, + "epoch": 0.12577784458139185, + "flos": 41245846675200.0, + "grad_norm": 1.9817261006417197, + "language_loss": 0.63484943, + "learning_rate": 3.904541275215825e-06, + "loss": 0.65659237, + "num_input_tokens_seen": 45228495, + "step": 2092, + "time_per_iteration": 2.8396012783050537 + }, + { + "auxiliary_loss_clip": 0.01118696, + "auxiliary_loss_mlp": 0.01066052, + "balance_loss_clip": 1.04417396, + "balance_loss_mlp": 1.04321146, + "epoch": 0.12583796783405982, + "flos": 19755501799680.0, + "grad_norm": 2.1816338204947727, + "language_loss": 0.80364728, + "learning_rate": 3.904422353969493e-06, + "loss": 0.82549477, + "num_input_tokens_seen": 45245720, + "step": 2093, + "time_per_iteration": 2.705756664276123 + }, + { + "auxiliary_loss_clip": 0.01110625, + "auxiliary_loss_mlp": 0.0105957, + "balance_loss_clip": 1.04050398, + "balance_loss_mlp": 1.03867292, + "epoch": 0.12589809108672778, + "flos": 22602104680320.0, + "grad_norm": 1.7427531727666112, + "language_loss": 0.75634438, + "learning_rate": 3.904303360507276e-06, + "loss": 0.77804625, + "num_input_tokens_seen": 45265650, + "step": 2094, + "time_per_iteration": 2.723785400390625 + }, + { + "auxiliary_loss_clip": 0.01080914, + "auxiliary_loss_mlp": 0.01053848, + "balance_loss_clip": 1.03565109, + "balance_loss_mlp": 1.03307033, + "epoch": 0.12595821433939577, + "flos": 45222845541120.0, + "grad_norm": 1.7758942396052222, + "language_loss": 0.76791435, + "learning_rate": 3.9041842948336835e-06, + "loss": 0.78926194, + "num_input_tokens_seen": 45287790, + "step": 2095, + "time_per_iteration": 2.923630475997925 + }, + { + "auxiliary_loss_clip": 0.01111049, + "auxiliary_loss_mlp": 0.01055422, + "balance_loss_clip": 1.03742385, + "balance_loss_mlp": 1.03488219, + "epoch": 0.12601833759206374, + "flos": 14319811618560.0, + "grad_norm": 2.4878675143001434, + "language_loss": 0.82638896, + "learning_rate": 3.904065156953232e-06, + "loss": 0.84805363, + "num_input_tokens_seen": 45305720, + "step": 2096, + "time_per_iteration": 2.649806261062622 + }, + { + "auxiliary_loss_clip": 0.01124939, + "auxiliary_loss_mlp": 0.01047544, + "balance_loss_clip": 1.04047978, + "balance_loss_mlp": 1.02764773, + "epoch": 0.1260784608447317, + "flos": 21288241002240.0, + "grad_norm": 1.8299765984983498, + "language_loss": 0.7558111, + "learning_rate": 3.903945946870439e-06, + "loss": 0.77753597, + "num_input_tokens_seen": 45325290, + "step": 2097, + "time_per_iteration": 2.6133852005004883 + }, + { + "auxiliary_loss_clip": 0.01122239, + "auxiliary_loss_mlp": 0.01054179, + "balance_loss_clip": 1.0428226, + "balance_loss_mlp": 1.03538001, + "epoch": 0.12613858409739967, + "flos": 26251311006720.0, + "grad_norm": 2.885700933318526, + "language_loss": 0.86995488, + "learning_rate": 3.9038266645898246e-06, + "loss": 0.8917191, + "num_input_tokens_seen": 45344465, + "step": 2098, + "time_per_iteration": 2.672680616378784 + }, + { + "auxiliary_loss_clip": 0.01081923, + "auxiliary_loss_mlp": 0.01065007, + "balance_loss_clip": 1.03609633, + "balance_loss_mlp": 1.04106975, + "epoch": 0.12619870735006763, + "flos": 21579979265280.0, + "grad_norm": 1.827762024726778, + "language_loss": 0.70071149, + "learning_rate": 3.903707310115912e-06, + "loss": 0.72218072, + "num_input_tokens_seen": 45362465, + "step": 2099, + "time_per_iteration": 2.7244656085968018 + }, + { + "auxiliary_loss_clip": 0.0110149, + "auxiliary_loss_mlp": 0.01059581, + "balance_loss_clip": 1.0349009, + "balance_loss_mlp": 1.03714597, + "epoch": 0.1262588306027356, + "flos": 23367037737600.0, + "grad_norm": 3.5145895600384147, + "language_loss": 0.81722933, + "learning_rate": 3.903587883453228e-06, + "loss": 0.83884007, + "num_input_tokens_seen": 45382700, + "step": 2100, + "time_per_iteration": 2.6950199604034424 + }, + { + "auxiliary_loss_clip": 0.01123511, + "auxiliary_loss_mlp": 0.01053881, + "balance_loss_clip": 1.04595399, + "balance_loss_mlp": 1.03253055, + "epoch": 0.12631895385540357, + "flos": 23949185460480.0, + "grad_norm": 3.0091948233794747, + "language_loss": 0.802248, + "learning_rate": 3.903468384606302e-06, + "loss": 0.82402188, + "num_input_tokens_seen": 45401005, + "step": 2101, + "time_per_iteration": 2.9483213424682617 + }, + { + "auxiliary_loss_clip": 0.01030137, + "auxiliary_loss_mlp": 0.01007006, + "balance_loss_clip": 1.00315845, + "balance_loss_mlp": 1.00434804, + "epoch": 0.12637907710807156, + "flos": 70282138780800.0, + "grad_norm": 0.7065212373612985, + "language_loss": 0.57090276, + "learning_rate": 3.903348813579662e-06, + "loss": 0.5912742, + "num_input_tokens_seen": 45466555, + "step": 2102, + "time_per_iteration": 3.2081053256988525 + }, + { + "auxiliary_loss_clip": 0.01104722, + "auxiliary_loss_mlp": 0.01052346, + "balance_loss_clip": 1.04075837, + "balance_loss_mlp": 1.03230679, + "epoch": 0.12643920036073952, + "flos": 18915084311040.0, + "grad_norm": 1.9839733494330525, + "language_loss": 0.93378574, + "learning_rate": 3.903229170377845e-06, + "loss": 0.95535642, + "num_input_tokens_seen": 45485165, + "step": 2103, + "time_per_iteration": 2.7431986331939697 + }, + { + "auxiliary_loss_clip": 0.01107065, + "auxiliary_loss_mlp": 0.0103537, + "balance_loss_clip": 1.03695142, + "balance_loss_mlp": 1.01645148, + "epoch": 0.1264993236134075, + "flos": 27782470010880.0, + "grad_norm": 1.6949330666562838, + "language_loss": 0.78173625, + "learning_rate": 3.903109455005387e-06, + "loss": 0.80316061, + "num_input_tokens_seen": 45504630, + "step": 2104, + "time_per_iteration": 2.7150657176971436 + }, + { + "auxiliary_loss_clip": 0.01098067, + "auxiliary_loss_mlp": 0.01054141, + "balance_loss_clip": 1.04200029, + "balance_loss_mlp": 1.03513908, + "epoch": 0.12655944686607545, + "flos": 24754697907840.0, + "grad_norm": 2.0844665497799295, + "language_loss": 0.81201994, + "learning_rate": 3.902989667466828e-06, + "loss": 0.83354199, + "num_input_tokens_seen": 45524885, + "step": 2105, + "time_per_iteration": 2.6774723529815674 + }, + { + "auxiliary_loss_clip": 0.01128048, + "auxiliary_loss_mlp": 0.01056485, + "balance_loss_clip": 1.04298842, + "balance_loss_mlp": 1.03501558, + "epoch": 0.12661957011874342, + "flos": 24133048202880.0, + "grad_norm": 2.0710379787325857, + "language_loss": 0.82956088, + "learning_rate": 3.90286980776671e-06, + "loss": 0.85140622, + "num_input_tokens_seen": 45545000, + "step": 2106, + "time_per_iteration": 2.649883508682251 + }, + { + "auxiliary_loss_clip": 0.0108514, + "auxiliary_loss_mlp": 0.01049936, + "balance_loss_clip": 1.03935933, + "balance_loss_mlp": 1.02943182, + "epoch": 0.12667969337141138, + "flos": 24569614103040.0, + "grad_norm": 1.7809185366911007, + "language_loss": 0.73335981, + "learning_rate": 3.902749875909578e-06, + "loss": 0.75471061, + "num_input_tokens_seen": 45564210, + "step": 2107, + "time_per_iteration": 2.7776081562042236 + }, + { + "auxiliary_loss_clip": 0.01128764, + "auxiliary_loss_mlp": 0.01047273, + "balance_loss_clip": 1.03956819, + "balance_loss_mlp": 1.02834249, + "epoch": 0.12673981662407935, + "flos": 22961677777920.0, + "grad_norm": 2.0458705881057098, + "language_loss": 0.79315245, + "learning_rate": 3.90262987189998e-06, + "loss": 0.81491286, + "num_input_tokens_seen": 45583030, + "step": 2108, + "time_per_iteration": 2.6750071048736572 + }, + { + "auxiliary_loss_clip": 0.01132798, + "auxiliary_loss_mlp": 0.010491, + "balance_loss_clip": 1.03984988, + "balance_loss_mlp": 1.02932334, + "epoch": 0.12679993987674734, + "flos": 17274864637440.0, + "grad_norm": 3.0188371821946385, + "language_loss": 0.75343686, + "learning_rate": 3.902509795742467e-06, + "loss": 0.7752558, + "num_input_tokens_seen": 45602265, + "step": 2109, + "time_per_iteration": 2.5908379554748535 + }, + { + "auxiliary_loss_clip": 0.01067327, + "auxiliary_loss_mlp": 0.01049359, + "balance_loss_clip": 1.03294003, + "balance_loss_mlp": 1.02989244, + "epoch": 0.1268600631294153, + "flos": 17275080119040.0, + "grad_norm": 1.7209151427468978, + "language_loss": 0.83105016, + "learning_rate": 3.902389647441592e-06, + "loss": 0.85221702, + "num_input_tokens_seen": 45620595, + "step": 2110, + "time_per_iteration": 2.7784719467163086 + }, + { + "auxiliary_loss_clip": 0.01104829, + "auxiliary_loss_mlp": 0.00749225, + "balance_loss_clip": 1.03751302, + "balance_loss_mlp": 1.00051749, + "epoch": 0.12692018638208327, + "flos": 24061047390720.0, + "grad_norm": 1.6574912007167308, + "language_loss": 0.78518343, + "learning_rate": 3.90226942700191e-06, + "loss": 0.80372393, + "num_input_tokens_seen": 45641140, + "step": 2111, + "time_per_iteration": 2.669520854949951 + }, + { + "auxiliary_loss_clip": 0.01093376, + "auxiliary_loss_mlp": 0.01069117, + "balance_loss_clip": 1.03888297, + "balance_loss_mlp": 1.04603791, + "epoch": 0.12698030963475124, + "flos": 31831900652160.0, + "grad_norm": 2.1393385746749276, + "language_loss": 0.76372284, + "learning_rate": 3.902149134427982e-06, + "loss": 0.7853477, + "num_input_tokens_seen": 45662315, + "step": 2112, + "time_per_iteration": 2.8622899055480957 + }, + { + "auxiliary_loss_clip": 0.01087911, + "auxiliary_loss_mlp": 0.01058358, + "balance_loss_clip": 1.03497863, + "balance_loss_mlp": 1.03779471, + "epoch": 0.1270404328874192, + "flos": 25187744275200.0, + "grad_norm": 1.8896351077319182, + "language_loss": 0.85508764, + "learning_rate": 3.902028769724367e-06, + "loss": 0.87655038, + "num_input_tokens_seen": 45680335, + "step": 2113, + "time_per_iteration": 4.464522838592529 + }, + { + "auxiliary_loss_clip": 0.01094515, + "auxiliary_loss_mlp": 0.01062007, + "balance_loss_clip": 1.03662217, + "balance_loss_mlp": 1.04106236, + "epoch": 0.12710055614008717, + "flos": 15997342544640.0, + "grad_norm": 2.7025007424071514, + "language_loss": 0.73936117, + "learning_rate": 3.9019083328956315e-06, + "loss": 0.76092637, + "num_input_tokens_seen": 45696240, + "step": 2114, + "time_per_iteration": 2.7963058948516846 + }, + { + "auxiliary_loss_clip": 0.01124238, + "auxiliary_loss_mlp": 0.01056124, + "balance_loss_clip": 1.04467988, + "balance_loss_mlp": 1.03471422, + "epoch": 0.12716067939275516, + "flos": 15085642515840.0, + "grad_norm": 1.828959556005836, + "language_loss": 0.83457685, + "learning_rate": 3.901787823946341e-06, + "loss": 0.85638046, + "num_input_tokens_seen": 45713695, + "step": 2115, + "time_per_iteration": 2.7019214630126953 + }, + { + "auxiliary_loss_clip": 0.01113228, + "auxiliary_loss_mlp": 0.01056285, + "balance_loss_clip": 1.03990364, + "balance_loss_mlp": 1.03618598, + "epoch": 0.12722080264542313, + "flos": 28366736636160.0, + "grad_norm": 1.5426434955987225, + "language_loss": 0.86600465, + "learning_rate": 3.901667242881065e-06, + "loss": 0.88769972, + "num_input_tokens_seen": 45736655, + "step": 2116, + "time_per_iteration": 2.79936146736145 + }, + { + "auxiliary_loss_clip": 0.01098562, + "auxiliary_loss_mlp": 0.00749053, + "balance_loss_clip": 1.03755569, + "balance_loss_mlp": 1.00044751, + "epoch": 0.1272809258980911, + "flos": 32379897519360.0, + "grad_norm": 1.6268944115895736, + "language_loss": 0.70665276, + "learning_rate": 3.9015465897043775e-06, + "loss": 0.72512889, + "num_input_tokens_seen": 45758195, + "step": 2117, + "time_per_iteration": 2.836818218231201 + }, + { + "auxiliary_loss_clip": 0.01084802, + "auxiliary_loss_mlp": 0.01056851, + "balance_loss_clip": 1.03523552, + "balance_loss_mlp": 1.03485692, + "epoch": 0.12734104915075906, + "flos": 16034402401920.0, + "grad_norm": 2.220240299952267, + "language_loss": 0.86538315, + "learning_rate": 3.901425864420852e-06, + "loss": 0.88679963, + "num_input_tokens_seen": 45774280, + "step": 2118, + "time_per_iteration": 4.323772668838501 + }, + { + "auxiliary_loss_clip": 0.01118852, + "auxiliary_loss_mlp": 0.01050309, + "balance_loss_clip": 1.03843212, + "balance_loss_mlp": 1.03193879, + "epoch": 0.12740117240342702, + "flos": 18260325244800.0, + "grad_norm": 1.7479309150484323, + "language_loss": 0.87113446, + "learning_rate": 3.901305067035068e-06, + "loss": 0.89282608, + "num_input_tokens_seen": 45792760, + "step": 2119, + "time_per_iteration": 4.194917440414429 + }, + { + "auxiliary_loss_clip": 0.01107211, + "auxiliary_loss_mlp": 0.00749136, + "balance_loss_clip": 1.0388875, + "balance_loss_mlp": 1.00052392, + "epoch": 0.127461295656095, + "flos": 12121790664960.0, + "grad_norm": 3.2906376659355683, + "language_loss": 0.87764513, + "learning_rate": 3.901184197551605e-06, + "loss": 0.89620864, + "num_input_tokens_seen": 45804300, + "step": 2120, + "time_per_iteration": 2.768893003463745 + }, + { + "auxiliary_loss_clip": 0.01130833, + "auxiliary_loss_mlp": 0.01045084, + "balance_loss_clip": 1.04018378, + "balance_loss_mlp": 1.02522385, + "epoch": 0.12752141890876295, + "flos": 23149095966720.0, + "grad_norm": 1.9620865713596383, + "language_loss": 0.75362462, + "learning_rate": 3.901063255975046e-06, + "loss": 0.77538383, + "num_input_tokens_seen": 45823780, + "step": 2121, + "time_per_iteration": 2.588378667831421 + }, + { + "auxiliary_loss_clip": 0.01067789, + "auxiliary_loss_mlp": 0.01049019, + "balance_loss_clip": 1.03213573, + "balance_loss_mlp": 1.02936125, + "epoch": 0.12758154216143094, + "flos": 21615997628160.0, + "grad_norm": 2.3256042990263794, + "language_loss": 0.82949543, + "learning_rate": 3.900942242309978e-06, + "loss": 0.85066342, + "num_input_tokens_seen": 45840495, + "step": 2122, + "time_per_iteration": 2.724975824356079 + }, + { + "auxiliary_loss_clip": 0.01112501, + "auxiliary_loss_mlp": 0.01048457, + "balance_loss_clip": 1.04105425, + "balance_loss_mlp": 1.02856064, + "epoch": 0.1276416654140989, + "flos": 15924874855680.0, + "grad_norm": 1.9733621912465587, + "language_loss": 0.78914881, + "learning_rate": 3.90082115656099e-06, + "loss": 0.81075835, + "num_input_tokens_seen": 45857735, + "step": 2123, + "time_per_iteration": 2.5901384353637695 + }, + { + "auxiliary_loss_clip": 0.0113425, + "auxiliary_loss_mlp": 0.01048543, + "balance_loss_clip": 1.04223132, + "balance_loss_mlp": 1.02855134, + "epoch": 0.12770178866676687, + "flos": 22382690451840.0, + "grad_norm": 1.6421281514903565, + "language_loss": 0.79135549, + "learning_rate": 3.900699998732673e-06, + "loss": 0.81318343, + "num_input_tokens_seen": 45876485, + "step": 2124, + "time_per_iteration": 2.530357599258423 + }, + { + "auxiliary_loss_clip": 0.01122221, + "auxiliary_loss_mlp": 0.00749167, + "balance_loss_clip": 1.03843784, + "balance_loss_mlp": 1.00049341, + "epoch": 0.12776191191943484, + "flos": 21652482867840.0, + "grad_norm": 1.8149893106315604, + "language_loss": 0.75492036, + "learning_rate": 3.900578768829623e-06, + "loss": 0.77363425, + "num_input_tokens_seen": 45894645, + "step": 2125, + "time_per_iteration": 2.622450113296509 + }, + { + "auxiliary_loss_clip": 0.01121081, + "auxiliary_loss_mlp": 0.00749089, + "balance_loss_clip": 1.03893304, + "balance_loss_mlp": 1.00042331, + "epoch": 0.1278220351721028, + "flos": 25735561574400.0, + "grad_norm": 2.088864951865538, + "language_loss": 0.77901828, + "learning_rate": 3.900457466856434e-06, + "loss": 0.79771996, + "num_input_tokens_seen": 45913755, + "step": 2126, + "time_per_iteration": 2.6224164962768555 + }, + { + "auxiliary_loss_clip": 0.01084241, + "auxiliary_loss_mlp": 0.01049315, + "balance_loss_clip": 1.03736043, + "balance_loss_mlp": 1.0299201, + "epoch": 0.12788215842477077, + "flos": 41243224982400.0, + "grad_norm": 1.508008366823693, + "language_loss": 0.6885736, + "learning_rate": 3.9003360928177085e-06, + "loss": 0.7099092, + "num_input_tokens_seen": 45936095, + "step": 2127, + "time_per_iteration": 2.7969272136688232 + }, + { + "auxiliary_loss_clip": 0.00992115, + "auxiliary_loss_mlp": 0.00748663, + "balance_loss_clip": 1.00654948, + "balance_loss_mlp": 1.0006218, + "epoch": 0.12794228167743876, + "flos": 70877430881280.0, + "grad_norm": 0.8483266789841635, + "language_loss": 0.62818372, + "learning_rate": 3.900214646718047e-06, + "loss": 0.6455915, + "num_input_tokens_seen": 46004655, + "step": 2128, + "time_per_iteration": 3.2640204429626465 + }, + { + "auxiliary_loss_clip": 0.0110592, + "auxiliary_loss_mlp": 0.01046679, + "balance_loss_clip": 1.03673375, + "balance_loss_mlp": 1.02499521, + "epoch": 0.12800240493010673, + "flos": 16289727252480.0, + "grad_norm": 2.9879329620067447, + "language_loss": 0.7738378, + "learning_rate": 3.900093128562056e-06, + "loss": 0.79536378, + "num_input_tokens_seen": 46023610, + "step": 2129, + "time_per_iteration": 2.6087467670440674 + }, + { + "auxiliary_loss_clip": 0.01083296, + "auxiliary_loss_mlp": 0.0105048, + "balance_loss_clip": 1.03601241, + "balance_loss_mlp": 1.02808082, + "epoch": 0.1280625281827747, + "flos": 20631542601600.0, + "grad_norm": 2.420547979788456, + "language_loss": 0.78710175, + "learning_rate": 3.899971538354343e-06, + "loss": 0.80843949, + "num_input_tokens_seen": 46041725, + "step": 2130, + "time_per_iteration": 2.6926047801971436 + }, + { + "auxiliary_loss_clip": 0.01098919, + "auxiliary_loss_mlp": 0.01048531, + "balance_loss_clip": 1.03691816, + "balance_loss_mlp": 1.02842069, + "epoch": 0.12812265143544266, + "flos": 22638230784000.0, + "grad_norm": 1.8017313813355635, + "language_loss": 0.70786524, + "learning_rate": 3.899849876099518e-06, + "loss": 0.72933972, + "num_input_tokens_seen": 46061095, + "step": 2131, + "time_per_iteration": 2.723433494567871 + }, + { + "auxiliary_loss_clip": 0.01067198, + "auxiliary_loss_mlp": 0.01048156, + "balance_loss_clip": 1.03773975, + "balance_loss_mlp": 1.02796149, + "epoch": 0.12818277468811062, + "flos": 34714701463680.0, + "grad_norm": 3.546883648518352, + "language_loss": 0.7250976, + "learning_rate": 3.899728141802197e-06, + "loss": 0.74625117, + "num_input_tokens_seen": 46082670, + "step": 2132, + "time_per_iteration": 2.887087821960449 + }, + { + "auxiliary_loss_clip": 0.01058639, + "auxiliary_loss_mlp": 0.01056781, + "balance_loss_clip": 1.03097606, + "balance_loss_mlp": 1.03554952, + "epoch": 0.1282428979407786, + "flos": 23112107936640.0, + "grad_norm": 1.8602584834313574, + "language_loss": 0.81822568, + "learning_rate": 3.8996063354669935e-06, + "loss": 0.83937985, + "num_input_tokens_seen": 46102410, + "step": 2133, + "time_per_iteration": 2.8314414024353027 + }, + { + "auxiliary_loss_clip": 0.01122887, + "auxiliary_loss_mlp": 0.01054568, + "balance_loss_clip": 1.03802991, + "balance_loss_mlp": 1.03237128, + "epoch": 0.12830302119344655, + "flos": 20886508316160.0, + "grad_norm": 3.259993197001186, + "language_loss": 0.79687905, + "learning_rate": 3.899484457098528e-06, + "loss": 0.81865358, + "num_input_tokens_seen": 46121145, + "step": 2134, + "time_per_iteration": 2.6857759952545166 + }, + { + "auxiliary_loss_clip": 0.01122944, + "auxiliary_loss_mlp": 0.01045263, + "balance_loss_clip": 1.04267955, + "balance_loss_mlp": 1.02539051, + "epoch": 0.12836314444611455, + "flos": 21397768548480.0, + "grad_norm": 1.9683230333165553, + "language_loss": 0.82404184, + "learning_rate": 3.899362506701421e-06, + "loss": 0.84572387, + "num_input_tokens_seen": 46140740, + "step": 2135, + "time_per_iteration": 2.5977635383605957 + }, + { + "auxiliary_loss_clip": 0.0110701, + "auxiliary_loss_mlp": 0.01055958, + "balance_loss_clip": 1.03888381, + "balance_loss_mlp": 1.03538275, + "epoch": 0.1284232676987825, + "flos": 13662466773120.0, + "grad_norm": 2.2352509784205656, + "language_loss": 0.77609754, + "learning_rate": 3.899240484280298e-06, + "loss": 0.79772723, + "num_input_tokens_seen": 46156805, + "step": 2136, + "time_per_iteration": 2.6174237728118896 + }, + { + "auxiliary_loss_clip": 0.00988241, + "auxiliary_loss_mlp": 0.01005972, + "balance_loss_clip": 1.00342762, + "balance_loss_mlp": 1.00302744, + "epoch": 0.12848339095145048, + "flos": 59994737735040.0, + "grad_norm": 1.001244495167274, + "language_loss": 0.59137207, + "learning_rate": 3.899118389839785e-06, + "loss": 0.61131418, + "num_input_tokens_seen": 46222085, + "step": 2137, + "time_per_iteration": 3.418200969696045 + }, + { + "auxiliary_loss_clip": 0.01118822, + "auxiliary_loss_mlp": 0.01055228, + "balance_loss_clip": 1.03894746, + "balance_loss_mlp": 1.03572536, + "epoch": 0.12854351420411844, + "flos": 13881378211200.0, + "grad_norm": 2.469184103512273, + "language_loss": 0.82003945, + "learning_rate": 3.898996223384512e-06, + "loss": 0.84177995, + "num_input_tokens_seen": 46239970, + "step": 2138, + "time_per_iteration": 2.6262850761413574 + }, + { + "auxiliary_loss_clip": 0.01122725, + "auxiliary_loss_mlp": 0.01054552, + "balance_loss_clip": 1.04131496, + "balance_loss_mlp": 1.0320456, + "epoch": 0.1286036374567864, + "flos": 22637943475200.0, + "grad_norm": 2.915685178003298, + "language_loss": 0.78877854, + "learning_rate": 3.898873984919113e-06, + "loss": 0.81055129, + "num_input_tokens_seen": 46257740, + "step": 2139, + "time_per_iteration": 2.630955934524536 + }, + { + "auxiliary_loss_clip": 0.01099838, + "auxiliary_loss_mlp": 0.01043873, + "balance_loss_clip": 1.03762698, + "balance_loss_mlp": 1.02361894, + "epoch": 0.12866376070945437, + "flos": 16324775948160.0, + "grad_norm": 3.958834232828788, + "language_loss": 0.85047328, + "learning_rate": 3.8987516744482215e-06, + "loss": 0.87191039, + "num_input_tokens_seen": 46275445, + "step": 2140, + "time_per_iteration": 2.6724693775177 + }, + { + "auxiliary_loss_clip": 0.01107735, + "auxiliary_loss_mlp": 0.01046813, + "balance_loss_clip": 1.03748989, + "balance_loss_mlp": 1.02739382, + "epoch": 0.12872388396212234, + "flos": 11874546374400.0, + "grad_norm": 2.005203523977762, + "language_loss": 0.8573693, + "learning_rate": 3.898629291976476e-06, + "loss": 0.87891477, + "num_input_tokens_seen": 46291710, + "step": 2141, + "time_per_iteration": 2.6822190284729004 + }, + { + "auxiliary_loss_clip": 0.01110823, + "auxiliary_loss_mlp": 0.01049185, + "balance_loss_clip": 1.0382098, + "balance_loss_mlp": 1.02853835, + "epoch": 0.12878400721479033, + "flos": 28366700722560.0, + "grad_norm": 3.9257125378012825, + "language_loss": 0.67804003, + "learning_rate": 3.898506837508518e-06, + "loss": 0.6996401, + "num_input_tokens_seen": 46311335, + "step": 2142, + "time_per_iteration": 2.7243120670318604 + }, + { + "auxiliary_loss_clip": 0.01130519, + "auxiliary_loss_mlp": 0.00749141, + "balance_loss_clip": 1.04430008, + "balance_loss_mlp": 1.00043249, + "epoch": 0.1288441304674583, + "flos": 25885632597120.0, + "grad_norm": 2.2820728418154435, + "language_loss": 0.83623672, + "learning_rate": 3.89838431104899e-06, + "loss": 0.8550334, + "num_input_tokens_seen": 46330985, + "step": 2143, + "time_per_iteration": 2.610957384109497 + }, + { + "auxiliary_loss_clip": 0.01138032, + "auxiliary_loss_mlp": 0.00749105, + "balance_loss_clip": 1.04572189, + "balance_loss_mlp": 1.00053251, + "epoch": 0.12890425372012626, + "flos": 20813789232000.0, + "grad_norm": 1.7436004931159959, + "language_loss": 0.81852055, + "learning_rate": 3.898261712602539e-06, + "loss": 0.83739191, + "num_input_tokens_seen": 46351295, + "step": 2144, + "time_per_iteration": 2.6443893909454346 + }, + { + "auxiliary_loss_clip": 0.01093397, + "auxiliary_loss_mlp": 0.01054309, + "balance_loss_clip": 1.0338974, + "balance_loss_mlp": 1.03257692, + "epoch": 0.12896437697279423, + "flos": 22565870835840.0, + "grad_norm": 2.0290495099615766, + "language_loss": 0.78247529, + "learning_rate": 3.898139042173813e-06, + "loss": 0.80395234, + "num_input_tokens_seen": 46368600, + "step": 2145, + "time_per_iteration": 2.738581895828247 + }, + { + "auxiliary_loss_clip": 0.01133102, + "auxiliary_loss_mlp": 0.01046965, + "balance_loss_clip": 1.03933477, + "balance_loss_mlp": 1.02548337, + "epoch": 0.1290245002254622, + "flos": 17493776075520.0, + "grad_norm": 2.006534523716082, + "language_loss": 0.82184547, + "learning_rate": 3.898016299767465e-06, + "loss": 0.84364617, + "num_input_tokens_seen": 46387370, + "step": 2146, + "time_per_iteration": 2.6169912815093994 + }, + { + "auxiliary_loss_clip": 0.01107129, + "auxiliary_loss_mlp": 0.01050665, + "balance_loss_clip": 1.03930426, + "balance_loss_mlp": 1.02981544, + "epoch": 0.12908462347813016, + "flos": 36315957859200.0, + "grad_norm": 3.8368781033382087, + "language_loss": 0.70417917, + "learning_rate": 3.897893485388149e-06, + "loss": 0.72575712, + "num_input_tokens_seen": 46409570, + "step": 2147, + "time_per_iteration": 2.7140872478485107 + }, + { + "auxiliary_loss_clip": 0.01114508, + "auxiliary_loss_mlp": 0.01053035, + "balance_loss_clip": 1.04276693, + "balance_loss_mlp": 1.03269804, + "epoch": 0.12914474673079815, + "flos": 22528703237760.0, + "grad_norm": 2.1496661586889307, + "language_loss": 0.7170006, + "learning_rate": 3.897770599040521e-06, + "loss": 0.73867601, + "num_input_tokens_seen": 46429320, + "step": 2148, + "time_per_iteration": 2.6336958408355713 + }, + { + "auxiliary_loss_clip": 0.01130544, + "auxiliary_loss_mlp": 0.01044284, + "balance_loss_clip": 1.04243839, + "balance_loss_mlp": 1.0251627, + "epoch": 0.12920486998346611, + "flos": 21471888263040.0, + "grad_norm": 1.6231876657669677, + "language_loss": 0.78699589, + "learning_rate": 3.897647640729242e-06, + "loss": 0.80874419, + "num_input_tokens_seen": 46450155, + "step": 2149, + "time_per_iteration": 2.589250087738037 + }, + { + "auxiliary_loss_clip": 0.01121776, + "auxiliary_loss_mlp": 0.0104236, + "balance_loss_clip": 1.04078484, + "balance_loss_mlp": 1.02126002, + "epoch": 0.12926499323613408, + "flos": 27308556944640.0, + "grad_norm": 1.9729053122549924, + "language_loss": 0.76059067, + "learning_rate": 3.897524610458975e-06, + "loss": 0.78223205, + "num_input_tokens_seen": 46470280, + "step": 2150, + "time_per_iteration": 2.6272480487823486 + }, + { + "auxiliary_loss_clip": 0.01118987, + "auxiliary_loss_mlp": 0.01048822, + "balance_loss_clip": 1.03965592, + "balance_loss_mlp": 1.02835345, + "epoch": 0.12932511648880204, + "flos": 22091131756800.0, + "grad_norm": 2.8479340668038415, + "language_loss": 0.70968843, + "learning_rate": 3.8974015082343835e-06, + "loss": 0.73136652, + "num_input_tokens_seen": 46487605, + "step": 2151, + "time_per_iteration": 2.6015565395355225 + }, + { + "auxiliary_loss_clip": 0.01129556, + "auxiliary_loss_mlp": 0.01042387, + "balance_loss_clip": 1.04133248, + "balance_loss_mlp": 1.02313483, + "epoch": 0.12938523974147, + "flos": 20302780394880.0, + "grad_norm": 1.988077277518082, + "language_loss": 0.83984858, + "learning_rate": 3.897278334060137e-06, + "loss": 0.86156797, + "num_input_tokens_seen": 46505100, + "step": 2152, + "time_per_iteration": 2.5700583457946777 + }, + { + "auxiliary_loss_clip": 0.01117393, + "auxiliary_loss_mlp": 0.0105563, + "balance_loss_clip": 1.03663838, + "balance_loss_mlp": 1.03572261, + "epoch": 0.12944536299413797, + "flos": 19499961467520.0, + "grad_norm": 1.7547207248998848, + "language_loss": 0.78565007, + "learning_rate": 3.897155087940906e-06, + "loss": 0.80738032, + "num_input_tokens_seen": 46524020, + "step": 2153, + "time_per_iteration": 2.6502063274383545 + }, + { + "auxiliary_loss_clip": 0.0108183, + "auxiliary_loss_mlp": 0.0074913, + "balance_loss_clip": 1.03734887, + "balance_loss_mlp": 1.00052166, + "epoch": 0.12950548624680594, + "flos": 27707919333120.0, + "grad_norm": 1.7355796743981156, + "language_loss": 0.79992193, + "learning_rate": 3.897031769881364e-06, + "loss": 0.81823152, + "num_input_tokens_seen": 46544640, + "step": 2154, + "time_per_iteration": 2.708670139312744 + }, + { + "auxiliary_loss_clip": 0.0110977, + "auxiliary_loss_mlp": 0.0104763, + "balance_loss_clip": 1.03794813, + "balance_loss_mlp": 1.02778172, + "epoch": 0.12956560949947393, + "flos": 17565740974080.0, + "grad_norm": 1.931932259607275, + "language_loss": 0.83298242, + "learning_rate": 3.896908379886188e-06, + "loss": 0.8545565, + "num_input_tokens_seen": 46561395, + "step": 2155, + "time_per_iteration": 2.577969789505005 + }, + { + "auxiliary_loss_clip": 0.01121428, + "auxiliary_loss_mlp": 0.01051847, + "balance_loss_clip": 1.03777659, + "balance_loss_mlp": 1.03137875, + "epoch": 0.1296257327521419, + "flos": 20740711011840.0, + "grad_norm": 2.3787970585576037, + "language_loss": 0.76083887, + "learning_rate": 3.896784917960055e-06, + "loss": 0.78257161, + "num_input_tokens_seen": 46579395, + "step": 2156, + "time_per_iteration": 2.6496763229370117 + }, + { + "auxiliary_loss_clip": 0.01063648, + "auxiliary_loss_mlp": 0.01047674, + "balance_loss_clip": 1.03515553, + "balance_loss_mlp": 1.02781415, + "epoch": 0.12968585600480986, + "flos": 16395735265920.0, + "grad_norm": 2.9074019900050536, + "language_loss": 0.8704527, + "learning_rate": 3.896661384107648e-06, + "loss": 0.89156586, + "num_input_tokens_seen": 46597090, + "step": 2157, + "time_per_iteration": 2.7373995780944824 + }, + { + "auxiliary_loss_clip": 0.01131185, + "auxiliary_loss_mlp": 0.01050086, + "balance_loss_clip": 1.03763354, + "balance_loss_mlp": 1.02946281, + "epoch": 0.12974597925747783, + "flos": 28329533124480.0, + "grad_norm": 2.437722677599873, + "language_loss": 0.811818, + "learning_rate": 3.896537778333651e-06, + "loss": 0.83363068, + "num_input_tokens_seen": 46617355, + "step": 2158, + "time_per_iteration": 2.688331365585327 + }, + { + "auxiliary_loss_clip": 0.01134814, + "auxiliary_loss_mlp": 0.01057879, + "balance_loss_clip": 1.04113126, + "balance_loss_mlp": 1.0379231, + "epoch": 0.1298061025101458, + "flos": 9683025782400.0, + "grad_norm": 2.321014852968811, + "language_loss": 0.74692321, + "learning_rate": 3.896414100642752e-06, + "loss": 0.76885009, + "num_input_tokens_seen": 46633130, + "step": 2159, + "time_per_iteration": 2.5661709308624268 + }, + { + "auxiliary_loss_clip": 0.01082066, + "auxiliary_loss_mlp": 0.01049545, + "balance_loss_clip": 1.03259695, + "balance_loss_mlp": 1.027825, + "epoch": 0.12986622576281376, + "flos": 27709535445120.0, + "grad_norm": 2.235377761709267, + "language_loss": 0.82616723, + "learning_rate": 3.89629035103964e-06, + "loss": 0.84748334, + "num_input_tokens_seen": 46650575, + "step": 2160, + "time_per_iteration": 2.7805216312408447 + }, + { + "auxiliary_loss_clip": 0.01116056, + "auxiliary_loss_mlp": 0.01044328, + "balance_loss_clip": 1.04243207, + "balance_loss_mlp": 1.02493262, + "epoch": 0.12992634901548175, + "flos": 18802719590400.0, + "grad_norm": 1.6294306697725431, + "language_loss": 0.81826663, + "learning_rate": 3.896166529529008e-06, + "loss": 0.83987039, + "num_input_tokens_seen": 46668780, + "step": 2161, + "time_per_iteration": 4.170602321624756 + }, + { + "auxiliary_loss_clip": 0.01104763, + "auxiliary_loss_mlp": 0.01050092, + "balance_loss_clip": 1.03673089, + "balance_loss_mlp": 1.02919507, + "epoch": 0.12998647226814972, + "flos": 29127575543040.0, + "grad_norm": 2.0489246111983888, + "language_loss": 0.82570827, + "learning_rate": 3.896042636115551e-06, + "loss": 0.84725678, + "num_input_tokens_seen": 46687550, + "step": 2162, + "time_per_iteration": 2.8024611473083496 + }, + { + "auxiliary_loss_clip": 0.01100803, + "auxiliary_loss_mlp": 0.01051481, + "balance_loss_clip": 1.03886294, + "balance_loss_mlp": 1.03127456, + "epoch": 0.13004659552081768, + "flos": 19573686132480.0, + "grad_norm": 2.7520037129606787, + "language_loss": 0.72621238, + "learning_rate": 3.895918670803968e-06, + "loss": 0.74773514, + "num_input_tokens_seen": 46706730, + "step": 2163, + "time_per_iteration": 2.6347203254699707 + }, + { + "auxiliary_loss_clip": 0.01133969, + "auxiliary_loss_mlp": 0.00749202, + "balance_loss_clip": 1.04041481, + "balance_loss_mlp": 1.0005188, + "epoch": 0.13010671877348565, + "flos": 22490709626880.0, + "grad_norm": 2.2194522178294447, + "language_loss": 0.81288898, + "learning_rate": 3.895794633598958e-06, + "loss": 0.83172071, + "num_input_tokens_seen": 46724250, + "step": 2164, + "time_per_iteration": 2.622056007385254 + }, + { + "auxiliary_loss_clip": 0.01079328, + "auxiliary_loss_mlp": 0.01045789, + "balance_loss_clip": 1.03436244, + "balance_loss_mlp": 1.02579784, + "epoch": 0.1301668420261536, + "flos": 23878226142720.0, + "grad_norm": 1.9632004113944879, + "language_loss": 0.7236852, + "learning_rate": 3.8956705245052256e-06, + "loss": 0.74493635, + "num_input_tokens_seen": 46744105, + "step": 2165, + "time_per_iteration": 2.7288413047790527 + }, + { + "auxiliary_loss_clip": 0.01072743, + "auxiliary_loss_mlp": 0.01046035, + "balance_loss_clip": 1.03689432, + "balance_loss_mlp": 1.0247798, + "epoch": 0.13022696527882158, + "flos": 23150065633920.0, + "grad_norm": 1.5890124898904792, + "language_loss": 0.74509537, + "learning_rate": 3.8955463435274765e-06, + "loss": 0.76628315, + "num_input_tokens_seen": 46764250, + "step": 2166, + "time_per_iteration": 5.931979179382324 + }, + { + "auxiliary_loss_clip": 0.01132131, + "auxiliary_loss_mlp": 0.01046704, + "balance_loss_clip": 1.03838205, + "balance_loss_mlp": 1.02761829, + "epoch": 0.13028708853148954, + "flos": 26908548111360.0, + "grad_norm": 1.808725781916101, + "language_loss": 0.83515728, + "learning_rate": 3.895422090670421e-06, + "loss": 0.85694563, + "num_input_tokens_seen": 46786865, + "step": 2167, + "time_per_iteration": 2.589409589767456 + }, + { + "auxiliary_loss_clip": 0.01067149, + "auxiliary_loss_mlp": 0.01060212, + "balance_loss_clip": 1.03272188, + "balance_loss_mlp": 1.03918362, + "epoch": 0.13034721178415754, + "flos": 21251468453760.0, + "grad_norm": 1.598240773053832, + "language_loss": 0.83114886, + "learning_rate": 3.89529776593877e-06, + "loss": 0.85242248, + "num_input_tokens_seen": 46807030, + "step": 2168, + "time_per_iteration": 2.7244696617126465 + }, + { + "auxiliary_loss_clip": 0.01037237, + "auxiliary_loss_mlp": 0.01058444, + "balance_loss_clip": 1.0304848, + "balance_loss_mlp": 1.03650928, + "epoch": 0.1304073350368255, + "flos": 18767239931520.0, + "grad_norm": 1.8342132379767189, + "language_loss": 0.80140549, + "learning_rate": 3.8951733693372375e-06, + "loss": 0.8223623, + "num_input_tokens_seen": 46826280, + "step": 2169, + "time_per_iteration": 2.7309393882751465 + }, + { + "auxiliary_loss_clip": 0.01132907, + "auxiliary_loss_mlp": 0.01044935, + "balance_loss_clip": 1.04158235, + "balance_loss_mlp": 1.02369189, + "epoch": 0.13046745828949347, + "flos": 28364653647360.0, + "grad_norm": 2.214670521489935, + "language_loss": 0.66442537, + "learning_rate": 3.8950489008705406e-06, + "loss": 0.68620384, + "num_input_tokens_seen": 46846505, + "step": 2170, + "time_per_iteration": 2.6298348903656006 + }, + { + "auxiliary_loss_clip": 0.01108848, + "auxiliary_loss_mlp": 0.01048814, + "balance_loss_clip": 1.04112303, + "balance_loss_mlp": 1.0278213, + "epoch": 0.13052758154216143, + "flos": 29605044055680.0, + "grad_norm": 2.0127346323454876, + "language_loss": 0.67121184, + "learning_rate": 3.8949243605434e-06, + "loss": 0.69278842, + "num_input_tokens_seen": 46867380, + "step": 2171, + "time_per_iteration": 2.767422676086426 + }, + { + "auxiliary_loss_clip": 0.0112124, + "auxiliary_loss_mlp": 0.01050728, + "balance_loss_clip": 1.03936124, + "balance_loss_mlp": 1.02868652, + "epoch": 0.1305877047948294, + "flos": 19390864884480.0, + "grad_norm": 3.077797037340465, + "language_loss": 0.72054458, + "learning_rate": 3.894799748360537e-06, + "loss": 0.74226427, + "num_input_tokens_seen": 46886810, + "step": 2172, + "time_per_iteration": 2.590054750442505 + }, + { + "auxiliary_loss_clip": 0.01089652, + "auxiliary_loss_mlp": 0.01043027, + "balance_loss_clip": 1.03987145, + "balance_loss_mlp": 1.02358449, + "epoch": 0.13064782804749736, + "flos": 16873527000960.0, + "grad_norm": 1.7421077314776237, + "language_loss": 0.75649035, + "learning_rate": 3.894675064326678e-06, + "loss": 0.77781719, + "num_input_tokens_seen": 46905620, + "step": 2173, + "time_per_iteration": 2.759568929672241 + }, + { + "auxiliary_loss_clip": 0.01101797, + "auxiliary_loss_mlp": 0.01057997, + "balance_loss_clip": 1.04316354, + "balance_loss_mlp": 1.03583598, + "epoch": 0.13070795130016533, + "flos": 24499085748480.0, + "grad_norm": 3.8453322728681933, + "language_loss": 0.71091771, + "learning_rate": 3.894550308446551e-06, + "loss": 0.73251563, + "num_input_tokens_seen": 46925120, + "step": 2174, + "time_per_iteration": 2.7292821407318115 + }, + { + "auxiliary_loss_clip": 0.01013371, + "auxiliary_loss_mlp": 0.01007186, + "balance_loss_clip": 1.0057534, + "balance_loss_mlp": 1.00360954, + "epoch": 0.13076807455283332, + "flos": 71054505953280.0, + "grad_norm": 0.8044094602605089, + "language_loss": 0.59045124, + "learning_rate": 3.894425480724886e-06, + "loss": 0.61065674, + "num_input_tokens_seen": 46988195, + "step": 2175, + "time_per_iteration": 3.331988573074341 + }, + { + "auxiliary_loss_clip": 0.01120699, + "auxiliary_loss_mlp": 0.0104911, + "balance_loss_clip": 1.0399667, + "balance_loss_mlp": 1.02948856, + "epoch": 0.13082819780550128, + "flos": 20264499475200.0, + "grad_norm": 3.5902772575345674, + "language_loss": 0.80056036, + "learning_rate": 3.894300581166417e-06, + "loss": 0.82225847, + "num_input_tokens_seen": 47004720, + "step": 2176, + "time_per_iteration": 2.6318519115448 + }, + { + "auxiliary_loss_clip": 0.01132018, + "auxiliary_loss_mlp": 0.01055141, + "balance_loss_clip": 1.04030192, + "balance_loss_mlp": 1.03324246, + "epoch": 0.13088832105816925, + "flos": 34203441231360.0, + "grad_norm": 2.231598355983225, + "language_loss": 0.745561, + "learning_rate": 3.894175609775881e-06, + "loss": 0.76743263, + "num_input_tokens_seen": 47024255, + "step": 2177, + "time_per_iteration": 2.7628185749053955 + }, + { + "auxiliary_loss_clip": 0.01094809, + "auxiliary_loss_mlp": 0.01050868, + "balance_loss_clip": 1.03736854, + "balance_loss_mlp": 1.02851653, + "epoch": 0.13094844431083721, + "flos": 17894970057600.0, + "grad_norm": 3.624795943468729, + "language_loss": 0.82248968, + "learning_rate": 3.894050566558015e-06, + "loss": 0.84394646, + "num_input_tokens_seen": 47042465, + "step": 2178, + "time_per_iteration": 2.800412178039551 + }, + { + "auxiliary_loss_clip": 0.01132041, + "auxiliary_loss_mlp": 0.0104789, + "balance_loss_clip": 1.04231334, + "balance_loss_mlp": 1.02700412, + "epoch": 0.13100856756350518, + "flos": 17311313963520.0, + "grad_norm": 2.683373827366529, + "language_loss": 0.74728835, + "learning_rate": 3.893925451517562e-06, + "loss": 0.76908767, + "num_input_tokens_seen": 47060370, + "step": 2179, + "time_per_iteration": 2.64490008354187 + }, + { + "auxiliary_loss_clip": 0.01093196, + "auxiliary_loss_mlp": 0.01049598, + "balance_loss_clip": 1.03711772, + "balance_loss_mlp": 1.02890313, + "epoch": 0.13106869081617314, + "flos": 22200551562240.0, + "grad_norm": 2.1386126565310843, + "language_loss": 0.8497076, + "learning_rate": 3.893800264659266e-06, + "loss": 0.87113553, + "num_input_tokens_seen": 47081415, + "step": 2180, + "time_per_iteration": 2.725971221923828 + }, + { + "auxiliary_loss_clip": 0.01122697, + "auxiliary_loss_mlp": 0.01056523, + "balance_loss_clip": 1.04239392, + "balance_loss_mlp": 1.03625715, + "epoch": 0.13112881406884114, + "flos": 21763123735680.0, + "grad_norm": 1.788712278474314, + "language_loss": 0.9004246, + "learning_rate": 3.8936750059878746e-06, + "loss": 0.92221683, + "num_input_tokens_seen": 47099860, + "step": 2181, + "time_per_iteration": 2.6515750885009766 + }, + { + "auxiliary_loss_clip": 0.011155, + "auxiliary_loss_mlp": 0.01054645, + "balance_loss_clip": 1.03935742, + "balance_loss_mlp": 1.03366387, + "epoch": 0.1311889373215091, + "flos": 23331091201920.0, + "grad_norm": 1.8751764669942579, + "language_loss": 0.68682504, + "learning_rate": 3.893549675508137e-06, + "loss": 0.70852649, + "num_input_tokens_seen": 47118540, + "step": 2182, + "time_per_iteration": 2.6796438694000244 + }, + { + "auxiliary_loss_clip": 0.01081462, + "auxiliary_loss_mlp": 0.01056123, + "balance_loss_clip": 1.03556931, + "balance_loss_mlp": 1.03481996, + "epoch": 0.13124906057417707, + "flos": 21467363149440.0, + "grad_norm": 2.1597246360520215, + "language_loss": 0.78476501, + "learning_rate": 3.893424273224806e-06, + "loss": 0.80614084, + "num_input_tokens_seen": 47136710, + "step": 2183, + "time_per_iteration": 2.663121223449707 + }, + { + "auxiliary_loss_clip": 0.01128778, + "auxiliary_loss_mlp": 0.01047353, + "balance_loss_clip": 1.03941822, + "balance_loss_mlp": 1.02699184, + "epoch": 0.13130918382684503, + "flos": 23255319461760.0, + "grad_norm": 1.7525184061457861, + "language_loss": 0.86096656, + "learning_rate": 3.893298799142636e-06, + "loss": 0.88272786, + "num_input_tokens_seen": 47157155, + "step": 2184, + "time_per_iteration": 2.5194590091705322 + }, + { + "auxiliary_loss_clip": 0.01099895, + "auxiliary_loss_mlp": 0.01052082, + "balance_loss_clip": 1.03844881, + "balance_loss_mlp": 1.03081548, + "epoch": 0.131369307079513, + "flos": 20850274471680.0, + "grad_norm": 1.9904711638120127, + "language_loss": 0.82536983, + "learning_rate": 3.893173253266387e-06, + "loss": 0.84688962, + "num_input_tokens_seen": 47176820, + "step": 2185, + "time_per_iteration": 2.6289303302764893 + }, + { + "auxiliary_loss_clip": 0.01106906, + "auxiliary_loss_mlp": 0.01050735, + "balance_loss_clip": 1.03785467, + "balance_loss_mlp": 1.02944434, + "epoch": 0.13142943033218096, + "flos": 17858341163520.0, + "grad_norm": 2.139242095673954, + "language_loss": 0.72826827, + "learning_rate": 3.893047635600818e-06, + "loss": 0.74984467, + "num_input_tokens_seen": 47195855, + "step": 2186, + "time_per_iteration": 2.6018829345703125 + }, + { + "auxiliary_loss_clip": 0.01118144, + "auxiliary_loss_mlp": 0.01047003, + "balance_loss_clip": 1.03893447, + "balance_loss_mlp": 1.02496099, + "epoch": 0.13148955358484893, + "flos": 20996035862400.0, + "grad_norm": 2.1086945472560155, + "language_loss": 0.80162966, + "learning_rate": 3.892921946150693e-06, + "loss": 0.82328111, + "num_input_tokens_seen": 47214535, + "step": 2187, + "time_per_iteration": 2.5756006240844727 + }, + { + "auxiliary_loss_clip": 0.00998021, + "auxiliary_loss_mlp": 0.01019875, + "balance_loss_clip": 1.01085234, + "balance_loss_mlp": 1.01693082, + "epoch": 0.13154967683751692, + "flos": 70172467580160.0, + "grad_norm": 0.8447834476332876, + "language_loss": 0.59046346, + "learning_rate": 3.892796184920778e-06, + "loss": 0.61064243, + "num_input_tokens_seen": 47270300, + "step": 2188, + "time_per_iteration": 3.2770378589630127 + }, + { + "auxiliary_loss_clip": 0.01051061, + "auxiliary_loss_mlp": 0.01052439, + "balance_loss_clip": 1.03586853, + "balance_loss_mlp": 1.03169692, + "epoch": 0.1316098000901849, + "flos": 20376145923840.0, + "grad_norm": 1.9729743174997318, + "language_loss": 0.74280208, + "learning_rate": 3.892670351915842e-06, + "loss": 0.7638371, + "num_input_tokens_seen": 47290720, + "step": 2189, + "time_per_iteration": 2.7862634658813477 + }, + { + "auxiliary_loss_clip": 0.01116463, + "auxiliary_loss_mlp": 0.01045559, + "balance_loss_clip": 1.03930163, + "balance_loss_mlp": 1.02561498, + "epoch": 0.13166992334285285, + "flos": 23221132692480.0, + "grad_norm": 4.353817284419629, + "language_loss": 0.72765136, + "learning_rate": 3.892544447140657e-06, + "loss": 0.74927151, + "num_input_tokens_seen": 47311820, + "step": 2190, + "time_per_iteration": 2.7206637859344482 + }, + { + "auxiliary_loss_clip": 0.01122567, + "auxiliary_loss_mlp": 0.01053464, + "balance_loss_clip": 1.0439167, + "balance_loss_mlp": 1.03363991, + "epoch": 0.13173004659552082, + "flos": 23330947547520.0, + "grad_norm": 1.8924532118191693, + "language_loss": 0.74487489, + "learning_rate": 3.892418470599996e-06, + "loss": 0.76663524, + "num_input_tokens_seen": 47331605, + "step": 2191, + "time_per_iteration": 2.666151523590088 + }, + { + "auxiliary_loss_clip": 0.01092367, + "auxiliary_loss_mlp": 0.01051266, + "balance_loss_clip": 1.04082465, + "balance_loss_mlp": 1.02948642, + "epoch": 0.13179016984818878, + "flos": 21251504367360.0, + "grad_norm": 2.0840982444484903, + "language_loss": 0.79519701, + "learning_rate": 3.892292422298637e-06, + "loss": 0.81663334, + "num_input_tokens_seen": 47350455, + "step": 2192, + "time_per_iteration": 2.6791515350341797 + }, + { + "auxiliary_loss_clip": 0.01070931, + "auxiliary_loss_mlp": 0.0105195, + "balance_loss_clip": 1.03307259, + "balance_loss_mlp": 1.03182709, + "epoch": 0.13185029310085675, + "flos": 17778690754560.0, + "grad_norm": 1.9280501224473765, + "language_loss": 0.85342544, + "learning_rate": 3.892166302241361e-06, + "loss": 0.87465423, + "num_input_tokens_seen": 47368225, + "step": 2193, + "time_per_iteration": 2.713296413421631 + }, + { + "auxiliary_loss_clip": 0.01021882, + "auxiliary_loss_mlp": 0.01006912, + "balance_loss_clip": 1.01659679, + "balance_loss_mlp": 1.00381267, + "epoch": 0.1319104163535247, + "flos": 69851785933440.0, + "grad_norm": 0.7579653598123862, + "language_loss": 0.54081917, + "learning_rate": 3.8920401104329475e-06, + "loss": 0.5611071, + "num_input_tokens_seen": 47427125, + "step": 2194, + "time_per_iteration": 3.1732046604156494 + }, + { + "auxiliary_loss_clip": 0.011282, + "auxiliary_loss_mlp": 0.01049608, + "balance_loss_clip": 1.03934836, + "balance_loss_mlp": 1.0296998, + "epoch": 0.1319705396061927, + "flos": 25193095401600.0, + "grad_norm": 1.7778021944663511, + "language_loss": 0.7222333, + "learning_rate": 3.891913846878185e-06, + "loss": 0.7440114, + "num_input_tokens_seen": 47450275, + "step": 2195, + "time_per_iteration": 2.7312943935394287 + }, + { + "auxiliary_loss_clip": 0.01099796, + "auxiliary_loss_mlp": 0.0074931, + "balance_loss_clip": 1.03819942, + "balance_loss_mlp": 1.00067437, + "epoch": 0.13203066285886067, + "flos": 20740459616640.0, + "grad_norm": 3.536142055432574, + "language_loss": 0.77841163, + "learning_rate": 3.891787511581859e-06, + "loss": 0.79690272, + "num_input_tokens_seen": 47469155, + "step": 2196, + "time_per_iteration": 2.7818524837493896 + }, + { + "auxiliary_loss_clip": 0.01121907, + "auxiliary_loss_mlp": 0.01055035, + "balance_loss_clip": 1.038306, + "balance_loss_mlp": 1.03509164, + "epoch": 0.13209078611152864, + "flos": 22054395121920.0, + "grad_norm": 1.9769463180348033, + "language_loss": 0.7508496, + "learning_rate": 3.89166110454876e-06, + "loss": 0.77261901, + "num_input_tokens_seen": 47488405, + "step": 2197, + "time_per_iteration": 2.7421326637268066 + }, + { + "auxiliary_loss_clip": 0.01132402, + "auxiliary_loss_mlp": 0.01049488, + "balance_loss_clip": 1.03982842, + "balance_loss_mlp": 1.02928209, + "epoch": 0.1321509093641966, + "flos": 16284950743680.0, + "grad_norm": 1.9331305552610885, + "language_loss": 0.79794484, + "learning_rate": 3.891534625783685e-06, + "loss": 0.81976372, + "num_input_tokens_seen": 47505650, + "step": 2198, + "time_per_iteration": 2.8318421840667725 + }, + { + "auxiliary_loss_clip": 0.01129586, + "auxiliary_loss_mlp": 0.01057382, + "balance_loss_clip": 1.04078639, + "balance_loss_mlp": 1.03781962, + "epoch": 0.13221103261686457, + "flos": 16983018633600.0, + "grad_norm": 2.825142742877047, + "language_loss": 0.82728535, + "learning_rate": 3.891408075291425e-06, + "loss": 0.84915501, + "num_input_tokens_seen": 47521540, + "step": 2199, + "time_per_iteration": 2.785219430923462 + }, + { + "auxiliary_loss_clip": 0.01084855, + "auxiliary_loss_mlp": 0.01053191, + "balance_loss_clip": 1.03817189, + "balance_loss_mlp": 1.03257918, + "epoch": 0.13227115586953253, + "flos": 34233605677440.0, + "grad_norm": 1.6997729857459007, + "language_loss": 0.69482136, + "learning_rate": 3.8912814530767826e-06, + "loss": 0.71620178, + "num_input_tokens_seen": 47543625, + "step": 2200, + "time_per_iteration": 2.8861804008483887 + }, + { + "auxiliary_loss_clip": 0.01130572, + "auxiliary_loss_mlp": 0.01053639, + "balance_loss_clip": 1.04099655, + "balance_loss_mlp": 1.03324223, + "epoch": 0.13233127912220052, + "flos": 20704656735360.0, + "grad_norm": 1.848719284113498, + "language_loss": 0.84320378, + "learning_rate": 3.891154759144557e-06, + "loss": 0.86504591, + "num_input_tokens_seen": 47563740, + "step": 2201, + "time_per_iteration": 2.626117706298828 + }, + { + "auxiliary_loss_clip": 0.01132974, + "auxiliary_loss_mlp": 0.01056605, + "balance_loss_clip": 1.04158711, + "balance_loss_mlp": 1.03617215, + "epoch": 0.1323914023748685, + "flos": 25805048434560.0, + "grad_norm": 2.01347640226095, + "language_loss": 0.86798227, + "learning_rate": 3.891027993499554e-06, + "loss": 0.88987803, + "num_input_tokens_seen": 47582655, + "step": 2202, + "time_per_iteration": 2.6782021522521973 + }, + { + "auxiliary_loss_clip": 0.01101382, + "auxiliary_loss_mlp": 0.0104988, + "balance_loss_clip": 1.03778493, + "balance_loss_mlp": 1.03002024, + "epoch": 0.13245152562753645, + "flos": 21251540280960.0, + "grad_norm": 2.297422773281766, + "language_loss": 0.729186, + "learning_rate": 3.89090115614658e-06, + "loss": 0.75069869, + "num_input_tokens_seen": 47600875, + "step": 2203, + "time_per_iteration": 2.866755485534668 + }, + { + "auxiliary_loss_clip": 0.01079865, + "auxiliary_loss_mlp": 0.01052609, + "balance_loss_clip": 1.03391683, + "balance_loss_mlp": 1.03315389, + "epoch": 0.13251164888020442, + "flos": 26610955931520.0, + "grad_norm": 2.235225337541576, + "language_loss": 0.73382211, + "learning_rate": 3.890774247090444e-06, + "loss": 0.75514686, + "num_input_tokens_seen": 47619250, + "step": 2204, + "time_per_iteration": 2.71286678314209 + }, + { + "auxiliary_loss_clip": 0.011221, + "auxiliary_loss_mlp": 0.01049753, + "balance_loss_clip": 1.04335487, + "balance_loss_mlp": 1.02873623, + "epoch": 0.13257177213287238, + "flos": 29826541272960.0, + "grad_norm": 1.65327041937397, + "language_loss": 0.78653765, + "learning_rate": 3.89064726633596e-06, + "loss": 0.80825621, + "num_input_tokens_seen": 47639445, + "step": 2205, + "time_per_iteration": 2.649826765060425 + }, + { + "auxiliary_loss_clip": 0.01087138, + "auxiliary_loss_mlp": 0.01048352, + "balance_loss_clip": 1.03722143, + "balance_loss_mlp": 1.02873063, + "epoch": 0.13263189538554035, + "flos": 21288456483840.0, + "grad_norm": 1.8436744916047771, + "language_loss": 0.78855598, + "learning_rate": 3.890520213887941e-06, + "loss": 0.80991089, + "num_input_tokens_seen": 47658740, + "step": 2206, + "time_per_iteration": 2.7777719497680664 + }, + { + "auxiliary_loss_clip": 0.01085144, + "auxiliary_loss_mlp": 0.01049594, + "balance_loss_clip": 1.03617859, + "balance_loss_mlp": 1.0307467, + "epoch": 0.13269201863820831, + "flos": 16874101618560.0, + "grad_norm": 1.8484758285527816, + "language_loss": 0.74217832, + "learning_rate": 3.890393089751208e-06, + "loss": 0.76352566, + "num_input_tokens_seen": 47676880, + "step": 2207, + "time_per_iteration": 2.6762924194335938 + }, + { + "auxiliary_loss_clip": 0.01104705, + "auxiliary_loss_mlp": 0.01051055, + "balance_loss_clip": 1.03733087, + "balance_loss_mlp": 1.03016973, + "epoch": 0.1327521418908763, + "flos": 23768914078080.0, + "grad_norm": 2.6859592027866372, + "language_loss": 0.83874607, + "learning_rate": 3.890265893930578e-06, + "loss": 0.8603037, + "num_input_tokens_seen": 47696635, + "step": 2208, + "time_per_iteration": 4.150737047195435 + }, + { + "auxiliary_loss_clip": 0.01111127, + "auxiliary_loss_mlp": 0.0105263, + "balance_loss_clip": 1.03988981, + "balance_loss_mlp": 1.03437877, + "epoch": 0.13281226514354427, + "flos": 26505594362880.0, + "grad_norm": 2.778119770029398, + "language_loss": 0.85249776, + "learning_rate": 3.890138626430876e-06, + "loss": 0.87413532, + "num_input_tokens_seen": 47717760, + "step": 2209, + "time_per_iteration": 2.6461126804351807 + }, + { + "auxiliary_loss_clip": 0.01089347, + "auxiliary_loss_mlp": 0.00749043, + "balance_loss_clip": 1.03432178, + "balance_loss_mlp": 1.00050497, + "epoch": 0.13287238839621224, + "flos": 24498762526080.0, + "grad_norm": 1.8789146584224559, + "language_loss": 0.82084084, + "learning_rate": 3.890011287256929e-06, + "loss": 0.8392247, + "num_input_tokens_seen": 47737685, + "step": 2210, + "time_per_iteration": 2.6513500213623047 + }, + { + "auxiliary_loss_clip": 0.0101032, + "auxiliary_loss_mlp": 0.0074875, + "balance_loss_clip": 1.01536942, + "balance_loss_mlp": 1.0008086, + "epoch": 0.1329325116488802, + "flos": 67694344369920.0, + "grad_norm": 0.7561704666939266, + "language_loss": 0.58005244, + "learning_rate": 3.889883876413563e-06, + "loss": 0.5976432, + "num_input_tokens_seen": 47802415, + "step": 2211, + "time_per_iteration": 3.357830286026001 + }, + { + "auxiliary_loss_clip": 0.01012744, + "auxiliary_loss_mlp": 0.01019896, + "balance_loss_clip": 1.00570345, + "balance_loss_mlp": 1.01710689, + "epoch": 0.13299263490154817, + "flos": 72261894741120.0, + "grad_norm": 0.8120383764259567, + "language_loss": 0.55331373, + "learning_rate": 3.889756393905611e-06, + "loss": 0.57364011, + "num_input_tokens_seen": 47871485, + "step": 2212, + "time_per_iteration": 3.267930507659912 + }, + { + "auxiliary_loss_clip": 0.01087768, + "auxiliary_loss_mlp": 0.0105179, + "balance_loss_clip": 1.03489733, + "balance_loss_mlp": 1.03105938, + "epoch": 0.13305275815421613, + "flos": 17931275729280.0, + "grad_norm": 2.3578009490630247, + "language_loss": 0.74574625, + "learning_rate": 3.889628839737908e-06, + "loss": 0.76714182, + "num_input_tokens_seen": 47888315, + "step": 2213, + "time_per_iteration": 5.796499967575073 + }, + { + "auxiliary_loss_clip": 0.010684, + "auxiliary_loss_mlp": 0.01044915, + "balance_loss_clip": 1.03137219, + "balance_loss_mlp": 1.025877, + "epoch": 0.13311288140688413, + "flos": 22340889999360.0, + "grad_norm": 1.7329152993518528, + "language_loss": 0.79186487, + "learning_rate": 3.889501213915291e-06, + "loss": 0.81299806, + "num_input_tokens_seen": 47906600, + "step": 2214, + "time_per_iteration": 2.7550902366638184 + }, + { + "auxiliary_loss_clip": 0.01097372, + "auxiliary_loss_mlp": 0.01051882, + "balance_loss_clip": 1.03674889, + "balance_loss_mlp": 1.03149724, + "epoch": 0.1331730046595521, + "flos": 31868888682240.0, + "grad_norm": 1.8015151042551456, + "language_loss": 0.68928838, + "learning_rate": 3.889373516442597e-06, + "loss": 0.71078086, + "num_input_tokens_seen": 47927630, + "step": 2215, + "time_per_iteration": 2.7122459411621094 + }, + { + "auxiliary_loss_clip": 0.01121991, + "auxiliary_loss_mlp": 0.01049181, + "balance_loss_clip": 1.04050851, + "balance_loss_mlp": 1.02936804, + "epoch": 0.13323312791222006, + "flos": 22566589107840.0, + "grad_norm": 2.158748936338003, + "language_loss": 0.81471896, + "learning_rate": 3.889245747324671e-06, + "loss": 0.83643067, + "num_input_tokens_seen": 47947935, + "step": 2216, + "time_per_iteration": 2.6373419761657715 + }, + { + "auxiliary_loss_clip": 0.01116494, + "auxiliary_loss_mlp": 0.01055313, + "balance_loss_clip": 1.03921151, + "balance_loss_mlp": 1.03452325, + "epoch": 0.13329325116488802, + "flos": 15085319293440.0, + "grad_norm": 2.1999513298823485, + "language_loss": 0.87118036, + "learning_rate": 3.889117906566356e-06, + "loss": 0.89289844, + "num_input_tokens_seen": 47965515, + "step": 2217, + "time_per_iteration": 2.542715072631836 + }, + { + "auxiliary_loss_clip": 0.01103901, + "auxiliary_loss_mlp": 0.01050762, + "balance_loss_clip": 1.03714991, + "balance_loss_mlp": 1.02938747, + "epoch": 0.133353374417556, + "flos": 27453671890560.0, + "grad_norm": 2.4551556116268793, + "language_loss": 0.73282444, + "learning_rate": 3.888989994172501e-06, + "loss": 0.75437105, + "num_input_tokens_seen": 47985675, + "step": 2218, + "time_per_iteration": 2.6663975715637207 + }, + { + "auxiliary_loss_clip": 0.01085851, + "auxiliary_loss_mlp": 0.01045999, + "balance_loss_clip": 1.03577626, + "balance_loss_mlp": 1.02548289, + "epoch": 0.13341349767022395, + "flos": 24094695456000.0, + "grad_norm": 1.9301122063118017, + "language_loss": 0.87316668, + "learning_rate": 3.8888620101479565e-06, + "loss": 0.89448524, + "num_input_tokens_seen": 48004985, + "step": 2219, + "time_per_iteration": 2.720529556274414 + }, + { + "auxiliary_loss_clip": 0.01094288, + "auxiliary_loss_mlp": 0.0105173, + "balance_loss_clip": 1.03690827, + "balance_loss_mlp": 1.03225112, + "epoch": 0.13347362092289192, + "flos": 24133335511680.0, + "grad_norm": 4.1817714327076985, + "language_loss": 0.77028835, + "learning_rate": 3.888733954497574e-06, + "loss": 0.79174852, + "num_input_tokens_seen": 48024965, + "step": 2220, + "time_per_iteration": 2.662120819091797 + }, + { + "auxiliary_loss_clip": 0.0110289, + "auxiliary_loss_mlp": 0.01047227, + "balance_loss_clip": 1.03536725, + "balance_loss_mlp": 1.02804637, + "epoch": 0.1335337441755599, + "flos": 18436538390400.0, + "grad_norm": 7.902099178879727, + "language_loss": 0.78756505, + "learning_rate": 3.888605827226212e-06, + "loss": 0.80906624, + "num_input_tokens_seen": 48040890, + "step": 2221, + "time_per_iteration": 2.682983160018921 + }, + { + "auxiliary_loss_clip": 0.01023822, + "auxiliary_loss_mlp": 0.0101127, + "balance_loss_clip": 1.00874472, + "balance_loss_mlp": 1.00843334, + "epoch": 0.13359386742822787, + "flos": 50611997652480.0, + "grad_norm": 0.9755744652893494, + "language_loss": 0.69006395, + "learning_rate": 3.8884776283387275e-06, + "loss": 0.71041489, + "num_input_tokens_seen": 48091855, + "step": 2222, + "time_per_iteration": 3.0308797359466553 + }, + { + "auxiliary_loss_clip": 0.01094349, + "auxiliary_loss_mlp": 0.01047603, + "balance_loss_clip": 1.04084849, + "balance_loss_mlp": 1.02837443, + "epoch": 0.13365399068089584, + "flos": 22778569221120.0, + "grad_norm": 2.093507331684533, + "language_loss": 0.67344493, + "learning_rate": 3.888349357839982e-06, + "loss": 0.69486439, + "num_input_tokens_seen": 48111350, + "step": 2223, + "time_per_iteration": 2.7923688888549805 + }, + { + "auxiliary_loss_clip": 0.01119318, + "auxiliary_loss_mlp": 0.01053025, + "balance_loss_clip": 1.03954148, + "balance_loss_mlp": 1.03180552, + "epoch": 0.1337141139335638, + "flos": 12531603911040.0, + "grad_norm": 2.2331220818471915, + "language_loss": 0.82664144, + "learning_rate": 3.88822101573484e-06, + "loss": 0.84836495, + "num_input_tokens_seen": 48129840, + "step": 2224, + "time_per_iteration": 2.5526678562164307 + }, + { + "auxiliary_loss_clip": 0.01134306, + "auxiliary_loss_mlp": 0.01047523, + "balance_loss_clip": 1.04109383, + "balance_loss_mlp": 1.02643538, + "epoch": 0.13377423718623177, + "flos": 23038957889280.0, + "grad_norm": 1.9283801817652093, + "language_loss": 0.65910393, + "learning_rate": 3.888092602028167e-06, + "loss": 0.68092215, + "num_input_tokens_seen": 48149240, + "step": 2225, + "time_per_iteration": 2.645509958267212 + }, + { + "auxiliary_loss_clip": 0.01112787, + "auxiliary_loss_mlp": 0.01054421, + "balance_loss_clip": 1.03755534, + "balance_loss_mlp": 1.03330898, + "epoch": 0.13383436043889974, + "flos": 16216397637120.0, + "grad_norm": 2.17444484480741, + "language_loss": 0.89480108, + "learning_rate": 3.887964116724835e-06, + "loss": 0.91647321, + "num_input_tokens_seen": 48166330, + "step": 2226, + "time_per_iteration": 2.6195781230926514 + }, + { + "auxiliary_loss_clip": 0.0110777, + "auxiliary_loss_mlp": 0.01053641, + "balance_loss_clip": 1.03758514, + "balance_loss_mlp": 1.03339946, + "epoch": 0.1338944836915677, + "flos": 24279671520000.0, + "grad_norm": 1.8383647200485016, + "language_loss": 0.73816264, + "learning_rate": 3.887835559829712e-06, + "loss": 0.75977677, + "num_input_tokens_seen": 48187600, + "step": 2227, + "time_per_iteration": 2.7525644302368164 + }, + { + "auxiliary_loss_clip": 0.01118391, + "auxiliary_loss_mlp": 0.01052384, + "balance_loss_clip": 1.03880596, + "balance_loss_mlp": 1.03149867, + "epoch": 0.1339546069442357, + "flos": 17598742594560.0, + "grad_norm": 2.1441400180775014, + "language_loss": 0.84696734, + "learning_rate": 3.8877069313476764e-06, + "loss": 0.86867505, + "num_input_tokens_seen": 48204400, + "step": 2228, + "time_per_iteration": 2.6075711250305176 + }, + { + "auxiliary_loss_clip": 0.01093838, + "auxiliary_loss_mlp": 0.01051722, + "balance_loss_clip": 1.03631473, + "balance_loss_mlp": 1.03080082, + "epoch": 0.13401473019690366, + "flos": 18990065952000.0, + "grad_norm": 1.7959335346649812, + "language_loss": 0.81170559, + "learning_rate": 3.8875782312836054e-06, + "loss": 0.83316118, + "num_input_tokens_seen": 48222180, + "step": 2229, + "time_per_iteration": 2.650719165802002 + }, + { + "auxiliary_loss_clip": 0.01069797, + "auxiliary_loss_mlp": 0.01061672, + "balance_loss_clip": 1.03533638, + "balance_loss_mlp": 1.04090583, + "epoch": 0.13407485344957162, + "flos": 26943812288640.0, + "grad_norm": 1.989383580770474, + "language_loss": 0.73856789, + "learning_rate": 3.887449459642378e-06, + "loss": 0.75988257, + "num_input_tokens_seen": 48243245, + "step": 2230, + "time_per_iteration": 2.7091333866119385 + }, + { + "auxiliary_loss_clip": 0.01094113, + "auxiliary_loss_mlp": 0.01061513, + "balance_loss_clip": 1.03909636, + "balance_loss_mlp": 1.04086614, + "epoch": 0.1341349767022396, + "flos": 20339373375360.0, + "grad_norm": 1.8252480344652298, + "language_loss": 0.80070961, + "learning_rate": 3.8873206164288785e-06, + "loss": 0.8222658, + "num_input_tokens_seen": 48262600, + "step": 2231, + "time_per_iteration": 2.6635866165161133 + }, + { + "auxiliary_loss_clip": 0.01072633, + "auxiliary_loss_mlp": 0.01060857, + "balance_loss_clip": 1.03647804, + "balance_loss_mlp": 1.03802872, + "epoch": 0.13419509995490755, + "flos": 29862020931840.0, + "grad_norm": 1.610576842458878, + "language_loss": 0.7205373, + "learning_rate": 3.887191701647992e-06, + "loss": 0.74187219, + "num_input_tokens_seen": 48285075, + "step": 2232, + "time_per_iteration": 2.7680118083953857 + }, + { + "auxiliary_loss_clip": 0.01083326, + "auxiliary_loss_mlp": 0.01055223, + "balance_loss_clip": 1.03622377, + "balance_loss_mlp": 1.03291869, + "epoch": 0.13425522320757552, + "flos": 26942986275840.0, + "grad_norm": 2.520848264541714, + "language_loss": 0.65791589, + "learning_rate": 3.8870627153046066e-06, + "loss": 0.67930138, + "num_input_tokens_seen": 48301285, + "step": 2233, + "time_per_iteration": 2.718388319015503 + }, + { + "auxiliary_loss_clip": 0.01130308, + "auxiliary_loss_mlp": 0.01048486, + "balance_loss_clip": 1.03772998, + "balance_loss_mlp": 1.02701604, + "epoch": 0.1343153464602435, + "flos": 15777281871360.0, + "grad_norm": 2.9436367294650685, + "language_loss": 0.81587255, + "learning_rate": 3.886933657403615e-06, + "loss": 0.83766055, + "num_input_tokens_seen": 48317835, + "step": 2234, + "time_per_iteration": 2.5814907550811768 + }, + { + "auxiliary_loss_clip": 0.0110517, + "auxiliary_loss_mlp": 0.01058305, + "balance_loss_clip": 1.03826785, + "balance_loss_mlp": 1.03701448, + "epoch": 0.13437546971291148, + "flos": 24314756129280.0, + "grad_norm": 1.8981933725409075, + "language_loss": 0.8217786, + "learning_rate": 3.886804527949909e-06, + "loss": 0.84341335, + "num_input_tokens_seen": 48335670, + "step": 2235, + "time_per_iteration": 2.7858662605285645 + }, + { + "auxiliary_loss_clip": 0.01118258, + "auxiliary_loss_mlp": 0.01054748, + "balance_loss_clip": 1.03955328, + "balance_loss_mlp": 1.03191948, + "epoch": 0.13443559296557944, + "flos": 26650673395200.0, + "grad_norm": 1.914758685480515, + "language_loss": 0.86578357, + "learning_rate": 3.8866753269483864e-06, + "loss": 0.88751364, + "num_input_tokens_seen": 48357805, + "step": 2236, + "time_per_iteration": 2.6923182010650635 + }, + { + "auxiliary_loss_clip": 0.01132334, + "auxiliary_loss_mlp": 0.01047533, + "balance_loss_clip": 1.04141402, + "balance_loss_mlp": 1.02555096, + "epoch": 0.1344957162182474, + "flos": 21796197183360.0, + "grad_norm": 1.95968195703391, + "language_loss": 0.7730068, + "learning_rate": 3.886546054403946e-06, + "loss": 0.79480547, + "num_input_tokens_seen": 48377845, + "step": 2237, + "time_per_iteration": 2.659921169281006 + }, + { + "auxiliary_loss_clip": 0.0110922, + "auxiliary_loss_mlp": 0.01056755, + "balance_loss_clip": 1.03882098, + "balance_loss_mlp": 1.03360474, + "epoch": 0.13455583947091537, + "flos": 19865568049920.0, + "grad_norm": 2.02306814026929, + "language_loss": 0.78640062, + "learning_rate": 3.886416710321491e-06, + "loss": 0.80806035, + "num_input_tokens_seen": 48394735, + "step": 2238, + "time_per_iteration": 2.691607713699341 + }, + { + "auxiliary_loss_clip": 0.01107951, + "auxiliary_loss_mlp": 0.01053881, + "balance_loss_clip": 1.03908825, + "balance_loss_mlp": 1.03093362, + "epoch": 0.13461596272358334, + "flos": 30846835094400.0, + "grad_norm": 2.688194611084432, + "language_loss": 0.68341649, + "learning_rate": 3.886287294705924e-06, + "loss": 0.70503479, + "num_input_tokens_seen": 48414200, + "step": 2239, + "time_per_iteration": 2.770528793334961 + }, + { + "auxiliary_loss_clip": 0.01109317, + "auxiliary_loss_mlp": 0.01056375, + "balance_loss_clip": 1.03879797, + "balance_loss_mlp": 1.03504896, + "epoch": 0.1346760859762513, + "flos": 12494436312960.0, + "grad_norm": 2.255814981332963, + "language_loss": 0.81018567, + "learning_rate": 3.8861578075621555e-06, + "loss": 0.83184254, + "num_input_tokens_seen": 48431065, + "step": 2240, + "time_per_iteration": 2.6946136951446533 + }, + { + "auxiliary_loss_clip": 0.01071765, + "auxiliary_loss_mlp": 0.01051537, + "balance_loss_clip": 1.03470397, + "balance_loss_mlp": 1.02988863, + "epoch": 0.1347362092289193, + "flos": 21836022387840.0, + "grad_norm": 3.6332183639704034, + "language_loss": 0.78084123, + "learning_rate": 3.886028248895093e-06, + "loss": 0.80207425, + "num_input_tokens_seen": 48450335, + "step": 2241, + "time_per_iteration": 2.7706832885742188 + }, + { + "auxiliary_loss_clip": 0.01127466, + "auxiliary_loss_mlp": 0.01040406, + "balance_loss_clip": 1.0411731, + "balance_loss_mlp": 1.02155852, + "epoch": 0.13479633248158726, + "flos": 23509459163520.0, + "grad_norm": 1.7743160202298947, + "language_loss": 0.83239555, + "learning_rate": 3.88589861870965e-06, + "loss": 0.85407424, + "num_input_tokens_seen": 48468555, + "step": 2242, + "time_per_iteration": 2.5767881870269775 + }, + { + "auxiliary_loss_clip": 0.01132428, + "auxiliary_loss_mlp": 0.01057934, + "balance_loss_clip": 1.04125273, + "balance_loss_mlp": 1.03553474, + "epoch": 0.13485645573425523, + "flos": 29344332165120.0, + "grad_norm": 2.331790560257259, + "language_loss": 0.65091175, + "learning_rate": 3.885768917010744e-06, + "loss": 0.67281538, + "num_input_tokens_seen": 48488515, + "step": 2243, + "time_per_iteration": 2.678446054458618 + }, + { + "auxiliary_loss_clip": 0.01088535, + "auxiliary_loss_mlp": 0.01046572, + "balance_loss_clip": 1.03364909, + "balance_loss_mlp": 1.02495956, + "epoch": 0.1349165789869232, + "flos": 28037112503040.0, + "grad_norm": 1.377229601307913, + "language_loss": 0.72470731, + "learning_rate": 3.8856391438032895e-06, + "loss": 0.74605834, + "num_input_tokens_seen": 48510515, + "step": 2244, + "time_per_iteration": 2.808137893676758 + }, + { + "auxiliary_loss_clip": 0.01118533, + "auxiliary_loss_mlp": 0.01051756, + "balance_loss_clip": 1.04026651, + "balance_loss_mlp": 1.03212237, + "epoch": 0.13497670223959116, + "flos": 22853730430080.0, + "grad_norm": 1.5200732021140835, + "language_loss": 0.86202347, + "learning_rate": 3.88550929909221e-06, + "loss": 0.88372636, + "num_input_tokens_seen": 48529940, + "step": 2245, + "time_per_iteration": 2.633540153503418 + }, + { + "auxiliary_loss_clip": 0.01113601, + "auxiliary_loss_mlp": 0.0104993, + "balance_loss_clip": 1.03843975, + "balance_loss_mlp": 1.03021264, + "epoch": 0.13503682549225912, + "flos": 16504580453760.0, + "grad_norm": 1.8503721073068156, + "language_loss": 0.78848529, + "learning_rate": 3.88537938288243e-06, + "loss": 0.81012058, + "num_input_tokens_seen": 48548190, + "step": 2246, + "time_per_iteration": 2.5949087142944336 + }, + { + "auxiliary_loss_clip": 0.00986418, + "auxiliary_loss_mlp": 0.01027104, + "balance_loss_clip": 1.0122081, + "balance_loss_mlp": 1.02437389, + "epoch": 0.1350969487449271, + "flos": 70756303242240.0, + "grad_norm": 0.7648904298603032, + "language_loss": 0.6055367, + "learning_rate": 3.885249395178874e-06, + "loss": 0.62567186, + "num_input_tokens_seen": 48613165, + "step": 2247, + "time_per_iteration": 3.4792075157165527 + }, + { + "auxiliary_loss_clip": 0.01126057, + "auxiliary_loss_mlp": 0.01058962, + "balance_loss_clip": 1.04206085, + "balance_loss_mlp": 1.0355258, + "epoch": 0.13515707199759508, + "flos": 23075981832960.0, + "grad_norm": 2.7698085319268992, + "language_loss": 0.81056798, + "learning_rate": 3.885119335986473e-06, + "loss": 0.8324182, + "num_input_tokens_seen": 48631705, + "step": 2248, + "time_per_iteration": 2.597028970718384 + }, + { + "auxiliary_loss_clip": 0.01106007, + "auxiliary_loss_mlp": 0.01043065, + "balance_loss_clip": 1.03852534, + "balance_loss_mlp": 1.02364552, + "epoch": 0.13521719525026304, + "flos": 23186371305600.0, + "grad_norm": 1.7292222308455172, + "language_loss": 0.77443635, + "learning_rate": 3.884989205310157e-06, + "loss": 0.79592711, + "num_input_tokens_seen": 48649740, + "step": 2249, + "time_per_iteration": 2.6595380306243896 + }, + { + "auxiliary_loss_clip": 0.01094449, + "auxiliary_loss_mlp": 0.01056767, + "balance_loss_clip": 1.04054499, + "balance_loss_mlp": 1.03703809, + "epoch": 0.135277318502931, + "flos": 24790931752320.0, + "grad_norm": 1.4375802830127264, + "language_loss": 0.84575105, + "learning_rate": 3.884859003154862e-06, + "loss": 0.86726326, + "num_input_tokens_seen": 48671565, + "step": 2250, + "time_per_iteration": 2.724522113800049 + }, + { + "auxiliary_loss_clip": 0.01122872, + "auxiliary_loss_mlp": 0.01058126, + "balance_loss_clip": 1.04162478, + "balance_loss_mlp": 1.03553605, + "epoch": 0.13533744175559898, + "flos": 21908525990400.0, + "grad_norm": 1.9553035220439332, + "language_loss": 0.81705326, + "learning_rate": 3.884728729525524e-06, + "loss": 0.83886325, + "num_input_tokens_seen": 48690425, + "step": 2251, + "time_per_iteration": 2.625663995742798 + }, + { + "auxiliary_loss_clip": 0.01129618, + "auxiliary_loss_mlp": 0.01057886, + "balance_loss_clip": 1.03912699, + "balance_loss_mlp": 1.03533149, + "epoch": 0.13539756500826694, + "flos": 21211643249280.0, + "grad_norm": 13.187037289709387, + "language_loss": 0.86077201, + "learning_rate": 3.884598384427084e-06, + "loss": 0.88264704, + "num_input_tokens_seen": 48707505, + "step": 2252, + "time_per_iteration": 2.598798990249634 + }, + { + "auxiliary_loss_clip": 0.01021547, + "auxiliary_loss_mlp": 0.01023461, + "balance_loss_clip": 1.00669384, + "balance_loss_mlp": 1.02100492, + "epoch": 0.1354576882609349, + "flos": 63242103634560.0, + "grad_norm": 0.7590411927916831, + "language_loss": 0.61753583, + "learning_rate": 3.884467967864485e-06, + "loss": 0.63798583, + "num_input_tokens_seen": 48775895, + "step": 2253, + "time_per_iteration": 3.340942621231079 + }, + { + "auxiliary_loss_clip": 0.01120148, + "auxiliary_loss_mlp": 0.01058478, + "balance_loss_clip": 1.04110503, + "balance_loss_mlp": 1.03874922, + "epoch": 0.1355178115136029, + "flos": 25483037984640.0, + "grad_norm": 1.6568061009038095, + "language_loss": 0.89202869, + "learning_rate": 3.884337479842671e-06, + "loss": 0.91381496, + "num_input_tokens_seen": 48798370, + "step": 2254, + "time_per_iteration": 2.682813882827759 + }, + { + "auxiliary_loss_clip": 0.01090254, + "auxiliary_loss_mlp": 0.01058762, + "balance_loss_clip": 1.03365791, + "balance_loss_mlp": 1.0342648, + "epoch": 0.13557793476627086, + "flos": 21616967295360.0, + "grad_norm": 3.1740571026557034, + "language_loss": 0.83846229, + "learning_rate": 3.884206920366591e-06, + "loss": 0.85995245, + "num_input_tokens_seen": 48817955, + "step": 2255, + "time_per_iteration": 4.227473020553589 + }, + { + "auxiliary_loss_clip": 0.01128926, + "auxiliary_loss_mlp": 0.01052736, + "balance_loss_clip": 1.03924656, + "balance_loss_mlp": 1.03118336, + "epoch": 0.13563805801893883, + "flos": 24928253447040.0, + "grad_norm": 2.2882295320983776, + "language_loss": 0.74681044, + "learning_rate": 3.884076289441196e-06, + "loss": 0.76862705, + "num_input_tokens_seen": 48836330, + "step": 2256, + "time_per_iteration": 2.6574857234954834 + }, + { + "auxiliary_loss_clip": 0.0109495, + "auxiliary_loss_mlp": 0.01054986, + "balance_loss_clip": 1.03760517, + "balance_loss_mlp": 1.0334568, + "epoch": 0.1356981812716068, + "flos": 14750272206720.0, + "grad_norm": 4.346698355490639, + "language_loss": 0.83136487, + "learning_rate": 3.88394558707144e-06, + "loss": 0.85286427, + "num_input_tokens_seen": 48851890, + "step": 2257, + "time_per_iteration": 2.6910364627838135 + }, + { + "auxiliary_loss_clip": 0.01115392, + "auxiliary_loss_mlp": 0.00749211, + "balance_loss_clip": 1.03956628, + "balance_loss_mlp": 1.00057507, + "epoch": 0.13575830452427476, + "flos": 11108571822720.0, + "grad_norm": 2.1156620003552327, + "language_loss": 0.81823534, + "learning_rate": 3.883814813262277e-06, + "loss": 0.83688134, + "num_input_tokens_seen": 48865510, + "step": 2258, + "time_per_iteration": 2.7229394912719727 + }, + { + "auxiliary_loss_clip": 0.0112034, + "auxiliary_loss_mlp": 0.01052441, + "balance_loss_clip": 1.03940749, + "balance_loss_mlp": 1.02937412, + "epoch": 0.13581842777694272, + "flos": 17960290940160.0, + "grad_norm": 2.3293097149334074, + "language_loss": 0.8231408, + "learning_rate": 3.883683968018669e-06, + "loss": 0.84486866, + "num_input_tokens_seen": 48882360, + "step": 2259, + "time_per_iteration": 2.613722801208496 + }, + { + "auxiliary_loss_clip": 0.01091289, + "auxiliary_loss_mlp": 0.01061354, + "balance_loss_clip": 1.03889179, + "balance_loss_mlp": 1.04181516, + "epoch": 0.1358785510296107, + "flos": 22857142222080.0, + "grad_norm": 1.8561053011428745, + "language_loss": 0.73175836, + "learning_rate": 3.8835530513455755e-06, + "loss": 0.75328481, + "num_input_tokens_seen": 48902700, + "step": 2260, + "time_per_iteration": 6.146433353424072 + }, + { + "auxiliary_loss_clip": 0.0110861, + "auxiliary_loss_mlp": 0.01063701, + "balance_loss_clip": 1.04009366, + "balance_loss_mlp": 1.04303014, + "epoch": 0.13593867428227868, + "flos": 25739404329600.0, + "grad_norm": 2.842597304805878, + "language_loss": 0.75325203, + "learning_rate": 3.883422063247961e-06, + "loss": 0.77497518, + "num_input_tokens_seen": 48922525, + "step": 2261, + "time_per_iteration": 4.324607849121094 + }, + { + "auxiliary_loss_clip": 0.01130572, + "auxiliary_loss_mlp": 0.01052706, + "balance_loss_clip": 1.03932881, + "balance_loss_mlp": 1.03232169, + "epoch": 0.13599879753494665, + "flos": 31249214225280.0, + "grad_norm": 2.1373997756882974, + "language_loss": 0.63294852, + "learning_rate": 3.883291003730794e-06, + "loss": 0.65478134, + "num_input_tokens_seen": 48942510, + "step": 2262, + "time_per_iteration": 2.6620185375213623 + }, + { + "auxiliary_loss_clip": 0.01112047, + "auxiliary_loss_mlp": 0.01050236, + "balance_loss_clip": 1.04043901, + "balance_loss_mlp": 1.03068566, + "epoch": 0.1360589207876146, + "flos": 23915034604800.0, + "grad_norm": 3.0895199705454566, + "language_loss": 0.82073343, + "learning_rate": 3.883159872799043e-06, + "loss": 0.8423562, + "num_input_tokens_seen": 48962625, + "step": 2263, + "time_per_iteration": 2.6701409816741943 + }, + { + "auxiliary_loss_clip": 0.01075216, + "auxiliary_loss_mlp": 0.01064236, + "balance_loss_clip": 1.04117322, + "balance_loss_mlp": 1.04100227, + "epoch": 0.13611904404028258, + "flos": 19974197756160.0, + "grad_norm": 1.8811315942410023, + "language_loss": 0.88208318, + "learning_rate": 3.8830286704576815e-06, + "loss": 0.90347767, + "num_input_tokens_seen": 48982525, + "step": 2264, + "time_per_iteration": 2.797001838684082 + }, + { + "auxiliary_loss_clip": 0.01123753, + "auxiliary_loss_mlp": 0.01057568, + "balance_loss_clip": 1.04152226, + "balance_loss_mlp": 1.03534722, + "epoch": 0.13617916729295054, + "flos": 15340644144000.0, + "grad_norm": 4.666843892996156, + "language_loss": 0.7230289, + "learning_rate": 3.882897396711683e-06, + "loss": 0.74484205, + "num_input_tokens_seen": 48997605, + "step": 2265, + "time_per_iteration": 2.571197032928467 + }, + { + "auxiliary_loss_clip": 0.01085202, + "auxiliary_loss_mlp": 0.01054286, + "balance_loss_clip": 1.04292727, + "balance_loss_mlp": 1.03384137, + "epoch": 0.1362392905456185, + "flos": 27451445247360.0, + "grad_norm": 2.2758588899107144, + "language_loss": 0.66729909, + "learning_rate": 3.882766051566027e-06, + "loss": 0.68869394, + "num_input_tokens_seen": 49018535, + "step": 2266, + "time_per_iteration": 2.775515079498291 + }, + { + "auxiliary_loss_clip": 0.01100805, + "auxiliary_loss_mlp": 0.01065485, + "balance_loss_clip": 1.05009127, + "balance_loss_mlp": 1.04550529, + "epoch": 0.1362994137982865, + "flos": 25009017177600.0, + "grad_norm": 1.86189580608015, + "language_loss": 0.764184, + "learning_rate": 3.882634635025694e-06, + "loss": 0.78584695, + "num_input_tokens_seen": 49038865, + "step": 2267, + "time_per_iteration": 2.8225221633911133 + }, + { + "auxiliary_loss_clip": 0.01093623, + "auxiliary_loss_mlp": 0.01051956, + "balance_loss_clip": 1.03531075, + "balance_loss_mlp": 1.03130913, + "epoch": 0.13635953705095447, + "flos": 20303031790080.0, + "grad_norm": 2.1505464446028384, + "language_loss": 0.82038021, + "learning_rate": 3.882503147095667e-06, + "loss": 0.84183604, + "num_input_tokens_seen": 49058010, + "step": 2268, + "time_per_iteration": 2.660654067993164 + }, + { + "auxiliary_loss_clip": 0.01116957, + "auxiliary_loss_mlp": 0.01047231, + "balance_loss_clip": 1.0406152, + "balance_loss_mlp": 1.02692986, + "epoch": 0.13641966030362243, + "flos": 31358418549120.0, + "grad_norm": 1.6840248509560523, + "language_loss": 0.76137531, + "learning_rate": 3.882371587780931e-06, + "loss": 0.78301716, + "num_input_tokens_seen": 49080330, + "step": 2269, + "time_per_iteration": 2.6772522926330566 + }, + { + "auxiliary_loss_clip": 0.01096354, + "auxiliary_loss_mlp": 0.0104753, + "balance_loss_clip": 1.03909492, + "balance_loss_mlp": 1.02695501, + "epoch": 0.1364797835562904, + "flos": 20478095700480.0, + "grad_norm": 2.0228348483914265, + "language_loss": 0.81211585, + "learning_rate": 3.882239957086477e-06, + "loss": 0.83355474, + "num_input_tokens_seen": 49097035, + "step": 2270, + "time_per_iteration": 2.6886746883392334 + }, + { + "auxiliary_loss_clip": 0.01099242, + "auxiliary_loss_mlp": 0.01052571, + "balance_loss_clip": 1.03545153, + "balance_loss_mlp": 1.03131592, + "epoch": 0.13653990680895836, + "flos": 13078343802240.0, + "grad_norm": 3.6220331753516106, + "language_loss": 0.75364846, + "learning_rate": 3.882108255017295e-06, + "loss": 0.77516657, + "num_input_tokens_seen": 49113945, + "step": 2271, + "time_per_iteration": 2.5969772338867188 + }, + { + "auxiliary_loss_clip": 0.01115647, + "auxiliary_loss_mlp": 0.0105653, + "balance_loss_clip": 1.03700113, + "balance_loss_mlp": 1.03550124, + "epoch": 0.13660003006162633, + "flos": 16946712961920.0, + "grad_norm": 2.279762265572756, + "language_loss": 0.8029725, + "learning_rate": 3.881976481578379e-06, + "loss": 0.82469428, + "num_input_tokens_seen": 49132855, + "step": 2272, + "time_per_iteration": 2.600585460662842 + }, + { + "auxiliary_loss_clip": 0.01017524, + "auxiliary_loss_mlp": 0.01016519, + "balance_loss_clip": 1.00311232, + "balance_loss_mlp": 1.01400399, + "epoch": 0.1366601533142943, + "flos": 68682749892480.0, + "grad_norm": 0.7081870944696871, + "language_loss": 0.60645211, + "learning_rate": 3.8818446367747255e-06, + "loss": 0.62679255, + "num_input_tokens_seen": 49198310, + "step": 2273, + "time_per_iteration": 3.2407164573669434 + }, + { + "auxiliary_loss_clip": 0.01128183, + "auxiliary_loss_mlp": 0.00749086, + "balance_loss_clip": 1.04097176, + "balance_loss_mlp": 1.00046515, + "epoch": 0.13672027656696228, + "flos": 19244241567360.0, + "grad_norm": 1.7307630791022477, + "language_loss": 0.7787571, + "learning_rate": 3.881712720611336e-06, + "loss": 0.79752982, + "num_input_tokens_seen": 49217250, + "step": 2274, + "time_per_iteration": 2.6013059616088867 + }, + { + "auxiliary_loss_clip": 0.01112265, + "auxiliary_loss_mlp": 0.01048116, + "balance_loss_clip": 1.03828669, + "balance_loss_mlp": 1.02667022, + "epoch": 0.13678039981963025, + "flos": 24534924543360.0, + "grad_norm": 2.7090424433224682, + "language_loss": 0.78408897, + "learning_rate": 3.881580733093211e-06, + "loss": 0.80569273, + "num_input_tokens_seen": 49236615, + "step": 2275, + "time_per_iteration": 2.6329259872436523 + }, + { + "auxiliary_loss_clip": 0.01118779, + "auxiliary_loss_mlp": 0.01043047, + "balance_loss_clip": 1.04064298, + "balance_loss_mlp": 1.02352095, + "epoch": 0.13684052307229821, + "flos": 15669334523520.0, + "grad_norm": 2.2405372321552557, + "language_loss": 0.81104833, + "learning_rate": 3.881448674225356e-06, + "loss": 0.83266664, + "num_input_tokens_seen": 49253935, + "step": 2276, + "time_per_iteration": 2.649515151977539 + }, + { + "auxiliary_loss_clip": 0.01125885, + "auxiliary_loss_mlp": 0.01055977, + "balance_loss_clip": 1.04060054, + "balance_loss_mlp": 1.03263617, + "epoch": 0.13690064632496618, + "flos": 28364689560960.0, + "grad_norm": 2.3367797664149395, + "language_loss": 0.69013345, + "learning_rate": 3.881316544012779e-06, + "loss": 0.71195209, + "num_input_tokens_seen": 49273605, + "step": 2277, + "time_per_iteration": 2.70871901512146 + }, + { + "auxiliary_loss_clip": 0.0112267, + "auxiliary_loss_mlp": 0.00749198, + "balance_loss_clip": 1.04111099, + "balance_loss_mlp": 1.00063837, + "epoch": 0.13696076957763414, + "flos": 23404779953280.0, + "grad_norm": 1.9366139189792964, + "language_loss": 0.80053425, + "learning_rate": 3.88118434246049e-06, + "loss": 0.81925291, + "num_input_tokens_seen": 49291785, + "step": 2278, + "time_per_iteration": 2.7663471698760986 + }, + { + "auxiliary_loss_clip": 0.01125154, + "auxiliary_loss_mlp": 0.01052912, + "balance_loss_clip": 1.04943609, + "balance_loss_mlp": 1.03139472, + "epoch": 0.1370208928303021, + "flos": 37196595601920.0, + "grad_norm": 3.349466256719168, + "language_loss": 0.75205863, + "learning_rate": 3.881052069573502e-06, + "loss": 0.77383935, + "num_input_tokens_seen": 49311405, + "step": 2279, + "time_per_iteration": 2.920569896697998 + }, + { + "auxiliary_loss_clip": 0.01063546, + "auxiliary_loss_mlp": 0.01050208, + "balance_loss_clip": 1.03505337, + "balance_loss_mlp": 1.02971625, + "epoch": 0.13708101608297008, + "flos": 26976311118720.0, + "grad_norm": 2.2810298657256785, + "language_loss": 0.76786447, + "learning_rate": 3.880919725356831e-06, + "loss": 0.789002, + "num_input_tokens_seen": 49331835, + "step": 2280, + "time_per_iteration": 2.85040545463562 + }, + { + "auxiliary_loss_clip": 0.01070722, + "auxiliary_loss_mlp": 0.01045225, + "balance_loss_clip": 1.03312242, + "balance_loss_mlp": 1.02538824, + "epoch": 0.13714113933563807, + "flos": 32556864850560.0, + "grad_norm": 2.072826025161089, + "language_loss": 0.7970773, + "learning_rate": 3.880787309815496e-06, + "loss": 0.81823683, + "num_input_tokens_seen": 49352290, + "step": 2281, + "time_per_iteration": 2.811084270477295 + }, + { + "auxiliary_loss_clip": 0.01136219, + "auxiliary_loss_mlp": 0.01052394, + "balance_loss_clip": 1.04316938, + "balance_loss_mlp": 1.03155589, + "epoch": 0.13720126258830603, + "flos": 16101267569280.0, + "grad_norm": 1.7274239652509815, + "language_loss": 0.83642513, + "learning_rate": 3.880654822954518e-06, + "loss": 0.85831118, + "num_input_tokens_seen": 49370285, + "step": 2282, + "time_per_iteration": 2.6039836406707764 + }, + { + "auxiliary_loss_clip": 0.01102843, + "auxiliary_loss_mlp": 0.01049625, + "balance_loss_clip": 1.03609443, + "balance_loss_mlp": 1.03011012, + "epoch": 0.137261385840974, + "flos": 18953544798720.0, + "grad_norm": 1.5080368225715666, + "language_loss": 0.737809, + "learning_rate": 3.8805222647789195e-06, + "loss": 0.75933373, + "num_input_tokens_seen": 49389610, + "step": 2283, + "time_per_iteration": 2.7435781955718994 + }, + { + "auxiliary_loss_clip": 0.01116908, + "auxiliary_loss_mlp": 0.01051737, + "balance_loss_clip": 1.04148614, + "balance_loss_mlp": 1.03236532, + "epoch": 0.13732150909364196, + "flos": 23295360147840.0, + "grad_norm": 2.140837990371825, + "language_loss": 0.84292161, + "learning_rate": 3.880389635293729e-06, + "loss": 0.86460805, + "num_input_tokens_seen": 49408390, + "step": 2284, + "time_per_iteration": 2.623734951019287 + }, + { + "auxiliary_loss_clip": 0.01111216, + "auxiliary_loss_mlp": 0.01054316, + "balance_loss_clip": 1.03785646, + "balance_loss_mlp": 1.0322386, + "epoch": 0.13738163234630993, + "flos": 29351263489920.0, + "grad_norm": 2.355094420129757, + "language_loss": 0.75199455, + "learning_rate": 3.880256934503974e-06, + "loss": 0.77364987, + "num_input_tokens_seen": 49427725, + "step": 2285, + "time_per_iteration": 2.6698386669158936 + }, + { + "auxiliary_loss_clip": 0.01107432, + "auxiliary_loss_mlp": 0.01050231, + "balance_loss_clip": 1.03977108, + "balance_loss_mlp": 1.03063309, + "epoch": 0.1374417555989779, + "flos": 26651319840000.0, + "grad_norm": 5.843383600272883, + "language_loss": 0.7452485, + "learning_rate": 3.880124162414689e-06, + "loss": 0.76682508, + "num_input_tokens_seen": 49449000, + "step": 2286, + "time_per_iteration": 2.715031385421753 + }, + { + "auxiliary_loss_clip": 0.01091179, + "auxiliary_loss_mlp": 0.01052043, + "balance_loss_clip": 1.03864968, + "balance_loss_mlp": 1.02913094, + "epoch": 0.1375018788516459, + "flos": 28403401443840.0, + "grad_norm": 2.0250096509681206, + "language_loss": 0.86208284, + "learning_rate": 3.879991319030908e-06, + "loss": 0.883515, + "num_input_tokens_seen": 49468360, + "step": 2287, + "time_per_iteration": 2.726977586746216 + }, + { + "auxiliary_loss_clip": 0.01084293, + "auxiliary_loss_mlp": 0.01050554, + "balance_loss_clip": 1.03537154, + "balance_loss_mlp": 1.02942991, + "epoch": 0.13756200210431385, + "flos": 37413783187200.0, + "grad_norm": 1.8213039457602003, + "language_loss": 0.68589389, + "learning_rate": 3.879858404357666e-06, + "loss": 0.70724237, + "num_input_tokens_seen": 49493450, + "step": 2288, + "time_per_iteration": 2.746729850769043 + }, + { + "auxiliary_loss_clip": 0.01075839, + "auxiliary_loss_mlp": 0.01060944, + "balance_loss_clip": 1.0388912, + "balance_loss_mlp": 1.03811574, + "epoch": 0.13762212535698182, + "flos": 22711021695360.0, + "grad_norm": 5.65935871094972, + "language_loss": 0.86554354, + "learning_rate": 3.879725418400005e-06, + "loss": 0.88691139, + "num_input_tokens_seen": 49511220, + "step": 2289, + "time_per_iteration": 2.8234684467315674 + }, + { + "auxiliary_loss_clip": 0.01091702, + "auxiliary_loss_mlp": 0.00749137, + "balance_loss_clip": 1.03419018, + "balance_loss_mlp": 1.0006628, + "epoch": 0.13768224860964978, + "flos": 23952130375680.0, + "grad_norm": 2.089876513515539, + "language_loss": 0.74524492, + "learning_rate": 3.879592361162969e-06, + "loss": 0.76365334, + "num_input_tokens_seen": 49529820, + "step": 2290, + "time_per_iteration": 2.7099549770355225 + }, + { + "auxiliary_loss_clip": 0.00998188, + "auxiliary_loss_mlp": 0.0102304, + "balance_loss_clip": 1.00272727, + "balance_loss_mlp": 1.02013099, + "epoch": 0.13774237186231775, + "flos": 63590438753280.0, + "grad_norm": 0.7045550823479458, + "language_loss": 0.51569325, + "learning_rate": 3.8794592326516015e-06, + "loss": 0.53590554, + "num_input_tokens_seen": 49595325, + "step": 2291, + "time_per_iteration": 3.2894673347473145 + }, + { + "auxiliary_loss_clip": 0.01119106, + "auxiliary_loss_mlp": 0.01053612, + "balance_loss_clip": 1.03888929, + "balance_loss_mlp": 1.03199935, + "epoch": 0.1378024951149857, + "flos": 24279456038400.0, + "grad_norm": 1.868520516190162, + "language_loss": 0.7066505, + "learning_rate": 3.879326032870952e-06, + "loss": 0.7283777, + "num_input_tokens_seen": 49615850, + "step": 2292, + "time_per_iteration": 2.682908296585083 + }, + { + "auxiliary_loss_clip": 0.01117729, + "auxiliary_loss_mlp": 0.01047108, + "balance_loss_clip": 1.04029083, + "balance_loss_mlp": 1.0265801, + "epoch": 0.13786261836765368, + "flos": 14021537080320.0, + "grad_norm": 2.697626419988066, + "language_loss": 0.80100447, + "learning_rate": 3.879192761826071e-06, + "loss": 0.82265282, + "num_input_tokens_seen": 49631860, + "step": 2293, + "time_per_iteration": 2.5557289123535156 + }, + { + "auxiliary_loss_clip": 0.01118823, + "auxiliary_loss_mlp": 0.01053733, + "balance_loss_clip": 1.03904605, + "balance_loss_mlp": 1.03331256, + "epoch": 0.13792274162032167, + "flos": 28878679226880.0, + "grad_norm": 4.516727799100684, + "language_loss": 0.78145635, + "learning_rate": 3.879059419522011e-06, + "loss": 0.80318189, + "num_input_tokens_seen": 49652145, + "step": 2294, + "time_per_iteration": 2.6534981727600098 + }, + { + "auxiliary_loss_clip": 0.01086744, + "auxiliary_loss_mlp": 0.01049301, + "balance_loss_clip": 1.03753757, + "balance_loss_mlp": 1.03082335, + "epoch": 0.13798286487298964, + "flos": 21141150808320.0, + "grad_norm": 2.146939470367959, + "language_loss": 0.8008523, + "learning_rate": 3.878926005963831e-06, + "loss": 0.82221276, + "num_input_tokens_seen": 49669880, + "step": 2295, + "time_per_iteration": 2.6585373878479004 + }, + { + "auxiliary_loss_clip": 0.01115737, + "auxiliary_loss_mlp": 0.01048731, + "balance_loss_clip": 1.0378654, + "balance_loss_mlp": 1.02810812, + "epoch": 0.1380429881256576, + "flos": 22487477402880.0, + "grad_norm": 1.8220092444400517, + "language_loss": 0.77861935, + "learning_rate": 3.878792521156588e-06, + "loss": 0.800264, + "num_input_tokens_seen": 49687255, + "step": 2296, + "time_per_iteration": 2.606640100479126 + }, + { + "auxiliary_loss_clip": 0.01113839, + "auxiliary_loss_mlp": 0.01060008, + "balance_loss_clip": 1.04046988, + "balance_loss_mlp": 1.03940845, + "epoch": 0.13810311137832557, + "flos": 21393674398080.0, + "grad_norm": 1.9765489142397095, + "language_loss": 0.78941423, + "learning_rate": 3.8786589651053446e-06, + "loss": 0.8111527, + "num_input_tokens_seen": 49706650, + "step": 2297, + "time_per_iteration": 2.615492343902588 + }, + { + "auxiliary_loss_clip": 0.01076606, + "auxiliary_loss_mlp": 0.01051417, + "balance_loss_clip": 1.04301572, + "balance_loss_mlp": 1.03230822, + "epoch": 0.13816323463099353, + "flos": 25989844930560.0, + "grad_norm": 2.200208375914907, + "language_loss": 0.69493926, + "learning_rate": 3.878525337815164e-06, + "loss": 0.71621954, + "num_input_tokens_seen": 49725715, + "step": 2298, + "time_per_iteration": 2.79105281829834 + }, + { + "auxiliary_loss_clip": 0.01098406, + "auxiliary_loss_mlp": 0.01058309, + "balance_loss_clip": 1.03939438, + "balance_loss_mlp": 1.03766227, + "epoch": 0.1382233578836615, + "flos": 19244313394560.0, + "grad_norm": 1.8304785092190805, + "language_loss": 0.86690521, + "learning_rate": 3.878391639291116e-06, + "loss": 0.88847232, + "num_input_tokens_seen": 49744710, + "step": 2299, + "time_per_iteration": 2.6195569038391113 + }, + { + "auxiliary_loss_clip": 0.01128145, + "auxiliary_loss_mlp": 0.01063757, + "balance_loss_clip": 1.03928399, + "balance_loss_mlp": 1.04308653, + "epoch": 0.1382834811363295, + "flos": 25666290195840.0, + "grad_norm": 1.6843508469504807, + "language_loss": 0.75106442, + "learning_rate": 3.878257869538267e-06, + "loss": 0.77298349, + "num_input_tokens_seen": 49764300, + "step": 2300, + "time_per_iteration": 2.5821192264556885 + }, + { + "auxiliary_loss_clip": 0.01097703, + "auxiliary_loss_mlp": 0.01056222, + "balance_loss_clip": 1.04189539, + "balance_loss_mlp": 1.0369097, + "epoch": 0.13834360438899745, + "flos": 19784193788160.0, + "grad_norm": 2.7899618585114245, + "language_loss": 0.82285005, + "learning_rate": 3.878124028561692e-06, + "loss": 0.84438932, + "num_input_tokens_seen": 49778380, + "step": 2301, + "time_per_iteration": 2.6534533500671387 + }, + { + "auxiliary_loss_clip": 0.01100466, + "auxiliary_loss_mlp": 0.00749174, + "balance_loss_clip": 1.03772438, + "balance_loss_mlp": 1.00062943, + "epoch": 0.13840372764166542, + "flos": 26651858544000.0, + "grad_norm": 2.3106381490573003, + "language_loss": 0.85571998, + "learning_rate": 3.877990116366466e-06, + "loss": 0.87421644, + "num_input_tokens_seen": 49797460, + "step": 2302, + "time_per_iteration": 4.269487619400024 + }, + { + "auxiliary_loss_clip": 0.01021795, + "auxiliary_loss_mlp": 0.01036809, + "balance_loss_clip": 1.00767827, + "balance_loss_mlp": 1.03424644, + "epoch": 0.13846385089433338, + "flos": 70510998286080.0, + "grad_norm": 0.7597556642928304, + "language_loss": 0.65599805, + "learning_rate": 3.877856132957667e-06, + "loss": 0.67658412, + "num_input_tokens_seen": 49868005, + "step": 2303, + "time_per_iteration": 3.348855495452881 + }, + { + "auxiliary_loss_clip": 0.01112203, + "auxiliary_loss_mlp": 0.0104981, + "balance_loss_clip": 1.03786922, + "balance_loss_mlp": 1.03036714, + "epoch": 0.13852397414700135, + "flos": 17348732956800.0, + "grad_norm": 1.8930510740017574, + "language_loss": 0.78593779, + "learning_rate": 3.877722078340374e-06, + "loss": 0.80755788, + "num_input_tokens_seen": 49885825, + "step": 2304, + "time_per_iteration": 2.6335084438323975 + }, + { + "auxiliary_loss_clip": 0.01119506, + "auxiliary_loss_mlp": 0.01044367, + "balance_loss_clip": 1.04128885, + "balance_loss_mlp": 1.0254966, + "epoch": 0.13858409739966931, + "flos": 21543781334400.0, + "grad_norm": 2.0118310450903625, + "language_loss": 0.77565008, + "learning_rate": 3.877587952519672e-06, + "loss": 0.79728884, + "num_input_tokens_seen": 49905975, + "step": 2305, + "time_per_iteration": 2.6997718811035156 + }, + { + "auxiliary_loss_clip": 0.01031453, + "auxiliary_loss_mlp": 0.01049974, + "balance_loss_clip": 1.02840114, + "balance_loss_mlp": 1.03063822, + "epoch": 0.13864422065233728, + "flos": 21579907438080.0, + "grad_norm": 2.662514912574531, + "language_loss": 0.87606764, + "learning_rate": 3.877453755500647e-06, + "loss": 0.89688194, + "num_input_tokens_seen": 49925800, + "step": 2306, + "time_per_iteration": 2.985947370529175 + }, + { + "auxiliary_loss_clip": 0.0102578, + "auxiliary_loss_mlp": 0.01005225, + "balance_loss_clip": 1.00221324, + "balance_loss_mlp": 1.00255489, + "epoch": 0.13870434390500527, + "flos": 53371156872960.0, + "grad_norm": 0.8727243995191335, + "language_loss": 0.59037501, + "learning_rate": 3.877319487288387e-06, + "loss": 0.61068505, + "num_input_tokens_seen": 49977620, + "step": 2307, + "time_per_iteration": 5.067146062850952 + }, + { + "auxiliary_loss_clip": 0.01133007, + "auxiliary_loss_mlp": 0.00749229, + "balance_loss_clip": 1.04227769, + "balance_loss_mlp": 1.00067127, + "epoch": 0.13876446715767324, + "flos": 22565906749440.0, + "grad_norm": 2.075682511115886, + "language_loss": 0.79553211, + "learning_rate": 3.877185147887984e-06, + "loss": 0.81435442, + "num_input_tokens_seen": 49996650, + "step": 2308, + "time_per_iteration": 5.787481784820557 + }, + { + "auxiliary_loss_clip": 0.01093656, + "auxiliary_loss_mlp": 0.01043071, + "balance_loss_clip": 1.03746319, + "balance_loss_mlp": 1.02254319, + "epoch": 0.1388245904103412, + "flos": 20705231352960.0, + "grad_norm": 2.5121754097657445, + "language_loss": 0.78257734, + "learning_rate": 3.877050737304533e-06, + "loss": 0.80394459, + "num_input_tokens_seen": 50015640, + "step": 2309, + "time_per_iteration": 2.7008821964263916 + }, + { + "auxiliary_loss_clip": 0.01092735, + "auxiliary_loss_mlp": 0.01044536, + "balance_loss_clip": 1.03783977, + "balance_loss_mlp": 1.02396059, + "epoch": 0.13888471366300917, + "flos": 20554729367040.0, + "grad_norm": 2.1378185746802605, + "language_loss": 0.67610908, + "learning_rate": 3.876916255543129e-06, + "loss": 0.69748181, + "num_input_tokens_seen": 50033500, + "step": 2310, + "time_per_iteration": 2.6194581985473633 + }, + { + "auxiliary_loss_clip": 0.01129326, + "auxiliary_loss_mlp": 0.01054158, + "balance_loss_clip": 1.04142094, + "balance_loss_mlp": 1.03295064, + "epoch": 0.13894483691567713, + "flos": 13838033473920.0, + "grad_norm": 1.820530670599048, + "language_loss": 0.83703613, + "learning_rate": 3.8767817026088725e-06, + "loss": 0.85887098, + "num_input_tokens_seen": 50050075, + "step": 2311, + "time_per_iteration": 2.671502113342285 + }, + { + "auxiliary_loss_clip": 0.01135724, + "auxiliary_loss_mlp": 0.01043835, + "balance_loss_clip": 1.04367447, + "balance_loss_mlp": 1.02410626, + "epoch": 0.1390049601683451, + "flos": 28031186759040.0, + "grad_norm": 1.9355007744088966, + "language_loss": 0.82043314, + "learning_rate": 3.876647078506866e-06, + "loss": 0.84222871, + "num_input_tokens_seen": 50070080, + "step": 2312, + "time_per_iteration": 2.613568067550659 + }, + { + "auxiliary_loss_clip": 0.0109908, + "auxiliary_loss_mlp": 0.00749265, + "balance_loss_clip": 1.04250598, + "balance_loss_mlp": 1.00080156, + "epoch": 0.13906508342101306, + "flos": 26756860976640.0, + "grad_norm": 4.687130418758417, + "language_loss": 0.86606407, + "learning_rate": 3.876512383242215e-06, + "loss": 0.88454747, + "num_input_tokens_seen": 50090040, + "step": 2313, + "time_per_iteration": 2.7741286754608154 + }, + { + "auxiliary_loss_clip": 0.01130017, + "auxiliary_loss_mlp": 0.01057985, + "balance_loss_clip": 1.04137254, + "balance_loss_mlp": 1.03748119, + "epoch": 0.13912520667368106, + "flos": 24535104111360.0, + "grad_norm": 1.9402963938423499, + "language_loss": 0.80173242, + "learning_rate": 3.876377616820024e-06, + "loss": 0.82361245, + "num_input_tokens_seen": 50110595, + "step": 2314, + "time_per_iteration": 2.590015411376953 + }, + { + "auxiliary_loss_clip": 0.01087767, + "auxiliary_loss_mlp": 0.01047271, + "balance_loss_clip": 1.03705394, + "balance_loss_mlp": 1.0276258, + "epoch": 0.13918532992634902, + "flos": 19383215287680.0, + "grad_norm": 2.556594299071215, + "language_loss": 0.85693121, + "learning_rate": 3.876242779245409e-06, + "loss": 0.87828159, + "num_input_tokens_seen": 50125430, + "step": 2315, + "time_per_iteration": 2.618927001953125 + }, + { + "auxiliary_loss_clip": 0.01117401, + "auxiliary_loss_mlp": 0.0105282, + "balance_loss_clip": 1.03880763, + "balance_loss_mlp": 1.0308969, + "epoch": 0.139245453179017, + "flos": 21323756574720.0, + "grad_norm": 1.9366742229724698, + "language_loss": 0.77353853, + "learning_rate": 3.876107870523477e-06, + "loss": 0.7952407, + "num_input_tokens_seen": 50144120, + "step": 2316, + "time_per_iteration": 2.5823593139648438 + }, + { + "auxiliary_loss_clip": 0.01126929, + "auxiliary_loss_mlp": 0.00749228, + "balance_loss_clip": 1.04065168, + "balance_loss_mlp": 1.00077629, + "epoch": 0.13930557643168495, + "flos": 19500607912320.0, + "grad_norm": 1.7070381904556646, + "language_loss": 0.76926768, + "learning_rate": 3.875972890659349e-06, + "loss": 0.78802919, + "num_input_tokens_seen": 50162500, + "step": 2317, + "time_per_iteration": 2.5551059246063232 + }, + { + "auxiliary_loss_clip": 0.01106359, + "auxiliary_loss_mlp": 0.01050152, + "balance_loss_clip": 1.03902698, + "balance_loss_mlp": 1.02944517, + "epoch": 0.13936569968435292, + "flos": 25410821690880.0, + "grad_norm": 2.0524487999679746, + "language_loss": 0.80465043, + "learning_rate": 3.875837839658139e-06, + "loss": 0.82621551, + "num_input_tokens_seen": 50182415, + "step": 2318, + "time_per_iteration": 2.6829566955566406 + }, + { + "auxiliary_loss_clip": 0.01007493, + "auxiliary_loss_mlp": 0.01023471, + "balance_loss_clip": 1.00576711, + "balance_loss_mlp": 1.02045524, + "epoch": 0.13942582293702088, + "flos": 70771063731840.0, + "grad_norm": 0.8561068607108178, + "language_loss": 0.59047133, + "learning_rate": 3.87570271752497e-06, + "loss": 0.61078095, + "num_input_tokens_seen": 50245160, + "step": 2319, + "time_per_iteration": 3.254345655441284 + }, + { + "auxiliary_loss_clip": 0.01075015, + "auxiliary_loss_mlp": 0.01051241, + "balance_loss_clip": 1.03387415, + "balance_loss_mlp": 1.03000987, + "epoch": 0.13948594618968888, + "flos": 35590885920000.0, + "grad_norm": 2.33978082350926, + "language_loss": 0.64753145, + "learning_rate": 3.875567524264967e-06, + "loss": 0.66879404, + "num_input_tokens_seen": 50268215, + "step": 2320, + "time_per_iteration": 2.7939393520355225 + }, + { + "auxiliary_loss_clip": 0.01052373, + "auxiliary_loss_mlp": 0.01047933, + "balance_loss_clip": 1.03127789, + "balance_loss_mlp": 1.02729774, + "epoch": 0.13954606944235684, + "flos": 21105204272640.0, + "grad_norm": 1.5225413053177606, + "language_loss": 0.70694089, + "learning_rate": 3.875432259883256e-06, + "loss": 0.7279439, + "num_input_tokens_seen": 50288575, + "step": 2321, + "time_per_iteration": 2.6957523822784424 + }, + { + "auxiliary_loss_clip": 0.01072568, + "auxiliary_loss_mlp": 0.01054426, + "balance_loss_clip": 1.03071702, + "balance_loss_mlp": 1.031335, + "epoch": 0.1396061926950248, + "flos": 25044425009280.0, + "grad_norm": 1.7892556420264012, + "language_loss": 0.85610414, + "learning_rate": 3.875296924384965e-06, + "loss": 0.87737411, + "num_input_tokens_seen": 50308735, + "step": 2322, + "time_per_iteration": 2.7968294620513916 + }, + { + "auxiliary_loss_clip": 0.01078996, + "auxiliary_loss_mlp": 0.01055429, + "balance_loss_clip": 1.03229511, + "balance_loss_mlp": 1.03571141, + "epoch": 0.13966631594769277, + "flos": 37634023428480.0, + "grad_norm": 1.506652558670011, + "language_loss": 0.66880918, + "learning_rate": 3.875161517775226e-06, + "loss": 0.69015342, + "num_input_tokens_seen": 50331025, + "step": 2323, + "time_per_iteration": 2.878345012664795 + }, + { + "auxiliary_loss_clip": 0.0108851, + "auxiliary_loss_mlp": 0.01056501, + "balance_loss_clip": 1.03594577, + "balance_loss_mlp": 1.03530586, + "epoch": 0.13972643920036074, + "flos": 16690993061760.0, + "grad_norm": 1.8793095096332357, + "language_loss": 0.88540316, + "learning_rate": 3.875026040059175e-06, + "loss": 0.9068532, + "num_input_tokens_seen": 50349725, + "step": 2324, + "time_per_iteration": 2.720571994781494 + }, + { + "auxiliary_loss_clip": 0.0111779, + "auxiliary_loss_mlp": 0.0105914, + "balance_loss_clip": 1.03791165, + "balance_loss_mlp": 1.03819478, + "epoch": 0.1397865624530287, + "flos": 23331055288320.0, + "grad_norm": 3.3321776488215336, + "language_loss": 0.70485771, + "learning_rate": 3.8748904912419485e-06, + "loss": 0.72662693, + "num_input_tokens_seen": 50367965, + "step": 2325, + "time_per_iteration": 2.5989086627960205 + }, + { + "auxiliary_loss_clip": 0.0109475, + "auxiliary_loss_mlp": 0.00749202, + "balance_loss_clip": 1.03901839, + "balance_loss_mlp": 1.00078762, + "epoch": 0.13984668570569667, + "flos": 22778317825920.0, + "grad_norm": 1.770113031783541, + "language_loss": 0.81437087, + "learning_rate": 3.874754871328688e-06, + "loss": 0.8328104, + "num_input_tokens_seen": 50385605, + "step": 2326, + "time_per_iteration": 2.694370985031128 + }, + { + "auxiliary_loss_clip": 0.01117068, + "auxiliary_loss_mlp": 0.01054876, + "balance_loss_clip": 1.04156685, + "balance_loss_mlp": 1.03607702, + "epoch": 0.13990680895836466, + "flos": 19464553635840.0, + "grad_norm": 1.7862917327664307, + "language_loss": 0.89064074, + "learning_rate": 3.874619180324534e-06, + "loss": 0.91236019, + "num_input_tokens_seen": 50403985, + "step": 2327, + "time_per_iteration": 2.597097158432007 + }, + { + "auxiliary_loss_clip": 0.01084061, + "auxiliary_loss_mlp": 0.01071667, + "balance_loss_clip": 1.0364933, + "balance_loss_mlp": 1.050138, + "epoch": 0.13996693221103262, + "flos": 20303283185280.0, + "grad_norm": 2.291404040524306, + "language_loss": 0.8503049, + "learning_rate": 3.874483418234632e-06, + "loss": 0.87186223, + "num_input_tokens_seen": 50421590, + "step": 2328, + "time_per_iteration": 2.6295437812805176 + }, + { + "auxiliary_loss_clip": 0.01117391, + "auxiliary_loss_mlp": 0.01049908, + "balance_loss_clip": 1.04055345, + "balance_loss_mlp": 1.02940452, + "epoch": 0.1400270554637006, + "flos": 26617707688320.0, + "grad_norm": 1.612123641692061, + "language_loss": 0.73688489, + "learning_rate": 3.874347585064131e-06, + "loss": 0.75855792, + "num_input_tokens_seen": 50443945, + "step": 2329, + "time_per_iteration": 2.6902403831481934 + }, + { + "auxiliary_loss_clip": 0.0111473, + "auxiliary_loss_mlp": 0.0105214, + "balance_loss_clip": 1.0369916, + "balance_loss_mlp": 1.03174376, + "epoch": 0.14008717871636855, + "flos": 19391475415680.0, + "grad_norm": 2.300094175721146, + "language_loss": 0.7841357, + "learning_rate": 3.874211680818183e-06, + "loss": 0.80580437, + "num_input_tokens_seen": 50462065, + "step": 2330, + "time_per_iteration": 2.683105945587158 + }, + { + "auxiliary_loss_clip": 0.0110507, + "auxiliary_loss_mlp": 0.01047121, + "balance_loss_clip": 1.03764868, + "balance_loss_mlp": 1.02736855, + "epoch": 0.14014730196903652, + "flos": 15304266645120.0, + "grad_norm": 1.9675836540682023, + "language_loss": 0.71936542, + "learning_rate": 3.87407570550194e-06, + "loss": 0.74088734, + "num_input_tokens_seen": 50479565, + "step": 2331, + "time_per_iteration": 2.840654134750366 + }, + { + "auxiliary_loss_clip": 0.01122031, + "auxiliary_loss_mlp": 0.01057975, + "balance_loss_clip": 1.04055476, + "balance_loss_mlp": 1.03773355, + "epoch": 0.14020742522170448, + "flos": 14939701557120.0, + "grad_norm": 2.0319356972776688, + "language_loss": 0.72248709, + "learning_rate": 3.873939659120557e-06, + "loss": 0.74428713, + "num_input_tokens_seen": 50497305, + "step": 2332, + "time_per_iteration": 2.6408886909484863 + }, + { + "auxiliary_loss_clip": 0.0102056, + "auxiliary_loss_mlp": 0.01036495, + "balance_loss_clip": 1.00705123, + "balance_loss_mlp": 1.03383696, + "epoch": 0.14026754847437245, + "flos": 48824580044160.0, + "grad_norm": 0.8280300000130248, + "language_loss": 0.5611245, + "learning_rate": 3.873803541679196e-06, + "loss": 0.58169508, + "num_input_tokens_seen": 50549735, + "step": 2333, + "time_per_iteration": 3.1143412590026855 + }, + { + "auxiliary_loss_clip": 0.01091781, + "auxiliary_loss_mlp": 0.01047117, + "balance_loss_clip": 1.03751612, + "balance_loss_mlp": 1.02675653, + "epoch": 0.14032767172704044, + "flos": 25773267876480.0, + "grad_norm": 1.6374951614243738, + "language_loss": 0.82789046, + "learning_rate": 3.873667353183016e-06, + "loss": 0.8492794, + "num_input_tokens_seen": 50570100, + "step": 2334, + "time_per_iteration": 2.7511088848114014 + }, + { + "auxiliary_loss_clip": 0.01093361, + "auxiliary_loss_mlp": 0.01043236, + "balance_loss_clip": 1.03731179, + "balance_loss_mlp": 1.02456748, + "epoch": 0.1403877949797084, + "flos": 21216312017280.0, + "grad_norm": 2.2386406282717974, + "language_loss": 0.81358558, + "learning_rate": 3.8735310936371825e-06, + "loss": 0.83495152, + "num_input_tokens_seen": 50589185, + "step": 2335, + "time_per_iteration": 2.910728931427002 + }, + { + "auxiliary_loss_clip": 0.01077693, + "auxiliary_loss_mlp": 0.01052324, + "balance_loss_clip": 1.03949142, + "balance_loss_mlp": 1.02901912, + "epoch": 0.14044791823237637, + "flos": 22747973811840.0, + "grad_norm": 1.7771369111349526, + "language_loss": 0.82008505, + "learning_rate": 3.873394763046862e-06, + "loss": 0.84138519, + "num_input_tokens_seen": 50609645, + "step": 2336, + "time_per_iteration": 2.8569998741149902 + }, + { + "auxiliary_loss_clip": 0.01118257, + "auxiliary_loss_mlp": 0.01052108, + "balance_loss_clip": 1.04442477, + "balance_loss_mlp": 1.03160417, + "epoch": 0.14050804148504434, + "flos": 22964443125120.0, + "grad_norm": 1.7127668628193982, + "language_loss": 0.80608404, + "learning_rate": 3.873258361417225e-06, + "loss": 0.82778764, + "num_input_tokens_seen": 50628385, + "step": 2337, + "time_per_iteration": 2.6187987327575684 + }, + { + "auxiliary_loss_clip": 0.01114377, + "auxiliary_loss_mlp": 0.01048647, + "balance_loss_clip": 1.03770065, + "balance_loss_mlp": 1.02873874, + "epoch": 0.1405681647377123, + "flos": 22200336080640.0, + "grad_norm": 2.9814259428224963, + "language_loss": 0.79395175, + "learning_rate": 3.873121888753442e-06, + "loss": 0.81558192, + "num_input_tokens_seen": 50647260, + "step": 2338, + "time_per_iteration": 2.7368664741516113 + }, + { + "auxiliary_loss_clip": 0.01119273, + "auxiliary_loss_mlp": 0.01046923, + "balance_loss_clip": 1.04176962, + "balance_loss_mlp": 1.02581131, + "epoch": 0.14062828799038027, + "flos": 23732787974400.0, + "grad_norm": 2.792195109478964, + "language_loss": 0.79590642, + "learning_rate": 3.87298534506069e-06, + "loss": 0.81756842, + "num_input_tokens_seen": 50666130, + "step": 2339, + "time_per_iteration": 2.6620404720306396 + }, + { + "auxiliary_loss_clip": 0.01057656, + "auxiliary_loss_mlp": 0.01060225, + "balance_loss_clip": 1.03424263, + "balance_loss_mlp": 1.03936315, + "epoch": 0.14068841124304826, + "flos": 39202493685120.0, + "grad_norm": 2.760518487552127, + "language_loss": 0.65736407, + "learning_rate": 3.872848730344146e-06, + "loss": 0.67854291, + "num_input_tokens_seen": 50687440, + "step": 2340, + "time_per_iteration": 2.8954861164093018 + }, + { + "auxiliary_loss_clip": 0.01114955, + "auxiliary_loss_mlp": 0.01051443, + "balance_loss_clip": 1.04237211, + "balance_loss_mlp": 1.03054571, + "epoch": 0.14074853449571623, + "flos": 20192283181440.0, + "grad_norm": 2.1879536023012007, + "language_loss": 0.78418136, + "learning_rate": 3.87271204460899e-06, + "loss": 0.80584526, + "num_input_tokens_seen": 50704030, + "step": 2341, + "time_per_iteration": 2.647156238555908 + }, + { + "auxiliary_loss_clip": 0.01123291, + "auxiliary_loss_mlp": 0.0105612, + "balance_loss_clip": 1.03917587, + "balance_loss_mlp": 1.03633177, + "epoch": 0.1408086577483842, + "flos": 18405871153920.0, + "grad_norm": 1.9981506775196918, + "language_loss": 0.80644912, + "learning_rate": 3.8725752878604066e-06, + "loss": 0.82824326, + "num_input_tokens_seen": 50723305, + "step": 2342, + "time_per_iteration": 2.5651793479919434 + }, + { + "auxiliary_loss_clip": 0.01116805, + "auxiliary_loss_mlp": 0.01047624, + "balance_loss_clip": 1.04398179, + "balance_loss_mlp": 1.02814484, + "epoch": 0.14086878100105216, + "flos": 25264593423360.0, + "grad_norm": 1.9367056397186122, + "language_loss": 0.77271128, + "learning_rate": 3.87243846010358e-06, + "loss": 0.79435563, + "num_input_tokens_seen": 50743270, + "step": 2343, + "time_per_iteration": 2.6915366649627686 + }, + { + "auxiliary_loss_clip": 0.01007497, + "auxiliary_loss_mlp": 0.01004988, + "balance_loss_clip": 1.00452089, + "balance_loss_mlp": 1.00215042, + "epoch": 0.14092890425372012, + "flos": 65978388869760.0, + "grad_norm": 0.8477711500491186, + "language_loss": 0.61542571, + "learning_rate": 3.872301561343699e-06, + "loss": 0.63555062, + "num_input_tokens_seen": 50802710, + "step": 2344, + "time_per_iteration": 3.209350109100342 + }, + { + "auxiliary_loss_clip": 0.01107325, + "auxiliary_loss_mlp": 0.01043042, + "balance_loss_clip": 1.0358429, + "balance_loss_mlp": 1.02405238, + "epoch": 0.1409890275063881, + "flos": 23694973931520.0, + "grad_norm": 1.4809651908876063, + "language_loss": 0.64338493, + "learning_rate": 3.872164591585956e-06, + "loss": 0.66488856, + "num_input_tokens_seen": 50822625, + "step": 2345, + "time_per_iteration": 2.6079318523406982 + }, + { + "auxiliary_loss_clip": 0.01112971, + "auxiliary_loss_mlp": 0.0104732, + "balance_loss_clip": 1.03399682, + "balance_loss_mlp": 1.02610075, + "epoch": 0.14104915075905605, + "flos": 23623152687360.0, + "grad_norm": 3.034526819140187, + "language_loss": 0.7369594, + "learning_rate": 3.8720275508355435e-06, + "loss": 0.75856233, + "num_input_tokens_seen": 50842330, + "step": 2346, + "time_per_iteration": 2.570545196533203 + }, + { + "auxiliary_loss_clip": 0.0111683, + "auxiliary_loss_mlp": 0.01047093, + "balance_loss_clip": 1.04084158, + "balance_loss_mlp": 1.02725649, + "epoch": 0.14110927401172405, + "flos": 20595165102720.0, + "grad_norm": 1.8830176451314913, + "language_loss": 0.77338588, + "learning_rate": 3.8718904390976585e-06, + "loss": 0.79502511, + "num_input_tokens_seen": 50861035, + "step": 2347, + "time_per_iteration": 2.7415695190429688 + }, + { + "auxiliary_loss_clip": 0.01123689, + "auxiliary_loss_mlp": 0.0105012, + "balance_loss_clip": 1.03667021, + "balance_loss_mlp": 1.03084373, + "epoch": 0.141169397264392, + "flos": 28548049512960.0, + "grad_norm": 1.7054126703971908, + "language_loss": 0.7666415, + "learning_rate": 3.8717532563775e-06, + "loss": 0.78837955, + "num_input_tokens_seen": 50880105, + "step": 2348, + "time_per_iteration": 2.6089625358581543 + }, + { + "auxiliary_loss_clip": 0.01104658, + "auxiliary_loss_mlp": 0.01044689, + "balance_loss_clip": 1.03585684, + "balance_loss_mlp": 1.02522206, + "epoch": 0.14122952051705998, + "flos": 17092258871040.0, + "grad_norm": 1.6905866875869495, + "language_loss": 0.86642039, + "learning_rate": 3.871616002680272e-06, + "loss": 0.88791382, + "num_input_tokens_seen": 50897720, + "step": 2349, + "time_per_iteration": 2.5619289875030518 + }, + { + "auxiliary_loss_clip": 0.01113028, + "auxiliary_loss_mlp": 0.01048092, + "balance_loss_clip": 1.03950274, + "balance_loss_mlp": 1.0288043, + "epoch": 0.14128964376972794, + "flos": 28946801370240.0, + "grad_norm": 1.6754334773833017, + "language_loss": 0.88659161, + "learning_rate": 3.871478678011177e-06, + "loss": 0.90820283, + "num_input_tokens_seen": 50918385, + "step": 2350, + "time_per_iteration": 4.23794150352478 + }, + { + "auxiliary_loss_clip": 0.01102219, + "auxiliary_loss_mlp": 0.0104361, + "balance_loss_clip": 1.03807223, + "balance_loss_mlp": 1.02334487, + "epoch": 0.1413497670223959, + "flos": 18989778643200.0, + "grad_norm": 2.3409436344712993, + "language_loss": 0.80910009, + "learning_rate": 3.871341282375423e-06, + "loss": 0.83055842, + "num_input_tokens_seen": 50938270, + "step": 2351, + "time_per_iteration": 2.7166125774383545 + }, + { + "auxiliary_loss_clip": 0.01107803, + "auxiliary_loss_mlp": 0.01044669, + "balance_loss_clip": 1.03563309, + "balance_loss_mlp": 1.02554774, + "epoch": 0.14140989027506387, + "flos": 29862236413440.0, + "grad_norm": 2.6166142390618568, + "language_loss": 0.8363173, + "learning_rate": 3.871203815778219e-06, + "loss": 0.85784197, + "num_input_tokens_seen": 50958155, + "step": 2352, + "time_per_iteration": 2.66216778755188 + }, + { + "auxiliary_loss_clip": 0.01014767, + "auxiliary_loss_mlp": 0.01021332, + "balance_loss_clip": 1.00239325, + "balance_loss_mlp": 1.01868558, + "epoch": 0.14147001352773186, + "flos": 62079532041600.0, + "grad_norm": 0.9097539532910822, + "language_loss": 0.61926872, + "learning_rate": 3.87106627822478e-06, + "loss": 0.63962972, + "num_input_tokens_seen": 51020705, + "step": 2353, + "time_per_iteration": 3.143355131149292 + }, + { + "auxiliary_loss_clip": 0.01097859, + "auxiliary_loss_mlp": 0.01051261, + "balance_loss_clip": 1.0374347, + "balance_loss_mlp": 1.03261638, + "epoch": 0.14153013678039983, + "flos": 22017514832640.0, + "grad_norm": 1.6500242716578593, + "language_loss": 0.87065995, + "learning_rate": 3.8709286697203196e-06, + "loss": 0.89215124, + "num_input_tokens_seen": 51039995, + "step": 2354, + "time_per_iteration": 2.6146929264068604 + }, + { + "auxiliary_loss_clip": 0.01089353, + "auxiliary_loss_mlp": 0.01045053, + "balance_loss_clip": 1.03811181, + "balance_loss_mlp": 1.02447748, + "epoch": 0.1415902600330678, + "flos": 19720093968000.0, + "grad_norm": 1.861763936542285, + "language_loss": 0.7492671, + "learning_rate": 3.870790990270057e-06, + "loss": 0.77061117, + "num_input_tokens_seen": 51059075, + "step": 2355, + "time_per_iteration": 4.3965981006622314 + }, + { + "auxiliary_loss_clip": 0.01014644, + "auxiliary_loss_mlp": 0.01009014, + "balance_loss_clip": 1.00278926, + "balance_loss_mlp": 1.00649858, + "epoch": 0.14165038328573576, + "flos": 65900929190400.0, + "grad_norm": 0.7246143736345684, + "language_loss": 0.51841772, + "learning_rate": 3.870653239879212e-06, + "loss": 0.53865427, + "num_input_tokens_seen": 51120380, + "step": 2356, + "time_per_iteration": 6.538336515426636 + }, + { + "auxiliary_loss_clip": 0.01124978, + "auxiliary_loss_mlp": 0.0105306, + "balance_loss_clip": 1.03930819, + "balance_loss_mlp": 1.03422499, + "epoch": 0.14171050653840372, + "flos": 12130158533760.0, + "grad_norm": 2.682010191899663, + "language_loss": 0.70450264, + "learning_rate": 3.8705154185530095e-06, + "loss": 0.72628301, + "num_input_tokens_seen": 51136950, + "step": 2357, + "time_per_iteration": 2.5645205974578857 + }, + { + "auxiliary_loss_clip": 0.01071897, + "auxiliary_loss_mlp": 0.01055638, + "balance_loss_clip": 1.03218555, + "balance_loss_mlp": 1.03633809, + "epoch": 0.1417706297910717, + "flos": 20412487509120.0, + "grad_norm": 3.5854198789786014, + "language_loss": 0.8160454, + "learning_rate": 3.870377526296674e-06, + "loss": 0.83732069, + "num_input_tokens_seen": 51155175, + "step": 2358, + "time_per_iteration": 2.6737349033355713 + }, + { + "auxiliary_loss_clip": 0.01109699, + "auxiliary_loss_mlp": 0.01050937, + "balance_loss_clip": 1.03974998, + "balance_loss_mlp": 1.03076661, + "epoch": 0.14183075304373965, + "flos": 22380607463040.0, + "grad_norm": 1.9412636606278149, + "language_loss": 0.71590531, + "learning_rate": 3.870239563115436e-06, + "loss": 0.73751169, + "num_input_tokens_seen": 51174500, + "step": 2359, + "time_per_iteration": 2.6496341228485107 + }, + { + "auxiliary_loss_clip": 0.01076313, + "auxiliary_loss_mlp": 0.00749164, + "balance_loss_clip": 1.03630924, + "balance_loss_mlp": 1.0006932, + "epoch": 0.14189087629640765, + "flos": 21580913018880.0, + "grad_norm": 2.163150030489238, + "language_loss": 0.75837451, + "learning_rate": 3.870101529014526e-06, + "loss": 0.77662927, + "num_input_tokens_seen": 51194270, + "step": 2360, + "time_per_iteration": 2.7338554859161377 + }, + { + "auxiliary_loss_clip": 0.01073266, + "auxiliary_loss_mlp": 0.01044907, + "balance_loss_clip": 1.03819454, + "balance_loss_mlp": 1.02303255, + "epoch": 0.1419509995490756, + "flos": 20008564093440.0, + "grad_norm": 2.122090014845417, + "language_loss": 0.81294286, + "learning_rate": 3.869963423999178e-06, + "loss": 0.83412462, + "num_input_tokens_seen": 51211850, + "step": 2361, + "time_per_iteration": 2.7970871925354004 + }, + { + "auxiliary_loss_clip": 0.01110767, + "auxiliary_loss_mlp": 0.0105149, + "balance_loss_clip": 1.03810775, + "balance_loss_mlp": 1.03238034, + "epoch": 0.14201112280174358, + "flos": 31941464112000.0, + "grad_norm": 1.7742649707676237, + "language_loss": 0.73994648, + "learning_rate": 3.86982524807463e-06, + "loss": 0.76156902, + "num_input_tokens_seen": 51233545, + "step": 2362, + "time_per_iteration": 2.826509475708008 + }, + { + "auxiliary_loss_clip": 0.01117252, + "auxiliary_loss_mlp": 0.01045904, + "balance_loss_clip": 1.04237676, + "balance_loss_mlp": 1.02652049, + "epoch": 0.14207124605441154, + "flos": 41464147582080.0, + "grad_norm": 1.642303513147655, + "language_loss": 0.73558176, + "learning_rate": 3.869687001246122e-06, + "loss": 0.75721329, + "num_input_tokens_seen": 51257615, + "step": 2363, + "time_per_iteration": 2.877086877822876 + }, + { + "auxiliary_loss_clip": 0.01080833, + "auxiliary_loss_mlp": 0.01046494, + "balance_loss_clip": 1.03288686, + "balance_loss_mlp": 1.02720618, + "epoch": 0.1421313693070795, + "flos": 31905086613120.0, + "grad_norm": 1.8752454351449843, + "language_loss": 0.7310667, + "learning_rate": 3.8695486835188946e-06, + "loss": 0.75233996, + "num_input_tokens_seen": 51279645, + "step": 2364, + "time_per_iteration": 2.7697110176086426 + }, + { + "auxiliary_loss_clip": 0.01096787, + "auxiliary_loss_mlp": 0.0104624, + "balance_loss_clip": 1.03526318, + "balance_loss_mlp": 1.02876413, + "epoch": 0.14219149255974747, + "flos": 26871165031680.0, + "grad_norm": 2.167886209383182, + "language_loss": 0.90203154, + "learning_rate": 3.869410294898195e-06, + "loss": 0.92346179, + "num_input_tokens_seen": 51299775, + "step": 2365, + "time_per_iteration": 2.6784236431121826 + }, + { + "auxiliary_loss_clip": 0.0107732, + "auxiliary_loss_mlp": 0.01045043, + "balance_loss_clip": 1.03121686, + "balance_loss_mlp": 1.02470565, + "epoch": 0.14225161581241544, + "flos": 27454426076160.0, + "grad_norm": 1.6480495237387514, + "language_loss": 0.65366417, + "learning_rate": 3.869271835389268e-06, + "loss": 0.67488778, + "num_input_tokens_seen": 51319430, + "step": 2366, + "time_per_iteration": 2.664045810699463 + }, + { + "auxiliary_loss_clip": 0.01101404, + "auxiliary_loss_mlp": 0.01054559, + "balance_loss_clip": 1.0381006, + "balance_loss_mlp": 1.03451955, + "epoch": 0.14231173906508343, + "flos": 10561436881920.0, + "grad_norm": 2.060958191720695, + "language_loss": 0.80446267, + "learning_rate": 3.8691333049973665e-06, + "loss": 0.82602227, + "num_input_tokens_seen": 51336045, + "step": 2367, + "time_per_iteration": 2.6883764266967773 + }, + { + "auxiliary_loss_clip": 0.01095372, + "auxiliary_loss_mlp": 0.01058232, + "balance_loss_clip": 1.0367713, + "balance_loss_mlp": 1.03659534, + "epoch": 0.1423718623177514, + "flos": 28360882719360.0, + "grad_norm": 1.7799369757866563, + "language_loss": 0.82155919, + "learning_rate": 3.868994703727742e-06, + "loss": 0.84309524, + "num_input_tokens_seen": 51357030, + "step": 2368, + "time_per_iteration": 2.679892063140869 + }, + { + "auxiliary_loss_clip": 0.01081424, + "auxiliary_loss_mlp": 0.01054071, + "balance_loss_clip": 1.03737414, + "balance_loss_mlp": 1.03241026, + "epoch": 0.14243198557041936, + "flos": 19354235990400.0, + "grad_norm": 2.3192866199282154, + "language_loss": 0.86853194, + "learning_rate": 3.868856031585652e-06, + "loss": 0.88988686, + "num_input_tokens_seen": 51374890, + "step": 2369, + "time_per_iteration": 2.688962697982788 + }, + { + "auxiliary_loss_clip": 0.01096427, + "auxiliary_loss_mlp": 0.01048076, + "balance_loss_clip": 1.04119074, + "balance_loss_mlp": 1.02794111, + "epoch": 0.14249210882308733, + "flos": 28806857982720.0, + "grad_norm": 1.495457292459728, + "language_loss": 0.75878209, + "learning_rate": 3.868717288576354e-06, + "loss": 0.78022707, + "num_input_tokens_seen": 51398100, + "step": 2370, + "time_per_iteration": 2.7679364681243896 + }, + { + "auxiliary_loss_clip": 0.01109322, + "auxiliary_loss_mlp": 0.00749009, + "balance_loss_clip": 1.03641438, + "balance_loss_mlp": 1.00047183, + "epoch": 0.1425522320757553, + "flos": 21835016807040.0, + "grad_norm": 1.6626926318827542, + "language_loss": 0.82725799, + "learning_rate": 3.868578474705109e-06, + "loss": 0.84584129, + "num_input_tokens_seen": 51418745, + "step": 2371, + "time_per_iteration": 2.725554943084717 + }, + { + "auxiliary_loss_clip": 0.01126768, + "auxiliary_loss_mlp": 0.01051149, + "balance_loss_clip": 1.04065502, + "balance_loss_mlp": 1.03137243, + "epoch": 0.14261235532842326, + "flos": 17311457617920.0, + "grad_norm": 1.7938358648932036, + "language_loss": 0.82656568, + "learning_rate": 3.868439589977181e-06, + "loss": 0.8483448, + "num_input_tokens_seen": 51437455, + "step": 2372, + "time_per_iteration": 2.642245292663574 + }, + { + "auxiliary_loss_clip": 0.01128368, + "auxiliary_loss_mlp": 0.01047549, + "balance_loss_clip": 1.0417161, + "balance_loss_mlp": 1.02736688, + "epoch": 0.14267247858109125, + "flos": 18806741913600.0, + "grad_norm": 2.2230008749341486, + "language_loss": 0.8422364, + "learning_rate": 3.868300634397836e-06, + "loss": 0.86399555, + "num_input_tokens_seen": 51455710, + "step": 2373, + "time_per_iteration": 2.6462416648864746 + }, + { + "auxiliary_loss_clip": 0.01095036, + "auxiliary_loss_mlp": 0.01051953, + "balance_loss_clip": 1.03555155, + "balance_loss_mlp": 1.03371453, + "epoch": 0.14273260183375922, + "flos": 11358904682880.0, + "grad_norm": 2.2180435718305143, + "language_loss": 0.8598516, + "learning_rate": 3.8681616079723445e-06, + "loss": 0.88132155, + "num_input_tokens_seen": 51471270, + "step": 2374, + "time_per_iteration": 2.6569087505340576 + }, + { + "auxiliary_loss_clip": 0.01117194, + "auxiliary_loss_mlp": 0.01051569, + "balance_loss_clip": 1.03805768, + "balance_loss_mlp": 1.03089821, + "epoch": 0.14279272508642718, + "flos": 27567688636800.0, + "grad_norm": 1.5968338084361822, + "language_loss": 0.79088199, + "learning_rate": 3.868022510705977e-06, + "loss": 0.81256968, + "num_input_tokens_seen": 51492705, + "step": 2375, + "time_per_iteration": 2.6861865520477295 + }, + { + "auxiliary_loss_clip": 0.01114127, + "auxiliary_loss_mlp": 0.01061542, + "balance_loss_clip": 1.03999662, + "balance_loss_mlp": 1.04165828, + "epoch": 0.14285284833909515, + "flos": 16252559654400.0, + "grad_norm": 2.2358784851941462, + "language_loss": 0.76350033, + "learning_rate": 3.867883342604009e-06, + "loss": 0.78525704, + "num_input_tokens_seen": 51510780, + "step": 2376, + "time_per_iteration": 2.626532554626465 + }, + { + "auxiliary_loss_clip": 0.01111231, + "auxiliary_loss_mlp": 0.01047127, + "balance_loss_clip": 1.03757787, + "balance_loss_mlp": 1.02776766, + "epoch": 0.1429129715917631, + "flos": 19755609540480.0, + "grad_norm": 1.7287626302788173, + "language_loss": 0.9286772, + "learning_rate": 3.867744103671717e-06, + "loss": 0.95026076, + "num_input_tokens_seen": 51531400, + "step": 2377, + "time_per_iteration": 2.736403703689575 + }, + { + "auxiliary_loss_clip": 0.0110323, + "auxiliary_loss_mlp": 0.01048282, + "balance_loss_clip": 1.03749239, + "balance_loss_mlp": 1.02652669, + "epoch": 0.14297309484443108, + "flos": 21137092571520.0, + "grad_norm": 1.7970711974713411, + "language_loss": 0.91515511, + "learning_rate": 3.867604793914382e-06, + "loss": 0.93667024, + "num_input_tokens_seen": 51548215, + "step": 2378, + "time_per_iteration": 2.6585280895233154 + }, + { + "auxiliary_loss_clip": 0.0111765, + "auxiliary_loss_mlp": 0.01044005, + "balance_loss_clip": 1.03937697, + "balance_loss_mlp": 1.02444267, + "epoch": 0.14303321809709904, + "flos": 23586667447680.0, + "grad_norm": 2.5522955766000845, + "language_loss": 0.73858833, + "learning_rate": 3.8674654133372864e-06, + "loss": 0.76020491, + "num_input_tokens_seen": 51566820, + "step": 2379, + "time_per_iteration": 2.717411756515503 + }, + { + "auxiliary_loss_clip": 0.01091197, + "auxiliary_loss_mlp": 0.01055707, + "balance_loss_clip": 1.03614187, + "balance_loss_mlp": 1.03570414, + "epoch": 0.14309334134976703, + "flos": 15888281875200.0, + "grad_norm": 1.862130276688129, + "language_loss": 0.78810662, + "learning_rate": 3.867325961945714e-06, + "loss": 0.80957568, + "num_input_tokens_seen": 51585075, + "step": 2380, + "time_per_iteration": 2.6520612239837646 + }, + { + "auxiliary_loss_clip": 0.01080099, + "auxiliary_loss_mlp": 0.01053931, + "balance_loss_clip": 1.0383203, + "balance_loss_mlp": 1.03342652, + "epoch": 0.143153464602435, + "flos": 16325601960960.0, + "grad_norm": 2.4286574189251966, + "language_loss": 0.88196605, + "learning_rate": 3.867186439744955e-06, + "loss": 0.90330637, + "num_input_tokens_seen": 51603185, + "step": 2381, + "time_per_iteration": 2.6870715618133545 + }, + { + "auxiliary_loss_clip": 0.01093986, + "auxiliary_loss_mlp": 0.01046651, + "balance_loss_clip": 1.0375371, + "balance_loss_mlp": 1.02669573, + "epoch": 0.14321358785510296, + "flos": 17092079303040.0, + "grad_norm": 2.6626948637439587, + "language_loss": 0.76637328, + "learning_rate": 3.867046846740299e-06, + "loss": 0.78777969, + "num_input_tokens_seen": 51620880, + "step": 2382, + "time_per_iteration": 2.736151695251465 + }, + { + "auxiliary_loss_clip": 0.01090339, + "auxiliary_loss_mlp": 0.01052976, + "balance_loss_clip": 1.03662014, + "balance_loss_mlp": 1.0331037, + "epoch": 0.14327371110777093, + "flos": 26322916769280.0, + "grad_norm": 1.8178318359520478, + "language_loss": 0.76507008, + "learning_rate": 3.866907182937039e-06, + "loss": 0.78650326, + "num_input_tokens_seen": 51640170, + "step": 2383, + "time_per_iteration": 2.7229576110839844 + }, + { + "auxiliary_loss_clip": 0.01092097, + "auxiliary_loss_mlp": 0.01054282, + "balance_loss_clip": 1.0361613, + "balance_loss_mlp": 1.03231215, + "epoch": 0.1433338343604389, + "flos": 18076462502400.0, + "grad_norm": 3.9307945226893786, + "language_loss": 0.8757382, + "learning_rate": 3.866767448340471e-06, + "loss": 0.89720201, + "num_input_tokens_seen": 51656580, + "step": 2384, + "time_per_iteration": 2.7009596824645996 + }, + { + "auxiliary_loss_clip": 0.01119605, + "auxiliary_loss_mlp": 0.01050104, + "balance_loss_clip": 1.04092526, + "balance_loss_mlp": 1.02812147, + "epoch": 0.14339395761310686, + "flos": 15522783033600.0, + "grad_norm": 2.3767304784570498, + "language_loss": 0.79613137, + "learning_rate": 3.866627642955895e-06, + "loss": 0.81782842, + "num_input_tokens_seen": 51674645, + "step": 2385, + "time_per_iteration": 2.6814887523651123 + }, + { + "auxiliary_loss_clip": 0.01107978, + "auxiliary_loss_mlp": 0.01048766, + "balance_loss_clip": 1.03657293, + "balance_loss_mlp": 1.02902532, + "epoch": 0.14345408086577485, + "flos": 28548767784960.0, + "grad_norm": 1.852204083852833, + "language_loss": 0.75149977, + "learning_rate": 3.866487766788612e-06, + "loss": 0.77306724, + "num_input_tokens_seen": 51695770, + "step": 2386, + "time_per_iteration": 2.6866190433502197 + }, + { + "auxiliary_loss_clip": 0.01124442, + "auxiliary_loss_mlp": 0.01043764, + "balance_loss_clip": 1.03927016, + "balance_loss_mlp": 1.02466631, + "epoch": 0.14351420411844282, + "flos": 20230061310720.0, + "grad_norm": 2.2542778903605662, + "language_loss": 0.78301036, + "learning_rate": 3.866347819843925e-06, + "loss": 0.80469239, + "num_input_tokens_seen": 51714165, + "step": 2387, + "time_per_iteration": 2.528275489807129 + }, + { + "auxiliary_loss_clip": 0.01098635, + "auxiliary_loss_mlp": 0.01051939, + "balance_loss_clip": 1.03665972, + "balance_loss_mlp": 1.03106534, + "epoch": 0.14357432737111078, + "flos": 19865029345920.0, + "grad_norm": 2.4673796175325995, + "language_loss": 0.82473236, + "learning_rate": 3.866207802127143e-06, + "loss": 0.84623814, + "num_input_tokens_seen": 51734440, + "step": 2388, + "time_per_iteration": 2.6341068744659424 + }, + { + "auxiliary_loss_clip": 0.01108041, + "auxiliary_loss_mlp": 0.01042739, + "balance_loss_clip": 1.03879261, + "balance_loss_mlp": 1.02353406, + "epoch": 0.14363445062377875, + "flos": 28256814040320.0, + "grad_norm": 1.9143272814282508, + "language_loss": 0.82013667, + "learning_rate": 3.866067713643573e-06, + "loss": 0.84164447, + "num_input_tokens_seen": 51753730, + "step": 2389, + "time_per_iteration": 2.739168643951416 + }, + { + "auxiliary_loss_clip": 0.01106022, + "auxiliary_loss_mlp": 0.01050093, + "balance_loss_clip": 1.03863418, + "balance_loss_mlp": 1.02950537, + "epoch": 0.1436945738764467, + "flos": 18186672407040.0, + "grad_norm": 2.057382457901879, + "language_loss": 0.83189678, + "learning_rate": 3.8659275543985285e-06, + "loss": 0.85345793, + "num_input_tokens_seen": 51771195, + "step": 2390, + "time_per_iteration": 2.7372958660125732 + }, + { + "auxiliary_loss_clip": 0.01116075, + "auxiliary_loss_mlp": 0.01050484, + "balance_loss_clip": 1.04186404, + "balance_loss_mlp": 1.03085005, + "epoch": 0.14375469712911468, + "flos": 27307910499840.0, + "grad_norm": 1.7819858459284776, + "language_loss": 0.74978369, + "learning_rate": 3.865787324397324e-06, + "loss": 0.77144933, + "num_input_tokens_seen": 51792290, + "step": 2391, + "time_per_iteration": 2.6966969966888428 + }, + { + "auxiliary_loss_clip": 0.00995369, + "auxiliary_loss_mlp": 0.01014028, + "balance_loss_clip": 1.00346208, + "balance_loss_mlp": 1.01142967, + "epoch": 0.14381482038178264, + "flos": 56891445287040.0, + "grad_norm": 0.8683978873087262, + "language_loss": 0.61779749, + "learning_rate": 3.865647023645277e-06, + "loss": 0.63789153, + "num_input_tokens_seen": 51843675, + "step": 2392, + "time_per_iteration": 3.060222864151001 + }, + { + "auxiliary_loss_clip": 0.0111743, + "auxiliary_loss_mlp": 0.01054315, + "balance_loss_clip": 1.0382514, + "balance_loss_mlp": 1.03242826, + "epoch": 0.14387494363445064, + "flos": 14282177143680.0, + "grad_norm": 2.0568449222517167, + "language_loss": 0.76500416, + "learning_rate": 3.865506652147709e-06, + "loss": 0.78672159, + "num_input_tokens_seen": 51860285, + "step": 2393, + "time_per_iteration": 2.6575417518615723 + }, + { + "auxiliary_loss_clip": 0.0112678, + "auxiliary_loss_mlp": 0.0105189, + "balance_loss_clip": 1.04065537, + "balance_loss_mlp": 1.03288794, + "epoch": 0.1439350668871186, + "flos": 26761493831040.0, + "grad_norm": 1.9845445596345592, + "language_loss": 0.76623058, + "learning_rate": 3.865366209909941e-06, + "loss": 0.78801727, + "num_input_tokens_seen": 51880105, + "step": 2394, + "time_per_iteration": 2.6729538440704346 + }, + { + "auxiliary_loss_clip": 0.01124072, + "auxiliary_loss_mlp": 0.01051566, + "balance_loss_clip": 1.0382576, + "balance_loss_mlp": 1.03213501, + "epoch": 0.14399519013978657, + "flos": 40700040537600.0, + "grad_norm": 1.4867570057239456, + "language_loss": 0.85896373, + "learning_rate": 3.8652256969372994e-06, + "loss": 0.88072014, + "num_input_tokens_seen": 51905175, + "step": 2395, + "time_per_iteration": 2.7726175785064697 + }, + { + "auxiliary_loss_clip": 0.01079865, + "auxiliary_loss_mlp": 0.01054097, + "balance_loss_clip": 1.03509665, + "balance_loss_mlp": 1.03391528, + "epoch": 0.14405531339245453, + "flos": 20557530627840.0, + "grad_norm": 1.531742326625273, + "language_loss": 0.83070648, + "learning_rate": 3.865085113235113e-06, + "loss": 0.85204613, + "num_input_tokens_seen": 51924490, + "step": 2396, + "time_per_iteration": 4.337973356246948 + }, + { + "auxiliary_loss_clip": 0.01082902, + "auxiliary_loss_mlp": 0.00748956, + "balance_loss_clip": 1.03396177, + "balance_loss_mlp": 1.00043869, + "epoch": 0.1441154366451225, + "flos": 19572931946880.0, + "grad_norm": 2.3655600002249617, + "language_loss": 0.82627386, + "learning_rate": 3.864944458808712e-06, + "loss": 0.84459245, + "num_input_tokens_seen": 51940490, + "step": 2397, + "time_per_iteration": 2.6809632778167725 + }, + { + "auxiliary_loss_clip": 0.01126758, + "auxiliary_loss_mlp": 0.01047722, + "balance_loss_clip": 1.03970909, + "balance_loss_mlp": 1.02810001, + "epoch": 0.14417555989779046, + "flos": 18515721922560.0, + "grad_norm": 1.6749611495857268, + "language_loss": 0.79895818, + "learning_rate": 3.86480373366343e-06, + "loss": 0.82070303, + "num_input_tokens_seen": 51957910, + "step": 2398, + "time_per_iteration": 2.5774288177490234 + }, + { + "auxiliary_loss_clip": 0.01109972, + "auxiliary_loss_mlp": 0.01053948, + "balance_loss_clip": 1.03799939, + "balance_loss_mlp": 1.03521979, + "epoch": 0.14423568315045843, + "flos": 26031681296640.0, + "grad_norm": 2.330137754535888, + "language_loss": 0.64179981, + "learning_rate": 3.864662937804603e-06, + "loss": 0.66343892, + "num_input_tokens_seen": 51978010, + "step": 2399, + "time_per_iteration": 2.6686623096466064 + }, + { + "auxiliary_loss_clip": 0.01087945, + "auxiliary_loss_mlp": 0.01051635, + "balance_loss_clip": 1.03513932, + "balance_loss_mlp": 1.03175139, + "epoch": 0.14429580640312642, + "flos": 21288743792640.0, + "grad_norm": 1.6018797723543219, + "language_loss": 0.82275927, + "learning_rate": 3.864522071237571e-06, + "loss": 0.84415507, + "num_input_tokens_seen": 51998515, + "step": 2400, + "time_per_iteration": 2.7551138401031494 + }, + { + "auxiliary_loss_clip": 0.01110378, + "auxiliary_loss_mlp": 0.01058154, + "balance_loss_clip": 1.04264855, + "balance_loss_mlp": 1.03666055, + "epoch": 0.14435592965579438, + "flos": 25627865621760.0, + "grad_norm": 1.6033647986279609, + "language_loss": 0.74711126, + "learning_rate": 3.864381133967676e-06, + "loss": 0.76879656, + "num_input_tokens_seen": 52019270, + "step": 2401, + "time_per_iteration": 2.805370807647705 + }, + { + "auxiliary_loss_clip": 0.01095449, + "auxiliary_loss_mlp": 0.01047589, + "balance_loss_clip": 1.03650498, + "balance_loss_mlp": 1.02883792, + "epoch": 0.14441605290846235, + "flos": 22965053656320.0, + "grad_norm": 1.4964196754689862, + "language_loss": 0.80866492, + "learning_rate": 3.86424012600026e-06, + "loss": 0.83009529, + "num_input_tokens_seen": 52039315, + "step": 2402, + "time_per_iteration": 4.467408895492554 + }, + { + "auxiliary_loss_clip": 0.01075841, + "auxiliary_loss_mlp": 0.01050608, + "balance_loss_clip": 1.03282309, + "balance_loss_mlp": 1.03090274, + "epoch": 0.14447617616113032, + "flos": 17347655548800.0, + "grad_norm": 2.2054374987346113, + "language_loss": 0.84683686, + "learning_rate": 3.864099047340673e-06, + "loss": 0.86810136, + "num_input_tokens_seen": 52056555, + "step": 2403, + "time_per_iteration": 6.141287803649902 + }, + { + "auxiliary_loss_clip": 0.01083533, + "auxiliary_loss_mlp": 0.00749143, + "balance_loss_clip": 1.03303182, + "balance_loss_mlp": 1.0004288, + "epoch": 0.14453629941379828, + "flos": 24060185464320.0, + "grad_norm": 1.8633695007149655, + "language_loss": 0.69847196, + "learning_rate": 3.863957897994262e-06, + "loss": 0.71679872, + "num_input_tokens_seen": 52075800, + "step": 2404, + "time_per_iteration": 2.7245566844940186 + }, + { + "auxiliary_loss_clip": 0.01097224, + "auxiliary_loss_mlp": 0.01052976, + "balance_loss_clip": 1.03491116, + "balance_loss_mlp": 1.03398573, + "epoch": 0.14459642266646625, + "flos": 14429554646400.0, + "grad_norm": 2.271946429715857, + "language_loss": 0.73838234, + "learning_rate": 3.863816677966381e-06, + "loss": 0.7598843, + "num_input_tokens_seen": 52092585, + "step": 2405, + "time_per_iteration": 2.8616700172424316 + }, + { + "auxiliary_loss_clip": 0.0105944, + "auxiliary_loss_mlp": 0.01047193, + "balance_loss_clip": 1.03043747, + "balance_loss_mlp": 1.02728486, + "epoch": 0.14465654591913424, + "flos": 9867032179200.0, + "grad_norm": 2.204906621752469, + "language_loss": 0.73513305, + "learning_rate": 3.863675387262386e-06, + "loss": 0.75619936, + "num_input_tokens_seen": 52108990, + "step": 2406, + "time_per_iteration": 2.679295778274536 + }, + { + "auxiliary_loss_clip": 0.01114871, + "auxiliary_loss_mlp": 0.01052951, + "balance_loss_clip": 1.03892338, + "balance_loss_mlp": 1.03173208, + "epoch": 0.1447166691718022, + "flos": 24972926987520.0, + "grad_norm": 3.127051356086633, + "language_loss": 0.75626129, + "learning_rate": 3.8635340258876325e-06, + "loss": 0.77793956, + "num_input_tokens_seen": 52125385, + "step": 2407, + "time_per_iteration": 2.584122657775879 + }, + { + "auxiliary_loss_clip": 0.01122747, + "auxiliary_loss_mlp": 0.01046349, + "balance_loss_clip": 1.03788614, + "balance_loss_mlp": 1.0275619, + "epoch": 0.14477679242447017, + "flos": 21908023200000.0, + "grad_norm": 1.7920017831419368, + "language_loss": 0.79317808, + "learning_rate": 3.8633925938474826e-06, + "loss": 0.81486905, + "num_input_tokens_seen": 52144985, + "step": 2408, + "time_per_iteration": 2.562160015106201 + }, + { + "auxiliary_loss_clip": 0.01116547, + "auxiliary_loss_mlp": 0.01053806, + "balance_loss_clip": 1.04203546, + "balance_loss_mlp": 1.03228879, + "epoch": 0.14483691567713813, + "flos": 20740746925440.0, + "grad_norm": 3.3771727939015186, + "language_loss": 0.82058239, + "learning_rate": 3.863251091147299e-06, + "loss": 0.84228593, + "num_input_tokens_seen": 52163885, + "step": 2409, + "time_per_iteration": 2.5730109214782715 + }, + { + "auxiliary_loss_clip": 0.01066626, + "auxiliary_loss_mlp": 0.01055917, + "balance_loss_clip": 1.03242314, + "balance_loss_mlp": 1.03541279, + "epoch": 0.1448970389298061, + "flos": 35407705536000.0, + "grad_norm": 1.9730753643319423, + "language_loss": 0.74684727, + "learning_rate": 3.863109517792446e-06, + "loss": 0.76807278, + "num_input_tokens_seen": 52184325, + "step": 2410, + "time_per_iteration": 2.9331512451171875 + }, + { + "auxiliary_loss_clip": 0.0112518, + "auxiliary_loss_mlp": 0.0104652, + "balance_loss_clip": 1.03989434, + "balance_loss_mlp": 1.02758932, + "epoch": 0.14495716218247406, + "flos": 15414368808960.0, + "grad_norm": 13.153415630439955, + "language_loss": 0.81022269, + "learning_rate": 3.8629678737882945e-06, + "loss": 0.8319397, + "num_input_tokens_seen": 52202740, + "step": 2411, + "time_per_iteration": 2.628406286239624 + }, + { + "auxiliary_loss_clip": 0.01100046, + "auxiliary_loss_mlp": 0.01052992, + "balance_loss_clip": 1.03993189, + "balance_loss_mlp": 1.03308415, + "epoch": 0.14501728543514203, + "flos": 33693222493440.0, + "grad_norm": 1.8984426567338477, + "language_loss": 0.69787252, + "learning_rate": 3.862826159140214e-06, + "loss": 0.71940285, + "num_input_tokens_seen": 52223100, + "step": 2412, + "time_per_iteration": 2.8360283374786377 + }, + { + "auxiliary_loss_clip": 0.01118557, + "auxiliary_loss_mlp": 0.01048747, + "balance_loss_clip": 1.04577494, + "balance_loss_mlp": 1.02831411, + "epoch": 0.14507740868781002, + "flos": 15596112648960.0, + "grad_norm": 1.8794897037592246, + "language_loss": 0.77125132, + "learning_rate": 3.862684373853579e-06, + "loss": 0.79292434, + "num_input_tokens_seen": 52239690, + "step": 2413, + "time_per_iteration": 2.652838945388794 + }, + { + "auxiliary_loss_clip": 0.01017556, + "auxiliary_loss_mlp": 0.01011796, + "balance_loss_clip": 1.00509512, + "balance_loss_mlp": 1.00972199, + "epoch": 0.145137531940478, + "flos": 66675343438080.0, + "grad_norm": 0.9152537527058981, + "language_loss": 0.58940619, + "learning_rate": 3.8625425179337656e-06, + "loss": 0.60969973, + "num_input_tokens_seen": 52296705, + "step": 2414, + "time_per_iteration": 3.1234848499298096 + }, + { + "auxiliary_loss_clip": 0.01005363, + "auxiliary_loss_mlp": 0.01008489, + "balance_loss_clip": 1.00341892, + "balance_loss_mlp": 1.00634277, + "epoch": 0.14519765519314595, + "flos": 67521578929920.0, + "grad_norm": 0.8477404362979363, + "language_loss": 0.62183905, + "learning_rate": 3.862400591386154e-06, + "loss": 0.64197755, + "num_input_tokens_seen": 52361830, + "step": 2415, + "time_per_iteration": 3.183547019958496 + }, + { + "auxiliary_loss_clip": 0.01109302, + "auxiliary_loss_mlp": 0.0104876, + "balance_loss_clip": 1.03783429, + "balance_loss_mlp": 1.02754092, + "epoch": 0.14525777844581392, + "flos": 17198913329280.0, + "grad_norm": 1.9964923793006786, + "language_loss": 0.72101104, + "learning_rate": 3.8622585942161245e-06, + "loss": 0.74259162, + "num_input_tokens_seen": 52379420, + "step": 2416, + "time_per_iteration": 2.604307174682617 + }, + { + "auxiliary_loss_clip": 0.00996054, + "auxiliary_loss_mlp": 0.01002303, + "balance_loss_clip": 1.00376177, + "balance_loss_mlp": 0.99988317, + "epoch": 0.14531790169848188, + "flos": 65404609015680.0, + "grad_norm": 0.709754216667338, + "language_loss": 0.60416764, + "learning_rate": 3.8621165264290635e-06, + "loss": 0.62415123, + "num_input_tokens_seen": 52446290, + "step": 2417, + "time_per_iteration": 3.3290934562683105 + }, + { + "auxiliary_loss_clip": 0.01126848, + "auxiliary_loss_mlp": 0.01050441, + "balance_loss_clip": 1.03813457, + "balance_loss_mlp": 1.02941251, + "epoch": 0.14537802495114985, + "flos": 32562467372160.0, + "grad_norm": 2.6028819143751103, + "language_loss": 0.79195487, + "learning_rate": 3.861974388030356e-06, + "loss": 0.81372774, + "num_input_tokens_seen": 52467295, + "step": 2418, + "time_per_iteration": 2.627027750015259 + }, + { + "auxiliary_loss_clip": 0.01067754, + "auxiliary_loss_mlp": 0.0105717, + "balance_loss_clip": 1.03078508, + "balance_loss_mlp": 1.03680897, + "epoch": 0.1454381482038178, + "flos": 20226685432320.0, + "grad_norm": 1.8156032863837197, + "language_loss": 0.7165513, + "learning_rate": 3.861832179025394e-06, + "loss": 0.73780048, + "num_input_tokens_seen": 52487295, + "step": 2419, + "time_per_iteration": 2.6939005851745605 + }, + { + "auxiliary_loss_clip": 0.01099348, + "auxiliary_loss_mlp": 0.01049141, + "balance_loss_clip": 1.03768229, + "balance_loss_mlp": 1.02780247, + "epoch": 0.1454982714564858, + "flos": 22893124671360.0, + "grad_norm": 2.363762163312259, + "language_loss": 0.89951944, + "learning_rate": 3.861689899419569e-06, + "loss": 0.9210043, + "num_input_tokens_seen": 52504220, + "step": 2420, + "time_per_iteration": 2.643911123275757 + }, + { + "auxiliary_loss_clip": 0.0111109, + "auxiliary_loss_mlp": 0.01051362, + "balance_loss_clip": 1.03725111, + "balance_loss_mlp": 1.03172851, + "epoch": 0.14555839470915377, + "flos": 20229845829120.0, + "grad_norm": 2.3084422713794814, + "language_loss": 0.82925546, + "learning_rate": 3.861547549218276e-06, + "loss": 0.85088003, + "num_input_tokens_seen": 52521900, + "step": 2421, + "time_per_iteration": 2.627946615219116 + }, + { + "auxiliary_loss_clip": 0.01055694, + "auxiliary_loss_mlp": 0.01054158, + "balance_loss_clip": 1.03068638, + "balance_loss_mlp": 1.03330851, + "epoch": 0.14561851796182174, + "flos": 22236282616320.0, + "grad_norm": 1.4703368638970442, + "language_loss": 0.81854928, + "learning_rate": 3.861405128426914e-06, + "loss": 0.83964777, + "num_input_tokens_seen": 52540495, + "step": 2422, + "time_per_iteration": 2.722416400909424 + }, + { + "auxiliary_loss_clip": 0.01018173, + "auxiliary_loss_mlp": 0.00748615, + "balance_loss_clip": 1.02723384, + "balance_loss_mlp": 1.00060833, + "epoch": 0.1456786412144897, + "flos": 52636786289280.0, + "grad_norm": 0.9043591555453663, + "language_loss": 0.63327414, + "learning_rate": 3.861262637050883e-06, + "loss": 0.65094203, + "num_input_tokens_seen": 52603305, + "step": 2423, + "time_per_iteration": 3.3691232204437256 + }, + { + "auxiliary_loss_clip": 0.01079432, + "auxiliary_loss_mlp": 0.00749109, + "balance_loss_clip": 1.04184604, + "balance_loss_mlp": 1.00050545, + "epoch": 0.14573876446715767, + "flos": 23221671396480.0, + "grad_norm": 1.7310444981692548, + "language_loss": 0.82688642, + "learning_rate": 3.861120075095585e-06, + "loss": 0.84517181, + "num_input_tokens_seen": 52623435, + "step": 2424, + "time_per_iteration": 2.762530565261841 + }, + { + "auxiliary_loss_clip": 0.0110363, + "auxiliary_loss_mlp": 0.01049966, + "balance_loss_clip": 1.03953862, + "balance_loss_mlp": 1.03038001, + "epoch": 0.14579888771982563, + "flos": 18114384286080.0, + "grad_norm": 2.2395065812100765, + "language_loss": 0.78594816, + "learning_rate": 3.860977442566429e-06, + "loss": 0.80748409, + "num_input_tokens_seen": 52642255, + "step": 2425, + "time_per_iteration": 2.8593854904174805 + }, + { + "auxiliary_loss_clip": 0.01115227, + "auxiliary_loss_mlp": 0.01047751, + "balance_loss_clip": 1.0410428, + "balance_loss_mlp": 1.02814126, + "epoch": 0.14585901097249362, + "flos": 23001107932800.0, + "grad_norm": 2.6054712951239507, + "language_loss": 0.83524454, + "learning_rate": 3.860834739468821e-06, + "loss": 0.85687435, + "num_input_tokens_seen": 52658700, + "step": 2426, + "time_per_iteration": 2.74167799949646 + }, + { + "auxiliary_loss_clip": 0.01124465, + "auxiliary_loss_mlp": 0.01046401, + "balance_loss_clip": 1.04200768, + "balance_loss_mlp": 1.02695775, + "epoch": 0.1459191342251616, + "flos": 21908669644800.0, + "grad_norm": 1.8085187977158106, + "language_loss": 0.8737402, + "learning_rate": 3.860691965808173e-06, + "loss": 0.89544886, + "num_input_tokens_seen": 52678140, + "step": 2427, + "time_per_iteration": 2.5815086364746094 + }, + { + "auxiliary_loss_clip": 0.01081547, + "auxiliary_loss_mlp": 0.0105241, + "balance_loss_clip": 1.03490317, + "balance_loss_mlp": 1.0300343, + "epoch": 0.14597925747782955, + "flos": 14975504438400.0, + "grad_norm": 1.6735101447827376, + "language_loss": 0.67117381, + "learning_rate": 3.8605491215899e-06, + "loss": 0.69251347, + "num_input_tokens_seen": 52696825, + "step": 2428, + "time_per_iteration": 2.6287026405334473 + }, + { + "auxiliary_loss_clip": 0.01113426, + "auxiliary_loss_mlp": 0.01051801, + "balance_loss_clip": 1.03901887, + "balance_loss_mlp": 1.03179812, + "epoch": 0.14603938073049752, + "flos": 21068898600960.0, + "grad_norm": 1.7296988583734518, + "language_loss": 0.83212793, + "learning_rate": 3.860406206819417e-06, + "loss": 0.85378027, + "num_input_tokens_seen": 52715125, + "step": 2429, + "time_per_iteration": 2.643005132675171 + }, + { + "auxiliary_loss_clip": 0.01078409, + "auxiliary_loss_mlp": 0.01049199, + "balance_loss_clip": 1.03292251, + "balance_loss_mlp": 1.03065038, + "epoch": 0.14609950398316549, + "flos": 19864777950720.0, + "grad_norm": 1.6784997659921113, + "language_loss": 0.78915155, + "learning_rate": 3.860263221502145e-06, + "loss": 0.81042767, + "num_input_tokens_seen": 52734015, + "step": 2430, + "time_per_iteration": 2.633965492248535 + }, + { + "auxiliary_loss_clip": 0.01130812, + "auxiliary_loss_mlp": 0.01048641, + "balance_loss_clip": 1.04469752, + "balance_loss_mlp": 1.0287571, + "epoch": 0.14615962723583345, + "flos": 22418852469120.0, + "grad_norm": 3.1137534972122656, + "language_loss": 0.82633835, + "learning_rate": 3.860120165643504e-06, + "loss": 0.84813285, + "num_input_tokens_seen": 52753025, + "step": 2431, + "time_per_iteration": 2.5511560440063477 + }, + { + "auxiliary_loss_clip": 0.0112062, + "auxiliary_loss_mlp": 0.01058269, + "balance_loss_clip": 1.0411489, + "balance_loss_mlp": 1.03695416, + "epoch": 0.14621975048850142, + "flos": 22346241125760.0, + "grad_norm": 2.361630717868701, + "language_loss": 0.78574002, + "learning_rate": 3.859977039248921e-06, + "loss": 0.80752885, + "num_input_tokens_seen": 52773420, + "step": 2432, + "time_per_iteration": 2.6105058193206787 + }, + { + "auxiliary_loss_clip": 0.01125883, + "auxiliary_loss_mlp": 0.00749258, + "balance_loss_clip": 1.04098272, + "balance_loss_mlp": 1.00058055, + "epoch": 0.1462798737411694, + "flos": 24389163152640.0, + "grad_norm": 1.9661718485937398, + "language_loss": 0.79738533, + "learning_rate": 3.859833842323822e-06, + "loss": 0.81613672, + "num_input_tokens_seen": 52792870, + "step": 2433, + "time_per_iteration": 2.5862526893615723 + }, + { + "auxiliary_loss_clip": 0.01091165, + "auxiliary_loss_mlp": 0.01057215, + "balance_loss_clip": 1.0430038, + "balance_loss_mlp": 1.03626955, + "epoch": 0.14633999699383737, + "flos": 19244672530560.0, + "grad_norm": 1.9321456330667617, + "language_loss": 0.78411072, + "learning_rate": 3.859690574873638e-06, + "loss": 0.80559456, + "num_input_tokens_seen": 52811615, + "step": 2434, + "time_per_iteration": 2.7235946655273438 + }, + { + "auxiliary_loss_clip": 0.01003729, + "auxiliary_loss_mlp": 0.01028115, + "balance_loss_clip": 1.01038265, + "balance_loss_mlp": 1.02587438, + "epoch": 0.14640012024650534, + "flos": 62660638270080.0, + "grad_norm": 0.8435806309317307, + "language_loss": 0.58399588, + "learning_rate": 3.8595472369038e-06, + "loss": 0.60431433, + "num_input_tokens_seen": 52873230, + "step": 2435, + "time_per_iteration": 3.2831785678863525 + }, + { + "auxiliary_loss_clip": 0.01122918, + "auxiliary_loss_mlp": 0.01046354, + "balance_loss_clip": 1.04079497, + "balance_loss_mlp": 1.02741134, + "epoch": 0.1464602434991733, + "flos": 12276243146880.0, + "grad_norm": 2.1821370312894666, + "language_loss": 0.88696945, + "learning_rate": 3.859403828419744e-06, + "loss": 0.90866208, + "num_input_tokens_seen": 52889325, + "step": 2436, + "time_per_iteration": 2.530792713165283 + }, + { + "auxiliary_loss_clip": 0.01116868, + "auxiliary_loss_mlp": 0.00749135, + "balance_loss_clip": 1.04239607, + "balance_loss_mlp": 1.00058126, + "epoch": 0.14652036675184127, + "flos": 20922311197440.0, + "grad_norm": 1.942567232356588, + "language_loss": 0.75115645, + "learning_rate": 3.85926034942691e-06, + "loss": 0.76981652, + "num_input_tokens_seen": 52909705, + "step": 2437, + "time_per_iteration": 2.7113804817199707 + }, + { + "auxiliary_loss_clip": 0.01129946, + "auxiliary_loss_mlp": 0.01051647, + "balance_loss_clip": 1.04269743, + "balance_loss_mlp": 1.02946198, + "epoch": 0.14658049000450923, + "flos": 27703681528320.0, + "grad_norm": 2.248702942109598, + "language_loss": 0.7329675, + "learning_rate": 3.859116799930736e-06, + "loss": 0.75478345, + "num_input_tokens_seen": 52930300, + "step": 2438, + "time_per_iteration": 2.709806442260742 + }, + { + "auxiliary_loss_clip": 0.01117817, + "auxiliary_loss_mlp": 0.01048549, + "balance_loss_clip": 1.04494333, + "balance_loss_mlp": 1.03001237, + "epoch": 0.14664061325717723, + "flos": 24936513575040.0, + "grad_norm": 2.1936643936174978, + "language_loss": 0.74409312, + "learning_rate": 3.858973179936668e-06, + "loss": 0.76575673, + "num_input_tokens_seen": 52949955, + "step": 2439, + "time_per_iteration": 2.778987407684326 + }, + { + "auxiliary_loss_clip": 0.01114671, + "auxiliary_loss_mlp": 0.01056217, + "balance_loss_clip": 1.04209018, + "balance_loss_mlp": 1.03605855, + "epoch": 0.1467007365098452, + "flos": 40297661406720.0, + "grad_norm": 1.8222666737796638, + "language_loss": 0.7457186, + "learning_rate": 3.85882948945015e-06, + "loss": 0.76742744, + "num_input_tokens_seen": 52972905, + "step": 2440, + "time_per_iteration": 2.824841260910034 + }, + { + "auxiliary_loss_clip": 0.01122917, + "auxiliary_loss_mlp": 0.01054074, + "balance_loss_clip": 1.04145205, + "balance_loss_mlp": 1.03507209, + "epoch": 0.14676085976251316, + "flos": 26541074021760.0, + "grad_norm": 1.520778688889431, + "language_loss": 0.83006966, + "learning_rate": 3.85868572847663e-06, + "loss": 0.85183954, + "num_input_tokens_seen": 52994850, + "step": 2441, + "time_per_iteration": 2.5935325622558594 + }, + { + "auxiliary_loss_clip": 0.01122013, + "auxiliary_loss_mlp": 0.01052839, + "balance_loss_clip": 1.04502106, + "balance_loss_mlp": 1.03144085, + "epoch": 0.14682098301518112, + "flos": 23550110380800.0, + "grad_norm": 2.3660668238804026, + "language_loss": 0.71341991, + "learning_rate": 3.858541897021563e-06, + "loss": 0.73516846, + "num_input_tokens_seen": 53014740, + "step": 2442, + "time_per_iteration": 2.636371612548828 + }, + { + "auxiliary_loss_clip": 0.01097187, + "auxiliary_loss_mlp": 0.01050663, + "balance_loss_clip": 1.04247642, + "balance_loss_mlp": 1.03055203, + "epoch": 0.1468811062678491, + "flos": 11651073909120.0, + "grad_norm": 3.13981602107041, + "language_loss": 0.80663872, + "learning_rate": 3.8583979950904e-06, + "loss": 0.82811725, + "num_input_tokens_seen": 53029780, + "step": 2443, + "time_per_iteration": 4.24943995475769 + }, + { + "auxiliary_loss_clip": 0.01106543, + "auxiliary_loss_mlp": 0.01057941, + "balance_loss_clip": 1.0414834, + "balance_loss_mlp": 1.0362215, + "epoch": 0.14694122952051705, + "flos": 23002616304000.0, + "grad_norm": 1.636792307115119, + "language_loss": 0.82803273, + "learning_rate": 3.858254022688599e-06, + "loss": 0.84967756, + "num_input_tokens_seen": 53048620, + "step": 2444, + "time_per_iteration": 2.688391923904419 + }, + { + "auxiliary_loss_clip": 0.01101347, + "auxiliary_loss_mlp": 0.01059134, + "balance_loss_clip": 1.04207325, + "balance_loss_mlp": 1.03964305, + "epoch": 0.14700135277318502, + "flos": 26502972670080.0, + "grad_norm": 1.485573721093736, + "language_loss": 0.71019244, + "learning_rate": 3.85810997982162e-06, + "loss": 0.73179722, + "num_input_tokens_seen": 53070055, + "step": 2445, + "time_per_iteration": 2.837233781814575 + }, + { + "auxiliary_loss_clip": 0.01026083, + "auxiliary_loss_mlp": 0.0101078, + "balance_loss_clip": 1.0040617, + "balance_loss_mlp": 1.00890875, + "epoch": 0.147061476025853, + "flos": 59449434387840.0, + "grad_norm": 0.8363020409378822, + "language_loss": 0.63123703, + "learning_rate": 3.857965866494923e-06, + "loss": 0.65160573, + "num_input_tokens_seen": 53126945, + "step": 2446, + "time_per_iteration": 3.197559356689453 + }, + { + "auxiliary_loss_clip": 0.01078668, + "auxiliary_loss_mlp": 0.01049049, + "balance_loss_clip": 1.03796625, + "balance_loss_mlp": 1.02867627, + "epoch": 0.14712159927852098, + "flos": 28330897841280.0, + "grad_norm": 1.5008254157890553, + "language_loss": 0.74861717, + "learning_rate": 3.857821682713975e-06, + "loss": 0.76989436, + "num_input_tokens_seen": 53149130, + "step": 2447, + "time_per_iteration": 2.8706347942352295 + }, + { + "auxiliary_loss_clip": 0.01121652, + "auxiliary_loss_mlp": 0.01043781, + "balance_loss_clip": 1.03864837, + "balance_loss_mlp": 1.02483904, + "epoch": 0.14718172253118894, + "flos": 27089825074560.0, + "grad_norm": 2.1172292789340865, + "language_loss": 0.85873842, + "learning_rate": 3.857677428484242e-06, + "loss": 0.88039267, + "num_input_tokens_seen": 53167120, + "step": 2448, + "time_per_iteration": 2.6403465270996094 + }, + { + "auxiliary_loss_clip": 0.01023857, + "auxiliary_loss_mlp": 0.01003828, + "balance_loss_clip": 1.00185537, + "balance_loss_mlp": 1.00192106, + "epoch": 0.1472418457838569, + "flos": 66706764860160.0, + "grad_norm": 0.7661445924111081, + "language_loss": 0.56823385, + "learning_rate": 3.857533103811195e-06, + "loss": 0.58851063, + "num_input_tokens_seen": 53227945, + "step": 2449, + "time_per_iteration": 3.0984907150268555 + }, + { + "auxiliary_loss_clip": 0.0108928, + "auxiliary_loss_mlp": 0.01046128, + "balance_loss_clip": 1.03606737, + "balance_loss_mlp": 1.02698326, + "epoch": 0.14730196903652487, + "flos": 19573578391680.0, + "grad_norm": 1.7530435042602113, + "language_loss": 0.84888011, + "learning_rate": 3.857388708700307e-06, + "loss": 0.87023425, + "num_input_tokens_seen": 53244615, + "step": 2450, + "time_per_iteration": 7.620276689529419 + }, + { + "auxiliary_loss_clip": 0.01110978, + "auxiliary_loss_mlp": 0.010565, + "balance_loss_clip": 1.0368098, + "balance_loss_mlp": 1.03610349, + "epoch": 0.14736209228919284, + "flos": 16071031296000.0, + "grad_norm": 2.1776170109469923, + "language_loss": 0.74930191, + "learning_rate": 3.857244243157052e-06, + "loss": 0.77097672, + "num_input_tokens_seen": 53262205, + "step": 2451, + "time_per_iteration": 2.6205286979675293 + }, + { + "auxiliary_loss_clip": 0.01087793, + "auxiliary_loss_mlp": 0.01040749, + "balance_loss_clip": 1.03386354, + "balance_loss_mlp": 1.02279568, + "epoch": 0.1474222155418608, + "flos": 23039460679680.0, + "grad_norm": 1.6429868752024162, + "language_loss": 0.82284999, + "learning_rate": 3.85709970718691e-06, + "loss": 0.8441354, + "num_input_tokens_seen": 53282445, + "step": 2452, + "time_per_iteration": 2.62549090385437 + }, + { + "auxiliary_loss_clip": 0.01059339, + "auxiliary_loss_mlp": 0.01038824, + "balance_loss_clip": 1.04057932, + "balance_loss_mlp": 1.02094281, + "epoch": 0.1474823387945288, + "flos": 17018641946880.0, + "grad_norm": 1.532914697617229, + "language_loss": 0.73929632, + "learning_rate": 3.856955100795361e-06, + "loss": 0.76027799, + "num_input_tokens_seen": 53299060, + "step": 2453, + "time_per_iteration": 2.8596084117889404 + }, + { + "auxiliary_loss_clip": 0.01103125, + "auxiliary_loss_mlp": 0.01050466, + "balance_loss_clip": 1.03961039, + "balance_loss_mlp": 1.03061771, + "epoch": 0.14754246204719676, + "flos": 17895041884800.0, + "grad_norm": 2.24727899716957, + "language_loss": 0.75782663, + "learning_rate": 3.856810423987889e-06, + "loss": 0.77936256, + "num_input_tokens_seen": 53315970, + "step": 2454, + "time_per_iteration": 2.73248291015625 + }, + { + "auxiliary_loss_clip": 0.0110598, + "auxiliary_loss_mlp": 0.01044323, + "balance_loss_clip": 1.03909135, + "balance_loss_mlp": 1.02536845, + "epoch": 0.14760258529986472, + "flos": 13079097987840.0, + "grad_norm": 5.313096829653563, + "language_loss": 0.83231777, + "learning_rate": 3.856665676769979e-06, + "loss": 0.8538208, + "num_input_tokens_seen": 53332940, + "step": 2455, + "time_per_iteration": 2.617842435836792 + }, + { + "auxiliary_loss_clip": 0.01088342, + "auxiliary_loss_mlp": 0.01048137, + "balance_loss_clip": 1.03986979, + "balance_loss_mlp": 1.02853894, + "epoch": 0.1476627085525327, + "flos": 30806399358720.0, + "grad_norm": 3.059912431577015, + "language_loss": 0.84453857, + "learning_rate": 3.85652085914712e-06, + "loss": 0.86590338, + "num_input_tokens_seen": 53353295, + "step": 2456, + "time_per_iteration": 2.806432008743286 + }, + { + "auxiliary_loss_clip": 0.01109984, + "auxiliary_loss_mlp": 0.01039458, + "balance_loss_clip": 1.03951001, + "balance_loss_mlp": 1.02021718, + "epoch": 0.14772283180520066, + "flos": 21689434984320.0, + "grad_norm": 2.9178837890145672, + "language_loss": 0.84158283, + "learning_rate": 3.856375971124805e-06, + "loss": 0.86307722, + "num_input_tokens_seen": 53373410, + "step": 2457, + "time_per_iteration": 2.6281447410583496 + }, + { + "auxiliary_loss_clip": 0.01111815, + "auxiliary_loss_mlp": 0.01041494, + "balance_loss_clip": 1.04090238, + "balance_loss_mlp": 1.02307582, + "epoch": 0.14778295505786862, + "flos": 18770400328320.0, + "grad_norm": 2.0727317039819804, + "language_loss": 0.75638038, + "learning_rate": 3.856231012708527e-06, + "loss": 0.77791345, + "num_input_tokens_seen": 53391430, + "step": 2458, + "time_per_iteration": 2.640674114227295 + }, + { + "auxiliary_loss_clip": 0.0107769, + "auxiliary_loss_mlp": 0.01053638, + "balance_loss_clip": 1.03954411, + "balance_loss_mlp": 1.03233576, + "epoch": 0.1478430783105366, + "flos": 22893555634560.0, + "grad_norm": 1.9724254930368823, + "language_loss": 0.83539855, + "learning_rate": 3.856085983903782e-06, + "loss": 0.85671186, + "num_input_tokens_seen": 53409960, + "step": 2459, + "time_per_iteration": 2.7894446849823 + }, + { + "auxiliary_loss_clip": 0.01086485, + "auxiliary_loss_mlp": 0.01043539, + "balance_loss_clip": 1.03589153, + "balance_loss_mlp": 1.02456045, + "epoch": 0.14790320156320458, + "flos": 15085319293440.0, + "grad_norm": 3.3583826059868156, + "language_loss": 0.7522378, + "learning_rate": 3.855940884716071e-06, + "loss": 0.77353805, + "num_input_tokens_seen": 53426160, + "step": 2460, + "time_per_iteration": 2.679151773452759 + }, + { + "auxiliary_loss_clip": 0.01095937, + "auxiliary_loss_mlp": 0.01044441, + "balance_loss_clip": 1.04069042, + "balance_loss_mlp": 1.02459252, + "epoch": 0.14796332481587254, + "flos": 26504768350080.0, + "grad_norm": 1.6094160369766886, + "language_loss": 0.81545478, + "learning_rate": 3.855795715150896e-06, + "loss": 0.83685863, + "num_input_tokens_seen": 53448530, + "step": 2461, + "time_per_iteration": 2.718417167663574 + }, + { + "auxiliary_loss_clip": 0.01114778, + "auxiliary_loss_mlp": 0.01047506, + "balance_loss_clip": 1.04044294, + "balance_loss_mlp": 1.02703762, + "epoch": 0.1480234480685405, + "flos": 17563191108480.0, + "grad_norm": 2.1326961433290546, + "language_loss": 0.65988225, + "learning_rate": 3.855650475213761e-06, + "loss": 0.68150508, + "num_input_tokens_seen": 53465915, + "step": 2462, + "time_per_iteration": 2.686859607696533 + }, + { + "auxiliary_loss_clip": 0.01099129, + "auxiliary_loss_mlp": 0.0104918, + "balance_loss_clip": 1.03880763, + "balance_loss_mlp": 1.02961755, + "epoch": 0.14808357132120847, + "flos": 53582203232640.0, + "grad_norm": 1.6572211830011554, + "language_loss": 0.67116797, + "learning_rate": 3.8555051649101745e-06, + "loss": 0.69265109, + "num_input_tokens_seen": 53496055, + "step": 2463, + "time_per_iteration": 3.037108898162842 + }, + { + "auxiliary_loss_clip": 0.0111127, + "auxiliary_loss_mlp": 0.01049972, + "balance_loss_clip": 1.03703403, + "balance_loss_mlp": 1.0299567, + "epoch": 0.14814369457387644, + "flos": 19829190551040.0, + "grad_norm": 2.8193766189246565, + "language_loss": 0.76649392, + "learning_rate": 3.855359784245646e-06, + "loss": 0.78810632, + "num_input_tokens_seen": 53513790, + "step": 2464, + "time_per_iteration": 2.6424615383148193 + }, + { + "auxiliary_loss_clip": 0.01087387, + "auxiliary_loss_mlp": 0.01050675, + "balance_loss_clip": 1.03567505, + "balance_loss_mlp": 1.03224516, + "epoch": 0.1482038178265444, + "flos": 23914962777600.0, + "grad_norm": 1.5854663533994255, + "language_loss": 0.79509878, + "learning_rate": 3.855214333225688e-06, + "loss": 0.81647933, + "num_input_tokens_seen": 53533410, + "step": 2465, + "time_per_iteration": 2.686262369155884 + }, + { + "auxiliary_loss_clip": 0.01129689, + "auxiliary_loss_mlp": 0.01045754, + "balance_loss_clip": 1.04171443, + "balance_loss_mlp": 1.02600074, + "epoch": 0.1482639410792124, + "flos": 24170503109760.0, + "grad_norm": 1.579089311191031, + "language_loss": 0.76331729, + "learning_rate": 3.855068811855817e-06, + "loss": 0.78507173, + "num_input_tokens_seen": 53554775, + "step": 2466, + "time_per_iteration": 2.6223719120025635 + }, + { + "auxiliary_loss_clip": 0.00975871, + "auxiliary_loss_mlp": 0.0100155, + "balance_loss_clip": 1.00665081, + "balance_loss_mlp": 0.99960709, + "epoch": 0.14832406433188036, + "flos": 66191051341440.0, + "grad_norm": 0.7848622004296375, + "language_loss": 0.60069352, + "learning_rate": 3.854923220141551e-06, + "loss": 0.62046772, + "num_input_tokens_seen": 53609675, + "step": 2467, + "time_per_iteration": 3.4628255367279053 + }, + { + "auxiliary_loss_clip": 0.01097608, + "auxiliary_loss_mlp": 0.01040775, + "balance_loss_clip": 1.03585553, + "balance_loss_mlp": 1.0217607, + "epoch": 0.14838418758454833, + "flos": 25411252654080.0, + "grad_norm": 1.9623796267209108, + "language_loss": 0.88190162, + "learning_rate": 3.85477755808841e-06, + "loss": 0.9032855, + "num_input_tokens_seen": 53626950, + "step": 2468, + "time_per_iteration": 2.8293423652648926 + }, + { + "auxiliary_loss_clip": 0.01087056, + "auxiliary_loss_mlp": 0.010478, + "balance_loss_clip": 1.03535652, + "balance_loss_mlp": 1.02703404, + "epoch": 0.1484443108372163, + "flos": 23289901280640.0, + "grad_norm": 2.5277358208323566, + "language_loss": 0.76052654, + "learning_rate": 3.854631825701919e-06, + "loss": 0.78187507, + "num_input_tokens_seen": 53644200, + "step": 2469, + "time_per_iteration": 2.6411287784576416 + }, + { + "auxiliary_loss_clip": 0.01091035, + "auxiliary_loss_mlp": 0.01048591, + "balance_loss_clip": 1.03735352, + "balance_loss_mlp": 1.02870715, + "epoch": 0.14850443408988426, + "flos": 14647675985280.0, + "grad_norm": 5.266810279602295, + "language_loss": 0.75590861, + "learning_rate": 3.854486022987603e-06, + "loss": 0.77730489, + "num_input_tokens_seen": 53659650, + "step": 2470, + "time_per_iteration": 2.6254093647003174 + }, + { + "auxiliary_loss_clip": 0.01119434, + "auxiliary_loss_mlp": 0.01045641, + "balance_loss_clip": 1.03786111, + "balance_loss_mlp": 1.02610242, + "epoch": 0.14856455734255222, + "flos": 23548314700800.0, + "grad_norm": 2.1544935169920807, + "language_loss": 0.72195923, + "learning_rate": 3.8543401499509905e-06, + "loss": 0.74361002, + "num_input_tokens_seen": 53680275, + "step": 2471, + "time_per_iteration": 2.578871488571167 + }, + { + "auxiliary_loss_clip": 0.01089742, + "auxiliary_loss_mlp": 0.01053547, + "balance_loss_clip": 1.03572512, + "balance_loss_mlp": 1.03222096, + "epoch": 0.1486246805952202, + "flos": 18077288515200.0, + "grad_norm": 1.9535947742908653, + "language_loss": 0.89843541, + "learning_rate": 3.854194206597615e-06, + "loss": 0.91986829, + "num_input_tokens_seen": 53698270, + "step": 2472, + "time_per_iteration": 2.6363108158111572 + }, + { + "auxiliary_loss_clip": 0.01081487, + "auxiliary_loss_mlp": 0.01049395, + "balance_loss_clip": 1.03549886, + "balance_loss_mlp": 1.02871275, + "epoch": 0.14868480384788818, + "flos": 19353625459200.0, + "grad_norm": 2.1630677112984977, + "language_loss": 0.80697018, + "learning_rate": 3.854048192933008e-06, + "loss": 0.82827902, + "num_input_tokens_seen": 53716845, + "step": 2473, + "time_per_iteration": 2.6330182552337646 + }, + { + "auxiliary_loss_clip": 0.01118211, + "auxiliary_loss_mlp": 0.01054511, + "balance_loss_clip": 1.04012382, + "balance_loss_mlp": 1.03499627, + "epoch": 0.14874492710055615, + "flos": 22200192426240.0, + "grad_norm": 2.59640903696701, + "language_loss": 0.77431452, + "learning_rate": 3.853902108962709e-06, + "loss": 0.79604173, + "num_input_tokens_seen": 53734970, + "step": 2474, + "time_per_iteration": 2.6280858516693115 + }, + { + "auxiliary_loss_clip": 0.01093601, + "auxiliary_loss_mlp": 0.01050875, + "balance_loss_clip": 1.04164016, + "balance_loss_mlp": 1.03016782, + "epoch": 0.1488050503532241, + "flos": 21103444506240.0, + "grad_norm": 2.209660513332006, + "language_loss": 0.8226589, + "learning_rate": 3.853755954692255e-06, + "loss": 0.84410363, + "num_input_tokens_seen": 53753415, + "step": 2475, + "time_per_iteration": 2.678272008895874 + }, + { + "auxiliary_loss_clip": 0.0106565, + "auxiliary_loss_mlp": 0.01052749, + "balance_loss_clip": 1.0376302, + "balance_loss_mlp": 1.03321075, + "epoch": 0.14886517360589208, + "flos": 12786569625600.0, + "grad_norm": 1.7926805645982296, + "language_loss": 0.80740839, + "learning_rate": 3.85360973012719e-06, + "loss": 0.82859242, + "num_input_tokens_seen": 53770305, + "step": 2476, + "time_per_iteration": 2.7660295963287354 + }, + { + "auxiliary_loss_clip": 0.01106892, + "auxiliary_loss_mlp": 0.01046505, + "balance_loss_clip": 1.03828979, + "balance_loss_mlp": 1.02827811, + "epoch": 0.14892529685856004, + "flos": 29022860419200.0, + "grad_norm": 1.7800479802944387, + "language_loss": 0.77927625, + "learning_rate": 3.853463435273058e-06, + "loss": 0.80081022, + "num_input_tokens_seen": 53788895, + "step": 2477, + "time_per_iteration": 2.6593568325042725 + }, + { + "auxiliary_loss_clip": 0.01003457, + "auxiliary_loss_mlp": 0.01003886, + "balance_loss_clip": 1.00266981, + "balance_loss_mlp": 1.00194263, + "epoch": 0.148985420111228, + "flos": 61926121054080.0, + "grad_norm": 0.8220043904391855, + "language_loss": 0.60150266, + "learning_rate": 3.853317070135407e-06, + "loss": 0.62157607, + "num_input_tokens_seen": 53850260, + "step": 2478, + "time_per_iteration": 3.325594902038574 + }, + { + "auxiliary_loss_clip": 0.01063236, + "auxiliary_loss_mlp": 0.01047119, + "balance_loss_clip": 1.0344919, + "balance_loss_mlp": 1.02840328, + "epoch": 0.149045543363896, + "flos": 23915106432000.0, + "grad_norm": 2.3749759123890377, + "language_loss": 0.71137989, + "learning_rate": 3.853170634719787e-06, + "loss": 0.73248345, + "num_input_tokens_seen": 53867520, + "step": 2479, + "time_per_iteration": 2.797619104385376 + }, + { + "auxiliary_loss_clip": 0.01101737, + "auxiliary_loss_mlp": 0.01048896, + "balance_loss_clip": 1.03788304, + "balance_loss_mlp": 1.02892816, + "epoch": 0.14910566661656396, + "flos": 23654394541440.0, + "grad_norm": 1.4989021431951077, + "language_loss": 0.80710524, + "learning_rate": 3.853024129031751e-06, + "loss": 0.82861155, + "num_input_tokens_seen": 53886620, + "step": 2480, + "time_per_iteration": 2.802222490310669 + }, + { + "auxiliary_loss_clip": 0.01098997, + "auxiliary_loss_mlp": 0.01050623, + "balance_loss_clip": 1.04011798, + "balance_loss_mlp": 1.03066754, + "epoch": 0.14916578986923193, + "flos": 20515299212160.0, + "grad_norm": 2.388995157421833, + "language_loss": 0.84190977, + "learning_rate": 3.852877553076854e-06, + "loss": 0.86340594, + "num_input_tokens_seen": 53902230, + "step": 2481, + "time_per_iteration": 2.7300667762756348 + }, + { + "auxiliary_loss_clip": 0.01111903, + "auxiliary_loss_mlp": 0.01055783, + "balance_loss_clip": 1.03939307, + "balance_loss_mlp": 1.03413427, + "epoch": 0.1492259131218999, + "flos": 22491822948480.0, + "grad_norm": 8.432839787793137, + "language_loss": 0.77769268, + "learning_rate": 3.8527309068606546e-06, + "loss": 0.79936951, + "num_input_tokens_seen": 53919475, + "step": 2482, + "time_per_iteration": 2.7774746417999268 + }, + { + "auxiliary_loss_clip": 0.01097012, + "auxiliary_loss_mlp": 0.01043607, + "balance_loss_clip": 1.0386951, + "balance_loss_mlp": 1.02265024, + "epoch": 0.14928603637456786, + "flos": 23185868515200.0, + "grad_norm": 2.309646264458803, + "language_loss": 0.78073317, + "learning_rate": 3.852584190388713e-06, + "loss": 0.8021394, + "num_input_tokens_seen": 53939150, + "step": 2483, + "time_per_iteration": 2.7315549850463867 + }, + { + "auxiliary_loss_clip": 0.0110864, + "auxiliary_loss_mlp": 0.0074894, + "balance_loss_clip": 1.03816152, + "balance_loss_mlp": 1.00048447, + "epoch": 0.14934615962723582, + "flos": 21653237053440.0, + "grad_norm": 1.4865356364685158, + "language_loss": 0.7040472, + "learning_rate": 3.852437403666595e-06, + "loss": 0.72262299, + "num_input_tokens_seen": 53958735, + "step": 2484, + "time_per_iteration": 2.670778512954712 + }, + { + "auxiliary_loss_clip": 0.01102217, + "auxiliary_loss_mlp": 0.00749083, + "balance_loss_clip": 1.03751743, + "balance_loss_mlp": 1.00038528, + "epoch": 0.1494062828799038, + "flos": 27010066924800.0, + "grad_norm": 2.345774565594429, + "language_loss": 0.84726608, + "learning_rate": 3.852290546699863e-06, + "loss": 0.86577916, + "num_input_tokens_seen": 53975065, + "step": 2485, + "time_per_iteration": 2.6807172298431396 + }, + { + "auxiliary_loss_clip": 0.01102504, + "auxiliary_loss_mlp": 0.01047938, + "balance_loss_clip": 1.03870833, + "balance_loss_mlp": 1.02707684, + "epoch": 0.14946640613257178, + "flos": 21214947300480.0, + "grad_norm": 2.274608331619074, + "language_loss": 0.84632361, + "learning_rate": 3.8521436194940894e-06, + "loss": 0.86782801, + "num_input_tokens_seen": 53993330, + "step": 2486, + "time_per_iteration": 2.626779079437256 + }, + { + "auxiliary_loss_clip": 0.01110322, + "auxiliary_loss_mlp": 0.01044583, + "balance_loss_clip": 1.03832376, + "balance_loss_mlp": 1.02735758, + "epoch": 0.14952652938523975, + "flos": 13370872164480.0, + "grad_norm": 2.716383034522316, + "language_loss": 0.747572, + "learning_rate": 3.851996622054842e-06, + "loss": 0.76912105, + "num_input_tokens_seen": 54010515, + "step": 2487, + "time_per_iteration": 2.628329277038574 + }, + { + "auxiliary_loss_clip": 0.01114026, + "auxiliary_loss_mlp": 0.01050135, + "balance_loss_clip": 1.03940928, + "balance_loss_mlp": 1.02979803, + "epoch": 0.1495866526379077, + "flos": 35517699959040.0, + "grad_norm": 1.8388810564420985, + "language_loss": 0.71884382, + "learning_rate": 3.8518495543877e-06, + "loss": 0.74048543, + "num_input_tokens_seen": 54031315, + "step": 2488, + "time_per_iteration": 2.7908642292022705 + }, + { + "auxiliary_loss_clip": 0.01103399, + "auxiliary_loss_mlp": 0.01052926, + "balance_loss_clip": 1.04022455, + "balance_loss_mlp": 1.03334022, + "epoch": 0.14964677589057568, + "flos": 17632749795840.0, + "grad_norm": 2.3176300535565773, + "language_loss": 0.71070433, + "learning_rate": 3.851702416498235e-06, + "loss": 0.73226762, + "num_input_tokens_seen": 54045965, + "step": 2489, + "time_per_iteration": 2.673642873764038 + }, + { + "auxiliary_loss_clip": 0.01097379, + "auxiliary_loss_mlp": 0.01054175, + "balance_loss_clip": 1.03690815, + "balance_loss_mlp": 1.03394485, + "epoch": 0.14970689914324364, + "flos": 20185280029440.0, + "grad_norm": 2.8378657227190223, + "language_loss": 0.81857061, + "learning_rate": 3.8515552083920295e-06, + "loss": 0.84008616, + "num_input_tokens_seen": 54059960, + "step": 2490, + "time_per_iteration": 2.684695243835449 + }, + { + "auxiliary_loss_clip": 0.01081074, + "auxiliary_loss_mlp": 0.01050342, + "balance_loss_clip": 1.03720629, + "balance_loss_mlp": 1.0307678, + "epoch": 0.1497670223959116, + "flos": 37228699382400.0, + "grad_norm": 1.8093181706892245, + "language_loss": 0.80149424, + "learning_rate": 3.851407930074666e-06, + "loss": 0.82280833, + "num_input_tokens_seen": 54079330, + "step": 2491, + "time_per_iteration": 4.387061834335327 + }, + { + "auxiliary_loss_clip": 0.0109906, + "auxiliary_loss_mlp": 0.01051468, + "balance_loss_clip": 1.03502202, + "balance_loss_mlp": 1.02918792, + "epoch": 0.1498271456485796, + "flos": 24455848752000.0, + "grad_norm": 1.7479705926748146, + "language_loss": 0.90902185, + "learning_rate": 3.851260581551727e-06, + "loss": 0.93052715, + "num_input_tokens_seen": 54097555, + "step": 2492, + "time_per_iteration": 2.5813939571380615 + }, + { + "auxiliary_loss_clip": 0.0111392, + "auxiliary_loss_mlp": 0.01056047, + "balance_loss_clip": 1.04063857, + "balance_loss_mlp": 1.03602004, + "epoch": 0.14988726890124757, + "flos": 16253601148800.0, + "grad_norm": 2.741573258950494, + "language_loss": 0.78804606, + "learning_rate": 3.851113162828802e-06, + "loss": 0.80974573, + "num_input_tokens_seen": 54115600, + "step": 2493, + "time_per_iteration": 2.5826878547668457 + }, + { + "auxiliary_loss_clip": 0.01114433, + "auxiliary_loss_mlp": 0.01049178, + "balance_loss_clip": 1.03921473, + "balance_loss_mlp": 1.02811325, + "epoch": 0.14994739215391553, + "flos": 20666555383680.0, + "grad_norm": 1.6632791415705035, + "language_loss": 0.79798746, + "learning_rate": 3.85096567391148e-06, + "loss": 0.81962347, + "num_input_tokens_seen": 54135220, + "step": 2494, + "time_per_iteration": 2.625983953475952 + }, + { + "auxiliary_loss_clip": 0.01097559, + "auxiliary_loss_mlp": 0.01050692, + "balance_loss_clip": 1.03675556, + "balance_loss_mlp": 1.02912736, + "epoch": 0.1500075154065835, + "flos": 70652375239680.0, + "grad_norm": 1.8020928476664575, + "language_loss": 0.66499674, + "learning_rate": 3.850818114805354e-06, + "loss": 0.68647921, + "num_input_tokens_seen": 54161065, + "step": 2495, + "time_per_iteration": 3.054612874984741 + }, + { + "auxiliary_loss_clip": 0.0101616, + "auxiliary_loss_mlp": 0.01010127, + "balance_loss_clip": 1.00403357, + "balance_loss_mlp": 1.00814819, + "epoch": 0.15006763865925146, + "flos": 68011937447040.0, + "grad_norm": 0.8850064862326312, + "language_loss": 0.59541696, + "learning_rate": 3.850670485516019e-06, + "loss": 0.61567986, + "num_input_tokens_seen": 54225095, + "step": 2496, + "time_per_iteration": 3.241241455078125 + }, + { + "auxiliary_loss_clip": 0.01125506, + "auxiliary_loss_mlp": 0.01056026, + "balance_loss_clip": 1.0378747, + "balance_loss_mlp": 1.03362679, + "epoch": 0.15012776191191943, + "flos": 18916269459840.0, + "grad_norm": 2.0152921112983733, + "language_loss": 0.65705776, + "learning_rate": 3.850522786049075e-06, + "loss": 0.67887306, + "num_input_tokens_seen": 54243750, + "step": 2497, + "time_per_iteration": 4.128999471664429 + }, + { + "auxiliary_loss_clip": 0.01079995, + "auxiliary_loss_mlp": 0.01054966, + "balance_loss_clip": 1.03608727, + "balance_loss_mlp": 1.03374648, + "epoch": 0.1501878851645874, + "flos": 23701330638720.0, + "grad_norm": 1.4495907467646956, + "language_loss": 0.75308043, + "learning_rate": 3.850375016410121e-06, + "loss": 0.77443004, + "num_input_tokens_seen": 54266185, + "step": 2498, + "time_per_iteration": 6.199479103088379 + }, + { + "auxiliary_loss_clip": 0.01098915, + "auxiliary_loss_mlp": 0.01050973, + "balance_loss_clip": 1.04149342, + "balance_loss_mlp": 1.02950311, + "epoch": 0.15024800841725539, + "flos": 20412523422720.0, + "grad_norm": 2.2260657395632553, + "language_loss": 0.71969819, + "learning_rate": 3.850227176604761e-06, + "loss": 0.74119711, + "num_input_tokens_seen": 54283940, + "step": 2499, + "time_per_iteration": 2.7926759719848633 + }, + { + "auxiliary_loss_clip": 0.01092375, + "auxiliary_loss_mlp": 0.01052839, + "balance_loss_clip": 1.03633153, + "balance_loss_mlp": 1.03120303, + "epoch": 0.15030813166992335, + "flos": 31831002812160.0, + "grad_norm": 1.9909412032179252, + "language_loss": 0.72154784, + "learning_rate": 3.850079266638601e-06, + "loss": 0.74300003, + "num_input_tokens_seen": 54304830, + "step": 2500, + "time_per_iteration": 2.748649835586548 + }, + { + "auxiliary_loss_clip": 0.01088453, + "auxiliary_loss_mlp": 0.01058065, + "balance_loss_clip": 1.03626001, + "balance_loss_mlp": 1.03688157, + "epoch": 0.15036825492259132, + "flos": 35657822914560.0, + "grad_norm": 1.748941877102306, + "language_loss": 0.65307164, + "learning_rate": 3.849931286517249e-06, + "loss": 0.67453688, + "num_input_tokens_seen": 54325595, + "step": 2501, + "time_per_iteration": 2.936119794845581 + }, + { + "auxiliary_loss_clip": 0.01100547, + "auxiliary_loss_mlp": 0.01052728, + "balance_loss_clip": 1.0367589, + "balance_loss_mlp": 1.03080487, + "epoch": 0.15042837817525928, + "flos": 18838163335680.0, + "grad_norm": 2.063661595865035, + "language_loss": 0.83423555, + "learning_rate": 3.849783236246318e-06, + "loss": 0.85576832, + "num_input_tokens_seen": 54342180, + "step": 2502, + "time_per_iteration": 2.7966859340667725 + }, + { + "auxiliary_loss_clip": 0.01087627, + "auxiliary_loss_mlp": 0.01052837, + "balance_loss_clip": 1.03615093, + "balance_loss_mlp": 1.03316736, + "epoch": 0.15048850142792725, + "flos": 19535548867200.0, + "grad_norm": 3.274505790693669, + "language_loss": 0.77392679, + "learning_rate": 3.849635115831421e-06, + "loss": 0.79533142, + "num_input_tokens_seen": 54360255, + "step": 2503, + "time_per_iteration": 2.6978962421417236 + }, + { + "auxiliary_loss_clip": 0.01120138, + "auxiliary_loss_mlp": 0.01047319, + "balance_loss_clip": 1.03742754, + "balance_loss_mlp": 1.02868688, + "epoch": 0.1505486246805952, + "flos": 22017550746240.0, + "grad_norm": 1.8764312596878516, + "language_loss": 0.84621912, + "learning_rate": 3.849486925278176e-06, + "loss": 0.8678937, + "num_input_tokens_seen": 54378260, + "step": 2504, + "time_per_iteration": 2.6452724933624268 + }, + { + "auxiliary_loss_clip": 0.01108183, + "auxiliary_loss_mlp": 0.01046304, + "balance_loss_clip": 1.03656387, + "balance_loss_mlp": 1.02754045, + "epoch": 0.15060874793326318, + "flos": 20743153136640.0, + "grad_norm": 1.6139929636304005, + "language_loss": 0.83338624, + "learning_rate": 3.8493386645922e-06, + "loss": 0.85493112, + "num_input_tokens_seen": 54399745, + "step": 2505, + "time_per_iteration": 2.6445508003234863 + }, + { + "auxiliary_loss_clip": 0.01079865, + "auxiliary_loss_mlp": 0.01053858, + "balance_loss_clip": 1.03433907, + "balance_loss_mlp": 1.03436732, + "epoch": 0.15066887118593117, + "flos": 16471902055680.0, + "grad_norm": 2.096535043632177, + "language_loss": 0.7586596, + "learning_rate": 3.849190333779117e-06, + "loss": 0.77999687, + "num_input_tokens_seen": 54417105, + "step": 2506, + "time_per_iteration": 2.625277519226074 + }, + { + "auxiliary_loss_clip": 0.01127512, + "auxiliary_loss_mlp": 0.01045563, + "balance_loss_clip": 1.04078543, + "balance_loss_mlp": 1.02521443, + "epoch": 0.15072899443859913, + "flos": 19859319083520.0, + "grad_norm": 3.8245250938391666, + "language_loss": 0.76723325, + "learning_rate": 3.849041932844552e-06, + "loss": 0.78896397, + "num_input_tokens_seen": 54433920, + "step": 2507, + "time_per_iteration": 2.616366147994995 + }, + { + "auxiliary_loss_clip": 0.01108131, + "auxiliary_loss_mlp": 0.01042938, + "balance_loss_clip": 1.03621638, + "balance_loss_mlp": 1.0240556, + "epoch": 0.1507891176912671, + "flos": 20776226584320.0, + "grad_norm": 1.8738225091664253, + "language_loss": 0.69521761, + "learning_rate": 3.848893461794131e-06, + "loss": 0.71672833, + "num_input_tokens_seen": 54451540, + "step": 2508, + "time_per_iteration": 2.6157054901123047 + }, + { + "auxiliary_loss_clip": 0.01087562, + "auxiliary_loss_mlp": 0.01048828, + "balance_loss_clip": 1.036901, + "balance_loss_mlp": 1.02962291, + "epoch": 0.15084924094393506, + "flos": 23586631534080.0, + "grad_norm": 2.4983463496642524, + "language_loss": 0.7754367, + "learning_rate": 3.8487449206334845e-06, + "loss": 0.79680061, + "num_input_tokens_seen": 54470800, + "step": 2509, + "time_per_iteration": 2.784550666809082 + }, + { + "auxiliary_loss_clip": 0.01102775, + "auxiliary_loss_mlp": 0.00749083, + "balance_loss_clip": 1.03665268, + "balance_loss_mlp": 1.00042391, + "epoch": 0.15090936419660303, + "flos": 18911313383040.0, + "grad_norm": 3.94782469604716, + "language_loss": 0.80099368, + "learning_rate": 3.848596309368246e-06, + "loss": 0.81951225, + "num_input_tokens_seen": 54486525, + "step": 2510, + "time_per_iteration": 2.7444963455200195 + }, + { + "auxiliary_loss_clip": 0.01113593, + "auxiliary_loss_mlp": 0.0105239, + "balance_loss_clip": 1.03923893, + "balance_loss_mlp": 1.03237486, + "epoch": 0.150969487449271, + "flos": 17928223073280.0, + "grad_norm": 1.8199569489254206, + "language_loss": 0.74113595, + "learning_rate": 3.8484476280040495e-06, + "loss": 0.76279587, + "num_input_tokens_seen": 54503795, + "step": 2511, + "time_per_iteration": 2.688664674758911 + }, + { + "auxiliary_loss_clip": 0.0105298, + "auxiliary_loss_mlp": 0.01042813, + "balance_loss_clip": 1.032179, + "balance_loss_mlp": 1.02447855, + "epoch": 0.151029610701939, + "flos": 24243078539520.0, + "grad_norm": 2.150244668862509, + "language_loss": 0.69049639, + "learning_rate": 3.848298876546534e-06, + "loss": 0.71145427, + "num_input_tokens_seen": 54523025, + "step": 2512, + "time_per_iteration": 2.7898011207580566 + }, + { + "auxiliary_loss_clip": 0.01109006, + "auxiliary_loss_mlp": 0.01046203, + "balance_loss_clip": 1.03759432, + "balance_loss_mlp": 1.02726102, + "epoch": 0.15108973395460695, + "flos": 30262496641920.0, + "grad_norm": 2.119575083162003, + "language_loss": 0.73988402, + "learning_rate": 3.84815005500134e-06, + "loss": 0.76143616, + "num_input_tokens_seen": 54545025, + "step": 2513, + "time_per_iteration": 2.693732500076294 + }, + { + "auxiliary_loss_clip": 0.00988236, + "auxiliary_loss_mlp": 0.01008744, + "balance_loss_clip": 1.01761854, + "balance_loss_mlp": 1.00640762, + "epoch": 0.15114985720727492, + "flos": 60437624428800.0, + "grad_norm": 0.8675198697736707, + "language_loss": 0.64809632, + "learning_rate": 3.84800116337411e-06, + "loss": 0.66806608, + "num_input_tokens_seen": 54604545, + "step": 2514, + "time_per_iteration": 3.2533321380615234 + }, + { + "auxiliary_loss_clip": 0.01109482, + "auxiliary_loss_mlp": 0.01046631, + "balance_loss_clip": 1.03865051, + "balance_loss_mlp": 1.02774835, + "epoch": 0.15120998045994288, + "flos": 20521691832960.0, + "grad_norm": 2.282038666612428, + "language_loss": 0.72819948, + "learning_rate": 3.8478522016704916e-06, + "loss": 0.74976063, + "num_input_tokens_seen": 54620590, + "step": 2515, + "time_per_iteration": 2.6270134449005127 + }, + { + "auxiliary_loss_clip": 0.01088261, + "auxiliary_loss_mlp": 0.01042755, + "balance_loss_clip": 1.0348649, + "balance_loss_mlp": 1.02318132, + "epoch": 0.15127010371261085, + "flos": 21178893024000.0, + "grad_norm": 1.8770070837687152, + "language_loss": 0.77426207, + "learning_rate": 3.8477031698961325e-06, + "loss": 0.79557222, + "num_input_tokens_seen": 54640410, + "step": 2516, + "time_per_iteration": 2.644505023956299 + }, + { + "auxiliary_loss_clip": 0.01015155, + "auxiliary_loss_mlp": 0.01001755, + "balance_loss_clip": 1.00352621, + "balance_loss_mlp": 0.9997763, + "epoch": 0.1513302269652788, + "flos": 65320648974720.0, + "grad_norm": 0.7223212072305442, + "language_loss": 0.5463599, + "learning_rate": 3.8475540680566835e-06, + "loss": 0.56652898, + "num_input_tokens_seen": 54701430, + "step": 2517, + "time_per_iteration": 3.1475789546966553 + }, + { + "auxiliary_loss_clip": 0.01064723, + "auxiliary_loss_mlp": 0.01048344, + "balance_loss_clip": 1.02931821, + "balance_loss_mlp": 1.02735102, + "epoch": 0.15139035021794678, + "flos": 19135827342720.0, + "grad_norm": 1.9272054239308707, + "language_loss": 0.78474778, + "learning_rate": 3.8474048961577995e-06, + "loss": 0.80587846, + "num_input_tokens_seen": 54720845, + "step": 2518, + "time_per_iteration": 2.7585957050323486 + }, + { + "auxiliary_loss_clip": 0.01107602, + "auxiliary_loss_mlp": 0.01049403, + "balance_loss_clip": 1.03740561, + "balance_loss_mlp": 1.02910113, + "epoch": 0.15145047347061477, + "flos": 26578564842240.0, + "grad_norm": 1.8937082836845223, + "language_loss": 0.69873476, + "learning_rate": 3.847255654205137e-06, + "loss": 0.72030485, + "num_input_tokens_seen": 54740495, + "step": 2519, + "time_per_iteration": 2.755262613296509 + }, + { + "auxiliary_loss_clip": 0.01112675, + "auxiliary_loss_mlp": 0.01050045, + "balance_loss_clip": 1.03790689, + "balance_loss_mlp": 1.03047085, + "epoch": 0.15151059672328274, + "flos": 20302959962880.0, + "grad_norm": 1.7788380930708594, + "language_loss": 0.7869637, + "learning_rate": 3.847106342204354e-06, + "loss": 0.80859089, + "num_input_tokens_seen": 54758415, + "step": 2520, + "time_per_iteration": 2.75107479095459 + }, + { + "auxiliary_loss_clip": 0.01105765, + "auxiliary_loss_mlp": 0.01050075, + "balance_loss_clip": 1.03816056, + "balance_loss_mlp": 1.02905869, + "epoch": 0.1515707199759507, + "flos": 27228367831680.0, + "grad_norm": 1.7065376484313255, + "language_loss": 0.74883294, + "learning_rate": 3.846956960161114e-06, + "loss": 0.7703914, + "num_input_tokens_seen": 54779355, + "step": 2521, + "time_per_iteration": 2.8475728034973145 + }, + { + "auxiliary_loss_clip": 0.01094954, + "auxiliary_loss_mlp": 0.0104635, + "balance_loss_clip": 1.03630352, + "balance_loss_mlp": 1.02489209, + "epoch": 0.15163084322861867, + "flos": 23587349806080.0, + "grad_norm": 2.6020418940652785, + "language_loss": 0.82098866, + "learning_rate": 3.84680750808108e-06, + "loss": 0.84240174, + "num_input_tokens_seen": 54799465, + "step": 2522, + "time_per_iteration": 2.8311898708343506 + }, + { + "auxiliary_loss_clip": 0.00981849, + "auxiliary_loss_mlp": 0.01007696, + "balance_loss_clip": 1.00418186, + "balance_loss_mlp": 1.00539517, + "epoch": 0.15169096648128663, + "flos": 66889622021760.0, + "grad_norm": 0.819733600583675, + "language_loss": 0.57877904, + "learning_rate": 3.846657985969922e-06, + "loss": 0.59867448, + "num_input_tokens_seen": 54857665, + "step": 2523, + "time_per_iteration": 3.2466259002685547 + }, + { + "auxiliary_loss_clip": 0.01100168, + "auxiliary_loss_mlp": 0.01055479, + "balance_loss_clip": 1.03540838, + "balance_loss_mlp": 1.03379536, + "epoch": 0.1517510897339546, + "flos": 29095435848960.0, + "grad_norm": 1.8153442558165793, + "language_loss": 0.74890363, + "learning_rate": 3.8465083938333066e-06, + "loss": 0.77046013, + "num_input_tokens_seen": 54879895, + "step": 2524, + "time_per_iteration": 2.7553486824035645 + }, + { + "auxiliary_loss_clip": 0.0110093, + "auxiliary_loss_mlp": 0.0104673, + "balance_loss_clip": 1.03614068, + "balance_loss_mlp": 1.02716732, + "epoch": 0.1518112129866226, + "flos": 18406553512320.0, + "grad_norm": 1.615603230433024, + "language_loss": 0.74587345, + "learning_rate": 3.8463587316769085e-06, + "loss": 0.76735008, + "num_input_tokens_seen": 54898245, + "step": 2525, + "time_per_iteration": 2.660527229309082 + }, + { + "auxiliary_loss_clip": 0.01102286, + "auxiliary_loss_mlp": 0.01046821, + "balance_loss_clip": 1.03733623, + "balance_loss_mlp": 1.02601922, + "epoch": 0.15187133623929056, + "flos": 19425410789760.0, + "grad_norm": 1.92130019243945, + "language_loss": 0.79472697, + "learning_rate": 3.846208999506402e-06, + "loss": 0.81621802, + "num_input_tokens_seen": 54917060, + "step": 2526, + "time_per_iteration": 2.6691365242004395 + }, + { + "auxiliary_loss_clip": 0.01095151, + "auxiliary_loss_mlp": 0.01050338, + "balance_loss_clip": 1.03589547, + "balance_loss_mlp": 1.03163362, + "epoch": 0.15193145949195852, + "flos": 17566207850880.0, + "grad_norm": 1.7017022663359784, + "language_loss": 0.84701258, + "learning_rate": 3.846059197327466e-06, + "loss": 0.86846745, + "num_input_tokens_seen": 54936365, + "step": 2527, + "time_per_iteration": 2.6021177768707275 + }, + { + "auxiliary_loss_clip": 0.01087714, + "auxiliary_loss_mlp": 0.01047431, + "balance_loss_clip": 1.0362308, + "balance_loss_mlp": 1.02762985, + "epoch": 0.15199158274462649, + "flos": 36176265866880.0, + "grad_norm": 2.153167652844247, + "language_loss": 0.68964195, + "learning_rate": 3.845909325145779e-06, + "loss": 0.71099335, + "num_input_tokens_seen": 54961365, + "step": 2528, + "time_per_iteration": 2.792447805404663 + }, + { + "auxiliary_loss_clip": 0.01100721, + "auxiliary_loss_mlp": 0.01051895, + "balance_loss_clip": 1.04078484, + "balance_loss_mlp": 1.03221393, + "epoch": 0.15205170599729445, + "flos": 23074042498560.0, + "grad_norm": 1.7525276056305283, + "language_loss": 0.86710089, + "learning_rate": 3.845759382967026e-06, + "loss": 0.88862711, + "num_input_tokens_seen": 54980750, + "step": 2529, + "time_per_iteration": 2.6176085472106934 + }, + { + "auxiliary_loss_clip": 0.01094425, + "auxiliary_loss_mlp": 0.01040324, + "balance_loss_clip": 1.03702688, + "balance_loss_mlp": 1.02079749, + "epoch": 0.15211182924996242, + "flos": 21908382336000.0, + "grad_norm": 1.7995405770535546, + "language_loss": 0.83105677, + "learning_rate": 3.845609370796893e-06, + "loss": 0.85240424, + "num_input_tokens_seen": 54999675, + "step": 2530, + "time_per_iteration": 2.615280866622925 + }, + { + "auxiliary_loss_clip": 0.0108702, + "auxiliary_loss_mlp": 0.01050551, + "balance_loss_clip": 1.03607392, + "balance_loss_mlp": 1.03029704, + "epoch": 0.15217195250263038, + "flos": 13881521865600.0, + "grad_norm": 3.603303337347771, + "language_loss": 0.80284482, + "learning_rate": 3.845459288641066e-06, + "loss": 0.82422054, + "num_input_tokens_seen": 55018295, + "step": 2531, + "time_per_iteration": 2.595996618270874 + }, + { + "auxiliary_loss_clip": 0.01103783, + "auxiliary_loss_mlp": 0.01046938, + "balance_loss_clip": 1.03669393, + "balance_loss_mlp": 1.02801943, + "epoch": 0.15223207575529837, + "flos": 24535319592960.0, + "grad_norm": 1.7794063286053559, + "language_loss": 0.78892195, + "learning_rate": 3.8453091365052394e-06, + "loss": 0.8104291, + "num_input_tokens_seen": 55037975, + "step": 2532, + "time_per_iteration": 2.603313684463501 + }, + { + "auxiliary_loss_clip": 0.01109172, + "auxiliary_loss_mlp": 0.01049578, + "balance_loss_clip": 1.03810859, + "balance_loss_mlp": 1.02932477, + "epoch": 0.15229219900796634, + "flos": 25556798563200.0, + "grad_norm": 1.7392647460901576, + "language_loss": 0.87530708, + "learning_rate": 3.845158914395105e-06, + "loss": 0.89689457, + "num_input_tokens_seen": 55057135, + "step": 2533, + "time_per_iteration": 2.754121780395508 + }, + { + "auxiliary_loss_clip": 0.01074294, + "auxiliary_loss_mlp": 0.01050129, + "balance_loss_clip": 1.03303587, + "balance_loss_mlp": 1.03042388, + "epoch": 0.1523523222606343, + "flos": 18217806520320.0, + "grad_norm": 2.31330588149726, + "language_loss": 0.78834283, + "learning_rate": 3.84500862231636e-06, + "loss": 0.809587, + "num_input_tokens_seen": 55075525, + "step": 2534, + "time_per_iteration": 2.686509370803833 + }, + { + "auxiliary_loss_clip": 0.01125489, + "auxiliary_loss_mlp": 0.01047271, + "balance_loss_clip": 1.03791618, + "balance_loss_mlp": 1.02668357, + "epoch": 0.15241244551330227, + "flos": 13260087642240.0, + "grad_norm": 3.132093553159022, + "language_loss": 0.76666039, + "learning_rate": 3.844858260274702e-06, + "loss": 0.78838801, + "num_input_tokens_seen": 55090845, + "step": 2535, + "time_per_iteration": 2.7409212589263916 + }, + { + "auxiliary_loss_clip": 0.01102998, + "auxiliary_loss_mlp": 0.01046396, + "balance_loss_clip": 1.03585005, + "balance_loss_mlp": 1.02670252, + "epoch": 0.15247256876597023, + "flos": 19715568854400.0, + "grad_norm": 2.413705524649606, + "language_loss": 0.77873999, + "learning_rate": 3.844707828275835e-06, + "loss": 0.8002339, + "num_input_tokens_seen": 55108750, + "step": 2536, + "time_per_iteration": 2.6730308532714844 + }, + { + "auxiliary_loss_clip": 0.01092483, + "auxiliary_loss_mlp": 0.01048614, + "balance_loss_clip": 1.03959537, + "balance_loss_mlp": 1.02946949, + "epoch": 0.1525326920186382, + "flos": 20375858615040.0, + "grad_norm": 2.301313320539569, + "language_loss": 0.75806534, + "learning_rate": 3.844557326325461e-06, + "loss": 0.77947634, + "num_input_tokens_seen": 55126750, + "step": 2537, + "time_per_iteration": 2.6214048862457275 + }, + { + "auxiliary_loss_clip": 0.01110948, + "auxiliary_loss_mlp": 0.01045953, + "balance_loss_clip": 1.03853202, + "balance_loss_mlp": 1.02604473, + "epoch": 0.15259281527130616, + "flos": 13589963170560.0, + "grad_norm": 2.049205212112759, + "language_loss": 0.77884459, + "learning_rate": 3.8444067544292896e-06, + "loss": 0.80041367, + "num_input_tokens_seen": 55144690, + "step": 2538, + "time_per_iteration": 4.182161808013916 + }, + { + "auxiliary_loss_clip": 0.0107119, + "auxiliary_loss_mlp": 0.01045044, + "balance_loss_clip": 1.03758383, + "balance_loss_mlp": 1.02678144, + "epoch": 0.15265293852397416, + "flos": 22860374446080.0, + "grad_norm": 1.868281326707354, + "language_loss": 0.8977266, + "learning_rate": 3.844256112593029e-06, + "loss": 0.91888893, + "num_input_tokens_seen": 55166055, + "step": 2539, + "time_per_iteration": 2.7484018802642822 + }, + { + "auxiliary_loss_clip": 0.01107532, + "auxiliary_loss_mlp": 0.01051503, + "balance_loss_clip": 1.03765082, + "balance_loss_mlp": 1.03167868, + "epoch": 0.15271306177664212, + "flos": 29238108670080.0, + "grad_norm": 1.983388984051604, + "language_loss": 0.93431175, + "learning_rate": 3.844105400822391e-06, + "loss": 0.9559021, + "num_input_tokens_seen": 55186285, + "step": 2540, + "time_per_iteration": 2.6872408390045166 + }, + { + "auxiliary_loss_clip": 0.01091162, + "auxiliary_loss_mlp": 0.01041381, + "balance_loss_clip": 1.03303456, + "balance_loss_mlp": 1.02268934, + "epoch": 0.1527731850293101, + "flos": 31246269310080.0, + "grad_norm": 1.6534836731287863, + "language_loss": 0.75261021, + "learning_rate": 3.843954619123092e-06, + "loss": 0.77393562, + "num_input_tokens_seen": 55207915, + "step": 2541, + "time_per_iteration": 2.7311153411865234 + }, + { + "auxiliary_loss_clip": 0.01083312, + "auxiliary_loss_mlp": 0.01048234, + "balance_loss_clip": 1.03627658, + "balance_loss_mlp": 1.02843308, + "epoch": 0.15283330828197805, + "flos": 22382079920640.0, + "grad_norm": 1.5412213232231846, + "language_loss": 0.8115766, + "learning_rate": 3.84380376750085e-06, + "loss": 0.83289206, + "num_input_tokens_seen": 55227860, + "step": 2542, + "time_per_iteration": 2.8111839294433594 + }, + { + "auxiliary_loss_clip": 0.01121406, + "auxiliary_loss_mlp": 0.01051612, + "balance_loss_clip": 1.03744864, + "balance_loss_mlp": 1.03172803, + "epoch": 0.15289343153464602, + "flos": 25520133755520.0, + "grad_norm": 1.8747031277026327, + "language_loss": 0.77233553, + "learning_rate": 3.843652845961383e-06, + "loss": 0.79406571, + "num_input_tokens_seen": 55247330, + "step": 2543, + "time_per_iteration": 2.6499383449554443 + }, + { + "auxiliary_loss_clip": 0.01109437, + "auxiliary_loss_mlp": 0.01049188, + "balance_loss_clip": 1.03781033, + "balance_loss_mlp": 1.03024578, + "epoch": 0.15295355478731398, + "flos": 22710016114560.0, + "grad_norm": 1.9320867179730539, + "language_loss": 0.86337447, + "learning_rate": 3.843501854510416e-06, + "loss": 0.88496077, + "num_input_tokens_seen": 55266195, + "step": 2544, + "time_per_iteration": 4.381317615509033 + }, + { + "auxiliary_loss_clip": 0.01113024, + "auxiliary_loss_mlp": 0.01055212, + "balance_loss_clip": 1.03646874, + "balance_loss_mlp": 1.03375435, + "epoch": 0.15301367803998198, + "flos": 23251907669760.0, + "grad_norm": 2.0170201415230813, + "language_loss": 0.82510614, + "learning_rate": 3.843350793153673e-06, + "loss": 0.84678853, + "num_input_tokens_seen": 55283305, + "step": 2545, + "time_per_iteration": 5.9127185344696045 + }, + { + "auxiliary_loss_clip": 0.01122486, + "auxiliary_loss_mlp": 0.01045216, + "balance_loss_clip": 1.0401268, + "balance_loss_mlp": 1.02566552, + "epoch": 0.15307380129264994, + "flos": 25886279041920.0, + "grad_norm": 2.346052340270257, + "language_loss": 0.71188205, + "learning_rate": 3.843199661896884e-06, + "loss": 0.73355907, + "num_input_tokens_seen": 55303035, + "step": 2546, + "time_per_iteration": 2.5935049057006836 + }, + { + "auxiliary_loss_clip": 0.01095123, + "auxiliary_loss_mlp": 0.0104706, + "balance_loss_clip": 1.03803349, + "balance_loss_mlp": 1.02635312, + "epoch": 0.1531339245453179, + "flos": 46973239205760.0, + "grad_norm": 1.6746989742701424, + "language_loss": 0.77326691, + "learning_rate": 3.843048460745779e-06, + "loss": 0.79468882, + "num_input_tokens_seen": 55327570, + "step": 2547, + "time_per_iteration": 2.8480355739593506 + }, + { + "auxiliary_loss_clip": 0.01073998, + "auxiliary_loss_mlp": 0.01050246, + "balance_loss_clip": 1.03535676, + "balance_loss_mlp": 1.03019476, + "epoch": 0.15319404779798587, + "flos": 35882049565440.0, + "grad_norm": 2.359449921336011, + "language_loss": 0.74246627, + "learning_rate": 3.842897189706092e-06, + "loss": 0.76370871, + "num_input_tokens_seen": 55351090, + "step": 2548, + "time_per_iteration": 2.8092849254608154 + }, + { + "auxiliary_loss_clip": 0.01104422, + "auxiliary_loss_mlp": 0.01053672, + "balance_loss_clip": 1.03820682, + "balance_loss_mlp": 1.03334641, + "epoch": 0.15325417105065384, + "flos": 25664638170240.0, + "grad_norm": 1.4910309051819186, + "language_loss": 0.80447435, + "learning_rate": 3.842745848783558e-06, + "loss": 0.82605529, + "num_input_tokens_seen": 55371050, + "step": 2549, + "time_per_iteration": 2.725484609603882 + }, + { + "auxiliary_loss_clip": 0.01111161, + "auxiliary_loss_mlp": 0.01049421, + "balance_loss_clip": 1.03831434, + "balance_loss_mlp": 1.0294888, + "epoch": 0.1533142943033218, + "flos": 18770831291520.0, + "grad_norm": 1.5018716647289763, + "language_loss": 0.74870217, + "learning_rate": 3.842594437983917e-06, + "loss": 0.77030802, + "num_input_tokens_seen": 55390375, + "step": 2550, + "time_per_iteration": 2.6414120197296143 + }, + { + "auxiliary_loss_clip": 0.01111278, + "auxiliary_loss_mlp": 0.01044309, + "balance_loss_clip": 1.03742957, + "balance_loss_mlp": 1.02398372, + "epoch": 0.15337441755598977, + "flos": 23107367341440.0, + "grad_norm": 2.455083842875954, + "language_loss": 0.76799679, + "learning_rate": 3.8424429573129115e-06, + "loss": 0.78955269, + "num_input_tokens_seen": 55408890, + "step": 2551, + "time_per_iteration": 2.594953775405884 + }, + { + "auxiliary_loss_clip": 0.01019586, + "auxiliary_loss_mlp": 0.01004547, + "balance_loss_clip": 1.00754714, + "balance_loss_mlp": 1.00234115, + "epoch": 0.15343454080865776, + "flos": 59861079227520.0, + "grad_norm": 0.9915198084116514, + "language_loss": 0.56709278, + "learning_rate": 3.842291406776283e-06, + "loss": 0.58733404, + "num_input_tokens_seen": 55463815, + "step": 2552, + "time_per_iteration": 3.0742220878601074 + }, + { + "auxiliary_loss_clip": 0.01075526, + "auxiliary_loss_mlp": 0.010454, + "balance_loss_clip": 1.0349772, + "balance_loss_mlp": 1.02513397, + "epoch": 0.15349466406132573, + "flos": 11910887959680.0, + "grad_norm": 2.2282716634984108, + "language_loss": 0.89180291, + "learning_rate": 3.84213978637978e-06, + "loss": 0.91301221, + "num_input_tokens_seen": 55481050, + "step": 2553, + "time_per_iteration": 2.8085885047912598 + }, + { + "auxiliary_loss_clip": 0.01113365, + "auxiliary_loss_mlp": 0.01050156, + "balance_loss_clip": 1.03966773, + "balance_loss_mlp": 1.0296042, + "epoch": 0.1535547873139937, + "flos": 24096922099200.0, + "grad_norm": 1.8488967864264612, + "language_loss": 0.78257096, + "learning_rate": 3.841988096129152e-06, + "loss": 0.80420619, + "num_input_tokens_seen": 55500050, + "step": 2554, + "time_per_iteration": 2.730175495147705 + }, + { + "auxiliary_loss_clip": 0.01060387, + "auxiliary_loss_mlp": 0.01056037, + "balance_loss_clip": 1.03722584, + "balance_loss_mlp": 1.03482962, + "epoch": 0.15361491056666166, + "flos": 17566459246080.0, + "grad_norm": 2.1423327347035883, + "language_loss": 0.77940118, + "learning_rate": 3.841836336030151e-06, + "loss": 0.80056536, + "num_input_tokens_seen": 55518125, + "step": 2555, + "time_per_iteration": 2.7457752227783203 + }, + { + "auxiliary_loss_clip": 0.01091463, + "auxiliary_loss_mlp": 0.01048364, + "balance_loss_clip": 1.03803229, + "balance_loss_mlp": 1.0292666, + "epoch": 0.15367503381932962, + "flos": 25046041121280.0, + "grad_norm": 1.814251328434845, + "language_loss": 0.76852542, + "learning_rate": 3.8416845060885305e-06, + "loss": 0.78992367, + "num_input_tokens_seen": 55540960, + "step": 2556, + "time_per_iteration": 2.725924015045166 + }, + { + "auxiliary_loss_clip": 0.01107688, + "auxiliary_loss_mlp": 0.00749007, + "balance_loss_clip": 1.04053819, + "balance_loss_mlp": 1.00048447, + "epoch": 0.15373515707199759, + "flos": 21507332008320.0, + "grad_norm": 1.8577272429063234, + "language_loss": 0.90062517, + "learning_rate": 3.84153260631005e-06, + "loss": 0.91919208, + "num_input_tokens_seen": 55559210, + "step": 2557, + "time_per_iteration": 2.6549901962280273 + }, + { + "auxiliary_loss_clip": 0.0109615, + "auxiliary_loss_mlp": 0.01049856, + "balance_loss_clip": 1.03484941, + "balance_loss_mlp": 1.02942359, + "epoch": 0.15379528032466555, + "flos": 25994729180160.0, + "grad_norm": 1.884757378751505, + "language_loss": 0.70440894, + "learning_rate": 3.841380636700468e-06, + "loss": 0.72586906, + "num_input_tokens_seen": 55578925, + "step": 2558, + "time_per_iteration": 2.7261126041412354 + }, + { + "auxiliary_loss_clip": 0.01098313, + "auxiliary_loss_mlp": 0.01051685, + "balance_loss_clip": 1.03672874, + "balance_loss_mlp": 1.03178859, + "epoch": 0.15385540357733354, + "flos": 19277315015040.0, + "grad_norm": 1.9999206186612815, + "language_loss": 0.92121112, + "learning_rate": 3.841228597265548e-06, + "loss": 0.94271106, + "num_input_tokens_seen": 55597255, + "step": 2559, + "time_per_iteration": 2.737381935119629 + }, + { + "auxiliary_loss_clip": 0.01105589, + "auxiliary_loss_mlp": 0.01059271, + "balance_loss_clip": 1.04250717, + "balance_loss_mlp": 1.03818274, + "epoch": 0.1539155268300015, + "flos": 28549126920960.0, + "grad_norm": 2.49932596730132, + "language_loss": 0.63658595, + "learning_rate": 3.841076488011055e-06, + "loss": 0.6582346, + "num_input_tokens_seen": 55619515, + "step": 2560, + "time_per_iteration": 2.741715431213379 + }, + { + "auxiliary_loss_clip": 0.01101377, + "auxiliary_loss_mlp": 0.0104907, + "balance_loss_clip": 1.03821445, + "balance_loss_mlp": 1.02904284, + "epoch": 0.15397565008266947, + "flos": 23547883737600.0, + "grad_norm": 2.273264713688543, + "language_loss": 0.8796674, + "learning_rate": 3.8409243089427574e-06, + "loss": 0.9011718, + "num_input_tokens_seen": 55640050, + "step": 2561, + "time_per_iteration": 2.7815303802490234 + }, + { + "auxiliary_loss_clip": 0.01108695, + "auxiliary_loss_mlp": 0.01042768, + "balance_loss_clip": 1.0384655, + "balance_loss_mlp": 1.02390885, + "epoch": 0.15403577333533744, + "flos": 17129821518720.0, + "grad_norm": 1.757062950797462, + "language_loss": 0.82781434, + "learning_rate": 3.840772060066425e-06, + "loss": 0.84932888, + "num_input_tokens_seen": 55658695, + "step": 2562, + "time_per_iteration": 2.6648948192596436 + }, + { + "auxiliary_loss_clip": 0.01091501, + "auxiliary_loss_mlp": 0.00749086, + "balance_loss_clip": 1.03743184, + "balance_loss_mlp": 1.00049007, + "epoch": 0.1540958965880054, + "flos": 17894503180800.0, + "grad_norm": 2.439634397729529, + "language_loss": 0.74597651, + "learning_rate": 3.840619741387832e-06, + "loss": 0.76438236, + "num_input_tokens_seen": 55676340, + "step": 2563, + "time_per_iteration": 2.7012457847595215 + }, + { + "auxiliary_loss_clip": 0.01081985, + "auxiliary_loss_mlp": 0.01044113, + "balance_loss_clip": 1.03552508, + "balance_loss_mlp": 1.02397823, + "epoch": 0.15415601984067337, + "flos": 32161057908480.0, + "grad_norm": 2.0933588834342554, + "language_loss": 0.75668293, + "learning_rate": 3.8404673529127534e-06, + "loss": 0.77794391, + "num_input_tokens_seen": 55698890, + "step": 2564, + "time_per_iteration": 2.8028295040130615 + }, + { + "auxiliary_loss_clip": 0.01095918, + "auxiliary_loss_mlp": 0.01055709, + "balance_loss_clip": 1.0358448, + "balance_loss_mlp": 1.03638482, + "epoch": 0.15421614309334136, + "flos": 24024418496640.0, + "grad_norm": 2.771298945765033, + "language_loss": 0.70218599, + "learning_rate": 3.840314894646969e-06, + "loss": 0.72370219, + "num_input_tokens_seen": 55718535, + "step": 2565, + "time_per_iteration": 2.769265651702881 + }, + { + "auxiliary_loss_clip": 0.01099198, + "auxiliary_loss_mlp": 0.01049625, + "balance_loss_clip": 1.03371274, + "balance_loss_mlp": 1.0298717, + "epoch": 0.15427626634600933, + "flos": 24386290064640.0, + "grad_norm": 2.332748923041009, + "language_loss": 0.71788108, + "learning_rate": 3.840162366596259e-06, + "loss": 0.73936933, + "num_input_tokens_seen": 55738970, + "step": 2566, + "time_per_iteration": 2.659841537475586 + }, + { + "auxiliary_loss_clip": 0.0111587, + "auxiliary_loss_mlp": 0.01042013, + "balance_loss_clip": 1.03538311, + "balance_loss_mlp": 1.022856, + "epoch": 0.1543363895986773, + "flos": 23331522165120.0, + "grad_norm": 1.759619396913442, + "language_loss": 0.85007346, + "learning_rate": 3.840009768766408e-06, + "loss": 0.87165225, + "num_input_tokens_seen": 55759585, + "step": 2567, + "time_per_iteration": 2.6884102821350098 + }, + { + "auxiliary_loss_clip": 0.01065308, + "auxiliary_loss_mlp": 0.01048237, + "balance_loss_clip": 1.0325222, + "balance_loss_mlp": 1.0297122, + "epoch": 0.15439651285134526, + "flos": 24274284480000.0, + "grad_norm": 2.167075128073181, + "language_loss": 0.7781378, + "learning_rate": 3.839857101163202e-06, + "loss": 0.79927325, + "num_input_tokens_seen": 55779250, + "step": 2568, + "time_per_iteration": 2.750138282775879 + }, + { + "auxiliary_loss_clip": 0.01084431, + "auxiliary_loss_mlp": 0.01039753, + "balance_loss_clip": 1.0358882, + "balance_loss_mlp": 1.01960707, + "epoch": 0.15445663610401322, + "flos": 22456163721600.0, + "grad_norm": 1.8097761061582616, + "language_loss": 0.70813715, + "learning_rate": 3.83970436379243e-06, + "loss": 0.729379, + "num_input_tokens_seen": 55800470, + "step": 2569, + "time_per_iteration": 2.7509026527404785 + }, + { + "auxiliary_loss_clip": 0.01092458, + "auxiliary_loss_mlp": 0.01043912, + "balance_loss_clip": 1.03462625, + "balance_loss_mlp": 1.02572107, + "epoch": 0.1545167593566812, + "flos": 22049510872320.0, + "grad_norm": 3.00058063681417, + "language_loss": 0.76790196, + "learning_rate": 3.839551556659884e-06, + "loss": 0.78926563, + "num_input_tokens_seen": 55817795, + "step": 2570, + "time_per_iteration": 2.6414763927459717 + }, + { + "auxiliary_loss_clip": 0.0110567, + "auxiliary_loss_mlp": 0.01044759, + "balance_loss_clip": 1.03801918, + "balance_loss_mlp": 1.02575707, + "epoch": 0.15457688260934915, + "flos": 19318253541120.0, + "grad_norm": 2.245598491672239, + "language_loss": 0.77240694, + "learning_rate": 3.839398679771359e-06, + "loss": 0.79391122, + "num_input_tokens_seen": 55836125, + "step": 2571, + "time_per_iteration": 2.586853504180908 + }, + { + "auxiliary_loss_clip": 0.01093992, + "auxiliary_loss_mlp": 0.01046139, + "balance_loss_clip": 1.03489387, + "balance_loss_mlp": 1.02696991, + "epoch": 0.15463700586201715, + "flos": 24133981956480.0, + "grad_norm": 2.0429433152898255, + "language_loss": 0.82284009, + "learning_rate": 3.839245733132652e-06, + "loss": 0.84424144, + "num_input_tokens_seen": 55855280, + "step": 2572, + "time_per_iteration": 2.652027130126953 + }, + { + "auxiliary_loss_clip": 0.01121992, + "auxiliary_loss_mlp": 0.01043731, + "balance_loss_clip": 1.03793681, + "balance_loss_mlp": 1.02486038, + "epoch": 0.1546971291146851, + "flos": 22420935457920.0, + "grad_norm": 1.6114213244869247, + "language_loss": 0.9060784, + "learning_rate": 3.839092716749563e-06, + "loss": 0.92773569, + "num_input_tokens_seen": 55875695, + "step": 2573, + "time_per_iteration": 2.5761146545410156 + }, + { + "auxiliary_loss_clip": 0.01049686, + "auxiliary_loss_mlp": 0.01054457, + "balance_loss_clip": 1.03153002, + "balance_loss_mlp": 1.03416717, + "epoch": 0.15475725236735308, + "flos": 17530225401600.0, + "grad_norm": 1.7076701196850663, + "language_loss": 0.70178115, + "learning_rate": 3.838939630627893e-06, + "loss": 0.72282261, + "num_input_tokens_seen": 55894575, + "step": 2574, + "time_per_iteration": 2.719402313232422 + }, + { + "auxiliary_loss_clip": 0.01091387, + "auxiliary_loss_mlp": 0.01050163, + "balance_loss_clip": 1.03366077, + "balance_loss_mlp": 1.02964735, + "epoch": 0.15481737562002104, + "flos": 22561740771840.0, + "grad_norm": 1.9587535780965515, + "language_loss": 0.82354629, + "learning_rate": 3.838786474773448e-06, + "loss": 0.84496188, + "num_input_tokens_seen": 55912855, + "step": 2575, + "time_per_iteration": 2.757969617843628 + }, + { + "auxiliary_loss_clip": 0.01093614, + "auxiliary_loss_mlp": 0.01046358, + "balance_loss_clip": 1.03246975, + "balance_loss_mlp": 1.02752328, + "epoch": 0.154877498872689, + "flos": 24900567039360.0, + "grad_norm": 2.0975489069179134, + "language_loss": 0.8486951, + "learning_rate": 3.838633249192036e-06, + "loss": 0.87009478, + "num_input_tokens_seen": 55932375, + "step": 2576, + "time_per_iteration": 2.5908966064453125 + }, + { + "auxiliary_loss_clip": 0.01117034, + "auxiliary_loss_mlp": 0.01044235, + "balance_loss_clip": 1.03520536, + "balance_loss_mlp": 1.02554274, + "epoch": 0.15493762212535697, + "flos": 28147501975680.0, + "grad_norm": 2.215858653387284, + "language_loss": 0.81763232, + "learning_rate": 3.838479953889465e-06, + "loss": 0.83924508, + "num_input_tokens_seen": 55953970, + "step": 2577, + "time_per_iteration": 2.637603998184204 + }, + { + "auxiliary_loss_clip": 0.01087904, + "auxiliary_loss_mlp": 0.01051191, + "balance_loss_clip": 1.03694773, + "balance_loss_mlp": 1.03217745, + "epoch": 0.15499774537802496, + "flos": 25411073086080.0, + "grad_norm": 2.032423974820109, + "language_loss": 0.7651186, + "learning_rate": 3.8383265888715525e-06, + "loss": 0.78650951, + "num_input_tokens_seen": 55973120, + "step": 2578, + "time_per_iteration": 2.7077956199645996 + }, + { + "auxiliary_loss_clip": 0.0108062, + "auxiliary_loss_mlp": 0.01047804, + "balance_loss_clip": 1.03502429, + "balance_loss_mlp": 1.02759814, + "epoch": 0.15505786863069293, + "flos": 22091562720000.0, + "grad_norm": 1.9128049185780458, + "language_loss": 0.82708013, + "learning_rate": 3.83817315414411e-06, + "loss": 0.84836435, + "num_input_tokens_seen": 55993260, + "step": 2579, + "time_per_iteration": 2.703061103820801 + }, + { + "auxiliary_loss_clip": 0.01103059, + "auxiliary_loss_mlp": 0.01046085, + "balance_loss_clip": 1.0399946, + "balance_loss_mlp": 1.02716649, + "epoch": 0.1551179918833609, + "flos": 18917131386240.0, + "grad_norm": 1.6012398738705582, + "language_loss": 0.80719197, + "learning_rate": 3.838019649712958e-06, + "loss": 0.82868338, + "num_input_tokens_seen": 56012130, + "step": 2580, + "time_per_iteration": 2.7430126667022705 + }, + { + "auxiliary_loss_clip": 0.01015213, + "auxiliary_loss_mlp": 0.01010229, + "balance_loss_clip": 1.00359344, + "balance_loss_mlp": 1.00783288, + "epoch": 0.15517811513602886, + "flos": 66239172587520.0, + "grad_norm": 0.8457452514610277, + "language_loss": 0.58833075, + "learning_rate": 3.8378660755839166e-06, + "loss": 0.60858524, + "num_input_tokens_seen": 56079045, + "step": 2581, + "time_per_iteration": 3.3474323749542236 + }, + { + "auxiliary_loss_clip": 0.01065503, + "auxiliary_loss_mlp": 0.01050857, + "balance_loss_clip": 1.03114665, + "balance_loss_mlp": 1.03000689, + "epoch": 0.15523823838869683, + "flos": 24021078531840.0, + "grad_norm": 1.7841326447961285, + "language_loss": 0.85092795, + "learning_rate": 3.8377124317628095e-06, + "loss": 0.87209153, + "num_input_tokens_seen": 56098745, + "step": 2582, + "time_per_iteration": 2.8524010181427 + }, + { + "auxiliary_loss_clip": 0.01108392, + "auxiliary_loss_mlp": 0.01051483, + "balance_loss_clip": 1.03786123, + "balance_loss_mlp": 1.03165889, + "epoch": 0.1552983616413648, + "flos": 20485062938880.0, + "grad_norm": 2.1700816138662247, + "language_loss": 0.78640896, + "learning_rate": 3.8375587182554625e-06, + "loss": 0.80800766, + "num_input_tokens_seen": 56117655, + "step": 2583, + "time_per_iteration": 2.6576101779937744 + }, + { + "auxiliary_loss_clip": 0.01099767, + "auxiliary_loss_mlp": 0.01053257, + "balance_loss_clip": 1.03583395, + "balance_loss_mlp": 1.03208542, + "epoch": 0.15535848489403276, + "flos": 32123710742400.0, + "grad_norm": 1.6696757727507219, + "language_loss": 0.76153833, + "learning_rate": 3.837404935067705e-06, + "loss": 0.78306866, + "num_input_tokens_seen": 56141960, + "step": 2584, + "time_per_iteration": 2.7816476821899414 + }, + { + "auxiliary_loss_clip": 0.01095518, + "auxiliary_loss_mlp": 0.01041766, + "balance_loss_clip": 1.03364897, + "balance_loss_mlp": 1.02253747, + "epoch": 0.15541860814670075, + "flos": 19098444263040.0, + "grad_norm": 2.0714147319962635, + "language_loss": 0.75598228, + "learning_rate": 3.837251082205368e-06, + "loss": 0.77735507, + "num_input_tokens_seen": 56161430, + "step": 2585, + "time_per_iteration": 4.235036373138428 + }, + { + "auxiliary_loss_clip": 0.01082029, + "auxiliary_loss_mlp": 0.01044713, + "balance_loss_clip": 1.03485084, + "balance_loss_mlp": 1.02610493, + "epoch": 0.1554787313993687, + "flos": 19172097100800.0, + "grad_norm": 2.4139723065961083, + "language_loss": 0.61220276, + "learning_rate": 3.837097159674286e-06, + "loss": 0.63347018, + "num_input_tokens_seen": 56179390, + "step": 2586, + "time_per_iteration": 2.673238754272461 + }, + { + "auxiliary_loss_clip": 0.01085697, + "auxiliary_loss_mlp": 0.01044861, + "balance_loss_clip": 1.03315043, + "balance_loss_mlp": 1.02607334, + "epoch": 0.15553885465203668, + "flos": 16143822207360.0, + "grad_norm": 1.7916086382924878, + "language_loss": 0.81389844, + "learning_rate": 3.836943167480296e-06, + "loss": 0.83520401, + "num_input_tokens_seen": 56198020, + "step": 2587, + "time_per_iteration": 2.719332456588745 + }, + { + "auxiliary_loss_clip": 0.01123111, + "auxiliary_loss_mlp": 0.01051629, + "balance_loss_clip": 1.03825307, + "balance_loss_mlp": 1.02911055, + "epoch": 0.15559897790470464, + "flos": 25337779384320.0, + "grad_norm": 2.1958606606963946, + "language_loss": 0.8919127, + "learning_rate": 3.836789105629236e-06, + "loss": 0.91366011, + "num_input_tokens_seen": 56218165, + "step": 2588, + "time_per_iteration": 2.7594802379608154 + }, + { + "auxiliary_loss_clip": 0.0105814, + "auxiliary_loss_mlp": 0.01057277, + "balance_loss_clip": 1.03281927, + "balance_loss_mlp": 1.0359621, + "epoch": 0.1556591011573726, + "flos": 23148772744320.0, + "grad_norm": 2.0165721686114533, + "language_loss": 0.64137834, + "learning_rate": 3.83663497412695e-06, + "loss": 0.66253257, + "num_input_tokens_seen": 56237160, + "step": 2589, + "time_per_iteration": 2.685192346572876 + }, + { + "auxiliary_loss_clip": 0.01058779, + "auxiliary_loss_mlp": 0.01044112, + "balance_loss_clip": 1.02974093, + "balance_loss_mlp": 1.02311921, + "epoch": 0.15571922441004057, + "flos": 25370888745600.0, + "grad_norm": 2.0775918740595465, + "language_loss": 0.82873857, + "learning_rate": 3.836480772979281e-06, + "loss": 0.84976745, + "num_input_tokens_seen": 56257610, + "step": 2590, + "time_per_iteration": 2.6452953815460205 + }, + { + "auxiliary_loss_clip": 0.01085338, + "auxiliary_loss_mlp": 0.01041007, + "balance_loss_clip": 1.03426099, + "balance_loss_mlp": 1.02216029, + "epoch": 0.15577934766270854, + "flos": 14501375890560.0, + "grad_norm": 2.1759810305008465, + "language_loss": 0.79361963, + "learning_rate": 3.836326502192077e-06, + "loss": 0.81488311, + "num_input_tokens_seen": 56275215, + "step": 2591, + "time_per_iteration": 4.579212665557861 + }, + { + "auxiliary_loss_clip": 0.01107783, + "auxiliary_loss_mlp": 0.01047208, + "balance_loss_clip": 1.03724432, + "balance_loss_mlp": 1.02925515, + "epoch": 0.15583947091537653, + "flos": 37414537372800.0, + "grad_norm": 2.6456929700686063, + "language_loss": 0.64767468, + "learning_rate": 3.836172161771189e-06, + "loss": 0.66922456, + "num_input_tokens_seen": 56297130, + "step": 2592, + "time_per_iteration": 5.903934001922607 + }, + { + "auxiliary_loss_clip": 0.01102184, + "auxiliary_loss_mlp": 0.01051099, + "balance_loss_clip": 1.04093027, + "balance_loss_mlp": 1.03067899, + "epoch": 0.1558995941680445, + "flos": 21834729498240.0, + "grad_norm": 2.035256613419279, + "language_loss": 0.82060266, + "learning_rate": 3.836017751722467e-06, + "loss": 0.84213549, + "num_input_tokens_seen": 56314995, + "step": 2593, + "time_per_iteration": 2.739927053451538 + }, + { + "auxiliary_loss_clip": 0.01106816, + "auxiliary_loss_mlp": 0.01045684, + "balance_loss_clip": 1.0369699, + "balance_loss_mlp": 1.02591896, + "epoch": 0.15595971742071246, + "flos": 19792633484160.0, + "grad_norm": 2.0613922992459495, + "language_loss": 0.72811979, + "learning_rate": 3.8358632720517695e-06, + "loss": 0.74964476, + "num_input_tokens_seen": 56334005, + "step": 2594, + "time_per_iteration": 2.567192554473877 + }, + { + "auxiliary_loss_clip": 0.01071905, + "auxiliary_loss_mlp": 0.01039354, + "balance_loss_clip": 1.02948976, + "balance_loss_mlp": 1.01988721, + "epoch": 0.15601984067338043, + "flos": 26722135503360.0, + "grad_norm": 5.374310685900384, + "language_loss": 0.81620556, + "learning_rate": 3.835708722764952e-06, + "loss": 0.83731818, + "num_input_tokens_seen": 56353795, + "step": 2595, + "time_per_iteration": 2.630751848220825 + }, + { + "auxiliary_loss_clip": 0.01116995, + "auxiliary_loss_mlp": 0.01043007, + "balance_loss_clip": 1.03527141, + "balance_loss_mlp": 1.02394557, + "epoch": 0.1560799639260484, + "flos": 18369278173440.0, + "grad_norm": 1.8183791986265139, + "language_loss": 0.86705095, + "learning_rate": 3.835554103867876e-06, + "loss": 0.88865095, + "num_input_tokens_seen": 56373195, + "step": 2596, + "time_per_iteration": 2.526045322418213 + }, + { + "auxiliary_loss_clip": 0.01107721, + "auxiliary_loss_mlp": 0.0104226, + "balance_loss_clip": 1.0385536, + "balance_loss_mlp": 1.02379441, + "epoch": 0.15614008717871636, + "flos": 22598980197120.0, + "grad_norm": 1.6786679865798149, + "language_loss": 0.68751931, + "learning_rate": 3.835399415366404e-06, + "loss": 0.70901906, + "num_input_tokens_seen": 56391525, + "step": 2597, + "time_per_iteration": 2.593975305557251 + }, + { + "auxiliary_loss_clip": 0.01094444, + "auxiliary_loss_mlp": 0.01044672, + "balance_loss_clip": 1.03987861, + "balance_loss_mlp": 1.02630234, + "epoch": 0.15620021043138435, + "flos": 22746860490240.0, + "grad_norm": 1.770601473301218, + "language_loss": 0.79903984, + "learning_rate": 3.8352446572664035e-06, + "loss": 0.82043099, + "num_input_tokens_seen": 56410715, + "step": 2598, + "time_per_iteration": 2.767094612121582 + }, + { + "auxiliary_loss_clip": 0.0109353, + "auxiliary_loss_mlp": 0.00749007, + "balance_loss_clip": 1.03506017, + "balance_loss_mlp": 1.00057352, + "epoch": 0.15626033368405232, + "flos": 13114936782720.0, + "grad_norm": 1.7808795920065772, + "language_loss": 0.82596332, + "learning_rate": 3.8350898295737405e-06, + "loss": 0.84438872, + "num_input_tokens_seen": 56429170, + "step": 2599, + "time_per_iteration": 2.7019450664520264 + }, + { + "auxiliary_loss_clip": 0.01122373, + "auxiliary_loss_mlp": 0.01050483, + "balance_loss_clip": 1.03708386, + "balance_loss_mlp": 1.02932286, + "epoch": 0.15632045693672028, + "flos": 16472297105280.0, + "grad_norm": 2.1852012197062196, + "language_loss": 0.81582773, + "learning_rate": 3.834934932294287e-06, + "loss": 0.83755624, + "num_input_tokens_seen": 56445685, + "step": 2600, + "time_per_iteration": 2.538729429244995 + }, + { + "auxiliary_loss_clip": 0.01122432, + "auxiliary_loss_mlp": 0.00749099, + "balance_loss_clip": 1.04049766, + "balance_loss_mlp": 1.00069857, + "epoch": 0.15638058018938825, + "flos": 20850346298880.0, + "grad_norm": 1.8975311140249402, + "language_loss": 0.88414162, + "learning_rate": 3.834779965433917e-06, + "loss": 0.90285695, + "num_input_tokens_seen": 56465900, + "step": 2601, + "time_per_iteration": 2.581544876098633 + }, + { + "auxiliary_loss_clip": 0.01122876, + "auxiliary_loss_mlp": 0.01070141, + "balance_loss_clip": 1.04023886, + "balance_loss_mlp": 1.04821897, + "epoch": 0.1564407034420562, + "flos": 21872220318720.0, + "grad_norm": 1.7050261172649024, + "language_loss": 0.7891531, + "learning_rate": 3.834624928998508e-06, + "loss": 0.81108332, + "num_input_tokens_seen": 56485020, + "step": 2602, + "time_per_iteration": 2.606022357940674 + }, + { + "auxiliary_loss_clip": 0.01084905, + "auxiliary_loss_mlp": 0.010444, + "balance_loss_clip": 1.03510201, + "balance_loss_mlp": 1.02501678, + "epoch": 0.15650082669472418, + "flos": 21834549930240.0, + "grad_norm": 1.7478272072826124, + "language_loss": 0.73942971, + "learning_rate": 3.8344698229939376e-06, + "loss": 0.76072276, + "num_input_tokens_seen": 56505205, + "step": 2603, + "time_per_iteration": 2.6962881088256836 + }, + { + "auxiliary_loss_clip": 0.01104837, + "auxiliary_loss_mlp": 0.01056785, + "balance_loss_clip": 1.03515339, + "balance_loss_mlp": 1.03750849, + "epoch": 0.15656094994739214, + "flos": 13800542653440.0, + "grad_norm": 3.1159579421029924, + "language_loss": 0.87216818, + "learning_rate": 3.8343146474260865e-06, + "loss": 0.8937844, + "num_input_tokens_seen": 56521495, + "step": 2604, + "time_per_iteration": 2.6195836067199707 + }, + { + "auxiliary_loss_clip": 0.01109814, + "auxiliary_loss_mlp": 0.01045922, + "balance_loss_clip": 1.03739214, + "balance_loss_mlp": 1.02606153, + "epoch": 0.15662107320006013, + "flos": 27308197808640.0, + "grad_norm": 1.9762791107916347, + "language_loss": 0.85344744, + "learning_rate": 3.834159402300841e-06, + "loss": 0.87500477, + "num_input_tokens_seen": 56540665, + "step": 2605, + "time_per_iteration": 2.7030436992645264 + }, + { + "auxiliary_loss_clip": 0.01112313, + "auxiliary_loss_mlp": 0.0104979, + "balance_loss_clip": 1.03639126, + "balance_loss_mlp": 1.02866578, + "epoch": 0.1566811964527281, + "flos": 26685075646080.0, + "grad_norm": 3.784516635529001, + "language_loss": 0.73450065, + "learning_rate": 3.834004087624087e-06, + "loss": 0.7561217, + "num_input_tokens_seen": 56560805, + "step": 2606, + "time_per_iteration": 2.665766477584839 + }, + { + "auxiliary_loss_clip": 0.01121604, + "auxiliary_loss_mlp": 0.01050924, + "balance_loss_clip": 1.04097807, + "balance_loss_mlp": 1.03252959, + "epoch": 0.15674131970539606, + "flos": 16103422385280.0, + "grad_norm": 2.084667302159628, + "language_loss": 0.76291108, + "learning_rate": 3.8338487034017145e-06, + "loss": 0.78463626, + "num_input_tokens_seen": 56576335, + "step": 2607, + "time_per_iteration": 2.563485860824585 + }, + { + "auxiliary_loss_clip": 0.01084207, + "auxiliary_loss_mlp": 0.01046946, + "balance_loss_clip": 1.0368005, + "balance_loss_mlp": 1.02857602, + "epoch": 0.15680144295806403, + "flos": 19169690889600.0, + "grad_norm": 1.8375580624875598, + "language_loss": 0.82037532, + "learning_rate": 3.833693249639615e-06, + "loss": 0.8416869, + "num_input_tokens_seen": 56595880, + "step": 2608, + "time_per_iteration": 2.8398027420043945 + }, + { + "auxiliary_loss_clip": 0.01092269, + "auxiliary_loss_mlp": 0.01049066, + "balance_loss_clip": 1.03564572, + "balance_loss_mlp": 1.02661908, + "epoch": 0.156861566210732, + "flos": 20813430096000.0, + "grad_norm": 2.0505939094682053, + "language_loss": 0.72495449, + "learning_rate": 3.833537726343684e-06, + "loss": 0.74636781, + "num_input_tokens_seen": 56615130, + "step": 2609, + "time_per_iteration": 2.6297507286071777 + }, + { + "auxiliary_loss_clip": 0.01103472, + "auxiliary_loss_mlp": 0.01048291, + "balance_loss_clip": 1.03427482, + "balance_loss_mlp": 1.0277034, + "epoch": 0.15692168946339996, + "flos": 20047922421120.0, + "grad_norm": 1.7664757905289832, + "language_loss": 0.72021449, + "learning_rate": 3.833382133519818e-06, + "loss": 0.74173212, + "num_input_tokens_seen": 56634005, + "step": 2610, + "time_per_iteration": 2.6239371299743652 + }, + { + "auxiliary_loss_clip": 0.01120825, + "auxiliary_loss_mlp": 0.01052065, + "balance_loss_clip": 1.03728306, + "balance_loss_mlp": 1.03076243, + "epoch": 0.15698181271606793, + "flos": 21398019943680.0, + "grad_norm": 1.7750345028601437, + "language_loss": 0.72534108, + "learning_rate": 3.833226471173919e-06, + "loss": 0.74706995, + "num_input_tokens_seen": 56653480, + "step": 2611, + "time_per_iteration": 2.523017406463623 + }, + { + "auxiliary_loss_clip": 0.01101007, + "auxiliary_loss_mlp": 0.01051767, + "balance_loss_clip": 1.03554928, + "balance_loss_mlp": 1.03146565, + "epoch": 0.15704193596873592, + "flos": 20845785271680.0, + "grad_norm": 2.000787152998562, + "language_loss": 0.70834363, + "learning_rate": 3.833070739311887e-06, + "loss": 0.72987139, + "num_input_tokens_seen": 56672270, + "step": 2612, + "time_per_iteration": 2.749708652496338 + }, + { + "auxiliary_loss_clip": 0.0107356, + "auxiliary_loss_mlp": 0.01052581, + "balance_loss_clip": 1.03427589, + "balance_loss_mlp": 1.03235173, + "epoch": 0.15710205922140388, + "flos": 21762908254080.0, + "grad_norm": 2.026409925248897, + "language_loss": 0.75583994, + "learning_rate": 3.83291493793963e-06, + "loss": 0.7771014, + "num_input_tokens_seen": 56691510, + "step": 2613, + "time_per_iteration": 2.762510061264038 + }, + { + "auxiliary_loss_clip": 0.0107943, + "auxiliary_loss_mlp": 0.01057507, + "balance_loss_clip": 1.03414917, + "balance_loss_mlp": 1.03719378, + "epoch": 0.15716218247407185, + "flos": 25007760201600.0, + "grad_norm": 2.7270801145285612, + "language_loss": 0.65841031, + "learning_rate": 3.832759067063055e-06, + "loss": 0.67977971, + "num_input_tokens_seen": 56712230, + "step": 2614, + "time_per_iteration": 2.857241630554199 + }, + { + "auxiliary_loss_clip": 0.01112765, + "auxiliary_loss_mlp": 0.01046756, + "balance_loss_clip": 1.03836083, + "balance_loss_mlp": 1.02643108, + "epoch": 0.1572223057267398, + "flos": 20191780391040.0, + "grad_norm": 2.241065084839668, + "language_loss": 0.7546196, + "learning_rate": 3.832603126688072e-06, + "loss": 0.77621484, + "num_input_tokens_seen": 56727490, + "step": 2615, + "time_per_iteration": 2.655266284942627 + }, + { + "auxiliary_loss_clip": 0.01098117, + "auxiliary_loss_mlp": 0.01052479, + "balance_loss_clip": 1.03746283, + "balance_loss_mlp": 1.03314376, + "epoch": 0.15728242897940778, + "flos": 20959514709120.0, + "grad_norm": 1.6054460406185747, + "language_loss": 0.73019505, + "learning_rate": 3.832447116820594e-06, + "loss": 0.751701, + "num_input_tokens_seen": 56747385, + "step": 2616, + "time_per_iteration": 2.547410726547241 + }, + { + "auxiliary_loss_clip": 0.010979, + "auxiliary_loss_mlp": 0.0105557, + "balance_loss_clip": 1.03782034, + "balance_loss_mlp": 1.03531671, + "epoch": 0.15734255223207574, + "flos": 23038275530880.0, + "grad_norm": 1.9521739577566257, + "language_loss": 0.72640812, + "learning_rate": 3.832291037466539e-06, + "loss": 0.74794281, + "num_input_tokens_seen": 56768055, + "step": 2617, + "time_per_iteration": 2.5789036750793457 + }, + { + "auxiliary_loss_clip": 0.0109866, + "auxiliary_loss_mlp": 0.01047035, + "balance_loss_clip": 1.03716779, + "balance_loss_mlp": 1.02692497, + "epoch": 0.15740267548474374, + "flos": 20551281661440.0, + "grad_norm": 2.3801812628684536, + "language_loss": 0.74755275, + "learning_rate": 3.8321348886318235e-06, + "loss": 0.76900965, + "num_input_tokens_seen": 56785110, + "step": 2618, + "time_per_iteration": 2.556917190551758 + }, + { + "auxiliary_loss_clip": 0.01121294, + "auxiliary_loss_mlp": 0.01049283, + "balance_loss_clip": 1.03711033, + "balance_loss_mlp": 1.02707422, + "epoch": 0.1574627987374117, + "flos": 22666922772480.0, + "grad_norm": 1.9330453428363996, + "language_loss": 0.78714252, + "learning_rate": 3.8319786703223695e-06, + "loss": 0.80884832, + "num_input_tokens_seen": 56804975, + "step": 2619, + "time_per_iteration": 2.5277576446533203 + }, + { + "auxiliary_loss_clip": 0.01086732, + "auxiliary_loss_mlp": 0.01053703, + "balance_loss_clip": 1.03731966, + "balance_loss_mlp": 1.03417635, + "epoch": 0.15752292199007967, + "flos": 16800664262400.0, + "grad_norm": 2.591973741422475, + "language_loss": 0.77035069, + "learning_rate": 3.831822382544101e-06, + "loss": 0.79175502, + "num_input_tokens_seen": 56822470, + "step": 2620, + "time_per_iteration": 2.598659038543701 + }, + { + "auxiliary_loss_clip": 0.01098058, + "auxiliary_loss_mlp": 0.01050049, + "balance_loss_clip": 1.03758478, + "balance_loss_mlp": 1.02911615, + "epoch": 0.15758304524274763, + "flos": 29826002568960.0, + "grad_norm": 1.984254468326998, + "language_loss": 0.70898306, + "learning_rate": 3.831666025302944e-06, + "loss": 0.73046416, + "num_input_tokens_seen": 56842100, + "step": 2621, + "time_per_iteration": 2.717297315597534 + }, + { + "auxiliary_loss_clip": 0.01066043, + "auxiliary_loss_mlp": 0.0105476, + "balance_loss_clip": 1.03518152, + "balance_loss_mlp": 1.0326829, + "epoch": 0.1576431684954156, + "flos": 53577426723840.0, + "grad_norm": 1.9756882938200502, + "language_loss": 0.7218219, + "learning_rate": 3.831509598604828e-06, + "loss": 0.74302995, + "num_input_tokens_seen": 56865920, + "step": 2622, + "time_per_iteration": 2.996148109436035 + }, + { + "auxiliary_loss_clip": 0.01048131, + "auxiliary_loss_mlp": 0.01048015, + "balance_loss_clip": 1.03061509, + "balance_loss_mlp": 1.02892923, + "epoch": 0.15770329174808356, + "flos": 20813609664000.0, + "grad_norm": 1.7757990891293123, + "language_loss": 0.88006538, + "learning_rate": 3.831353102455684e-06, + "loss": 0.90102684, + "num_input_tokens_seen": 56885265, + "step": 2623, + "time_per_iteration": 2.7239370346069336 + }, + { + "auxiliary_loss_clip": 0.01120356, + "auxiliary_loss_mlp": 0.01045047, + "balance_loss_clip": 1.03917623, + "balance_loss_mlp": 1.0262841, + "epoch": 0.15776341500075153, + "flos": 24974004395520.0, + "grad_norm": 1.5924174242637026, + "language_loss": 0.81667519, + "learning_rate": 3.831196536861448e-06, + "loss": 0.83832926, + "num_input_tokens_seen": 56906710, + "step": 2624, + "time_per_iteration": 2.5778136253356934 + }, + { + "auxiliary_loss_clip": 0.01086209, + "auxiliary_loss_mlp": 0.01049553, + "balance_loss_clip": 1.03509545, + "balance_loss_mlp": 1.02916861, + "epoch": 0.15782353825341952, + "flos": 21907915459200.0, + "grad_norm": 2.3160274208035943, + "language_loss": 0.79702711, + "learning_rate": 3.831039901828054e-06, + "loss": 0.81838477, + "num_input_tokens_seen": 56924275, + "step": 2625, + "time_per_iteration": 2.7280921936035156 + }, + { + "auxiliary_loss_clip": 0.01119591, + "auxiliary_loss_mlp": 0.01048082, + "balance_loss_clip": 1.03946877, + "balance_loss_mlp": 1.02966428, + "epoch": 0.15788366150608749, + "flos": 26177191292160.0, + "grad_norm": 2.510838192969288, + "language_loss": 0.80452967, + "learning_rate": 3.830883197361445e-06, + "loss": 0.82620639, + "num_input_tokens_seen": 56941525, + "step": 2626, + "time_per_iteration": 2.604332208633423 + }, + { + "auxiliary_loss_clip": 0.01063483, + "auxiliary_loss_mlp": 0.01053378, + "balance_loss_clip": 1.03943443, + "balance_loss_mlp": 1.03144372, + "epoch": 0.15794378475875545, + "flos": 27709822753920.0, + "grad_norm": 2.7524611800088485, + "language_loss": 0.73679, + "learning_rate": 3.830726423467561e-06, + "loss": 0.75795865, + "num_input_tokens_seen": 56962145, + "step": 2627, + "time_per_iteration": 2.7852869033813477 + }, + { + "auxiliary_loss_clip": 0.01075519, + "auxiliary_loss_mlp": 0.01052214, + "balance_loss_clip": 1.03564215, + "balance_loss_mlp": 1.03198457, + "epoch": 0.15800390801142342, + "flos": 12130158533760.0, + "grad_norm": 2.2809934873424127, + "language_loss": 0.85106778, + "learning_rate": 3.830569580152348e-06, + "loss": 0.87234509, + "num_input_tokens_seen": 56977505, + "step": 2628, + "time_per_iteration": 2.7011489868164062 + }, + { + "auxiliary_loss_clip": 0.01095442, + "auxiliary_loss_mlp": 0.01043214, + "balance_loss_clip": 1.03737187, + "balance_loss_mlp": 1.02505827, + "epoch": 0.15806403126409138, + "flos": 20704728562560.0, + "grad_norm": 2.628734617197477, + "language_loss": 0.76978785, + "learning_rate": 3.830412667421752e-06, + "loss": 0.79117441, + "num_input_tokens_seen": 56996770, + "step": 2629, + "time_per_iteration": 2.765458822250366 + }, + { + "auxiliary_loss_clip": 0.01110365, + "auxiliary_loss_mlp": 0.01050355, + "balance_loss_clip": 1.03986418, + "balance_loss_mlp": 1.03028023, + "epoch": 0.15812415451675935, + "flos": 17821712269440.0, + "grad_norm": 3.417301247278976, + "language_loss": 0.73255867, + "learning_rate": 3.8302556852817245e-06, + "loss": 0.75416589, + "num_input_tokens_seen": 57014970, + "step": 2630, + "time_per_iteration": 2.6574244499206543 + }, + { + "auxiliary_loss_clip": 0.01113613, + "auxiliary_loss_mlp": 0.01048629, + "balance_loss_clip": 1.03790438, + "balance_loss_mlp": 1.02816105, + "epoch": 0.15818427776942734, + "flos": 20084048524800.0, + "grad_norm": 1.9893560916060873, + "language_loss": 0.83684051, + "learning_rate": 3.8300986337382184e-06, + "loss": 0.85846293, + "num_input_tokens_seen": 57034045, + "step": 2631, + "time_per_iteration": 2.585500478744507 + }, + { + "auxiliary_loss_clip": 0.01119296, + "auxiliary_loss_mlp": 0.01045477, + "balance_loss_clip": 1.03756785, + "balance_loss_mlp": 1.02647531, + "epoch": 0.1582444010220953, + "flos": 21214911386880.0, + "grad_norm": 1.628715989528351, + "language_loss": 0.78751343, + "learning_rate": 3.8299415127971895e-06, + "loss": 0.80916119, + "num_input_tokens_seen": 57053695, + "step": 2632, + "time_per_iteration": 2.5342092514038086 + }, + { + "auxiliary_loss_clip": 0.01111811, + "auxiliary_loss_mlp": 0.0105765, + "balance_loss_clip": 1.03852224, + "balance_loss_mlp": 1.03769433, + "epoch": 0.15830452427476327, + "flos": 17858341163520.0, + "grad_norm": 2.031608535778632, + "language_loss": 0.8332938, + "learning_rate": 3.829784322464594e-06, + "loss": 0.85498834, + "num_input_tokens_seen": 57071290, + "step": 2633, + "time_per_iteration": 4.209028482437134 + }, + { + "auxiliary_loss_clip": 0.01125764, + "auxiliary_loss_mlp": 0.01043457, + "balance_loss_clip": 1.04173911, + "balance_loss_mlp": 1.02332222, + "epoch": 0.15836464752743123, + "flos": 24534960456960.0, + "grad_norm": 1.7166259524719314, + "language_loss": 0.77141893, + "learning_rate": 3.829627062746394e-06, + "loss": 0.79311109, + "num_input_tokens_seen": 57091465, + "step": 2634, + "time_per_iteration": 2.6552302837371826 + }, + { + "auxiliary_loss_clip": 0.01091047, + "auxiliary_loss_mlp": 0.00749227, + "balance_loss_clip": 1.03879786, + "balance_loss_mlp": 1.00077081, + "epoch": 0.1584247707800992, + "flos": 20120821073280.0, + "grad_norm": 1.949497566741892, + "language_loss": 0.89202297, + "learning_rate": 3.829469733648552e-06, + "loss": 0.91042578, + "num_input_tokens_seen": 57110075, + "step": 2635, + "time_per_iteration": 2.657991409301758 + }, + { + "auxiliary_loss_clip": 0.01048543, + "auxiliary_loss_mlp": 0.01055061, + "balance_loss_clip": 1.03022194, + "balance_loss_mlp": 1.03295946, + "epoch": 0.15848489403276717, + "flos": 20375966355840.0, + "grad_norm": 1.9954080212583598, + "language_loss": 0.75336516, + "learning_rate": 3.829312335177034e-06, + "loss": 0.77440119, + "num_input_tokens_seen": 57128945, + "step": 2636, + "time_per_iteration": 2.7004711627960205 + }, + { + "auxiliary_loss_clip": 0.01089922, + "auxiliary_loss_mlp": 0.01047152, + "balance_loss_clip": 1.03902125, + "balance_loss_mlp": 1.02562261, + "epoch": 0.15854501728543513, + "flos": 39346890359040.0, + "grad_norm": 2.216164647755345, + "language_loss": 0.7250185, + "learning_rate": 3.82915486733781e-06, + "loss": 0.74638927, + "num_input_tokens_seen": 57152385, + "step": 2637, + "time_per_iteration": 2.822871685028076 + }, + { + "auxiliary_loss_clip": 0.01107615, + "auxiliary_loss_mlp": 0.01044701, + "balance_loss_clip": 1.03684044, + "balance_loss_mlp": 1.02648592, + "epoch": 0.15860514053810312, + "flos": 24864225454080.0, + "grad_norm": 1.922365202879586, + "language_loss": 0.77753866, + "learning_rate": 3.82899733013685e-06, + "loss": 0.79906178, + "num_input_tokens_seen": 57172620, + "step": 2638, + "time_per_iteration": 2.6535587310791016 + }, + { + "auxiliary_loss_clip": 0.01088461, + "auxiliary_loss_mlp": 0.01065551, + "balance_loss_clip": 1.03505683, + "balance_loss_mlp": 1.04337788, + "epoch": 0.1586652637907711, + "flos": 26177694082560.0, + "grad_norm": 3.3458725967128378, + "language_loss": 0.75430024, + "learning_rate": 3.828839723580128e-06, + "loss": 0.7758404, + "num_input_tokens_seen": 57194680, + "step": 2639, + "time_per_iteration": 4.376213788986206 + }, + { + "auxiliary_loss_clip": 0.01060056, + "auxiliary_loss_mlp": 0.0105688, + "balance_loss_clip": 1.03485799, + "balance_loss_mlp": 1.03649521, + "epoch": 0.15872538704343905, + "flos": 19792058866560.0, + "grad_norm": 1.7423875407918452, + "language_loss": 0.81305492, + "learning_rate": 3.82868204767362e-06, + "loss": 0.83422428, + "num_input_tokens_seen": 57214675, + "step": 2640, + "time_per_iteration": 5.926401376724243 + }, + { + "auxiliary_loss_clip": 0.01086493, + "auxiliary_loss_mlp": 0.0105251, + "balance_loss_clip": 1.0337677, + "balance_loss_mlp": 1.03125465, + "epoch": 0.15878551029610702, + "flos": 28475366342400.0, + "grad_norm": 1.5108872981790422, + "language_loss": 0.66883737, + "learning_rate": 3.828524302423306e-06, + "loss": 0.69022745, + "num_input_tokens_seen": 57235830, + "step": 2641, + "time_per_iteration": 2.851408004760742 + }, + { + "auxiliary_loss_clip": 0.01099369, + "auxiliary_loss_mlp": 0.0105589, + "balance_loss_clip": 1.03633904, + "balance_loss_mlp": 1.0353148, + "epoch": 0.15884563354877498, + "flos": 24206701040640.0, + "grad_norm": 2.1598239486035524, + "language_loss": 0.7529664, + "learning_rate": 3.828366487835167e-06, + "loss": 0.77451903, + "num_input_tokens_seen": 57255970, + "step": 2642, + "time_per_iteration": 2.7080190181732178 + }, + { + "auxiliary_loss_clip": 0.01111053, + "auxiliary_loss_mlp": 0.01050228, + "balance_loss_clip": 1.04046583, + "balance_loss_mlp": 1.03078508, + "epoch": 0.15890575680144295, + "flos": 23949795991680.0, + "grad_norm": 2.1442496817636503, + "language_loss": 0.70246297, + "learning_rate": 3.828208603915186e-06, + "loss": 0.72407579, + "num_input_tokens_seen": 57274435, + "step": 2643, + "time_per_iteration": 2.6151952743530273 + }, + { + "auxiliary_loss_clip": 0.01117273, + "auxiliary_loss_mlp": 0.0104676, + "balance_loss_clip": 1.03783762, + "balance_loss_mlp": 1.02848577, + "epoch": 0.15896588005411091, + "flos": 21215019127680.0, + "grad_norm": 1.9853670224441096, + "language_loss": 0.78166485, + "learning_rate": 3.828050650669353e-06, + "loss": 0.80330515, + "num_input_tokens_seen": 57293115, + "step": 2644, + "time_per_iteration": 2.619187831878662 + }, + { + "auxiliary_loss_clip": 0.0110867, + "auxiliary_loss_mlp": 0.01052581, + "balance_loss_clip": 1.03730249, + "balance_loss_mlp": 1.03298283, + "epoch": 0.1590260033067789, + "flos": 24352390604160.0, + "grad_norm": 1.8539155984819653, + "language_loss": 0.82102197, + "learning_rate": 3.827892628103657e-06, + "loss": 0.8426345, + "num_input_tokens_seen": 57312565, + "step": 2645, + "time_per_iteration": 2.6373932361602783 + }, + { + "auxiliary_loss_clip": 0.01121521, + "auxiliary_loss_mlp": 0.01047744, + "balance_loss_clip": 1.03786635, + "balance_loss_mlp": 1.02684712, + "epoch": 0.15908612655944687, + "flos": 32048944583040.0, + "grad_norm": 2.0856591732728944, + "language_loss": 0.70033348, + "learning_rate": 3.827734536224087e-06, + "loss": 0.72202611, + "num_input_tokens_seen": 57333360, + "step": 2646, + "time_per_iteration": 2.646376371383667 + }, + { + "auxiliary_loss_clip": 0.01096691, + "auxiliary_loss_mlp": 0.01043856, + "balance_loss_clip": 1.0382967, + "balance_loss_mlp": 1.02528358, + "epoch": 0.15914624981211484, + "flos": 17785370684160.0, + "grad_norm": 2.0883547947720986, + "language_loss": 0.62282693, + "learning_rate": 3.827576375036642e-06, + "loss": 0.64423239, + "num_input_tokens_seen": 57350575, + "step": 2647, + "time_per_iteration": 2.650660991668701 + }, + { + "auxiliary_loss_clip": 0.01121435, + "auxiliary_loss_mlp": 0.01045203, + "balance_loss_clip": 1.04107034, + "balance_loss_mlp": 1.02635598, + "epoch": 0.1592063730647828, + "flos": 17712507945600.0, + "grad_norm": 1.8837551688732648, + "language_loss": 0.89163768, + "learning_rate": 3.827418144547318e-06, + "loss": 0.91330409, + "num_input_tokens_seen": 57367570, + "step": 2648, + "time_per_iteration": 2.581962823867798 + }, + { + "auxiliary_loss_clip": 0.01116445, + "auxiliary_loss_mlp": 0.01045245, + "balance_loss_clip": 1.03859031, + "balance_loss_mlp": 1.02738762, + "epoch": 0.15926649631745077, + "flos": 18803545603200.0, + "grad_norm": 1.8005664026095811, + "language_loss": 0.9135294, + "learning_rate": 3.827259844762114e-06, + "loss": 0.93514633, + "num_input_tokens_seen": 57383980, + "step": 2649, + "time_per_iteration": 2.643777370452881 + }, + { + "auxiliary_loss_clip": 0.01067104, + "auxiliary_loss_mlp": 0.01043949, + "balance_loss_clip": 1.04051089, + "balance_loss_mlp": 1.02277732, + "epoch": 0.15932661957011873, + "flos": 17566243764480.0, + "grad_norm": 3.730561303871316, + "language_loss": 0.71185935, + "learning_rate": 3.827101475687033e-06, + "loss": 0.73296982, + "num_input_tokens_seen": 57400840, + "step": 2650, + "time_per_iteration": 2.8130834102630615 + }, + { + "auxiliary_loss_clip": 0.0110224, + "auxiliary_loss_mlp": 0.01043385, + "balance_loss_clip": 1.03526521, + "balance_loss_mlp": 1.02598035, + "epoch": 0.15938674282278673, + "flos": 13334351011200.0, + "grad_norm": 1.9569324656047902, + "language_loss": 0.70826089, + "learning_rate": 3.826943037328082e-06, + "loss": 0.72971714, + "num_input_tokens_seen": 57419230, + "step": 2651, + "time_per_iteration": 2.543311595916748 + }, + { + "auxiliary_loss_clip": 0.01074077, + "auxiliary_loss_mlp": 0.00749143, + "balance_loss_clip": 1.03384161, + "balance_loss_mlp": 1.00084043, + "epoch": 0.1594468660754547, + "flos": 22488842119680.0, + "grad_norm": 1.9111897374986162, + "language_loss": 0.80112237, + "learning_rate": 3.8267845296912674e-06, + "loss": 0.81935453, + "num_input_tokens_seen": 57439315, + "step": 2652, + "time_per_iteration": 2.6799232959747314 + }, + { + "auxiliary_loss_clip": 0.01094944, + "auxiliary_loss_mlp": 0.00749073, + "balance_loss_clip": 1.03695512, + "balance_loss_mlp": 1.00086188, + "epoch": 0.15950698932812266, + "flos": 15007320910080.0, + "grad_norm": 2.4535553070089224, + "language_loss": 0.70193374, + "learning_rate": 3.826625952782601e-06, + "loss": 0.72037393, + "num_input_tokens_seen": 57454635, + "step": 2653, + "time_per_iteration": 2.5790743827819824 + }, + { + "auxiliary_loss_clip": 0.01108659, + "auxiliary_loss_mlp": 0.01041942, + "balance_loss_clip": 1.03879952, + "balance_loss_mlp": 1.02261853, + "epoch": 0.15956711258079062, + "flos": 30155052084480.0, + "grad_norm": 4.140857821661535, + "language_loss": 0.76585329, + "learning_rate": 3.826467306608095e-06, + "loss": 0.78735936, + "num_input_tokens_seen": 57476805, + "step": 2654, + "time_per_iteration": 2.643021821975708 + }, + { + "auxiliary_loss_clip": 0.01077563, + "auxiliary_loss_mlp": 0.0104298, + "balance_loss_clip": 1.03442359, + "balance_loss_mlp": 1.02455008, + "epoch": 0.1596272358334586, + "flos": 21032700670080.0, + "grad_norm": 3.2749951953698755, + "language_loss": 0.82129103, + "learning_rate": 3.826308591173765e-06, + "loss": 0.84249645, + "num_input_tokens_seen": 57496400, + "step": 2655, + "time_per_iteration": 2.705331563949585 + }, + { + "auxiliary_loss_clip": 0.01072852, + "auxiliary_loss_mlp": 0.01043752, + "balance_loss_clip": 1.03285336, + "balance_loss_mlp": 1.02538204, + "epoch": 0.15968735908612655, + "flos": 15268032800640.0, + "grad_norm": 1.9170918093734632, + "language_loss": 0.73882842, + "learning_rate": 3.826149806485631e-06, + "loss": 0.75999439, + "num_input_tokens_seen": 57513700, + "step": 2656, + "time_per_iteration": 2.7507965564727783 + }, + { + "auxiliary_loss_clip": 0.01079656, + "auxiliary_loss_mlp": 0.01044246, + "balance_loss_clip": 1.03535414, + "balance_loss_mlp": 1.02632916, + "epoch": 0.15974748233879452, + "flos": 52665726695040.0, + "grad_norm": 1.9708824322037766, + "language_loss": 0.7770977, + "learning_rate": 3.825990952549713e-06, + "loss": 0.79833663, + "num_input_tokens_seen": 57536180, + "step": 2657, + "time_per_iteration": 3.0204272270202637 + }, + { + "auxiliary_loss_clip": 0.01099047, + "auxiliary_loss_mlp": 0.01047426, + "balance_loss_clip": 1.03666556, + "balance_loss_mlp": 1.02806592, + "epoch": 0.1598076055914625, + "flos": 18733232730240.0, + "grad_norm": 1.6981806236622452, + "language_loss": 0.74754262, + "learning_rate": 3.825832029372035e-06, + "loss": 0.76900738, + "num_input_tokens_seen": 57555025, + "step": 2658, + "time_per_iteration": 2.6903743743896484 + }, + { + "auxiliary_loss_clip": 0.01096368, + "auxiliary_loss_mlp": 0.01054566, + "balance_loss_clip": 1.04147625, + "balance_loss_mlp": 1.03383589, + "epoch": 0.15986772884413047, + "flos": 34349238535680.0, + "grad_norm": 1.7102932398548019, + "language_loss": 0.75290382, + "learning_rate": 3.825673036958624e-06, + "loss": 0.77441323, + "num_input_tokens_seen": 57577660, + "step": 2659, + "time_per_iteration": 2.9174563884735107 + }, + { + "auxiliary_loss_clip": 0.01088988, + "auxiliary_loss_mlp": 0.01049265, + "balance_loss_clip": 1.03883171, + "balance_loss_mlp": 1.02917838, + "epoch": 0.15992785209679844, + "flos": 22054969739520.0, + "grad_norm": 2.076416434621994, + "language_loss": 0.90532517, + "learning_rate": 3.825513975315508e-06, + "loss": 0.92670774, + "num_input_tokens_seen": 57596335, + "step": 2660, + "time_per_iteration": 2.833186626434326 + }, + { + "auxiliary_loss_clip": 0.01070143, + "auxiliary_loss_mlp": 0.01050927, + "balance_loss_clip": 1.03655612, + "balance_loss_mlp": 1.03023171, + "epoch": 0.1599879753494664, + "flos": 33066652625280.0, + "grad_norm": 4.675814665962016, + "language_loss": 0.77549446, + "learning_rate": 3.82535484444872e-06, + "loss": 0.79670519, + "num_input_tokens_seen": 57616830, + "step": 2661, + "time_per_iteration": 2.9155900478363037 + }, + { + "auxiliary_loss_clip": 0.01092485, + "auxiliary_loss_mlp": 0.00749133, + "balance_loss_clip": 1.03581464, + "balance_loss_mlp": 1.00086999, + "epoch": 0.16004809860213437, + "flos": 28038010343040.0, + "grad_norm": 1.8806772257982287, + "language_loss": 0.74507022, + "learning_rate": 3.825195644364292e-06, + "loss": 0.76348644, + "num_input_tokens_seen": 57635515, + "step": 2662, + "time_per_iteration": 2.7384793758392334 + }, + { + "auxiliary_loss_clip": 0.01088182, + "auxiliary_loss_mlp": 0.00749079, + "balance_loss_clip": 1.03450942, + "balance_loss_mlp": 1.00074899, + "epoch": 0.16010822185480234, + "flos": 22780113505920.0, + "grad_norm": 5.456573344142505, + "language_loss": 0.81748092, + "learning_rate": 3.825036375068263e-06, + "loss": 0.83585358, + "num_input_tokens_seen": 57654250, + "step": 2663, + "time_per_iteration": 2.6600639820098877 + }, + { + "auxiliary_loss_clip": 0.01071667, + "auxiliary_loss_mlp": 0.01049608, + "balance_loss_clip": 1.03730416, + "balance_loss_mlp": 1.03015351, + "epoch": 0.16016834510747033, + "flos": 20084012611200.0, + "grad_norm": 1.927671721063683, + "language_loss": 0.79831457, + "learning_rate": 3.824877036566672e-06, + "loss": 0.81952739, + "num_input_tokens_seen": 57672645, + "step": 2664, + "time_per_iteration": 2.6971065998077393 + }, + { + "auxiliary_loss_clip": 0.01110329, + "auxiliary_loss_mlp": 0.01052754, + "balance_loss_clip": 1.03745127, + "balance_loss_mlp": 1.03313255, + "epoch": 0.1602284683601383, + "flos": 21173829206400.0, + "grad_norm": 2.839808693854912, + "language_loss": 0.9399066, + "learning_rate": 3.824717628865561e-06, + "loss": 0.96153748, + "num_input_tokens_seen": 57691055, + "step": 2665, + "time_per_iteration": 2.621201515197754 + }, + { + "auxiliary_loss_clip": 0.01086388, + "auxiliary_loss_mlp": 0.0104243, + "balance_loss_clip": 1.03568983, + "balance_loss_mlp": 1.02334452, + "epoch": 0.16028859161280626, + "flos": 14647568244480.0, + "grad_norm": 2.6462918601729104, + "language_loss": 0.85229874, + "learning_rate": 3.824558151970974e-06, + "loss": 0.87358689, + "num_input_tokens_seen": 57707235, + "step": 2666, + "time_per_iteration": 2.658231019973755 + }, + { + "auxiliary_loss_clip": 0.01088116, + "auxiliary_loss_mlp": 0.00749253, + "balance_loss_clip": 1.03776264, + "balance_loss_mlp": 1.00092506, + "epoch": 0.16034871486547422, + "flos": 20990325600000.0, + "grad_norm": 1.9422991732288495, + "language_loss": 0.81677711, + "learning_rate": 3.8243986058889595e-06, + "loss": 0.83515084, + "num_input_tokens_seen": 57724190, + "step": 2667, + "time_per_iteration": 2.756727933883667 + }, + { + "auxiliary_loss_clip": 0.01119925, + "auxiliary_loss_mlp": 0.01049663, + "balance_loss_clip": 1.03955817, + "balance_loss_mlp": 1.02965999, + "epoch": 0.1604088381181422, + "flos": 21397732634880.0, + "grad_norm": 1.740814455588283, + "language_loss": 0.73459202, + "learning_rate": 3.824238990625567e-06, + "loss": 0.75628781, + "num_input_tokens_seen": 57743620, + "step": 2668, + "time_per_iteration": 2.6132376194000244 + }, + { + "auxiliary_loss_clip": 0.01109852, + "auxiliary_loss_mlp": 0.01047889, + "balance_loss_clip": 1.0396831, + "balance_loss_mlp": 1.02845788, + "epoch": 0.16046896137081015, + "flos": 23877040993920.0, + "grad_norm": 1.5287966400981008, + "language_loss": 0.76854712, + "learning_rate": 3.824079306186848e-06, + "loss": 0.79012454, + "num_input_tokens_seen": 57764810, + "step": 2669, + "time_per_iteration": 2.7647929191589355 + }, + { + "auxiliary_loss_clip": 0.01020617, + "auxiliary_loss_mlp": 0.01025724, + "balance_loss_clip": 1.00801492, + "balance_loss_mlp": 1.02342343, + "epoch": 0.16052908462347812, + "flos": 59806709015040.0, + "grad_norm": 0.8072420564006894, + "language_loss": 0.55575752, + "learning_rate": 3.823919552578861e-06, + "loss": 0.57622093, + "num_input_tokens_seen": 57824390, + "step": 2670, + "time_per_iteration": 3.1728720664978027 + }, + { + "auxiliary_loss_clip": 0.01108406, + "auxiliary_loss_mlp": 0.01046609, + "balance_loss_clip": 1.03780222, + "balance_loss_mlp": 1.02746391, + "epoch": 0.1605892078761461, + "flos": 18296559089280.0, + "grad_norm": 2.318108993164018, + "language_loss": 0.77464724, + "learning_rate": 3.82375972980766e-06, + "loss": 0.79619741, + "num_input_tokens_seen": 57843665, + "step": 2671, + "time_per_iteration": 2.7051570415496826 + }, + { + "auxiliary_loss_clip": 0.01105725, + "auxiliary_loss_mlp": 0.01041655, + "balance_loss_clip": 1.03811502, + "balance_loss_mlp": 1.02315354, + "epoch": 0.16064933112881408, + "flos": 32160734686080.0, + "grad_norm": 2.7763668313924073, + "language_loss": 0.64853996, + "learning_rate": 3.8235998378793086e-06, + "loss": 0.67001379, + "num_input_tokens_seen": 57863305, + "step": 2672, + "time_per_iteration": 2.7132952213287354 + }, + { + "auxiliary_loss_clip": 0.01116646, + "auxiliary_loss_mlp": 0.01046005, + "balance_loss_clip": 1.04428351, + "balance_loss_mlp": 1.02472663, + "epoch": 0.16070945438148204, + "flos": 19828795501440.0, + "grad_norm": 1.6295914583458924, + "language_loss": 0.85525823, + "learning_rate": 3.8234398767998675e-06, + "loss": 0.8768847, + "num_input_tokens_seen": 57883025, + "step": 2673, + "time_per_iteration": 2.5781795978546143 + }, + { + "auxiliary_loss_clip": 0.01090271, + "auxiliary_loss_mlp": 0.01053529, + "balance_loss_clip": 1.0400598, + "balance_loss_mlp": 1.03484929, + "epoch": 0.16076957763415, + "flos": 18913144976640.0, + "grad_norm": 2.927781874539236, + "language_loss": 0.72534013, + "learning_rate": 3.823279846575403e-06, + "loss": 0.74677813, + "num_input_tokens_seen": 57901430, + "step": 2674, + "time_per_iteration": 2.6773953437805176 + }, + { + "auxiliary_loss_clip": 0.01109594, + "auxiliary_loss_mlp": 0.01048493, + "balance_loss_clip": 1.0385797, + "balance_loss_mlp": 1.02841842, + "epoch": 0.16082970088681797, + "flos": 16764358590720.0, + "grad_norm": 3.0274772865262407, + "language_loss": 0.84457684, + "learning_rate": 3.823119747211986e-06, + "loss": 0.86615777, + "num_input_tokens_seen": 57919550, + "step": 2675, + "time_per_iteration": 2.6718034744262695 + }, + { + "auxiliary_loss_clip": 0.01067377, + "auxiliary_loss_mlp": 0.01050101, + "balance_loss_clip": 1.03459024, + "balance_loss_mlp": 1.02952576, + "epoch": 0.16088982413948594, + "flos": 35150261783040.0, + "grad_norm": 1.8979617975093073, + "language_loss": 0.8240602, + "learning_rate": 3.822959578715685e-06, + "loss": 0.84523499, + "num_input_tokens_seen": 57939890, + "step": 2676, + "time_per_iteration": 2.8181471824645996 + }, + { + "auxiliary_loss_clip": 0.01111283, + "auxiliary_loss_mlp": 0.01050328, + "balance_loss_clip": 1.04302907, + "balance_loss_mlp": 1.03232777, + "epoch": 0.1609499473921539, + "flos": 18625105814400.0, + "grad_norm": 2.0291792788176894, + "language_loss": 0.73321009, + "learning_rate": 3.822799341092573e-06, + "loss": 0.75482619, + "num_input_tokens_seen": 57957410, + "step": 2677, + "time_per_iteration": 2.593935012817383 + }, + { + "auxiliary_loss_clip": 0.01095028, + "auxiliary_loss_mlp": 0.01042473, + "balance_loss_clip": 1.0375402, + "balance_loss_mlp": 1.02416253, + "epoch": 0.1610100706448219, + "flos": 33145728416640.0, + "grad_norm": 1.623152052783519, + "language_loss": 0.76167238, + "learning_rate": 3.822639034348728e-06, + "loss": 0.78304732, + "num_input_tokens_seen": 57977900, + "step": 2678, + "time_per_iteration": 2.7082467079162598 + }, + { + "auxiliary_loss_clip": 0.01112221, + "auxiliary_loss_mlp": 0.01052224, + "balance_loss_clip": 1.04088342, + "balance_loss_mlp": 1.03195834, + "epoch": 0.16107019389748986, + "flos": 34676707852800.0, + "grad_norm": 2.0915954029710906, + "language_loss": 0.70533603, + "learning_rate": 3.822478658490228e-06, + "loss": 0.72698045, + "num_input_tokens_seen": 57998210, + "step": 2679, + "time_per_iteration": 2.859734535217285 + }, + { + "auxiliary_loss_clip": 0.00994292, + "auxiliary_loss_mlp": 0.00748502, + "balance_loss_clip": 1.00345445, + "balance_loss_mlp": 1.00070667, + "epoch": 0.16113031715015783, + "flos": 65713403260800.0, + "grad_norm": 0.7839610394257133, + "language_loss": 0.51829332, + "learning_rate": 3.822318213523154e-06, + "loss": 0.5357213, + "num_input_tokens_seen": 58059420, + "step": 2680, + "time_per_iteration": 3.28495454788208 + }, + { + "auxiliary_loss_clip": 0.01101819, + "auxiliary_loss_mlp": 0.01047597, + "balance_loss_clip": 1.03709602, + "balance_loss_mlp": 1.02697349, + "epoch": 0.1611904404028258, + "flos": 20810413353600.0, + "grad_norm": 1.617644803154739, + "language_loss": 0.80499923, + "learning_rate": 3.8221576994535925e-06, + "loss": 0.82649338, + "num_input_tokens_seen": 58078370, + "step": 2681, + "time_per_iteration": 4.195562839508057 + }, + { + "auxiliary_loss_clip": 0.01096782, + "auxiliary_loss_mlp": 0.01061641, + "balance_loss_clip": 1.03846622, + "balance_loss_mlp": 1.04278243, + "epoch": 0.16125056365549376, + "flos": 27013335062400.0, + "grad_norm": 2.166331844266385, + "language_loss": 0.68924046, + "learning_rate": 3.821997116287627e-06, + "loss": 0.71082467, + "num_input_tokens_seen": 58097395, + "step": 2682, + "time_per_iteration": 2.669118642807007 + }, + { + "auxiliary_loss_clip": 0.01104999, + "auxiliary_loss_mlp": 0.0105258, + "balance_loss_clip": 1.04527068, + "balance_loss_mlp": 1.03248119, + "epoch": 0.16131068690816172, + "flos": 19276524915840.0, + "grad_norm": 1.8555018212720535, + "language_loss": 0.87598217, + "learning_rate": 3.821836464031348e-06, + "loss": 0.89755797, + "num_input_tokens_seen": 58115630, + "step": 2683, + "time_per_iteration": 2.72782039642334 + }, + { + "auxiliary_loss_clip": 0.01121861, + "auxiliary_loss_mlp": 0.01056272, + "balance_loss_clip": 1.04048967, + "balance_loss_mlp": 1.03630447, + "epoch": 0.16137081016082971, + "flos": 35337931367040.0, + "grad_norm": 1.8726658782881556, + "language_loss": 0.74478555, + "learning_rate": 3.821675742690849e-06, + "loss": 0.76656693, + "num_input_tokens_seen": 58138655, + "step": 2684, + "time_per_iteration": 2.7039504051208496 + }, + { + "auxiliary_loss_clip": 0.01091983, + "auxiliary_loss_mlp": 0.00749315, + "balance_loss_clip": 1.03840268, + "balance_loss_mlp": 1.00105309, + "epoch": 0.16143093341349768, + "flos": 34235257703040.0, + "grad_norm": 1.7109502985345872, + "language_loss": 0.70223385, + "learning_rate": 3.821514952272223e-06, + "loss": 0.72064686, + "num_input_tokens_seen": 58157440, + "step": 2685, + "time_per_iteration": 2.839031219482422 + }, + { + "auxiliary_loss_clip": 0.01075851, + "auxiliary_loss_mlp": 0.01055224, + "balance_loss_clip": 1.03786552, + "balance_loss_mlp": 1.03469634, + "epoch": 0.16149105666616564, + "flos": 27999262546560.0, + "grad_norm": 1.9571187090242432, + "language_loss": 0.71451443, + "learning_rate": 3.821354092781567e-06, + "loss": 0.73582518, + "num_input_tokens_seen": 58176660, + "step": 2686, + "time_per_iteration": 4.375270366668701 + }, + { + "auxiliary_loss_clip": 0.01110166, + "auxiliary_loss_mlp": 0.01050341, + "balance_loss_clip": 1.04098344, + "balance_loss_mlp": 1.03151822, + "epoch": 0.1615511799188336, + "flos": 19422214479360.0, + "grad_norm": 2.5214061293411376, + "language_loss": 0.81333745, + "learning_rate": 3.821193164224981e-06, + "loss": 0.83494258, + "num_input_tokens_seen": 58195085, + "step": 2687, + "time_per_iteration": 4.345925331115723 + }, + { + "auxiliary_loss_clip": 0.01112337, + "auxiliary_loss_mlp": 0.01047787, + "balance_loss_clip": 1.03709698, + "balance_loss_mlp": 1.02708077, + "epoch": 0.16161130317150157, + "flos": 22854915578880.0, + "grad_norm": 1.744573592301871, + "language_loss": 0.71575004, + "learning_rate": 3.821032166608568e-06, + "loss": 0.7373513, + "num_input_tokens_seen": 58213540, + "step": 2688, + "time_per_iteration": 4.249330520629883 + }, + { + "auxiliary_loss_clip": 0.01076209, + "auxiliary_loss_mlp": 0.01049041, + "balance_loss_clip": 1.0342927, + "balance_loss_mlp": 1.03053951, + "epoch": 0.16167142642416954, + "flos": 26110577520000.0, + "grad_norm": 1.7301231547803961, + "language_loss": 0.75526392, + "learning_rate": 3.8208710999384325e-06, + "loss": 0.77651644, + "num_input_tokens_seen": 58236995, + "step": 2689, + "time_per_iteration": 2.753854751586914 + }, + { + "auxiliary_loss_clip": 0.01121084, + "auxiliary_loss_mlp": 0.01053143, + "balance_loss_clip": 1.04122806, + "balance_loss_mlp": 1.03372383, + "epoch": 0.1617315496768375, + "flos": 22779646629120.0, + "grad_norm": 2.660955973885462, + "language_loss": 0.87321281, + "learning_rate": 3.820709964220683e-06, + "loss": 0.89495504, + "num_input_tokens_seen": 58257230, + "step": 2690, + "time_per_iteration": 2.608721971511841 + }, + { + "auxiliary_loss_clip": 0.01105373, + "auxiliary_loss_mlp": 0.01045649, + "balance_loss_clip": 1.03829074, + "balance_loss_mlp": 1.02837539, + "epoch": 0.1617916729295055, + "flos": 22017299351040.0, + "grad_norm": 1.6188595743235445, + "language_loss": 0.88049114, + "learning_rate": 3.8205487594614284e-06, + "loss": 0.90200138, + "num_input_tokens_seen": 58277080, + "step": 2691, + "time_per_iteration": 2.574166774749756 + }, + { + "auxiliary_loss_clip": 0.01111174, + "auxiliary_loss_mlp": 0.01050739, + "balance_loss_clip": 1.0372901, + "balance_loss_mlp": 1.02926898, + "epoch": 0.16185179618217346, + "flos": 23438248450560.0, + "grad_norm": 2.3914531922392563, + "language_loss": 0.82219887, + "learning_rate": 3.820387485666784e-06, + "loss": 0.84381795, + "num_input_tokens_seen": 58294815, + "step": 2692, + "time_per_iteration": 2.666609287261963 + }, + { + "auxiliary_loss_clip": 0.01123408, + "auxiliary_loss_mlp": 0.01052954, + "balance_loss_clip": 1.03793108, + "balance_loss_mlp": 1.03239083, + "epoch": 0.16191191943484143, + "flos": 25666110627840.0, + "grad_norm": 3.2445771268958628, + "language_loss": 0.81236625, + "learning_rate": 3.820226142842862e-06, + "loss": 0.83412993, + "num_input_tokens_seen": 58313215, + "step": 2693, + "time_per_iteration": 2.57403302192688 + }, + { + "auxiliary_loss_clip": 0.01118018, + "auxiliary_loss_mlp": 0.0105514, + "balance_loss_clip": 1.03970003, + "balance_loss_mlp": 1.03708005, + "epoch": 0.1619720426875094, + "flos": 23477355383040.0, + "grad_norm": 1.5463860617752097, + "language_loss": 0.83602941, + "learning_rate": 3.820064730995783e-06, + "loss": 0.85776097, + "num_input_tokens_seen": 58333215, + "step": 2694, + "time_per_iteration": 2.597130537033081 + }, + { + "auxiliary_loss_clip": 0.01073401, + "auxiliary_loss_mlp": 0.01056053, + "balance_loss_clip": 1.03512859, + "balance_loss_mlp": 1.0349288, + "epoch": 0.16203216594017736, + "flos": 24133658734080.0, + "grad_norm": 1.7895286797729906, + "language_loss": 0.69205546, + "learning_rate": 3.819903250131667e-06, + "loss": 0.71335, + "num_input_tokens_seen": 58351160, + "step": 2695, + "time_per_iteration": 2.67341947555542 + }, + { + "auxiliary_loss_clip": 0.01114348, + "auxiliary_loss_mlp": 0.01050471, + "balance_loss_clip": 1.04100323, + "balance_loss_mlp": 1.03040862, + "epoch": 0.16209228919284532, + "flos": 22340889999360.0, + "grad_norm": 2.5850548696225473, + "language_loss": 0.82419693, + "learning_rate": 3.819741700256637e-06, + "loss": 0.8458451, + "num_input_tokens_seen": 58368505, + "step": 2696, + "time_per_iteration": 2.7528998851776123 + }, + { + "auxiliary_loss_clip": 0.01127154, + "auxiliary_loss_mlp": 0.01054547, + "balance_loss_clip": 1.03958201, + "balance_loss_mlp": 1.03358972, + "epoch": 0.1621524124455133, + "flos": 15815131827840.0, + "grad_norm": 2.4683486637362146, + "language_loss": 0.88506705, + "learning_rate": 3.8195800813768194e-06, + "loss": 0.90688407, + "num_input_tokens_seen": 58385085, + "step": 2697, + "time_per_iteration": 2.5958919525146484 + }, + { + "auxiliary_loss_clip": 0.01114366, + "auxiliary_loss_mlp": 0.01047768, + "balance_loss_clip": 1.03679299, + "balance_loss_mlp": 1.03022075, + "epoch": 0.16221253569818128, + "flos": 30186688988160.0, + "grad_norm": 1.441408103454088, + "language_loss": 0.8064068, + "learning_rate": 3.819418393498343e-06, + "loss": 0.82802814, + "num_input_tokens_seen": 58406985, + "step": 2698, + "time_per_iteration": 2.6276795864105225 + }, + { + "auxiliary_loss_clip": 0.01106804, + "auxiliary_loss_mlp": 0.01050732, + "balance_loss_clip": 1.03935003, + "balance_loss_mlp": 1.03187346, + "epoch": 0.16227265895084925, + "flos": 24605991601920.0, + "grad_norm": 1.5360282406164498, + "language_loss": 0.7754398, + "learning_rate": 3.819256636627339e-06, + "loss": 0.79701519, + "num_input_tokens_seen": 58426205, + "step": 2699, + "time_per_iteration": 2.651684045791626 + }, + { + "auxiliary_loss_clip": 0.01091844, + "auxiliary_loss_mlp": 0.01042144, + "balance_loss_clip": 1.03476381, + "balance_loss_mlp": 1.02407217, + "epoch": 0.1623327822035172, + "flos": 19573326996480.0, + "grad_norm": 1.8858152276631936, + "language_loss": 0.85990918, + "learning_rate": 3.81909481076994e-06, + "loss": 0.88124907, + "num_input_tokens_seen": 58443830, + "step": 2700, + "time_per_iteration": 2.7537708282470703 + }, + { + "auxiliary_loss_clip": 0.01101268, + "auxiliary_loss_mlp": 0.00749178, + "balance_loss_clip": 1.03376198, + "balance_loss_mlp": 1.00104332, + "epoch": 0.16239290545618518, + "flos": 26468462678400.0, + "grad_norm": 1.5417761800083274, + "language_loss": 0.80465722, + "learning_rate": 3.818932915932284e-06, + "loss": 0.82316166, + "num_input_tokens_seen": 58464405, + "step": 2701, + "time_per_iteration": 2.700873851776123 + }, + { + "auxiliary_loss_clip": 0.0109607, + "auxiliary_loss_mlp": 0.0104615, + "balance_loss_clip": 1.037902, + "balance_loss_mlp": 1.02767277, + "epoch": 0.16245302870885314, + "flos": 15851940289920.0, + "grad_norm": 2.0927499602788155, + "language_loss": 0.7325263, + "learning_rate": 3.818770952120511e-06, + "loss": 0.75394845, + "num_input_tokens_seen": 58483295, + "step": 2702, + "time_per_iteration": 2.629004716873169 + }, + { + "auxiliary_loss_clip": 0.0110821, + "auxiliary_loss_mlp": 0.0104695, + "balance_loss_clip": 1.03714168, + "balance_loss_mlp": 1.02730393, + "epoch": 0.1625131519615211, + "flos": 14756521173120.0, + "grad_norm": 1.9977417793428398, + "language_loss": 0.73203814, + "learning_rate": 3.81860891934076e-06, + "loss": 0.75358975, + "num_input_tokens_seen": 58501205, + "step": 2703, + "time_per_iteration": 2.6065280437469482 + }, + { + "auxiliary_loss_clip": 0.01117738, + "auxiliary_loss_mlp": 0.01050024, + "balance_loss_clip": 1.0357008, + "balance_loss_mlp": 1.02907872, + "epoch": 0.1625732752141891, + "flos": 28220508368640.0, + "grad_norm": 1.908755786508406, + "language_loss": 0.70336211, + "learning_rate": 3.818446817599176e-06, + "loss": 0.72503972, + "num_input_tokens_seen": 58522315, + "step": 2704, + "time_per_iteration": 2.607732057571411 + }, + { + "auxiliary_loss_clip": 0.00996125, + "auxiliary_loss_mlp": 0.01005866, + "balance_loss_clip": 1.00673103, + "balance_loss_mlp": 1.00338602, + "epoch": 0.16263339846685707, + "flos": 67327947688320.0, + "grad_norm": 0.7813682904533398, + "language_loss": 0.53298771, + "learning_rate": 3.818284646901907e-06, + "loss": 0.5530076, + "num_input_tokens_seen": 58586695, + "step": 2705, + "time_per_iteration": 3.2694203853607178 + }, + { + "auxiliary_loss_clip": 0.01091461, + "auxiliary_loss_mlp": 0.00749278, + "balance_loss_clip": 1.03665137, + "balance_loss_mlp": 1.00099778, + "epoch": 0.16269352171952503, + "flos": 14319165173760.0, + "grad_norm": 2.4617144390767294, + "language_loss": 0.76279438, + "learning_rate": 3.818122407255102e-06, + "loss": 0.78120172, + "num_input_tokens_seen": 58602435, + "step": 2706, + "time_per_iteration": 2.7115092277526855 + }, + { + "auxiliary_loss_clip": 0.01080352, + "auxiliary_loss_mlp": 0.01048878, + "balance_loss_clip": 1.03446245, + "balance_loss_mlp": 1.03038836, + "epoch": 0.162753644972193, + "flos": 28361205941760.0, + "grad_norm": 1.7443679399584648, + "language_loss": 0.72077823, + "learning_rate": 3.817960098664914e-06, + "loss": 0.74207056, + "num_input_tokens_seen": 58621275, + "step": 2707, + "time_per_iteration": 2.8673183917999268 + }, + { + "auxiliary_loss_clip": 0.0109757, + "auxiliary_loss_mlp": 0.01046512, + "balance_loss_clip": 1.03744936, + "balance_loss_mlp": 1.02835608, + "epoch": 0.16281376822486096, + "flos": 19937856170880.0, + "grad_norm": 2.738103492358851, + "language_loss": 0.83875418, + "learning_rate": 3.817797721137495e-06, + "loss": 0.86019492, + "num_input_tokens_seen": 58637550, + "step": 2708, + "time_per_iteration": 2.7405619621276855 + }, + { + "auxiliary_loss_clip": 0.01056289, + "auxiliary_loss_mlp": 0.0074926, + "balance_loss_clip": 1.03229332, + "balance_loss_mlp": 1.00103045, + "epoch": 0.16287389147752893, + "flos": 21251719848960.0, + "grad_norm": 1.989720806619123, + "language_loss": 0.86088651, + "learning_rate": 3.817635274679006e-06, + "loss": 0.87894201, + "num_input_tokens_seen": 58654135, + "step": 2709, + "time_per_iteration": 2.969589948654175 + }, + { + "auxiliary_loss_clip": 0.01095226, + "auxiliary_loss_mlp": 0.00749193, + "balance_loss_clip": 1.03566337, + "balance_loss_mlp": 1.00100732, + "epoch": 0.1629340147301969, + "flos": 19244672530560.0, + "grad_norm": 1.5901332094421738, + "language_loss": 0.91532457, + "learning_rate": 3.817472759295605e-06, + "loss": 0.93376875, + "num_input_tokens_seen": 58674320, + "step": 2710, + "time_per_iteration": 2.6496496200561523 + }, + { + "auxiliary_loss_clip": 0.01079961, + "auxiliary_loss_mlp": 0.01053982, + "balance_loss_clip": 1.03976703, + "balance_loss_mlp": 1.03420568, + "epoch": 0.16299413798286488, + "flos": 21249816428160.0, + "grad_norm": 2.557685548841217, + "language_loss": 0.81189394, + "learning_rate": 3.817310174993453e-06, + "loss": 0.83323336, + "num_input_tokens_seen": 58691000, + "step": 2711, + "time_per_iteration": 2.698448419570923 + }, + { + "auxiliary_loss_clip": 0.01102307, + "auxiliary_loss_mlp": 0.01041928, + "balance_loss_clip": 1.03539181, + "balance_loss_mlp": 1.02256835, + "epoch": 0.16305426123553285, + "flos": 18770579896320.0, + "grad_norm": 2.541911322569207, + "language_loss": 0.81228185, + "learning_rate": 3.817147521778719e-06, + "loss": 0.83372414, + "num_input_tokens_seen": 58710230, + "step": 2712, + "time_per_iteration": 2.703768491744995 + }, + { + "auxiliary_loss_clip": 0.01123582, + "auxiliary_loss_mlp": 0.01053971, + "balance_loss_clip": 1.03932333, + "balance_loss_mlp": 1.03424191, + "epoch": 0.16311438448820081, + "flos": 22087648137600.0, + "grad_norm": 1.7844380977557004, + "language_loss": 0.77276021, + "learning_rate": 3.816984799657568e-06, + "loss": 0.79453576, + "num_input_tokens_seen": 58728610, + "step": 2713, + "time_per_iteration": 2.599879503250122 + }, + { + "auxiliary_loss_clip": 0.01110598, + "auxiliary_loss_mlp": 0.01056783, + "balance_loss_clip": 1.04225826, + "balance_loss_mlp": 1.03720856, + "epoch": 0.16317450774086878, + "flos": 16467700164480.0, + "grad_norm": 2.2281256604032946, + "language_loss": 0.79141247, + "learning_rate": 3.8168220086361715e-06, + "loss": 0.81308627, + "num_input_tokens_seen": 58744385, + "step": 2714, + "time_per_iteration": 2.548794984817505 + }, + { + "auxiliary_loss_clip": 0.01103948, + "auxiliary_loss_mlp": 0.01054919, + "balance_loss_clip": 1.03816748, + "balance_loss_mlp": 1.03598857, + "epoch": 0.16323463099353674, + "flos": 24352929308160.0, + "grad_norm": 1.6911099680296575, + "language_loss": 0.78094542, + "learning_rate": 3.816659148720702e-06, + "loss": 0.8025341, + "num_input_tokens_seen": 58763905, + "step": 2715, + "time_per_iteration": 2.7049334049224854 + }, + { + "auxiliary_loss_clip": 0.01086729, + "auxiliary_loss_mlp": 0.01042849, + "balance_loss_clip": 1.0332948, + "balance_loss_mlp": 1.02478933, + "epoch": 0.1632947542462047, + "flos": 24900782520960.0, + "grad_norm": 2.2453001933899555, + "language_loss": 0.81950712, + "learning_rate": 3.816496219917336e-06, + "loss": 0.84080285, + "num_input_tokens_seen": 58785580, + "step": 2716, + "time_per_iteration": 2.6782453060150146 + }, + { + "auxiliary_loss_clip": 0.01101392, + "auxiliary_loss_mlp": 0.01055246, + "balance_loss_clip": 1.04082859, + "balance_loss_mlp": 1.03666127, + "epoch": 0.1633548774988727, + "flos": 24900279730560.0, + "grad_norm": 2.5884395070459636, + "language_loss": 0.86071825, + "learning_rate": 3.816333222232251e-06, + "loss": 0.88228464, + "num_input_tokens_seen": 58806075, + "step": 2717, + "time_per_iteration": 2.653041124343872 + }, + { + "auxiliary_loss_clip": 0.01095345, + "auxiliary_loss_mlp": 0.010437, + "balance_loss_clip": 1.03803039, + "balance_loss_mlp": 1.02587867, + "epoch": 0.16341500075154067, + "flos": 30441798357120.0, + "grad_norm": 2.0933090253794795, + "language_loss": 0.76445979, + "learning_rate": 3.816170155671629e-06, + "loss": 0.78585029, + "num_input_tokens_seen": 58827405, + "step": 2718, + "time_per_iteration": 2.7374253273010254 + }, + { + "auxiliary_loss_clip": 0.01103603, + "auxiliary_loss_mlp": 0.01042736, + "balance_loss_clip": 1.04026246, + "balance_loss_mlp": 1.02472317, + "epoch": 0.16347512400420863, + "flos": 22784530878720.0, + "grad_norm": 1.9234698365342064, + "language_loss": 0.73650193, + "learning_rate": 3.816007020241652e-06, + "loss": 0.75796533, + "num_input_tokens_seen": 58847205, + "step": 2719, + "time_per_iteration": 2.645486354827881 + }, + { + "auxiliary_loss_clip": 0.01079289, + "auxiliary_loss_mlp": 0.01045201, + "balance_loss_clip": 1.032776, + "balance_loss_mlp": 1.02667594, + "epoch": 0.1635352472568766, + "flos": 22633274707200.0, + "grad_norm": 1.6668092381813948, + "language_loss": 0.72247756, + "learning_rate": 3.815843815948507e-06, + "loss": 0.74372244, + "num_input_tokens_seen": 58866865, + "step": 2720, + "time_per_iteration": 2.6574387550354004 + }, + { + "auxiliary_loss_clip": 0.01066369, + "auxiliary_loss_mlp": 0.01053516, + "balance_loss_clip": 1.03635502, + "balance_loss_mlp": 1.03260684, + "epoch": 0.16359537050954456, + "flos": 15522998515200.0, + "grad_norm": 1.996165260256733, + "language_loss": 0.74662888, + "learning_rate": 3.8156805427983824e-06, + "loss": 0.76782763, + "num_input_tokens_seen": 58885200, + "step": 2721, + "time_per_iteration": 2.708609104156494 + }, + { + "auxiliary_loss_clip": 0.01063533, + "auxiliary_loss_mlp": 0.01055043, + "balance_loss_clip": 1.03106594, + "balance_loss_mlp": 1.03375196, + "epoch": 0.16365549376221253, + "flos": 22090162089600.0, + "grad_norm": 1.8158131935080122, + "language_loss": 0.78805506, + "learning_rate": 3.8155172007974695e-06, + "loss": 0.80924082, + "num_input_tokens_seen": 58906385, + "step": 2722, + "time_per_iteration": 2.82029390335083 + }, + { + "auxiliary_loss_clip": 0.01101944, + "auxiliary_loss_mlp": 0.00749316, + "balance_loss_clip": 1.03668821, + "balance_loss_mlp": 1.00100327, + "epoch": 0.1637156170148805, + "flos": 24060400945920.0, + "grad_norm": 3.2055152313292314, + "language_loss": 0.84734541, + "learning_rate": 3.8153537899519624e-06, + "loss": 0.86585796, + "num_input_tokens_seen": 58925040, + "step": 2723, + "time_per_iteration": 2.632753610610962 + }, + { + "auxiliary_loss_clip": 0.01060871, + "auxiliary_loss_mlp": 0.01042931, + "balance_loss_clip": 1.03017843, + "balance_loss_mlp": 1.02338064, + "epoch": 0.1637757402675485, + "flos": 26685362954880.0, + "grad_norm": 42.188729975357134, + "language_loss": 0.71415365, + "learning_rate": 3.815190310268058e-06, + "loss": 0.7351917, + "num_input_tokens_seen": 58944790, + "step": 2724, + "time_per_iteration": 2.782533884048462 + }, + { + "auxiliary_loss_clip": 0.01073507, + "auxiliary_loss_mlp": 0.01041442, + "balance_loss_clip": 1.03565538, + "balance_loss_mlp": 1.02338171, + "epoch": 0.16383586352021645, + "flos": 16106941918080.0, + "grad_norm": 2.2373759817874492, + "language_loss": 0.71482629, + "learning_rate": 3.815026761751955e-06, + "loss": 0.7359758, + "num_input_tokens_seen": 58962500, + "step": 2725, + "time_per_iteration": 2.7130801677703857 + }, + { + "auxiliary_loss_clip": 0.01067867, + "auxiliary_loss_mlp": 0.0103909, + "balance_loss_clip": 1.03348899, + "balance_loss_mlp": 1.02179289, + "epoch": 0.16389598677288442, + "flos": 19165991788800.0, + "grad_norm": 1.743829201212, + "language_loss": 0.88705498, + "learning_rate": 3.814863144409855e-06, + "loss": 0.90812451, + "num_input_tokens_seen": 58980355, + "step": 2726, + "time_per_iteration": 2.753385066986084 + }, + { + "auxiliary_loss_clip": 0.01113334, + "auxiliary_loss_mlp": 0.01049528, + "balance_loss_clip": 1.04198122, + "balance_loss_mlp": 1.0300734, + "epoch": 0.16395611002555238, + "flos": 21507008785920.0, + "grad_norm": 1.870421699509261, + "language_loss": 0.74193084, + "learning_rate": 3.814699458247963e-06, + "loss": 0.76355946, + "num_input_tokens_seen": 58999505, + "step": 2727, + "time_per_iteration": 2.6716089248657227 + }, + { + "auxiliary_loss_clip": 0.01104786, + "auxiliary_loss_mlp": 0.01048512, + "balance_loss_clip": 1.03787482, + "balance_loss_mlp": 1.03082144, + "epoch": 0.16401623327822035, + "flos": 21470918595840.0, + "grad_norm": 1.7252171516061363, + "language_loss": 0.82481563, + "learning_rate": 3.8145357032724855e-06, + "loss": 0.84634858, + "num_input_tokens_seen": 59017930, + "step": 2728, + "time_per_iteration": 4.19445013999939 + }, + { + "auxiliary_loss_clip": 0.01112043, + "auxiliary_loss_mlp": 0.01046894, + "balance_loss_clip": 1.03923059, + "balance_loss_mlp": 1.02726066, + "epoch": 0.1640763565308883, + "flos": 13626232928640.0, + "grad_norm": 2.3488072359086423, + "language_loss": 0.8476913, + "learning_rate": 3.814371879489633e-06, + "loss": 0.8692807, + "num_input_tokens_seen": 59035130, + "step": 2729, + "time_per_iteration": 2.5909667015075684 + }, + { + "auxiliary_loss_clip": 0.01120059, + "auxiliary_loss_mlp": 0.01047822, + "balance_loss_clip": 1.03854012, + "balance_loss_mlp": 1.02974951, + "epoch": 0.16413647978355628, + "flos": 15451464579840.0, + "grad_norm": 1.8770548545625059, + "language_loss": 0.72297132, + "learning_rate": 3.814207986905616e-06, + "loss": 0.74465013, + "num_input_tokens_seen": 59053080, + "step": 2730, + "time_per_iteration": 2.511378288269043 + }, + { + "auxiliary_loss_clip": 0.01098782, + "auxiliary_loss_mlp": 0.01052439, + "balance_loss_clip": 1.03520405, + "balance_loss_mlp": 1.03121948, + "epoch": 0.16419660303622427, + "flos": 45878682015360.0, + "grad_norm": 1.692938575225803, + "language_loss": 0.74529064, + "learning_rate": 3.814044025526651e-06, + "loss": 0.76680291, + "num_input_tokens_seen": 59075610, + "step": 2731, + "time_per_iteration": 2.9368042945861816 + }, + { + "auxiliary_loss_clip": 0.01072542, + "auxiliary_loss_mlp": 0.01054233, + "balance_loss_clip": 1.03332448, + "balance_loss_mlp": 1.03346705, + "epoch": 0.16425672628889224, + "flos": 18952826526720.0, + "grad_norm": 1.9849058539609759, + "language_loss": 0.79272938, + "learning_rate": 3.8138799953589548e-06, + "loss": 0.81399715, + "num_input_tokens_seen": 59094555, + "step": 2732, + "time_per_iteration": 2.6952226161956787 + }, + { + "auxiliary_loss_clip": 0.01095226, + "auxiliary_loss_mlp": 0.01047648, + "balance_loss_clip": 1.03657532, + "balance_loss_mlp": 1.02869391, + "epoch": 0.1643168495415602, + "flos": 24312996362880.0, + "grad_norm": 2.0917875933635792, + "language_loss": 0.69278073, + "learning_rate": 3.8137158964087473e-06, + "loss": 0.71420944, + "num_input_tokens_seen": 59113515, + "step": 2733, + "time_per_iteration": 4.2469704151153564 + }, + { + "auxiliary_loss_clip": 0.01095752, + "auxiliary_loss_mlp": 0.0104862, + "balance_loss_clip": 1.03655863, + "balance_loss_mlp": 1.02812755, + "epoch": 0.16437697279422817, + "flos": 26428421992320.0, + "grad_norm": 2.8935448316862824, + "language_loss": 0.80904818, + "learning_rate": 3.8135517286822508e-06, + "loss": 0.8304919, + "num_input_tokens_seen": 59133275, + "step": 2734, + "time_per_iteration": 4.252305746078491 + }, + { + "auxiliary_loss_clip": 0.01087976, + "auxiliary_loss_mlp": 0.01052579, + "balance_loss_clip": 1.03509903, + "balance_loss_mlp": 1.03202784, + "epoch": 0.16443709604689613, + "flos": 34532239351680.0, + "grad_norm": 2.4998394657645076, + "language_loss": 0.82161057, + "learning_rate": 3.8133874921856914e-06, + "loss": 0.84301615, + "num_input_tokens_seen": 59154095, + "step": 2735, + "time_per_iteration": 4.298838138580322 + }, + { + "auxiliary_loss_clip": 0.01031632, + "auxiliary_loss_mlp": 0.01043191, + "balance_loss_clip": 1.03070951, + "balance_loss_mlp": 1.02403402, + "epoch": 0.1644972192995641, + "flos": 23258048895360.0, + "grad_norm": 1.9909482768054707, + "language_loss": 0.78638875, + "learning_rate": 3.813223186925296e-06, + "loss": 0.80713701, + "num_input_tokens_seen": 59173795, + "step": 2736, + "time_per_iteration": 2.8476157188415527 + }, + { + "auxiliary_loss_clip": 0.01101342, + "auxiliary_loss_mlp": 0.01054757, + "balance_loss_clip": 1.04038775, + "balance_loss_mlp": 1.0358268, + "epoch": 0.1645573425522321, + "flos": 26979543342720.0, + "grad_norm": 1.5721659276595856, + "language_loss": 0.81464851, + "learning_rate": 3.8130588129072964e-06, + "loss": 0.83620948, + "num_input_tokens_seen": 59191610, + "step": 2737, + "time_per_iteration": 2.7321362495422363 + }, + { + "auxiliary_loss_clip": 0.01099345, + "auxiliary_loss_mlp": 0.01046982, + "balance_loss_clip": 1.03551674, + "balance_loss_mlp": 1.02725267, + "epoch": 0.16461746580490005, + "flos": 28731768600960.0, + "grad_norm": 1.8377788518432132, + "language_loss": 0.87633049, + "learning_rate": 3.8128943701379246e-06, + "loss": 0.89779377, + "num_input_tokens_seen": 59213000, + "step": 2738, + "time_per_iteration": 2.6590383052825928 + }, + { + "auxiliary_loss_clip": 0.01089137, + "auxiliary_loss_mlp": 0.01055769, + "balance_loss_clip": 1.0345552, + "balance_loss_mlp": 1.03582585, + "epoch": 0.16467758905756802, + "flos": 24930156867840.0, + "grad_norm": 2.015662995946275, + "language_loss": 0.71744549, + "learning_rate": 3.8127298586234167e-06, + "loss": 0.73889458, + "num_input_tokens_seen": 59232340, + "step": 2739, + "time_per_iteration": 2.648906707763672 + }, + { + "auxiliary_loss_clip": 0.011088, + "auxiliary_loss_mlp": 0.01048274, + "balance_loss_clip": 1.03719711, + "balance_loss_mlp": 1.02870035, + "epoch": 0.16473771231023598, + "flos": 24826519152000.0, + "grad_norm": 2.243375513083543, + "language_loss": 0.81436849, + "learning_rate": 3.8125652783700104e-06, + "loss": 0.83593929, + "num_input_tokens_seen": 59253950, + "step": 2740, + "time_per_iteration": 2.6066062450408936 + }, + { + "auxiliary_loss_clip": 0.01057044, + "auxiliary_loss_mlp": 0.01059189, + "balance_loss_clip": 1.0308342, + "balance_loss_mlp": 1.03591943, + "epoch": 0.16479783556290395, + "flos": 39896072375040.0, + "grad_norm": 1.8119892620561744, + "language_loss": 0.69292122, + "learning_rate": 3.8124006293839475e-06, + "loss": 0.71408355, + "num_input_tokens_seen": 59275545, + "step": 2741, + "time_per_iteration": 2.879621982574463 + }, + { + "auxiliary_loss_clip": 0.01118577, + "auxiliary_loss_mlp": 0.01046915, + "balance_loss_clip": 1.03648901, + "balance_loss_mlp": 1.02762699, + "epoch": 0.16485795881557191, + "flos": 19897061299200.0, + "grad_norm": 1.7751370533866309, + "language_loss": 0.79886794, + "learning_rate": 3.812235911671472e-06, + "loss": 0.82052279, + "num_input_tokens_seen": 59293480, + "step": 2742, + "time_per_iteration": 2.578153133392334 + }, + { + "auxiliary_loss_clip": 0.01093748, + "auxiliary_loss_mlp": 0.01051521, + "balance_loss_clip": 1.03568542, + "balance_loss_mlp": 1.03172016, + "epoch": 0.16491808206823988, + "flos": 20556129997440.0, + "grad_norm": 1.8497553732318406, + "language_loss": 0.84705061, + "learning_rate": 3.8120711252388274e-06, + "loss": 0.86850333, + "num_input_tokens_seen": 59313435, + "step": 2743, + "time_per_iteration": 2.714317798614502 + }, + { + "auxiliary_loss_clip": 0.01114014, + "auxiliary_loss_mlp": 0.01047462, + "balance_loss_clip": 1.03542483, + "balance_loss_mlp": 1.02793574, + "epoch": 0.16497820532090787, + "flos": 23800802376960.0, + "grad_norm": 1.7165708096703223, + "language_loss": 0.85695529, + "learning_rate": 3.811906270092265e-06, + "loss": 0.87857008, + "num_input_tokens_seen": 59331535, + "step": 2744, + "time_per_iteration": 2.5899596214294434 + }, + { + "auxiliary_loss_clip": 0.01090069, + "auxiliary_loss_mlp": 0.01046428, + "balance_loss_clip": 1.03628731, + "balance_loss_mlp": 1.02824843, + "epoch": 0.16503832857357584, + "flos": 25482642935040.0, + "grad_norm": 2.014034337713589, + "language_loss": 0.83076096, + "learning_rate": 3.811741346238036e-06, + "loss": 0.85212588, + "num_input_tokens_seen": 59350680, + "step": 2745, + "time_per_iteration": 2.684401750564575 + }, + { + "auxiliary_loss_clip": 0.01092009, + "auxiliary_loss_mlp": 0.01051862, + "balance_loss_clip": 1.04060984, + "balance_loss_mlp": 1.0325377, + "epoch": 0.1650984518262438, + "flos": 17676058619520.0, + "grad_norm": 1.8156652908344153, + "language_loss": 0.76805866, + "learning_rate": 3.8115763536823923e-06, + "loss": 0.78949738, + "num_input_tokens_seen": 59367020, + "step": 2746, + "time_per_iteration": 2.6561033725738525 + }, + { + "auxiliary_loss_clip": 0.0111806, + "auxiliary_loss_mlp": 0.01047942, + "balance_loss_clip": 1.03793836, + "balance_loss_mlp": 1.02839136, + "epoch": 0.16515857507891177, + "flos": 18698327688960.0, + "grad_norm": 1.5732696885370496, + "language_loss": 0.80871487, + "learning_rate": 3.811411292431592e-06, + "loss": 0.83037484, + "num_input_tokens_seen": 59386075, + "step": 2747, + "time_per_iteration": 2.594788074493408 + }, + { + "auxiliary_loss_clip": 0.01111693, + "auxiliary_loss_mlp": 0.01044208, + "balance_loss_clip": 1.04031825, + "balance_loss_mlp": 1.02463365, + "epoch": 0.16521869833157973, + "flos": 15010481306880.0, + "grad_norm": 2.079210124061283, + "language_loss": 0.69804293, + "learning_rate": 3.8112461624918945e-06, + "loss": 0.71960193, + "num_input_tokens_seen": 59402690, + "step": 2748, + "time_per_iteration": 2.627718687057495 + }, + { + "auxiliary_loss_clip": 0.0112347, + "auxiliary_loss_mlp": 0.00749168, + "balance_loss_clip": 1.04190063, + "balance_loss_mlp": 1.00094581, + "epoch": 0.1652788215842477, + "flos": 22121152548480.0, + "grad_norm": 2.1321713883758306, + "language_loss": 0.88077223, + "learning_rate": 3.811080963869561e-06, + "loss": 0.89949864, + "num_input_tokens_seen": 59421130, + "step": 2749, + "time_per_iteration": 2.659911870956421 + }, + { + "auxiliary_loss_clip": 0.01108684, + "auxiliary_loss_mlp": 0.01047224, + "balance_loss_clip": 1.03644049, + "balance_loss_mlp": 1.0274353, + "epoch": 0.16533894483691566, + "flos": 18333080242560.0, + "grad_norm": 1.8911465467584316, + "language_loss": 0.79201031, + "learning_rate": 3.8109156965708557e-06, + "loss": 0.81356943, + "num_input_tokens_seen": 59438970, + "step": 2750, + "time_per_iteration": 2.689019203186035 + }, + { + "auxiliary_loss_clip": 0.01109518, + "auxiliary_loss_mlp": 0.01044151, + "balance_loss_clip": 1.03985667, + "balance_loss_mlp": 1.02550662, + "epoch": 0.16539906808958366, + "flos": 22382115834240.0, + "grad_norm": 1.8350092634749857, + "language_loss": 0.9503808, + "learning_rate": 3.8107503606020455e-06, + "loss": 0.97191751, + "num_input_tokens_seen": 59458510, + "step": 2751, + "time_per_iteration": 2.6260664463043213 + }, + { + "auxiliary_loss_clip": 0.0103059, + "auxiliary_loss_mlp": 0.01053111, + "balance_loss_clip": 1.03560019, + "balance_loss_mlp": 1.03353667, + "epoch": 0.16545919134225162, + "flos": 22711093522560.0, + "grad_norm": 2.193462826786188, + "language_loss": 0.70895988, + "learning_rate": 3.8105849559693997e-06, + "loss": 0.72979695, + "num_input_tokens_seen": 59477110, + "step": 2752, + "time_per_iteration": 2.8382837772369385 + }, + { + "auxiliary_loss_clip": 0.0102755, + "auxiliary_loss_mlp": 0.0101461, + "balance_loss_clip": 1.01277924, + "balance_loss_mlp": 1.01196408, + "epoch": 0.1655193145949196, + "flos": 67802974076160.0, + "grad_norm": 0.7732643643835889, + "language_loss": 0.54129076, + "learning_rate": 3.810419482679192e-06, + "loss": 0.56171238, + "num_input_tokens_seen": 59541155, + "step": 2753, + "time_per_iteration": 3.5223655700683594 + }, + { + "auxiliary_loss_clip": 0.01118616, + "auxiliary_loss_mlp": 0.0074929, + "balance_loss_clip": 1.03767419, + "balance_loss_mlp": 1.00110745, + "epoch": 0.16557943784758755, + "flos": 24280389792000.0, + "grad_norm": 1.6681124102071485, + "language_loss": 0.74987704, + "learning_rate": 3.8102539407376954e-06, + "loss": 0.76855612, + "num_input_tokens_seen": 59561155, + "step": 2754, + "time_per_iteration": 2.567675828933716 + }, + { + "auxiliary_loss_clip": 0.01093446, + "auxiliary_loss_mlp": 0.0105597, + "balance_loss_clip": 1.03796935, + "balance_loss_mlp": 1.03314126, + "epoch": 0.16563956110025552, + "flos": 20083617561600.0, + "grad_norm": 2.243323985980874, + "language_loss": 0.86229777, + "learning_rate": 3.810088330151188e-06, + "loss": 0.88379192, + "num_input_tokens_seen": 59580460, + "step": 2755, + "time_per_iteration": 2.618633985519409 + }, + { + "auxiliary_loss_clip": 0.01077282, + "auxiliary_loss_mlp": 0.01056543, + "balance_loss_clip": 1.03287756, + "balance_loss_mlp": 1.03577626, + "epoch": 0.16569968435292348, + "flos": 28034454896640.0, + "grad_norm": 1.9505295690203441, + "language_loss": 0.73583984, + "learning_rate": 3.80992265092595e-06, + "loss": 0.75717807, + "num_input_tokens_seen": 59600025, + "step": 2756, + "time_per_iteration": 2.691145658493042 + }, + { + "auxiliary_loss_clip": 0.01080927, + "auxiliary_loss_mlp": 0.01049286, + "balance_loss_clip": 1.03556311, + "balance_loss_mlp": 1.02937865, + "epoch": 0.16575980760559147, + "flos": 26250233598720.0, + "grad_norm": 1.4703277345038093, + "language_loss": 0.74642754, + "learning_rate": 3.8097569030682636e-06, + "loss": 0.76772964, + "num_input_tokens_seen": 59620600, + "step": 2757, + "time_per_iteration": 2.696477174758911 + }, + { + "auxiliary_loss_clip": 0.01099964, + "auxiliary_loss_mlp": 0.01052511, + "balance_loss_clip": 1.03933287, + "balance_loss_mlp": 1.03366423, + "epoch": 0.16581993085825944, + "flos": 26943955943040.0, + "grad_norm": 1.820528894485311, + "language_loss": 0.84678674, + "learning_rate": 3.8095910865844137e-06, + "loss": 0.86831146, + "num_input_tokens_seen": 59641385, + "step": 2758, + "time_per_iteration": 2.6655337810516357 + }, + { + "auxiliary_loss_clip": 0.01124314, + "auxiliary_loss_mlp": 0.01058212, + "balance_loss_clip": 1.04288912, + "balance_loss_mlp": 1.03907847, + "epoch": 0.1658800541109274, + "flos": 21653632103040.0, + "grad_norm": 1.7754772008413948, + "language_loss": 0.79146898, + "learning_rate": 3.809425201480689e-06, + "loss": 0.81329417, + "num_input_tokens_seen": 59659865, + "step": 2759, + "time_per_iteration": 2.5555784702301025 + }, + { + "auxiliary_loss_clip": 0.01059151, + "auxiliary_loss_mlp": 0.01048366, + "balance_loss_clip": 1.03463197, + "balance_loss_mlp": 1.02869654, + "epoch": 0.16594017736359537, + "flos": 16435488643200.0, + "grad_norm": 2.2101456453782826, + "language_loss": 0.74887133, + "learning_rate": 3.8092592477633793e-06, + "loss": 0.76994652, + "num_input_tokens_seen": 59678780, + "step": 2760, + "time_per_iteration": 2.655364513397217 + }, + { + "auxiliary_loss_clip": 0.01077432, + "auxiliary_loss_mlp": 0.0104703, + "balance_loss_clip": 1.03564668, + "balance_loss_mlp": 1.02799261, + "epoch": 0.16600030061626334, + "flos": 22637297030400.0, + "grad_norm": 1.7202374544167034, + "language_loss": 0.73320645, + "learning_rate": 3.8090932254387774e-06, + "loss": 0.75445104, + "num_input_tokens_seen": 59698795, + "step": 2761, + "time_per_iteration": 2.786618232727051 + }, + { + "auxiliary_loss_clip": 0.01090215, + "auxiliary_loss_mlp": 0.01044796, + "balance_loss_clip": 1.03669727, + "balance_loss_mlp": 1.0258534, + "epoch": 0.1660604238689313, + "flos": 26396569607040.0, + "grad_norm": 1.7236983815726115, + "language_loss": 0.8914811, + "learning_rate": 3.8089271345131788e-06, + "loss": 0.91283119, + "num_input_tokens_seen": 59718795, + "step": 2762, + "time_per_iteration": 2.7995142936706543 + }, + { + "auxiliary_loss_clip": 0.01075358, + "auxiliary_loss_mlp": 0.01053605, + "balance_loss_clip": 1.036731, + "balance_loss_mlp": 1.03419781, + "epoch": 0.16612054712159927, + "flos": 23039999383680.0, + "grad_norm": 1.7702183142141799, + "language_loss": 0.88120407, + "learning_rate": 3.8087609749928822e-06, + "loss": 0.90249366, + "num_input_tokens_seen": 59737555, + "step": 2763, + "time_per_iteration": 2.7776103019714355 + }, + { + "auxiliary_loss_clip": 0.01034171, + "auxiliary_loss_mlp": 0.01011859, + "balance_loss_clip": 1.01195657, + "balance_loss_mlp": 1.00890243, + "epoch": 0.16618067037426726, + "flos": 59241225202560.0, + "grad_norm": 0.7793562878897391, + "language_loss": 0.5978387, + "learning_rate": 3.8085947468841885e-06, + "loss": 0.61829901, + "num_input_tokens_seen": 59800915, + "step": 2764, + "time_per_iteration": 3.1685211658477783 + }, + { + "auxiliary_loss_clip": 0.01109523, + "auxiliary_loss_mlp": 0.01051084, + "balance_loss_clip": 1.03971434, + "balance_loss_mlp": 1.02954268, + "epoch": 0.16624079362693522, + "flos": 27198813916800.0, + "grad_norm": 1.8606477501920529, + "language_loss": 0.81879401, + "learning_rate": 3.808428450193401e-06, + "loss": 0.84040004, + "num_input_tokens_seen": 59822910, + "step": 2765, + "time_per_iteration": 2.6779701709747314 + }, + { + "auxiliary_loss_clip": 0.01129631, + "auxiliary_loss_mlp": 0.01048507, + "balance_loss_clip": 1.04242337, + "balance_loss_mlp": 1.02720439, + "epoch": 0.1663009168796032, + "flos": 10925068216320.0, + "grad_norm": 2.16954040497924, + "language_loss": 0.7026273, + "learning_rate": 3.8082620849268244e-06, + "loss": 0.72440863, + "num_input_tokens_seen": 59838805, + "step": 2766, + "time_per_iteration": 2.5968945026397705 + }, + { + "auxiliary_loss_clip": 0.01110052, + "auxiliary_loss_mlp": 0.01043632, + "balance_loss_clip": 1.04197598, + "balance_loss_mlp": 1.02510726, + "epoch": 0.16636104013227115, + "flos": 17894431353600.0, + "grad_norm": 2.1758726819163714, + "language_loss": 0.887344, + "learning_rate": 3.808095651090769e-06, + "loss": 0.90888083, + "num_input_tokens_seen": 59855345, + "step": 2767, + "time_per_iteration": 2.6447176933288574 + }, + { + "auxiliary_loss_clip": 0.01018221, + "auxiliary_loss_mlp": 0.01002836, + "balance_loss_clip": 1.00589991, + "balance_loss_mlp": 1.00029683, + "epoch": 0.16642116338493912, + "flos": 66726050463360.0, + "grad_norm": 0.6401943065963012, + "language_loss": 0.52877378, + "learning_rate": 3.8079291486915447e-06, + "loss": 0.54898429, + "num_input_tokens_seen": 59917710, + "step": 2768, + "time_per_iteration": 3.2628889083862305 + }, + { + "auxiliary_loss_clip": 0.01098354, + "auxiliary_loss_mlp": 0.0105147, + "balance_loss_clip": 1.03758478, + "balance_loss_mlp": 1.03096592, + "epoch": 0.16648128663760708, + "flos": 19026048401280.0, + "grad_norm": 2.679611524805191, + "language_loss": 0.84750056, + "learning_rate": 3.8077625777354667e-06, + "loss": 0.86899889, + "num_input_tokens_seen": 59935105, + "step": 2769, + "time_per_iteration": 2.6230385303497314 + }, + { + "auxiliary_loss_clip": 0.01016871, + "auxiliary_loss_mlp": 0.0100537, + "balance_loss_clip": 1.01287329, + "balance_loss_mlp": 1.00289011, + "epoch": 0.16654140989027508, + "flos": 70134976759680.0, + "grad_norm": 0.8111458769569485, + "language_loss": 0.57483667, + "learning_rate": 3.80759593822885e-06, + "loss": 0.59505904, + "num_input_tokens_seen": 59984085, + "step": 2770, + "time_per_iteration": 3.08870530128479 + }, + { + "auxiliary_loss_clip": 0.00999913, + "auxiliary_loss_mlp": 0.01003961, + "balance_loss_clip": 1.00840819, + "balance_loss_mlp": 1.00143361, + "epoch": 0.16660153314294304, + "flos": 70272406195200.0, + "grad_norm": 0.8831802187143378, + "language_loss": 0.56250632, + "learning_rate": 3.807429230178015e-06, + "loss": 0.58254504, + "num_input_tokens_seen": 60043470, + "step": 2771, + "time_per_iteration": 3.0530550479888916 + }, + { + "auxiliary_loss_clip": 0.01074167, + "auxiliary_loss_mlp": 0.0105882, + "balance_loss_clip": 1.03658104, + "balance_loss_mlp": 1.03888822, + "epoch": 0.166661656395611, + "flos": 23075048079360.0, + "grad_norm": 3.0506881461948576, + "language_loss": 0.7079758, + "learning_rate": 3.8072624535892817e-06, + "loss": 0.72930562, + "num_input_tokens_seen": 60063045, + "step": 2772, + "time_per_iteration": 2.6381847858428955 + }, + { + "auxiliary_loss_clip": 0.01104371, + "auxiliary_loss_mlp": 0.01046081, + "balance_loss_clip": 1.03542757, + "balance_loss_mlp": 1.02699542, + "epoch": 0.16672177964827897, + "flos": 28366341586560.0, + "grad_norm": 1.7095405586958563, + "language_loss": 0.85854572, + "learning_rate": 3.807095608468975e-06, + "loss": 0.88005024, + "num_input_tokens_seen": 60081945, + "step": 2773, + "time_per_iteration": 2.6115758419036865 + }, + { + "auxiliary_loss_clip": 0.01056614, + "auxiliary_loss_mlp": 0.01044957, + "balance_loss_clip": 1.03384566, + "balance_loss_mlp": 1.02645612, + "epoch": 0.16678190290094694, + "flos": 19091010147840.0, + "grad_norm": 2.789384027660934, + "language_loss": 0.826002, + "learning_rate": 3.8069286948234224e-06, + "loss": 0.84701777, + "num_input_tokens_seen": 60096820, + "step": 2774, + "time_per_iteration": 2.7024476528167725 + }, + { + "auxiliary_loss_clip": 0.01079603, + "auxiliary_loss_mlp": 0.01048744, + "balance_loss_clip": 1.03642178, + "balance_loss_mlp": 1.02909803, + "epoch": 0.1668420261536149, + "flos": 21799106184960.0, + "grad_norm": 2.1976356740851606, + "language_loss": 0.83323717, + "learning_rate": 3.806761712658952e-06, + "loss": 0.85452062, + "num_input_tokens_seen": 60116140, + "step": 2775, + "time_per_iteration": 4.374493360519409 + }, + { + "auxiliary_loss_clip": 0.01107901, + "auxiliary_loss_mlp": 0.01049429, + "balance_loss_clip": 1.04008102, + "balance_loss_mlp": 1.03120184, + "epoch": 0.16690214940628287, + "flos": 19062533640960.0, + "grad_norm": 1.8174237892609797, + "language_loss": 0.80342531, + "learning_rate": 3.806594661981897e-06, + "loss": 0.82499862, + "num_input_tokens_seen": 60134235, + "step": 2776, + "time_per_iteration": 2.657485246658325 + }, + { + "auxiliary_loss_clip": 0.01104349, + "auxiliary_loss_mlp": 0.01048556, + "balance_loss_clip": 1.04261792, + "balance_loss_mlp": 1.02874351, + "epoch": 0.16696227265895086, + "flos": 18588548747520.0, + "grad_norm": 1.8571061366708328, + "language_loss": 0.80065697, + "learning_rate": 3.8064275427985906e-06, + "loss": 0.82218605, + "num_input_tokens_seen": 60153275, + "step": 2777, + "time_per_iteration": 2.752082586288452 + }, + { + "auxiliary_loss_clip": 0.01104852, + "auxiliary_loss_mlp": 0.01048534, + "balance_loss_clip": 1.03658676, + "balance_loss_mlp": 1.02944827, + "epoch": 0.16702239591161883, + "flos": 23294139085440.0, + "grad_norm": 1.641091237529655, + "language_loss": 0.85096711, + "learning_rate": 3.806260355115371e-06, + "loss": 0.87250096, + "num_input_tokens_seen": 60173215, + "step": 2778, + "time_per_iteration": 2.586927652359009 + }, + { + "auxiliary_loss_clip": 0.01098565, + "auxiliary_loss_mlp": 0.01040197, + "balance_loss_clip": 1.03858292, + "balance_loss_mlp": 1.02110004, + "epoch": 0.1670825191642868, + "flos": 24425648392320.0, + "grad_norm": 1.7932657608891398, + "language_loss": 0.73972285, + "learning_rate": 3.8060930989385778e-06, + "loss": 0.76111042, + "num_input_tokens_seen": 60190515, + "step": 2779, + "time_per_iteration": 2.6408610343933105 + }, + { + "auxiliary_loss_clip": 0.01072386, + "auxiliary_loss_mlp": 0.00749086, + "balance_loss_clip": 1.03541195, + "balance_loss_mlp": 1.00082755, + "epoch": 0.16714264241695476, + "flos": 26797512193920.0, + "grad_norm": 1.91680987539001, + "language_loss": 0.65323842, + "learning_rate": 3.805925774274554e-06, + "loss": 0.67145312, + "num_input_tokens_seen": 60211655, + "step": 2780, + "time_per_iteration": 4.343729496002197 + }, + { + "auxiliary_loss_clip": 0.01079416, + "auxiliary_loss_mlp": 0.01046453, + "balance_loss_clip": 1.03485155, + "balance_loss_mlp": 1.02642632, + "epoch": 0.16720276566962272, + "flos": 21835304115840.0, + "grad_norm": 2.2123202167254217, + "language_loss": 0.78662103, + "learning_rate": 3.805758381129643e-06, + "loss": 0.80787975, + "num_input_tokens_seen": 60230860, + "step": 2781, + "time_per_iteration": 2.632368564605713 + }, + { + "auxiliary_loss_clip": 0.01052018, + "auxiliary_loss_mlp": 0.01050873, + "balance_loss_clip": 1.03216279, + "balance_loss_mlp": 1.03189445, + "epoch": 0.1672628889222907, + "flos": 21470415805440.0, + "grad_norm": 1.5033466389711532, + "language_loss": 0.75335693, + "learning_rate": 3.805590919510193e-06, + "loss": 0.77438587, + "num_input_tokens_seen": 60250535, + "step": 2782, + "time_per_iteration": 5.910945177078247 + }, + { + "auxiliary_loss_clip": 0.01087575, + "auxiliary_loss_mlp": 0.01053446, + "balance_loss_clip": 1.03703952, + "balance_loss_mlp": 1.03291821, + "epoch": 0.16732301217495865, + "flos": 30774008269440.0, + "grad_norm": 2.258325956273268, + "language_loss": 0.67537189, + "learning_rate": 3.8054233894225547e-06, + "loss": 0.69678211, + "num_input_tokens_seen": 60269530, + "step": 2783, + "time_per_iteration": 2.8330583572387695 + }, + { + "auxiliary_loss_clip": 0.01116055, + "auxiliary_loss_mlp": 0.01050913, + "balance_loss_clip": 1.03803742, + "balance_loss_mlp": 1.03204226, + "epoch": 0.16738313542762664, + "flos": 23474625949440.0, + "grad_norm": 1.6289701445466296, + "language_loss": 0.70027602, + "learning_rate": 3.805255790873081e-06, + "loss": 0.72194564, + "num_input_tokens_seen": 60289900, + "step": 2784, + "time_per_iteration": 2.738060712814331 + }, + { + "auxiliary_loss_clip": 0.01098081, + "auxiliary_loss_mlp": 0.01052745, + "balance_loss_clip": 1.03673291, + "balance_loss_mlp": 1.03231227, + "epoch": 0.1674432586802946, + "flos": 29789086366080.0, + "grad_norm": 1.738988537744791, + "language_loss": 0.61035848, + "learning_rate": 3.805088123868126e-06, + "loss": 0.63186681, + "num_input_tokens_seen": 60310025, + "step": 2785, + "time_per_iteration": 2.8349499702453613 + }, + { + "auxiliary_loss_clip": 0.0101057, + "auxiliary_loss_mlp": 0.01013361, + "balance_loss_clip": 1.00772595, + "balance_loss_mlp": 1.01067924, + "epoch": 0.16750338193296258, + "flos": 66136073575680.0, + "grad_norm": 0.7973950859615493, + "language_loss": 0.58870286, + "learning_rate": 3.8049203884140492e-06, + "loss": 0.60894221, + "num_input_tokens_seen": 60377800, + "step": 2786, + "time_per_iteration": 3.403820514678955 + }, + { + "auxiliary_loss_clip": 0.01096767, + "auxiliary_loss_mlp": 0.01050992, + "balance_loss_clip": 1.03623903, + "balance_loss_mlp": 1.03077459, + "epoch": 0.16756350518563054, + "flos": 25696777864320.0, + "grad_norm": 1.7252030234725364, + "language_loss": 0.7619344, + "learning_rate": 3.80475258451721e-06, + "loss": 0.78341192, + "num_input_tokens_seen": 60398215, + "step": 2787, + "time_per_iteration": 2.774123430252075 + }, + { + "auxiliary_loss_clip": 0.01109228, + "auxiliary_loss_mlp": 0.0104391, + "balance_loss_clip": 1.03965509, + "balance_loss_mlp": 1.02546787, + "epoch": 0.1676236284382985, + "flos": 23836102467840.0, + "grad_norm": 2.0384545984857767, + "language_loss": 0.77716291, + "learning_rate": 3.804584712183972e-06, + "loss": 0.79869431, + "num_input_tokens_seen": 60416910, + "step": 2788, + "time_per_iteration": 2.757657527923584 + }, + { + "auxiliary_loss_clip": 0.01009791, + "auxiliary_loss_mlp": 0.01004951, + "balance_loss_clip": 1.00752521, + "balance_loss_mlp": 1.00251949, + "epoch": 0.16768375169096647, + "flos": 59874902985600.0, + "grad_norm": 0.8777262091715563, + "language_loss": 0.59393442, + "learning_rate": 3.8044167714207013e-06, + "loss": 0.61408186, + "num_input_tokens_seen": 60468660, + "step": 2789, + "time_per_iteration": 3.087498426437378 + }, + { + "auxiliary_loss_clip": 0.01108216, + "auxiliary_loss_mlp": 0.01049391, + "balance_loss_clip": 1.03932762, + "balance_loss_mlp": 1.02965033, + "epoch": 0.16774387494363446, + "flos": 38435657207040.0, + "grad_norm": 1.405843227195074, + "language_loss": 0.7037484, + "learning_rate": 3.804248762233765e-06, + "loss": 0.72532451, + "num_input_tokens_seen": 60492370, + "step": 2790, + "time_per_iteration": 2.7795722484588623 + }, + { + "auxiliary_loss_clip": 0.01084318, + "auxiliary_loss_mlp": 0.01050089, + "balance_loss_clip": 1.03663576, + "balance_loss_mlp": 1.03191006, + "epoch": 0.16780399819630243, + "flos": 22637620252800.0, + "grad_norm": 1.8238111340056287, + "language_loss": 0.79529107, + "learning_rate": 3.8040806846295356e-06, + "loss": 0.81663513, + "num_input_tokens_seen": 60512655, + "step": 2791, + "time_per_iteration": 2.732046604156494 + }, + { + "auxiliary_loss_clip": 0.01083312, + "auxiliary_loss_mlp": 0.01043397, + "balance_loss_clip": 1.03824592, + "balance_loss_mlp": 1.02402604, + "epoch": 0.1678641214489704, + "flos": 32891516887680.0, + "grad_norm": 2.7666457077873585, + "language_loss": 0.71427, + "learning_rate": 3.8039125386143853e-06, + "loss": 0.73553705, + "num_input_tokens_seen": 60533090, + "step": 2792, + "time_per_iteration": 2.8519084453582764 + }, + { + "auxiliary_loss_clip": 0.01091012, + "auxiliary_loss_mlp": 0.01045652, + "balance_loss_clip": 1.04001141, + "balance_loss_mlp": 1.02637577, + "epoch": 0.16792424470163836, + "flos": 19974916028160.0, + "grad_norm": 2.1611569560550943, + "language_loss": 0.7150929, + "learning_rate": 3.803744324194691e-06, + "loss": 0.73645949, + "num_input_tokens_seen": 60553190, + "step": 2793, + "time_per_iteration": 2.7790913581848145 + }, + { + "auxiliary_loss_clip": 0.01107114, + "auxiliary_loss_mlp": 0.01054267, + "balance_loss_clip": 1.03978372, + "balance_loss_mlp": 1.03536081, + "epoch": 0.16798436795430632, + "flos": 19719878486400.0, + "grad_norm": 2.1839666161995437, + "language_loss": 0.77443361, + "learning_rate": 3.803576041376831e-06, + "loss": 0.79604733, + "num_input_tokens_seen": 60571995, + "step": 2794, + "time_per_iteration": 2.5807929039001465 + }, + { + "auxiliary_loss_clip": 0.01098185, + "auxiliary_loss_mlp": 0.01049226, + "balance_loss_clip": 1.04072118, + "balance_loss_mlp": 1.03052163, + "epoch": 0.1680444912069743, + "flos": 28104839596800.0, + "grad_norm": 2.2860490653888186, + "language_loss": 0.71605605, + "learning_rate": 3.803407690167187e-06, + "loss": 0.73753011, + "num_input_tokens_seen": 60591275, + "step": 2795, + "time_per_iteration": 2.719031572341919 + }, + { + "auxiliary_loss_clip": 0.01092068, + "auxiliary_loss_mlp": 0.01039609, + "balance_loss_clip": 1.03466392, + "balance_loss_mlp": 1.02178693, + "epoch": 0.16810461445964225, + "flos": 18075205526400.0, + "grad_norm": 1.8235283653375358, + "language_loss": 0.84584033, + "learning_rate": 3.803239270572142e-06, + "loss": 0.8671571, + "num_input_tokens_seen": 60609235, + "step": 2796, + "time_per_iteration": 2.66691517829895 + }, + { + "auxiliary_loss_clip": 0.01062953, + "auxiliary_loss_mlp": 0.01043074, + "balance_loss_clip": 1.03443885, + "balance_loss_mlp": 1.02446556, + "epoch": 0.16816473771231025, + "flos": 23878657105920.0, + "grad_norm": 1.6685076052855723, + "language_loss": 0.81359994, + "learning_rate": 3.8030707825980838e-06, + "loss": 0.83466017, + "num_input_tokens_seen": 60629880, + "step": 2797, + "time_per_iteration": 2.835592031478882 + }, + { + "auxiliary_loss_clip": 0.0109857, + "auxiliary_loss_mlp": 0.01041088, + "balance_loss_clip": 1.03586197, + "balance_loss_mlp": 1.0248996, + "epoch": 0.1682248609649782, + "flos": 22783597125120.0, + "grad_norm": 1.5280278326801358, + "language_loss": 0.75109124, + "learning_rate": 3.802902226251401e-06, + "loss": 0.77248776, + "num_input_tokens_seen": 60651175, + "step": 2798, + "time_per_iteration": 2.6792984008789062 + }, + { + "auxiliary_loss_clip": 0.01119045, + "auxiliary_loss_mlp": 0.01048085, + "balance_loss_clip": 1.04059827, + "balance_loss_mlp": 1.03106236, + "epoch": 0.16828498421764618, + "flos": 20705123612160.0, + "grad_norm": 1.6176094202658111, + "language_loss": 0.79787302, + "learning_rate": 3.8027336015384845e-06, + "loss": 0.81954432, + "num_input_tokens_seen": 60670210, + "step": 2799, + "time_per_iteration": 2.5465235710144043 + }, + { + "auxiliary_loss_clip": 0.01037065, + "auxiliary_loss_mlp": 0.01043951, + "balance_loss_clip": 1.03200173, + "balance_loss_mlp": 1.02349436, + "epoch": 0.16834510747031414, + "flos": 29420606695680.0, + "grad_norm": 2.508422446835175, + "language_loss": 0.70547593, + "learning_rate": 3.8025649084657296e-06, + "loss": 0.72628611, + "num_input_tokens_seen": 60690895, + "step": 2800, + "time_per_iteration": 2.8608570098876953 + }, + { + "auxiliary_loss_clip": 0.01071596, + "auxiliary_loss_mlp": 0.0074905, + "balance_loss_clip": 1.0347805, + "balance_loss_mlp": 1.00079238, + "epoch": 0.1684052307229821, + "flos": 18145374744960.0, + "grad_norm": 1.7209996596016826, + "language_loss": 0.83569074, + "learning_rate": 3.8023961470395326e-06, + "loss": 0.85389721, + "num_input_tokens_seen": 60708280, + "step": 2801, + "time_per_iteration": 2.783426284790039 + }, + { + "auxiliary_loss_clip": 0.01084422, + "auxiliary_loss_mlp": 0.0105107, + "balance_loss_clip": 1.03425395, + "balance_loss_mlp": 1.03244948, + "epoch": 0.16846535397565007, + "flos": 16574929240320.0, + "grad_norm": 2.827718278434978, + "language_loss": 0.82543552, + "learning_rate": 3.8022273172662933e-06, + "loss": 0.84679043, + "num_input_tokens_seen": 60724150, + "step": 2802, + "time_per_iteration": 2.6621809005737305 + }, + { + "auxiliary_loss_clip": 0.01109435, + "auxiliary_loss_mlp": 0.01045669, + "balance_loss_clip": 1.03931057, + "balance_loss_mlp": 1.02642822, + "epoch": 0.16852547722831807, + "flos": 30408868563840.0, + "grad_norm": 1.5746874457437912, + "language_loss": 0.80680776, + "learning_rate": 3.802058419152413e-06, + "loss": 0.82835877, + "num_input_tokens_seen": 60746485, + "step": 2803, + "time_per_iteration": 2.743356466293335 + }, + { + "auxiliary_loss_clip": 0.01103797, + "auxiliary_loss_mlp": 0.01046808, + "balance_loss_clip": 1.03796697, + "balance_loss_mlp": 1.02816367, + "epoch": 0.16858560048098603, + "flos": 33507420416640.0, + "grad_norm": 3.3731533205178033, + "language_loss": 0.76062238, + "learning_rate": 3.801889452704297e-06, + "loss": 0.78212851, + "num_input_tokens_seen": 60762875, + "step": 2804, + "time_per_iteration": 2.7265243530273438 + }, + { + "auxiliary_loss_clip": 0.00996165, + "auxiliary_loss_mlp": 0.01014258, + "balance_loss_clip": 1.00621045, + "balance_loss_mlp": 1.01192176, + "epoch": 0.168645723733654, + "flos": 67370502326400.0, + "grad_norm": 0.8346829919773359, + "language_loss": 0.55433905, + "learning_rate": 3.8017204179283526e-06, + "loss": 0.57444334, + "num_input_tokens_seen": 60825510, + "step": 2805, + "time_per_iteration": 3.193873405456543 + }, + { + "auxiliary_loss_clip": 0.01092673, + "auxiliary_loss_mlp": 0.01041237, + "balance_loss_clip": 1.03268778, + "balance_loss_mlp": 1.02371335, + "epoch": 0.16870584698632196, + "flos": 21324618501120.0, + "grad_norm": 2.140716975559554, + "language_loss": 0.72858286, + "learning_rate": 3.8015513148309892e-06, + "loss": 0.74992198, + "num_input_tokens_seen": 60844440, + "step": 2806, + "time_per_iteration": 2.61918306350708 + }, + { + "auxiliary_loss_clip": 0.01080106, + "auxiliary_loss_mlp": 0.0104384, + "balance_loss_clip": 1.03753948, + "balance_loss_mlp": 1.02629256, + "epoch": 0.16876597023898993, + "flos": 20740746925440.0, + "grad_norm": 1.878368602053995, + "language_loss": 0.69549179, + "learning_rate": 3.80138214341862e-06, + "loss": 0.71673119, + "num_input_tokens_seen": 60863210, + "step": 2807, + "time_per_iteration": 2.6647861003875732 + }, + { + "auxiliary_loss_clip": 0.01086937, + "auxiliary_loss_mlp": 0.01046725, + "balance_loss_clip": 1.03253484, + "balance_loss_mlp": 1.02713895, + "epoch": 0.1688260934916579, + "flos": 20303498666880.0, + "grad_norm": 3.119439173917109, + "language_loss": 0.70865965, + "learning_rate": 3.8012129036976587e-06, + "loss": 0.72999632, + "num_input_tokens_seen": 60882510, + "step": 2808, + "time_per_iteration": 2.630368232727051 + }, + { + "auxiliary_loss_clip": 0.01081575, + "auxiliary_loss_mlp": 0.01046403, + "balance_loss_clip": 1.03471279, + "balance_loss_mlp": 1.02614975, + "epoch": 0.16888621674432586, + "flos": 20340702178560.0, + "grad_norm": 3.199487257417087, + "language_loss": 0.80432403, + "learning_rate": 3.8010435956745236e-06, + "loss": 0.82560378, + "num_input_tokens_seen": 60901105, + "step": 2809, + "time_per_iteration": 2.7630672454833984 + }, + { + "auxiliary_loss_clip": 0.01112273, + "auxiliary_loss_mlp": 0.01046409, + "balance_loss_clip": 1.03838325, + "balance_loss_mlp": 1.02752614, + "epoch": 0.16894633999699385, + "flos": 16244802316800.0, + "grad_norm": 2.5566360216389326, + "language_loss": 0.88246614, + "learning_rate": 3.8008742193556358e-06, + "loss": 0.90405297, + "num_input_tokens_seen": 60915340, + "step": 2810, + "time_per_iteration": 2.6478404998779297 + }, + { + "auxiliary_loss_clip": 0.01109462, + "auxiliary_loss_mlp": 0.01051612, + "balance_loss_clip": 1.03810906, + "balance_loss_mlp": 1.03230023, + "epoch": 0.16900646324966181, + "flos": 19610171372160.0, + "grad_norm": 2.1876170965153694, + "language_loss": 0.92147493, + "learning_rate": 3.800704774747416e-06, + "loss": 0.94308573, + "num_input_tokens_seen": 60933735, + "step": 2811, + "time_per_iteration": 2.604495048522949 + }, + { + "auxiliary_loss_clip": 0.01092825, + "auxiliary_loss_mlp": 0.01043741, + "balance_loss_clip": 1.03565359, + "balance_loss_mlp": 1.02558541, + "epoch": 0.16906658650232978, + "flos": 22018089450240.0, + "grad_norm": 3.8074724411418726, + "language_loss": 0.78825486, + "learning_rate": 3.800535261856291e-06, + "loss": 0.8096205, + "num_input_tokens_seen": 60953105, + "step": 2812, + "time_per_iteration": 2.627218008041382 + }, + { + "auxiliary_loss_clip": 0.01104888, + "auxiliary_loss_mlp": 0.01047521, + "balance_loss_clip": 1.03917193, + "balance_loss_mlp": 1.02969968, + "epoch": 0.16912670975499774, + "flos": 11763690024960.0, + "grad_norm": 2.6158413454484997, + "language_loss": 0.74840939, + "learning_rate": 3.8003656806886887e-06, + "loss": 0.76993358, + "num_input_tokens_seen": 60969150, + "step": 2813, + "time_per_iteration": 2.5601556301116943 + }, + { + "auxiliary_loss_clip": 0.01093276, + "auxiliary_loss_mlp": 0.01044915, + "balance_loss_clip": 1.03507447, + "balance_loss_mlp": 1.02560329, + "epoch": 0.1691868330076657, + "flos": 17161386595200.0, + "grad_norm": 2.428034109395419, + "language_loss": 0.69264382, + "learning_rate": 3.8001960312510396e-06, + "loss": 0.71402574, + "num_input_tokens_seen": 60982825, + "step": 2814, + "time_per_iteration": 2.57685923576355 + }, + { + "auxiliary_loss_clip": 0.01116785, + "auxiliary_loss_mlp": 0.01045903, + "balance_loss_clip": 1.03812861, + "balance_loss_mlp": 1.02762818, + "epoch": 0.16924695626033368, + "flos": 22416553998720.0, + "grad_norm": 2.0002724924971904, + "language_loss": 0.61713737, + "learning_rate": 3.800026313549776e-06, + "loss": 0.63876426, + "num_input_tokens_seen": 61000875, + "step": 2815, + "time_per_iteration": 2.5633280277252197 + }, + { + "auxiliary_loss_clip": 0.01084116, + "auxiliary_loss_mlp": 0.01040292, + "balance_loss_clip": 1.03314018, + "balance_loss_mlp": 1.02188563, + "epoch": 0.16930707951300164, + "flos": 25739655724800.0, + "grad_norm": 1.6436439857442333, + "language_loss": 0.82031202, + "learning_rate": 3.7998565275913342e-06, + "loss": 0.84155607, + "num_input_tokens_seen": 61021940, + "step": 2816, + "time_per_iteration": 2.622021198272705 + }, + { + "auxiliary_loss_clip": 0.01095733, + "auxiliary_loss_mlp": 0.01049102, + "balance_loss_clip": 1.03833115, + "balance_loss_mlp": 1.0305171, + "epoch": 0.16936720276566963, + "flos": 22747040058240.0, + "grad_norm": 2.5176388881633223, + "language_loss": 0.87074184, + "learning_rate": 3.799686673382153e-06, + "loss": 0.89219022, + "num_input_tokens_seen": 61040285, + "step": 2817, + "time_per_iteration": 2.6115012168884277 + }, + { + "auxiliary_loss_clip": 0.0110032, + "auxiliary_loss_mlp": 0.01050936, + "balance_loss_clip": 1.04106164, + "balance_loss_mlp": 1.03164756, + "epoch": 0.1694273260183376, + "flos": 19573973441280.0, + "grad_norm": 1.5651000844156184, + "language_loss": 0.81213874, + "learning_rate": 3.799516750928672e-06, + "loss": 0.8336513, + "num_input_tokens_seen": 61059020, + "step": 2818, + "time_per_iteration": 2.6197590827941895 + }, + { + "auxiliary_loss_clip": 0.01115519, + "auxiliary_loss_mlp": 0.01044585, + "balance_loss_clip": 1.03804576, + "balance_loss_mlp": 1.02553499, + "epoch": 0.16948744927100556, + "flos": 12457843332480.0, + "grad_norm": 2.3043503960125893, + "language_loss": 0.80922049, + "learning_rate": 3.799346760237336e-06, + "loss": 0.83082151, + "num_input_tokens_seen": 61074245, + "step": 2819, + "time_per_iteration": 2.487739324569702 + }, + { + "auxiliary_loss_clip": 0.01005339, + "auxiliary_loss_mlp": 0.01008648, + "balance_loss_clip": 1.00407183, + "balance_loss_mlp": 1.00609708, + "epoch": 0.16954757252367353, + "flos": 71291694435840.0, + "grad_norm": 0.9381127069674672, + "language_loss": 0.61027807, + "learning_rate": 3.7991767013145902e-06, + "loss": 0.63041794, + "num_input_tokens_seen": 61127080, + "step": 2820, + "time_per_iteration": 3.0850119590759277 + }, + { + "auxiliary_loss_clip": 0.01080004, + "auxiliary_loss_mlp": 0.01050076, + "balance_loss_clip": 1.03374195, + "balance_loss_mlp": 1.03137243, + "epoch": 0.1696076957763415, + "flos": 29606516513280.0, + "grad_norm": 1.836961558486389, + "language_loss": 0.78967661, + "learning_rate": 3.7990065741668844e-06, + "loss": 0.8109774, + "num_input_tokens_seen": 61146955, + "step": 2821, + "time_per_iteration": 2.75980281829834 + }, + { + "auxiliary_loss_clip": 0.01092485, + "auxiliary_loss_mlp": 0.01054681, + "balance_loss_clip": 1.0362221, + "balance_loss_mlp": 1.03422499, + "epoch": 0.16966781902900946, + "flos": 24388588535040.0, + "grad_norm": 1.8735892507722147, + "language_loss": 0.78235483, + "learning_rate": 3.7988363788006685e-06, + "loss": 0.80382645, + "num_input_tokens_seen": 61166605, + "step": 2822, + "time_per_iteration": 4.331564664840698 + }, + { + "auxiliary_loss_clip": 0.01104357, + "auxiliary_loss_mlp": 0.00749055, + "balance_loss_clip": 1.03873134, + "balance_loss_mlp": 1.00075257, + "epoch": 0.16972794228167745, + "flos": 23038814234880.0, + "grad_norm": 1.7520789923933338, + "language_loss": 0.74906158, + "learning_rate": 3.7986661152223967e-06, + "loss": 0.76759565, + "num_input_tokens_seen": 61186535, + "step": 2823, + "time_per_iteration": 2.7054083347320557 + }, + { + "auxiliary_loss_clip": 0.01095191, + "auxiliary_loss_mlp": 0.01061676, + "balance_loss_clip": 1.03927016, + "balance_loss_mlp": 1.04251933, + "epoch": 0.16978806553434542, + "flos": 35228691129600.0, + "grad_norm": 2.168773036837378, + "language_loss": 0.59986496, + "learning_rate": 3.7984957834385257e-06, + "loss": 0.62143362, + "num_input_tokens_seen": 61208965, + "step": 2824, + "time_per_iteration": 2.7630858421325684 + }, + { + "auxiliary_loss_clip": 0.01095869, + "auxiliary_loss_mlp": 0.01049904, + "balance_loss_clip": 1.03932607, + "balance_loss_mlp": 1.03018641, + "epoch": 0.16984818878701338, + "flos": 32014290936960.0, + "grad_norm": 1.6455102724546102, + "language_loss": 0.73183227, + "learning_rate": 3.7983253834555144e-06, + "loss": 0.75329006, + "num_input_tokens_seen": 61230670, + "step": 2825, + "time_per_iteration": 2.749830722808838 + }, + { + "auxiliary_loss_clip": 0.01119554, + "auxiliary_loss_mlp": 0.01056778, + "balance_loss_clip": 1.03790557, + "balance_loss_mlp": 1.03639364, + "epoch": 0.16990831203968135, + "flos": 22818609907200.0, + "grad_norm": 2.035751755239223, + "language_loss": 0.85452151, + "learning_rate": 3.7981549152798245e-06, + "loss": 0.87628484, + "num_input_tokens_seen": 61249510, + "step": 2826, + "time_per_iteration": 4.283624172210693 + }, + { + "auxiliary_loss_clip": 0.01094913, + "auxiliary_loss_mlp": 0.0105684, + "balance_loss_clip": 1.03574193, + "balance_loss_mlp": 1.03726602, + "epoch": 0.1699684352923493, + "flos": 23039604334080.0, + "grad_norm": 1.5393763312335391, + "language_loss": 0.8238312, + "learning_rate": 3.7979843789179196e-06, + "loss": 0.84534872, + "num_input_tokens_seen": 61269440, + "step": 2827, + "time_per_iteration": 2.603602647781372 + }, + { + "auxiliary_loss_clip": 0.01089162, + "auxiliary_loss_mlp": 0.01049563, + "balance_loss_clip": 1.03471017, + "balance_loss_mlp": 1.0289042, + "epoch": 0.17002855854501728, + "flos": 21434110133760.0, + "grad_norm": 1.9207484980674185, + "language_loss": 0.73727739, + "learning_rate": 3.797813774376267e-06, + "loss": 0.75866461, + "num_input_tokens_seen": 61288195, + "step": 2828, + "time_per_iteration": 2.6102044582366943 + }, + { + "auxiliary_loss_clip": 0.01014219, + "auxiliary_loss_mlp": 0.01005123, + "balance_loss_clip": 1.01951742, + "balance_loss_mlp": 1.00273883, + "epoch": 0.17008868179768524, + "flos": 71453509205760.0, + "grad_norm": 0.7642428266646846, + "language_loss": 0.56543946, + "learning_rate": 3.797643101661336e-06, + "loss": 0.58563286, + "num_input_tokens_seen": 61350850, + "step": 2829, + "time_per_iteration": 4.863545894622803 + }, + { + "auxiliary_loss_clip": 0.01062846, + "auxiliary_loss_mlp": 0.01050906, + "balance_loss_clip": 1.02940941, + "balance_loss_mlp": 1.0318923, + "epoch": 0.17014880505035324, + "flos": 24900315644160.0, + "grad_norm": 1.666900102293369, + "language_loss": 0.82996804, + "learning_rate": 3.7974723607795983e-06, + "loss": 0.85110557, + "num_input_tokens_seen": 61370765, + "step": 2830, + "time_per_iteration": 4.387189865112305 + }, + { + "auxiliary_loss_clip": 0.01080785, + "auxiliary_loss_mlp": 0.01045419, + "balance_loss_clip": 1.03381991, + "balance_loss_mlp": 1.02598739, + "epoch": 0.1702089283030212, + "flos": 29862415981440.0, + "grad_norm": 2.162278294791228, + "language_loss": 0.78390443, + "learning_rate": 3.797301551737529e-06, + "loss": 0.80516648, + "num_input_tokens_seen": 61388935, + "step": 2831, + "time_per_iteration": 2.692689895629883 + }, + { + "auxiliary_loss_clip": 0.01082632, + "auxiliary_loss_mlp": 0.01049338, + "balance_loss_clip": 1.03495049, + "balance_loss_mlp": 1.02910852, + "epoch": 0.17026905155568917, + "flos": 17744180762880.0, + "grad_norm": 2.137825741470876, + "language_loss": 0.79564345, + "learning_rate": 3.7971306745416044e-06, + "loss": 0.81696314, + "num_input_tokens_seen": 61407350, + "step": 2832, + "time_per_iteration": 2.6190314292907715 + }, + { + "auxiliary_loss_clip": 0.01086645, + "auxiliary_loss_mlp": 0.01052088, + "balance_loss_clip": 1.03523982, + "balance_loss_mlp": 1.03316975, + "epoch": 0.17032917480835713, + "flos": 23148665003520.0, + "grad_norm": 1.6461107807168458, + "language_loss": 0.89207965, + "learning_rate": 3.7969597291983046e-06, + "loss": 0.91346693, + "num_input_tokens_seen": 61429010, + "step": 2833, + "time_per_iteration": 2.6316726207733154 + }, + { + "auxiliary_loss_clip": 0.01115413, + "auxiliary_loss_mlp": 0.01046842, + "balance_loss_clip": 1.03708792, + "balance_loss_mlp": 1.02851915, + "epoch": 0.1703892980610251, + "flos": 39202565512320.0, + "grad_norm": 2.044952947465716, + "language_loss": 0.71814919, + "learning_rate": 3.7967887157141115e-06, + "loss": 0.73977172, + "num_input_tokens_seen": 61450040, + "step": 2834, + "time_per_iteration": 2.7195770740509033 + }, + { + "auxiliary_loss_clip": 0.01082353, + "auxiliary_loss_mlp": 0.01056304, + "balance_loss_clip": 1.0378412, + "balance_loss_mlp": 1.03762436, + "epoch": 0.17044942131369306, + "flos": 23039101543680.0, + "grad_norm": 1.9235376745357617, + "language_loss": 0.86494124, + "learning_rate": 3.7966176340955106e-06, + "loss": 0.88632774, + "num_input_tokens_seen": 61468585, + "step": 2835, + "time_per_iteration": 2.700536012649536 + }, + { + "auxiliary_loss_clip": 0.01111506, + "auxiliary_loss_mlp": 0.01050206, + "balance_loss_clip": 1.03815484, + "balance_loss_mlp": 1.02867663, + "epoch": 0.17050954456636103, + "flos": 17054983532160.0, + "grad_norm": 2.2294070290756536, + "language_loss": 0.73630023, + "learning_rate": 3.796446484348989e-06, + "loss": 0.7579174, + "num_input_tokens_seen": 61486330, + "step": 2836, + "time_per_iteration": 2.535108804702759 + }, + { + "auxiliary_loss_clip": 0.0106245, + "auxiliary_loss_mlp": 0.01049551, + "balance_loss_clip": 1.03427136, + "balance_loss_mlp": 1.02829576, + "epoch": 0.17056966781902902, + "flos": 16836969934080.0, + "grad_norm": 2.0891595153788884, + "language_loss": 0.8003974, + "learning_rate": 3.796275266481036e-06, + "loss": 0.82151741, + "num_input_tokens_seen": 61503950, + "step": 2837, + "time_per_iteration": 2.657458782196045 + }, + { + "auxiliary_loss_clip": 0.01103415, + "auxiliary_loss_mlp": 0.01045616, + "balance_loss_clip": 1.03903496, + "balance_loss_mlp": 1.0270431, + "epoch": 0.17062979107169698, + "flos": 17712543859200.0, + "grad_norm": 1.7666201964275927, + "language_loss": 0.83271253, + "learning_rate": 3.7961039804981456e-06, + "loss": 0.85420281, + "num_input_tokens_seen": 61523550, + "step": 2838, + "time_per_iteration": 2.5816593170166016 + }, + { + "auxiliary_loss_clip": 0.01075517, + "auxiliary_loss_mlp": 0.0104616, + "balance_loss_clip": 1.04014874, + "balance_loss_mlp": 1.02727771, + "epoch": 0.17068991432436495, + "flos": 22525040050560.0, + "grad_norm": 1.6195469040157946, + "language_loss": 0.93329513, + "learning_rate": 3.795932626406812e-06, + "loss": 0.95451188, + "num_input_tokens_seen": 61542720, + "step": 2839, + "time_per_iteration": 2.721256971359253 + }, + { + "auxiliary_loss_clip": 0.01083163, + "auxiliary_loss_mlp": 0.01045853, + "balance_loss_clip": 1.03578508, + "balance_loss_mlp": 1.02530098, + "epoch": 0.17075003757703291, + "flos": 25882939077120.0, + "grad_norm": 1.9080610889967566, + "language_loss": 0.83555615, + "learning_rate": 3.7957612042135336e-06, + "loss": 0.85684633, + "num_input_tokens_seen": 61563040, + "step": 2840, + "time_per_iteration": 2.6543498039245605 + }, + { + "auxiliary_loss_clip": 0.0110878, + "auxiliary_loss_mlp": 0.01047283, + "balance_loss_clip": 1.03705311, + "balance_loss_mlp": 1.02617097, + "epoch": 0.17081016082970088, + "flos": 20120713332480.0, + "grad_norm": 1.8988873140730849, + "language_loss": 0.76059914, + "learning_rate": 3.79558971392481e-06, + "loss": 0.78215975, + "num_input_tokens_seen": 61581890, + "step": 2841, + "time_per_iteration": 2.6069176197052 + }, + { + "auxiliary_loss_clip": 0.01094021, + "auxiliary_loss_mlp": 0.01048304, + "balance_loss_clip": 1.03543818, + "balance_loss_mlp": 1.0288012, + "epoch": 0.17087028408236885, + "flos": 24936477661440.0, + "grad_norm": 2.0151226955248314, + "language_loss": 0.77378619, + "learning_rate": 3.7954181555471443e-06, + "loss": 0.79520953, + "num_input_tokens_seen": 61602095, + "step": 2842, + "time_per_iteration": 2.8203134536743164 + }, + { + "auxiliary_loss_clip": 0.01114553, + "auxiliary_loss_mlp": 0.01045649, + "balance_loss_clip": 1.03836691, + "balance_loss_mlp": 1.02726674, + "epoch": 0.17093040733503684, + "flos": 19057864872960.0, + "grad_norm": 2.824633115463221, + "language_loss": 0.85381287, + "learning_rate": 3.795246529087043e-06, + "loss": 0.87541485, + "num_input_tokens_seen": 61620400, + "step": 2843, + "time_per_iteration": 2.7441112995147705 + }, + { + "auxiliary_loss_clip": 0.01115717, + "auxiliary_loss_mlp": 0.0104492, + "balance_loss_clip": 1.03969657, + "balance_loss_mlp": 1.02629924, + "epoch": 0.1709905305877048, + "flos": 13078954333440.0, + "grad_norm": 1.6374891544343821, + "language_loss": 0.68285584, + "learning_rate": 3.7950748345510126e-06, + "loss": 0.70446211, + "num_input_tokens_seen": 61637680, + "step": 2844, + "time_per_iteration": 2.5407657623291016 + }, + { + "auxiliary_loss_clip": 0.010925, + "auxiliary_loss_mlp": 0.00749188, + "balance_loss_clip": 1.03555632, + "balance_loss_mlp": 1.00081921, + "epoch": 0.17105065384037277, + "flos": 19209336526080.0, + "grad_norm": 1.744984080440914, + "language_loss": 0.78439677, + "learning_rate": 3.7949030719455646e-06, + "loss": 0.80281365, + "num_input_tokens_seen": 61655630, + "step": 2845, + "time_per_iteration": 2.645127058029175 + }, + { + "auxiliary_loss_clip": 0.01103463, + "auxiliary_loss_mlp": 0.0104318, + "balance_loss_clip": 1.03583097, + "balance_loss_mlp": 1.02457142, + "epoch": 0.17111077709304073, + "flos": 18515183218560.0, + "grad_norm": 2.3315392460550197, + "language_loss": 0.77404332, + "learning_rate": 3.7947312412772127e-06, + "loss": 0.79550976, + "num_input_tokens_seen": 61673475, + "step": 2846, + "time_per_iteration": 2.635387420654297 + }, + { + "auxiliary_loss_clip": 0.01105296, + "auxiliary_loss_mlp": 0.01043482, + "balance_loss_clip": 1.0382452, + "balance_loss_mlp": 1.02494478, + "epoch": 0.1711709003457087, + "flos": 25082670015360.0, + "grad_norm": 3.9748602957669643, + "language_loss": 0.79851711, + "learning_rate": 3.794559342552472e-06, + "loss": 0.82000494, + "num_input_tokens_seen": 61693370, + "step": 2847, + "time_per_iteration": 2.717679738998413 + }, + { + "auxiliary_loss_clip": 0.01101709, + "auxiliary_loss_mlp": 0.01048927, + "balance_loss_clip": 1.03254426, + "balance_loss_mlp": 1.02976966, + "epoch": 0.17123102359837666, + "flos": 17566387418880.0, + "grad_norm": 2.5130398196295793, + "language_loss": 0.86661738, + "learning_rate": 3.7943873757778614e-06, + "loss": 0.88812375, + "num_input_tokens_seen": 61710820, + "step": 2848, + "time_per_iteration": 2.6553237438201904 + }, + { + "auxiliary_loss_clip": 0.01072538, + "auxiliary_loss_mlp": 0.01046272, + "balance_loss_clip": 1.03387809, + "balance_loss_mlp": 1.02679348, + "epoch": 0.17129114685104463, + "flos": 26173635845760.0, + "grad_norm": 1.7763346101085324, + "language_loss": 0.75272572, + "learning_rate": 3.794215340959902e-06, + "loss": 0.77391386, + "num_input_tokens_seen": 61729855, + "step": 2849, + "time_per_iteration": 2.807727336883545 + }, + { + "auxiliary_loss_clip": 0.00993556, + "auxiliary_loss_mlp": 0.01012718, + "balance_loss_clip": 1.00533009, + "balance_loss_mlp": 1.01046467, + "epoch": 0.17135127010371262, + "flos": 69269710037760.0, + "grad_norm": 0.7952711324656244, + "language_loss": 0.57545877, + "learning_rate": 3.7940432381051163e-06, + "loss": 0.59552145, + "num_input_tokens_seen": 61790290, + "step": 2850, + "time_per_iteration": 3.222541570663452 + }, + { + "auxiliary_loss_clip": 0.01079295, + "auxiliary_loss_mlp": 0.0104772, + "balance_loss_clip": 1.03594971, + "balance_loss_mlp": 1.02843189, + "epoch": 0.1714113933563806, + "flos": 23550110380800.0, + "grad_norm": 3.6837584635892053, + "language_loss": 0.8093878, + "learning_rate": 3.793871067220031e-06, + "loss": 0.83065796, + "num_input_tokens_seen": 61809265, + "step": 2851, + "time_per_iteration": 2.685858964920044 + }, + { + "auxiliary_loss_clip": 0.01082615, + "auxiliary_loss_mlp": 0.01043159, + "balance_loss_clip": 1.038656, + "balance_loss_mlp": 1.02521765, + "epoch": 0.17147151660904855, + "flos": 21142443697920.0, + "grad_norm": 6.3294751645245615, + "language_loss": 0.9346602, + "learning_rate": 3.7936988283111764e-06, + "loss": 0.95591795, + "num_input_tokens_seen": 61828980, + "step": 2852, + "time_per_iteration": 2.681030035018921 + }, + { + "auxiliary_loss_clip": 0.010833, + "auxiliary_loss_mlp": 0.01053385, + "balance_loss_clip": 1.03432703, + "balance_loss_mlp": 1.03427625, + "epoch": 0.17153163986171652, + "flos": 18624890332800.0, + "grad_norm": 1.8426349639574287, + "language_loss": 0.69092804, + "learning_rate": 3.7935265213850817e-06, + "loss": 0.71229488, + "num_input_tokens_seen": 61847915, + "step": 2853, + "time_per_iteration": 2.6312661170959473 + }, + { + "auxiliary_loss_clip": 0.0108395, + "auxiliary_loss_mlp": 0.01056869, + "balance_loss_clip": 1.04260564, + "balance_loss_mlp": 1.03864145, + "epoch": 0.17159176311438448, + "flos": 18223265387520.0, + "grad_norm": 10.053472931847471, + "language_loss": 0.66262317, + "learning_rate": 3.7933541464482815e-06, + "loss": 0.68403137, + "num_input_tokens_seen": 61865570, + "step": 2854, + "time_per_iteration": 2.7188892364501953 + }, + { + "auxiliary_loss_clip": 0.01078014, + "auxiliary_loss_mlp": 0.01047657, + "balance_loss_clip": 1.03188908, + "balance_loss_mlp": 1.02910757, + "epoch": 0.17165188636705245, + "flos": 20738987159040.0, + "grad_norm": 1.5421998786664017, + "language_loss": 0.89436054, + "learning_rate": 3.7931817035073124e-06, + "loss": 0.91561723, + "num_input_tokens_seen": 61883340, + "step": 2855, + "time_per_iteration": 2.7053022384643555 + }, + { + "auxiliary_loss_clip": 0.01116806, + "auxiliary_loss_mlp": 0.01052388, + "balance_loss_clip": 1.0372045, + "balance_loss_mlp": 1.03466213, + "epoch": 0.17171200961972044, + "flos": 24899884680960.0, + "grad_norm": 2.0718823323155466, + "language_loss": 0.83399749, + "learning_rate": 3.7930091925687134e-06, + "loss": 0.85568947, + "num_input_tokens_seen": 61900610, + "step": 2856, + "time_per_iteration": 2.5809688568115234 + }, + { + "auxiliary_loss_clip": 0.0110729, + "auxiliary_loss_mlp": 0.01049875, + "balance_loss_clip": 1.03853309, + "balance_loss_mlp": 1.03180254, + "epoch": 0.1717721328723884, + "flos": 20157234485760.0, + "grad_norm": 1.882044033982266, + "language_loss": 0.86196184, + "learning_rate": 3.792836613639026e-06, + "loss": 0.8835336, + "num_input_tokens_seen": 61916795, + "step": 2857, + "time_per_iteration": 2.6613707542419434 + }, + { + "auxiliary_loss_clip": 0.01104377, + "auxiliary_loss_mlp": 0.01055862, + "balance_loss_clip": 1.03655565, + "balance_loss_mlp": 1.03658605, + "epoch": 0.17183225612505637, + "flos": 23361650697600.0, + "grad_norm": 2.2405343966136186, + "language_loss": 0.78456444, + "learning_rate": 3.7926639667247947e-06, + "loss": 0.80616683, + "num_input_tokens_seen": 61936665, + "step": 2858, + "time_per_iteration": 2.688755750656128 + }, + { + "auxiliary_loss_clip": 0.01106231, + "auxiliary_loss_mlp": 0.01061973, + "balance_loss_clip": 1.03905392, + "balance_loss_mlp": 1.0400151, + "epoch": 0.17189237937772434, + "flos": 18114240631680.0, + "grad_norm": 1.8943571014306584, + "language_loss": 0.77169907, + "learning_rate": 3.7924912518325663e-06, + "loss": 0.79338115, + "num_input_tokens_seen": 61954415, + "step": 2859, + "time_per_iteration": 2.58884596824646 + }, + { + "auxiliary_loss_clip": 0.01071559, + "auxiliary_loss_mlp": 0.01050465, + "balance_loss_clip": 1.03725195, + "balance_loss_mlp": 1.03118849, + "epoch": 0.1719525026303923, + "flos": 23258408031360.0, + "grad_norm": 1.8848555924331607, + "language_loss": 0.76720047, + "learning_rate": 3.7923184689688902e-06, + "loss": 0.78842068, + "num_input_tokens_seen": 61973940, + "step": 2860, + "time_per_iteration": 2.8700525760650635 + }, + { + "auxiliary_loss_clip": 0.0110581, + "auxiliary_loss_mlp": 0.01047854, + "balance_loss_clip": 1.03617907, + "balance_loss_mlp": 1.02923346, + "epoch": 0.17201262588306027, + "flos": 20810413353600.0, + "grad_norm": 1.636645978761963, + "language_loss": 0.81410468, + "learning_rate": 3.792145618140317e-06, + "loss": 0.83564132, + "num_input_tokens_seen": 61991845, + "step": 2861, + "time_per_iteration": 2.687279224395752 + }, + { + "auxiliary_loss_clip": 0.01085747, + "auxiliary_loss_mlp": 0.01053465, + "balance_loss_clip": 1.03365719, + "balance_loss_mlp": 1.03489256, + "epoch": 0.17207274913572823, + "flos": 20375858615040.0, + "grad_norm": 2.4775337019733463, + "language_loss": 0.85557687, + "learning_rate": 3.7919726993534038e-06, + "loss": 0.87696904, + "num_input_tokens_seen": 62009395, + "step": 2862, + "time_per_iteration": 2.8735601902008057 + }, + { + "auxiliary_loss_clip": 0.01075317, + "auxiliary_loss_mlp": 0.0104652, + "balance_loss_clip": 1.03259361, + "balance_loss_mlp": 1.02930641, + "epoch": 0.17213287238839622, + "flos": 26797727675520.0, + "grad_norm": 2.7459327194036374, + "language_loss": 0.78126538, + "learning_rate": 3.7917997126147054e-06, + "loss": 0.8024838, + "num_input_tokens_seen": 62029005, + "step": 2863, + "time_per_iteration": 2.777056932449341 + }, + { + "auxiliary_loss_clip": 0.01082301, + "auxiliary_loss_mlp": 0.00749023, + "balance_loss_clip": 1.03325295, + "balance_loss_mlp": 1.00059021, + "epoch": 0.1721929956410642, + "flos": 26030819370240.0, + "grad_norm": 1.6816056117230305, + "language_loss": 0.72712064, + "learning_rate": 3.7916266579307823e-06, + "loss": 0.74543393, + "num_input_tokens_seen": 62048730, + "step": 2864, + "time_per_iteration": 2.9542272090911865 + }, + { + "auxiliary_loss_clip": 0.01081231, + "auxiliary_loss_mlp": 0.01052536, + "balance_loss_clip": 1.03483665, + "balance_loss_mlp": 1.03333139, + "epoch": 0.17225311889373215, + "flos": 22273091078400.0, + "grad_norm": 1.7822766712631364, + "language_loss": 0.7270627, + "learning_rate": 3.7914535353081973e-06, + "loss": 0.74840039, + "num_input_tokens_seen": 62069000, + "step": 2865, + "time_per_iteration": 2.8932197093963623 + }, + { + "auxiliary_loss_clip": 0.01105984, + "auxiliary_loss_mlp": 0.00748924, + "balance_loss_clip": 1.03986478, + "balance_loss_mlp": 1.00052595, + "epoch": 0.17231324214640012, + "flos": 21287774125440.0, + "grad_norm": 3.2613781921243783, + "language_loss": 0.78829449, + "learning_rate": 3.7912803447535145e-06, + "loss": 0.80684358, + "num_input_tokens_seen": 62086750, + "step": 2866, + "time_per_iteration": 2.7431013584136963 + }, + { + "auxiliary_loss_clip": 0.01119001, + "auxiliary_loss_mlp": 0.0104612, + "balance_loss_clip": 1.03924191, + "balance_loss_mlp": 1.02614069, + "epoch": 0.17237336539906808, + "flos": 19680735640320.0, + "grad_norm": 1.6481822886485953, + "language_loss": 0.79782468, + "learning_rate": 3.7911070862733016e-06, + "loss": 0.81947589, + "num_input_tokens_seen": 62106240, + "step": 2867, + "time_per_iteration": 2.5524213314056396 + }, + { + "auxiliary_loss_clip": 0.01094739, + "auxiliary_loss_mlp": 0.01041237, + "balance_loss_clip": 1.03670812, + "balance_loss_mlp": 1.02143621, + "epoch": 0.17243348865173605, + "flos": 17529650784000.0, + "grad_norm": 1.7590322623762904, + "language_loss": 0.7950871, + "learning_rate": 3.7909337598741276e-06, + "loss": 0.8164469, + "num_input_tokens_seen": 62124895, + "step": 2868, + "time_per_iteration": 2.6684916019439697 + }, + { + "auxiliary_loss_clip": 0.01081487, + "auxiliary_loss_mlp": 0.01046381, + "balance_loss_clip": 1.04232347, + "balance_loss_mlp": 1.02699792, + "epoch": 0.17249361190440402, + "flos": 18259858368000.0, + "grad_norm": 2.7998045839155763, + "language_loss": 0.83634841, + "learning_rate": 3.7907603655625674e-06, + "loss": 0.85762715, + "num_input_tokens_seen": 62143510, + "step": 2869, + "time_per_iteration": 4.326248407363892 + }, + { + "auxiliary_loss_clip": 0.01095294, + "auxiliary_loss_mlp": 0.0105369, + "balance_loss_clip": 1.03726482, + "balance_loss_mlp": 1.0334959, + "epoch": 0.172553735157072, + "flos": 21174367910400.0, + "grad_norm": 2.024692340023908, + "language_loss": 0.77299762, + "learning_rate": 3.7905869033451932e-06, + "loss": 0.79448742, + "num_input_tokens_seen": 62162285, + "step": 2870, + "time_per_iteration": 2.622706890106201 + }, + { + "auxiliary_loss_clip": 0.01113977, + "auxiliary_loss_mlp": 0.01043288, + "balance_loss_clip": 1.03976464, + "balance_loss_mlp": 1.02591872, + "epoch": 0.17261385840973997, + "flos": 22273270646400.0, + "grad_norm": 3.2427660840736023, + "language_loss": 0.77034336, + "learning_rate": 3.7904133732285857e-06, + "loss": 0.79191601, + "num_input_tokens_seen": 62180970, + "step": 2871, + "time_per_iteration": 2.661728620529175 + }, + { + "auxiliary_loss_clip": 0.01093309, + "auxiliary_loss_mlp": 0.01041626, + "balance_loss_clip": 1.03579068, + "balance_loss_mlp": 1.02232587, + "epoch": 0.17267398166240794, + "flos": 27922233830400.0, + "grad_norm": 3.8239107016592224, + "language_loss": 0.7433461, + "learning_rate": 3.7902397752193228e-06, + "loss": 0.76469541, + "num_input_tokens_seen": 62198965, + "step": 2872, + "time_per_iteration": 2.7469747066497803 + }, + { + "auxiliary_loss_clip": 0.01111654, + "auxiliary_loss_mlp": 0.01041225, + "balance_loss_clip": 1.0367558, + "balance_loss_mlp": 1.02221131, + "epoch": 0.1727341049150759, + "flos": 21945118970880.0, + "grad_norm": 1.7780901924166002, + "language_loss": 0.82589746, + "learning_rate": 3.790066109323988e-06, + "loss": 0.84742624, + "num_input_tokens_seen": 62219890, + "step": 2873, + "time_per_iteration": 4.31557297706604 + }, + { + "auxiliary_loss_clip": 0.01067697, + "auxiliary_loss_mlp": 0.01041669, + "balance_loss_clip": 1.03218865, + "balance_loss_mlp": 1.02205861, + "epoch": 0.17279422816774387, + "flos": 18107883924480.0, + "grad_norm": 2.0657657458748866, + "language_loss": 0.74940693, + "learning_rate": 3.7898923755491678e-06, + "loss": 0.77050054, + "num_input_tokens_seen": 62237140, + "step": 2874, + "time_per_iteration": 2.723419427871704 + }, + { + "auxiliary_loss_clip": 0.01115924, + "auxiliary_loss_mlp": 0.01047626, + "balance_loss_clip": 1.03803837, + "balance_loss_mlp": 1.02690756, + "epoch": 0.17285435142041183, + "flos": 21835447770240.0, + "grad_norm": 1.9043790757438397, + "language_loss": 0.80793869, + "learning_rate": 3.7897185739014487e-06, + "loss": 0.82957423, + "num_input_tokens_seen": 62255405, + "step": 2875, + "time_per_iteration": 2.5473403930664062 + }, + { + "auxiliary_loss_clip": 0.0110002, + "auxiliary_loss_mlp": 0.01052015, + "balance_loss_clip": 1.03910995, + "balance_loss_mlp": 1.03140426, + "epoch": 0.17291447467307983, + "flos": 18368452160640.0, + "grad_norm": 2.9924172876038018, + "language_loss": 0.87612665, + "learning_rate": 3.7895447043874217e-06, + "loss": 0.89764702, + "num_input_tokens_seen": 62271280, + "step": 2876, + "time_per_iteration": 4.25696063041687 + }, + { + "auxiliary_loss_clip": 0.01093999, + "auxiliary_loss_mlp": 0.01043917, + "balance_loss_clip": 1.03703249, + "balance_loss_mlp": 1.02532053, + "epoch": 0.1729745979257478, + "flos": 18624638937600.0, + "grad_norm": 2.2658304363824233, + "language_loss": 0.84827006, + "learning_rate": 3.789370767013681e-06, + "loss": 0.86964917, + "num_input_tokens_seen": 62289140, + "step": 2877, + "time_per_iteration": 4.217576503753662 + }, + { + "auxiliary_loss_clip": 0.01083025, + "auxiliary_loss_mlp": 0.01044372, + "balance_loss_clip": 1.03783822, + "balance_loss_mlp": 1.02446413, + "epoch": 0.17303472117841576, + "flos": 22998234844800.0, + "grad_norm": 1.9200984812910338, + "language_loss": 0.79118657, + "learning_rate": 3.7891967617868204e-06, + "loss": 0.8124606, + "num_input_tokens_seen": 62307490, + "step": 2878, + "time_per_iteration": 2.70849347114563 + }, + { + "auxiliary_loss_clip": 0.01090765, + "auxiliary_loss_mlp": 0.01045165, + "balance_loss_clip": 1.03413868, + "balance_loss_mlp": 1.02654421, + "epoch": 0.17309484443108372, + "flos": 25664386775040.0, + "grad_norm": 1.585088891562969, + "language_loss": 0.70300996, + "learning_rate": 3.78902268871344e-06, + "loss": 0.72436929, + "num_input_tokens_seen": 62328570, + "step": 2879, + "time_per_iteration": 2.6831817626953125 + }, + { + "auxiliary_loss_clip": 0.0109239, + "auxiliary_loss_mlp": 0.01049618, + "balance_loss_clip": 1.0351398, + "balance_loss_mlp": 1.03043699, + "epoch": 0.1731549676837517, + "flos": 13552903313280.0, + "grad_norm": 2.0737649233876265, + "language_loss": 0.833453, + "learning_rate": 3.78884854780014e-06, + "loss": 0.85487306, + "num_input_tokens_seen": 62345735, + "step": 2880, + "time_per_iteration": 2.6721394062042236 + }, + { + "auxiliary_loss_clip": 0.01066884, + "auxiliary_loss_mlp": 0.01050544, + "balance_loss_clip": 1.03439784, + "balance_loss_mlp": 1.02937281, + "epoch": 0.17321509093641965, + "flos": 22857070394880.0, + "grad_norm": 1.9187836801593836, + "language_loss": 0.80975342, + "learning_rate": 3.7886743390535236e-06, + "loss": 0.83092767, + "num_input_tokens_seen": 62365525, + "step": 2881, + "time_per_iteration": 2.7573459148406982 + }, + { + "auxiliary_loss_clip": 0.01091317, + "auxiliary_loss_mlp": 0.01044925, + "balance_loss_clip": 1.0345149, + "balance_loss_mlp": 1.02691293, + "epoch": 0.17327521418908762, + "flos": 24352785653760.0, + "grad_norm": 3.0553864153836465, + "language_loss": 0.76751447, + "learning_rate": 3.788500062480197e-06, + "loss": 0.78887689, + "num_input_tokens_seen": 62385160, + "step": 2882, + "time_per_iteration": 2.6344010829925537 + }, + { + "auxiliary_loss_clip": 0.01081281, + "auxiliary_loss_mlp": 0.01049976, + "balance_loss_clip": 1.04159188, + "balance_loss_mlp": 1.03140354, + "epoch": 0.1733353374417556, + "flos": 33105651816960.0, + "grad_norm": 3.683311748600786, + "language_loss": 0.76244354, + "learning_rate": 3.788325718086769e-06, + "loss": 0.78375614, + "num_input_tokens_seen": 62405280, + "step": 2883, + "time_per_iteration": 2.9778804779052734 + }, + { + "auxiliary_loss_clip": 0.01070721, + "auxiliary_loss_mlp": 0.01050964, + "balance_loss_clip": 1.03308249, + "balance_loss_mlp": 1.03156877, + "epoch": 0.17339546069442358, + "flos": 24388947671040.0, + "grad_norm": 1.829774790730933, + "language_loss": 0.85517484, + "learning_rate": 3.7881513058798503e-06, + "loss": 0.87639171, + "num_input_tokens_seen": 62423665, + "step": 2884, + "time_per_iteration": 2.78839373588562 + }, + { + "auxiliary_loss_clip": 0.01095835, + "auxiliary_loss_mlp": 0.0074904, + "balance_loss_clip": 1.03787589, + "balance_loss_mlp": 1.00051916, + "epoch": 0.17345558394709154, + "flos": 27454174680960.0, + "grad_norm": 1.8130234557879357, + "language_loss": 0.73725814, + "learning_rate": 3.787976825866055e-06, + "loss": 0.75570691, + "num_input_tokens_seen": 62445170, + "step": 2885, + "time_per_iteration": 2.767256259918213 + }, + { + "auxiliary_loss_clip": 0.01089457, + "auxiliary_loss_mlp": 0.01045836, + "balance_loss_clip": 1.03605294, + "balance_loss_mlp": 1.02790666, + "epoch": 0.1735157071997595, + "flos": 24682158391680.0, + "grad_norm": 1.5276756148097221, + "language_loss": 0.70785588, + "learning_rate": 3.7878022780519998e-06, + "loss": 0.72920883, + "num_input_tokens_seen": 62466135, + "step": 2886, + "time_per_iteration": 2.7768819332122803 + }, + { + "auxiliary_loss_clip": 0.0110364, + "auxiliary_loss_mlp": 0.01042031, + "balance_loss_clip": 1.03500843, + "balance_loss_mlp": 1.02349377, + "epoch": 0.17357583045242747, + "flos": 21688932193920.0, + "grad_norm": 3.0670918984907702, + "language_loss": 0.69237876, + "learning_rate": 3.7876276624443024e-06, + "loss": 0.71383548, + "num_input_tokens_seen": 62483910, + "step": 2887, + "time_per_iteration": 2.6439766883850098 + }, + { + "auxiliary_loss_clip": 0.01072154, + "auxiliary_loss_mlp": 0.01048158, + "balance_loss_clip": 1.03428698, + "balance_loss_mlp": 1.02881074, + "epoch": 0.17363595370509544, + "flos": 15375728753280.0, + "grad_norm": 1.6270980695343926, + "language_loss": 0.85415995, + "learning_rate": 3.787452979049585e-06, + "loss": 0.87536311, + "num_input_tokens_seen": 62501530, + "step": 2888, + "time_per_iteration": 2.6557776927948 + }, + { + "auxiliary_loss_clip": 0.01056083, + "auxiliary_loss_mlp": 0.01051108, + "balance_loss_clip": 1.03450513, + "balance_loss_mlp": 1.02985263, + "epoch": 0.1736960769577634, + "flos": 23440941970560.0, + "grad_norm": 1.9329677434362311, + "language_loss": 0.78362536, + "learning_rate": 3.7872782278744718e-06, + "loss": 0.80469728, + "num_input_tokens_seen": 62521295, + "step": 2889, + "time_per_iteration": 2.8989365100860596 + }, + { + "auxiliary_loss_clip": 0.01078399, + "auxiliary_loss_mlp": 0.00748977, + "balance_loss_clip": 1.03674603, + "balance_loss_mlp": 1.00047445, + "epoch": 0.1737562002104314, + "flos": 18587830475520.0, + "grad_norm": 3.82616484981771, + "language_loss": 0.84161484, + "learning_rate": 3.7871034089255883e-06, + "loss": 0.85988855, + "num_input_tokens_seen": 62539615, + "step": 2890, + "time_per_iteration": 2.8049111366271973 + }, + { + "auxiliary_loss_clip": 0.01104145, + "auxiliary_loss_mlp": 0.01053205, + "balance_loss_clip": 1.03762078, + "balance_loss_mlp": 1.03441715, + "epoch": 0.17381632346309936, + "flos": 15998060816640.0, + "grad_norm": 2.2597779415322212, + "language_loss": 0.82113665, + "learning_rate": 3.7869285222095653e-06, + "loss": 0.84271014, + "num_input_tokens_seen": 62556820, + "step": 2891, + "time_per_iteration": 2.714566707611084 + }, + { + "auxiliary_loss_clip": 0.01048648, + "auxiliary_loss_mlp": 0.0105005, + "balance_loss_clip": 1.02701044, + "balance_loss_mlp": 1.02774596, + "epoch": 0.17387644671576732, + "flos": 13369830670080.0, + "grad_norm": 2.337514208896664, + "language_loss": 0.81435823, + "learning_rate": 3.7867535677330334e-06, + "loss": 0.83534521, + "num_input_tokens_seen": 62572450, + "step": 2892, + "time_per_iteration": 2.8372695446014404 + }, + { + "auxiliary_loss_clip": 0.01109669, + "auxiliary_loss_mlp": 0.01056132, + "balance_loss_clip": 1.0408709, + "balance_loss_mlp": 1.03548551, + "epoch": 0.1739365699684353, + "flos": 26615516958720.0, + "grad_norm": 2.9320028049166815, + "language_loss": 0.74716449, + "learning_rate": 3.786578545502627e-06, + "loss": 0.76882249, + "num_input_tokens_seen": 62592580, + "step": 2893, + "time_per_iteration": 2.804553270339966 + }, + { + "auxiliary_loss_clip": 0.01088644, + "auxiliary_loss_mlp": 0.01047748, + "balance_loss_clip": 1.03364849, + "balance_loss_mlp": 1.02848411, + "epoch": 0.17399669322110325, + "flos": 23367971491200.0, + "grad_norm": 1.8134244163429942, + "language_loss": 0.82362604, + "learning_rate": 3.7864034555249828e-06, + "loss": 0.84498996, + "num_input_tokens_seen": 62611220, + "step": 2894, + "time_per_iteration": 2.814283847808838 + }, + { + "auxiliary_loss_clip": 0.01080598, + "auxiliary_loss_mlp": 0.010479, + "balance_loss_clip": 1.03740096, + "balance_loss_mlp": 1.02572691, + "epoch": 0.17405681647377122, + "flos": 22054107813120.0, + "grad_norm": 2.440835131727674, + "language_loss": 0.74041134, + "learning_rate": 3.786228297806741e-06, + "loss": 0.76169634, + "num_input_tokens_seen": 62629185, + "step": 2895, + "time_per_iteration": 2.8923182487487793 + }, + { + "auxiliary_loss_clip": 0.0100123, + "auxiliary_loss_mlp": 0.01033069, + "balance_loss_clip": 1.01953173, + "balance_loss_mlp": 1.03064919, + "epoch": 0.1741169397264392, + "flos": 61457559114240.0, + "grad_norm": 0.8726321926557801, + "language_loss": 0.62795264, + "learning_rate": 3.7860530723545435e-06, + "loss": 0.64829558, + "num_input_tokens_seen": 62691895, + "step": 2896, + "time_per_iteration": 3.466623306274414 + }, + { + "auxiliary_loss_clip": 0.01094357, + "auxiliary_loss_mlp": 0.0074897, + "balance_loss_clip": 1.035779, + "balance_loss_mlp": 1.00036454, + "epoch": 0.17417706297910718, + "flos": 27017680608000.0, + "grad_norm": 1.6924213348433397, + "language_loss": 0.75814402, + "learning_rate": 3.785877779175034e-06, + "loss": 0.77657735, + "num_input_tokens_seen": 62713790, + "step": 2897, + "time_per_iteration": 2.9596195220947266 + }, + { + "auxiliary_loss_clip": 0.01104865, + "auxiliary_loss_mlp": 0.01046441, + "balance_loss_clip": 1.03925252, + "balance_loss_mlp": 1.02685452, + "epoch": 0.17423718623177514, + "flos": 33508856960640.0, + "grad_norm": 1.8035250303324, + "language_loss": 0.69182038, + "learning_rate": 3.7857024182748606e-06, + "loss": 0.71333349, + "num_input_tokens_seen": 62736285, + "step": 2898, + "time_per_iteration": 2.8534634113311768 + }, + { + "auxiliary_loss_clip": 0.010864, + "auxiliary_loss_mlp": 0.01047393, + "balance_loss_clip": 1.03605855, + "balance_loss_mlp": 1.027843, + "epoch": 0.1742973094844431, + "flos": 27198634348800.0, + "grad_norm": 2.2504641709109015, + "language_loss": 0.76358509, + "learning_rate": 3.7855269896606717e-06, + "loss": 0.78492302, + "num_input_tokens_seen": 62756240, + "step": 2899, + "time_per_iteration": 2.8598318099975586 + }, + { + "auxiliary_loss_clip": 0.01060852, + "auxiliary_loss_mlp": 0.01048271, + "balance_loss_clip": 1.03542912, + "balance_loss_mlp": 1.02798152, + "epoch": 0.17435743273711107, + "flos": 22710734386560.0, + "grad_norm": 1.7978308089021078, + "language_loss": 0.72438216, + "learning_rate": 3.785351493339121e-06, + "loss": 0.74547344, + "num_input_tokens_seen": 62775910, + "step": 2900, + "time_per_iteration": 2.87996506690979 + }, + { + "auxiliary_loss_clip": 0.01070789, + "auxiliary_loss_mlp": 0.00748986, + "balance_loss_clip": 1.03538728, + "balance_loss_mlp": 1.00036407, + "epoch": 0.17441755598977904, + "flos": 41646466039680.0, + "grad_norm": 1.4624943461525524, + "language_loss": 0.69756663, + "learning_rate": 3.785175929316863e-06, + "loss": 0.7157644, + "num_input_tokens_seen": 62799385, + "step": 2901, + "time_per_iteration": 2.8357064723968506 + }, + { + "auxiliary_loss_clip": 0.01086092, + "auxiliary_loss_mlp": 0.01051385, + "balance_loss_clip": 1.03527641, + "balance_loss_mlp": 1.03219271, + "epoch": 0.174477679242447, + "flos": 26287077974400.0, + "grad_norm": 1.8531391300047284, + "language_loss": 0.76253593, + "learning_rate": 3.7850002976005543e-06, + "loss": 0.78391075, + "num_input_tokens_seen": 62819380, + "step": 2902, + "time_per_iteration": 2.7228612899780273 + }, + { + "auxiliary_loss_clip": 0.01108515, + "auxiliary_loss_mlp": 0.01057165, + "balance_loss_clip": 1.0378958, + "balance_loss_mlp": 1.03744793, + "epoch": 0.174537802495115, + "flos": 17858412990720.0, + "grad_norm": 3.3093659879142656, + "language_loss": 0.81261027, + "learning_rate": 3.7848245981968558e-06, + "loss": 0.83426702, + "num_input_tokens_seen": 62836205, + "step": 2903, + "time_per_iteration": 2.633989095687866 + }, + { + "auxiliary_loss_clip": 0.01095097, + "auxiliary_loss_mlp": 0.01043081, + "balance_loss_clip": 1.04103494, + "balance_loss_mlp": 1.02435279, + "epoch": 0.17459792574778296, + "flos": 16940715390720.0, + "grad_norm": 2.29328647066931, + "language_loss": 0.72955376, + "learning_rate": 3.784648831112429e-06, + "loss": 0.75093555, + "num_input_tokens_seen": 62854045, + "step": 2904, + "time_per_iteration": 2.6854026317596436 + }, + { + "auxiliary_loss_clip": 0.01058021, + "auxiliary_loss_mlp": 0.01047961, + "balance_loss_clip": 1.03046966, + "balance_loss_mlp": 1.02922118, + "epoch": 0.17465804900045093, + "flos": 25520026014720.0, + "grad_norm": 1.9807618566018874, + "language_loss": 0.64053202, + "learning_rate": 3.7844729963539406e-06, + "loss": 0.66159183, + "num_input_tokens_seen": 62873075, + "step": 2905, + "time_per_iteration": 2.8156843185424805 + }, + { + "auxiliary_loss_clip": 0.01092793, + "auxiliary_loss_mlp": 0.01050606, + "balance_loss_clip": 1.03836346, + "balance_loss_mlp": 1.03042412, + "epoch": 0.1747181722531189, + "flos": 24129708238080.0, + "grad_norm": 6.553068686345645, + "language_loss": 0.79773998, + "learning_rate": 3.7842970939280566e-06, + "loss": 0.81917399, + "num_input_tokens_seen": 62892675, + "step": 2906, + "time_per_iteration": 2.715179443359375 + }, + { + "auxiliary_loss_clip": 0.0111082, + "auxiliary_loss_mlp": 0.0105855, + "balance_loss_clip": 1.04279757, + "balance_loss_mlp": 1.03905964, + "epoch": 0.17477829550578686, + "flos": 17748813617280.0, + "grad_norm": 1.7684415470000696, + "language_loss": 0.80868983, + "learning_rate": 3.784121123841449e-06, + "loss": 0.83038354, + "num_input_tokens_seen": 62910675, + "step": 2907, + "time_per_iteration": 2.6953225135803223 + }, + { + "auxiliary_loss_clip": 0.01109201, + "auxiliary_loss_mlp": 0.01054118, + "balance_loss_clip": 1.0411253, + "balance_loss_mlp": 1.03500926, + "epoch": 0.17483841875845482, + "flos": 15377344865280.0, + "grad_norm": 2.1619690846223705, + "language_loss": 0.81205314, + "learning_rate": 3.7839450861007886e-06, + "loss": 0.83368635, + "num_input_tokens_seen": 62928130, + "step": 2908, + "time_per_iteration": 2.727759599685669 + }, + { + "auxiliary_loss_clip": 0.01091067, + "auxiliary_loss_mlp": 0.01054295, + "balance_loss_clip": 1.03807831, + "balance_loss_mlp": 1.03382659, + "epoch": 0.17489854201112282, + "flos": 17163254102400.0, + "grad_norm": 3.2598309478225502, + "language_loss": 0.80245024, + "learning_rate": 3.7837689807127518e-06, + "loss": 0.82390392, + "num_input_tokens_seen": 62944290, + "step": 2909, + "time_per_iteration": 2.858180284500122 + }, + { + "auxiliary_loss_clip": 0.0104859, + "auxiliary_loss_mlp": 0.0106351, + "balance_loss_clip": 1.03462696, + "balance_loss_mlp": 1.0401448, + "epoch": 0.17495866526379078, + "flos": 19755286318080.0, + "grad_norm": 7.669534449716569, + "language_loss": 0.76795727, + "learning_rate": 3.783592807684017e-06, + "loss": 0.7890783, + "num_input_tokens_seen": 62963505, + "step": 2910, + "time_per_iteration": 2.759878635406494 + }, + { + "auxiliary_loss_clip": 0.01118928, + "auxiliary_loss_mlp": 0.01052241, + "balance_loss_clip": 1.03928208, + "balance_loss_mlp": 1.03116441, + "epoch": 0.17501878851645875, + "flos": 28511133310080.0, + "grad_norm": 1.7186886519826097, + "language_loss": 0.8690511, + "learning_rate": 3.7834165670212645e-06, + "loss": 0.89076269, + "num_input_tokens_seen": 62985020, + "step": 2911, + "time_per_iteration": 2.6977615356445312 + }, + { + "auxiliary_loss_clip": 0.01116307, + "auxiliary_loss_mlp": 0.00748981, + "balance_loss_clip": 1.03773928, + "balance_loss_mlp": 1.00045204, + "epoch": 0.1750789117691267, + "flos": 17931203902080.0, + "grad_norm": 2.0536070891318623, + "language_loss": 0.89790571, + "learning_rate": 3.7832402587311764e-06, + "loss": 0.91655862, + "num_input_tokens_seen": 63001745, + "step": 2912, + "time_per_iteration": 2.5591676235198975 + }, + { + "auxiliary_loss_clip": 0.01106665, + "auxiliary_loss_mlp": 0.01049553, + "balance_loss_clip": 1.03667283, + "balance_loss_mlp": 1.02813101, + "epoch": 0.17513903502179468, + "flos": 18259427404800.0, + "grad_norm": 1.7267808401396088, + "language_loss": 0.72313333, + "learning_rate": 3.783063882820439e-06, + "loss": 0.74469548, + "num_input_tokens_seen": 63019750, + "step": 2913, + "time_per_iteration": 2.6562137603759766 + }, + { + "auxiliary_loss_clip": 0.01097369, + "auxiliary_loss_mlp": 0.01047116, + "balance_loss_clip": 1.03765261, + "balance_loss_mlp": 1.02722001, + "epoch": 0.17519915827446264, + "flos": 20704728562560.0, + "grad_norm": 2.2832351293490407, + "language_loss": 0.69066155, + "learning_rate": 3.782887439295741e-06, + "loss": 0.71210635, + "num_input_tokens_seen": 63039500, + "step": 2914, + "time_per_iteration": 2.8473198413848877 + }, + { + "auxiliary_loss_clip": 0.0110487, + "auxiliary_loss_mlp": 0.01049497, + "balance_loss_clip": 1.0386529, + "balance_loss_mlp": 1.02963686, + "epoch": 0.1752592815271306, + "flos": 20523415685760.0, + "grad_norm": 1.708108199591268, + "language_loss": 0.93417531, + "learning_rate": 3.782710928163772e-06, + "loss": 0.95571899, + "num_input_tokens_seen": 63059785, + "step": 2915, + "time_per_iteration": 2.6240508556365967 + }, + { + "auxiliary_loss_clip": 0.01077176, + "auxiliary_loss_mlp": 0.01047673, + "balance_loss_clip": 1.03528428, + "balance_loss_mlp": 1.02738321, + "epoch": 0.1753194047797986, + "flos": 21799178012160.0, + "grad_norm": 1.619184969687498, + "language_loss": 0.80894434, + "learning_rate": 3.782534349431226e-06, + "loss": 0.8301928, + "num_input_tokens_seen": 63079385, + "step": 2916, + "time_per_iteration": 4.298900604248047 + }, + { + "auxiliary_loss_clip": 0.01107799, + "auxiliary_loss_mlp": 0.01054387, + "balance_loss_clip": 1.03776932, + "balance_loss_mlp": 1.03456283, + "epoch": 0.17537952803246656, + "flos": 20668351063680.0, + "grad_norm": 1.5072230446553179, + "language_loss": 0.7357679, + "learning_rate": 3.782357703104799e-06, + "loss": 0.75738972, + "num_input_tokens_seen": 63098970, + "step": 2917, + "time_per_iteration": 2.559001922607422 + }, + { + "auxiliary_loss_clip": 0.01099528, + "auxiliary_loss_mlp": 0.01057544, + "balance_loss_clip": 1.03841448, + "balance_loss_mlp": 1.03650308, + "epoch": 0.17543965128513453, + "flos": 23295072839040.0, + "grad_norm": 1.8577343578026673, + "language_loss": 0.76575339, + "learning_rate": 3.7821809891911897e-06, + "loss": 0.78732413, + "num_input_tokens_seen": 63118750, + "step": 2918, + "time_per_iteration": 2.601958751678467 + }, + { + "auxiliary_loss_clip": 0.01056717, + "auxiliary_loss_mlp": 0.01047155, + "balance_loss_clip": 1.03412032, + "balance_loss_mlp": 1.02516079, + "epoch": 0.1754997745378025, + "flos": 29095615416960.0, + "grad_norm": 2.477976582277454, + "language_loss": 0.74045932, + "learning_rate": 3.782004207697098e-06, + "loss": 0.76149809, + "num_input_tokens_seen": 63136865, + "step": 2919, + "time_per_iteration": 2.9339654445648193 + }, + { + "auxiliary_loss_clip": 0.0108349, + "auxiliary_loss_mlp": 0.01048836, + "balance_loss_clip": 1.03484201, + "balance_loss_mlp": 1.02903569, + "epoch": 0.17555989779047046, + "flos": 30371844620160.0, + "grad_norm": 2.247975372725149, + "language_loss": 0.74352849, + "learning_rate": 3.781827358629228e-06, + "loss": 0.76485169, + "num_input_tokens_seen": 63158325, + "step": 2920, + "time_per_iteration": 4.518924951553345 + }, + { + "auxiliary_loss_clip": 0.01071351, + "auxiliary_loss_mlp": 0.01045748, + "balance_loss_clip": 1.0294106, + "balance_loss_mlp": 1.02611434, + "epoch": 0.17562002104313842, + "flos": 23287746464640.0, + "grad_norm": 2.0026620994134183, + "language_loss": 0.79549301, + "learning_rate": 3.7816504419942873e-06, + "loss": 0.81666398, + "num_input_tokens_seen": 63173115, + "step": 2921, + "time_per_iteration": 2.7677454948425293 + }, + { + "auxiliary_loss_clip": 0.01082198, + "auxiliary_loss_mlp": 0.01049828, + "balance_loss_clip": 1.03408265, + "balance_loss_mlp": 1.02986026, + "epoch": 0.1756801442958064, + "flos": 24790500789120.0, + "grad_norm": 1.6789729257126145, + "language_loss": 0.87725073, + "learning_rate": 3.7814734577989823e-06, + "loss": 0.89857101, + "num_input_tokens_seen": 63192880, + "step": 2922, + "time_per_iteration": 2.9007136821746826 + }, + { + "auxiliary_loss_clip": 0.01101842, + "auxiliary_loss_mlp": 0.01050207, + "balance_loss_clip": 1.03414154, + "balance_loss_mlp": 1.03025103, + "epoch": 0.17574026754847438, + "flos": 25771651764480.0, + "grad_norm": 3.159442490812838, + "language_loss": 0.62568718, + "learning_rate": 3.7812964060500253e-06, + "loss": 0.64720762, + "num_input_tokens_seen": 63214395, + "step": 2923, + "time_per_iteration": 4.349703550338745 + }, + { + "auxiliary_loss_clip": 0.01089048, + "auxiliary_loss_mlp": 0.01049567, + "balance_loss_clip": 1.0372932, + "balance_loss_mlp": 1.02907491, + "epoch": 0.17580039080114235, + "flos": 17456608477440.0, + "grad_norm": 2.311839763794186, + "language_loss": 0.80694783, + "learning_rate": 3.78111928675413e-06, + "loss": 0.82833397, + "num_input_tokens_seen": 63231020, + "step": 2924, + "time_per_iteration": 4.35402512550354 + }, + { + "auxiliary_loss_clip": 0.01094323, + "auxiliary_loss_mlp": 0.01057986, + "balance_loss_clip": 1.03535032, + "balance_loss_mlp": 1.03627777, + "epoch": 0.1758605140538103, + "flos": 14864648088960.0, + "grad_norm": 2.492340745817138, + "language_loss": 0.70630324, + "learning_rate": 3.7809420999180126e-06, + "loss": 0.72782624, + "num_input_tokens_seen": 63246245, + "step": 2925, + "time_per_iteration": 2.6151814460754395 + }, + { + "auxiliary_loss_clip": 0.01080632, + "auxiliary_loss_mlp": 0.01046626, + "balance_loss_clip": 1.03464925, + "balance_loss_mlp": 1.02776718, + "epoch": 0.17592063730647828, + "flos": 23004268329600.0, + "grad_norm": 1.8195188907344602, + "language_loss": 0.71959543, + "learning_rate": 3.7807648455483934e-06, + "loss": 0.74086797, + "num_input_tokens_seen": 63267790, + "step": 2926, + "time_per_iteration": 2.7463252544403076 + }, + { + "auxiliary_loss_clip": 0.01056969, + "auxiliary_loss_mlp": 0.01049253, + "balance_loss_clip": 1.03013968, + "balance_loss_mlp": 1.02657974, + "epoch": 0.17598076055914624, + "flos": 20741501111040.0, + "grad_norm": 1.7582986567734817, + "language_loss": 0.85168767, + "learning_rate": 3.7805875236519918e-06, + "loss": 0.87274992, + "num_input_tokens_seen": 63286830, + "step": 2927, + "time_per_iteration": 2.7960870265960693 + }, + { + "auxiliary_loss_clip": 0.01067778, + "auxiliary_loss_mlp": 0.01049134, + "balance_loss_clip": 1.03764999, + "balance_loss_mlp": 1.03089523, + "epoch": 0.1760408838118142, + "flos": 34092441227520.0, + "grad_norm": 2.449872804243385, + "language_loss": 0.71958655, + "learning_rate": 3.7804101342355336e-06, + "loss": 0.74075568, + "num_input_tokens_seen": 63308870, + "step": 2928, + "time_per_iteration": 2.8540875911712646 + }, + { + "auxiliary_loss_clip": 0.0107242, + "auxiliary_loss_mlp": 0.01046223, + "balance_loss_clip": 1.03303909, + "balance_loss_mlp": 1.02707767, + "epoch": 0.1761010070644822, + "flos": 24168384207360.0, + "grad_norm": 2.0348246096397893, + "language_loss": 0.82943612, + "learning_rate": 3.780232677305744e-06, + "loss": 0.85062253, + "num_input_tokens_seen": 63329005, + "step": 2929, + "time_per_iteration": 2.753391742706299 + }, + { + "auxiliary_loss_clip": 0.01080632, + "auxiliary_loss_mlp": 0.0104487, + "balance_loss_clip": 1.03266573, + "balance_loss_mlp": 1.02604699, + "epoch": 0.17616113031715017, + "flos": 26576697335040.0, + "grad_norm": 1.6311483949901053, + "language_loss": 0.79160726, + "learning_rate": 3.7800551528693535e-06, + "loss": 0.81286228, + "num_input_tokens_seen": 63349390, + "step": 2930, + "time_per_iteration": 2.7862579822540283 + }, + { + "auxiliary_loss_clip": 0.01117374, + "auxiliary_loss_mlp": 0.01046438, + "balance_loss_clip": 1.039276, + "balance_loss_mlp": 1.02630341, + "epoch": 0.17622125356981813, + "flos": 25666685245440.0, + "grad_norm": 8.10498188753054, + "language_loss": 0.76774979, + "learning_rate": 3.7798775609330927e-06, + "loss": 0.78938794, + "num_input_tokens_seen": 63368835, + "step": 2931, + "time_per_iteration": 2.6445305347442627 + }, + { + "auxiliary_loss_clip": 0.01029834, + "auxiliary_loss_mlp": 0.01047498, + "balance_loss_clip": 1.02944875, + "balance_loss_mlp": 1.02803123, + "epoch": 0.1762813768224861, + "flos": 16508530949760.0, + "grad_norm": 19.70100155852793, + "language_loss": 0.75432605, + "learning_rate": 3.779699901503696e-06, + "loss": 0.7750994, + "num_input_tokens_seen": 63385220, + "step": 2932, + "time_per_iteration": 2.815152645111084 + }, + { + "auxiliary_loss_clip": 0.01104946, + "auxiliary_loss_mlp": 0.01043599, + "balance_loss_clip": 1.03589618, + "balance_loss_mlp": 1.02264214, + "epoch": 0.17634150007515406, + "flos": 11211850402560.0, + "grad_norm": 2.153211469122124, + "language_loss": 0.90296167, + "learning_rate": 3.7795221745879016e-06, + "loss": 0.92444718, + "num_input_tokens_seen": 63400865, + "step": 2933, + "time_per_iteration": 2.637042999267578 + }, + { + "auxiliary_loss_clip": 0.01111478, + "auxiliary_loss_mlp": 0.01054025, + "balance_loss_clip": 1.0367682, + "balance_loss_mlp": 1.03577375, + "epoch": 0.17640162332782203, + "flos": 23659925235840.0, + "grad_norm": 1.7621488289909222, + "language_loss": 0.88064754, + "learning_rate": 3.779344380192448e-06, + "loss": 0.90230262, + "num_input_tokens_seen": 63421390, + "step": 2934, + "time_per_iteration": 2.604442834854126 + }, + { + "auxiliary_loss_clip": 0.01086251, + "auxiliary_loss_mlp": 0.01048089, + "balance_loss_clip": 1.03428531, + "balance_loss_mlp": 1.0300647, + "epoch": 0.17646174658049, + "flos": 53796984606720.0, + "grad_norm": 1.6514754165031447, + "language_loss": 0.70589197, + "learning_rate": 3.779166518324077e-06, + "loss": 0.72723538, + "num_input_tokens_seen": 63444715, + "step": 2935, + "time_per_iteration": 2.940330982208252 + }, + { + "auxiliary_loss_clip": 0.0108414, + "auxiliary_loss_mlp": 0.01043255, + "balance_loss_clip": 1.03492188, + "balance_loss_mlp": 1.02403879, + "epoch": 0.17652186983315798, + "flos": 24243868638720.0, + "grad_norm": 2.2083134138024167, + "language_loss": 0.69934982, + "learning_rate": 3.7789885889895325e-06, + "loss": 0.72062379, + "num_input_tokens_seen": 63465525, + "step": 2936, + "time_per_iteration": 2.787297487258911 + }, + { + "auxiliary_loss_clip": 0.01057485, + "auxiliary_loss_mlp": 0.01044656, + "balance_loss_clip": 1.03294313, + "balance_loss_mlp": 1.0262264, + "epoch": 0.17658199308582595, + "flos": 27454282421760.0, + "grad_norm": 6.5041608325030555, + "language_loss": 0.71999311, + "learning_rate": 3.7788105921955634e-06, + "loss": 0.74101448, + "num_input_tokens_seen": 63485815, + "step": 2937, + "time_per_iteration": 2.7589004039764404 + }, + { + "auxiliary_loss_clip": 0.01097035, + "auxiliary_loss_mlp": 0.0104776, + "balance_loss_clip": 1.03862917, + "balance_loss_mlp": 1.02750587, + "epoch": 0.17664211633849392, + "flos": 22418672901120.0, + "grad_norm": 2.198474633778035, + "language_loss": 0.75379556, + "learning_rate": 3.7786325279489184e-06, + "loss": 0.77524352, + "num_input_tokens_seen": 63503905, + "step": 2938, + "time_per_iteration": 2.6695058345794678 + }, + { + "auxiliary_loss_clip": 0.01102893, + "auxiliary_loss_mlp": 0.01040289, + "balance_loss_clip": 1.03585005, + "balance_loss_mlp": 1.0215137, + "epoch": 0.17670223959116188, + "flos": 24715124098560.0, + "grad_norm": 2.107610861364574, + "language_loss": 0.7045815, + "learning_rate": 3.7784543962563495e-06, + "loss": 0.7260133, + "num_input_tokens_seen": 63521985, + "step": 2939, + "time_per_iteration": 2.6869874000549316 + }, + { + "auxiliary_loss_clip": 0.01117332, + "auxiliary_loss_mlp": 0.01043963, + "balance_loss_clip": 1.04018319, + "balance_loss_mlp": 1.02548504, + "epoch": 0.17676236284382985, + "flos": 22527051212160.0, + "grad_norm": 2.415951568907851, + "language_loss": 0.738087, + "learning_rate": 3.7782761971246115e-06, + "loss": 0.75969994, + "num_input_tokens_seen": 63539830, + "step": 2940, + "time_per_iteration": 2.6376776695251465 + }, + { + "auxiliary_loss_clip": 0.01083704, + "auxiliary_loss_mlp": 0.01044947, + "balance_loss_clip": 1.03710866, + "balance_loss_mlp": 1.02489591, + "epoch": 0.1768224860964978, + "flos": 12385160161920.0, + "grad_norm": 2.384009739030544, + "language_loss": 0.85372096, + "learning_rate": 3.7780979305604616e-06, + "loss": 0.87500751, + "num_input_tokens_seen": 63555495, + "step": 2941, + "time_per_iteration": 2.7204091548919678 + }, + { + "auxiliary_loss_clip": 0.01114611, + "auxiliary_loss_mlp": 0.01042581, + "balance_loss_clip": 1.03718138, + "balance_loss_mlp": 1.02417493, + "epoch": 0.1768826093491658, + "flos": 24353360271360.0, + "grad_norm": 2.05339295546871, + "language_loss": 0.76714081, + "learning_rate": 3.7779195965706607e-06, + "loss": 0.78871274, + "num_input_tokens_seen": 63575290, + "step": 2942, + "time_per_iteration": 2.621356725692749 + }, + { + "auxiliary_loss_clip": 0.01058347, + "auxiliary_loss_mlp": 0.00749029, + "balance_loss_clip": 1.03336191, + "balance_loss_mlp": 1.00053704, + "epoch": 0.17694273260183377, + "flos": 23587062497280.0, + "grad_norm": 1.8524196500332695, + "language_loss": 0.80306125, + "learning_rate": 3.77774119516197e-06, + "loss": 0.82113504, + "num_input_tokens_seen": 63594670, + "step": 2943, + "time_per_iteration": 2.8056843280792236 + }, + { + "auxiliary_loss_clip": 0.01082932, + "auxiliary_loss_mlp": 0.01050314, + "balance_loss_clip": 1.03305912, + "balance_loss_mlp": 1.02929783, + "epoch": 0.17700285585450173, + "flos": 26760991040640.0, + "grad_norm": 1.8769647286434148, + "language_loss": 0.80672455, + "learning_rate": 3.777562726341155e-06, + "loss": 0.82805705, + "num_input_tokens_seen": 63614780, + "step": 2944, + "time_per_iteration": 2.7852180004119873 + }, + { + "auxiliary_loss_clip": 0.01115193, + "auxiliary_loss_mlp": 0.0106236, + "balance_loss_clip": 1.03708196, + "balance_loss_mlp": 1.04358447, + "epoch": 0.1770629791071697, + "flos": 42776323320960.0, + "grad_norm": 1.8407497426535129, + "language_loss": 0.73906755, + "learning_rate": 3.7773841901149835e-06, + "loss": 0.7608431, + "num_input_tokens_seen": 63637190, + "step": 2945, + "time_per_iteration": 2.8857030868530273 + }, + { + "auxiliary_loss_clip": 0.01102422, + "auxiliary_loss_mlp": 0.01043942, + "balance_loss_clip": 1.036461, + "balance_loss_mlp": 1.02541685, + "epoch": 0.17712310235983766, + "flos": 17345572560000.0, + "grad_norm": 3.007161243425335, + "language_loss": 0.78029311, + "learning_rate": 3.7772055864902256e-06, + "loss": 0.8017568, + "num_input_tokens_seen": 63652140, + "step": 2946, + "time_per_iteration": 2.6414437294006348 + }, + { + "auxiliary_loss_clip": 0.01056659, + "auxiliary_loss_mlp": 0.01049592, + "balance_loss_clip": 1.03051996, + "balance_loss_mlp": 1.03010142, + "epoch": 0.17718322561250563, + "flos": 23878477537920.0, + "grad_norm": 1.7375664083521736, + "language_loss": 0.75903767, + "learning_rate": 3.7770269154736535e-06, + "loss": 0.78010023, + "num_input_tokens_seen": 63671700, + "step": 2947, + "time_per_iteration": 2.737409830093384 + }, + { + "auxiliary_loss_clip": 0.0109736, + "auxiliary_loss_mlp": 0.01046626, + "balance_loss_clip": 1.03267586, + "balance_loss_mlp": 1.02695632, + "epoch": 0.1772433488651736, + "flos": 36466352104320.0, + "grad_norm": 1.7795624965143435, + "language_loss": 0.72574127, + "learning_rate": 3.7768481770720424e-06, + "loss": 0.74718118, + "num_input_tokens_seen": 63691685, + "step": 2948, + "time_per_iteration": 2.7204976081848145 + }, + { + "auxiliary_loss_clip": 0.01102244, + "auxiliary_loss_mlp": 0.01047416, + "balance_loss_clip": 1.03768635, + "balance_loss_mlp": 1.02871227, + "epoch": 0.1773034721178416, + "flos": 26684716510080.0, + "grad_norm": 4.314376406609778, + "language_loss": 0.82129067, + "learning_rate": 3.776669371292171e-06, + "loss": 0.84278727, + "num_input_tokens_seen": 63711720, + "step": 2949, + "time_per_iteration": 2.6456611156463623 + }, + { + "auxiliary_loss_clip": 0.01020565, + "auxiliary_loss_mlp": 0.01009678, + "balance_loss_clip": 1.01029253, + "balance_loss_mlp": 1.0069598, + "epoch": 0.17736359537050955, + "flos": 57117467617920.0, + "grad_norm": 0.7606666684509831, + "language_loss": 0.65032005, + "learning_rate": 3.7764904981408186e-06, + "loss": 0.67062247, + "num_input_tokens_seen": 63776280, + "step": 2950, + "time_per_iteration": 3.240630626678467 + }, + { + "auxiliary_loss_clip": 0.01077108, + "auxiliary_loss_mlp": 0.01044861, + "balance_loss_clip": 1.03349483, + "balance_loss_mlp": 1.02595472, + "epoch": 0.17742371862317752, + "flos": 27198203385600.0, + "grad_norm": 4.519807876440562, + "language_loss": 0.83483392, + "learning_rate": 3.7763115576247686e-06, + "loss": 0.85605365, + "num_input_tokens_seen": 63797535, + "step": 2951, + "time_per_iteration": 2.795196533203125 + }, + { + "auxiliary_loss_clip": 0.01078908, + "auxiliary_loss_mlp": 0.01051613, + "balance_loss_clip": 1.03345275, + "balance_loss_mlp": 1.03282523, + "epoch": 0.17748384187584548, + "flos": 20959694277120.0, + "grad_norm": 4.769396791038075, + "language_loss": 0.80072647, + "learning_rate": 3.776132549750806e-06, + "loss": 0.82203168, + "num_input_tokens_seen": 63817045, + "step": 2952, + "time_per_iteration": 2.7625339031219482 + }, + { + "auxiliary_loss_clip": 0.01115282, + "auxiliary_loss_mlp": 0.01051844, + "balance_loss_clip": 1.03775907, + "balance_loss_mlp": 1.0318526, + "epoch": 0.17754396512851345, + "flos": 25009986844800.0, + "grad_norm": 1.9482698127508735, + "language_loss": 0.79733944, + "learning_rate": 3.7759534745257194e-06, + "loss": 0.81901073, + "num_input_tokens_seen": 63837665, + "step": 2953, + "time_per_iteration": 2.5857203006744385 + }, + { + "auxiliary_loss_clip": 0.01073437, + "auxiliary_loss_mlp": 0.01049782, + "balance_loss_clip": 1.03478956, + "balance_loss_mlp": 1.03139997, + "epoch": 0.1776040883811814, + "flos": 32051566275840.0, + "grad_norm": 1.840834092074147, + "language_loss": 0.87795985, + "learning_rate": 3.7757743319562994e-06, + "loss": 0.89919204, + "num_input_tokens_seen": 63858455, + "step": 2954, + "time_per_iteration": 2.7254409790039062 + }, + { + "auxiliary_loss_clip": 0.01096197, + "auxiliary_loss_mlp": 0.01058346, + "balance_loss_clip": 1.03737807, + "balance_loss_mlp": 1.03946304, + "epoch": 0.17766421163384938, + "flos": 21574125348480.0, + "grad_norm": 1.912539487134435, + "language_loss": 0.85161269, + "learning_rate": 3.7755951220493386e-06, + "loss": 0.87315816, + "num_input_tokens_seen": 63876935, + "step": 2955, + "time_per_iteration": 2.636904001235962 + }, + { + "auxiliary_loss_clip": 0.01082111, + "auxiliary_loss_mlp": 0.01052397, + "balance_loss_clip": 1.03502464, + "balance_loss_mlp": 1.03316832, + "epoch": 0.17772433488651737, + "flos": 22419319345920.0, + "grad_norm": 1.5839128435956644, + "language_loss": 0.71242225, + "learning_rate": 3.7754158448116327e-06, + "loss": 0.73376727, + "num_input_tokens_seen": 63896815, + "step": 2956, + "time_per_iteration": 2.7318079471588135 + }, + { + "auxiliary_loss_clip": 0.01101981, + "auxiliary_loss_mlp": 0.01048526, + "balance_loss_clip": 1.03644335, + "balance_loss_mlp": 1.02975047, + "epoch": 0.17778445813918534, + "flos": 25629445820160.0, + "grad_norm": 2.490925778917758, + "language_loss": 0.8274051, + "learning_rate": 3.7752365002499795e-06, + "loss": 0.84891015, + "num_input_tokens_seen": 63916140, + "step": 2957, + "time_per_iteration": 2.709292411804199 + }, + { + "auxiliary_loss_clip": 0.0105159, + "auxiliary_loss_mlp": 0.01049204, + "balance_loss_clip": 1.03337491, + "balance_loss_mlp": 1.03040493, + "epoch": 0.1778445813918533, + "flos": 25628871202560.0, + "grad_norm": 2.1392813070661827, + "language_loss": 0.74883717, + "learning_rate": 3.7750570883711807e-06, + "loss": 0.76984507, + "num_input_tokens_seen": 63935220, + "step": 2958, + "time_per_iteration": 2.7618820667266846 + }, + { + "auxiliary_loss_clip": 0.01101464, + "auxiliary_loss_mlp": 0.01041808, + "balance_loss_clip": 1.04159772, + "balance_loss_mlp": 1.02355695, + "epoch": 0.17790470464452127, + "flos": 22345522853760.0, + "grad_norm": 1.8740385385417502, + "language_loss": 0.79881799, + "learning_rate": 3.7748776091820397e-06, + "loss": 0.82025063, + "num_input_tokens_seen": 63954550, + "step": 2959, + "time_per_iteration": 2.8025453090667725 + }, + { + "auxiliary_loss_clip": 0.0111763, + "auxiliary_loss_mlp": 0.01048685, + "balance_loss_clip": 1.0376153, + "balance_loss_mlp": 1.02895641, + "epoch": 0.17796482789718923, + "flos": 18765875214720.0, + "grad_norm": 1.579433562837929, + "language_loss": 0.51972687, + "learning_rate": 3.774698062689362e-06, + "loss": 0.54139, + "num_input_tokens_seen": 63972425, + "step": 2960, + "time_per_iteration": 2.5907247066497803 + }, + { + "auxiliary_loss_clip": 0.01067914, + "auxiliary_loss_mlp": 0.01061951, + "balance_loss_clip": 1.03760219, + "balance_loss_mlp": 1.04169726, + "epoch": 0.1780249511498572, + "flos": 23440941970560.0, + "grad_norm": 1.819605328705305, + "language_loss": 0.88432467, + "learning_rate": 3.7745184488999548e-06, + "loss": 0.90562332, + "num_input_tokens_seen": 63992165, + "step": 2961, + "time_per_iteration": 2.768982410430908 + }, + { + "auxiliary_loss_clip": 0.01074306, + "auxiliary_loss_mlp": 0.01060112, + "balance_loss_clip": 1.03686523, + "balance_loss_mlp": 1.03944063, + "epoch": 0.1780850744025252, + "flos": 23367468700800.0, + "grad_norm": 1.640526293787124, + "language_loss": 0.79348183, + "learning_rate": 3.774338767820631e-06, + "loss": 0.81482601, + "num_input_tokens_seen": 64013470, + "step": 2962, + "time_per_iteration": 2.6892282962799072 + }, + { + "auxiliary_loss_clip": 0.01096136, + "auxiliary_loss_mlp": 0.01058307, + "balance_loss_clip": 1.03669226, + "balance_loss_mlp": 1.03647959, + "epoch": 0.17814519765519315, + "flos": 13771994319360.0, + "grad_norm": 2.0359717742834476, + "language_loss": 0.74445295, + "learning_rate": 3.774159019458203e-06, + "loss": 0.76599741, + "num_input_tokens_seen": 64030975, + "step": 2963, + "time_per_iteration": 2.7391231060028076 + }, + { + "auxiliary_loss_clip": 0.0109771, + "auxiliary_loss_mlp": 0.01048205, + "balance_loss_clip": 1.03888893, + "balance_loss_mlp": 1.0279038, + "epoch": 0.17820532090786112, + "flos": 21976396738560.0, + "grad_norm": 1.8789455795424628, + "language_loss": 0.78700948, + "learning_rate": 3.7739792038194877e-06, + "loss": 0.8084687, + "num_input_tokens_seen": 64050075, + "step": 2964, + "time_per_iteration": 4.266571521759033 + }, + { + "auxiliary_loss_clip": 0.01107831, + "auxiliary_loss_mlp": 0.00749081, + "balance_loss_clip": 1.04071712, + "balance_loss_mlp": 1.00068402, + "epoch": 0.17826544416052909, + "flos": 24790752184320.0, + "grad_norm": 1.9797437876592459, + "language_loss": 0.812953, + "learning_rate": 3.7737993209113027e-06, + "loss": 0.83152211, + "num_input_tokens_seen": 64071920, + "step": 2965, + "time_per_iteration": 2.6511459350585938 + }, + { + "auxiliary_loss_clip": 0.01104362, + "auxiliary_loss_mlp": 0.01052668, + "balance_loss_clip": 1.03838062, + "balance_loss_mlp": 1.03519189, + "epoch": 0.17832556741319705, + "flos": 13879582531200.0, + "grad_norm": 2.335183155970505, + "language_loss": 0.94280219, + "learning_rate": 3.7736193707404698e-06, + "loss": 0.9643724, + "num_input_tokens_seen": 64086835, + "step": 2966, + "time_per_iteration": 2.5666215419769287 + }, + { + "auxiliary_loss_clip": 0.0106896, + "auxiliary_loss_mlp": 0.0074897, + "balance_loss_clip": 1.03705668, + "balance_loss_mlp": 1.00065303, + "epoch": 0.17838569066586502, + "flos": 36641703323520.0, + "grad_norm": 2.2327028750210705, + "language_loss": 0.72622591, + "learning_rate": 3.7734393533138127e-06, + "loss": 0.74440521, + "num_input_tokens_seen": 64107360, + "step": 2967, + "time_per_iteration": 4.419801235198975 + }, + { + "auxiliary_loss_clip": 0.01081243, + "auxiliary_loss_mlp": 0.01046872, + "balance_loss_clip": 1.03715551, + "balance_loss_mlp": 1.02755976, + "epoch": 0.17844581391853298, + "flos": 18727271072640.0, + "grad_norm": 1.8299739501378327, + "language_loss": 0.77113497, + "learning_rate": 3.773259268638157e-06, + "loss": 0.79241604, + "num_input_tokens_seen": 64124690, + "step": 2968, + "time_per_iteration": 2.6608004570007324 + }, + { + "auxiliary_loss_clip": 0.01032197, + "auxiliary_loss_mlp": 0.01045275, + "balance_loss_clip": 1.02897453, + "balance_loss_mlp": 1.02597511, + "epoch": 0.17850593717120097, + "flos": 27378259286400.0, + "grad_norm": 1.787204766210607, + "language_loss": 0.75542164, + "learning_rate": 3.7730791167203333e-06, + "loss": 0.77619636, + "num_input_tokens_seen": 64146315, + "step": 2969, + "time_per_iteration": 2.807457447052002 + }, + { + "auxiliary_loss_clip": 0.0101739, + "auxiliary_loss_mlp": 0.01011877, + "balance_loss_clip": 1.0192194, + "balance_loss_mlp": 1.00868249, + "epoch": 0.17856606042386894, + "flos": 66996025084800.0, + "grad_norm": 0.8394595641163886, + "language_loss": 0.6899693, + "learning_rate": 3.772898897567171e-06, + "loss": 0.71026206, + "num_input_tokens_seen": 64210875, + "step": 2970, + "time_per_iteration": 4.943094968795776 + }, + { + "auxiliary_loss_clip": 0.01083088, + "auxiliary_loss_mlp": 0.0104556, + "balance_loss_clip": 1.0363059, + "balance_loss_mlp": 1.02633142, + "epoch": 0.1786261836765369, + "flos": 36977001805440.0, + "grad_norm": 1.6898037698155133, + "language_loss": 0.6745683, + "learning_rate": 3.772718611185505e-06, + "loss": 0.69585478, + "num_input_tokens_seen": 64230740, + "step": 2971, + "time_per_iteration": 4.65270209312439 + }, + { + "auxiliary_loss_clip": 0.01068076, + "auxiliary_loss_mlp": 0.01054626, + "balance_loss_clip": 1.03487778, + "balance_loss_mlp": 1.03389549, + "epoch": 0.17868630692920487, + "flos": 24825441744000.0, + "grad_norm": 1.6259448953324152, + "language_loss": 0.89466178, + "learning_rate": 3.7725382575821717e-06, + "loss": 0.91588885, + "num_input_tokens_seen": 64252300, + "step": 2972, + "time_per_iteration": 3.039280414581299 + }, + { + "auxiliary_loss_clip": 0.01076395, + "auxiliary_loss_mlp": 0.01053024, + "balance_loss_clip": 1.03632891, + "balance_loss_mlp": 1.03292537, + "epoch": 0.17874643018187283, + "flos": 16981977139200.0, + "grad_norm": 2.7959731716581575, + "language_loss": 0.88277721, + "learning_rate": 3.77235783676401e-06, + "loss": 0.90407145, + "num_input_tokens_seen": 64270105, + "step": 2973, + "time_per_iteration": 2.7533798217773438 + }, + { + "auxiliary_loss_clip": 0.01114708, + "auxiliary_loss_mlp": 0.01055239, + "balance_loss_clip": 1.03900385, + "balance_loss_mlp": 1.03611779, + "epoch": 0.1788065534345408, + "flos": 21032233793280.0, + "grad_norm": 1.8847601558582772, + "language_loss": 0.76168692, + "learning_rate": 3.7721773487378615e-06, + "loss": 0.78338635, + "num_input_tokens_seen": 64287250, + "step": 2974, + "time_per_iteration": 2.5909976959228516 + }, + { + "auxiliary_loss_clip": 0.01096551, + "auxiliary_loss_mlp": 0.01049206, + "balance_loss_clip": 1.03897166, + "balance_loss_mlp": 1.0302043, + "epoch": 0.17886667668720876, + "flos": 23987717775360.0, + "grad_norm": 2.1012720153802507, + "language_loss": 0.74510717, + "learning_rate": 3.7719967935105705e-06, + "loss": 0.76656479, + "num_input_tokens_seen": 64307140, + "step": 2975, + "time_per_iteration": 2.628532648086548 + }, + { + "auxiliary_loss_clip": 0.01101669, + "auxiliary_loss_mlp": 0.01050416, + "balance_loss_clip": 1.0374676, + "balance_loss_mlp": 1.03198576, + "epoch": 0.17892679993987676, + "flos": 25739476156800.0, + "grad_norm": 1.587112919395289, + "language_loss": 0.73234379, + "learning_rate": 3.7718161710889833e-06, + "loss": 0.75386465, + "num_input_tokens_seen": 64328760, + "step": 2976, + "time_per_iteration": 2.835554361343384 + }, + { + "auxiliary_loss_clip": 0.01101439, + "auxiliary_loss_mlp": 0.01042583, + "balance_loss_clip": 1.03991377, + "balance_loss_mlp": 1.02628112, + "epoch": 0.17898692319254472, + "flos": 25699686865920.0, + "grad_norm": 1.4516857834195958, + "language_loss": 0.77280247, + "learning_rate": 3.7716354814799495e-06, + "loss": 0.79424268, + "num_input_tokens_seen": 64348800, + "step": 2977, + "time_per_iteration": 2.806300640106201 + }, + { + "auxiliary_loss_clip": 0.01083019, + "auxiliary_loss_mlp": 0.01053988, + "balance_loss_clip": 1.04078496, + "balance_loss_mlp": 1.03591633, + "epoch": 0.1790470464452127, + "flos": 19317786664320.0, + "grad_norm": 2.924025603320409, + "language_loss": 0.8006652, + "learning_rate": 3.7714547246903203e-06, + "loss": 0.82203531, + "num_input_tokens_seen": 64367955, + "step": 2978, + "time_per_iteration": 2.9136528968811035 + }, + { + "auxiliary_loss_clip": 0.01093423, + "auxiliary_loss_mlp": 0.01045877, + "balance_loss_clip": 1.03703451, + "balance_loss_mlp": 1.027161, + "epoch": 0.17910716969788065, + "flos": 30044267562240.0, + "grad_norm": 1.4290596238226907, + "language_loss": 0.76205355, + "learning_rate": 3.7712739007269508e-06, + "loss": 0.78344655, + "num_input_tokens_seen": 64389805, + "step": 2979, + "time_per_iteration": 2.8541250228881836 + }, + { + "auxiliary_loss_clip": 0.01076884, + "auxiliary_loss_mlp": 0.01046913, + "balance_loss_clip": 1.03632343, + "balance_loss_mlp": 1.02872193, + "epoch": 0.17916729295054862, + "flos": 19427709260160.0, + "grad_norm": 1.6466607774731352, + "language_loss": 0.69083154, + "learning_rate": 3.7710930095966976e-06, + "loss": 0.71206951, + "num_input_tokens_seen": 64408220, + "step": 2980, + "time_per_iteration": 2.969006061553955 + }, + { + "auxiliary_loss_clip": 0.01103547, + "auxiliary_loss_mlp": 0.0104439, + "balance_loss_clip": 1.03765702, + "balance_loss_mlp": 1.02396917, + "epoch": 0.17922741620321658, + "flos": 14611549881600.0, + "grad_norm": 1.8867910054726424, + "language_loss": 0.70466083, + "learning_rate": 3.7709120513064196e-06, + "loss": 0.7261402, + "num_input_tokens_seen": 64426380, + "step": 2981, + "time_per_iteration": 2.8199191093444824 + }, + { + "auxiliary_loss_clip": 0.01094, + "auxiliary_loss_mlp": 0.01061281, + "balance_loss_clip": 1.04009104, + "balance_loss_mlp": 1.04168332, + "epoch": 0.17928753945588458, + "flos": 17165301177600.0, + "grad_norm": 2.264234405724051, + "language_loss": 0.81766587, + "learning_rate": 3.7707310258629796e-06, + "loss": 0.83921868, + "num_input_tokens_seen": 64444355, + "step": 2982, + "time_per_iteration": 2.8578336238861084 + }, + { + "auxiliary_loss_clip": 0.01113475, + "auxiliary_loss_mlp": 0.01043223, + "balance_loss_clip": 1.03847098, + "balance_loss_mlp": 1.02567518, + "epoch": 0.17934766270855254, + "flos": 31395622060800.0, + "grad_norm": 1.690286798994299, + "language_loss": 0.82801187, + "learning_rate": 3.7705499332732413e-06, + "loss": 0.84957886, + "num_input_tokens_seen": 64467800, + "step": 2983, + "time_per_iteration": 2.7371363639831543 + }, + { + "auxiliary_loss_clip": 0.01104302, + "auxiliary_loss_mlp": 0.0104379, + "balance_loss_clip": 1.03507233, + "balance_loss_mlp": 1.02488375, + "epoch": 0.1794077859612205, + "flos": 20814184281600.0, + "grad_norm": 1.8528562511867661, + "language_loss": 0.85187483, + "learning_rate": 3.7703687735440718e-06, + "loss": 0.87335575, + "num_input_tokens_seen": 64487230, + "step": 2984, + "time_per_iteration": 2.673003911972046 + }, + { + "auxiliary_loss_clip": 0.01082034, + "auxiliary_loss_mlp": 0.01042184, + "balance_loss_clip": 1.03592527, + "balance_loss_mlp": 1.02364707, + "epoch": 0.17946790921388847, + "flos": 28986447006720.0, + "grad_norm": 1.4037743878200466, + "language_loss": 0.89280277, + "learning_rate": 3.7701875466823416e-06, + "loss": 0.91404498, + "num_input_tokens_seen": 64509165, + "step": 2985, + "time_per_iteration": 2.8553950786590576 + }, + { + "auxiliary_loss_clip": 0.01111188, + "auxiliary_loss_mlp": 0.0104404, + "balance_loss_clip": 1.03856885, + "balance_loss_mlp": 1.0277915, + "epoch": 0.17952803246655644, + "flos": 20737406960640.0, + "grad_norm": 1.8837697279953807, + "language_loss": 0.6954509, + "learning_rate": 3.770006252694922e-06, + "loss": 0.71700311, + "num_input_tokens_seen": 64527940, + "step": 2986, + "time_per_iteration": 2.5925040245056152 + }, + { + "auxiliary_loss_clip": 0.01110767, + "auxiliary_loss_mlp": 0.00748942, + "balance_loss_clip": 1.03674722, + "balance_loss_mlp": 1.00062561, + "epoch": 0.1795881557192244, + "flos": 28255988027520.0, + "grad_norm": 2.2510284345174645, + "language_loss": 0.77317691, + "learning_rate": 3.769824891588688e-06, + "loss": 0.79177392, + "num_input_tokens_seen": 64545230, + "step": 2987, + "time_per_iteration": 2.6200854778289795 + }, + { + "auxiliary_loss_clip": 0.01117194, + "auxiliary_loss_mlp": 0.01044461, + "balance_loss_clip": 1.03874493, + "balance_loss_mlp": 1.02537513, + "epoch": 0.17964827897189237, + "flos": 18552027594240.0, + "grad_norm": 2.3393989477207686, + "language_loss": 0.78137779, + "learning_rate": 3.7696434633705164e-06, + "loss": 0.80299431, + "num_input_tokens_seen": 64563820, + "step": 2988, + "time_per_iteration": 2.556612730026245 + }, + { + "auxiliary_loss_clip": 0.00991854, + "auxiliary_loss_mlp": 0.0074849, + "balance_loss_clip": 1.01420307, + "balance_loss_mlp": 1.0006547, + "epoch": 0.17970840222456036, + "flos": 58165088711040.0, + "grad_norm": 0.7588310947291594, + "language_loss": 0.62725317, + "learning_rate": 3.7694619680472875e-06, + "loss": 0.6446566, + "num_input_tokens_seen": 64621315, + "step": 2989, + "time_per_iteration": 3.2078702449798584 + }, + { + "auxiliary_loss_clip": 0.01097171, + "auxiliary_loss_mlp": 0.0104308, + "balance_loss_clip": 1.04065919, + "balance_loss_mlp": 1.02597392, + "epoch": 0.17976852547722832, + "flos": 20300805146880.0, + "grad_norm": 2.7266097324254095, + "language_loss": 0.70465219, + "learning_rate": 3.7692804056258837e-06, + "loss": 0.72605473, + "num_input_tokens_seen": 64639885, + "step": 2990, + "time_per_iteration": 2.6723546981811523 + }, + { + "auxiliary_loss_clip": 0.01093665, + "auxiliary_loss_mlp": 0.01047115, + "balance_loss_clip": 1.03730905, + "balance_loss_mlp": 1.02997255, + "epoch": 0.1798286487298963, + "flos": 39669367685760.0, + "grad_norm": 1.7576897267212315, + "language_loss": 0.68554938, + "learning_rate": 3.7690987761131893e-06, + "loss": 0.70695722, + "num_input_tokens_seen": 64661220, + "step": 2991, + "time_per_iteration": 2.8356704711914062 + }, + { + "auxiliary_loss_clip": 0.01068233, + "auxiliary_loss_mlp": 0.01044133, + "balance_loss_clip": 1.03775787, + "balance_loss_mlp": 1.02572751, + "epoch": 0.17988877198256426, + "flos": 25520313323520.0, + "grad_norm": 1.4471607529755524, + "language_loss": 0.82700562, + "learning_rate": 3.7689170795160924e-06, + "loss": 0.84812927, + "num_input_tokens_seen": 64682530, + "step": 2992, + "time_per_iteration": 2.792926788330078 + }, + { + "auxiliary_loss_clip": 0.01100053, + "auxiliary_loss_mlp": 0.01042162, + "balance_loss_clip": 1.03634501, + "balance_loss_mlp": 1.0251987, + "epoch": 0.17994889523523222, + "flos": 18807496099200.0, + "grad_norm": 1.9964282967271967, + "language_loss": 0.8208257, + "learning_rate": 3.7687353158414822e-06, + "loss": 0.84224784, + "num_input_tokens_seen": 64701025, + "step": 2993, + "time_per_iteration": 2.6625359058380127 + }, + { + "auxiliary_loss_clip": 0.01088239, + "auxiliary_loss_mlp": 0.0104751, + "balance_loss_clip": 1.03464162, + "balance_loss_mlp": 1.02927089, + "epoch": 0.18000901848790019, + "flos": 21104450087040.0, + "grad_norm": 1.7219655685260966, + "language_loss": 0.78431249, + "learning_rate": 3.7685534850962517e-06, + "loss": 0.80567002, + "num_input_tokens_seen": 64719570, + "step": 2994, + "time_per_iteration": 2.627735137939453 + }, + { + "auxiliary_loss_clip": 0.01115229, + "auxiliary_loss_mlp": 0.01045285, + "balance_loss_clip": 1.03901172, + "balance_loss_mlp": 1.02792776, + "epoch": 0.18006914174056818, + "flos": 19646441130240.0, + "grad_norm": 2.5277342191165184, + "language_loss": 0.80809557, + "learning_rate": 3.768371587287296e-06, + "loss": 0.82970071, + "num_input_tokens_seen": 64738110, + "step": 2995, + "time_per_iteration": 2.669476270675659 + }, + { + "auxiliary_loss_clip": 0.01101046, + "auxiliary_loss_mlp": 0.01042979, + "balance_loss_clip": 1.03675354, + "balance_loss_mlp": 1.02618289, + "epoch": 0.18012926499323614, + "flos": 19499889640320.0, + "grad_norm": 1.9299009987330866, + "language_loss": 0.84346831, + "learning_rate": 3.768189622421512e-06, + "loss": 0.86490858, + "num_input_tokens_seen": 64756345, + "step": 2996, + "time_per_iteration": 2.5968263149261475 + }, + { + "auxiliary_loss_clip": 0.01086759, + "auxiliary_loss_mlp": 0.01043507, + "balance_loss_clip": 1.04311275, + "balance_loss_mlp": 1.02666283, + "epoch": 0.1801893882459041, + "flos": 19464553635840.0, + "grad_norm": 1.6507241530971066, + "language_loss": 0.88033307, + "learning_rate": 3.7680075905058006e-06, + "loss": 0.90163577, + "num_input_tokens_seen": 64776375, + "step": 2997, + "time_per_iteration": 2.72291898727417 + }, + { + "auxiliary_loss_clip": 0.01087077, + "auxiliary_loss_mlp": 0.01041252, + "balance_loss_clip": 1.0318532, + "balance_loss_mlp": 1.02312016, + "epoch": 0.18024951149857207, + "flos": 26870590414080.0, + "grad_norm": 2.8858562625636646, + "language_loss": 0.84968883, + "learning_rate": 3.7678254915470643e-06, + "loss": 0.87097216, + "num_input_tokens_seen": 64796210, + "step": 2998, + "time_per_iteration": 2.65633487701416 + }, + { + "auxiliary_loss_clip": 0.01113316, + "auxiliary_loss_mlp": 0.01044928, + "balance_loss_clip": 1.0402844, + "balance_loss_mlp": 1.02766669, + "epoch": 0.18030963475124004, + "flos": 30226621933440.0, + "grad_norm": 1.7970193907562397, + "language_loss": 0.84471905, + "learning_rate": 3.7676433255522084e-06, + "loss": 0.86630154, + "num_input_tokens_seen": 64818590, + "step": 2999, + "time_per_iteration": 2.635490894317627 + }, + { + "auxiliary_loss_clip": 0.01095372, + "auxiliary_loss_mlp": 0.01045436, + "balance_loss_clip": 1.03495145, + "balance_loss_mlp": 1.0274117, + "epoch": 0.180369758003908, + "flos": 22307493329280.0, + "grad_norm": 1.8811485658421518, + "language_loss": 0.7491399, + "learning_rate": 3.76746109252814e-06, + "loss": 0.77054799, + "num_input_tokens_seen": 64838350, + "step": 3000, + "time_per_iteration": 2.5735485553741455 + }, + { + "auxiliary_loss_clip": 0.01086245, + "auxiliary_loss_mlp": 0.00748858, + "balance_loss_clip": 1.03522038, + "balance_loss_mlp": 1.00058544, + "epoch": 0.18042988125657597, + "flos": 23732033788800.0, + "grad_norm": 1.8057300911284753, + "language_loss": 0.7106179, + "learning_rate": 3.76727879248177e-06, + "loss": 0.72896892, + "num_input_tokens_seen": 64858065, + "step": 3001, + "time_per_iteration": 2.641813039779663 + }, + { + "auxiliary_loss_clip": 0.01107358, + "auxiliary_loss_mlp": 0.01049526, + "balance_loss_clip": 1.03864729, + "balance_loss_mlp": 1.03131056, + "epoch": 0.18049000450924396, + "flos": 24093582134400.0, + "grad_norm": 2.4246870587912466, + "language_loss": 0.88352585, + "learning_rate": 3.767096425420011e-06, + "loss": 0.90509474, + "num_input_tokens_seen": 64877305, + "step": 3002, + "time_per_iteration": 2.592283248901367 + }, + { + "auxiliary_loss_clip": 0.01112586, + "auxiliary_loss_mlp": 0.01047903, + "balance_loss_clip": 1.03851855, + "balance_loss_mlp": 1.0304265, + "epoch": 0.18055012776191193, + "flos": 22163168482560.0, + "grad_norm": 2.0046713244534704, + "language_loss": 0.80501914, + "learning_rate": 3.7669139913497788e-06, + "loss": 0.82662404, + "num_input_tokens_seen": 64896955, + "step": 3003, + "time_per_iteration": 2.734034538269043 + }, + { + "auxiliary_loss_clip": 0.01115464, + "auxiliary_loss_mlp": 0.01054072, + "balance_loss_clip": 1.03910351, + "balance_loss_mlp": 1.03672671, + "epoch": 0.1806102510145799, + "flos": 28913512440960.0, + "grad_norm": 2.0185204905262135, + "language_loss": 0.67249221, + "learning_rate": 3.7667314902779907e-06, + "loss": 0.69418758, + "num_input_tokens_seen": 64917080, + "step": 3004, + "time_per_iteration": 2.6948189735412598 + }, + { + "auxiliary_loss_clip": 0.01104836, + "auxiliary_loss_mlp": 0.01047439, + "balance_loss_clip": 1.03859627, + "balance_loss_mlp": 1.02931905, + "epoch": 0.18067037426724786, + "flos": 19025689265280.0, + "grad_norm": 2.4457540626526035, + "language_loss": 0.85439414, + "learning_rate": 3.7665489222115677e-06, + "loss": 0.87591696, + "num_input_tokens_seen": 64935215, + "step": 3005, + "time_per_iteration": 2.5747263431549072 + }, + { + "auxiliary_loss_clip": 0.01099709, + "auxiliary_loss_mlp": 0.01041193, + "balance_loss_clip": 1.03754401, + "balance_loss_mlp": 1.02520752, + "epoch": 0.18073049751991582, + "flos": 27453635976960.0, + "grad_norm": 1.921953731657349, + "language_loss": 0.82956797, + "learning_rate": 3.766366287157432e-06, + "loss": 0.850977, + "num_input_tokens_seen": 64956275, + "step": 3006, + "time_per_iteration": 2.6493120193481445 + }, + { + "auxiliary_loss_clip": 0.01079318, + "auxiliary_loss_mlp": 0.01051264, + "balance_loss_clip": 1.03229439, + "balance_loss_mlp": 1.03251278, + "epoch": 0.1807906207725838, + "flos": 28729039167360.0, + "grad_norm": 1.9995226009903453, + "language_loss": 0.77152222, + "learning_rate": 3.7661835851225103e-06, + "loss": 0.79282802, + "num_input_tokens_seen": 64979390, + "step": 3007, + "time_per_iteration": 2.7362282276153564 + }, + { + "auxiliary_loss_clip": 0.0101184, + "auxiliary_loss_mlp": 0.01064851, + "balance_loss_clip": 1.01063657, + "balance_loss_mlp": 1.06229961, + "epoch": 0.18085074402525175, + "flos": 64466515468800.0, + "grad_norm": 0.8374905816199927, + "language_loss": 0.56934279, + "learning_rate": 3.7660008161137294e-06, + "loss": 0.59010971, + "num_input_tokens_seen": 65043135, + "step": 3008, + "time_per_iteration": 3.3633410930633545 + }, + { + "auxiliary_loss_clip": 0.01085012, + "auxiliary_loss_mlp": 0.01049038, + "balance_loss_clip": 1.03600597, + "balance_loss_mlp": 1.03058493, + "epoch": 0.18091086727791975, + "flos": 23476960333440.0, + "grad_norm": 2.747388788668418, + "language_loss": 0.67062998, + "learning_rate": 3.765817980138021e-06, + "loss": 0.69197053, + "num_input_tokens_seen": 65062845, + "step": 3009, + "time_per_iteration": 2.6851701736450195 + }, + { + "auxiliary_loss_clip": 0.0111744, + "auxiliary_loss_mlp": 0.01044328, + "balance_loss_clip": 1.04088616, + "balance_loss_mlp": 1.02678084, + "epoch": 0.1809709905305877, + "flos": 24170467196160.0, + "grad_norm": 1.769710012927469, + "language_loss": 0.75458014, + "learning_rate": 3.7656350772023177e-06, + "loss": 0.77619779, + "num_input_tokens_seen": 65082110, + "step": 3010, + "time_per_iteration": 2.6162502765655518 + }, + { + "auxiliary_loss_clip": 0.01083546, + "auxiliary_loss_mlp": 0.01035355, + "balance_loss_clip": 1.03544307, + "balance_loss_mlp": 1.01963723, + "epoch": 0.18103111378325568, + "flos": 21650902669440.0, + "grad_norm": 1.5218450688844716, + "language_loss": 0.67067337, + "learning_rate": 3.7654521073135553e-06, + "loss": 0.6918624, + "num_input_tokens_seen": 65101985, + "step": 3011, + "time_per_iteration": 4.188769578933716 + }, + { + "auxiliary_loss_clip": 0.01058701, + "auxiliary_loss_mlp": 0.00748753, + "balance_loss_clip": 1.02988768, + "balance_loss_mlp": 1.00067449, + "epoch": 0.18109123703592364, + "flos": 53686918356480.0, + "grad_norm": 1.742024782692954, + "language_loss": 0.71558106, + "learning_rate": 3.7652690704786723e-06, + "loss": 0.73365557, + "num_input_tokens_seen": 65129295, + "step": 3012, + "time_per_iteration": 3.013439655303955 + }, + { + "auxiliary_loss_clip": 0.010873, + "auxiliary_loss_mlp": 0.01047717, + "balance_loss_clip": 1.03805542, + "balance_loss_mlp": 1.03039587, + "epoch": 0.1811513602885916, + "flos": 35845564325760.0, + "grad_norm": 1.92030485164378, + "language_loss": 0.62177354, + "learning_rate": 3.765085966704609e-06, + "loss": 0.64312369, + "num_input_tokens_seen": 65150625, + "step": 3013, + "time_per_iteration": 2.785625457763672 + }, + { + "auxiliary_loss_clip": 0.01084204, + "auxiliary_loss_mlp": 0.01053664, + "balance_loss_clip": 1.03517306, + "balance_loss_mlp": 1.03674793, + "epoch": 0.18121148354125957, + "flos": 23732572492800.0, + "grad_norm": 2.3435334163719532, + "language_loss": 0.761222, + "learning_rate": 3.764902795998309e-06, + "loss": 0.78260064, + "num_input_tokens_seen": 65170880, + "step": 3014, + "time_per_iteration": 4.186155796051025 + }, + { + "auxiliary_loss_clip": 0.01116806, + "auxiliary_loss_mlp": 0.01050358, + "balance_loss_clip": 1.03913856, + "balance_loss_mlp": 1.031142, + "epoch": 0.18127160679392756, + "flos": 28728320895360.0, + "grad_norm": 1.6122771017481345, + "language_loss": 0.65874463, + "learning_rate": 3.7647195583667184e-06, + "loss": 0.68041629, + "num_input_tokens_seen": 65192530, + "step": 3015, + "time_per_iteration": 2.6043522357940674 + }, + { + "auxiliary_loss_clip": 0.01088134, + "auxiliary_loss_mlp": 0.00748844, + "balance_loss_clip": 1.03788018, + "balance_loss_mlp": 1.00062478, + "epoch": 0.18133173004659553, + "flos": 20485062938880.0, + "grad_norm": 1.6921123257224775, + "language_loss": 0.77798736, + "learning_rate": 3.764536253816785e-06, + "loss": 0.79635715, + "num_input_tokens_seen": 65211675, + "step": 3016, + "time_per_iteration": 2.6319687366485596 + }, + { + "auxiliary_loss_clip": 0.01095396, + "auxiliary_loss_mlp": 0.01049597, + "balance_loss_clip": 1.03828096, + "balance_loss_mlp": 1.03119159, + "epoch": 0.1813918532992635, + "flos": 22852078404480.0, + "grad_norm": 1.6380472932757921, + "language_loss": 0.83348989, + "learning_rate": 3.7643528823554602e-06, + "loss": 0.85493982, + "num_input_tokens_seen": 65231185, + "step": 3017, + "time_per_iteration": 2.651494026184082 + }, + { + "auxiliary_loss_clip": 0.01091519, + "auxiliary_loss_mlp": 0.01038315, + "balance_loss_clip": 1.03429735, + "balance_loss_mlp": 1.02178025, + "epoch": 0.18145197655193146, + "flos": 36065122208640.0, + "grad_norm": 1.8467528495086831, + "language_loss": 0.67431182, + "learning_rate": 3.764169443989697e-06, + "loss": 0.69561017, + "num_input_tokens_seen": 65251645, + "step": 3018, + "time_per_iteration": 4.333236217498779 + }, + { + "auxiliary_loss_clip": 0.01100185, + "auxiliary_loss_mlp": 0.00748817, + "balance_loss_clip": 1.03582311, + "balance_loss_mlp": 1.00061524, + "epoch": 0.18151209980459942, + "flos": 24023951619840.0, + "grad_norm": 2.125560843852569, + "language_loss": 0.75810856, + "learning_rate": 3.7639859387264518e-06, + "loss": 0.77659857, + "num_input_tokens_seen": 65271125, + "step": 3019, + "time_per_iteration": 2.698683500289917 + }, + { + "auxiliary_loss_clip": 0.01085007, + "auxiliary_loss_mlp": 0.01049282, + "balance_loss_clip": 1.04201496, + "balance_loss_mlp": 1.03067338, + "epoch": 0.1815722230572674, + "flos": 23951627585280.0, + "grad_norm": 2.13565847807468, + "language_loss": 0.81719089, + "learning_rate": 3.7638023665726834e-06, + "loss": 0.8385337, + "num_input_tokens_seen": 65290600, + "step": 3020, + "time_per_iteration": 4.425551176071167 + }, + { + "auxiliary_loss_clip": 0.01088257, + "auxiliary_loss_mlp": 0.01044214, + "balance_loss_clip": 1.03581452, + "balance_loss_mlp": 1.02573681, + "epoch": 0.18163234630993536, + "flos": 24386469632640.0, + "grad_norm": 2.4150043495962343, + "language_loss": 0.77598667, + "learning_rate": 3.763618727535352e-06, + "loss": 0.79731137, + "num_input_tokens_seen": 65311040, + "step": 3021, + "time_per_iteration": 2.68129825592041 + }, + { + "auxiliary_loss_clip": 0.01095728, + "auxiliary_loss_mlp": 0.01040964, + "balance_loss_clip": 1.032933, + "balance_loss_mlp": 1.02407193, + "epoch": 0.18169246956260335, + "flos": 24681332378880.0, + "grad_norm": 1.5168723211366044, + "language_loss": 0.84972119, + "learning_rate": 3.763435021621422e-06, + "loss": 0.87108815, + "num_input_tokens_seen": 65332115, + "step": 3022, + "time_per_iteration": 2.6820549964904785 + }, + { + "auxiliary_loss_clip": 0.01091171, + "auxiliary_loss_mlp": 0.01046038, + "balance_loss_clip": 1.04139519, + "balance_loss_mlp": 1.02766776, + "epoch": 0.1817525928152713, + "flos": 24243294021120.0, + "grad_norm": 1.9609164827708263, + "language_loss": 0.69314802, + "learning_rate": 3.763251248837859e-06, + "loss": 0.7145201, + "num_input_tokens_seen": 65352210, + "step": 3023, + "time_per_iteration": 2.8455772399902344 + }, + { + "auxiliary_loss_clip": 0.01080163, + "auxiliary_loss_mlp": 0.01041082, + "balance_loss_clip": 1.0315758, + "balance_loss_mlp": 1.02342713, + "epoch": 0.18181271606793928, + "flos": 16472081623680.0, + "grad_norm": 1.6535848584955175, + "language_loss": 0.74254382, + "learning_rate": 3.7630674091916317e-06, + "loss": 0.76375628, + "num_input_tokens_seen": 65370600, + "step": 3024, + "time_per_iteration": 2.725461959838867 + }, + { + "auxiliary_loss_clip": 0.01099837, + "auxiliary_loss_mlp": 0.0103818, + "balance_loss_clip": 1.03605628, + "balance_loss_mlp": 1.02039421, + "epoch": 0.18187283932060724, + "flos": 18581042805120.0, + "grad_norm": 1.9835884259105019, + "language_loss": 0.88478386, + "learning_rate": 3.7628835026897123e-06, + "loss": 0.90616405, + "num_input_tokens_seen": 65387270, + "step": 3025, + "time_per_iteration": 2.71161150932312 + }, + { + "auxiliary_loss_clip": 0.01087056, + "auxiliary_loss_mlp": 0.01048853, + "balance_loss_clip": 1.03460121, + "balance_loss_mlp": 1.03065014, + "epoch": 0.1819329625732752, + "flos": 20266833859200.0, + "grad_norm": 1.96903330917026, + "language_loss": 0.79059708, + "learning_rate": 3.7626995293390735e-06, + "loss": 0.81195623, + "num_input_tokens_seen": 65406550, + "step": 3026, + "time_per_iteration": 2.907425880432129 + }, + { + "auxiliary_loss_clip": 0.01090131, + "auxiliary_loss_mlp": 0.01055749, + "balance_loss_clip": 1.03845644, + "balance_loss_mlp": 1.03806996, + "epoch": 0.18199308582594317, + "flos": 25915186512000.0, + "grad_norm": 1.6105596057974134, + "language_loss": 0.75652081, + "learning_rate": 3.762515489146692e-06, + "loss": 0.77797961, + "num_input_tokens_seen": 65425955, + "step": 3027, + "time_per_iteration": 2.754943370819092 + }, + { + "auxiliary_loss_clip": 0.01113773, + "auxiliary_loss_mlp": 0.01045901, + "balance_loss_clip": 1.03693938, + "balance_loss_mlp": 1.02789998, + "epoch": 0.18205320907861114, + "flos": 15377524433280.0, + "grad_norm": 2.2043430502332537, + "language_loss": 0.85297, + "learning_rate": 3.762331382119546e-06, + "loss": 0.87456673, + "num_input_tokens_seen": 65442820, + "step": 3028, + "time_per_iteration": 2.6144258975982666 + }, + { + "auxiliary_loss_clip": 0.01107119, + "auxiliary_loss_mlp": 0.01040688, + "balance_loss_clip": 1.03531313, + "balance_loss_mlp": 1.02303267, + "epoch": 0.18211333233127913, + "flos": 25624310175360.0, + "grad_norm": 1.7273604528375262, + "language_loss": 0.83219695, + "learning_rate": 3.7621472082646183e-06, + "loss": 0.85367501, + "num_input_tokens_seen": 65461825, + "step": 3029, + "time_per_iteration": 2.594177484512329 + }, + { + "auxiliary_loss_clip": 0.01059777, + "auxiliary_loss_mlp": 0.01048057, + "balance_loss_clip": 1.02967906, + "balance_loss_mlp": 1.02868533, + "epoch": 0.1821734555839471, + "flos": 14976007228800.0, + "grad_norm": 1.87261606034119, + "language_loss": 0.77884996, + "learning_rate": 3.761962967588891e-06, + "loss": 0.79992831, + "num_input_tokens_seen": 65479480, + "step": 3030, + "time_per_iteration": 2.6677770614624023 + }, + { + "auxiliary_loss_clip": 0.01085728, + "auxiliary_loss_mlp": 0.010493, + "balance_loss_clip": 1.03222728, + "balance_loss_mlp": 1.0311563, + "epoch": 0.18223357883661506, + "flos": 20194007034240.0, + "grad_norm": 2.3886370086953477, + "language_loss": 0.84664762, + "learning_rate": 3.761778660099352e-06, + "loss": 0.86799788, + "num_input_tokens_seen": 65497775, + "step": 3031, + "time_per_iteration": 2.5905184745788574 + }, + { + "auxiliary_loss_clip": 0.01071373, + "auxiliary_loss_mlp": 0.00748787, + "balance_loss_clip": 1.03258204, + "balance_loss_mlp": 1.00064993, + "epoch": 0.18229370208928303, + "flos": 15231978524160.0, + "grad_norm": 3.6692368008765737, + "language_loss": 0.79617518, + "learning_rate": 3.76159428580299e-06, + "loss": 0.81437677, + "num_input_tokens_seen": 65516505, + "step": 3032, + "time_per_iteration": 2.7831828594207764 + }, + { + "auxiliary_loss_clip": 0.01116144, + "auxiliary_loss_mlp": 0.01050787, + "balance_loss_clip": 1.03856158, + "balance_loss_mlp": 1.03217804, + "epoch": 0.182353825341951, + "flos": 23840483927040.0, + "grad_norm": 1.8030473721263875, + "language_loss": 0.80968487, + "learning_rate": 3.761409844706795e-06, + "loss": 0.8313542, + "num_input_tokens_seen": 65536160, + "step": 3033, + "time_per_iteration": 2.6869912147521973 + }, + { + "auxiliary_loss_clip": 0.01003237, + "auxiliary_loss_mlp": 0.01017209, + "balance_loss_clip": 1.03032231, + "balance_loss_mlp": 1.01439571, + "epoch": 0.18241394859461896, + "flos": 61190957393280.0, + "grad_norm": 0.8867234046912719, + "language_loss": 0.63534462, + "learning_rate": 3.7612253368177625e-06, + "loss": 0.65554905, + "num_input_tokens_seen": 65589375, + "step": 3034, + "time_per_iteration": 3.2934205532073975 + }, + { + "auxiliary_loss_clip": 0.01079741, + "auxiliary_loss_mlp": 0.0104249, + "balance_loss_clip": 1.03414738, + "balance_loss_mlp": 1.02566969, + "epoch": 0.18247407184728695, + "flos": 18471694826880.0, + "grad_norm": 1.836522655179998, + "language_loss": 0.79564983, + "learning_rate": 3.7610407621428893e-06, + "loss": 0.81687212, + "num_input_tokens_seen": 65606720, + "step": 3035, + "time_per_iteration": 2.629455089569092 + }, + { + "auxiliary_loss_clip": 0.01087997, + "auxiliary_loss_mlp": 0.01045343, + "balance_loss_clip": 1.03679502, + "balance_loss_mlp": 1.02928507, + "epoch": 0.18253419509995492, + "flos": 21795191602560.0, + "grad_norm": 1.8911668186282051, + "language_loss": 0.84818876, + "learning_rate": 3.7608561206891735e-06, + "loss": 0.86952221, + "num_input_tokens_seen": 65625495, + "step": 3036, + "time_per_iteration": 2.7071192264556885 + }, + { + "auxiliary_loss_clip": 0.01098175, + "auxiliary_loss_mlp": 0.01041108, + "balance_loss_clip": 1.03826058, + "balance_loss_mlp": 1.02454972, + "epoch": 0.18259431835262288, + "flos": 20149764456960.0, + "grad_norm": 2.4175715746669355, + "language_loss": 0.79934371, + "learning_rate": 3.760671412463617e-06, + "loss": 0.82073653, + "num_input_tokens_seen": 65643515, + "step": 3037, + "time_per_iteration": 2.673400640487671 + }, + { + "auxiliary_loss_clip": 0.01094907, + "auxiliary_loss_mlp": 0.00748716, + "balance_loss_clip": 1.04051685, + "balance_loss_mlp": 1.00051236, + "epoch": 0.18265444160529085, + "flos": 16981653916800.0, + "grad_norm": 2.404660798850911, + "language_loss": 0.79641449, + "learning_rate": 3.7604866374732246e-06, + "loss": 0.81485081, + "num_input_tokens_seen": 65658155, + "step": 3038, + "time_per_iteration": 2.6312577724456787 + }, + { + "auxiliary_loss_clip": 0.01082639, + "auxiliary_loss_mlp": 0.01048782, + "balance_loss_clip": 1.0372833, + "balance_loss_mlp": 1.0309962, + "epoch": 0.1827145648579588, + "flos": 34423250509440.0, + "grad_norm": 1.8872677495289567, + "language_loss": 0.67354691, + "learning_rate": 3.7603017957250023e-06, + "loss": 0.69486117, + "num_input_tokens_seen": 65679310, + "step": 3039, + "time_per_iteration": 2.7725470066070557 + }, + { + "auxiliary_loss_clip": 0.01078455, + "auxiliary_loss_mlp": 0.01047835, + "balance_loss_clip": 1.03369749, + "balance_loss_mlp": 1.03003716, + "epoch": 0.18277468811062678, + "flos": 53287017264000.0, + "grad_norm": 1.7787464368219563, + "language_loss": 0.73419869, + "learning_rate": 3.7601168872259593e-06, + "loss": 0.75546157, + "num_input_tokens_seen": 65705235, + "step": 3040, + "time_per_iteration": 2.9844729900360107 + }, + { + "auxiliary_loss_clip": 0.01100552, + "auxiliary_loss_mlp": 0.01044889, + "balance_loss_clip": 1.03909028, + "balance_loss_mlp": 1.02701902, + "epoch": 0.18283481136329474, + "flos": 31650659602560.0, + "grad_norm": 2.4223210173659817, + "language_loss": 0.59794652, + "learning_rate": 3.7599319119831075e-06, + "loss": 0.61940086, + "num_input_tokens_seen": 65727575, + "step": 3041, + "time_per_iteration": 2.6612417697906494 + }, + { + "auxiliary_loss_clip": 0.01072876, + "auxiliary_loss_mlp": 0.01056502, + "balance_loss_clip": 1.0351274, + "balance_loss_mlp": 1.0393002, + "epoch": 0.18289493461596273, + "flos": 53137664513280.0, + "grad_norm": 1.7492090369459752, + "language_loss": 0.60162175, + "learning_rate": 3.7597468700034616e-06, + "loss": 0.62291551, + "num_input_tokens_seen": 65751370, + "step": 3042, + "time_per_iteration": 3.021244525909424 + }, + { + "auxiliary_loss_clip": 0.01074662, + "auxiliary_loss_mlp": 0.01045817, + "balance_loss_clip": 1.034392, + "balance_loss_mlp": 1.0292232, + "epoch": 0.1829550578686307, + "flos": 25589369220480.0, + "grad_norm": 1.8468860378898149, + "language_loss": 0.87538511, + "learning_rate": 3.7595617612940374e-06, + "loss": 0.89658993, + "num_input_tokens_seen": 65771040, + "step": 3043, + "time_per_iteration": 2.762946844100952 + }, + { + "auxiliary_loss_clip": 0.01003503, + "auxiliary_loss_mlp": 0.01044584, + "balance_loss_clip": 1.02615499, + "balance_loss_mlp": 1.02629721, + "epoch": 0.18301518112129866, + "flos": 22601422321920.0, + "grad_norm": 2.1691240253522346, + "language_loss": 0.70725685, + "learning_rate": 3.7593765858618552e-06, + "loss": 0.72773772, + "num_input_tokens_seen": 65789345, + "step": 3044, + "time_per_iteration": 2.9028117656707764 + }, + { + "auxiliary_loss_clip": 0.01053807, + "auxiliary_loss_mlp": 0.01048905, + "balance_loss_clip": 1.03190923, + "balance_loss_mlp": 1.02961683, + "epoch": 0.18307530437396663, + "flos": 34020799551360.0, + "grad_norm": 1.8917929112312062, + "language_loss": 0.63772094, + "learning_rate": 3.7591913437139365e-06, + "loss": 0.65874815, + "num_input_tokens_seen": 65810990, + "step": 3045, + "time_per_iteration": 3.0983309745788574 + }, + { + "auxiliary_loss_clip": 0.01111352, + "auxiliary_loss_mlp": 0.01047849, + "balance_loss_clip": 1.03980458, + "balance_loss_mlp": 1.03152871, + "epoch": 0.1831354276266346, + "flos": 21279765392640.0, + "grad_norm": 3.136229379083822, + "language_loss": 0.79386795, + "learning_rate": 3.7590060348573066e-06, + "loss": 0.81545991, + "num_input_tokens_seen": 65827230, + "step": 3046, + "time_per_iteration": 2.734269142150879 + }, + { + "auxiliary_loss_clip": 0.0107584, + "auxiliary_loss_mlp": 0.01043654, + "balance_loss_clip": 1.03300166, + "balance_loss_mlp": 1.02590382, + "epoch": 0.18319555087930256, + "flos": 21032952065280.0, + "grad_norm": 1.7955774375699447, + "language_loss": 0.78708231, + "learning_rate": 3.7588206592989903e-06, + "loss": 0.80827725, + "num_input_tokens_seen": 65845900, + "step": 3047, + "time_per_iteration": 2.8413877487182617 + }, + { + "auxiliary_loss_clip": 0.01100498, + "auxiliary_loss_mlp": 0.01045585, + "balance_loss_clip": 1.03883266, + "balance_loss_mlp": 1.02883577, + "epoch": 0.18325567413197055, + "flos": 34382958428160.0, + "grad_norm": 1.485324170024271, + "language_loss": 0.80848831, + "learning_rate": 3.7586352170460194e-06, + "loss": 0.82994914, + "num_input_tokens_seen": 65868730, + "step": 3048, + "time_per_iteration": 2.7292141914367676 + }, + { + "auxiliary_loss_clip": 0.01090638, + "auxiliary_loss_mlp": 0.0104292, + "balance_loss_clip": 1.03450298, + "balance_loss_mlp": 1.02520585, + "epoch": 0.18331579738463852, + "flos": 20558464381440.0, + "grad_norm": 3.3986045548320227, + "language_loss": 0.85682589, + "learning_rate": 3.758449708105424e-06, + "loss": 0.87816149, + "num_input_tokens_seen": 65888420, + "step": 3049, + "time_per_iteration": 2.694544553756714 + }, + { + "auxiliary_loss_clip": 0.01096598, + "auxiliary_loss_mlp": 0.0104376, + "balance_loss_clip": 1.0365634, + "balance_loss_mlp": 1.02435315, + "epoch": 0.18337592063730648, + "flos": 19607872901760.0, + "grad_norm": 2.3755498523313046, + "language_loss": 0.77160931, + "learning_rate": 3.75826413248424e-06, + "loss": 0.79301286, + "num_input_tokens_seen": 65905840, + "step": 3050, + "time_per_iteration": 2.703836679458618 + }, + { + "auxiliary_loss_clip": 0.01079292, + "auxiliary_loss_mlp": 0.01040587, + "balance_loss_clip": 1.03133607, + "balance_loss_mlp": 1.02391016, + "epoch": 0.18343604388997445, + "flos": 20850885002880.0, + "grad_norm": 2.019431404651282, + "language_loss": 0.99758172, + "learning_rate": 3.7580784901895035e-06, + "loss": 1.01878047, + "num_input_tokens_seen": 65922845, + "step": 3051, + "time_per_iteration": 2.6278390884399414 + }, + { + "auxiliary_loss_clip": 0.01076334, + "auxiliary_loss_mlp": 0.01037346, + "balance_loss_clip": 1.03284979, + "balance_loss_mlp": 1.01942873, + "epoch": 0.1834961671426424, + "flos": 24394370624640.0, + "grad_norm": 1.4794478374084217, + "language_loss": 0.86193031, + "learning_rate": 3.7578927812282542e-06, + "loss": 0.88306713, + "num_input_tokens_seen": 65945555, + "step": 3052, + "time_per_iteration": 2.713686943054199 + }, + { + "auxiliary_loss_clip": 0.01107798, + "auxiliary_loss_mlp": 0.01044063, + "balance_loss_clip": 1.03590059, + "balance_loss_mlp": 1.02719486, + "epoch": 0.18355629039531038, + "flos": 21251612108160.0, + "grad_norm": 2.0431936630285796, + "language_loss": 0.7357015, + "learning_rate": 3.7577070056075356e-06, + "loss": 0.75722009, + "num_input_tokens_seen": 65963965, + "step": 3053, + "time_per_iteration": 2.580491542816162 + }, + { + "auxiliary_loss_clip": 0.01111759, + "auxiliary_loss_mlp": 0.01044455, + "balance_loss_clip": 1.03792787, + "balance_loss_mlp": 1.02703881, + "epoch": 0.18361641364797834, + "flos": 28656499651200.0, + "grad_norm": 1.8689796857300136, + "language_loss": 0.61589777, + "learning_rate": 3.7575211633343902e-06, + "loss": 0.63745999, + "num_input_tokens_seen": 65985965, + "step": 3054, + "time_per_iteration": 2.653799057006836 + }, + { + "auxiliary_loss_clip": 0.01068653, + "auxiliary_loss_mlp": 0.01047315, + "balance_loss_clip": 1.03636825, + "balance_loss_mlp": 1.03063786, + "epoch": 0.18367653690064634, + "flos": 20918827578240.0, + "grad_norm": 2.1711596440959617, + "language_loss": 0.78079033, + "learning_rate": 3.7573352544158663e-06, + "loss": 0.80194998, + "num_input_tokens_seen": 66005645, + "step": 3055, + "time_per_iteration": 2.8834617137908936 + }, + { + "auxiliary_loss_clip": 0.01053094, + "auxiliary_loss_mlp": 0.01052464, + "balance_loss_clip": 1.03094721, + "balance_loss_mlp": 1.03459406, + "epoch": 0.1837366601533143, + "flos": 28765596234240.0, + "grad_norm": 1.8825498456285839, + "language_loss": 0.70377433, + "learning_rate": 3.757149278859014e-06, + "loss": 0.72482991, + "num_input_tokens_seen": 66025675, + "step": 3056, + "time_per_iteration": 2.7236897945404053 + }, + { + "auxiliary_loss_clip": 0.0110014, + "auxiliary_loss_mlp": 0.0103837, + "balance_loss_clip": 1.036605, + "balance_loss_mlp": 1.02226508, + "epoch": 0.18379678340598227, + "flos": 21251432540160.0, + "grad_norm": 1.5155234552884391, + "language_loss": 0.80221933, + "learning_rate": 3.7569632366708842e-06, + "loss": 0.82360446, + "num_input_tokens_seen": 66046125, + "step": 3057, + "time_per_iteration": 4.210191011428833 + }, + { + "auxiliary_loss_clip": 0.01103292, + "auxiliary_loss_mlp": 0.01044843, + "balance_loss_clip": 1.03373694, + "balance_loss_mlp": 1.02518582, + "epoch": 0.18385690665865023, + "flos": 20449619193600.0, + "grad_norm": 2.080836047733498, + "language_loss": 0.8226738, + "learning_rate": 3.756777127858533e-06, + "loss": 0.84415519, + "num_input_tokens_seen": 66064375, + "step": 3058, + "time_per_iteration": 2.592301845550537 + }, + { + "auxiliary_loss_clip": 0.01070154, + "auxiliary_loss_mlp": 0.00748835, + "balance_loss_clip": 1.03123093, + "balance_loss_mlp": 1.00056982, + "epoch": 0.1839170299113182, + "flos": 26140562398080.0, + "grad_norm": 2.338757434743973, + "language_loss": 0.85670358, + "learning_rate": 3.756590952429017e-06, + "loss": 0.87489349, + "num_input_tokens_seen": 66084590, + "step": 3059, + "time_per_iteration": 2.743638515472412 + }, + { + "auxiliary_loss_clip": 0.01107068, + "auxiliary_loss_mlp": 0.00748747, + "balance_loss_clip": 1.03468215, + "balance_loss_mlp": 1.00060606, + "epoch": 0.18397715316398616, + "flos": 31758032332800.0, + "grad_norm": 1.6322225766999174, + "language_loss": 0.7255379, + "learning_rate": 3.756404710389396e-06, + "loss": 0.74409604, + "num_input_tokens_seen": 66107105, + "step": 3060, + "time_per_iteration": 2.6629955768585205 + }, + { + "auxiliary_loss_clip": 0.01103086, + "auxiliary_loss_mlp": 0.01041807, + "balance_loss_clip": 1.03743756, + "balance_loss_mlp": 1.0236156, + "epoch": 0.18403727641665413, + "flos": 24611989173120.0, + "grad_norm": 1.6784141907123087, + "language_loss": 0.72973561, + "learning_rate": 3.7562184017467323e-06, + "loss": 0.75118458, + "num_input_tokens_seen": 66129295, + "step": 3061, + "time_per_iteration": 4.548817157745361 + }, + { + "auxiliary_loss_clip": 0.01090466, + "auxiliary_loss_mlp": 0.01049001, + "balance_loss_clip": 1.0354743, + "balance_loss_mlp": 1.03035688, + "epoch": 0.18409739966932212, + "flos": 23439900476160.0, + "grad_norm": 1.7026593529624061, + "language_loss": 0.81580675, + "learning_rate": 3.7560320265080906e-06, + "loss": 0.83720136, + "num_input_tokens_seen": 66146910, + "step": 3062, + "time_per_iteration": 2.658405065536499 + }, + { + "auxiliary_loss_clip": 0.01100651, + "auxiliary_loss_mlp": 0.01042713, + "balance_loss_clip": 1.03692138, + "balance_loss_mlp": 1.02493942, + "epoch": 0.18415752292199009, + "flos": 21872112577920.0, + "grad_norm": 1.8891882006637117, + "language_loss": 0.7320118, + "learning_rate": 3.7558455846805383e-06, + "loss": 0.75344551, + "num_input_tokens_seen": 66165370, + "step": 3063, + "time_per_iteration": 2.542003870010376 + }, + { + "auxiliary_loss_clip": 0.01093844, + "auxiliary_loss_mlp": 0.01037676, + "balance_loss_clip": 1.03477716, + "balance_loss_mlp": 1.02228642, + "epoch": 0.18421764617465805, + "flos": 25410678036480.0, + "grad_norm": 1.930768128666993, + "language_loss": 0.65816063, + "learning_rate": 3.7556590762711463e-06, + "loss": 0.67947578, + "num_input_tokens_seen": 66186210, + "step": 3064, + "time_per_iteration": 2.6836555004119873 + }, + { + "auxiliary_loss_clip": 0.01097093, + "auxiliary_loss_mlp": 0.01043455, + "balance_loss_clip": 1.03557253, + "balance_loss_mlp": 1.02618146, + "epoch": 0.18427776942732602, + "flos": 27198131558400.0, + "grad_norm": 3.9003715196044193, + "language_loss": 0.68926966, + "learning_rate": 3.7554725012869853e-06, + "loss": 0.71067512, + "num_input_tokens_seen": 66204800, + "step": 3065, + "time_per_iteration": 4.205069541931152 + }, + { + "auxiliary_loss_clip": 0.0109187, + "auxiliary_loss_mlp": 0.01042524, + "balance_loss_clip": 1.03567255, + "balance_loss_mlp": 1.02409399, + "epoch": 0.18433789267999398, + "flos": 27852351920640.0, + "grad_norm": 2.016640340049062, + "language_loss": 0.72937775, + "learning_rate": 3.7552858597351318e-06, + "loss": 0.75072169, + "num_input_tokens_seen": 66222195, + "step": 3066, + "time_per_iteration": 2.6873836517333984 + }, + { + "auxiliary_loss_clip": 0.01076234, + "auxiliary_loss_mlp": 0.01036269, + "balance_loss_clip": 1.03193343, + "balance_loss_mlp": 1.01951981, + "epoch": 0.18439801593266195, + "flos": 17856940533120.0, + "grad_norm": 2.307564950778079, + "language_loss": 0.81516051, + "learning_rate": 3.7550991516226622e-06, + "loss": 0.83628553, + "num_input_tokens_seen": 66239505, + "step": 3067, + "time_per_iteration": 4.172776222229004 + }, + { + "auxiliary_loss_clip": 0.01036502, + "auxiliary_loss_mlp": 0.0074817, + "balance_loss_clip": 1.01698029, + "balance_loss_mlp": 1.00038362, + "epoch": 0.18445813918532994, + "flos": 56389522590720.0, + "grad_norm": 0.7883109094473385, + "language_loss": 0.59727955, + "learning_rate": 3.754912376956657e-06, + "loss": 0.61512625, + "num_input_tokens_seen": 66295695, + "step": 3068, + "time_per_iteration": 3.0234599113464355 + }, + { + "auxiliary_loss_clip": 0.01090916, + "auxiliary_loss_mlp": 0.01040881, + "balance_loss_clip": 1.04198074, + "balance_loss_mlp": 1.02363157, + "epoch": 0.1845182624379979, + "flos": 20957180325120.0, + "grad_norm": 1.8130596261282192, + "language_loss": 0.7624824, + "learning_rate": 3.7547255357441987e-06, + "loss": 0.78380036, + "num_input_tokens_seen": 66315315, + "step": 3069, + "time_per_iteration": 2.6990177631378174 + }, + { + "auxiliary_loss_clip": 0.01097301, + "auxiliary_loss_mlp": 0.01043627, + "balance_loss_clip": 1.03283453, + "balance_loss_mlp": 1.0259726, + "epoch": 0.18457838569066587, + "flos": 20485170679680.0, + "grad_norm": 1.7244461366089683, + "language_loss": 0.8507781, + "learning_rate": 3.7545386279923718e-06, + "loss": 0.87218738, + "num_input_tokens_seen": 66333675, + "step": 3070, + "time_per_iteration": 2.6576614379882812 + }, + { + "auxiliary_loss_clip": 0.01080436, + "auxiliary_loss_mlp": 0.01044395, + "balance_loss_clip": 1.03468108, + "balance_loss_mlp": 1.02626312, + "epoch": 0.18463850894333383, + "flos": 25010022758400.0, + "grad_norm": 2.443945146143619, + "language_loss": 0.77430868, + "learning_rate": 3.754351653708265e-06, + "loss": 0.79555702, + "num_input_tokens_seen": 66354075, + "step": 3071, + "time_per_iteration": 2.7638306617736816 + }, + { + "auxiliary_loss_clip": 0.01073183, + "auxiliary_loss_mlp": 0.01055511, + "balance_loss_clip": 1.03870511, + "balance_loss_mlp": 1.03714132, + "epoch": 0.1846986321960018, + "flos": 16800628348800.0, + "grad_norm": 3.160827921861565, + "language_loss": 0.77346939, + "learning_rate": 3.7541646128989674e-06, + "loss": 0.79475629, + "num_input_tokens_seen": 66372520, + "step": 3072, + "time_per_iteration": 2.680543899536133 + }, + { + "auxiliary_loss_clip": 0.01091388, + "auxiliary_loss_mlp": 0.01044973, + "balance_loss_clip": 1.03350616, + "balance_loss_mlp": 1.02622163, + "epoch": 0.18475875544866976, + "flos": 20814327936000.0, + "grad_norm": 2.27265622605345, + "language_loss": 0.86436677, + "learning_rate": 3.7539775055715715e-06, + "loss": 0.8857305, + "num_input_tokens_seen": 66390745, + "step": 3073, + "time_per_iteration": 2.635680675506592 + }, + { + "auxiliary_loss_clip": 0.01112783, + "auxiliary_loss_mlp": 0.01040807, + "balance_loss_clip": 1.03857565, + "balance_loss_mlp": 1.02463007, + "epoch": 0.18481887870133773, + "flos": 22601422321920.0, + "grad_norm": 2.098673064328515, + "language_loss": 0.92305112, + "learning_rate": 3.7537903317331732e-06, + "loss": 0.94458699, + "num_input_tokens_seen": 66410525, + "step": 3074, + "time_per_iteration": 2.6038923263549805 + }, + { + "auxiliary_loss_clip": 0.01049712, + "auxiliary_loss_mlp": 0.01044747, + "balance_loss_clip": 1.02802086, + "balance_loss_mlp": 1.02536333, + "epoch": 0.18487900195400572, + "flos": 29458815788160.0, + "grad_norm": 1.707714770225554, + "language_loss": 0.64976156, + "learning_rate": 3.75360309139087e-06, + "loss": 0.67070609, + "num_input_tokens_seen": 66432535, + "step": 3075, + "time_per_iteration": 2.7379043102264404 + }, + { + "auxiliary_loss_clip": 0.01091454, + "auxiliary_loss_mlp": 0.01048582, + "balance_loss_clip": 1.03720498, + "balance_loss_mlp": 1.03146386, + "epoch": 0.1849391252066737, + "flos": 20628777254400.0, + "grad_norm": 1.719680810389596, + "language_loss": 0.72574484, + "learning_rate": 3.753415784551761e-06, + "loss": 0.74714518, + "num_input_tokens_seen": 66450620, + "step": 3076, + "time_per_iteration": 2.5933468341827393 + }, + { + "auxiliary_loss_clip": 0.01087161, + "auxiliary_loss_mlp": 0.01046387, + "balance_loss_clip": 1.04277968, + "balance_loss_mlp": 1.02851772, + "epoch": 0.18499924845934165, + "flos": 14428549065600.0, + "grad_norm": 2.0865012945070376, + "language_loss": 0.80885112, + "learning_rate": 3.7532284112229507e-06, + "loss": 0.83018661, + "num_input_tokens_seen": 66467865, + "step": 3077, + "time_per_iteration": 2.674839973449707 + }, + { + "auxiliary_loss_clip": 0.01087888, + "auxiliary_loss_mlp": 0.01044905, + "balance_loss_clip": 1.03734946, + "balance_loss_mlp": 1.02776301, + "epoch": 0.18505937171200962, + "flos": 23727652329600.0, + "grad_norm": 1.7412449345400642, + "language_loss": 0.78760612, + "learning_rate": 3.7530409714115424e-06, + "loss": 0.80893397, + "num_input_tokens_seen": 66486245, + "step": 3078, + "time_per_iteration": 2.6305150985717773 + }, + { + "auxiliary_loss_clip": 0.01109836, + "auxiliary_loss_mlp": 0.0104302, + "balance_loss_clip": 1.0370543, + "balance_loss_mlp": 1.02649736, + "epoch": 0.18511949496467758, + "flos": 25957489754880.0, + "grad_norm": 2.146837130000241, + "language_loss": 0.77578735, + "learning_rate": 3.7528534651246453e-06, + "loss": 0.7973159, + "num_input_tokens_seen": 66506510, + "step": 3079, + "time_per_iteration": 2.6370975971221924 + }, + { + "auxiliary_loss_clip": 0.01077525, + "auxiliary_loss_mlp": 0.01037986, + "balance_loss_clip": 1.03333974, + "balance_loss_mlp": 1.02087927, + "epoch": 0.18517961821734555, + "flos": 42413553912960.0, + "grad_norm": 1.5918708766417766, + "language_loss": 0.81640732, + "learning_rate": 3.752665892369369e-06, + "loss": 0.83756244, + "num_input_tokens_seen": 66530960, + "step": 3080, + "time_per_iteration": 2.839545726776123 + }, + { + "auxiliary_loss_clip": 0.0107287, + "auxiliary_loss_mlp": 0.01044704, + "balance_loss_clip": 1.03435576, + "balance_loss_mlp": 1.02638125, + "epoch": 0.18523974147001354, + "flos": 24097568544000.0, + "grad_norm": 2.048957219049279, + "language_loss": 0.74359798, + "learning_rate": 3.7524782531528266e-06, + "loss": 0.76477373, + "num_input_tokens_seen": 66550275, + "step": 3081, + "time_per_iteration": 2.7199554443359375 + }, + { + "auxiliary_loss_clip": 0.01090979, + "auxiliary_loss_mlp": 0.01048819, + "balance_loss_clip": 1.03995585, + "balance_loss_mlp": 1.03040147, + "epoch": 0.1852998647226815, + "flos": 27375278457600.0, + "grad_norm": 2.182999563928667, + "language_loss": 0.71879959, + "learning_rate": 3.7522905474821334e-06, + "loss": 0.74019754, + "num_input_tokens_seen": 66569040, + "step": 3082, + "time_per_iteration": 2.7119436264038086 + }, + { + "auxiliary_loss_clip": 0.01083997, + "auxiliary_loss_mlp": 0.01045125, + "balance_loss_clip": 1.03860688, + "balance_loss_mlp": 1.02683866, + "epoch": 0.18535998797534947, + "flos": 18332757020160.0, + "grad_norm": 2.044246169396123, + "language_loss": 0.69432914, + "learning_rate": 3.752102775364407e-06, + "loss": 0.71562034, + "num_input_tokens_seen": 66587775, + "step": 3083, + "time_per_iteration": 2.7121963500976562 + }, + { + "auxiliary_loss_clip": 0.01079694, + "auxiliary_loss_mlp": 0.01042594, + "balance_loss_clip": 1.03422213, + "balance_loss_mlp": 1.02572608, + "epoch": 0.18542011122801744, + "flos": 37845859887360.0, + "grad_norm": 5.110216423776397, + "language_loss": 0.68366146, + "learning_rate": 3.751914936806767e-06, + "loss": 0.70488429, + "num_input_tokens_seen": 66610800, + "step": 3084, + "time_per_iteration": 2.787545680999756 + }, + { + "auxiliary_loss_clip": 0.01109086, + "auxiliary_loss_mlp": 0.01038002, + "balance_loss_clip": 1.03578329, + "balance_loss_mlp": 1.02151513, + "epoch": 0.1854802344806854, + "flos": 25186128163200.0, + "grad_norm": 1.6292125434582414, + "language_loss": 0.77817178, + "learning_rate": 3.7517270318163377e-06, + "loss": 0.79964262, + "num_input_tokens_seen": 66630960, + "step": 3085, + "time_per_iteration": 2.6030142307281494 + }, + { + "auxiliary_loss_clip": 0.01107683, + "auxiliary_loss_mlp": 0.01045356, + "balance_loss_clip": 1.034917, + "balance_loss_mlp": 1.02853537, + "epoch": 0.18554035773335337, + "flos": 26684788337280.0, + "grad_norm": 2.0060351736297033, + "language_loss": 0.73401058, + "learning_rate": 3.751539060400244e-06, + "loss": 0.75554091, + "num_input_tokens_seen": 66650585, + "step": 3086, + "time_per_iteration": 2.6310510635375977 + }, + { + "auxiliary_loss_clip": 0.01098156, + "auxiliary_loss_mlp": 0.01044847, + "balance_loss_clip": 1.03661871, + "balance_loss_mlp": 1.02682281, + "epoch": 0.18560048098602133, + "flos": 22346887570560.0, + "grad_norm": 2.90228195025199, + "language_loss": 0.69555432, + "learning_rate": 3.7513510225656132e-06, + "loss": 0.71698433, + "num_input_tokens_seen": 66670045, + "step": 3087, + "time_per_iteration": 2.6040573120117188 + }, + { + "auxiliary_loss_clip": 0.01069166, + "auxiliary_loss_mlp": 0.0104804, + "balance_loss_clip": 1.0352155, + "balance_loss_mlp": 1.02946687, + "epoch": 0.18566060423868933, + "flos": 17748526308480.0, + "grad_norm": 2.60337137462876, + "language_loss": 0.7245478, + "learning_rate": 3.7511629183195764e-06, + "loss": 0.74571991, + "num_input_tokens_seen": 66688790, + "step": 3088, + "time_per_iteration": 2.686263084411621 + }, + { + "auxiliary_loss_clip": 0.01077485, + "auxiliary_loss_mlp": 0.01038512, + "balance_loss_clip": 1.03169584, + "balance_loss_mlp": 1.02220392, + "epoch": 0.1857207274913573, + "flos": 24677274142080.0, + "grad_norm": 1.825852306722914, + "language_loss": 0.92252445, + "learning_rate": 3.7509747476692663e-06, + "loss": 0.9436844, + "num_input_tokens_seen": 66708090, + "step": 3089, + "time_per_iteration": 2.646122455596924 + }, + { + "auxiliary_loss_clip": 0.01052496, + "auxiliary_loss_mlp": 0.01045035, + "balance_loss_clip": 1.03054595, + "balance_loss_mlp": 1.02823877, + "epoch": 0.18578085074402526, + "flos": 28147825198080.0, + "grad_norm": 2.839577674095756, + "language_loss": 0.57394469, + "learning_rate": 3.7507865106218176e-06, + "loss": 0.59491998, + "num_input_tokens_seen": 66727320, + "step": 3090, + "time_per_iteration": 2.825593948364258 + }, + { + "auxiliary_loss_clip": 0.0107562, + "auxiliary_loss_mlp": 0.01045614, + "balance_loss_clip": 1.03090787, + "balance_loss_mlp": 1.02860284, + "epoch": 0.18584097399669322, + "flos": 23951878980480.0, + "grad_norm": 2.224248297196478, + "language_loss": 0.8159157, + "learning_rate": 3.7505982071843695e-06, + "loss": 0.83712804, + "num_input_tokens_seen": 66747505, + "step": 3091, + "time_per_iteration": 2.7518091201782227 + }, + { + "auxiliary_loss_clip": 0.0105421, + "auxiliary_loss_mlp": 0.01049958, + "balance_loss_clip": 1.03419495, + "balance_loss_mlp": 1.03204107, + "epoch": 0.18590109724936119, + "flos": 17201678676480.0, + "grad_norm": 2.651229042223476, + "language_loss": 0.84440899, + "learning_rate": 3.7504098373640617e-06, + "loss": 0.86545062, + "num_input_tokens_seen": 66766425, + "step": 3092, + "time_per_iteration": 2.8699803352355957 + }, + { + "auxiliary_loss_clip": 0.01086932, + "auxiliary_loss_mlp": 0.01044437, + "balance_loss_clip": 1.03281653, + "balance_loss_mlp": 1.02653122, + "epoch": 0.18596122050202915, + "flos": 17234644383360.0, + "grad_norm": 2.368626460158139, + "language_loss": 0.92904294, + "learning_rate": 3.750221401168038e-06, + "loss": 0.9503566, + "num_input_tokens_seen": 66781130, + "step": 3093, + "time_per_iteration": 2.773148536682129 + }, + { + "auxiliary_loss_clip": 0.01080581, + "auxiliary_loss_mlp": 0.01040439, + "balance_loss_clip": 1.03564942, + "balance_loss_mlp": 1.02357125, + "epoch": 0.18602134375469712, + "flos": 19020733188480.0, + "grad_norm": 2.1924200809283017, + "language_loss": 0.77562541, + "learning_rate": 3.750032898603443e-06, + "loss": 0.79683566, + "num_input_tokens_seen": 66797535, + "step": 3094, + "time_per_iteration": 2.900209426879883 + }, + { + "auxiliary_loss_clip": 0.0105564, + "auxiliary_loss_mlp": 0.01043695, + "balance_loss_clip": 1.03458595, + "balance_loss_mlp": 1.02774477, + "epoch": 0.1860814670073651, + "flos": 50950094417280.0, + "grad_norm": 1.738784472981599, + "language_loss": 0.70208794, + "learning_rate": 3.749844329677425e-06, + "loss": 0.72308129, + "num_input_tokens_seen": 66821720, + "step": 3095, + "time_per_iteration": 3.317453622817993 + }, + { + "auxiliary_loss_clip": 0.01066219, + "auxiliary_loss_mlp": 0.01046178, + "balance_loss_clip": 1.03346467, + "balance_loss_mlp": 1.02667499, + "epoch": 0.18614159026003307, + "flos": 19390972625280.0, + "grad_norm": 1.8899497577895268, + "language_loss": 0.81241107, + "learning_rate": 3.749655694397135e-06, + "loss": 0.83353502, + "num_input_tokens_seen": 66839060, + "step": 3096, + "time_per_iteration": 2.8432629108428955 + }, + { + "auxiliary_loss_clip": 0.01097562, + "auxiliary_loss_mlp": 0.01040925, + "balance_loss_clip": 1.03494596, + "balance_loss_mlp": 1.02381825, + "epoch": 0.18620171351270104, + "flos": 21798782962560.0, + "grad_norm": 2.1557984456428825, + "language_loss": 0.75138426, + "learning_rate": 3.7494669927697255e-06, + "loss": 0.77276909, + "num_input_tokens_seen": 66857760, + "step": 3097, + "time_per_iteration": 2.6357808113098145 + }, + { + "auxiliary_loss_clip": 0.01087821, + "auxiliary_loss_mlp": 0.01046626, + "balance_loss_clip": 1.0370568, + "balance_loss_mlp": 1.02944791, + "epoch": 0.186261836765369, + "flos": 16362877299840.0, + "grad_norm": 2.334497116459061, + "language_loss": 0.6590507, + "learning_rate": 3.749278224802352e-06, + "loss": 0.68039519, + "num_input_tokens_seen": 66876460, + "step": 3098, + "time_per_iteration": 2.705333948135376 + }, + { + "auxiliary_loss_clip": 0.01110801, + "auxiliary_loss_mlp": 0.01048096, + "balance_loss_clip": 1.03572595, + "balance_loss_mlp": 1.02934384, + "epoch": 0.18632196001803697, + "flos": 23370054480000.0, + "grad_norm": 1.6123443798250243, + "language_loss": 0.69696891, + "learning_rate": 3.7490893905021733e-06, + "loss": 0.71855789, + "num_input_tokens_seen": 66897960, + "step": 3099, + "time_per_iteration": 2.7188522815704346 + }, + { + "auxiliary_loss_clip": 0.01098895, + "auxiliary_loss_mlp": 0.01050078, + "balance_loss_clip": 1.03510547, + "balance_loss_mlp": 1.03212523, + "epoch": 0.18638208327070493, + "flos": 22492002516480.0, + "grad_norm": 1.4924791402205153, + "language_loss": 0.71839952, + "learning_rate": 3.7489004898763494e-06, + "loss": 0.73988926, + "num_input_tokens_seen": 66917675, + "step": 3100, + "time_per_iteration": 2.726377248764038 + }, + { + "auxiliary_loss_clip": 0.01084389, + "auxiliary_loss_mlp": 0.01050096, + "balance_loss_clip": 1.03327107, + "balance_loss_mlp": 1.03141582, + "epoch": 0.18644220652337293, + "flos": 29165245931520.0, + "grad_norm": 1.7681071650842655, + "language_loss": 0.80354321, + "learning_rate": 3.7487115229320444e-06, + "loss": 0.82488805, + "num_input_tokens_seen": 66936000, + "step": 3101, + "time_per_iteration": 2.8762171268463135 + }, + { + "auxiliary_loss_clip": 0.01068902, + "auxiliary_loss_mlp": 0.01047846, + "balance_loss_clip": 1.03587794, + "balance_loss_mlp": 1.03097773, + "epoch": 0.1865023297760409, + "flos": 24243796811520.0, + "grad_norm": 1.8124618754497863, + "language_loss": 0.76720762, + "learning_rate": 3.7485224896764222e-06, + "loss": 0.78837514, + "num_input_tokens_seen": 66955700, + "step": 3102, + "time_per_iteration": 2.840177059173584 + }, + { + "auxiliary_loss_clip": 0.01101244, + "auxiliary_loss_mlp": 0.01041443, + "balance_loss_clip": 1.03499496, + "balance_loss_mlp": 1.02376485, + "epoch": 0.18656245302870886, + "flos": 19128716449920.0, + "grad_norm": 2.065029786735788, + "language_loss": 0.76831186, + "learning_rate": 3.7483333901166525e-06, + "loss": 0.78973871, + "num_input_tokens_seen": 66972815, + "step": 3103, + "time_per_iteration": 2.713676691055298 + }, + { + "auxiliary_loss_clip": 0.01090985, + "auxiliary_loss_mlp": 0.0105227, + "balance_loss_clip": 1.03752959, + "balance_loss_mlp": 1.03497326, + "epoch": 0.18662257628137682, + "flos": 17786088956160.0, + "grad_norm": 3.1260759092878128, + "language_loss": 0.79346645, + "learning_rate": 3.7481442242599054e-06, + "loss": 0.81489897, + "num_input_tokens_seen": 66992280, + "step": 3104, + "time_per_iteration": 2.75673508644104 + }, + { + "auxiliary_loss_clip": 0.01055804, + "auxiliary_loss_mlp": 0.01051989, + "balance_loss_clip": 1.03222346, + "balance_loss_mlp": 1.03403592, + "epoch": 0.1866826995340448, + "flos": 24024382583040.0, + "grad_norm": 1.8728624989975267, + "language_loss": 0.85276395, + "learning_rate": 3.747954992113354e-06, + "loss": 0.87384188, + "num_input_tokens_seen": 67012220, + "step": 3105, + "time_per_iteration": 4.452714443206787 + }, + { + "auxiliary_loss_clip": 0.01076095, + "auxiliary_loss_mlp": 0.01048522, + "balance_loss_clip": 1.03338945, + "balance_loss_mlp": 1.02888787, + "epoch": 0.18674282278671275, + "flos": 26141244756480.0, + "grad_norm": 1.8002805876955041, + "language_loss": 0.86853272, + "learning_rate": 3.7477656936841742e-06, + "loss": 0.88977885, + "num_input_tokens_seen": 67032030, + "step": 3106, + "time_per_iteration": 2.745670795440674 + }, + { + "auxiliary_loss_clip": 0.01097895, + "auxiliary_loss_mlp": 0.01043377, + "balance_loss_clip": 1.03485465, + "balance_loss_mlp": 1.02566254, + "epoch": 0.18680294603938072, + "flos": 19201938324480.0, + "grad_norm": 1.8537872295855127, + "language_loss": 0.77746129, + "learning_rate": 3.7475763289795445e-06, + "loss": 0.79887402, + "num_input_tokens_seen": 67048920, + "step": 3107, + "time_per_iteration": 2.7135980129241943 + }, + { + "auxiliary_loss_clip": 0.01104245, + "auxiliary_loss_mlp": 0.01046812, + "balance_loss_clip": 1.03679883, + "balance_loss_mlp": 1.02773845, + "epoch": 0.1868630692920487, + "flos": 28544889116160.0, + "grad_norm": 1.873953617178891, + "language_loss": 0.73989642, + "learning_rate": 3.7473868980066446e-06, + "loss": 0.7614069, + "num_input_tokens_seen": 67068645, + "step": 3108, + "time_per_iteration": 4.3371498584747314 + }, + { + "auxiliary_loss_clip": 0.01062067, + "auxiliary_loss_mlp": 0.01045575, + "balance_loss_clip": 1.03297007, + "balance_loss_mlp": 1.02812243, + "epoch": 0.18692319254471668, + "flos": 17238020261760.0, + "grad_norm": 1.6019221029054445, + "language_loss": 0.74358046, + "learning_rate": 3.747197400772658e-06, + "loss": 0.7646569, + "num_input_tokens_seen": 67087075, + "step": 3109, + "time_per_iteration": 2.8592023849487305 + }, + { + "auxiliary_loss_clip": 0.01097094, + "auxiliary_loss_mlp": 0.01049467, + "balance_loss_clip": 1.03336143, + "balance_loss_mlp": 1.03124034, + "epoch": 0.18698331579738464, + "flos": 23185186156800.0, + "grad_norm": 1.3951624824968447, + "language_loss": 0.84358639, + "learning_rate": 3.747007837284772e-06, + "loss": 0.86505204, + "num_input_tokens_seen": 67108040, + "step": 3110, + "time_per_iteration": 2.846590518951416 + }, + { + "auxiliary_loss_clip": 0.01101572, + "auxiliary_loss_mlp": 0.0103984, + "balance_loss_clip": 1.03870177, + "balance_loss_mlp": 1.0221796, + "epoch": 0.1870434390500526, + "flos": 25516721963520.0, + "grad_norm": 1.530612632527994, + "language_loss": 0.8463515, + "learning_rate": 3.7468182075501737e-06, + "loss": 0.86776561, + "num_input_tokens_seen": 67127605, + "step": 3111, + "time_per_iteration": 2.71266770362854 + }, + { + "auxiliary_loss_clip": 0.01084982, + "auxiliary_loss_mlp": 0.01036707, + "balance_loss_clip": 1.03503108, + "balance_loss_mlp": 1.01976728, + "epoch": 0.18710356230272057, + "flos": 19500823393920.0, + "grad_norm": 1.7776674705987918, + "language_loss": 0.76647878, + "learning_rate": 3.7466285115760536e-06, + "loss": 0.78769571, + "num_input_tokens_seen": 67145785, + "step": 3112, + "time_per_iteration": 4.2325921058654785 + }, + { + "auxiliary_loss_clip": 0.01100257, + "auxiliary_loss_mlp": 0.01043906, + "balance_loss_clip": 1.0352273, + "balance_loss_mlp": 1.02698994, + "epoch": 0.18716368555538854, + "flos": 26760847386240.0, + "grad_norm": 2.2465817991541743, + "language_loss": 0.6481992, + "learning_rate": 3.7464387493696046e-06, + "loss": 0.66964078, + "num_input_tokens_seen": 67165930, + "step": 3113, + "time_per_iteration": 4.26839804649353 + }, + { + "auxiliary_loss_clip": 0.01103221, + "auxiliary_loss_mlp": 0.01041903, + "balance_loss_clip": 1.03649902, + "balance_loss_mlp": 1.02409279, + "epoch": 0.1872238088080565, + "flos": 25189827264000.0, + "grad_norm": 2.107177404034876, + "language_loss": 0.81428182, + "learning_rate": 3.746248920938024e-06, + "loss": 0.83573306, + "num_input_tokens_seen": 67185830, + "step": 3114, + "time_per_iteration": 2.696362257003784 + }, + { + "auxiliary_loss_clip": 0.01059455, + "auxiliary_loss_mlp": 0.01050146, + "balance_loss_clip": 1.03267527, + "balance_loss_mlp": 1.03114355, + "epoch": 0.1872839320607245, + "flos": 24134305178880.0, + "grad_norm": 2.31549124575947, + "language_loss": 0.57840437, + "learning_rate": 3.74605902628851e-06, + "loss": 0.59950036, + "num_input_tokens_seen": 67206930, + "step": 3115, + "time_per_iteration": 2.965304136276245 + }, + { + "auxiliary_loss_clip": 0.01071863, + "auxiliary_loss_mlp": 0.01053416, + "balance_loss_clip": 1.03572011, + "balance_loss_mlp": 1.03470004, + "epoch": 0.18734405531339246, + "flos": 21173793292800.0, + "grad_norm": 1.8450221805151843, + "language_loss": 0.71349162, + "learning_rate": 3.745869065428261e-06, + "loss": 0.73474437, + "num_input_tokens_seen": 67226290, + "step": 3116, + "time_per_iteration": 2.851580858230591 + }, + { + "auxiliary_loss_clip": 0.01104189, + "auxiliary_loss_mlp": 0.01032169, + "balance_loss_clip": 1.03273916, + "balance_loss_mlp": 1.01531267, + "epoch": 0.18740417856606043, + "flos": 17237697039360.0, + "grad_norm": 1.9844873133600376, + "language_loss": 0.78903925, + "learning_rate": 3.7456790383644833e-06, + "loss": 0.81040287, + "num_input_tokens_seen": 67244410, + "step": 3117, + "time_per_iteration": 2.727642774581909 + }, + { + "auxiliary_loss_clip": 0.01090469, + "auxiliary_loss_mlp": 0.01045553, + "balance_loss_clip": 1.03725243, + "balance_loss_mlp": 1.0277431, + "epoch": 0.1874643018187284, + "flos": 32558049999360.0, + "grad_norm": 1.6040742326933284, + "language_loss": 0.84041893, + "learning_rate": 3.745488945104381e-06, + "loss": 0.86177915, + "num_input_tokens_seen": 67264470, + "step": 3118, + "time_per_iteration": 2.8996522426605225 + }, + { + "auxiliary_loss_clip": 0.01100149, + "auxiliary_loss_mlp": 0.01048981, + "balance_loss_clip": 1.03634501, + "balance_loss_mlp": 1.03200555, + "epoch": 0.18752442507139636, + "flos": 23258156636160.0, + "grad_norm": 1.8695590693229427, + "language_loss": 0.76591241, + "learning_rate": 3.7452987856551636e-06, + "loss": 0.7874037, + "num_input_tokens_seen": 67284315, + "step": 3119, + "time_per_iteration": 2.695936918258667 + }, + { + "auxiliary_loss_clip": 0.01110113, + "auxiliary_loss_mlp": 0.01048197, + "balance_loss_clip": 1.03561485, + "balance_loss_mlp": 1.03153157, + "epoch": 0.18758454832406432, + "flos": 21760933006080.0, + "grad_norm": 1.6260892233748896, + "language_loss": 0.82065356, + "learning_rate": 3.7451085600240406e-06, + "loss": 0.84223658, + "num_input_tokens_seen": 67302780, + "step": 3120, + "time_per_iteration": 2.605287790298462 + }, + { + "auxiliary_loss_clip": 0.01083352, + "auxiliary_loss_mlp": 0.01037682, + "balance_loss_clip": 1.03192413, + "balance_loss_mlp": 1.02179193, + "epoch": 0.1876446715767323, + "flos": 29570210841600.0, + "grad_norm": 1.6508735730530841, + "language_loss": 0.8516106, + "learning_rate": 3.7449182682182263e-06, + "loss": 0.87282091, + "num_input_tokens_seen": 67323405, + "step": 3121, + "time_per_iteration": 2.734083414077759 + }, + { + "auxiliary_loss_clip": 0.01046187, + "auxiliary_loss_mlp": 0.01042271, + "balance_loss_clip": 1.02887821, + "balance_loss_mlp": 1.02511668, + "epoch": 0.18770479482940028, + "flos": 30339992234880.0, + "grad_norm": 1.7624128779634156, + "language_loss": 0.69967532, + "learning_rate": 3.744727910244937e-06, + "loss": 0.72055995, + "num_input_tokens_seen": 67345800, + "step": 3122, + "time_per_iteration": 2.864678382873535 + }, + { + "auxiliary_loss_clip": 0.01107689, + "auxiliary_loss_mlp": 0.01046734, + "balance_loss_clip": 1.03490579, + "balance_loss_mlp": 1.02808988, + "epoch": 0.18776491808206824, + "flos": 14465357527680.0, + "grad_norm": 2.2294235149532704, + "language_loss": 0.70500058, + "learning_rate": 3.7445374861113905e-06, + "loss": 0.72654486, + "num_input_tokens_seen": 67363575, + "step": 3123, + "time_per_iteration": 2.6046555042266846 + }, + { + "auxiliary_loss_clip": 0.01095124, + "auxiliary_loss_mlp": 0.01043444, + "balance_loss_clip": 1.03465247, + "balance_loss_mlp": 1.02735114, + "epoch": 0.1878250413347362, + "flos": 24498547044480.0, + "grad_norm": 1.7986987219866721, + "language_loss": 0.73998749, + "learning_rate": 3.7443469958248066e-06, + "loss": 0.76137322, + "num_input_tokens_seen": 67381765, + "step": 3124, + "time_per_iteration": 2.620858907699585 + }, + { + "auxiliary_loss_clip": 0.01109514, + "auxiliary_loss_mlp": 0.01046141, + "balance_loss_clip": 1.03519225, + "balance_loss_mlp": 1.02811658, + "epoch": 0.18788516458740417, + "flos": 39786185692800.0, + "grad_norm": 1.8046215156172283, + "language_loss": 0.80814767, + "learning_rate": 3.7441564393924106e-06, + "loss": 0.82970423, + "num_input_tokens_seen": 67405000, + "step": 3125, + "time_per_iteration": 2.7099010944366455 + }, + { + "auxiliary_loss_clip": 0.0098815, + "auxiliary_loss_mlp": 0.01003545, + "balance_loss_clip": 1.00753033, + "balance_loss_mlp": 1.0008868, + "epoch": 0.18794528784007214, + "flos": 64699250664960.0, + "grad_norm": 0.9357483528502916, + "language_loss": 0.63666725, + "learning_rate": 3.7439658168214273e-06, + "loss": 0.65658414, + "num_input_tokens_seen": 67467140, + "step": 3126, + "time_per_iteration": 3.373117208480835 + }, + { + "auxiliary_loss_clip": 0.01086083, + "auxiliary_loss_mlp": 0.01041513, + "balance_loss_clip": 1.03687334, + "balance_loss_mlp": 1.02521682, + "epoch": 0.1880054110927401, + "flos": 28622061486720.0, + "grad_norm": 1.6117139588478684, + "language_loss": 0.81489491, + "learning_rate": 3.7437751281190857e-06, + "loss": 0.83617085, + "num_input_tokens_seen": 67487980, + "step": 3127, + "time_per_iteration": 2.720189094543457 + }, + { + "auxiliary_loss_clip": 0.01027249, + "auxiliary_loss_mlp": 0.01004325, + "balance_loss_clip": 1.00730324, + "balance_loss_mlp": 1.00202405, + "epoch": 0.1880655343454081, + "flos": 64488958490880.0, + "grad_norm": 0.7696489961278946, + "language_loss": 0.61885798, + "learning_rate": 3.7435843732926164e-06, + "loss": 0.63917375, + "num_input_tokens_seen": 67552500, + "step": 3128, + "time_per_iteration": 3.2579872608184814 + }, + { + "auxiliary_loss_clip": 0.0105843, + "auxiliary_loss_mlp": 0.01046883, + "balance_loss_clip": 1.03152251, + "balance_loss_mlp": 1.02819133, + "epoch": 0.18812565759807606, + "flos": 32124464928000.0, + "grad_norm": 2.029327489194398, + "language_loss": 0.71650892, + "learning_rate": 3.7433935523492536e-06, + "loss": 0.73756206, + "num_input_tokens_seen": 67573295, + "step": 3129, + "time_per_iteration": 2.829149007797241 + }, + { + "auxiliary_loss_clip": 0.01109202, + "auxiliary_loss_mlp": 0.01046694, + "balance_loss_clip": 1.03526378, + "balance_loss_mlp": 1.02968264, + "epoch": 0.18818578085074403, + "flos": 20624539449600.0, + "grad_norm": 1.8452425180378007, + "language_loss": 0.85358357, + "learning_rate": 3.7432026652962314e-06, + "loss": 0.87514257, + "num_input_tokens_seen": 67590010, + "step": 3130, + "time_per_iteration": 2.574023723602295 + }, + { + "auxiliary_loss_clip": 0.01065438, + "auxiliary_loss_mlp": 0.01048248, + "balance_loss_clip": 1.03324592, + "balance_loss_mlp": 1.0298059, + "epoch": 0.188245904103412, + "flos": 28840506048000.0, + "grad_norm": 1.7603720487755312, + "language_loss": 0.7670455, + "learning_rate": 3.7430117121407897e-06, + "loss": 0.78818232, + "num_input_tokens_seen": 67611110, + "step": 3131, + "time_per_iteration": 2.8444085121154785 + }, + { + "auxiliary_loss_clip": 0.01077174, + "auxiliary_loss_mlp": 0.01045932, + "balance_loss_clip": 1.0354048, + "balance_loss_mlp": 1.02814579, + "epoch": 0.18830602735607996, + "flos": 29420319386880.0, + "grad_norm": 1.827773663763364, + "language_loss": 0.81415689, + "learning_rate": 3.74282069289017e-06, + "loss": 0.835388, + "num_input_tokens_seen": 67631990, + "step": 3132, + "time_per_iteration": 2.8582890033721924 + }, + { + "auxiliary_loss_clip": 0.01047249, + "auxiliary_loss_mlp": 0.00748738, + "balance_loss_clip": 1.03121924, + "balance_loss_mlp": 1.00068831, + "epoch": 0.18836615060874792, + "flos": 28872933050880.0, + "grad_norm": 1.794654353935064, + "language_loss": 0.79821646, + "learning_rate": 3.742629607551614e-06, + "loss": 0.8161763, + "num_input_tokens_seen": 67650490, + "step": 3133, + "time_per_iteration": 2.847461462020874 + }, + { + "auxiliary_loss_clip": 0.0107693, + "auxiliary_loss_mlp": 0.0105016, + "balance_loss_clip": 1.03831458, + "balance_loss_mlp": 1.03217149, + "epoch": 0.18842627386141592, + "flos": 22601673717120.0, + "grad_norm": 1.826264486381365, + "language_loss": 0.82831794, + "learning_rate": 3.7424384561323698e-06, + "loss": 0.84958887, + "num_input_tokens_seen": 67668860, + "step": 3134, + "time_per_iteration": 2.7763078212738037 + }, + { + "auxiliary_loss_clip": 0.01081502, + "auxiliary_loss_mlp": 0.01047281, + "balance_loss_clip": 1.03208244, + "balance_loss_mlp": 1.03029323, + "epoch": 0.18848639711408388, + "flos": 24573600512640.0, + "grad_norm": 1.63985563078546, + "language_loss": 0.83102763, + "learning_rate": 3.742247238639684e-06, + "loss": 0.85231549, + "num_input_tokens_seen": 67690220, + "step": 3135, + "time_per_iteration": 2.8832285404205322 + }, + { + "auxiliary_loss_clip": 0.01098873, + "auxiliary_loss_mlp": 0.01041424, + "balance_loss_clip": 1.03563941, + "balance_loss_mlp": 1.02457929, + "epoch": 0.18854652036675185, + "flos": 34166920078080.0, + "grad_norm": 1.9311388710881348, + "language_loss": 0.78625095, + "learning_rate": 3.7420559550808083e-06, + "loss": 0.8076539, + "num_input_tokens_seen": 67709820, + "step": 3136, + "time_per_iteration": 2.702589273452759 + }, + { + "auxiliary_loss_clip": 0.01084544, + "auxiliary_loss_mlp": 0.01044494, + "balance_loss_clip": 1.03457296, + "balance_loss_mlp": 1.02651715, + "epoch": 0.1886066436194198, + "flos": 24200236592640.0, + "grad_norm": 1.8610468588601097, + "language_loss": 0.81273484, + "learning_rate": 3.741864605462996e-06, + "loss": 0.83402526, + "num_input_tokens_seen": 67729490, + "step": 3137, + "time_per_iteration": 2.7104458808898926 + }, + { + "auxiliary_loss_clip": 0.01111914, + "auxiliary_loss_mlp": 0.01040884, + "balance_loss_clip": 1.03987837, + "balance_loss_mlp": 1.02524352, + "epoch": 0.18866676687208778, + "flos": 21251109317760.0, + "grad_norm": 1.9282839629225355, + "language_loss": 0.81110871, + "learning_rate": 3.741673189793504e-06, + "loss": 0.83263671, + "num_input_tokens_seen": 67749665, + "step": 3138, + "time_per_iteration": 2.5544047355651855 + }, + { + "auxiliary_loss_clip": 0.01101388, + "auxiliary_loss_mlp": 0.01049812, + "balance_loss_clip": 1.03648734, + "balance_loss_mlp": 1.03231204, + "epoch": 0.18872689012475574, + "flos": 37308673013760.0, + "grad_norm": 2.2425853939281866, + "language_loss": 0.63567346, + "learning_rate": 3.7414817080795896e-06, + "loss": 0.65718544, + "num_input_tokens_seen": 67776230, + "step": 3139, + "time_per_iteration": 2.7604970932006836 + }, + { + "auxiliary_loss_clip": 0.0110811, + "auxiliary_loss_mlp": 0.01039155, + "balance_loss_clip": 1.03492928, + "balance_loss_mlp": 1.02203608, + "epoch": 0.1887870133774237, + "flos": 21652303299840.0, + "grad_norm": 2.0185135431765224, + "language_loss": 0.71666861, + "learning_rate": 3.741290160328514e-06, + "loss": 0.73814118, + "num_input_tokens_seen": 67795080, + "step": 3140, + "time_per_iteration": 2.7092745304107666 + }, + { + "auxiliary_loss_clip": 0.01109776, + "auxiliary_loss_mlp": 0.01043634, + "balance_loss_clip": 1.03580689, + "balance_loss_mlp": 1.02516901, + "epoch": 0.1888471366300917, + "flos": 15924659374080.0, + "grad_norm": 5.503642137188039, + "language_loss": 0.86694169, + "learning_rate": 3.7410985465475412e-06, + "loss": 0.88847578, + "num_input_tokens_seen": 67813110, + "step": 3141, + "time_per_iteration": 2.562562942504883 + }, + { + "auxiliary_loss_clip": 0.01081658, + "auxiliary_loss_mlp": 0.01040408, + "balance_loss_clip": 1.0340004, + "balance_loss_mlp": 1.02219296, + "epoch": 0.18890725988275966, + "flos": 18551955767040.0, + "grad_norm": 1.729412347043978, + "language_loss": 0.77069694, + "learning_rate": 3.7409068667439378e-06, + "loss": 0.79191756, + "num_input_tokens_seen": 67831070, + "step": 3142, + "time_per_iteration": 2.715684175491333 + }, + { + "auxiliary_loss_clip": 0.0108831, + "auxiliary_loss_mlp": 0.01041129, + "balance_loss_clip": 1.03764844, + "balance_loss_mlp": 1.02585816, + "epoch": 0.18896738313542763, + "flos": 28840865184000.0, + "grad_norm": 1.6893888246403992, + "language_loss": 0.78198528, + "learning_rate": 3.740715120924971e-06, + "loss": 0.80327964, + "num_input_tokens_seen": 67852170, + "step": 3143, + "time_per_iteration": 2.853677988052368 + }, + { + "auxiliary_loss_clip": 0.01072046, + "auxiliary_loss_mlp": 0.01047896, + "balance_loss_clip": 1.03281581, + "balance_loss_mlp": 1.0306108, + "epoch": 0.1890275063880956, + "flos": 22412747157120.0, + "grad_norm": 2.3157618658768424, + "language_loss": 0.71642029, + "learning_rate": 3.740523309097912e-06, + "loss": 0.73761964, + "num_input_tokens_seen": 67869945, + "step": 3144, + "time_per_iteration": 2.6585819721221924 + }, + { + "auxiliary_loss_clip": 0.01076673, + "auxiliary_loss_mlp": 0.01046996, + "balance_loss_clip": 1.03267503, + "balance_loss_mlp": 1.02856672, + "epoch": 0.18908762964076356, + "flos": 24243904552320.0, + "grad_norm": 2.218216376854763, + "language_loss": 0.73613447, + "learning_rate": 3.7403314312700356e-06, + "loss": 0.75737113, + "num_input_tokens_seen": 67890240, + "step": 3145, + "time_per_iteration": 2.702101945877075 + }, + { + "auxiliary_loss_clip": 0.01059283, + "auxiliary_loss_mlp": 0.01040768, + "balance_loss_clip": 1.0283823, + "balance_loss_mlp": 1.02424538, + "epoch": 0.18914775289343153, + "flos": 16982910892800.0, + "grad_norm": 2.0644843715556718, + "language_loss": 0.7636857, + "learning_rate": 3.740139487448616e-06, + "loss": 0.78468621, + "num_input_tokens_seen": 67907825, + "step": 3146, + "time_per_iteration": 2.7012646198272705 + }, + { + "auxiliary_loss_clip": 0.01043895, + "auxiliary_loss_mlp": 0.0104949, + "balance_loss_clip": 1.02782989, + "balance_loss_mlp": 1.03146517, + "epoch": 0.1892078761460995, + "flos": 21543781334400.0, + "grad_norm": 2.037155087493872, + "language_loss": 0.78530401, + "learning_rate": 3.7399474776409326e-06, + "loss": 0.80623794, + "num_input_tokens_seen": 67926670, + "step": 3147, + "time_per_iteration": 2.848998546600342 + }, + { + "auxiliary_loss_clip": 0.01098726, + "auxiliary_loss_mlp": 0.01044518, + "balance_loss_clip": 1.03629625, + "balance_loss_mlp": 1.02789998, + "epoch": 0.18926799939876748, + "flos": 23001538896000.0, + "grad_norm": 2.290091172653627, + "language_loss": 0.6612922, + "learning_rate": 3.739755401854267e-06, + "loss": 0.6827246, + "num_input_tokens_seen": 67943645, + "step": 3148, + "time_per_iteration": 2.617905616760254 + }, + { + "auxiliary_loss_clip": 0.01063473, + "auxiliary_loss_mlp": 0.01037959, + "balance_loss_clip": 1.02874923, + "balance_loss_mlp": 1.02093577, + "epoch": 0.18932812265143545, + "flos": 22273019251200.0, + "grad_norm": 2.9920317941335965, + "language_loss": 0.75848156, + "learning_rate": 3.739563260095902e-06, + "loss": 0.77949584, + "num_input_tokens_seen": 67962345, + "step": 3149, + "time_per_iteration": 2.7667272090911865 + }, + { + "auxiliary_loss_clip": 0.0108368, + "auxiliary_loss_mlp": 0.01046562, + "balance_loss_clip": 1.03561711, + "balance_loss_mlp": 1.03049278, + "epoch": 0.1893882459041034, + "flos": 18624423456000.0, + "grad_norm": 2.4370298743808596, + "language_loss": 0.80353636, + "learning_rate": 3.7393710523731245e-06, + "loss": 0.82483876, + "num_input_tokens_seen": 67979760, + "step": 3150, + "time_per_iteration": 2.770165205001831 + }, + { + "auxiliary_loss_clip": 0.01088178, + "auxiliary_loss_mlp": 0.01053301, + "balance_loss_clip": 1.03557634, + "balance_loss_mlp": 1.03597963, + "epoch": 0.18944836915677138, + "flos": 22892981016960.0, + "grad_norm": 1.9902912690389811, + "language_loss": 0.84387338, + "learning_rate": 3.7391787786932215e-06, + "loss": 0.86528826, + "num_input_tokens_seen": 67996895, + "step": 3151, + "time_per_iteration": 2.632974624633789 + }, + { + "auxiliary_loss_clip": 0.01077289, + "auxiliary_loss_mlp": 0.01047529, + "balance_loss_clip": 1.03522706, + "balance_loss_mlp": 1.03100634, + "epoch": 0.18950849240943934, + "flos": 26796542526720.0, + "grad_norm": 1.7351113058675693, + "language_loss": 0.74545848, + "learning_rate": 3.7389864390634857e-06, + "loss": 0.76670665, + "num_input_tokens_seen": 68018365, + "step": 3152, + "time_per_iteration": 4.533084154129028 + }, + { + "auxiliary_loss_clip": 0.01068687, + "auxiliary_loss_mlp": 0.01046349, + "balance_loss_clip": 1.0315243, + "balance_loss_mlp": 1.0285393, + "epoch": 0.1895686156621073, + "flos": 24971239048320.0, + "grad_norm": 2.024780091718936, + "language_loss": 0.75491911, + "learning_rate": 3.738794033491209e-06, + "loss": 0.7760694, + "num_input_tokens_seen": 68037985, + "step": 3153, + "time_per_iteration": 2.8312606811523438 + }, + { + "auxiliary_loss_clip": 0.01111115, + "auxiliary_loss_mlp": 0.01043357, + "balance_loss_clip": 1.0368228, + "balance_loss_mlp": 1.02648854, + "epoch": 0.1896287389147753, + "flos": 21944544353280.0, + "grad_norm": 2.017003658652727, + "language_loss": 0.78887326, + "learning_rate": 3.7386015619836887e-06, + "loss": 0.81041801, + "num_input_tokens_seen": 68057975, + "step": 3154, + "time_per_iteration": 2.700709342956543 + }, + { + "auxiliary_loss_clip": 0.01067661, + "auxiliary_loss_mlp": 0.01048681, + "balance_loss_clip": 1.03078151, + "balance_loss_mlp": 1.03019118, + "epoch": 0.18968886216744327, + "flos": 18179058723840.0, + "grad_norm": 2.5878717690264845, + "language_loss": 0.72380185, + "learning_rate": 3.738409024548223e-06, + "loss": 0.74496526, + "num_input_tokens_seen": 68074175, + "step": 3155, + "time_per_iteration": 2.772304058074951 + }, + { + "auxiliary_loss_clip": 0.01081578, + "auxiliary_loss_mlp": 0.01041693, + "balance_loss_clip": 1.03304064, + "balance_loss_mlp": 1.02529025, + "epoch": 0.18974898542011123, + "flos": 20412487509120.0, + "grad_norm": 1.7011008195353952, + "language_loss": 0.74191135, + "learning_rate": 3.7382164211921136e-06, + "loss": 0.76314408, + "num_input_tokens_seen": 68095230, + "step": 3156, + "time_per_iteration": 4.386704921722412 + }, + { + "auxiliary_loss_clip": 0.01108948, + "auxiliary_loss_mlp": 0.01044268, + "balance_loss_clip": 1.03587937, + "balance_loss_mlp": 1.02805603, + "epoch": 0.1898091086727792, + "flos": 23985024255360.0, + "grad_norm": 2.0492869456995604, + "language_loss": 0.68017352, + "learning_rate": 3.7380237519226623e-06, + "loss": 0.70170569, + "num_input_tokens_seen": 68113805, + "step": 3157, + "time_per_iteration": 2.6145143508911133 + }, + { + "auxiliary_loss_clip": 0.010714, + "auxiliary_loss_mlp": 0.01037225, + "balance_loss_clip": 1.03129542, + "balance_loss_mlp": 1.02032077, + "epoch": 0.18986923192544716, + "flos": 27637067756160.0, + "grad_norm": 1.7002105241066396, + "language_loss": 0.79923451, + "learning_rate": 3.737831016747176e-06, + "loss": 0.82032073, + "num_input_tokens_seen": 68133190, + "step": 3158, + "time_per_iteration": 2.7302463054656982 + }, + { + "auxiliary_loss_clip": 0.01113416, + "auxiliary_loss_mlp": 0.01042044, + "balance_loss_clip": 1.03748167, + "balance_loss_mlp": 1.02411532, + "epoch": 0.18992935517811513, + "flos": 25484151306240.0, + "grad_norm": 1.6985838237908908, + "language_loss": 0.72425985, + "learning_rate": 3.737638215672964e-06, + "loss": 0.74581444, + "num_input_tokens_seen": 68152330, + "step": 3159, + "time_per_iteration": 4.400169849395752 + }, + { + "auxiliary_loss_clip": 0.01099817, + "auxiliary_loss_mlp": 0.01046466, + "balance_loss_clip": 1.03659785, + "balance_loss_mlp": 1.02884698, + "epoch": 0.1899894784307831, + "flos": 17420805596160.0, + "grad_norm": 2.015383330247822, + "language_loss": 0.85477847, + "learning_rate": 3.7374453487073366e-06, + "loss": 0.87624127, + "num_input_tokens_seen": 68170185, + "step": 3160, + "time_per_iteration": 4.203662872314453 + }, + { + "auxiliary_loss_clip": 0.01081509, + "auxiliary_loss_mlp": 0.01047602, + "balance_loss_clip": 1.03347397, + "balance_loss_mlp": 1.03172326, + "epoch": 0.19004960168345109, + "flos": 27492240119040.0, + "grad_norm": 2.0454859931950464, + "language_loss": 0.73669076, + "learning_rate": 3.7372524158576074e-06, + "loss": 0.75798196, + "num_input_tokens_seen": 68191665, + "step": 3161, + "time_per_iteration": 2.6436028480529785 + }, + { + "auxiliary_loss_clip": 0.01087249, + "auxiliary_loss_mlp": 0.01051999, + "balance_loss_clip": 1.03356314, + "balance_loss_mlp": 1.03468955, + "epoch": 0.19010972493611905, + "flos": 38654676385920.0, + "grad_norm": 1.5666939609695762, + "language_loss": 0.80758053, + "learning_rate": 3.7370594171310926e-06, + "loss": 0.828973, + "num_input_tokens_seen": 68214635, + "step": 3162, + "time_per_iteration": 2.714617967605591 + }, + { + "auxiliary_loss_clip": 0.01109405, + "auxiliary_loss_mlp": 0.01041112, + "balance_loss_clip": 1.03585887, + "balance_loss_mlp": 1.02392244, + "epoch": 0.19016984818878702, + "flos": 19244744357760.0, + "grad_norm": 2.5378183860216024, + "language_loss": 0.75184333, + "learning_rate": 3.73686635253511e-06, + "loss": 0.77334857, + "num_input_tokens_seen": 68232150, + "step": 3163, + "time_per_iteration": 2.5103821754455566 + }, + { + "auxiliary_loss_clip": 0.01055487, + "auxiliary_loss_mlp": 0.01045412, + "balance_loss_clip": 1.03299296, + "balance_loss_mlp": 1.02795935, + "epoch": 0.19022997144145498, + "flos": 37596891744000.0, + "grad_norm": 1.4697397069343667, + "language_loss": 0.74143857, + "learning_rate": 3.736673222076982e-06, + "loss": 0.76244754, + "num_input_tokens_seen": 68253370, + "step": 3164, + "time_per_iteration": 2.8030500411987305 + }, + { + "auxiliary_loss_clip": 0.01095947, + "auxiliary_loss_mlp": 0.01035476, + "balance_loss_clip": 1.03537905, + "balance_loss_mlp": 1.01832128, + "epoch": 0.19029009469412295, + "flos": 61530921665280.0, + "grad_norm": 1.3721246803781357, + "language_loss": 0.66796988, + "learning_rate": 3.7364800257640313e-06, + "loss": 0.68928409, + "num_input_tokens_seen": 68278895, + "step": 3165, + "time_per_iteration": 2.9141104221343994 + }, + { + "auxiliary_loss_clip": 0.01096514, + "auxiliary_loss_mlp": 0.01045371, + "balance_loss_clip": 1.03583169, + "balance_loss_mlp": 1.02676225, + "epoch": 0.1903502179467909, + "flos": 13954851480960.0, + "grad_norm": 2.1756459556862193, + "language_loss": 0.7422722, + "learning_rate": 3.7362867636035835e-06, + "loss": 0.76369107, + "num_input_tokens_seen": 68294880, + "step": 3166, + "time_per_iteration": 2.5320208072662354 + }, + { + "auxiliary_loss_clip": 0.00997571, + "auxiliary_loss_mlp": 0.01030683, + "balance_loss_clip": 1.00880659, + "balance_loss_mlp": 1.02819109, + "epoch": 0.1904103411994589, + "flos": 66899641916160.0, + "grad_norm": 0.933021685509284, + "language_loss": 0.5041126, + "learning_rate": 3.736093435602968e-06, + "loss": 0.52439517, + "num_input_tokens_seen": 68359665, + "step": 3167, + "time_per_iteration": 3.332904815673828 + }, + { + "auxiliary_loss_clip": 0.01095634, + "auxiliary_loss_mlp": 0.01044989, + "balance_loss_clip": 1.03638554, + "balance_loss_mlp": 1.0284369, + "epoch": 0.19047046445212687, + "flos": 21908741472000.0, + "grad_norm": 1.8462644349828985, + "language_loss": 0.7429418, + "learning_rate": 3.7359000417695156e-06, + "loss": 0.76434803, + "num_input_tokens_seen": 68378950, + "step": 3168, + "time_per_iteration": 2.6800472736358643 + }, + { + "auxiliary_loss_clip": 0.00990814, + "auxiliary_loss_mlp": 0.01023294, + "balance_loss_clip": 1.01104164, + "balance_loss_mlp": 1.02038515, + "epoch": 0.19053058770479483, + "flos": 59255156701440.0, + "grad_norm": 0.8678135367081571, + "language_loss": 0.60023272, + "learning_rate": 3.73570658211056e-06, + "loss": 0.62037385, + "num_input_tokens_seen": 68434235, + "step": 3169, + "time_per_iteration": 3.2373902797698975 + }, + { + "auxiliary_loss_clip": 0.01062384, + "auxiliary_loss_mlp": 0.01049129, + "balance_loss_clip": 1.04033017, + "balance_loss_mlp": 1.03196263, + "epoch": 0.1905907109574628, + "flos": 23951304362880.0, + "grad_norm": 1.537294437822698, + "language_loss": 0.78375441, + "learning_rate": 3.735513056633436e-06, + "loss": 0.80486959, + "num_input_tokens_seen": 68453830, + "step": 3170, + "time_per_iteration": 2.989030599594116 + }, + { + "auxiliary_loss_clip": 0.0109603, + "auxiliary_loss_mlp": 0.01043236, + "balance_loss_clip": 1.03505373, + "balance_loss_mlp": 1.02643895, + "epoch": 0.19065083421013077, + "flos": 20812316774400.0, + "grad_norm": 2.432313389063624, + "language_loss": 0.7832166, + "learning_rate": 3.7353194653454834e-06, + "loss": 0.8046093, + "num_input_tokens_seen": 68473005, + "step": 3171, + "time_per_iteration": 2.70268177986145 + }, + { + "auxiliary_loss_clip": 0.01109849, + "auxiliary_loss_mlp": 0.01047754, + "balance_loss_clip": 1.03570735, + "balance_loss_mlp": 1.02981281, + "epoch": 0.19071095746279873, + "flos": 31284981192960.0, + "grad_norm": 1.9194787259015367, + "language_loss": 0.78672487, + "learning_rate": 3.7351258082540426e-06, + "loss": 0.80830091, + "num_input_tokens_seen": 68493470, + "step": 3172, + "time_per_iteration": 2.677872657775879 + }, + { + "auxiliary_loss_clip": 0.01095373, + "auxiliary_loss_mlp": 0.0105521, + "balance_loss_clip": 1.03402209, + "balance_loss_mlp": 1.03853226, + "epoch": 0.1907710807154667, + "flos": 14356117290240.0, + "grad_norm": 1.4964628794772985, + "language_loss": 0.80142522, + "learning_rate": 3.7349320853664576e-06, + "loss": 0.82293105, + "num_input_tokens_seen": 68511290, + "step": 3173, + "time_per_iteration": 2.628077745437622 + }, + { + "auxiliary_loss_clip": 0.01063604, + "auxiliary_loss_mlp": 0.00748962, + "balance_loss_clip": 1.03251815, + "balance_loss_mlp": 1.00099277, + "epoch": 0.1908312039681347, + "flos": 26907039740160.0, + "grad_norm": 1.7219726743324408, + "language_loss": 0.78532887, + "learning_rate": 3.7347382966900735e-06, + "loss": 0.80345452, + "num_input_tokens_seen": 68532575, + "step": 3174, + "time_per_iteration": 2.7708542346954346 + }, + { + "auxiliary_loss_clip": 0.01059419, + "auxiliary_loss_mlp": 0.01057211, + "balance_loss_clip": 1.03156292, + "balance_loss_mlp": 1.04011619, + "epoch": 0.19089132722080265, + "flos": 14494695960960.0, + "grad_norm": 2.3430888209691303, + "language_loss": 0.80957818, + "learning_rate": 3.7345444422322395e-06, + "loss": 0.8307445, + "num_input_tokens_seen": 68548760, + "step": 3175, + "time_per_iteration": 2.7465012073516846 + }, + { + "auxiliary_loss_clip": 0.01030464, + "auxiliary_loss_mlp": 0.01053035, + "balance_loss_clip": 1.02678442, + "balance_loss_mlp": 1.03541589, + "epoch": 0.19095145047347062, + "flos": 13952876232960.0, + "grad_norm": 1.9005031725127375, + "language_loss": 0.85807961, + "learning_rate": 3.7343505220003067e-06, + "loss": 0.87891459, + "num_input_tokens_seen": 68563100, + "step": 3176, + "time_per_iteration": 2.7078857421875 + }, + { + "auxiliary_loss_clip": 0.01089812, + "auxiliary_loss_mlp": 0.01059404, + "balance_loss_clip": 1.03806496, + "balance_loss_mlp": 1.04072404, + "epoch": 0.19101157372613858, + "flos": 25301832848640.0, + "grad_norm": 1.881901571437743, + "language_loss": 0.81613731, + "learning_rate": 3.7341565360016285e-06, + "loss": 0.8376295, + "num_input_tokens_seen": 68581650, + "step": 3177, + "time_per_iteration": 2.815142869949341 + }, + { + "auxiliary_loss_clip": 0.01070929, + "auxiliary_loss_mlp": 0.01053162, + "balance_loss_clip": 1.02996492, + "balance_loss_mlp": 1.0360198, + "epoch": 0.19107169697880655, + "flos": 20558212986240.0, + "grad_norm": 2.8976300332432774, + "language_loss": 0.7508167, + "learning_rate": 3.73396248424356e-06, + "loss": 0.77205765, + "num_input_tokens_seen": 68600360, + "step": 3178, + "time_per_iteration": 2.641263008117676 + }, + { + "auxiliary_loss_clip": 0.0109707, + "auxiliary_loss_mlp": 0.0104368, + "balance_loss_clip": 1.03478408, + "balance_loss_mlp": 1.02707374, + "epoch": 0.19113182023147451, + "flos": 22163204396160.0, + "grad_norm": 2.451833215802686, + "language_loss": 0.81471038, + "learning_rate": 3.7337683667334606e-06, + "loss": 0.83611786, + "num_input_tokens_seen": 68617885, + "step": 3179, + "time_per_iteration": 2.7308006286621094 + }, + { + "auxiliary_loss_clip": 0.01101145, + "auxiliary_loss_mlp": 0.01050333, + "balance_loss_clip": 1.03853095, + "balance_loss_mlp": 1.03363204, + "epoch": 0.19119194348414248, + "flos": 18581796990720.0, + "grad_norm": 2.566894046500242, + "language_loss": 0.79468, + "learning_rate": 3.733574183478691e-06, + "loss": 0.81619483, + "num_input_tokens_seen": 68634550, + "step": 3180, + "time_per_iteration": 2.6683542728424072 + }, + { + "auxiliary_loss_clip": 0.01083807, + "auxiliary_loss_mlp": 0.01053024, + "balance_loss_clip": 1.03476644, + "balance_loss_mlp": 1.03540468, + "epoch": 0.19125206673681047, + "flos": 19026623018880.0, + "grad_norm": 2.311269892709618, + "language_loss": 0.79525542, + "learning_rate": 3.733379934486615e-06, + "loss": 0.81662369, + "num_input_tokens_seen": 68651895, + "step": 3181, + "time_per_iteration": 2.863680362701416 + }, + { + "auxiliary_loss_clip": 0.0109976, + "auxiliary_loss_mlp": 0.01049478, + "balance_loss_clip": 1.03683078, + "balance_loss_mlp": 1.03282452, + "epoch": 0.19131218998947844, + "flos": 21690153256320.0, + "grad_norm": 1.829288311511835, + "language_loss": 0.73927534, + "learning_rate": 3.7331856197645973e-06, + "loss": 0.7607677, + "num_input_tokens_seen": 68671500, + "step": 3182, + "time_per_iteration": 2.6242847442626953 + }, + { + "auxiliary_loss_clip": 0.0108358, + "auxiliary_loss_mlp": 0.01041858, + "balance_loss_clip": 1.03823185, + "balance_loss_mlp": 1.02477551, + "epoch": 0.1913723132421464, + "flos": 18442500048000.0, + "grad_norm": 1.7976801447290207, + "language_loss": 0.64804375, + "learning_rate": 3.7329912393200084e-06, + "loss": 0.66929817, + "num_input_tokens_seen": 68690570, + "step": 3183, + "time_per_iteration": 2.6969122886657715 + }, + { + "auxiliary_loss_clip": 0.01084721, + "auxiliary_loss_mlp": 0.01047002, + "balance_loss_clip": 1.03344989, + "balance_loss_mlp": 1.02901363, + "epoch": 0.19143243649481437, + "flos": 27160102033920.0, + "grad_norm": 1.5741538349495463, + "language_loss": 0.73561305, + "learning_rate": 3.7327967931602173e-06, + "loss": 0.75693035, + "num_input_tokens_seen": 68709735, + "step": 3184, + "time_per_iteration": 2.680219888687134 + }, + { + "auxiliary_loss_clip": 0.01073851, + "auxiliary_loss_mlp": 0.01048037, + "balance_loss_clip": 1.03207707, + "balance_loss_mlp": 1.02904701, + "epoch": 0.19149255974748233, + "flos": 21718952985600.0, + "grad_norm": 1.8185178342031751, + "language_loss": 0.88096273, + "learning_rate": 3.732602281292598e-06, + "loss": 0.90218163, + "num_input_tokens_seen": 68727565, + "step": 3185, + "time_per_iteration": 2.7139999866485596 + }, + { + "auxiliary_loss_clip": 0.01107131, + "auxiliary_loss_mlp": 0.01040053, + "balance_loss_clip": 1.03633821, + "balance_loss_mlp": 1.02277923, + "epoch": 0.1915526830001503, + "flos": 22963293889920.0, + "grad_norm": 2.038663605157027, + "language_loss": 0.73094022, + "learning_rate": 3.7324077037245267e-06, + "loss": 0.75241208, + "num_input_tokens_seen": 68748110, + "step": 3186, + "time_per_iteration": 2.564085006713867 + }, + { + "auxiliary_loss_clip": 0.01089199, + "auxiliary_loss_mlp": 0.01044342, + "balance_loss_clip": 1.03683591, + "balance_loss_mlp": 1.02455306, + "epoch": 0.1916128062528183, + "flos": 26140741966080.0, + "grad_norm": 1.819110713555502, + "language_loss": 0.8338927, + "learning_rate": 3.7322130604633825e-06, + "loss": 0.85522813, + "num_input_tokens_seen": 68769765, + "step": 3187, + "time_per_iteration": 2.6744091510772705 + }, + { + "auxiliary_loss_clip": 0.01017475, + "auxiliary_loss_mlp": 0.01038697, + "balance_loss_clip": 1.00842953, + "balance_loss_mlp": 1.03605103, + "epoch": 0.19167292950548626, + "flos": 54925767457920.0, + "grad_norm": 0.8571672670102661, + "language_loss": 0.55877674, + "learning_rate": 3.732018351516544e-06, + "loss": 0.57933843, + "num_input_tokens_seen": 68826815, + "step": 3188, + "time_per_iteration": 3.1780059337615967 + }, + { + "auxiliary_loss_clip": 0.01095852, + "auxiliary_loss_mlp": 0.01048844, + "balance_loss_clip": 1.03644872, + "balance_loss_mlp": 1.03166628, + "epoch": 0.19173305275815422, + "flos": 29935601942400.0, + "grad_norm": 1.7265340119853627, + "language_loss": 0.6990335, + "learning_rate": 3.731823576891397e-06, + "loss": 0.7204805, + "num_input_tokens_seen": 68847585, + "step": 3189, + "time_per_iteration": 2.715411901473999 + }, + { + "auxiliary_loss_clip": 0.01079558, + "auxiliary_loss_mlp": 0.01031289, + "balance_loss_clip": 1.03360045, + "balance_loss_mlp": 1.01574385, + "epoch": 0.1917931760108222, + "flos": 24752471264640.0, + "grad_norm": 1.8786282957111538, + "language_loss": 0.74225444, + "learning_rate": 3.7316287365953266e-06, + "loss": 0.76336288, + "num_input_tokens_seen": 68866620, + "step": 3190, + "time_per_iteration": 2.6602694988250732 + }, + { + "auxiliary_loss_clip": 0.01063767, + "auxiliary_loss_mlp": 0.01052293, + "balance_loss_clip": 1.03177989, + "balance_loss_mlp": 1.03456616, + "epoch": 0.19185329926349015, + "flos": 18843550375680.0, + "grad_norm": 2.08068963293269, + "language_loss": 0.84504342, + "learning_rate": 3.73143383063572e-06, + "loss": 0.86620402, + "num_input_tokens_seen": 68885515, + "step": 3191, + "time_per_iteration": 2.781773805618286 + }, + { + "auxiliary_loss_clip": 0.01080667, + "auxiliary_loss_mlp": 0.01037407, + "balance_loss_clip": 1.03195763, + "balance_loss_mlp": 1.02152836, + "epoch": 0.19191342251615812, + "flos": 22086858038400.0, + "grad_norm": 1.79579251964214, + "language_loss": 0.8916924, + "learning_rate": 3.73123885901997e-06, + "loss": 0.91287315, + "num_input_tokens_seen": 68903225, + "step": 3192, + "time_per_iteration": 2.71334171295166 + }, + { + "auxiliary_loss_clip": 0.01084308, + "auxiliary_loss_mlp": 0.01045499, + "balance_loss_clip": 1.03873539, + "balance_loss_mlp": 1.02736735, + "epoch": 0.19197354576882608, + "flos": 22199115018240.0, + "grad_norm": 1.8100415469407949, + "language_loss": 0.74713802, + "learning_rate": 3.7310438217554687e-06, + "loss": 0.76843607, + "num_input_tokens_seen": 68922860, + "step": 3193, + "time_per_iteration": 2.86782169342041 + }, + { + "auxiliary_loss_clip": 0.0108199, + "auxiliary_loss_mlp": 0.00748918, + "balance_loss_clip": 1.03332424, + "balance_loss_mlp": 1.00094199, + "epoch": 0.19203366902149407, + "flos": 24896185580160.0, + "grad_norm": 1.9711814956659792, + "language_loss": 0.75176239, + "learning_rate": 3.730848718849612e-06, + "loss": 0.77007145, + "num_input_tokens_seen": 68943000, + "step": 3194, + "time_per_iteration": 2.6722331047058105 + }, + { + "auxiliary_loss_clip": 0.01014617, + "auxiliary_loss_mlp": 0.01013724, + "balance_loss_clip": 1.00525999, + "balance_loss_mlp": 1.01129198, + "epoch": 0.19209379227416204, + "flos": 68416722789120.0, + "grad_norm": 0.7869279205387278, + "language_loss": 0.68531924, + "learning_rate": 3.7306535503097985e-06, + "loss": 0.70560265, + "num_input_tokens_seen": 69000255, + "step": 3195, + "time_per_iteration": 3.1669986248016357 + }, + { + "auxiliary_loss_clip": 0.01072261, + "auxiliary_loss_mlp": 0.0106328, + "balance_loss_clip": 1.03687811, + "balance_loss_mlp": 1.04445744, + "epoch": 0.19215391552683, + "flos": 22055185221120.0, + "grad_norm": 4.4928834257874275, + "language_loss": 0.72767568, + "learning_rate": 3.730458316143429e-06, + "loss": 0.74903107, + "num_input_tokens_seen": 69019665, + "step": 3196, + "time_per_iteration": 2.8449671268463135 + }, + { + "auxiliary_loss_clip": 0.01091006, + "auxiliary_loss_mlp": 0.01050417, + "balance_loss_clip": 1.0399344, + "balance_loss_mlp": 1.03248835, + "epoch": 0.19221403877949797, + "flos": 20302959962880.0, + "grad_norm": 2.061844358870046, + "language_loss": 0.8359071, + "learning_rate": 3.7302630163579068e-06, + "loss": 0.85732132, + "num_input_tokens_seen": 69039055, + "step": 3197, + "time_per_iteration": 2.7902848720550537 + }, + { + "auxiliary_loss_clip": 0.01048272, + "auxiliary_loss_mlp": 0.01058676, + "balance_loss_clip": 1.03515351, + "balance_loss_mlp": 1.03912592, + "epoch": 0.19227416203216594, + "flos": 23185329811200.0, + "grad_norm": 1.943841441957698, + "language_loss": 0.79853153, + "learning_rate": 3.7300676509606373e-06, + "loss": 0.819601, + "num_input_tokens_seen": 69056370, + "step": 3198, + "time_per_iteration": 2.8671786785125732 + }, + { + "auxiliary_loss_clip": 0.01087649, + "auxiliary_loss_mlp": 0.0104946, + "balance_loss_clip": 1.03452301, + "balance_loss_mlp": 1.0313046, + "epoch": 0.1923342852848339, + "flos": 25776607841280.0, + "grad_norm": 1.9821818207350659, + "language_loss": 0.78550315, + "learning_rate": 3.729872219959029e-06, + "loss": 0.80687422, + "num_input_tokens_seen": 69075915, + "step": 3199, + "time_per_iteration": 4.40791392326355 + }, + { + "auxiliary_loss_clip": 0.01064022, + "auxiliary_loss_mlp": 0.0105671, + "balance_loss_clip": 1.03208268, + "balance_loss_mlp": 1.03870988, + "epoch": 0.19239440853750187, + "flos": 17128349061120.0, + "grad_norm": 2.257439337823435, + "language_loss": 0.83566701, + "learning_rate": 3.7296767233604934e-06, + "loss": 0.85687435, + "num_input_tokens_seen": 69094145, + "step": 3200, + "time_per_iteration": 2.666285753250122 + }, + { + "auxiliary_loss_clip": 0.01111268, + "auxiliary_loss_mlp": 0.01053215, + "balance_loss_clip": 1.03906047, + "balance_loss_mlp": 1.03634715, + "epoch": 0.19245453179016986, + "flos": 16435093593600.0, + "grad_norm": 1.6287914562747476, + "language_loss": 0.79280353, + "learning_rate": 3.729481161172443e-06, + "loss": 0.81444836, + "num_input_tokens_seen": 69111110, + "step": 3201, + "time_per_iteration": 2.601151704788208 + }, + { + "auxiliary_loss_clip": 0.01053394, + "auxiliary_loss_mlp": 0.01045081, + "balance_loss_clip": 1.02933931, + "balance_loss_mlp": 1.02743816, + "epoch": 0.19251465504283782, + "flos": 20230276792320.0, + "grad_norm": 1.9567593525951654, + "language_loss": 0.69473004, + "learning_rate": 3.7292855334022927e-06, + "loss": 0.71571475, + "num_input_tokens_seen": 69130280, + "step": 3202, + "time_per_iteration": 2.6970882415771484 + }, + { + "auxiliary_loss_clip": 0.01084882, + "auxiliary_loss_mlp": 0.01042725, + "balance_loss_clip": 1.03462505, + "balance_loss_mlp": 1.0259645, + "epoch": 0.1925747782955058, + "flos": 19464374067840.0, + "grad_norm": 1.8561828903738427, + "language_loss": 0.91238177, + "learning_rate": 3.7290898400574627e-06, + "loss": 0.93365788, + "num_input_tokens_seen": 69149570, + "step": 3203, + "time_per_iteration": 2.768216848373413 + }, + { + "auxiliary_loss_clip": 0.01101959, + "auxiliary_loss_mlp": 0.0104906, + "balance_loss_clip": 1.03675961, + "balance_loss_mlp": 1.0308212, + "epoch": 0.19263490154817375, + "flos": 17785586165760.0, + "grad_norm": 2.0926951532546973, + "language_loss": 0.81783891, + "learning_rate": 3.7288940811453725e-06, + "loss": 0.83934909, + "num_input_tokens_seen": 69168190, + "step": 3204, + "time_per_iteration": 4.2032976150512695 + }, + { + "auxiliary_loss_clip": 0.01073345, + "auxiliary_loss_mlp": 0.01040617, + "balance_loss_clip": 1.03267241, + "balance_loss_mlp": 1.02343917, + "epoch": 0.19269502480084172, + "flos": 17457075354240.0, + "grad_norm": 1.7677876993690187, + "language_loss": 0.75756693, + "learning_rate": 3.7286982566734454e-06, + "loss": 0.77870655, + "num_input_tokens_seen": 69186950, + "step": 3205, + "time_per_iteration": 2.7091453075408936 + }, + { + "auxiliary_loss_clip": 0.0109373, + "auxiliary_loss_mlp": 0.01047774, + "balance_loss_clip": 1.03962183, + "balance_loss_mlp": 1.03084612, + "epoch": 0.19275514805350968, + "flos": 21506901045120.0, + "grad_norm": 2.413388578874194, + "language_loss": 0.8326534, + "learning_rate": 3.728502366649107e-06, + "loss": 0.8540684, + "num_input_tokens_seen": 69204850, + "step": 3206, + "time_per_iteration": 4.222656965255737 + }, + { + "auxiliary_loss_clip": 0.01010504, + "auxiliary_loss_mlp": 0.01004202, + "balance_loss_clip": 1.00996184, + "balance_loss_mlp": 1.00138891, + "epoch": 0.19281527130617768, + "flos": 47695979738880.0, + "grad_norm": 0.8400623034617938, + "language_loss": 0.60535717, + "learning_rate": 3.728306411079786e-06, + "loss": 0.62550426, + "num_input_tokens_seen": 69259200, + "step": 3207, + "time_per_iteration": 4.657994985580444 + }, + { + "auxiliary_loss_clip": 0.01073962, + "auxiliary_loss_mlp": 0.01046439, + "balance_loss_clip": 1.03293395, + "balance_loss_mlp": 1.02914715, + "epoch": 0.19287539455884564, + "flos": 11801252672640.0, + "grad_norm": 4.6014210718512905, + "language_loss": 0.75477403, + "learning_rate": 3.7281103899729125e-06, + "loss": 0.77597803, + "num_input_tokens_seen": 69275835, + "step": 3208, + "time_per_iteration": 2.7559115886688232 + }, + { + "auxiliary_loss_clip": 0.01100347, + "auxiliary_loss_mlp": 0.00748961, + "balance_loss_clip": 1.03515863, + "balance_loss_mlp": 1.00095427, + "epoch": 0.1929355178115136, + "flos": 20631434860800.0, + "grad_norm": 2.23916190314135, + "language_loss": 0.60499728, + "learning_rate": 3.7279143033359195e-06, + "loss": 0.62349033, + "num_input_tokens_seen": 69294810, + "step": 3209, + "time_per_iteration": 2.6849420070648193 + }, + { + "auxiliary_loss_clip": 0.01112375, + "auxiliary_loss_mlp": 0.01049186, + "balance_loss_clip": 1.03628016, + "balance_loss_mlp": 1.03007627, + "epoch": 0.19299564106418157, + "flos": 40807916058240.0, + "grad_norm": 2.380830655492099, + "language_loss": 0.80285758, + "learning_rate": 3.727718151176243e-06, + "loss": 0.82447326, + "num_input_tokens_seen": 69316065, + "step": 3210, + "time_per_iteration": 2.729153871536255 + }, + { + "auxiliary_loss_clip": 0.01070682, + "auxiliary_loss_mlp": 0.01039843, + "balance_loss_clip": 1.03072333, + "balance_loss_mlp": 1.02361822, + "epoch": 0.19305576431684954, + "flos": 11361418634880.0, + "grad_norm": 1.8725315290564928, + "language_loss": 0.83135474, + "learning_rate": 3.7275219335013217e-06, + "loss": 0.85245997, + "num_input_tokens_seen": 69332900, + "step": 3211, + "time_per_iteration": 2.724181890487671 + }, + { + "auxiliary_loss_clip": 0.01022007, + "auxiliary_loss_mlp": 0.01002463, + "balance_loss_clip": 1.00244451, + "balance_loss_mlp": 0.99993563, + "epoch": 0.1931158875695175, + "flos": 54511895975040.0, + "grad_norm": 0.9661709166428186, + "language_loss": 0.63693607, + "learning_rate": 3.7273256503185953e-06, + "loss": 0.65718073, + "num_input_tokens_seen": 69382535, + "step": 3212, + "time_per_iteration": 2.995060682296753 + }, + { + "auxiliary_loss_clip": 0.01090568, + "auxiliary_loss_mlp": 0.01042735, + "balance_loss_clip": 1.03914988, + "balance_loss_mlp": 1.02624846, + "epoch": 0.19317601082218547, + "flos": 19828436365440.0, + "grad_norm": 1.6078734119089413, + "language_loss": 0.762909, + "learning_rate": 3.7271293016355074e-06, + "loss": 0.78424197, + "num_input_tokens_seen": 69400600, + "step": 3213, + "time_per_iteration": 2.730698585510254 + }, + { + "auxiliary_loss_clip": 0.01078183, + "auxiliary_loss_mlp": 0.01043549, + "balance_loss_clip": 1.03445494, + "balance_loss_mlp": 1.02562034, + "epoch": 0.19323613407485346, + "flos": 13152068467200.0, + "grad_norm": 2.0172045786186485, + "language_loss": 0.71064073, + "learning_rate": 3.726932887459503e-06, + "loss": 0.73185802, + "num_input_tokens_seen": 69417350, + "step": 3214, + "time_per_iteration": 2.6659696102142334 + }, + { + "auxiliary_loss_clip": 0.01106618, + "auxiliary_loss_mlp": 0.01049126, + "balance_loss_clip": 1.03465533, + "balance_loss_mlp": 1.03091145, + "epoch": 0.19329625732752143, + "flos": 14027247342720.0, + "grad_norm": 2.1743435264428324, + "language_loss": 0.75252533, + "learning_rate": 3.72673640779803e-06, + "loss": 0.77408278, + "num_input_tokens_seen": 69431845, + "step": 3215, + "time_per_iteration": 2.5783188343048096 + }, + { + "auxiliary_loss_clip": 0.01071832, + "auxiliary_loss_mlp": 0.0105231, + "balance_loss_clip": 1.03311479, + "balance_loss_mlp": 1.03554964, + "epoch": 0.1933563805801894, + "flos": 23441732069760.0, + "grad_norm": 1.850421192549396, + "language_loss": 0.88880146, + "learning_rate": 3.72653986265854e-06, + "loss": 0.91004294, + "num_input_tokens_seen": 69453275, + "step": 3216, + "time_per_iteration": 2.7607338428497314 + }, + { + "auxiliary_loss_clip": 0.01107637, + "auxiliary_loss_mlp": 0.01050288, + "balance_loss_clip": 1.03662884, + "balance_loss_mlp": 1.03381371, + "epoch": 0.19341650383285736, + "flos": 20485314334080.0, + "grad_norm": 1.6474060956602488, + "language_loss": 0.79909962, + "learning_rate": 3.726343252048485e-06, + "loss": 0.82067895, + "num_input_tokens_seen": 69471830, + "step": 3217, + "time_per_iteration": 2.646369695663452 + }, + { + "auxiliary_loss_clip": 0.01096119, + "auxiliary_loss_mlp": 0.01045643, + "balance_loss_clip": 1.03766203, + "balance_loss_mlp": 1.02636707, + "epoch": 0.19347662708552532, + "flos": 17858484817920.0, + "grad_norm": 2.193180328084572, + "language_loss": 0.61706328, + "learning_rate": 3.7261465759753206e-06, + "loss": 0.63848084, + "num_input_tokens_seen": 69489320, + "step": 3218, + "time_per_iteration": 2.663703203201294 + }, + { + "auxiliary_loss_clip": 0.0110975, + "auxiliary_loss_mlp": 0.01045269, + "balance_loss_clip": 1.0369072, + "balance_loss_mlp": 1.02806664, + "epoch": 0.1935367503381933, + "flos": 18187247024640.0, + "grad_norm": 1.7320360824681496, + "language_loss": 0.80372703, + "learning_rate": 3.7259498344465053e-06, + "loss": 0.82527721, + "num_input_tokens_seen": 69506665, + "step": 3219, + "time_per_iteration": 2.5074141025543213 + }, + { + "auxiliary_loss_clip": 0.0105715, + "auxiliary_loss_mlp": 0.01047371, + "balance_loss_clip": 1.03304553, + "balance_loss_mlp": 1.02975154, + "epoch": 0.19359687359086128, + "flos": 15957122290560.0, + "grad_norm": 2.062885189529884, + "language_loss": 0.85913754, + "learning_rate": 3.7257530274694993e-06, + "loss": 0.88018274, + "num_input_tokens_seen": 69523835, + "step": 3220, + "time_per_iteration": 2.742539644241333 + }, + { + "auxiliary_loss_clip": 0.01104515, + "auxiliary_loss_mlp": 0.01041397, + "balance_loss_clip": 1.03589487, + "balance_loss_mlp": 1.02615595, + "epoch": 0.19365699684352924, + "flos": 21215198695680.0, + "grad_norm": 2.030443128463622, + "language_loss": 0.84264076, + "learning_rate": 3.725556155051766e-06, + "loss": 0.86409986, + "num_input_tokens_seen": 69542620, + "step": 3221, + "time_per_iteration": 2.5915794372558594 + }, + { + "auxiliary_loss_clip": 0.01097177, + "auxiliary_loss_mlp": 0.01045936, + "balance_loss_clip": 1.03677917, + "balance_loss_mlp": 1.03005099, + "epoch": 0.1937171200961972, + "flos": 17311098481920.0, + "grad_norm": 2.0493216095024103, + "language_loss": 0.85510981, + "learning_rate": 3.7253592172007702e-06, + "loss": 0.8765409, + "num_input_tokens_seen": 69561130, + "step": 3222, + "time_per_iteration": 2.589468479156494 + }, + { + "auxiliary_loss_clip": 0.01026432, + "auxiliary_loss_mlp": 0.01041554, + "balance_loss_clip": 1.02861238, + "balance_loss_mlp": 1.02393544, + "epoch": 0.19377724334886517, + "flos": 22635968227200.0, + "grad_norm": 1.746487916960246, + "language_loss": 0.78442526, + "learning_rate": 3.72516221392398e-06, + "loss": 0.80510515, + "num_input_tokens_seen": 69580425, + "step": 3223, + "time_per_iteration": 2.760707378387451 + }, + { + "auxiliary_loss_clip": 0.01095442, + "auxiliary_loss_mlp": 0.01043172, + "balance_loss_clip": 1.03580511, + "balance_loss_mlp": 1.02606583, + "epoch": 0.19383736660153314, + "flos": 15077813351040.0, + "grad_norm": 1.823481096148058, + "language_loss": 0.75123781, + "learning_rate": 3.7249651452288653e-06, + "loss": 0.77262396, + "num_input_tokens_seen": 69597085, + "step": 3224, + "time_per_iteration": 2.5662834644317627 + }, + { + "auxiliary_loss_clip": 0.01051644, + "auxiliary_loss_mlp": 0.01051283, + "balance_loss_clip": 1.02979958, + "balance_loss_mlp": 1.03315115, + "epoch": 0.1938974898542011, + "flos": 47119934350080.0, + "grad_norm": 2.1803351492250407, + "language_loss": 0.70311058, + "learning_rate": 3.7247680111229e-06, + "loss": 0.72413987, + "num_input_tokens_seen": 69618885, + "step": 3225, + "time_per_iteration": 2.887077808380127 + }, + { + "auxiliary_loss_clip": 0.01067597, + "auxiliary_loss_mlp": 0.01050634, + "balance_loss_clip": 1.0313853, + "balance_loss_mlp": 1.03431392, + "epoch": 0.19395761310686907, + "flos": 25812554376960.0, + "grad_norm": 2.622973583755895, + "language_loss": 0.69022143, + "learning_rate": 3.7245708116135585e-06, + "loss": 0.71140373, + "num_input_tokens_seen": 69638200, + "step": 3226, + "time_per_iteration": 2.681907892227173 + }, + { + "auxiliary_loss_clip": 0.01076242, + "auxiliary_loss_mlp": 0.01039497, + "balance_loss_clip": 1.0352056, + "balance_loss_mlp": 1.0212456, + "epoch": 0.19401773635953706, + "flos": 23039604334080.0, + "grad_norm": 1.6886858599454855, + "language_loss": 0.76487792, + "learning_rate": 3.7243735467083193e-06, + "loss": 0.7860353, + "num_input_tokens_seen": 69657550, + "step": 3227, + "time_per_iteration": 2.6825180053710938 + }, + { + "auxiliary_loss_clip": 0.01075473, + "auxiliary_loss_mlp": 0.01042146, + "balance_loss_clip": 1.03456402, + "balance_loss_mlp": 1.02632689, + "epoch": 0.19407785961220503, + "flos": 15920780705280.0, + "grad_norm": 2.1330580464307434, + "language_loss": 0.69355333, + "learning_rate": 3.724176216414662e-06, + "loss": 0.71472949, + "num_input_tokens_seen": 69675005, + "step": 3228, + "time_per_iteration": 2.6719143390655518 + }, + { + "auxiliary_loss_clip": 0.01098025, + "auxiliary_loss_mlp": 0.01040586, + "balance_loss_clip": 1.0364722, + "balance_loss_mlp": 1.02373028, + "epoch": 0.194137982864873, + "flos": 25921722787200.0, + "grad_norm": 2.5762871779476413, + "language_loss": 0.74285382, + "learning_rate": 3.72397882074007e-06, + "loss": 0.76423997, + "num_input_tokens_seen": 69696455, + "step": 3229, + "time_per_iteration": 2.6306333541870117 + }, + { + "auxiliary_loss_clip": 0.01073927, + "auxiliary_loss_mlp": 0.01042667, + "balance_loss_clip": 1.03402519, + "balance_loss_mlp": 1.02569127, + "epoch": 0.19419810611754096, + "flos": 13261344618240.0, + "grad_norm": 1.6692732084987412, + "language_loss": 0.65295422, + "learning_rate": 3.7237813596920285e-06, + "loss": 0.67412019, + "num_input_tokens_seen": 69714245, + "step": 3230, + "time_per_iteration": 2.6688828468322754 + }, + { + "auxiliary_loss_clip": 0.01081036, + "auxiliary_loss_mlp": 0.00748822, + "balance_loss_clip": 1.03346562, + "balance_loss_mlp": 1.00092411, + "epoch": 0.19425822937020892, + "flos": 15705568368000.0, + "grad_norm": 1.8825029069655201, + "language_loss": 0.82000673, + "learning_rate": 3.7235838332780254e-06, + "loss": 0.83830529, + "num_input_tokens_seen": 69731515, + "step": 3231, + "time_per_iteration": 2.7388803958892822 + }, + { + "auxiliary_loss_clip": 0.01081911, + "auxiliary_loss_mlp": 0.010422, + "balance_loss_clip": 1.03373647, + "balance_loss_mlp": 1.02371049, + "epoch": 0.1943183526228769, + "flos": 23105392093440.0, + "grad_norm": 1.7683911216578814, + "language_loss": 0.87172115, + "learning_rate": 3.72338624150555e-06, + "loss": 0.89296222, + "num_input_tokens_seen": 69748885, + "step": 3232, + "time_per_iteration": 2.637477159500122 + }, + { + "auxiliary_loss_clip": 0.01048695, + "auxiliary_loss_mlp": 0.01055164, + "balance_loss_clip": 1.0300529, + "balance_loss_mlp": 1.03652, + "epoch": 0.19437847587554485, + "flos": 24712610146560.0, + "grad_norm": 1.7597045208810616, + "language_loss": 0.85022211, + "learning_rate": 3.723188584382096e-06, + "loss": 0.8712607, + "num_input_tokens_seen": 69767540, + "step": 3233, + "time_per_iteration": 2.782367467880249 + }, + { + "auxiliary_loss_clip": 0.01100497, + "auxiliary_loss_mlp": 0.01051308, + "balance_loss_clip": 1.03537762, + "balance_loss_mlp": 1.03421342, + "epoch": 0.19443859912821285, + "flos": 23116130259840.0, + "grad_norm": 1.6814664503478718, + "language_loss": 0.89507544, + "learning_rate": 3.722990861915158e-06, + "loss": 0.91659349, + "num_input_tokens_seen": 69789340, + "step": 3234, + "time_per_iteration": 2.7853007316589355 + }, + { + "auxiliary_loss_clip": 0.01085591, + "auxiliary_loss_mlp": 0.01043239, + "balance_loss_clip": 1.03251481, + "balance_loss_mlp": 1.02546453, + "epoch": 0.1944987223808808, + "flos": 15084385539840.0, + "grad_norm": 2.197004684110535, + "language_loss": 0.78138268, + "learning_rate": 3.722793074112234e-06, + "loss": 0.80267096, + "num_input_tokens_seen": 69806470, + "step": 3235, + "time_per_iteration": 2.651752471923828 + }, + { + "auxiliary_loss_clip": 0.01090886, + "auxiliary_loss_mlp": 0.01042978, + "balance_loss_clip": 1.03879821, + "balance_loss_mlp": 1.02711749, + "epoch": 0.19455884563354878, + "flos": 17126876603520.0, + "grad_norm": 2.121963924368493, + "language_loss": 0.79171121, + "learning_rate": 3.7225952209808233e-06, + "loss": 0.81304985, + "num_input_tokens_seen": 69822655, + "step": 3236, + "time_per_iteration": 2.6408727169036865 + }, + { + "auxiliary_loss_clip": 0.01109293, + "auxiliary_loss_mlp": 0.01041603, + "balance_loss_clip": 1.03835058, + "balance_loss_mlp": 1.02386451, + "epoch": 0.19461896888621674, + "flos": 20193396503040.0, + "grad_norm": 1.6410280256989744, + "language_loss": 0.75757176, + "learning_rate": 3.72239730252843e-06, + "loss": 0.77908069, + "num_input_tokens_seen": 69841895, + "step": 3237, + "time_per_iteration": 2.6627626419067383 + }, + { + "auxiliary_loss_clip": 0.01112443, + "auxiliary_loss_mlp": 0.01050662, + "balance_loss_clip": 1.0381422, + "balance_loss_mlp": 1.03394866, + "epoch": 0.1946790921388847, + "flos": 25301365971840.0, + "grad_norm": 2.060631929851792, + "language_loss": 0.75145763, + "learning_rate": 3.7221993187625583e-06, + "loss": 0.77308863, + "num_input_tokens_seen": 69862220, + "step": 3238, + "time_per_iteration": 2.6694836616516113 + }, + { + "auxiliary_loss_clip": 0.01060298, + "auxiliary_loss_mlp": 0.01046937, + "balance_loss_clip": 1.03090775, + "balance_loss_mlp": 1.02859068, + "epoch": 0.19473921539155267, + "flos": 20193396503040.0, + "grad_norm": 3.1074112375955987, + "language_loss": 0.73194981, + "learning_rate": 3.7220012696907155e-06, + "loss": 0.75302213, + "num_input_tokens_seen": 69881830, + "step": 3239, + "time_per_iteration": 2.6749727725982666 + }, + { + "auxiliary_loss_clip": 0.01083415, + "auxiliary_loss_mlp": 0.01047556, + "balance_loss_clip": 1.03324604, + "balance_loss_mlp": 1.02982926, + "epoch": 0.19479933864422067, + "flos": 20887549810560.0, + "grad_norm": 2.0577842704108225, + "language_loss": 0.73283261, + "learning_rate": 3.721803155320412e-06, + "loss": 0.75414228, + "num_input_tokens_seen": 69900515, + "step": 3240, + "time_per_iteration": 2.5954997539520264 + }, + { + "auxiliary_loss_clip": 0.01085122, + "auxiliary_loss_mlp": 0.01042103, + "balance_loss_clip": 1.03558123, + "balance_loss_mlp": 1.0252583, + "epoch": 0.19485946189688863, + "flos": 23295072839040.0, + "grad_norm": 2.4730433864021863, + "language_loss": 0.66550446, + "learning_rate": 3.7216049756591606e-06, + "loss": 0.68677676, + "num_input_tokens_seen": 69920060, + "step": 3241, + "time_per_iteration": 2.720423460006714 + }, + { + "auxiliary_loss_clip": 0.01077442, + "auxiliary_loss_mlp": 0.01043117, + "balance_loss_clip": 1.0334264, + "balance_loss_mlp": 1.0258913, + "epoch": 0.1949195851495566, + "flos": 23295036925440.0, + "grad_norm": 1.3059436739360482, + "language_loss": 0.82823718, + "learning_rate": 3.7214067307144754e-06, + "loss": 0.84944272, + "num_input_tokens_seen": 69939820, + "step": 3242, + "time_per_iteration": 2.6949424743652344 + }, + { + "auxiliary_loss_clip": 0.01024016, + "auxiliary_loss_mlp": 0.0100698, + "balance_loss_clip": 1.00553417, + "balance_loss_mlp": 1.0044769, + "epoch": 0.19497970840222456, + "flos": 64962871557120.0, + "grad_norm": 0.8370500394122821, + "language_loss": 0.57485366, + "learning_rate": 3.721208420493875e-06, + "loss": 0.59516358, + "num_input_tokens_seen": 70002145, + "step": 3243, + "time_per_iteration": 3.150280714035034 + }, + { + "auxiliary_loss_clip": 0.01087784, + "auxiliary_loss_mlp": 0.01049549, + "balance_loss_clip": 1.03340852, + "balance_loss_mlp": 1.03068972, + "epoch": 0.19503983165489253, + "flos": 19644717277440.0, + "grad_norm": 1.8378779116469381, + "language_loss": 0.83584744, + "learning_rate": 3.7210100450048784e-06, + "loss": 0.85722077, + "num_input_tokens_seen": 70020510, + "step": 3244, + "time_per_iteration": 2.5657808780670166 + }, + { + "auxiliary_loss_clip": 0.01098155, + "auxiliary_loss_mlp": 0.01045362, + "balance_loss_clip": 1.03775775, + "balance_loss_mlp": 1.02900648, + "epoch": 0.1950999549075605, + "flos": 21141976821120.0, + "grad_norm": 5.3525807835964745, + "language_loss": 0.77340752, + "learning_rate": 3.7208116042550088e-06, + "loss": 0.79484272, + "num_input_tokens_seen": 70040760, + "step": 3245, + "time_per_iteration": 2.6502013206481934 + }, + { + "auxiliary_loss_clip": 0.01097844, + "auxiliary_loss_mlp": 0.0104163, + "balance_loss_clip": 1.03592229, + "balance_loss_mlp": 1.02391577, + "epoch": 0.19516007816022846, + "flos": 20884820376960.0, + "grad_norm": 1.8988059504930925, + "language_loss": 0.83530712, + "learning_rate": 3.7206130982517906e-06, + "loss": 0.85670185, + "num_input_tokens_seen": 70058720, + "step": 3246, + "time_per_iteration": 2.5908849239349365 + }, + { + "auxiliary_loss_clip": 0.01099546, + "auxiliary_loss_mlp": 0.00748965, + "balance_loss_clip": 1.03642428, + "balance_loss_mlp": 1.00093961, + "epoch": 0.19522020141289645, + "flos": 16910515031040.0, + "grad_norm": 2.34392627806248, + "language_loss": 0.76222759, + "learning_rate": 3.7204145270027514e-06, + "loss": 0.78071278, + "num_input_tokens_seen": 70076470, + "step": 3247, + "time_per_iteration": 4.236788034439087 + }, + { + "auxiliary_loss_clip": 0.01080829, + "auxiliary_loss_mlp": 0.01045794, + "balance_loss_clip": 1.03988254, + "balance_loss_mlp": 1.02897406, + "epoch": 0.19528032466556441, + "flos": 26724829023360.0, + "grad_norm": 1.5221661257890362, + "language_loss": 0.75611544, + "learning_rate": 3.720215890515421e-06, + "loss": 0.77738166, + "num_input_tokens_seen": 70096220, + "step": 3248, + "time_per_iteration": 2.8293561935424805 + }, + { + "auxiliary_loss_clip": 0.01109366, + "auxiliary_loss_mlp": 0.01043994, + "balance_loss_clip": 1.03621268, + "balance_loss_mlp": 1.02668524, + "epoch": 0.19534044791823238, + "flos": 21032808410880.0, + "grad_norm": 2.3182610531758696, + "language_loss": 0.78285372, + "learning_rate": 3.7200171887973316e-06, + "loss": 0.80438733, + "num_input_tokens_seen": 70114800, + "step": 3249, + "time_per_iteration": 2.5961549282073975 + }, + { + "auxiliary_loss_clip": 0.01099085, + "auxiliary_loss_mlp": 0.01041724, + "balance_loss_clip": 1.03514647, + "balance_loss_mlp": 1.02538049, + "epoch": 0.19540057117090034, + "flos": 22344050396160.0, + "grad_norm": 1.5702302512955313, + "language_loss": 0.73088068, + "learning_rate": 3.7198184218560176e-06, + "loss": 0.7522887, + "num_input_tokens_seen": 70134930, + "step": 3250, + "time_per_iteration": 4.263609886169434 + }, + { + "auxiliary_loss_clip": 0.01052926, + "auxiliary_loss_mlp": 0.01040498, + "balance_loss_clip": 1.03309917, + "balance_loss_mlp": 1.02421355, + "epoch": 0.1954606944235683, + "flos": 20301631159680.0, + "grad_norm": 1.9338038047056494, + "language_loss": 0.79528582, + "learning_rate": 3.719619589699017e-06, + "loss": 0.81622005, + "num_input_tokens_seen": 70152045, + "step": 3251, + "time_per_iteration": 2.7433865070343018 + }, + { + "auxiliary_loss_clip": 0.01108574, + "auxiliary_loss_mlp": 0.01041271, + "balance_loss_clip": 1.03573167, + "balance_loss_mlp": 1.02394986, + "epoch": 0.19552081767623627, + "flos": 17346865449600.0, + "grad_norm": 2.6670688416052077, + "language_loss": 0.83406794, + "learning_rate": 3.7194206923338695e-06, + "loss": 0.85556638, + "num_input_tokens_seen": 70169240, + "step": 3252, + "time_per_iteration": 2.6913764476776123 + }, + { + "auxiliary_loss_clip": 0.01091282, + "auxiliary_loss_mlp": 0.01055119, + "balance_loss_clip": 1.03310454, + "balance_loss_mlp": 1.03469825, + "epoch": 0.19558094092890424, + "flos": 31977626129280.0, + "grad_norm": 1.8311534299171728, + "language_loss": 0.73792958, + "learning_rate": 3.719221729768117e-06, + "loss": 0.75939357, + "num_input_tokens_seen": 70192690, + "step": 3253, + "time_per_iteration": 2.781390428543091 + }, + { + "auxiliary_loss_clip": 0.01051078, + "auxiliary_loss_mlp": 0.01045202, + "balance_loss_clip": 1.02803695, + "balance_loss_mlp": 1.0272131, + "epoch": 0.19564106418157223, + "flos": 22268889187200.0, + "grad_norm": 2.0748562611714663, + "language_loss": 0.76801622, + "learning_rate": 3.7190227020093037e-06, + "loss": 0.78897899, + "num_input_tokens_seen": 70209685, + "step": 3254, + "time_per_iteration": 4.380268573760986 + }, + { + "auxiliary_loss_clip": 0.00986876, + "auxiliary_loss_mlp": 0.01003145, + "balance_loss_clip": 1.00775075, + "balance_loss_mlp": 1.00046277, + "epoch": 0.1957011874342402, + "flos": 54364554385920.0, + "grad_norm": 0.7656702812814526, + "language_loss": 0.55282867, + "learning_rate": 3.7188236090649774e-06, + "loss": 0.57272887, + "num_input_tokens_seen": 70265050, + "step": 3255, + "time_per_iteration": 3.283076047897339 + }, + { + "auxiliary_loss_clip": 0.01088728, + "auxiliary_loss_mlp": 0.01042855, + "balance_loss_clip": 1.03655863, + "balance_loss_mlp": 1.02598715, + "epoch": 0.19576131068690816, + "flos": 16506699356160.0, + "grad_norm": 2.4442859859366775, + "language_loss": 0.70565975, + "learning_rate": 3.718624450942688e-06, + "loss": 0.72697562, + "num_input_tokens_seen": 70281830, + "step": 3256, + "time_per_iteration": 2.7888989448547363 + }, + { + "auxiliary_loss_clip": 0.01106719, + "auxiliary_loss_mlp": 0.01041811, + "balance_loss_clip": 1.03561842, + "balance_loss_mlp": 1.02534795, + "epoch": 0.19582143393957613, + "flos": 14719676797440.0, + "grad_norm": 2.2418964225457962, + "language_loss": 0.80022889, + "learning_rate": 3.718425227649987e-06, + "loss": 0.82171416, + "num_input_tokens_seen": 70297420, + "step": 3257, + "time_per_iteration": 2.6599934101104736 + }, + { + "auxiliary_loss_clip": 0.01063802, + "auxiliary_loss_mlp": 0.01045168, + "balance_loss_clip": 1.0349896, + "balance_loss_mlp": 1.02874076, + "epoch": 0.1958815571922441, + "flos": 24425504737920.0, + "grad_norm": 1.8288044853223313, + "language_loss": 0.75233269, + "learning_rate": 3.7182259391944292e-06, + "loss": 0.77342236, + "num_input_tokens_seen": 70319210, + "step": 3258, + "time_per_iteration": 2.881833791732788 + }, + { + "auxiliary_loss_clip": 0.01037271, + "auxiliary_loss_mlp": 0.01043119, + "balance_loss_clip": 1.0286423, + "balance_loss_mlp": 1.02466536, + "epoch": 0.19594168044491206, + "flos": 24900279730560.0, + "grad_norm": 1.9460162651946127, + "language_loss": 0.73800153, + "learning_rate": 3.7180265855835714e-06, + "loss": 0.75880539, + "num_input_tokens_seen": 70339045, + "step": 3259, + "time_per_iteration": 2.884049654006958 + }, + { + "auxiliary_loss_clip": 0.01090861, + "auxiliary_loss_mlp": 0.01042612, + "balance_loss_clip": 1.0367806, + "balance_loss_mlp": 1.0244447, + "epoch": 0.19600180369758005, + "flos": 12057008486400.0, + "grad_norm": 2.313176527975873, + "language_loss": 0.77273917, + "learning_rate": 3.7178271668249735e-06, + "loss": 0.79407382, + "num_input_tokens_seen": 70356505, + "step": 3260, + "time_per_iteration": 2.614802598953247 + }, + { + "auxiliary_loss_clip": 0.01098464, + "auxiliary_loss_mlp": 0.0104217, + "balance_loss_clip": 1.03445303, + "balance_loss_mlp": 1.0243957, + "epoch": 0.19606192695024802, + "flos": 20850202644480.0, + "grad_norm": 2.278177381029293, + "language_loss": 0.82107955, + "learning_rate": 3.7176276829261975e-06, + "loss": 0.8424859, + "num_input_tokens_seen": 70375410, + "step": 3261, + "time_per_iteration": 2.591926336288452 + }, + { + "auxiliary_loss_clip": 0.01075823, + "auxiliary_loss_mlp": 0.01042016, + "balance_loss_clip": 1.03492403, + "balance_loss_mlp": 1.02384889, + "epoch": 0.19612205020291598, + "flos": 28475509996800.0, + "grad_norm": 1.8721391329202601, + "language_loss": 0.76512682, + "learning_rate": 3.717428133894807e-06, + "loss": 0.78630525, + "num_input_tokens_seen": 70396315, + "step": 3262, + "time_per_iteration": 2.8431284427642822 + }, + { + "auxiliary_loss_clip": 0.01103263, + "auxiliary_loss_mlp": 0.01044177, + "balance_loss_clip": 1.0409385, + "balance_loss_mlp": 1.02740419, + "epoch": 0.19618217345558395, + "flos": 25556618995200.0, + "grad_norm": 1.561280364900193, + "language_loss": 0.8641212, + "learning_rate": 3.71722851973837e-06, + "loss": 0.88559556, + "num_input_tokens_seen": 70417945, + "step": 3263, + "time_per_iteration": 2.7348592281341553 + }, + { + "auxiliary_loss_clip": 0.01088703, + "auxiliary_loss_mlp": 0.01040232, + "balance_loss_clip": 1.03690314, + "balance_loss_mlp": 1.02357841, + "epoch": 0.1962422967082519, + "flos": 25264413855360.0, + "grad_norm": 1.5953147036475952, + "language_loss": 0.73891014, + "learning_rate": 3.717028840464455e-06, + "loss": 0.76019949, + "num_input_tokens_seen": 70438690, + "step": 3264, + "time_per_iteration": 2.6650424003601074 + }, + { + "auxiliary_loss_clip": 0.01098348, + "auxiliary_loss_mlp": 0.01045137, + "balance_loss_clip": 1.03855681, + "balance_loss_mlp": 1.02875769, + "epoch": 0.19630241996091988, + "flos": 18807352444800.0, + "grad_norm": 2.0488110908470665, + "language_loss": 0.78290606, + "learning_rate": 3.7168290960806344e-06, + "loss": 0.80434084, + "num_input_tokens_seen": 70455385, + "step": 3265, + "time_per_iteration": 2.5624678134918213 + }, + { + "auxiliary_loss_clip": 0.00985075, + "auxiliary_loss_mlp": 0.01003168, + "balance_loss_clip": 1.00515723, + "balance_loss_mlp": 1.00074804, + "epoch": 0.19636254321358784, + "flos": 62321137896960.0, + "grad_norm": 0.806526977509256, + "language_loss": 0.53381979, + "learning_rate": 3.716629286594483e-06, + "loss": 0.55370224, + "num_input_tokens_seen": 70514280, + "step": 3266, + "time_per_iteration": 3.263702392578125 + }, + { + "auxiliary_loss_clip": 0.01089376, + "auxiliary_loss_mlp": 0.00748893, + "balance_loss_clip": 1.03712606, + "balance_loss_mlp": 1.00093317, + "epoch": 0.19642266646625584, + "flos": 21069329564160.0, + "grad_norm": 1.892852516973263, + "language_loss": 0.79934263, + "learning_rate": 3.7164294120135767e-06, + "loss": 0.81772542, + "num_input_tokens_seen": 70531800, + "step": 3267, + "time_per_iteration": 2.6236653327941895 + }, + { + "auxiliary_loss_clip": 0.01086657, + "auxiliary_loss_mlp": 0.01042538, + "balance_loss_clip": 1.03576469, + "balance_loss_mlp": 1.02594423, + "epoch": 0.1964827897189238, + "flos": 14538651229440.0, + "grad_norm": 1.8080609118773927, + "language_loss": 0.8678326, + "learning_rate": 3.7162294723454953e-06, + "loss": 0.88912451, + "num_input_tokens_seen": 70550615, + "step": 3268, + "time_per_iteration": 2.624964714050293 + }, + { + "auxiliary_loss_clip": 0.01065857, + "auxiliary_loss_mlp": 0.01042761, + "balance_loss_clip": 1.03826118, + "balance_loss_mlp": 1.02625108, + "epoch": 0.19654291297159177, + "flos": 19244636616960.0, + "grad_norm": 1.98825465490001, + "language_loss": 0.69089448, + "learning_rate": 3.7160294675978197e-06, + "loss": 0.71198064, + "num_input_tokens_seen": 70568690, + "step": 3269, + "time_per_iteration": 2.7435786724090576 + }, + { + "auxiliary_loss_clip": 0.01066027, + "auxiliary_loss_mlp": 0.01052346, + "balance_loss_clip": 1.03286743, + "balance_loss_mlp": 1.03424978, + "epoch": 0.19660303622425973, + "flos": 25775710001280.0, + "grad_norm": 1.956334091979053, + "language_loss": 0.8089577, + "learning_rate": 3.715829397778135e-06, + "loss": 0.83014143, + "num_input_tokens_seen": 70588665, + "step": 3270, + "time_per_iteration": 2.8137316703796387 + }, + { + "auxiliary_loss_clip": 0.01094578, + "auxiliary_loss_mlp": 0.01044436, + "balance_loss_clip": 1.03381181, + "balance_loss_mlp": 1.02818751, + "epoch": 0.1966631594769277, + "flos": 20595093275520.0, + "grad_norm": 1.9044591410559024, + "language_loss": 0.83799696, + "learning_rate": 3.715629262894028e-06, + "loss": 0.85938716, + "num_input_tokens_seen": 70606900, + "step": 3271, + "time_per_iteration": 2.705784559249878 + }, + { + "auxiliary_loss_clip": 0.01093673, + "auxiliary_loss_mlp": 0.01048692, + "balance_loss_clip": 1.03579473, + "balance_loss_mlp": 1.03242028, + "epoch": 0.19672328272959566, + "flos": 23623188600960.0, + "grad_norm": 1.9355091222432803, + "language_loss": 0.80323905, + "learning_rate": 3.715429062953087e-06, + "loss": 0.82466274, + "num_input_tokens_seen": 70625955, + "step": 3272, + "time_per_iteration": 2.630279779434204 + }, + { + "auxiliary_loss_clip": 0.01070498, + "auxiliary_loss_mlp": 0.01061582, + "balance_loss_clip": 1.0312506, + "balance_loss_mlp": 1.04223406, + "epoch": 0.19678340598226365, + "flos": 23110922787840.0, + "grad_norm": 2.22538013120067, + "language_loss": 0.80681306, + "learning_rate": 3.7152287979629043e-06, + "loss": 0.82813382, + "num_input_tokens_seen": 70646090, + "step": 3273, + "time_per_iteration": 2.722264528274536 + }, + { + "auxiliary_loss_clip": 0.01098023, + "auxiliary_loss_mlp": 0.01051972, + "balance_loss_clip": 1.03596389, + "balance_loss_mlp": 1.035151, + "epoch": 0.19684352923493162, + "flos": 24534852716160.0, + "grad_norm": 1.5952566628113007, + "language_loss": 0.77558291, + "learning_rate": 3.7150284679310735e-06, + "loss": 0.79708278, + "num_input_tokens_seen": 70666065, + "step": 3274, + "time_per_iteration": 2.6719717979431152 + }, + { + "auxiliary_loss_clip": 0.01098582, + "auxiliary_loss_mlp": 0.01048144, + "balance_loss_clip": 1.03639936, + "balance_loss_mlp": 1.02980936, + "epoch": 0.19690365248759958, + "flos": 21796448578560.0, + "grad_norm": 2.473437315455941, + "language_loss": 0.80956388, + "learning_rate": 3.7148280728651914e-06, + "loss": 0.83103114, + "num_input_tokens_seen": 70681580, + "step": 3275, + "time_per_iteration": 2.601017475128174 + }, + { + "auxiliary_loss_clip": 0.01071612, + "auxiliary_loss_mlp": 0.01048126, + "balance_loss_clip": 1.03442192, + "balance_loss_mlp": 1.0301609, + "epoch": 0.19696377574026755, + "flos": 19056643810560.0, + "grad_norm": 2.042077841000868, + "language_loss": 0.81089061, + "learning_rate": 3.7146276127728563e-06, + "loss": 0.83208799, + "num_input_tokens_seen": 70697745, + "step": 3276, + "time_per_iteration": 2.638556480407715 + }, + { + "auxiliary_loss_clip": 0.01097178, + "auxiliary_loss_mlp": 0.01036845, + "balance_loss_clip": 1.03484154, + "balance_loss_mlp": 1.01994121, + "epoch": 0.19702389899293551, + "flos": 22820656982400.0, + "grad_norm": 2.7184830351890716, + "language_loss": 0.89324522, + "learning_rate": 3.7144270876616713e-06, + "loss": 0.91458547, + "num_input_tokens_seen": 70715110, + "step": 3277, + "time_per_iteration": 2.694704055786133 + }, + { + "auxiliary_loss_clip": 0.01070269, + "auxiliary_loss_mlp": 0.01046489, + "balance_loss_clip": 1.03576946, + "balance_loss_mlp": 1.02652144, + "epoch": 0.19708402224560348, + "flos": 22894237992960.0, + "grad_norm": 2.1685326230168793, + "language_loss": 0.62631297, + "learning_rate": 3.714226497539239e-06, + "loss": 0.64748049, + "num_input_tokens_seen": 70734715, + "step": 3278, + "time_per_iteration": 2.7534332275390625 + }, + { + "auxiliary_loss_clip": 0.01067759, + "auxiliary_loss_mlp": 0.010519, + "balance_loss_clip": 1.0330497, + "balance_loss_mlp": 1.03376853, + "epoch": 0.19714414549827144, + "flos": 25662519267840.0, + "grad_norm": 2.3882867746135235, + "language_loss": 0.73469454, + "learning_rate": 3.714025842413166e-06, + "loss": 0.75589114, + "num_input_tokens_seen": 70752650, + "step": 3279, + "time_per_iteration": 2.685664415359497 + }, + { + "auxiliary_loss_clip": 0.01098221, + "auxiliary_loss_mlp": 0.01041496, + "balance_loss_clip": 1.03387582, + "balance_loss_mlp": 1.02515197, + "epoch": 0.19720426875093944, + "flos": 23915824704000.0, + "grad_norm": 2.6044325372802906, + "language_loss": 0.82905906, + "learning_rate": 3.713825122291061e-06, + "loss": 0.8504563, + "num_input_tokens_seen": 70772365, + "step": 3280, + "time_per_iteration": 2.6596574783325195 + }, + { + "auxiliary_loss_clip": 0.01069847, + "auxiliary_loss_mlp": 0.01049598, + "balance_loss_clip": 1.03644347, + "balance_loss_mlp": 1.03177667, + "epoch": 0.1972643920036074, + "flos": 13881952828800.0, + "grad_norm": 1.6876035953079238, + "language_loss": 0.77953005, + "learning_rate": 3.713624337180536e-06, + "loss": 0.80072451, + "num_input_tokens_seen": 70790340, + "step": 3281, + "time_per_iteration": 2.64453125 + }, + { + "auxiliary_loss_clip": 0.01076661, + "auxiliary_loss_mlp": 0.01041843, + "balance_loss_clip": 1.03351736, + "balance_loss_mlp": 1.02615559, + "epoch": 0.19732451525627537, + "flos": 19863592801920.0, + "grad_norm": 1.646431466859681, + "language_loss": 0.79347551, + "learning_rate": 3.7134234870892045e-06, + "loss": 0.81466055, + "num_input_tokens_seen": 70809295, + "step": 3282, + "time_per_iteration": 2.6922543048858643 + }, + { + "auxiliary_loss_clip": 0.01075793, + "auxiliary_loss_mlp": 0.01047266, + "balance_loss_clip": 1.04131556, + "balance_loss_mlp": 1.03014731, + "epoch": 0.19738463850894333, + "flos": 24973429777920.0, + "grad_norm": 1.9989115165957125, + "language_loss": 0.71995646, + "learning_rate": 3.7132225720246826e-06, + "loss": 0.74118704, + "num_input_tokens_seen": 70828765, + "step": 3283, + "time_per_iteration": 2.7703773975372314 + }, + { + "auxiliary_loss_clip": 0.01099406, + "auxiliary_loss_mlp": 0.01042876, + "balance_loss_clip": 1.03915286, + "balance_loss_mlp": 1.02607965, + "epoch": 0.1974447617616113, + "flos": 18368883123840.0, + "grad_norm": 1.8315839404872947, + "language_loss": 0.79095513, + "learning_rate": 3.7130215919945886e-06, + "loss": 0.81237793, + "num_input_tokens_seen": 70846805, + "step": 3284, + "time_per_iteration": 2.615443468093872 + }, + { + "auxiliary_loss_clip": 0.01078366, + "auxiliary_loss_mlp": 0.00748844, + "balance_loss_clip": 1.03297687, + "balance_loss_mlp": 1.00094676, + "epoch": 0.19750488501427926, + "flos": 22892945103360.0, + "grad_norm": 2.0280301185650016, + "language_loss": 0.86081028, + "learning_rate": 3.7128205470065445e-06, + "loss": 0.87908244, + "num_input_tokens_seen": 70863805, + "step": 3285, + "time_per_iteration": 2.66226863861084 + }, + { + "auxiliary_loss_clip": 0.01077174, + "auxiliary_loss_mlp": 0.01042728, + "balance_loss_clip": 1.03983998, + "balance_loss_mlp": 1.02574086, + "epoch": 0.19756500826694723, + "flos": 21871502046720.0, + "grad_norm": 2.076623928064305, + "language_loss": 0.88800192, + "learning_rate": 3.712619437068174e-06, + "loss": 0.90920091, + "num_input_tokens_seen": 70882660, + "step": 3286, + "time_per_iteration": 2.781040906906128 + }, + { + "auxiliary_loss_clip": 0.01072131, + "auxiliary_loss_mlp": 0.01048864, + "balance_loss_clip": 1.03490257, + "balance_loss_mlp": 1.02840769, + "epoch": 0.19762513151961522, + "flos": 15158972131200.0, + "grad_norm": 3.3480647996743023, + "language_loss": 0.78344113, + "learning_rate": 3.712418262187102e-06, + "loss": 0.80465108, + "num_input_tokens_seen": 70898765, + "step": 3287, + "time_per_iteration": 2.614086389541626 + }, + { + "auxiliary_loss_clip": 0.01089265, + "auxiliary_loss_mlp": 0.01045569, + "balance_loss_clip": 1.03775549, + "balance_loss_mlp": 1.02712727, + "epoch": 0.1976852547722832, + "flos": 16979175878400.0, + "grad_norm": 2.233771697053857, + "language_loss": 0.8124131, + "learning_rate": 3.7122170223709584e-06, + "loss": 0.83376145, + "num_input_tokens_seen": 70916370, + "step": 3288, + "time_per_iteration": 2.6270503997802734 + }, + { + "auxiliary_loss_clip": 0.01083556, + "auxiliary_loss_mlp": 0.0104855, + "balance_loss_clip": 1.03293729, + "balance_loss_mlp": 1.03143191, + "epoch": 0.19774537802495115, + "flos": 20302924049280.0, + "grad_norm": 1.743061380261895, + "language_loss": 0.73301685, + "learning_rate": 3.712015717627374e-06, + "loss": 0.75433791, + "num_input_tokens_seen": 70934870, + "step": 3289, + "time_per_iteration": 2.651911973953247 + }, + { + "auxiliary_loss_clip": 0.01091051, + "auxiliary_loss_mlp": 0.01046947, + "balance_loss_clip": 1.03801906, + "balance_loss_mlp": 1.0291611, + "epoch": 0.19780550127761912, + "flos": 27235478724480.0, + "grad_norm": 1.8371568365340099, + "language_loss": 0.79758704, + "learning_rate": 3.7118143479639813e-06, + "loss": 0.81896698, + "num_input_tokens_seen": 70955140, + "step": 3290, + "time_per_iteration": 2.7695415019989014 + }, + { + "auxiliary_loss_clip": 0.0100634, + "auxiliary_loss_mlp": 0.01005403, + "balance_loss_clip": 1.00693107, + "balance_loss_mlp": 1.00255382, + "epoch": 0.19786562453028708, + "flos": 63550972684800.0, + "grad_norm": 0.9150043482222148, + "language_loss": 0.603724, + "learning_rate": 3.711612913388418e-06, + "loss": 0.6238414, + "num_input_tokens_seen": 71012005, + "step": 3291, + "time_per_iteration": 3.358374834060669 + }, + { + "auxiliary_loss_clip": 0.01111278, + "auxiliary_loss_mlp": 0.01045036, + "balance_loss_clip": 1.03518844, + "balance_loss_mlp": 1.02583098, + "epoch": 0.19792574778295505, + "flos": 26286647011200.0, + "grad_norm": 1.9922831867540651, + "language_loss": 0.81153929, + "learning_rate": 3.7114114139083204e-06, + "loss": 0.83310241, + "num_input_tokens_seen": 71031140, + "step": 3292, + "time_per_iteration": 2.7982325553894043 + }, + { + "auxiliary_loss_clip": 0.01071925, + "auxiliary_loss_mlp": 0.00748784, + "balance_loss_clip": 1.03273547, + "balance_loss_mlp": 1.00086987, + "epoch": 0.19798587103562304, + "flos": 19938107566080.0, + "grad_norm": 1.6627682261826287, + "language_loss": 0.81450868, + "learning_rate": 3.7112098495313313e-06, + "loss": 0.83271575, + "num_input_tokens_seen": 71050250, + "step": 3293, + "time_per_iteration": 2.867763042449951 + }, + { + "auxiliary_loss_clip": 0.01090877, + "auxiliary_loss_mlp": 0.01052, + "balance_loss_clip": 1.03751552, + "balance_loss_mlp": 1.03221118, + "epoch": 0.198045994288291, + "flos": 20120282369280.0, + "grad_norm": 2.089790585532579, + "language_loss": 0.60962802, + "learning_rate": 3.711008220265093e-06, + "loss": 0.63105679, + "num_input_tokens_seen": 71068665, + "step": 3294, + "time_per_iteration": 2.784287214279175 + }, + { + "auxiliary_loss_clip": 0.01087766, + "auxiliary_loss_mlp": 0.01044003, + "balance_loss_clip": 1.03561044, + "balance_loss_mlp": 1.02703977, + "epoch": 0.19810611754095897, + "flos": 17967653228160.0, + "grad_norm": 2.0785860984031994, + "language_loss": 0.86956149, + "learning_rate": 3.710806526117251e-06, + "loss": 0.89087927, + "num_input_tokens_seen": 71085320, + "step": 3295, + "time_per_iteration": 4.229424953460693 + }, + { + "auxiliary_loss_clip": 0.0106387, + "auxiliary_loss_mlp": 0.01051529, + "balance_loss_clip": 1.03235722, + "balance_loss_mlp": 1.03481627, + "epoch": 0.19816624079362694, + "flos": 15084996071040.0, + "grad_norm": 3.8666532596234564, + "language_loss": 0.80882847, + "learning_rate": 3.7106047670954544e-06, + "loss": 0.82998252, + "num_input_tokens_seen": 71102020, + "step": 3296, + "time_per_iteration": 2.6451656818389893 + }, + { + "auxiliary_loss_clip": 0.01074675, + "auxiliary_loss_mlp": 0.01043636, + "balance_loss_clip": 1.03185964, + "balance_loss_mlp": 1.02372813, + "epoch": 0.1982263640462949, + "flos": 24900315644160.0, + "grad_norm": 2.2119527303386817, + "language_loss": 0.68226862, + "learning_rate": 3.710402943207354e-06, + "loss": 0.70345175, + "num_input_tokens_seen": 71123390, + "step": 3297, + "time_per_iteration": 4.286594390869141 + }, + { + "auxiliary_loss_clip": 0.01104902, + "auxiliary_loss_mlp": 0.01039833, + "balance_loss_clip": 1.03658509, + "balance_loss_mlp": 1.02295256, + "epoch": 0.19828648729896287, + "flos": 20376181837440.0, + "grad_norm": 1.7019117088482334, + "language_loss": 0.81347668, + "learning_rate": 3.7102010544606016e-06, + "loss": 0.83492404, + "num_input_tokens_seen": 71141800, + "step": 3298, + "time_per_iteration": 2.6485564708709717 + }, + { + "auxiliary_loss_clip": 0.01091713, + "auxiliary_loss_mlp": 0.0103961, + "balance_loss_clip": 1.03690124, + "balance_loss_mlp": 1.01942825, + "epoch": 0.19834661055163083, + "flos": 18880035615360.0, + "grad_norm": 1.9886473471269697, + "language_loss": 0.85122991, + "learning_rate": 3.7099991008628544e-06, + "loss": 0.87254316, + "num_input_tokens_seen": 71159505, + "step": 3299, + "time_per_iteration": 2.60247540473938 + }, + { + "auxiliary_loss_clip": 0.00991971, + "auxiliary_loss_mlp": 0.01012587, + "balance_loss_clip": 1.00318599, + "balance_loss_mlp": 1.0098809, + "epoch": 0.19840673380429882, + "flos": 60259184640000.0, + "grad_norm": 1.1559873811312618, + "language_loss": 0.53270119, + "learning_rate": 3.7097970824217706e-06, + "loss": 0.55274671, + "num_input_tokens_seen": 71223265, + "step": 3300, + "time_per_iteration": 3.1556522846221924 + }, + { + "auxiliary_loss_clip": 0.01049456, + "auxiliary_loss_mlp": 0.01055208, + "balance_loss_clip": 1.02868569, + "balance_loss_mlp": 1.03423929, + "epoch": 0.1984668570569668, + "flos": 19902017376000.0, + "grad_norm": 1.6850492838297235, + "language_loss": 0.73232174, + "learning_rate": 3.7095949991450093e-06, + "loss": 0.75336838, + "num_input_tokens_seen": 71242385, + "step": 3301, + "time_per_iteration": 4.2404091358184814 + }, + { + "auxiliary_loss_clip": 0.010738, + "auxiliary_loss_mlp": 0.01038778, + "balance_loss_clip": 1.03318095, + "balance_loss_mlp": 1.02205288, + "epoch": 0.19852698030963475, + "flos": 15630766295040.0, + "grad_norm": 2.7206617588294457, + "language_loss": 0.88124573, + "learning_rate": 3.709392851040235e-06, + "loss": 0.90237147, + "num_input_tokens_seen": 71258990, + "step": 3302, + "time_per_iteration": 2.6249802112579346 + }, + { + "auxiliary_loss_clip": 0.01082438, + "auxiliary_loss_mlp": 0.0104386, + "balance_loss_clip": 1.04089665, + "balance_loss_mlp": 1.02613401, + "epoch": 0.19858710356230272, + "flos": 43143007311360.0, + "grad_norm": 1.7586121360198201, + "language_loss": 0.73548639, + "learning_rate": 3.709190638115111e-06, + "loss": 0.75674939, + "num_input_tokens_seen": 71282770, + "step": 3303, + "time_per_iteration": 2.8883285522460938 + }, + { + "auxiliary_loss_clip": 0.01096209, + "auxiliary_loss_mlp": 0.0104439, + "balance_loss_clip": 1.0359273, + "balance_loss_mlp": 1.02708101, + "epoch": 0.19864722681497068, + "flos": 35144084643840.0, + "grad_norm": 2.046726532944569, + "language_loss": 0.74936539, + "learning_rate": 3.7089883603773084e-06, + "loss": 0.77077138, + "num_input_tokens_seen": 71301410, + "step": 3304, + "time_per_iteration": 2.6808505058288574 + }, + { + "auxiliary_loss_clip": 0.01085483, + "auxiliary_loss_mlp": 0.01036596, + "balance_loss_clip": 1.03378868, + "balance_loss_mlp": 1.02001357, + "epoch": 0.19870735006763865, + "flos": 19426200888960.0, + "grad_norm": 1.8444858841614957, + "language_loss": 0.86063057, + "learning_rate": 3.7087860178344955e-06, + "loss": 0.88185138, + "num_input_tokens_seen": 71319670, + "step": 3305, + "time_per_iteration": 2.6797358989715576 + }, + { + "auxiliary_loss_clip": 0.01079389, + "auxiliary_loss_mlp": 0.01039691, + "balance_loss_clip": 1.03061676, + "balance_loss_mlp": 1.02260852, + "epoch": 0.19876747332030664, + "flos": 23547380947200.0, + "grad_norm": 1.5816242217131278, + "language_loss": 0.68191004, + "learning_rate": 3.7085836104943445e-06, + "loss": 0.7031008, + "num_input_tokens_seen": 71339850, + "step": 3306, + "time_per_iteration": 2.6012818813323975 + }, + { + "auxiliary_loss_clip": 0.01069412, + "auxiliary_loss_mlp": 0.01039724, + "balance_loss_clip": 1.03053784, + "balance_loss_mlp": 1.02347541, + "epoch": 0.1988275965729746, + "flos": 19829406032640.0, + "grad_norm": 1.5997739175030576, + "language_loss": 0.76426101, + "learning_rate": 3.7083811383645332e-06, + "loss": 0.78535241, + "num_input_tokens_seen": 71359795, + "step": 3307, + "time_per_iteration": 2.6470580101013184 + }, + { + "auxiliary_loss_clip": 0.01108838, + "auxiliary_loss_mlp": 0.01044996, + "balance_loss_clip": 1.03841949, + "balance_loss_mlp": 1.02858067, + "epoch": 0.19888771982564257, + "flos": 23513625141120.0, + "grad_norm": 2.673482286005944, + "language_loss": 0.75576735, + "learning_rate": 3.708178601452737e-06, + "loss": 0.77730572, + "num_input_tokens_seen": 71378885, + "step": 3308, + "time_per_iteration": 2.5865390300750732 + }, + { + "auxiliary_loss_clip": 0.01062873, + "auxiliary_loss_mlp": 0.01040367, + "balance_loss_clip": 1.03520143, + "balance_loss_mlp": 1.02298641, + "epoch": 0.19894784307831054, + "flos": 18150510389760.0, + "grad_norm": 3.9549540033182184, + "language_loss": 0.75841779, + "learning_rate": 3.7079759997666374e-06, + "loss": 0.77945012, + "num_input_tokens_seen": 71397285, + "step": 3309, + "time_per_iteration": 2.7542686462402344 + }, + { + "auxiliary_loss_clip": 0.01095363, + "auxiliary_loss_mlp": 0.0104863, + "balance_loss_clip": 1.03523612, + "balance_loss_mlp": 1.0307132, + "epoch": 0.1990079663309785, + "flos": 24276044246400.0, + "grad_norm": 1.824883298540353, + "language_loss": 0.88048917, + "learning_rate": 3.707773333313917e-06, + "loss": 0.90192914, + "num_input_tokens_seen": 71415775, + "step": 3310, + "time_per_iteration": 2.7832770347595215 + }, + { + "auxiliary_loss_clip": 0.01104645, + "auxiliary_loss_mlp": 0.0103754, + "balance_loss_clip": 1.03515553, + "balance_loss_mlp": 1.02055311, + "epoch": 0.19906808958364647, + "flos": 34897666366080.0, + "grad_norm": 2.240417487353802, + "language_loss": 0.64462441, + "learning_rate": 3.70757060210226e-06, + "loss": 0.66604626, + "num_input_tokens_seen": 71437315, + "step": 3311, + "time_per_iteration": 2.6727054119110107 + }, + { + "auxiliary_loss_clip": 0.01061754, + "auxiliary_loss_mlp": 0.0103925, + "balance_loss_clip": 1.02937686, + "balance_loss_mlp": 1.02129734, + "epoch": 0.19912821283631443, + "flos": 24024885373440.0, + "grad_norm": 2.746217006709707, + "language_loss": 0.74367237, + "learning_rate": 3.707367806139355e-06, + "loss": 0.76468241, + "num_input_tokens_seen": 71456320, + "step": 3312, + "time_per_iteration": 2.71549916267395 + }, + { + "auxiliary_loss_clip": 0.01096084, + "auxiliary_loss_mlp": 0.01043317, + "balance_loss_clip": 1.03508568, + "balance_loss_mlp": 1.02702093, + "epoch": 0.19918833608898243, + "flos": 19859031774720.0, + "grad_norm": 2.2384206988115496, + "language_loss": 0.83548731, + "learning_rate": 3.7071649454328915e-06, + "loss": 0.85688132, + "num_input_tokens_seen": 71475360, + "step": 3313, + "time_per_iteration": 2.5707647800445557 + }, + { + "auxiliary_loss_clip": 0.01100171, + "auxiliary_loss_mlp": 0.01043729, + "balance_loss_clip": 1.03936183, + "balance_loss_mlp": 1.02712321, + "epoch": 0.1992484593416504, + "flos": 29095794984960.0, + "grad_norm": 6.20553617482805, + "language_loss": 0.80983812, + "learning_rate": 3.7069620199905625e-06, + "loss": 0.83127713, + "num_input_tokens_seen": 71496155, + "step": 3314, + "time_per_iteration": 2.7388579845428467 + }, + { + "auxiliary_loss_clip": 0.01061524, + "auxiliary_loss_mlp": 0.01042673, + "balance_loss_clip": 1.02961302, + "balance_loss_mlp": 1.02688956, + "epoch": 0.19930858259431836, + "flos": 23295001011840.0, + "grad_norm": 1.4703242270885513, + "language_loss": 0.87147999, + "learning_rate": 3.7067590298200627e-06, + "loss": 0.89252198, + "num_input_tokens_seen": 71517295, + "step": 3315, + "time_per_iteration": 2.6935508251190186 + }, + { + "auxiliary_loss_clip": 0.01076476, + "auxiliary_loss_mlp": 0.00748769, + "balance_loss_clip": 1.03838158, + "balance_loss_mlp": 1.00077224, + "epoch": 0.19936870584698632, + "flos": 25378825651200.0, + "grad_norm": 1.5098807178617686, + "language_loss": 0.7108435, + "learning_rate": 3.7065559749290892e-06, + "loss": 0.72909594, + "num_input_tokens_seen": 71540000, + "step": 3316, + "time_per_iteration": 2.8046939373016357 + }, + { + "auxiliary_loss_clip": 0.00977795, + "auxiliary_loss_mlp": 0.01001831, + "balance_loss_clip": 1.00821829, + "balance_loss_mlp": 0.9995783, + "epoch": 0.1994288290996543, + "flos": 62168053109760.0, + "grad_norm": 0.8242610523134765, + "language_loss": 0.66269094, + "learning_rate": 3.706352855325342e-06, + "loss": 0.68248719, + "num_input_tokens_seen": 71607880, + "step": 3317, + "time_per_iteration": 3.338111162185669 + }, + { + "auxiliary_loss_clip": 0.01099399, + "auxiliary_loss_mlp": 0.01048934, + "balance_loss_clip": 1.03493047, + "balance_loss_mlp": 1.03112435, + "epoch": 0.19948895235232225, + "flos": 19025832919680.0, + "grad_norm": 2.087521468247317, + "language_loss": 0.74211705, + "learning_rate": 3.7061496710165233e-06, + "loss": 0.76360041, + "num_input_tokens_seen": 71625695, + "step": 3318, + "time_per_iteration": 2.645632266998291 + }, + { + "auxiliary_loss_clip": 0.01070472, + "auxiliary_loss_mlp": 0.0103913, + "balance_loss_clip": 1.03220654, + "balance_loss_mlp": 1.02310848, + "epoch": 0.19954907560499022, + "flos": 37815803182080.0, + "grad_norm": 1.8963982023220871, + "language_loss": 0.78692645, + "learning_rate": 3.7059464220103385e-06, + "loss": 0.8080225, + "num_input_tokens_seen": 71648520, + "step": 3319, + "time_per_iteration": 2.7907052040100098 + }, + { + "auxiliary_loss_clip": 0.01090431, + "auxiliary_loss_mlp": 0.01043623, + "balance_loss_clip": 1.03741002, + "balance_loss_mlp": 1.02477562, + "epoch": 0.1996091988576582, + "flos": 49565199594240.0, + "grad_norm": 2.3028006520744446, + "language_loss": 0.75660515, + "learning_rate": 3.7057431083144945e-06, + "loss": 0.77794576, + "num_input_tokens_seen": 71672185, + "step": 3320, + "time_per_iteration": 2.907168388366699 + }, + { + "auxiliary_loss_clip": 0.01075175, + "auxiliary_loss_mlp": 0.01041073, + "balance_loss_clip": 1.03322864, + "balance_loss_mlp": 1.02486014, + "epoch": 0.19966932211032618, + "flos": 22635788659200.0, + "grad_norm": 1.949465089921535, + "language_loss": 0.80342251, + "learning_rate": 3.705539729936701e-06, + "loss": 0.82458496, + "num_input_tokens_seen": 71692890, + "step": 3321, + "time_per_iteration": 2.7746965885162354 + }, + { + "auxiliary_loss_clip": 0.00992017, + "auxiliary_loss_mlp": 0.01005598, + "balance_loss_clip": 1.00426364, + "balance_loss_mlp": 1.00326145, + "epoch": 0.19972944536299414, + "flos": 54082117745280.0, + "grad_norm": 0.8614871907723581, + "language_loss": 0.65172756, + "learning_rate": 3.7053362868846696e-06, + "loss": 0.6717037, + "num_input_tokens_seen": 71745815, + "step": 3322, + "time_per_iteration": 3.001962184906006 + }, + { + "auxiliary_loss_clip": 0.01014091, + "auxiliary_loss_mlp": 0.0102321, + "balance_loss_clip": 1.01569879, + "balance_loss_mlp": 1.01999092, + "epoch": 0.1997895686156621, + "flos": 69355031817600.0, + "grad_norm": 0.7986868906994548, + "language_loss": 0.57050276, + "learning_rate": 3.7051327791661153e-06, + "loss": 0.59087574, + "num_input_tokens_seen": 71806915, + "step": 3323, + "time_per_iteration": 3.347712516784668 + }, + { + "auxiliary_loss_clip": 0.01085252, + "auxiliary_loss_mlp": 0.00748583, + "balance_loss_clip": 1.03553116, + "balance_loss_mlp": 1.00072765, + "epoch": 0.19984969186833007, + "flos": 18552063507840.0, + "grad_norm": 2.1330524792655257, + "language_loss": 0.80806231, + "learning_rate": 3.7049292067887555e-06, + "loss": 0.8264007, + "num_input_tokens_seen": 71824645, + "step": 3324, + "time_per_iteration": 2.602530002593994 + }, + { + "auxiliary_loss_clip": 0.01083944, + "auxiliary_loss_mlp": 0.01042349, + "balance_loss_clip": 1.03009486, + "balance_loss_mlp": 1.02379954, + "epoch": 0.19990981512099804, + "flos": 26429678968320.0, + "grad_norm": 1.5932654675009084, + "language_loss": 0.53503031, + "learning_rate": 3.7047255697603092e-06, + "loss": 0.55629325, + "num_input_tokens_seen": 71845125, + "step": 3325, + "time_per_iteration": 2.642245054244995 + }, + { + "auxiliary_loss_clip": 0.01085394, + "auxiliary_loss_mlp": 0.01044995, + "balance_loss_clip": 1.03489518, + "balance_loss_mlp": 1.02829337, + "epoch": 0.19996993837366603, + "flos": 16325997010560.0, + "grad_norm": 2.600501324943444, + "language_loss": 0.86152351, + "learning_rate": 3.7045218680884984e-06, + "loss": 0.8828274, + "num_input_tokens_seen": 71863500, + "step": 3326, + "time_per_iteration": 2.634488821029663 + }, + { + "auxiliary_loss_clip": 0.01109241, + "auxiliary_loss_mlp": 0.0104438, + "balance_loss_clip": 1.04037499, + "balance_loss_mlp": 1.028597, + "epoch": 0.200030061626334, + "flos": 20844169159680.0, + "grad_norm": 1.8276274973442563, + "language_loss": 0.71696341, + "learning_rate": 3.7043181017810476e-06, + "loss": 0.73849964, + "num_input_tokens_seen": 71881845, + "step": 3327, + "time_per_iteration": 2.5579769611358643 + }, + { + "auxiliary_loss_clip": 0.01085269, + "auxiliary_loss_mlp": 0.01046374, + "balance_loss_clip": 1.03381515, + "balance_loss_mlp": 1.02848017, + "epoch": 0.20009018487900196, + "flos": 23762629198080.0, + "grad_norm": 1.8369786479233035, + "language_loss": 0.76319373, + "learning_rate": 3.7041142708456833e-06, + "loss": 0.7845102, + "num_input_tokens_seen": 71900940, + "step": 3328, + "time_per_iteration": 2.6488096714019775 + }, + { + "auxiliary_loss_clip": 0.01069286, + "auxiliary_loss_mlp": 0.0104072, + "balance_loss_clip": 1.03040624, + "balance_loss_mlp": 1.02530622, + "epoch": 0.20015030813166992, + "flos": 28111555440000.0, + "grad_norm": 1.7914634083899028, + "language_loss": 0.69573379, + "learning_rate": 3.7039103752901353e-06, + "loss": 0.71683389, + "num_input_tokens_seen": 71921925, + "step": 3329, + "time_per_iteration": 2.8077495098114014 + }, + { + "auxiliary_loss_clip": 0.01053149, + "auxiliary_loss_mlp": 0.01055849, + "balance_loss_clip": 1.03064561, + "balance_loss_mlp": 1.03559494, + "epoch": 0.2002104313843379, + "flos": 26067160955520.0, + "grad_norm": 1.6968319580210571, + "language_loss": 0.81193459, + "learning_rate": 3.7037064151221353e-06, + "loss": 0.83302462, + "num_input_tokens_seen": 71941855, + "step": 3330, + "time_per_iteration": 2.7446696758270264 + }, + { + "auxiliary_loss_clip": 0.01095679, + "auxiliary_loss_mlp": 0.01042681, + "balance_loss_clip": 1.03606558, + "balance_loss_mlp": 1.02595556, + "epoch": 0.20027055463700585, + "flos": 22966633854720.0, + "grad_norm": 2.111968037112584, + "language_loss": 0.76752698, + "learning_rate": 3.703502390349417e-06, + "loss": 0.78891057, + "num_input_tokens_seen": 71960915, + "step": 3331, + "time_per_iteration": 2.631920576095581 + }, + { + "auxiliary_loss_clip": 0.01043384, + "auxiliary_loss_mlp": 0.01055962, + "balance_loss_clip": 1.02824235, + "balance_loss_mlp": 1.0371747, + "epoch": 0.20033067788967382, + "flos": 17165660313600.0, + "grad_norm": 1.834888255475724, + "language_loss": 0.79619193, + "learning_rate": 3.7032983009797176e-06, + "loss": 0.8171854, + "num_input_tokens_seen": 71979220, + "step": 3332, + "time_per_iteration": 2.7175261974334717 + }, + { + "auxiliary_loss_clip": 0.01015859, + "auxiliary_loss_mlp": 0.01031754, + "balance_loss_clip": 1.00780034, + "balance_loss_mlp": 1.02933443, + "epoch": 0.2003908011423418, + "flos": 60825566292480.0, + "grad_norm": 0.9313406937300068, + "language_loss": 0.61966038, + "learning_rate": 3.703094147020776e-06, + "loss": 0.6401366, + "num_input_tokens_seen": 72033950, + "step": 3333, + "time_per_iteration": 3.0480990409851074 + }, + { + "auxiliary_loss_clip": 0.0107427, + "auxiliary_loss_mlp": 0.00748669, + "balance_loss_clip": 1.03443635, + "balance_loss_mlp": 1.00064814, + "epoch": 0.20045092439500978, + "flos": 24206234163840.0, + "grad_norm": 3.5257305384560236, + "language_loss": 0.81244135, + "learning_rate": 3.7028899284803334e-06, + "loss": 0.83067071, + "num_input_tokens_seen": 72051395, + "step": 3334, + "time_per_iteration": 2.7798967361450195 + }, + { + "auxiliary_loss_clip": 0.01054181, + "auxiliary_loss_mlp": 0.01048261, + "balance_loss_clip": 1.03179646, + "balance_loss_mlp": 1.02962828, + "epoch": 0.20051104764767774, + "flos": 29387605075200.0, + "grad_norm": 1.7773143403036769, + "language_loss": 0.74513233, + "learning_rate": 3.702685645366134e-06, + "loss": 0.76615673, + "num_input_tokens_seen": 72071305, + "step": 3335, + "time_per_iteration": 2.9625673294067383 + }, + { + "auxiliary_loss_clip": 0.01103519, + "auxiliary_loss_mlp": 0.01058192, + "balance_loss_clip": 1.04016519, + "balance_loss_mlp": 1.04103756, + "epoch": 0.2005711709003457, + "flos": 23513804709120.0, + "grad_norm": 1.619793793030427, + "language_loss": 0.79912031, + "learning_rate": 3.7024812976859243e-06, + "loss": 0.82073736, + "num_input_tokens_seen": 72090165, + "step": 3336, + "time_per_iteration": 2.665573835372925 + }, + { + "auxiliary_loss_clip": 0.01071068, + "auxiliary_loss_mlp": 0.01044807, + "balance_loss_clip": 1.03359473, + "balance_loss_mlp": 1.02581668, + "epoch": 0.20063129415301367, + "flos": 22523388024960.0, + "grad_norm": 1.9035987103555427, + "language_loss": 0.77872312, + "learning_rate": 3.7022768854474532e-06, + "loss": 0.79988194, + "num_input_tokens_seen": 72107210, + "step": 3337, + "time_per_iteration": 2.6547415256500244 + }, + { + "auxiliary_loss_clip": 0.01110726, + "auxiliary_loss_mlp": 0.01043306, + "balance_loss_clip": 1.03918338, + "balance_loss_mlp": 1.02439952, + "epoch": 0.20069141740568164, + "flos": 25958243940480.0, + "grad_norm": 2.5531955408834226, + "language_loss": 0.68882048, + "learning_rate": 3.7020724086584724e-06, + "loss": 0.71036077, + "num_input_tokens_seen": 72126315, + "step": 3338, + "time_per_iteration": 2.620541572570801 + }, + { + "auxiliary_loss_clip": 0.01067706, + "auxiliary_loss_mlp": 0.01050097, + "balance_loss_clip": 1.03305995, + "balance_loss_mlp": 1.03297853, + "epoch": 0.2007515406583496, + "flos": 24790608529920.0, + "grad_norm": 2.0813043868751437, + "language_loss": 0.68688762, + "learning_rate": 3.701867867326735e-06, + "loss": 0.70806563, + "num_input_tokens_seen": 72146470, + "step": 3339, + "time_per_iteration": 2.7047784328460693 + }, + { + "auxiliary_loss_clip": 0.01082364, + "auxiliary_loss_mlp": 0.01038571, + "balance_loss_clip": 1.04485822, + "balance_loss_mlp": 1.02170265, + "epoch": 0.2008116639110176, + "flos": 37925582123520.0, + "grad_norm": 2.2187873828952593, + "language_loss": 0.66699541, + "learning_rate": 3.7016632614599974e-06, + "loss": 0.68820471, + "num_input_tokens_seen": 72166600, + "step": 3340, + "time_per_iteration": 2.8924736976623535 + }, + { + "auxiliary_loss_clip": 0.01098079, + "auxiliary_loss_mlp": 0.01036483, + "balance_loss_clip": 1.03519249, + "balance_loss_mlp": 1.01863694, + "epoch": 0.20087178716368556, + "flos": 20740531443840.0, + "grad_norm": 2.4515751899678064, + "language_loss": 0.74306566, + "learning_rate": 3.701458591066019e-06, + "loss": 0.76441121, + "num_input_tokens_seen": 72185160, + "step": 3341, + "time_per_iteration": 4.262914657592773 + }, + { + "auxiliary_loss_clip": 0.01064647, + "auxiliary_loss_mlp": 0.01047524, + "balance_loss_clip": 1.03581071, + "balance_loss_mlp": 1.03040576, + "epoch": 0.20093191041635353, + "flos": 23842279607040.0, + "grad_norm": 2.317501787323118, + "language_loss": 0.71649802, + "learning_rate": 3.70125385615256e-06, + "loss": 0.73761976, + "num_input_tokens_seen": 72205160, + "step": 3342, + "time_per_iteration": 2.7583484649658203 + }, + { + "auxiliary_loss_clip": 0.01064273, + "auxiliary_loss_mlp": 0.01045177, + "balance_loss_clip": 1.03320575, + "balance_loss_mlp": 1.02838016, + "epoch": 0.2009920336690215, + "flos": 21792067119360.0, + "grad_norm": 2.365245008342798, + "language_loss": 0.71980184, + "learning_rate": 3.701049056727384e-06, + "loss": 0.74089628, + "num_input_tokens_seen": 72223555, + "step": 3343, + "time_per_iteration": 4.341833114624023 + }, + { + "auxiliary_loss_clip": 0.01068637, + "auxiliary_loss_mlp": 0.01042035, + "balance_loss_clip": 1.03175509, + "balance_loss_mlp": 1.0242846, + "epoch": 0.20105215692168946, + "flos": 26359222440960.0, + "grad_norm": 1.9568967772416974, + "language_loss": 0.81018066, + "learning_rate": 3.7008441927982574e-06, + "loss": 0.83128744, + "num_input_tokens_seen": 72242465, + "step": 3344, + "time_per_iteration": 2.655672550201416 + }, + { + "auxiliary_loss_clip": 0.01105147, + "auxiliary_loss_mlp": 0.01045113, + "balance_loss_clip": 1.03437495, + "balance_loss_mlp": 1.02865052, + "epoch": 0.20111228017435742, + "flos": 18807280617600.0, + "grad_norm": 2.2735121409993, + "language_loss": 0.83689773, + "learning_rate": 3.700639264372948e-06, + "loss": 0.85840034, + "num_input_tokens_seen": 72260655, + "step": 3345, + "time_per_iteration": 2.6278998851776123 + }, + { + "auxiliary_loss_clip": 0.01050025, + "auxiliary_loss_mlp": 0.01038442, + "balance_loss_clip": 1.03119075, + "balance_loss_mlp": 1.0233438, + "epoch": 0.20117240342702541, + "flos": 19975059682560.0, + "grad_norm": 1.7123470544601245, + "language_loss": 0.6785835, + "learning_rate": 3.7004342714592283e-06, + "loss": 0.69946814, + "num_input_tokens_seen": 72279055, + "step": 3346, + "time_per_iteration": 2.725142240524292 + }, + { + "auxiliary_loss_clip": 0.01063197, + "auxiliary_loss_mlp": 0.01046997, + "balance_loss_clip": 1.03001559, + "balance_loss_mlp": 1.03053451, + "epoch": 0.20123252667969338, + "flos": 23142703345920.0, + "grad_norm": 2.1426333295317064, + "language_loss": 0.73847222, + "learning_rate": 3.70022921406487e-06, + "loss": 0.75957417, + "num_input_tokens_seen": 72297895, + "step": 3347, + "time_per_iteration": 2.6316330432891846 + }, + { + "auxiliary_loss_clip": 0.01094814, + "auxiliary_loss_mlp": 0.01048423, + "balance_loss_clip": 1.03581083, + "balance_loss_mlp": 1.03289032, + "epoch": 0.20129264993236134, + "flos": 23221671396480.0, + "grad_norm": 1.860126090810896, + "language_loss": 0.87073851, + "learning_rate": 3.70002409219765e-06, + "loss": 0.89217091, + "num_input_tokens_seen": 72318385, + "step": 3348, + "time_per_iteration": 4.196394443511963 + }, + { + "auxiliary_loss_clip": 0.01055613, + "auxiliary_loss_mlp": 0.01039068, + "balance_loss_clip": 1.03062069, + "balance_loss_mlp": 1.02188957, + "epoch": 0.2013527731850293, + "flos": 21871466133120.0, + "grad_norm": 1.5662372368684456, + "language_loss": 0.70472509, + "learning_rate": 3.699818905865346e-06, + "loss": 0.72567189, + "num_input_tokens_seen": 72338235, + "step": 3349, + "time_per_iteration": 2.7047767639160156 + }, + { + "auxiliary_loss_clip": 0.01075017, + "auxiliary_loss_mlp": 0.01047451, + "balance_loss_clip": 1.03490424, + "balance_loss_mlp": 1.02960563, + "epoch": 0.20141289643769728, + "flos": 18040803275520.0, + "grad_norm": 1.6070624620580443, + "language_loss": 0.7151736, + "learning_rate": 3.6996136550757377e-06, + "loss": 0.73639822, + "num_input_tokens_seen": 72357825, + "step": 3350, + "time_per_iteration": 2.6094818115234375 + }, + { + "auxiliary_loss_clip": 0.01077141, + "auxiliary_loss_mlp": 0.01044434, + "balance_loss_clip": 1.03332615, + "balance_loss_mlp": 1.02505004, + "epoch": 0.20147301969036524, + "flos": 23951412103680.0, + "grad_norm": 3.170591745964458, + "language_loss": 0.76782435, + "learning_rate": 3.69940833983661e-06, + "loss": 0.78904009, + "num_input_tokens_seen": 72376335, + "step": 3351, + "time_per_iteration": 2.7097244262695312 + }, + { + "auxiliary_loss_clip": 0.01084362, + "auxiliary_loss_mlp": 0.01044517, + "balance_loss_clip": 1.03318417, + "balance_loss_mlp": 1.02682662, + "epoch": 0.2015331429430332, + "flos": 25588471380480.0, + "grad_norm": 1.7231311587212954, + "language_loss": 0.80336177, + "learning_rate": 3.699202960155748e-06, + "loss": 0.82465059, + "num_input_tokens_seen": 72395440, + "step": 3352, + "time_per_iteration": 2.6459882259368896 + }, + { + "auxiliary_loss_clip": 0.01094347, + "auxiliary_loss_mlp": 0.01038574, + "balance_loss_clip": 1.03521419, + "balance_loss_mlp": 1.02178907, + "epoch": 0.2015932661957012, + "flos": 26724972677760.0, + "grad_norm": 2.3662974895398246, + "language_loss": 0.80533022, + "learning_rate": 3.6989975160409396e-06, + "loss": 0.82665944, + "num_input_tokens_seen": 72414670, + "step": 3353, + "time_per_iteration": 2.670957326889038 + }, + { + "auxiliary_loss_clip": 0.01078605, + "auxiliary_loss_mlp": 0.01041152, + "balance_loss_clip": 1.03291321, + "balance_loss_mlp": 1.02492785, + "epoch": 0.20165338944836916, + "flos": 15633136592640.0, + "grad_norm": 2.781349380507882, + "language_loss": 0.89789301, + "learning_rate": 3.6987920074999747e-06, + "loss": 0.91909063, + "num_input_tokens_seen": 72432210, + "step": 3354, + "time_per_iteration": 2.588850975036621 + }, + { + "auxiliary_loss_clip": 0.01005915, + "auxiliary_loss_mlp": 0.00748494, + "balance_loss_clip": 1.00753486, + "balance_loss_mlp": 1.00123549, + "epoch": 0.20171351270103713, + "flos": 57912529207680.0, + "grad_norm": 0.8297465769169953, + "language_loss": 0.55913162, + "learning_rate": 3.6985864345406465e-06, + "loss": 0.57667577, + "num_input_tokens_seen": 72489225, + "step": 3355, + "time_per_iteration": 3.204645872116089 + }, + { + "auxiliary_loss_clip": 0.01080525, + "auxiliary_loss_mlp": 0.00748642, + "balance_loss_clip": 1.03473806, + "balance_loss_mlp": 1.00069165, + "epoch": 0.2017736359537051, + "flos": 20814363849600.0, + "grad_norm": 2.1351855529905786, + "language_loss": 0.84301162, + "learning_rate": 3.698380797170751e-06, + "loss": 0.86130321, + "num_input_tokens_seen": 72508715, + "step": 3356, + "time_per_iteration": 2.782589912414551 + }, + { + "auxiliary_loss_clip": 0.01072118, + "auxiliary_loss_mlp": 0.01045274, + "balance_loss_clip": 1.03169382, + "balance_loss_mlp": 1.02382779, + "epoch": 0.20183375920637306, + "flos": 17092043389440.0, + "grad_norm": 2.5132282487931583, + "language_loss": 0.69841343, + "learning_rate": 3.698175095398085e-06, + "loss": 0.71958733, + "num_input_tokens_seen": 72525135, + "step": 3357, + "time_per_iteration": 2.7259678840637207 + }, + { + "auxiliary_loss_clip": 0.01085105, + "auxiliary_loss_mlp": 0.01041377, + "balance_loss_clip": 1.034307, + "balance_loss_mlp": 1.02385283, + "epoch": 0.20189388245904102, + "flos": 18661339658880.0, + "grad_norm": 1.863145263530741, + "language_loss": 0.71824175, + "learning_rate": 3.6979693292304493e-06, + "loss": 0.7395066, + "num_input_tokens_seen": 72543690, + "step": 3358, + "time_per_iteration": 2.6916561126708984 + }, + { + "auxiliary_loss_clip": 0.01090006, + "auxiliary_loss_mlp": 0.01046092, + "balance_loss_clip": 1.03249574, + "balance_loss_mlp": 1.03071427, + "epoch": 0.20195400571170902, + "flos": 16797539779200.0, + "grad_norm": 1.728516762637495, + "language_loss": 0.83072788, + "learning_rate": 3.6977634986756463e-06, + "loss": 0.85208881, + "num_input_tokens_seen": 72560725, + "step": 3359, + "time_per_iteration": 2.66520094871521 + }, + { + "auxiliary_loss_clip": 0.01012054, + "auxiliary_loss_mlp": 0.01004383, + "balance_loss_clip": 1.0046556, + "balance_loss_mlp": 1.00150955, + "epoch": 0.20201412896437698, + "flos": 67174716268800.0, + "grad_norm": 0.7827516256338182, + "language_loss": 0.59016824, + "learning_rate": 3.697557603741482e-06, + "loss": 0.61033261, + "num_input_tokens_seen": 72621940, + "step": 3360, + "time_per_iteration": 3.289361000061035 + }, + { + "auxiliary_loss_clip": 0.01052792, + "auxiliary_loss_mlp": 0.01045271, + "balance_loss_clip": 1.03474903, + "balance_loss_mlp": 1.02732992, + "epoch": 0.20207425221704495, + "flos": 21325013550720.0, + "grad_norm": 3.875089606923427, + "language_loss": 0.6247381, + "learning_rate": 3.697351644435763e-06, + "loss": 0.64571869, + "num_input_tokens_seen": 72639135, + "step": 3361, + "time_per_iteration": 2.7718520164489746 + }, + { + "auxiliary_loss_clip": 0.01074701, + "auxiliary_loss_mlp": 0.01056563, + "balance_loss_clip": 1.0364418, + "balance_loss_mlp": 1.03896809, + "epoch": 0.2021343754697129, + "flos": 22527158952960.0, + "grad_norm": 1.889812307607499, + "language_loss": 0.75503188, + "learning_rate": 3.6971456207662993e-06, + "loss": 0.77634454, + "num_input_tokens_seen": 72658525, + "step": 3362, + "time_per_iteration": 2.7488081455230713 + }, + { + "auxiliary_loss_clip": 0.01088486, + "auxiliary_loss_mlp": 0.00748774, + "balance_loss_clip": 1.03422618, + "balance_loss_mlp": 1.0008719, + "epoch": 0.20219449872238088, + "flos": 19062785036160.0, + "grad_norm": 1.571585024245097, + "language_loss": 0.7642616, + "learning_rate": 3.6969395327409035e-06, + "loss": 0.78263426, + "num_input_tokens_seen": 72678085, + "step": 3363, + "time_per_iteration": 2.831935405731201 + }, + { + "auxiliary_loss_clip": 0.01092881, + "auxiliary_loss_mlp": 0.01043441, + "balance_loss_clip": 1.03301954, + "balance_loss_mlp": 1.02752614, + "epoch": 0.20225462197504884, + "flos": 24717027519360.0, + "grad_norm": 1.5443028339150382, + "language_loss": 0.75010383, + "learning_rate": 3.696733380367391e-06, + "loss": 0.77146703, + "num_input_tokens_seen": 72698695, + "step": 3364, + "time_per_iteration": 2.6485188007354736 + }, + { + "auxiliary_loss_clip": 0.01061912, + "auxiliary_loss_mlp": 0.01041619, + "balance_loss_clip": 1.03258705, + "balance_loss_mlp": 1.02365422, + "epoch": 0.2023147452277168, + "flos": 22018304931840.0, + "grad_norm": 2.3576169183758333, + "language_loss": 0.71281743, + "learning_rate": 3.6965271636535783e-06, + "loss": 0.73385274, + "num_input_tokens_seen": 72717880, + "step": 3365, + "time_per_iteration": 2.836880922317505 + }, + { + "auxiliary_loss_clip": 0.0106271, + "auxiliary_loss_mlp": 0.01044722, + "balance_loss_clip": 1.03389597, + "balance_loss_mlp": 1.0269835, + "epoch": 0.2023748684803848, + "flos": 17745365911680.0, + "grad_norm": 9.619542914834765, + "language_loss": 0.85317427, + "learning_rate": 3.696320882607286e-06, + "loss": 0.87424862, + "num_input_tokens_seen": 72736410, + "step": 3366, + "time_per_iteration": 2.6700289249420166 + }, + { + "auxiliary_loss_clip": 0.01065578, + "auxiliary_loss_mlp": 0.01040678, + "balance_loss_clip": 1.03177142, + "balance_loss_mlp": 1.02372622, + "epoch": 0.20243499173305277, + "flos": 31138932493440.0, + "grad_norm": 1.6163899447368608, + "language_loss": 0.69234765, + "learning_rate": 3.696114537236335e-06, + "loss": 0.71341026, + "num_input_tokens_seen": 72758295, + "step": 3367, + "time_per_iteration": 2.8838930130004883 + }, + { + "auxiliary_loss_clip": 0.01091227, + "auxiliary_loss_mlp": 0.01044221, + "balance_loss_clip": 1.03196383, + "balance_loss_mlp": 1.02386057, + "epoch": 0.20249511498572073, + "flos": 33839235279360.0, + "grad_norm": 2.4164010965649014, + "language_loss": 0.68373722, + "learning_rate": 3.6959081275485512e-06, + "loss": 0.70509171, + "num_input_tokens_seen": 72782495, + "step": 3368, + "time_per_iteration": 2.7278242111206055 + }, + { + "auxiliary_loss_clip": 0.01078458, + "auxiliary_loss_mlp": 0.01050471, + "balance_loss_clip": 1.03788877, + "balance_loss_mlp": 1.03284037, + "epoch": 0.2025552382383887, + "flos": 21215629658880.0, + "grad_norm": 1.602760131110158, + "language_loss": 0.77114689, + "learning_rate": 3.6957016535517615e-06, + "loss": 0.79243624, + "num_input_tokens_seen": 72801885, + "step": 3369, + "time_per_iteration": 2.7036406993865967 + }, + { + "auxiliary_loss_clip": 0.01081165, + "auxiliary_loss_mlp": 0.0104949, + "balance_loss_clip": 1.0314548, + "balance_loss_mlp": 1.03259754, + "epoch": 0.20261536149105666, + "flos": 14647388676480.0, + "grad_norm": 2.574888538687613, + "language_loss": 0.65195835, + "learning_rate": 3.695495115253795e-06, + "loss": 0.67326486, + "num_input_tokens_seen": 72816990, + "step": 3370, + "time_per_iteration": 2.6223084926605225 + }, + { + "auxiliary_loss_clip": 0.01014165, + "auxiliary_loss_mlp": 0.01001828, + "balance_loss_clip": 1.00606227, + "balance_loss_mlp": 0.9991104, + "epoch": 0.20267548474372463, + "flos": 66783649921920.0, + "grad_norm": 0.6746721438870348, + "language_loss": 0.58097488, + "learning_rate": 3.6952885126624834e-06, + "loss": 0.60113484, + "num_input_tokens_seen": 72879240, + "step": 3371, + "time_per_iteration": 3.3387680053710938 + }, + { + "auxiliary_loss_clip": 0.0106861, + "auxiliary_loss_mlp": 0.01038427, + "balance_loss_clip": 1.03076601, + "balance_loss_mlp": 1.0220592, + "epoch": 0.2027356079963926, + "flos": 24680793674880.0, + "grad_norm": 1.64220078968612, + "language_loss": 0.91534603, + "learning_rate": 3.6950818457856617e-06, + "loss": 0.93641639, + "num_input_tokens_seen": 72899030, + "step": 3372, + "time_per_iteration": 2.757596731185913 + }, + { + "auxiliary_loss_clip": 0.01086981, + "auxiliary_loss_mlp": 0.01049609, + "balance_loss_clip": 1.03315616, + "balance_loss_mlp": 1.0296061, + "epoch": 0.20279573124906058, + "flos": 26392762765440.0, + "grad_norm": 1.721143750940396, + "language_loss": 0.78835517, + "learning_rate": 3.694875114631167e-06, + "loss": 0.80972111, + "num_input_tokens_seen": 72919190, + "step": 3373, + "time_per_iteration": 2.690202236175537 + }, + { + "auxiliary_loss_clip": 0.01041561, + "auxiliary_loss_mlp": 0.01044831, + "balance_loss_clip": 1.02915573, + "balance_loss_mlp": 1.02551901, + "epoch": 0.20285585450172855, + "flos": 33799984692480.0, + "grad_norm": 1.9056257906597016, + "language_loss": 0.71148157, + "learning_rate": 3.6946683192068377e-06, + "loss": 0.73234546, + "num_input_tokens_seen": 72939720, + "step": 3374, + "time_per_iteration": 2.935732364654541 + }, + { + "auxiliary_loss_clip": 0.01000954, + "auxiliary_loss_mlp": 0.01002718, + "balance_loss_clip": 1.00370121, + "balance_loss_mlp": 0.99992859, + "epoch": 0.20291597775439651, + "flos": 71164823598720.0, + "grad_norm": 0.96421846210943, + "language_loss": 0.62458533, + "learning_rate": 3.694461459520516e-06, + "loss": 0.64462203, + "num_input_tokens_seen": 73000015, + "step": 3375, + "time_per_iteration": 3.2286150455474854 + }, + { + "auxiliary_loss_clip": 0.01103712, + "auxiliary_loss_mlp": 0.01045171, + "balance_loss_clip": 1.03562224, + "balance_loss_mlp": 1.02854145, + "epoch": 0.20297610100706448, + "flos": 19494287118720.0, + "grad_norm": 1.5121551666039788, + "language_loss": 0.82166791, + "learning_rate": 3.6942545355800463e-06, + "loss": 0.84315681, + "num_input_tokens_seen": 73017675, + "step": 3376, + "time_per_iteration": 2.7421720027923584 + }, + { + "auxiliary_loss_clip": 0.01090655, + "auxiliary_loss_mlp": 0.01039702, + "balance_loss_clip": 1.0329113, + "balance_loss_mlp": 1.02090228, + "epoch": 0.20303622425973245, + "flos": 25044245441280.0, + "grad_norm": 1.9677270313359292, + "language_loss": 0.81426442, + "learning_rate": 3.6940475473932743e-06, + "loss": 0.83556801, + "num_input_tokens_seen": 73036135, + "step": 3377, + "time_per_iteration": 2.6161112785339355 + }, + { + "auxiliary_loss_clip": 0.01079085, + "auxiliary_loss_mlp": 0.01044689, + "balance_loss_clip": 1.03398418, + "balance_loss_mlp": 1.02734423, + "epoch": 0.2030963475124004, + "flos": 21979988098560.0, + "grad_norm": 1.824258897253643, + "language_loss": 0.76292133, + "learning_rate": 3.69384049496805e-06, + "loss": 0.78415906, + "num_input_tokens_seen": 73054075, + "step": 3378, + "time_per_iteration": 2.7480721473693848 + }, + { + "auxiliary_loss_clip": 0.01043341, + "auxiliary_loss_mlp": 0.01047273, + "balance_loss_clip": 1.03417301, + "balance_loss_mlp": 1.02786541, + "epoch": 0.2031564707650684, + "flos": 19500392430720.0, + "grad_norm": 1.7967305321327651, + "language_loss": 0.79944628, + "learning_rate": 3.6936333783122242e-06, + "loss": 0.82035244, + "num_input_tokens_seen": 73073530, + "step": 3379, + "time_per_iteration": 2.7755026817321777 + }, + { + "auxiliary_loss_clip": 0.01089878, + "auxiliary_loss_mlp": 0.01036687, + "balance_loss_clip": 1.03469443, + "balance_loss_mlp": 1.02080798, + "epoch": 0.20321659401773637, + "flos": 22747075971840.0, + "grad_norm": 1.7129063968542155, + "language_loss": 0.86868364, + "learning_rate": 3.6934261974336505e-06, + "loss": 0.88994932, + "num_input_tokens_seen": 73092820, + "step": 3380, + "time_per_iteration": 2.683105707168579 + }, + { + "auxiliary_loss_clip": 0.01109006, + "auxiliary_loss_mlp": 0.01049762, + "balance_loss_clip": 1.04010439, + "balance_loss_mlp": 1.03216672, + "epoch": 0.20327671727040433, + "flos": 22455840499200.0, + "grad_norm": 3.762251473482563, + "language_loss": 0.74729633, + "learning_rate": 3.693218952340186e-06, + "loss": 0.768884, + "num_input_tokens_seen": 73113385, + "step": 3381, + "time_per_iteration": 2.7917518615722656 + }, + { + "auxiliary_loss_clip": 0.01066035, + "auxiliary_loss_mlp": 0.01051209, + "balance_loss_clip": 1.02992833, + "balance_loss_mlp": 1.03333998, + "epoch": 0.2033368405230723, + "flos": 19535010163200.0, + "grad_norm": 1.638875504452712, + "language_loss": 0.7929092, + "learning_rate": 3.6930116430396895e-06, + "loss": 0.81408167, + "num_input_tokens_seen": 73131195, + "step": 3382, + "time_per_iteration": 2.6264405250549316 + }, + { + "auxiliary_loss_clip": 0.01067723, + "auxiliary_loss_mlp": 0.0074893, + "balance_loss_clip": 1.03228617, + "balance_loss_mlp": 1.00093484, + "epoch": 0.20339696377574026, + "flos": 13809233744640.0, + "grad_norm": 1.7623214480993794, + "language_loss": 0.79777253, + "learning_rate": 3.6928042695400214e-06, + "loss": 0.81593907, + "num_input_tokens_seen": 73148850, + "step": 3383, + "time_per_iteration": 2.6607837677001953 + }, + { + "auxiliary_loss_clip": 0.01048695, + "auxiliary_loss_mlp": 0.01039297, + "balance_loss_clip": 1.0292052, + "balance_loss_mlp": 1.02171326, + "epoch": 0.20345708702840823, + "flos": 20339409288960.0, + "grad_norm": 1.8061808980272853, + "language_loss": 0.7454915, + "learning_rate": 3.6925968318490464e-06, + "loss": 0.76637137, + "num_input_tokens_seen": 73166775, + "step": 3384, + "time_per_iteration": 2.654067039489746 + }, + { + "auxiliary_loss_clip": 0.01099039, + "auxiliary_loss_mlp": 0.01044324, + "balance_loss_clip": 1.03541255, + "balance_loss_mlp": 1.02507138, + "epoch": 0.2035172102810762, + "flos": 20333950421760.0, + "grad_norm": 2.488075634260152, + "language_loss": 0.76730537, + "learning_rate": 3.6923893299746293e-06, + "loss": 0.78873903, + "num_input_tokens_seen": 73183215, + "step": 3385, + "time_per_iteration": 2.7511749267578125 + }, + { + "auxiliary_loss_clip": 0.01062778, + "auxiliary_loss_mlp": 0.01057013, + "balance_loss_clip": 1.03380239, + "balance_loss_mlp": 1.03875065, + "epoch": 0.2035773335337442, + "flos": 23330983461120.0, + "grad_norm": 1.6562946964206267, + "language_loss": 0.68572462, + "learning_rate": 3.692181763924639e-06, + "loss": 0.70692253, + "num_input_tokens_seen": 73203290, + "step": 3386, + "time_per_iteration": 2.9017062187194824 + }, + { + "auxiliary_loss_clip": 0.01056572, + "auxiliary_loss_mlp": 0.0105508, + "balance_loss_clip": 1.03233552, + "balance_loss_mlp": 1.0363524, + "epoch": 0.20363745678641215, + "flos": 28330287310080.0, + "grad_norm": 1.4191915205277827, + "language_loss": 0.80928421, + "learning_rate": 3.691974133706947e-06, + "loss": 0.83040071, + "num_input_tokens_seen": 73226185, + "step": 3387, + "time_per_iteration": 2.9127211570739746 + }, + { + "auxiliary_loss_clip": 0.01080126, + "auxiliary_loss_mlp": 0.01042364, + "balance_loss_clip": 1.03673792, + "balance_loss_mlp": 1.02448273, + "epoch": 0.20369758003908012, + "flos": 18915658928640.0, + "grad_norm": 2.232342964756414, + "language_loss": 0.79919326, + "learning_rate": 3.6917664393294262e-06, + "loss": 0.82041812, + "num_input_tokens_seen": 73243300, + "step": 3388, + "time_per_iteration": 4.392082929611206 + }, + { + "auxiliary_loss_clip": 0.0110864, + "auxiliary_loss_mlp": 0.01039558, + "balance_loss_clip": 1.03825831, + "balance_loss_mlp": 1.02152181, + "epoch": 0.20375770329174808, + "flos": 19206499351680.0, + "grad_norm": 2.2608540199426717, + "language_loss": 0.72153389, + "learning_rate": 3.6915586807999527e-06, + "loss": 0.74301589, + "num_input_tokens_seen": 73261490, + "step": 3389, + "time_per_iteration": 2.6804308891296387 + }, + { + "auxiliary_loss_clip": 0.01093021, + "auxiliary_loss_mlp": 0.0104553, + "balance_loss_clip": 1.03522205, + "balance_loss_mlp": 1.02842307, + "epoch": 0.20381782654441605, + "flos": 19391008538880.0, + "grad_norm": 1.7335401884765087, + "language_loss": 0.87212443, + "learning_rate": 3.691350858126404e-06, + "loss": 0.89350998, + "num_input_tokens_seen": 73280180, + "step": 3390, + "time_per_iteration": 2.7340004444122314 + }, + { + "auxiliary_loss_clip": 0.01074982, + "auxiliary_loss_mlp": 0.0104418, + "balance_loss_clip": 1.03399754, + "balance_loss_mlp": 1.02510631, + "epoch": 0.203877949797084, + "flos": 24827704300800.0, + "grad_norm": 1.8919602913363534, + "language_loss": 0.70941335, + "learning_rate": 3.691142971316662e-06, + "loss": 0.73060495, + "num_input_tokens_seen": 73300680, + "step": 3391, + "time_per_iteration": 4.420921325683594 + }, + { + "auxiliary_loss_clip": 0.01075396, + "auxiliary_loss_mlp": 0.01046847, + "balance_loss_clip": 1.03621733, + "balance_loss_mlp": 1.02983594, + "epoch": 0.20393807304975198, + "flos": 18003707504640.0, + "grad_norm": 2.1430301378292245, + "language_loss": 0.86272597, + "learning_rate": 3.6909350203786086e-06, + "loss": 0.88394845, + "num_input_tokens_seen": 73316760, + "step": 3392, + "time_per_iteration": 2.7444398403167725 + }, + { + "auxiliary_loss_clip": 0.01093899, + "auxiliary_loss_mlp": 0.01045689, + "balance_loss_clip": 1.0347904, + "balance_loss_mlp": 1.02885616, + "epoch": 0.20399819630241997, + "flos": 24206988349440.0, + "grad_norm": 1.3994856942366345, + "language_loss": 0.80788064, + "learning_rate": 3.69072700532013e-06, + "loss": 0.82927644, + "num_input_tokens_seen": 73339385, + "step": 3393, + "time_per_iteration": 2.6977930068969727 + }, + { + "auxiliary_loss_clip": 0.01068191, + "auxiliary_loss_mlp": 0.01041062, + "balance_loss_clip": 1.02977502, + "balance_loss_mlp": 1.02473044, + "epoch": 0.20405831955508794, + "flos": 20777124424320.0, + "grad_norm": 1.77063505920415, + "language_loss": 0.85863495, + "learning_rate": 3.6905189261491137e-06, + "loss": 0.87972748, + "num_input_tokens_seen": 73357235, + "step": 3394, + "time_per_iteration": 2.633598804473877 + }, + { + "auxiliary_loss_clip": 0.01093324, + "auxiliary_loss_mlp": 0.01045266, + "balance_loss_clip": 1.03691769, + "balance_loss_mlp": 1.02905369, + "epoch": 0.2041184428077559, + "flos": 15486908325120.0, + "grad_norm": 2.152754643387198, + "language_loss": 0.83687657, + "learning_rate": 3.69031078287345e-06, + "loss": 0.85826242, + "num_input_tokens_seen": 73374435, + "step": 3395, + "time_per_iteration": 5.923198223114014 + }, + { + "auxiliary_loss_clip": 0.0109604, + "auxiliary_loss_mlp": 0.01036167, + "balance_loss_clip": 1.03416538, + "balance_loss_mlp": 1.01855969, + "epoch": 0.20417856606042387, + "flos": 15588463052160.0, + "grad_norm": 2.17231175898879, + "language_loss": 0.83751059, + "learning_rate": 3.690102575501033e-06, + "loss": 0.85883266, + "num_input_tokens_seen": 73391025, + "step": 3396, + "time_per_iteration": 2.675570249557495 + }, + { + "auxiliary_loss_clip": 0.01064453, + "auxiliary_loss_mlp": 0.01041917, + "balance_loss_clip": 1.03191197, + "balance_loss_mlp": 1.02386856, + "epoch": 0.20423868931309183, + "flos": 24279348297600.0, + "grad_norm": 1.6492952676221122, + "language_loss": 0.77270055, + "learning_rate": 3.6898943040397556e-06, + "loss": 0.79376429, + "num_input_tokens_seen": 73409270, + "step": 3397, + "time_per_iteration": 2.6889779567718506 + }, + { + "auxiliary_loss_clip": 0.01082487, + "auxiliary_loss_mlp": 0.01045738, + "balance_loss_clip": 1.0349288, + "balance_loss_mlp": 1.03033662, + "epoch": 0.2042988125657598, + "flos": 18614870438400.0, + "grad_norm": 3.0616058212596315, + "language_loss": 0.87368977, + "learning_rate": 3.689685968497518e-06, + "loss": 0.89497197, + "num_input_tokens_seen": 73425225, + "step": 3398, + "time_per_iteration": 2.6224117279052734 + }, + { + "auxiliary_loss_clip": 0.01073029, + "auxiliary_loss_mlp": 0.01041775, + "balance_loss_clip": 1.03401637, + "balance_loss_mlp": 1.02437043, + "epoch": 0.2043589358184278, + "flos": 17851230270720.0, + "grad_norm": 2.0338433825452373, + "language_loss": 0.77767271, + "learning_rate": 3.6894775688822186e-06, + "loss": 0.79882073, + "num_input_tokens_seen": 73440940, + "step": 3399, + "time_per_iteration": 2.677757501602173 + }, + { + "auxiliary_loss_clip": 0.01094016, + "auxiliary_loss_mlp": 0.01041444, + "balance_loss_clip": 1.03449821, + "balance_loss_mlp": 1.0241344, + "epoch": 0.20441905907109575, + "flos": 21435223455360.0, + "grad_norm": 3.139274624880013, + "language_loss": 0.76687896, + "learning_rate": 3.6892691052017603e-06, + "loss": 0.78823364, + "num_input_tokens_seen": 73458805, + "step": 3400, + "time_per_iteration": 2.592064142227173 + }, + { + "auxiliary_loss_clip": 0.01072212, + "auxiliary_loss_mlp": 0.00748705, + "balance_loss_clip": 1.03509021, + "balance_loss_mlp": 1.00091696, + "epoch": 0.20447918232376372, + "flos": 27707703851520.0, + "grad_norm": 1.72512893219016, + "language_loss": 0.796009, + "learning_rate": 3.6890605774640487e-06, + "loss": 0.81421816, + "num_input_tokens_seen": 73479380, + "step": 3401, + "time_per_iteration": 2.723156690597534 + }, + { + "auxiliary_loss_clip": 0.01081649, + "auxiliary_loss_mlp": 0.0104512, + "balance_loss_clip": 1.03086948, + "balance_loss_mlp": 1.02709508, + "epoch": 0.20453930557643168, + "flos": 30524214113280.0, + "grad_norm": 1.7706431584368663, + "language_loss": 0.69793856, + "learning_rate": 3.688851985676991e-06, + "loss": 0.71920621, + "num_input_tokens_seen": 73505105, + "step": 3402, + "time_per_iteration": 2.7411506175994873 + }, + { + "auxiliary_loss_clip": 0.01069307, + "auxiliary_loss_mlp": 0.01038674, + "balance_loss_clip": 1.03314447, + "balance_loss_mlp": 1.02112651, + "epoch": 0.20459942882909965, + "flos": 18987767481600.0, + "grad_norm": 1.8422710648305538, + "language_loss": 0.80482787, + "learning_rate": 3.688643329848496e-06, + "loss": 0.82590771, + "num_input_tokens_seen": 73523700, + "step": 3403, + "time_per_iteration": 2.6792471408843994 + }, + { + "auxiliary_loss_clip": 0.01090481, + "auxiliary_loss_mlp": 0.01041265, + "balance_loss_clip": 1.03486013, + "balance_loss_mlp": 1.02467132, + "epoch": 0.20465955208176762, + "flos": 20339050152960.0, + "grad_norm": 2.452986593871692, + "language_loss": 0.83459604, + "learning_rate": 3.6884346099864772e-06, + "loss": 0.85591352, + "num_input_tokens_seen": 73542625, + "step": 3404, + "time_per_iteration": 2.656723976135254 + }, + { + "auxiliary_loss_clip": 0.01085424, + "auxiliary_loss_mlp": 0.01044503, + "balance_loss_clip": 1.0314045, + "balance_loss_mlp": 1.02775431, + "epoch": 0.20471967533443558, + "flos": 21251288885760.0, + "grad_norm": 2.2271438088053843, + "language_loss": 0.85861051, + "learning_rate": 3.6882258260988487e-06, + "loss": 0.87990975, + "num_input_tokens_seen": 73561450, + "step": 3405, + "time_per_iteration": 2.6969032287597656 + }, + { + "auxiliary_loss_clip": 0.01069257, + "auxiliary_loss_mlp": 0.01039751, + "balance_loss_clip": 1.03220177, + "balance_loss_mlp": 1.02328849, + "epoch": 0.20477979858710357, + "flos": 14501555458560.0, + "grad_norm": 2.0107850618693677, + "language_loss": 0.85074908, + "learning_rate": 3.6880169781935276e-06, + "loss": 0.87183917, + "num_input_tokens_seen": 73577155, + "step": 3406, + "time_per_iteration": 2.7257275581359863 + }, + { + "auxiliary_loss_clip": 0.01101859, + "auxiliary_loss_mlp": 0.01038281, + "balance_loss_clip": 1.03585553, + "balance_loss_mlp": 1.02271259, + "epoch": 0.20483992183977154, + "flos": 11400310085760.0, + "grad_norm": 2.2359470951931226, + "language_loss": 0.67857295, + "learning_rate": 3.6878080662784336e-06, + "loss": 0.6999743, + "num_input_tokens_seen": 73594900, + "step": 3407, + "time_per_iteration": 2.504708766937256 + }, + { + "auxiliary_loss_clip": 0.01099748, + "auxiliary_loss_mlp": 0.01043479, + "balance_loss_clip": 1.03282368, + "balance_loss_mlp": 1.02687275, + "epoch": 0.2049000450924395, + "flos": 19060271084160.0, + "grad_norm": 2.375780939186845, + "language_loss": 0.84460199, + "learning_rate": 3.6875990903614886e-06, + "loss": 0.86603421, + "num_input_tokens_seen": 73613810, + "step": 3408, + "time_per_iteration": 2.5931553840637207 + }, + { + "auxiliary_loss_clip": 0.01107686, + "auxiliary_loss_mlp": 0.01041692, + "balance_loss_clip": 1.03808618, + "balance_loss_mlp": 1.02490759, + "epoch": 0.20496016834510747, + "flos": 14574561851520.0, + "grad_norm": 2.4413023584334486, + "language_loss": 0.64692706, + "learning_rate": 3.6873900504506166e-06, + "loss": 0.66842085, + "num_input_tokens_seen": 73631495, + "step": 3409, + "time_per_iteration": 2.5371170043945312 + }, + { + "auxiliary_loss_clip": 0.01090583, + "auxiliary_loss_mlp": 0.01041422, + "balance_loss_clip": 1.03285885, + "balance_loss_mlp": 1.0250783, + "epoch": 0.20502029159777543, + "flos": 22126647329280.0, + "grad_norm": 2.1212146064456676, + "language_loss": 0.80290461, + "learning_rate": 3.687180946553745e-06, + "loss": 0.82422465, + "num_input_tokens_seen": 73652840, + "step": 3410, + "time_per_iteration": 2.587801933288574 + }, + { + "auxiliary_loss_clip": 0.01053698, + "auxiliary_loss_mlp": 0.01042923, + "balance_loss_clip": 1.03397071, + "balance_loss_mlp": 1.02671027, + "epoch": 0.2050804148504434, + "flos": 25367907916800.0, + "grad_norm": 2.6309439165299113, + "language_loss": 0.76167703, + "learning_rate": 3.686971778678803e-06, + "loss": 0.7826432, + "num_input_tokens_seen": 73672150, + "step": 3411, + "time_per_iteration": 2.84865140914917 + }, + { + "auxiliary_loss_clip": 0.01084293, + "auxiliary_loss_mlp": 0.01040454, + "balance_loss_clip": 1.03567672, + "balance_loss_mlp": 1.02395582, + "epoch": 0.2051405381031114, + "flos": 23620171858560.0, + "grad_norm": 1.9044196222606393, + "language_loss": 0.73560786, + "learning_rate": 3.686762546833722e-06, + "loss": 0.75685537, + "num_input_tokens_seen": 73691940, + "step": 3412, + "time_per_iteration": 2.649691343307495 + }, + { + "auxiliary_loss_clip": 0.01076704, + "auxiliary_loss_mlp": 0.01051646, + "balance_loss_clip": 1.03267574, + "balance_loss_mlp": 1.03316808, + "epoch": 0.20520066135577936, + "flos": 19565533745280.0, + "grad_norm": 2.182139664410313, + "language_loss": 0.77770901, + "learning_rate": 3.6865532510264362e-06, + "loss": 0.79899251, + "num_input_tokens_seen": 73709080, + "step": 3413, + "time_per_iteration": 2.66139554977417 + }, + { + "auxiliary_loss_clip": 0.01054568, + "auxiliary_loss_mlp": 0.01046828, + "balance_loss_clip": 1.03143489, + "balance_loss_mlp": 1.02981675, + "epoch": 0.20526078460844732, + "flos": 17676345928320.0, + "grad_norm": 1.8192104258168276, + "language_loss": 0.84994948, + "learning_rate": 3.6863438912648823e-06, + "loss": 0.87096345, + "num_input_tokens_seen": 73727670, + "step": 3414, + "time_per_iteration": 2.738478899002075 + }, + { + "auxiliary_loss_clip": 0.01092716, + "auxiliary_loss_mlp": 0.01037674, + "balance_loss_clip": 1.03493047, + "balance_loss_mlp": 1.02137196, + "epoch": 0.2053209078611153, + "flos": 21500328856320.0, + "grad_norm": 1.8774469155055769, + "language_loss": 0.80779833, + "learning_rate": 3.6861344675569986e-06, + "loss": 0.82910222, + "num_input_tokens_seen": 73747170, + "step": 3415, + "time_per_iteration": 2.793074131011963 + }, + { + "auxiliary_loss_clip": 0.01036334, + "auxiliary_loss_mlp": 0.01037017, + "balance_loss_clip": 1.03062308, + "balance_loss_mlp": 1.02210987, + "epoch": 0.20538103111378325, + "flos": 25663524848640.0, + "grad_norm": 1.964286156141792, + "language_loss": 0.73027235, + "learning_rate": 3.6859249799107275e-06, + "loss": 0.75100583, + "num_input_tokens_seen": 73767690, + "step": 3416, + "time_per_iteration": 2.775496482849121 + }, + { + "auxiliary_loss_clip": 0.01092535, + "auxiliary_loss_mlp": 0.01039547, + "balance_loss_clip": 1.03356504, + "balance_loss_mlp": 1.02307272, + "epoch": 0.20544115436645122, + "flos": 23148952312320.0, + "grad_norm": 2.3001261400514137, + "language_loss": 0.78593898, + "learning_rate": 3.6857154283340115e-06, + "loss": 0.8072598, + "num_input_tokens_seen": 73786900, + "step": 3417, + "time_per_iteration": 2.6615748405456543 + }, + { + "auxiliary_loss_clip": 0.01093717, + "auxiliary_loss_mlp": 0.01042868, + "balance_loss_clip": 1.03431845, + "balance_loss_mlp": 1.02569032, + "epoch": 0.20550127761911918, + "flos": 19390433921280.0, + "grad_norm": 2.2077745076845874, + "language_loss": 0.87717688, + "learning_rate": 3.685505812834798e-06, + "loss": 0.8985427, + "num_input_tokens_seen": 73804515, + "step": 3418, + "time_per_iteration": 2.5973942279815674 + }, + { + "auxiliary_loss_clip": 0.01089862, + "auxiliary_loss_mlp": 0.01038285, + "balance_loss_clip": 1.03942287, + "balance_loss_mlp": 1.02129769, + "epoch": 0.20556140087178718, + "flos": 22893124671360.0, + "grad_norm": 1.9206289714234974, + "language_loss": 0.6220386, + "learning_rate": 3.685296133421035e-06, + "loss": 0.64332008, + "num_input_tokens_seen": 73822910, + "step": 3419, + "time_per_iteration": 2.6267969608306885 + }, + { + "auxiliary_loss_clip": 0.01088642, + "auxiliary_loss_mlp": 0.0105031, + "balance_loss_clip": 1.03647661, + "balance_loss_mlp": 1.03158236, + "epoch": 0.20562152412445514, + "flos": 19789652655360.0, + "grad_norm": 2.36067490184312, + "language_loss": 0.86227345, + "learning_rate": 3.685086390100674e-06, + "loss": 0.88366294, + "num_input_tokens_seen": 73841160, + "step": 3420, + "time_per_iteration": 2.5872271060943604 + }, + { + "auxiliary_loss_clip": 0.01049549, + "auxiliary_loss_mlp": 0.00748915, + "balance_loss_clip": 1.02823734, + "balance_loss_mlp": 1.0009563, + "epoch": 0.2056816473771231, + "flos": 31501989210240.0, + "grad_norm": 2.192820930612507, + "language_loss": 0.71563846, + "learning_rate": 3.684876582881668e-06, + "loss": 0.73362315, + "num_input_tokens_seen": 73862795, + "step": 3421, + "time_per_iteration": 2.7394509315490723 + }, + { + "auxiliary_loss_clip": 0.01100261, + "auxiliary_loss_mlp": 0.01042298, + "balance_loss_clip": 1.03364396, + "balance_loss_mlp": 1.02526283, + "epoch": 0.20574177062979107, + "flos": 23258372117760.0, + "grad_norm": 2.639768007987311, + "language_loss": 0.70866954, + "learning_rate": 3.6846667117719732e-06, + "loss": 0.73009515, + "num_input_tokens_seen": 73881525, + "step": 3422, + "time_per_iteration": 2.560001850128174 + }, + { + "auxiliary_loss_clip": 0.01015184, + "auxiliary_loss_mlp": 0.01013666, + "balance_loss_clip": 1.00814569, + "balance_loss_mlp": 1.01123416, + "epoch": 0.20580189388245904, + "flos": 70312518708480.0, + "grad_norm": 0.7516494930535835, + "language_loss": 0.55467033, + "learning_rate": 3.684456776779548e-06, + "loss": 0.57495892, + "num_input_tokens_seen": 73937775, + "step": 3423, + "time_per_iteration": 3.2288687229156494 + }, + { + "auxiliary_loss_clip": 0.01060355, + "auxiliary_loss_mlp": 0.01040271, + "balance_loss_clip": 1.03321195, + "balance_loss_mlp": 1.02306914, + "epoch": 0.205862017135127, + "flos": 30737846252160.0, + "grad_norm": 1.8333285041468697, + "language_loss": 0.71373683, + "learning_rate": 3.684246777912353e-06, + "loss": 0.73474312, + "num_input_tokens_seen": 73958250, + "step": 3424, + "time_per_iteration": 2.7597413063049316 + }, + { + "auxiliary_loss_clip": 0.01071127, + "auxiliary_loss_mlp": 0.00748797, + "balance_loss_clip": 1.03528607, + "balance_loss_mlp": 1.0010426, + "epoch": 0.20592214038779497, + "flos": 21324546673920.0, + "grad_norm": 1.481337002006389, + "language_loss": 0.7516641, + "learning_rate": 3.684036715178351e-06, + "loss": 0.76986337, + "num_input_tokens_seen": 73977775, + "step": 3425, + "time_per_iteration": 2.6533827781677246 + }, + { + "auxiliary_loss_clip": 0.01060783, + "auxiliary_loss_mlp": 0.01052459, + "balance_loss_clip": 1.03698027, + "balance_loss_mlp": 1.0357573, + "epoch": 0.20598226364046296, + "flos": 22891652213760.0, + "grad_norm": 1.7784780307167274, + "language_loss": 0.88175118, + "learning_rate": 3.683826588585508e-06, + "loss": 0.90288359, + "num_input_tokens_seen": 73996590, + "step": 3426, + "time_per_iteration": 2.748080015182495 + }, + { + "auxiliary_loss_clip": 0.01094631, + "auxiliary_loss_mlp": 0.01039034, + "balance_loss_clip": 1.03931069, + "balance_loss_mlp": 1.02351332, + "epoch": 0.20604238689313092, + "flos": 23878549365120.0, + "grad_norm": 1.4647011847459552, + "language_loss": 0.77240837, + "learning_rate": 3.6836163981417926e-06, + "loss": 0.79374504, + "num_input_tokens_seen": 74015935, + "step": 3427, + "time_per_iteration": 2.8276922702789307 + }, + { + "auxiliary_loss_clip": 0.01107457, + "auxiliary_loss_mlp": 0.01044596, + "balance_loss_clip": 1.03875172, + "balance_loss_mlp": 1.02810311, + "epoch": 0.2061025101457989, + "flos": 22491535639680.0, + "grad_norm": 1.6802034586203487, + "language_loss": 0.73982871, + "learning_rate": 3.683406143855174e-06, + "loss": 0.7613492, + "num_input_tokens_seen": 74036575, + "step": 3428, + "time_per_iteration": 2.637951374053955 + }, + { + "auxiliary_loss_clip": 0.01079424, + "auxiliary_loss_mlp": 0.01038509, + "balance_loss_clip": 1.03125739, + "balance_loss_mlp": 1.02127159, + "epoch": 0.20616263339846685, + "flos": 22778928357120.0, + "grad_norm": 1.8754813302428364, + "language_loss": 0.73598427, + "learning_rate": 3.6831958257336256e-06, + "loss": 0.75716352, + "num_input_tokens_seen": 74055365, + "step": 3429, + "time_per_iteration": 2.6464321613311768 + }, + { + "auxiliary_loss_clip": 0.01087182, + "auxiliary_loss_mlp": 0.01043674, + "balance_loss_clip": 1.03821313, + "balance_loss_mlp": 1.02727056, + "epoch": 0.20622275665113482, + "flos": 20882198684160.0, + "grad_norm": 1.9530144048115818, + "language_loss": 0.85764468, + "learning_rate": 3.6829854437851237e-06, + "loss": 0.87895328, + "num_input_tokens_seen": 74074875, + "step": 3430, + "time_per_iteration": 2.655536413192749 + }, + { + "auxiliary_loss_clip": 0.01035472, + "auxiliary_loss_mlp": 0.01045273, + "balance_loss_clip": 1.03003716, + "balance_loss_mlp": 1.02777302, + "epoch": 0.20628287990380278, + "flos": 19354415558400.0, + "grad_norm": 1.6974597449653446, + "language_loss": 0.68891567, + "learning_rate": 3.6827749980176444e-06, + "loss": 0.70972311, + "num_input_tokens_seen": 74094505, + "step": 3431, + "time_per_iteration": 2.779843807220459 + }, + { + "auxiliary_loss_clip": 0.00993959, + "auxiliary_loss_mlp": 0.01006855, + "balance_loss_clip": 1.01425016, + "balance_loss_mlp": 1.0047096, + "epoch": 0.20634300315647078, + "flos": 71517932248320.0, + "grad_norm": 0.8073709170396804, + "language_loss": 0.60238171, + "learning_rate": 3.6825644884391693e-06, + "loss": 0.62238979, + "num_input_tokens_seen": 74158500, + "step": 3432, + "time_per_iteration": 3.4225411415100098 + }, + { + "auxiliary_loss_clip": 0.01097032, + "auxiliary_loss_mlp": 0.01040577, + "balance_loss_clip": 1.04011738, + "balance_loss_mlp": 1.02428091, + "epoch": 0.20640312640913874, + "flos": 21723944976000.0, + "grad_norm": 1.647609551187222, + "language_loss": 0.72305596, + "learning_rate": 3.682353915057679e-06, + "loss": 0.74443209, + "num_input_tokens_seen": 74176685, + "step": 3433, + "time_per_iteration": 2.6004221439361572 + }, + { + "auxiliary_loss_clip": 0.01042915, + "auxiliary_loss_mlp": 0.0104711, + "balance_loss_clip": 1.03031516, + "balance_loss_mlp": 1.02904963, + "epoch": 0.2064632496618067, + "flos": 20554621626240.0, + "grad_norm": 2.2073498287974793, + "language_loss": 0.86826229, + "learning_rate": 3.6821432778811604e-06, + "loss": 0.8891626, + "num_input_tokens_seen": 74194935, + "step": 3434, + "time_per_iteration": 2.7756905555725098 + }, + { + "auxiliary_loss_clip": 0.01098169, + "auxiliary_loss_mlp": 0.01037067, + "balance_loss_clip": 1.03538573, + "balance_loss_mlp": 1.0204258, + "epoch": 0.20652337291447467, + "flos": 29823273135360.0, + "grad_norm": 1.719872740582949, + "language_loss": 0.69671881, + "learning_rate": 3.6819325769176004e-06, + "loss": 0.71807122, + "num_input_tokens_seen": 74215400, + "step": 3435, + "time_per_iteration": 2.6499743461608887 + }, + { + "auxiliary_loss_clip": 0.01072012, + "auxiliary_loss_mlp": 0.01043022, + "balance_loss_clip": 1.03486919, + "balance_loss_mlp": 1.02574849, + "epoch": 0.20658349616714264, + "flos": 26213640618240.0, + "grad_norm": 1.6748455581616417, + "language_loss": 0.89365232, + "learning_rate": 3.681721812174988e-06, + "loss": 0.91480267, + "num_input_tokens_seen": 74234090, + "step": 3436, + "time_per_iteration": 4.309027671813965 + }, + { + "auxiliary_loss_clip": 0.01064423, + "auxiliary_loss_mlp": 0.01036751, + "balance_loss_clip": 1.03370023, + "balance_loss_mlp": 1.01928639, + "epoch": 0.2066436194198106, + "flos": 25994370044160.0, + "grad_norm": 1.604814256080503, + "language_loss": 0.77195919, + "learning_rate": 3.6815109836613163e-06, + "loss": 0.7929709, + "num_input_tokens_seen": 74253345, + "step": 3437, + "time_per_iteration": 2.7189338207244873 + }, + { + "auxiliary_loss_clip": 0.01091324, + "auxiliary_loss_mlp": 0.01038764, + "balance_loss_clip": 1.03357244, + "balance_loss_mlp": 1.02283716, + "epoch": 0.20670374267247857, + "flos": 21361067827200.0, + "grad_norm": 2.0657449736113027, + "language_loss": 0.77634108, + "learning_rate": 3.6813000913845795e-06, + "loss": 0.79764193, + "num_input_tokens_seen": 74271615, + "step": 3438, + "time_per_iteration": 4.182756423950195 + }, + { + "auxiliary_loss_clip": 0.01014195, + "auxiliary_loss_mlp": 0.01008493, + "balance_loss_clip": 1.00554299, + "balance_loss_mlp": 1.00567997, + "epoch": 0.20676386592514656, + "flos": 66383281952640.0, + "grad_norm": 0.8335342220051933, + "language_loss": 0.67049539, + "learning_rate": 3.6810891353527747e-06, + "loss": 0.69072223, + "num_input_tokens_seen": 74331390, + "step": 3439, + "time_per_iteration": 3.125953197479248 + }, + { + "auxiliary_loss_clip": 0.01096499, + "auxiliary_loss_mlp": 0.01038729, + "balance_loss_clip": 1.03600276, + "balance_loss_mlp": 1.02233756, + "epoch": 0.20682398917781453, + "flos": 17274577328640.0, + "grad_norm": 2.132915703535483, + "language_loss": 0.84211755, + "learning_rate": 3.6808781155739014e-06, + "loss": 0.86346984, + "num_input_tokens_seen": 74347335, + "step": 3440, + "time_per_iteration": 2.5719544887542725 + }, + { + "auxiliary_loss_clip": 0.01094847, + "auxiliary_loss_mlp": 0.01041909, + "balance_loss_clip": 1.03588533, + "balance_loss_mlp": 1.02564859, + "epoch": 0.2068841124304825, + "flos": 18077288515200.0, + "grad_norm": 2.4106364894655883, + "language_loss": 0.84540188, + "learning_rate": 3.6806670320559614e-06, + "loss": 0.86676937, + "num_input_tokens_seen": 74366310, + "step": 3441, + "time_per_iteration": 2.5805823802948 + }, + { + "auxiliary_loss_clip": 0.01058463, + "auxiliary_loss_mlp": 0.01041986, + "balance_loss_clip": 1.03372061, + "balance_loss_mlp": 1.02547562, + "epoch": 0.20694423568315046, + "flos": 27347017432320.0, + "grad_norm": 2.067829883615388, + "language_loss": 0.85852158, + "learning_rate": 3.680455884806959e-06, + "loss": 0.87952608, + "num_input_tokens_seen": 74387100, + "step": 3442, + "time_per_iteration": 4.393989324569702 + }, + { + "auxiliary_loss_clip": 0.01027333, + "auxiliary_loss_mlp": 0.01043738, + "balance_loss_clip": 1.03336155, + "balance_loss_mlp": 1.02659619, + "epoch": 0.20700435893581842, + "flos": 20229845829120.0, + "grad_norm": 2.65211977305794, + "language_loss": 0.72960162, + "learning_rate": 3.6802446738349014e-06, + "loss": 0.75031233, + "num_input_tokens_seen": 74404460, + "step": 3443, + "time_per_iteration": 4.366429090499878 + }, + { + "auxiliary_loss_clip": 0.01082547, + "auxiliary_loss_mlp": 0.00748752, + "balance_loss_clip": 1.03585029, + "balance_loss_mlp": 1.0009743, + "epoch": 0.2070644821884864, + "flos": 20631111638400.0, + "grad_norm": 1.8045652318675112, + "language_loss": 0.85559982, + "learning_rate": 3.680033399147797e-06, + "loss": 0.87391281, + "num_input_tokens_seen": 74423790, + "step": 3444, + "time_per_iteration": 2.703199625015259 + }, + { + "auxiliary_loss_clip": 0.00994112, + "auxiliary_loss_mlp": 0.01003226, + "balance_loss_clip": 1.01451254, + "balance_loss_mlp": 1.00068665, + "epoch": 0.20712460544115438, + "flos": 65941077617280.0, + "grad_norm": 0.6869359110710259, + "language_loss": 0.57121015, + "learning_rate": 3.6798220607536585e-06, + "loss": 0.59118354, + "num_input_tokens_seen": 74488130, + "step": 3445, + "time_per_iteration": 3.2185003757476807 + }, + { + "auxiliary_loss_clip": 0.01105764, + "auxiliary_loss_mlp": 0.00748848, + "balance_loss_clip": 1.03744292, + "balance_loss_mlp": 1.00106466, + "epoch": 0.20718472869382235, + "flos": 19425734012160.0, + "grad_norm": 1.5195922298544413, + "language_loss": 0.78322768, + "learning_rate": 3.6796106586604987e-06, + "loss": 0.80177379, + "num_input_tokens_seen": 74506720, + "step": 3446, + "time_per_iteration": 2.6060023307800293 + }, + { + "auxiliary_loss_clip": 0.01099187, + "auxiliary_loss_mlp": 0.01045578, + "balance_loss_clip": 1.03541398, + "balance_loss_mlp": 1.02569366, + "epoch": 0.2072448519464903, + "flos": 24499049834880.0, + "grad_norm": 1.847981977935163, + "language_loss": 0.62179816, + "learning_rate": 3.679399192876334e-06, + "loss": 0.64324582, + "num_input_tokens_seen": 74525330, + "step": 3447, + "time_per_iteration": 2.651954412460327 + }, + { + "auxiliary_loss_clip": 0.01049836, + "auxiliary_loss_mlp": 0.01059665, + "balance_loss_clip": 1.0311842, + "balance_loss_mlp": 1.04135418, + "epoch": 0.20730497519915828, + "flos": 23075694524160.0, + "grad_norm": 1.7107678486972102, + "language_loss": 0.86570162, + "learning_rate": 3.679187663409184e-06, + "loss": 0.88679659, + "num_input_tokens_seen": 74544535, + "step": 3448, + "time_per_iteration": 2.7527284622192383 + }, + { + "auxiliary_loss_clip": 0.01079836, + "auxiliary_loss_mlp": 0.01044706, + "balance_loss_clip": 1.03261995, + "balance_loss_mlp": 1.02557325, + "epoch": 0.20736509845182624, + "flos": 21069042255360.0, + "grad_norm": 2.0479061414154716, + "language_loss": 0.74786621, + "learning_rate": 3.6789760702670696e-06, + "loss": 0.76911163, + "num_input_tokens_seen": 74562300, + "step": 3449, + "time_per_iteration": 2.6721811294555664 + }, + { + "auxiliary_loss_clip": 0.0108793, + "auxiliary_loss_mlp": 0.01048445, + "balance_loss_clip": 1.03368974, + "balance_loss_mlp": 1.02958596, + "epoch": 0.2074252217044942, + "flos": 17633288499840.0, + "grad_norm": 2.852158208954475, + "language_loss": 0.76766372, + "learning_rate": 3.6787644134580134e-06, + "loss": 0.78902751, + "num_input_tokens_seen": 74580080, + "step": 3450, + "time_per_iteration": 2.6524157524108887 + }, + { + "auxiliary_loss_clip": 0.01077385, + "auxiliary_loss_mlp": 0.01047003, + "balance_loss_clip": 1.03495932, + "balance_loss_mlp": 1.02971721, + "epoch": 0.20748534495716217, + "flos": 23546985897600.0, + "grad_norm": 1.5236776441512005, + "language_loss": 0.82390308, + "learning_rate": 3.6785526929900436e-06, + "loss": 0.84514695, + "num_input_tokens_seen": 74598980, + "step": 3451, + "time_per_iteration": 2.7199201583862305 + }, + { + "auxiliary_loss_clip": 0.01022214, + "auxiliary_loss_mlp": 0.01002395, + "balance_loss_clip": 1.00423551, + "balance_loss_mlp": 0.99937916, + "epoch": 0.20754546820983016, + "flos": 52252935598080.0, + "grad_norm": 0.7915262258132928, + "language_loss": 0.56612569, + "learning_rate": 3.6783409088711875e-06, + "loss": 0.58637178, + "num_input_tokens_seen": 74655275, + "step": 3452, + "time_per_iteration": 3.0614216327667236 + }, + { + "auxiliary_loss_clip": 0.01075459, + "auxiliary_loss_mlp": 0.00749016, + "balance_loss_clip": 1.03393769, + "balance_loss_mlp": 1.00113964, + "epoch": 0.20760559146249813, + "flos": 20412379768320.0, + "grad_norm": 1.9699632213617646, + "language_loss": 0.8859266, + "learning_rate": 3.6781290611094755e-06, + "loss": 0.90417129, + "num_input_tokens_seen": 74674560, + "step": 3453, + "time_per_iteration": 2.726529598236084 + }, + { + "auxiliary_loss_clip": 0.01096035, + "auxiliary_loss_mlp": 0.01041027, + "balance_loss_clip": 1.03728175, + "balance_loss_mlp": 1.02268028, + "epoch": 0.2076657147151661, + "flos": 23186012169600.0, + "grad_norm": 1.5985999309590633, + "language_loss": 0.79828149, + "learning_rate": 3.6779171497129407e-06, + "loss": 0.81965208, + "num_input_tokens_seen": 74694500, + "step": 3454, + "time_per_iteration": 2.7234861850738525 + }, + { + "auxiliary_loss_clip": 0.01049428, + "auxiliary_loss_mlp": 0.00748953, + "balance_loss_clip": 1.02848983, + "balance_loss_mlp": 1.00113583, + "epoch": 0.20772583796783406, + "flos": 18293219124480.0, + "grad_norm": 2.799098259131522, + "language_loss": 0.7753498, + "learning_rate": 3.6777051746896202e-06, + "loss": 0.79333359, + "num_input_tokens_seen": 74710485, + "step": 3455, + "time_per_iteration": 2.834995985031128 + }, + { + "auxiliary_loss_clip": 0.0106315, + "auxiliary_loss_mlp": 0.01047999, + "balance_loss_clip": 1.0303551, + "balance_loss_mlp": 1.03163791, + "epoch": 0.20778596122050202, + "flos": 17602800831360.0, + "grad_norm": 1.5777644487870177, + "language_loss": 0.80558372, + "learning_rate": 3.6774931360475516e-06, + "loss": 0.82669514, + "num_input_tokens_seen": 74727450, + "step": 3456, + "time_per_iteration": 2.697075605392456 + }, + { + "auxiliary_loss_clip": 0.01059118, + "auxiliary_loss_mlp": 0.00748931, + "balance_loss_clip": 1.03312743, + "balance_loss_mlp": 1.00109434, + "epoch": 0.20784608447317, + "flos": 23805578885760.0, + "grad_norm": 1.7566358835899187, + "language_loss": 0.77687216, + "learning_rate": 3.6772810337947745e-06, + "loss": 0.79495263, + "num_input_tokens_seen": 74746725, + "step": 3457, + "time_per_iteration": 2.71195125579834 + }, + { + "auxiliary_loss_clip": 0.01027387, + "auxiliary_loss_mlp": 0.01049048, + "balance_loss_clip": 1.02767026, + "balance_loss_mlp": 1.02917624, + "epoch": 0.20790620772583795, + "flos": 17639286071040.0, + "grad_norm": 2.5410878439070586, + "language_loss": 0.83593965, + "learning_rate": 3.677068867939333e-06, + "loss": 0.856704, + "num_input_tokens_seen": 74765255, + "step": 3458, + "time_per_iteration": 2.811741828918457 + }, + { + "auxiliary_loss_clip": 0.01089489, + "auxiliary_loss_mlp": 0.00748918, + "balance_loss_clip": 1.03384709, + "balance_loss_mlp": 1.00124812, + "epoch": 0.20796633097850595, + "flos": 27673481168640.0, + "grad_norm": 1.6808895865314772, + "language_loss": 0.75964165, + "learning_rate": 3.676856638489272e-06, + "loss": 0.77802569, + "num_input_tokens_seen": 74785710, + "step": 3459, + "time_per_iteration": 2.67746639251709 + }, + { + "auxiliary_loss_clip": 0.0104219, + "auxiliary_loss_mlp": 0.01037936, + "balance_loss_clip": 1.02933025, + "balance_loss_mlp": 1.02101994, + "epoch": 0.2080264542311739, + "flos": 19245606284160.0, + "grad_norm": 1.8855916002259103, + "language_loss": 0.77253854, + "learning_rate": 3.6766443454526382e-06, + "loss": 0.79333979, + "num_input_tokens_seen": 74804490, + "step": 3460, + "time_per_iteration": 2.7244715690612793 + }, + { + "auxiliary_loss_clip": 0.01041148, + "auxiliary_loss_mlp": 0.01046752, + "balance_loss_clip": 1.03067923, + "balance_loss_mlp": 1.02900171, + "epoch": 0.20808657748384188, + "flos": 27525924097920.0, + "grad_norm": 2.1429629618399613, + "language_loss": 0.75803506, + "learning_rate": 3.6764319888374836e-06, + "loss": 0.77891403, + "num_input_tokens_seen": 74826340, + "step": 3461, + "time_per_iteration": 2.812173366546631 + }, + { + "auxiliary_loss_clip": 0.010754, + "auxiliary_loss_mlp": 0.0104292, + "balance_loss_clip": 1.03141975, + "balance_loss_mlp": 1.02409697, + "epoch": 0.20814670073650984, + "flos": 26906931999360.0, + "grad_norm": 2.0439563949808206, + "language_loss": 0.88644773, + "learning_rate": 3.6762195686518604e-06, + "loss": 0.90763092, + "num_input_tokens_seen": 74844960, + "step": 3462, + "time_per_iteration": 2.6723756790161133 + }, + { + "auxiliary_loss_clip": 0.00986396, + "auxiliary_loss_mlp": 0.00748419, + "balance_loss_clip": 1.00774777, + "balance_loss_mlp": 1.00114882, + "epoch": 0.2082068239891778, + "flos": 70175735717760.0, + "grad_norm": 0.7602688440070539, + "language_loss": 0.59053242, + "learning_rate": 3.6760070849038226e-06, + "loss": 0.60788059, + "num_input_tokens_seen": 74909075, + "step": 3463, + "time_per_iteration": 3.379409074783325 + }, + { + "auxiliary_loss_clip": 0.01084411, + "auxiliary_loss_mlp": 0.01049113, + "balance_loss_clip": 1.03220129, + "balance_loss_mlp": 1.03088558, + "epoch": 0.20826694724184577, + "flos": 24608074590720.0, + "grad_norm": 2.572685518466759, + "language_loss": 0.66166449, + "learning_rate": 3.675794537601429e-06, + "loss": 0.68299973, + "num_input_tokens_seen": 74928125, + "step": 3464, + "time_per_iteration": 2.7102444171905518 + }, + { + "auxiliary_loss_clip": 0.01075268, + "auxiliary_loss_mlp": 0.01047735, + "balance_loss_clip": 1.03284287, + "balance_loss_mlp": 1.02907848, + "epoch": 0.20832707049451377, + "flos": 12892829034240.0, + "grad_norm": 1.8975494126099153, + "language_loss": 0.83748984, + "learning_rate": 3.6755819267527373e-06, + "loss": 0.85871994, + "num_input_tokens_seen": 74945090, + "step": 3465, + "time_per_iteration": 2.6429662704467773 + }, + { + "auxiliary_loss_clip": 0.0105001, + "auxiliary_loss_mlp": 0.01042189, + "balance_loss_clip": 1.0290606, + "balance_loss_mlp": 1.02465391, + "epoch": 0.20838719374718173, + "flos": 22198827709440.0, + "grad_norm": 2.026002273855868, + "language_loss": 0.81389064, + "learning_rate": 3.6753692523658113e-06, + "loss": 0.83481264, + "num_input_tokens_seen": 74963630, + "step": 3466, + "time_per_iteration": 2.7250118255615234 + }, + { + "auxiliary_loss_clip": 0.01096414, + "auxiliary_loss_mlp": 0.01040976, + "balance_loss_clip": 1.03863358, + "balance_loss_mlp": 1.02614617, + "epoch": 0.2084473169998497, + "flos": 15158648908800.0, + "grad_norm": 1.8052367072007762, + "language_loss": 0.82202816, + "learning_rate": 3.675156514448716e-06, + "loss": 0.84340203, + "num_input_tokens_seen": 74981875, + "step": 3467, + "time_per_iteration": 2.6303226947784424 + }, + { + "auxiliary_loss_clip": 0.0110559, + "auxiliary_loss_mlp": 0.01042788, + "balance_loss_clip": 1.03995633, + "balance_loss_mlp": 1.02675486, + "epoch": 0.20850744025251766, + "flos": 17456788045440.0, + "grad_norm": 1.9141363324189913, + "language_loss": 0.81758726, + "learning_rate": 3.674943713009518e-06, + "loss": 0.83907104, + "num_input_tokens_seen": 74999155, + "step": 3468, + "time_per_iteration": 2.6322011947631836 + }, + { + "auxiliary_loss_clip": 0.01098163, + "auxiliary_loss_mlp": 0.01049661, + "balance_loss_clip": 1.03750515, + "balance_loss_mlp": 1.03021753, + "epoch": 0.20856756350518563, + "flos": 25698968593920.0, + "grad_norm": 1.915135815214893, + "language_loss": 0.90086925, + "learning_rate": 3.6747308480562856e-06, + "loss": 0.92234749, + "num_input_tokens_seen": 75017850, + "step": 3469, + "time_per_iteration": 2.6359875202178955 + }, + { + "auxiliary_loss_clip": 0.01085825, + "auxiliary_loss_mlp": 0.01043618, + "balance_loss_clip": 1.0407393, + "balance_loss_mlp": 1.02605844, + "epoch": 0.2086276867578536, + "flos": 37889060970240.0, + "grad_norm": 2.0228883506838384, + "language_loss": 0.76712871, + "learning_rate": 3.674517919597092e-06, + "loss": 0.78842318, + "num_input_tokens_seen": 75039270, + "step": 3470, + "time_per_iteration": 2.7845327854156494 + }, + { + "auxiliary_loss_clip": 0.0108355, + "auxiliary_loss_mlp": 0.01044729, + "balance_loss_clip": 1.03606319, + "balance_loss_mlp": 1.02780139, + "epoch": 0.20868781001052156, + "flos": 25557049958400.0, + "grad_norm": 2.78848112155982, + "language_loss": 0.75666839, + "learning_rate": 3.674304927640011e-06, + "loss": 0.77795112, + "num_input_tokens_seen": 75059350, + "step": 3471, + "time_per_iteration": 2.6707544326782227 + }, + { + "auxiliary_loss_clip": 0.01074354, + "auxiliary_loss_mlp": 0.01045774, + "balance_loss_clip": 1.03108919, + "balance_loss_mlp": 1.02734458, + "epoch": 0.20874793326318955, + "flos": 27529192235520.0, + "grad_norm": 1.6718439869502935, + "language_loss": 0.7592206, + "learning_rate": 3.67409187219312e-06, + "loss": 0.78042185, + "num_input_tokens_seen": 75080150, + "step": 3472, + "time_per_iteration": 2.7593414783477783 + }, + { + "auxiliary_loss_clip": 0.01091578, + "auxiliary_loss_mlp": 0.01040826, + "balance_loss_clip": 1.03490794, + "balance_loss_mlp": 1.0243032, + "epoch": 0.20880805651585752, + "flos": 18548795370240.0, + "grad_norm": 1.8656909943688542, + "language_loss": 0.84228206, + "learning_rate": 3.6738787532644966e-06, + "loss": 0.8636061, + "num_input_tokens_seen": 75097920, + "step": 3473, + "time_per_iteration": 2.5989553928375244 + }, + { + "auxiliary_loss_clip": 0.01012658, + "auxiliary_loss_mlp": 0.01014545, + "balance_loss_clip": 1.02567387, + "balance_loss_mlp": 1.01125455, + "epoch": 0.20886817976852548, + "flos": 65946644225280.0, + "grad_norm": 0.8870902586351654, + "language_loss": 0.63620389, + "learning_rate": 3.6736655708622235e-06, + "loss": 0.6564759, + "num_input_tokens_seen": 75152410, + "step": 3474, + "time_per_iteration": 3.1773433685302734 + }, + { + "auxiliary_loss_clip": 0.01087467, + "auxiliary_loss_mlp": 0.01043178, + "balance_loss_clip": 1.03560019, + "balance_loss_mlp": 1.02590442, + "epoch": 0.20892830302119345, + "flos": 36539178929280.0, + "grad_norm": 4.56937281252521, + "language_loss": 0.70430481, + "learning_rate": 3.6734523249943844e-06, + "loss": 0.72561121, + "num_input_tokens_seen": 75173265, + "step": 3475, + "time_per_iteration": 2.952235221862793 + }, + { + "auxiliary_loss_clip": 0.01108797, + "auxiliary_loss_mlp": 0.01044614, + "balance_loss_clip": 1.03939605, + "balance_loss_mlp": 1.02784121, + "epoch": 0.2089884262738614, + "flos": 20956749361920.0, + "grad_norm": 1.5774657591730161, + "language_loss": 0.70288169, + "learning_rate": 3.673239015669065e-06, + "loss": 0.72441578, + "num_input_tokens_seen": 75193640, + "step": 3476, + "time_per_iteration": 2.5820200443267822 + }, + { + "auxiliary_loss_clip": 0.01084588, + "auxiliary_loss_mlp": 0.01042137, + "balance_loss_clip": 1.03621018, + "balance_loss_mlp": 1.02526867, + "epoch": 0.20904854952652938, + "flos": 22784028088320.0, + "grad_norm": 2.0335791916372195, + "language_loss": 0.88847125, + "learning_rate": 3.6730256428943544e-06, + "loss": 0.90973848, + "num_input_tokens_seen": 75212545, + "step": 3477, + "time_per_iteration": 2.6405959129333496 + }, + { + "auxiliary_loss_clip": 0.01045134, + "auxiliary_loss_mlp": 0.01046155, + "balance_loss_clip": 1.02965152, + "balance_loss_mlp": 1.02858996, + "epoch": 0.20910867277919734, + "flos": 27303277645440.0, + "grad_norm": 2.173126680228309, + "language_loss": 0.67915237, + "learning_rate": 3.672812206678344e-06, + "loss": 0.7000652, + "num_input_tokens_seen": 75230865, + "step": 3478, + "time_per_iteration": 2.7815375328063965 + }, + { + "auxiliary_loss_clip": 0.01041533, + "auxiliary_loss_mlp": 0.01046579, + "balance_loss_clip": 1.02683723, + "balance_loss_mlp": 1.02823257, + "epoch": 0.20916879603186533, + "flos": 14319237000960.0, + "grad_norm": 2.2187360165714107, + "language_loss": 0.84989488, + "learning_rate": 3.672598707029127e-06, + "loss": 0.870776, + "num_input_tokens_seen": 75248285, + "step": 3479, + "time_per_iteration": 2.647615432739258 + }, + { + "auxiliary_loss_clip": 0.01063503, + "auxiliary_loss_mlp": 0.01050136, + "balance_loss_clip": 1.03331137, + "balance_loss_mlp": 1.03169417, + "epoch": 0.2092289192845333, + "flos": 22273019251200.0, + "grad_norm": 2.0971992019013874, + "language_loss": 0.74193096, + "learning_rate": 3.6723851439548003e-06, + "loss": 0.76306736, + "num_input_tokens_seen": 75266310, + "step": 3480, + "time_per_iteration": 2.636239767074585 + }, + { + "auxiliary_loss_clip": 0.01062295, + "auxiliary_loss_mlp": 0.01039304, + "balance_loss_clip": 1.03273702, + "balance_loss_mlp": 1.0247848, + "epoch": 0.20928904253720126, + "flos": 14830712714880.0, + "grad_norm": 1.986477131978899, + "language_loss": 0.75820845, + "learning_rate": 3.67217151746346e-06, + "loss": 0.7792244, + "num_input_tokens_seen": 75284175, + "step": 3481, + "time_per_iteration": 2.6748948097229004 + }, + { + "auxiliary_loss_clip": 0.01040841, + "auxiliary_loss_mlp": 0.01042842, + "balance_loss_clip": 1.02997756, + "balance_loss_mlp": 1.02597356, + "epoch": 0.20934916578986923, + "flos": 23259162216960.0, + "grad_norm": 2.853388775188801, + "language_loss": 0.852606, + "learning_rate": 3.671957827563209e-06, + "loss": 0.87344277, + "num_input_tokens_seen": 75303465, + "step": 3482, + "time_per_iteration": 2.763129472732544 + }, + { + "auxiliary_loss_clip": 0.01050726, + "auxiliary_loss_mlp": 0.01040531, + "balance_loss_clip": 1.03260827, + "balance_loss_mlp": 1.02379358, + "epoch": 0.2094092890425372, + "flos": 32014398677760.0, + "grad_norm": 2.4443978929095556, + "language_loss": 0.71191126, + "learning_rate": 3.6717440742621494e-06, + "loss": 0.73282385, + "num_input_tokens_seen": 75325290, + "step": 3483, + "time_per_iteration": 4.487018346786499 + }, + { + "auxiliary_loss_clip": 0.01084556, + "auxiliary_loss_mlp": 0.01047624, + "balance_loss_clip": 1.03616273, + "balance_loss_mlp": 1.03024304, + "epoch": 0.20946941229520516, + "flos": 20010647082240.0, + "grad_norm": 1.785388387907995, + "language_loss": 0.74808121, + "learning_rate": 3.6715302575683865e-06, + "loss": 0.76940298, + "num_input_tokens_seen": 75343895, + "step": 3484, + "time_per_iteration": 4.241856098175049 + }, + { + "auxiliary_loss_clip": 0.01071362, + "auxiliary_loss_mlp": 0.01044132, + "balance_loss_clip": 1.03706324, + "balance_loss_mlp": 1.02639377, + "epoch": 0.20952953554787315, + "flos": 30740072895360.0, + "grad_norm": 1.641857156328715, + "language_loss": 0.70852554, + "learning_rate": 3.6713163774900292e-06, + "loss": 0.72968054, + "num_input_tokens_seen": 75367100, + "step": 3485, + "time_per_iteration": 2.7203528881073 + }, + { + "auxiliary_loss_clip": 0.01038188, + "auxiliary_loss_mlp": 0.00748866, + "balance_loss_clip": 1.03142989, + "balance_loss_mlp": 1.00112176, + "epoch": 0.20958965880054112, + "flos": 27049209770880.0, + "grad_norm": 1.9996546191180566, + "language_loss": 0.83319247, + "learning_rate": 3.6711024340351875e-06, + "loss": 0.85106301, + "num_input_tokens_seen": 75389925, + "step": 3486, + "time_per_iteration": 2.829118251800537 + }, + { + "auxiliary_loss_clip": 0.0109486, + "auxiliary_loss_mlp": 0.010481, + "balance_loss_clip": 1.0361886, + "balance_loss_mlp": 1.03203082, + "epoch": 0.20964978205320908, + "flos": 34204123589760.0, + "grad_norm": 2.15391494927714, + "language_loss": 0.87527943, + "learning_rate": 3.6708884272119737e-06, + "loss": 0.89670902, + "num_input_tokens_seen": 75408575, + "step": 3487, + "time_per_iteration": 2.716686725616455 + }, + { + "auxiliary_loss_clip": 0.01065612, + "auxiliary_loss_mlp": 0.01046557, + "balance_loss_clip": 1.03398371, + "balance_loss_mlp": 1.02917624, + "epoch": 0.20970990530587705, + "flos": 23477391296640.0, + "grad_norm": 2.061511983717983, + "language_loss": 0.72516727, + "learning_rate": 3.670674357028504e-06, + "loss": 0.7462889, + "num_input_tokens_seen": 75427155, + "step": 3488, + "time_per_iteration": 2.749114990234375 + }, + { + "auxiliary_loss_clip": 0.01083907, + "auxiliary_loss_mlp": 0.01039048, + "balance_loss_clip": 1.03950906, + "balance_loss_mlp": 1.02252555, + "epoch": 0.209770028558545, + "flos": 18551452976640.0, + "grad_norm": 2.1134230089056776, + "language_loss": 0.80786991, + "learning_rate": 3.6704602234928945e-06, + "loss": 0.82909948, + "num_input_tokens_seen": 75444450, + "step": 3489, + "time_per_iteration": 4.258174657821655 + }, + { + "auxiliary_loss_clip": 0.01107561, + "auxiliary_loss_mlp": 0.01040963, + "balance_loss_clip": 1.03809214, + "balance_loss_mlp": 1.02433348, + "epoch": 0.20983015181121298, + "flos": 21617003208960.0, + "grad_norm": 2.0098449317575633, + "language_loss": 0.72492105, + "learning_rate": 3.670246026613266e-06, + "loss": 0.74640632, + "num_input_tokens_seen": 75462625, + "step": 3490, + "time_per_iteration": 4.159834384918213 + }, + { + "auxiliary_loss_clip": 0.0108002, + "auxiliary_loss_mlp": 0.01047832, + "balance_loss_clip": 1.03628695, + "balance_loss_mlp": 1.03195381, + "epoch": 0.20989027506388094, + "flos": 16614718531200.0, + "grad_norm": 2.049266459978127, + "language_loss": 0.70312881, + "learning_rate": 3.6700317663977415e-06, + "loss": 0.72440732, + "num_input_tokens_seen": 75480640, + "step": 3491, + "time_per_iteration": 2.6376895904541016 + }, + { + "auxiliary_loss_clip": 0.01093747, + "auxiliary_loss_mlp": 0.0074896, + "balance_loss_clip": 1.03473306, + "balance_loss_mlp": 1.00111306, + "epoch": 0.20995039831654894, + "flos": 23216823060480.0, + "grad_norm": 4.684190663652907, + "language_loss": 0.79677689, + "learning_rate": 3.669817442854444e-06, + "loss": 0.81520402, + "num_input_tokens_seen": 75494900, + "step": 3492, + "time_per_iteration": 2.6611180305480957 + }, + { + "auxiliary_loss_clip": 0.01096362, + "auxiliary_loss_mlp": 0.00748951, + "balance_loss_clip": 1.03776276, + "balance_loss_mlp": 1.00114751, + "epoch": 0.2100105215692169, + "flos": 18147493647360.0, + "grad_norm": 2.0205286745238595, + "language_loss": 0.8665216, + "learning_rate": 3.669603055991502e-06, + "loss": 0.88497472, + "num_input_tokens_seen": 75513370, + "step": 3493, + "time_per_iteration": 2.643366813659668 + }, + { + "auxiliary_loss_clip": 0.01059503, + "auxiliary_loss_mlp": 0.01039904, + "balance_loss_clip": 1.02906966, + "balance_loss_mlp": 1.02341688, + "epoch": 0.21007064482188487, + "flos": 15961611490560.0, + "grad_norm": 1.653778309178185, + "language_loss": 0.69159538, + "learning_rate": 3.6693886058170455e-06, + "loss": 0.71258944, + "num_input_tokens_seen": 75532480, + "step": 3494, + "time_per_iteration": 2.6624667644500732 + }, + { + "auxiliary_loss_clip": 0.01096491, + "auxiliary_loss_mlp": 0.01039445, + "balance_loss_clip": 1.03528047, + "balance_loss_mlp": 1.02276778, + "epoch": 0.21013076807455283, + "flos": 32234315696640.0, + "grad_norm": 1.7876148350542298, + "language_loss": 0.78957093, + "learning_rate": 3.6691740923392053e-06, + "loss": 0.81093025, + "num_input_tokens_seen": 75552745, + "step": 3495, + "time_per_iteration": 2.7759270668029785 + }, + { + "auxiliary_loss_clip": 0.01070566, + "auxiliary_loss_mlp": 0.01043386, + "balance_loss_clip": 1.03093696, + "balance_loss_mlp": 1.02657735, + "epoch": 0.2101908913272208, + "flos": 23696625957120.0, + "grad_norm": 2.162954113336845, + "language_loss": 0.77561688, + "learning_rate": 3.668959515566116e-06, + "loss": 0.79675633, + "num_input_tokens_seen": 75574355, + "step": 3496, + "time_per_iteration": 2.7304975986480713 + }, + { + "auxiliary_loss_clip": 0.01088027, + "auxiliary_loss_mlp": 0.01046446, + "balance_loss_clip": 1.03555191, + "balance_loss_mlp": 1.02860069, + "epoch": 0.21025101457988876, + "flos": 20375786787840.0, + "grad_norm": 2.454458592501398, + "language_loss": 0.82440281, + "learning_rate": 3.668744875505915e-06, + "loss": 0.84574753, + "num_input_tokens_seen": 75592215, + "step": 3497, + "time_per_iteration": 2.69571852684021 + }, + { + "auxiliary_loss_clip": 0.01096577, + "auxiliary_loss_mlp": 0.01047151, + "balance_loss_clip": 1.03593087, + "balance_loss_mlp": 1.02973437, + "epoch": 0.21031113783255675, + "flos": 25775638174080.0, + "grad_norm": 2.3063923351859072, + "language_loss": 0.67642766, + "learning_rate": 3.668530172166741e-06, + "loss": 0.69786489, + "num_input_tokens_seen": 75610740, + "step": 3498, + "time_per_iteration": 2.6998541355133057 + }, + { + "auxiliary_loss_clip": 0.0107328, + "auxiliary_loss_mlp": 0.01038808, + "balance_loss_clip": 1.03268743, + "balance_loss_mlp": 1.02079582, + "epoch": 0.21037126108522472, + "flos": 22018197191040.0, + "grad_norm": 1.943267860211212, + "language_loss": 0.80759192, + "learning_rate": 3.6683154055567352e-06, + "loss": 0.82871282, + "num_input_tokens_seen": 75631005, + "step": 3499, + "time_per_iteration": 2.818979024887085 + }, + { + "auxiliary_loss_clip": 0.01095325, + "auxiliary_loss_mlp": 0.01043135, + "balance_loss_clip": 1.03690791, + "balance_loss_mlp": 1.0266726, + "epoch": 0.21043138433789269, + "flos": 25334403505920.0, + "grad_norm": 1.5813144764281861, + "language_loss": 0.78289461, + "learning_rate": 3.668100575684043e-06, + "loss": 0.80427921, + "num_input_tokens_seen": 75650655, + "step": 3500, + "time_per_iteration": 2.644179105758667 + }, + { + "auxiliary_loss_clip": 0.01082169, + "auxiliary_loss_mlp": 0.0104158, + "balance_loss_clip": 1.03348684, + "balance_loss_mlp": 1.02433062, + "epoch": 0.21049150759056065, + "flos": 25556654908800.0, + "grad_norm": 1.5747147124646852, + "language_loss": 0.73806, + "learning_rate": 3.6678856825568094e-06, + "loss": 0.75929749, + "num_input_tokens_seen": 75669895, + "step": 3501, + "time_per_iteration": 2.6400535106658936 + }, + { + "auxiliary_loss_clip": 0.01092418, + "auxiliary_loss_mlp": 0.01041193, + "balance_loss_clip": 1.03455949, + "balance_loss_mlp": 1.02383649, + "epoch": 0.21055163084322862, + "flos": 24495602129280.0, + "grad_norm": 2.5735821258556575, + "language_loss": 0.75333059, + "learning_rate": 3.667670726183183e-06, + "loss": 0.77466679, + "num_input_tokens_seen": 75689535, + "step": 3502, + "time_per_iteration": 2.5634729862213135 + }, + { + "auxiliary_loss_clip": 0.01048547, + "auxiliary_loss_mlp": 0.01040365, + "balance_loss_clip": 1.0308615, + "balance_loss_mlp": 1.02288866, + "epoch": 0.21061175409589658, + "flos": 25739045193600.0, + "grad_norm": 1.924994156545708, + "language_loss": 0.77239347, + "learning_rate": 3.667455706571316e-06, + "loss": 0.79328257, + "num_input_tokens_seen": 75709265, + "step": 3503, + "time_per_iteration": 2.7361135482788086 + }, + { + "auxiliary_loss_clip": 0.0105481, + "auxiliary_loss_mlp": 0.01045181, + "balance_loss_clip": 1.03404331, + "balance_loss_mlp": 1.02485633, + "epoch": 0.21067187734856455, + "flos": 18989168112000.0, + "grad_norm": 2.4200578013847367, + "language_loss": 0.78787935, + "learning_rate": 3.6672406237293617e-06, + "loss": 0.80887926, + "num_input_tokens_seen": 75727050, + "step": 3504, + "time_per_iteration": 2.7139179706573486 + }, + { + "auxiliary_loss_clip": 0.01075051, + "auxiliary_loss_mlp": 0.0104942, + "balance_loss_clip": 1.0334363, + "balance_loss_mlp": 1.03038216, + "epoch": 0.21073200060123254, + "flos": 24681368292480.0, + "grad_norm": 1.4530155605935169, + "language_loss": 0.76644278, + "learning_rate": 3.6670254776654754e-06, + "loss": 0.78768742, + "num_input_tokens_seen": 75747175, + "step": 3505, + "time_per_iteration": 2.73928165435791 + }, + { + "auxiliary_loss_clip": 0.01077457, + "auxiliary_loss_mlp": 0.01050354, + "balance_loss_clip": 1.03506088, + "balance_loss_mlp": 1.03330684, + "epoch": 0.2107921238539005, + "flos": 28549342402560.0, + "grad_norm": 1.7278396417946482, + "language_loss": 0.63835692, + "learning_rate": 3.6668102683878163e-06, + "loss": 0.65963507, + "num_input_tokens_seen": 75767690, + "step": 3506, + "time_per_iteration": 2.685103178024292 + }, + { + "auxiliary_loss_clip": 0.0109172, + "auxiliary_loss_mlp": 0.01048913, + "balance_loss_clip": 1.03392088, + "balance_loss_mlp": 1.03141284, + "epoch": 0.21085224710656847, + "flos": 25885848078720.0, + "grad_norm": 1.7655084774648633, + "language_loss": 0.81798053, + "learning_rate": 3.6665949959045443e-06, + "loss": 0.83938688, + "num_input_tokens_seen": 75787255, + "step": 3507, + "time_per_iteration": 2.6983208656311035 + }, + { + "auxiliary_loss_clip": 0.01092792, + "auxiliary_loss_mlp": 0.0104768, + "balance_loss_clip": 1.03368902, + "balance_loss_mlp": 1.03022802, + "epoch": 0.21091237035923643, + "flos": 14976294537600.0, + "grad_norm": 1.646704255406727, + "language_loss": 0.75625467, + "learning_rate": 3.666379660223824e-06, + "loss": 0.77765942, + "num_input_tokens_seen": 75805890, + "step": 3508, + "time_per_iteration": 2.6230814456939697 + }, + { + "auxiliary_loss_clip": 0.01107912, + "auxiliary_loss_mlp": 0.01039759, + "balance_loss_clip": 1.0363009, + "balance_loss_mlp": 1.02204454, + "epoch": 0.2109724936119044, + "flos": 16362518163840.0, + "grad_norm": 2.1864664390533437, + "language_loss": 0.84999281, + "learning_rate": 3.6661642613538192e-06, + "loss": 0.8714695, + "num_input_tokens_seen": 75821620, + "step": 3509, + "time_per_iteration": 2.4975669384002686 + }, + { + "auxiliary_loss_clip": 0.01075689, + "auxiliary_loss_mlp": 0.01044855, + "balance_loss_clip": 1.03535557, + "balance_loss_mlp": 1.02686584, + "epoch": 0.21103261686457236, + "flos": 31502492000640.0, + "grad_norm": 1.760703921374217, + "language_loss": 0.67759347, + "learning_rate": 3.6659487993026987e-06, + "loss": 0.69879889, + "num_input_tokens_seen": 75842490, + "step": 3510, + "time_per_iteration": 2.802562952041626 + }, + { + "auxiliary_loss_clip": 0.01106355, + "auxiliary_loss_mlp": 0.01037506, + "balance_loss_clip": 1.03459406, + "balance_loss_mlp": 1.02066159, + "epoch": 0.21109274011724033, + "flos": 27344072517120.0, + "grad_norm": 1.7673602568002804, + "language_loss": 0.72386384, + "learning_rate": 3.6657332740786327e-06, + "loss": 0.74530244, + "num_input_tokens_seen": 75865985, + "step": 3511, + "time_per_iteration": 2.6435132026672363 + }, + { + "auxiliary_loss_clip": 0.01027664, + "auxiliary_loss_mlp": 0.01037862, + "balance_loss_clip": 1.03068805, + "balance_loss_mlp": 1.01883614, + "epoch": 0.21115286336990832, + "flos": 17820383466240.0, + "grad_norm": 2.7465694932483617, + "language_loss": 0.68806511, + "learning_rate": 3.665517685689794e-06, + "loss": 0.70872039, + "num_input_tokens_seen": 75882745, + "step": 3512, + "time_per_iteration": 2.766188859939575 + }, + { + "auxiliary_loss_clip": 0.01094068, + "auxiliary_loss_mlp": 0.01044712, + "balance_loss_clip": 1.03347707, + "balance_loss_mlp": 1.02617526, + "epoch": 0.2112129866225763, + "flos": 27197987904000.0, + "grad_norm": 1.6948453558581338, + "language_loss": 0.72951984, + "learning_rate": 3.6653020341443584e-06, + "loss": 0.7509076, + "num_input_tokens_seen": 75904305, + "step": 3513, + "time_per_iteration": 2.6363019943237305 + }, + { + "auxiliary_loss_clip": 0.01074776, + "auxiliary_loss_mlp": 0.01036178, + "balance_loss_clip": 1.03323352, + "balance_loss_mlp": 1.01982248, + "epoch": 0.21127310987524425, + "flos": 23731279603200.0, + "grad_norm": 1.7925458260772846, + "language_loss": 0.74390936, + "learning_rate": 3.665086319450502e-06, + "loss": 0.76501888, + "num_input_tokens_seen": 75923710, + "step": 3514, + "time_per_iteration": 2.646941900253296 + }, + { + "auxiliary_loss_clip": 0.01093661, + "auxiliary_loss_mlp": 0.01037844, + "balance_loss_clip": 1.04029107, + "balance_loss_mlp": 1.02144098, + "epoch": 0.21133323312791222, + "flos": 18332505624960.0, + "grad_norm": 1.728891372330761, + "language_loss": 0.76607913, + "learning_rate": 3.6648705416164062e-06, + "loss": 0.78739417, + "num_input_tokens_seen": 75942625, + "step": 3515, + "time_per_iteration": 2.6296892166137695 + }, + { + "auxiliary_loss_clip": 0.01086142, + "auxiliary_loss_mlp": 0.01042463, + "balance_loss_clip": 1.036587, + "balance_loss_mlp": 1.02585709, + "epoch": 0.21139335638058018, + "flos": 17931203902080.0, + "grad_norm": 1.7961853593065336, + "language_loss": 0.68380195, + "learning_rate": 3.6646547006502518e-06, + "loss": 0.70508796, + "num_input_tokens_seen": 75959930, + "step": 3516, + "time_per_iteration": 2.674283504486084 + }, + { + "auxiliary_loss_clip": 0.01073628, + "auxiliary_loss_mlp": 0.01048591, + "balance_loss_clip": 1.03539085, + "balance_loss_mlp": 1.03067398, + "epoch": 0.21145347963324815, + "flos": 24572092141440.0, + "grad_norm": 1.844264738646571, + "language_loss": 0.85148847, + "learning_rate": 3.664438796560225e-06, + "loss": 0.8727107, + "num_input_tokens_seen": 75980335, + "step": 3517, + "time_per_iteration": 2.7655391693115234 + }, + { + "auxiliary_loss_clip": 0.01079525, + "auxiliary_loss_mlp": 0.01033622, + "balance_loss_clip": 1.03259873, + "balance_loss_mlp": 1.01695633, + "epoch": 0.21151360288591614, + "flos": 35845959375360.0, + "grad_norm": 1.9222926022356355, + "language_loss": 0.6280129, + "learning_rate": 3.664222829354512e-06, + "loss": 0.64914435, + "num_input_tokens_seen": 76002095, + "step": 3518, + "time_per_iteration": 2.8222551345825195 + }, + { + "auxiliary_loss_clip": 0.0104909, + "auxiliary_loss_mlp": 0.01052096, + "balance_loss_clip": 1.03402424, + "balance_loss_mlp": 1.03603804, + "epoch": 0.2115737261385841, + "flos": 24641579001600.0, + "grad_norm": 2.0409123038486605, + "language_loss": 0.89020562, + "learning_rate": 3.664006799041303e-06, + "loss": 0.91121745, + "num_input_tokens_seen": 76020425, + "step": 3519, + "time_per_iteration": 2.747157096862793 + }, + { + "auxiliary_loss_clip": 0.01085919, + "auxiliary_loss_mlp": 0.01050293, + "balance_loss_clip": 1.03455806, + "balance_loss_mlp": 1.03291178, + "epoch": 0.21163384939125207, + "flos": 25226887121280.0, + "grad_norm": 1.9049012350463352, + "language_loss": 0.81165504, + "learning_rate": 3.6637907056287886e-06, + "loss": 0.83301711, + "num_input_tokens_seen": 76041210, + "step": 3520, + "time_per_iteration": 2.6691110134124756 + }, + { + "auxiliary_loss_clip": 0.01080319, + "auxiliary_loss_mlp": 0.01045713, + "balance_loss_clip": 1.03441858, + "balance_loss_mlp": 1.02990532, + "epoch": 0.21169397264392004, + "flos": 26067520091520.0, + "grad_norm": 1.6200752092853792, + "language_loss": 0.75913358, + "learning_rate": 3.6635745491251642e-06, + "loss": 0.78039384, + "num_input_tokens_seen": 76062685, + "step": 3521, + "time_per_iteration": 2.6235830783843994 + }, + { + "auxiliary_loss_clip": 0.01049956, + "auxiliary_loss_mlp": 0.01036211, + "balance_loss_clip": 1.03222549, + "balance_loss_mlp": 1.02142954, + "epoch": 0.211754095896588, + "flos": 23108265181440.0, + "grad_norm": 2.0342983067406077, + "language_loss": 0.75649369, + "learning_rate": 3.663358329538626e-06, + "loss": 0.77735537, + "num_input_tokens_seen": 76082300, + "step": 3522, + "time_per_iteration": 2.8147501945495605 + }, + { + "auxiliary_loss_clip": 0.01104088, + "auxiliary_loss_mlp": 0.01049236, + "balance_loss_clip": 1.03572881, + "balance_loss_mlp": 1.03239226, + "epoch": 0.21181421914925597, + "flos": 27922341571200.0, + "grad_norm": 2.596407994609368, + "language_loss": 0.7000891, + "learning_rate": 3.663142046877374e-06, + "loss": 0.72162235, + "num_input_tokens_seen": 76101135, + "step": 3523, + "time_per_iteration": 2.6104047298431396 + }, + { + "auxiliary_loss_clip": 0.01087564, + "auxiliary_loss_mlp": 0.01045456, + "balance_loss_clip": 1.03383791, + "balance_loss_mlp": 1.02968478, + "epoch": 0.21187434240192393, + "flos": 17128636369920.0, + "grad_norm": 2.4274300572461343, + "language_loss": 0.77012742, + "learning_rate": 3.6629257011496085e-06, + "loss": 0.79145765, + "num_input_tokens_seen": 76119320, + "step": 3524, + "time_per_iteration": 2.5538852214813232 + }, + { + "auxiliary_loss_clip": 0.01083568, + "auxiliary_loss_mlp": 0.01041054, + "balance_loss_clip": 1.03210402, + "balance_loss_mlp": 1.0242933, + "epoch": 0.21193446565459192, + "flos": 22347318533760.0, + "grad_norm": 1.8509746190458756, + "language_loss": 0.81517947, + "learning_rate": 3.6627092923635338e-06, + "loss": 0.83642572, + "num_input_tokens_seen": 76137445, + "step": 3525, + "time_per_iteration": 2.6212921142578125 + }, + { + "auxiliary_loss_clip": 0.01040837, + "auxiliary_loss_mlp": 0.01044671, + "balance_loss_clip": 1.02893043, + "balance_loss_mlp": 1.02814853, + "epoch": 0.2119945889072599, + "flos": 27199316707200.0, + "grad_norm": 1.84509613349619, + "language_loss": 0.75184655, + "learning_rate": 3.662492820527356e-06, + "loss": 0.77270162, + "num_input_tokens_seen": 76159500, + "step": 3526, + "time_per_iteration": 2.779656171798706 + }, + { + "auxiliary_loss_clip": 0.01104708, + "auxiliary_loss_mlp": 0.01039448, + "balance_loss_clip": 1.03497887, + "balance_loss_mlp": 1.02277088, + "epoch": 0.21205471215992786, + "flos": 20991869884800.0, + "grad_norm": 2.1269214428438823, + "language_loss": 0.77364624, + "learning_rate": 3.662276285649284e-06, + "loss": 0.79508781, + "num_input_tokens_seen": 76177990, + "step": 3527, + "time_per_iteration": 2.5961387157440186 + }, + { + "auxiliary_loss_clip": 0.01104277, + "auxiliary_loss_mlp": 0.01047181, + "balance_loss_clip": 1.03568399, + "balance_loss_mlp": 1.02913249, + "epoch": 0.21211483541259582, + "flos": 20777663128320.0, + "grad_norm": 1.6106488289505978, + "language_loss": 0.77941501, + "learning_rate": 3.662059687737528e-06, + "loss": 0.80092955, + "num_input_tokens_seen": 76197125, + "step": 3528, + "time_per_iteration": 2.5903127193450928 + }, + { + "auxiliary_loss_clip": 0.01093757, + "auxiliary_loss_mlp": 0.01045198, + "balance_loss_clip": 1.03646588, + "balance_loss_mlp": 1.02908039, + "epoch": 0.21217495866526379, + "flos": 18989994124800.0, + "grad_norm": 1.9523003446599392, + "language_loss": 0.81898129, + "learning_rate": 3.6618430268003024e-06, + "loss": 0.84037083, + "num_input_tokens_seen": 76216215, + "step": 3529, + "time_per_iteration": 2.5682120323181152 + }, + { + "auxiliary_loss_clip": 0.01084795, + "auxiliary_loss_mlp": 0.00748873, + "balance_loss_clip": 1.03327692, + "balance_loss_mlp": 1.00107467, + "epoch": 0.21223508191793175, + "flos": 20667309569280.0, + "grad_norm": 2.1809390854914006, + "language_loss": 0.76537263, + "learning_rate": 3.6616263028458235e-06, + "loss": 0.78370935, + "num_input_tokens_seen": 76237010, + "step": 3530, + "time_per_iteration": 4.24065899848938 + }, + { + "auxiliary_loss_clip": 0.01103169, + "auxiliary_loss_mlp": 0.01043832, + "balance_loss_clip": 1.0361402, + "balance_loss_mlp": 1.02826357, + "epoch": 0.21229520517059972, + "flos": 21616464504960.0, + "grad_norm": 2.189917284114767, + "language_loss": 0.82893413, + "learning_rate": 3.661409515882308e-06, + "loss": 0.85040414, + "num_input_tokens_seen": 76255965, + "step": 3531, + "time_per_iteration": 4.246731996536255 + }, + { + "auxiliary_loss_clip": 0.01071953, + "auxiliary_loss_mlp": 0.0104042, + "balance_loss_clip": 1.03270459, + "balance_loss_mlp": 1.02280045, + "epoch": 0.2123553284232677, + "flos": 13991049411840.0, + "grad_norm": 3.1540761271976825, + "language_loss": 0.73426449, + "learning_rate": 3.661192665917977e-06, + "loss": 0.7553882, + "num_input_tokens_seen": 76272150, + "step": 3532, + "time_per_iteration": 2.617694139480591 + }, + { + "auxiliary_loss_clip": 0.01080652, + "auxiliary_loss_mlp": 0.01043674, + "balance_loss_clip": 1.03846717, + "balance_loss_mlp": 1.02668715, + "epoch": 0.21241545167593567, + "flos": 18296774570880.0, + "grad_norm": 1.6258155392252014, + "language_loss": 0.73573267, + "learning_rate": 3.660975752961054e-06, + "loss": 0.75697601, + "num_input_tokens_seen": 76291425, + "step": 3533, + "time_per_iteration": 2.7851996421813965 + }, + { + "auxiliary_loss_clip": 0.01098744, + "auxiliary_loss_mlp": 0.01041142, + "balance_loss_clip": 1.03654242, + "balance_loss_mlp": 1.02471495, + "epoch": 0.21247557492860364, + "flos": 34713121265280.0, + "grad_norm": 2.5707955731540784, + "language_loss": 0.71125484, + "learning_rate": 3.6607587770197634e-06, + "loss": 0.73265374, + "num_input_tokens_seen": 76313975, + "step": 3534, + "time_per_iteration": 2.6647534370422363 + }, + { + "auxiliary_loss_clip": 0.01085758, + "auxiliary_loss_mlp": 0.01040983, + "balance_loss_clip": 1.03589928, + "balance_loss_mlp": 1.02361441, + "epoch": 0.2125356981812716, + "flos": 22053820504320.0, + "grad_norm": 1.9948254760488486, + "language_loss": 0.71710229, + "learning_rate": 3.6605417381023346e-06, + "loss": 0.73836982, + "num_input_tokens_seen": 76330955, + "step": 3535, + "time_per_iteration": 2.6331028938293457 + }, + { + "auxiliary_loss_clip": 0.01090646, + "auxiliary_loss_mlp": 0.01047801, + "balance_loss_clip": 1.03419566, + "balance_loss_mlp": 1.0319581, + "epoch": 0.21259582143393957, + "flos": 28548336821760.0, + "grad_norm": 2.9755785363943628, + "language_loss": 0.70380229, + "learning_rate": 3.660324636216996e-06, + "loss": 0.72518671, + "num_input_tokens_seen": 76352680, + "step": 3536, + "time_per_iteration": 2.6839518547058105 + }, + { + "auxiliary_loss_clip": 0.01107319, + "auxiliary_loss_mlp": 0.0104356, + "balance_loss_clip": 1.03653812, + "balance_loss_mlp": 1.02626324, + "epoch": 0.21265594468660753, + "flos": 20120892900480.0, + "grad_norm": 2.2584585130877035, + "language_loss": 0.87727141, + "learning_rate": 3.660107471371981e-06, + "loss": 0.89878023, + "num_input_tokens_seen": 76370750, + "step": 3537, + "time_per_iteration": 5.670110464096069 + }, + { + "auxiliary_loss_clip": 0.01091881, + "auxiliary_loss_mlp": 0.00748778, + "balance_loss_clip": 1.03380668, + "balance_loss_mlp": 1.00097156, + "epoch": 0.21271606793927553, + "flos": 23076161400960.0, + "grad_norm": 1.6803945650605256, + "language_loss": 0.80406708, + "learning_rate": 3.659890243575524e-06, + "loss": 0.82247365, + "num_input_tokens_seen": 76390610, + "step": 3538, + "time_per_iteration": 2.563546895980835 + }, + { + "auxiliary_loss_clip": 0.01032455, + "auxiliary_loss_mlp": 0.01044396, + "balance_loss_clip": 1.02862656, + "balance_loss_mlp": 1.02734947, + "epoch": 0.2127761911919435, + "flos": 26388201738240.0, + "grad_norm": 1.8902036174882773, + "language_loss": 0.86855716, + "learning_rate": 3.659672952835863e-06, + "loss": 0.88932562, + "num_input_tokens_seen": 76408860, + "step": 3539, + "time_per_iteration": 2.719123601913452 + }, + { + "auxiliary_loss_clip": 0.01074136, + "auxiliary_loss_mlp": 0.01047295, + "balance_loss_clip": 1.03134489, + "balance_loss_mlp": 1.03097534, + "epoch": 0.21283631444461146, + "flos": 20228265630720.0, + "grad_norm": 2.1135809003062715, + "language_loss": 0.58098865, + "learning_rate": 3.659455599161237e-06, + "loss": 0.60220295, + "num_input_tokens_seen": 76424980, + "step": 3540, + "time_per_iteration": 2.5735745429992676 + }, + { + "auxiliary_loss_clip": 0.01104209, + "auxiliary_loss_mlp": 0.0103841, + "balance_loss_clip": 1.03504598, + "balance_loss_mlp": 1.02205443, + "epoch": 0.21289643769727942, + "flos": 13516992691200.0, + "grad_norm": 2.6203719268868637, + "language_loss": 0.75790155, + "learning_rate": 3.659238182559888e-06, + "loss": 0.77932775, + "num_input_tokens_seen": 76443135, + "step": 3541, + "time_per_iteration": 2.509526491165161 + }, + { + "auxiliary_loss_clip": 0.01061543, + "auxiliary_loss_mlp": 0.01045527, + "balance_loss_clip": 1.0328536, + "balance_loss_mlp": 1.02868223, + "epoch": 0.2129565609499474, + "flos": 24827021942400.0, + "grad_norm": 1.8452500582104014, + "language_loss": 0.69231117, + "learning_rate": 3.6590207030400615e-06, + "loss": 0.71338177, + "num_input_tokens_seen": 76462470, + "step": 3542, + "time_per_iteration": 2.704183578491211 + }, + { + "auxiliary_loss_clip": 0.0110214, + "auxiliary_loss_mlp": 0.01040525, + "balance_loss_clip": 1.03588831, + "balance_loss_mlp": 1.02470589, + "epoch": 0.21301668420261535, + "flos": 23659242877440.0, + "grad_norm": 1.9190987770170054, + "language_loss": 0.76459622, + "learning_rate": 3.658803160610004e-06, + "loss": 0.7860229, + "num_input_tokens_seen": 76481995, + "step": 3543, + "time_per_iteration": 2.6518354415893555 + }, + { + "auxiliary_loss_clip": 0.01080984, + "auxiliary_loss_mlp": 0.01036427, + "balance_loss_clip": 1.03493464, + "balance_loss_mlp": 1.02017903, + "epoch": 0.21307680745528332, + "flos": 16362805472640.0, + "grad_norm": 1.9550247561774683, + "language_loss": 0.66766202, + "learning_rate": 3.6585855552779634e-06, + "loss": 0.6888361, + "num_input_tokens_seen": 76500245, + "step": 3544, + "time_per_iteration": 2.673837661743164 + }, + { + "auxiliary_loss_clip": 0.01076387, + "auxiliary_loss_mlp": 0.01040861, + "balance_loss_clip": 1.03458238, + "balance_loss_mlp": 1.02520895, + "epoch": 0.2131369307079513, + "flos": 19099054794240.0, + "grad_norm": 1.7986636832808152, + "language_loss": 0.70490873, + "learning_rate": 3.6583678870521934e-06, + "loss": 0.72608125, + "num_input_tokens_seen": 76519535, + "step": 3545, + "time_per_iteration": 2.5974678993225098 + }, + { + "auxiliary_loss_clip": 0.01081636, + "auxiliary_loss_mlp": 0.01048597, + "balance_loss_clip": 1.03471899, + "balance_loss_mlp": 1.03215802, + "epoch": 0.21319705396061928, + "flos": 30372275583360.0, + "grad_norm": 1.694728213353448, + "language_loss": 0.72233891, + "learning_rate": 3.658150155940946e-06, + "loss": 0.74364126, + "num_input_tokens_seen": 76542065, + "step": 3546, + "time_per_iteration": 2.687530994415283 + }, + { + "auxiliary_loss_clip": 0.01062645, + "auxiliary_loss_mlp": 0.01043499, + "balance_loss_clip": 1.03379726, + "balance_loss_mlp": 1.02745342, + "epoch": 0.21325717721328724, + "flos": 21756192410880.0, + "grad_norm": 2.075296039304432, + "language_loss": 0.8027966, + "learning_rate": 3.657932361952479e-06, + "loss": 0.82385802, + "num_input_tokens_seen": 76560540, + "step": 3547, + "time_per_iteration": 2.8264284133911133 + }, + { + "auxiliary_loss_clip": 0.01106542, + "auxiliary_loss_mlp": 0.01041415, + "balance_loss_clip": 1.0351665, + "balance_loss_mlp": 1.02455831, + "epoch": 0.2133173004659552, + "flos": 28730870760960.0, + "grad_norm": 2.516816167587101, + "language_loss": 0.74285138, + "learning_rate": 3.6577145050950504e-06, + "loss": 0.76433098, + "num_input_tokens_seen": 76581760, + "step": 3548, + "time_per_iteration": 2.6386473178863525 + }, + { + "auxiliary_loss_clip": 0.0106767, + "auxiliary_loss_mlp": 0.01048905, + "balance_loss_clip": 1.03374016, + "balance_loss_mlp": 1.03061867, + "epoch": 0.21337742371862317, + "flos": 16837077674880.0, + "grad_norm": 1.9418790281255953, + "language_loss": 0.74071127, + "learning_rate": 3.657496585376922e-06, + "loss": 0.76187706, + "num_input_tokens_seen": 76599940, + "step": 3549, + "time_per_iteration": 2.680847406387329 + }, + { + "auxiliary_loss_clip": 0.01071853, + "auxiliary_loss_mlp": 0.01043864, + "balance_loss_clip": 1.03568482, + "balance_loss_mlp": 1.0270915, + "epoch": 0.21343754697129114, + "flos": 24424930120320.0, + "grad_norm": 1.6887350501473504, + "language_loss": 0.80580115, + "learning_rate": 3.657278602806357e-06, + "loss": 0.8269583, + "num_input_tokens_seen": 76619580, + "step": 3550, + "time_per_iteration": 2.71230149269104 + }, + { + "auxiliary_loss_clip": 0.01103571, + "auxiliary_loss_mlp": 0.01042841, + "balance_loss_clip": 1.03764319, + "balance_loss_mlp": 1.02728462, + "epoch": 0.21349767022395913, + "flos": 19277817805440.0, + "grad_norm": 1.6135020337100552, + "language_loss": 0.8800714, + "learning_rate": 3.657060557391621e-06, + "loss": 0.90153551, + "num_input_tokens_seen": 76638195, + "step": 3551, + "time_per_iteration": 2.658407211303711 + }, + { + "auxiliary_loss_clip": 0.01101241, + "auxiliary_loss_mlp": 0.01042748, + "balance_loss_clip": 1.03421855, + "balance_loss_mlp": 1.02615404, + "epoch": 0.2135577934766271, + "flos": 17347547808000.0, + "grad_norm": 2.4115287524459883, + "language_loss": 0.83221239, + "learning_rate": 3.656842449140983e-06, + "loss": 0.85365224, + "num_input_tokens_seen": 76656695, + "step": 3552, + "time_per_iteration": 2.628222703933716 + }, + { + "auxiliary_loss_clip": 0.01083448, + "auxiliary_loss_mlp": 0.01047795, + "balance_loss_clip": 1.03107953, + "balance_loss_mlp": 1.03073668, + "epoch": 0.21361791672929506, + "flos": 24057204635520.0, + "grad_norm": 1.6118215681176546, + "language_loss": 0.76287115, + "learning_rate": 3.656624278062713e-06, + "loss": 0.78418356, + "num_input_tokens_seen": 76677430, + "step": 3553, + "time_per_iteration": 2.6406595706939697 + }, + { + "auxiliary_loss_clip": 0.01091718, + "auxiliary_loss_mlp": 0.010411, + "balance_loss_clip": 1.03529656, + "balance_loss_mlp": 1.02590048, + "epoch": 0.21367803998196302, + "flos": 22162306556160.0, + "grad_norm": 1.6312554193785616, + "language_loss": 0.72274524, + "learning_rate": 3.6564060441650843e-06, + "loss": 0.74407339, + "num_input_tokens_seen": 76697615, + "step": 3554, + "time_per_iteration": 2.603689432144165 + }, + { + "auxiliary_loss_clip": 0.01044225, + "auxiliary_loss_mlp": 0.00748624, + "balance_loss_clip": 1.02964306, + "balance_loss_mlp": 1.00097191, + "epoch": 0.213738163234631, + "flos": 20886867452160.0, + "grad_norm": 1.907119711956455, + "language_loss": 0.68023366, + "learning_rate": 3.6561877474563724e-06, + "loss": 0.6981622, + "num_input_tokens_seen": 76715685, + "step": 3555, + "time_per_iteration": 2.842349052429199 + }, + { + "auxiliary_loss_clip": 0.01070899, + "auxiliary_loss_mlp": 0.01040786, + "balance_loss_clip": 1.03320336, + "balance_loss_mlp": 1.02381039, + "epoch": 0.21379828648729896, + "flos": 28403114135040.0, + "grad_norm": 1.9358833203927246, + "language_loss": 0.64764971, + "learning_rate": 3.6559693879448553e-06, + "loss": 0.66876656, + "num_input_tokens_seen": 76735405, + "step": 3556, + "time_per_iteration": 2.9192254543304443 + }, + { + "auxiliary_loss_clip": 0.0109342, + "auxiliary_loss_mlp": 0.01049189, + "balance_loss_clip": 1.03589559, + "balance_loss_mlp": 1.03171325, + "epoch": 0.21385840973996692, + "flos": 25479662106240.0, + "grad_norm": 1.6688541106496126, + "language_loss": 0.72763181, + "learning_rate": 3.6557509656388125e-06, + "loss": 0.74905783, + "num_input_tokens_seen": 76754395, + "step": 3557, + "time_per_iteration": 2.910963535308838 + }, + { + "auxiliary_loss_clip": 0.01094061, + "auxiliary_loss_mlp": 0.00748803, + "balance_loss_clip": 1.03936851, + "balance_loss_mlp": 1.00110483, + "epoch": 0.2139185329926349, + "flos": 28074280101120.0, + "grad_norm": 1.638411879895096, + "language_loss": 0.66844487, + "learning_rate": 3.655532480546528e-06, + "loss": 0.68687356, + "num_input_tokens_seen": 76777210, + "step": 3558, + "time_per_iteration": 2.919551372528076 + }, + { + "auxiliary_loss_clip": 0.0110835, + "auxiliary_loss_mlp": 0.01043455, + "balance_loss_clip": 1.0358187, + "balance_loss_mlp": 1.02674222, + "epoch": 0.21397865624530288, + "flos": 19608698914560.0, + "grad_norm": 1.8062870544206018, + "language_loss": 0.79375482, + "learning_rate": 3.655313932676286e-06, + "loss": 0.81527287, + "num_input_tokens_seen": 76795830, + "step": 3559, + "time_per_iteration": 2.7818093299865723 + }, + { + "auxiliary_loss_clip": 0.01102642, + "auxiliary_loss_mlp": 0.01043631, + "balance_loss_clip": 1.0340395, + "balance_loss_mlp": 1.02737093, + "epoch": 0.21403877949797084, + "flos": 24681476033280.0, + "grad_norm": 1.669211284666128, + "language_loss": 0.67513418, + "learning_rate": 3.655095322036373e-06, + "loss": 0.69659692, + "num_input_tokens_seen": 76814700, + "step": 3560, + "time_per_iteration": 2.767604351043701 + }, + { + "auxiliary_loss_clip": 0.01094709, + "auxiliary_loss_mlp": 0.01039466, + "balance_loss_clip": 1.03557527, + "balance_loss_mlp": 1.02303886, + "epoch": 0.2140989027506388, + "flos": 19861150677120.0, + "grad_norm": 2.0107274467306024, + "language_loss": 0.73103791, + "learning_rate": 3.65487664863508e-06, + "loss": 0.7523796, + "num_input_tokens_seen": 76833400, + "step": 3561, + "time_per_iteration": 2.831329584121704 + }, + { + "auxiliary_loss_clip": 0.01077819, + "auxiliary_loss_mlp": 0.01044356, + "balance_loss_clip": 1.03264284, + "balance_loss_mlp": 1.02742863, + "epoch": 0.21415902600330677, + "flos": 19135324552320.0, + "grad_norm": 2.669926069676961, + "language_loss": 0.77745837, + "learning_rate": 3.654657912480698e-06, + "loss": 0.79868019, + "num_input_tokens_seen": 76850645, + "step": 3562, + "time_per_iteration": 2.8181893825531006 + }, + { + "auxiliary_loss_clip": 0.01102913, + "auxiliary_loss_mlp": 0.01041698, + "balance_loss_clip": 1.03484249, + "balance_loss_mlp": 1.02535486, + "epoch": 0.21421914925597474, + "flos": 22272624201600.0, + "grad_norm": 1.5498169493662672, + "language_loss": 0.8438189, + "learning_rate": 3.6544391135815237e-06, + "loss": 0.86526501, + "num_input_tokens_seen": 76870135, + "step": 3563, + "time_per_iteration": 2.727949857711792 + }, + { + "auxiliary_loss_clip": 0.01103601, + "auxiliary_loss_mlp": 0.01034942, + "balance_loss_clip": 1.03573239, + "balance_loss_mlp": 1.01990342, + "epoch": 0.2142792725086427, + "flos": 33875109987840.0, + "grad_norm": 1.557430756169701, + "language_loss": 0.76625234, + "learning_rate": 3.6542202519458507e-06, + "loss": 0.78763777, + "num_input_tokens_seen": 76893905, + "step": 3564, + "time_per_iteration": 2.8902835845947266 + }, + { + "auxiliary_loss_clip": 0.01080697, + "auxiliary_loss_mlp": 0.01037225, + "balance_loss_clip": 1.03457367, + "balance_loss_mlp": 1.02146518, + "epoch": 0.2143393957613107, + "flos": 19860216923520.0, + "grad_norm": 1.7011683222505602, + "language_loss": 0.88271224, + "learning_rate": 3.654001327581981e-06, + "loss": 0.90389144, + "num_input_tokens_seen": 76914205, + "step": 3565, + "time_per_iteration": 2.876243829727173 + }, + { + "auxiliary_loss_clip": 0.01006213, + "auxiliary_loss_mlp": 0.01038069, + "balance_loss_clip": 1.00844038, + "balance_loss_mlp": 1.03564918, + "epoch": 0.21439951901397866, + "flos": 68530093090560.0, + "grad_norm": 0.8329099792471709, + "language_loss": 0.52198911, + "learning_rate": 3.653782340498215e-06, + "loss": 0.54243189, + "num_input_tokens_seen": 76975650, + "step": 3566, + "time_per_iteration": 3.2990596294403076 + }, + { + "auxiliary_loss_clip": 0.01088867, + "auxiliary_loss_mlp": 0.01035883, + "balance_loss_clip": 1.03407204, + "balance_loss_mlp": 1.02122021, + "epoch": 0.21445964226664663, + "flos": 19682998197120.0, + "grad_norm": 1.9190100396760428, + "language_loss": 0.67079747, + "learning_rate": 3.6535632907028566e-06, + "loss": 0.69204497, + "num_input_tokens_seen": 76992615, + "step": 3567, + "time_per_iteration": 2.7149605751037598 + }, + { + "auxiliary_loss_clip": 0.0106759, + "auxiliary_loss_mlp": 0.01045229, + "balance_loss_clip": 1.02981305, + "balance_loss_mlp": 1.02933884, + "epoch": 0.2145197655193146, + "flos": 31107259676160.0, + "grad_norm": 1.5356905706155937, + "language_loss": 0.74067879, + "learning_rate": 3.6533441782042126e-06, + "loss": 0.76180696, + "num_input_tokens_seen": 77017005, + "step": 3568, + "time_per_iteration": 2.7787585258483887 + }, + { + "auxiliary_loss_clip": 0.01090525, + "auxiliary_loss_mlp": 0.0104847, + "balance_loss_clip": 1.03386748, + "balance_loss_mlp": 1.03243685, + "epoch": 0.21457988877198256, + "flos": 20120785159680.0, + "grad_norm": 1.6883370748520339, + "language_loss": 0.77681589, + "learning_rate": 3.6531250030105917e-06, + "loss": 0.79820585, + "num_input_tokens_seen": 77034990, + "step": 3569, + "time_per_iteration": 2.683690309524536 + }, + { + "auxiliary_loss_clip": 0.0109523, + "auxiliary_loss_mlp": 0.01042004, + "balance_loss_clip": 1.03597307, + "balance_loss_mlp": 1.02401495, + "epoch": 0.21464001202465052, + "flos": 18588045957120.0, + "grad_norm": 2.38813572712963, + "language_loss": 0.69904804, + "learning_rate": 3.6529057651303053e-06, + "loss": 0.72042042, + "num_input_tokens_seen": 77052610, + "step": 3570, + "time_per_iteration": 2.638705253601074 + }, + { + "auxiliary_loss_clip": 0.01105981, + "auxiliary_loss_mlp": 0.01040084, + "balance_loss_clip": 1.03631234, + "balance_loss_mlp": 1.02337742, + "epoch": 0.21470013527731852, + "flos": 21835160461440.0, + "grad_norm": 2.178536871394689, + "language_loss": 0.78843594, + "learning_rate": 3.6526864645716666e-06, + "loss": 0.80989659, + "num_input_tokens_seen": 77072475, + "step": 3571, + "time_per_iteration": 2.619476318359375 + }, + { + "auxiliary_loss_clip": 0.01084429, + "auxiliary_loss_mlp": 0.01042983, + "balance_loss_clip": 1.03367901, + "balance_loss_mlp": 1.02439809, + "epoch": 0.21476025852998648, + "flos": 17603195880960.0, + "grad_norm": 2.3569262325008404, + "language_loss": 0.82675695, + "learning_rate": 3.652467101342991e-06, + "loss": 0.84803104, + "num_input_tokens_seen": 77089930, + "step": 3572, + "time_per_iteration": 2.6472442150115967 + }, + { + "auxiliary_loss_clip": 0.01088265, + "auxiliary_loss_mlp": 0.01043158, + "balance_loss_clip": 1.03798366, + "balance_loss_mlp": 1.02681446, + "epoch": 0.21482038178265445, + "flos": 24828135264000.0, + "grad_norm": 2.653373098485397, + "language_loss": 0.6520496, + "learning_rate": 3.652247675452598e-06, + "loss": 0.6733638, + "num_input_tokens_seen": 77108970, + "step": 3573, + "time_per_iteration": 2.692199945449829 + }, + { + "auxiliary_loss_clip": 0.01097158, + "auxiliary_loss_mlp": 0.01040874, + "balance_loss_clip": 1.03328109, + "balance_loss_mlp": 1.02574599, + "epoch": 0.2148805050353224, + "flos": 23258228463360.0, + "grad_norm": 1.7472573839121095, + "language_loss": 0.75115389, + "learning_rate": 3.652028186908807e-06, + "loss": 0.77253425, + "num_input_tokens_seen": 77126045, + "step": 3574, + "time_per_iteration": 2.602625608444214 + }, + { + "auxiliary_loss_clip": 0.01087561, + "auxiliary_loss_mlp": 0.0103869, + "balance_loss_clip": 1.03208756, + "balance_loss_mlp": 1.02170277, + "epoch": 0.21494062828799038, + "flos": 21321098968320.0, + "grad_norm": 2.649471149597977, + "language_loss": 0.72025919, + "learning_rate": 3.6518086357199416e-06, + "loss": 0.74152166, + "num_input_tokens_seen": 77144600, + "step": 3575, + "time_per_iteration": 2.643298387527466 + }, + { + "auxiliary_loss_clip": 0.0107392, + "auxiliary_loss_mlp": 0.01034546, + "balance_loss_clip": 1.03272986, + "balance_loss_mlp": 1.0191319, + "epoch": 0.21500075154065834, + "flos": 18843334894080.0, + "grad_norm": 1.7410576053108135, + "language_loss": 0.68138289, + "learning_rate": 3.6515890218943277e-06, + "loss": 0.70246756, + "num_input_tokens_seen": 77162965, + "step": 3576, + "time_per_iteration": 2.7319278717041016 + }, + { + "auxiliary_loss_clip": 0.01085498, + "auxiliary_loss_mlp": 0.01045064, + "balance_loss_clip": 1.03249121, + "balance_loss_mlp": 1.02734923, + "epoch": 0.2150608747933263, + "flos": 18441997257600.0, + "grad_norm": 2.468955017043943, + "language_loss": 0.88621736, + "learning_rate": 3.651369345440292e-06, + "loss": 0.90752292, + "num_input_tokens_seen": 77179960, + "step": 3577, + "time_per_iteration": 2.653883934020996 + }, + { + "auxiliary_loss_clip": 0.01003873, + "auxiliary_loss_mlp": 0.01024618, + "balance_loss_clip": 1.00495577, + "balance_loss_mlp": 1.02225745, + "epoch": 0.2151209980459943, + "flos": 66598242894720.0, + "grad_norm": 0.8030866210170723, + "language_loss": 0.56212914, + "learning_rate": 3.6511496063661654e-06, + "loss": 0.58241403, + "num_input_tokens_seen": 77239500, + "step": 3578, + "time_per_iteration": 6.28125 + }, + { + "auxiliary_loss_clip": 0.01089534, + "auxiliary_loss_mlp": 0.00748739, + "balance_loss_clip": 1.03425741, + "balance_loss_mlp": 1.00109255, + "epoch": 0.21518112129866226, + "flos": 21575885114880.0, + "grad_norm": 1.9534277599139755, + "language_loss": 0.88867801, + "learning_rate": 3.6509298046802807e-06, + "loss": 0.90706074, + "num_input_tokens_seen": 77254680, + "step": 3579, + "time_per_iteration": 2.5737290382385254 + }, + { + "auxiliary_loss_clip": 0.01087791, + "auxiliary_loss_mlp": 0.01039898, + "balance_loss_clip": 1.03154087, + "balance_loss_mlp": 1.02303028, + "epoch": 0.21524124455133023, + "flos": 20047635112320.0, + "grad_norm": 1.8410863530388566, + "language_loss": 0.778651, + "learning_rate": 3.650709940390972e-06, + "loss": 0.79992789, + "num_input_tokens_seen": 77274060, + "step": 3580, + "time_per_iteration": 2.5652143955230713 + }, + { + "auxiliary_loss_clip": 0.01094122, + "auxiliary_loss_mlp": 0.01044222, + "balance_loss_clip": 1.0370456, + "balance_loss_mlp": 1.02784252, + "epoch": 0.2153013678039982, + "flos": 23951807153280.0, + "grad_norm": 1.9080746165356377, + "language_loss": 0.72817171, + "learning_rate": 3.6504900135065775e-06, + "loss": 0.74955517, + "num_input_tokens_seen": 77293255, + "step": 3581, + "time_per_iteration": 2.6909115314483643 + }, + { + "auxiliary_loss_clip": 0.01088755, + "auxiliary_loss_mlp": 0.01044387, + "balance_loss_clip": 1.03346968, + "balance_loss_mlp": 1.02633858, + "epoch": 0.21536149105666616, + "flos": 20594841880320.0, + "grad_norm": 2.5253884997365397, + "language_loss": 0.71410513, + "learning_rate": 3.6502700240354357e-06, + "loss": 0.73543656, + "num_input_tokens_seen": 77312390, + "step": 3582, + "time_per_iteration": 2.621842622756958 + }, + { + "auxiliary_loss_clip": 0.01101158, + "auxiliary_loss_mlp": 0.01043648, + "balance_loss_clip": 1.03410816, + "balance_loss_mlp": 1.02669597, + "epoch": 0.21542161430933413, + "flos": 12860042895360.0, + "grad_norm": 2.435929240236372, + "language_loss": 0.84736067, + "learning_rate": 3.650049971985889e-06, + "loss": 0.86880875, + "num_input_tokens_seen": 77330985, + "step": 3583, + "time_per_iteration": 2.489683151245117 + }, + { + "auxiliary_loss_clip": 0.01084404, + "auxiliary_loss_mlp": 0.01040459, + "balance_loss_clip": 1.03424072, + "balance_loss_mlp": 1.02410364, + "epoch": 0.21548173756200212, + "flos": 26103933504000.0, + "grad_norm": 2.932378521534517, + "language_loss": 0.83305144, + "learning_rate": 3.6498298573662824e-06, + "loss": 0.85430014, + "num_input_tokens_seen": 77350770, + "step": 3584, + "time_per_iteration": 4.333279132843018 + }, + { + "auxiliary_loss_clip": 0.010704, + "auxiliary_loss_mlp": 0.00748732, + "balance_loss_clip": 1.03426325, + "balance_loss_mlp": 1.00104022, + "epoch": 0.21554186081467008, + "flos": 22163779013760.0, + "grad_norm": 1.8508915443800493, + "language_loss": 0.90095282, + "learning_rate": 3.6496096801849625e-06, + "loss": 0.91914415, + "num_input_tokens_seen": 77370510, + "step": 3585, + "time_per_iteration": 4.202632665634155 + }, + { + "auxiliary_loss_clip": 0.01092323, + "auxiliary_loss_mlp": 0.01040177, + "balance_loss_clip": 1.03532851, + "balance_loss_mlp": 1.02396452, + "epoch": 0.21560198406733805, + "flos": 22966741595520.0, + "grad_norm": 1.7432304614078906, + "language_loss": 0.74571246, + "learning_rate": 3.649389440450277e-06, + "loss": 0.76703751, + "num_input_tokens_seen": 77390645, + "step": 3586, + "time_per_iteration": 2.5569331645965576 + }, + { + "auxiliary_loss_clip": 0.0107004, + "auxiliary_loss_mlp": 0.01045028, + "balance_loss_clip": 1.03762341, + "balance_loss_mlp": 1.0294826, + "epoch": 0.215662107320006, + "flos": 22784064001920.0, + "grad_norm": 2.2681410494775847, + "language_loss": 0.83232272, + "learning_rate": 3.6491691381705804e-06, + "loss": 0.85347342, + "num_input_tokens_seen": 77409655, + "step": 3587, + "time_per_iteration": 2.735135316848755 + }, + { + "auxiliary_loss_clip": 0.01062569, + "auxiliary_loss_mlp": 0.0074886, + "balance_loss_clip": 1.0351336, + "balance_loss_mlp": 1.00109386, + "epoch": 0.21572223057267398, + "flos": 30883859038080.0, + "grad_norm": 1.9358091704783642, + "language_loss": 0.76078284, + "learning_rate": 3.648948773354224e-06, + "loss": 0.77889717, + "num_input_tokens_seen": 77430560, + "step": 3588, + "time_per_iteration": 2.84493088722229 + }, + { + "auxiliary_loss_clip": 0.01082859, + "auxiliary_loss_mlp": 0.01037875, + "balance_loss_clip": 1.03136969, + "balance_loss_mlp": 1.02168596, + "epoch": 0.21578235382534194, + "flos": 26910487445760.0, + "grad_norm": 1.6636178497618939, + "language_loss": 0.81106085, + "learning_rate": 3.6487283460095643e-06, + "loss": 0.83226818, + "num_input_tokens_seen": 77455000, + "step": 3589, + "time_per_iteration": 2.711561441421509 + }, + { + "auxiliary_loss_clip": 0.01105615, + "auxiliary_loss_mlp": 0.0103904, + "balance_loss_clip": 1.03654289, + "balance_loss_mlp": 1.02331674, + "epoch": 0.2158424770780099, + "flos": 24425720219520.0, + "grad_norm": 2.11198408154781, + "language_loss": 0.7275641, + "learning_rate": 3.648507856144961e-06, + "loss": 0.74901068, + "num_input_tokens_seen": 77475075, + "step": 3590, + "time_per_iteration": 2.5977895259857178 + }, + { + "auxiliary_loss_clip": 0.01085633, + "auxiliary_loss_mlp": 0.01045773, + "balance_loss_clip": 1.03402615, + "balance_loss_mlp": 1.0279038, + "epoch": 0.2159026003306779, + "flos": 23949975559680.0, + "grad_norm": 1.9222241992426519, + "language_loss": 0.83882558, + "learning_rate": 3.648287303768775e-06, + "loss": 0.86013961, + "num_input_tokens_seen": 77495945, + "step": 3591, + "time_per_iteration": 2.75075101852417 + }, + { + "auxiliary_loss_clip": 0.0107809, + "auxiliary_loss_mlp": 0.01043497, + "balance_loss_clip": 1.03682446, + "balance_loss_mlp": 1.02407813, + "epoch": 0.21596272358334587, + "flos": 30040963511040.0, + "grad_norm": 1.7174297680834103, + "language_loss": 0.69469225, + "learning_rate": 3.6480666888893686e-06, + "loss": 0.71590823, + "num_input_tokens_seen": 77517140, + "step": 3592, + "time_per_iteration": 2.740867853164673 + }, + { + "auxiliary_loss_clip": 0.01067212, + "auxiliary_loss_mlp": 0.01044983, + "balance_loss_clip": 1.03282261, + "balance_loss_mlp": 1.02770925, + "epoch": 0.21602284683601383, + "flos": 20376217751040.0, + "grad_norm": 2.584883447575204, + "language_loss": 0.83766562, + "learning_rate": 3.647846011515108e-06, + "loss": 0.85878754, + "num_input_tokens_seen": 77536085, + "step": 3593, + "time_per_iteration": 2.686692237854004 + }, + { + "auxiliary_loss_clip": 0.01072914, + "auxiliary_loss_mlp": 0.01044385, + "balance_loss_clip": 1.03270769, + "balance_loss_mlp": 1.02737355, + "epoch": 0.2160829700886818, + "flos": 20777339905920.0, + "grad_norm": 2.6370203188169197, + "language_loss": 0.75636393, + "learning_rate": 3.6476252716543625e-06, + "loss": 0.77753687, + "num_input_tokens_seen": 77553675, + "step": 3594, + "time_per_iteration": 2.7177655696868896 + }, + { + "auxiliary_loss_clip": 0.01092585, + "auxiliary_loss_mlp": 0.01043059, + "balance_loss_clip": 1.03592765, + "balance_loss_mlp": 1.02613187, + "epoch": 0.21614309334134976, + "flos": 22309755886080.0, + "grad_norm": 2.141561474506547, + "language_loss": 0.80300754, + "learning_rate": 3.6474044693155007e-06, + "loss": 0.82436395, + "num_input_tokens_seen": 77573360, + "step": 3595, + "time_per_iteration": 2.6328299045562744 + }, + { + "auxiliary_loss_clip": 0.01075413, + "auxiliary_loss_mlp": 0.01037455, + "balance_loss_clip": 1.03568971, + "balance_loss_mlp": 1.02081358, + "epoch": 0.21620321659401773, + "flos": 19609524927360.0, + "grad_norm": 2.1085574651164336, + "language_loss": 0.7918126, + "learning_rate": 3.647183604506897e-06, + "loss": 0.81294125, + "num_input_tokens_seen": 77591865, + "step": 3596, + "time_per_iteration": 2.6735763549804688 + }, + { + "auxiliary_loss_clip": 0.0103741, + "auxiliary_loss_mlp": 0.01039924, + "balance_loss_clip": 1.03230512, + "balance_loss_mlp": 1.02465272, + "epoch": 0.2162633398466857, + "flos": 18844555956480.0, + "grad_norm": 1.5424683564811972, + "language_loss": 0.82863963, + "learning_rate": 3.6469626772369253e-06, + "loss": 0.84941292, + "num_input_tokens_seen": 77611600, + "step": 3597, + "time_per_iteration": 2.9286704063415527 + }, + { + "auxiliary_loss_clip": 0.01082703, + "auxiliary_loss_mlp": 0.00748871, + "balance_loss_clip": 1.03416681, + "balance_loss_mlp": 1.00099421, + "epoch": 0.21632346309935369, + "flos": 18768820129920.0, + "grad_norm": 1.5539635785085073, + "language_loss": 0.80537093, + "learning_rate": 3.6467416875139642e-06, + "loss": 0.8236866, + "num_input_tokens_seen": 77630665, + "step": 3598, + "time_per_iteration": 2.646491527557373 + }, + { + "auxiliary_loss_clip": 0.01069633, + "auxiliary_loss_mlp": 0.01049311, + "balance_loss_clip": 1.03173888, + "balance_loss_mlp": 1.03111982, + "epoch": 0.21638358635202165, + "flos": 26324173745280.0, + "grad_norm": 1.725612616339128, + "language_loss": 0.81818932, + "learning_rate": 3.6465206353463934e-06, + "loss": 0.83937871, + "num_input_tokens_seen": 77650835, + "step": 3599, + "time_per_iteration": 2.6880784034729004 + }, + { + "auxiliary_loss_clip": 0.01059558, + "auxiliary_loss_mlp": 0.00748847, + "balance_loss_clip": 1.03120184, + "balance_loss_mlp": 1.00097394, + "epoch": 0.21644370960468962, + "flos": 20740854666240.0, + "grad_norm": 1.8639019757384025, + "language_loss": 0.76712245, + "learning_rate": 3.6462995207425947e-06, + "loss": 0.78520656, + "num_input_tokens_seen": 77669000, + "step": 3600, + "time_per_iteration": 2.7395496368408203 + }, + { + "auxiliary_loss_clip": 0.01063139, + "auxiliary_loss_mlp": 0.01043373, + "balance_loss_clip": 1.03323793, + "balance_loss_mlp": 1.02819729, + "epoch": 0.21650383285735758, + "flos": 23952238116480.0, + "grad_norm": 2.2128121872626267, + "language_loss": 0.79949939, + "learning_rate": 3.6460783437109533e-06, + "loss": 0.82056457, + "num_input_tokens_seen": 77688745, + "step": 3601, + "time_per_iteration": 2.760021209716797 + }, + { + "auxiliary_loss_clip": 0.01103712, + "auxiliary_loss_mlp": 0.01046181, + "balance_loss_clip": 1.03639317, + "balance_loss_mlp": 1.03051746, + "epoch": 0.21656395611002555, + "flos": 23696087253120.0, + "grad_norm": 1.7660871035991377, + "language_loss": 0.82792497, + "learning_rate": 3.6458571042598565e-06, + "loss": 0.84942394, + "num_input_tokens_seen": 77708445, + "step": 3602, + "time_per_iteration": 2.7338976860046387 + }, + { + "auxiliary_loss_clip": 0.01103388, + "auxiliary_loss_mlp": 0.01044501, + "balance_loss_clip": 1.03512585, + "balance_loss_mlp": 1.02822924, + "epoch": 0.2166240793626935, + "flos": 20666052593280.0, + "grad_norm": 1.9619764653369685, + "language_loss": 0.74395621, + "learning_rate": 3.645635802397693e-06, + "loss": 0.7654351, + "num_input_tokens_seen": 77728465, + "step": 3603, + "time_per_iteration": 2.708219528198242 + }, + { + "auxiliary_loss_clip": 0.01061096, + "auxiliary_loss_mlp": 0.01045764, + "balance_loss_clip": 1.03071797, + "balance_loss_mlp": 1.0292294, + "epoch": 0.2166842026153615, + "flos": 21580410228480.0, + "grad_norm": 1.6307611888511309, + "language_loss": 0.74113119, + "learning_rate": 3.645414438132855e-06, + "loss": 0.76219976, + "num_input_tokens_seen": 77746735, + "step": 3604, + "time_per_iteration": 2.621624231338501 + }, + { + "auxiliary_loss_clip": 0.01088101, + "auxiliary_loss_mlp": 0.01040444, + "balance_loss_clip": 1.03344452, + "balance_loss_mlp": 1.02516174, + "epoch": 0.21674432586802947, + "flos": 25629948610560.0, + "grad_norm": 1.587655207763605, + "language_loss": 0.79828167, + "learning_rate": 3.6451930114737366e-06, + "loss": 0.81956708, + "num_input_tokens_seen": 77768105, + "step": 3605, + "time_per_iteration": 2.6612019538879395 + }, + { + "auxiliary_loss_clip": 0.01025119, + "auxiliary_loss_mlp": 0.01010625, + "balance_loss_clip": 1.00739384, + "balance_loss_mlp": 1.00821686, + "epoch": 0.21680444912069743, + "flos": 56417783616000.0, + "grad_norm": 0.6924585414000378, + "language_loss": 0.58364797, + "learning_rate": 3.6449715224287347e-06, + "loss": 0.6040054, + "num_input_tokens_seen": 77833750, + "step": 3606, + "time_per_iteration": 3.291102170944214 + }, + { + "auxiliary_loss_clip": 0.01105443, + "auxiliary_loss_mlp": 0.01043435, + "balance_loss_clip": 1.03665376, + "balance_loss_mlp": 1.02663803, + "epoch": 0.2168645723733654, + "flos": 23878944414720.0, + "grad_norm": 2.1925948218151388, + "language_loss": 0.73585474, + "learning_rate": 3.644749971006248e-06, + "loss": 0.75734353, + "num_input_tokens_seen": 77853780, + "step": 3607, + "time_per_iteration": 2.726083993911743 + }, + { + "auxiliary_loss_clip": 0.01085277, + "auxiliary_loss_mlp": 0.01046618, + "balance_loss_clip": 1.03420424, + "balance_loss_mlp": 1.02923703, + "epoch": 0.21692469562603336, + "flos": 16946174257920.0, + "grad_norm": 1.9566573761846293, + "language_loss": 0.76843274, + "learning_rate": 3.6445283572146765e-06, + "loss": 0.78975165, + "num_input_tokens_seen": 77872575, + "step": 3608, + "time_per_iteration": 2.5812063217163086 + }, + { + "auxiliary_loss_clip": 0.01035907, + "auxiliary_loss_mlp": 0.01051202, + "balance_loss_clip": 1.03032839, + "balance_loss_mlp": 1.03527522, + "epoch": 0.21698481887870133, + "flos": 25119047514240.0, + "grad_norm": 2.1225587261403, + "language_loss": 0.73892719, + "learning_rate": 3.6443066810624255e-06, + "loss": 0.75979829, + "num_input_tokens_seen": 77892700, + "step": 3609, + "time_per_iteration": 2.7963833808898926 + }, + { + "auxiliary_loss_clip": 0.01083035, + "auxiliary_loss_mlp": 0.01049473, + "balance_loss_clip": 1.03440952, + "balance_loss_mlp": 1.03317738, + "epoch": 0.2170449421313693, + "flos": 17894682748800.0, + "grad_norm": 1.8115311342345648, + "language_loss": 0.88630998, + "learning_rate": 3.6440849425579e-06, + "loss": 0.90763503, + "num_input_tokens_seen": 77911060, + "step": 3610, + "time_per_iteration": 2.677700996398926 + }, + { + "auxiliary_loss_clip": 0.01103643, + "auxiliary_loss_mlp": 0.0104465, + "balance_loss_clip": 1.03604937, + "balance_loss_mlp": 1.02802062, + "epoch": 0.2171050653840373, + "flos": 22638446265600.0, + "grad_norm": 1.7385940679780627, + "language_loss": 0.77688742, + "learning_rate": 3.6438631417095095e-06, + "loss": 0.79837036, + "num_input_tokens_seen": 77929930, + "step": 3611, + "time_per_iteration": 2.628937244415283 + }, + { + "auxiliary_loss_clip": 0.01035586, + "auxiliary_loss_mlp": 0.01044389, + "balance_loss_clip": 1.02940977, + "balance_loss_mlp": 1.02821791, + "epoch": 0.21716518863670525, + "flos": 19499997381120.0, + "grad_norm": 2.634151225670582, + "language_loss": 0.63020897, + "learning_rate": 3.6436412785256637e-06, + "loss": 0.65100873, + "num_input_tokens_seen": 77949060, + "step": 3612, + "time_per_iteration": 2.9092063903808594 + }, + { + "auxiliary_loss_clip": 0.01036906, + "auxiliary_loss_mlp": 0.01046265, + "balance_loss_clip": 1.02645278, + "balance_loss_mlp": 1.02881229, + "epoch": 0.21722531188937322, + "flos": 19792022952960.0, + "grad_norm": 2.853574473972562, + "language_loss": 0.75749981, + "learning_rate": 3.643419353014776e-06, + "loss": 0.77833152, + "num_input_tokens_seen": 77967920, + "step": 3613, + "time_per_iteration": 2.7532854080200195 + }, + { + "auxiliary_loss_clip": 0.01059371, + "auxiliary_loss_mlp": 0.01048308, + "balance_loss_clip": 1.03424001, + "balance_loss_mlp": 1.03080773, + "epoch": 0.21728543514204118, + "flos": 13334386924800.0, + "grad_norm": 5.521599960353436, + "language_loss": 0.71002144, + "learning_rate": 3.643197365185261e-06, + "loss": 0.73109829, + "num_input_tokens_seen": 77985330, + "step": 3614, + "time_per_iteration": 2.718763828277588 + }, + { + "auxiliary_loss_clip": 0.01091185, + "auxiliary_loss_mlp": 0.01048654, + "balance_loss_clip": 1.03536081, + "balance_loss_mlp": 1.03324032, + "epoch": 0.21734555839470915, + "flos": 15231870783360.0, + "grad_norm": 2.0195586883486496, + "language_loss": 0.73205519, + "learning_rate": 3.6429753150455378e-06, + "loss": 0.75345361, + "num_input_tokens_seen": 78003105, + "step": 3615, + "time_per_iteration": 2.6533401012420654 + }, + { + "auxiliary_loss_clip": 0.01088708, + "auxiliary_loss_mlp": 0.01045404, + "balance_loss_clip": 1.03202009, + "balance_loss_mlp": 1.02764177, + "epoch": 0.2174056816473771, + "flos": 19973982274560.0, + "grad_norm": 2.2720138828421157, + "language_loss": 0.90029424, + "learning_rate": 3.6427532026040263e-06, + "loss": 0.92163527, + "num_input_tokens_seen": 78019655, + "step": 3616, + "time_per_iteration": 2.688185453414917 + }, + { + "auxiliary_loss_clip": 0.01049322, + "auxiliary_loss_mlp": 0.01040887, + "balance_loss_clip": 1.02963448, + "balance_loss_mlp": 1.02411437, + "epoch": 0.21746580490004508, + "flos": 16687293960960.0, + "grad_norm": 2.091215837266511, + "language_loss": 0.81129622, + "learning_rate": 3.642531027869148e-06, + "loss": 0.83219838, + "num_input_tokens_seen": 78036025, + "step": 3617, + "time_per_iteration": 2.827134609222412 + }, + { + "auxiliary_loss_clip": 0.01082346, + "auxiliary_loss_mlp": 0.01046546, + "balance_loss_clip": 1.03417969, + "balance_loss_mlp": 1.0308814, + "epoch": 0.21752592815271307, + "flos": 25772298209280.0, + "grad_norm": 1.8884152193583772, + "language_loss": 0.75598884, + "learning_rate": 3.642308790849329e-06, + "loss": 0.77727771, + "num_input_tokens_seen": 78055645, + "step": 3618, + "time_per_iteration": 2.722771644592285 + }, + { + "auxiliary_loss_clip": 0.01095798, + "auxiliary_loss_mlp": 0.01051288, + "balance_loss_clip": 1.0357089, + "balance_loss_mlp": 1.03444386, + "epoch": 0.21758605140538104, + "flos": 11254692349440.0, + "grad_norm": 2.0341737048388118, + "language_loss": 0.69404739, + "learning_rate": 3.642086491552996e-06, + "loss": 0.7155183, + "num_input_tokens_seen": 78071660, + "step": 3619, + "time_per_iteration": 2.62458872795105 + }, + { + "auxiliary_loss_clip": 0.01094342, + "auxiliary_loss_mlp": 0.01043628, + "balance_loss_clip": 1.03623497, + "balance_loss_mlp": 1.02711105, + "epoch": 0.217646174658049, + "flos": 19242625455360.0, + "grad_norm": 1.5969798675023985, + "language_loss": 0.78565478, + "learning_rate": 3.641864129988579e-06, + "loss": 0.80703449, + "num_input_tokens_seen": 78091265, + "step": 3620, + "time_per_iteration": 2.6236724853515625 + }, + { + "auxiliary_loss_clip": 0.01096968, + "auxiliary_loss_mlp": 0.01038139, + "balance_loss_clip": 1.03317511, + "balance_loss_mlp": 1.02258229, + "epoch": 0.21770629791071697, + "flos": 21945083057280.0, + "grad_norm": 1.5481795652886805, + "language_loss": 0.79578948, + "learning_rate": 3.641641706164509e-06, + "loss": 0.81714052, + "num_input_tokens_seen": 78110095, + "step": 3621, + "time_per_iteration": 2.635934829711914 + }, + { + "auxiliary_loss_clip": 0.01091227, + "auxiliary_loss_mlp": 0.01034839, + "balance_loss_clip": 1.03375912, + "balance_loss_mlp": 1.01974702, + "epoch": 0.21776642116338493, + "flos": 24936764970240.0, + "grad_norm": 1.6482404913918518, + "language_loss": 0.87385136, + "learning_rate": 3.641419220089221e-06, + "loss": 0.89511204, + "num_input_tokens_seen": 78129475, + "step": 3622, + "time_per_iteration": 2.6272497177124023 + }, + { + "auxiliary_loss_clip": 0.01094738, + "auxiliary_loss_mlp": 0.01038324, + "balance_loss_clip": 1.03500915, + "balance_loss_mlp": 1.02051985, + "epoch": 0.2178265444160529, + "flos": 17821317219840.0, + "grad_norm": 2.2474254025139806, + "language_loss": 0.76980555, + "learning_rate": 3.641196671771152e-06, + "loss": 0.79113615, + "num_input_tokens_seen": 78146880, + "step": 3623, + "time_per_iteration": 2.6205015182495117 + }, + { + "auxiliary_loss_clip": 0.01071421, + "auxiliary_loss_mlp": 0.010417, + "balance_loss_clip": 1.03464437, + "balance_loss_mlp": 1.0242238, + "epoch": 0.2178866676687209, + "flos": 17712902995200.0, + "grad_norm": 1.8662425992976, + "language_loss": 0.84296095, + "learning_rate": 3.640974061218741e-06, + "loss": 0.86409217, + "num_input_tokens_seen": 78165065, + "step": 3624, + "time_per_iteration": 2.613036632537842 + }, + { + "auxiliary_loss_clip": 0.01083362, + "auxiliary_loss_mlp": 0.01050754, + "balance_loss_clip": 1.03345299, + "balance_loss_mlp": 1.03423119, + "epoch": 0.21794679092138886, + "flos": 16945851035520.0, + "grad_norm": 2.3351666363313703, + "language_loss": 0.77621114, + "learning_rate": 3.640751388440429e-06, + "loss": 0.79755229, + "num_input_tokens_seen": 78180005, + "step": 3625, + "time_per_iteration": 4.226773023605347 + }, + { + "auxiliary_loss_clip": 0.01011823, + "auxiliary_loss_mlp": 0.01005321, + "balance_loss_clip": 1.00534999, + "balance_loss_mlp": 1.00318718, + "epoch": 0.21800691417405682, + "flos": 63718566566400.0, + "grad_norm": 0.8179345138575731, + "language_loss": 0.60684884, + "learning_rate": 3.64052865344466e-06, + "loss": 0.6270203, + "num_input_tokens_seen": 78245350, + "step": 3626, + "time_per_iteration": 4.859921216964722 + }, + { + "auxiliary_loss_clip": 0.010717, + "auxiliary_loss_mlp": 0.0074887, + "balance_loss_clip": 1.03019977, + "balance_loss_mlp": 1.00115633, + "epoch": 0.21806703742672479, + "flos": 21616392677760.0, + "grad_norm": 1.8884738046272522, + "language_loss": 0.90566409, + "learning_rate": 3.6403058562398795e-06, + "loss": 0.92386973, + "num_input_tokens_seen": 78264165, + "step": 3627, + "time_per_iteration": 2.632420063018799 + }, + { + "auxiliary_loss_clip": 0.01048506, + "auxiliary_loss_mlp": 0.0103695, + "balance_loss_clip": 1.03498006, + "balance_loss_mlp": 1.01966465, + "epoch": 0.21812716067939275, + "flos": 19354882435200.0, + "grad_norm": 1.677718978823107, + "language_loss": 0.73879611, + "learning_rate": 3.6400829968345365e-06, + "loss": 0.75965071, + "num_input_tokens_seen": 78283745, + "step": 3628, + "time_per_iteration": 2.9175448417663574 + }, + { + "auxiliary_loss_clip": 0.01097794, + "auxiliary_loss_mlp": 0.01035941, + "balance_loss_clip": 1.03129745, + "balance_loss_mlp": 1.02008641, + "epoch": 0.21818728393206072, + "flos": 23548063305600.0, + "grad_norm": 1.8785318992043512, + "language_loss": 0.7711941, + "learning_rate": 3.6398600752370826e-06, + "loss": 0.79253143, + "num_input_tokens_seen": 78302900, + "step": 3629, + "time_per_iteration": 2.761587381362915 + }, + { + "auxiliary_loss_clip": 0.01091275, + "auxiliary_loss_mlp": 0.01041807, + "balance_loss_clip": 1.0344609, + "balance_loss_mlp": 1.02615452, + "epoch": 0.21824740718472868, + "flos": 30225652266240.0, + "grad_norm": 1.5460269642750228, + "language_loss": 0.71270788, + "learning_rate": 3.63963709145597e-06, + "loss": 0.73403865, + "num_input_tokens_seen": 78326470, + "step": 3630, + "time_per_iteration": 2.845128297805786 + }, + { + "auxiliary_loss_clip": 0.01033107, + "auxiliary_loss_mlp": 0.01035363, + "balance_loss_clip": 1.02779078, + "balance_loss_mlp": 1.02113521, + "epoch": 0.21830753043739667, + "flos": 26134672567680.0, + "grad_norm": 2.1118480233070724, + "language_loss": 0.76622868, + "learning_rate": 3.6394140454996544e-06, + "loss": 0.78691339, + "num_input_tokens_seen": 78345810, + "step": 3631, + "time_per_iteration": 4.582738161087036 + }, + { + "auxiliary_loss_clip": 0.01100193, + "auxiliary_loss_mlp": 0.01033979, + "balance_loss_clip": 1.03363538, + "balance_loss_mlp": 1.01891696, + "epoch": 0.21836765369006464, + "flos": 21720712752000.0, + "grad_norm": 4.866458478835243, + "language_loss": 0.75586653, + "learning_rate": 3.639190937376594e-06, + "loss": 0.77720821, + "num_input_tokens_seen": 78364085, + "step": 3632, + "time_per_iteration": 4.246539354324341 + }, + { + "auxiliary_loss_clip": 0.01098294, + "auxiliary_loss_mlp": 0.01034707, + "balance_loss_clip": 1.03337073, + "balance_loss_mlp": 1.01992512, + "epoch": 0.2184277769427326, + "flos": 19937604775680.0, + "grad_norm": 1.9958205277341659, + "language_loss": 0.83844447, + "learning_rate": 3.638967767095249e-06, + "loss": 0.85977441, + "num_input_tokens_seen": 78381385, + "step": 3633, + "time_per_iteration": 2.541369676589966 + }, + { + "auxiliary_loss_clip": 0.01067371, + "auxiliary_loss_mlp": 0.01043242, + "balance_loss_clip": 1.03297091, + "balance_loss_mlp": 1.02775669, + "epoch": 0.21848790019540057, + "flos": 20340235301760.0, + "grad_norm": 1.8677115261003123, + "language_loss": 0.81770682, + "learning_rate": 3.6387445346640823e-06, + "loss": 0.83881295, + "num_input_tokens_seen": 78400500, + "step": 3634, + "time_per_iteration": 2.6684491634368896 + }, + { + "auxiliary_loss_clip": 0.01092046, + "auxiliary_loss_mlp": 0.01033969, + "balance_loss_clip": 1.03462708, + "balance_loss_mlp": 1.01810801, + "epoch": 0.21854802344806853, + "flos": 15450818135040.0, + "grad_norm": 1.7299366487368921, + "language_loss": 0.75010806, + "learning_rate": 3.638521240091558e-06, + "loss": 0.77136827, + "num_input_tokens_seen": 78418340, + "step": 3635, + "time_per_iteration": 2.5727312564849854 + }, + { + "auxiliary_loss_clip": 0.01073881, + "auxiliary_loss_mlp": 0.01049607, + "balance_loss_clip": 1.03311682, + "balance_loss_mlp": 1.03372836, + "epoch": 0.2186081467007365, + "flos": 16320717711360.0, + "grad_norm": 2.108443584999876, + "language_loss": 0.88202977, + "learning_rate": 3.6382978833861445e-06, + "loss": 0.90326464, + "num_input_tokens_seen": 78434375, + "step": 3636, + "time_per_iteration": 2.6896185874938965 + }, + { + "auxiliary_loss_clip": 0.01070852, + "auxiliary_loss_mlp": 0.00748862, + "balance_loss_clip": 1.03426409, + "balance_loss_mlp": 1.00116062, + "epoch": 0.2186682699534045, + "flos": 21689255416320.0, + "grad_norm": 2.0285042993835583, + "language_loss": 0.76384699, + "learning_rate": 3.638074464556311e-06, + "loss": 0.78204417, + "num_input_tokens_seen": 78451735, + "step": 3637, + "time_per_iteration": 2.722625732421875 + }, + { + "auxiliary_loss_clip": 0.0108616, + "auxiliary_loss_mlp": 0.01036759, + "balance_loss_clip": 1.03541327, + "balance_loss_mlp": 1.02034402, + "epoch": 0.21872839320607246, + "flos": 17739260599680.0, + "grad_norm": 2.24516845731209, + "language_loss": 0.90061116, + "learning_rate": 3.63785098361053e-06, + "loss": 0.92184031, + "num_input_tokens_seen": 78462730, + "step": 3638, + "time_per_iteration": 2.5459420680999756 + }, + { + "auxiliary_loss_clip": 0.01088069, + "auxiliary_loss_mlp": 0.01046212, + "balance_loss_clip": 1.03329408, + "balance_loss_mlp": 1.02948725, + "epoch": 0.21878851645874042, + "flos": 18652289431680.0, + "grad_norm": 2.340524179731659, + "language_loss": 0.89421451, + "learning_rate": 3.637627440557275e-06, + "loss": 0.91555732, + "num_input_tokens_seen": 78476300, + "step": 3639, + "time_per_iteration": 2.5035135746002197 + }, + { + "auxiliary_loss_clip": 0.0107969, + "auxiliary_loss_mlp": 0.00748865, + "balance_loss_clip": 1.03281188, + "balance_loss_mlp": 1.00107753, + "epoch": 0.2188486397114084, + "flos": 25557301353600.0, + "grad_norm": 1.657357791059345, + "language_loss": 0.79327691, + "learning_rate": 3.637403835405024e-06, + "loss": 0.81156248, + "num_input_tokens_seen": 78496135, + "step": 3640, + "time_per_iteration": 2.6865875720977783 + }, + { + "auxiliary_loss_clip": 0.01095455, + "auxiliary_loss_mlp": 0.01047634, + "balance_loss_clip": 1.03741121, + "balance_loss_mlp": 1.03036094, + "epoch": 0.21890876296407635, + "flos": 17892061056000.0, + "grad_norm": 2.5105153022402216, + "language_loss": 0.71806127, + "learning_rate": 3.637180168162255e-06, + "loss": 0.73949218, + "num_input_tokens_seen": 78513855, + "step": 3641, + "time_per_iteration": 2.530064582824707 + }, + { + "auxiliary_loss_clip": 0.01075588, + "auxiliary_loss_mlp": 0.01039908, + "balance_loss_clip": 1.03311157, + "balance_loss_mlp": 1.02361155, + "epoch": 0.21896888621674432, + "flos": 17749100926080.0, + "grad_norm": 1.9160072813153197, + "language_loss": 0.81608427, + "learning_rate": 3.63695643883745e-06, + "loss": 0.83723927, + "num_input_tokens_seen": 78531740, + "step": 3642, + "time_per_iteration": 2.546509027481079 + }, + { + "auxiliary_loss_clip": 0.01095505, + "auxiliary_loss_mlp": 0.01042118, + "balance_loss_clip": 1.03683054, + "balance_loss_mlp": 1.02544081, + "epoch": 0.21902900946941228, + "flos": 23076161400960.0, + "grad_norm": 1.7068030556064706, + "language_loss": 0.71773887, + "learning_rate": 3.6367326474390928e-06, + "loss": 0.73911512, + "num_input_tokens_seen": 78549600, + "step": 3643, + "time_per_iteration": 2.680616617202759 + }, + { + "auxiliary_loss_clip": 0.01103766, + "auxiliary_loss_mlp": 0.01047273, + "balance_loss_clip": 1.03605008, + "balance_loss_mlp": 1.03060782, + "epoch": 0.21908913272208028, + "flos": 48178545004800.0, + "grad_norm": 1.6605992361768889, + "language_loss": 0.6796397, + "learning_rate": 3.6365087939756696e-06, + "loss": 0.70115006, + "num_input_tokens_seen": 78573350, + "step": 3644, + "time_per_iteration": 2.858248472213745 + }, + { + "auxiliary_loss_clip": 0.01103009, + "auxiliary_loss_mlp": 0.01040534, + "balance_loss_clip": 1.03445041, + "balance_loss_mlp": 1.0243212, + "epoch": 0.21914925597474824, + "flos": 22236749493120.0, + "grad_norm": 2.217715480166424, + "language_loss": 0.7752521, + "learning_rate": 3.636284878455669e-06, + "loss": 0.79668754, + "num_input_tokens_seen": 78591005, + "step": 3645, + "time_per_iteration": 2.5512797832489014 + }, + { + "auxiliary_loss_clip": 0.01091178, + "auxiliary_loss_mlp": 0.01048737, + "balance_loss_clip": 1.03703928, + "balance_loss_mlp": 1.03322768, + "epoch": 0.2192093792274162, + "flos": 22125605834880.0, + "grad_norm": 1.7066818815196325, + "language_loss": 0.82564223, + "learning_rate": 3.636060900887582e-06, + "loss": 0.84704143, + "num_input_tokens_seen": 78610645, + "step": 3646, + "time_per_iteration": 2.754199504852295 + }, + { + "auxiliary_loss_clip": 0.01084778, + "auxiliary_loss_mlp": 0.01037324, + "balance_loss_clip": 1.03256845, + "balance_loss_mlp": 1.02171993, + "epoch": 0.21926950248008417, + "flos": 15669442264320.0, + "grad_norm": 1.6646191421254686, + "language_loss": 0.82561409, + "learning_rate": 3.635836861279901e-06, + "loss": 0.84683514, + "num_input_tokens_seen": 78628340, + "step": 3647, + "time_per_iteration": 2.650495767593384 + }, + { + "auxiliary_loss_clip": 0.0109799, + "auxiliary_loss_mlp": 0.01043139, + "balance_loss_clip": 1.03295183, + "balance_loss_mlp": 1.0277369, + "epoch": 0.21932962573275214, + "flos": 30262496641920.0, + "grad_norm": 1.8531490752782183, + "language_loss": 0.72304684, + "learning_rate": 3.635612759641123e-06, + "loss": 0.74445808, + "num_input_tokens_seen": 78649355, + "step": 3648, + "time_per_iteration": 2.6552813053131104 + }, + { + "auxiliary_loss_clip": 0.0106055, + "auxiliary_loss_mlp": 0.01045318, + "balance_loss_clip": 1.02992249, + "balance_loss_mlp": 1.02680445, + "epoch": 0.2193897489854201, + "flos": 10780132838400.0, + "grad_norm": 2.384985157022465, + "language_loss": 0.74523437, + "learning_rate": 3.635388595979745e-06, + "loss": 0.76629305, + "num_input_tokens_seen": 78664915, + "step": 3649, + "time_per_iteration": 2.6981568336486816 + }, + { + "auxiliary_loss_clip": 0.01083636, + "auxiliary_loss_mlp": 0.01036839, + "balance_loss_clip": 1.03155375, + "balance_loss_mlp": 1.02220023, + "epoch": 0.21944987223808807, + "flos": 19133313390720.0, + "grad_norm": 1.9602992205902836, + "language_loss": 0.86621237, + "learning_rate": 3.635164370304267e-06, + "loss": 0.88741708, + "num_input_tokens_seen": 78681475, + "step": 3650, + "time_per_iteration": 2.703587293624878 + }, + { + "auxiliary_loss_clip": 0.01075496, + "auxiliary_loss_mlp": 0.01044782, + "balance_loss_clip": 1.03039432, + "balance_loss_mlp": 1.02866483, + "epoch": 0.21950999549075606, + "flos": 22711093522560.0, + "grad_norm": 1.9452639751154235, + "language_loss": 0.83279532, + "learning_rate": 3.6349400826231927e-06, + "loss": 0.85399812, + "num_input_tokens_seen": 78702300, + "step": 3651, + "time_per_iteration": 2.7661163806915283 + }, + { + "auxiliary_loss_clip": 0.01086205, + "auxiliary_loss_mlp": 0.01042661, + "balance_loss_clip": 1.03064728, + "balance_loss_mlp": 1.02738476, + "epoch": 0.21957011874342403, + "flos": 10561329141120.0, + "grad_norm": 3.3993487486772755, + "language_loss": 0.7448113, + "learning_rate": 3.634715732945027e-06, + "loss": 0.76609993, + "num_input_tokens_seen": 78720230, + "step": 3652, + "time_per_iteration": 2.7291228771209717 + }, + { + "auxiliary_loss_clip": 0.00984306, + "auxiliary_loss_mlp": 0.01003475, + "balance_loss_clip": 1.00634336, + "balance_loss_mlp": 1.00112653, + "epoch": 0.219630241996092, + "flos": 65747913252480.0, + "grad_norm": 0.7345545495689367, + "language_loss": 0.51546073, + "learning_rate": 3.6344913212782764e-06, + "loss": 0.53533852, + "num_input_tokens_seen": 78780200, + "step": 3653, + "time_per_iteration": 3.260892391204834 + }, + { + "auxiliary_loss_clip": 0.01068846, + "auxiliary_loss_mlp": 0.01048076, + "balance_loss_clip": 1.03272343, + "balance_loss_mlp": 1.03232884, + "epoch": 0.21969036524875996, + "flos": 23696518216320.0, + "grad_norm": 2.436385411839234, + "language_loss": 0.75120795, + "learning_rate": 3.6342668476314514e-06, + "loss": 0.77237719, + "num_input_tokens_seen": 78800575, + "step": 3654, + "time_per_iteration": 2.7644691467285156 + }, + { + "auxiliary_loss_clip": 0.01094288, + "auxiliary_loss_mlp": 0.01040938, + "balance_loss_clip": 1.03706837, + "balance_loss_mlp": 1.02498746, + "epoch": 0.21975048850142792, + "flos": 19640910435840.0, + "grad_norm": 2.446915701440405, + "language_loss": 0.72673821, + "learning_rate": 3.634042312013064e-06, + "loss": 0.74809051, + "num_input_tokens_seen": 78819585, + "step": 3655, + "time_per_iteration": 2.7041878700256348 + }, + { + "auxiliary_loss_clip": 0.01072206, + "auxiliary_loss_mlp": 0.01043937, + "balance_loss_clip": 1.03246593, + "balance_loss_mlp": 1.02791536, + "epoch": 0.21981061175409589, + "flos": 22448550038400.0, + "grad_norm": 2.1443179533386707, + "language_loss": 0.8060109, + "learning_rate": 3.6338177144316276e-06, + "loss": 0.8271724, + "num_input_tokens_seen": 78837330, + "step": 3656, + "time_per_iteration": 2.7428274154663086 + }, + { + "auxiliary_loss_clip": 0.01067639, + "auxiliary_loss_mlp": 0.00748715, + "balance_loss_clip": 1.0352664, + "balance_loss_mlp": 1.00108194, + "epoch": 0.21987073500676388, + "flos": 18151049093760.0, + "grad_norm": 2.3769043110843597, + "language_loss": 0.85025746, + "learning_rate": 3.63359305489566e-06, + "loss": 0.86842108, + "num_input_tokens_seen": 78854955, + "step": 3657, + "time_per_iteration": 2.7353668212890625 + }, + { + "auxiliary_loss_clip": 0.01090807, + "auxiliary_loss_mlp": 0.01037713, + "balance_loss_clip": 1.03669214, + "balance_loss_mlp": 1.02171481, + "epoch": 0.21993085825943184, + "flos": 25626177682560.0, + "grad_norm": 1.64341842726504, + "language_loss": 0.80442131, + "learning_rate": 3.6333683334136803e-06, + "loss": 0.82570654, + "num_input_tokens_seen": 78874965, + "step": 3658, + "time_per_iteration": 2.716094732284546 + }, + { + "auxiliary_loss_clip": 0.00993624, + "auxiliary_loss_mlp": 0.01004207, + "balance_loss_clip": 1.00684321, + "balance_loss_mlp": 1.00181127, + "epoch": 0.2199909815120998, + "flos": 70923217743360.0, + "grad_norm": 0.7795844943015319, + "language_loss": 0.58229172, + "learning_rate": 3.6331435499942095e-06, + "loss": 0.60227001, + "num_input_tokens_seen": 78937740, + "step": 3659, + "time_per_iteration": 3.2972121238708496 + }, + { + "auxiliary_loss_clip": 0.01056021, + "auxiliary_loss_mlp": 0.01038193, + "balance_loss_clip": 1.03105998, + "balance_loss_mlp": 1.02260065, + "epoch": 0.22005110476476777, + "flos": 21543529939200.0, + "grad_norm": 2.3543566809051173, + "language_loss": 0.73703939, + "learning_rate": 3.632918704645772e-06, + "loss": 0.75798154, + "num_input_tokens_seen": 78955055, + "step": 3660, + "time_per_iteration": 2.8568317890167236 + }, + { + "auxiliary_loss_clip": 0.01091848, + "auxiliary_loss_mlp": 0.01038777, + "balance_loss_clip": 1.03516185, + "balance_loss_mlp": 1.02282739, + "epoch": 0.22011122801743574, + "flos": 22054502862720.0, + "grad_norm": 1.6666631521020567, + "language_loss": 0.80788368, + "learning_rate": 3.632693797376893e-06, + "loss": 0.82918996, + "num_input_tokens_seen": 78974895, + "step": 3661, + "time_per_iteration": 2.716066837310791 + }, + { + "auxiliary_loss_clip": 0.01066102, + "auxiliary_loss_mlp": 0.01044119, + "balance_loss_clip": 1.02958763, + "balance_loss_mlp": 1.02842546, + "epoch": 0.2201713512701037, + "flos": 26687589598080.0, + "grad_norm": 1.7149442784119715, + "language_loss": 0.73073667, + "learning_rate": 3.632468828196102e-06, + "loss": 0.75183892, + "num_input_tokens_seen": 78994990, + "step": 3662, + "time_per_iteration": 2.7532129287719727 + }, + { + "auxiliary_loss_clip": 0.01079393, + "auxiliary_loss_mlp": 0.01042927, + "balance_loss_clip": 1.03568792, + "balance_loss_mlp": 1.02855599, + "epoch": 0.22023147452277167, + "flos": 22162198815360.0, + "grad_norm": 1.956777149223151, + "language_loss": 0.78039825, + "learning_rate": 3.632243797111929e-06, + "loss": 0.80162144, + "num_input_tokens_seen": 79014405, + "step": 3663, + "time_per_iteration": 2.727691173553467 + }, + { + "auxiliary_loss_clip": 0.01083679, + "auxiliary_loss_mlp": 0.010511, + "balance_loss_clip": 1.03432858, + "balance_loss_mlp": 1.03368366, + "epoch": 0.22029159777543966, + "flos": 22523280284160.0, + "grad_norm": 1.6673724390742373, + "language_loss": 0.80469823, + "learning_rate": 3.632018704132908e-06, + "loss": 0.82604599, + "num_input_tokens_seen": 79032375, + "step": 3664, + "time_per_iteration": 2.7545535564422607 + }, + { + "auxiliary_loss_clip": 0.01082132, + "auxiliary_loss_mlp": 0.01040587, + "balance_loss_clip": 1.03456926, + "balance_loss_mlp": 1.02271795, + "epoch": 0.22035172102810763, + "flos": 13042469093760.0, + "grad_norm": 2.520726607636793, + "language_loss": 0.77131677, + "learning_rate": 3.6317935492675742e-06, + "loss": 0.79254389, + "num_input_tokens_seen": 79049635, + "step": 3665, + "time_per_iteration": 2.6843156814575195 + }, + { + "auxiliary_loss_clip": 0.01075696, + "auxiliary_loss_mlp": 0.01047023, + "balance_loss_clip": 1.03319061, + "balance_loss_mlp": 1.03123999, + "epoch": 0.2204118442807756, + "flos": 12165817760640.0, + "grad_norm": 3.7809850900455872, + "language_loss": 0.97902799, + "learning_rate": 3.631568332524466e-06, + "loss": 1.00025511, + "num_input_tokens_seen": 79062890, + "step": 3666, + "time_per_iteration": 2.775705575942993 + }, + { + "auxiliary_loss_clip": 0.01087102, + "auxiliary_loss_mlp": 0.00748821, + "balance_loss_clip": 1.03208709, + "balance_loss_mlp": 1.00101864, + "epoch": 0.22047196753344356, + "flos": 40108806673920.0, + "grad_norm": 1.575680405315493, + "language_loss": 0.80569434, + "learning_rate": 3.631343053912122e-06, + "loss": 0.82405359, + "num_input_tokens_seen": 79085495, + "step": 3667, + "time_per_iteration": 2.8589835166931152 + }, + { + "auxiliary_loss_clip": 0.0109165, + "auxiliary_loss_mlp": 0.01048531, + "balance_loss_clip": 1.03537989, + "balance_loss_mlp": 1.0318892, + "epoch": 0.22053209078611152, + "flos": 20701137202560.0, + "grad_norm": 2.0483181074849903, + "language_loss": 0.77315128, + "learning_rate": 3.631117713439087e-06, + "loss": 0.7945531, + "num_input_tokens_seen": 79101820, + "step": 3668, + "time_per_iteration": 2.8226964473724365 + }, + { + "auxiliary_loss_clip": 0.01089805, + "auxiliary_loss_mlp": 0.01045374, + "balance_loss_clip": 1.0364325, + "balance_loss_mlp": 1.02967453, + "epoch": 0.2205922140387795, + "flos": 24716309247360.0, + "grad_norm": 1.7278614046748957, + "language_loss": 0.71473587, + "learning_rate": 3.630892311113904e-06, + "loss": 0.73608768, + "num_input_tokens_seen": 79123320, + "step": 3669, + "time_per_iteration": 2.676558256149292 + }, + { + "auxiliary_loss_clip": 0.01098327, + "auxiliary_loss_mlp": 0.01037754, + "balance_loss_clip": 1.03233719, + "balance_loss_mlp": 1.02230453, + "epoch": 0.22065233729144745, + "flos": 23477247642240.0, + "grad_norm": 2.0428731232204362, + "language_loss": 0.86074007, + "learning_rate": 3.6306668469451215e-06, + "loss": 0.88210094, + "num_input_tokens_seen": 79141615, + "step": 3670, + "time_per_iteration": 2.59916615486145 + }, + { + "auxiliary_loss_clip": 0.01078109, + "auxiliary_loss_mlp": 0.01042866, + "balance_loss_clip": 1.0337162, + "balance_loss_mlp": 1.02667725, + "epoch": 0.22071246054411545, + "flos": 35225566646400.0, + "grad_norm": 1.7649511007348413, + "language_loss": 0.76967597, + "learning_rate": 3.6304413209412886e-06, + "loss": 0.79088569, + "num_input_tokens_seen": 79164910, + "step": 3671, + "time_per_iteration": 2.787665367126465 + }, + { + "auxiliary_loss_clip": 0.01079003, + "auxiliary_loss_mlp": 0.01037847, + "balance_loss_clip": 1.03585124, + "balance_loss_mlp": 1.02186155, + "epoch": 0.2207725837967834, + "flos": 18150294908160.0, + "grad_norm": 2.3950552717012523, + "language_loss": 0.80882204, + "learning_rate": 3.6302157331109573e-06, + "loss": 0.82999051, + "num_input_tokens_seen": 79179685, + "step": 3672, + "time_per_iteration": 4.226728200912476 + }, + { + "auxiliary_loss_clip": 0.01093411, + "auxiliary_loss_mlp": 0.01048121, + "balance_loss_clip": 1.03604925, + "balance_loss_mlp": 1.03295732, + "epoch": 0.22083270704945138, + "flos": 20479675898880.0, + "grad_norm": 1.8701668902285293, + "language_loss": 0.73585308, + "learning_rate": 3.629990083462682e-06, + "loss": 0.75726837, + "num_input_tokens_seen": 79196285, + "step": 3673, + "time_per_iteration": 4.27129054069519 + }, + { + "auxiliary_loss_clip": 0.01064985, + "auxiliary_loss_mlp": 0.01045531, + "balance_loss_clip": 1.03689563, + "balance_loss_mlp": 1.02848411, + "epoch": 0.22089283030211934, + "flos": 34125801984000.0, + "grad_norm": 1.8939480249652172, + "language_loss": 0.76549065, + "learning_rate": 3.6297643720050203e-06, + "loss": 0.78659582, + "num_input_tokens_seen": 79216060, + "step": 3674, + "time_per_iteration": 2.905825138092041 + }, + { + "auxiliary_loss_clip": 0.01101983, + "auxiliary_loss_mlp": 0.01040766, + "balance_loss_clip": 1.03587723, + "balance_loss_mlp": 1.02351642, + "epoch": 0.2209529535547873, + "flos": 18077216688000.0, + "grad_norm": 1.8442036616763426, + "language_loss": 0.74542069, + "learning_rate": 3.6295385987465293e-06, + "loss": 0.76684821, + "num_input_tokens_seen": 79235145, + "step": 3675, + "time_per_iteration": 2.5690274238586426 + }, + { + "auxiliary_loss_clip": 0.01102872, + "auxiliary_loss_mlp": 0.0104302, + "balance_loss_clip": 1.03566825, + "balance_loss_mlp": 1.02637815, + "epoch": 0.22101307680745527, + "flos": 27235335070080.0, + "grad_norm": 1.6983499483234672, + "language_loss": 0.80162269, + "learning_rate": 3.629312763695772e-06, + "loss": 0.82308161, + "num_input_tokens_seen": 79256960, + "step": 3676, + "time_per_iteration": 2.729081869125366 + }, + { + "auxiliary_loss_clip": 0.01078945, + "auxiliary_loss_mlp": 0.01048173, + "balance_loss_clip": 1.03212333, + "balance_loss_mlp": 1.03292656, + "epoch": 0.22107320006012326, + "flos": 16543256423040.0, + "grad_norm": 1.8351454311688409, + "language_loss": 0.75415564, + "learning_rate": 3.6290868668613107e-06, + "loss": 0.77542681, + "num_input_tokens_seen": 79274860, + "step": 3677, + "time_per_iteration": 2.691317558288574 + }, + { + "auxiliary_loss_clip": 0.0106313, + "auxiliary_loss_mlp": 0.01040574, + "balance_loss_clip": 1.03197551, + "balance_loss_mlp": 1.02467132, + "epoch": 0.22113332331279123, + "flos": 22054466949120.0, + "grad_norm": 1.8544665769608208, + "language_loss": 0.83264053, + "learning_rate": 3.628860908251712e-06, + "loss": 0.85367751, + "num_input_tokens_seen": 79294005, + "step": 3678, + "time_per_iteration": 2.9000933170318604 + }, + { + "auxiliary_loss_clip": 0.01036407, + "auxiliary_loss_mlp": 0.01044329, + "balance_loss_clip": 1.0293684, + "balance_loss_mlp": 1.02746058, + "epoch": 0.2211934465654592, + "flos": 26612787525120.0, + "grad_norm": 2.0864666891879335, + "language_loss": 0.88721973, + "learning_rate": 3.6286348878755452e-06, + "loss": 0.90802705, + "num_input_tokens_seen": 79314005, + "step": 3679, + "time_per_iteration": 5.91292929649353 + }, + { + "auxiliary_loss_clip": 0.01096109, + "auxiliary_loss_mlp": 0.01046346, + "balance_loss_clip": 1.03672361, + "balance_loss_mlp": 1.02940679, + "epoch": 0.22125356981812716, + "flos": 16360363347840.0, + "grad_norm": 2.4728162108826104, + "language_loss": 0.87088096, + "learning_rate": 3.6284088057413803e-06, + "loss": 0.89230549, + "num_input_tokens_seen": 79331030, + "step": 3680, + "time_per_iteration": 2.5861921310424805 + }, + { + "auxiliary_loss_clip": 0.0106721, + "auxiliary_loss_mlp": 0.01044614, + "balance_loss_clip": 1.0344193, + "balance_loss_mlp": 1.0287534, + "epoch": 0.22131369307079513, + "flos": 21651118151040.0, + "grad_norm": 1.8794686467851118, + "language_loss": 0.81549698, + "learning_rate": 3.6281826618577894e-06, + "loss": 0.8366152, + "num_input_tokens_seen": 79348560, + "step": 3681, + "time_per_iteration": 2.7686407566070557 + }, + { + "auxiliary_loss_clip": 0.01097319, + "auxiliary_loss_mlp": 0.00748783, + "balance_loss_clip": 1.03500056, + "balance_loss_mlp": 1.00110328, + "epoch": 0.2213738163234631, + "flos": 19609524927360.0, + "grad_norm": 2.2084773041123, + "language_loss": 0.79731905, + "learning_rate": 3.62795645623335e-06, + "loss": 0.81578004, + "num_input_tokens_seen": 79367175, + "step": 3682, + "time_per_iteration": 2.7065982818603516 + }, + { + "auxiliary_loss_clip": 0.01067706, + "auxiliary_loss_mlp": 0.01042459, + "balance_loss_clip": 1.0309242, + "balance_loss_mlp": 1.02515018, + "epoch": 0.22143393957613106, + "flos": 23623404082560.0, + "grad_norm": 1.783420673679914, + "language_loss": 0.77604866, + "learning_rate": 3.627730188876638e-06, + "loss": 0.79715025, + "num_input_tokens_seen": 79388435, + "step": 3683, + "time_per_iteration": 2.9025588035583496 + }, + { + "auxiliary_loss_clip": 0.01072697, + "auxiliary_loss_mlp": 0.01045394, + "balance_loss_clip": 1.02903426, + "balance_loss_mlp": 1.02952743, + "epoch": 0.22149406282879905, + "flos": 26177801823360.0, + "grad_norm": 3.2979313276143705, + "language_loss": 0.72642171, + "learning_rate": 3.627503859796234e-06, + "loss": 0.74760264, + "num_input_tokens_seen": 79407910, + "step": 3684, + "time_per_iteration": 2.8148088455200195 + }, + { + "auxiliary_loss_clip": 0.01031037, + "auxiliary_loss_mlp": 0.01041143, + "balance_loss_clip": 1.02950716, + "balance_loss_mlp": 1.02407241, + "epoch": 0.221554186081467, + "flos": 14538758970240.0, + "grad_norm": 2.4427714263353053, + "language_loss": 0.80540675, + "learning_rate": 3.6272774690007207e-06, + "loss": 0.82612854, + "num_input_tokens_seen": 79424020, + "step": 3685, + "time_per_iteration": 2.8148672580718994 + }, + { + "auxiliary_loss_clip": 0.0109558, + "auxiliary_loss_mlp": 0.01038636, + "balance_loss_clip": 1.03206766, + "balance_loss_mlp": 1.02383614, + "epoch": 0.22161430933413498, + "flos": 22238257864320.0, + "grad_norm": 1.3781580518055585, + "language_loss": 0.87323868, + "learning_rate": 3.6270510164986823e-06, + "loss": 0.89458084, + "num_input_tokens_seen": 79445605, + "step": 3686, + "time_per_iteration": 2.7134904861450195 + }, + { + "auxiliary_loss_clip": 0.01085561, + "auxiliary_loss_mlp": 0.01041697, + "balance_loss_clip": 1.03246355, + "balance_loss_mlp": 1.02562714, + "epoch": 0.22167443258680294, + "flos": 23476529370240.0, + "grad_norm": 1.8655872557502609, + "language_loss": 0.77278346, + "learning_rate": 3.626824502298707e-06, + "loss": 0.794056, + "num_input_tokens_seen": 79463850, + "step": 3687, + "time_per_iteration": 2.701997995376587 + }, + { + "auxiliary_loss_clip": 0.01067911, + "auxiliary_loss_mlp": 0.0105202, + "balance_loss_clip": 1.03056383, + "balance_loss_mlp": 1.03418624, + "epoch": 0.2217345558394709, + "flos": 23221132692480.0, + "grad_norm": 1.7650818398818173, + "language_loss": 0.84934092, + "learning_rate": 3.626597926409383e-06, + "loss": 0.87054026, + "num_input_tokens_seen": 79482845, + "step": 3688, + "time_per_iteration": 2.8258142471313477 + }, + { + "auxiliary_loss_clip": 0.01055141, + "auxiliary_loss_mlp": 0.01042537, + "balance_loss_clip": 1.03119349, + "balance_loss_mlp": 1.02566862, + "epoch": 0.22179467909213887, + "flos": 20011078045440.0, + "grad_norm": 2.095719003829699, + "language_loss": 0.81204855, + "learning_rate": 3.6263712888393027e-06, + "loss": 0.83302534, + "num_input_tokens_seen": 79501550, + "step": 3689, + "time_per_iteration": 2.8326988220214844 + }, + { + "auxiliary_loss_clip": 0.01077771, + "auxiliary_loss_mlp": 0.0104267, + "balance_loss_clip": 1.03316665, + "balance_loss_mlp": 1.02663612, + "epoch": 0.22185480234480687, + "flos": 19683034110720.0, + "grad_norm": 1.8687059573982456, + "language_loss": 0.6951499, + "learning_rate": 3.626144589597061e-06, + "loss": 0.71635437, + "num_input_tokens_seen": 79519680, + "step": 3690, + "time_per_iteration": 2.776367664337158 + }, + { + "auxiliary_loss_clip": 0.01095199, + "auxiliary_loss_mlp": 0.00749015, + "balance_loss_clip": 1.03419578, + "balance_loss_mlp": 1.00123048, + "epoch": 0.22191492559747483, + "flos": 21981316901760.0, + "grad_norm": 2.2106039559050203, + "language_loss": 0.7224679, + "learning_rate": 3.6259178286912528e-06, + "loss": 0.74091005, + "num_input_tokens_seen": 79539000, + "step": 3691, + "time_per_iteration": 2.7460262775421143 + }, + { + "auxiliary_loss_clip": 0.01092752, + "auxiliary_loss_mlp": 0.01045268, + "balance_loss_clip": 1.03642416, + "balance_loss_mlp": 1.0280062, + "epoch": 0.2219750488501428, + "flos": 23222066446080.0, + "grad_norm": 1.9289280193584264, + "language_loss": 0.71081436, + "learning_rate": 3.625691006130477e-06, + "loss": 0.73219454, + "num_input_tokens_seen": 79559695, + "step": 3692, + "time_per_iteration": 2.7134146690368652 + }, + { + "auxiliary_loss_clip": 0.01093315, + "auxiliary_loss_mlp": 0.01047487, + "balance_loss_clip": 1.03496134, + "balance_loss_mlp": 1.03101254, + "epoch": 0.22203517210281076, + "flos": 22453685683200.0, + "grad_norm": 1.6612384843536434, + "language_loss": 0.87240636, + "learning_rate": 3.6254641219233362e-06, + "loss": 0.89381444, + "num_input_tokens_seen": 79579095, + "step": 3693, + "time_per_iteration": 2.731602191925049 + }, + { + "auxiliary_loss_clip": 0.01090029, + "auxiliary_loss_mlp": 0.01038758, + "balance_loss_clip": 1.03473639, + "balance_loss_mlp": 1.02382064, + "epoch": 0.22209529535547873, + "flos": 17564555825280.0, + "grad_norm": 1.9988266179288934, + "language_loss": 0.84870195, + "learning_rate": 3.6252371760784325e-06, + "loss": 0.86998987, + "num_input_tokens_seen": 79596430, + "step": 3694, + "time_per_iteration": 2.5934667587280273 + }, + { + "auxiliary_loss_clip": 0.01053262, + "auxiliary_loss_mlp": 0.01041913, + "balance_loss_clip": 1.02650809, + "balance_loss_mlp": 1.02465129, + "epoch": 0.2221554186081467, + "flos": 21469015175040.0, + "grad_norm": 1.9278293241803548, + "language_loss": 0.69576883, + "learning_rate": 3.6250101686043725e-06, + "loss": 0.71672058, + "num_input_tokens_seen": 79615825, + "step": 3695, + "time_per_iteration": 2.717094898223877 + }, + { + "auxiliary_loss_clip": 0.01068867, + "auxiliary_loss_mlp": 0.01037708, + "balance_loss_clip": 1.03333282, + "balance_loss_mlp": 1.02265179, + "epoch": 0.22221554186081466, + "flos": 27673445255040.0, + "grad_norm": 1.4803769946703016, + "language_loss": 0.71644789, + "learning_rate": 3.6247830995097637e-06, + "loss": 0.73751366, + "num_input_tokens_seen": 79637875, + "step": 3696, + "time_per_iteration": 2.7205264568328857 + }, + { + "auxiliary_loss_clip": 0.01091283, + "auxiliary_loss_mlp": 0.01036749, + "balance_loss_clip": 1.03371024, + "balance_loss_mlp": 1.02064395, + "epoch": 0.22227566511348265, + "flos": 25958926298880.0, + "grad_norm": 1.852127950558493, + "language_loss": 0.87844634, + "learning_rate": 3.624555968803217e-06, + "loss": 0.89972663, + "num_input_tokens_seen": 79656970, + "step": 3697, + "time_per_iteration": 2.930898904800415 + }, + { + "auxiliary_loss_clip": 0.01068483, + "auxiliary_loss_mlp": 0.01041162, + "balance_loss_clip": 1.03039026, + "balance_loss_mlp": 1.02639174, + "epoch": 0.22233578836615062, + "flos": 39203678833920.0, + "grad_norm": 1.5794592929524427, + "language_loss": 0.66121602, + "learning_rate": 3.624328776493346e-06, + "loss": 0.68231237, + "num_input_tokens_seen": 79680275, + "step": 3698, + "time_per_iteration": 2.933245897293091 + }, + { + "auxiliary_loss_clip": 0.01090469, + "auxiliary_loss_mlp": 0.01037392, + "balance_loss_clip": 1.03359234, + "balance_loss_mlp": 1.02049994, + "epoch": 0.22239591161881858, + "flos": 36283782251520.0, + "grad_norm": 2.305972590001303, + "language_loss": 0.82582688, + "learning_rate": 3.6241015225887637e-06, + "loss": 0.8471055, + "num_input_tokens_seen": 79701255, + "step": 3699, + "time_per_iteration": 2.8050434589385986 + }, + { + "auxiliary_loss_clip": 0.01075062, + "auxiliary_loss_mlp": 0.01041607, + "balance_loss_clip": 1.03189957, + "balance_loss_mlp": 1.02470326, + "epoch": 0.22245603487148655, + "flos": 19719591177600.0, + "grad_norm": 8.226846510687153, + "language_loss": 0.79443884, + "learning_rate": 3.62387420709809e-06, + "loss": 0.81560552, + "num_input_tokens_seen": 79721315, + "step": 3700, + "time_per_iteration": 2.708136796951294 + }, + { + "auxiliary_loss_clip": 0.01056288, + "auxiliary_loss_mlp": 0.0104241, + "balance_loss_clip": 1.03186083, + "balance_loss_mlp": 1.0250653, + "epoch": 0.2225161581241545, + "flos": 46280450615040.0, + "grad_norm": 1.9427966118328028, + "language_loss": 0.72324473, + "learning_rate": 3.623646830029943e-06, + "loss": 0.74423176, + "num_input_tokens_seen": 79742705, + "step": 3701, + "time_per_iteration": 3.016767740249634 + }, + { + "auxiliary_loss_clip": 0.01086984, + "auxiliary_loss_mlp": 0.01042408, + "balance_loss_clip": 1.03162408, + "balance_loss_mlp": 1.02600455, + "epoch": 0.22257628137682248, + "flos": 23696194993920.0, + "grad_norm": 1.7940500519389508, + "language_loss": 0.80463457, + "learning_rate": 3.6234193913929454e-06, + "loss": 0.82592851, + "num_input_tokens_seen": 79763000, + "step": 3702, + "time_per_iteration": 2.687634229660034 + }, + { + "auxiliary_loss_clip": 0.01082391, + "auxiliary_loss_mlp": 0.01036176, + "balance_loss_clip": 1.02980947, + "balance_loss_mlp": 1.01977289, + "epoch": 0.22263640462949044, + "flos": 19353984595200.0, + "grad_norm": 1.893476112814983, + "language_loss": 0.78214073, + "learning_rate": 3.623191891195723e-06, + "loss": 0.80332637, + "num_input_tokens_seen": 79781335, + "step": 3703, + "time_per_iteration": 2.5813944339752197 + }, + { + "auxiliary_loss_clip": 0.01092396, + "auxiliary_loss_mlp": 0.01040636, + "balance_loss_clip": 1.03350794, + "balance_loss_mlp": 1.02259994, + "epoch": 0.22269652788215843, + "flos": 20776047016320.0, + "grad_norm": 1.960478079032007, + "language_loss": 0.73943949, + "learning_rate": 3.6229643294469005e-06, + "loss": 0.76076984, + "num_input_tokens_seen": 79800150, + "step": 3704, + "time_per_iteration": 2.6057567596435547 + }, + { + "auxiliary_loss_clip": 0.01060208, + "auxiliary_loss_mlp": 0.01042307, + "balance_loss_clip": 1.0340445, + "balance_loss_mlp": 1.02641606, + "epoch": 0.2227566511348264, + "flos": 47958843467520.0, + "grad_norm": 1.8774726321583433, + "language_loss": 0.64398676, + "learning_rate": 3.6227367061551074e-06, + "loss": 0.66501188, + "num_input_tokens_seen": 79822390, + "step": 3705, + "time_per_iteration": 2.935626983642578 + }, + { + "auxiliary_loss_clip": 0.01006049, + "auxiliary_loss_mlp": 0.01009427, + "balance_loss_clip": 1.02039301, + "balance_loss_mlp": 1.00556457, + "epoch": 0.22281677438749437, + "flos": 66218953230720.0, + "grad_norm": 1.3247608070957506, + "language_loss": 0.65235341, + "learning_rate": 3.6225090213289766e-06, + "loss": 0.67250824, + "num_input_tokens_seen": 79873350, + "step": 3706, + "time_per_iteration": 3.238664388656616 + }, + { + "auxiliary_loss_clip": 0.01063606, + "auxiliary_loss_mlp": 0.01040341, + "balance_loss_clip": 1.02932048, + "balance_loss_mlp": 1.024248, + "epoch": 0.22287689764016233, + "flos": 21871609787520.0, + "grad_norm": 2.2764908608153673, + "language_loss": 0.80724794, + "learning_rate": 3.622281274977141e-06, + "loss": 0.82828736, + "num_input_tokens_seen": 79891715, + "step": 3707, + "time_per_iteration": 2.647758960723877 + }, + { + "auxiliary_loss_clip": 0.01099332, + "auxiliary_loss_mlp": 0.01036384, + "balance_loss_clip": 1.03374791, + "balance_loss_mlp": 1.02076769, + "epoch": 0.2229370208928303, + "flos": 27672475587840.0, + "grad_norm": 2.0151718435081984, + "language_loss": 0.78624946, + "learning_rate": 3.6220534671082367e-06, + "loss": 0.80760658, + "num_input_tokens_seen": 79911175, + "step": 3708, + "time_per_iteration": 2.6016902923583984 + }, + { + "auxiliary_loss_clip": 0.01078518, + "auxiliary_loss_mlp": 0.01045093, + "balance_loss_clip": 1.03474998, + "balance_loss_mlp": 1.02840412, + "epoch": 0.22299714414549826, + "flos": 30154657034880.0, + "grad_norm": 1.9900820517108944, + "language_loss": 0.80460763, + "learning_rate": 3.6218255977309024e-06, + "loss": 0.82584369, + "num_input_tokens_seen": 79931875, + "step": 3709, + "time_per_iteration": 2.774142265319824 + }, + { + "auxiliary_loss_clip": 0.0108971, + "auxiliary_loss_mlp": 0.00749071, + "balance_loss_clip": 1.03288853, + "balance_loss_mlp": 1.00127602, + "epoch": 0.22305726739816625, + "flos": 23143134309120.0, + "grad_norm": 1.7606476786867038, + "language_loss": 0.68954456, + "learning_rate": 3.6215976668537787e-06, + "loss": 0.70793241, + "num_input_tokens_seen": 79952445, + "step": 3710, + "time_per_iteration": 2.807751178741455 + }, + { + "auxiliary_loss_clip": 0.01053866, + "auxiliary_loss_mlp": 0.01051462, + "balance_loss_clip": 1.02850747, + "balance_loss_mlp": 1.03426564, + "epoch": 0.22311739065083422, + "flos": 19172061187200.0, + "grad_norm": 2.1238712684697947, + "language_loss": 0.91009915, + "learning_rate": 3.6213696744855096e-06, + "loss": 0.9311524, + "num_input_tokens_seen": 79971030, + "step": 3711, + "time_per_iteration": 2.8004343509674072 + }, + { + "auxiliary_loss_clip": 0.01077869, + "auxiliary_loss_mlp": 0.01052861, + "balance_loss_clip": 1.03436816, + "balance_loss_mlp": 1.03480124, + "epoch": 0.22317751390350218, + "flos": 13617757319040.0, + "grad_norm": 8.349168525612718, + "language_loss": 0.89385307, + "learning_rate": 3.6211416206347395e-06, + "loss": 0.91516036, + "num_input_tokens_seen": 79982085, + "step": 3712, + "time_per_iteration": 2.68656587600708 + }, + { + "auxiliary_loss_clip": 0.01100287, + "auxiliary_loss_mlp": 0.010482, + "balance_loss_clip": 1.03521562, + "balance_loss_mlp": 1.03201175, + "epoch": 0.22323763715617015, + "flos": 11029065068160.0, + "grad_norm": 2.6883336844605705, + "language_loss": 0.7549026, + "learning_rate": 3.620913505310117e-06, + "loss": 0.77638745, + "num_input_tokens_seen": 79997460, + "step": 3713, + "time_per_iteration": 2.597381830215454 + }, + { + "auxiliary_loss_clip": 0.0105262, + "auxiliary_loss_mlp": 0.01044364, + "balance_loss_clip": 1.03481889, + "balance_loss_mlp": 1.02771091, + "epoch": 0.22329776040883811, + "flos": 41351531466240.0, + "grad_norm": 2.024520332567683, + "language_loss": 0.62857163, + "learning_rate": 3.6206853285202917e-06, + "loss": 0.6495415, + "num_input_tokens_seen": 80022450, + "step": 3714, + "time_per_iteration": 3.1307051181793213 + }, + { + "auxiliary_loss_clip": 0.01072092, + "auxiliary_loss_mlp": 0.01035925, + "balance_loss_clip": 1.0345664, + "balance_loss_mlp": 1.01985574, + "epoch": 0.22335788366150608, + "flos": 25119478477440.0, + "grad_norm": 1.793303857998491, + "language_loss": 0.79160756, + "learning_rate": 3.6204570902739164e-06, + "loss": 0.81268775, + "num_input_tokens_seen": 80042100, + "step": 3715, + "time_per_iteration": 2.9169294834136963 + }, + { + "auxiliary_loss_clip": 0.01073057, + "auxiliary_loss_mlp": 0.01051037, + "balance_loss_clip": 1.03896689, + "balance_loss_mlp": 1.0347892, + "epoch": 0.22341800691417404, + "flos": 16983377769600.0, + "grad_norm": 2.3640048155186495, + "language_loss": 0.77239281, + "learning_rate": 3.620228790579645e-06, + "loss": 0.79363376, + "num_input_tokens_seen": 80059690, + "step": 3716, + "time_per_iteration": 2.7945055961608887 + }, + { + "auxiliary_loss_clip": 0.01074798, + "auxiliary_loss_mlp": 0.01045365, + "balance_loss_clip": 1.03233719, + "balance_loss_mlp": 1.02906895, + "epoch": 0.22347813016684204, + "flos": 14136738975360.0, + "grad_norm": 1.9764469106312392, + "language_loss": 0.79254711, + "learning_rate": 3.6200004294461367e-06, + "loss": 0.81374872, + "num_input_tokens_seen": 80076060, + "step": 3717, + "time_per_iteration": 2.819689989089966 + }, + { + "auxiliary_loss_clip": 0.01033465, + "auxiliary_loss_mlp": 0.0104443, + "balance_loss_clip": 1.03138375, + "balance_loss_mlp": 1.0265131, + "epoch": 0.22353825341951, + "flos": 23583147914880.0, + "grad_norm": 2.114989856174041, + "language_loss": 0.67885399, + "learning_rate": 3.6197720068820497e-06, + "loss": 0.69963288, + "num_input_tokens_seen": 80094760, + "step": 3718, + "time_per_iteration": 2.8355143070220947 + }, + { + "auxiliary_loss_clip": 0.01073202, + "auxiliary_loss_mlp": 0.01042956, + "balance_loss_clip": 1.03043497, + "balance_loss_mlp": 1.02526546, + "epoch": 0.22359837667217797, + "flos": 29824206888960.0, + "grad_norm": 1.4372651627814494, + "language_loss": 0.80840993, + "learning_rate": 3.619543522896045e-06, + "loss": 0.82957149, + "num_input_tokens_seen": 80114475, + "step": 3719, + "time_per_iteration": 2.798834800720215 + }, + { + "auxiliary_loss_clip": 0.01086472, + "auxiliary_loss_mlp": 0.01053472, + "balance_loss_clip": 1.03791833, + "balance_loss_mlp": 1.03534031, + "epoch": 0.22365849992484593, + "flos": 17603088140160.0, + "grad_norm": 2.8950849399413134, + "language_loss": 0.86591315, + "learning_rate": 3.6193149774967885e-06, + "loss": 0.88731259, + "num_input_tokens_seen": 80132920, + "step": 3720, + "time_per_iteration": 4.170133113861084 + }, + { + "auxiliary_loss_clip": 0.01072574, + "auxiliary_loss_mlp": 0.01033733, + "balance_loss_clip": 1.0346446, + "balance_loss_mlp": 1.01772285, + "epoch": 0.2237186231775139, + "flos": 22710949868160.0, + "grad_norm": 1.755849185444099, + "language_loss": 0.74625921, + "learning_rate": 3.619086370692945e-06, + "loss": 0.7673223, + "num_input_tokens_seen": 80152845, + "step": 3721, + "time_per_iteration": 4.231414318084717 + }, + { + "auxiliary_loss_clip": 0.01107587, + "auxiliary_loss_mlp": 0.01044196, + "balance_loss_clip": 1.03820348, + "balance_loss_mlp": 1.02693391, + "epoch": 0.22377874643018186, + "flos": 13371518609280.0, + "grad_norm": 2.362133236056242, + "language_loss": 0.7886703, + "learning_rate": 3.6188577024931844e-06, + "loss": 0.81018817, + "num_input_tokens_seen": 80170680, + "step": 3722, + "time_per_iteration": 2.5309383869171143 + }, + { + "auxiliary_loss_clip": 0.01069689, + "auxiliary_loss_mlp": 0.0103905, + "balance_loss_clip": 1.03660023, + "balance_loss_mlp": 1.02381492, + "epoch": 0.22383886968284986, + "flos": 17894970057600.0, + "grad_norm": 2.1417879388930685, + "language_loss": 0.82213742, + "learning_rate": 3.618628972906178e-06, + "loss": 0.84322476, + "num_input_tokens_seen": 80189030, + "step": 3723, + "time_per_iteration": 2.6639633178710938 + }, + { + "auxiliary_loss_clip": 0.01105186, + "auxiliary_loss_mlp": 0.01045622, + "balance_loss_clip": 1.03708744, + "balance_loss_mlp": 1.02935004, + "epoch": 0.22389899293551782, + "flos": 23879123982720.0, + "grad_norm": 2.527128379953545, + "language_loss": 0.84721446, + "learning_rate": 3.6184001819405984e-06, + "loss": 0.86872256, + "num_input_tokens_seen": 80208365, + "step": 3724, + "time_per_iteration": 2.585466146469116 + }, + { + "auxiliary_loss_clip": 0.01067855, + "auxiliary_loss_mlp": 0.0103981, + "balance_loss_clip": 1.03157222, + "balance_loss_mlp": 1.02379441, + "epoch": 0.2239591161881858, + "flos": 27272430840960.0, + "grad_norm": 2.1204370686519467, + "language_loss": 0.78997505, + "learning_rate": 3.618171329605121e-06, + "loss": 0.81105167, + "num_input_tokens_seen": 80228685, + "step": 3725, + "time_per_iteration": 2.674821376800537 + }, + { + "auxiliary_loss_clip": 0.01047066, + "auxiliary_loss_mlp": 0.01036655, + "balance_loss_clip": 1.03234339, + "balance_loss_mlp": 1.02106202, + "epoch": 0.22401923944085375, + "flos": 22236857233920.0, + "grad_norm": 1.9225473124112027, + "language_loss": 0.77140296, + "learning_rate": 3.6179424159084254e-06, + "loss": 0.7922402, + "num_input_tokens_seen": 80247635, + "step": 3726, + "time_per_iteration": 5.901752948760986 + }, + { + "auxiliary_loss_clip": 0.01098017, + "auxiliary_loss_mlp": 0.01041967, + "balance_loss_clip": 1.03421116, + "balance_loss_mlp": 1.02359664, + "epoch": 0.22407936269352172, + "flos": 12053668521600.0, + "grad_norm": 8.93949460624947, + "language_loss": 0.72511083, + "learning_rate": 3.6177134408591914e-06, + "loss": 0.74651068, + "num_input_tokens_seen": 80260045, + "step": 3727, + "time_per_iteration": 2.5029568672180176 + }, + { + "auxiliary_loss_clip": 0.01106887, + "auxiliary_loss_mlp": 0.01044755, + "balance_loss_clip": 1.0365212, + "balance_loss_mlp": 1.02582455, + "epoch": 0.22413948594618968, + "flos": 19353553632000.0, + "grad_norm": 3.3650209688367285, + "language_loss": 0.86695218, + "learning_rate": 3.6174844044661013e-06, + "loss": 0.88846862, + "num_input_tokens_seen": 80277680, + "step": 3728, + "time_per_iteration": 2.7039694786071777 + }, + { + "auxiliary_loss_clip": 0.01071962, + "auxiliary_loss_mlp": 0.0104614, + "balance_loss_clip": 1.03332353, + "balance_loss_mlp": 1.02715015, + "epoch": 0.22419960919885765, + "flos": 24170000319360.0, + "grad_norm": 3.6500673957289926, + "language_loss": 0.80237162, + "learning_rate": 3.6172553067378406e-06, + "loss": 0.82355261, + "num_input_tokens_seen": 80294795, + "step": 3729, + "time_per_iteration": 2.7407493591308594 + }, + { + "auxiliary_loss_clip": 0.01077508, + "auxiliary_loss_mlp": 0.01045061, + "balance_loss_clip": 1.03368926, + "balance_loss_mlp": 1.02955222, + "epoch": 0.22425973245152564, + "flos": 27378977558400.0, + "grad_norm": 1.6185047883482646, + "language_loss": 0.86655873, + "learning_rate": 3.6170261476830964e-06, + "loss": 0.88778448, + "num_input_tokens_seen": 80315425, + "step": 3730, + "time_per_iteration": 2.7388317584991455 + }, + { + "auxiliary_loss_clip": 0.01073083, + "auxiliary_loss_mlp": 0.00748964, + "balance_loss_clip": 1.03144026, + "balance_loss_mlp": 1.0012908, + "epoch": 0.2243198557041936, + "flos": 13735652734080.0, + "grad_norm": 1.8532277958709902, + "language_loss": 0.72846454, + "learning_rate": 3.616796927310559e-06, + "loss": 0.74668503, + "num_input_tokens_seen": 80333905, + "step": 3731, + "time_per_iteration": 2.753880500793457 + }, + { + "auxiliary_loss_clip": 0.01071861, + "auxiliary_loss_mlp": 0.01037738, + "balance_loss_clip": 1.03390062, + "balance_loss_mlp": 1.02159691, + "epoch": 0.22437997895686157, + "flos": 19530700531200.0, + "grad_norm": 1.7644122135007188, + "language_loss": 0.7526381, + "learning_rate": 3.6165676456289195e-06, + "loss": 0.77373409, + "num_input_tokens_seen": 80352165, + "step": 3732, + "time_per_iteration": 2.936622142791748 + }, + { + "auxiliary_loss_clip": 0.01102371, + "auxiliary_loss_mlp": 0.01052216, + "balance_loss_clip": 1.03574383, + "balance_loss_mlp": 1.03543162, + "epoch": 0.22444010220952954, + "flos": 23696230907520.0, + "grad_norm": 1.6910891982477403, + "language_loss": 0.88365138, + "learning_rate": 3.616338302646873e-06, + "loss": 0.9051972, + "num_input_tokens_seen": 80371305, + "step": 3733, + "time_per_iteration": 2.607025384902954 + }, + { + "auxiliary_loss_clip": 0.01056128, + "auxiliary_loss_mlp": 0.01041482, + "balance_loss_clip": 1.03001761, + "balance_loss_mlp": 1.02513838, + "epoch": 0.2245002254621975, + "flos": 22382905933440.0, + "grad_norm": 1.6451250065686687, + "language_loss": 0.84476256, + "learning_rate": 3.6161088983731166e-06, + "loss": 0.86573869, + "num_input_tokens_seen": 80391020, + "step": 3734, + "time_per_iteration": 2.6635210514068604 + }, + { + "auxiliary_loss_clip": 0.01076809, + "auxiliary_loss_mlp": 0.01045526, + "balance_loss_clip": 1.03354239, + "balance_loss_mlp": 1.02912283, + "epoch": 0.22456034871486547, + "flos": 26942303917440.0, + "grad_norm": 2.0276812207567976, + "language_loss": 0.76781642, + "learning_rate": 3.6158794328163482e-06, + "loss": 0.78903985, + "num_input_tokens_seen": 80411365, + "step": 3735, + "time_per_iteration": 2.6992745399475098 + }, + { + "auxiliary_loss_clip": 0.01085622, + "auxiliary_loss_mlp": 0.0104741, + "balance_loss_clip": 1.035097, + "balance_loss_mlp": 1.03235388, + "epoch": 0.22462047196753343, + "flos": 28983538005120.0, + "grad_norm": 1.984879219927335, + "language_loss": 0.84329212, + "learning_rate": 3.6156499059852702e-06, + "loss": 0.86462241, + "num_input_tokens_seen": 80431075, + "step": 3736, + "time_per_iteration": 2.698493719100952 + }, + { + "auxiliary_loss_clip": 0.01069439, + "auxiliary_loss_mlp": 0.01040745, + "balance_loss_clip": 1.03611875, + "balance_loss_mlp": 1.02483082, + "epoch": 0.22468059522020142, + "flos": 20011329440640.0, + "grad_norm": 1.614873875272634, + "language_loss": 0.86293089, + "learning_rate": 3.615420317888586e-06, + "loss": 0.88403273, + "num_input_tokens_seen": 80449240, + "step": 3737, + "time_per_iteration": 2.6874730587005615 + }, + { + "auxiliary_loss_clip": 0.01102996, + "auxiliary_loss_mlp": 0.01048972, + "balance_loss_clip": 1.03543866, + "balance_loss_mlp": 1.03193748, + "epoch": 0.2247407184728694, + "flos": 29314239546240.0, + "grad_norm": 1.7689673232359384, + "language_loss": 0.78990185, + "learning_rate": 3.6151906685350006e-06, + "loss": 0.81142157, + "num_input_tokens_seen": 80467900, + "step": 3738, + "time_per_iteration": 2.621351718902588 + }, + { + "auxiliary_loss_clip": 0.01067617, + "auxiliary_loss_mlp": 0.01039867, + "balance_loss_clip": 1.03125751, + "balance_loss_mlp": 1.02457261, + "epoch": 0.22480084172553735, + "flos": 22310366417280.0, + "grad_norm": 1.6147948363359834, + "language_loss": 0.76005852, + "learning_rate": 3.614960957933224e-06, + "loss": 0.78113341, + "num_input_tokens_seen": 80487100, + "step": 3739, + "time_per_iteration": 2.6994450092315674 + }, + { + "auxiliary_loss_clip": 0.01057336, + "auxiliary_loss_mlp": 0.01046632, + "balance_loss_clip": 1.02894807, + "balance_loss_mlp": 1.02871466, + "epoch": 0.22486096497820532, + "flos": 25591272641280.0, + "grad_norm": 1.8859452087260826, + "language_loss": 0.74458694, + "learning_rate": 3.6147311860919655e-06, + "loss": 0.76562667, + "num_input_tokens_seen": 80508625, + "step": 3740, + "time_per_iteration": 2.766537666320801 + }, + { + "auxiliary_loss_clip": 0.01099218, + "auxiliary_loss_mlp": 0.01039041, + "balance_loss_clip": 1.03428769, + "balance_loss_mlp": 1.02276897, + "epoch": 0.22492108823087328, + "flos": 17639824775040.0, + "grad_norm": 1.8092856955510837, + "language_loss": 0.75410706, + "learning_rate": 3.614501353019939e-06, + "loss": 0.77548963, + "num_input_tokens_seen": 80527345, + "step": 3741, + "time_per_iteration": 2.5346553325653076 + }, + { + "auxiliary_loss_clip": 0.01079069, + "auxiliary_loss_mlp": 0.01037224, + "balance_loss_clip": 1.03436434, + "balance_loss_mlp": 1.02127409, + "epoch": 0.22498121148354125, + "flos": 16034653797120.0, + "grad_norm": 1.632086829667128, + "language_loss": 0.87620437, + "learning_rate": 3.6142714587258592e-06, + "loss": 0.89736736, + "num_input_tokens_seen": 80545545, + "step": 3742, + "time_per_iteration": 2.647282600402832 + }, + { + "auxiliary_loss_clip": 0.01039308, + "auxiliary_loss_mlp": 0.01045648, + "balance_loss_clip": 1.02882624, + "balance_loss_mlp": 1.02907825, + "epoch": 0.22504133473620924, + "flos": 24023772051840.0, + "grad_norm": 2.0551159409629816, + "language_loss": 0.81461191, + "learning_rate": 3.614041503218444e-06, + "loss": 0.83546144, + "num_input_tokens_seen": 80565040, + "step": 3743, + "time_per_iteration": 2.917734146118164 + }, + { + "auxiliary_loss_clip": 0.01086983, + "auxiliary_loss_mlp": 0.01040221, + "balance_loss_clip": 1.03188968, + "balance_loss_mlp": 1.02436662, + "epoch": 0.2251014579888772, + "flos": 16763963541120.0, + "grad_norm": 2.808207844068317, + "language_loss": 0.63292539, + "learning_rate": 3.6138114865064134e-06, + "loss": 0.65419739, + "num_input_tokens_seen": 80582815, + "step": 3744, + "time_per_iteration": 2.6017205715179443 + }, + { + "auxiliary_loss_clip": 0.01086939, + "auxiliary_loss_mlp": 0.01038014, + "balance_loss_clip": 1.03229308, + "balance_loss_mlp": 1.02183747, + "epoch": 0.22516158124154517, + "flos": 13991013498240.0, + "grad_norm": 2.877043635086266, + "language_loss": 0.7638675, + "learning_rate": 3.613581408598489e-06, + "loss": 0.78511703, + "num_input_tokens_seen": 80600865, + "step": 3745, + "time_per_iteration": 2.603858709335327 + }, + { + "auxiliary_loss_clip": 0.01067085, + "auxiliary_loss_mlp": 0.01039802, + "balance_loss_clip": 1.03236854, + "balance_loss_mlp": 1.02387512, + "epoch": 0.22522170449421314, + "flos": 14390016750720.0, + "grad_norm": 1.756713823538413, + "language_loss": 0.80396813, + "learning_rate": 3.6133512695033965e-06, + "loss": 0.825037, + "num_input_tokens_seen": 80617455, + "step": 3746, + "time_per_iteration": 2.6134204864501953 + }, + { + "auxiliary_loss_clip": 0.01090487, + "auxiliary_loss_mlp": 0.01041996, + "balance_loss_clip": 1.03324819, + "balance_loss_mlp": 1.02631962, + "epoch": 0.2252818277468811, + "flos": 23805542972160.0, + "grad_norm": 2.785666808048251, + "language_loss": 0.86197037, + "learning_rate": 3.613121069229862e-06, + "loss": 0.88329518, + "num_input_tokens_seen": 80635125, + "step": 3747, + "time_per_iteration": 2.6832199096679688 + }, + { + "auxiliary_loss_clip": 0.01087142, + "auxiliary_loss_mlp": 0.00749009, + "balance_loss_clip": 1.03142941, + "balance_loss_mlp": 1.00145984, + "epoch": 0.22534195099954907, + "flos": 24718033100160.0, + "grad_norm": 1.6621957918384496, + "language_loss": 0.76276875, + "learning_rate": 3.6128908077866145e-06, + "loss": 0.78113025, + "num_input_tokens_seen": 80656370, + "step": 3748, + "time_per_iteration": 2.616558313369751 + }, + { + "auxiliary_loss_clip": 0.01102771, + "auxiliary_loss_mlp": 0.01044312, + "balance_loss_clip": 1.03542161, + "balance_loss_mlp": 1.02762318, + "epoch": 0.22540207425221703, + "flos": 21032341534080.0, + "grad_norm": 1.6148503275336483, + "language_loss": 0.79482162, + "learning_rate": 3.6126604851823864e-06, + "loss": 0.81629246, + "num_input_tokens_seen": 80676495, + "step": 3749, + "time_per_iteration": 2.5301363468170166 + }, + { + "auxiliary_loss_clip": 0.01074287, + "auxiliary_loss_mlp": 0.01036385, + "balance_loss_clip": 1.03155208, + "balance_loss_mlp": 1.0215435, + "epoch": 0.22546219750488503, + "flos": 19390362094080.0, + "grad_norm": 1.7931659850739745, + "language_loss": 0.79692221, + "learning_rate": 3.6124301014259108e-06, + "loss": 0.81802893, + "num_input_tokens_seen": 80694755, + "step": 3750, + "time_per_iteration": 2.7342379093170166 + }, + { + "auxiliary_loss_clip": 0.01046328, + "auxiliary_loss_mlp": 0.0104865, + "balance_loss_clip": 1.03262818, + "balance_loss_mlp": 1.03205585, + "epoch": 0.225522320757553, + "flos": 25192628524800.0, + "grad_norm": 3.1701394992943452, + "language_loss": 0.81808567, + "learning_rate": 3.6121996565259244e-06, + "loss": 0.83903545, + "num_input_tokens_seen": 80713670, + "step": 3751, + "time_per_iteration": 2.723283290863037 + }, + { + "auxiliary_loss_clip": 0.01067251, + "auxiliary_loss_mlp": 0.01042755, + "balance_loss_clip": 1.03244507, + "balance_loss_mlp": 1.02649486, + "epoch": 0.22558244401022096, + "flos": 17163110448000.0, + "grad_norm": 1.8358285999671042, + "language_loss": 0.83932853, + "learning_rate": 3.611969150491165e-06, + "loss": 0.86042857, + "num_input_tokens_seen": 80731450, + "step": 3752, + "time_per_iteration": 2.611708164215088 + }, + { + "auxiliary_loss_clip": 0.01097179, + "auxiliary_loss_mlp": 0.01034654, + "balance_loss_clip": 1.03326678, + "balance_loss_mlp": 1.01999187, + "epoch": 0.22564256726288892, + "flos": 15231008856960.0, + "grad_norm": 1.6426451219288003, + "language_loss": 0.7874583, + "learning_rate": 3.611738583330375e-06, + "loss": 0.80877662, + "num_input_tokens_seen": 80748415, + "step": 3753, + "time_per_iteration": 2.5134570598602295 + }, + { + "auxiliary_loss_clip": 0.01074884, + "auxiliary_loss_mlp": 0.01037149, + "balance_loss_clip": 1.03037882, + "balance_loss_mlp": 1.02073383, + "epoch": 0.2257026905155569, + "flos": 34568652764160.0, + "grad_norm": 1.9238764031583862, + "language_loss": 0.78601325, + "learning_rate": 3.611507955052295e-06, + "loss": 0.80713356, + "num_input_tokens_seen": 80770835, + "step": 3754, + "time_per_iteration": 2.7372541427612305 + }, + { + "auxiliary_loss_clip": 0.01077917, + "auxiliary_loss_mlp": 0.01038858, + "balance_loss_clip": 1.03496504, + "balance_loss_mlp": 1.02269316, + "epoch": 0.22576281376822485, + "flos": 19938430788480.0, + "grad_norm": 1.797181380051248, + "language_loss": 0.70086658, + "learning_rate": 3.6112772656656727e-06, + "loss": 0.72203434, + "num_input_tokens_seen": 80787840, + "step": 3755, + "time_per_iteration": 0.0216367244720459 + }, + { + "auxiliary_loss_clip": 0.01075057, + "auxiliary_loss_mlp": 0.01047366, + "balance_loss_clip": 1.03466892, + "balance_loss_mlp": 1.03096318, + "epoch": 0.22582293702089282, + "flos": 24602005192320.0, + "grad_norm": 1.9234184703733368, + "language_loss": 0.77140868, + "learning_rate": 3.6110465151792547e-06, + "loss": 0.79263294, + "num_input_tokens_seen": 80806335, + "step": 3756, + "time_per_iteration": 2.775583028793335 + }, + { + "auxiliary_loss_clip": 0.01077341, + "auxiliary_loss_mlp": 0.01044158, + "balance_loss_clip": 1.03624988, + "balance_loss_mlp": 1.027457, + "epoch": 0.2258830602735608, + "flos": 23035438356480.0, + "grad_norm": 1.7178722206708015, + "language_loss": 0.82446349, + "learning_rate": 3.6108157036017916e-06, + "loss": 0.84567851, + "num_input_tokens_seen": 80825355, + "step": 3757, + "time_per_iteration": 2.7068870067596436 + }, + { + "auxiliary_loss_clip": 0.01090629, + "auxiliary_loss_mlp": 0.01046001, + "balance_loss_clip": 1.03508496, + "balance_loss_mlp": 1.02926397, + "epoch": 0.22594318352622877, + "flos": 22158427887360.0, + "grad_norm": 2.0482773055247043, + "language_loss": 0.73188096, + "learning_rate": 3.6105848309420358e-06, + "loss": 0.75324726, + "num_input_tokens_seen": 80842570, + "step": 3758, + "time_per_iteration": 2.6077725887298584 + }, + { + "auxiliary_loss_clip": 0.01082927, + "auxiliary_loss_mlp": 0.01048419, + "balance_loss_clip": 1.03445196, + "balance_loss_mlp": 1.03188467, + "epoch": 0.22600330677889674, + "flos": 20594303176320.0, + "grad_norm": 1.9320642944112414, + "language_loss": 0.77186382, + "learning_rate": 3.6103538972087412e-06, + "loss": 0.79317725, + "num_input_tokens_seen": 80858745, + "step": 3759, + "time_per_iteration": 2.6604063510894775 + }, + { + "auxiliary_loss_clip": 0.01055493, + "auxiliary_loss_mlp": 0.01042073, + "balance_loss_clip": 1.02924132, + "balance_loss_mlp": 1.02518129, + "epoch": 0.2260634300315647, + "flos": 35659798162560.0, + "grad_norm": 1.6681980213911127, + "language_loss": 0.7794717, + "learning_rate": 3.6101229024106655e-06, + "loss": 0.80044734, + "num_input_tokens_seen": 80880085, + "step": 3760, + "time_per_iteration": 2.875321865081787 + }, + { + "auxiliary_loss_clip": 0.00998275, + "auxiliary_loss_mlp": 0.01006101, + "balance_loss_clip": 1.01087201, + "balance_loss_mlp": 1.00374019, + "epoch": 0.22612355328423267, + "flos": 72090455126400.0, + "grad_norm": 0.9501438431268608, + "language_loss": 0.60099608, + "learning_rate": 3.609891846556569e-06, + "loss": 0.62103987, + "num_input_tokens_seen": 80937660, + "step": 3761, + "time_per_iteration": 3.2924840450286865 + }, + { + "auxiliary_loss_clip": 0.01068465, + "auxiliary_loss_mlp": 0.01039802, + "balance_loss_clip": 1.03194714, + "balance_loss_mlp": 1.02337456, + "epoch": 0.22618367653690064, + "flos": 22783776693120.0, + "grad_norm": 2.622906489526956, + "language_loss": 0.77735573, + "learning_rate": 3.609660729655211e-06, + "loss": 0.79843831, + "num_input_tokens_seen": 80956265, + "step": 3762, + "time_per_iteration": 2.683746099472046 + }, + { + "auxiliary_loss_clip": 0.01072635, + "auxiliary_loss_mlp": 0.01039078, + "balance_loss_clip": 1.03186893, + "balance_loss_mlp": 1.021209, + "epoch": 0.22624379978956863, + "flos": 20448254476800.0, + "grad_norm": 2.2174163088335175, + "language_loss": 0.78999001, + "learning_rate": 3.6094295517153573e-06, + "loss": 0.8111071, + "num_input_tokens_seen": 80975185, + "step": 3763, + "time_per_iteration": 2.7782247066497803 + }, + { + "auxiliary_loss_clip": 0.01093858, + "auxiliary_loss_mlp": 0.01050177, + "balance_loss_clip": 1.03774345, + "balance_loss_mlp": 1.03214049, + "epoch": 0.2263039230422366, + "flos": 17494314779520.0, + "grad_norm": 1.605927287010501, + "language_loss": 0.91335708, + "learning_rate": 3.6091983127457743e-06, + "loss": 0.93479741, + "num_input_tokens_seen": 80992830, + "step": 3764, + "time_per_iteration": 2.569957733154297 + }, + { + "auxiliary_loss_clip": 0.01078636, + "auxiliary_loss_mlp": 0.01047566, + "balance_loss_clip": 1.03361428, + "balance_loss_mlp": 1.03074598, + "epoch": 0.22636404629490456, + "flos": 28329748606080.0, + "grad_norm": 1.7867170437840325, + "language_loss": 0.7531541, + "learning_rate": 3.6089670127552293e-06, + "loss": 0.77441609, + "num_input_tokens_seen": 81013675, + "step": 3765, + "time_per_iteration": 2.6404547691345215 + }, + { + "auxiliary_loss_clip": 0.01090701, + "auxiliary_loss_mlp": 0.0104513, + "balance_loss_clip": 1.0354377, + "balance_loss_mlp": 1.02908468, + "epoch": 0.22642416954757252, + "flos": 17489143221120.0, + "grad_norm": 2.044498145127444, + "language_loss": 0.89834446, + "learning_rate": 3.608735651752494e-06, + "loss": 0.91970277, + "num_input_tokens_seen": 81030345, + "step": 3766, + "time_per_iteration": 2.601520538330078 + }, + { + "auxiliary_loss_clip": 0.01076344, + "auxiliary_loss_mlp": 0.01038017, + "balance_loss_clip": 1.03418934, + "balance_loss_mlp": 1.02191234, + "epoch": 0.2264842928002405, + "flos": 24384530298240.0, + "grad_norm": 1.7252102131437326, + "language_loss": 0.74748433, + "learning_rate": 3.6085042297463417e-06, + "loss": 0.768628, + "num_input_tokens_seen": 81051000, + "step": 3767, + "time_per_iteration": 4.304690837860107 + }, + { + "auxiliary_loss_clip": 0.01087612, + "auxiliary_loss_mlp": 0.0104093, + "balance_loss_clip": 1.03190339, + "balance_loss_mlp": 1.02406216, + "epoch": 0.22654441605290845, + "flos": 19830519354240.0, + "grad_norm": 1.5794016063762093, + "language_loss": 0.71847296, + "learning_rate": 3.6082727467455477e-06, + "loss": 0.73975837, + "num_input_tokens_seen": 81071205, + "step": 3768, + "time_per_iteration": 4.197840690612793 + }, + { + "auxiliary_loss_clip": 0.01095802, + "auxiliary_loss_mlp": 0.01050873, + "balance_loss_clip": 1.0395633, + "balance_loss_mlp": 1.03355193, + "epoch": 0.22660453930557642, + "flos": 27454569730560.0, + "grad_norm": 1.609756883247935, + "language_loss": 0.78096604, + "learning_rate": 3.6080412027588905e-06, + "loss": 0.80243278, + "num_input_tokens_seen": 81091880, + "step": 3769, + "time_per_iteration": 2.6022768020629883 + }, + { + "auxiliary_loss_clip": 0.01068374, + "auxiliary_loss_mlp": 0.0104302, + "balance_loss_clip": 1.02932763, + "balance_loss_mlp": 1.02485263, + "epoch": 0.2266646625582444, + "flos": 23988148738560.0, + "grad_norm": 1.8510157808265912, + "language_loss": 0.68606132, + "learning_rate": 3.6078095977951488e-06, + "loss": 0.70717525, + "num_input_tokens_seen": 81113290, + "step": 3770, + "time_per_iteration": 2.770599365234375 + }, + { + "auxiliary_loss_clip": 0.01102354, + "auxiliary_loss_mlp": 0.01042083, + "balance_loss_clip": 1.03508472, + "balance_loss_mlp": 1.02655041, + "epoch": 0.22672478581091238, + "flos": 26028054023040.0, + "grad_norm": 1.7016009363535376, + "language_loss": 0.80419159, + "learning_rate": 3.6075779318631067e-06, + "loss": 0.82563597, + "num_input_tokens_seen": 81133535, + "step": 3771, + "time_per_iteration": 2.6614174842834473 + }, + { + "auxiliary_loss_clip": 0.01063309, + "auxiliary_loss_mlp": 0.01044867, + "balance_loss_clip": 1.03179097, + "balance_loss_mlp": 1.02900028, + "epoch": 0.22678490906358034, + "flos": 23841812730240.0, + "grad_norm": 1.5801045681754813, + "language_loss": 0.78909075, + "learning_rate": 3.6073462049715486e-06, + "loss": 0.81017256, + "num_input_tokens_seen": 81154650, + "step": 3772, + "time_per_iteration": 2.7101683616638184 + }, + { + "auxiliary_loss_clip": 0.01002806, + "auxiliary_loss_mlp": 0.01012218, + "balance_loss_clip": 1.02587581, + "balance_loss_mlp": 1.00927341, + "epoch": 0.2268450323162483, + "flos": 65048088574080.0, + "grad_norm": 0.6576894424494754, + "language_loss": 0.54428518, + "learning_rate": 3.607114417129261e-06, + "loss": 0.56443548, + "num_input_tokens_seen": 81221240, + "step": 3773, + "time_per_iteration": 6.579836845397949 + }, + { + "auxiliary_loss_clip": 0.01066476, + "auxiliary_loss_mlp": 0.01036544, + "balance_loss_clip": 1.03296518, + "balance_loss_mlp": 1.02048635, + "epoch": 0.22690515556891627, + "flos": 22526081544960.0, + "grad_norm": 3.2810883971405858, + "language_loss": 0.70478827, + "learning_rate": 3.6068825683450334e-06, + "loss": 0.72581846, + "num_input_tokens_seen": 81241520, + "step": 3774, + "time_per_iteration": 2.733337640762329 + }, + { + "auxiliary_loss_clip": 0.01074777, + "auxiliary_loss_mlp": 0.01044804, + "balance_loss_clip": 1.03218937, + "balance_loss_mlp": 1.02850819, + "epoch": 0.22696527882158424, + "flos": 18223444955520.0, + "grad_norm": 2.1771264685727623, + "language_loss": 0.7455104, + "learning_rate": 3.606650658627658e-06, + "loss": 0.76670623, + "num_input_tokens_seen": 81256825, + "step": 3775, + "time_per_iteration": 2.7164084911346436 + }, + { + "auxiliary_loss_clip": 0.01100239, + "auxiliary_loss_mlp": 0.01040992, + "balance_loss_clip": 1.03458405, + "balance_loss_mlp": 1.02522016, + "epoch": 0.22702540207425223, + "flos": 17019252478080.0, + "grad_norm": 2.0297522920257633, + "language_loss": 0.81991756, + "learning_rate": 3.606418687985928e-06, + "loss": 0.84132993, + "num_input_tokens_seen": 81275695, + "step": 3776, + "time_per_iteration": 2.732024908065796 + }, + { + "auxiliary_loss_clip": 0.01081314, + "auxiliary_loss_mlp": 0.01041742, + "balance_loss_clip": 1.03385687, + "balance_loss_mlp": 1.02599406, + "epoch": 0.2270855253269202, + "flos": 21325731822720.0, + "grad_norm": 2.104092309277011, + "language_loss": 0.82453632, + "learning_rate": 3.606186656428641e-06, + "loss": 0.8457669, + "num_input_tokens_seen": 81294920, + "step": 3777, + "time_per_iteration": 2.837646484375 + }, + { + "auxiliary_loss_clip": 0.01079768, + "auxiliary_loss_mlp": 0.0104121, + "balance_loss_clip": 1.03747284, + "balance_loss_mlp": 1.02471173, + "epoch": 0.22714564857958816, + "flos": 23550469516800.0, + "grad_norm": 2.092571269687961, + "language_loss": 0.71928638, + "learning_rate": 3.6059545639645955e-06, + "loss": 0.74049616, + "num_input_tokens_seen": 81314275, + "step": 3778, + "time_per_iteration": 2.8876214027404785 + }, + { + "auxiliary_loss_clip": 0.01058065, + "auxiliary_loss_mlp": 0.01036903, + "balance_loss_clip": 1.03208447, + "balance_loss_mlp": 1.02063131, + "epoch": 0.22720577183225613, + "flos": 25989880844160.0, + "grad_norm": 2.368131967564731, + "language_loss": 0.64420676, + "learning_rate": 3.605722410602591e-06, + "loss": 0.66515648, + "num_input_tokens_seen": 81333890, + "step": 3779, + "time_per_iteration": 2.9628865718841553 + }, + { + "auxiliary_loss_clip": 0.01076115, + "auxiliary_loss_mlp": 0.01039137, + "balance_loss_clip": 1.03227842, + "balance_loss_mlp": 1.02329385, + "epoch": 0.2272658950849241, + "flos": 20814076540800.0, + "grad_norm": 1.6309217437150099, + "language_loss": 0.70258963, + "learning_rate": 3.6054901963514323e-06, + "loss": 0.72374213, + "num_input_tokens_seen": 81353640, + "step": 3780, + "time_per_iteration": 2.860170841217041 + }, + { + "auxiliary_loss_clip": 0.01092759, + "auxiliary_loss_mlp": 0.01043347, + "balance_loss_clip": 1.03876138, + "balance_loss_mlp": 1.02616858, + "epoch": 0.22732601833759206, + "flos": 23909324342400.0, + "grad_norm": 1.6679398067791222, + "language_loss": 0.89576769, + "learning_rate": 3.6052579212199246e-06, + "loss": 0.91712868, + "num_input_tokens_seen": 81371595, + "step": 3781, + "time_per_iteration": 2.5888285636901855 + }, + { + "auxiliary_loss_clip": 0.0110381, + "auxiliary_loss_mlp": 0.01042483, + "balance_loss_clip": 1.03744149, + "balance_loss_mlp": 1.02578175, + "epoch": 0.22738614159026002, + "flos": 15924407978880.0, + "grad_norm": 1.9366312433411603, + "language_loss": 0.74563485, + "learning_rate": 3.6050255852168753e-06, + "loss": 0.76709777, + "num_input_tokens_seen": 81388435, + "step": 3782, + "time_per_iteration": 2.504472017288208 + }, + { + "auxiliary_loss_clip": 0.0107694, + "auxiliary_loss_mlp": 0.01044731, + "balance_loss_clip": 1.03212285, + "balance_loss_mlp": 1.03023493, + "epoch": 0.22744626484292801, + "flos": 24205515891840.0, + "grad_norm": 1.6598672834864134, + "language_loss": 0.82791746, + "learning_rate": 3.604793188351095e-06, + "loss": 0.84913415, + "num_input_tokens_seen": 81410195, + "step": 3783, + "time_per_iteration": 2.686450242996216 + }, + { + "auxiliary_loss_clip": 0.01079065, + "auxiliary_loss_mlp": 0.01041028, + "balance_loss_clip": 1.03380716, + "balance_loss_mlp": 1.02419567, + "epoch": 0.22750638809559598, + "flos": 24791614110720.0, + "grad_norm": 1.8368394166018325, + "language_loss": 0.76195848, + "learning_rate": 3.6045607306313964e-06, + "loss": 0.78315943, + "num_input_tokens_seen": 81430060, + "step": 3784, + "time_per_iteration": 2.723313093185425 + }, + { + "auxiliary_loss_clip": 0.01095453, + "auxiliary_loss_mlp": 0.0104354, + "balance_loss_clip": 1.03279209, + "balance_loss_mlp": 1.02737486, + "epoch": 0.22756651134826394, + "flos": 22236498097920.0, + "grad_norm": 1.7419595011111162, + "language_loss": 0.70741725, + "learning_rate": 3.604328212066594e-06, + "loss": 0.72880721, + "num_input_tokens_seen": 81447375, + "step": 3785, + "time_per_iteration": 2.5609779357910156 + }, + { + "auxiliary_loss_clip": 0.00997749, + "auxiliary_loss_mlp": 0.01004283, + "balance_loss_clip": 1.00985658, + "balance_loss_mlp": 1.00150537, + "epoch": 0.2276266346009319, + "flos": 62707466626560.0, + "grad_norm": 0.8236169116748004, + "language_loss": 0.6194886, + "learning_rate": 3.6040956326655047e-06, + "loss": 0.63950896, + "num_input_tokens_seen": 81505235, + "step": 3786, + "time_per_iteration": 3.3305485248565674 + }, + { + "auxiliary_loss_clip": 0.01081153, + "auxiliary_loss_mlp": 0.01039229, + "balance_loss_clip": 1.03428781, + "balance_loss_mlp": 1.02257514, + "epoch": 0.22768675785359987, + "flos": 18613936684800.0, + "grad_norm": 3.342861042151342, + "language_loss": 0.86378968, + "learning_rate": 3.6038629924369486e-06, + "loss": 0.88499355, + "num_input_tokens_seen": 81518685, + "step": 3787, + "time_per_iteration": 2.738596200942993 + }, + { + "auxiliary_loss_clip": 0.0107462, + "auxiliary_loss_mlp": 0.01036961, + "balance_loss_clip": 1.0322423, + "balance_loss_mlp": 1.02188146, + "epoch": 0.22774688110626784, + "flos": 26870195364480.0, + "grad_norm": 1.5134839541194562, + "language_loss": 0.72232425, + "learning_rate": 3.6036302913897474e-06, + "loss": 0.74344003, + "num_input_tokens_seen": 81538940, + "step": 3788, + "time_per_iteration": 2.682823896408081 + }, + { + "auxiliary_loss_clip": 0.01074473, + "auxiliary_loss_mlp": 0.01031214, + "balance_loss_clip": 1.03269911, + "balance_loss_mlp": 1.01599073, + "epoch": 0.2278070043589358, + "flos": 15553593924480.0, + "grad_norm": 4.042752650506552, + "language_loss": 0.67120779, + "learning_rate": 3.6033975295327243e-06, + "loss": 0.69226468, + "num_input_tokens_seen": 81555525, + "step": 3789, + "time_per_iteration": 2.53851318359375 + }, + { + "auxiliary_loss_clip": 0.01058597, + "auxiliary_loss_mlp": 0.01042863, + "balance_loss_clip": 1.02895594, + "balance_loss_mlp": 1.02673388, + "epoch": 0.2278671276116038, + "flos": 22416805393920.0, + "grad_norm": 1.9534229527965719, + "language_loss": 0.76421738, + "learning_rate": 3.6031647068747065e-06, + "loss": 0.78523195, + "num_input_tokens_seen": 81576305, + "step": 3790, + "time_per_iteration": 2.6698758602142334 + }, + { + "auxiliary_loss_clip": 0.01046792, + "auxiliary_loss_mlp": 0.01039637, + "balance_loss_clip": 1.03152275, + "balance_loss_mlp": 1.02297163, + "epoch": 0.22792725086427176, + "flos": 20631363033600.0, + "grad_norm": 1.8591757622324432, + "language_loss": 0.90729499, + "learning_rate": 3.602931823424522e-06, + "loss": 0.92815936, + "num_input_tokens_seen": 81594115, + "step": 3791, + "time_per_iteration": 2.8326406478881836 + }, + { + "auxiliary_loss_clip": 0.01090764, + "auxiliary_loss_mlp": 0.01035389, + "balance_loss_clip": 1.0335772, + "balance_loss_mlp": 1.01998687, + "epoch": 0.22798737411693973, + "flos": 31428946903680.0, + "grad_norm": 1.6165895185031292, + "language_loss": 0.82538831, + "learning_rate": 3.6026988791910026e-06, + "loss": 0.84664989, + "num_input_tokens_seen": 81615355, + "step": 3792, + "time_per_iteration": 2.807514190673828 + }, + { + "auxiliary_loss_clip": 0.01022941, + "auxiliary_loss_mlp": 0.01004118, + "balance_loss_clip": 1.0048821, + "balance_loss_mlp": 1.00156689, + "epoch": 0.2280474973696077, + "flos": 52396685827200.0, + "grad_norm": 1.1348249293687356, + "language_loss": 0.65623927, + "learning_rate": 3.602465874182981e-06, + "loss": 0.67650986, + "num_input_tokens_seen": 81662075, + "step": 3793, + "time_per_iteration": 2.9920496940612793 + }, + { + "auxiliary_loss_clip": 0.01101225, + "auxiliary_loss_mlp": 0.01047181, + "balance_loss_clip": 1.033427, + "balance_loss_mlp": 1.03034854, + "epoch": 0.22810762062227566, + "flos": 26396066816640.0, + "grad_norm": 2.5894549261852973, + "language_loss": 0.77759969, + "learning_rate": 3.602232808409293e-06, + "loss": 0.79908383, + "num_input_tokens_seen": 81681625, + "step": 3794, + "time_per_iteration": 2.6228675842285156 + }, + { + "auxiliary_loss_clip": 0.01050734, + "auxiliary_loss_mlp": 0.01043318, + "balance_loss_clip": 1.02781379, + "balance_loss_mlp": 1.026474, + "epoch": 0.22816774387494362, + "flos": 25630271832960.0, + "grad_norm": 2.645268814560435, + "language_loss": 0.80666971, + "learning_rate": 3.6019996818787755e-06, + "loss": 0.82761025, + "num_input_tokens_seen": 81701170, + "step": 3795, + "time_per_iteration": 2.6948063373565674 + }, + { + "auxiliary_loss_clip": 0.01086094, + "auxiliary_loss_mlp": 0.0104421, + "balance_loss_clip": 1.03315032, + "balance_loss_mlp": 1.02857006, + "epoch": 0.22822786712761162, + "flos": 22451602694400.0, + "grad_norm": 1.9914430404445487, + "language_loss": 0.76854575, + "learning_rate": 3.6017664946002704e-06, + "loss": 0.7898488, + "num_input_tokens_seen": 81721265, + "step": 3796, + "time_per_iteration": 2.661241292953491 + }, + { + "auxiliary_loss_clip": 0.01064065, + "auxiliary_loss_mlp": 0.00748855, + "balance_loss_clip": 1.03344524, + "balance_loss_mlp": 1.00114536, + "epoch": 0.22828799038027958, + "flos": 12202554395520.0, + "grad_norm": 2.2224308450300168, + "language_loss": 0.95182431, + "learning_rate": 3.6015332465826188e-06, + "loss": 0.96995354, + "num_input_tokens_seen": 81736565, + "step": 3797, + "time_per_iteration": 2.6831936836242676 + }, + { + "auxiliary_loss_clip": 0.01087427, + "auxiliary_loss_mlp": 0.00748779, + "balance_loss_clip": 1.03345442, + "balance_loss_mlp": 1.0012573, + "epoch": 0.22834811363294755, + "flos": 22085708803200.0, + "grad_norm": 1.7129232007281001, + "language_loss": 0.81371605, + "learning_rate": 3.601299937834666e-06, + "loss": 0.8320781, + "num_input_tokens_seen": 81756240, + "step": 3798, + "time_per_iteration": 2.6957294940948486 + }, + { + "auxiliary_loss_clip": 0.01062502, + "auxiliary_loss_mlp": 0.010361, + "balance_loss_clip": 1.03144908, + "balance_loss_mlp": 1.01967263, + "epoch": 0.2284082368856155, + "flos": 24860634094080.0, + "grad_norm": 1.9291134920016604, + "language_loss": 0.7896986, + "learning_rate": 3.6010665683652596e-06, + "loss": 0.81068456, + "num_input_tokens_seen": 81775720, + "step": 3799, + "time_per_iteration": 2.736569881439209 + }, + { + "auxiliary_loss_clip": 0.01069254, + "auxiliary_loss_mlp": 0.01045786, + "balance_loss_clip": 1.0314002, + "balance_loss_mlp": 1.02914393, + "epoch": 0.22846836013828348, + "flos": 23292882109440.0, + "grad_norm": 1.705862958848125, + "language_loss": 0.75220525, + "learning_rate": 3.6008331381832484e-06, + "loss": 0.77335566, + "num_input_tokens_seen": 81795830, + "step": 3800, + "time_per_iteration": 2.6642842292785645 + }, + { + "auxiliary_loss_clip": 0.01071493, + "auxiliary_loss_mlp": 0.01036743, + "balance_loss_clip": 1.03229201, + "balance_loss_mlp": 1.02213347, + "epoch": 0.22852848339095144, + "flos": 27416288810880.0, + "grad_norm": 1.804448122191541, + "language_loss": 0.63602912, + "learning_rate": 3.600599647297484e-06, + "loss": 0.65711147, + "num_input_tokens_seen": 81815745, + "step": 3801, + "time_per_iteration": 2.7994391918182373 + }, + { + "auxiliary_loss_clip": 0.01079492, + "auxiliary_loss_mlp": 0.01034287, + "balance_loss_clip": 1.03547978, + "balance_loss_mlp": 1.02032125, + "epoch": 0.2285886066436194, + "flos": 26321157002880.0, + "grad_norm": 1.7126473072900723, + "language_loss": 0.81624258, + "learning_rate": 3.60036609571682e-06, + "loss": 0.83738029, + "num_input_tokens_seen": 81835155, + "step": 3802, + "time_per_iteration": 2.88370418548584 + }, + { + "auxiliary_loss_clip": 0.01073551, + "auxiliary_loss_mlp": 0.0104969, + "balance_loss_clip": 1.03464162, + "balance_loss_mlp": 1.03341222, + "epoch": 0.2286487298962874, + "flos": 29716475022720.0, + "grad_norm": 1.911753597137487, + "language_loss": 0.78540599, + "learning_rate": 3.600132483450114e-06, + "loss": 0.80663836, + "num_input_tokens_seen": 81855655, + "step": 3803, + "time_per_iteration": 2.7283968925476074 + }, + { + "auxiliary_loss_clip": 0.01059803, + "auxiliary_loss_mlp": 0.01035826, + "balance_loss_clip": 1.02877223, + "balance_loss_mlp": 1.02013767, + "epoch": 0.22870885314895537, + "flos": 21287199507840.0, + "grad_norm": 1.7064989949688956, + "language_loss": 0.84876019, + "learning_rate": 3.5998988105062235e-06, + "loss": 0.86971647, + "num_input_tokens_seen": 81876385, + "step": 3804, + "time_per_iteration": 2.7261457443237305 + }, + { + "auxiliary_loss_clip": 0.01089538, + "auxiliary_loss_mlp": 0.01038343, + "balance_loss_clip": 1.0324173, + "balance_loss_mlp": 1.02342999, + "epoch": 0.22876897640162333, + "flos": 14939450161920.0, + "grad_norm": 2.4460518885466964, + "language_loss": 0.76663154, + "learning_rate": 3.59966507689401e-06, + "loss": 0.78791034, + "num_input_tokens_seen": 81893225, + "step": 3805, + "time_per_iteration": 2.5956361293792725 + }, + { + "auxiliary_loss_clip": 0.01079895, + "auxiliary_loss_mlp": 0.00748827, + "balance_loss_clip": 1.03296065, + "balance_loss_mlp": 1.00108826, + "epoch": 0.2288290996542913, + "flos": 18113917409280.0, + "grad_norm": 2.7045999165101167, + "language_loss": 0.78698361, + "learning_rate": 3.5994312826223363e-06, + "loss": 0.80527091, + "num_input_tokens_seen": 81911350, + "step": 3806, + "time_per_iteration": 2.6757054328918457 + }, + { + "auxiliary_loss_clip": 0.01068918, + "auxiliary_loss_mlp": 0.01049557, + "balance_loss_clip": 1.03134823, + "balance_loss_mlp": 1.03297472, + "epoch": 0.22888922290695926, + "flos": 39855457071360.0, + "grad_norm": 2.2752496516744594, + "language_loss": 0.69857204, + "learning_rate": 3.5991974277000684e-06, + "loss": 0.71975684, + "num_input_tokens_seen": 81935420, + "step": 3807, + "time_per_iteration": 2.8143723011016846 + }, + { + "auxiliary_loss_clip": 0.01093464, + "auxiliary_loss_mlp": 0.01047639, + "balance_loss_clip": 1.03551006, + "balance_loss_mlp": 1.03080714, + "epoch": 0.22894934615962723, + "flos": 23403774372480.0, + "grad_norm": 3.6021787562693066, + "language_loss": 0.65921515, + "learning_rate": 3.5989635121360733e-06, + "loss": 0.68062615, + "num_input_tokens_seen": 81953845, + "step": 3808, + "time_per_iteration": 2.6302497386932373 + }, + { + "auxiliary_loss_clip": 0.01048989, + "auxiliary_loss_mlp": 0.01047821, + "balance_loss_clip": 1.03052735, + "balance_loss_mlp": 1.03097653, + "epoch": 0.22900946941229522, + "flos": 18843011671680.0, + "grad_norm": 1.7276500950143336, + "language_loss": 0.74883538, + "learning_rate": 3.598729535939222e-06, + "loss": 0.76980352, + "num_input_tokens_seen": 81972100, + "step": 3809, + "time_per_iteration": 2.754693031311035 + }, + { + "auxiliary_loss_clip": 0.01076597, + "auxiliary_loss_mlp": 0.01042406, + "balance_loss_clip": 1.03390634, + "balance_loss_mlp": 1.02731395, + "epoch": 0.22906959266496318, + "flos": 22929394429440.0, + "grad_norm": 1.682057381120164, + "language_loss": 0.81540579, + "learning_rate": 3.5984954991183862e-06, + "loss": 0.83659583, + "num_input_tokens_seen": 81992760, + "step": 3810, + "time_per_iteration": 2.739485740661621 + }, + { + "auxiliary_loss_clip": 0.01089344, + "auxiliary_loss_mlp": 0.01040499, + "balance_loss_clip": 1.03850031, + "balance_loss_mlp": 1.02570534, + "epoch": 0.22912971591763115, + "flos": 19354523299200.0, + "grad_norm": 2.098632920491321, + "language_loss": 0.7862559, + "learning_rate": 3.598261401682441e-06, + "loss": 0.8075543, + "num_input_tokens_seen": 82009080, + "step": 3811, + "time_per_iteration": 2.6314239501953125 + }, + { + "auxiliary_loss_clip": 0.01075135, + "auxiliary_loss_mlp": 0.00748782, + "balance_loss_clip": 1.03192747, + "balance_loss_mlp": 1.00119328, + "epoch": 0.22918983917029911, + "flos": 19933546538880.0, + "grad_norm": 1.8215000173052327, + "language_loss": 0.827667, + "learning_rate": 3.5980272436402632e-06, + "loss": 0.84590614, + "num_input_tokens_seen": 82026705, + "step": 3812, + "time_per_iteration": 2.7007017135620117 + }, + { + "auxiliary_loss_clip": 0.01038129, + "auxiliary_loss_mlp": 0.01054332, + "balance_loss_clip": 1.03169465, + "balance_loss_mlp": 1.0368917, + "epoch": 0.22924996242296708, + "flos": 16690885320960.0, + "grad_norm": 2.471753608703041, + "language_loss": 0.82322741, + "learning_rate": 3.5977930250007324e-06, + "loss": 0.84415203, + "num_input_tokens_seen": 82043245, + "step": 3813, + "time_per_iteration": 2.729369878768921 + }, + { + "auxiliary_loss_clip": 0.01077082, + "auxiliary_loss_mlp": 0.01040785, + "balance_loss_clip": 1.03025961, + "balance_loss_mlp": 1.02572894, + "epoch": 0.22931008567563504, + "flos": 33036164956800.0, + "grad_norm": 3.9182286273686104, + "language_loss": 0.6982224, + "learning_rate": 3.5975587457727298e-06, + "loss": 0.71940106, + "num_input_tokens_seen": 82066870, + "step": 3814, + "time_per_iteration": 2.730839490890503 + }, + { + "auxiliary_loss_clip": 0.01085144, + "auxiliary_loss_mlp": 0.01041016, + "balance_loss_clip": 1.03053916, + "balance_loss_mlp": 1.0258882, + "epoch": 0.229370208928303, + "flos": 23330696152320.0, + "grad_norm": 2.188131244756755, + "language_loss": 0.67273331, + "learning_rate": 3.597324405965139e-06, + "loss": 0.69399494, + "num_input_tokens_seen": 82083180, + "step": 3815, + "time_per_iteration": 4.334621906280518 + }, + { + "auxiliary_loss_clip": 0.01084849, + "auxiliary_loss_mlp": 0.01039021, + "balance_loss_clip": 1.0321846, + "balance_loss_mlp": 1.02388728, + "epoch": 0.229430332180971, + "flos": 28617213150720.0, + "grad_norm": 1.8528611165420903, + "language_loss": 0.8330065, + "learning_rate": 3.597090005586848e-06, + "loss": 0.85424519, + "num_input_tokens_seen": 82102950, + "step": 3816, + "time_per_iteration": 2.7061269283294678 + }, + { + "auxiliary_loss_clip": 0.01086557, + "auxiliary_loss_mlp": 0.01036688, + "balance_loss_clip": 1.03313053, + "balance_loss_mlp": 1.02066588, + "epoch": 0.22949045543363897, + "flos": 17238199829760.0, + "grad_norm": 2.5925163389816044, + "language_loss": 0.86875927, + "learning_rate": 3.596855544646742e-06, + "loss": 0.88999176, + "num_input_tokens_seen": 82119510, + "step": 3817, + "time_per_iteration": 2.5643136501312256 + }, + { + "auxiliary_loss_clip": 0.01071597, + "auxiliary_loss_mlp": 0.01038793, + "balance_loss_clip": 1.03044736, + "balance_loss_mlp": 1.02285445, + "epoch": 0.22955057868630693, + "flos": 27489438858240.0, + "grad_norm": 2.1663797941745027, + "language_loss": 0.74777865, + "learning_rate": 3.5966210231537154e-06, + "loss": 0.76888257, + "num_input_tokens_seen": 82140095, + "step": 3818, + "time_per_iteration": 2.754240036010742 + }, + { + "auxiliary_loss_clip": 0.01087679, + "auxiliary_loss_mlp": 0.01039069, + "balance_loss_clip": 1.03343034, + "balance_loss_mlp": 1.02337468, + "epoch": 0.2296107019389749, + "flos": 23476421629440.0, + "grad_norm": 1.6841808855012423, + "language_loss": 0.74488318, + "learning_rate": 3.596386441116659e-06, + "loss": 0.76615071, + "num_input_tokens_seen": 82159510, + "step": 3819, + "time_per_iteration": 2.640479564666748 + }, + { + "auxiliary_loss_clip": 0.01088666, + "auxiliary_loss_mlp": 0.01037625, + "balance_loss_clip": 1.03376782, + "balance_loss_mlp": 1.0223186, + "epoch": 0.22967082519164286, + "flos": 31285160760960.0, + "grad_norm": 1.9055629771766789, + "language_loss": 0.80977476, + "learning_rate": 3.5961517985444684e-06, + "loss": 0.83103764, + "num_input_tokens_seen": 82179580, + "step": 3820, + "time_per_iteration": 2.6662158966064453 + }, + { + "auxiliary_loss_clip": 0.01079511, + "auxiliary_loss_mlp": 0.01041642, + "balance_loss_clip": 1.03364456, + "balance_loss_mlp": 1.02488136, + "epoch": 0.22973094844431083, + "flos": 14642935390080.0, + "grad_norm": 1.8991294954293194, + "language_loss": 0.69430447, + "learning_rate": 3.595917095446042e-06, + "loss": 0.71551603, + "num_input_tokens_seen": 82195585, + "step": 3821, + "time_per_iteration": 5.853618860244751 + }, + { + "auxiliary_loss_clip": 0.01053141, + "auxiliary_loss_mlp": 0.01034715, + "balance_loss_clip": 1.0326885, + "balance_loss_mlp": 1.01876521, + "epoch": 0.2297910716969788, + "flos": 22823853292800.0, + "grad_norm": 1.583719357454463, + "language_loss": 0.82840407, + "learning_rate": 3.5956823318302796e-06, + "loss": 0.84928268, + "num_input_tokens_seen": 82217530, + "step": 3822, + "time_per_iteration": 2.721061944961548 + }, + { + "auxiliary_loss_clip": 0.01097003, + "auxiliary_loss_mlp": 0.01039756, + "balance_loss_clip": 1.0321101, + "balance_loss_mlp": 1.02365088, + "epoch": 0.2298511949496468, + "flos": 23039029716480.0, + "grad_norm": 1.4983837890573655, + "language_loss": 0.6624043, + "learning_rate": 3.5954475077060833e-06, + "loss": 0.68377191, + "num_input_tokens_seen": 82237980, + "step": 3823, + "time_per_iteration": 2.625699758529663 + }, + { + "auxiliary_loss_clip": 0.01014652, + "auxiliary_loss_mlp": 0.01009201, + "balance_loss_clip": 1.00743842, + "balance_loss_mlp": 1.00685263, + "epoch": 0.22991131820231475, + "flos": 66890914911360.0, + "grad_norm": 0.8300506330831519, + "language_loss": 0.56744337, + "learning_rate": 3.595212623082357e-06, + "loss": 0.58768189, + "num_input_tokens_seen": 82301785, + "step": 3824, + "time_per_iteration": 3.313156843185425 + }, + { + "auxiliary_loss_clip": 0.01072606, + "auxiliary_loss_mlp": 0.01033725, + "balance_loss_clip": 1.03102207, + "balance_loss_mlp": 1.01937222, + "epoch": 0.22997144145498272, + "flos": 17887248633600.0, + "grad_norm": 2.9466356626943626, + "language_loss": 0.73156005, + "learning_rate": 3.594977677968009e-06, + "loss": 0.75262332, + "num_input_tokens_seen": 82317355, + "step": 3825, + "time_per_iteration": 2.6005301475524902 + }, + { + "auxiliary_loss_clip": 0.01091243, + "auxiliary_loss_mlp": 0.01043168, + "balance_loss_clip": 1.03585374, + "balance_loss_mlp": 1.02671099, + "epoch": 0.23003156470765068, + "flos": 24676843178880.0, + "grad_norm": 1.8231936207099637, + "language_loss": 0.87746167, + "learning_rate": 3.5947426723719473e-06, + "loss": 0.89880574, + "num_input_tokens_seen": 82336645, + "step": 3826, + "time_per_iteration": 2.6212778091430664 + }, + { + "auxiliary_loss_clip": 0.01078273, + "auxiliary_loss_mlp": 0.01044248, + "balance_loss_clip": 1.03128386, + "balance_loss_mlp": 1.02730882, + "epoch": 0.23009168796031865, + "flos": 15814126247040.0, + "grad_norm": 2.3205672371801866, + "language_loss": 0.81808311, + "learning_rate": 3.594507606303083e-06, + "loss": 0.83930826, + "num_input_tokens_seen": 82354225, + "step": 3827, + "time_per_iteration": 2.577910900115967 + }, + { + "auxiliary_loss_clip": 0.01034364, + "auxiliary_loss_mlp": 0.01043058, + "balance_loss_clip": 1.02954912, + "balance_loss_mlp": 1.02731669, + "epoch": 0.2301518112129866, + "flos": 16212842190720.0, + "grad_norm": 2.2840572538749373, + "language_loss": 0.8634789, + "learning_rate": 3.5942724797703314e-06, + "loss": 0.88425308, + "num_input_tokens_seen": 82370240, + "step": 3828, + "time_per_iteration": 2.6835217475891113 + }, + { + "auxiliary_loss_clip": 0.01075198, + "auxiliary_loss_mlp": 0.01043217, + "balance_loss_clip": 1.03217721, + "balance_loss_mlp": 1.02688539, + "epoch": 0.2302119344656546, + "flos": 20595452411520.0, + "grad_norm": 2.0940532616981486, + "language_loss": 0.70329076, + "learning_rate": 3.594037292782607e-06, + "loss": 0.72447491, + "num_input_tokens_seen": 82389145, + "step": 3829, + "time_per_iteration": 2.7591423988342285 + }, + { + "auxiliary_loss_clip": 0.01035687, + "auxiliary_loss_mlp": 0.0104073, + "balance_loss_clip": 1.02927959, + "balance_loss_mlp": 1.02649021, + "epoch": 0.23027205771832257, + "flos": 26796901662720.0, + "grad_norm": 1.7575170029771727, + "language_loss": 0.84161341, + "learning_rate": 3.5938020453488293e-06, + "loss": 0.86237752, + "num_input_tokens_seen": 82409185, + "step": 3830, + "time_per_iteration": 2.820098638534546 + }, + { + "auxiliary_loss_clip": 0.01089498, + "auxiliary_loss_mlp": 0.01045313, + "balance_loss_clip": 1.03726232, + "balance_loss_mlp": 1.02938676, + "epoch": 0.23033218097099054, + "flos": 43873143068160.0, + "grad_norm": 1.7836016460217252, + "language_loss": 0.66881257, + "learning_rate": 3.5935667374779177e-06, + "loss": 0.69016069, + "num_input_tokens_seen": 82432070, + "step": 3831, + "time_per_iteration": 2.893550395965576 + }, + { + "auxiliary_loss_clip": 0.01058549, + "auxiliary_loss_mlp": 0.01047018, + "balance_loss_clip": 1.03264499, + "balance_loss_mlp": 1.03081703, + "epoch": 0.2303923042236585, + "flos": 26067663745920.0, + "grad_norm": 7.650144575184964, + "language_loss": 0.7577759, + "learning_rate": 3.5933313691787957e-06, + "loss": 0.77883154, + "num_input_tokens_seen": 82450625, + "step": 3832, + "time_per_iteration": 2.7117505073547363 + }, + { + "auxiliary_loss_clip": 0.01055502, + "auxiliary_loss_mlp": 0.01040339, + "balance_loss_clip": 1.03203487, + "balance_loss_mlp": 1.02354288, + "epoch": 0.23045242747632647, + "flos": 18296379521280.0, + "grad_norm": 1.9927637127055238, + "language_loss": 0.87740421, + "learning_rate": 3.593095940460389e-06, + "loss": 0.89836258, + "num_input_tokens_seen": 82468575, + "step": 3833, + "time_per_iteration": 2.7518489360809326 + }, + { + "auxiliary_loss_clip": 0.01062446, + "auxiliary_loss_mlp": 0.01043174, + "balance_loss_clip": 1.03061306, + "balance_loss_mlp": 1.0269202, + "epoch": 0.23051255072899443, + "flos": 25520528805120.0, + "grad_norm": 1.7665126100199804, + "language_loss": 0.74859488, + "learning_rate": 3.592860451331624e-06, + "loss": 0.76965106, + "num_input_tokens_seen": 82488655, + "step": 3834, + "time_per_iteration": 2.7434682846069336 + }, + { + "auxiliary_loss_clip": 0.01044731, + "auxiliary_loss_mlp": 0.01056146, + "balance_loss_clip": 1.02664268, + "balance_loss_mlp": 1.03853869, + "epoch": 0.2305726739816624, + "flos": 21215198695680.0, + "grad_norm": 1.9953214463189315, + "language_loss": 0.85427547, + "learning_rate": 3.592624901801432e-06, + "loss": 0.87528425, + "num_input_tokens_seen": 82507220, + "step": 3835, + "time_per_iteration": 2.7146174907684326 + }, + { + "auxiliary_loss_clip": 0.01058996, + "auxiliary_loss_mlp": 0.01051957, + "balance_loss_clip": 1.03027749, + "balance_loss_mlp": 1.03327692, + "epoch": 0.2306327972343304, + "flos": 23331127115520.0, + "grad_norm": 3.7240547782548337, + "language_loss": 0.82390952, + "learning_rate": 3.5923892918787432e-06, + "loss": 0.8450191, + "num_input_tokens_seen": 82527920, + "step": 3836, + "time_per_iteration": 2.734572410583496 + }, + { + "auxiliary_loss_clip": 0.01089603, + "auxiliary_loss_mlp": 0.01043181, + "balance_loss_clip": 1.03530383, + "balance_loss_mlp": 1.02766585, + "epoch": 0.23069292048699835, + "flos": 20666734951680.0, + "grad_norm": 1.6445108800591832, + "language_loss": 0.79413402, + "learning_rate": 3.5921536215724934e-06, + "loss": 0.81546187, + "num_input_tokens_seen": 82549040, + "step": 3837, + "time_per_iteration": 2.6221024990081787 + }, + { + "auxiliary_loss_clip": 0.01009047, + "auxiliary_loss_mlp": 0.01021379, + "balance_loss_clip": 1.01941895, + "balance_loss_mlp": 1.01893508, + "epoch": 0.23075304373966632, + "flos": 70454832393600.0, + "grad_norm": 0.9637241390852103, + "language_loss": 0.65526134, + "learning_rate": 3.5919178908916184e-06, + "loss": 0.67556554, + "num_input_tokens_seen": 82604070, + "step": 3838, + "time_per_iteration": 3.2468321323394775 + }, + { + "auxiliary_loss_clip": 0.01086902, + "auxiliary_loss_mlp": 0.01043997, + "balance_loss_clip": 1.03417623, + "balance_loss_mlp": 1.02853572, + "epoch": 0.23081316699233428, + "flos": 16617986668800.0, + "grad_norm": 2.435051517814905, + "language_loss": 0.75435954, + "learning_rate": 3.591682099845058e-06, + "loss": 0.77566862, + "num_input_tokens_seen": 82619665, + "step": 3839, + "time_per_iteration": 2.7305495738983154 + }, + { + "auxiliary_loss_clip": 0.01068286, + "auxiliary_loss_mlp": 0.01040954, + "balance_loss_clip": 1.03338373, + "balance_loss_mlp": 1.02481937, + "epoch": 0.23087329024500225, + "flos": 13298081253120.0, + "grad_norm": 1.8656306482508278, + "language_loss": 0.68684745, + "learning_rate": 3.591446248441752e-06, + "loss": 0.70793986, + "num_input_tokens_seen": 82637530, + "step": 3840, + "time_per_iteration": 2.665651321411133 + }, + { + "auxiliary_loss_clip": 0.01102826, + "auxiliary_loss_mlp": 0.01042943, + "balance_loss_clip": 1.03675497, + "balance_loss_mlp": 1.02483499, + "epoch": 0.23093341349767021, + "flos": 17785729820160.0, + "grad_norm": 1.9802210414623103, + "language_loss": 0.79742026, + "learning_rate": 3.591210336690645e-06, + "loss": 0.81887794, + "num_input_tokens_seen": 82656130, + "step": 3841, + "time_per_iteration": 2.580143690109253 + }, + { + "auxiliary_loss_clip": 0.01091278, + "auxiliary_loss_mlp": 0.01035574, + "balance_loss_clip": 1.03716159, + "balance_loss_mlp": 1.0209347, + "epoch": 0.23099353675033818, + "flos": 23988076911360.0, + "grad_norm": 2.241960614073197, + "language_loss": 0.82801211, + "learning_rate": 3.590974364600683e-06, + "loss": 0.84928071, + "num_input_tokens_seen": 82675295, + "step": 3842, + "time_per_iteration": 2.6760551929473877 + }, + { + "auxiliary_loss_clip": 0.01089421, + "auxiliary_loss_mlp": 0.0104425, + "balance_loss_clip": 1.03396654, + "balance_loss_mlp": 1.02731001, + "epoch": 0.23105366000300617, + "flos": 35995168471680.0, + "grad_norm": 1.6231785334597921, + "language_loss": 0.66318047, + "learning_rate": 3.5907383321808135e-06, + "loss": 0.68451715, + "num_input_tokens_seen": 82703260, + "step": 3843, + "time_per_iteration": 2.987231731414795 + }, + { + "auxiliary_loss_clip": 0.01084241, + "auxiliary_loss_mlp": 0.01045055, + "balance_loss_clip": 1.03286028, + "balance_loss_mlp": 1.02908134, + "epoch": 0.23111378325567414, + "flos": 31245335556480.0, + "grad_norm": 1.868922325391781, + "language_loss": 0.77504861, + "learning_rate": 3.590502239439987e-06, + "loss": 0.79634154, + "num_input_tokens_seen": 82725060, + "step": 3844, + "time_per_iteration": 2.684224843978882 + }, + { + "auxiliary_loss_clip": 0.01083037, + "auxiliary_loss_mlp": 0.01042762, + "balance_loss_clip": 1.03243673, + "balance_loss_mlp": 1.02520227, + "epoch": 0.2311739065083421, + "flos": 19208223204480.0, + "grad_norm": 1.5525885078844488, + "language_loss": 0.78203231, + "learning_rate": 3.590266086387156e-06, + "loss": 0.80329037, + "num_input_tokens_seen": 82742960, + "step": 3845, + "time_per_iteration": 2.5465736389160156 + }, + { + "auxiliary_loss_clip": 0.01058245, + "auxiliary_loss_mlp": 0.0103498, + "balance_loss_clip": 1.02875566, + "balance_loss_mlp": 1.02050757, + "epoch": 0.23123402976101007, + "flos": 23360178240000.0, + "grad_norm": 2.049756586607065, + "language_loss": 0.75671434, + "learning_rate": 3.590029873031276e-06, + "loss": 0.77764654, + "num_input_tokens_seen": 82760205, + "step": 3846, + "time_per_iteration": 2.709916591644287 + }, + { + "auxiliary_loss_clip": 0.01076318, + "auxiliary_loss_mlp": 0.01043651, + "balance_loss_clip": 1.03211343, + "balance_loss_mlp": 1.02787995, + "epoch": 0.23129415301367803, + "flos": 13735365425280.0, + "grad_norm": 2.6444282247983253, + "language_loss": 0.69710553, + "learning_rate": 3.589793599381304e-06, + "loss": 0.71830523, + "num_input_tokens_seen": 82778590, + "step": 3847, + "time_per_iteration": 2.582157850265503 + }, + { + "auxiliary_loss_clip": 0.01032704, + "auxiliary_loss_mlp": 0.01026874, + "balance_loss_clip": 1.02592921, + "balance_loss_mlp": 1.02303529, + "epoch": 0.231354276266346, + "flos": 69737015001600.0, + "grad_norm": 0.7964409621731687, + "language_loss": 0.61018252, + "learning_rate": 3.589557265446198e-06, + "loss": 0.63077831, + "num_input_tokens_seen": 82833925, + "step": 3848, + "time_per_iteration": 3.0888314247131348 + }, + { + "auxiliary_loss_clip": 0.01087161, + "auxiliary_loss_mlp": 0.0104331, + "balance_loss_clip": 1.03405023, + "balance_loss_mlp": 1.02719259, + "epoch": 0.231414399519014, + "flos": 18835900778880.0, + "grad_norm": 2.151143602408737, + "language_loss": 0.78034198, + "learning_rate": 3.589320871234923e-06, + "loss": 0.80164665, + "num_input_tokens_seen": 82850625, + "step": 3849, + "time_per_iteration": 2.5524442195892334 + }, + { + "auxiliary_loss_clip": 0.01089023, + "auxiliary_loss_mlp": 0.01039903, + "balance_loss_clip": 1.03436685, + "balance_loss_mlp": 1.02420306, + "epoch": 0.23147452277168196, + "flos": 36135470995200.0, + "grad_norm": 1.7639540489836372, + "language_loss": 0.7129944, + "learning_rate": 3.5890844167564405e-06, + "loss": 0.73428369, + "num_input_tokens_seen": 82872105, + "step": 3850, + "time_per_iteration": 2.730196714401245 + }, + { + "auxiliary_loss_clip": 0.01065746, + "auxiliary_loss_mlp": 0.00748894, + "balance_loss_clip": 1.02948046, + "balance_loss_mlp": 1.00129223, + "epoch": 0.23153464602434992, + "flos": 20812927305600.0, + "grad_norm": 1.8853670252423238, + "language_loss": 0.76267397, + "learning_rate": 3.588847902019718e-06, + "loss": 0.78082043, + "num_input_tokens_seen": 82890595, + "step": 3851, + "time_per_iteration": 2.5983872413635254 + }, + { + "auxiliary_loss_clip": 0.01095552, + "auxiliary_loss_mlp": 0.01040641, + "balance_loss_clip": 1.03341341, + "balance_loss_mlp": 1.02405906, + "epoch": 0.2315947692770179, + "flos": 19939256801280.0, + "grad_norm": 1.578846948975976, + "language_loss": 0.69349003, + "learning_rate": 3.588611327033723e-06, + "loss": 0.71485198, + "num_input_tokens_seen": 82908910, + "step": 3852, + "time_per_iteration": 2.535006046295166 + }, + { + "auxiliary_loss_clip": 0.01054181, + "auxiliary_loss_mlp": 0.01044242, + "balance_loss_clip": 1.02947378, + "balance_loss_mlp": 1.02733803, + "epoch": 0.23165489252968585, + "flos": 12855553695360.0, + "grad_norm": 2.6216419928425276, + "language_loss": 0.67627531, + "learning_rate": 3.588374691807428e-06, + "loss": 0.69725955, + "num_input_tokens_seen": 82925405, + "step": 3853, + "time_per_iteration": 2.622579574584961 + }, + { + "auxiliary_loss_clip": 0.01088299, + "auxiliary_loss_mlp": 0.01044007, + "balance_loss_clip": 1.03287733, + "balance_loss_mlp": 1.02815247, + "epoch": 0.23171501578235382, + "flos": 30628282792320.0, + "grad_norm": 1.7472178956597075, + "language_loss": 0.79694968, + "learning_rate": 3.5881379963498053e-06, + "loss": 0.81827271, + "num_input_tokens_seen": 82945615, + "step": 3854, + "time_per_iteration": 2.7187647819519043 + }, + { + "auxiliary_loss_clip": 0.01057569, + "auxiliary_loss_mlp": 0.01054889, + "balance_loss_clip": 1.0279386, + "balance_loss_mlp": 1.03659034, + "epoch": 0.23177513903502178, + "flos": 23842782397440.0, + "grad_norm": 2.6411874637738086, + "language_loss": 0.65358639, + "learning_rate": 3.587901240669831e-06, + "loss": 0.67471099, + "num_input_tokens_seen": 82967570, + "step": 3855, + "time_per_iteration": 2.683525562286377 + }, + { + "auxiliary_loss_clip": 0.01098213, + "auxiliary_loss_mlp": 0.01053501, + "balance_loss_clip": 1.03230155, + "balance_loss_mlp": 1.03780067, + "epoch": 0.23183526228768978, + "flos": 29570282668800.0, + "grad_norm": 2.4170729787785854, + "language_loss": 0.70955795, + "learning_rate": 3.5876644247764815e-06, + "loss": 0.73107505, + "num_input_tokens_seen": 82987435, + "step": 3856, + "time_per_iteration": 2.692706823348999 + }, + { + "auxiliary_loss_clip": 0.0105979, + "auxiliary_loss_mlp": 0.01042001, + "balance_loss_clip": 1.03678012, + "balance_loss_mlp": 1.02745712, + "epoch": 0.23189538554035774, + "flos": 34458694254720.0, + "grad_norm": 2.1759423474052495, + "language_loss": 0.77414787, + "learning_rate": 3.5874275486787387e-06, + "loss": 0.79516584, + "num_input_tokens_seen": 83010505, + "step": 3857, + "time_per_iteration": 2.9447031021118164 + }, + { + "auxiliary_loss_clip": 0.01083053, + "auxiliary_loss_mlp": 0.00748912, + "balance_loss_clip": 1.03435802, + "balance_loss_mlp": 1.00113654, + "epoch": 0.2319555087930257, + "flos": 18003815245440.0, + "grad_norm": 2.6501105865655945, + "language_loss": 0.91032863, + "learning_rate": 3.587190612385584e-06, + "loss": 0.92864823, + "num_input_tokens_seen": 83026705, + "step": 3858, + "time_per_iteration": 2.698115825653076 + }, + { + "auxiliary_loss_clip": 0.0105359, + "auxiliary_loss_mlp": 0.01046278, + "balance_loss_clip": 1.0350585, + "balance_loss_mlp": 1.03149021, + "epoch": 0.23201563204569367, + "flos": 23143852581120.0, + "grad_norm": 1.9863583305261199, + "language_loss": 0.7612288, + "learning_rate": 3.5869536159060026e-06, + "loss": 0.78222752, + "num_input_tokens_seen": 83046500, + "step": 3859, + "time_per_iteration": 2.7677829265594482 + }, + { + "auxiliary_loss_clip": 0.01083652, + "auxiliary_loss_mlp": 0.01037352, + "balance_loss_clip": 1.03072739, + "balance_loss_mlp": 1.02193248, + "epoch": 0.23207575529836164, + "flos": 20667991927680.0, + "grad_norm": 2.007190574512248, + "language_loss": 0.84135294, + "learning_rate": 3.58671655924898e-06, + "loss": 0.86256295, + "num_input_tokens_seen": 83065280, + "step": 3860, + "time_per_iteration": 2.8248701095581055 + }, + { + "auxiliary_loss_clip": 0.01041646, + "auxiliary_loss_mlp": 0.01049215, + "balance_loss_clip": 1.02885103, + "balance_loss_mlp": 1.03372955, + "epoch": 0.2321358785510296, + "flos": 16472189364480.0, + "grad_norm": 1.8844935382254793, + "language_loss": 0.82980752, + "learning_rate": 3.586479442423508e-06, + "loss": 0.85071611, + "num_input_tokens_seen": 83082310, + "step": 3861, + "time_per_iteration": 2.87374210357666 + }, + { + "auxiliary_loss_clip": 0.01075313, + "auxiliary_loss_mlp": 0.00748952, + "balance_loss_clip": 1.03140974, + "balance_loss_mlp": 1.00128388, + "epoch": 0.2321960018036976, + "flos": 21616320850560.0, + "grad_norm": 1.9185198962510712, + "language_loss": 0.86095214, + "learning_rate": 3.586242265438576e-06, + "loss": 0.87919474, + "num_input_tokens_seen": 83102065, + "step": 3862, + "time_per_iteration": 4.384010076522827 + }, + { + "auxiliary_loss_clip": 0.01062764, + "auxiliary_loss_mlp": 0.01048623, + "balance_loss_clip": 1.03145516, + "balance_loss_mlp": 1.03468704, + "epoch": 0.23225612505636556, + "flos": 22271474966400.0, + "grad_norm": 1.4280150454544656, + "language_loss": 0.74754333, + "learning_rate": 3.5860050283031773e-06, + "loss": 0.76865721, + "num_input_tokens_seen": 83121445, + "step": 3863, + "time_per_iteration": 2.798661231994629 + }, + { + "auxiliary_loss_clip": 0.01065385, + "auxiliary_loss_mlp": 0.01047245, + "balance_loss_clip": 1.03509903, + "balance_loss_mlp": 1.03294003, + "epoch": 0.23231624830903352, + "flos": 17052325925760.0, + "grad_norm": 1.8383037837786382, + "language_loss": 0.74529111, + "learning_rate": 3.58576773102631e-06, + "loss": 0.76641738, + "num_input_tokens_seen": 83138175, + "step": 3864, + "time_per_iteration": 2.7930896282196045 + }, + { + "auxiliary_loss_clip": 0.01094653, + "auxiliary_loss_mlp": 0.01037409, + "balance_loss_clip": 1.03181255, + "balance_loss_mlp": 1.02215028, + "epoch": 0.2323763715617015, + "flos": 34640043045120.0, + "grad_norm": 2.525008423344866, + "language_loss": 0.70224178, + "learning_rate": 3.5855303736169714e-06, + "loss": 0.72356242, + "num_input_tokens_seen": 83161975, + "step": 3865, + "time_per_iteration": 2.7988927364349365 + }, + { + "auxiliary_loss_clip": 0.01105725, + "auxiliary_loss_mlp": 0.01049435, + "balance_loss_clip": 1.03566992, + "balance_loss_mlp": 1.03180408, + "epoch": 0.23243649481436945, + "flos": 25551698832000.0, + "grad_norm": 1.9195558897501726, + "language_loss": 0.94894153, + "learning_rate": 3.5852929560841617e-06, + "loss": 0.97049308, + "num_input_tokens_seen": 83180905, + "step": 3866, + "time_per_iteration": 2.596435546875 + }, + { + "auxiliary_loss_clip": 0.0107915, + "auxiliary_loss_mlp": 0.01040896, + "balance_loss_clip": 1.03187895, + "balance_loss_mlp": 1.02572036, + "epoch": 0.23249661806703742, + "flos": 20483482740480.0, + "grad_norm": 2.3058971558131285, + "language_loss": 0.73415935, + "learning_rate": 3.5850554784368846e-06, + "loss": 0.75535983, + "num_input_tokens_seen": 83196390, + "step": 3867, + "time_per_iteration": 2.605855703353882 + }, + { + "auxiliary_loss_clip": 0.01081121, + "auxiliary_loss_mlp": 0.01039003, + "balance_loss_clip": 1.03470588, + "balance_loss_mlp": 1.02319598, + "epoch": 0.23255674131970538, + "flos": 20376612800640.0, + "grad_norm": 1.9717211764603022, + "language_loss": 0.82391745, + "learning_rate": 3.584817940684145e-06, + "loss": 0.8451187, + "num_input_tokens_seen": 83216165, + "step": 3868, + "time_per_iteration": 6.026679039001465 + }, + { + "auxiliary_loss_clip": 0.01072162, + "auxiliary_loss_mlp": 0.01037317, + "balance_loss_clip": 1.03061569, + "balance_loss_mlp": 1.02280962, + "epoch": 0.23261686457237338, + "flos": 17056096853760.0, + "grad_norm": 1.787925562497311, + "language_loss": 0.72973108, + "learning_rate": 3.58458034283495e-06, + "loss": 0.75082588, + "num_input_tokens_seen": 83233845, + "step": 3869, + "time_per_iteration": 2.525522232055664 + }, + { + "auxiliary_loss_clip": 0.01084187, + "auxiliary_loss_mlp": 0.01045978, + "balance_loss_clip": 1.03361177, + "balance_loss_mlp": 1.03090429, + "epoch": 0.23267698782504134, + "flos": 29169878785920.0, + "grad_norm": 2.735948218209285, + "language_loss": 0.79629183, + "learning_rate": 3.5843426848983097e-06, + "loss": 0.81759346, + "num_input_tokens_seen": 83254930, + "step": 3870, + "time_per_iteration": 2.66774582862854 + }, + { + "auxiliary_loss_clip": 0.01100186, + "auxiliary_loss_mlp": 0.01038647, + "balance_loss_clip": 1.03347218, + "balance_loss_mlp": 1.02231574, + "epoch": 0.2327371110777093, + "flos": 21174655219200.0, + "grad_norm": 7.030983738011464, + "language_loss": 0.70295823, + "learning_rate": 3.5841049668832357e-06, + "loss": 0.72434652, + "num_input_tokens_seen": 83272095, + "step": 3871, + "time_per_iteration": 2.6757800579071045 + }, + { + "auxiliary_loss_clip": 0.01087776, + "auxiliary_loss_mlp": 0.01051122, + "balance_loss_clip": 1.03372002, + "balance_loss_mlp": 1.03362203, + "epoch": 0.23279723433037727, + "flos": 24863112132480.0, + "grad_norm": 2.1691391320018374, + "language_loss": 0.68887115, + "learning_rate": 3.5838671887987433e-06, + "loss": 0.71026015, + "num_input_tokens_seen": 83290980, + "step": 3872, + "time_per_iteration": 2.794193983078003 + }, + { + "auxiliary_loss_clip": 0.01094207, + "auxiliary_loss_mlp": 0.01040464, + "balance_loss_clip": 1.03514457, + "balance_loss_mlp": 1.02362001, + "epoch": 0.23285735758304524, + "flos": 38800617344640.0, + "grad_norm": 1.6680670638319468, + "language_loss": 0.77772892, + "learning_rate": 3.5836293506538474e-06, + "loss": 0.7990756, + "num_input_tokens_seen": 83315175, + "step": 3873, + "time_per_iteration": 2.7987818717956543 + }, + { + "auxiliary_loss_clip": 0.01005178, + "auxiliary_loss_mlp": 0.0102471, + "balance_loss_clip": 1.0082581, + "balance_loss_mlp": 1.02220631, + "epoch": 0.2329174808357132, + "flos": 53944113692160.0, + "grad_norm": 0.854613014776789, + "language_loss": 0.6057654, + "learning_rate": 3.5833914524575687e-06, + "loss": 0.62606424, + "num_input_tokens_seen": 83372060, + "step": 3874, + "time_per_iteration": 3.2120089530944824 + }, + { + "auxiliary_loss_clip": 0.01077331, + "auxiliary_loss_mlp": 0.0103853, + "balance_loss_clip": 1.03339243, + "balance_loss_mlp": 1.02241337, + "epoch": 0.23297760408838117, + "flos": 21216024708480.0, + "grad_norm": 2.513365053540447, + "language_loss": 0.80876154, + "learning_rate": 3.583153494218927e-06, + "loss": 0.82992017, + "num_input_tokens_seen": 83389795, + "step": 3875, + "time_per_iteration": 2.749260902404785 + }, + { + "auxiliary_loss_clip": 0.01097089, + "auxiliary_loss_mlp": 0.0074878, + "balance_loss_clip": 1.03508306, + "balance_loss_mlp": 1.00127554, + "epoch": 0.23303772734104916, + "flos": 28403006394240.0, + "grad_norm": 1.5665197378498534, + "language_loss": 0.6054253, + "learning_rate": 3.5829154759469464e-06, + "loss": 0.62388396, + "num_input_tokens_seen": 83410005, + "step": 3876, + "time_per_iteration": 2.672584056854248 + }, + { + "auxiliary_loss_clip": 0.01068608, + "auxiliary_loss_mlp": 0.01050123, + "balance_loss_clip": 1.03303933, + "balance_loss_mlp": 1.03344536, + "epoch": 0.23309785059371713, + "flos": 24314720215680.0, + "grad_norm": 1.7494869627762106, + "language_loss": 0.70240539, + "learning_rate": 3.5826773976506523e-06, + "loss": 0.7235927, + "num_input_tokens_seen": 83430250, + "step": 3877, + "time_per_iteration": 2.663957357406616 + }, + { + "auxiliary_loss_clip": 0.0108991, + "auxiliary_loss_mlp": 0.01052862, + "balance_loss_clip": 1.03523254, + "balance_loss_mlp": 1.03589892, + "epoch": 0.2331579738463851, + "flos": 15992925171840.0, + "grad_norm": 2.4205837402530954, + "language_loss": 0.80648643, + "learning_rate": 3.582439259339073e-06, + "loss": 0.82791424, + "num_input_tokens_seen": 83447950, + "step": 3878, + "time_per_iteration": 2.6230740547180176 + }, + { + "auxiliary_loss_clip": 0.01037301, + "auxiliary_loss_mlp": 0.01039128, + "balance_loss_clip": 1.02791953, + "balance_loss_mlp": 1.0221765, + "epoch": 0.23321809709905306, + "flos": 36426957863040.0, + "grad_norm": 2.224568031883798, + "language_loss": 0.74920738, + "learning_rate": 3.5822010610212374e-06, + "loss": 0.76997161, + "num_input_tokens_seen": 83467785, + "step": 3879, + "time_per_iteration": 3.0417661666870117 + }, + { + "auxiliary_loss_clip": 0.01045873, + "auxiliary_loss_mlp": 0.01044268, + "balance_loss_clip": 1.0301379, + "balance_loss_mlp": 1.02779341, + "epoch": 0.23327822035172102, + "flos": 21324762155520.0, + "grad_norm": 2.2056112743211327, + "language_loss": 0.89325422, + "learning_rate": 3.5819628027061795e-06, + "loss": 0.91415566, + "num_input_tokens_seen": 83485390, + "step": 3880, + "time_per_iteration": 2.7665786743164062 + }, + { + "auxiliary_loss_clip": 0.01081303, + "auxiliary_loss_mlp": 0.01042368, + "balance_loss_clip": 1.03611231, + "balance_loss_mlp": 1.02671599, + "epoch": 0.233338343604389, + "flos": 19171881619200.0, + "grad_norm": 1.650084585051573, + "language_loss": 0.71593237, + "learning_rate": 3.5817244844029334e-06, + "loss": 0.73716903, + "num_input_tokens_seen": 83504890, + "step": 3881, + "time_per_iteration": 2.723123550415039 + }, + { + "auxiliary_loss_clip": 0.01096629, + "auxiliary_loss_mlp": 0.01039721, + "balance_loss_clip": 1.03366137, + "balance_loss_mlp": 1.02421212, + "epoch": 0.23339846685705698, + "flos": 26908368543360.0, + "grad_norm": 1.7557725784060518, + "language_loss": 0.68054509, + "learning_rate": 3.581486106120537e-06, + "loss": 0.70190859, + "num_input_tokens_seen": 83526475, + "step": 3882, + "time_per_iteration": 2.657529830932617 + }, + { + "auxiliary_loss_clip": 0.01059447, + "auxiliary_loss_mlp": 0.01043929, + "balance_loss_clip": 1.03042459, + "balance_loss_mlp": 1.02737117, + "epoch": 0.23345859010972494, + "flos": 32343160884480.0, + "grad_norm": 1.9131569687465237, + "language_loss": 0.76533979, + "learning_rate": 3.5812476678680287e-06, + "loss": 0.78637356, + "num_input_tokens_seen": 83546620, + "step": 3883, + "time_per_iteration": 2.7601823806762695 + }, + { + "auxiliary_loss_clip": 0.01003202, + "auxiliary_loss_mlp": 0.01004873, + "balance_loss_clip": 1.00556421, + "balance_loss_mlp": 1.00174928, + "epoch": 0.2335187133623929, + "flos": 58484229050880.0, + "grad_norm": 0.7759001727435099, + "language_loss": 0.59151936, + "learning_rate": 3.58100916965445e-06, + "loss": 0.6116001, + "num_input_tokens_seen": 83616160, + "step": 3884, + "time_per_iteration": 3.4479715824127197 + }, + { + "auxiliary_loss_clip": 0.01066542, + "auxiliary_loss_mlp": 0.0103439, + "balance_loss_clip": 1.03322649, + "balance_loss_mlp": 1.01915503, + "epoch": 0.23357883661506088, + "flos": 24502317972480.0, + "grad_norm": 1.6114632602571484, + "language_loss": 0.8061043, + "learning_rate": 3.5807706114888455e-06, + "loss": 0.82711369, + "num_input_tokens_seen": 83636795, + "step": 3885, + "time_per_iteration": 2.7355740070343018 + }, + { + "auxiliary_loss_clip": 0.01086396, + "auxiliary_loss_mlp": 0.01033198, + "balance_loss_clip": 1.03407216, + "balance_loss_mlp": 1.01777232, + "epoch": 0.23363895986772884, + "flos": 18948516894720.0, + "grad_norm": 2.3337667131502777, + "language_loss": 0.87914944, + "learning_rate": 3.580531993380261e-06, + "loss": 0.90034539, + "num_input_tokens_seen": 83654050, + "step": 3886, + "time_per_iteration": 2.5938258171081543 + }, + { + "auxiliary_loss_clip": 0.01100875, + "auxiliary_loss_mlp": 0.01036948, + "balance_loss_clip": 1.03568125, + "balance_loss_mlp": 1.02151084, + "epoch": 0.2336990831203968, + "flos": 31686821619840.0, + "grad_norm": 1.8841753573367508, + "language_loss": 0.7325145, + "learning_rate": 3.5802933153377445e-06, + "loss": 0.75389278, + "num_input_tokens_seen": 83673720, + "step": 3887, + "time_per_iteration": 2.6387665271759033 + }, + { + "auxiliary_loss_clip": 0.01087289, + "auxiliary_loss_mlp": 0.01037574, + "balance_loss_clip": 1.03217709, + "balance_loss_mlp": 1.02171922, + "epoch": 0.23375920637306477, + "flos": 27709750926720.0, + "grad_norm": 2.7348139727780407, + "language_loss": 0.84373307, + "learning_rate": 3.5800545773703475e-06, + "loss": 0.86498165, + "num_input_tokens_seen": 83693470, + "step": 3888, + "time_per_iteration": 2.642845869064331 + }, + { + "auxiliary_loss_clip": 0.01070644, + "auxiliary_loss_mlp": 0.01048194, + "balance_loss_clip": 1.03285551, + "balance_loss_mlp": 1.03119516, + "epoch": 0.23381932962573276, + "flos": 17675627656320.0, + "grad_norm": 2.251203074055154, + "language_loss": 0.87327152, + "learning_rate": 3.5798157794871225e-06, + "loss": 0.89445996, + "num_input_tokens_seen": 83711620, + "step": 3889, + "time_per_iteration": 2.5860698223114014 + }, + { + "auxiliary_loss_clip": 0.01088788, + "auxiliary_loss_mlp": 0.01035228, + "balance_loss_clip": 1.03330231, + "balance_loss_mlp": 1.0199095, + "epoch": 0.23387945287840073, + "flos": 14390842763520.0, + "grad_norm": 2.8045663209488914, + "language_loss": 0.76733541, + "learning_rate": 3.579576921697125e-06, + "loss": 0.78857559, + "num_input_tokens_seen": 83727890, + "step": 3890, + "time_per_iteration": 2.5333433151245117 + }, + { + "auxiliary_loss_clip": 0.01054222, + "auxiliary_loss_mlp": 0.0074874, + "balance_loss_clip": 1.03169453, + "balance_loss_mlp": 1.00118387, + "epoch": 0.2339395761310687, + "flos": 46097988503040.0, + "grad_norm": 1.7327665081992372, + "language_loss": 0.73671257, + "learning_rate": 3.579338004009412e-06, + "loss": 0.75474215, + "num_input_tokens_seen": 83749370, + "step": 3891, + "time_per_iteration": 3.0255212783813477 + }, + { + "auxiliary_loss_clip": 0.01092676, + "auxiliary_loss_mlp": 0.01035281, + "balance_loss_clip": 1.03244567, + "balance_loss_mlp": 1.0200702, + "epoch": 0.23399969938373666, + "flos": 22382044007040.0, + "grad_norm": 1.8700257223454773, + "language_loss": 0.82920659, + "learning_rate": 3.5790990264330433e-06, + "loss": 0.85048616, + "num_input_tokens_seen": 83769560, + "step": 3892, + "time_per_iteration": 2.693650484085083 + }, + { + "auxiliary_loss_clip": 0.01041659, + "auxiliary_loss_mlp": 0.01043264, + "balance_loss_clip": 1.0258826, + "balance_loss_mlp": 1.02578831, + "epoch": 0.23405982263640462, + "flos": 43508542066560.0, + "grad_norm": 4.07837326044524, + "language_loss": 0.64706665, + "learning_rate": 3.578859988977082e-06, + "loss": 0.66791582, + "num_input_tokens_seen": 83795635, + "step": 3893, + "time_per_iteration": 2.8838346004486084 + }, + { + "auxiliary_loss_clip": 0.01054788, + "auxiliary_loss_mlp": 0.01038857, + "balance_loss_clip": 1.03076935, + "balance_loss_mlp": 1.0223825, + "epoch": 0.2341199458890726, + "flos": 22564685687040.0, + "grad_norm": 1.8159212836288658, + "language_loss": 0.79169345, + "learning_rate": 3.5786208916505916e-06, + "loss": 0.81262994, + "num_input_tokens_seen": 83814090, + "step": 3894, + "time_per_iteration": 2.7352774143218994 + }, + { + "auxiliary_loss_clip": 0.01081388, + "auxiliary_loss_mlp": 0.01037874, + "balance_loss_clip": 1.02999949, + "balance_loss_mlp": 1.02315176, + "epoch": 0.23418006914174055, + "flos": 25633970933760.0, + "grad_norm": 1.563469533614716, + "language_loss": 0.81765467, + "learning_rate": 3.5783817344626383e-06, + "loss": 0.83884728, + "num_input_tokens_seen": 83836870, + "step": 3895, + "time_per_iteration": 2.6506121158599854 + }, + { + "auxiliary_loss_clip": 0.01086766, + "auxiliary_loss_mlp": 0.01044539, + "balance_loss_clip": 1.03287399, + "balance_loss_mlp": 1.02892852, + "epoch": 0.23424019239440855, + "flos": 13545936074880.0, + "grad_norm": 1.9850196331325287, + "language_loss": 0.80028492, + "learning_rate": 3.578142517422292e-06, + "loss": 0.82159793, + "num_input_tokens_seen": 83853275, + "step": 3896, + "time_per_iteration": 2.5937137603759766 + }, + { + "auxiliary_loss_clip": 0.01072787, + "auxiliary_loss_mlp": 0.01039573, + "balance_loss_clip": 1.03082943, + "balance_loss_mlp": 1.02349114, + "epoch": 0.2343003156470765, + "flos": 22419498913920.0, + "grad_norm": 1.6540842143269283, + "language_loss": 0.82798207, + "learning_rate": 3.577903240538623e-06, + "loss": 0.84910566, + "num_input_tokens_seen": 83872340, + "step": 3897, + "time_per_iteration": 2.610732316970825 + }, + { + "auxiliary_loss_clip": 0.01090437, + "auxiliary_loss_mlp": 0.0104618, + "balance_loss_clip": 1.03636372, + "balance_loss_mlp": 1.02969313, + "epoch": 0.23436043889974448, + "flos": 14790815683200.0, + "grad_norm": 1.6411396811676249, + "language_loss": 0.78971744, + "learning_rate": 3.577663903820705e-06, + "loss": 0.81108356, + "num_input_tokens_seen": 83888795, + "step": 3898, + "time_per_iteration": 2.605320930480957 + }, + { + "auxiliary_loss_clip": 0.0105768, + "auxiliary_loss_mlp": 0.01050747, + "balance_loss_clip": 1.03013873, + "balance_loss_mlp": 1.03514278, + "epoch": 0.23442056215241244, + "flos": 22965700101120.0, + "grad_norm": 1.9736399987038413, + "language_loss": 0.73681831, + "learning_rate": 3.577424507277614e-06, + "loss": 0.75790262, + "num_input_tokens_seen": 83906820, + "step": 3899, + "time_per_iteration": 2.7541909217834473 + }, + { + "auxiliary_loss_clip": 0.01062234, + "auxiliary_loss_mlp": 0.01049348, + "balance_loss_clip": 1.03003454, + "balance_loss_mlp": 1.03307569, + "epoch": 0.2344806854050804, + "flos": 23071887682560.0, + "grad_norm": 1.833761378066402, + "language_loss": 0.74907953, + "learning_rate": 3.5771850509184277e-06, + "loss": 0.77019531, + "num_input_tokens_seen": 83926370, + "step": 3900, + "time_per_iteration": 2.7064120769500732 + }, + { + "auxiliary_loss_clip": 0.01050721, + "auxiliary_loss_mlp": 0.01043948, + "balance_loss_clip": 1.0307126, + "balance_loss_mlp": 1.02775896, + "epoch": 0.23454080865774837, + "flos": 16327074418560.0, + "grad_norm": 2.010208905448109, + "language_loss": 0.66598612, + "learning_rate": 3.5769455347522256e-06, + "loss": 0.6869328, + "num_input_tokens_seen": 83944600, + "step": 3901, + "time_per_iteration": 2.7792232036590576 + }, + { + "auxiliary_loss_clip": 0.00983023, + "auxiliary_loss_mlp": 0.01008365, + "balance_loss_clip": 1.00681281, + "balance_loss_mlp": 1.00586128, + "epoch": 0.23460093191041637, + "flos": 67760958142080.0, + "grad_norm": 0.7629766763572006, + "language_loss": 0.58254796, + "learning_rate": 3.576705958788091e-06, + "loss": 0.60246187, + "num_input_tokens_seen": 84005100, + "step": 3902, + "time_per_iteration": 3.2780263423919678 + }, + { + "auxiliary_loss_clip": 0.01081471, + "auxiliary_loss_mlp": 0.01042372, + "balance_loss_clip": 1.03598809, + "balance_loss_mlp": 1.02554011, + "epoch": 0.23466105516308433, + "flos": 20077619990400.0, + "grad_norm": 1.9086374094646041, + "language_loss": 0.8043623, + "learning_rate": 3.576466323035108e-06, + "loss": 0.82560074, + "num_input_tokens_seen": 84023775, + "step": 3903, + "time_per_iteration": 2.6666970252990723 + }, + { + "auxiliary_loss_clip": 0.01039676, + "auxiliary_loss_mlp": 0.0103634, + "balance_loss_clip": 1.02597237, + "balance_loss_mlp": 1.01984119, + "epoch": 0.2347211784157523, + "flos": 24535714642560.0, + "grad_norm": 2.1214557515684116, + "language_loss": 0.81911278, + "learning_rate": 3.5762266275023645e-06, + "loss": 0.8398729, + "num_input_tokens_seen": 84042605, + "step": 3904, + "time_per_iteration": 2.823025941848755 + }, + { + "auxiliary_loss_clip": 0.01098805, + "auxiliary_loss_mlp": 0.01040857, + "balance_loss_clip": 1.03555226, + "balance_loss_mlp": 1.02489543, + "epoch": 0.23478130166842026, + "flos": 23805040181760.0, + "grad_norm": 3.90557326914149, + "language_loss": 0.7117148, + "learning_rate": 3.57598687219895e-06, + "loss": 0.7331115, + "num_input_tokens_seen": 84061520, + "step": 3905, + "time_per_iteration": 2.619251251220703 + }, + { + "auxiliary_loss_clip": 0.01094978, + "auxiliary_loss_mlp": 0.01036189, + "balance_loss_clip": 1.03309917, + "balance_loss_mlp": 1.02104354, + "epoch": 0.23484142492108823, + "flos": 24093618048000.0, + "grad_norm": 1.6504239152379394, + "language_loss": 0.70857131, + "learning_rate": 3.5757470571339543e-06, + "loss": 0.72988302, + "num_input_tokens_seen": 84081800, + "step": 3906, + "time_per_iteration": 2.6263182163238525 + }, + { + "auxiliary_loss_clip": 0.01090665, + "auxiliary_loss_mlp": 0.01037619, + "balance_loss_clip": 1.03181481, + "balance_loss_mlp": 1.01997566, + "epoch": 0.2349015481737562, + "flos": 29095830898560.0, + "grad_norm": 2.112419576385739, + "language_loss": 0.73457032, + "learning_rate": 3.575507182316473e-06, + "loss": 0.75585318, + "num_input_tokens_seen": 84102340, + "step": 3907, + "time_per_iteration": 2.634868860244751 + }, + { + "auxiliary_loss_clip": 0.01088627, + "auxiliary_loss_mlp": 0.01048638, + "balance_loss_clip": 1.03384054, + "balance_loss_mlp": 1.03197265, + "epoch": 0.23496167142642416, + "flos": 18916305373440.0, + "grad_norm": 1.6681865488936576, + "language_loss": 0.72575867, + "learning_rate": 3.575267247755601e-06, + "loss": 0.74713135, + "num_input_tokens_seen": 84120370, + "step": 3908, + "time_per_iteration": 2.5581512451171875 + }, + { + "auxiliary_loss_clip": 0.01007177, + "auxiliary_loss_mlp": 0.01004244, + "balance_loss_clip": 1.00932574, + "balance_loss_mlp": 1.00163329, + "epoch": 0.23502179467909215, + "flos": 55868062896000.0, + "grad_norm": 1.0191581962581477, + "language_loss": 0.73289764, + "learning_rate": 3.5750272534604367e-06, + "loss": 0.75301182, + "num_input_tokens_seen": 84165515, + "step": 3909, + "time_per_iteration": 4.502224445343018 + }, + { + "auxiliary_loss_clip": 0.01090031, + "auxiliary_loss_mlp": 0.01036142, + "balance_loss_clip": 1.03457987, + "balance_loss_mlp": 1.02009034, + "epoch": 0.23508191793176011, + "flos": 23401763210880.0, + "grad_norm": 1.7603754410600592, + "language_loss": 0.87633997, + "learning_rate": 3.5747871994400822e-06, + "loss": 0.89760172, + "num_input_tokens_seen": 84184540, + "step": 3910, + "time_per_iteration": 2.6371467113494873 + }, + { + "auxiliary_loss_clip": 0.01089648, + "auxiliary_loss_mlp": 0.01038418, + "balance_loss_clip": 1.03500342, + "balance_loss_mlp": 1.02312374, + "epoch": 0.23514204118442808, + "flos": 20047671025920.0, + "grad_norm": 2.3595403505381785, + "language_loss": 0.76086432, + "learning_rate": 3.5745470857036386e-06, + "loss": 0.78214502, + "num_input_tokens_seen": 84202025, + "step": 3911, + "time_per_iteration": 2.538794755935669 + }, + { + "auxiliary_loss_clip": 0.01084064, + "auxiliary_loss_mlp": 0.01041781, + "balance_loss_clip": 1.0331018, + "balance_loss_mlp": 1.02684999, + "epoch": 0.23520216443709605, + "flos": 21580589796480.0, + "grad_norm": 1.5395895021289177, + "language_loss": 0.81724751, + "learning_rate": 3.5743069122602122e-06, + "loss": 0.83850592, + "num_input_tokens_seen": 84221895, + "step": 3912, + "time_per_iteration": 2.5580971240997314 + }, + { + "auxiliary_loss_clip": 0.01075273, + "auxiliary_loss_mlp": 0.01045199, + "balance_loss_clip": 1.03161812, + "balance_loss_mlp": 1.02923691, + "epoch": 0.235262287689764, + "flos": 23185796688000.0, + "grad_norm": 1.9502357601678548, + "language_loss": 0.7169407, + "learning_rate": 3.574066679118909e-06, + "loss": 0.73814541, + "num_input_tokens_seen": 84240455, + "step": 3913, + "time_per_iteration": 2.723090171813965 + }, + { + "auxiliary_loss_clip": 0.01092479, + "auxiliary_loss_mlp": 0.00748926, + "balance_loss_clip": 1.03382778, + "balance_loss_mlp": 1.00106359, + "epoch": 0.23532241094243198, + "flos": 23185222070400.0, + "grad_norm": 1.5658116152228976, + "language_loss": 0.75808382, + "learning_rate": 3.57382638628884e-06, + "loss": 0.77649784, + "num_input_tokens_seen": 84261605, + "step": 3914, + "time_per_iteration": 2.6869962215423584 + }, + { + "auxiliary_loss_clip": 0.01045603, + "auxiliary_loss_mlp": 0.01042196, + "balance_loss_clip": 1.03598571, + "balance_loss_mlp": 1.02513742, + "epoch": 0.23538253419509997, + "flos": 17019324305280.0, + "grad_norm": 3.321741320533552, + "language_loss": 0.89542365, + "learning_rate": 3.5735860337791174e-06, + "loss": 0.91630161, + "num_input_tokens_seen": 84278675, + "step": 3915, + "time_per_iteration": 2.7027511596679688 + }, + { + "auxiliary_loss_clip": 0.00999869, + "auxiliary_loss_mlp": 0.01005183, + "balance_loss_clip": 1.00538659, + "balance_loss_mlp": 1.00260818, + "epoch": 0.23544265744776793, + "flos": 63448588967040.0, + "grad_norm": 0.8216768161114864, + "language_loss": 0.59357637, + "learning_rate": 3.573345621598854e-06, + "loss": 0.6136269, + "num_input_tokens_seen": 84329765, + "step": 3916, + "time_per_iteration": 4.852906942367554 + }, + { + "auxiliary_loss_clip": 0.00985066, + "auxiliary_loss_mlp": 0.01006147, + "balance_loss_clip": 1.00881112, + "balance_loss_mlp": 1.00366712, + "epoch": 0.2355027807004359, + "flos": 70515343831680.0, + "grad_norm": 0.7695527232431005, + "language_loss": 0.49491823, + "learning_rate": 3.5731051497571675e-06, + "loss": 0.51483035, + "num_input_tokens_seen": 84393680, + "step": 3917, + "time_per_iteration": 3.3651931285858154 + }, + { + "auxiliary_loss_clip": 0.01067207, + "auxiliary_loss_mlp": 0.01050785, + "balance_loss_clip": 1.03311253, + "balance_loss_mlp": 1.03456068, + "epoch": 0.23556290395310386, + "flos": 21434289701760.0, + "grad_norm": 1.9143624533021097, + "language_loss": 0.75725549, + "learning_rate": 3.5728646182631756e-06, + "loss": 0.77843541, + "num_input_tokens_seen": 84412640, + "step": 3918, + "time_per_iteration": 2.855384111404419 + }, + { + "auxiliary_loss_clip": 0.01049075, + "auxiliary_loss_mlp": 0.01044891, + "balance_loss_clip": 1.03051758, + "balance_loss_mlp": 1.0281539, + "epoch": 0.23562302720577183, + "flos": 18186421011840.0, + "grad_norm": 1.8455231718426486, + "language_loss": 0.68818772, + "learning_rate": 3.5726240271259995e-06, + "loss": 0.70912743, + "num_input_tokens_seen": 84431605, + "step": 3919, + "time_per_iteration": 2.9568159580230713 + }, + { + "auxiliary_loss_clip": 0.01064795, + "auxiliary_loss_mlp": 0.0103992, + "balance_loss_clip": 1.03429151, + "balance_loss_mlp": 1.02369523, + "epoch": 0.2356831504584398, + "flos": 33730497832320.0, + "grad_norm": 1.628112937608069, + "language_loss": 0.701316, + "learning_rate": 3.5723833763547634e-06, + "loss": 0.72236311, + "num_input_tokens_seen": 84454210, + "step": 3920, + "time_per_iteration": 2.9610960483551025 + }, + { + "auxiliary_loss_clip": 0.01077914, + "auxiliary_loss_mlp": 0.01045734, + "balance_loss_clip": 1.03410149, + "balance_loss_mlp": 1.02978373, + "epoch": 0.23574327371110776, + "flos": 24932778560640.0, + "grad_norm": 1.5884417068517713, + "language_loss": 0.76957798, + "learning_rate": 3.5721426659585916e-06, + "loss": 0.79081452, + "num_input_tokens_seen": 84475540, + "step": 3921, + "time_per_iteration": 2.826441526412964 + }, + { + "auxiliary_loss_clip": 0.01066737, + "auxiliary_loss_mlp": 0.01043493, + "balance_loss_clip": 1.03279757, + "balance_loss_mlp": 1.02734017, + "epoch": 0.23580339696377575, + "flos": 17822107319040.0, + "grad_norm": 2.19675879068404, + "language_loss": 0.74945086, + "learning_rate": 3.571901895946612e-06, + "loss": 0.77055317, + "num_input_tokens_seen": 84494580, + "step": 3922, + "time_per_iteration": 2.8890347480773926 + }, + { + "auxiliary_loss_clip": 0.01072602, + "auxiliary_loss_mlp": 0.0103743, + "balance_loss_clip": 1.03160191, + "balance_loss_mlp": 1.02212334, + "epoch": 0.23586352021644372, + "flos": 26286611097600.0, + "grad_norm": 2.0019716207309317, + "language_loss": 0.79956865, + "learning_rate": 3.571661066327956e-06, + "loss": 0.82066894, + "num_input_tokens_seen": 84513850, + "step": 3923, + "time_per_iteration": 2.828691005706787 + }, + { + "auxiliary_loss_clip": 0.01045152, + "auxiliary_loss_mlp": 0.01045401, + "balance_loss_clip": 1.03017712, + "balance_loss_mlp": 1.02850926, + "epoch": 0.23592364346911168, + "flos": 14246697484800.0, + "grad_norm": 2.1474532226025125, + "language_loss": 0.74565363, + "learning_rate": 3.571420177111754e-06, + "loss": 0.76655912, + "num_input_tokens_seen": 84532315, + "step": 3924, + "time_per_iteration": 2.8578009605407715 + }, + { + "auxiliary_loss_clip": 0.01100116, + "auxiliary_loss_mlp": 0.0103817, + "balance_loss_clip": 1.03640974, + "balance_loss_mlp": 1.02282739, + "epoch": 0.23598376672177965, + "flos": 18587938216320.0, + "grad_norm": 1.8566263768950704, + "language_loss": 0.82387424, + "learning_rate": 3.5711792283071416e-06, + "loss": 0.84525704, + "num_input_tokens_seen": 84550970, + "step": 3925, + "time_per_iteration": 2.5814406871795654 + }, + { + "auxiliary_loss_clip": 0.01076521, + "auxiliary_loss_mlp": 0.0104108, + "balance_loss_clip": 1.03350854, + "balance_loss_mlp": 1.02509451, + "epoch": 0.2360438899744476, + "flos": 22675542036480.0, + "grad_norm": 1.9047070074266947, + "language_loss": 0.59428656, + "learning_rate": 3.5709382199232564e-06, + "loss": 0.61546254, + "num_input_tokens_seen": 84571655, + "step": 3926, + "time_per_iteration": 2.751163959503174 + }, + { + "auxiliary_loss_clip": 0.01081405, + "auxiliary_loss_mlp": 0.01037693, + "balance_loss_clip": 1.03398025, + "balance_loss_mlp": 1.02270222, + "epoch": 0.23610401322711558, + "flos": 29570139014400.0, + "grad_norm": 2.755841962422441, + "language_loss": 0.71664131, + "learning_rate": 3.570697151969235e-06, + "loss": 0.73783231, + "num_input_tokens_seen": 84593130, + "step": 3927, + "time_per_iteration": 2.723944664001465 + }, + { + "auxiliary_loss_clip": 0.01072205, + "auxiliary_loss_mlp": 0.01039532, + "balance_loss_clip": 1.03168857, + "balance_loss_mlp": 1.02459466, + "epoch": 0.23616413647978354, + "flos": 17858520731520.0, + "grad_norm": 1.823022676777323, + "language_loss": 0.74741662, + "learning_rate": 3.570456024454221e-06, + "loss": 0.768534, + "num_input_tokens_seen": 84612410, + "step": 3928, + "time_per_iteration": 2.628067970275879 + }, + { + "auxiliary_loss_clip": 0.01074742, + "auxiliary_loss_mlp": 0.0104107, + "balance_loss_clip": 1.03236127, + "balance_loss_mlp": 1.02434444, + "epoch": 0.23622425973245154, + "flos": 11034847157760.0, + "grad_norm": 2.1495801414092233, + "language_loss": 0.81686139, + "learning_rate": 3.5702148373873576e-06, + "loss": 0.83801949, + "num_input_tokens_seen": 84627610, + "step": 3929, + "time_per_iteration": 2.5835373401641846 + }, + { + "auxiliary_loss_clip": 0.0110725, + "auxiliary_loss_mlp": 0.0104733, + "balance_loss_clip": 1.03855038, + "balance_loss_mlp": 1.02985406, + "epoch": 0.2362843829851195, + "flos": 23404061681280.0, + "grad_norm": 3.333939181032104, + "language_loss": 0.71721101, + "learning_rate": 3.569973590777789e-06, + "loss": 0.73875678, + "num_input_tokens_seen": 84648415, + "step": 3930, + "time_per_iteration": 2.6100308895111084 + }, + { + "auxiliary_loss_clip": 0.01097137, + "auxiliary_loss_mlp": 0.01034617, + "balance_loss_clip": 1.03298008, + "balance_loss_mlp": 1.01915586, + "epoch": 0.23634450623778747, + "flos": 39529855261440.0, + "grad_norm": 1.8841605248788182, + "language_loss": 0.74060285, + "learning_rate": 3.569732284634665e-06, + "loss": 0.76192039, + "num_input_tokens_seen": 84670080, + "step": 3931, + "time_per_iteration": 2.8868939876556396 + }, + { + "auxiliary_loss_clip": 0.01090783, + "auxiliary_loss_mlp": 0.01042389, + "balance_loss_clip": 1.03662825, + "balance_loss_mlp": 1.02507997, + "epoch": 0.23640462949045543, + "flos": 24207167917440.0, + "grad_norm": 2.1711322989300044, + "language_loss": 0.80305982, + "learning_rate": 3.569490918967136e-06, + "loss": 0.82439154, + "num_input_tokens_seen": 84686465, + "step": 3932, + "time_per_iteration": 2.6480765342712402 + }, + { + "auxiliary_loss_clip": 0.01064166, + "auxiliary_loss_mlp": 0.01036386, + "balance_loss_clip": 1.03406775, + "balance_loss_mlp": 1.02172363, + "epoch": 0.2364647527431234, + "flos": 26177622255360.0, + "grad_norm": 1.4836644739347165, + "language_loss": 0.85130781, + "learning_rate": 3.5692494937843537e-06, + "loss": 0.87231332, + "num_input_tokens_seen": 84708825, + "step": 3933, + "time_per_iteration": 2.7328476905822754 + }, + { + "auxiliary_loss_clip": 0.01050068, + "auxiliary_loss_mlp": 0.01046173, + "balance_loss_clip": 1.03168631, + "balance_loss_mlp": 1.0280652, + "epoch": 0.23652487599579136, + "flos": 22637009721600.0, + "grad_norm": 1.9402614250270098, + "language_loss": 0.82776403, + "learning_rate": 3.5690080090954727e-06, + "loss": 0.84872645, + "num_input_tokens_seen": 84726165, + "step": 3934, + "time_per_iteration": 2.749204397201538 + }, + { + "auxiliary_loss_clip": 0.01101218, + "auxiliary_loss_mlp": 0.01040241, + "balance_loss_clip": 1.03568578, + "balance_loss_mlp": 1.02424324, + "epoch": 0.23658499924845935, + "flos": 21762261809280.0, + "grad_norm": 1.82254358465067, + "language_loss": 0.78367758, + "learning_rate": 3.5687664649096515e-06, + "loss": 0.80509222, + "num_input_tokens_seen": 84745815, + "step": 3935, + "time_per_iteration": 2.539243221282959 + }, + { + "auxiliary_loss_clip": 0.0108786, + "auxiliary_loss_mlp": 0.010349, + "balance_loss_clip": 1.03591013, + "balance_loss_mlp": 1.0198437, + "epoch": 0.23664512250112732, + "flos": 21798998444160.0, + "grad_norm": 1.568665028429326, + "language_loss": 0.79374301, + "learning_rate": 3.5685248612360487e-06, + "loss": 0.81497067, + "num_input_tokens_seen": 84765415, + "step": 3936, + "time_per_iteration": 2.6463756561279297 + }, + { + "auxiliary_loss_clip": 0.01078198, + "auxiliary_loss_mlp": 0.0103679, + "balance_loss_clip": 1.0335536, + "balance_loss_mlp": 1.0207082, + "epoch": 0.23670524575379528, + "flos": 22637871648000.0, + "grad_norm": 1.455496290160756, + "language_loss": 0.79145366, + "learning_rate": 3.568283198083826e-06, + "loss": 0.81260359, + "num_input_tokens_seen": 84787080, + "step": 3937, + "time_per_iteration": 2.665813446044922 + }, + { + "auxiliary_loss_clip": 0.01085078, + "auxiliary_loss_mlp": 0.0103664, + "balance_loss_clip": 1.03450298, + "balance_loss_mlp": 1.0221684, + "epoch": 0.23676536900646325, + "flos": 16725000263040.0, + "grad_norm": 2.2085688672429202, + "language_loss": 0.85441625, + "learning_rate": 3.568041475462147e-06, + "loss": 0.87563342, + "num_input_tokens_seen": 84805395, + "step": 3938, + "time_per_iteration": 2.609914779663086 + }, + { + "auxiliary_loss_clip": 0.01096868, + "auxiliary_loss_mlp": 0.01050622, + "balance_loss_clip": 1.03446364, + "balance_loss_mlp": 1.03482664, + "epoch": 0.23682549225913122, + "flos": 11135611785600.0, + "grad_norm": 2.2802464708642356, + "language_loss": 0.9410162, + "learning_rate": 3.5677996933801785e-06, + "loss": 0.9624911, + "num_input_tokens_seen": 84818090, + "step": 3939, + "time_per_iteration": 2.6134610176086426 + }, + { + "auxiliary_loss_clip": 0.01100188, + "auxiliary_loss_mlp": 0.01045117, + "balance_loss_clip": 1.03382123, + "balance_loss_mlp": 1.02882147, + "epoch": 0.23688561551179918, + "flos": 22559226819840.0, + "grad_norm": 1.7045928155507233, + "language_loss": 0.8240546, + "learning_rate": 3.567557851847088e-06, + "loss": 0.84550774, + "num_input_tokens_seen": 84837695, + "step": 3940, + "time_per_iteration": 2.6089465618133545 + }, + { + "auxiliary_loss_clip": 0.01080692, + "auxiliary_loss_mlp": 0.00748939, + "balance_loss_clip": 1.03418195, + "balance_loss_mlp": 1.00102711, + "epoch": 0.23694573876446715, + "flos": 18514895909760.0, + "grad_norm": 2.2921424838241253, + "language_loss": 0.896662, + "learning_rate": 3.5673159508720464e-06, + "loss": 0.9149583, + "num_input_tokens_seen": 84854630, + "step": 3941, + "time_per_iteration": 2.6756412982940674 + }, + { + "auxiliary_loss_clip": 0.01096689, + "auxiliary_loss_mlp": 0.01043697, + "balance_loss_clip": 1.03081977, + "balance_loss_mlp": 1.02632833, + "epoch": 0.23700586201713514, + "flos": 15335723980800.0, + "grad_norm": 2.150851999878922, + "language_loss": 0.84524584, + "learning_rate": 3.5670739904642274e-06, + "loss": 0.86664963, + "num_input_tokens_seen": 84871805, + "step": 3942, + "time_per_iteration": 2.6118648052215576 + }, + { + "auxiliary_loss_clip": 0.01059117, + "auxiliary_loss_mlp": 0.01047142, + "balance_loss_clip": 1.03037643, + "balance_loss_mlp": 1.03013098, + "epoch": 0.2370659852698031, + "flos": 23947605262080.0, + "grad_norm": 1.7695706586319426, + "language_loss": 0.81092817, + "learning_rate": 3.5668319706328065e-06, + "loss": 0.83199072, + "num_input_tokens_seen": 84889815, + "step": 3943, + "time_per_iteration": 2.7584054470062256 + }, + { + "auxiliary_loss_clip": 0.01068765, + "auxiliary_loss_mlp": 0.01044077, + "balance_loss_clip": 1.03306377, + "balance_loss_mlp": 1.02581441, + "epoch": 0.23712610852247107, + "flos": 15332527670400.0, + "grad_norm": 2.450767678075849, + "language_loss": 0.67741466, + "learning_rate": 3.566589891386959e-06, + "loss": 0.69854313, + "num_input_tokens_seen": 84904380, + "step": 3944, + "time_per_iteration": 2.736520528793335 + }, + { + "auxiliary_loss_clip": 0.01068489, + "auxiliary_loss_mlp": 0.01042526, + "balance_loss_clip": 1.0307672, + "balance_loss_mlp": 1.02540731, + "epoch": 0.23718623177513903, + "flos": 19682567233920.0, + "grad_norm": 1.839807332589577, + "language_loss": 0.75477815, + "learning_rate": 3.566347752735866e-06, + "loss": 0.77588832, + "num_input_tokens_seen": 84922935, + "step": 3945, + "time_per_iteration": 2.739623785018921 + }, + { + "auxiliary_loss_clip": 0.01076438, + "auxiliary_loss_mlp": 0.01046769, + "balance_loss_clip": 1.03210473, + "balance_loss_mlp": 1.03050852, + "epoch": 0.237246355027807, + "flos": 24973322037120.0, + "grad_norm": 1.7967103551840289, + "language_loss": 0.6343599, + "learning_rate": 3.5661055546887094e-06, + "loss": 0.65559196, + "num_input_tokens_seen": 84943685, + "step": 3946, + "time_per_iteration": 2.7130489349365234 + }, + { + "auxiliary_loss_clip": 0.01081223, + "auxiliary_loss_mlp": 0.01041152, + "balance_loss_clip": 1.0312289, + "balance_loss_mlp": 1.02383077, + "epoch": 0.23730647828047496, + "flos": 15377416692480.0, + "grad_norm": 2.3641452842528206, + "language_loss": 0.76917052, + "learning_rate": 3.5658632972546734e-06, + "loss": 0.79039431, + "num_input_tokens_seen": 84959505, + "step": 3947, + "time_per_iteration": 2.5593345165252686 + }, + { + "auxiliary_loss_clip": 0.01099853, + "auxiliary_loss_mlp": 0.01041186, + "balance_loss_clip": 1.04104853, + "balance_loss_mlp": 1.02484226, + "epoch": 0.23736660153314296, + "flos": 28150662372480.0, + "grad_norm": 2.6503025660205273, + "language_loss": 0.80572426, + "learning_rate": 3.565620980442944e-06, + "loss": 0.82713473, + "num_input_tokens_seen": 84982130, + "step": 3948, + "time_per_iteration": 2.6483235359191895 + }, + { + "auxiliary_loss_clip": 0.01084782, + "auxiliary_loss_mlp": 0.01047912, + "balance_loss_clip": 1.03668022, + "balance_loss_mlp": 1.03090048, + "epoch": 0.23742672478581092, + "flos": 22086570729600.0, + "grad_norm": 1.8606284936458888, + "language_loss": 0.80336297, + "learning_rate": 3.5653786042627107e-06, + "loss": 0.82468992, + "num_input_tokens_seen": 85000640, + "step": 3949, + "time_per_iteration": 2.6483263969421387 + }, + { + "auxiliary_loss_clip": 0.01080654, + "auxiliary_loss_mlp": 0.01040934, + "balance_loss_clip": 1.03492284, + "balance_loss_mlp": 1.02369678, + "epoch": 0.2374868480384789, + "flos": 19537093152000.0, + "grad_norm": 1.728016387567024, + "language_loss": 0.73218668, + "learning_rate": 3.565136168723163e-06, + "loss": 0.75340253, + "num_input_tokens_seen": 85018970, + "step": 3950, + "time_per_iteration": 2.61842679977417 + }, + { + "auxiliary_loss_clip": 0.01095927, + "auxiliary_loss_mlp": 0.01035561, + "balance_loss_clip": 1.0330143, + "balance_loss_mlp": 1.02075529, + "epoch": 0.23754697129114685, + "flos": 19422501788160.0, + "grad_norm": 1.8423247830549228, + "language_loss": 0.72938156, + "learning_rate": 3.564893673833495e-06, + "loss": 0.75069642, + "num_input_tokens_seen": 85035905, + "step": 3951, + "time_per_iteration": 2.5554893016815186 + }, + { + "auxiliary_loss_clip": 0.01082422, + "auxiliary_loss_mlp": 0.01041964, + "balance_loss_clip": 1.03729784, + "balance_loss_mlp": 1.02466643, + "epoch": 0.23760709454381482, + "flos": 19501002961920.0, + "grad_norm": 1.9035693029383407, + "language_loss": 0.73416531, + "learning_rate": 3.564651119602903e-06, + "loss": 0.75540924, + "num_input_tokens_seen": 85054560, + "step": 3952, + "time_per_iteration": 2.6183502674102783 + }, + { + "auxiliary_loss_clip": 0.01047513, + "auxiliary_loss_mlp": 0.01042748, + "balance_loss_clip": 1.02797341, + "balance_loss_mlp": 1.02678573, + "epoch": 0.23766721779648278, + "flos": 27636600879360.0, + "grad_norm": 1.6424811845817484, + "language_loss": 0.71409315, + "learning_rate": 3.564408506040583e-06, + "loss": 0.73499572, + "num_input_tokens_seen": 85074425, + "step": 3953, + "time_per_iteration": 2.863253355026245 + }, + { + "auxiliary_loss_clip": 0.01102742, + "auxiliary_loss_mlp": 0.01045736, + "balance_loss_clip": 1.03643465, + "balance_loss_mlp": 1.02825952, + "epoch": 0.23772734104915075, + "flos": 23404348990080.0, + "grad_norm": 1.945322846193168, + "language_loss": 0.8129375, + "learning_rate": 3.5641658331557356e-06, + "loss": 0.83442229, + "num_input_tokens_seen": 85092865, + "step": 3954, + "time_per_iteration": 2.675283908843994 + }, + { + "auxiliary_loss_clip": 0.01080449, + "auxiliary_loss_mlp": 0.01041599, + "balance_loss_clip": 1.03405571, + "balance_loss_mlp": 1.02418232, + "epoch": 0.23778746430181874, + "flos": 15705496540800.0, + "grad_norm": 2.290535149730749, + "language_loss": 0.66088665, + "learning_rate": 3.5639231009575634e-06, + "loss": 0.68210709, + "num_input_tokens_seen": 85110175, + "step": 3955, + "time_per_iteration": 2.8049583435058594 + }, + { + "auxiliary_loss_clip": 0.01100159, + "auxiliary_loss_mlp": 0.01046927, + "balance_loss_clip": 1.03540301, + "balance_loss_mlp": 1.03101242, + "epoch": 0.2378475875544867, + "flos": 19426452284160.0, + "grad_norm": 1.4207261000212659, + "language_loss": 0.83755004, + "learning_rate": 3.5636803094552704e-06, + "loss": 0.85902095, + "num_input_tokens_seen": 85129925, + "step": 3956, + "time_per_iteration": 4.3586578369140625 + }, + { + "auxiliary_loss_clip": 0.01051557, + "auxiliary_loss_mlp": 0.01036291, + "balance_loss_clip": 1.0292362, + "balance_loss_mlp": 1.02054346, + "epoch": 0.23790771080715467, + "flos": 22268565964800.0, + "grad_norm": 9.9766151385791, + "language_loss": 0.85077822, + "learning_rate": 3.5634374586580635e-06, + "loss": 0.87165666, + "num_input_tokens_seen": 85147755, + "step": 3957, + "time_per_iteration": 2.895632743835449 + }, + { + "auxiliary_loss_clip": 0.01044708, + "auxiliary_loss_mlp": 0.01039027, + "balance_loss_clip": 1.03205729, + "balance_loss_mlp": 1.02369654, + "epoch": 0.23796783405982264, + "flos": 20047311889920.0, + "grad_norm": 2.0527692310601, + "language_loss": 0.70371628, + "learning_rate": 3.563194548575151e-06, + "loss": 0.72455359, + "num_input_tokens_seen": 85165270, + "step": 3958, + "time_per_iteration": 2.848989725112915 + }, + { + "auxiliary_loss_clip": 0.01053061, + "auxiliary_loss_mlp": 0.0104445, + "balance_loss_clip": 1.02983046, + "balance_loss_mlp": 1.02550721, + "epoch": 0.2380279573124906, + "flos": 14245943299200.0, + "grad_norm": 2.3315190829673256, + "language_loss": 0.65842593, + "learning_rate": 3.562951579215745e-06, + "loss": 0.67940104, + "num_input_tokens_seen": 85181555, + "step": 3959, + "time_per_iteration": 2.6691689491271973 + }, + { + "auxiliary_loss_clip": 0.01057626, + "auxiliary_loss_mlp": 0.01039539, + "balance_loss_clip": 1.03344631, + "balance_loss_mlp": 1.02380395, + "epoch": 0.23808808056515857, + "flos": 21179180332800.0, + "grad_norm": 1.7090541285039151, + "language_loss": 0.71976656, + "learning_rate": 3.5627085505890586e-06, + "loss": 0.74073815, + "num_input_tokens_seen": 85199455, + "step": 3960, + "time_per_iteration": 2.724287509918213 + }, + { + "auxiliary_loss_clip": 0.01035906, + "auxiliary_loss_mlp": 0.01039778, + "balance_loss_clip": 1.0410223, + "balance_loss_mlp": 1.02269554, + "epoch": 0.23814820381782653, + "flos": 22528308188160.0, + "grad_norm": 1.6628325859072757, + "language_loss": 0.73603046, + "learning_rate": 3.562465462704307e-06, + "loss": 0.7567873, + "num_input_tokens_seen": 85219170, + "step": 3961, + "time_per_iteration": 2.903794527053833 + }, + { + "auxiliary_loss_clip": 0.01100885, + "auxiliary_loss_mlp": 0.01050722, + "balance_loss_clip": 1.03397179, + "balance_loss_mlp": 1.0329361, + "epoch": 0.23820832707049452, + "flos": 22304332932480.0, + "grad_norm": 1.6578059810379284, + "language_loss": 0.65851748, + "learning_rate": 3.5622223155707085e-06, + "loss": 0.68003356, + "num_input_tokens_seen": 85238480, + "step": 3962, + "time_per_iteration": 4.313721656799316 + }, + { + "auxiliary_loss_clip": 0.01070808, + "auxiliary_loss_mlp": 0.01044119, + "balance_loss_clip": 1.03053904, + "balance_loss_mlp": 1.02783501, + "epoch": 0.2382684503231625, + "flos": 24864225454080.0, + "grad_norm": 2.173934876771309, + "language_loss": 0.74344635, + "learning_rate": 3.561979109197483e-06, + "loss": 0.76459563, + "num_input_tokens_seen": 85259180, + "step": 3963, + "time_per_iteration": 4.45715856552124 + }, + { + "auxiliary_loss_clip": 0.01072777, + "auxiliary_loss_mlp": 0.0104199, + "balance_loss_clip": 1.03582859, + "balance_loss_mlp": 1.02570629, + "epoch": 0.23832857357583045, + "flos": 21871609787520.0, + "grad_norm": 1.8647861775941055, + "language_loss": 0.77259314, + "learning_rate": 3.5617358435938538e-06, + "loss": 0.79374081, + "num_input_tokens_seen": 85278550, + "step": 3964, + "time_per_iteration": 2.7892680168151855 + }, + { + "auxiliary_loss_clip": 0.01057142, + "auxiliary_loss_mlp": 0.01039853, + "balance_loss_clip": 1.03039265, + "balance_loss_mlp": 1.02405739, + "epoch": 0.23838869682849842, + "flos": 21288061434240.0, + "grad_norm": 1.9568799894436066, + "language_loss": 0.71563458, + "learning_rate": 3.561492518769045e-06, + "loss": 0.73660457, + "num_input_tokens_seen": 85297345, + "step": 3965, + "time_per_iteration": 2.9039363861083984 + }, + { + "auxiliary_loss_clip": 0.01060844, + "auxiliary_loss_mlp": 0.01043506, + "balance_loss_clip": 1.02928388, + "balance_loss_mlp": 1.02735305, + "epoch": 0.23844882008116638, + "flos": 16180594755840.0, + "grad_norm": 1.8239773552466874, + "language_loss": 0.78316903, + "learning_rate": 3.561249134732282e-06, + "loss": 0.80421245, + "num_input_tokens_seen": 85315105, + "step": 3966, + "time_per_iteration": 2.6085329055786133 + }, + { + "auxiliary_loss_clip": 0.01075685, + "auxiliary_loss_mlp": 0.01042988, + "balance_loss_clip": 1.03282273, + "balance_loss_mlp": 1.02750301, + "epoch": 0.23850894333383435, + "flos": 21069724613760.0, + "grad_norm": 1.829491606663195, + "language_loss": 0.6860708, + "learning_rate": 3.561005691492797e-06, + "loss": 0.70725751, + "num_input_tokens_seen": 85334735, + "step": 3967, + "time_per_iteration": 2.6903717517852783 + }, + { + "auxiliary_loss_clip": 0.01066266, + "auxiliary_loss_mlp": 0.01049148, + "balance_loss_clip": 1.03129339, + "balance_loss_mlp": 1.03236318, + "epoch": 0.23856906658650234, + "flos": 17201606849280.0, + "grad_norm": 1.8668922509637411, + "language_loss": 0.68032563, + "learning_rate": 3.5607621890598185e-06, + "loss": 0.70147973, + "num_input_tokens_seen": 85352875, + "step": 3968, + "time_per_iteration": 2.6825292110443115 + }, + { + "auxiliary_loss_clip": 0.01058321, + "auxiliary_loss_mlp": 0.01043335, + "balance_loss_clip": 1.03559303, + "balance_loss_mlp": 1.02670527, + "epoch": 0.2386291898391703, + "flos": 29494223619840.0, + "grad_norm": 2.4738707826406827, + "language_loss": 0.75985575, + "learning_rate": 3.5605186274425823e-06, + "loss": 0.78087223, + "num_input_tokens_seen": 85372205, + "step": 3969, + "time_per_iteration": 2.796715497970581 + }, + { + "auxiliary_loss_clip": 0.0107476, + "auxiliary_loss_mlp": 0.01036698, + "balance_loss_clip": 1.03410137, + "balance_loss_mlp": 1.02145123, + "epoch": 0.23868931309183827, + "flos": 21142443697920.0, + "grad_norm": 2.234840952210217, + "language_loss": 0.76030576, + "learning_rate": 3.5602750066503225e-06, + "loss": 0.78142035, + "num_input_tokens_seen": 85389705, + "step": 3970, + "time_per_iteration": 2.6352758407592773 + }, + { + "auxiliary_loss_clip": 0.0106076, + "auxiliary_loss_mlp": 0.01046136, + "balance_loss_clip": 1.02912927, + "balance_loss_mlp": 1.02899408, + "epoch": 0.23874943634450624, + "flos": 25659394784640.0, + "grad_norm": 2.3696781791902426, + "language_loss": 0.85335749, + "learning_rate": 3.5600313266922793e-06, + "loss": 0.87442642, + "num_input_tokens_seen": 85407855, + "step": 3971, + "time_per_iteration": 2.78140926361084 + }, + { + "auxiliary_loss_clip": 0.01015245, + "auxiliary_loss_mlp": 0.01018901, + "balance_loss_clip": 1.00787282, + "balance_loss_mlp": 1.01644504, + "epoch": 0.2388095595971742, + "flos": 58986618624000.0, + "grad_norm": 0.753439458282678, + "language_loss": 0.62899691, + "learning_rate": 3.5597875875776915e-06, + "loss": 0.64933836, + "num_input_tokens_seen": 85470885, + "step": 3972, + "time_per_iteration": 3.2159264087677 + }, + { + "auxiliary_loss_clip": 0.0107723, + "auxiliary_loss_mlp": 0.01037523, + "balance_loss_clip": 1.03430307, + "balance_loss_mlp": 1.02202582, + "epoch": 0.23886968284984217, + "flos": 16800341040000.0, + "grad_norm": 2.7358231590166753, + "language_loss": 0.81636524, + "learning_rate": 3.5595437893158013e-06, + "loss": 0.83751273, + "num_input_tokens_seen": 85488460, + "step": 3973, + "time_per_iteration": 2.662085771560669 + }, + { + "auxiliary_loss_clip": 0.01065068, + "auxiliary_loss_mlp": 0.01044712, + "balance_loss_clip": 1.03191686, + "balance_loss_mlp": 1.02824903, + "epoch": 0.23892980610251013, + "flos": 22382654538240.0, + "grad_norm": 1.6840855416418596, + "language_loss": 0.79315025, + "learning_rate": 3.5592999319158546e-06, + "loss": 0.81424803, + "num_input_tokens_seen": 85508590, + "step": 3974, + "time_per_iteration": 2.6469085216522217 + }, + { + "auxiliary_loss_clip": 0.01079543, + "auxiliary_loss_mlp": 0.01045894, + "balance_loss_clip": 1.03213382, + "balance_loss_mlp": 1.02878785, + "epoch": 0.23898992935517813, + "flos": 12823198519680.0, + "grad_norm": 1.879358466693296, + "language_loss": 0.85002625, + "learning_rate": 3.5590560153870984e-06, + "loss": 0.87128055, + "num_input_tokens_seen": 85525970, + "step": 3975, + "time_per_iteration": 2.597234010696411 + }, + { + "auxiliary_loss_clip": 0.01074744, + "auxiliary_loss_mlp": 0.01046868, + "balance_loss_clip": 1.03158474, + "balance_loss_mlp": 1.0307982, + "epoch": 0.2390500526078461, + "flos": 22345666508160.0, + "grad_norm": 2.2939262627147556, + "language_loss": 0.83507192, + "learning_rate": 3.5588120397387816e-06, + "loss": 0.85628808, + "num_input_tokens_seen": 85543700, + "step": 3976, + "time_per_iteration": 2.649742841720581 + }, + { + "auxiliary_loss_clip": 0.01037574, + "auxiliary_loss_mlp": 0.01039198, + "balance_loss_clip": 1.03206885, + "balance_loss_mlp": 1.02442765, + "epoch": 0.23911017586051406, + "flos": 22635142214400.0, + "grad_norm": 1.7726141902841823, + "language_loss": 0.74612558, + "learning_rate": 3.5585680049801566e-06, + "loss": 0.76689339, + "num_input_tokens_seen": 85562765, + "step": 3977, + "time_per_iteration": 2.9264626502990723 + }, + { + "auxiliary_loss_clip": 0.01101663, + "auxiliary_loss_mlp": 0.01047191, + "balance_loss_clip": 1.03654957, + "balance_loss_mlp": 1.03099084, + "epoch": 0.23917029911318202, + "flos": 23653281219840.0, + "grad_norm": 1.7596574724649243, + "language_loss": 0.71745026, + "learning_rate": 3.5583239111204764e-06, + "loss": 0.73893881, + "num_input_tokens_seen": 85581755, + "step": 3978, + "time_per_iteration": 2.6443612575531006 + }, + { + "auxiliary_loss_clip": 0.01071564, + "auxiliary_loss_mlp": 0.01047343, + "balance_loss_clip": 1.03219891, + "balance_loss_mlp": 1.03071368, + "epoch": 0.23923042236585, + "flos": 22783597125120.0, + "grad_norm": 2.4666528342177094, + "language_loss": 0.78859651, + "learning_rate": 3.558079758168997e-06, + "loss": 0.8097856, + "num_input_tokens_seen": 85599455, + "step": 3979, + "time_per_iteration": 2.696812868118286 + }, + { + "auxiliary_loss_clip": 0.0107458, + "auxiliary_loss_mlp": 0.0104884, + "balance_loss_clip": 1.03225338, + "balance_loss_mlp": 1.03222179, + "epoch": 0.23929054561851795, + "flos": 28147717457280.0, + "grad_norm": 1.6374264916196526, + "language_loss": 0.81587613, + "learning_rate": 3.557835546134977e-06, + "loss": 0.83711034, + "num_input_tokens_seen": 85619970, + "step": 3980, + "time_per_iteration": 2.774095058441162 + }, + { + "auxiliary_loss_clip": 0.01051771, + "auxiliary_loss_mlp": 0.01041426, + "balance_loss_clip": 1.03255892, + "balance_loss_mlp": 1.02560687, + "epoch": 0.23935066887118592, + "flos": 21686525982720.0, + "grad_norm": 1.6795105275498436, + "language_loss": 0.84201741, + "learning_rate": 3.5575912750276775e-06, + "loss": 0.86294937, + "num_input_tokens_seen": 85638850, + "step": 3981, + "time_per_iteration": 2.7101664543151855 + }, + { + "auxiliary_loss_clip": 0.01073587, + "auxiliary_loss_mlp": 0.01043284, + "balance_loss_clip": 1.03319395, + "balance_loss_mlp": 1.0271312, + "epoch": 0.2394107921238539, + "flos": 32122274198400.0, + "grad_norm": 2.2844188313530998, + "language_loss": 0.76628411, + "learning_rate": 3.5573469448563607e-06, + "loss": 0.78745276, + "num_input_tokens_seen": 85656285, + "step": 3982, + "time_per_iteration": 2.9703257083892822 + }, + { + "auxiliary_loss_clip": 0.01066563, + "auxiliary_loss_mlp": 0.01046852, + "balance_loss_clip": 1.03441691, + "balance_loss_mlp": 1.03152168, + "epoch": 0.23947091537652188, + "flos": 17019180650880.0, + "grad_norm": 1.7761096856408103, + "language_loss": 0.78126192, + "learning_rate": 3.5571025556302915e-06, + "loss": 0.80239606, + "num_input_tokens_seen": 85673020, + "step": 3983, + "time_per_iteration": 2.794051170349121 + }, + { + "auxiliary_loss_clip": 0.01088325, + "auxiliary_loss_mlp": 0.00748749, + "balance_loss_clip": 1.03504348, + "balance_loss_mlp": 1.00111294, + "epoch": 0.23953103862918984, + "flos": 20593584904320.0, + "grad_norm": 1.6399544818652014, + "language_loss": 0.72589588, + "learning_rate": 3.556858107358737e-06, + "loss": 0.74426663, + "num_input_tokens_seen": 85692565, + "step": 3984, + "time_per_iteration": 2.6442129611968994 + }, + { + "auxiliary_loss_clip": 0.01059123, + "auxiliary_loss_mlp": 0.01049227, + "balance_loss_clip": 1.03273845, + "balance_loss_mlp": 1.03148913, + "epoch": 0.2395911618818578, + "flos": 20704405340160.0, + "grad_norm": 1.9628017415657164, + "language_loss": 0.78918773, + "learning_rate": 3.5566136000509674e-06, + "loss": 0.81027126, + "num_input_tokens_seen": 85709730, + "step": 3985, + "time_per_iteration": 2.6519670486450195 + }, + { + "auxiliary_loss_clip": 0.01051067, + "auxiliary_loss_mlp": 0.01047491, + "balance_loss_clip": 1.0314815, + "balance_loss_mlp": 1.03061104, + "epoch": 0.23965128513452577, + "flos": 27053519402880.0, + "grad_norm": 1.7845328063902026, + "language_loss": 0.73384202, + "learning_rate": 3.556369033716254e-06, + "loss": 0.75482756, + "num_input_tokens_seen": 85730045, + "step": 3986, + "time_per_iteration": 2.789649248123169 + }, + { + "auxiliary_loss_clip": 0.0109251, + "auxiliary_loss_mlp": 0.01047809, + "balance_loss_clip": 1.03508461, + "balance_loss_mlp": 1.03206146, + "epoch": 0.23971140838719374, + "flos": 23144319457920.0, + "grad_norm": 1.9995734725143077, + "language_loss": 0.87689227, + "learning_rate": 3.556124408363871e-06, + "loss": 0.89829546, + "num_input_tokens_seen": 85747590, + "step": 3987, + "time_per_iteration": 2.588937282562256 + }, + { + "auxiliary_loss_clip": 0.01081277, + "auxiliary_loss_mlp": 0.01039937, + "balance_loss_clip": 1.03182006, + "balance_loss_mlp": 1.02575064, + "epoch": 0.23977153163986173, + "flos": 18034554309120.0, + "grad_norm": 2.507042358983893, + "language_loss": 0.83597612, + "learning_rate": 3.5558797240030945e-06, + "loss": 0.85718828, + "num_input_tokens_seen": 85763460, + "step": 3988, + "time_per_iteration": 2.6042025089263916 + }, + { + "auxiliary_loss_clip": 0.01086373, + "auxiliary_loss_mlp": 0.01039566, + "balance_loss_clip": 1.03322363, + "balance_loss_mlp": 1.02274561, + "epoch": 0.2398316548925297, + "flos": 18113378705280.0, + "grad_norm": 1.7145607129074187, + "language_loss": 0.85451293, + "learning_rate": 3.5556349806432035e-06, + "loss": 0.8757723, + "num_input_tokens_seen": 85782050, + "step": 3989, + "time_per_iteration": 2.571769952774048 + }, + { + "auxiliary_loss_clip": 0.01096852, + "auxiliary_loss_mlp": 0.01043567, + "balance_loss_clip": 1.03340065, + "balance_loss_mlp": 1.02792692, + "epoch": 0.23989177814519766, + "flos": 12567730014720.0, + "grad_norm": 2.033827632581445, + "language_loss": 0.85234678, + "learning_rate": 3.555390178293477e-06, + "loss": 0.87375093, + "num_input_tokens_seen": 85797400, + "step": 3990, + "time_per_iteration": 2.5107765197753906 + }, + { + "auxiliary_loss_clip": 0.01079502, + "auxiliary_loss_mlp": 0.01043415, + "balance_loss_clip": 1.0302459, + "balance_loss_mlp": 1.02825201, + "epoch": 0.23995190139786562, + "flos": 25264593423360.0, + "grad_norm": 1.4732245613448807, + "language_loss": 0.75600177, + "learning_rate": 3.5551453169631994e-06, + "loss": 0.77723098, + "num_input_tokens_seen": 85818995, + "step": 3991, + "time_per_iteration": 2.6114654541015625 + }, + { + "auxiliary_loss_clip": 0.01006305, + "auxiliary_loss_mlp": 0.01009609, + "balance_loss_clip": 1.01115584, + "balance_loss_mlp": 1.00631893, + "epoch": 0.2400120246505336, + "flos": 61960379650560.0, + "grad_norm": 1.0548624819197994, + "language_loss": 0.63792825, + "learning_rate": 3.554900396661656e-06, + "loss": 0.65808737, + "num_input_tokens_seen": 85876695, + "step": 3992, + "time_per_iteration": 3.103231430053711 + }, + { + "auxiliary_loss_clip": 0.01016018, + "auxiliary_loss_mlp": 0.01007666, + "balance_loss_clip": 1.00877249, + "balance_loss_mlp": 1.00480509, + "epoch": 0.24007214790320155, + "flos": 66708560540160.0, + "grad_norm": 0.7608078703276655, + "language_loss": 0.62952811, + "learning_rate": 3.5546554173981334e-06, + "loss": 0.64976496, + "num_input_tokens_seen": 85940990, + "step": 3993, + "time_per_iteration": 3.2339136600494385 + }, + { + "auxiliary_loss_clip": 0.01067251, + "auxiliary_loss_mlp": 0.01046327, + "balance_loss_clip": 1.03701258, + "balance_loss_mlp": 1.02899384, + "epoch": 0.24013227115586952, + "flos": 25809070757760.0, + "grad_norm": 1.5287433589653008, + "language_loss": 0.76471114, + "learning_rate": 3.5544103791819218e-06, + "loss": 0.78584695, + "num_input_tokens_seen": 85961165, + "step": 3994, + "time_per_iteration": 2.6806280612945557 + }, + { + "auxiliary_loss_clip": 0.01071703, + "auxiliary_loss_mlp": 0.01049266, + "balance_loss_clip": 1.03051734, + "balance_loss_mlp": 1.03122973, + "epoch": 0.2401923944085375, + "flos": 25557480921600.0, + "grad_norm": 1.9567609920113327, + "language_loss": 0.78346509, + "learning_rate": 3.5541652820223124e-06, + "loss": 0.80467474, + "num_input_tokens_seen": 85982710, + "step": 3995, + "time_per_iteration": 2.7320120334625244 + }, + { + "auxiliary_loss_clip": 0.00996983, + "auxiliary_loss_mlp": 0.01002747, + "balance_loss_clip": 1.00815535, + "balance_loss_mlp": 0.99950409, + "epoch": 0.24025251766120548, + "flos": 54941138478720.0, + "grad_norm": 0.9057892906123628, + "language_loss": 0.63418519, + "learning_rate": 3.5539201259286006e-06, + "loss": 0.65418243, + "num_input_tokens_seen": 86046935, + "step": 3996, + "time_per_iteration": 3.2903668880462646 + }, + { + "auxiliary_loss_clip": 0.01080952, + "auxiliary_loss_mlp": 0.01043609, + "balance_loss_clip": 1.03415561, + "balance_loss_mlp": 1.02683663, + "epoch": 0.24031264091387344, + "flos": 20631075724800.0, + "grad_norm": 5.943407288999388, + "language_loss": 0.69860935, + "learning_rate": 3.5536749109100808e-06, + "loss": 0.71985495, + "num_input_tokens_seen": 86064355, + "step": 3997, + "time_per_iteration": 2.605867624282837 + }, + { + "auxiliary_loss_clip": 0.01090048, + "auxiliary_loss_mlp": 0.0104491, + "balance_loss_clip": 1.03490448, + "balance_loss_mlp": 1.02782679, + "epoch": 0.2403727641665414, + "flos": 20886256920960.0, + "grad_norm": 1.94200593759851, + "language_loss": 0.87133896, + "learning_rate": 3.5534296369760535e-06, + "loss": 0.89268851, + "num_input_tokens_seen": 86081340, + "step": 3998, + "time_per_iteration": 2.641174077987671 + }, + { + "auxiliary_loss_clip": 0.01076234, + "auxiliary_loss_mlp": 0.01035816, + "balance_loss_clip": 1.02886426, + "balance_loss_mlp": 1.01968694, + "epoch": 0.24043288741920937, + "flos": 22820046451200.0, + "grad_norm": 1.5192496755676639, + "language_loss": 0.75950336, + "learning_rate": 3.5531843041358183e-06, + "loss": 0.78062391, + "num_input_tokens_seen": 86102260, + "step": 3999, + "time_per_iteration": 2.7782301902770996 + }, + { + "auxiliary_loss_clip": 0.01068966, + "auxiliary_loss_mlp": 0.01039693, + "balance_loss_clip": 1.03327966, + "balance_loss_mlp": 1.02338552, + "epoch": 0.24049301067187734, + "flos": 27959652823680.0, + "grad_norm": 1.720047685854952, + "language_loss": 0.71886039, + "learning_rate": 3.552938912398679e-06, + "loss": 0.73994702, + "num_input_tokens_seen": 86123400, + "step": 4000, + "time_per_iteration": 2.8030309677124023 + }, + { + "auxiliary_loss_clip": 0.010943, + "auxiliary_loss_mlp": 0.01037094, + "balance_loss_clip": 1.03726399, + "balance_loss_mlp": 1.0199635, + "epoch": 0.24055313392454533, + "flos": 27451409333760.0, + "grad_norm": 1.966330964328354, + "language_loss": 0.66699201, + "learning_rate": 3.5526934617739397e-06, + "loss": 0.68830591, + "num_input_tokens_seen": 86144060, + "step": 4001, + "time_per_iteration": 2.6555354595184326 + }, + { + "auxiliary_loss_clip": 0.0109898, + "auxiliary_loss_mlp": 0.01040864, + "balance_loss_clip": 1.03328323, + "balance_loss_mlp": 1.02391219, + "epoch": 0.2406132571772133, + "flos": 25556618995200.0, + "grad_norm": 1.6555188186442675, + "language_loss": 0.82896686, + "learning_rate": 3.5524479522709095e-06, + "loss": 0.85036528, + "num_input_tokens_seen": 86163005, + "step": 4002, + "time_per_iteration": 2.613917827606201 + }, + { + "auxiliary_loss_clip": 0.01063357, + "auxiliary_loss_mlp": 0.0103809, + "balance_loss_clip": 1.03398871, + "balance_loss_mlp": 1.02275991, + "epoch": 0.24067338042988126, + "flos": 24791398629120.0, + "grad_norm": 1.907372005407236, + "language_loss": 0.82721597, + "learning_rate": 3.552202383898897e-06, + "loss": 0.84823048, + "num_input_tokens_seen": 86182580, + "step": 4003, + "time_per_iteration": 2.753763437271118 + }, + { + "auxiliary_loss_clip": 0.0107021, + "auxiliary_loss_mlp": 0.01041093, + "balance_loss_clip": 1.03414786, + "balance_loss_mlp": 1.02390337, + "epoch": 0.24073350368254923, + "flos": 21177923356800.0, + "grad_norm": 1.9893017940540167, + "language_loss": 0.8701694, + "learning_rate": 3.551956756667215e-06, + "loss": 0.89128244, + "num_input_tokens_seen": 86200665, + "step": 4004, + "time_per_iteration": 5.961357831954956 + }, + { + "auxiliary_loss_clip": 0.01067279, + "auxiliary_loss_mlp": 0.01048491, + "balance_loss_clip": 1.03138459, + "balance_loss_mlp": 1.03201652, + "epoch": 0.2407936269352172, + "flos": 22494300986880.0, + "grad_norm": 2.029068659375744, + "language_loss": 0.781111, + "learning_rate": 3.551711070585177e-06, + "loss": 0.80226874, + "num_input_tokens_seen": 86221640, + "step": 4005, + "time_per_iteration": 2.8019657135009766 + }, + { + "auxiliary_loss_clip": 0.01041141, + "auxiliary_loss_mlp": 0.01035466, + "balance_loss_clip": 1.02910554, + "balance_loss_mlp": 1.01912224, + "epoch": 0.24085375018788516, + "flos": 18551129754240.0, + "grad_norm": 1.5225159696722124, + "language_loss": 0.79097003, + "learning_rate": 3.5514653256620995e-06, + "loss": 0.81173611, + "num_input_tokens_seen": 86240795, + "step": 4006, + "time_per_iteration": 2.7373528480529785 + }, + { + "auxiliary_loss_clip": 0.01080881, + "auxiliary_loss_mlp": 0.00749012, + "balance_loss_clip": 1.03482103, + "balance_loss_mlp": 1.00125015, + "epoch": 0.24091387344055312, + "flos": 24170539023360.0, + "grad_norm": 2.11887664121378, + "language_loss": 0.71320546, + "learning_rate": 3.551219521907302e-06, + "loss": 0.73150438, + "num_input_tokens_seen": 86262000, + "step": 4007, + "time_per_iteration": 2.719639301300049 + }, + { + "auxiliary_loss_clip": 0.01054563, + "auxiliary_loss_mlp": 0.01043588, + "balance_loss_clip": 1.03262842, + "balance_loss_mlp": 1.02820945, + "epoch": 0.24097399669322112, + "flos": 11036319615360.0, + "grad_norm": 2.542162204391899, + "language_loss": 0.75901484, + "learning_rate": 3.5509736593301042e-06, + "loss": 0.7799964, + "num_input_tokens_seen": 86279680, + "step": 4008, + "time_per_iteration": 2.635385751724243 + }, + { + "auxiliary_loss_clip": 0.01090261, + "auxiliary_loss_mlp": 0.01036971, + "balance_loss_clip": 1.03569484, + "balance_loss_mlp": 1.02117515, + "epoch": 0.24103411994588908, + "flos": 17165085696000.0, + "grad_norm": 2.471274236051246, + "language_loss": 0.74760616, + "learning_rate": 3.5507277379398295e-06, + "loss": 0.76887846, + "num_input_tokens_seen": 86297180, + "step": 4009, + "time_per_iteration": 2.5825068950653076 + }, + { + "auxiliary_loss_clip": 0.01090861, + "auxiliary_loss_mlp": 0.01043335, + "balance_loss_clip": 1.03775835, + "balance_loss_mlp": 1.02821898, + "epoch": 0.24109424319855705, + "flos": 20667956014080.0, + "grad_norm": 2.0557638305575314, + "language_loss": 0.80115825, + "learning_rate": 3.550481757745804e-06, + "loss": 0.82250023, + "num_input_tokens_seen": 86317660, + "step": 4010, + "time_per_iteration": 4.239682912826538 + }, + { + "auxiliary_loss_clip": 0.01079813, + "auxiliary_loss_mlp": 0.01052757, + "balance_loss_clip": 1.03494573, + "balance_loss_mlp": 1.0339818, + "epoch": 0.241154366451225, + "flos": 28181796485760.0, + "grad_norm": 1.934998679266715, + "language_loss": 0.708125, + "learning_rate": 3.5502357187573555e-06, + "loss": 0.7294507, + "num_input_tokens_seen": 86338325, + "step": 4011, + "time_per_iteration": 4.279928207397461 + }, + { + "auxiliary_loss_clip": 0.01021606, + "auxiliary_loss_mlp": 0.01045276, + "balance_loss_clip": 1.02995825, + "balance_loss_mlp": 1.02846777, + "epoch": 0.24121448970389298, + "flos": 21689722293120.0, + "grad_norm": 1.6839807652130723, + "language_loss": 0.69171411, + "learning_rate": 3.5499896209838118e-06, + "loss": 0.71238297, + "num_input_tokens_seen": 86357615, + "step": 4012, + "time_per_iteration": 2.7659449577331543 + }, + { + "auxiliary_loss_clip": 0.01090526, + "auxiliary_loss_mlp": 0.01040319, + "balance_loss_clip": 1.03406024, + "balance_loss_mlp": 1.02244925, + "epoch": 0.24127461295656094, + "flos": 39676191269760.0, + "grad_norm": 1.5662422452185176, + "language_loss": 0.73597562, + "learning_rate": 3.5497434644345073e-06, + "loss": 0.75728405, + "num_input_tokens_seen": 86380355, + "step": 4013, + "time_per_iteration": 2.7152187824249268 + }, + { + "auxiliary_loss_clip": 0.0110258, + "auxiliary_loss_mlp": 0.01036496, + "balance_loss_clip": 1.03684497, + "balance_loss_mlp": 1.02041411, + "epoch": 0.2413347362092289, + "flos": 19135863256320.0, + "grad_norm": 2.014437659243194, + "language_loss": 0.88266122, + "learning_rate": 3.5494972491187753e-06, + "loss": 0.90405196, + "num_input_tokens_seen": 86399125, + "step": 4014, + "time_per_iteration": 2.5276594161987305 + }, + { + "auxiliary_loss_clip": 0.01062083, + "auxiliary_loss_mlp": 0.01047667, + "balance_loss_clip": 1.02894926, + "balance_loss_mlp": 1.02973819, + "epoch": 0.2413948594618969, + "flos": 26939430829440.0, + "grad_norm": 2.099283275027565, + "language_loss": 0.94748449, + "learning_rate": 3.549250975045952e-06, + "loss": 0.96858203, + "num_input_tokens_seen": 86418625, + "step": 4015, + "time_per_iteration": 2.6321072578430176 + }, + { + "auxiliary_loss_clip": 0.01070961, + "auxiliary_loss_mlp": 0.01038483, + "balance_loss_clip": 1.03052139, + "balance_loss_mlp": 1.0220921, + "epoch": 0.24145498271456486, + "flos": 25228108183680.0, + "grad_norm": 2.4447538570219143, + "language_loss": 0.82260829, + "learning_rate": 3.5490046422253768e-06, + "loss": 0.84370273, + "num_input_tokens_seen": 86438375, + "step": 4016, + "time_per_iteration": 2.797612428665161 + }, + { + "auxiliary_loss_clip": 0.01056153, + "auxiliary_loss_mlp": 0.01039084, + "balance_loss_clip": 1.03119779, + "balance_loss_mlp": 1.02336061, + "epoch": 0.24151510596723283, + "flos": 40661759617920.0, + "grad_norm": 1.9384039755185067, + "language_loss": 0.68931186, + "learning_rate": 3.54875825066639e-06, + "loss": 0.71026421, + "num_input_tokens_seen": 86463230, + "step": 4017, + "time_per_iteration": 2.954058885574341 + }, + { + "auxiliary_loss_clip": 0.01092429, + "auxiliary_loss_mlp": 0.01048579, + "balance_loss_clip": 1.03443944, + "balance_loss_mlp": 1.03162766, + "epoch": 0.2415752292199008, + "flos": 18146667634560.0, + "grad_norm": 1.5742084725443735, + "language_loss": 0.84634, + "learning_rate": 3.5485118003783353e-06, + "loss": 0.86775005, + "num_input_tokens_seen": 86481230, + "step": 4018, + "time_per_iteration": 2.8150477409362793 + }, + { + "auxiliary_loss_clip": 0.0101492, + "auxiliary_loss_mlp": 0.01003758, + "balance_loss_clip": 1.00693893, + "balance_loss_mlp": 1.00098038, + "epoch": 0.24163535247256876, + "flos": 67288409792640.0, + "grad_norm": 0.830962033183201, + "language_loss": 0.60683507, + "learning_rate": 3.548265291370558e-06, + "loss": 0.62702185, + "num_input_tokens_seen": 86541260, + "step": 4019, + "time_per_iteration": 3.252655267715454 + }, + { + "auxiliary_loss_clip": 0.01067662, + "auxiliary_loss_mlp": 0.01042658, + "balance_loss_clip": 1.03079331, + "balance_loss_mlp": 1.02695251, + "epoch": 0.24169547572523672, + "flos": 24929941386240.0, + "grad_norm": 1.6958330932022827, + "language_loss": 0.73237288, + "learning_rate": 3.5480187236524055e-06, + "loss": 0.75347602, + "num_input_tokens_seen": 86559580, + "step": 4020, + "time_per_iteration": 2.7043192386627197 + }, + { + "auxiliary_loss_clip": 0.01066259, + "auxiliary_loss_mlp": 0.01041421, + "balance_loss_clip": 1.03303659, + "balance_loss_mlp": 1.02585232, + "epoch": 0.24175559897790472, + "flos": 18728312567040.0, + "grad_norm": 2.2980088773998713, + "language_loss": 0.81788969, + "learning_rate": 3.5477720972332285e-06, + "loss": 0.83896655, + "num_input_tokens_seen": 86577560, + "step": 4021, + "time_per_iteration": 2.639955997467041 + }, + { + "auxiliary_loss_clip": 0.01104031, + "auxiliary_loss_mlp": 0.01051674, + "balance_loss_clip": 1.03647792, + "balance_loss_mlp": 1.03399515, + "epoch": 0.24181572223057268, + "flos": 23039281111680.0, + "grad_norm": 3.255972967130634, + "language_loss": 0.76484567, + "learning_rate": 3.547525412122378e-06, + "loss": 0.7864027, + "num_input_tokens_seen": 86595350, + "step": 4022, + "time_per_iteration": 2.629298448562622 + }, + { + "auxiliary_loss_clip": 0.01053126, + "auxiliary_loss_mlp": 0.01049144, + "balance_loss_clip": 1.02868414, + "balance_loss_mlp": 1.03008258, + "epoch": 0.24187584548324065, + "flos": 20376145923840.0, + "grad_norm": 2.228144433550213, + "language_loss": 0.75471783, + "learning_rate": 3.5472786683292083e-06, + "loss": 0.7757405, + "num_input_tokens_seen": 86614805, + "step": 4023, + "time_per_iteration": 2.7667248249053955 + }, + { + "auxiliary_loss_clip": 0.01076901, + "auxiliary_loss_mlp": 0.01052608, + "balance_loss_clip": 1.03461039, + "balance_loss_mlp": 1.03671741, + "epoch": 0.2419359687359086, + "flos": 21397517153280.0, + "grad_norm": 1.908076603323115, + "language_loss": 0.82498991, + "learning_rate": 3.5470318658630766e-06, + "loss": 0.84628499, + "num_input_tokens_seen": 86633700, + "step": 4024, + "time_per_iteration": 2.6909892559051514 + }, + { + "auxiliary_loss_clip": 0.01085931, + "auxiliary_loss_mlp": 0.01043557, + "balance_loss_clip": 1.03341091, + "balance_loss_mlp": 1.02753508, + "epoch": 0.24199609198857658, + "flos": 18369385914240.0, + "grad_norm": 1.981190709321454, + "language_loss": 0.86316371, + "learning_rate": 3.5467850047333424e-06, + "loss": 0.88445854, + "num_input_tokens_seen": 86650905, + "step": 4025, + "time_per_iteration": 2.562662124633789 + }, + { + "auxiliary_loss_clip": 0.01046733, + "auxiliary_loss_mlp": 0.01057499, + "balance_loss_clip": 1.03051591, + "balance_loss_mlp": 1.0388788, + "epoch": 0.24205621524124454, + "flos": 19463871277440.0, + "grad_norm": 1.7857644771261951, + "language_loss": 0.713148, + "learning_rate": 3.546538084949365e-06, + "loss": 0.73419034, + "num_input_tokens_seen": 86669185, + "step": 4026, + "time_per_iteration": 2.8944311141967773 + }, + { + "auxiliary_loss_clip": 0.01090653, + "auxiliary_loss_mlp": 0.01046833, + "balance_loss_clip": 1.03664279, + "balance_loss_mlp": 1.03153801, + "epoch": 0.2421163384939125, + "flos": 14976330451200.0, + "grad_norm": 2.578385762133411, + "language_loss": 0.64573753, + "learning_rate": 3.546291106520509e-06, + "loss": 0.66711235, + "num_input_tokens_seen": 86686805, + "step": 4027, + "time_per_iteration": 2.7558867931365967 + }, + { + "auxiliary_loss_clip": 0.01091318, + "auxiliary_loss_mlp": 0.00748845, + "balance_loss_clip": 1.03736901, + "balance_loss_mlp": 1.00133049, + "epoch": 0.2421764617465805, + "flos": 18662057930880.0, + "grad_norm": 2.014316248638377, + "language_loss": 0.70567214, + "learning_rate": 3.5460440694561388e-06, + "loss": 0.72407377, + "num_input_tokens_seen": 86705520, + "step": 4028, + "time_per_iteration": 2.7505640983581543 + }, + { + "auxiliary_loss_clip": 0.0101424, + "auxiliary_loss_mlp": 0.01013785, + "balance_loss_clip": 1.00644779, + "balance_loss_mlp": 1.01143706, + "epoch": 0.24223658499924847, + "flos": 64347327164160.0, + "grad_norm": 0.8745894593665072, + "language_loss": 0.55362624, + "learning_rate": 3.545796973765623e-06, + "loss": 0.57390648, + "num_input_tokens_seen": 86767320, + "step": 4029, + "time_per_iteration": 3.213508129119873 + }, + { + "auxiliary_loss_clip": 0.0108427, + "auxiliary_loss_mlp": 0.01038521, + "balance_loss_clip": 1.03251791, + "balance_loss_mlp": 1.0217129, + "epoch": 0.24229670825191643, + "flos": 25775243124480.0, + "grad_norm": 1.542131868439749, + "language_loss": 0.73931718, + "learning_rate": 3.54554981945833e-06, + "loss": 0.76054507, + "num_input_tokens_seen": 86788110, + "step": 4030, + "time_per_iteration": 2.8234763145446777 + }, + { + "auxiliary_loss_clip": 0.01098704, + "auxiliary_loss_mlp": 0.01048736, + "balance_loss_clip": 1.03459358, + "balance_loss_mlp": 1.03272676, + "epoch": 0.2423568315045844, + "flos": 20667094087680.0, + "grad_norm": 6.972108028647074, + "language_loss": 0.76812857, + "learning_rate": 3.5453026065436343e-06, + "loss": 0.78960299, + "num_input_tokens_seen": 86807640, + "step": 4031, + "time_per_iteration": 2.7473604679107666 + }, + { + "auxiliary_loss_clip": 0.01082806, + "auxiliary_loss_mlp": 0.00749024, + "balance_loss_clip": 1.03334475, + "balance_loss_mlp": 1.00153661, + "epoch": 0.24241695475725236, + "flos": 22416805393920.0, + "grad_norm": 1.8995085220652346, + "language_loss": 0.65629387, + "learning_rate": 3.5450553350309083e-06, + "loss": 0.67461222, + "num_input_tokens_seen": 86826795, + "step": 4032, + "time_per_iteration": 2.6405627727508545 + }, + { + "auxiliary_loss_clip": 0.0108572, + "auxiliary_loss_mlp": 0.01041779, + "balance_loss_clip": 1.03258467, + "balance_loss_mlp": 1.02559662, + "epoch": 0.24247707800992033, + "flos": 17128995505920.0, + "grad_norm": 2.6593716971648957, + "language_loss": 0.81378472, + "learning_rate": 3.5448080049295286e-06, + "loss": 0.8350597, + "num_input_tokens_seen": 86843175, + "step": 4033, + "time_per_iteration": 2.6171152591705322 + }, + { + "auxiliary_loss_clip": 0.01045245, + "auxiliary_loss_mlp": 0.01040309, + "balance_loss_clip": 1.0254333, + "balance_loss_mlp": 1.0240252, + "epoch": 0.2425372012625883, + "flos": 31613743399680.0, + "grad_norm": 2.0593111924817444, + "language_loss": 0.69067395, + "learning_rate": 3.5445606162488754e-06, + "loss": 0.71152949, + "num_input_tokens_seen": 86863185, + "step": 4034, + "time_per_iteration": 2.7018861770629883 + }, + { + "auxiliary_loss_clip": 0.01081823, + "auxiliary_loss_mlp": 0.01034476, + "balance_loss_clip": 1.03496146, + "balance_loss_mlp": 1.01769125, + "epoch": 0.24259732451525629, + "flos": 16326032924160.0, + "grad_norm": 2.0232203748495574, + "language_loss": 0.96201819, + "learning_rate": 3.5443131689983283e-06, + "loss": 0.98318112, + "num_input_tokens_seen": 86880040, + "step": 4035, + "time_per_iteration": 2.7563395500183105 + }, + { + "auxiliary_loss_clip": 0.01069555, + "auxiliary_loss_mlp": 0.01044413, + "balance_loss_clip": 1.0305388, + "balance_loss_mlp": 1.02997649, + "epoch": 0.24265744776792425, + "flos": 22856639431680.0, + "grad_norm": 2.094563401603495, + "language_loss": 0.77682292, + "learning_rate": 3.5440656631872715e-06, + "loss": 0.79796261, + "num_input_tokens_seen": 86900610, + "step": 4036, + "time_per_iteration": 2.735811471939087 + }, + { + "auxiliary_loss_clip": 0.01091357, + "auxiliary_loss_mlp": 0.01040496, + "balance_loss_clip": 1.0352509, + "balance_loss_mlp": 1.02449763, + "epoch": 0.24271757102059222, + "flos": 21871573873920.0, + "grad_norm": 1.6523012954828622, + "language_loss": 0.74105155, + "learning_rate": 3.5438180988250898e-06, + "loss": 0.76237011, + "num_input_tokens_seen": 86919385, + "step": 4037, + "time_per_iteration": 2.7248570919036865 + }, + { + "auxiliary_loss_clip": 0.01045937, + "auxiliary_loss_mlp": 0.01045712, + "balance_loss_clip": 1.02652264, + "balance_loss_mlp": 1.02829576, + "epoch": 0.24277769427326018, + "flos": 19208582340480.0, + "grad_norm": 2.698834452810587, + "language_loss": 0.76673454, + "learning_rate": 3.543570475921171e-06, + "loss": 0.787651, + "num_input_tokens_seen": 86938885, + "step": 4038, + "time_per_iteration": 2.8368594646453857 + }, + { + "auxiliary_loss_clip": 0.01091157, + "auxiliary_loss_mlp": 0.01044099, + "balance_loss_clip": 1.03677928, + "balance_loss_mlp": 1.02647972, + "epoch": 0.24283781752592815, + "flos": 19499889640320.0, + "grad_norm": 1.9119096184063127, + "language_loss": 0.72286856, + "learning_rate": 3.543322794484905e-06, + "loss": 0.74422109, + "num_input_tokens_seen": 86957705, + "step": 4039, + "time_per_iteration": 2.7068357467651367 + }, + { + "auxiliary_loss_clip": 0.0107807, + "auxiliary_loss_mlp": 0.01046508, + "balance_loss_clip": 1.03092813, + "balance_loss_mlp": 1.0293541, + "epoch": 0.2428979407785961, + "flos": 19902196944000.0, + "grad_norm": 2.9426758387988867, + "language_loss": 0.78357589, + "learning_rate": 3.5430750545256843e-06, + "loss": 0.80482167, + "num_input_tokens_seen": 86975845, + "step": 4040, + "time_per_iteration": 2.852123498916626 + }, + { + "auxiliary_loss_clip": 0.01049301, + "auxiliary_loss_mlp": 0.01034502, + "balance_loss_clip": 1.03006756, + "balance_loss_mlp": 1.02040601, + "epoch": 0.2429580640312641, + "flos": 24715878284160.0, + "grad_norm": 1.8236852535018486, + "language_loss": 0.80567348, + "learning_rate": 3.5428272560529027e-06, + "loss": 0.8265115, + "num_input_tokens_seen": 86994800, + "step": 4041, + "time_per_iteration": 2.9039220809936523 + }, + { + "auxiliary_loss_clip": 0.01058527, + "auxiliary_loss_mlp": 0.01046739, + "balance_loss_clip": 1.02994049, + "balance_loss_mlp": 1.03061032, + "epoch": 0.24301818728393207, + "flos": 25630343660160.0, + "grad_norm": 2.2084507554782675, + "language_loss": 0.76532066, + "learning_rate": 3.542579399075957e-06, + "loss": 0.78637332, + "num_input_tokens_seen": 87016845, + "step": 4042, + "time_per_iteration": 2.8491597175598145 + }, + { + "auxiliary_loss_clip": 0.01013631, + "auxiliary_loss_mlp": 0.01033772, + "balance_loss_clip": 1.02912462, + "balance_loss_mlp": 1.01919317, + "epoch": 0.24307831053660003, + "flos": 26141388410880.0, + "grad_norm": 3.7525197797083343, + "language_loss": 0.8152073, + "learning_rate": 3.542331483604246e-06, + "loss": 0.83568132, + "num_input_tokens_seen": 87036270, + "step": 4043, + "time_per_iteration": 3.0045440196990967 + }, + { + "auxiliary_loss_clip": 0.010772, + "auxiliary_loss_mlp": 0.0103697, + "balance_loss_clip": 1.02950883, + "balance_loss_mlp": 1.02019715, + "epoch": 0.243138433789268, + "flos": 14972415868800.0, + "grad_norm": 2.12295784853546, + "language_loss": 0.72899032, + "learning_rate": 3.5420835096471706e-06, + "loss": 0.75013196, + "num_input_tokens_seen": 87049920, + "step": 4044, + "time_per_iteration": 2.714925765991211 + }, + { + "auxiliary_loss_clip": 0.01088934, + "auxiliary_loss_mlp": 0.01039701, + "balance_loss_clip": 1.03495061, + "balance_loss_mlp": 1.02369118, + "epoch": 0.24319855704193596, + "flos": 25191694771200.0, + "grad_norm": 1.816706364423925, + "language_loss": 0.83379447, + "learning_rate": 3.5418354772141337e-06, + "loss": 0.85508084, + "num_input_tokens_seen": 87068230, + "step": 4045, + "time_per_iteration": 2.695220470428467 + }, + { + "auxiliary_loss_clip": 0.01035557, + "auxiliary_loss_mlp": 0.01046028, + "balance_loss_clip": 1.03334618, + "balance_loss_mlp": 1.03082311, + "epoch": 0.24325868029460393, + "flos": 22127221946880.0, + "grad_norm": 1.5604997499328523, + "language_loss": 0.86471564, + "learning_rate": 3.541587386314541e-06, + "loss": 0.88553149, + "num_input_tokens_seen": 87086435, + "step": 4046, + "time_per_iteration": 2.8475351333618164 + }, + { + "auxiliary_loss_clip": 0.01072618, + "auxiliary_loss_mlp": 0.010404, + "balance_loss_clip": 1.03032827, + "balance_loss_mlp": 1.02386022, + "epoch": 0.2433188035472719, + "flos": 23582106420480.0, + "grad_norm": 1.7171098669734877, + "language_loss": 0.73138922, + "learning_rate": 3.5413392369578e-06, + "loss": 0.75251943, + "num_input_tokens_seen": 87105340, + "step": 4047, + "time_per_iteration": 2.698923349380493 + }, + { + "auxiliary_loss_clip": 0.01074584, + "auxiliary_loss_mlp": 0.01039559, + "balance_loss_clip": 1.03029728, + "balance_loss_mlp": 1.02203548, + "epoch": 0.2433789267999399, + "flos": 24462815990400.0, + "grad_norm": 3.52650888410499, + "language_loss": 0.73289979, + "learning_rate": 3.5410910291533213e-06, + "loss": 0.75404119, + "num_input_tokens_seen": 87125780, + "step": 4048, + "time_per_iteration": 2.570957660675049 + }, + { + "auxiliary_loss_clip": 0.01067643, + "auxiliary_loss_mlp": 0.01042356, + "balance_loss_clip": 1.03442144, + "balance_loss_mlp": 1.02804458, + "epoch": 0.24343905005260785, + "flos": 16727909264640.0, + "grad_norm": 2.1582664880442253, + "language_loss": 0.73270488, + "learning_rate": 3.5408427629105155e-06, + "loss": 0.7538048, + "num_input_tokens_seen": 87144470, + "step": 4049, + "time_per_iteration": 2.626147508621216 + }, + { + "auxiliary_loss_clip": 0.0104782, + "auxiliary_loss_mlp": 0.01043132, + "balance_loss_clip": 1.02806282, + "balance_loss_mlp": 1.02787304, + "epoch": 0.24349917330527582, + "flos": 20043756443520.0, + "grad_norm": 1.7359621623651809, + "language_loss": 0.73502535, + "learning_rate": 3.5405944382387985e-06, + "loss": 0.75593483, + "num_input_tokens_seen": 87162830, + "step": 4050, + "time_per_iteration": 4.2016377449035645 + }, + { + "auxiliary_loss_clip": 0.01071311, + "auxiliary_loss_mlp": 0.01037614, + "balance_loss_clip": 1.03025997, + "balance_loss_mlp": 1.02345753, + "epoch": 0.24355929655794378, + "flos": 17420554200960.0, + "grad_norm": 2.304198969337667, + "language_loss": 0.75131655, + "learning_rate": 3.5403460551475854e-06, + "loss": 0.7724058, + "num_input_tokens_seen": 87180905, + "step": 4051, + "time_per_iteration": 4.322691440582275 + }, + { + "auxiliary_loss_clip": 0.01045996, + "auxiliary_loss_mlp": 0.01044429, + "balance_loss_clip": 1.02868366, + "balance_loss_mlp": 1.02882457, + "epoch": 0.24361941981061175, + "flos": 25410929431680.0, + "grad_norm": 2.030371340478379, + "language_loss": 0.70296741, + "learning_rate": 3.540097613646296e-06, + "loss": 0.72387171, + "num_input_tokens_seen": 87202290, + "step": 4052, + "time_per_iteration": 2.8123087882995605 + }, + { + "auxiliary_loss_clip": 0.01076487, + "auxiliary_loss_mlp": 0.01047035, + "balance_loss_clip": 1.03551638, + "balance_loss_mlp": 1.03124595, + "epoch": 0.2436795430632797, + "flos": 22820800636800.0, + "grad_norm": 1.5969094214998725, + "language_loss": 0.80876207, + "learning_rate": 3.539849113744351e-06, + "loss": 0.8299973, + "num_input_tokens_seen": 87221650, + "step": 4053, + "time_per_iteration": 2.804704427719116 + }, + { + "auxiliary_loss_clip": 0.01098204, + "auxiliary_loss_mlp": 0.01034196, + "balance_loss_clip": 1.03450871, + "balance_loss_mlp": 1.01879442, + "epoch": 0.2437396663159477, + "flos": 15157786982400.0, + "grad_norm": 1.7257824799827888, + "language_loss": 0.7770614, + "learning_rate": 3.539600555451172e-06, + "loss": 0.79838538, + "num_input_tokens_seen": 87238515, + "step": 4054, + "time_per_iteration": 2.7940022945404053 + }, + { + "auxiliary_loss_clip": 0.01041933, + "auxiliary_loss_mlp": 0.01047214, + "balance_loss_clip": 1.02450573, + "balance_loss_mlp": 1.03201437, + "epoch": 0.24379978956861567, + "flos": 22091131756800.0, + "grad_norm": 1.6490100855646366, + "language_loss": 0.83867383, + "learning_rate": 3.5393519387761866e-06, + "loss": 0.85956526, + "num_input_tokens_seen": 87256290, + "step": 4055, + "time_per_iteration": 2.7835793495178223 + }, + { + "auxiliary_loss_clip": 0.01061564, + "auxiliary_loss_mlp": 0.01038858, + "balance_loss_clip": 1.02874374, + "balance_loss_mlp": 1.02278888, + "epoch": 0.24385991282128364, + "flos": 31467766527360.0, + "grad_norm": 2.360760586834523, + "language_loss": 0.54706395, + "learning_rate": 3.5391032637288217e-06, + "loss": 0.56806821, + "num_input_tokens_seen": 87277085, + "step": 4056, + "time_per_iteration": 2.748476982116699 + }, + { + "auxiliary_loss_clip": 0.01086704, + "auxiliary_loss_mlp": 0.01045071, + "balance_loss_clip": 1.03176594, + "balance_loss_mlp": 1.02870393, + "epoch": 0.2439200360739516, + "flos": 23838795987840.0, + "grad_norm": 2.5559552304201616, + "language_loss": 0.80515176, + "learning_rate": 3.538854530318506e-06, + "loss": 0.82646954, + "num_input_tokens_seen": 87293020, + "step": 4057, + "time_per_iteration": 4.226075649261475 + }, + { + "auxiliary_loss_clip": 0.01083636, + "auxiliary_loss_mlp": 0.01039845, + "balance_loss_clip": 1.0318284, + "balance_loss_mlp": 1.02494359, + "epoch": 0.24398015932661957, + "flos": 19169978198400.0, + "grad_norm": 1.8425206983790885, + "language_loss": 0.79398191, + "learning_rate": 3.538605738554673e-06, + "loss": 0.81521672, + "num_input_tokens_seen": 87311445, + "step": 4058, + "time_per_iteration": 4.155444383621216 + }, + { + "auxiliary_loss_clip": 0.01097471, + "auxiliary_loss_mlp": 0.01042478, + "balance_loss_clip": 1.03238237, + "balance_loss_mlp": 1.02840519, + "epoch": 0.24404028257928753, + "flos": 25262474520960.0, + "grad_norm": 1.5639533154239171, + "language_loss": 0.85557991, + "learning_rate": 3.538356888446756e-06, + "loss": 0.87697941, + "num_input_tokens_seen": 87332055, + "step": 4059, + "time_per_iteration": 2.5726613998413086 + }, + { + "auxiliary_loss_clip": 0.01084911, + "auxiliary_loss_mlp": 0.01043713, + "balance_loss_clip": 1.03653526, + "balance_loss_mlp": 1.02906179, + "epoch": 0.2441004058319555, + "flos": 26467600752000.0, + "grad_norm": 1.5395706847168842, + "language_loss": 0.74213505, + "learning_rate": 3.5381079800041913e-06, + "loss": 0.7634213, + "num_input_tokens_seen": 87351295, + "step": 4060, + "time_per_iteration": 2.6175496578216553 + }, + { + "auxiliary_loss_clip": 0.01068566, + "auxiliary_loss_mlp": 0.0104388, + "balance_loss_clip": 1.03197312, + "balance_loss_mlp": 1.02610612, + "epoch": 0.2441605290846235, + "flos": 26760524163840.0, + "grad_norm": 1.833444565869857, + "language_loss": 0.73085058, + "learning_rate": 3.5378590132364182e-06, + "loss": 0.75197506, + "num_input_tokens_seen": 87370650, + "step": 4061, + "time_per_iteration": 2.722365140914917 + }, + { + "auxiliary_loss_clip": 0.0109406, + "auxiliary_loss_mlp": 0.01038137, + "balance_loss_clip": 1.03304875, + "balance_loss_mlp": 1.02469015, + "epoch": 0.24422065233729146, + "flos": 21105850717440.0, + "grad_norm": 2.0284865668381467, + "language_loss": 0.75763786, + "learning_rate": 3.5376099881528768e-06, + "loss": 0.77895987, + "num_input_tokens_seen": 87389020, + "step": 4062, + "time_per_iteration": 2.632327079772949 + }, + { + "auxiliary_loss_clip": 0.01059343, + "auxiliary_loss_mlp": 0.01035492, + "balance_loss_clip": 1.03114617, + "balance_loss_mlp": 1.02093625, + "epoch": 0.24428077558995942, + "flos": 25263156879360.0, + "grad_norm": 6.195833218195549, + "language_loss": 0.85121447, + "learning_rate": 3.537360904763011e-06, + "loss": 0.87216282, + "num_input_tokens_seen": 87409695, + "step": 4063, + "time_per_iteration": 2.671229839324951 + }, + { + "auxiliary_loss_clip": 0.010677, + "auxiliary_loss_mlp": 0.01036954, + "balance_loss_clip": 1.03074312, + "balance_loss_mlp": 1.02062225, + "epoch": 0.24434089884262739, + "flos": 20485278420480.0, + "grad_norm": 7.782382728101178, + "language_loss": 0.67907768, + "learning_rate": 3.5371117630762656e-06, + "loss": 0.7001242, + "num_input_tokens_seen": 87428250, + "step": 4064, + "time_per_iteration": 2.9384028911590576 + }, + { + "auxiliary_loss_clip": 0.01087411, + "auxiliary_loss_mlp": 0.01038349, + "balance_loss_clip": 1.0314312, + "balance_loss_mlp": 1.02244687, + "epoch": 0.24440102209529535, + "flos": 23621895711360.0, + "grad_norm": 1.4337093564777945, + "language_loss": 0.69945908, + "learning_rate": 3.536862563102088e-06, + "loss": 0.72071671, + "num_input_tokens_seen": 87449380, + "step": 4065, + "time_per_iteration": 2.9587960243225098 + }, + { + "auxiliary_loss_clip": 0.0110002, + "auxiliary_loss_mlp": 0.01047875, + "balance_loss_clip": 1.03403449, + "balance_loss_mlp": 1.03072047, + "epoch": 0.24446114534796332, + "flos": 20554729367040.0, + "grad_norm": 1.796312943059901, + "language_loss": 0.83801633, + "learning_rate": 3.5366133048499282e-06, + "loss": 0.85949522, + "num_input_tokens_seen": 87465365, + "step": 4066, + "time_per_iteration": 2.639923095703125 + }, + { + "auxiliary_loss_clip": 0.01020375, + "auxiliary_loss_mlp": 0.01006101, + "balance_loss_clip": 1.00433064, + "balance_loss_mlp": 1.00383651, + "epoch": 0.24452126860063128, + "flos": 60389575009920.0, + "grad_norm": 0.7471674384530729, + "language_loss": 0.52284813, + "learning_rate": 3.5363639883292374e-06, + "loss": 0.54311287, + "num_input_tokens_seen": 87522525, + "step": 4067, + "time_per_iteration": 3.1348493099212646 + }, + { + "auxiliary_loss_clip": 0.01075398, + "auxiliary_loss_mlp": 0.01044803, + "balance_loss_clip": 1.03328133, + "balance_loss_mlp": 1.02916837, + "epoch": 0.24458139185329927, + "flos": 15121660878720.0, + "grad_norm": 3.3101231168514875, + "language_loss": 0.72364318, + "learning_rate": 3.5361146135494706e-06, + "loss": 0.74484515, + "num_input_tokens_seen": 87539170, + "step": 4068, + "time_per_iteration": 2.8209288120269775 + }, + { + "auxiliary_loss_clip": 0.01039898, + "auxiliary_loss_mlp": 0.01043328, + "balance_loss_clip": 1.029737, + "balance_loss_mlp": 1.02780724, + "epoch": 0.24464151510596724, + "flos": 27998723842560.0, + "grad_norm": 1.5984649746969408, + "language_loss": 0.77725846, + "learning_rate": 3.5358651805200835e-06, + "loss": 0.7980907, + "num_input_tokens_seen": 87558875, + "step": 4069, + "time_per_iteration": 2.752119779586792 + }, + { + "auxiliary_loss_clip": 0.01074464, + "auxiliary_loss_mlp": 0.01039215, + "balance_loss_clip": 1.03479636, + "balance_loss_mlp": 1.02331805, + "epoch": 0.2447016383586352, + "flos": 19792884879360.0, + "grad_norm": 2.6384068698748013, + "language_loss": 0.8005079, + "learning_rate": 3.5356156892505347e-06, + "loss": 0.82164466, + "num_input_tokens_seen": 87576485, + "step": 4070, + "time_per_iteration": 2.6489312648773193 + }, + { + "auxiliary_loss_clip": 0.01069058, + "auxiliary_loss_mlp": 0.0104596, + "balance_loss_clip": 1.02804756, + "balance_loss_mlp": 1.03103518, + "epoch": 0.24476176161130317, + "flos": 26067340523520.0, + "grad_norm": 1.6070649135318058, + "language_loss": 0.84112728, + "learning_rate": 3.5353661397502854e-06, + "loss": 0.86227751, + "num_input_tokens_seen": 87598620, + "step": 4071, + "time_per_iteration": 2.675706386566162 + }, + { + "auxiliary_loss_clip": 0.01069457, + "auxiliary_loss_mlp": 0.01053827, + "balance_loss_clip": 1.030182, + "balance_loss_mlp": 1.03549242, + "epoch": 0.24482188486397113, + "flos": 18843550375680.0, + "grad_norm": 1.8011864983226002, + "language_loss": 0.79924452, + "learning_rate": 3.535116532028798e-06, + "loss": 0.82047737, + "num_input_tokens_seen": 87616595, + "step": 4072, + "time_per_iteration": 2.5824429988861084 + }, + { + "auxiliary_loss_clip": 0.01084498, + "auxiliary_loss_mlp": 0.01040049, + "balance_loss_clip": 1.03336847, + "balance_loss_mlp": 1.02575612, + "epoch": 0.2448820081166391, + "flos": 21251791676160.0, + "grad_norm": 2.453932153151477, + "language_loss": 0.70301062, + "learning_rate": 3.5348668660955382e-06, + "loss": 0.72425616, + "num_input_tokens_seen": 87635755, + "step": 4073, + "time_per_iteration": 2.6045444011688232 + }, + { + "auxiliary_loss_clip": 0.01059593, + "auxiliary_loss_mlp": 0.01042563, + "balance_loss_clip": 1.02968001, + "balance_loss_mlp": 1.02810884, + "epoch": 0.2449421313693071, + "flos": 23950586090880.0, + "grad_norm": 2.1593008552653394, + "language_loss": 0.67165983, + "learning_rate": 3.5346171419599728e-06, + "loss": 0.69268131, + "num_input_tokens_seen": 87652885, + "step": 4074, + "time_per_iteration": 2.5977659225463867 + }, + { + "auxiliary_loss_clip": 0.01018809, + "auxiliary_loss_mlp": 0.01007336, + "balance_loss_clip": 1.003178, + "balance_loss_mlp": 1.00505888, + "epoch": 0.24500225462197506, + "flos": 60687669980160.0, + "grad_norm": 0.9045691767886899, + "language_loss": 0.68718559, + "learning_rate": 3.5343673596315718e-06, + "loss": 0.70744705, + "num_input_tokens_seen": 87713220, + "step": 4075, + "time_per_iteration": 3.2435669898986816 + }, + { + "auxiliary_loss_clip": 0.0109524, + "auxiliary_loss_mlp": 0.01038952, + "balance_loss_clip": 1.03392887, + "balance_loss_mlp": 1.02441418, + "epoch": 0.24506237787464302, + "flos": 26284204886400.0, + "grad_norm": 2.3513756847363823, + "language_loss": 0.79402608, + "learning_rate": 3.5341175191198063e-06, + "loss": 0.815368, + "num_input_tokens_seen": 87732680, + "step": 4076, + "time_per_iteration": 2.6201460361480713 + }, + { + "auxiliary_loss_clip": 0.01076065, + "auxiliary_loss_mlp": 0.00749024, + "balance_loss_clip": 1.03174651, + "balance_loss_mlp": 1.00167167, + "epoch": 0.245122501127311, + "flos": 20552287242240.0, + "grad_norm": 1.8231094734482907, + "language_loss": 0.8217957, + "learning_rate": 3.533867620434151e-06, + "loss": 0.84004653, + "num_input_tokens_seen": 87751880, + "step": 4077, + "time_per_iteration": 2.7369658946990967 + }, + { + "auxiliary_loss_clip": 0.0109737, + "auxiliary_loss_mlp": 0.01041337, + "balance_loss_clip": 1.0337491, + "balance_loss_mlp": 1.02507639, + "epoch": 0.24518262437997895, + "flos": 29132603447040.0, + "grad_norm": 2.550955290020612, + "language_loss": 0.62525749, + "learning_rate": 3.533617663584082e-06, + "loss": 0.64664453, + "num_input_tokens_seen": 87771795, + "step": 4078, + "time_per_iteration": 2.608107566833496 + }, + { + "auxiliary_loss_clip": 0.01074487, + "auxiliary_loss_mlp": 0.01037453, + "balance_loss_clip": 1.03675127, + "balance_loss_mlp": 1.02264714, + "epoch": 0.24524274763264692, + "flos": 23476924419840.0, + "grad_norm": 1.4900206548599595, + "language_loss": 0.75618935, + "learning_rate": 3.5333676485790765e-06, + "loss": 0.77730864, + "num_input_tokens_seen": 87793640, + "step": 4079, + "time_per_iteration": 2.7708661556243896 + }, + { + "auxiliary_loss_clip": 0.01092249, + "auxiliary_loss_mlp": 0.01040654, + "balance_loss_clip": 1.03153253, + "balance_loss_mlp": 1.02456033, + "epoch": 0.24530287088531488, + "flos": 17201175886080.0, + "grad_norm": 1.9447627475450748, + "language_loss": 0.74788618, + "learning_rate": 3.5331175754286173e-06, + "loss": 0.76921523, + "num_input_tokens_seen": 87812390, + "step": 4080, + "time_per_iteration": 2.5570764541625977 + }, + { + "auxiliary_loss_clip": 0.01070073, + "auxiliary_loss_mlp": 0.01039324, + "balance_loss_clip": 1.03046942, + "balance_loss_mlp": 1.02525091, + "epoch": 0.24536299413798288, + "flos": 14867449349760.0, + "grad_norm": 1.8119960288144012, + "language_loss": 0.82731503, + "learning_rate": 3.532867444142186e-06, + "loss": 0.848409, + "num_input_tokens_seen": 87830640, + "step": 4081, + "time_per_iteration": 2.7862703800201416 + }, + { + "auxiliary_loss_clip": 0.01069031, + "auxiliary_loss_mlp": 0.01037583, + "balance_loss_clip": 1.03076029, + "balance_loss_mlp": 1.0233314, + "epoch": 0.24542311739065084, + "flos": 35262051886080.0, + "grad_norm": 2.4230400619952372, + "language_loss": 0.73506749, + "learning_rate": 3.532617254729267e-06, + "loss": 0.75613356, + "num_input_tokens_seen": 87850450, + "step": 4082, + "time_per_iteration": 2.8977739810943604 + }, + { + "auxiliary_loss_clip": 0.01055144, + "auxiliary_loss_mlp": 0.01040445, + "balance_loss_clip": 1.02903962, + "balance_loss_mlp": 1.0274508, + "epoch": 0.2454832406433188, + "flos": 21503130117120.0, + "grad_norm": 1.6180341265896292, + "language_loss": 0.7160809, + "learning_rate": 3.5323670071993485e-06, + "loss": 0.73703682, + "num_input_tokens_seen": 87868810, + "step": 4083, + "time_per_iteration": 2.957594633102417 + }, + { + "auxiliary_loss_clip": 0.01064371, + "auxiliary_loss_mlp": 0.01039973, + "balance_loss_clip": 1.02890444, + "balance_loss_mlp": 1.02372479, + "epoch": 0.24554336389598677, + "flos": 14756664827520.0, + "grad_norm": 2.1655043740348217, + "language_loss": 0.74446142, + "learning_rate": 3.532116701561919e-06, + "loss": 0.76550496, + "num_input_tokens_seen": 87885685, + "step": 4084, + "time_per_iteration": 2.8244073390960693 + }, + { + "auxiliary_loss_clip": 0.01073896, + "auxiliary_loss_mlp": 0.01037459, + "balance_loss_clip": 1.02973723, + "balance_loss_mlp": 1.02258205, + "epoch": 0.24560348714865474, + "flos": 14976402278400.0, + "grad_norm": 2.432463156067834, + "language_loss": 0.85310495, + "learning_rate": 3.531866337826471e-06, + "loss": 0.87421852, + "num_input_tokens_seen": 87903715, + "step": 4085, + "time_per_iteration": 2.6530139446258545 + }, + { + "auxiliary_loss_clip": 0.01073543, + "auxiliary_loss_mlp": 0.01049187, + "balance_loss_clip": 1.03521955, + "balance_loss_mlp": 1.03428626, + "epoch": 0.2456636104013227, + "flos": 22675326554880.0, + "grad_norm": 1.764230724867112, + "language_loss": 0.79074305, + "learning_rate": 3.5316159160024982e-06, + "loss": 0.81197035, + "num_input_tokens_seen": 87923375, + "step": 4086, + "time_per_iteration": 2.6634795665740967 + }, + { + "auxiliary_loss_clip": 0.01038754, + "auxiliary_loss_mlp": 0.01047225, + "balance_loss_clip": 1.03099144, + "balance_loss_mlp": 1.03229988, + "epoch": 0.2457237336539907, + "flos": 27417869009280.0, + "grad_norm": 1.5491050450504944, + "language_loss": 0.75275981, + "learning_rate": 3.531365436099496e-06, + "loss": 0.77361953, + "num_input_tokens_seen": 87943115, + "step": 4087, + "time_per_iteration": 2.782198429107666 + }, + { + "auxiliary_loss_clip": 0.01046795, + "auxiliary_loss_mlp": 0.01045159, + "balance_loss_clip": 1.03584218, + "balance_loss_mlp": 1.02895784, + "epoch": 0.24578385690665866, + "flos": 20412379768320.0, + "grad_norm": 4.713158745293083, + "language_loss": 0.79327106, + "learning_rate": 3.5311148981269635e-06, + "loss": 0.81419057, + "num_input_tokens_seen": 87959505, + "step": 4088, + "time_per_iteration": 2.710116386413574 + }, + { + "auxiliary_loss_clip": 0.0105345, + "auxiliary_loss_mlp": 0.01029115, + "balance_loss_clip": 1.02814794, + "balance_loss_mlp": 1.01575732, + "epoch": 0.24584398015932662, + "flos": 23915393740800.0, + "grad_norm": 1.5794885921665804, + "language_loss": 0.77201766, + "learning_rate": 3.5308643020944e-06, + "loss": 0.79284322, + "num_input_tokens_seen": 87979725, + "step": 4089, + "time_per_iteration": 2.7063682079315186 + }, + { + "auxiliary_loss_clip": 0.01085218, + "auxiliary_loss_mlp": 0.01043477, + "balance_loss_clip": 1.03562486, + "balance_loss_mlp": 1.02792072, + "epoch": 0.2459041034119946, + "flos": 41496359103360.0, + "grad_norm": 2.141045809749959, + "language_loss": 0.81054705, + "learning_rate": 3.530613648011309e-06, + "loss": 0.83183396, + "num_input_tokens_seen": 87998270, + "step": 4090, + "time_per_iteration": 2.7326314449310303 + }, + { + "auxiliary_loss_clip": 0.01065547, + "auxiliary_loss_mlp": 0.01045598, + "balance_loss_clip": 1.02853394, + "balance_loss_mlp": 1.02968383, + "epoch": 0.24596422666466256, + "flos": 19936814676480.0, + "grad_norm": 1.8983563553976963, + "language_loss": 0.73474193, + "learning_rate": 3.5303629358871946e-06, + "loss": 0.75585341, + "num_input_tokens_seen": 88016760, + "step": 4091, + "time_per_iteration": 2.599666118621826 + }, + { + "auxiliary_loss_clip": 0.01061547, + "auxiliary_loss_mlp": 0.01042714, + "balance_loss_clip": 1.03648686, + "balance_loss_mlp": 1.02763987, + "epoch": 0.24602434991733052, + "flos": 21544391865600.0, + "grad_norm": 2.1036974753941404, + "language_loss": 0.76885784, + "learning_rate": 3.5301121657315653e-06, + "loss": 0.78990042, + "num_input_tokens_seen": 88036465, + "step": 4092, + "time_per_iteration": 2.701508045196533 + }, + { + "auxiliary_loss_clip": 0.01059482, + "auxiliary_loss_mlp": 0.010375, + "balance_loss_clip": 1.02667975, + "balance_loss_mlp": 1.02176404, + "epoch": 0.24608447316999849, + "flos": 23185078416000.0, + "grad_norm": 2.6989405497395413, + "language_loss": 0.81316769, + "learning_rate": 3.5298613375539287e-06, + "loss": 0.83413756, + "num_input_tokens_seen": 88053270, + "step": 4093, + "time_per_iteration": 2.642014980316162 + }, + { + "auxiliary_loss_clip": 0.0108694, + "auxiliary_loss_mlp": 0.01041091, + "balance_loss_clip": 1.03226805, + "balance_loss_mlp": 1.02532005, + "epoch": 0.24614459642266648, + "flos": 19641951930240.0, + "grad_norm": 2.011285389895963, + "language_loss": 0.8691228, + "learning_rate": 3.529610451363797e-06, + "loss": 0.89040309, + "num_input_tokens_seen": 88072305, + "step": 4094, + "time_per_iteration": 2.6264960765838623 + }, + { + "auxiliary_loss_clip": 0.00972615, + "auxiliary_loss_mlp": 0.01012568, + "balance_loss_clip": 1.01173031, + "balance_loss_mlp": 1.00903988, + "epoch": 0.24620471967533444, + "flos": 61739816186880.0, + "grad_norm": 0.7715834780276283, + "language_loss": 0.57531983, + "learning_rate": 3.5293595071706833e-06, + "loss": 0.59517163, + "num_input_tokens_seen": 88137995, + "step": 4095, + "time_per_iteration": 3.3580195903778076 + }, + { + "auxiliary_loss_clip": 0.01005057, + "auxiliary_loss_mlp": 0.01010009, + "balance_loss_clip": 1.00869203, + "balance_loss_mlp": 1.00773227, + "epoch": 0.2462648429280024, + "flos": 69154436315520.0, + "grad_norm": 0.6422868530821179, + "language_loss": 0.56206292, + "learning_rate": 3.5291085049841042e-06, + "loss": 0.58221358, + "num_input_tokens_seen": 88208490, + "step": 4096, + "time_per_iteration": 3.31860089302063 + }, + { + "auxiliary_loss_clip": 0.01075445, + "auxiliary_loss_mlp": 0.01038687, + "balance_loss_clip": 1.03504455, + "balance_loss_mlp": 1.02444792, + "epoch": 0.24632496618067037, + "flos": 29459605887360.0, + "grad_norm": 1.8158506260376965, + "language_loss": 0.77332759, + "learning_rate": 3.5288574448135773e-06, + "loss": 0.79446888, + "num_input_tokens_seen": 88228050, + "step": 4097, + "time_per_iteration": 2.745915651321411 + }, + { + "auxiliary_loss_clip": 0.01064973, + "auxiliary_loss_mlp": 0.01041333, + "balance_loss_clip": 1.0308845, + "balance_loss_mlp": 1.0241785, + "epoch": 0.24638508943333834, + "flos": 24316444068480.0, + "grad_norm": 2.9556183978191792, + "language_loss": 0.76332706, + "learning_rate": 3.5286063266686235e-06, + "loss": 0.78439009, + "num_input_tokens_seen": 88248090, + "step": 4098, + "time_per_iteration": 5.9578797817230225 + }, + { + "auxiliary_loss_clip": 0.01077768, + "auxiliary_loss_mlp": 0.01036489, + "balance_loss_clip": 1.03478944, + "balance_loss_mlp": 1.02257776, + "epoch": 0.2464452126860063, + "flos": 26613254401920.0, + "grad_norm": 2.0439976071095467, + "language_loss": 0.68053806, + "learning_rate": 3.528355150558764e-06, + "loss": 0.70168066, + "num_input_tokens_seen": 88267545, + "step": 4099, + "time_per_iteration": 2.732388734817505 + }, + { + "auxiliary_loss_clip": 0.01082332, + "auxiliary_loss_mlp": 0.01040743, + "balance_loss_clip": 1.03355229, + "balance_loss_mlp": 1.02649188, + "epoch": 0.24650533593867427, + "flos": 31212405763200.0, + "grad_norm": 2.449742318442343, + "language_loss": 0.6543566, + "learning_rate": 3.5281039164935237e-06, + "loss": 0.67558742, + "num_input_tokens_seen": 88289785, + "step": 4100, + "time_per_iteration": 2.7051470279693604 + }, + { + "auxiliary_loss_clip": 0.01001346, + "auxiliary_loss_mlp": 0.01006838, + "balance_loss_clip": 1.00554502, + "balance_loss_mlp": 1.00464439, + "epoch": 0.24656545919134226, + "flos": 68494002900480.0, + "grad_norm": 0.7110129711409637, + "language_loss": 0.61498892, + "learning_rate": 3.5278526244824304e-06, + "loss": 0.63507074, + "num_input_tokens_seen": 88357320, + "step": 4101, + "time_per_iteration": 3.3148860931396484 + }, + { + "auxiliary_loss_clip": 0.0109396, + "auxiliary_loss_mlp": 0.01039956, + "balance_loss_clip": 1.03436685, + "balance_loss_mlp": 1.02522814, + "epoch": 0.24662558244401023, + "flos": 20084192179200.0, + "grad_norm": 1.5180388580044153, + "language_loss": 0.73571771, + "learning_rate": 3.527601274535012e-06, + "loss": 0.75705683, + "num_input_tokens_seen": 88377040, + "step": 4102, + "time_per_iteration": 2.546248197555542 + }, + { + "auxiliary_loss_clip": 0.01075527, + "auxiliary_loss_mlp": 0.01037745, + "balance_loss_clip": 1.03432846, + "balance_loss_mlp": 1.02329719, + "epoch": 0.2466857056966782, + "flos": 30701361012480.0, + "grad_norm": 2.046834541015926, + "language_loss": 0.76210952, + "learning_rate": 3.5273498666608004e-06, + "loss": 0.78324223, + "num_input_tokens_seen": 88395085, + "step": 4103, + "time_per_iteration": 2.7607452869415283 + }, + { + "auxiliary_loss_clip": 0.01084002, + "auxiliary_loss_mlp": 0.01042493, + "balance_loss_clip": 1.03398061, + "balance_loss_mlp": 1.02702546, + "epoch": 0.24674582894934616, + "flos": 22528523669760.0, + "grad_norm": 2.004325836705833, + "language_loss": 0.78325403, + "learning_rate": 3.5270984008693288e-06, + "loss": 0.80451906, + "num_input_tokens_seen": 88413205, + "step": 4104, + "time_per_iteration": 4.370748519897461 + }, + { + "auxiliary_loss_clip": 0.01084399, + "auxiliary_loss_mlp": 0.01039964, + "balance_loss_clip": 1.03374648, + "balance_loss_mlp": 1.02295268, + "epoch": 0.24680595220201412, + "flos": 20704297599360.0, + "grad_norm": 1.6826874869933555, + "language_loss": 0.83073533, + "learning_rate": 3.526846877170133e-06, + "loss": 0.8519789, + "num_input_tokens_seen": 88431525, + "step": 4105, + "time_per_iteration": 4.245366334915161 + }, + { + "auxiliary_loss_clip": 0.01100623, + "auxiliary_loss_mlp": 0.01041332, + "balance_loss_clip": 1.03893161, + "balance_loss_mlp": 1.02690792, + "epoch": 0.2468660754546821, + "flos": 21831174051840.0, + "grad_norm": 1.6778890874537784, + "language_loss": 0.760144, + "learning_rate": 3.52659529557275e-06, + "loss": 0.78156352, + "num_input_tokens_seen": 88451210, + "step": 4106, + "time_per_iteration": 2.5609731674194336 + }, + { + "auxiliary_loss_clip": 0.01051949, + "auxiliary_loss_mlp": 0.01048149, + "balance_loss_clip": 1.02741146, + "balance_loss_mlp": 1.03072047, + "epoch": 0.24692619870735008, + "flos": 15267709578240.0, + "grad_norm": 2.043455062111776, + "language_loss": 0.72684479, + "learning_rate": 3.5263436560867205e-06, + "loss": 0.74784577, + "num_input_tokens_seen": 88467790, + "step": 4107, + "time_per_iteration": 2.672790050506592 + }, + { + "auxiliary_loss_clip": 0.01095282, + "auxiliary_loss_mlp": 0.01047302, + "balance_loss_clip": 1.03417289, + "balance_loss_mlp": 1.0318284, + "epoch": 0.24698632196001805, + "flos": 29680097523840.0, + "grad_norm": 1.7745887959846407, + "language_loss": 0.65474296, + "learning_rate": 3.526091958721587e-06, + "loss": 0.6761688, + "num_input_tokens_seen": 88490330, + "step": 4108, + "time_per_iteration": 2.6327288150787354 + }, + { + "auxiliary_loss_clip": 0.01048716, + "auxiliary_loss_mlp": 0.01043111, + "balance_loss_clip": 1.03131223, + "balance_loss_mlp": 1.02731562, + "epoch": 0.247046445212686, + "flos": 39165469741440.0, + "grad_norm": 1.8176038681622377, + "language_loss": 0.72660631, + "learning_rate": 3.5258402034868936e-06, + "loss": 0.74752462, + "num_input_tokens_seen": 88512435, + "step": 4109, + "time_per_iteration": 2.855057716369629 + }, + { + "auxiliary_loss_clip": 0.01062898, + "auxiliary_loss_mlp": 0.01044799, + "balance_loss_clip": 1.03190613, + "balance_loss_mlp": 1.02960587, + "epoch": 0.24710656846535398, + "flos": 22998845376000.0, + "grad_norm": 2.1537520782606903, + "language_loss": 0.79073042, + "learning_rate": 3.5255883903921866e-06, + "loss": 0.81180739, + "num_input_tokens_seen": 88529780, + "step": 4110, + "time_per_iteration": 2.657759666442871 + }, + { + "auxiliary_loss_clip": 0.01066391, + "auxiliary_loss_mlp": 0.01039427, + "balance_loss_clip": 1.03209686, + "balance_loss_mlp": 1.02402568, + "epoch": 0.24716669171802194, + "flos": 26432803451520.0, + "grad_norm": 2.1181968093903256, + "language_loss": 0.80895245, + "learning_rate": 3.5253365194470144e-06, + "loss": 0.83001071, + "num_input_tokens_seen": 88547200, + "step": 4111, + "time_per_iteration": 2.670206069946289 + }, + { + "auxiliary_loss_clip": 0.01093954, + "auxiliary_loss_mlp": 0.01041439, + "balance_loss_clip": 1.03305149, + "balance_loss_mlp": 1.02750969, + "epoch": 0.2472268149706899, + "flos": 23329870139520.0, + "grad_norm": 1.899901120849572, + "language_loss": 0.75258273, + "learning_rate": 3.5250845906609294e-06, + "loss": 0.77393663, + "num_input_tokens_seen": 88566415, + "step": 4112, + "time_per_iteration": 2.556819200515747 + }, + { + "auxiliary_loss_clip": 0.01065488, + "auxiliary_loss_mlp": 0.00749019, + "balance_loss_clip": 1.03231478, + "balance_loss_mlp": 1.00168753, + "epoch": 0.24728693822335787, + "flos": 23768734510080.0, + "grad_norm": 1.8842393783427909, + "language_loss": 0.82469285, + "learning_rate": 3.5248326040434835e-06, + "loss": 0.84283793, + "num_input_tokens_seen": 88585225, + "step": 4113, + "time_per_iteration": 2.764155626296997 + }, + { + "auxiliary_loss_clip": 0.01092388, + "auxiliary_loss_mlp": 0.01036905, + "balance_loss_clip": 1.03060234, + "balance_loss_mlp": 1.02167594, + "epoch": 0.24734706147602586, + "flos": 19317499355520.0, + "grad_norm": 2.113519084675534, + "language_loss": 0.87294757, + "learning_rate": 3.5245805596042322e-06, + "loss": 0.8942405, + "num_input_tokens_seen": 88603280, + "step": 4114, + "time_per_iteration": 2.580672025680542 + }, + { + "auxiliary_loss_clip": 0.01055, + "auxiliary_loss_mlp": 0.01035437, + "balance_loss_clip": 1.03275394, + "balance_loss_mlp": 1.02105427, + "epoch": 0.24740718472869383, + "flos": 28036932935040.0, + "grad_norm": 2.0699603390335146, + "language_loss": 0.75127554, + "learning_rate": 3.524328457352734e-06, + "loss": 0.77217996, + "num_input_tokens_seen": 88624925, + "step": 4115, + "time_per_iteration": 2.7825961112976074 + }, + { + "auxiliary_loss_clip": 0.00971526, + "auxiliary_loss_mlp": 0.01004188, + "balance_loss_clip": 1.0061692, + "balance_loss_mlp": 1.00193453, + "epoch": 0.2474673079813618, + "flos": 68107569408000.0, + "grad_norm": 0.6600894958738409, + "language_loss": 0.58228749, + "learning_rate": 3.5240762972985475e-06, + "loss": 0.60204464, + "num_input_tokens_seen": 88691475, + "step": 4116, + "time_per_iteration": 3.4304134845733643 + }, + { + "auxiliary_loss_clip": 0.01071573, + "auxiliary_loss_mlp": 0.01035064, + "balance_loss_clip": 1.03139806, + "balance_loss_mlp": 1.02088416, + "epoch": 0.24752743123402976, + "flos": 29462119839360.0, + "grad_norm": 1.4853064046833782, + "language_loss": 0.83687729, + "learning_rate": 3.523824079451235e-06, + "loss": 0.85794365, + "num_input_tokens_seen": 88713425, + "step": 4117, + "time_per_iteration": 2.8725714683532715 + }, + { + "auxiliary_loss_clip": 0.01002658, + "auxiliary_loss_mlp": 0.0074875, + "balance_loss_clip": 1.00719869, + "balance_loss_mlp": 1.00237203, + "epoch": 0.24758755448669773, + "flos": 58350459824640.0, + "grad_norm": 0.9045566045184423, + "language_loss": 0.63476223, + "learning_rate": 3.5235718038203602e-06, + "loss": 0.65227628, + "num_input_tokens_seen": 88769995, + "step": 4118, + "time_per_iteration": 3.1270008087158203 + }, + { + "auxiliary_loss_clip": 0.01075288, + "auxiliary_loss_mlp": 0.01040473, + "balance_loss_clip": 1.03008628, + "balance_loss_mlp": 1.02570283, + "epoch": 0.2476476777393657, + "flos": 20484416494080.0, + "grad_norm": 1.6354830787119097, + "language_loss": 0.79321879, + "learning_rate": 3.523319470415491e-06, + "loss": 0.81437635, + "num_input_tokens_seen": 88789970, + "step": 4119, + "time_per_iteration": 2.7273600101470947 + }, + { + "auxiliary_loss_clip": 0.01083197, + "auxiliary_loss_mlp": 0.01038918, + "balance_loss_clip": 1.03425872, + "balance_loss_mlp": 1.02468395, + "epoch": 0.24770780099203366, + "flos": 20485853038080.0, + "grad_norm": 1.4649066609636825, + "language_loss": 0.73901087, + "learning_rate": 3.5230670792461943e-06, + "loss": 0.76023203, + "num_input_tokens_seen": 88810000, + "step": 4120, + "time_per_iteration": 2.6467502117156982 + }, + { + "auxiliary_loss_clip": 0.01083603, + "auxiliary_loss_mlp": 0.01043454, + "balance_loss_clip": 1.03353453, + "balance_loss_mlp": 1.02807629, + "epoch": 0.24776792424470165, + "flos": 15153405523200.0, + "grad_norm": 2.1056053986188115, + "language_loss": 0.88098288, + "learning_rate": 3.522814630322041e-06, + "loss": 0.90225351, + "num_input_tokens_seen": 88827515, + "step": 4121, + "time_per_iteration": 2.576950788497925 + }, + { + "auxiliary_loss_clip": 0.01096256, + "auxiliary_loss_mlp": 0.01039147, + "balance_loss_clip": 1.03377867, + "balance_loss_mlp": 1.0233283, + "epoch": 0.2478280474973696, + "flos": 21725453347200.0, + "grad_norm": 2.0861558202011836, + "language_loss": 0.69441056, + "learning_rate": 3.5225621236526045e-06, + "loss": 0.71576458, + "num_input_tokens_seen": 88845025, + "step": 4122, + "time_per_iteration": 2.545970916748047 + }, + { + "auxiliary_loss_clip": 0.01095979, + "auxiliary_loss_mlp": 0.0103696, + "balance_loss_clip": 1.03298688, + "balance_loss_mlp": 1.02059281, + "epoch": 0.24788817075003758, + "flos": 20412200200320.0, + "grad_norm": 1.9325367388103147, + "language_loss": 0.79797292, + "learning_rate": 3.5223095592474596e-06, + "loss": 0.81930232, + "num_input_tokens_seen": 88861740, + "step": 4123, + "time_per_iteration": 2.5848448276519775 + }, + { + "auxiliary_loss_clip": 0.0104074, + "auxiliary_loss_mlp": 0.01041043, + "balance_loss_clip": 1.03150856, + "balance_loss_mlp": 1.02651143, + "epoch": 0.24794829400270554, + "flos": 22594455083520.0, + "grad_norm": 1.782891776477047, + "language_loss": 0.74679154, + "learning_rate": 3.5220569371161846e-06, + "loss": 0.76760936, + "num_input_tokens_seen": 88879740, + "step": 4124, + "time_per_iteration": 2.7080230712890625 + }, + { + "auxiliary_loss_clip": 0.01082335, + "auxiliary_loss_mlp": 0.01036991, + "balance_loss_clip": 1.03189981, + "balance_loss_mlp": 1.02276969, + "epoch": 0.2480084172553735, + "flos": 39676047615360.0, + "grad_norm": 1.7591083207306915, + "language_loss": 0.73421919, + "learning_rate": 3.521804257268357e-06, + "loss": 0.75541246, + "num_input_tokens_seen": 88904095, + "step": 4125, + "time_per_iteration": 2.70957612991333 + }, + { + "auxiliary_loss_clip": 0.01060763, + "auxiliary_loss_mlp": 0.00748909, + "balance_loss_clip": 1.03001082, + "balance_loss_mlp": 1.00155497, + "epoch": 0.24806854050804147, + "flos": 22053712763520.0, + "grad_norm": 1.7432517403055605, + "language_loss": 0.69885993, + "learning_rate": 3.5215515197135595e-06, + "loss": 0.71695668, + "num_input_tokens_seen": 88920740, + "step": 4126, + "time_per_iteration": 2.6192595958709717 + }, + { + "auxiliary_loss_clip": 0.01081646, + "auxiliary_loss_mlp": 0.01044826, + "balance_loss_clip": 1.03149939, + "balance_loss_mlp": 1.02957916, + "epoch": 0.24812866376070947, + "flos": 15486764670720.0, + "grad_norm": 9.523440004547655, + "language_loss": 0.81005657, + "learning_rate": 3.5212987244613764e-06, + "loss": 0.8313213, + "num_input_tokens_seen": 88938510, + "step": 4127, + "time_per_iteration": 2.5676982402801514 + }, + { + "auxiliary_loss_clip": 0.01084091, + "auxiliary_loss_mlp": 0.00748848, + "balance_loss_clip": 1.03416932, + "balance_loss_mlp": 1.00144458, + "epoch": 0.24818878701337743, + "flos": 14757419013120.0, + "grad_norm": 2.906521092351516, + "language_loss": 0.84092212, + "learning_rate": 3.5210458715213927e-06, + "loss": 0.8592515, + "num_input_tokens_seen": 88955235, + "step": 4128, + "time_per_iteration": 2.5999011993408203 + }, + { + "auxiliary_loss_clip": 0.01066429, + "auxiliary_loss_mlp": 0.01047964, + "balance_loss_clip": 1.03162599, + "balance_loss_mlp": 1.03171611, + "epoch": 0.2482489102660454, + "flos": 27089501852160.0, + "grad_norm": 2.3186063594719934, + "language_loss": 0.65764028, + "learning_rate": 3.5207929609031973e-06, + "loss": 0.67878419, + "num_input_tokens_seen": 88975210, + "step": 4129, + "time_per_iteration": 2.6606078147888184 + }, + { + "auxiliary_loss_clip": 0.01048785, + "auxiliary_loss_mlp": 0.01040071, + "balance_loss_clip": 1.02930522, + "balance_loss_mlp": 1.02471662, + "epoch": 0.24830903351871336, + "flos": 26467528924800.0, + "grad_norm": 1.714902621697703, + "language_loss": 0.75446481, + "learning_rate": 3.5205399926163806e-06, + "loss": 0.77535337, + "num_input_tokens_seen": 88996120, + "step": 4130, + "time_per_iteration": 2.823603630065918 + }, + { + "auxiliary_loss_clip": 0.01033066, + "auxiliary_loss_mlp": 0.01044687, + "balance_loss_clip": 1.02920377, + "balance_loss_mlp": 1.02901113, + "epoch": 0.24836915677138133, + "flos": 10228436870400.0, + "grad_norm": 2.034182718245518, + "language_loss": 0.767416, + "learning_rate": 3.520286966670535e-06, + "loss": 0.78819358, + "num_input_tokens_seen": 89008685, + "step": 4131, + "time_per_iteration": 2.680129051208496 + }, + { + "auxiliary_loss_clip": 0.01082837, + "auxiliary_loss_mlp": 0.01037062, + "balance_loss_clip": 1.0328064, + "balance_loss_mlp": 1.02348423, + "epoch": 0.2484292800240493, + "flos": 30080429579520.0, + "grad_norm": 1.5741823898230705, + "language_loss": 0.84149593, + "learning_rate": 3.520033883075255e-06, + "loss": 0.86269492, + "num_input_tokens_seen": 89031160, + "step": 4132, + "time_per_iteration": 2.6681268215179443 + }, + { + "auxiliary_loss_clip": 0.01071643, + "auxiliary_loss_mlp": 0.01039966, + "balance_loss_clip": 1.03153539, + "balance_loss_mlp": 1.02383697, + "epoch": 0.24848940327671726, + "flos": 13442944803840.0, + "grad_norm": 1.5820955958198668, + "language_loss": 0.71140975, + "learning_rate": 3.5197807418401386e-06, + "loss": 0.73252583, + "num_input_tokens_seen": 89047235, + "step": 4133, + "time_per_iteration": 2.7813992500305176 + }, + { + "auxiliary_loss_clip": 0.01103359, + "auxiliary_loss_mlp": 0.01039906, + "balance_loss_clip": 1.03594494, + "balance_loss_mlp": 1.02157211, + "epoch": 0.24854952652938525, + "flos": 19970247260160.0, + "grad_norm": 2.072431583629941, + "language_loss": 0.61474192, + "learning_rate": 3.5195275429747834e-06, + "loss": 0.63617456, + "num_input_tokens_seen": 89064790, + "step": 4134, + "time_per_iteration": 2.5756685733795166 + }, + { + "auxiliary_loss_clip": 0.01082934, + "auxiliary_loss_mlp": 0.01039649, + "balance_loss_clip": 1.0315975, + "balance_loss_mlp": 1.02451587, + "epoch": 0.24860964978205322, + "flos": 18150187167360.0, + "grad_norm": 2.428226339634667, + "language_loss": 0.78780234, + "learning_rate": 3.5192742864887914e-06, + "loss": 0.80902815, + "num_input_tokens_seen": 89083250, + "step": 4135, + "time_per_iteration": 2.6598756313323975 + }, + { + "auxiliary_loss_clip": 0.01076751, + "auxiliary_loss_mlp": 0.01030325, + "balance_loss_clip": 1.03542101, + "balance_loss_mlp": 1.0162468, + "epoch": 0.24866977303472118, + "flos": 11728641329280.0, + "grad_norm": 2.943974722897228, + "language_loss": 0.82937384, + "learning_rate": 3.5190209723917662e-06, + "loss": 0.85044456, + "num_input_tokens_seen": 89100905, + "step": 4136, + "time_per_iteration": 2.6123294830322266 + }, + { + "auxiliary_loss_clip": 0.01069539, + "auxiliary_loss_mlp": 0.01039628, + "balance_loss_clip": 1.03415, + "balance_loss_mlp": 1.02483988, + "epoch": 0.24872989628738915, + "flos": 34823582565120.0, + "grad_norm": 1.6374599196762416, + "language_loss": 0.70572108, + "learning_rate": 3.518767600693314e-06, + "loss": 0.72681272, + "num_input_tokens_seen": 89122630, + "step": 4137, + "time_per_iteration": 2.790609359741211 + }, + { + "auxiliary_loss_clip": 0.01081931, + "auxiliary_loss_mlp": 0.00748966, + "balance_loss_clip": 1.02899957, + "balance_loss_mlp": 1.00154591, + "epoch": 0.2487900195400571, + "flos": 13699347062400.0, + "grad_norm": 1.8542367020882347, + "language_loss": 0.66723466, + "learning_rate": 3.518514171403042e-06, + "loss": 0.68554354, + "num_input_tokens_seen": 89141050, + "step": 4138, + "time_per_iteration": 2.5676867961883545 + }, + { + "auxiliary_loss_clip": 0.01062081, + "auxiliary_loss_mlp": 0.01038636, + "balance_loss_clip": 1.03233576, + "balance_loss_mlp": 1.02445602, + "epoch": 0.24885014279272508, + "flos": 25337815297920.0, + "grad_norm": 1.997858087461506, + "language_loss": 0.83922559, + "learning_rate": 3.51826068453056e-06, + "loss": 0.86023283, + "num_input_tokens_seen": 89160810, + "step": 4139, + "time_per_iteration": 2.7553086280822754 + }, + { + "auxiliary_loss_clip": 0.01061668, + "auxiliary_loss_mlp": 0.01042484, + "balance_loss_clip": 1.03041148, + "balance_loss_mlp": 1.02610481, + "epoch": 0.24891026604539307, + "flos": 20631434860800.0, + "grad_norm": 1.4542869758601182, + "language_loss": 0.78915799, + "learning_rate": 3.518007140085481e-06, + "loss": 0.8101995, + "num_input_tokens_seen": 89180610, + "step": 4140, + "time_per_iteration": 2.622673273086548 + }, + { + "auxiliary_loss_clip": 0.01018098, + "auxiliary_loss_mlp": 0.01008322, + "balance_loss_clip": 1.01393402, + "balance_loss_mlp": 1.00453079, + "epoch": 0.24897038929806103, + "flos": 66960294030720.0, + "grad_norm": 0.8227352683423965, + "language_loss": 0.61032414, + "learning_rate": 3.51775353807742e-06, + "loss": 0.63058829, + "num_input_tokens_seen": 89241880, + "step": 4141, + "time_per_iteration": 3.301473617553711 + }, + { + "auxiliary_loss_clip": 0.01099991, + "auxiliary_loss_mlp": 0.01045917, + "balance_loss_clip": 1.03638434, + "balance_loss_mlp": 1.03003883, + "epoch": 0.249030512550729, + "flos": 36392555612160.0, + "grad_norm": 2.5051174910727454, + "language_loss": 0.73107243, + "learning_rate": 3.5174998785159913e-06, + "loss": 0.75253153, + "num_input_tokens_seen": 89263340, + "step": 4142, + "time_per_iteration": 2.869194507598877 + }, + { + "auxiliary_loss_clip": 0.01082836, + "auxiliary_loss_mlp": 0.01038963, + "balance_loss_clip": 1.03236592, + "balance_loss_mlp": 1.02345443, + "epoch": 0.24909063580339696, + "flos": 20154576879360.0, + "grad_norm": 1.7223100303923746, + "language_loss": 0.81019044, + "learning_rate": 3.5172461614108157e-06, + "loss": 0.83140838, + "num_input_tokens_seen": 89282870, + "step": 4143, + "time_per_iteration": 2.7926273345947266 + }, + { + "auxiliary_loss_clip": 0.01068928, + "auxiliary_loss_mlp": 0.01034099, + "balance_loss_clip": 1.02965319, + "balance_loss_mlp": 1.02081311, + "epoch": 0.24915075905606493, + "flos": 26396569607040.0, + "grad_norm": 1.9205460606370781, + "language_loss": 0.58868313, + "learning_rate": 3.5169923867715137e-06, + "loss": 0.60971344, + "num_input_tokens_seen": 89303830, + "step": 4144, + "time_per_iteration": 2.7053706645965576 + }, + { + "auxiliary_loss_clip": 0.01080169, + "auxiliary_loss_mlp": 0.0104424, + "balance_loss_clip": 1.03019178, + "balance_loss_mlp": 1.02907681, + "epoch": 0.2492108823087329, + "flos": 27527216987520.0, + "grad_norm": 1.9649933866354194, + "language_loss": 0.78539741, + "learning_rate": 3.516738554607708e-06, + "loss": 0.80664158, + "num_input_tokens_seen": 89324350, + "step": 4145, + "time_per_iteration": 5.739285469055176 + }, + { + "auxiliary_loss_clip": 0.01091249, + "auxiliary_loss_mlp": 0.00749128, + "balance_loss_clip": 1.03281403, + "balance_loss_mlp": 1.00168633, + "epoch": 0.24927100556140086, + "flos": 16691388111360.0, + "grad_norm": 2.3497508995151106, + "language_loss": 0.6539216, + "learning_rate": 3.5164846649290253e-06, + "loss": 0.67232537, + "num_input_tokens_seen": 89342875, + "step": 4146, + "time_per_iteration": 2.5464742183685303 + }, + { + "auxiliary_loss_clip": 0.0100257, + "auxiliary_loss_mlp": 0.01002553, + "balance_loss_clip": 1.00870574, + "balance_loss_mlp": 1.00035977, + "epoch": 0.24933112881406885, + "flos": 62772464286720.0, + "grad_norm": 0.9560984993252745, + "language_loss": 0.67267978, + "learning_rate": 3.5162307177450915e-06, + "loss": 0.69273102, + "num_input_tokens_seen": 89404925, + "step": 4147, + "time_per_iteration": 3.2547342777252197 + }, + { + "auxiliary_loss_clip": 0.0107847, + "auxiliary_loss_mlp": 0.01039972, + "balance_loss_clip": 1.03548193, + "balance_loss_mlp": 1.02412915, + "epoch": 0.24939125206673682, + "flos": 26651894457600.0, + "grad_norm": 1.9666109079329566, + "language_loss": 0.88959932, + "learning_rate": 3.5159767130655366e-06, + "loss": 0.91078365, + "num_input_tokens_seen": 89425090, + "step": 4148, + "time_per_iteration": 2.662869930267334 + }, + { + "auxiliary_loss_clip": 0.01053143, + "auxiliary_loss_mlp": 0.01043007, + "balance_loss_clip": 1.03322411, + "balance_loss_mlp": 1.02389812, + "epoch": 0.24945137531940478, + "flos": 20704333512960.0, + "grad_norm": 2.14219475645604, + "language_loss": 0.68458331, + "learning_rate": 3.5157226508999935e-06, + "loss": 0.70554477, + "num_input_tokens_seen": 89442615, + "step": 4149, + "time_per_iteration": 2.744210720062256 + }, + { + "auxiliary_loss_clip": 0.01085559, + "auxiliary_loss_mlp": 0.01037074, + "balance_loss_clip": 1.03322434, + "balance_loss_mlp": 1.02125537, + "epoch": 0.24951149857207275, + "flos": 23768662682880.0, + "grad_norm": 1.6209773768755624, + "language_loss": 0.71779466, + "learning_rate": 3.515468531258095e-06, + "loss": 0.73902094, + "num_input_tokens_seen": 89463025, + "step": 4150, + "time_per_iteration": 2.5998754501342773 + }, + { + "auxiliary_loss_clip": 0.01054101, + "auxiliary_loss_mlp": 0.01043103, + "balance_loss_clip": 1.03289425, + "balance_loss_mlp": 1.02661657, + "epoch": 0.2495716218247407, + "flos": 15664881237120.0, + "grad_norm": 2.18136620348283, + "language_loss": 0.7264505, + "learning_rate": 3.515214354149478e-06, + "loss": 0.74742258, + "num_input_tokens_seen": 89480225, + "step": 4151, + "time_per_iteration": 4.42067289352417 + }, + { + "auxiliary_loss_clip": 0.01094787, + "auxiliary_loss_mlp": 0.01045427, + "balance_loss_clip": 1.03461361, + "balance_loss_mlp": 1.02861905, + "epoch": 0.24963174507740868, + "flos": 24052499953920.0, + "grad_norm": 2.4542667612758753, + "language_loss": 0.62978435, + "learning_rate": 3.514960119583781e-06, + "loss": 0.65118647, + "num_input_tokens_seen": 89496985, + "step": 4152, + "time_per_iteration": 4.308203458786011 + }, + { + "auxiliary_loss_clip": 0.01086584, + "auxiliary_loss_mlp": 0.01038916, + "balance_loss_clip": 1.03635967, + "balance_loss_mlp": 1.0233829, + "epoch": 0.24969186833007664, + "flos": 21799501234560.0, + "grad_norm": 1.9546511715156598, + "language_loss": 0.77223933, + "learning_rate": 3.514705827570645e-06, + "loss": 0.79349434, + "num_input_tokens_seen": 89514420, + "step": 4153, + "time_per_iteration": 2.6963720321655273 + }, + { + "auxiliary_loss_clip": 0.01087581, + "auxiliary_loss_mlp": 0.0103695, + "balance_loss_clip": 1.03501058, + "balance_loss_mlp": 1.02181053, + "epoch": 0.24975199158274464, + "flos": 19938143479680.0, + "grad_norm": 1.9984104211053433, + "language_loss": 0.76720887, + "learning_rate": 3.514451478119711e-06, + "loss": 0.78845417, + "num_input_tokens_seen": 89532925, + "step": 4154, + "time_per_iteration": 2.6481776237487793 + }, + { + "auxiliary_loss_clip": 0.01091463, + "auxiliary_loss_mlp": 0.01044302, + "balance_loss_clip": 1.03497458, + "balance_loss_mlp": 1.02639711, + "epoch": 0.2498121148354126, + "flos": 25338389915520.0, + "grad_norm": 2.355583119915818, + "language_loss": 0.70672709, + "learning_rate": 3.5141970712406258e-06, + "loss": 0.72808474, + "num_input_tokens_seen": 89552855, + "step": 4155, + "time_per_iteration": 2.6420609951019287 + }, + { + "auxiliary_loss_clip": 0.01082942, + "auxiliary_loss_mlp": 0.01048356, + "balance_loss_clip": 1.03686285, + "balance_loss_mlp": 1.03209567, + "epoch": 0.24987223808808057, + "flos": 20558787603840.0, + "grad_norm": 1.454312747715725, + "language_loss": 0.74806911, + "learning_rate": 3.513942606943036e-06, + "loss": 0.76938212, + "num_input_tokens_seen": 89572830, + "step": 4156, + "time_per_iteration": 2.6347219944000244 + }, + { + "auxiliary_loss_clip": 0.01086123, + "auxiliary_loss_mlp": 0.01037407, + "balance_loss_clip": 1.03487635, + "balance_loss_mlp": 1.02263117, + "epoch": 0.24993236134074853, + "flos": 19749037351680.0, + "grad_norm": 1.9647688604171323, + "language_loss": 0.76311165, + "learning_rate": 3.513688085236591e-06, + "loss": 0.78434694, + "num_input_tokens_seen": 89590345, + "step": 4157, + "time_per_iteration": 2.60664439201355 + }, + { + "auxiliary_loss_clip": 0.01038637, + "auxiliary_loss_mlp": 0.01038485, + "balance_loss_clip": 1.02922583, + "balance_loss_mlp": 1.02261829, + "epoch": 0.2499924845934165, + "flos": 18770292587520.0, + "grad_norm": 1.894687902873374, + "language_loss": 0.81253505, + "learning_rate": 3.513433506130942e-06, + "loss": 0.83330631, + "num_input_tokens_seen": 89610295, + "step": 4158, + "time_per_iteration": 2.963618755340576 + }, + { + "auxiliary_loss_clip": 0.01064826, + "auxiliary_loss_mlp": 0.0103586, + "balance_loss_clip": 1.03085709, + "balance_loss_mlp": 1.02083945, + "epoch": 0.25005260784608446, + "flos": 16872198197760.0, + "grad_norm": 1.8785289799530682, + "language_loss": 0.75918829, + "learning_rate": 3.5131788696357427e-06, + "loss": 0.78019518, + "num_input_tokens_seen": 89627795, + "step": 4159, + "time_per_iteration": 2.707329750061035 + }, + { + "auxiliary_loss_clip": 0.01088612, + "auxiliary_loss_mlp": 0.01036377, + "balance_loss_clip": 1.03421831, + "balance_loss_mlp": 1.01914573, + "epoch": 0.2501127310987524, + "flos": 22124923476480.0, + "grad_norm": 1.8604854536552748, + "language_loss": 0.71744508, + "learning_rate": 3.512924175760649e-06, + "loss": 0.73869497, + "num_input_tokens_seen": 89648090, + "step": 4160, + "time_per_iteration": 2.7156429290771484 + }, + { + "auxiliary_loss_clip": 0.01020408, + "auxiliary_loss_mlp": 0.0100438, + "balance_loss_clip": 1.00533664, + "balance_loss_mlp": 1.00222242, + "epoch": 0.2501728543514204, + "flos": 69458061980160.0, + "grad_norm": 0.7484639593328118, + "language_loss": 0.56737375, + "learning_rate": 3.5126694245153186e-06, + "loss": 0.58762157, + "num_input_tokens_seen": 89710345, + "step": 4161, + "time_per_iteration": 3.222325086593628 + }, + { + "auxiliary_loss_clip": 0.01092653, + "auxiliary_loss_mlp": 0.01047373, + "balance_loss_clip": 1.03548157, + "balance_loss_mlp": 1.03071928, + "epoch": 0.25023297760408836, + "flos": 16289978647680.0, + "grad_norm": 2.5870415490123606, + "language_loss": 0.80574238, + "learning_rate": 3.5124146159094125e-06, + "loss": 0.8271426, + "num_input_tokens_seen": 89729390, + "step": 4162, + "time_per_iteration": 2.6508305072784424 + }, + { + "auxiliary_loss_clip": 0.01077021, + "auxiliary_loss_mlp": 0.00749212, + "balance_loss_clip": 1.03026581, + "balance_loss_mlp": 1.00196505, + "epoch": 0.2502931008567563, + "flos": 12237998140800.0, + "grad_norm": 3.244587029373888, + "language_loss": 0.87951756, + "learning_rate": 3.5121597499525927e-06, + "loss": 0.89778, + "num_input_tokens_seen": 89742805, + "step": 4163, + "time_per_iteration": 2.7371954917907715 + }, + { + "auxiliary_loss_clip": 0.01090755, + "auxiliary_loss_mlp": 0.01043422, + "balance_loss_clip": 1.03813362, + "balance_loss_mlp": 1.02638698, + "epoch": 0.25035322410942434, + "flos": 23181882105600.0, + "grad_norm": 1.738166806366568, + "language_loss": 0.83284873, + "learning_rate": 3.5119048266545232e-06, + "loss": 0.85419047, + "num_input_tokens_seen": 89761145, + "step": 4164, + "time_per_iteration": 2.68856143951416 + }, + { + "auxiliary_loss_clip": 0.01088194, + "auxiliary_loss_mlp": 0.01045105, + "balance_loss_clip": 1.03993487, + "balance_loss_mlp": 1.03069258, + "epoch": 0.2504133473620923, + "flos": 20917534688640.0, + "grad_norm": 1.6102542191276783, + "language_loss": 0.74253327, + "learning_rate": 3.5116498460248716e-06, + "loss": 0.76386625, + "num_input_tokens_seen": 89780905, + "step": 4165, + "time_per_iteration": 2.638249397277832 + }, + { + "auxiliary_loss_clip": 0.01064494, + "auxiliary_loss_mlp": 0.01041525, + "balance_loss_clip": 1.03131533, + "balance_loss_mlp": 1.02483618, + "epoch": 0.2504734706147603, + "flos": 20776549806720.0, + "grad_norm": 1.7586288170652065, + "language_loss": 0.73778015, + "learning_rate": 3.5113948080733062e-06, + "loss": 0.75884032, + "num_input_tokens_seen": 89799230, + "step": 4166, + "time_per_iteration": 2.698652982711792 + }, + { + "auxiliary_loss_clip": 0.01066429, + "auxiliary_loss_mlp": 0.01042461, + "balance_loss_clip": 1.03446317, + "balance_loss_mlp": 1.02750576, + "epoch": 0.25053359386742824, + "flos": 24349373861760.0, + "grad_norm": 1.6430097445259084, + "language_loss": 0.81824738, + "learning_rate": 3.5111397128094973e-06, + "loss": 0.83933628, + "num_input_tokens_seen": 89818240, + "step": 4167, + "time_per_iteration": 2.7818994522094727 + }, + { + "auxiliary_loss_clip": 0.01085856, + "auxiliary_loss_mlp": 0.010394, + "balance_loss_clip": 1.0344491, + "balance_loss_mlp": 1.02417719, + "epoch": 0.2505937171200962, + "flos": 21214336769280.0, + "grad_norm": 1.9405826391584884, + "language_loss": 0.79818577, + "learning_rate": 3.51088456024312e-06, + "loss": 0.81943834, + "num_input_tokens_seen": 89834485, + "step": 4168, + "time_per_iteration": 2.5749809741973877 + }, + { + "auxiliary_loss_clip": 0.01089013, + "auxiliary_loss_mlp": 0.01040955, + "balance_loss_clip": 1.03241217, + "balance_loss_mlp": 1.02363431, + "epoch": 0.25065384037276417, + "flos": 41427231379200.0, + "grad_norm": 2.601243919913023, + "language_loss": 0.69815457, + "learning_rate": 3.510629350383849e-06, + "loss": 0.71945429, + "num_input_tokens_seen": 89855645, + "step": 4169, + "time_per_iteration": 2.783184051513672 + }, + { + "auxiliary_loss_clip": 0.01061814, + "auxiliary_loss_mlp": 0.01044007, + "balance_loss_clip": 1.03198719, + "balance_loss_mlp": 1.02871203, + "epoch": 0.25071396362543213, + "flos": 26102389219200.0, + "grad_norm": 2.7049047170840255, + "language_loss": 0.77631694, + "learning_rate": 3.510374083241361e-06, + "loss": 0.79737508, + "num_input_tokens_seen": 89874895, + "step": 4170, + "time_per_iteration": 2.689359426498413 + }, + { + "auxiliary_loss_clip": 0.01079675, + "auxiliary_loss_mlp": 0.0103756, + "balance_loss_clip": 1.03512239, + "balance_loss_mlp": 1.02215219, + "epoch": 0.2507740868781001, + "flos": 19098982967040.0, + "grad_norm": 2.432172499665017, + "language_loss": 0.76177841, + "learning_rate": 3.5101187588253368e-06, + "loss": 0.78295076, + "num_input_tokens_seen": 89891700, + "step": 4171, + "time_per_iteration": 2.589259386062622 + }, + { + "auxiliary_loss_clip": 0.01017347, + "auxiliary_loss_mlp": 0.01005427, + "balance_loss_clip": 1.0027684, + "balance_loss_mlp": 1.00338829, + "epoch": 0.25083421013076806, + "flos": 64341868296960.0, + "grad_norm": 0.8299959826738956, + "language_loss": 0.60049528, + "learning_rate": 3.509863377145458e-06, + "loss": 0.62072307, + "num_input_tokens_seen": 89955775, + "step": 4172, + "time_per_iteration": 3.1321663856506348 + }, + { + "auxiliary_loss_clip": 0.01070577, + "auxiliary_loss_mlp": 0.01042032, + "balance_loss_clip": 1.03001738, + "balance_loss_mlp": 1.02609396, + "epoch": 0.25089433338343603, + "flos": 24279599692800.0, + "grad_norm": 1.5353425116627888, + "language_loss": 0.78755713, + "learning_rate": 3.509607938211409e-06, + "loss": 0.80868328, + "num_input_tokens_seen": 89977150, + "step": 4173, + "time_per_iteration": 2.72853946685791 + }, + { + "auxiliary_loss_clip": 0.01099555, + "auxiliary_loss_mlp": 0.01041282, + "balance_loss_clip": 1.03683197, + "balance_loss_mlp": 1.02530801, + "epoch": 0.250954456636104, + "flos": 14721472477440.0, + "grad_norm": 2.0985402159153566, + "language_loss": 0.8351109, + "learning_rate": 3.509352442032875e-06, + "loss": 0.85651928, + "num_input_tokens_seen": 89994925, + "step": 4174, + "time_per_iteration": 2.7265307903289795 + }, + { + "auxiliary_loss_clip": 0.01039582, + "auxiliary_loss_mlp": 0.0104376, + "balance_loss_clip": 1.02816677, + "balance_loss_mlp": 1.02736878, + "epoch": 0.25101457988877196, + "flos": 22273593868800.0, + "grad_norm": 2.9401309586654776, + "language_loss": 0.71686614, + "learning_rate": 3.509096888619545e-06, + "loss": 0.73769957, + "num_input_tokens_seen": 90013235, + "step": 4175, + "time_per_iteration": 2.7614424228668213 + }, + { + "auxiliary_loss_clip": 0.01066813, + "auxiliary_loss_mlp": 0.01035395, + "balance_loss_clip": 1.0321312, + "balance_loss_mlp": 1.01890206, + "epoch": 0.2510747031414399, + "flos": 25188929424000.0, + "grad_norm": 1.9239902363727353, + "language_loss": 0.80418587, + "learning_rate": 3.50884127798111e-06, + "loss": 0.82520795, + "num_input_tokens_seen": 90032150, + "step": 4176, + "time_per_iteration": 2.703749656677246 + }, + { + "auxiliary_loss_clip": 0.01080553, + "auxiliary_loss_mlp": 0.01041058, + "balance_loss_clip": 1.03568602, + "balance_loss_mlp": 1.02331972, + "epoch": 0.25113482639410795, + "flos": 20704189858560.0, + "grad_norm": 2.0181353131009048, + "language_loss": 0.82445461, + "learning_rate": 3.5085856101272623e-06, + "loss": 0.8456707, + "num_input_tokens_seen": 90049085, + "step": 4177, + "time_per_iteration": 2.637443780899048 + }, + { + "auxiliary_loss_clip": 0.0106299, + "auxiliary_loss_mlp": 0.01043578, + "balance_loss_clip": 1.03498328, + "balance_loss_mlp": 1.02809834, + "epoch": 0.2511949496467759, + "flos": 21506936958720.0, + "grad_norm": 2.3073278397118666, + "language_loss": 0.82531881, + "learning_rate": 3.508329885067698e-06, + "loss": 0.84638453, + "num_input_tokens_seen": 90067695, + "step": 4178, + "time_per_iteration": 2.746147632598877 + }, + { + "auxiliary_loss_clip": 0.01090731, + "auxiliary_loss_mlp": 0.00749054, + "balance_loss_clip": 1.03025866, + "balance_loss_mlp": 1.00201297, + "epoch": 0.2512550728994439, + "flos": 20701999128960.0, + "grad_norm": 2.0692704500336268, + "language_loss": 0.75691628, + "learning_rate": 3.508074102812112e-06, + "loss": 0.77531421, + "num_input_tokens_seen": 90083890, + "step": 4179, + "time_per_iteration": 2.645827293395996 + }, + { + "auxiliary_loss_clip": 0.01054729, + "auxiliary_loss_mlp": 0.01046061, + "balance_loss_clip": 1.0290457, + "balance_loss_mlp": 1.03043306, + "epoch": 0.25131519615211184, + "flos": 18478626151680.0, + "grad_norm": 1.7824853115104835, + "language_loss": 0.700737, + "learning_rate": 3.507818263370206e-06, + "loss": 0.72174489, + "num_input_tokens_seen": 90100995, + "step": 4180, + "time_per_iteration": 2.7291226387023926 + }, + { + "auxiliary_loss_clip": 0.01095481, + "auxiliary_loss_mlp": 0.01047299, + "balance_loss_clip": 1.03378749, + "balance_loss_mlp": 1.03168249, + "epoch": 0.2513753194047798, + "flos": 20484955198080.0, + "grad_norm": 1.7789956469672226, + "language_loss": 0.86170357, + "learning_rate": 3.5075623667516796e-06, + "loss": 0.88313138, + "num_input_tokens_seen": 90120365, + "step": 4181, + "time_per_iteration": 2.658745288848877 + }, + { + "auxiliary_loss_clip": 0.01096885, + "auxiliary_loss_mlp": 0.01040959, + "balance_loss_clip": 1.03483534, + "balance_loss_mlp": 1.02585566, + "epoch": 0.25143544265744777, + "flos": 37670077704960.0, + "grad_norm": 1.8396090409592287, + "language_loss": 0.67720443, + "learning_rate": 3.507306412966238e-06, + "loss": 0.69858289, + "num_input_tokens_seen": 90142610, + "step": 4182, + "time_per_iteration": 2.7006869316101074 + }, + { + "auxiliary_loss_clip": 0.01003875, + "auxiliary_loss_mlp": 0.0100675, + "balance_loss_clip": 1.00878632, + "balance_loss_mlp": 1.00474763, + "epoch": 0.25149556591011574, + "flos": 69367457923200.0, + "grad_norm": 0.8520930654468767, + "language_loss": 0.70110798, + "learning_rate": 3.5070504020235853e-06, + "loss": 0.72121418, + "num_input_tokens_seen": 90200555, + "step": 4183, + "time_per_iteration": 3.240520477294922 + }, + { + "auxiliary_loss_clip": 0.01077705, + "auxiliary_loss_mlp": 0.01044718, + "balance_loss_clip": 1.03186131, + "balance_loss_mlp": 1.02770686, + "epoch": 0.2515556891627837, + "flos": 13990402967040.0, + "grad_norm": 1.6406259773693397, + "language_loss": 0.7415849, + "learning_rate": 3.506794333933431e-06, + "loss": 0.7628091, + "num_input_tokens_seen": 90218120, + "step": 4184, + "time_per_iteration": 2.5921218395233154 + }, + { + "auxiliary_loss_clip": 0.01087407, + "auxiliary_loss_mlp": 0.01045096, + "balance_loss_clip": 1.0346179, + "balance_loss_mlp": 1.02909851, + "epoch": 0.25161581241545167, + "flos": 22163527618560.0, + "grad_norm": 1.796329002307088, + "language_loss": 0.83284736, + "learning_rate": 3.506538208705484e-06, + "loss": 0.85417247, + "num_input_tokens_seen": 90236790, + "step": 4185, + "time_per_iteration": 2.81400728225708 + }, + { + "auxiliary_loss_clip": 0.00985628, + "auxiliary_loss_mlp": 0.01006728, + "balance_loss_clip": 1.01142824, + "balance_loss_mlp": 1.00417662, + "epoch": 0.25167593566811963, + "flos": 69358407696000.0, + "grad_norm": 0.7880518469749939, + "language_loss": 0.61539161, + "learning_rate": 3.5062820263494574e-06, + "loss": 0.63531518, + "num_input_tokens_seen": 90297070, + "step": 4186, + "time_per_iteration": 3.1732072830200195 + }, + { + "auxiliary_loss_clip": 0.01065273, + "auxiliary_loss_mlp": 0.01039782, + "balance_loss_clip": 1.03382945, + "balance_loss_mlp": 1.02324224, + "epoch": 0.2517360589207876, + "flos": 13261452359040.0, + "grad_norm": 1.9384427351770257, + "language_loss": 0.79428411, + "learning_rate": 3.5060257868750656e-06, + "loss": 0.81533468, + "num_input_tokens_seen": 90315255, + "step": 4187, + "time_per_iteration": 2.775850772857666 + }, + { + "auxiliary_loss_clip": 0.01043466, + "auxiliary_loss_mlp": 0.01055073, + "balance_loss_clip": 1.03212535, + "balance_loss_mlp": 1.03833628, + "epoch": 0.25179618217345556, + "flos": 20376828282240.0, + "grad_norm": 1.5066269292017993, + "language_loss": 0.7991218, + "learning_rate": 3.5057694902920244e-06, + "loss": 0.82010722, + "num_input_tokens_seen": 90334990, + "step": 4188, + "time_per_iteration": 2.715257167816162 + }, + { + "auxiliary_loss_clip": 0.01088079, + "auxiliary_loss_mlp": 0.01044345, + "balance_loss_clip": 1.03557467, + "balance_loss_mlp": 1.02812648, + "epoch": 0.25185630542612353, + "flos": 27664718250240.0, + "grad_norm": 1.84164396139388, + "language_loss": 0.74417973, + "learning_rate": 3.5055131366100534e-06, + "loss": 0.76550388, + "num_input_tokens_seen": 90351825, + "step": 4189, + "time_per_iteration": 2.659876823425293 + }, + { + "auxiliary_loss_clip": 0.01071224, + "auxiliary_loss_mlp": 0.01039017, + "balance_loss_clip": 1.03441429, + "balance_loss_mlp": 1.02486718, + "epoch": 0.25191642867879155, + "flos": 20996430912000.0, + "grad_norm": 1.9710279747696753, + "language_loss": 0.84372759, + "learning_rate": 3.5052567258388745e-06, + "loss": 0.86483002, + "num_input_tokens_seen": 90369860, + "step": 4190, + "time_per_iteration": 2.590183973312378 + }, + { + "auxiliary_loss_clip": 0.0106764, + "auxiliary_loss_mlp": 0.01043854, + "balance_loss_clip": 1.03115606, + "balance_loss_mlp": 1.02681959, + "epoch": 0.2519765519314595, + "flos": 21105671149440.0, + "grad_norm": 1.7788331296862665, + "language_loss": 0.75174332, + "learning_rate": 3.5050002579882082e-06, + "loss": 0.77285826, + "num_input_tokens_seen": 90389245, + "step": 4191, + "time_per_iteration": 2.7189254760742188 + }, + { + "auxiliary_loss_clip": 0.01017112, + "auxiliary_loss_mlp": 0.01008434, + "balance_loss_clip": 1.01195812, + "balance_loss_mlp": 1.00624013, + "epoch": 0.2520366751841275, + "flos": 62744993360640.0, + "grad_norm": 0.716658724434839, + "language_loss": 0.57161468, + "learning_rate": 3.5047437330677823e-06, + "loss": 0.59187013, + "num_input_tokens_seen": 90456735, + "step": 4192, + "time_per_iteration": 6.36509370803833 + }, + { + "auxiliary_loss_clip": 0.01084054, + "auxiliary_loss_mlp": 0.01043877, + "balance_loss_clip": 1.04366851, + "balance_loss_mlp": 1.02721763, + "epoch": 0.25209679843679544, + "flos": 22230716008320.0, + "grad_norm": 2.017287703280097, + "language_loss": 0.75924814, + "learning_rate": 3.504487151087323e-06, + "loss": 0.78052741, + "num_input_tokens_seen": 90474165, + "step": 4193, + "time_per_iteration": 2.64615797996521 + }, + { + "auxiliary_loss_clip": 0.01090893, + "auxiliary_loss_mlp": 0.01048693, + "balance_loss_clip": 1.03662801, + "balance_loss_mlp": 1.03307652, + "epoch": 0.2521569216894634, + "flos": 12166643773440.0, + "grad_norm": 2.014232629384664, + "language_loss": 0.83895516, + "learning_rate": 3.5042305120565598e-06, + "loss": 0.86035103, + "num_input_tokens_seen": 90491660, + "step": 4194, + "time_per_iteration": 2.634542465209961 + }, + { + "auxiliary_loss_clip": 0.01099787, + "auxiliary_loss_mlp": 0.0105021, + "balance_loss_clip": 1.03647518, + "balance_loss_mlp": 1.03550529, + "epoch": 0.2522170449421314, + "flos": 23699786353920.0, + "grad_norm": 1.4464563409910463, + "language_loss": 0.88070649, + "learning_rate": 3.5039738159852253e-06, + "loss": 0.90220648, + "num_input_tokens_seen": 90514025, + "step": 4195, + "time_per_iteration": 2.6630022525787354 + }, + { + "auxiliary_loss_clip": 0.01100175, + "auxiliary_loss_mlp": 0.01044109, + "balance_loss_clip": 1.03616881, + "balance_loss_mlp": 1.02589428, + "epoch": 0.25227716819479934, + "flos": 20955456472320.0, + "grad_norm": 2.199029743610487, + "language_loss": 0.86013031, + "learning_rate": 3.503717062883053e-06, + "loss": 0.88157308, + "num_input_tokens_seen": 90533530, + "step": 4196, + "time_per_iteration": 2.5893125534057617 + }, + { + "auxiliary_loss_clip": 0.0108715, + "auxiliary_loss_mlp": 0.01043311, + "balance_loss_clip": 1.03315139, + "balance_loss_mlp": 1.02762294, + "epoch": 0.2523372914474673, + "flos": 23331342597120.0, + "grad_norm": 1.9646671373862066, + "language_loss": 0.83352953, + "learning_rate": 3.5034602527597786e-06, + "loss": 0.85483414, + "num_input_tokens_seen": 90554025, + "step": 4197, + "time_per_iteration": 2.654261350631714 + }, + { + "auxiliary_loss_clip": 0.01090455, + "auxiliary_loss_mlp": 0.01049655, + "balance_loss_clip": 1.03577721, + "balance_loss_mlp": 1.03144002, + "epoch": 0.25239741470013527, + "flos": 36970321875840.0, + "grad_norm": 1.9523704175928251, + "language_loss": 0.7298972, + "learning_rate": 3.5032033856251405e-06, + "loss": 0.75129831, + "num_input_tokens_seen": 90576930, + "step": 4198, + "time_per_iteration": 4.333154678344727 + }, + { + "auxiliary_loss_clip": 0.01102753, + "auxiliary_loss_mlp": 0.01051888, + "balance_loss_clip": 1.03646457, + "balance_loss_mlp": 1.03522217, + "epoch": 0.25245753795280323, + "flos": 18515757836160.0, + "grad_norm": 1.7812378059444296, + "language_loss": 0.76648247, + "learning_rate": 3.50294646148888e-06, + "loss": 0.78802884, + "num_input_tokens_seen": 90595710, + "step": 4199, + "time_per_iteration": 2.543447494506836 + }, + { + "auxiliary_loss_clip": 0.01078227, + "auxiliary_loss_mlp": 0.00749131, + "balance_loss_clip": 1.03490996, + "balance_loss_mlp": 1.0019443, + "epoch": 0.2525176612054712, + "flos": 32344884737280.0, + "grad_norm": 1.7542901862351987, + "language_loss": 0.72755754, + "learning_rate": 3.502689480360739e-06, + "loss": 0.74583107, + "num_input_tokens_seen": 90617945, + "step": 4200, + "time_per_iteration": 4.284536123275757 + }, + { + "auxiliary_loss_clip": 0.01087513, + "auxiliary_loss_mlp": 0.01051978, + "balance_loss_clip": 1.03327084, + "balance_loss_mlp": 1.03691566, + "epoch": 0.25257778445813917, + "flos": 45258217459200.0, + "grad_norm": 1.5172319253626594, + "language_loss": 0.82463288, + "learning_rate": 3.5024324422504616e-06, + "loss": 0.84602785, + "num_input_tokens_seen": 90640855, + "step": 4201, + "time_per_iteration": 2.8477768898010254 + }, + { + "auxiliary_loss_clip": 0.01061034, + "auxiliary_loss_mlp": 0.01051722, + "balance_loss_clip": 1.03840613, + "balance_loss_mlp": 1.03499651, + "epoch": 0.25263790771080713, + "flos": 23367791923200.0, + "grad_norm": 1.642134152201458, + "language_loss": 0.74996364, + "learning_rate": 3.5021753471677965e-06, + "loss": 0.77109116, + "num_input_tokens_seen": 90661350, + "step": 4202, + "time_per_iteration": 2.762477397918701 + }, + { + "auxiliary_loss_clip": 0.01087543, + "auxiliary_loss_mlp": 0.01040612, + "balance_loss_clip": 1.03534126, + "balance_loss_mlp": 1.02495933, + "epoch": 0.25269803096347515, + "flos": 18515039564160.0, + "grad_norm": 1.950096884460389, + "language_loss": 0.73575181, + "learning_rate": 3.501918195122491e-06, + "loss": 0.75703335, + "num_input_tokens_seen": 90680540, + "step": 4203, + "time_per_iteration": 2.7343690395355225 + }, + { + "auxiliary_loss_clip": 0.01078199, + "auxiliary_loss_mlp": 0.01038068, + "balance_loss_clip": 1.03373396, + "balance_loss_mlp": 1.0224936, + "epoch": 0.2527581542161431, + "flos": 24610552629120.0, + "grad_norm": 1.4212258812768916, + "language_loss": 0.77266383, + "learning_rate": 3.501660986124297e-06, + "loss": 0.79382652, + "num_input_tokens_seen": 90703460, + "step": 4204, + "time_per_iteration": 2.824800491333008 + }, + { + "auxiliary_loss_clip": 0.01076669, + "auxiliary_loss_mlp": 0.01050298, + "balance_loss_clip": 1.04100323, + "balance_loss_mlp": 1.03490186, + "epoch": 0.2528182774688111, + "flos": 12641275111680.0, + "grad_norm": 2.0110518233345243, + "language_loss": 0.72473097, + "learning_rate": 3.5014037201829684e-06, + "loss": 0.74600065, + "num_input_tokens_seen": 90718815, + "step": 4205, + "time_per_iteration": 2.6526706218719482 + }, + { + "auxiliary_loss_clip": 0.01072888, + "auxiliary_loss_mlp": 0.01041683, + "balance_loss_clip": 1.03267705, + "balance_loss_mlp": 1.02688313, + "epoch": 0.25287840072147905, + "flos": 46936789879680.0, + "grad_norm": 1.411337204923265, + "language_loss": 0.75649631, + "learning_rate": 3.50114639730826e-06, + "loss": 0.77764201, + "num_input_tokens_seen": 90742125, + "step": 4206, + "time_per_iteration": 2.835476875305176 + }, + { + "auxiliary_loss_clip": 0.0105472, + "auxiliary_loss_mlp": 0.01044079, + "balance_loss_clip": 1.02908635, + "balance_loss_mlp": 1.0285697, + "epoch": 0.252938523974147, + "flos": 18879712392960.0, + "grad_norm": 1.9397344686210998, + "language_loss": 0.79320902, + "learning_rate": 3.5008890175099296e-06, + "loss": 0.814197, + "num_input_tokens_seen": 90760785, + "step": 4207, + "time_per_iteration": 2.6516754627227783 + }, + { + "auxiliary_loss_clip": 0.01084939, + "auxiliary_loss_mlp": 0.01041696, + "balance_loss_clip": 1.03501725, + "balance_loss_mlp": 1.02701545, + "epoch": 0.252998647226815, + "flos": 21434720664960.0, + "grad_norm": 1.491047232823479, + "language_loss": 0.76359928, + "learning_rate": 3.5006315807977375e-06, + "loss": 0.78486562, + "num_input_tokens_seen": 90780045, + "step": 4208, + "time_per_iteration": 2.5884695053100586 + }, + { + "auxiliary_loss_clip": 0.01083398, + "auxiliary_loss_mlp": 0.01038391, + "balance_loss_clip": 1.03338921, + "balance_loss_mlp": 1.02269089, + "epoch": 0.25305877047948294, + "flos": 25442171285760.0, + "grad_norm": 2.3323908759148377, + "language_loss": 0.70105803, + "learning_rate": 3.5003740871814456e-06, + "loss": 0.72227585, + "num_input_tokens_seen": 90797980, + "step": 4209, + "time_per_iteration": 2.623582363128662 + }, + { + "auxiliary_loss_clip": 0.01012344, + "auxiliary_loss_mlp": 0.01003624, + "balance_loss_clip": 1.00707316, + "balance_loss_mlp": 1.00159764, + "epoch": 0.2531188937321509, + "flos": 60185603629440.0, + "grad_norm": 0.7732712126385214, + "language_loss": 0.55174565, + "learning_rate": 3.5001165366708175e-06, + "loss": 0.57190531, + "num_input_tokens_seen": 90864865, + "step": 4210, + "time_per_iteration": 3.212961196899414 + }, + { + "auxiliary_loss_clip": 0.01070837, + "auxiliary_loss_mlp": 0.01035168, + "balance_loss_clip": 1.03622293, + "balance_loss_mlp": 1.02021933, + "epoch": 0.25317901698481887, + "flos": 19682387665920.0, + "grad_norm": 1.761028015751928, + "language_loss": 0.80287957, + "learning_rate": 3.4998589292756204e-06, + "loss": 0.82393956, + "num_input_tokens_seen": 90882885, + "step": 4211, + "time_per_iteration": 2.6842477321624756 + }, + { + "auxiliary_loss_clip": 0.01042627, + "auxiliary_loss_mlp": 0.01043634, + "balance_loss_clip": 1.02882576, + "balance_loss_mlp": 1.02921009, + "epoch": 0.25323914023748684, + "flos": 24424355502720.0, + "grad_norm": 1.5259195753959314, + "language_loss": 0.78074324, + "learning_rate": 3.499601265005622e-06, + "loss": 0.80160582, + "num_input_tokens_seen": 90902985, + "step": 4212, + "time_per_iteration": 2.7349660396575928 + }, + { + "auxiliary_loss_clip": 0.01084429, + "auxiliary_loss_mlp": 0.01038397, + "balance_loss_clip": 1.03219593, + "balance_loss_mlp": 1.02186251, + "epoch": 0.2532992634901548, + "flos": 25447450584960.0, + "grad_norm": 4.649863991753932, + "language_loss": 0.53408653, + "learning_rate": 3.4993435438705938e-06, + "loss": 0.55531478, + "num_input_tokens_seen": 90923550, + "step": 4213, + "time_per_iteration": 2.6222963333129883 + }, + { + "auxiliary_loss_clip": 0.01070484, + "auxiliary_loss_mlp": 0.01041688, + "balance_loss_clip": 1.03108358, + "balance_loss_mlp": 1.02497518, + "epoch": 0.25335938674282277, + "flos": 18880538405760.0, + "grad_norm": 2.188121278283469, + "language_loss": 0.64985251, + "learning_rate": 3.499085765880308e-06, + "loss": 0.67097425, + "num_input_tokens_seen": 90943260, + "step": 4214, + "time_per_iteration": 2.5894875526428223 + }, + { + "auxiliary_loss_clip": 0.01008706, + "auxiliary_loss_mlp": 0.01002598, + "balance_loss_clip": 1.00465512, + "balance_loss_mlp": 1.00053525, + "epoch": 0.25341950999549073, + "flos": 53062649936640.0, + "grad_norm": 0.8502310648344515, + "language_loss": 0.58068979, + "learning_rate": 3.4988279310445396e-06, + "loss": 0.60080278, + "num_input_tokens_seen": 90996295, + "step": 4215, + "time_per_iteration": 2.9185702800750732 + }, + { + "auxiliary_loss_clip": 0.0107791, + "auxiliary_loss_mlp": 0.01041229, + "balance_loss_clip": 1.03549123, + "balance_loss_mlp": 1.02514791, + "epoch": 0.2534796332481587, + "flos": 39020247054720.0, + "grad_norm": 1.56465464714472, + "language_loss": 0.83870125, + "learning_rate": 3.498570039373066e-06, + "loss": 0.85989267, + "num_input_tokens_seen": 91017545, + "step": 4216, + "time_per_iteration": 2.845701217651367 + }, + { + "auxiliary_loss_clip": 0.01088764, + "auxiliary_loss_mlp": 0.0104007, + "balance_loss_clip": 1.0370667, + "balance_loss_mlp": 1.02434635, + "epoch": 0.2535397565008267, + "flos": 23586990670080.0, + "grad_norm": 1.8688244894939585, + "language_loss": 0.79918104, + "learning_rate": 3.498312090875666e-06, + "loss": 0.82046938, + "num_input_tokens_seen": 91037715, + "step": 4217, + "time_per_iteration": 2.703059673309326 + }, + { + "auxiliary_loss_clip": 0.01064429, + "auxiliary_loss_mlp": 0.01038754, + "balance_loss_clip": 1.02767658, + "balance_loss_mlp": 1.02366257, + "epoch": 0.2535998797534947, + "flos": 19281373251840.0, + "grad_norm": 2.8492113331449422, + "language_loss": 0.74903309, + "learning_rate": 3.4980540855621218e-06, + "loss": 0.77006495, + "num_input_tokens_seen": 91055295, + "step": 4218, + "time_per_iteration": 2.724738359451294 + }, + { + "auxiliary_loss_clip": 0.0108597, + "auxiliary_loss_mlp": 0.01039538, + "balance_loss_clip": 1.03210449, + "balance_loss_mlp": 1.02327776, + "epoch": 0.25366000300616265, + "flos": 24024382583040.0, + "grad_norm": 1.7630986548238057, + "language_loss": 0.74588776, + "learning_rate": 3.4977960234422167e-06, + "loss": 0.76714289, + "num_input_tokens_seen": 91075485, + "step": 4219, + "time_per_iteration": 2.8078925609588623 + }, + { + "auxiliary_loss_clip": 0.01088666, + "auxiliary_loss_mlp": 0.0104824, + "balance_loss_clip": 1.03546524, + "balance_loss_mlp": 1.03182483, + "epoch": 0.2537201262588306, + "flos": 16289368116480.0, + "grad_norm": 1.8299655244633752, + "language_loss": 0.80997396, + "learning_rate": 3.497537904525736e-06, + "loss": 0.831343, + "num_input_tokens_seen": 91093620, + "step": 4220, + "time_per_iteration": 2.669975519180298 + }, + { + "auxiliary_loss_clip": 0.01050519, + "auxiliary_loss_mlp": 0.01049172, + "balance_loss_clip": 1.03230774, + "balance_loss_mlp": 1.03113556, + "epoch": 0.2537802495114986, + "flos": 23294677789440.0, + "grad_norm": 2.0267737713922918, + "language_loss": 0.70954722, + "learning_rate": 3.497279728822468e-06, + "loss": 0.73054409, + "num_input_tokens_seen": 91114110, + "step": 4221, + "time_per_iteration": 2.8157236576080322 + }, + { + "auxiliary_loss_clip": 0.01096843, + "auxiliary_loss_mlp": 0.01040075, + "balance_loss_clip": 1.0333786, + "balance_loss_mlp": 1.02393365, + "epoch": 0.25384037276416654, + "flos": 17639142416640.0, + "grad_norm": 1.648326119447625, + "language_loss": 0.61394459, + "learning_rate": 3.497021496342202e-06, + "loss": 0.63531375, + "num_input_tokens_seen": 91133135, + "step": 4222, + "time_per_iteration": 2.524116039276123 + }, + { + "auxiliary_loss_clip": 0.01091732, + "auxiliary_loss_mlp": 0.01053881, + "balance_loss_clip": 1.03718615, + "balance_loss_mlp": 1.03765643, + "epoch": 0.2539004960168345, + "flos": 21507044699520.0, + "grad_norm": 1.7196171754136746, + "language_loss": 0.7455014, + "learning_rate": 3.496763207094731e-06, + "loss": 0.76695752, + "num_input_tokens_seen": 91151805, + "step": 4223, + "time_per_iteration": 2.648899793624878 + }, + { + "auxiliary_loss_clip": 0.01045026, + "auxiliary_loss_mlp": 0.01036853, + "balance_loss_clip": 1.03225064, + "balance_loss_mlp": 1.02137923, + "epoch": 0.2539606192695025, + "flos": 23950909313280.0, + "grad_norm": 1.694619673615821, + "language_loss": 0.802001, + "learning_rate": 3.49650486108985e-06, + "loss": 0.82281983, + "num_input_tokens_seen": 91172270, + "step": 4224, + "time_per_iteration": 2.804105043411255 + }, + { + "auxiliary_loss_clip": 0.01084585, + "auxiliary_loss_mlp": 0.00748981, + "balance_loss_clip": 1.03343606, + "balance_loss_mlp": 1.00169003, + "epoch": 0.25402074252217044, + "flos": 24169784837760.0, + "grad_norm": 1.3859489308964277, + "language_loss": 0.77230346, + "learning_rate": 3.496246458337354e-06, + "loss": 0.79063916, + "num_input_tokens_seen": 91192080, + "step": 4225, + "time_per_iteration": 2.6393284797668457 + }, + { + "auxiliary_loss_clip": 0.01087486, + "auxiliary_loss_mlp": 0.01052229, + "balance_loss_clip": 1.03418219, + "balance_loss_mlp": 1.03590977, + "epoch": 0.2540808657748384, + "flos": 22303758314880.0, + "grad_norm": 1.6799233244325198, + "language_loss": 0.84459347, + "learning_rate": 3.4959879988470426e-06, + "loss": 0.86599064, + "num_input_tokens_seen": 91211450, + "step": 4226, + "time_per_iteration": 2.672074556350708 + }, + { + "auxiliary_loss_clip": 0.01094227, + "auxiliary_loss_mlp": 0.01043396, + "balance_loss_clip": 1.03308403, + "balance_loss_mlp": 1.02668262, + "epoch": 0.25414098902750637, + "flos": 27599541022080.0, + "grad_norm": 2.479329958653406, + "language_loss": 0.71063912, + "learning_rate": 3.4957294826287164e-06, + "loss": 0.73201525, + "num_input_tokens_seen": 91231835, + "step": 4227, + "time_per_iteration": 2.6512508392333984 + }, + { + "auxiliary_loss_clip": 0.01017597, + "auxiliary_loss_mlp": 0.01003598, + "balance_loss_clip": 1.00290799, + "balance_loss_mlp": 1.00163138, + "epoch": 0.25420111228017434, + "flos": 58170834887040.0, + "grad_norm": 0.9806085487550062, + "language_loss": 0.61868978, + "learning_rate": 3.4954709096921785e-06, + "loss": 0.63890171, + "num_input_tokens_seen": 91288755, + "step": 4228, + "time_per_iteration": 2.9999935626983643 + }, + { + "auxiliary_loss_clip": 0.01074481, + "auxiliary_loss_mlp": 0.01040895, + "balance_loss_clip": 1.03057003, + "balance_loss_mlp": 1.02380025, + "epoch": 0.2542612355328423, + "flos": 11464409905920.0, + "grad_norm": 2.114261791509004, + "language_loss": 0.85982788, + "learning_rate": 3.4952122800472336e-06, + "loss": 0.88098162, + "num_input_tokens_seen": 91302485, + "step": 4229, + "time_per_iteration": 2.7042653560638428 + }, + { + "auxiliary_loss_clip": 0.01056666, + "auxiliary_loss_mlp": 0.0105204, + "balance_loss_clip": 1.0318135, + "balance_loss_mlp": 1.03507674, + "epoch": 0.2543213587855103, + "flos": 22965879669120.0, + "grad_norm": 2.528779485793448, + "language_loss": 0.77128673, + "learning_rate": 3.4949535937036892e-06, + "loss": 0.79237384, + "num_input_tokens_seen": 91321120, + "step": 4230, + "time_per_iteration": 2.8684380054473877 + }, + { + "auxiliary_loss_clip": 0.01084618, + "auxiliary_loss_mlp": 0.010435, + "balance_loss_clip": 1.03245544, + "balance_loss_mlp": 1.02664411, + "epoch": 0.2543814820381783, + "flos": 18253178438400.0, + "grad_norm": 2.640812503161984, + "language_loss": 0.75233704, + "learning_rate": 3.4946948506713544e-06, + "loss": 0.77361822, + "num_input_tokens_seen": 91338575, + "step": 4231, + "time_per_iteration": 2.5856173038482666 + }, + { + "auxiliary_loss_clip": 0.01082202, + "auxiliary_loss_mlp": 0.0103952, + "balance_loss_clip": 1.03198957, + "balance_loss_mlp": 1.02381992, + "epoch": 0.25444160529084625, + "flos": 15632705629440.0, + "grad_norm": 1.7430044887483336, + "language_loss": 0.74216926, + "learning_rate": 3.4944360509600416e-06, + "loss": 0.76338649, + "num_input_tokens_seen": 91357355, + "step": 4232, + "time_per_iteration": 2.628979206085205 + }, + { + "auxiliary_loss_clip": 0.01099592, + "auxiliary_loss_mlp": 0.0104405, + "balance_loss_clip": 1.03548706, + "balance_loss_mlp": 1.02715826, + "epoch": 0.2545017285435142, + "flos": 24601610142720.0, + "grad_norm": 1.9501238121597648, + "language_loss": 0.86704993, + "learning_rate": 3.4941771945795637e-06, + "loss": 0.88848633, + "num_input_tokens_seen": 91376515, + "step": 4233, + "time_per_iteration": 2.610619306564331 + }, + { + "auxiliary_loss_clip": 0.01035993, + "auxiliary_loss_mlp": 0.01042283, + "balance_loss_clip": 1.03319323, + "balance_loss_mlp": 1.02725065, + "epoch": 0.2545618517961822, + "flos": 24679069822080.0, + "grad_norm": 1.481805055267923, + "language_loss": 0.74690253, + "learning_rate": 3.493918281539737e-06, + "loss": 0.76768523, + "num_input_tokens_seen": 91397595, + "step": 4234, + "time_per_iteration": 2.7704544067382812 + }, + { + "auxiliary_loss_clip": 0.01076553, + "auxiliary_loss_mlp": 0.0104179, + "balance_loss_clip": 1.03582835, + "balance_loss_mlp": 1.02669239, + "epoch": 0.25462197504885015, + "flos": 23915106432000.0, + "grad_norm": 2.276106450596277, + "language_loss": 0.74882215, + "learning_rate": 3.493659311850379e-06, + "loss": 0.77000558, + "num_input_tokens_seen": 91417775, + "step": 4235, + "time_per_iteration": 2.6556053161621094 + }, + { + "auxiliary_loss_clip": 0.01074678, + "auxiliary_loss_mlp": 0.00748994, + "balance_loss_clip": 1.0384376, + "balance_loss_mlp": 1.00165856, + "epoch": 0.2546820983015181, + "flos": 24789387467520.0, + "grad_norm": 28.920258978557285, + "language_loss": 0.64627063, + "learning_rate": 3.4934002855213106e-06, + "loss": 0.66450727, + "num_input_tokens_seen": 91437665, + "step": 4236, + "time_per_iteration": 2.706841230392456 + }, + { + "auxiliary_loss_clip": 0.01094544, + "auxiliary_loss_mlp": 0.01033818, + "balance_loss_clip": 1.03326893, + "balance_loss_mlp": 1.0198288, + "epoch": 0.2547422215541861, + "flos": 18734130570240.0, + "grad_norm": 1.5480326857433933, + "language_loss": 0.66888589, + "learning_rate": 3.493141202562354e-06, + "loss": 0.69016951, + "num_input_tokens_seen": 91456705, + "step": 4237, + "time_per_iteration": 2.561929225921631 + }, + { + "auxiliary_loss_clip": 0.01098302, + "auxiliary_loss_mlp": 0.01045272, + "balance_loss_clip": 1.03499973, + "balance_loss_mlp": 1.02954817, + "epoch": 0.25480234480685404, + "flos": 21032449274880.0, + "grad_norm": 1.9995772608334679, + "language_loss": 0.74601972, + "learning_rate": 3.492882062983333e-06, + "loss": 0.76745546, + "num_input_tokens_seen": 91475535, + "step": 4238, + "time_per_iteration": 2.703460216522217 + }, + { + "auxiliary_loss_clip": 0.01091259, + "auxiliary_loss_mlp": 0.01045875, + "balance_loss_clip": 1.03771305, + "balance_loss_mlp": 1.0287447, + "epoch": 0.254862468059522, + "flos": 25082167224960.0, + "grad_norm": 1.9099979537476073, + "language_loss": 0.80593193, + "learning_rate": 3.492622866794074e-06, + "loss": 0.82730329, + "num_input_tokens_seen": 91499140, + "step": 4239, + "time_per_iteration": 5.850543975830078 + }, + { + "auxiliary_loss_clip": 0.01087384, + "auxiliary_loss_mlp": 0.01044449, + "balance_loss_clip": 1.03856707, + "balance_loss_mlp": 1.02789116, + "epoch": 0.25492259131219, + "flos": 20558392554240.0, + "grad_norm": 1.822553186094856, + "language_loss": 0.77204633, + "learning_rate": 3.492363614004407e-06, + "loss": 0.79336464, + "num_input_tokens_seen": 91518335, + "step": 4240, + "time_per_iteration": 2.7311575412750244 + }, + { + "auxiliary_loss_clip": 0.01101621, + "auxiliary_loss_mlp": 0.01038262, + "balance_loss_clip": 1.03503549, + "balance_loss_mlp": 1.02096462, + "epoch": 0.25498271456485794, + "flos": 25042485674880.0, + "grad_norm": 2.450364382760227, + "language_loss": 0.83578289, + "learning_rate": 3.492104304624162e-06, + "loss": 0.85718167, + "num_input_tokens_seen": 91537655, + "step": 4241, + "time_per_iteration": 2.680002212524414 + }, + { + "auxiliary_loss_clip": 0.0108628, + "auxiliary_loss_mlp": 0.01045868, + "balance_loss_clip": 1.03332758, + "balance_loss_mlp": 1.02953672, + "epoch": 0.2550428378175259, + "flos": 26178412354560.0, + "grad_norm": 1.6260161165681617, + "language_loss": 0.73372781, + "learning_rate": 3.4918449386631725e-06, + "loss": 0.75504935, + "num_input_tokens_seen": 91557545, + "step": 4242, + "time_per_iteration": 2.7316033840179443 + }, + { + "auxiliary_loss_clip": 0.01097003, + "auxiliary_loss_mlp": 0.00748987, + "balance_loss_clip": 1.03385139, + "balance_loss_mlp": 1.00174999, + "epoch": 0.2551029610701939, + "flos": 15267170874240.0, + "grad_norm": 2.511746423771926, + "language_loss": 0.72220254, + "learning_rate": 3.491585516131273e-06, + "loss": 0.74066246, + "num_input_tokens_seen": 91574405, + "step": 4243, + "time_per_iteration": 2.6469435691833496 + }, + { + "auxiliary_loss_clip": 0.01086656, + "auxiliary_loss_mlp": 0.01042297, + "balance_loss_clip": 1.03348684, + "balance_loss_mlp": 1.02615666, + "epoch": 0.2551630843228619, + "flos": 18112193556480.0, + "grad_norm": 1.6005945205671646, + "language_loss": 0.81627524, + "learning_rate": 3.491326037038301e-06, + "loss": 0.83756471, + "num_input_tokens_seen": 91593755, + "step": 4244, + "time_per_iteration": 2.6341233253479004 + }, + { + "auxiliary_loss_clip": 0.01008249, + "auxiliary_loss_mlp": 0.01005493, + "balance_loss_clip": 1.00419497, + "balance_loss_mlp": 1.00345445, + "epoch": 0.25522320757552985, + "flos": 70520192167680.0, + "grad_norm": 0.6810210171290666, + "language_loss": 0.57735479, + "learning_rate": 3.4910665013940967e-06, + "loss": 0.59749222, + "num_input_tokens_seen": 91660335, + "step": 4245, + "time_per_iteration": 3.405109405517578 + }, + { + "auxiliary_loss_clip": 0.0109702, + "auxiliary_loss_mlp": 0.010458, + "balance_loss_clip": 1.03317618, + "balance_loss_mlp": 1.02992177, + "epoch": 0.2552833308281978, + "flos": 22893088757760.0, + "grad_norm": 2.040935404700235, + "language_loss": 0.64973319, + "learning_rate": 3.4908069092085015e-06, + "loss": 0.67116141, + "num_input_tokens_seen": 91678500, + "step": 4246, + "time_per_iteration": 4.230547666549683 + }, + { + "auxiliary_loss_clip": 0.01076317, + "auxiliary_loss_mlp": 0.01038636, + "balance_loss_clip": 1.02992284, + "balance_loss_mlp": 1.02403259, + "epoch": 0.2553434540808658, + "flos": 22053605022720.0, + "grad_norm": 1.7420341542544553, + "language_loss": 0.8156358, + "learning_rate": 3.4905472604913585e-06, + "loss": 0.83678538, + "num_input_tokens_seen": 91696430, + "step": 4247, + "time_per_iteration": 4.3282201290130615 + }, + { + "auxiliary_loss_clip": 0.01092066, + "auxiliary_loss_mlp": 0.01046078, + "balance_loss_clip": 1.03464174, + "balance_loss_mlp": 1.02799356, + "epoch": 0.25540357733353375, + "flos": 16544190176640.0, + "grad_norm": 2.6175931766152347, + "language_loss": 0.8318398, + "learning_rate": 3.490287555252514e-06, + "loss": 0.85322118, + "num_input_tokens_seen": 91713270, + "step": 4248, + "time_per_iteration": 2.6962027549743652 + }, + { + "auxiliary_loss_clip": 0.01059792, + "auxiliary_loss_mlp": 0.01040908, + "balance_loss_clip": 1.0295639, + "balance_loss_mlp": 1.02380109, + "epoch": 0.2554637005862017, + "flos": 17565022702080.0, + "grad_norm": 2.0701306551634655, + "language_loss": 0.83870435, + "learning_rate": 3.4900277935018166e-06, + "loss": 0.85971141, + "num_input_tokens_seen": 91728865, + "step": 4249, + "time_per_iteration": 2.582021951675415 + }, + { + "auxiliary_loss_clip": 0.00964126, + "auxiliary_loss_mlp": 0.01007084, + "balance_loss_clip": 1.00315452, + "balance_loss_mlp": 1.00511742, + "epoch": 0.2555238238388697, + "flos": 72244763953920.0, + "grad_norm": 0.7970282231582181, + "language_loss": 0.56323987, + "learning_rate": 3.489767975249115e-06, + "loss": 0.58295196, + "num_input_tokens_seen": 91787470, + "step": 4250, + "time_per_iteration": 3.412853240966797 + }, + { + "auxiliary_loss_clip": 0.01074252, + "auxiliary_loss_mlp": 0.0103731, + "balance_loss_clip": 1.03241646, + "balance_loss_mlp": 1.02040589, + "epoch": 0.25558394709153764, + "flos": 24389414547840.0, + "grad_norm": 1.8698700233290195, + "language_loss": 0.80959994, + "learning_rate": 3.4895081005042632e-06, + "loss": 0.83071554, + "num_input_tokens_seen": 91805640, + "step": 4251, + "time_per_iteration": 2.7846615314483643 + }, + { + "auxiliary_loss_clip": 0.00989268, + "auxiliary_loss_mlp": 0.01003228, + "balance_loss_clip": 1.00555348, + "balance_loss_mlp": 1.00128448, + "epoch": 0.2556440703442056, + "flos": 69231213636480.0, + "grad_norm": 0.7923964893205603, + "language_loss": 0.66086948, + "learning_rate": 3.4892481692771146e-06, + "loss": 0.68079442, + "num_input_tokens_seen": 91869695, + "step": 4252, + "time_per_iteration": 3.363536834716797 + }, + { + "auxiliary_loss_clip": 0.01082262, + "auxiliary_loss_mlp": 0.01033193, + "balance_loss_clip": 1.03251719, + "balance_loss_mlp": 1.01900661, + "epoch": 0.2557041935968736, + "flos": 24863902231680.0, + "grad_norm": 1.812145871556754, + "language_loss": 0.73601437, + "learning_rate": 3.4889881815775267e-06, + "loss": 0.75716895, + "num_input_tokens_seen": 91889920, + "step": 4253, + "time_per_iteration": 2.7574048042297363 + }, + { + "auxiliary_loss_clip": 0.01043737, + "auxiliary_loss_mlp": 0.01046488, + "balance_loss_clip": 1.02863264, + "balance_loss_mlp": 1.03035891, + "epoch": 0.25576431684954154, + "flos": 22492110257280.0, + "grad_norm": 2.0893733507412624, + "language_loss": 0.72705579, + "learning_rate": 3.488728137415357e-06, + "loss": 0.747958, + "num_input_tokens_seen": 91908665, + "step": 4254, + "time_per_iteration": 2.794356346130371 + }, + { + "auxiliary_loss_clip": 0.01045352, + "auxiliary_loss_mlp": 0.00749051, + "balance_loss_clip": 1.0288744, + "balance_loss_mlp": 1.00193489, + "epoch": 0.2558244401022095, + "flos": 19826748426240.0, + "grad_norm": 1.7173739512146775, + "language_loss": 0.80599463, + "learning_rate": 3.4884680368004675e-06, + "loss": 0.82393873, + "num_input_tokens_seen": 91927855, + "step": 4255, + "time_per_iteration": 2.9045166969299316 + }, + { + "auxiliary_loss_clip": 0.01072305, + "auxiliary_loss_mlp": 0.01039301, + "balance_loss_clip": 1.03230786, + "balance_loss_mlp": 1.02376223, + "epoch": 0.2558845633548775, + "flos": 23220486247680.0, + "grad_norm": 1.4779226516408699, + "language_loss": 0.85395318, + "learning_rate": 3.488207879742721e-06, + "loss": 0.87506926, + "num_input_tokens_seen": 91948500, + "step": 4256, + "time_per_iteration": 2.8705341815948486 + }, + { + "auxiliary_loss_clip": 0.01055502, + "auxiliary_loss_mlp": 0.0104718, + "balance_loss_clip": 1.03199697, + "balance_loss_mlp": 1.02982283, + "epoch": 0.2559446866075455, + "flos": 16837867774080.0, + "grad_norm": 1.6967588819039385, + "language_loss": 0.74719977, + "learning_rate": 3.4879476662519826e-06, + "loss": 0.76822662, + "num_input_tokens_seen": 91968375, + "step": 4257, + "time_per_iteration": 2.8770768642425537 + }, + { + "auxiliary_loss_clip": 0.00985319, + "auxiliary_loss_mlp": 0.01009255, + "balance_loss_clip": 1.01235831, + "balance_loss_mlp": 1.00656056, + "epoch": 0.25600480986021346, + "flos": 57593786895360.0, + "grad_norm": 0.7975925434303914, + "language_loss": 0.65256172, + "learning_rate": 3.4876873963381196e-06, + "loss": 0.67250746, + "num_input_tokens_seen": 92028490, + "step": 4258, + "time_per_iteration": 3.2729179859161377 + }, + { + "auxiliary_loss_clip": 0.0104489, + "auxiliary_loss_mlp": 0.00748908, + "balance_loss_clip": 1.03105903, + "balance_loss_mlp": 1.0020808, + "epoch": 0.2560649331128814, + "flos": 27819529868160.0, + "grad_norm": 1.6461786352426895, + "language_loss": 0.7648747, + "learning_rate": 3.4874270700110013e-06, + "loss": 0.78281271, + "num_input_tokens_seen": 92048060, + "step": 4259, + "time_per_iteration": 2.7879135608673096 + }, + { + "auxiliary_loss_clip": 0.00987451, + "auxiliary_loss_mlp": 0.01007969, + "balance_loss_clip": 1.00313401, + "balance_loss_mlp": 1.00601411, + "epoch": 0.2561250563655494, + "flos": 70950509101440.0, + "grad_norm": 0.7872974909454351, + "language_loss": 0.58384746, + "learning_rate": 3.4871666872804994e-06, + "loss": 0.60380167, + "num_input_tokens_seen": 92118180, + "step": 4260, + "time_per_iteration": 3.351181745529175 + }, + { + "auxiliary_loss_clip": 0.01087762, + "auxiliary_loss_mlp": 0.01043679, + "balance_loss_clip": 1.03504729, + "balance_loss_mlp": 1.02765775, + "epoch": 0.25618517961821735, + "flos": 27012329481600.0, + "grad_norm": 2.472762434321323, + "language_loss": 0.76810169, + "learning_rate": 3.4869062481564875e-06, + "loss": 0.78941607, + "num_input_tokens_seen": 92137570, + "step": 4261, + "time_per_iteration": 2.67094087600708 + }, + { + "auxiliary_loss_clip": 0.01094067, + "auxiliary_loss_mlp": 0.01037922, + "balance_loss_clip": 1.03466249, + "balance_loss_mlp": 1.0235455, + "epoch": 0.2562453028708853, + "flos": 23068296322560.0, + "grad_norm": 1.5426129011463463, + "language_loss": 0.82921672, + "learning_rate": 3.486645752648842e-06, + "loss": 0.85053658, + "num_input_tokens_seen": 92157625, + "step": 4262, + "time_per_iteration": 2.669355869293213 + }, + { + "auxiliary_loss_clip": 0.01093547, + "auxiliary_loss_mlp": 0.01046763, + "balance_loss_clip": 1.03896713, + "balance_loss_mlp": 1.02964461, + "epoch": 0.2563054261235533, + "flos": 15120942606720.0, + "grad_norm": 2.5569344719344804, + "language_loss": 0.74229622, + "learning_rate": 3.4863852007674405e-06, + "loss": 0.76369929, + "num_input_tokens_seen": 92175350, + "step": 4263, + "time_per_iteration": 2.6462576389312744 + }, + { + "auxiliary_loss_clip": 0.01076814, + "auxiliary_loss_mlp": 0.00748918, + "balance_loss_clip": 1.03995371, + "balance_loss_mlp": 1.00183141, + "epoch": 0.25636554937622125, + "flos": 27854865872640.0, + "grad_norm": 1.9376414256818253, + "language_loss": 0.82694775, + "learning_rate": 3.486124592522163e-06, + "loss": 0.84520507, + "num_input_tokens_seen": 92196070, + "step": 4264, + "time_per_iteration": 2.8280298709869385 + }, + { + "auxiliary_loss_clip": 0.01086862, + "auxiliary_loss_mlp": 0.0104015, + "balance_loss_clip": 1.03643739, + "balance_loss_mlp": 1.02430701, + "epoch": 0.2564256726288892, + "flos": 28906509288960.0, + "grad_norm": 1.9721099148448216, + "language_loss": 0.7436825, + "learning_rate": 3.4858639279228924e-06, + "loss": 0.76495254, + "num_input_tokens_seen": 92216310, + "step": 4265, + "time_per_iteration": 2.806913137435913 + }, + { + "auxiliary_loss_clip": 0.01060865, + "auxiliary_loss_mlp": 0.01035769, + "balance_loss_clip": 1.02949309, + "balance_loss_mlp": 1.02091527, + "epoch": 0.2564857958815572, + "flos": 18514931823360.0, + "grad_norm": 1.77355224796157, + "language_loss": 0.81606132, + "learning_rate": 3.485603206979513e-06, + "loss": 0.83702767, + "num_input_tokens_seen": 92234510, + "step": 4266, + "time_per_iteration": 2.8318116664886475 + }, + { + "auxiliary_loss_clip": 0.0104034, + "auxiliary_loss_mlp": 0.01042151, + "balance_loss_clip": 1.03050613, + "balance_loss_mlp": 1.02527142, + "epoch": 0.25654591913422514, + "flos": 25808280658560.0, + "grad_norm": 2.9008926777463113, + "language_loss": 0.79500431, + "learning_rate": 3.4853424297019103e-06, + "loss": 0.81582922, + "num_input_tokens_seen": 92254070, + "step": 4267, + "time_per_iteration": 2.8163599967956543 + }, + { + "auxiliary_loss_clip": 0.01047476, + "auxiliary_loss_mlp": 0.0104274, + "balance_loss_clip": 1.02967787, + "balance_loss_mlp": 1.02744555, + "epoch": 0.2566060423868931, + "flos": 19099665325440.0, + "grad_norm": 1.5493506065322067, + "language_loss": 0.79062891, + "learning_rate": 3.4850815960999736e-06, + "loss": 0.81153107, + "num_input_tokens_seen": 92275060, + "step": 4268, + "time_per_iteration": 2.714585781097412 + }, + { + "auxiliary_loss_clip": 0.01057864, + "auxiliary_loss_mlp": 0.00749106, + "balance_loss_clip": 1.03419757, + "balance_loss_mlp": 1.00187182, + "epoch": 0.25666616563956113, + "flos": 23842674656640.0, + "grad_norm": 5.377714665528424, + "language_loss": 0.68034387, + "learning_rate": 3.484820706183595e-06, + "loss": 0.69841355, + "num_input_tokens_seen": 92293610, + "step": 4269, + "time_per_iteration": 2.8214423656463623 + }, + { + "auxiliary_loss_clip": 0.01075958, + "auxiliary_loss_mlp": 0.01040863, + "balance_loss_clip": 1.03485131, + "balance_loss_mlp": 1.0249126, + "epoch": 0.2567262888922291, + "flos": 14604259420800.0, + "grad_norm": 2.640440917429385, + "language_loss": 0.7885707, + "learning_rate": 3.484559759962666e-06, + "loss": 0.80973887, + "num_input_tokens_seen": 92308305, + "step": 4270, + "time_per_iteration": 2.7090673446655273 + }, + { + "auxiliary_loss_clip": 0.01042903, + "auxiliary_loss_mlp": 0.01036746, + "balance_loss_clip": 1.02802622, + "balance_loss_mlp": 1.01898432, + "epoch": 0.25678641214489706, + "flos": 32923117877760.0, + "grad_norm": 5.506884532291166, + "language_loss": 0.67807239, + "learning_rate": 3.4842987574470816e-06, + "loss": 0.69886887, + "num_input_tokens_seen": 92329875, + "step": 4271, + "time_per_iteration": 2.8776907920837402 + }, + { + "auxiliary_loss_clip": 0.01085798, + "auxiliary_loss_mlp": 0.00748934, + "balance_loss_clip": 1.03328216, + "balance_loss_mlp": 1.00184691, + "epoch": 0.256846535397565, + "flos": 24098933260800.0, + "grad_norm": 1.623087116573892, + "language_loss": 0.87373477, + "learning_rate": 3.4840376986467403e-06, + "loss": 0.8920821, + "num_input_tokens_seen": 92348780, + "step": 4272, + "time_per_iteration": 2.6419389247894287 + }, + { + "auxiliary_loss_clip": 0.01079719, + "auxiliary_loss_mlp": 0.01043528, + "balance_loss_clip": 1.03843093, + "balance_loss_mlp": 1.02737534, + "epoch": 0.256906658650233, + "flos": 19718441942400.0, + "grad_norm": 1.7170191242767827, + "language_loss": 0.81882375, + "learning_rate": 3.483776583571541e-06, + "loss": 0.8400563, + "num_input_tokens_seen": 92368175, + "step": 4273, + "time_per_iteration": 2.7082481384277344 + }, + { + "auxiliary_loss_clip": 0.01045607, + "auxiliary_loss_mlp": 0.01043114, + "balance_loss_clip": 1.02633548, + "balance_loss_mlp": 1.02772439, + "epoch": 0.25696678190290095, + "flos": 22926018551040.0, + "grad_norm": 1.5273317155487711, + "language_loss": 0.76993865, + "learning_rate": 3.4835154122313846e-06, + "loss": 0.79082584, + "num_input_tokens_seen": 92387755, + "step": 4274, + "time_per_iteration": 2.7308220863342285 + }, + { + "auxiliary_loss_clip": 0.01064484, + "auxiliary_loss_mlp": 0.01037555, + "balance_loss_clip": 1.02940607, + "balance_loss_mlp": 1.02235603, + "epoch": 0.2570269051555689, + "flos": 27307838672640.0, + "grad_norm": 1.7042739003491594, + "language_loss": 0.83754897, + "learning_rate": 3.4832541846361743e-06, + "loss": 0.85856938, + "num_input_tokens_seen": 92409850, + "step": 4275, + "time_per_iteration": 2.775130033493042 + }, + { + "auxiliary_loss_clip": 0.01077476, + "auxiliary_loss_mlp": 0.01038168, + "balance_loss_clip": 1.03796411, + "balance_loss_mlp": 1.02230167, + "epoch": 0.2570870284082369, + "flos": 27563414918400.0, + "grad_norm": 2.229816066286624, + "language_loss": 0.7857452, + "learning_rate": 3.4829929007958175e-06, + "loss": 0.80690169, + "num_input_tokens_seen": 92431250, + "step": 4276, + "time_per_iteration": 2.8010029792785645 + }, + { + "auxiliary_loss_clip": 0.01086107, + "auxiliary_loss_mlp": 0.01038972, + "balance_loss_clip": 1.03521216, + "balance_loss_mlp": 1.02389193, + "epoch": 0.25714715166090485, + "flos": 28730834847360.0, + "grad_norm": 1.7587075204416585, + "language_loss": 0.78943747, + "learning_rate": 3.4827315607202214e-06, + "loss": 0.81068826, + "num_input_tokens_seen": 92452065, + "step": 4277, + "time_per_iteration": 2.854581832885742 + }, + { + "auxiliary_loss_clip": 0.01092356, + "auxiliary_loss_mlp": 0.01035801, + "balance_loss_clip": 1.03218269, + "balance_loss_mlp": 1.02155519, + "epoch": 0.2572072749135728, + "flos": 20116152305280.0, + "grad_norm": 1.8958352390765154, + "language_loss": 0.79168504, + "learning_rate": 3.482470164419295e-06, + "loss": 0.81296659, + "num_input_tokens_seen": 92470025, + "step": 4278, + "time_per_iteration": 2.7097365856170654 + }, + { + "auxiliary_loss_clip": 0.01076471, + "auxiliary_loss_mlp": 0.01035827, + "balance_loss_clip": 1.03380466, + "balance_loss_mlp": 1.02055001, + "epoch": 0.2572673981662408, + "flos": 26030855283840.0, + "grad_norm": 1.7467770680594337, + "language_loss": 0.74372017, + "learning_rate": 3.482208711902952e-06, + "loss": 0.76484317, + "num_input_tokens_seen": 92489825, + "step": 4279, + "time_per_iteration": 2.6615796089172363 + }, + { + "auxiliary_loss_clip": 0.01081924, + "auxiliary_loss_mlp": 0.01043823, + "balance_loss_clip": 1.03085721, + "balance_loss_mlp": 1.02858806, + "epoch": 0.25732752141890874, + "flos": 16106618695680.0, + "grad_norm": 3.4027722507495137, + "language_loss": 0.85323465, + "learning_rate": 3.4819472031811065e-06, + "loss": 0.87449217, + "num_input_tokens_seen": 92507270, + "step": 4280, + "time_per_iteration": 2.554797887802124 + }, + { + "auxiliary_loss_clip": 0.01083117, + "auxiliary_loss_mlp": 0.01038361, + "balance_loss_clip": 1.03093112, + "balance_loss_mlp": 1.0225302, + "epoch": 0.2573876446715767, + "flos": 22524429519360.0, + "grad_norm": 3.0526023949177925, + "language_loss": 0.79183829, + "learning_rate": 3.4816856382636744e-06, + "loss": 0.81305301, + "num_input_tokens_seen": 92526300, + "step": 4281, + "time_per_iteration": 2.618438482284546 + }, + { + "auxiliary_loss_clip": 0.01058083, + "auxiliary_loss_mlp": 0.01035036, + "balance_loss_clip": 1.03012073, + "balance_loss_mlp": 1.01921058, + "epoch": 0.2574477679242447, + "flos": 23950837486080.0, + "grad_norm": 1.67414731967321, + "language_loss": 0.87526417, + "learning_rate": 3.4814240171605737e-06, + "loss": 0.89619535, + "num_input_tokens_seen": 92546465, + "step": 4282, + "time_per_iteration": 2.7585608959198 + }, + { + "auxiliary_loss_clip": 0.01095193, + "auxiliary_loss_mlp": 0.01043217, + "balance_loss_clip": 1.03274703, + "balance_loss_mlp": 1.02796972, + "epoch": 0.2575078911769127, + "flos": 21981711951360.0, + "grad_norm": 1.8256930366126431, + "language_loss": 0.70178664, + "learning_rate": 3.4811623398817267e-06, + "loss": 0.7231707, + "num_input_tokens_seen": 92567260, + "step": 4283, + "time_per_iteration": 2.582841396331787 + }, + { + "auxiliary_loss_clip": 0.01089266, + "auxiliary_loss_mlp": 0.00748819, + "balance_loss_clip": 1.03303814, + "balance_loss_mlp": 1.00177884, + "epoch": 0.25756801442958066, + "flos": 21945406279680.0, + "grad_norm": 1.8029687002116876, + "language_loss": 0.80044317, + "learning_rate": 3.4809006064370553e-06, + "loss": 0.81882405, + "num_input_tokens_seen": 92585425, + "step": 4284, + "time_per_iteration": 2.563084125518799 + }, + { + "auxiliary_loss_clip": 0.01056744, + "auxiliary_loss_mlp": 0.01035543, + "balance_loss_clip": 1.04317832, + "balance_loss_mlp": 1.0219059, + "epoch": 0.2576281376822486, + "flos": 35261980058880.0, + "grad_norm": 1.811142751796735, + "language_loss": 0.70501953, + "learning_rate": 3.4806388168364835e-06, + "loss": 0.72594237, + "num_input_tokens_seen": 92604770, + "step": 4285, + "time_per_iteration": 2.8534276485443115 + }, + { + "auxiliary_loss_clip": 0.01072569, + "auxiliary_loss_mlp": 0.01035723, + "balance_loss_clip": 1.03318357, + "balance_loss_mlp": 1.02138233, + "epoch": 0.2576882609349166, + "flos": 14132285688960.0, + "grad_norm": 2.025512675104544, + "language_loss": 0.58475012, + "learning_rate": 3.4803769710899402e-06, + "loss": 0.60583305, + "num_input_tokens_seen": 92622635, + "step": 4286, + "time_per_iteration": 5.892765045166016 + }, + { + "auxiliary_loss_clip": 0.01087009, + "auxiliary_loss_mlp": 0.01043445, + "balance_loss_clip": 1.03469551, + "balance_loss_mlp": 1.02829981, + "epoch": 0.25774838418758456, + "flos": 23258336204160.0, + "grad_norm": 1.6628371166893352, + "language_loss": 0.64209884, + "learning_rate": 3.480115069207354e-06, + "loss": 0.66340339, + "num_input_tokens_seen": 92642960, + "step": 4287, + "time_per_iteration": 2.5573995113372803 + }, + { + "auxiliary_loss_clip": 0.01076188, + "auxiliary_loss_mlp": 0.01038193, + "balance_loss_clip": 1.03224647, + "balance_loss_mlp": 1.02201629, + "epoch": 0.2578085074402525, + "flos": 22601745544320.0, + "grad_norm": 1.8653871366462358, + "language_loss": 0.71730614, + "learning_rate": 3.4798531111986557e-06, + "loss": 0.73844999, + "num_input_tokens_seen": 92662455, + "step": 4288, + "time_per_iteration": 2.65138840675354 + }, + { + "auxiliary_loss_clip": 0.01059098, + "auxiliary_loss_mlp": 0.01034404, + "balance_loss_clip": 1.0308826, + "balance_loss_mlp": 1.02044427, + "epoch": 0.2578686306929205, + "flos": 24571840746240.0, + "grad_norm": 1.6573978441412853, + "language_loss": 0.77373719, + "learning_rate": 3.4795910970737786e-06, + "loss": 0.79467225, + "num_input_tokens_seen": 92683520, + "step": 4289, + "time_per_iteration": 2.6668784618377686 + }, + { + "auxiliary_loss_clip": 0.01090869, + "auxiliary_loss_mlp": 0.00748806, + "balance_loss_clip": 1.03143108, + "balance_loss_mlp": 1.00171113, + "epoch": 0.25792875394558845, + "flos": 18113953322880.0, + "grad_norm": 1.9704487445302534, + "language_loss": 0.85038495, + "learning_rate": 3.4793290268426592e-06, + "loss": 0.86878169, + "num_input_tokens_seen": 92701450, + "step": 4290, + "time_per_iteration": 2.546125888824463 + }, + { + "auxiliary_loss_clip": 0.01056452, + "auxiliary_loss_mlp": 0.01052051, + "balance_loss_clip": 1.03054738, + "balance_loss_mlp": 1.03389573, + "epoch": 0.2579888771982564, + "flos": 17712902995200.0, + "grad_norm": 2.177095677455465, + "language_loss": 0.72355533, + "learning_rate": 3.4790669005152354e-06, + "loss": 0.74464035, + "num_input_tokens_seen": 92720355, + "step": 4291, + "time_per_iteration": 2.6732990741729736 + }, + { + "auxiliary_loss_clip": 0.01094795, + "auxiliary_loss_mlp": 0.01037686, + "balance_loss_clip": 1.03309464, + "balance_loss_mlp": 1.02175939, + "epoch": 0.2580490004509244, + "flos": 16434878112000.0, + "grad_norm": 2.2749361531311956, + "language_loss": 0.80970168, + "learning_rate": 3.4788047181014458e-06, + "loss": 0.83102643, + "num_input_tokens_seen": 92736755, + "step": 4292, + "time_per_iteration": 2.4892208576202393 + }, + { + "auxiliary_loss_clip": 0.01096301, + "auxiliary_loss_mlp": 0.01040303, + "balance_loss_clip": 1.03507614, + "balance_loss_mlp": 1.02510405, + "epoch": 0.25810912370359235, + "flos": 33835141128960.0, + "grad_norm": 1.7483258275983526, + "language_loss": 0.67489409, + "learning_rate": 3.4785424796112337e-06, + "loss": 0.69626015, + "num_input_tokens_seen": 92757655, + "step": 4293, + "time_per_iteration": 4.209293842315674 + }, + { + "auxiliary_loss_clip": 0.01057293, + "auxiliary_loss_mlp": 0.01035914, + "balance_loss_clip": 1.0308938, + "balance_loss_mlp": 1.02197242, + "epoch": 0.2581692469562603, + "flos": 25192197561600.0, + "grad_norm": 1.7306809376409704, + "language_loss": 0.75604737, + "learning_rate": 3.478280185054542e-06, + "loss": 0.77697939, + "num_input_tokens_seen": 92776100, + "step": 4294, + "time_per_iteration": 4.2288126945495605 + }, + { + "auxiliary_loss_clip": 0.01048646, + "auxiliary_loss_mlp": 0.01052636, + "balance_loss_clip": 1.02817631, + "balance_loss_mlp": 1.03626859, + "epoch": 0.2582293702089283, + "flos": 34932212271360.0, + "grad_norm": 1.9021647675921385, + "language_loss": 0.80854851, + "learning_rate": 3.478017834441318e-06, + "loss": 0.82956135, + "num_input_tokens_seen": 92798880, + "step": 4295, + "time_per_iteration": 2.8099279403686523 + }, + { + "auxiliary_loss_clip": 0.01012407, + "auxiliary_loss_mlp": 0.01044812, + "balance_loss_clip": 1.03283262, + "balance_loss_mlp": 1.02845621, + "epoch": 0.2582894934615963, + "flos": 26833746038400.0, + "grad_norm": 1.781229672350698, + "language_loss": 0.72424865, + "learning_rate": 3.4777554277815096e-06, + "loss": 0.74482083, + "num_input_tokens_seen": 92817750, + "step": 4296, + "time_per_iteration": 3.1384730339050293 + }, + { + "auxiliary_loss_clip": 0.01046835, + "auxiliary_loss_mlp": 0.01032748, + "balance_loss_clip": 1.03647637, + "balance_loss_mlp": 1.01737618, + "epoch": 0.25834961671426426, + "flos": 23515241253120.0, + "grad_norm": 1.7553857970415754, + "language_loss": 0.86596161, + "learning_rate": 3.477492965085067e-06, + "loss": 0.88675743, + "num_input_tokens_seen": 92837995, + "step": 4297, + "time_per_iteration": 3.081211805343628 + }, + { + "auxiliary_loss_clip": 0.01095465, + "auxiliary_loss_mlp": 0.01045395, + "balance_loss_clip": 1.03461957, + "balance_loss_mlp": 1.03045821, + "epoch": 0.25840973996693223, + "flos": 22451028076800.0, + "grad_norm": 1.8989463607077817, + "language_loss": 0.84702647, + "learning_rate": 3.477230446361943e-06, + "loss": 0.86843503, + "num_input_tokens_seen": 92857245, + "step": 4298, + "time_per_iteration": 2.6047565937042236 + }, + { + "auxiliary_loss_clip": 0.01081535, + "auxiliary_loss_mlp": 0.00748806, + "balance_loss_clip": 1.03226161, + "balance_loss_mlp": 1.00164676, + "epoch": 0.2584698632196002, + "flos": 11290854366720.0, + "grad_norm": 3.7892204058842283, + "language_loss": 0.83396107, + "learning_rate": 3.4769678716220927e-06, + "loss": 0.85226452, + "num_input_tokens_seen": 92873265, + "step": 4299, + "time_per_iteration": 2.8321075439453125 + }, + { + "auxiliary_loss_clip": 0.01059853, + "auxiliary_loss_mlp": 0.01034306, + "balance_loss_clip": 1.0299629, + "balance_loss_mlp": 1.0204246, + "epoch": 0.25852998647226816, + "flos": 17929982839680.0, + "grad_norm": 2.5980094579198325, + "language_loss": 0.82787132, + "learning_rate": 3.4767052408754726e-06, + "loss": 0.84881288, + "num_input_tokens_seen": 92890880, + "step": 4300, + "time_per_iteration": 2.6102051734924316 + }, + { + "auxiliary_loss_clip": 0.0108373, + "auxiliary_loss_mlp": 0.01037709, + "balance_loss_clip": 1.03260684, + "balance_loss_mlp": 1.02255774, + "epoch": 0.2585901097249361, + "flos": 33256117889280.0, + "grad_norm": 2.096059537324636, + "language_loss": 0.67292881, + "learning_rate": 3.4764425541320417e-06, + "loss": 0.69414318, + "num_input_tokens_seen": 92910770, + "step": 4301, + "time_per_iteration": 2.73815655708313 + }, + { + "auxiliary_loss_clip": 0.01083638, + "auxiliary_loss_mlp": 0.01039589, + "balance_loss_clip": 1.03135204, + "balance_loss_mlp": 1.02435386, + "epoch": 0.2586502329776041, + "flos": 18441278985600.0, + "grad_norm": 3.794134463321372, + "language_loss": 0.81207627, + "learning_rate": 3.4761798114017617e-06, + "loss": 0.83330858, + "num_input_tokens_seen": 92929520, + "step": 4302, + "time_per_iteration": 2.650601625442505 + }, + { + "auxiliary_loss_clip": 0.01056188, + "auxiliary_loss_mlp": 0.01039468, + "balance_loss_clip": 1.03432369, + "balance_loss_mlp": 1.02462697, + "epoch": 0.25871035623027205, + "flos": 17968120104960.0, + "grad_norm": 1.7570711188507446, + "language_loss": 0.92225289, + "learning_rate": 3.475917012694595e-06, + "loss": 0.94320947, + "num_input_tokens_seen": 92947890, + "step": 4303, + "time_per_iteration": 2.8566298484802246 + }, + { + "auxiliary_loss_clip": 0.01087337, + "auxiliary_loss_mlp": 0.0103114, + "balance_loss_clip": 1.03562987, + "balance_loss_mlp": 1.016078, + "epoch": 0.25877047948294, + "flos": 27777729415680.0, + "grad_norm": 1.855612738314707, + "language_loss": 0.67285901, + "learning_rate": 3.475654158020507e-06, + "loss": 0.69404376, + "num_input_tokens_seen": 92967690, + "step": 4304, + "time_per_iteration": 2.755016326904297 + }, + { + "auxiliary_loss_clip": 0.01063266, + "auxiliary_loss_mlp": 0.01043913, + "balance_loss_clip": 1.03161192, + "balance_loss_mlp": 1.02946496, + "epoch": 0.258830602735608, + "flos": 27125843437440.0, + "grad_norm": 2.088972679394949, + "language_loss": 0.72415906, + "learning_rate": 3.4753912473894657e-06, + "loss": 0.74523085, + "num_input_tokens_seen": 92986830, + "step": 4305, + "time_per_iteration": 2.8269448280334473 + }, + { + "auxiliary_loss_clip": 0.01048686, + "auxiliary_loss_mlp": 0.00749003, + "balance_loss_clip": 1.03244662, + "balance_loss_mlp": 1.00189447, + "epoch": 0.25889072598827595, + "flos": 17891486438400.0, + "grad_norm": 2.5978175232249248, + "language_loss": 0.75752234, + "learning_rate": 3.4751282808114403e-06, + "loss": 0.77549928, + "num_input_tokens_seen": 93002740, + "step": 4306, + "time_per_iteration": 2.6485037803649902 + }, + { + "auxiliary_loss_clip": 0.01001415, + "auxiliary_loss_mlp": 0.01006401, + "balance_loss_clip": 1.00608373, + "balance_loss_mlp": 1.00445831, + "epoch": 0.2589508492409439, + "flos": 53934955724160.0, + "grad_norm": 0.8398028648112085, + "language_loss": 0.57205671, + "learning_rate": 3.474865258296403e-06, + "loss": 0.59213495, + "num_input_tokens_seen": 93058645, + "step": 4307, + "time_per_iteration": 3.199768543243408 + }, + { + "auxiliary_loss_clip": 0.01070943, + "auxiliary_loss_mlp": 0.01034285, + "balance_loss_clip": 1.03136373, + "balance_loss_mlp": 1.01987255, + "epoch": 0.2590109724936119, + "flos": 22125785402880.0, + "grad_norm": 1.6031956997033452, + "language_loss": 0.71493912, + "learning_rate": 3.474602179854327e-06, + "loss": 0.73599136, + "num_input_tokens_seen": 93077140, + "step": 4308, + "time_per_iteration": 2.8476297855377197 + }, + { + "auxiliary_loss_clip": 0.01094553, + "auxiliary_loss_mlp": 0.01039156, + "balance_loss_clip": 1.03318, + "balance_loss_mlp": 1.02390885, + "epoch": 0.2590710957462799, + "flos": 13474294398720.0, + "grad_norm": 1.904943307812678, + "language_loss": 0.84291989, + "learning_rate": 3.4743390454951886e-06, + "loss": 0.86425698, + "num_input_tokens_seen": 93093580, + "step": 4309, + "time_per_iteration": 2.7594096660614014 + }, + { + "auxiliary_loss_clip": 0.0108461, + "auxiliary_loss_mlp": 0.01040747, + "balance_loss_clip": 1.03604317, + "balance_loss_mlp": 1.02676928, + "epoch": 0.25913121899894787, + "flos": 22307098279680.0, + "grad_norm": 1.5913973454079418, + "language_loss": 0.84691012, + "learning_rate": 3.474075855228966e-06, + "loss": 0.8681637, + "num_input_tokens_seen": 93112345, + "step": 4310, + "time_per_iteration": 2.796213388442993 + }, + { + "auxiliary_loss_clip": 0.01085966, + "auxiliary_loss_mlp": 0.0103782, + "balance_loss_clip": 1.03404903, + "balance_loss_mlp": 1.02314568, + "epoch": 0.25919134225161583, + "flos": 25811728364160.0, + "grad_norm": 1.8216689606508392, + "language_loss": 0.77088439, + "learning_rate": 3.473812609065639e-06, + "loss": 0.79212224, + "num_input_tokens_seen": 93131545, + "step": 4311, + "time_per_iteration": 2.675748586654663 + }, + { + "auxiliary_loss_clip": 0.01050688, + "auxiliary_loss_mlp": 0.01037682, + "balance_loss_clip": 1.02701902, + "balance_loss_mlp": 1.02262616, + "epoch": 0.2592514655042838, + "flos": 31212262108800.0, + "grad_norm": 1.840909825415226, + "language_loss": 0.72203296, + "learning_rate": 3.4735493070151904e-06, + "loss": 0.7429167, + "num_input_tokens_seen": 93150730, + "step": 4312, + "time_per_iteration": 2.9243650436401367 + }, + { + "auxiliary_loss_clip": 0.01092898, + "auxiliary_loss_mlp": 0.01038693, + "balance_loss_clip": 1.03314161, + "balance_loss_mlp": 1.02385712, + "epoch": 0.25931158875695176, + "flos": 18474998878080.0, + "grad_norm": 1.8752072944533602, + "language_loss": 0.69802153, + "learning_rate": 3.4732859490876044e-06, + "loss": 0.71933746, + "num_input_tokens_seen": 93167895, + "step": 4313, + "time_per_iteration": 2.7163636684417725 + }, + { + "auxiliary_loss_clip": 0.01092045, + "auxiliary_loss_mlp": 0.01039418, + "balance_loss_clip": 1.03327084, + "balance_loss_mlp": 1.02618623, + "epoch": 0.2593717120096197, + "flos": 19207935895680.0, + "grad_norm": 1.8606178856452833, + "language_loss": 0.80508679, + "learning_rate": 3.473022535292867e-06, + "loss": 0.82640141, + "num_input_tokens_seen": 93187650, + "step": 4314, + "time_per_iteration": 2.7593741416931152 + }, + { + "auxiliary_loss_clip": 0.01052425, + "auxiliary_loss_mlp": 0.01043839, + "balance_loss_clip": 1.02806032, + "balance_loss_mlp": 1.02848518, + "epoch": 0.2594318352622877, + "flos": 31248100903680.0, + "grad_norm": 2.3750308967980898, + "language_loss": 0.67288619, + "learning_rate": 3.472759065640968e-06, + "loss": 0.69384885, + "num_input_tokens_seen": 93207370, + "step": 4315, + "time_per_iteration": 2.9638924598693848 + }, + { + "auxiliary_loss_clip": 0.01042448, + "auxiliary_loss_mlp": 0.01035777, + "balance_loss_clip": 1.02955246, + "balance_loss_mlp": 1.02212167, + "epoch": 0.25949195851495566, + "flos": 22237144542720.0, + "grad_norm": 1.51797165967561, + "language_loss": 0.7937839, + "learning_rate": 3.4724955401418976e-06, + "loss": 0.8145662, + "num_input_tokens_seen": 93227925, + "step": 4316, + "time_per_iteration": 2.8191702365875244 + }, + { + "auxiliary_loss_clip": 0.01046102, + "auxiliary_loss_mlp": 0.01032049, + "balance_loss_clip": 1.0301913, + "balance_loss_mlp": 1.01752925, + "epoch": 0.2595520817676236, + "flos": 28075716645120.0, + "grad_norm": 1.7076308199611785, + "language_loss": 0.77926493, + "learning_rate": 3.4722319588056487e-06, + "loss": 0.80004644, + "num_input_tokens_seen": 93250020, + "step": 4317, + "time_per_iteration": 2.74806809425354 + }, + { + "auxiliary_loss_clip": 0.01094144, + "auxiliary_loss_mlp": 0.01048596, + "balance_loss_clip": 1.03406155, + "balance_loss_mlp": 1.03372455, + "epoch": 0.2596122050202916, + "flos": 20190954378240.0, + "grad_norm": 3.4057531309630193, + "language_loss": 0.78115118, + "learning_rate": 3.4719683216422163e-06, + "loss": 0.80257857, + "num_input_tokens_seen": 93269070, + "step": 4318, + "time_per_iteration": 2.6386804580688477 + }, + { + "auxiliary_loss_clip": 0.0108911, + "auxiliary_loss_mlp": 0.01037771, + "balance_loss_clip": 1.0308665, + "balance_loss_mlp": 1.02248836, + "epoch": 0.25967232827295955, + "flos": 22527949052160.0, + "grad_norm": 2.052922902069151, + "language_loss": 0.76605988, + "learning_rate": 3.471704628661598e-06, + "loss": 0.78732872, + "num_input_tokens_seen": 93290250, + "step": 4319, + "time_per_iteration": 2.7796685695648193 + }, + { + "auxiliary_loss_clip": 0.01069066, + "auxiliary_loss_mlp": 0.01038071, + "balance_loss_clip": 1.03222847, + "balance_loss_mlp": 1.02415347, + "epoch": 0.2597324515256275, + "flos": 21068252156160.0, + "grad_norm": 1.8731123512268448, + "language_loss": 0.76512212, + "learning_rate": 3.4714408798737925e-06, + "loss": 0.78619349, + "num_input_tokens_seen": 93310090, + "step": 4320, + "time_per_iteration": 2.8881216049194336 + }, + { + "auxiliary_loss_clip": 0.01061621, + "auxiliary_loss_mlp": 0.01037158, + "balance_loss_clip": 1.0325954, + "balance_loss_mlp": 1.02248347, + "epoch": 0.2597925747782955, + "flos": 22050013662720.0, + "grad_norm": 5.590672638787276, + "language_loss": 0.71233475, + "learning_rate": 3.471177075288801e-06, + "loss": 0.7333225, + "num_input_tokens_seen": 93329570, + "step": 4321, + "time_per_iteration": 2.9336390495300293 + }, + { + "auxiliary_loss_clip": 0.01068133, + "auxiliary_loss_mlp": 0.01040176, + "balance_loss_clip": 1.03072214, + "balance_loss_mlp": 1.0236423, + "epoch": 0.2598526980309635, + "flos": 19536949497600.0, + "grad_norm": 2.6157723258872303, + "language_loss": 0.74711424, + "learning_rate": 3.4709132149166277e-06, + "loss": 0.7681973, + "num_input_tokens_seen": 93347920, + "step": 4322, + "time_per_iteration": 2.8083510398864746 + }, + { + "auxiliary_loss_clip": 0.01064572, + "auxiliary_loss_mlp": 0.01041998, + "balance_loss_clip": 1.03253961, + "balance_loss_mlp": 1.02696609, + "epoch": 0.25991282128363147, + "flos": 24495207079680.0, + "grad_norm": 2.0383485015767593, + "language_loss": 0.7361604, + "learning_rate": 3.470649298767278e-06, + "loss": 0.75722611, + "num_input_tokens_seen": 93367145, + "step": 4323, + "time_per_iteration": 2.910276412963867 + }, + { + "auxiliary_loss_clip": 0.01088061, + "auxiliary_loss_mlp": 0.00748975, + "balance_loss_clip": 1.03286397, + "balance_loss_mlp": 1.00177729, + "epoch": 0.25997294453629943, + "flos": 24201457655040.0, + "grad_norm": 27.173031677911048, + "language_loss": 0.67083502, + "learning_rate": 3.4703853268507597e-06, + "loss": 0.68920535, + "num_input_tokens_seen": 93386555, + "step": 4324, + "time_per_iteration": 2.863797426223755 + }, + { + "auxiliary_loss_clip": 0.01050781, + "auxiliary_loss_mlp": 0.01036671, + "balance_loss_clip": 1.03255558, + "balance_loss_mlp": 1.02352202, + "epoch": 0.2600330677889674, + "flos": 31431460855680.0, + "grad_norm": 2.595478551518625, + "language_loss": 0.70675683, + "learning_rate": 3.470121299177082e-06, + "loss": 0.72763133, + "num_input_tokens_seen": 93405590, + "step": 4325, + "time_per_iteration": 2.9889609813690186 + }, + { + "auxiliary_loss_clip": 0.01080614, + "auxiliary_loss_mlp": 0.01035665, + "balance_loss_clip": 1.02975368, + "balance_loss_mlp": 1.02071643, + "epoch": 0.26009319104163536, + "flos": 32266527217920.0, + "grad_norm": 1.933118491876586, + "language_loss": 0.73114496, + "learning_rate": 3.469857215756257e-06, + "loss": 0.75230771, + "num_input_tokens_seen": 93424750, + "step": 4326, + "time_per_iteration": 2.8654909133911133 + }, + { + "auxiliary_loss_clip": 0.0106273, + "auxiliary_loss_mlp": 0.00748867, + "balance_loss_clip": 1.03058398, + "balance_loss_mlp": 1.00199091, + "epoch": 0.26015331429430333, + "flos": 26286754752000.0, + "grad_norm": 1.7643883492087042, + "language_loss": 0.86957788, + "learning_rate": 3.4695930765982997e-06, + "loss": 0.88769388, + "num_input_tokens_seen": 93443465, + "step": 4327, + "time_per_iteration": 2.8639111518859863 + }, + { + "auxiliary_loss_clip": 0.01096799, + "auxiliary_loss_mlp": 0.00748842, + "balance_loss_clip": 1.03455853, + "balance_loss_mlp": 1.00171018, + "epoch": 0.2602134375469713, + "flos": 21142335957120.0, + "grad_norm": 1.5227075528698704, + "language_loss": 0.80164301, + "learning_rate": 3.4693288817132255e-06, + "loss": 0.82009947, + "num_input_tokens_seen": 93462580, + "step": 4328, + "time_per_iteration": 2.8489978313446045 + }, + { + "auxiliary_loss_clip": 0.01065185, + "auxiliary_loss_mlp": 0.00748751, + "balance_loss_clip": 1.02900505, + "balance_loss_mlp": 1.00176597, + "epoch": 0.26027356079963926, + "flos": 25921327737600.0, + "grad_norm": 1.511013563040561, + "language_loss": 0.8811627, + "learning_rate": 3.4690646311110525e-06, + "loss": 0.89930207, + "num_input_tokens_seen": 93482790, + "step": 4329, + "time_per_iteration": 2.8295435905456543 + }, + { + "auxiliary_loss_clip": 0.01088619, + "auxiliary_loss_mlp": 0.01034823, + "balance_loss_clip": 1.03099704, + "balance_loss_mlp": 1.02106619, + "epoch": 0.2603336840523072, + "flos": 26359222440960.0, + "grad_norm": 2.2341650219994595, + "language_loss": 0.77713561, + "learning_rate": 3.468800324801802e-06, + "loss": 0.79837006, + "num_input_tokens_seen": 93498795, + "step": 4330, + "time_per_iteration": 2.8824381828308105 + }, + { + "auxiliary_loss_clip": 0.01094136, + "auxiliary_loss_mlp": 0.01044025, + "balance_loss_clip": 1.03270125, + "balance_loss_mlp": 1.02920151, + "epoch": 0.2603938073049752, + "flos": 23513661054720.0, + "grad_norm": 1.3961479986388876, + "language_loss": 0.75384545, + "learning_rate": 3.4685359627954958e-06, + "loss": 0.77522707, + "num_input_tokens_seen": 93518335, + "step": 4331, + "time_per_iteration": 2.713181972503662 + }, + { + "auxiliary_loss_clip": 0.01070447, + "auxiliary_loss_mlp": 0.01040924, + "balance_loss_clip": 1.03434336, + "balance_loss_mlp": 1.02713156, + "epoch": 0.26045393055764315, + "flos": 25374300537600.0, + "grad_norm": 1.3700787787608, + "language_loss": 0.6881218, + "learning_rate": 3.4682715451021584e-06, + "loss": 0.70923555, + "num_input_tokens_seen": 93539170, + "step": 4332, + "time_per_iteration": 2.7054574489593506 + }, + { + "auxiliary_loss_clip": 0.01061146, + "auxiliary_loss_mlp": 0.01039653, + "balance_loss_clip": 1.03044057, + "balance_loss_mlp": 1.02500272, + "epoch": 0.2605140538103111, + "flos": 27635272076160.0, + "grad_norm": 1.8968443925282465, + "language_loss": 0.79534584, + "learning_rate": 3.4680070717318174e-06, + "loss": 0.8163538, + "num_input_tokens_seen": 93558480, + "step": 4333, + "time_per_iteration": 6.031085014343262 + }, + { + "auxiliary_loss_clip": 0.01089107, + "auxiliary_loss_mlp": 0.01036808, + "balance_loss_clip": 1.03128242, + "balance_loss_mlp": 1.02312279, + "epoch": 0.2605741770629791, + "flos": 13769839503360.0, + "grad_norm": 1.8213796662849995, + "language_loss": 0.80633277, + "learning_rate": 3.467742542694501e-06, + "loss": 0.82759196, + "num_input_tokens_seen": 93575220, + "step": 4334, + "time_per_iteration": 2.608891010284424 + }, + { + "auxiliary_loss_clip": 0.01068085, + "auxiliary_loss_mlp": 0.01035721, + "balance_loss_clip": 1.03034449, + "balance_loss_mlp": 1.02073622, + "epoch": 0.26063430031564705, + "flos": 26031681296640.0, + "grad_norm": 1.7052095707312447, + "language_loss": 0.7974391, + "learning_rate": 3.46747795800024e-06, + "loss": 0.81847721, + "num_input_tokens_seen": 93597015, + "step": 4335, + "time_per_iteration": 2.7931969165802 + }, + { + "auxiliary_loss_clip": 0.01006241, + "auxiliary_loss_mlp": 0.01008382, + "balance_loss_clip": 1.00199091, + "balance_loss_mlp": 1.0065701, + "epoch": 0.26069442356831507, + "flos": 62443809820800.0, + "grad_norm": 0.8370112503337188, + "language_loss": 0.60806555, + "learning_rate": 3.467213317659068e-06, + "loss": 0.6282118, + "num_input_tokens_seen": 93657775, + "step": 4336, + "time_per_iteration": 3.317962169647217 + }, + { + "auxiliary_loss_clip": 0.01061032, + "auxiliary_loss_mlp": 0.01046004, + "balance_loss_clip": 1.03152096, + "balance_loss_mlp": 1.0313772, + "epoch": 0.26075454682098304, + "flos": 13626376583040.0, + "grad_norm": 2.1292800091826756, + "language_loss": 0.771667, + "learning_rate": 3.46694862168102e-06, + "loss": 0.79273736, + "num_input_tokens_seen": 93676145, + "step": 4337, + "time_per_iteration": 2.7124710083007812 + }, + { + "auxiliary_loss_clip": 0.0107377, + "auxiliary_loss_mlp": 0.01043397, + "balance_loss_clip": 1.03263116, + "balance_loss_mlp": 1.02730417, + "epoch": 0.260814670073651, + "flos": 12126531260160.0, + "grad_norm": 2.1184033047253954, + "language_loss": 0.74039453, + "learning_rate": 3.4666838700761334e-06, + "loss": 0.76156616, + "num_input_tokens_seen": 93692480, + "step": 4338, + "time_per_iteration": 2.7271127700805664 + }, + { + "auxiliary_loss_clip": 0.01083802, + "auxiliary_loss_mlp": 0.01040987, + "balance_loss_clip": 1.03110719, + "balance_loss_mlp": 1.02538252, + "epoch": 0.26087479332631897, + "flos": 15122522805120.0, + "grad_norm": 2.2464264269625986, + "language_loss": 0.80524063, + "learning_rate": 3.466419062854447e-06, + "loss": 0.82648849, + "num_input_tokens_seen": 93710165, + "step": 4339, + "time_per_iteration": 2.6422271728515625 + }, + { + "auxiliary_loss_clip": 0.010417, + "auxiliary_loss_mlp": 0.0103679, + "balance_loss_clip": 1.02801633, + "balance_loss_mlp": 1.02322388, + "epoch": 0.26093491657898693, + "flos": 24680937329280.0, + "grad_norm": 1.6325114551968445, + "language_loss": 0.76596522, + "learning_rate": 3.4661542000260033e-06, + "loss": 0.78675008, + "num_input_tokens_seen": 93730185, + "step": 4340, + "time_per_iteration": 4.664350271224976 + }, + { + "auxiliary_loss_clip": 0.01024964, + "auxiliary_loss_mlp": 0.01040918, + "balance_loss_clip": 1.02901149, + "balance_loss_mlp": 1.02545631, + "epoch": 0.2609950398316549, + "flos": 25116138512640.0, + "grad_norm": 1.6714106493611984, + "language_loss": 0.82814741, + "learning_rate": 3.465889281600845e-06, + "loss": 0.8488062, + "num_input_tokens_seen": 93747690, + "step": 4341, + "time_per_iteration": 4.3407580852508545 + }, + { + "auxiliary_loss_clip": 0.01092981, + "auxiliary_loss_mlp": 0.01036357, + "balance_loss_clip": 1.0329951, + "balance_loss_mlp": 1.0214442, + "epoch": 0.26105516308432286, + "flos": 28548588216960.0, + "grad_norm": 1.9679358852180207, + "language_loss": 0.77142489, + "learning_rate": 3.4656243075890183e-06, + "loss": 0.79271829, + "num_input_tokens_seen": 93767405, + "step": 4342, + "time_per_iteration": 2.7739484310150146 + }, + { + "auxiliary_loss_clip": 0.01079231, + "auxiliary_loss_mlp": 0.01030965, + "balance_loss_clip": 1.03113544, + "balance_loss_mlp": 1.01563513, + "epoch": 0.2611152863369908, + "flos": 39530609447040.0, + "grad_norm": 2.357493364979128, + "language_loss": 0.66356742, + "learning_rate": 3.4653592780005707e-06, + "loss": 0.68466944, + "num_input_tokens_seen": 93789950, + "step": 4343, + "time_per_iteration": 2.7917134761810303 + }, + { + "auxiliary_loss_clip": 0.01026043, + "auxiliary_loss_mlp": 0.01041497, + "balance_loss_clip": 1.02732229, + "balance_loss_mlp": 1.02584517, + "epoch": 0.2611754095896588, + "flos": 13735329511680.0, + "grad_norm": 2.2603914705856623, + "language_loss": 0.73484981, + "learning_rate": 3.465094192845553e-06, + "loss": 0.75552523, + "num_input_tokens_seen": 93807835, + "step": 4344, + "time_per_iteration": 2.7207696437835693 + }, + { + "auxiliary_loss_clip": 0.01094662, + "auxiliary_loss_mlp": 0.01037871, + "balance_loss_clip": 1.03453422, + "balance_loss_mlp": 1.02301693, + "epoch": 0.26123553284232676, + "flos": 21506649649920.0, + "grad_norm": 2.1652052560149593, + "language_loss": 0.86462688, + "learning_rate": 3.4648290521340165e-06, + "loss": 0.88595217, + "num_input_tokens_seen": 93825670, + "step": 4345, + "time_per_iteration": 2.624692440032959 + }, + { + "auxiliary_loss_clip": 0.01065596, + "auxiliary_loss_mlp": 0.01034749, + "balance_loss_clip": 1.02915144, + "balance_loss_mlp": 1.02016401, + "epoch": 0.2612956560949947, + "flos": 21139786091520.0, + "grad_norm": 2.298112393250557, + "language_loss": 0.76147294, + "learning_rate": 3.464563855876015e-06, + "loss": 0.78247637, + "num_input_tokens_seen": 93844045, + "step": 4346, + "time_per_iteration": 2.632874011993408 + }, + { + "auxiliary_loss_clip": 0.01083985, + "auxiliary_loss_mlp": 0.01038819, + "balance_loss_clip": 1.03206289, + "balance_loss_mlp": 1.02387011, + "epoch": 0.2613557793476627, + "flos": 25119011600640.0, + "grad_norm": 1.5354451813871977, + "language_loss": 0.75700939, + "learning_rate": 3.464298604081606e-06, + "loss": 0.77823746, + "num_input_tokens_seen": 93864380, + "step": 4347, + "time_per_iteration": 2.6953141689300537 + }, + { + "auxiliary_loss_clip": 0.01054563, + "auxiliary_loss_mlp": 0.01033039, + "balance_loss_clip": 1.03147113, + "balance_loss_mlp": 1.01800656, + "epoch": 0.26141590260033065, + "flos": 26067699659520.0, + "grad_norm": 1.3215444434879637, + "language_loss": 0.73577839, + "learning_rate": 3.4640332967608476e-06, + "loss": 0.75665438, + "num_input_tokens_seen": 93885475, + "step": 4348, + "time_per_iteration": 2.753603935241699 + }, + { + "auxiliary_loss_clip": 0.01059117, + "auxiliary_loss_mlp": 0.01040416, + "balance_loss_clip": 1.03104019, + "balance_loss_mlp": 1.02574778, + "epoch": 0.2614760258529987, + "flos": 25701518459520.0, + "grad_norm": 1.767626192335294, + "language_loss": 0.90824878, + "learning_rate": 3.463767933923799e-06, + "loss": 0.92924404, + "num_input_tokens_seen": 93905545, + "step": 4349, + "time_per_iteration": 2.697484016418457 + }, + { + "auxiliary_loss_clip": 0.01079362, + "auxiliary_loss_mlp": 0.01037021, + "balance_loss_clip": 1.03285599, + "balance_loss_mlp": 1.02347291, + "epoch": 0.26153614910566664, + "flos": 17457147181440.0, + "grad_norm": 1.8471545031090348, + "language_loss": 0.80083334, + "learning_rate": 3.463502515580524e-06, + "loss": 0.82199723, + "num_input_tokens_seen": 93924185, + "step": 4350, + "time_per_iteration": 2.6594815254211426 + }, + { + "auxiliary_loss_clip": 0.01079086, + "auxiliary_loss_mlp": 0.01037243, + "balance_loss_clip": 1.03232479, + "balance_loss_mlp": 1.02340841, + "epoch": 0.2615962723583346, + "flos": 17712831168000.0, + "grad_norm": 1.8327718497580074, + "language_loss": 0.6204077, + "learning_rate": 3.4632370417410866e-06, + "loss": 0.64157099, + "num_input_tokens_seen": 93942825, + "step": 4351, + "time_per_iteration": 2.8125338554382324 + }, + { + "auxiliary_loss_clip": 0.0108321, + "auxiliary_loss_mlp": 0.01037029, + "balance_loss_clip": 1.03028464, + "balance_loss_mlp": 1.02223468, + "epoch": 0.26165639561100257, + "flos": 23257725672960.0, + "grad_norm": 1.8488094994412434, + "language_loss": 0.83791625, + "learning_rate": 3.462971512415555e-06, + "loss": 0.8591187, + "num_input_tokens_seen": 93962045, + "step": 4352, + "time_per_iteration": 2.550248146057129 + }, + { + "auxiliary_loss_clip": 0.01014293, + "auxiliary_loss_mlp": 0.01003303, + "balance_loss_clip": 1.00915956, + "balance_loss_mlp": 1.00134754, + "epoch": 0.26171651886367053, + "flos": 66737970800640.0, + "grad_norm": 0.7953970393869867, + "language_loss": 0.70552003, + "learning_rate": 3.462705927613996e-06, + "loss": 0.72569597, + "num_input_tokens_seen": 94021175, + "step": 4353, + "time_per_iteration": 3.0251171588897705 + }, + { + "auxiliary_loss_clip": 0.01054661, + "auxiliary_loss_mlp": 0.0104729, + "balance_loss_clip": 1.02588129, + "balance_loss_mlp": 1.03026748, + "epoch": 0.2617766421163385, + "flos": 22349581090560.0, + "grad_norm": 1.6379596344803344, + "language_loss": 0.77480507, + "learning_rate": 3.4624402873464816e-06, + "loss": 0.79582459, + "num_input_tokens_seen": 94043370, + "step": 4354, + "time_per_iteration": 2.630500078201294 + }, + { + "auxiliary_loss_clip": 0.01030932, + "auxiliary_loss_mlp": 0.01050309, + "balance_loss_clip": 1.02667904, + "balance_loss_mlp": 1.03436518, + "epoch": 0.26183676536900646, + "flos": 26067125041920.0, + "grad_norm": 3.2399572333301956, + "language_loss": 0.68404412, + "learning_rate": 3.462174591623085e-06, + "loss": 0.70485657, + "num_input_tokens_seen": 94063510, + "step": 4355, + "time_per_iteration": 2.7289211750030518 + }, + { + "auxiliary_loss_clip": 0.01047213, + "auxiliary_loss_mlp": 0.01036099, + "balance_loss_clip": 1.03263509, + "balance_loss_mlp": 1.01924896, + "epoch": 0.26189688862167443, + "flos": 20996466825600.0, + "grad_norm": 1.9452820371925839, + "language_loss": 0.66992861, + "learning_rate": 3.4619088404538815e-06, + "loss": 0.69076169, + "num_input_tokens_seen": 94083865, + "step": 4356, + "time_per_iteration": 2.7614798545837402 + }, + { + "auxiliary_loss_clip": 0.01006888, + "auxiliary_loss_mlp": 0.01002734, + "balance_loss_clip": 1.00161099, + "balance_loss_mlp": 1.00091004, + "epoch": 0.2619570118743424, + "flos": 65798261141760.0, + "grad_norm": 0.6915667764712592, + "language_loss": 0.53151798, + "learning_rate": 3.4616430338489487e-06, + "loss": 0.55161422, + "num_input_tokens_seen": 94144095, + "step": 4357, + "time_per_iteration": 3.108074188232422 + }, + { + "auxiliary_loss_clip": 0.01081136, + "auxiliary_loss_mlp": 0.01042997, + "balance_loss_clip": 1.03191864, + "balance_loss_mlp": 1.02795827, + "epoch": 0.26201713512701036, + "flos": 28766817296640.0, + "grad_norm": 1.6994530469956055, + "language_loss": 0.84154809, + "learning_rate": 3.4613771718183654e-06, + "loss": 0.86278939, + "num_input_tokens_seen": 94163035, + "step": 4358, + "time_per_iteration": 2.7317492961883545 + }, + { + "auxiliary_loss_clip": 0.01069613, + "auxiliary_loss_mlp": 0.01041267, + "balance_loss_clip": 1.0298934, + "balance_loss_mlp": 1.02426803, + "epoch": 0.2620772583796783, + "flos": 26432516142720.0, + "grad_norm": 3.4047808683423755, + "language_loss": 0.6652292, + "learning_rate": 3.4611112543722127e-06, + "loss": 0.68633807, + "num_input_tokens_seen": 94182520, + "step": 4359, + "time_per_iteration": 2.654468297958374 + }, + { + "auxiliary_loss_clip": 0.01064208, + "auxiliary_loss_mlp": 0.01036435, + "balance_loss_clip": 1.02832115, + "balance_loss_mlp": 1.02185583, + "epoch": 0.2621373816323463, + "flos": 20156552127360.0, + "grad_norm": 2.2883657447126646, + "language_loss": 0.78398985, + "learning_rate": 3.4608452815205757e-06, + "loss": 0.80499625, + "num_input_tokens_seen": 94201795, + "step": 4360, + "time_per_iteration": 2.7014665603637695 + }, + { + "auxiliary_loss_clip": 0.01066117, + "auxiliary_loss_mlp": 0.0103817, + "balance_loss_clip": 1.02989018, + "balance_loss_mlp": 1.0242821, + "epoch": 0.26219750488501425, + "flos": 28621235473920.0, + "grad_norm": 2.1141828955564037, + "language_loss": 0.682585, + "learning_rate": 3.4605792532735387e-06, + "loss": 0.70362782, + "num_input_tokens_seen": 94222390, + "step": 4361, + "time_per_iteration": 2.6876087188720703 + }, + { + "auxiliary_loss_clip": 0.01084239, + "auxiliary_loss_mlp": 0.01047059, + "balance_loss_clip": 1.03161573, + "balance_loss_mlp": 1.03156149, + "epoch": 0.2622576281376823, + "flos": 15042549173760.0, + "grad_norm": 1.8585806298531302, + "language_loss": 0.84288454, + "learning_rate": 3.46031316964119e-06, + "loss": 0.86419761, + "num_input_tokens_seen": 94239980, + "step": 4362, + "time_per_iteration": 2.7339916229248047 + }, + { + "auxiliary_loss_clip": 0.01053908, + "auxiliary_loss_mlp": 0.010442, + "balance_loss_clip": 1.0307436, + "balance_loss_mlp": 1.02813005, + "epoch": 0.26231775139035024, + "flos": 26396174557440.0, + "grad_norm": 1.6190525016941253, + "language_loss": 0.65397489, + "learning_rate": 3.4600470306336197e-06, + "loss": 0.67495596, + "num_input_tokens_seen": 94260715, + "step": 4363, + "time_per_iteration": 2.7293999195098877 + }, + { + "auxiliary_loss_clip": 0.01003001, + "auxiliary_loss_mlp": 0.01002245, + "balance_loss_clip": 1.0079993, + "balance_loss_mlp": 1.00025463, + "epoch": 0.2623778746430182, + "flos": 65408918647680.0, + "grad_norm": 0.8907929053719685, + "language_loss": 0.61098611, + "learning_rate": 3.4597808362609194e-06, + "loss": 0.63103861, + "num_input_tokens_seen": 94321285, + "step": 4364, + "time_per_iteration": 3.384305715560913 + }, + { + "auxiliary_loss_clip": 0.010982, + "auxiliary_loss_mlp": 0.0104993, + "balance_loss_clip": 1.03615022, + "balance_loss_mlp": 1.03331184, + "epoch": 0.26243799789568617, + "flos": 12604215254400.0, + "grad_norm": 2.4471821518210346, + "language_loss": 0.71558589, + "learning_rate": 3.459514586533184e-06, + "loss": 0.7370671, + "num_input_tokens_seen": 94335420, + "step": 4365, + "time_per_iteration": 2.6165595054626465 + }, + { + "auxiliary_loss_clip": 0.01068531, + "auxiliary_loss_mlp": 0.00748917, + "balance_loss_clip": 1.03373063, + "balance_loss_mlp": 1.00187445, + "epoch": 0.26249812114835414, + "flos": 28623821253120.0, + "grad_norm": 1.558918838735746, + "language_loss": 0.7710557, + "learning_rate": 3.459248281460509e-06, + "loss": 0.78923023, + "num_input_tokens_seen": 94357440, + "step": 4366, + "time_per_iteration": 2.6905078887939453 + }, + { + "auxiliary_loss_clip": 0.01095561, + "auxiliary_loss_mlp": 0.01041784, + "balance_loss_clip": 1.03522515, + "balance_loss_mlp": 1.027771, + "epoch": 0.2625582444010221, + "flos": 14465393441280.0, + "grad_norm": 1.545323761223063, + "language_loss": 0.75803089, + "learning_rate": 3.4589819210529927e-06, + "loss": 0.77940434, + "num_input_tokens_seen": 94375690, + "step": 4367, + "time_per_iteration": 2.621701240539551 + }, + { + "auxiliary_loss_clip": 0.01080712, + "auxiliary_loss_mlp": 0.01035901, + "balance_loss_clip": 1.03229117, + "balance_loss_mlp": 1.02178645, + "epoch": 0.26261836765369007, + "flos": 16613174246400.0, + "grad_norm": 2.9133685710272044, + "language_loss": 0.6963203, + "learning_rate": 3.458715505320736e-06, + "loss": 0.71748638, + "num_input_tokens_seen": 94393190, + "step": 4368, + "time_per_iteration": 2.651745557785034 + }, + { + "auxiliary_loss_clip": 0.01071292, + "auxiliary_loss_mlp": 0.01043377, + "balance_loss_clip": 1.03214049, + "balance_loss_mlp": 1.02787924, + "epoch": 0.26267849090635803, + "flos": 20519932066560.0, + "grad_norm": 1.7109948903622574, + "language_loss": 0.78692436, + "learning_rate": 3.458449034273841e-06, + "loss": 0.80807102, + "num_input_tokens_seen": 94410975, + "step": 4369, + "time_per_iteration": 2.6014747619628906 + }, + { + "auxiliary_loss_clip": 0.01073282, + "auxiliary_loss_mlp": 0.01042069, + "balance_loss_clip": 1.03615117, + "balance_loss_mlp": 1.02691734, + "epoch": 0.262738614159026, + "flos": 21323936142720.0, + "grad_norm": 1.9825589337362628, + "language_loss": 0.83203846, + "learning_rate": 3.4581825079224133e-06, + "loss": 0.85319197, + "num_input_tokens_seen": 94429985, + "step": 4370, + "time_per_iteration": 2.589517593383789 + }, + { + "auxiliary_loss_clip": 0.01080094, + "auxiliary_loss_mlp": 0.01044781, + "balance_loss_clip": 1.03171444, + "balance_loss_mlp": 1.02781725, + "epoch": 0.26279873741169396, + "flos": 17603590930560.0, + "grad_norm": 1.7230637448799297, + "language_loss": 0.7090596, + "learning_rate": 3.4579159262765575e-06, + "loss": 0.73030841, + "num_input_tokens_seen": 94448660, + "step": 4371, + "time_per_iteration": 2.6664047241210938 + }, + { + "auxiliary_loss_clip": 0.01019484, + "auxiliary_loss_mlp": 0.01003873, + "balance_loss_clip": 1.00419211, + "balance_loss_mlp": 1.00202489, + "epoch": 0.2628588606643619, + "flos": 60949746587520.0, + "grad_norm": 0.9572299419690689, + "language_loss": 0.56432444, + "learning_rate": 3.457649289346384e-06, + "loss": 0.58455795, + "num_input_tokens_seen": 94515630, + "step": 4372, + "time_per_iteration": 3.2542147636413574 + }, + { + "auxiliary_loss_clip": 0.01070628, + "auxiliary_loss_mlp": 0.0103468, + "balance_loss_clip": 1.03272867, + "balance_loss_mlp": 1.02035689, + "epoch": 0.2629189839170299, + "flos": 27016315891200.0, + "grad_norm": 1.8776847547819457, + "language_loss": 0.77458704, + "learning_rate": 3.4573825971420042e-06, + "loss": 0.79564011, + "num_input_tokens_seen": 94535385, + "step": 4373, + "time_per_iteration": 2.740720272064209 + }, + { + "auxiliary_loss_clip": 0.01058115, + "auxiliary_loss_mlp": 0.01038389, + "balance_loss_clip": 1.03212905, + "balance_loss_mlp": 1.02450705, + "epoch": 0.26297910716969786, + "flos": 17019863009280.0, + "grad_norm": 2.0409496215393057, + "language_loss": 0.72444946, + "learning_rate": 3.4571158496735294e-06, + "loss": 0.7454145, + "num_input_tokens_seen": 94552650, + "step": 4374, + "time_per_iteration": 2.624418258666992 + }, + { + "auxiliary_loss_clip": 0.01076302, + "auxiliary_loss_mlp": 0.01042772, + "balance_loss_clip": 1.03823304, + "balance_loss_mlp": 1.02674997, + "epoch": 0.2630392304223659, + "flos": 24897370728960.0, + "grad_norm": 2.260876303719953, + "language_loss": 0.80824077, + "learning_rate": 3.4568490469510756e-06, + "loss": 0.82943147, + "num_input_tokens_seen": 94574075, + "step": 4375, + "time_per_iteration": 2.6657745838165283 + }, + { + "auxiliary_loss_clip": 0.01061962, + "auxiliary_loss_mlp": 0.01036192, + "balance_loss_clip": 1.0283041, + "balance_loss_mlp": 1.02239323, + "epoch": 0.26309935367503384, + "flos": 32854026067200.0, + "grad_norm": 1.656759821537865, + "language_loss": 0.65959662, + "learning_rate": 3.4565821889847603e-06, + "loss": 0.68057817, + "num_input_tokens_seen": 94594255, + "step": 4376, + "time_per_iteration": 2.8101375102996826 + }, + { + "auxiliary_loss_clip": 0.0103939, + "auxiliary_loss_mlp": 0.01044968, + "balance_loss_clip": 1.02764273, + "balance_loss_mlp": 1.02979243, + "epoch": 0.2631594769277018, + "flos": 15887958652800.0, + "grad_norm": 2.067013818024078, + "language_loss": 0.69623572, + "learning_rate": 3.4563152757847026e-06, + "loss": 0.71707928, + "num_input_tokens_seen": 94611410, + "step": 4377, + "time_per_iteration": 2.672743797302246 + }, + { + "auxiliary_loss_clip": 0.01083798, + "auxiliary_loss_mlp": 0.01037736, + "balance_loss_clip": 1.03345466, + "balance_loss_mlp": 1.02327597, + "epoch": 0.2632196001803698, + "flos": 50804943557760.0, + "grad_norm": 1.653519038668073, + "language_loss": 0.78955424, + "learning_rate": 3.4560483073610233e-06, + "loss": 0.8107695, + "num_input_tokens_seen": 94636575, + "step": 4378, + "time_per_iteration": 2.882444381713867 + }, + { + "auxiliary_loss_clip": 0.0106894, + "auxiliary_loss_mlp": 0.01047278, + "balance_loss_clip": 1.03122663, + "balance_loss_mlp": 1.03397477, + "epoch": 0.26327972343303774, + "flos": 13733031041280.0, + "grad_norm": 2.0370851827743035, + "language_loss": 0.76947445, + "learning_rate": 3.455781283723846e-06, + "loss": 0.7906366, + "num_input_tokens_seen": 94654345, + "step": 4379, + "time_per_iteration": 2.739013910293579 + }, + { + "auxiliary_loss_clip": 0.01063881, + "auxiliary_loss_mlp": 0.0103682, + "balance_loss_clip": 1.03296125, + "balance_loss_mlp": 1.02054775, + "epoch": 0.2633398466857057, + "flos": 23769057732480.0, + "grad_norm": 2.2345068699937323, + "language_loss": 0.77755511, + "learning_rate": 3.4555142048832975e-06, + "loss": 0.79856211, + "num_input_tokens_seen": 94673985, + "step": 4380, + "time_per_iteration": 5.938634634017944 + }, + { + "auxiliary_loss_clip": 0.01065735, + "auxiliary_loss_mlp": 0.01040516, + "balance_loss_clip": 1.02885461, + "balance_loss_mlp": 1.02576351, + "epoch": 0.26339996993837367, + "flos": 27600223380480.0, + "grad_norm": 2.078663074650848, + "language_loss": 0.63787359, + "learning_rate": 3.4552470708495036e-06, + "loss": 0.65893614, + "num_input_tokens_seen": 94693145, + "step": 4381, + "time_per_iteration": 2.7611308097839355 + }, + { + "auxiliary_loss_clip": 0.0107916, + "auxiliary_loss_mlp": 0.01037716, + "balance_loss_clip": 1.02980959, + "balance_loss_mlp": 1.02390528, + "epoch": 0.26346009319104163, + "flos": 16946317912320.0, + "grad_norm": 1.8673147514705528, + "language_loss": 0.82583052, + "learning_rate": 3.454979881632595e-06, + "loss": 0.84699929, + "num_input_tokens_seen": 94710185, + "step": 4382, + "time_per_iteration": 2.5532684326171875 + }, + { + "auxiliary_loss_clip": 0.01054209, + "auxiliary_loss_mlp": 0.01042913, + "balance_loss_clip": 1.02960587, + "balance_loss_mlp": 1.02735603, + "epoch": 0.2635202164437096, + "flos": 37232218915200.0, + "grad_norm": 2.16959842466616, + "language_loss": 0.6996721, + "learning_rate": 3.4547126372427035e-06, + "loss": 0.72064334, + "num_input_tokens_seen": 94730280, + "step": 4383, + "time_per_iteration": 2.8844525814056396 + }, + { + "auxiliary_loss_clip": 0.01076864, + "auxiliary_loss_mlp": 0.01040613, + "balance_loss_clip": 1.03069103, + "balance_loss_mlp": 1.02680862, + "epoch": 0.26358033969637756, + "flos": 20996359084800.0, + "grad_norm": 2.071667380986444, + "language_loss": 0.69185913, + "learning_rate": 3.4544453376899638e-06, + "loss": 0.71303397, + "num_input_tokens_seen": 94748560, + "step": 4384, + "time_per_iteration": 2.6183018684387207 + }, + { + "auxiliary_loss_clip": 0.01077388, + "auxiliary_loss_mlp": 0.01038744, + "balance_loss_clip": 1.02898753, + "balance_loss_mlp": 1.02470136, + "epoch": 0.26364046294904553, + "flos": 27746092512000.0, + "grad_norm": 2.682016192357911, + "language_loss": 0.69887161, + "learning_rate": 3.45417798298451e-06, + "loss": 0.72003293, + "num_input_tokens_seen": 94767570, + "step": 4385, + "time_per_iteration": 2.80924916267395 + }, + { + "auxiliary_loss_clip": 0.01055013, + "auxiliary_loss_mlp": 0.010463, + "balance_loss_clip": 1.03012204, + "balance_loss_mlp": 1.03163195, + "epoch": 0.2637005862017135, + "flos": 22893088757760.0, + "grad_norm": 1.8419937412559657, + "language_loss": 0.85302055, + "learning_rate": 3.453910573136482e-06, + "loss": 0.87403375, + "num_input_tokens_seen": 94784985, + "step": 4386, + "time_per_iteration": 2.697143793106079 + }, + { + "auxiliary_loss_clip": 0.01070333, + "auxiliary_loss_mlp": 0.01039001, + "balance_loss_clip": 1.03092909, + "balance_loss_mlp": 1.02454126, + "epoch": 0.26376070945438146, + "flos": 15048834053760.0, + "grad_norm": 2.155713534597705, + "language_loss": 0.76914287, + "learning_rate": 3.4536431081560196e-06, + "loss": 0.79023623, + "num_input_tokens_seen": 94802545, + "step": 4387, + "time_per_iteration": 4.265554904937744 + }, + { + "auxiliary_loss_clip": 0.01080593, + "auxiliary_loss_mlp": 0.01038247, + "balance_loss_clip": 1.03402472, + "balance_loss_mlp": 1.02395999, + "epoch": 0.2638208327070494, + "flos": 21141833166720.0, + "grad_norm": 2.1591021647742497, + "language_loss": 0.76007998, + "learning_rate": 3.453375588053264e-06, + "loss": 0.78126842, + "num_input_tokens_seen": 94820730, + "step": 4388, + "time_per_iteration": 4.5406413078308105 + }, + { + "auxiliary_loss_clip": 0.01091193, + "auxiliary_loss_mlp": 0.01032414, + "balance_loss_clip": 1.03186619, + "balance_loss_mlp": 1.01800156, + "epoch": 0.26388095595971744, + "flos": 21725597001600.0, + "grad_norm": 2.195453042536923, + "language_loss": 0.8644048, + "learning_rate": 3.4531080128383617e-06, + "loss": 0.88564086, + "num_input_tokens_seen": 94839175, + "step": 4389, + "time_per_iteration": 2.532653570175171 + }, + { + "auxiliary_loss_clip": 0.01010056, + "auxiliary_loss_mlp": 0.01007461, + "balance_loss_clip": 1.00423717, + "balance_loss_mlp": 1.0055536, + "epoch": 0.2639410792123854, + "flos": 65515537192320.0, + "grad_norm": 0.8077854596030595, + "language_loss": 0.60283059, + "learning_rate": 3.452840382521457e-06, + "loss": 0.62300581, + "num_input_tokens_seen": 94898865, + "step": 4390, + "time_per_iteration": 3.1646289825439453 + }, + { + "auxiliary_loss_clip": 0.01069314, + "auxiliary_loss_mlp": 0.01033188, + "balance_loss_clip": 1.0296737, + "balance_loss_mlp": 1.01778054, + "epoch": 0.2640012024650534, + "flos": 23948574929280.0, + "grad_norm": 1.6239970922165905, + "language_loss": 0.77732158, + "learning_rate": 3.4525726971127e-06, + "loss": 0.79834658, + "num_input_tokens_seen": 94917490, + "step": 4391, + "time_per_iteration": 2.639099597930908 + }, + { + "auxiliary_loss_clip": 0.00987542, + "auxiliary_loss_mlp": 0.00748179, + "balance_loss_clip": 1.00292635, + "balance_loss_mlp": 1.00178993, + "epoch": 0.26406132571772134, + "flos": 56441163369600.0, + "grad_norm": 0.8915690589200455, + "language_loss": 0.58664155, + "learning_rate": 3.45230495662224e-06, + "loss": 0.60399878, + "num_input_tokens_seen": 94969065, + "step": 4392, + "time_per_iteration": 3.310913324356079 + }, + { + "auxiliary_loss_clip": 0.01079447, + "auxiliary_loss_mlp": 0.01042539, + "balance_loss_clip": 1.0318923, + "balance_loss_mlp": 1.0279299, + "epoch": 0.2641214489703893, + "flos": 22090557139200.0, + "grad_norm": 1.809689673897017, + "language_loss": 0.68712974, + "learning_rate": 3.4520371610602306e-06, + "loss": 0.70834965, + "num_input_tokens_seen": 94988540, + "step": 4393, + "time_per_iteration": 2.6058826446533203 + }, + { + "auxiliary_loss_clip": 0.01085248, + "auxiliary_loss_mlp": 0.01039741, + "balance_loss_clip": 1.03328121, + "balance_loss_mlp": 1.02417231, + "epoch": 0.26418157222305727, + "flos": 16544764794240.0, + "grad_norm": 2.638523057229543, + "language_loss": 0.83859307, + "learning_rate": 3.4517693104368267e-06, + "loss": 0.85984302, + "num_input_tokens_seen": 95004810, + "step": 4394, + "time_per_iteration": 2.666170597076416 + }, + { + "auxiliary_loss_clip": 0.01068642, + "auxiliary_loss_mlp": 0.01043675, + "balance_loss_clip": 1.0313015, + "balance_loss_mlp": 1.02629423, + "epoch": 0.26424169547572524, + "flos": 18002486442240.0, + "grad_norm": 2.138581875283665, + "language_loss": 0.69884002, + "learning_rate": 3.4515014047621856e-06, + "loss": 0.71996313, + "num_input_tokens_seen": 95024085, + "step": 4395, + "time_per_iteration": 2.586648464202881 + }, + { + "auxiliary_loss_clip": 0.01055925, + "auxiliary_loss_mlp": 0.01030282, + "balance_loss_clip": 1.02868426, + "balance_loss_mlp": 1.01508236, + "epoch": 0.2643018187283932, + "flos": 16983162288000.0, + "grad_norm": 2.167606747595536, + "language_loss": 0.86708754, + "learning_rate": 3.4512334440464655e-06, + "loss": 0.88794965, + "num_input_tokens_seen": 95042515, + "step": 4396, + "time_per_iteration": 2.6820030212402344 + }, + { + "auxiliary_loss_clip": 0.0096822, + "auxiliary_loss_mlp": 0.01022058, + "balance_loss_clip": 1.00383723, + "balance_loss_mlp": 1.01978076, + "epoch": 0.26436194198106117, + "flos": 59664359416320.0, + "grad_norm": 0.7968273277652504, + "language_loss": 0.55039734, + "learning_rate": 3.4509654282998277e-06, + "loss": 0.57030016, + "num_input_tokens_seen": 95094835, + "step": 4397, + "time_per_iteration": 3.1380274295806885 + }, + { + "auxiliary_loss_clip": 0.01078659, + "auxiliary_loss_mlp": 0.01043943, + "balance_loss_clip": 1.03084755, + "balance_loss_mlp": 1.02942324, + "epoch": 0.26442206523372913, + "flos": 32921322197760.0, + "grad_norm": 2.102808745902709, + "language_loss": 0.77707386, + "learning_rate": 3.450697357532435e-06, + "loss": 0.79829979, + "num_input_tokens_seen": 95113480, + "step": 4398, + "time_per_iteration": 3.1596789360046387 + }, + { + "auxiliary_loss_clip": 0.01086665, + "auxiliary_loss_mlp": 0.01034862, + "balance_loss_clip": 1.03632081, + "balance_loss_mlp": 1.01972198, + "epoch": 0.2644821884863971, + "flos": 21031300039680.0, + "grad_norm": 1.8738368225671953, + "language_loss": 0.6724633, + "learning_rate": 3.4504292317544534e-06, + "loss": 0.69367862, + "num_input_tokens_seen": 95132580, + "step": 4399, + "time_per_iteration": 2.691957950592041 + }, + { + "auxiliary_loss_clip": 0.01048555, + "auxiliary_loss_mlp": 0.0103621, + "balance_loss_clip": 1.03228772, + "balance_loss_mlp": 1.02179193, + "epoch": 0.26454231173906506, + "flos": 20776801201920.0, + "grad_norm": 1.9605470981210753, + "language_loss": 0.86401379, + "learning_rate": 3.4501610509760504e-06, + "loss": 0.88486141, + "num_input_tokens_seen": 95152375, + "step": 4400, + "time_per_iteration": 2.779716968536377 + }, + { + "auxiliary_loss_clip": 0.01064537, + "auxiliary_loss_mlp": 0.01033275, + "balance_loss_clip": 1.02970672, + "balance_loss_mlp": 1.01758742, + "epoch": 0.264602434991733, + "flos": 16618669027200.0, + "grad_norm": 2.243867176143178, + "language_loss": 0.75837803, + "learning_rate": 3.4498928152073944e-06, + "loss": 0.77935612, + "num_input_tokens_seen": 95170265, + "step": 4401, + "time_per_iteration": 2.7481181621551514 + }, + { + "auxiliary_loss_clip": 0.01051998, + "auxiliary_loss_mlp": 0.01042607, + "balance_loss_clip": 1.03092325, + "balance_loss_mlp": 1.02639461, + "epoch": 0.26466255824440105, + "flos": 19062677295360.0, + "grad_norm": 1.7706540817655476, + "language_loss": 0.878905, + "learning_rate": 3.4496245244586577e-06, + "loss": 0.89985108, + "num_input_tokens_seen": 95188655, + "step": 4402, + "time_per_iteration": 2.7202324867248535 + }, + { + "auxiliary_loss_clip": 0.01058901, + "auxiliary_loss_mlp": 0.01037925, + "balance_loss_clip": 1.03281212, + "balance_loss_mlp": 1.0222013, + "epoch": 0.264722681497069, + "flos": 22638554006400.0, + "grad_norm": 1.8144803013431496, + "language_loss": 0.78074884, + "learning_rate": 3.4493561787400137e-06, + "loss": 0.80171716, + "num_input_tokens_seen": 95209615, + "step": 4403, + "time_per_iteration": 2.812809944152832 + }, + { + "auxiliary_loss_clip": 0.01073946, + "auxiliary_loss_mlp": 0.01031878, + "balance_loss_clip": 1.03009152, + "balance_loss_mlp": 1.01661921, + "epoch": 0.264782804749737, + "flos": 22492253911680.0, + "grad_norm": 2.5548650212366, + "language_loss": 0.88264525, + "learning_rate": 3.4490877780616387e-06, + "loss": 0.90370345, + "num_input_tokens_seen": 95227810, + "step": 4404, + "time_per_iteration": 2.720912218093872 + }, + { + "auxiliary_loss_clip": 0.01067302, + "auxiliary_loss_mlp": 0.01038012, + "balance_loss_clip": 1.02886748, + "balance_loss_mlp": 1.0232302, + "epoch": 0.26484292800240494, + "flos": 16800269212800.0, + "grad_norm": 1.7484346154350727, + "language_loss": 0.75919807, + "learning_rate": 3.448819322433709e-06, + "loss": 0.78025126, + "num_input_tokens_seen": 95245890, + "step": 4405, + "time_per_iteration": 2.8454227447509766 + }, + { + "auxiliary_loss_clip": 0.01093725, + "auxiliary_loss_mlp": 0.01035618, + "balance_loss_clip": 1.03338528, + "balance_loss_mlp": 1.02005506, + "epoch": 0.2649030512550729, + "flos": 20449583280000.0, + "grad_norm": 4.536577161925495, + "language_loss": 0.70290774, + "learning_rate": 3.4485508118664066e-06, + "loss": 0.72420114, + "num_input_tokens_seen": 95264955, + "step": 4406, + "time_per_iteration": 2.6240768432617188 + }, + { + "auxiliary_loss_clip": 0.01079023, + "auxiliary_loss_mlp": 0.01043004, + "balance_loss_clip": 1.04057968, + "balance_loss_mlp": 1.02860928, + "epoch": 0.2649631745077409, + "flos": 22416123035520.0, + "grad_norm": 1.5782209993840837, + "language_loss": 0.83281952, + "learning_rate": 3.448282246369912e-06, + "loss": 0.85403979, + "num_input_tokens_seen": 95284245, + "step": 4407, + "time_per_iteration": 2.719435930252075 + }, + { + "auxiliary_loss_clip": 0.01053697, + "auxiliary_loss_mlp": 0.01031777, + "balance_loss_clip": 1.02932787, + "balance_loss_mlp": 1.01697063, + "epoch": 0.26502329776040884, + "flos": 35116110927360.0, + "grad_norm": 2.089362006353192, + "language_loss": 0.75940365, + "learning_rate": 3.4480136259544084e-06, + "loss": 0.78025842, + "num_input_tokens_seen": 95307125, + "step": 4408, + "time_per_iteration": 2.7904632091522217 + }, + { + "auxiliary_loss_clip": 0.01037747, + "auxiliary_loss_mlp": 0.01037266, + "balance_loss_clip": 1.02669656, + "balance_loss_mlp": 1.02241302, + "epoch": 0.2650834210130768, + "flos": 38687498438400.0, + "grad_norm": 1.6469477091661107, + "language_loss": 0.71031588, + "learning_rate": 3.447744950630084e-06, + "loss": 0.73106599, + "num_input_tokens_seen": 95329150, + "step": 4409, + "time_per_iteration": 2.776568651199341 + }, + { + "auxiliary_loss_clip": 0.0108131, + "auxiliary_loss_mlp": 0.01036532, + "balance_loss_clip": 1.03108335, + "balance_loss_mlp": 1.02051044, + "epoch": 0.26514354426574477, + "flos": 24716847951360.0, + "grad_norm": 1.606905677566858, + "language_loss": 0.73400831, + "learning_rate": 3.4474762204071253e-06, + "loss": 0.75518668, + "num_input_tokens_seen": 95349880, + "step": 4410, + "time_per_iteration": 2.5905115604400635 + }, + { + "auxiliary_loss_clip": 0.01089584, + "auxiliary_loss_mlp": 0.01045355, + "balance_loss_clip": 1.03497434, + "balance_loss_mlp": 1.03085923, + "epoch": 0.26520366751841273, + "flos": 20340055733760.0, + "grad_norm": 1.8435995184984253, + "language_loss": 0.73552525, + "learning_rate": 3.4472074352957244e-06, + "loss": 0.75687462, + "num_input_tokens_seen": 95368570, + "step": 4411, + "time_per_iteration": 2.6587026119232178 + }, + { + "auxiliary_loss_clip": 0.01054528, + "auxiliary_loss_mlp": 0.01039554, + "balance_loss_clip": 1.03586912, + "balance_loss_mlp": 1.02434897, + "epoch": 0.2652637907710807, + "flos": 22343870828160.0, + "grad_norm": 2.513662972748656, + "language_loss": 0.82253087, + "learning_rate": 3.446938595306071e-06, + "loss": 0.84347171, + "num_input_tokens_seen": 95387065, + "step": 4412, + "time_per_iteration": 2.8788695335388184 + }, + { + "auxiliary_loss_clip": 0.01080261, + "auxiliary_loss_mlp": 0.01046051, + "balance_loss_clip": 1.03101945, + "balance_loss_mlp": 1.03157902, + "epoch": 0.26532391402374866, + "flos": 19354235990400.0, + "grad_norm": 1.7104491357738638, + "language_loss": 0.74399149, + "learning_rate": 3.4466697004483622e-06, + "loss": 0.76525462, + "num_input_tokens_seen": 95406345, + "step": 4413, + "time_per_iteration": 2.7756118774414062 + }, + { + "auxiliary_loss_clip": 0.01011744, + "auxiliary_loss_mlp": 0.01030534, + "balance_loss_clip": 1.00660396, + "balance_loss_mlp": 1.02849531, + "epoch": 0.26538403727641663, + "flos": 44787611422080.0, + "grad_norm": 0.8807855531743255, + "language_loss": 0.56871068, + "learning_rate": 3.446400750732793e-06, + "loss": 0.58913344, + "num_input_tokens_seen": 95463595, + "step": 4414, + "time_per_iteration": 3.152437925338745 + }, + { + "auxiliary_loss_clip": 0.01059148, + "auxiliary_loss_mlp": 0.01043976, + "balance_loss_clip": 1.02955592, + "balance_loss_mlp": 1.03011155, + "epoch": 0.26544416052908465, + "flos": 28182119708160.0, + "grad_norm": 1.622871142990004, + "language_loss": 0.74466896, + "learning_rate": 3.4461317461695625e-06, + "loss": 0.76570022, + "num_input_tokens_seen": 95484115, + "step": 4415, + "time_per_iteration": 2.708122730255127 + }, + { + "auxiliary_loss_clip": 0.01037333, + "auxiliary_loss_mlp": 0.01037164, + "balance_loss_clip": 1.02494287, + "balance_loss_mlp": 1.02008152, + "epoch": 0.2655042837817526, + "flos": 17565274097280.0, + "grad_norm": 2.0541397979138902, + "language_loss": 0.86955851, + "learning_rate": 3.4458626867688707e-06, + "loss": 0.89030349, + "num_input_tokens_seen": 95501435, + "step": 4416, + "time_per_iteration": 2.842360496520996 + }, + { + "auxiliary_loss_clip": 0.0108647, + "auxiliary_loss_mlp": 0.01039279, + "balance_loss_clip": 1.03373194, + "balance_loss_mlp": 1.02293491, + "epoch": 0.2655644070344206, + "flos": 23404636298880.0, + "grad_norm": 1.6366517805977177, + "language_loss": 0.76187497, + "learning_rate": 3.4455935725409217e-06, + "loss": 0.78313249, + "num_input_tokens_seen": 95520135, + "step": 4417, + "time_per_iteration": 2.8811850547790527 + }, + { + "auxiliary_loss_clip": 0.01068871, + "auxiliary_loss_mlp": 0.01035082, + "balance_loss_clip": 1.03206646, + "balance_loss_mlp": 1.01854753, + "epoch": 0.26562453028708854, + "flos": 26468462678400.0, + "grad_norm": 1.9014212367200205, + "language_loss": 0.79991692, + "learning_rate": 3.4453244034959196e-06, + "loss": 0.82095647, + "num_input_tokens_seen": 95541705, + "step": 4418, + "time_per_iteration": 2.7942252159118652 + }, + { + "auxiliary_loss_clip": 0.01081139, + "auxiliary_loss_mlp": 0.01043261, + "balance_loss_clip": 1.03116703, + "balance_loss_mlp": 1.0277524, + "epoch": 0.2656846535397565, + "flos": 19207576759680.0, + "grad_norm": 2.111981573754184, + "language_loss": 0.66979277, + "learning_rate": 3.445055179644071e-06, + "loss": 0.69103682, + "num_input_tokens_seen": 95560300, + "step": 4419, + "time_per_iteration": 2.7454416751861572 + }, + { + "auxiliary_loss_clip": 0.01094438, + "auxiliary_loss_mlp": 0.01038635, + "balance_loss_clip": 1.03308296, + "balance_loss_mlp": 1.02235067, + "epoch": 0.2657447767924245, + "flos": 30551325903360.0, + "grad_norm": 1.8773900084049757, + "language_loss": 0.79132771, + "learning_rate": 3.444785900995585e-06, + "loss": 0.81265843, + "num_input_tokens_seen": 95580150, + "step": 4420, + "time_per_iteration": 2.747056484222412 + }, + { + "auxiliary_loss_clip": 0.01076517, + "auxiliary_loss_mlp": 0.01050244, + "balance_loss_clip": 1.03415084, + "balance_loss_mlp": 1.03316164, + "epoch": 0.26580490004509244, + "flos": 20922742160640.0, + "grad_norm": 2.0637406282353106, + "language_loss": 0.82069892, + "learning_rate": 3.444516567560673e-06, + "loss": 0.84196651, + "num_input_tokens_seen": 95597570, + "step": 4421, + "time_per_iteration": 2.758270263671875 + }, + { + "auxiliary_loss_clip": 0.01081642, + "auxiliary_loss_mlp": 0.01045371, + "balance_loss_clip": 1.03455687, + "balance_loss_mlp": 1.03032649, + "epoch": 0.2658650232977604, + "flos": 43945682584320.0, + "grad_norm": 2.028610874714385, + "language_loss": 0.66308689, + "learning_rate": 3.444247179349548e-06, + "loss": 0.68435705, + "num_input_tokens_seen": 95619415, + "step": 4422, + "time_per_iteration": 2.900254011154175 + }, + { + "auxiliary_loss_clip": 0.01083157, + "auxiliary_loss_mlp": 0.0104443, + "balance_loss_clip": 1.03301752, + "balance_loss_mlp": 1.02997637, + "epoch": 0.26592514655042837, + "flos": 29716439109120.0, + "grad_norm": 2.4248885742367525, + "language_loss": 0.74156475, + "learning_rate": 3.4439777363724252e-06, + "loss": 0.76284057, + "num_input_tokens_seen": 95639155, + "step": 4423, + "time_per_iteration": 2.8196702003479004 + }, + { + "auxiliary_loss_clip": 0.01074248, + "auxiliary_loss_mlp": 0.01048612, + "balance_loss_clip": 1.03030753, + "balance_loss_mlp": 1.03186321, + "epoch": 0.26598526980309634, + "flos": 46677730014720.0, + "grad_norm": 1.7943716113222659, + "language_loss": 0.77848363, + "learning_rate": 3.443708238639522e-06, + "loss": 0.79971218, + "num_input_tokens_seen": 95663320, + "step": 4424, + "time_per_iteration": 2.9730098247528076 + }, + { + "auxiliary_loss_clip": 0.01076056, + "auxiliary_loss_mlp": 0.0104277, + "balance_loss_clip": 1.03174973, + "balance_loss_mlp": 1.02715993, + "epoch": 0.2660453930557643, + "flos": 11509442582400.0, + "grad_norm": 1.8770263286647304, + "language_loss": 0.79647434, + "learning_rate": 3.4434386861610573e-06, + "loss": 0.8176626, + "num_input_tokens_seen": 95680260, + "step": 4425, + "time_per_iteration": 2.7971439361572266 + }, + { + "auxiliary_loss_clip": 0.01070525, + "auxiliary_loss_mlp": 0.01045071, + "balance_loss_clip": 1.03292286, + "balance_loss_mlp": 1.03091466, + "epoch": 0.26610551630843227, + "flos": 24791578197120.0, + "grad_norm": 1.624802547458645, + "language_loss": 0.80075294, + "learning_rate": 3.4431690789472532e-06, + "loss": 0.82190889, + "num_input_tokens_seen": 95701140, + "step": 4426, + "time_per_iteration": 4.386860370635986 + }, + { + "auxiliary_loss_clip": 0.0109536, + "auxiliary_loss_mlp": 0.01046255, + "balance_loss_clip": 1.03573847, + "balance_loss_mlp": 1.0306561, + "epoch": 0.26616563956110023, + "flos": 27636385397760.0, + "grad_norm": 1.5941090451095548, + "language_loss": 0.76727521, + "learning_rate": 3.442899417008333e-06, + "loss": 0.7886914, + "num_input_tokens_seen": 95722060, + "step": 4427, + "time_per_iteration": 4.199794769287109 + }, + { + "auxiliary_loss_clip": 0.01061646, + "auxiliary_loss_mlp": 0.01034738, + "balance_loss_clip": 1.03295255, + "balance_loss_mlp": 1.02054071, + "epoch": 0.26622576281376825, + "flos": 28362893880960.0, + "grad_norm": 1.6621540345013643, + "language_loss": 0.76670212, + "learning_rate": 3.4426297003545227e-06, + "loss": 0.78766596, + "num_input_tokens_seen": 95742495, + "step": 4428, + "time_per_iteration": 2.7201483249664307 + }, + { + "auxiliary_loss_clip": 0.01061509, + "auxiliary_loss_mlp": 0.00749179, + "balance_loss_clip": 1.03058207, + "balance_loss_mlp": 1.00219464, + "epoch": 0.2662858860664362, + "flos": 18041341979520.0, + "grad_norm": 2.0081697818393303, + "language_loss": 0.82903033, + "learning_rate": 3.4423599289960495e-06, + "loss": 0.84713721, + "num_input_tokens_seen": 95761510, + "step": 4429, + "time_per_iteration": 2.6972203254699707 + }, + { + "auxiliary_loss_clip": 0.01058682, + "auxiliary_loss_mlp": 0.01040139, + "balance_loss_clip": 1.03097034, + "balance_loss_mlp": 1.0252974, + "epoch": 0.2663460093191042, + "flos": 22745818995840.0, + "grad_norm": 1.6451938178917769, + "language_loss": 0.72052717, + "learning_rate": 3.442090102943143e-06, + "loss": 0.7415154, + "num_input_tokens_seen": 95782385, + "step": 4430, + "time_per_iteration": 2.853440523147583 + }, + { + "auxiliary_loss_clip": 0.01094846, + "auxiliary_loss_mlp": 0.01047446, + "balance_loss_clip": 1.03398418, + "balance_loss_mlp": 1.03094745, + "epoch": 0.26640613257177215, + "flos": 16508782344960.0, + "grad_norm": 1.859563274087441, + "language_loss": 0.8175391, + "learning_rate": 3.441820222206035e-06, + "loss": 0.83896202, + "num_input_tokens_seen": 95800595, + "step": 4431, + "time_per_iteration": 2.5838582515716553 + }, + { + "auxiliary_loss_clip": 0.01087809, + "auxiliary_loss_mlp": 0.01045692, + "balance_loss_clip": 1.03380609, + "balance_loss_mlp": 1.0296104, + "epoch": 0.2664662558244401, + "flos": 23075945919360.0, + "grad_norm": 2.1954548592256384, + "language_loss": 0.7612704, + "learning_rate": 3.44155028679496e-06, + "loss": 0.78260547, + "num_input_tokens_seen": 95818480, + "step": 4432, + "time_per_iteration": 2.594667673110962 + }, + { + "auxiliary_loss_clip": 0.01032169, + "auxiliary_loss_mlp": 0.01045065, + "balance_loss_clip": 1.02649736, + "balance_loss_mlp": 1.02760053, + "epoch": 0.2665263790771081, + "flos": 23769273214080.0, + "grad_norm": 2.0363400238194647, + "language_loss": 0.83352029, + "learning_rate": 3.441280296720154e-06, + "loss": 0.85429263, + "num_input_tokens_seen": 95837205, + "step": 4433, + "time_per_iteration": 2.707550048828125 + }, + { + "auxiliary_loss_clip": 0.01085659, + "auxiliary_loss_mlp": 0.01041513, + "balance_loss_clip": 1.03349495, + "balance_loss_mlp": 1.02524149, + "epoch": 0.26658650232977604, + "flos": 28001273708160.0, + "grad_norm": 2.340146385586256, + "language_loss": 0.76765776, + "learning_rate": 3.441010251991854e-06, + "loss": 0.78892946, + "num_input_tokens_seen": 95858395, + "step": 4434, + "time_per_iteration": 2.702364444732666 + }, + { + "auxiliary_loss_clip": 0.01091196, + "auxiliary_loss_mlp": 0.01038052, + "balance_loss_clip": 1.03195894, + "balance_loss_mlp": 1.02329385, + "epoch": 0.266646625582444, + "flos": 22163635359360.0, + "grad_norm": 1.874239620877546, + "language_loss": 0.82294464, + "learning_rate": 3.440740152620301e-06, + "loss": 0.84423721, + "num_input_tokens_seen": 95877875, + "step": 4435, + "time_per_iteration": 5.800493240356445 + }, + { + "auxiliary_loss_clip": 0.01048527, + "auxiliary_loss_mlp": 0.01050576, + "balance_loss_clip": 1.02874112, + "balance_loss_mlp": 1.03385067, + "epoch": 0.266706748835112, + "flos": 27853537069440.0, + "grad_norm": 2.236345330023416, + "language_loss": 0.87323642, + "learning_rate": 3.4404699986157376e-06, + "loss": 0.89422745, + "num_input_tokens_seen": 95895820, + "step": 4436, + "time_per_iteration": 2.917905569076538 + }, + { + "auxiliary_loss_clip": 0.01072098, + "auxiliary_loss_mlp": 0.01044383, + "balance_loss_clip": 1.03074455, + "balance_loss_mlp": 1.02868295, + "epoch": 0.26676687208777994, + "flos": 25812123413760.0, + "grad_norm": 1.554238880808541, + "language_loss": 0.79081011, + "learning_rate": 3.440199789988407e-06, + "loss": 0.81197488, + "num_input_tokens_seen": 95918025, + "step": 4437, + "time_per_iteration": 2.8434154987335205 + }, + { + "auxiliary_loss_clip": 0.01035183, + "auxiliary_loss_mlp": 0.01037658, + "balance_loss_clip": 1.03009403, + "balance_loss_mlp": 1.02245843, + "epoch": 0.2668269953404479, + "flos": 36064583504640.0, + "grad_norm": 4.252500002942317, + "language_loss": 0.64268637, + "learning_rate": 3.439929526748556e-06, + "loss": 0.66341478, + "num_input_tokens_seen": 95937725, + "step": 4438, + "time_per_iteration": 3.0575132369995117 + }, + { + "auxiliary_loss_clip": 0.01022971, + "auxiliary_loss_mlp": 0.01039334, + "balance_loss_clip": 1.02575421, + "balance_loss_mlp": 1.02403903, + "epoch": 0.26688711859311587, + "flos": 26570987072640.0, + "grad_norm": 1.8561511391891414, + "language_loss": 0.76036513, + "learning_rate": 3.4396592089064334e-06, + "loss": 0.7809881, + "num_input_tokens_seen": 95956335, + "step": 4439, + "time_per_iteration": 2.945699453353882 + }, + { + "auxiliary_loss_clip": 0.01041147, + "auxiliary_loss_mlp": 0.01035377, + "balance_loss_clip": 1.03019261, + "balance_loss_mlp": 1.01886702, + "epoch": 0.26694724184578383, + "flos": 26761565658240.0, + "grad_norm": 1.7364951932522794, + "language_loss": 0.71484858, + "learning_rate": 3.4393888364722897e-06, + "loss": 0.73561382, + "num_input_tokens_seen": 95977135, + "step": 4440, + "time_per_iteration": 2.8754987716674805 + }, + { + "auxiliary_loss_clip": 0.01067794, + "auxiliary_loss_mlp": 0.01041262, + "balance_loss_clip": 1.02965033, + "balance_loss_mlp": 1.02450705, + "epoch": 0.2670073650984518, + "flos": 20959586536320.0, + "grad_norm": 2.3814159300738265, + "language_loss": 0.66707861, + "learning_rate": 3.439118409456376e-06, + "loss": 0.68816918, + "num_input_tokens_seen": 95995435, + "step": 4441, + "time_per_iteration": 2.687188148498535 + }, + { + "auxiliary_loss_clip": 0.01081823, + "auxiliary_loss_mlp": 0.01040476, + "balance_loss_clip": 1.03176165, + "balance_loss_mlp": 1.02452564, + "epoch": 0.2670674883511198, + "flos": 28366054277760.0, + "grad_norm": 1.530085041901804, + "language_loss": 0.76025259, + "learning_rate": 3.4388479278689486e-06, + "loss": 0.78147554, + "num_input_tokens_seen": 96016340, + "step": 4442, + "time_per_iteration": 2.717083692550659 + }, + { + "auxiliary_loss_clip": 0.00975749, + "auxiliary_loss_mlp": 0.01021107, + "balance_loss_clip": 1.01071036, + "balance_loss_mlp": 1.0187943, + "epoch": 0.2671276116037878, + "flos": 58971319430400.0, + "grad_norm": 0.9339090453017306, + "language_loss": 0.61223423, + "learning_rate": 3.4385773917202637e-06, + "loss": 0.63220274, + "num_input_tokens_seen": 96071205, + "step": 4443, + "time_per_iteration": 3.1980340480804443 + }, + { + "auxiliary_loss_clip": 0.0106876, + "auxiliary_loss_mlp": 0.01038801, + "balance_loss_clip": 1.03454733, + "balance_loss_mlp": 1.02341151, + "epoch": 0.26718773485645575, + "flos": 43945072053120.0, + "grad_norm": 2.574923273619211, + "language_loss": 0.76000524, + "learning_rate": 3.4383068010205793e-06, + "loss": 0.7810809, + "num_input_tokens_seen": 96094240, + "step": 4444, + "time_per_iteration": 2.972691535949707 + }, + { + "auxiliary_loss_clip": 0.01084261, + "auxiliary_loss_mlp": 0.01039786, + "balance_loss_clip": 1.03178501, + "balance_loss_mlp": 1.02235806, + "epoch": 0.2672478581091237, + "flos": 25228323665280.0, + "grad_norm": 3.9804566543857187, + "language_loss": 0.802845, + "learning_rate": 3.438036155780158e-06, + "loss": 0.82408547, + "num_input_tokens_seen": 96114105, + "step": 4445, + "time_per_iteration": 2.6328327655792236 + }, + { + "auxiliary_loss_clip": 0.01076127, + "auxiliary_loss_mlp": 0.01041993, + "balance_loss_clip": 1.03477609, + "balance_loss_mlp": 1.02474296, + "epoch": 0.2673079813617917, + "flos": 15268176455040.0, + "grad_norm": 1.9560558701531836, + "language_loss": 0.8922478, + "learning_rate": 3.43776545600926e-06, + "loss": 0.91342902, + "num_input_tokens_seen": 96132140, + "step": 4446, + "time_per_iteration": 2.6442320346832275 + }, + { + "auxiliary_loss_clip": 0.01088016, + "auxiliary_loss_mlp": 0.01042785, + "balance_loss_clip": 1.0351845, + "balance_loss_mlp": 1.02778816, + "epoch": 0.26736810461445965, + "flos": 25812733944960.0, + "grad_norm": 2.6830019197475417, + "language_loss": 0.68117225, + "learning_rate": 3.437494701718153e-06, + "loss": 0.70248032, + "num_input_tokens_seen": 96152090, + "step": 4447, + "time_per_iteration": 2.7947158813476562 + }, + { + "auxiliary_loss_clip": 0.01085438, + "auxiliary_loss_mlp": 0.01039388, + "balance_loss_clip": 1.03214407, + "balance_loss_mlp": 1.02321148, + "epoch": 0.2674282278671276, + "flos": 24312709054080.0, + "grad_norm": 2.1761351944333187, + "language_loss": 0.83467603, + "learning_rate": 3.4372238929171026e-06, + "loss": 0.85592425, + "num_input_tokens_seen": 96170015, + "step": 4448, + "time_per_iteration": 2.7527992725372314 + }, + { + "auxiliary_loss_clip": 0.01063732, + "auxiliary_loss_mlp": 0.01042963, + "balance_loss_clip": 1.03284812, + "balance_loss_mlp": 1.02663124, + "epoch": 0.2674883511197956, + "flos": 22815521337600.0, + "grad_norm": 1.4923672417454197, + "language_loss": 0.84182799, + "learning_rate": 3.436953029616378e-06, + "loss": 0.86289489, + "num_input_tokens_seen": 96188065, + "step": 4449, + "time_per_iteration": 2.7753005027770996 + }, + { + "auxiliary_loss_clip": 0.01081499, + "auxiliary_loss_mlp": 0.0104661, + "balance_loss_clip": 1.03405917, + "balance_loss_mlp": 1.02835929, + "epoch": 0.26754847437246354, + "flos": 25370170473600.0, + "grad_norm": 2.160259763428544, + "language_loss": 0.8376379, + "learning_rate": 3.4366821118262506e-06, + "loss": 0.85891902, + "num_input_tokens_seen": 96205780, + "step": 4450, + "time_per_iteration": 2.7382431030273438 + }, + { + "auxiliary_loss_clip": 0.01052754, + "auxiliary_loss_mlp": 0.01039419, + "balance_loss_clip": 1.02772737, + "balance_loss_mlp": 1.02523303, + "epoch": 0.2676085976251315, + "flos": 20230420446720.0, + "grad_norm": 1.8955413813356377, + "language_loss": 0.81095791, + "learning_rate": 3.4364111395569937e-06, + "loss": 0.83187962, + "num_input_tokens_seen": 96224990, + "step": 4451, + "time_per_iteration": 2.795240879058838 + }, + { + "auxiliary_loss_clip": 0.01084701, + "auxiliary_loss_mlp": 0.01041248, + "balance_loss_clip": 1.03439629, + "balance_loss_mlp": 1.02643037, + "epoch": 0.26766872087779947, + "flos": 28038225824640.0, + "grad_norm": 1.7483119662773448, + "language_loss": 0.86277318, + "learning_rate": 3.436140112818882e-06, + "loss": 0.88403273, + "num_input_tokens_seen": 96245345, + "step": 4452, + "time_per_iteration": 2.716590404510498 + }, + { + "auxiliary_loss_clip": 0.01075634, + "auxiliary_loss_mlp": 0.01041275, + "balance_loss_clip": 1.03374577, + "balance_loss_mlp": 1.02552176, + "epoch": 0.26772884413046744, + "flos": 18325179250560.0, + "grad_norm": 2.0618968060094116, + "language_loss": 0.83320999, + "learning_rate": 3.435869031622194e-06, + "loss": 0.85437912, + "num_input_tokens_seen": 96259000, + "step": 4453, + "time_per_iteration": 2.6022584438323975 + }, + { + "auxiliary_loss_clip": 0.01084489, + "auxiliary_loss_mlp": 0.01049089, + "balance_loss_clip": 1.03472948, + "balance_loss_mlp": 1.03296578, + "epoch": 0.2677889673831354, + "flos": 22127509255680.0, + "grad_norm": 1.6855075591667419, + "language_loss": 0.79436135, + "learning_rate": 3.435597895977208e-06, + "loss": 0.81569707, + "num_input_tokens_seen": 96277000, + "step": 4454, + "time_per_iteration": 2.5634775161743164 + }, + { + "auxiliary_loss_clip": 0.01074942, + "auxiliary_loss_mlp": 0.01040383, + "balance_loss_clip": 1.03216624, + "balance_loss_mlp": 1.0247314, + "epoch": 0.2678490906358034, + "flos": 23729699404800.0, + "grad_norm": 1.561667903840592, + "language_loss": 0.72724789, + "learning_rate": 3.435326705894206e-06, + "loss": 0.74840117, + "num_input_tokens_seen": 96297010, + "step": 4455, + "time_per_iteration": 2.672898530960083 + }, + { + "auxiliary_loss_clip": 0.01056274, + "auxiliary_loss_mlp": 0.01036728, + "balance_loss_clip": 1.0298779, + "balance_loss_mlp": 1.02179098, + "epoch": 0.2679092138884714, + "flos": 21762872340480.0, + "grad_norm": 1.8168948664605116, + "language_loss": 0.73750526, + "learning_rate": 3.435055461383471e-06, + "loss": 0.75843525, + "num_input_tokens_seen": 96315780, + "step": 4456, + "time_per_iteration": 2.7712862491607666 + }, + { + "auxiliary_loss_clip": 0.01087114, + "auxiliary_loss_mlp": 0.01040504, + "balance_loss_clip": 1.03411567, + "balance_loss_mlp": 1.02435112, + "epoch": 0.26796933714113935, + "flos": 19861186590720.0, + "grad_norm": 2.1520014520949466, + "language_loss": 0.70734996, + "learning_rate": 3.4347841624552896e-06, + "loss": 0.72862613, + "num_input_tokens_seen": 96333465, + "step": 4457, + "time_per_iteration": 2.6768620014190674 + }, + { + "auxiliary_loss_clip": 0.01050247, + "auxiliary_loss_mlp": 0.01048345, + "balance_loss_clip": 1.02946317, + "balance_loss_mlp": 1.03176296, + "epoch": 0.2680294603938073, + "flos": 20047886507520.0, + "grad_norm": 1.91689591712708, + "language_loss": 0.78971279, + "learning_rate": 3.4345128091199493e-06, + "loss": 0.81069869, + "num_input_tokens_seen": 96352005, + "step": 4458, + "time_per_iteration": 2.7108023166656494 + }, + { + "auxiliary_loss_clip": 0.00994115, + "auxiliary_loss_mlp": 0.01019108, + "balance_loss_clip": 1.00969958, + "balance_loss_mlp": 1.01699853, + "epoch": 0.2680895836464753, + "flos": 72113763052800.0, + "grad_norm": 0.868219009456647, + "language_loss": 0.58691967, + "learning_rate": 3.434241401387739e-06, + "loss": 0.60705185, + "num_input_tokens_seen": 96406265, + "step": 4459, + "time_per_iteration": 3.440161943435669 + }, + { + "auxiliary_loss_clip": 0.01032401, + "auxiliary_loss_mlp": 0.01042345, + "balance_loss_clip": 1.02624226, + "balance_loss_mlp": 1.02681243, + "epoch": 0.26814970689914325, + "flos": 20449044576000.0, + "grad_norm": 2.342199755096495, + "language_loss": 0.85056961, + "learning_rate": 3.4339699392689507e-06, + "loss": 0.87131709, + "num_input_tokens_seen": 96425225, + "step": 4460, + "time_per_iteration": 2.993558883666992 + }, + { + "auxiliary_loss_clip": 0.0107701, + "auxiliary_loss_mlp": 0.01041612, + "balance_loss_clip": 1.03232253, + "balance_loss_mlp": 1.02530396, + "epoch": 0.2682098301518112, + "flos": 17566674727680.0, + "grad_norm": 2.403435422133673, + "language_loss": 0.68395203, + "learning_rate": 3.4336984227738796e-06, + "loss": 0.70513833, + "num_input_tokens_seen": 96443780, + "step": 4461, + "time_per_iteration": 2.6925575733184814 + }, + { + "auxiliary_loss_clip": 0.01056237, + "auxiliary_loss_mlp": 0.01049016, + "balance_loss_clip": 1.02935767, + "balance_loss_mlp": 1.03306603, + "epoch": 0.2682699534044792, + "flos": 18333259810560.0, + "grad_norm": 1.6282553622325557, + "language_loss": 0.67271316, + "learning_rate": 3.43342685191282e-06, + "loss": 0.6937657, + "num_input_tokens_seen": 96464530, + "step": 4462, + "time_per_iteration": 2.7858238220214844 + }, + { + "auxiliary_loss_clip": 0.01059977, + "auxiliary_loss_mlp": 0.01039427, + "balance_loss_clip": 1.03068113, + "balance_loss_mlp": 1.02330959, + "epoch": 0.26833007665714714, + "flos": 25301294144640.0, + "grad_norm": 1.6637974980812376, + "language_loss": 0.69333351, + "learning_rate": 3.4331552266960705e-06, + "loss": 0.71432757, + "num_input_tokens_seen": 96483345, + "step": 4463, + "time_per_iteration": 2.731358051300049 + }, + { + "auxiliary_loss_clip": 0.01056291, + "auxiliary_loss_mlp": 0.01043148, + "balance_loss_clip": 1.02802014, + "balance_loss_mlp": 1.02563632, + "epoch": 0.2683901999098151, + "flos": 16099759198080.0, + "grad_norm": 4.46450735976322, + "language_loss": 0.78591216, + "learning_rate": 3.432883547133931e-06, + "loss": 0.80690652, + "num_input_tokens_seen": 96498305, + "step": 4464, + "time_per_iteration": 2.6290645599365234 + }, + { + "auxiliary_loss_clip": 0.01079659, + "auxiliary_loss_mlp": 0.01038937, + "balance_loss_clip": 1.03031504, + "balance_loss_mlp": 1.02311754, + "epoch": 0.2684503231624831, + "flos": 27308054154240.0, + "grad_norm": 2.8416035951851932, + "language_loss": 0.71127522, + "learning_rate": 3.432611813236704e-06, + "loss": 0.73246121, + "num_input_tokens_seen": 96519740, + "step": 4465, + "time_per_iteration": 2.6990225315093994 + }, + { + "auxiliary_loss_clip": 0.0100085, + "auxiliary_loss_mlp": 0.0100469, + "balance_loss_clip": 1.00463367, + "balance_loss_mlp": 1.00252044, + "epoch": 0.26851044641515104, + "flos": 71858007239040.0, + "grad_norm": 0.6773649290236448, + "language_loss": 0.53006995, + "learning_rate": 3.4323400250146943e-06, + "loss": 0.55012536, + "num_input_tokens_seen": 96588870, + "step": 4466, + "time_per_iteration": 3.4155731201171875 + }, + { + "auxiliary_loss_clip": 0.01061797, + "auxiliary_loss_mlp": 0.01046658, + "balance_loss_clip": 1.02923298, + "balance_loss_mlp": 1.02940893, + "epoch": 0.268570569667819, + "flos": 18733771434240.0, + "grad_norm": 2.233478115104644, + "language_loss": 0.7381683, + "learning_rate": 3.4320681824782057e-06, + "loss": 0.75925285, + "num_input_tokens_seen": 96605100, + "step": 4467, + "time_per_iteration": 2.5733635425567627 + }, + { + "auxiliary_loss_clip": 0.01065684, + "auxiliary_loss_mlp": 0.00749024, + "balance_loss_clip": 1.02974832, + "balance_loss_mlp": 1.00196958, + "epoch": 0.268630692920487, + "flos": 18178376365440.0, + "grad_norm": 2.0204423941245877, + "language_loss": 0.80654669, + "learning_rate": 3.4317962856375493e-06, + "loss": 0.8246938, + "num_input_tokens_seen": 96621410, + "step": 4468, + "time_per_iteration": 2.7059366703033447 + }, + { + "auxiliary_loss_clip": 0.01017415, + "auxiliary_loss_mlp": 0.01003971, + "balance_loss_clip": 1.00208497, + "balance_loss_mlp": 1.00183761, + "epoch": 0.268690816173155, + "flos": 68731768978560.0, + "grad_norm": 0.8378069439258127, + "language_loss": 0.59522456, + "learning_rate": 3.4315243345030334e-06, + "loss": 0.61543846, + "num_input_tokens_seen": 96684810, + "step": 4469, + "time_per_iteration": 3.174206018447876 + }, + { + "auxiliary_loss_clip": 0.01096624, + "auxiliary_loss_mlp": 0.01041356, + "balance_loss_clip": 1.03462267, + "balance_loss_mlp": 1.02461934, + "epoch": 0.26875093942582295, + "flos": 23293636295040.0, + "grad_norm": 1.9911710791689887, + "language_loss": 0.81440145, + "learning_rate": 3.431252329084972e-06, + "loss": 0.83578122, + "num_input_tokens_seen": 96701920, + "step": 4470, + "time_per_iteration": 2.6002795696258545 + }, + { + "auxiliary_loss_clip": 0.01065256, + "auxiliary_loss_mlp": 0.01032395, + "balance_loss_clip": 1.02889085, + "balance_loss_mlp": 1.01749992, + "epoch": 0.2688110626784909, + "flos": 21543458112000.0, + "grad_norm": 1.8061937133695127, + "language_loss": 0.82659721, + "learning_rate": 3.4309802693936786e-06, + "loss": 0.8475737, + "num_input_tokens_seen": 96721260, + "step": 4471, + "time_per_iteration": 2.663607358932495 + }, + { + "auxiliary_loss_clip": 0.01079219, + "auxiliary_loss_mlp": 0.01035091, + "balance_loss_clip": 1.03160298, + "balance_loss_mlp": 1.02003455, + "epoch": 0.2688711859311589, + "flos": 28400600183040.0, + "grad_norm": 1.8940733086900072, + "language_loss": 0.69535148, + "learning_rate": 3.43070815543947e-06, + "loss": 0.71649462, + "num_input_tokens_seen": 96740385, + "step": 4472, + "time_per_iteration": 2.6636099815368652 + }, + { + "auxiliary_loss_clip": 0.01090703, + "auxiliary_loss_mlp": 0.01039984, + "balance_loss_clip": 1.03145325, + "balance_loss_mlp": 1.02509475, + "epoch": 0.26893130918382685, + "flos": 25994944661760.0, + "grad_norm": 1.691484066854212, + "language_loss": 0.67927694, + "learning_rate": 3.4304359872326656e-06, + "loss": 0.70058382, + "num_input_tokens_seen": 96761860, + "step": 4473, + "time_per_iteration": 4.175025701522827 + }, + { + "auxiliary_loss_clip": 0.01070005, + "auxiliary_loss_mlp": 0.0104075, + "balance_loss_clip": 1.03282738, + "balance_loss_mlp": 1.02566969, + "epoch": 0.2689914324364948, + "flos": 20339624770560.0, + "grad_norm": 1.6293804341973437, + "language_loss": 0.82901704, + "learning_rate": 3.4301637647835843e-06, + "loss": 0.8501246, + "num_input_tokens_seen": 96781890, + "step": 4474, + "time_per_iteration": 4.390273809432983 + }, + { + "auxiliary_loss_clip": 0.01077994, + "auxiliary_loss_mlp": 0.01043492, + "balance_loss_clip": 1.03126216, + "balance_loss_mlp": 1.02828085, + "epoch": 0.2690515556891628, + "flos": 19464553635840.0, + "grad_norm": 2.3838401796295723, + "language_loss": 0.69712877, + "learning_rate": 3.4298914881025494e-06, + "loss": 0.71834362, + "num_input_tokens_seen": 96800390, + "step": 4475, + "time_per_iteration": 2.63476300239563 + }, + { + "auxiliary_loss_clip": 0.01064194, + "auxiliary_loss_mlp": 0.00749051, + "balance_loss_clip": 1.03358948, + "balance_loss_mlp": 1.00192976, + "epoch": 0.26911167894183075, + "flos": 18146631720960.0, + "grad_norm": 1.7441466920443427, + "language_loss": 0.73377323, + "learning_rate": 3.4296191571998863e-06, + "loss": 0.75190568, + "num_input_tokens_seen": 96816685, + "step": 4476, + "time_per_iteration": 2.661247968673706 + }, + { + "auxiliary_loss_clip": 0.01068503, + "auxiliary_loss_mlp": 0.01033767, + "balance_loss_clip": 1.03140771, + "balance_loss_mlp": 1.01923573, + "epoch": 0.2691718021944987, + "flos": 19975131509760.0, + "grad_norm": 1.684517427533149, + "language_loss": 0.80782783, + "learning_rate": 3.429346772085922e-06, + "loss": 0.82885051, + "num_input_tokens_seen": 96836285, + "step": 4477, + "time_per_iteration": 2.6517019271850586 + }, + { + "auxiliary_loss_clip": 0.01050688, + "auxiliary_loss_mlp": 0.01041893, + "balance_loss_clip": 1.03143549, + "balance_loss_mlp": 1.02607381, + "epoch": 0.2692319254471667, + "flos": 37447215770880.0, + "grad_norm": 1.655206185671715, + "language_loss": 0.6475504, + "learning_rate": 3.429074332770984e-06, + "loss": 0.66847622, + "num_input_tokens_seen": 96857745, + "step": 4478, + "time_per_iteration": 2.8549506664276123 + }, + { + "auxiliary_loss_clip": 0.01069296, + "auxiliary_loss_mlp": 0.01041357, + "balance_loss_clip": 1.02734435, + "balance_loss_mlp": 1.0256567, + "epoch": 0.26929204869983464, + "flos": 22127796564480.0, + "grad_norm": 1.8691951450865112, + "language_loss": 0.80359447, + "learning_rate": 3.4288018392654047e-06, + "loss": 0.82470101, + "num_input_tokens_seen": 96877295, + "step": 4479, + "time_per_iteration": 2.6262664794921875 + }, + { + "auxiliary_loss_clip": 0.01068139, + "auxiliary_loss_mlp": 0.0074907, + "balance_loss_clip": 1.0293299, + "balance_loss_mlp": 1.00206482, + "epoch": 0.2693521719525026, + "flos": 19792813052160.0, + "grad_norm": 2.6988668439863073, + "language_loss": 0.80605614, + "learning_rate": 3.4285292915795166e-06, + "loss": 0.82422823, + "num_input_tokens_seen": 96896160, + "step": 4480, + "time_per_iteration": 2.6724650859832764 + }, + { + "auxiliary_loss_clip": 0.01048534, + "auxiliary_loss_mlp": 0.01041588, + "balance_loss_clip": 1.03059065, + "balance_loss_mlp": 1.02581704, + "epoch": 0.2694122952051706, + "flos": 20994383836800.0, + "grad_norm": 1.6284524434342205, + "language_loss": 0.78190529, + "learning_rate": 3.4282566897236543e-06, + "loss": 0.8028065, + "num_input_tokens_seen": 96915410, + "step": 4481, + "time_per_iteration": 2.697078227996826 + }, + { + "auxiliary_loss_clip": 0.01077391, + "auxiliary_loss_mlp": 0.01045689, + "balance_loss_clip": 1.02939093, + "balance_loss_mlp": 1.02935696, + "epoch": 0.2694724184578386, + "flos": 25849291011840.0, + "grad_norm": 1.6315428087317834, + "language_loss": 0.74124551, + "learning_rate": 3.4279840337081547e-06, + "loss": 0.76247633, + "num_input_tokens_seen": 96937865, + "step": 4482, + "time_per_iteration": 4.320103168487549 + }, + { + "auxiliary_loss_clip": 0.01066799, + "auxiliary_loss_mlp": 0.01035313, + "balance_loss_clip": 1.03074551, + "balance_loss_mlp": 1.01948214, + "epoch": 0.26953254171050656, + "flos": 21726961718400.0, + "grad_norm": 1.8528860098208657, + "language_loss": 0.72292072, + "learning_rate": 3.4277113235433584e-06, + "loss": 0.74394178, + "num_input_tokens_seen": 96957710, + "step": 4483, + "time_per_iteration": 4.238270044326782 + }, + { + "auxiliary_loss_clip": 0.01077474, + "auxiliary_loss_mlp": 0.01043046, + "balance_loss_clip": 1.0275799, + "balance_loss_mlp": 1.02626169, + "epoch": 0.2695926649631745, + "flos": 19682926369920.0, + "grad_norm": 5.796655410120223, + "language_loss": 0.8702848, + "learning_rate": 3.427438559239605e-06, + "loss": 0.89149004, + "num_input_tokens_seen": 96975890, + "step": 4484, + "time_per_iteration": 2.6494944095611572 + }, + { + "auxiliary_loss_clip": 0.01081904, + "auxiliary_loss_mlp": 0.01040596, + "balance_loss_clip": 1.03041816, + "balance_loss_mlp": 1.02596903, + "epoch": 0.2696527882158425, + "flos": 32886596724480.0, + "grad_norm": 1.5025074395401996, + "language_loss": 0.66264647, + "learning_rate": 3.427165740807239e-06, + "loss": 0.68387151, + "num_input_tokens_seen": 96998595, + "step": 4485, + "time_per_iteration": 2.71004319190979 + }, + { + "auxiliary_loss_clip": 0.01051336, + "auxiliary_loss_mlp": 0.01042426, + "balance_loss_clip": 1.02563119, + "balance_loss_mlp": 1.02658272, + "epoch": 0.26971291146851045, + "flos": 12124843320960.0, + "grad_norm": 2.6396022385871984, + "language_loss": 0.72789156, + "learning_rate": 3.426892868256604e-06, + "loss": 0.74882913, + "num_input_tokens_seen": 97013715, + "step": 4486, + "time_per_iteration": 2.624695062637329 + }, + { + "auxiliary_loss_clip": 0.01097608, + "auxiliary_loss_mlp": 0.01043437, + "balance_loss_clip": 1.03403032, + "balance_loss_mlp": 1.02791619, + "epoch": 0.2697730347211784, + "flos": 22634459856000.0, + "grad_norm": 1.7879141637615976, + "language_loss": 0.84343815, + "learning_rate": 3.4266199415980495e-06, + "loss": 0.86484849, + "num_input_tokens_seen": 97031570, + "step": 4487, + "time_per_iteration": 2.5276389122009277 + }, + { + "auxiliary_loss_clip": 0.01066438, + "auxiliary_loss_mlp": 0.01043857, + "balance_loss_clip": 1.03296566, + "balance_loss_mlp": 1.0279547, + "epoch": 0.2698331579738464, + "flos": 23513050523520.0, + "grad_norm": 2.1107782176307643, + "language_loss": 0.71639603, + "learning_rate": 3.4263469608419234e-06, + "loss": 0.737499, + "num_input_tokens_seen": 97049815, + "step": 4488, + "time_per_iteration": 2.7347278594970703 + }, + { + "auxiliary_loss_clip": 0.01020453, + "auxiliary_loss_mlp": 0.01052704, + "balance_loss_clip": 1.0279305, + "balance_loss_mlp": 1.03586006, + "epoch": 0.26989328122651435, + "flos": 24641040297600.0, + "grad_norm": 1.9037989487077103, + "language_loss": 0.84079909, + "learning_rate": 3.426073925998578e-06, + "loss": 0.86153066, + "num_input_tokens_seen": 97067570, + "step": 4489, + "time_per_iteration": 2.8473072052001953 + }, + { + "auxiliary_loss_clip": 0.01079111, + "auxiliary_loss_mlp": 0.01053675, + "balance_loss_clip": 1.03737926, + "balance_loss_mlp": 1.03686666, + "epoch": 0.2699534044791823, + "flos": 10772555068800.0, + "grad_norm": 3.080248141689975, + "language_loss": 0.90011334, + "learning_rate": 3.4258008370783656e-06, + "loss": 0.9214412, + "num_input_tokens_seen": 97082180, + "step": 4490, + "time_per_iteration": 2.971605062484741 + }, + { + "auxiliary_loss_clip": 0.01021595, + "auxiliary_loss_mlp": 0.01042459, + "balance_loss_clip": 1.02491128, + "balance_loss_mlp": 1.02616334, + "epoch": 0.2700135277318503, + "flos": 36171597098880.0, + "grad_norm": 1.8771473046375018, + "language_loss": 0.73130572, + "learning_rate": 3.4255276940916434e-06, + "loss": 0.75194621, + "num_input_tokens_seen": 97103470, + "step": 4491, + "time_per_iteration": 2.862990617752075 + }, + { + "auxiliary_loss_clip": 0.01097055, + "auxiliary_loss_mlp": 0.01038845, + "balance_loss_clip": 1.03538656, + "balance_loss_mlp": 1.0235858, + "epoch": 0.27007365098451824, + "flos": 17418614866560.0, + "grad_norm": 3.38634410811887, + "language_loss": 0.7424916, + "learning_rate": 3.4252544970487676e-06, + "loss": 0.76385057, + "num_input_tokens_seen": 97118100, + "step": 4492, + "time_per_iteration": 2.6694881916046143 + }, + { + "auxiliary_loss_clip": 0.01073654, + "auxiliary_loss_mlp": 0.01037593, + "balance_loss_clip": 1.03333199, + "balance_loss_mlp": 1.02232265, + "epoch": 0.2701337742371862, + "flos": 23185688947200.0, + "grad_norm": 1.9801184009761015, + "language_loss": 0.89273584, + "learning_rate": 3.4249812459600986e-06, + "loss": 0.91384828, + "num_input_tokens_seen": 97136765, + "step": 4493, + "time_per_iteration": 2.6142430305480957 + }, + { + "auxiliary_loss_clip": 0.01080625, + "auxiliary_loss_mlp": 0.01038166, + "balance_loss_clip": 1.03100932, + "balance_loss_mlp": 1.02418876, + "epoch": 0.2701938974898542, + "flos": 24389450461440.0, + "grad_norm": 1.4504978552614065, + "language_loss": 0.71097779, + "learning_rate": 3.424707940835998e-06, + "loss": 0.73216569, + "num_input_tokens_seen": 97157470, + "step": 4494, + "time_per_iteration": 2.586815118789673 + }, + { + "auxiliary_loss_clip": 0.01068146, + "auxiliary_loss_mlp": 0.01035239, + "balance_loss_clip": 1.03125787, + "balance_loss_mlp": 1.0209285, + "epoch": 0.2702540207425222, + "flos": 26214322976640.0, + "grad_norm": 3.8697763996086962, + "language_loss": 0.8637318, + "learning_rate": 3.42443458168683e-06, + "loss": 0.88476562, + "num_input_tokens_seen": 97176905, + "step": 4495, + "time_per_iteration": 2.6341159343719482 + }, + { + "auxiliary_loss_clip": 0.01093234, + "auxiliary_loss_mlp": 0.01039777, + "balance_loss_clip": 1.0330832, + "balance_loss_mlp": 1.0251379, + "epoch": 0.27031414399519016, + "flos": 22926377687040.0, + "grad_norm": 1.9801571261884876, + "language_loss": 0.76657349, + "learning_rate": 3.424161168522959e-06, + "loss": 0.78790355, + "num_input_tokens_seen": 97196380, + "step": 4496, + "time_per_iteration": 2.625271797180176 + }, + { + "auxiliary_loss_clip": 0.0101832, + "auxiliary_loss_mlp": 0.01003688, + "balance_loss_clip": 1.00283432, + "balance_loss_mlp": 1.00182807, + "epoch": 0.2703742672478581, + "flos": 63019780404480.0, + "grad_norm": 0.7269262165055961, + "language_loss": 0.50150096, + "learning_rate": 3.423887701354754e-06, + "loss": 0.52172101, + "num_input_tokens_seen": 97260100, + "step": 4497, + "time_per_iteration": 3.2787187099456787 + }, + { + "auxiliary_loss_clip": 0.01049126, + "auxiliary_loss_mlp": 0.01042042, + "balance_loss_clip": 1.03193569, + "balance_loss_mlp": 1.02748084, + "epoch": 0.2704343905005261, + "flos": 18840820942080.0, + "grad_norm": 1.9286373382202004, + "language_loss": 0.72318727, + "learning_rate": 3.4236141801925847e-06, + "loss": 0.74409896, + "num_input_tokens_seen": 97277935, + "step": 4498, + "time_per_iteration": 2.8257126808166504 + }, + { + "auxiliary_loss_clip": 0.00993965, + "auxiliary_loss_mlp": 0.01003355, + "balance_loss_clip": 1.00700188, + "balance_loss_mlp": 1.00136375, + "epoch": 0.27049451375319405, + "flos": 71233412618880.0, + "grad_norm": 0.7544740523976349, + "language_loss": 0.59243631, + "learning_rate": 3.4233406050468237e-06, + "loss": 0.61240959, + "num_input_tokens_seen": 97338845, + "step": 4499, + "time_per_iteration": 3.2715556621551514 + }, + { + "auxiliary_loss_clip": 0.01068096, + "auxiliary_loss_mlp": 0.01037937, + "balance_loss_clip": 1.03025293, + "balance_loss_mlp": 1.02271366, + "epoch": 0.270554637005862, + "flos": 24278594112000.0, + "grad_norm": 1.7957462732460598, + "language_loss": 0.73678583, + "learning_rate": 3.4230669759278438e-06, + "loss": 0.75784618, + "num_input_tokens_seen": 97356640, + "step": 4500, + "time_per_iteration": 2.6419551372528076 + }, + { + "auxiliary_loss_clip": 0.01053717, + "auxiliary_loss_mlp": 0.01046391, + "balance_loss_clip": 1.02412653, + "balance_loss_mlp": 1.0305959, + "epoch": 0.27061476025853, + "flos": 17632318832640.0, + "grad_norm": 3.8445060744904045, + "language_loss": 0.80884928, + "learning_rate": 3.4227932928460215e-06, + "loss": 0.82985038, + "num_input_tokens_seen": 97372585, + "step": 4501, + "time_per_iteration": 2.736591100692749 + }, + { + "auxiliary_loss_clip": 0.01052874, + "auxiliary_loss_mlp": 0.01045052, + "balance_loss_clip": 1.02934182, + "balance_loss_mlp": 1.02843404, + "epoch": 0.27067488351119795, + "flos": 22710123855360.0, + "grad_norm": 1.767281938887895, + "language_loss": 0.72572899, + "learning_rate": 3.422519555811735e-06, + "loss": 0.74670821, + "num_input_tokens_seen": 97393315, + "step": 4502, + "time_per_iteration": 2.768450975418091 + }, + { + "auxiliary_loss_clip": 0.01076485, + "auxiliary_loss_mlp": 0.01037516, + "balance_loss_clip": 1.03152084, + "balance_loss_mlp": 1.02071929, + "epoch": 0.2707350067638659, + "flos": 41719616087040.0, + "grad_norm": 1.9181497657846038, + "language_loss": 0.68672323, + "learning_rate": 3.4222457648353642e-06, + "loss": 0.70786327, + "num_input_tokens_seen": 97417860, + "step": 4503, + "time_per_iteration": 2.846681833267212 + }, + { + "auxiliary_loss_clip": 0.01044678, + "auxiliary_loss_mlp": 0.01040434, + "balance_loss_clip": 1.03020549, + "balance_loss_mlp": 1.02437592, + "epoch": 0.2707951300165339, + "flos": 20193037367040.0, + "grad_norm": 1.7539547481805364, + "language_loss": 0.68053204, + "learning_rate": 3.4219719199272918e-06, + "loss": 0.70138317, + "num_input_tokens_seen": 97436780, + "step": 4504, + "time_per_iteration": 2.790761947631836 + }, + { + "auxiliary_loss_clip": 0.0108672, + "auxiliary_loss_mlp": 0.01044218, + "balance_loss_clip": 1.03682923, + "balance_loss_mlp": 1.02981782, + "epoch": 0.27085525326920185, + "flos": 21433966479360.0, + "grad_norm": 1.4541743545745331, + "language_loss": 0.7527535, + "learning_rate": 3.421698021097902e-06, + "loss": 0.77406287, + "num_input_tokens_seen": 97456190, + "step": 4505, + "time_per_iteration": 2.785352945327759 + }, + { + "auxiliary_loss_clip": 0.01094659, + "auxiliary_loss_mlp": 0.01048459, + "balance_loss_clip": 1.03144431, + "balance_loss_mlp": 1.0313648, + "epoch": 0.2709153765218698, + "flos": 17675232606720.0, + "grad_norm": 2.977319878975585, + "language_loss": 0.73490536, + "learning_rate": 3.42142406835758e-06, + "loss": 0.75633657, + "num_input_tokens_seen": 97474545, + "step": 4506, + "time_per_iteration": 2.7065625190734863 + }, + { + "auxiliary_loss_clip": 0.01072012, + "auxiliary_loss_mlp": 0.01042218, + "balance_loss_clip": 1.02951956, + "balance_loss_mlp": 1.02570772, + "epoch": 0.2709754997745378, + "flos": 24456243801600.0, + "grad_norm": 4.275719752078487, + "language_loss": 0.80275965, + "learning_rate": 3.421150061716715e-06, + "loss": 0.82390201, + "num_input_tokens_seen": 97494520, + "step": 4507, + "time_per_iteration": 2.717808723449707 + }, + { + "auxiliary_loss_clip": 0.00999606, + "auxiliary_loss_mlp": 0.01011132, + "balance_loss_clip": 1.00462449, + "balance_loss_mlp": 1.00921237, + "epoch": 0.2710356230272058, + "flos": 65210798206080.0, + "grad_norm": 0.7344288384339249, + "language_loss": 0.50846922, + "learning_rate": 3.420876001185698e-06, + "loss": 0.52857661, + "num_input_tokens_seen": 97552455, + "step": 4508, + "time_per_iteration": 3.095885992050171 + }, + { + "auxiliary_loss_clip": 0.01012286, + "auxiliary_loss_mlp": 0.01038917, + "balance_loss_clip": 1.02379155, + "balance_loss_mlp": 1.02434981, + "epoch": 0.27109574627987376, + "flos": 25484438615040.0, + "grad_norm": 2.427861002506537, + "language_loss": 0.74795943, + "learning_rate": 3.4206018867749197e-06, + "loss": 0.76847148, + "num_input_tokens_seen": 97572650, + "step": 4509, + "time_per_iteration": 2.78183650970459 + }, + { + "auxiliary_loss_clip": 0.01074696, + "auxiliary_loss_mlp": 0.01036199, + "balance_loss_clip": 1.02919436, + "balance_loss_mlp": 1.02246594, + "epoch": 0.2711558695325417, + "flos": 19682782715520.0, + "grad_norm": 8.005989898381102, + "language_loss": 0.71809304, + "learning_rate": 3.4203277184947757e-06, + "loss": 0.73920202, + "num_input_tokens_seen": 97591150, + "step": 4510, + "time_per_iteration": 2.8775625228881836 + }, + { + "auxiliary_loss_clip": 0.0107973, + "auxiliary_loss_mlp": 0.01033578, + "balance_loss_clip": 1.03188455, + "balance_loss_mlp": 1.01895094, + "epoch": 0.2712159927852097, + "flos": 18587758648320.0, + "grad_norm": 2.4591510338702, + "language_loss": 0.70269185, + "learning_rate": 3.4200534963556627e-06, + "loss": 0.72382492, + "num_input_tokens_seen": 97607410, + "step": 4511, + "time_per_iteration": 2.6180224418640137 + }, + { + "auxiliary_loss_clip": 0.0106938, + "auxiliary_loss_mlp": 0.01037157, + "balance_loss_clip": 1.03119469, + "balance_loss_mlp": 1.02164793, + "epoch": 0.27127611603787766, + "flos": 25630235919360.0, + "grad_norm": 2.235716411177429, + "language_loss": 0.80866754, + "learning_rate": 3.419779220367979e-06, + "loss": 0.82973289, + "num_input_tokens_seen": 97626870, + "step": 4512, + "time_per_iteration": 2.6334187984466553 + }, + { + "auxiliary_loss_clip": 0.01089645, + "auxiliary_loss_mlp": 0.01033326, + "balance_loss_clip": 1.03258634, + "balance_loss_mlp": 1.01984334, + "epoch": 0.2713362392905456, + "flos": 23148952312320.0, + "grad_norm": 1.7264778238113476, + "language_loss": 0.80474615, + "learning_rate": 3.419504890542124e-06, + "loss": 0.82597584, + "num_input_tokens_seen": 97646595, + "step": 4513, + "time_per_iteration": 2.6277456283569336 + }, + { + "auxiliary_loss_clip": 0.01062529, + "auxiliary_loss_mlp": 0.01033344, + "balance_loss_clip": 1.02771735, + "balance_loss_mlp": 1.01921725, + "epoch": 0.2713963625432136, + "flos": 18366045949440.0, + "grad_norm": 1.8082684013812533, + "language_loss": 0.88303399, + "learning_rate": 3.4192305068885026e-06, + "loss": 0.90399277, + "num_input_tokens_seen": 97665485, + "step": 4514, + "time_per_iteration": 2.588426113128662 + }, + { + "auxiliary_loss_clip": 0.01070672, + "auxiliary_loss_mlp": 0.01042729, + "balance_loss_clip": 1.03165054, + "balance_loss_mlp": 1.02727938, + "epoch": 0.27145648579588155, + "flos": 22491751121280.0, + "grad_norm": 1.5866556615948006, + "language_loss": 0.91783309, + "learning_rate": 3.418956069417517e-06, + "loss": 0.93896705, + "num_input_tokens_seen": 97683800, + "step": 4515, + "time_per_iteration": 2.7203025817871094 + }, + { + "auxiliary_loss_clip": 0.01051045, + "auxiliary_loss_mlp": 0.01051716, + "balance_loss_clip": 1.03263986, + "balance_loss_mlp": 1.0338583, + "epoch": 0.2715166090485495, + "flos": 19239177749760.0, + "grad_norm": 2.3133994606272683, + "language_loss": 0.7279489, + "learning_rate": 3.4186815781395756e-06, + "loss": 0.74897659, + "num_input_tokens_seen": 97700505, + "step": 4516, + "time_per_iteration": 2.7002665996551514 + }, + { + "auxiliary_loss_clip": 0.01081788, + "auxiliary_loss_mlp": 0.01037204, + "balance_loss_clip": 1.03163052, + "balance_loss_mlp": 1.02170706, + "epoch": 0.2715767323012175, + "flos": 17709598944000.0, + "grad_norm": 4.526287088868518, + "language_loss": 0.76210058, + "learning_rate": 3.4184070330650866e-06, + "loss": 0.78329051, + "num_input_tokens_seen": 97717410, + "step": 4517, + "time_per_iteration": 2.659538984298706 + }, + { + "auxiliary_loss_clip": 0.0103541, + "auxiliary_loss_mlp": 0.01043877, + "balance_loss_clip": 1.02428019, + "balance_loss_mlp": 1.02720523, + "epoch": 0.27163685555388545, + "flos": 22382834106240.0, + "grad_norm": 2.073474489175344, + "language_loss": 0.77117234, + "learning_rate": 3.4181324342044607e-06, + "loss": 0.79196519, + "num_input_tokens_seen": 97734545, + "step": 4518, + "time_per_iteration": 2.710430145263672 + }, + { + "auxiliary_loss_clip": 0.01063549, + "auxiliary_loss_mlp": 0.0103498, + "balance_loss_clip": 1.03154278, + "balance_loss_mlp": 1.02067447, + "epoch": 0.2716969788065534, + "flos": 22346708002560.0, + "grad_norm": 1.7371393333952079, + "language_loss": 0.68476856, + "learning_rate": 3.41785778156811e-06, + "loss": 0.7057538, + "num_input_tokens_seen": 97754000, + "step": 4519, + "time_per_iteration": 2.8939108848571777 + }, + { + "auxiliary_loss_clip": 0.01080118, + "auxiliary_loss_mlp": 0.01035556, + "balance_loss_clip": 1.03165698, + "balance_loss_mlp": 1.02137566, + "epoch": 0.2717571020592214, + "flos": 25228467319680.0, + "grad_norm": 2.165207489997934, + "language_loss": 0.75151455, + "learning_rate": 3.417583075166451e-06, + "loss": 0.77267134, + "num_input_tokens_seen": 97772080, + "step": 4520, + "time_per_iteration": 4.47353458404541 + }, + { + "auxiliary_loss_clip": 0.01077457, + "auxiliary_loss_mlp": 0.01040476, + "balance_loss_clip": 1.0303427, + "balance_loss_mlp": 1.02354825, + "epoch": 0.2718172253118894, + "flos": 20189769229440.0, + "grad_norm": 2.424214328115676, + "language_loss": 0.75925726, + "learning_rate": 3.4173083150099e-06, + "loss": 0.78043664, + "num_input_tokens_seen": 97789370, + "step": 4521, + "time_per_iteration": 2.6768648624420166 + }, + { + "auxiliary_loss_clip": 0.01057133, + "auxiliary_loss_mlp": 0.01043015, + "balance_loss_clip": 1.02693987, + "balance_loss_mlp": 1.02663612, + "epoch": 0.27187734856455736, + "flos": 14319129260160.0, + "grad_norm": 2.0686842764657993, + "language_loss": 0.74873245, + "learning_rate": 3.417033501108875e-06, + "loss": 0.76973397, + "num_input_tokens_seen": 97807385, + "step": 4522, + "time_per_iteration": 4.292971849441528 + }, + { + "auxiliary_loss_clip": 0.0109593, + "auxiliary_loss_mlp": 0.01033062, + "balance_loss_clip": 1.03459597, + "balance_loss_mlp": 1.01805937, + "epoch": 0.27193747181722533, + "flos": 21107682311040.0, + "grad_norm": 2.822980087928001, + "language_loss": 0.7248109, + "learning_rate": 3.416758633473798e-06, + "loss": 0.74610078, + "num_input_tokens_seen": 97827930, + "step": 4523, + "time_per_iteration": 2.659059762954712 + }, + { + "auxiliary_loss_clip": 0.01067463, + "auxiliary_loss_mlp": 0.010367, + "balance_loss_clip": 1.03008461, + "balance_loss_mlp": 1.02208507, + "epoch": 0.2719975950698933, + "flos": 19682782715520.0, + "grad_norm": 1.6136065124122594, + "language_loss": 0.74301887, + "learning_rate": 3.4164837121150915e-06, + "loss": 0.7640605, + "num_input_tokens_seen": 97847440, + "step": 4524, + "time_per_iteration": 2.687711477279663 + }, + { + "auxiliary_loss_clip": 0.01092888, + "auxiliary_loss_mlp": 0.01035764, + "balance_loss_clip": 1.03342175, + "balance_loss_mlp": 1.02097023, + "epoch": 0.27205771832256126, + "flos": 24754482426240.0, + "grad_norm": 1.8060639498492772, + "language_loss": 0.7602402, + "learning_rate": 3.4162087370431803e-06, + "loss": 0.78152668, + "num_input_tokens_seen": 97867620, + "step": 4525, + "time_per_iteration": 2.575465679168701 + }, + { + "auxiliary_loss_clip": 0.01074136, + "auxiliary_loss_mlp": 0.01039562, + "balance_loss_clip": 1.02939141, + "balance_loss_mlp": 1.02530479, + "epoch": 0.2721178415752292, + "flos": 21755581879680.0, + "grad_norm": 1.7143157241922165, + "language_loss": 0.81600797, + "learning_rate": 3.4159337082684926e-06, + "loss": 0.83714497, + "num_input_tokens_seen": 97884345, + "step": 4526, + "time_per_iteration": 2.6728053092956543 + }, + { + "auxiliary_loss_clip": 0.0109459, + "auxiliary_loss_mlp": 0.01041071, + "balance_loss_clip": 1.03163362, + "balance_loss_mlp": 1.02456045, + "epoch": 0.2721779648278972, + "flos": 12676826597760.0, + "grad_norm": 2.068746531820656, + "language_loss": 0.76864225, + "learning_rate": 3.4156586258014566e-06, + "loss": 0.78999889, + "num_input_tokens_seen": 97901500, + "step": 4527, + "time_per_iteration": 2.6119699478149414 + }, + { + "auxiliary_loss_clip": 0.01061114, + "auxiliary_loss_mlp": 0.00749034, + "balance_loss_clip": 1.03189719, + "balance_loss_mlp": 1.00171304, + "epoch": 0.27223808808056515, + "flos": 16253206099200.0, + "grad_norm": 2.1246446307924622, + "language_loss": 0.81897509, + "learning_rate": 3.415383489652503e-06, + "loss": 0.83707654, + "num_input_tokens_seen": 97917800, + "step": 4528, + "time_per_iteration": 2.6747212409973145 + }, + { + "auxiliary_loss_clip": 0.01051064, + "auxiliary_loss_mlp": 0.01040245, + "balance_loss_clip": 1.02625871, + "balance_loss_mlp": 1.02510524, + "epoch": 0.2722982113332331, + "flos": 27745805203200.0, + "grad_norm": 1.8427794373970419, + "language_loss": 0.77482086, + "learning_rate": 3.4151082998320666e-06, + "loss": 0.79573393, + "num_input_tokens_seen": 97937225, + "step": 4529, + "time_per_iteration": 5.870492219924927 + }, + { + "auxiliary_loss_clip": 0.01072095, + "auxiliary_loss_mlp": 0.01042683, + "balance_loss_clip": 1.03164685, + "balance_loss_mlp": 1.02856839, + "epoch": 0.2723583345859011, + "flos": 21726243446400.0, + "grad_norm": 2.07290353946128, + "language_loss": 0.82297897, + "learning_rate": 3.4148330563505805e-06, + "loss": 0.84412682, + "num_input_tokens_seen": 97956845, + "step": 4530, + "time_per_iteration": 2.6576106548309326 + }, + { + "auxiliary_loss_clip": 0.01082041, + "auxiliary_loss_mlp": 0.01033483, + "balance_loss_clip": 1.03257418, + "balance_loss_mlp": 1.01818824, + "epoch": 0.27241845783856905, + "flos": 17347260499200.0, + "grad_norm": 2.0952252906490187, + "language_loss": 0.91381848, + "learning_rate": 3.4145577592184838e-06, + "loss": 0.93497372, + "num_input_tokens_seen": 97972465, + "step": 4531, + "time_per_iteration": 2.5355868339538574 + }, + { + "auxiliary_loss_clip": 0.01082612, + "auxiliary_loss_mlp": 0.01043751, + "balance_loss_clip": 1.03138447, + "balance_loss_mlp": 1.0288136, + "epoch": 0.272478581091237, + "flos": 24754302858240.0, + "grad_norm": 1.8275070507486961, + "language_loss": 0.76648051, + "learning_rate": 3.4142824084462155e-06, + "loss": 0.78774416, + "num_input_tokens_seen": 97990770, + "step": 4532, + "time_per_iteration": 2.5992934703826904 + }, + { + "auxiliary_loss_clip": 0.01056248, + "auxiliary_loss_mlp": 0.01029494, + "balance_loss_clip": 1.02932763, + "balance_loss_mlp": 1.01480126, + "epoch": 0.272538704343905, + "flos": 17890624512000.0, + "grad_norm": 2.3858635905719447, + "language_loss": 0.88670528, + "learning_rate": 3.4140070040442162e-06, + "loss": 0.90756273, + "num_input_tokens_seen": 98005775, + "step": 4533, + "time_per_iteration": 2.6307497024536133 + }, + { + "auxiliary_loss_clip": 0.01065115, + "auxiliary_loss_mlp": 0.01032238, + "balance_loss_clip": 1.02874792, + "balance_loss_mlp": 1.01764655, + "epoch": 0.272598827596573, + "flos": 22932016122240.0, + "grad_norm": 2.143785166054194, + "language_loss": 0.7147823, + "learning_rate": 3.413731546022929e-06, + "loss": 0.73575586, + "num_input_tokens_seen": 98025750, + "step": 4534, + "time_per_iteration": 2.7292661666870117 + }, + { + "auxiliary_loss_clip": 0.01070699, + "auxiliary_loss_mlp": 0.01037822, + "balance_loss_clip": 1.03104591, + "balance_loss_mlp": 1.02231884, + "epoch": 0.27265895084924097, + "flos": 24238409771520.0, + "grad_norm": 1.6318096804630828, + "language_loss": 0.91254079, + "learning_rate": 3.4134560343928005e-06, + "loss": 0.93362594, + "num_input_tokens_seen": 98044955, + "step": 4535, + "time_per_iteration": 2.6769330501556396 + }, + { + "auxiliary_loss_clip": 0.01070123, + "auxiliary_loss_mlp": 0.01037638, + "balance_loss_clip": 1.03089499, + "balance_loss_mlp": 1.02202153, + "epoch": 0.27271907410190893, + "flos": 27013155494400.0, + "grad_norm": 1.620439451196767, + "language_loss": 0.73098385, + "learning_rate": 3.4131804691642778e-06, + "loss": 0.75206149, + "num_input_tokens_seen": 98065860, + "step": 4536, + "time_per_iteration": 2.699521541595459 + }, + { + "auxiliary_loss_clip": 0.01078783, + "auxiliary_loss_mlp": 0.01034855, + "balance_loss_clip": 1.02890396, + "balance_loss_mlp": 1.01993608, + "epoch": 0.2727791973545769, + "flos": 34452588942720.0, + "grad_norm": 1.7324857656744501, + "language_loss": 0.71482307, + "learning_rate": 3.41290485034781e-06, + "loss": 0.73595953, + "num_input_tokens_seen": 98085450, + "step": 4537, + "time_per_iteration": 2.744513511657715 + }, + { + "auxiliary_loss_clip": 0.01061581, + "auxiliary_loss_mlp": 0.01035798, + "balance_loss_clip": 1.02845955, + "balance_loss_mlp": 1.02032471, + "epoch": 0.27283932060724486, + "flos": 15041723160960.0, + "grad_norm": 2.047987091797055, + "language_loss": 0.78273511, + "learning_rate": 3.4126291779538485e-06, + "loss": 0.80370891, + "num_input_tokens_seen": 98099115, + "step": 4538, + "time_per_iteration": 2.618748426437378 + }, + { + "auxiliary_loss_clip": 0.01079651, + "auxiliary_loss_mlp": 0.01043807, + "balance_loss_clip": 1.03079176, + "balance_loss_mlp": 1.02926302, + "epoch": 0.2728994438599128, + "flos": 21652411040640.0, + "grad_norm": 1.4325115892656062, + "language_loss": 0.89882672, + "learning_rate": 3.412353451992847e-06, + "loss": 0.92006129, + "num_input_tokens_seen": 98118415, + "step": 4539, + "time_per_iteration": 2.721798896789551 + }, + { + "auxiliary_loss_clip": 0.01067534, + "auxiliary_loss_mlp": 0.01039126, + "balance_loss_clip": 1.02868581, + "balance_loss_mlp": 1.02330732, + "epoch": 0.2729595671125808, + "flos": 17488424949120.0, + "grad_norm": 1.826498445540282, + "language_loss": 0.88065213, + "learning_rate": 3.4120776724752607e-06, + "loss": 0.90171868, + "num_input_tokens_seen": 98136300, + "step": 4540, + "time_per_iteration": 2.72990345954895 + }, + { + "auxiliary_loss_clip": 0.01078531, + "auxiliary_loss_mlp": 0.00748951, + "balance_loss_clip": 1.02891207, + "balance_loss_mlp": 1.00182462, + "epoch": 0.27301969036524876, + "flos": 19318145800320.0, + "grad_norm": 1.8295709065312615, + "language_loss": 0.82234752, + "learning_rate": 3.4118018394115476e-06, + "loss": 0.84062231, + "num_input_tokens_seen": 98154580, + "step": 4541, + "time_per_iteration": 2.616570472717285 + }, + { + "auxiliary_loss_clip": 0.01072384, + "auxiliary_loss_mlp": 0.01038764, + "balance_loss_clip": 1.03261828, + "balance_loss_mlp": 1.02380323, + "epoch": 0.2730798136179167, + "flos": 21065666376960.0, + "grad_norm": 2.030835346508393, + "language_loss": 0.79934973, + "learning_rate": 3.4115259528121678e-06, + "loss": 0.82046121, + "num_input_tokens_seen": 98173115, + "step": 4542, + "time_per_iteration": 2.74131178855896 + }, + { + "auxiliary_loss_clip": 0.01074483, + "auxiliary_loss_mlp": 0.01036371, + "balance_loss_clip": 1.03375316, + "balance_loss_mlp": 1.02180374, + "epoch": 0.2731399368705847, + "flos": 19171737964800.0, + "grad_norm": 2.3499738660552008, + "language_loss": 0.89230204, + "learning_rate": 3.411250012687582e-06, + "loss": 0.9134106, + "num_input_tokens_seen": 98190260, + "step": 4543, + "time_per_iteration": 2.741539478302002 + }, + { + "auxiliary_loss_clip": 0.0105944, + "auxiliary_loss_mlp": 0.00748736, + "balance_loss_clip": 1.02788401, + "balance_loss_mlp": 1.0016067, + "epoch": 0.27320006012325265, + "flos": 18290130554880.0, + "grad_norm": 2.2280029342331136, + "language_loss": 0.63121784, + "learning_rate": 3.410974019048255e-06, + "loss": 0.64929962, + "num_input_tokens_seen": 98207115, + "step": 4544, + "time_per_iteration": 2.832115411758423 + }, + { + "auxiliary_loss_clip": 0.01069989, + "auxiliary_loss_mlp": 0.01035282, + "balance_loss_clip": 1.03150988, + "balance_loss_mlp": 1.01974916, + "epoch": 0.2732601833759206, + "flos": 34860929731200.0, + "grad_norm": 1.8144710036431817, + "language_loss": 0.69967824, + "learning_rate": 3.410697971904651e-06, + "loss": 0.72073096, + "num_input_tokens_seen": 98230610, + "step": 4545, + "time_per_iteration": 2.7756550312042236 + }, + { + "auxiliary_loss_clip": 0.00997538, + "auxiliary_loss_mlp": 0.01008441, + "balance_loss_clip": 1.00360847, + "balance_loss_mlp": 1.00662887, + "epoch": 0.2733203066285886, + "flos": 53910824762880.0, + "grad_norm": 0.723430079624043, + "language_loss": 0.61572313, + "learning_rate": 3.4104218712672383e-06, + "loss": 0.63578296, + "num_input_tokens_seen": 98293585, + "step": 4546, + "time_per_iteration": 3.340125560760498 + }, + { + "auxiliary_loss_clip": 0.01007965, + "auxiliary_loss_mlp": 0.01049501, + "balance_loss_clip": 1.03350544, + "balance_loss_mlp": 1.03262091, + "epoch": 0.2733804298812566, + "flos": 20660378244480.0, + "grad_norm": 2.477935549417644, + "language_loss": 0.64893275, + "learning_rate": 3.410145717146488e-06, + "loss": 0.66950738, + "num_input_tokens_seen": 98311680, + "step": 4547, + "time_per_iteration": 3.083752155303955 + }, + { + "auxiliary_loss_clip": 0.01064768, + "auxiliary_loss_mlp": 0.00748827, + "balance_loss_clip": 1.02891636, + "balance_loss_mlp": 1.00184202, + "epoch": 0.27344055313392457, + "flos": 25884339707520.0, + "grad_norm": 1.9354693292946337, + "language_loss": 0.77479601, + "learning_rate": 3.4098695095528694e-06, + "loss": 0.79293197, + "num_input_tokens_seen": 98330770, + "step": 4548, + "time_per_iteration": 2.9562439918518066 + }, + { + "auxiliary_loss_clip": 0.0106816, + "auxiliary_loss_mlp": 0.01035184, + "balance_loss_clip": 1.03050017, + "balance_loss_mlp": 1.02172542, + "epoch": 0.27350067638659253, + "flos": 22929753565440.0, + "grad_norm": 1.8757541535718176, + "language_loss": 0.82638019, + "learning_rate": 3.4095932484968585e-06, + "loss": 0.84741366, + "num_input_tokens_seen": 98349860, + "step": 4549, + "time_per_iteration": 2.6744234561920166 + }, + { + "auxiliary_loss_clip": 0.01079806, + "auxiliary_loss_mlp": 0.01042147, + "balance_loss_clip": 1.02922893, + "balance_loss_mlp": 1.02549338, + "epoch": 0.2735607996392605, + "flos": 16574821499520.0, + "grad_norm": 2.062227183808276, + "language_loss": 0.70945156, + "learning_rate": 3.4093169339889305e-06, + "loss": 0.73067111, + "num_input_tokens_seen": 98367040, + "step": 4550, + "time_per_iteration": 2.557293176651001 + }, + { + "auxiliary_loss_clip": 0.01056751, + "auxiliary_loss_mlp": 0.01032521, + "balance_loss_clip": 1.03045034, + "balance_loss_mlp": 1.01913357, + "epoch": 0.27362092289192846, + "flos": 19645291895040.0, + "grad_norm": 2.0096314947124196, + "language_loss": 0.79032588, + "learning_rate": 3.409040566039563e-06, + "loss": 0.81121856, + "num_input_tokens_seen": 98384010, + "step": 4551, + "time_per_iteration": 2.6145827770233154 + }, + { + "auxiliary_loss_clip": 0.0104998, + "auxiliary_loss_mlp": 0.01046586, + "balance_loss_clip": 1.02708352, + "balance_loss_mlp": 1.0302186, + "epoch": 0.27368104614459643, + "flos": 17639142416640.0, + "grad_norm": 2.4024497292552, + "language_loss": 0.7043618, + "learning_rate": 3.4087641446592362e-06, + "loss": 0.72532743, + "num_input_tokens_seen": 98399625, + "step": 4552, + "time_per_iteration": 2.779672145843506 + }, + { + "auxiliary_loss_clip": 0.0107346, + "auxiliary_loss_mlp": 0.01034137, + "balance_loss_clip": 1.03351927, + "balance_loss_mlp": 1.0191344, + "epoch": 0.2737411693972644, + "flos": 21580015178880.0, + "grad_norm": 2.133559308204542, + "language_loss": 0.71963072, + "learning_rate": 3.408487669858431e-06, + "loss": 0.74070668, + "num_input_tokens_seen": 98417310, + "step": 4553, + "time_per_iteration": 2.715847969055176 + }, + { + "auxiliary_loss_clip": 0.01079205, + "auxiliary_loss_mlp": 0.01036674, + "balance_loss_clip": 1.03079605, + "balance_loss_mlp": 1.02135515, + "epoch": 0.27380129264993236, + "flos": 25484043565440.0, + "grad_norm": 1.6229873012826448, + "language_loss": 0.59177387, + "learning_rate": 3.4082111416476337e-06, + "loss": 0.61293268, + "num_input_tokens_seen": 98438670, + "step": 4554, + "time_per_iteration": 2.5972862243652344 + }, + { + "auxiliary_loss_clip": 0.01075708, + "auxiliary_loss_mlp": 0.01033411, + "balance_loss_clip": 1.03381073, + "balance_loss_mlp": 1.01762724, + "epoch": 0.2738614159026003, + "flos": 18661196004480.0, + "grad_norm": 3.7534101473042423, + "language_loss": 0.73918527, + "learning_rate": 3.4079345600373275e-06, + "loss": 0.76027656, + "num_input_tokens_seen": 98456060, + "step": 4555, + "time_per_iteration": 2.5625836849212646 + }, + { + "auxiliary_loss_clip": 0.01082594, + "auxiliary_loss_mlp": 0.010348, + "balance_loss_clip": 1.03228736, + "balance_loss_mlp": 1.0202806, + "epoch": 0.2739215391552683, + "flos": 23477139901440.0, + "grad_norm": 1.8397325649107255, + "language_loss": 0.77747387, + "learning_rate": 3.407657925038002e-06, + "loss": 0.79864776, + "num_input_tokens_seen": 98473765, + "step": 4556, + "time_per_iteration": 2.6510441303253174 + }, + { + "auxiliary_loss_clip": 0.01088289, + "auxiliary_loss_mlp": 0.01050886, + "balance_loss_clip": 1.03204274, + "balance_loss_mlp": 1.03414893, + "epoch": 0.27398166240793626, + "flos": 17128636369920.0, + "grad_norm": 2.3712973274841307, + "language_loss": 0.8267495, + "learning_rate": 3.4073812366601473e-06, + "loss": 0.84814125, + "num_input_tokens_seen": 98490590, + "step": 4557, + "time_per_iteration": 2.6806557178497314 + }, + { + "auxiliary_loss_clip": 0.01029592, + "auxiliary_loss_mlp": 0.0104464, + "balance_loss_clip": 1.02524686, + "balance_loss_mlp": 1.02932143, + "epoch": 0.2740417856606042, + "flos": 23404744039680.0, + "grad_norm": 1.7323191245015654, + "language_loss": 0.73109877, + "learning_rate": 3.4071044949142547e-06, + "loss": 0.75184107, + "num_input_tokens_seen": 98510590, + "step": 4558, + "time_per_iteration": 2.8114967346191406 + }, + { + "auxiliary_loss_clip": 0.01069799, + "auxiliary_loss_mlp": 0.01046573, + "balance_loss_clip": 1.02973115, + "balance_loss_mlp": 1.03156483, + "epoch": 0.2741019089132722, + "flos": 12780428400000.0, + "grad_norm": 2.2038222956376923, + "language_loss": 0.68033743, + "learning_rate": 3.406827699810819e-06, + "loss": 0.70150113, + "num_input_tokens_seen": 98527875, + "step": 4559, + "time_per_iteration": 2.6387736797332764 + }, + { + "auxiliary_loss_clip": 0.0106586, + "auxiliary_loss_mlp": 0.01048842, + "balance_loss_clip": 1.03056073, + "balance_loss_mlp": 1.03299344, + "epoch": 0.27416203216594015, + "flos": 20631542601600.0, + "grad_norm": 1.6887828120819066, + "language_loss": 0.71858644, + "learning_rate": 3.4065508513603353e-06, + "loss": 0.73973346, + "num_input_tokens_seen": 98547575, + "step": 4560, + "time_per_iteration": 2.69580340385437 + }, + { + "auxiliary_loss_clip": 0.01072776, + "auxiliary_loss_mlp": 0.01036531, + "balance_loss_clip": 1.031546, + "balance_loss_mlp": 1.02146268, + "epoch": 0.27422215541860817, + "flos": 26541576812160.0, + "grad_norm": 2.534637389135696, + "language_loss": 0.81270802, + "learning_rate": 3.406273949573303e-06, + "loss": 0.83380103, + "num_input_tokens_seen": 98566290, + "step": 4561, + "time_per_iteration": 2.8368396759033203 + }, + { + "auxiliary_loss_clip": 0.01094723, + "auxiliary_loss_mlp": 0.01038555, + "balance_loss_clip": 1.03207326, + "balance_loss_mlp": 1.02339792, + "epoch": 0.27428227867127614, + "flos": 23331163029120.0, + "grad_norm": 1.695051921150965, + "language_loss": 0.75208354, + "learning_rate": 3.4059969944602214e-06, + "loss": 0.7734164, + "num_input_tokens_seen": 98586255, + "step": 4562, + "time_per_iteration": 2.5795300006866455 + }, + { + "auxiliary_loss_clip": 0.01093212, + "auxiliary_loss_mlp": 0.01031585, + "balance_loss_clip": 1.03355253, + "balance_loss_mlp": 1.01733935, + "epoch": 0.2743424019239441, + "flos": 23035115134080.0, + "grad_norm": 1.6333809136754103, + "language_loss": 0.74567384, + "learning_rate": 3.4057199860315928e-06, + "loss": 0.76692176, + "num_input_tokens_seen": 98606030, + "step": 4563, + "time_per_iteration": 2.6181747913360596 + }, + { + "auxiliary_loss_clip": 0.01063981, + "auxiliary_loss_mlp": 0.01041311, + "balance_loss_clip": 1.02899742, + "balance_loss_mlp": 1.02418029, + "epoch": 0.27440252517661207, + "flos": 21981101420160.0, + "grad_norm": 1.5825964774816639, + "language_loss": 0.63033175, + "learning_rate": 3.4054429242979213e-06, + "loss": 0.65138471, + "num_input_tokens_seen": 98625225, + "step": 4564, + "time_per_iteration": 2.6804330348968506 + }, + { + "auxiliary_loss_clip": 0.01069948, + "auxiliary_loss_mlp": 0.01038242, + "balance_loss_clip": 1.0305872, + "balance_loss_mlp": 1.02241731, + "epoch": 0.27446264842928003, + "flos": 40187451502080.0, + "grad_norm": 2.1035952306137906, + "language_loss": 0.79123092, + "learning_rate": 3.4051658092697135e-06, + "loss": 0.81231284, + "num_input_tokens_seen": 98649470, + "step": 4565, + "time_per_iteration": 2.8139736652374268 + }, + { + "auxiliary_loss_clip": 0.01035353, + "auxiliary_loss_mlp": 0.01040805, + "balance_loss_clip": 1.02665651, + "balance_loss_mlp": 1.02579689, + "epoch": 0.274522771681948, + "flos": 13479681438720.0, + "grad_norm": 1.9309611450161803, + "language_loss": 0.68545979, + "learning_rate": 3.404888640957477e-06, + "loss": 0.70622146, + "num_input_tokens_seen": 98666915, + "step": 4566, + "time_per_iteration": 2.7075910568237305 + }, + { + "auxiliary_loss_clip": 0.01083647, + "auxiliary_loss_mlp": 0.01041543, + "balance_loss_clip": 1.03432226, + "balance_loss_mlp": 1.02785194, + "epoch": 0.27458289493461596, + "flos": 28622133313920.0, + "grad_norm": 1.5777485421359347, + "language_loss": 0.60838264, + "learning_rate": 3.404611419371723e-06, + "loss": 0.62963456, + "num_input_tokens_seen": 98688240, + "step": 4567, + "time_per_iteration": 2.79986310005188 + }, + { + "auxiliary_loss_clip": 0.01074994, + "auxiliary_loss_mlp": 0.01043326, + "balance_loss_clip": 1.03098595, + "balance_loss_mlp": 1.0253613, + "epoch": 0.2746430181872839, + "flos": 20119815492480.0, + "grad_norm": 2.148865165298235, + "language_loss": 0.82402706, + "learning_rate": 3.4043341445229627e-06, + "loss": 0.84521025, + "num_input_tokens_seen": 98708245, + "step": 4568, + "time_per_iteration": 4.384025573730469 + }, + { + "auxiliary_loss_clip": 0.01085694, + "auxiliary_loss_mlp": 0.01035907, + "balance_loss_clip": 1.03456998, + "balance_loss_mlp": 1.02040935, + "epoch": 0.2747031414399519, + "flos": 20193468330240.0, + "grad_norm": 2.451018019371965, + "language_loss": 0.68377686, + "learning_rate": 3.4040568164217117e-06, + "loss": 0.70499283, + "num_input_tokens_seen": 98724575, + "step": 4569, + "time_per_iteration": 4.140775680541992 + }, + { + "auxiliary_loss_clip": 0.01057731, + "auxiliary_loss_mlp": 0.01039082, + "balance_loss_clip": 1.02613115, + "balance_loss_mlp": 1.02286935, + "epoch": 0.27476326469261986, + "flos": 13516346246400.0, + "grad_norm": 2.2558557455188697, + "language_loss": 0.70951182, + "learning_rate": 3.4037794350784848e-06, + "loss": 0.73047996, + "num_input_tokens_seen": 98740700, + "step": 4570, + "time_per_iteration": 2.6250851154327393 + }, + { + "auxiliary_loss_clip": 0.00991957, + "auxiliary_loss_mlp": 0.01003244, + "balance_loss_clip": 1.00473857, + "balance_loss_mlp": 1.00152743, + "epoch": 0.2748233879452878, + "flos": 65937127121280.0, + "grad_norm": 0.7215876653556084, + "language_loss": 0.55835021, + "learning_rate": 3.4035020005038014e-06, + "loss": 0.57830215, + "num_input_tokens_seen": 98803030, + "step": 4571, + "time_per_iteration": 3.3730781078338623 + }, + { + "auxiliary_loss_clip": 0.01042313, + "auxiliary_loss_mlp": 0.01046912, + "balance_loss_clip": 1.0297029, + "balance_loss_mlp": 1.03129578, + "epoch": 0.2748835111979558, + "flos": 17384212615680.0, + "grad_norm": 2.025439035796505, + "language_loss": 0.77877796, + "learning_rate": 3.4032245127081812e-06, + "loss": 0.79967022, + "num_input_tokens_seen": 98820505, + "step": 4572, + "time_per_iteration": 2.711751699447632 + }, + { + "auxiliary_loss_clip": 0.01090538, + "auxiliary_loss_mlp": 0.0103456, + "balance_loss_clip": 1.03353286, + "balance_loss_mlp": 1.02134585, + "epoch": 0.27494363445062375, + "flos": 23587565287680.0, + "grad_norm": 1.5480052608423926, + "language_loss": 0.81669098, + "learning_rate": 3.402946971702147e-06, + "loss": 0.83794194, + "num_input_tokens_seen": 98842150, + "step": 4573, + "time_per_iteration": 2.596391201019287 + }, + { + "auxiliary_loss_clip": 0.01080046, + "auxiliary_loss_mlp": 0.01037269, + "balance_loss_clip": 1.03104067, + "balance_loss_mlp": 1.02187932, + "epoch": 0.2750037577032918, + "flos": 17164582905600.0, + "grad_norm": 1.5724441414133734, + "language_loss": 0.7969259, + "learning_rate": 3.402669377496223e-06, + "loss": 0.81809902, + "num_input_tokens_seen": 98861050, + "step": 4574, + "time_per_iteration": 2.552072286605835 + }, + { + "auxiliary_loss_clip": 0.01052309, + "auxiliary_loss_mlp": 0.01046059, + "balance_loss_clip": 1.03068709, + "balance_loss_mlp": 1.03168821, + "epoch": 0.27506388095595974, + "flos": 24491903028480.0, + "grad_norm": 1.869432414980856, + "language_loss": 0.73819274, + "learning_rate": 3.402391730100936e-06, + "loss": 0.75917643, + "num_input_tokens_seen": 98879695, + "step": 4575, + "time_per_iteration": 2.8721730709075928 + }, + { + "auxiliary_loss_clip": 0.01069728, + "auxiliary_loss_mlp": 0.01036901, + "balance_loss_clip": 1.03005266, + "balance_loss_mlp": 1.02309632, + "epoch": 0.2751240042086277, + "flos": 38764706722560.0, + "grad_norm": 1.5943011245506113, + "language_loss": 0.71655595, + "learning_rate": 3.402114029526814e-06, + "loss": 0.73762226, + "num_input_tokens_seen": 98902035, + "step": 4576, + "time_per_iteration": 4.507781267166138 + }, + { + "auxiliary_loss_clip": 0.01053601, + "auxiliary_loss_mlp": 0.00748995, + "balance_loss_clip": 1.03220582, + "balance_loss_mlp": 1.00189877, + "epoch": 0.27518412746129567, + "flos": 26907039740160.0, + "grad_norm": 1.626840113754343, + "language_loss": 0.73299086, + "learning_rate": 3.4018362757843866e-06, + "loss": 0.75101686, + "num_input_tokens_seen": 98921835, + "step": 4577, + "time_per_iteration": 2.7007620334625244 + }, + { + "auxiliary_loss_clip": 0.01076016, + "auxiliary_loss_mlp": 0.01041005, + "balance_loss_clip": 1.03414917, + "balance_loss_mlp": 1.02492976, + "epoch": 0.27524425071396363, + "flos": 24900531125760.0, + "grad_norm": 8.367653070403836, + "language_loss": 0.76717657, + "learning_rate": 3.401558468884188e-06, + "loss": 0.78834677, + "num_input_tokens_seen": 98939610, + "step": 4578, + "time_per_iteration": 2.755099296569824 + }, + { + "auxiliary_loss_clip": 0.01076872, + "auxiliary_loss_mlp": 0.01051738, + "balance_loss_clip": 1.0362252, + "balance_loss_mlp": 1.0324018, + "epoch": 0.2753043739666316, + "flos": 26288047641600.0, + "grad_norm": 1.3041543412889658, + "language_loss": 0.66126001, + "learning_rate": 3.4012806088367516e-06, + "loss": 0.68254608, + "num_input_tokens_seen": 98962250, + "step": 4579, + "time_per_iteration": 2.802694082260132 + }, + { + "auxiliary_loss_clip": 0.0106002, + "auxiliary_loss_mlp": 0.01047118, + "balance_loss_clip": 1.02982557, + "balance_loss_mlp": 1.029809, + "epoch": 0.27536449721929956, + "flos": 24206772867840.0, + "grad_norm": 2.12704042807679, + "language_loss": 0.80002427, + "learning_rate": 3.4010026956526137e-06, + "loss": 0.82109571, + "num_input_tokens_seen": 98981845, + "step": 4580, + "time_per_iteration": 2.8423047065734863 + }, + { + "auxiliary_loss_clip": 0.01083897, + "auxiliary_loss_mlp": 0.01042297, + "balance_loss_clip": 1.03263164, + "balance_loss_mlp": 1.02558434, + "epoch": 0.27542462047196753, + "flos": 19537272720000.0, + "grad_norm": 1.3701147628743722, + "language_loss": 0.67452645, + "learning_rate": 3.4007247293423137e-06, + "loss": 0.69578838, + "num_input_tokens_seen": 99001855, + "step": 4581, + "time_per_iteration": 2.7311325073242188 + }, + { + "auxiliary_loss_clip": 0.01074123, + "auxiliary_loss_mlp": 0.01041865, + "balance_loss_clip": 1.03351402, + "balance_loss_mlp": 1.02673745, + "epoch": 0.2754847437246355, + "flos": 14319165173760.0, + "grad_norm": 1.7693966782628736, + "language_loss": 0.77976251, + "learning_rate": 3.400446709916392e-06, + "loss": 0.80092239, + "num_input_tokens_seen": 99019880, + "step": 4582, + "time_per_iteration": 2.6449553966522217 + }, + { + "auxiliary_loss_clip": 0.01054935, + "auxiliary_loss_mlp": 0.01037963, + "balance_loss_clip": 1.03668761, + "balance_loss_mlp": 1.02332449, + "epoch": 0.27554486697730346, + "flos": 18838773866880.0, + "grad_norm": 1.7880965729540055, + "language_loss": 0.84295684, + "learning_rate": 3.4001686373853895e-06, + "loss": 0.86388576, + "num_input_tokens_seen": 99037570, + "step": 4583, + "time_per_iteration": 2.7480626106262207 + }, + { + "auxiliary_loss_clip": 0.01083095, + "auxiliary_loss_mlp": 0.01038874, + "balance_loss_clip": 1.03115857, + "balance_loss_mlp": 1.02361476, + "epoch": 0.2756049902299714, + "flos": 22382295402240.0, + "grad_norm": 1.758846636001794, + "language_loss": 0.6653496, + "learning_rate": 3.3998905117598528e-06, + "loss": 0.68656933, + "num_input_tokens_seen": 99056875, + "step": 4584, + "time_per_iteration": 2.695406675338745 + }, + { + "auxiliary_loss_clip": 0.01029174, + "auxiliary_loss_mlp": 0.01045392, + "balance_loss_clip": 1.02516866, + "balance_loss_mlp": 1.02896452, + "epoch": 0.2756651134826394, + "flos": 19573901614080.0, + "grad_norm": 1.7904598490506616, + "language_loss": 0.7663523, + "learning_rate": 3.399612333050327e-06, + "loss": 0.78709793, + "num_input_tokens_seen": 99074685, + "step": 4585, + "time_per_iteration": 2.7815041542053223 + }, + { + "auxiliary_loss_clip": 0.01086026, + "auxiliary_loss_mlp": 0.00749015, + "balance_loss_clip": 1.03337169, + "balance_loss_mlp": 1.00187039, + "epoch": 0.27572523673530736, + "flos": 23586559706880.0, + "grad_norm": 1.890871371625451, + "language_loss": 0.72000504, + "learning_rate": 3.399334101267362e-06, + "loss": 0.7383554, + "num_input_tokens_seen": 99095300, + "step": 4586, + "time_per_iteration": 2.6352622509002686 + }, + { + "auxiliary_loss_clip": 0.01071702, + "auxiliary_loss_mlp": 0.01035913, + "balance_loss_clip": 1.03212321, + "balance_loss_mlp": 1.02023721, + "epoch": 0.2757853599879754, + "flos": 22820118278400.0, + "grad_norm": 1.4773189299067973, + "language_loss": 0.80464673, + "learning_rate": 3.3990558164215073e-06, + "loss": 0.82572287, + "num_input_tokens_seen": 99115965, + "step": 4587, + "time_per_iteration": 2.6737256050109863 + }, + { + "auxiliary_loss_clip": 0.01085062, + "auxiliary_loss_mlp": 0.0103577, + "balance_loss_clip": 1.03410423, + "balance_loss_mlp": 1.02102387, + "epoch": 0.27584548324064334, + "flos": 18551704371840.0, + "grad_norm": 2.101344473723129, + "language_loss": 0.82857984, + "learning_rate": 3.398777478523316e-06, + "loss": 0.84978819, + "num_input_tokens_seen": 99134265, + "step": 4588, + "time_per_iteration": 2.627387523651123 + }, + { + "auxiliary_loss_clip": 0.01054667, + "auxiliary_loss_mlp": 0.01035633, + "balance_loss_clip": 1.02875257, + "balance_loss_mlp": 1.0203023, + "epoch": 0.2759056064933113, + "flos": 23769883745280.0, + "grad_norm": 1.3155975835834424, + "language_loss": 0.75523031, + "learning_rate": 3.398499087583342e-06, + "loss": 0.7761333, + "num_input_tokens_seen": 99156185, + "step": 4589, + "time_per_iteration": 2.7190823554992676 + }, + { + "auxiliary_loss_clip": 0.01077908, + "auxiliary_loss_mlp": 0.01039977, + "balance_loss_clip": 1.03058982, + "balance_loss_mlp": 1.0238483, + "epoch": 0.27596572974597927, + "flos": 24281898163200.0, + "grad_norm": 1.7612171182913454, + "language_loss": 0.88923693, + "learning_rate": 3.398220643612143e-06, + "loss": 0.91041577, + "num_input_tokens_seen": 99176735, + "step": 4590, + "time_per_iteration": 2.617180824279785 + }, + { + "auxiliary_loss_clip": 0.01082714, + "auxiliary_loss_mlp": 0.01042118, + "balance_loss_clip": 1.03224659, + "balance_loss_mlp": 1.026371, + "epoch": 0.27602585299864724, + "flos": 35040985632000.0, + "grad_norm": 1.4971359803836406, + "language_loss": 0.71043134, + "learning_rate": 3.397942146620277e-06, + "loss": 0.73167968, + "num_input_tokens_seen": 99199765, + "step": 4591, + "time_per_iteration": 2.699166774749756 + }, + { + "auxiliary_loss_clip": 0.01063773, + "auxiliary_loss_mlp": 0.01044531, + "balance_loss_clip": 1.03271723, + "balance_loss_mlp": 1.02796674, + "epoch": 0.2760859762513152, + "flos": 24309405002880.0, + "grad_norm": 1.8869111293284602, + "language_loss": 0.80064964, + "learning_rate": 3.3976635966183046e-06, + "loss": 0.82173264, + "num_input_tokens_seen": 99218435, + "step": 4592, + "time_per_iteration": 2.7606019973754883 + }, + { + "auxiliary_loss_clip": 0.01008028, + "auxiliary_loss_mlp": 0.0074852, + "balance_loss_clip": 1.00364208, + "balance_loss_mlp": 1.00201309, + "epoch": 0.27614609950398317, + "flos": 71260739890560.0, + "grad_norm": 0.7038608917252136, + "language_loss": 0.61605799, + "learning_rate": 3.3973849936167886e-06, + "loss": 0.63362348, + "num_input_tokens_seen": 99276200, + "step": 4593, + "time_per_iteration": 3.1179373264312744 + }, + { + "auxiliary_loss_clip": 0.01086106, + "auxiliary_loss_mlp": 0.01048618, + "balance_loss_clip": 1.03705287, + "balance_loss_mlp": 1.03214955, + "epoch": 0.27620622275665113, + "flos": 29674854138240.0, + "grad_norm": 2.0031606141227223, + "language_loss": 0.77461886, + "learning_rate": 3.3971063376262937e-06, + "loss": 0.79596603, + "num_input_tokens_seen": 99297625, + "step": 4594, + "time_per_iteration": 2.683196783065796 + }, + { + "auxiliary_loss_clip": 0.0108381, + "auxiliary_loss_mlp": 0.01037117, + "balance_loss_clip": 1.0338136, + "balance_loss_mlp": 1.02156043, + "epoch": 0.2762663460093191, + "flos": 15378063137280.0, + "grad_norm": 1.6201495557774126, + "language_loss": 0.91153175, + "learning_rate": 3.3968276286573866e-06, + "loss": 0.93274093, + "num_input_tokens_seen": 99315790, + "step": 4595, + "time_per_iteration": 2.671485185623169 + }, + { + "auxiliary_loss_clip": 0.01085324, + "auxiliary_loss_mlp": 0.01043729, + "balance_loss_clip": 1.0337944, + "balance_loss_mlp": 1.02720642, + "epoch": 0.27632646926198706, + "flos": 20704082117760.0, + "grad_norm": 2.6673790118991962, + "language_loss": 0.69527829, + "learning_rate": 3.3965488667206353e-06, + "loss": 0.71656877, + "num_input_tokens_seen": 99334615, + "step": 4596, + "time_per_iteration": 2.658801555633545 + }, + { + "auxiliary_loss_clip": 0.01068896, + "auxiliary_loss_mlp": 0.01037653, + "balance_loss_clip": 1.03137434, + "balance_loss_mlp": 1.02105927, + "epoch": 0.276386592514655, + "flos": 32813374849920.0, + "grad_norm": 2.110843790573467, + "language_loss": 0.63911772, + "learning_rate": 3.3962700518266113e-06, + "loss": 0.66018325, + "num_input_tokens_seen": 99356685, + "step": 4597, + "time_per_iteration": 2.9156010150909424 + }, + { + "auxiliary_loss_clip": 0.01093029, + "auxiliary_loss_mlp": 0.01042671, + "balance_loss_clip": 1.03428376, + "balance_loss_mlp": 1.02759147, + "epoch": 0.276446715767323, + "flos": 18551704371840.0, + "grad_norm": 2.0575662702223765, + "language_loss": 0.86632967, + "learning_rate": 3.395991183985887e-06, + "loss": 0.88768661, + "num_input_tokens_seen": 99374810, + "step": 4598, + "time_per_iteration": 2.726959228515625 + }, + { + "auxiliary_loss_clip": 0.01093861, + "auxiliary_loss_mlp": 0.01042273, + "balance_loss_clip": 1.03271937, + "balance_loss_mlp": 1.0257268, + "epoch": 0.27650683901999096, + "flos": 22819615488000.0, + "grad_norm": 3.6613586468615757, + "language_loss": 0.79817569, + "learning_rate": 3.395712263209037e-06, + "loss": 0.81953704, + "num_input_tokens_seen": 99391290, + "step": 4599, + "time_per_iteration": 2.60148286819458 + }, + { + "auxiliary_loss_clip": 0.01069452, + "auxiliary_loss_mlp": 0.01045619, + "balance_loss_clip": 1.02971768, + "balance_loss_mlp": 1.03009796, + "epoch": 0.276566962272659, + "flos": 21361534704000.0, + "grad_norm": 1.705566145998076, + "language_loss": 0.78808969, + "learning_rate": 3.395433289506639e-06, + "loss": 0.80924034, + "num_input_tokens_seen": 99409120, + "step": 4600, + "time_per_iteration": 2.6398956775665283 + }, + { + "auxiliary_loss_clip": 0.01069702, + "auxiliary_loss_mlp": 0.01043824, + "balance_loss_clip": 1.03516865, + "balance_loss_mlp": 1.02752793, + "epoch": 0.27662708552532694, + "flos": 17710604524800.0, + "grad_norm": 1.7283305618411178, + "language_loss": 0.73281956, + "learning_rate": 3.3951542628892694e-06, + "loss": 0.75395477, + "num_input_tokens_seen": 99426180, + "step": 4601, + "time_per_iteration": 2.7588930130004883 + }, + { + "auxiliary_loss_clip": 0.01083068, + "auxiliary_loss_mlp": 0.01043652, + "balance_loss_clip": 1.03291667, + "balance_loss_mlp": 1.02734411, + "epoch": 0.2766872087779949, + "flos": 21252725429760.0, + "grad_norm": 1.5614823039441692, + "language_loss": 0.80193806, + "learning_rate": 3.3948751833675113e-06, + "loss": 0.82320523, + "num_input_tokens_seen": 99447720, + "step": 4602, + "time_per_iteration": 2.622981548309326 + }, + { + "auxiliary_loss_clip": 0.01075957, + "auxiliary_loss_mlp": 0.01052169, + "balance_loss_clip": 1.03126442, + "balance_loss_mlp": 1.03428805, + "epoch": 0.2767473320306629, + "flos": 12931900053120.0, + "grad_norm": 2.465744412906056, + "language_loss": 0.77031249, + "learning_rate": 3.3945960509519455e-06, + "loss": 0.79159373, + "num_input_tokens_seen": 99464720, + "step": 4603, + "time_per_iteration": 2.7375452518463135 + }, + { + "auxiliary_loss_clip": 0.0107076, + "auxiliary_loss_mlp": 0.01040459, + "balance_loss_clip": 1.03370643, + "balance_loss_mlp": 1.02594495, + "epoch": 0.27680745528333084, + "flos": 15012851604480.0, + "grad_norm": 1.470181832974916, + "language_loss": 0.81676269, + "learning_rate": 3.3943168656531585e-06, + "loss": 0.83787483, + "num_input_tokens_seen": 99482310, + "step": 4604, + "time_per_iteration": 2.7637996673583984 + }, + { + "auxiliary_loss_clip": 0.01043743, + "auxiliary_loss_mlp": 0.01035705, + "balance_loss_clip": 1.0290724, + "balance_loss_mlp": 1.01998091, + "epoch": 0.2768675785359988, + "flos": 22637835734400.0, + "grad_norm": 1.8833712641185418, + "language_loss": 0.69734395, + "learning_rate": 3.3940376274817363e-06, + "loss": 0.7181384, + "num_input_tokens_seen": 99501255, + "step": 4605, + "time_per_iteration": 2.8565540313720703 + }, + { + "auxiliary_loss_clip": 0.01007527, + "auxiliary_loss_mlp": 0.010055, + "balance_loss_clip": 1.0020119, + "balance_loss_mlp": 1.00362873, + "epoch": 0.27692770178866677, + "flos": 66130542881280.0, + "grad_norm": 0.7869580584676441, + "language_loss": 0.57144982, + "learning_rate": 3.3937583364482673e-06, + "loss": 0.59158009, + "num_input_tokens_seen": 99568925, + "step": 4606, + "time_per_iteration": 3.275073528289795 + }, + { + "auxiliary_loss_clip": 0.01074502, + "auxiliary_loss_mlp": 0.01043363, + "balance_loss_clip": 1.03317213, + "balance_loss_mlp": 1.02738929, + "epoch": 0.27698782504133473, + "flos": 26464979059200.0, + "grad_norm": 1.6868997540292818, + "language_loss": 0.69372404, + "learning_rate": 3.3934789925633424e-06, + "loss": 0.7149027, + "num_input_tokens_seen": 99588455, + "step": 4607, + "time_per_iteration": 2.6676573753356934 + }, + { + "auxiliary_loss_clip": 0.01079022, + "auxiliary_loss_mlp": 0.01035398, + "balance_loss_clip": 1.03225696, + "balance_loss_mlp": 1.02034163, + "epoch": 0.2770479482940027, + "flos": 25884806584320.0, + "grad_norm": 1.5002805271912873, + "language_loss": 0.70273829, + "learning_rate": 3.393199595837555e-06, + "loss": 0.7238825, + "num_input_tokens_seen": 99609355, + "step": 4608, + "time_per_iteration": 2.6121606826782227 + }, + { + "auxiliary_loss_clip": 0.01048143, + "auxiliary_loss_mlp": 0.01036093, + "balance_loss_clip": 1.03480208, + "balance_loss_mlp": 1.02063179, + "epoch": 0.27710807154667066, + "flos": 22857249962880.0, + "grad_norm": 2.1776034087994334, + "language_loss": 0.72387218, + "learning_rate": 3.392920146281499e-06, + "loss": 0.74471462, + "num_input_tokens_seen": 99628780, + "step": 4609, + "time_per_iteration": 2.802952527999878 + }, + { + "auxiliary_loss_clip": 0.01050656, + "auxiliary_loss_mlp": 0.01052127, + "balance_loss_clip": 1.02807224, + "balance_loss_mlp": 1.03481734, + "epoch": 0.27716819479933863, + "flos": 17711071401600.0, + "grad_norm": 2.388084028498204, + "language_loss": 0.83445287, + "learning_rate": 3.3926406439057714e-06, + "loss": 0.85548073, + "num_input_tokens_seen": 99644545, + "step": 4610, + "time_per_iteration": 2.7584729194641113 + }, + { + "auxiliary_loss_clip": 0.01022018, + "auxiliary_loss_mlp": 0.00749158, + "balance_loss_clip": 1.02638054, + "balance_loss_mlp": 1.00197804, + "epoch": 0.2772283180520066, + "flos": 19646046080640.0, + "grad_norm": 3.8899621401503546, + "language_loss": 0.69376433, + "learning_rate": 3.3923610887209705e-06, + "loss": 0.71147609, + "num_input_tokens_seen": 99663125, + "step": 4611, + "time_per_iteration": 2.719632387161255 + }, + { + "auxiliary_loss_clip": 0.01089574, + "auxiliary_loss_mlp": 0.01039355, + "balance_loss_clip": 1.03275275, + "balance_loss_mlp": 1.02410269, + "epoch": 0.27728844130467456, + "flos": 21032628842880.0, + "grad_norm": 2.3932748188562774, + "language_loss": 0.73592037, + "learning_rate": 3.392081480737698e-06, + "loss": 0.75720966, + "num_input_tokens_seen": 99682645, + "step": 4612, + "time_per_iteration": 2.5404462814331055 + }, + { + "auxiliary_loss_clip": 0.01085464, + "auxiliary_loss_mlp": 0.00748941, + "balance_loss_clip": 1.03229558, + "balance_loss_mlp": 1.00185513, + "epoch": 0.2773485645573425, + "flos": 18989204025600.0, + "grad_norm": 2.0696894955839586, + "language_loss": 0.66249788, + "learning_rate": 3.3918018199665563e-06, + "loss": 0.68084192, + "num_input_tokens_seen": 99700520, + "step": 4613, + "time_per_iteration": 2.7100274562835693 + }, + { + "auxiliary_loss_clip": 0.01044687, + "auxiliary_loss_mlp": 0.01046843, + "balance_loss_clip": 1.03086042, + "balance_loss_mlp": 1.03057742, + "epoch": 0.27740868781001055, + "flos": 21468440557440.0, + "grad_norm": 1.8395659095964925, + "language_loss": 0.79386574, + "learning_rate": 3.39152210641815e-06, + "loss": 0.81478107, + "num_input_tokens_seen": 99720355, + "step": 4614, + "time_per_iteration": 2.7197093963623047 + }, + { + "auxiliary_loss_clip": 0.01074524, + "auxiliary_loss_mlp": 0.01042745, + "balance_loss_clip": 1.02960992, + "balance_loss_mlp": 1.02623475, + "epoch": 0.2774688110626785, + "flos": 19827825834240.0, + "grad_norm": 2.6266238164138436, + "language_loss": 0.79394603, + "learning_rate": 3.3912423401030865e-06, + "loss": 0.81511867, + "num_input_tokens_seen": 99736090, + "step": 4615, + "time_per_iteration": 4.31575870513916 + }, + { + "auxiliary_loss_clip": 0.01062723, + "auxiliary_loss_mlp": 0.0104403, + "balance_loss_clip": 1.02914619, + "balance_loss_mlp": 1.02796066, + "epoch": 0.2775289343153465, + "flos": 18216226321920.0, + "grad_norm": 2.637124811091768, + "language_loss": 0.63223851, + "learning_rate": 3.3909625210319735e-06, + "loss": 0.65330613, + "num_input_tokens_seen": 99751805, + "step": 4616, + "time_per_iteration": 4.165896654129028 + }, + { + "auxiliary_loss_clip": 0.01081492, + "auxiliary_loss_mlp": 0.01040676, + "balance_loss_clip": 1.03137052, + "balance_loss_mlp": 1.02542377, + "epoch": 0.27758905756801444, + "flos": 16472476673280.0, + "grad_norm": 3.967840733567182, + "language_loss": 0.82961231, + "learning_rate": 3.3906826492154226e-06, + "loss": 0.85083401, + "num_input_tokens_seen": 99770610, + "step": 4617, + "time_per_iteration": 2.540180206298828 + }, + { + "auxiliary_loss_clip": 0.0109379, + "auxiliary_loss_mlp": 0.01045588, + "balance_loss_clip": 1.03152299, + "balance_loss_mlp": 1.03028202, + "epoch": 0.2776491808206824, + "flos": 18728240739840.0, + "grad_norm": 1.9766877494676793, + "language_loss": 0.76688969, + "learning_rate": 3.3904027246640458e-06, + "loss": 0.78828347, + "num_input_tokens_seen": 99787305, + "step": 4618, + "time_per_iteration": 2.533740997314453 + }, + { + "auxiliary_loss_clip": 0.01094976, + "auxiliary_loss_mlp": 0.01035849, + "balance_loss_clip": 1.03367567, + "balance_loss_mlp": 1.0215559, + "epoch": 0.27770930407335037, + "flos": 28038189911040.0, + "grad_norm": 1.7747758173879098, + "language_loss": 0.84445143, + "learning_rate": 3.390122747388459e-06, + "loss": 0.86575967, + "num_input_tokens_seen": 99808940, + "step": 4619, + "time_per_iteration": 2.652782917022705 + }, + { + "auxiliary_loss_clip": 0.01071435, + "auxiliary_loss_mlp": 0.01043248, + "balance_loss_clip": 1.0322988, + "balance_loss_mlp": 1.0294081, + "epoch": 0.27776942732601834, + "flos": 23549823072000.0, + "grad_norm": 2.1402360846298487, + "language_loss": 0.76760745, + "learning_rate": 3.3898427173992778e-06, + "loss": 0.78875428, + "num_input_tokens_seen": 99829575, + "step": 4620, + "time_per_iteration": 2.631129503250122 + }, + { + "auxiliary_loss_clip": 0.01040504, + "auxiliary_loss_mlp": 0.01039443, + "balance_loss_clip": 1.02747202, + "balance_loss_mlp": 1.02422571, + "epoch": 0.2778295505786863, + "flos": 23908713811200.0, + "grad_norm": 1.7224709515783887, + "language_loss": 0.78116781, + "learning_rate": 3.389562634707122e-06, + "loss": 0.80196732, + "num_input_tokens_seen": 99847575, + "step": 4621, + "time_per_iteration": 2.7583773136138916 + }, + { + "auxiliary_loss_clip": 0.01063151, + "auxiliary_loss_mlp": 0.01045554, + "balance_loss_clip": 1.03173304, + "balance_loss_mlp": 1.0298779, + "epoch": 0.27788967383135427, + "flos": 25554571920000.0, + "grad_norm": 3.4041042898182674, + "language_loss": 0.87739748, + "learning_rate": 3.389282499322611e-06, + "loss": 0.89848447, + "num_input_tokens_seen": 99864995, + "step": 4622, + "time_per_iteration": 2.9137415885925293 + }, + { + "auxiliary_loss_clip": 0.01050421, + "auxiliary_loss_mlp": 0.01045688, + "balance_loss_clip": 1.03015113, + "balance_loss_mlp": 1.02971387, + "epoch": 0.27794979708402223, + "flos": 16252631481600.0, + "grad_norm": 2.259056574956788, + "language_loss": 0.81396449, + "learning_rate": 3.389002311256369e-06, + "loss": 0.83492553, + "num_input_tokens_seen": 99881540, + "step": 4623, + "time_per_iteration": 4.511802673339844 + }, + { + "auxiliary_loss_clip": 0.01068506, + "auxiliary_loss_mlp": 0.01042764, + "balance_loss_clip": 1.03553927, + "balance_loss_mlp": 1.02715993, + "epoch": 0.2780099203366902, + "flos": 20667632791680.0, + "grad_norm": 3.6421659623151523, + "language_loss": 0.81581646, + "learning_rate": 3.3887220705190204e-06, + "loss": 0.8369292, + "num_input_tokens_seen": 99899595, + "step": 4624, + "time_per_iteration": 2.7785816192626953 + }, + { + "auxiliary_loss_clip": 0.01062045, + "auxiliary_loss_mlp": 0.00748864, + "balance_loss_clip": 1.03086436, + "balance_loss_mlp": 1.00176227, + "epoch": 0.27807004358935816, + "flos": 17739583822080.0, + "grad_norm": 2.4338413695495844, + "language_loss": 0.76852882, + "learning_rate": 3.388441777121191e-06, + "loss": 0.7866379, + "num_input_tokens_seen": 99913020, + "step": 4625, + "time_per_iteration": 2.7539613246917725 + }, + { + "auxiliary_loss_clip": 0.01051837, + "auxiliary_loss_mlp": 0.01040484, + "balance_loss_clip": 1.02626562, + "balance_loss_mlp": 1.02459919, + "epoch": 0.2781301668420261, + "flos": 16727119165440.0, + "grad_norm": 1.962226387203868, + "language_loss": 0.69664335, + "learning_rate": 3.388161431073511e-06, + "loss": 0.71756661, + "num_input_tokens_seen": 99931405, + "step": 4626, + "time_per_iteration": 2.623762369155884 + }, + { + "auxiliary_loss_clip": 0.01050827, + "auxiliary_loss_mlp": 0.01040167, + "balance_loss_clip": 1.03019059, + "balance_loss_mlp": 1.02339435, + "epoch": 0.27819029009469415, + "flos": 13844749317120.0, + "grad_norm": 2.352054224293858, + "language_loss": 0.92662859, + "learning_rate": 3.38788103238661e-06, + "loss": 0.94753844, + "num_input_tokens_seen": 99948100, + "step": 4627, + "time_per_iteration": 2.8434135913848877 + }, + { + "auxiliary_loss_clip": 0.01096165, + "auxiliary_loss_mlp": 0.01035907, + "balance_loss_clip": 1.03460264, + "balance_loss_mlp": 1.02162611, + "epoch": 0.2782504133473621, + "flos": 27089286370560.0, + "grad_norm": 1.7473487394369782, + "language_loss": 0.85366172, + "learning_rate": 3.387600581071121e-06, + "loss": 0.87498248, + "num_input_tokens_seen": 99966470, + "step": 4628, + "time_per_iteration": 2.653341054916382 + }, + { + "auxiliary_loss_clip": 0.01059453, + "auxiliary_loss_mlp": 0.01041958, + "balance_loss_clip": 1.0299387, + "balance_loss_mlp": 1.02702069, + "epoch": 0.2783105366000301, + "flos": 21068826773760.0, + "grad_norm": 1.65909227117278, + "language_loss": 0.79482955, + "learning_rate": 3.387320077137679e-06, + "loss": 0.81584358, + "num_input_tokens_seen": 99985930, + "step": 4629, + "time_per_iteration": 2.761929988861084 + }, + { + "auxiliary_loss_clip": 0.01046269, + "auxiliary_loss_mlp": 0.01035023, + "balance_loss_clip": 1.02892423, + "balance_loss_mlp": 1.02125442, + "epoch": 0.27837065985269804, + "flos": 26501823434880.0, + "grad_norm": 1.6294346774863064, + "language_loss": 0.84425139, + "learning_rate": 3.3870395205969208e-06, + "loss": 0.86506426, + "num_input_tokens_seen": 100006235, + "step": 4630, + "time_per_iteration": 2.7556393146514893 + }, + { + "auxiliary_loss_clip": 0.01070991, + "auxiliary_loss_mlp": 0.01034585, + "balance_loss_clip": 1.03075278, + "balance_loss_mlp": 1.01880193, + "epoch": 0.278430783105366, + "flos": 20223201813120.0, + "grad_norm": 1.9836346176290336, + "language_loss": 0.81073529, + "learning_rate": 3.386758911459485e-06, + "loss": 0.83179104, + "num_input_tokens_seen": 100023655, + "step": 4631, + "time_per_iteration": 2.622403144836426 + }, + { + "auxiliary_loss_clip": 0.01097709, + "auxiliary_loss_mlp": 0.01045367, + "balance_loss_clip": 1.0359025, + "balance_loss_mlp": 1.0307759, + "epoch": 0.278490906358034, + "flos": 25592888753280.0, + "grad_norm": 2.608276517154532, + "language_loss": 0.7129249, + "learning_rate": 3.3864782497360126e-06, + "loss": 0.73435569, + "num_input_tokens_seen": 100043280, + "step": 4632, + "time_per_iteration": 2.564046859741211 + }, + { + "auxiliary_loss_clip": 0.0107843, + "auxiliary_loss_mlp": 0.01037096, + "balance_loss_clip": 1.03274477, + "balance_loss_mlp": 1.02303576, + "epoch": 0.27855102961070194, + "flos": 16171544528640.0, + "grad_norm": 2.0164060663798633, + "language_loss": 0.82607651, + "learning_rate": 3.386197535437145e-06, + "loss": 0.84723175, + "num_input_tokens_seen": 100057690, + "step": 4633, + "time_per_iteration": 2.5530359745025635 + }, + { + "auxiliary_loss_clip": 0.01071293, + "auxiliary_loss_mlp": 0.01036864, + "balance_loss_clip": 1.03056681, + "balance_loss_mlp": 1.02068746, + "epoch": 0.2786111528633699, + "flos": 22927598749440.0, + "grad_norm": 1.621880634796439, + "language_loss": 0.8768183, + "learning_rate": 3.385916768573529e-06, + "loss": 0.89789987, + "num_input_tokens_seen": 100075875, + "step": 4634, + "time_per_iteration": 2.676356792449951 + }, + { + "auxiliary_loss_clip": 0.01066616, + "auxiliary_loss_mlp": 0.01036392, + "balance_loss_clip": 1.0321058, + "balance_loss_mlp": 1.02076387, + "epoch": 0.27867127611603787, + "flos": 23404205335680.0, + "grad_norm": 1.527367559578374, + "language_loss": 0.76609409, + "learning_rate": 3.38563594915581e-06, + "loss": 0.78712416, + "num_input_tokens_seen": 100092930, + "step": 4635, + "time_per_iteration": 2.6983070373535156 + }, + { + "auxiliary_loss_clip": 0.01097422, + "auxiliary_loss_mlp": 0.01044829, + "balance_loss_clip": 1.03551269, + "balance_loss_mlp": 1.02903366, + "epoch": 0.27873139936870583, + "flos": 19829010983040.0, + "grad_norm": 1.660995503337756, + "language_loss": 0.65231693, + "learning_rate": 3.385355077194637e-06, + "loss": 0.67373943, + "num_input_tokens_seen": 100110790, + "step": 4636, + "time_per_iteration": 2.583508253097534 + }, + { + "auxiliary_loss_clip": 0.0108042, + "auxiliary_loss_mlp": 0.01043573, + "balance_loss_clip": 1.03221977, + "balance_loss_mlp": 1.02733707, + "epoch": 0.2787915226213738, + "flos": 17707659609600.0, + "grad_norm": 2.359498026610336, + "language_loss": 0.83630562, + "learning_rate": 3.3850741527006604e-06, + "loss": 0.8575455, + "num_input_tokens_seen": 100126970, + "step": 4637, + "time_per_iteration": 2.605501890182495 + }, + { + "auxiliary_loss_clip": 0.01059354, + "auxiliary_loss_mlp": 0.01039792, + "balance_loss_clip": 1.02779055, + "balance_loss_mlp": 1.02478349, + "epoch": 0.27885164587404176, + "flos": 22090557139200.0, + "grad_norm": 1.4779115311243567, + "language_loss": 0.76070946, + "learning_rate": 3.384793175684533e-06, + "loss": 0.78170091, + "num_input_tokens_seen": 100146720, + "step": 4638, + "time_per_iteration": 2.658421277999878 + }, + { + "auxiliary_loss_clip": 0.01078327, + "auxiliary_loss_mlp": 0.01044551, + "balance_loss_clip": 1.03075624, + "balance_loss_mlp": 1.02824306, + "epoch": 0.27891176912670973, + "flos": 19207684500480.0, + "grad_norm": 1.4348477224830527, + "language_loss": 0.71587944, + "learning_rate": 3.38451214615691e-06, + "loss": 0.73710823, + "num_input_tokens_seen": 100165920, + "step": 4639, + "time_per_iteration": 2.5526421070098877 + }, + { + "auxiliary_loss_clip": 0.01079328, + "auxiliary_loss_mlp": 0.01039172, + "balance_loss_clip": 1.03064883, + "balance_loss_mlp": 1.02289391, + "epoch": 0.27897189237937775, + "flos": 27600007898880.0, + "grad_norm": 2.0415223765989383, + "language_loss": 0.65363455, + "learning_rate": 3.384231064128447e-06, + "loss": 0.67481947, + "num_input_tokens_seen": 100185525, + "step": 4640, + "time_per_iteration": 2.620110511779785 + }, + { + "auxiliary_loss_clip": 0.01084111, + "auxiliary_loss_mlp": 0.01036206, + "balance_loss_clip": 1.03381634, + "balance_loss_mlp": 1.02189481, + "epoch": 0.2790320156320457, + "flos": 21178210665600.0, + "grad_norm": 2.517969728855396, + "language_loss": 0.7240833, + "learning_rate": 3.383949929609804e-06, + "loss": 0.74528646, + "num_input_tokens_seen": 100204850, + "step": 4641, + "time_per_iteration": 2.5984818935394287 + }, + { + "auxiliary_loss_clip": 0.0105858, + "auxiliary_loss_mlp": 0.01041363, + "balance_loss_clip": 1.03391862, + "balance_loss_mlp": 1.02426815, + "epoch": 0.2790921388847137, + "flos": 22783920347520.0, + "grad_norm": 1.7509464848049905, + "language_loss": 0.75194836, + "learning_rate": 3.383668742611641e-06, + "loss": 0.77294779, + "num_input_tokens_seen": 100224520, + "step": 4642, + "time_per_iteration": 2.7548203468322754 + }, + { + "auxiliary_loss_clip": 0.01052015, + "auxiliary_loss_mlp": 0.01043249, + "balance_loss_clip": 1.02824783, + "balance_loss_mlp": 1.0258925, + "epoch": 0.27915226213738165, + "flos": 23400649889280.0, + "grad_norm": 1.7155076862216676, + "language_loss": 0.86153328, + "learning_rate": 3.3833875031446205e-06, + "loss": 0.88248587, + "num_input_tokens_seen": 100243935, + "step": 4643, + "time_per_iteration": 2.6952996253967285 + }, + { + "auxiliary_loss_clip": 0.01057533, + "auxiliary_loss_mlp": 0.01040007, + "balance_loss_clip": 1.03252017, + "balance_loss_mlp": 1.02398491, + "epoch": 0.2792123853900496, + "flos": 22747794243840.0, + "grad_norm": 2.010211820838104, + "language_loss": 0.82746422, + "learning_rate": 3.383106211219407e-06, + "loss": 0.84843957, + "num_input_tokens_seen": 100262290, + "step": 4644, + "time_per_iteration": 2.7722132205963135 + }, + { + "auxiliary_loss_clip": 0.01079687, + "auxiliary_loss_mlp": 0.0103652, + "balance_loss_clip": 1.0311811, + "balance_loss_mlp": 1.02089167, + "epoch": 0.2792725086427176, + "flos": 15049372757760.0, + "grad_norm": 1.7712781533107054, + "language_loss": 0.78793025, + "learning_rate": 3.3828248668466673e-06, + "loss": 0.80909228, + "num_input_tokens_seen": 100280015, + "step": 4645, + "time_per_iteration": 2.5848770141601562 + }, + { + "auxiliary_loss_clip": 0.01002316, + "auxiliary_loss_mlp": 0.01005554, + "balance_loss_clip": 1.00602865, + "balance_loss_mlp": 1.00367093, + "epoch": 0.27933263189538554, + "flos": 62544861757440.0, + "grad_norm": 0.7892871361028274, + "language_loss": 0.62195331, + "learning_rate": 3.3825434700370705e-06, + "loss": 0.64203203, + "num_input_tokens_seen": 100338935, + "step": 4646, + "time_per_iteration": 3.2175910472869873 + }, + { + "auxiliary_loss_clip": 0.01067695, + "auxiliary_loss_mlp": 0.01031522, + "balance_loss_clip": 1.03197193, + "balance_loss_mlp": 1.01840293, + "epoch": 0.2793927551480535, + "flos": 25118365155840.0, + "grad_norm": 1.6321526860193372, + "language_loss": 0.89303935, + "learning_rate": 3.3822620208012865e-06, + "loss": 0.91403151, + "num_input_tokens_seen": 100359905, + "step": 4647, + "time_per_iteration": 2.7261807918548584 + }, + { + "auxiliary_loss_clip": 0.01083487, + "auxiliary_loss_mlp": 0.01042381, + "balance_loss_clip": 1.03110576, + "balance_loss_mlp": 1.02624035, + "epoch": 0.27945287840072147, + "flos": 21324582587520.0, + "grad_norm": 1.6344294235361905, + "language_loss": 0.87283444, + "learning_rate": 3.381980519149988e-06, + "loss": 0.8940931, + "num_input_tokens_seen": 100376955, + "step": 4648, + "time_per_iteration": 2.777099370956421 + }, + { + "auxiliary_loss_clip": 0.01082879, + "auxiliary_loss_mlp": 0.01037076, + "balance_loss_clip": 1.03218889, + "balance_loss_mlp": 1.02225888, + "epoch": 0.27951300165338944, + "flos": 27450547407360.0, + "grad_norm": 2.1518515444849218, + "language_loss": 0.72708941, + "learning_rate": 3.38169896509385e-06, + "loss": 0.74828899, + "num_input_tokens_seen": 100397545, + "step": 4649, + "time_per_iteration": 2.7298154830932617 + }, + { + "auxiliary_loss_clip": 0.01060513, + "auxiliary_loss_mlp": 0.01041143, + "balance_loss_clip": 1.03030849, + "balance_loss_mlp": 1.02537167, + "epoch": 0.2795731249060574, + "flos": 15159008044800.0, + "grad_norm": 2.146025444932135, + "language_loss": 0.80208755, + "learning_rate": 3.381417358643549e-06, + "loss": 0.82310408, + "num_input_tokens_seen": 100415080, + "step": 4650, + "time_per_iteration": 2.7204463481903076 + }, + { + "auxiliary_loss_clip": 0.01012879, + "auxiliary_loss_mlp": 0.00748511, + "balance_loss_clip": 1.01899028, + "balance_loss_mlp": 1.0021286, + "epoch": 0.27963324815872537, + "flos": 60120103178880.0, + "grad_norm": 0.8171644131685647, + "language_loss": 0.58784062, + "learning_rate": 3.3811356998097624e-06, + "loss": 0.60545456, + "num_input_tokens_seen": 100471105, + "step": 4651, + "time_per_iteration": 3.3086812496185303 + }, + { + "auxiliary_loss_clip": 0.01084351, + "auxiliary_loss_mlp": 0.01043407, + "balance_loss_clip": 1.03112125, + "balance_loss_mlp": 1.02653885, + "epoch": 0.27969337141139333, + "flos": 21765960910080.0, + "grad_norm": 2.2425613566418834, + "language_loss": 0.73668236, + "learning_rate": 3.3808539886031726e-06, + "loss": 0.75795996, + "num_input_tokens_seen": 100492520, + "step": 4652, + "time_per_iteration": 2.6500725746154785 + }, + { + "auxiliary_loss_clip": 0.01097121, + "auxiliary_loss_mlp": 0.01042028, + "balance_loss_clip": 1.0355773, + "balance_loss_mlp": 1.02688861, + "epoch": 0.27975349466406135, + "flos": 39851398834560.0, + "grad_norm": 2.2335192374933466, + "language_loss": 0.79480088, + "learning_rate": 3.380572225034461e-06, + "loss": 0.81619239, + "num_input_tokens_seen": 100512870, + "step": 4653, + "time_per_iteration": 2.6670725345611572 + }, + { + "auxiliary_loss_clip": 0.01072062, + "auxiliary_loss_mlp": 0.01046373, + "balance_loss_clip": 1.03286421, + "balance_loss_mlp": 1.03134632, + "epoch": 0.2798136179167293, + "flos": 21579799697280.0, + "grad_norm": 2.6625741186910066, + "language_loss": 0.78704023, + "learning_rate": 3.380290409114312e-06, + "loss": 0.80822456, + "num_input_tokens_seen": 100531655, + "step": 4654, + "time_per_iteration": 2.612750768661499 + }, + { + "auxiliary_loss_clip": 0.01046064, + "auxiliary_loss_mlp": 0.01041639, + "balance_loss_clip": 1.02793062, + "balance_loss_mlp": 1.02598667, + "epoch": 0.2798737411693973, + "flos": 21537676022400.0, + "grad_norm": 2.3368752573887708, + "language_loss": 0.80666983, + "learning_rate": 3.3800085408534127e-06, + "loss": 0.82754689, + "num_input_tokens_seen": 100548005, + "step": 4655, + "time_per_iteration": 2.736286163330078 + }, + { + "auxiliary_loss_clip": 0.01063276, + "auxiliary_loss_mlp": 0.00748903, + "balance_loss_clip": 1.03114641, + "balance_loss_mlp": 1.00175452, + "epoch": 0.27993386442206525, + "flos": 26981051713920.0, + "grad_norm": 1.5472540692248828, + "language_loss": 0.81325847, + "learning_rate": 3.3797266202624506e-06, + "loss": 0.83138025, + "num_input_tokens_seen": 100567980, + "step": 4656, + "time_per_iteration": 2.718240261077881 + }, + { + "auxiliary_loss_clip": 0.01072617, + "auxiliary_loss_mlp": 0.01040746, + "balance_loss_clip": 1.03326464, + "balance_loss_mlp": 1.02516532, + "epoch": 0.2799939876747332, + "flos": 24349876652160.0, + "grad_norm": 1.5804341299150761, + "language_loss": 0.8300941, + "learning_rate": 3.3794446473521176e-06, + "loss": 0.85122776, + "num_input_tokens_seen": 100588630, + "step": 4657, + "time_per_iteration": 2.657365560531616 + }, + { + "auxiliary_loss_clip": 0.01060868, + "auxiliary_loss_mlp": 0.01048008, + "balance_loss_clip": 1.03535545, + "balance_loss_mlp": 1.03161657, + "epoch": 0.2800541109274012, + "flos": 33656988648960.0, + "grad_norm": 1.737226680081893, + "language_loss": 0.63631898, + "learning_rate": 3.379162622133105e-06, + "loss": 0.65740776, + "num_input_tokens_seen": 100608775, + "step": 4658, + "time_per_iteration": 2.7603580951690674 + }, + { + "auxiliary_loss_clip": 0.01079621, + "auxiliary_loss_mlp": 0.01041784, + "balance_loss_clip": 1.02895188, + "balance_loss_mlp": 1.02646601, + "epoch": 0.28011423418006914, + "flos": 21614417429760.0, + "grad_norm": 2.4662960323621506, + "language_loss": 0.78523731, + "learning_rate": 3.3788805446161073e-06, + "loss": 0.80645138, + "num_input_tokens_seen": 100627975, + "step": 4659, + "time_per_iteration": 2.669605255126953 + }, + { + "auxiliary_loss_clip": 0.01055016, + "auxiliary_loss_mlp": 0.01048956, + "balance_loss_clip": 1.03073812, + "balance_loss_mlp": 1.03393602, + "epoch": 0.2801743574327371, + "flos": 23112431159040.0, + "grad_norm": 2.4311166437753386, + "language_loss": 0.79478514, + "learning_rate": 3.3785984148118215e-06, + "loss": 0.81582487, + "num_input_tokens_seen": 100645430, + "step": 4660, + "time_per_iteration": 2.8281381130218506 + }, + { + "auxiliary_loss_clip": 0.01055872, + "auxiliary_loss_mlp": 0.01036487, + "balance_loss_clip": 1.03049994, + "balance_loss_mlp": 1.02321279, + "epoch": 0.2802344806854051, + "flos": 12641418766080.0, + "grad_norm": 1.9486179295545594, + "language_loss": 0.80635989, + "learning_rate": 3.3783162327309453e-06, + "loss": 0.8272835, + "num_input_tokens_seen": 100663775, + "step": 4661, + "time_per_iteration": 4.2906858921051025 + }, + { + "auxiliary_loss_clip": 0.01072705, + "auxiliary_loss_mlp": 0.01052239, + "balance_loss_clip": 1.03357983, + "balance_loss_mlp": 1.03697979, + "epoch": 0.28029460393807304, + "flos": 37267878142080.0, + "grad_norm": 1.488119287544517, + "language_loss": 0.78760082, + "learning_rate": 3.3780339983841794e-06, + "loss": 0.80885029, + "num_input_tokens_seen": 100686085, + "step": 4662, + "time_per_iteration": 2.9397661685943604 + }, + { + "auxiliary_loss_clip": 0.01074972, + "auxiliary_loss_mlp": 0.010494, + "balance_loss_clip": 1.03220022, + "balance_loss_mlp": 1.03294957, + "epoch": 0.280354727190741, + "flos": 20741106061440.0, + "grad_norm": 2.2238075131086736, + "language_loss": 0.69652045, + "learning_rate": 3.377751711782227e-06, + "loss": 0.71776414, + "num_input_tokens_seen": 100705135, + "step": 4663, + "time_per_iteration": 4.596871614456177 + }, + { + "auxiliary_loss_clip": 0.0107157, + "auxiliary_loss_mlp": 0.01049808, + "balance_loss_clip": 1.03365576, + "balance_loss_mlp": 1.0330596, + "epoch": 0.28041485044340897, + "flos": 21471026336640.0, + "grad_norm": 1.6211195707981088, + "language_loss": 0.77681577, + "learning_rate": 3.377469372935791e-06, + "loss": 0.7980296, + "num_input_tokens_seen": 100724960, + "step": 4664, + "time_per_iteration": 2.736307382583618 + }, + { + "auxiliary_loss_clip": 0.01057475, + "auxiliary_loss_mlp": 0.0104318, + "balance_loss_clip": 1.03081393, + "balance_loss_mlp": 1.02797496, + "epoch": 0.28047497369607693, + "flos": 14794263388800.0, + "grad_norm": 1.7991601463273736, + "language_loss": 0.79823935, + "learning_rate": 3.377186981855578e-06, + "loss": 0.81924587, + "num_input_tokens_seen": 100741995, + "step": 4665, + "time_per_iteration": 2.717043876647949 + }, + { + "auxiliary_loss_clip": 0.01077657, + "auxiliary_loss_mlp": 0.0103766, + "balance_loss_clip": 1.02996385, + "balance_loss_mlp": 1.0227704, + "epoch": 0.2805350969487449, + "flos": 23070738447360.0, + "grad_norm": 1.7362946341459875, + "language_loss": 0.80642259, + "learning_rate": 3.3769045385522968e-06, + "loss": 0.8275758, + "num_input_tokens_seen": 100758985, + "step": 4666, + "time_per_iteration": 2.6120307445526123 + }, + { + "auxiliary_loss_clip": 0.01058863, + "auxiliary_loss_mlp": 0.0105652, + "balance_loss_clip": 1.03306448, + "balance_loss_mlp": 1.03935385, + "epoch": 0.2805952202014129, + "flos": 20479855466880.0, + "grad_norm": 1.8218211274158638, + "language_loss": 0.84660208, + "learning_rate": 3.376622043036658e-06, + "loss": 0.86775595, + "num_input_tokens_seen": 100777820, + "step": 4667, + "time_per_iteration": 2.746265172958374 + }, + { + "auxiliary_loss_clip": 0.01067695, + "auxiliary_loss_mlp": 0.00748716, + "balance_loss_clip": 1.03346372, + "balance_loss_mlp": 1.00160384, + "epoch": 0.2806553434540809, + "flos": 27417330305280.0, + "grad_norm": 1.5947964728947919, + "language_loss": 0.79098904, + "learning_rate": 3.376339495319373e-06, + "loss": 0.80915314, + "num_input_tokens_seen": 100798205, + "step": 4668, + "time_per_iteration": 2.8295488357543945 + }, + { + "auxiliary_loss_clip": 0.01038825, + "auxiliary_loss_mlp": 0.01039341, + "balance_loss_clip": 1.02860069, + "balance_loss_mlp": 1.023844, + "epoch": 0.28071546670674885, + "flos": 26505019745280.0, + "grad_norm": 1.4637352194438937, + "language_loss": 0.76166618, + "learning_rate": 3.3760568954111563e-06, + "loss": 0.78244781, + "num_input_tokens_seen": 100819800, + "step": 4669, + "time_per_iteration": 2.811812400817871 + }, + { + "auxiliary_loss_clip": 0.01083031, + "auxiliary_loss_mlp": 0.01040133, + "balance_loss_clip": 1.03338826, + "balance_loss_mlp": 1.02541673, + "epoch": 0.2807755899594168, + "flos": 20558679863040.0, + "grad_norm": 3.453133269760356, + "language_loss": 0.78851563, + "learning_rate": 3.375774243322725e-06, + "loss": 0.80974728, + "num_input_tokens_seen": 100837880, + "step": 4670, + "time_per_iteration": 2.6199121475219727 + }, + { + "auxiliary_loss_clip": 0.0105326, + "auxiliary_loss_mlp": 0.01041972, + "balance_loss_clip": 1.02989447, + "balance_loss_mlp": 1.02561688, + "epoch": 0.2808357132120848, + "flos": 24313319585280.0, + "grad_norm": 1.8155115346948063, + "language_loss": 0.79015827, + "learning_rate": 3.3754915390647955e-06, + "loss": 0.81111062, + "num_input_tokens_seen": 100856350, + "step": 4671, + "time_per_iteration": 6.135117053985596 + }, + { + "auxiliary_loss_clip": 0.01072428, + "auxiliary_loss_mlp": 0.01041026, + "balance_loss_clip": 1.03166723, + "balance_loss_mlp": 1.02545702, + "epoch": 0.28089583646475275, + "flos": 26432408401920.0, + "grad_norm": 1.7268261475110087, + "language_loss": 0.75356632, + "learning_rate": 3.37520878264809e-06, + "loss": 0.77470088, + "num_input_tokens_seen": 100876135, + "step": 4672, + "time_per_iteration": 2.7212929725646973 + }, + { + "auxiliary_loss_clip": 0.01067568, + "auxiliary_loss_mlp": 0.01045246, + "balance_loss_clip": 1.02839756, + "balance_loss_mlp": 1.02767467, + "epoch": 0.2809559597174207, + "flos": 23111820627840.0, + "grad_norm": 3.351793211070368, + "language_loss": 0.75719643, + "learning_rate": 3.3749259740833286e-06, + "loss": 0.7783246, + "num_input_tokens_seen": 100894790, + "step": 4673, + "time_per_iteration": 2.7173197269439697 + }, + { + "auxiliary_loss_clip": 0.01081548, + "auxiliary_loss_mlp": 0.01038671, + "balance_loss_clip": 1.03126979, + "balance_loss_mlp": 1.02345443, + "epoch": 0.2810160829700887, + "flos": 20923496346240.0, + "grad_norm": 1.7824756501995735, + "language_loss": 0.72195995, + "learning_rate": 3.374643113381237e-06, + "loss": 0.74316216, + "num_input_tokens_seen": 100915100, + "step": 4674, + "time_per_iteration": 2.600877285003662 + }, + { + "auxiliary_loss_clip": 0.01082766, + "auxiliary_loss_mlp": 0.01040226, + "balance_loss_clip": 1.03246999, + "balance_loss_mlp": 1.02385879, + "epoch": 0.28107620622275664, + "flos": 14355901808640.0, + "grad_norm": 1.7255257690330934, + "language_loss": 0.77555704, + "learning_rate": 3.374360200552541e-06, + "loss": 0.79678702, + "num_input_tokens_seen": 100932795, + "step": 4675, + "time_per_iteration": 2.5540547370910645 + }, + { + "auxiliary_loss_clip": 0.01095028, + "auxiliary_loss_mlp": 0.01043921, + "balance_loss_clip": 1.03227985, + "balance_loss_mlp": 1.02700531, + "epoch": 0.2811363294754246, + "flos": 20919078973440.0, + "grad_norm": 1.9608322227825297, + "language_loss": 0.70481789, + "learning_rate": 3.374077235607968e-06, + "loss": 0.72620738, + "num_input_tokens_seen": 100950505, + "step": 4676, + "time_per_iteration": 2.508380651473999 + }, + { + "auxiliary_loss_clip": 0.01088422, + "auxiliary_loss_mlp": 0.01038985, + "balance_loss_clip": 1.03227031, + "balance_loss_mlp": 1.02453125, + "epoch": 0.28119645272809257, + "flos": 20594841880320.0, + "grad_norm": 1.5375619888058585, + "language_loss": 0.70287597, + "learning_rate": 3.3737942185582487e-06, + "loss": 0.72415006, + "num_input_tokens_seen": 100968790, + "step": 4677, + "time_per_iteration": 2.5430641174316406 + }, + { + "auxiliary_loss_clip": 0.01071873, + "auxiliary_loss_mlp": 0.01042041, + "balance_loss_clip": 1.0313406, + "balance_loss_mlp": 1.02412355, + "epoch": 0.28125657598076054, + "flos": 25337420248320.0, + "grad_norm": 1.6852661687055042, + "language_loss": 0.63618362, + "learning_rate": 3.3735111494141153e-06, + "loss": 0.65732276, + "num_input_tokens_seen": 100990205, + "step": 4678, + "time_per_iteration": 2.649630069732666 + }, + { + "auxiliary_loss_clip": 0.01085483, + "auxiliary_loss_mlp": 0.01037215, + "balance_loss_clip": 1.03365982, + "balance_loss_mlp": 1.02176583, + "epoch": 0.2813166992334285, + "flos": 24827093769600.0, + "grad_norm": 1.7416659423182357, + "language_loss": 0.70518756, + "learning_rate": 3.3732280281863013e-06, + "loss": 0.72641456, + "num_input_tokens_seen": 101009815, + "step": 4679, + "time_per_iteration": 2.6843934059143066 + }, + { + "auxiliary_loss_clip": 0.01081694, + "auxiliary_loss_mlp": 0.01042067, + "balance_loss_clip": 1.03124332, + "balance_loss_mlp": 1.02597356, + "epoch": 0.2813768224860965, + "flos": 21760753438080.0, + "grad_norm": 2.3873429670809165, + "language_loss": 0.74710709, + "learning_rate": 3.3729448548855422e-06, + "loss": 0.7683447, + "num_input_tokens_seen": 101026780, + "step": 4680, + "time_per_iteration": 2.6060922145843506 + }, + { + "auxiliary_loss_clip": 0.01095062, + "auxiliary_loss_mlp": 0.01034579, + "balance_loss_clip": 1.03371429, + "balance_loss_mlp": 1.02038121, + "epoch": 0.2814369457387645, + "flos": 24316803204480.0, + "grad_norm": 1.6192419712991466, + "language_loss": 0.76931983, + "learning_rate": 3.3726616295225774e-06, + "loss": 0.79061627, + "num_input_tokens_seen": 101046215, + "step": 4681, + "time_per_iteration": 2.672208786010742 + }, + { + "auxiliary_loss_clip": 0.01084102, + "auxiliary_loss_mlp": 0.01036437, + "balance_loss_clip": 1.03292131, + "balance_loss_mlp": 1.02094615, + "epoch": 0.28149706899143245, + "flos": 18515326872960.0, + "grad_norm": 1.9721697157627376, + "language_loss": 0.740502, + "learning_rate": 3.372378352108146e-06, + "loss": 0.76170743, + "num_input_tokens_seen": 101063365, + "step": 4682, + "time_per_iteration": 2.617894172668457 + }, + { + "auxiliary_loss_clip": 0.01091025, + "auxiliary_loss_mlp": 0.01039106, + "balance_loss_clip": 1.03268921, + "balance_loss_mlp": 1.02450299, + "epoch": 0.2815571922441004, + "flos": 24863255786880.0, + "grad_norm": 1.4675796807275903, + "language_loss": 0.80747819, + "learning_rate": 3.3720950226529894e-06, + "loss": 0.82877946, + "num_input_tokens_seen": 101083835, + "step": 4683, + "time_per_iteration": 2.5656611919403076 + }, + { + "auxiliary_loss_clip": 0.01045498, + "auxiliary_loss_mlp": 0.0104453, + "balance_loss_clip": 1.03394532, + "balance_loss_mlp": 1.02779305, + "epoch": 0.2816173154967684, + "flos": 19901622326400.0, + "grad_norm": 1.850373886534596, + "language_loss": 0.76182675, + "learning_rate": 3.371811641167852e-06, + "loss": 0.78272712, + "num_input_tokens_seen": 101101740, + "step": 4684, + "time_per_iteration": 2.750044822692871 + }, + { + "auxiliary_loss_clip": 0.01037564, + "auxiliary_loss_mlp": 0.01034008, + "balance_loss_clip": 1.02686191, + "balance_loss_mlp": 1.01968515, + "epoch": 0.28167743874943635, + "flos": 17491333950720.0, + "grad_norm": 1.9663327057818079, + "language_loss": 0.76691341, + "learning_rate": 3.3715282076634807e-06, + "loss": 0.78762913, + "num_input_tokens_seen": 101120480, + "step": 4685, + "time_per_iteration": 2.7722389698028564 + }, + { + "auxiliary_loss_clip": 0.01066414, + "auxiliary_loss_mlp": 0.01039945, + "balance_loss_clip": 1.03093851, + "balance_loss_mlp": 1.02525866, + "epoch": 0.2817375620021043, + "flos": 25302120157440.0, + "grad_norm": 1.4243098675892043, + "language_loss": 0.75554222, + "learning_rate": 3.3712447221506218e-06, + "loss": 0.77660578, + "num_input_tokens_seen": 101142910, + "step": 4686, + "time_per_iteration": 2.7217657566070557 + }, + { + "auxiliary_loss_clip": 0.0106023, + "auxiliary_loss_mlp": 0.01043021, + "balance_loss_clip": 1.02829063, + "balance_loss_mlp": 1.02642739, + "epoch": 0.2817976852547723, + "flos": 18693227957760.0, + "grad_norm": 2.596100711621553, + "language_loss": 0.62921727, + "learning_rate": 3.370961184640025e-06, + "loss": 0.65024984, + "num_input_tokens_seen": 101160030, + "step": 4687, + "time_per_iteration": 2.6590216159820557 + }, + { + "auxiliary_loss_clip": 0.01073357, + "auxiliary_loss_mlp": 0.01044968, + "balance_loss_clip": 1.03296471, + "balance_loss_mlp": 1.03007865, + "epoch": 0.28185780850744024, + "flos": 22742263549440.0, + "grad_norm": 2.4919171379516243, + "language_loss": 0.76283205, + "learning_rate": 3.3706775951424433e-06, + "loss": 0.78401524, + "num_input_tokens_seen": 101177675, + "step": 4688, + "time_per_iteration": 2.6269659996032715 + }, + { + "auxiliary_loss_clip": 0.01058278, + "auxiliary_loss_mlp": 0.01032225, + "balance_loss_clip": 1.03108788, + "balance_loss_mlp": 1.01787257, + "epoch": 0.2819179317601082, + "flos": 14933919467520.0, + "grad_norm": 9.43340726761338, + "language_loss": 0.78596383, + "learning_rate": 3.37039395366863e-06, + "loss": 0.80686885, + "num_input_tokens_seen": 101192225, + "step": 4689, + "time_per_iteration": 2.582127809524536 + }, + { + "auxiliary_loss_clip": 0.0104808, + "auxiliary_loss_mlp": 0.01040214, + "balance_loss_clip": 1.02786636, + "balance_loss_mlp": 1.02496755, + "epoch": 0.2819780550127762, + "flos": 23145325038720.0, + "grad_norm": 1.6298608460075845, + "language_loss": 0.77993572, + "learning_rate": 3.37011026022934e-06, + "loss": 0.80081868, + "num_input_tokens_seen": 101210870, + "step": 4690, + "time_per_iteration": 2.6585891246795654 + }, + { + "auxiliary_loss_clip": 0.01090893, + "auxiliary_loss_mlp": 0.00748704, + "balance_loss_clip": 1.03077173, + "balance_loss_mlp": 1.00139654, + "epoch": 0.28203817826544414, + "flos": 21616356764160.0, + "grad_norm": 1.8953239180336392, + "language_loss": 0.87648726, + "learning_rate": 3.369826514835332e-06, + "loss": 0.89488328, + "num_input_tokens_seen": 101229965, + "step": 4691, + "time_per_iteration": 2.566755771636963 + }, + { + "auxiliary_loss_clip": 0.01068322, + "auxiliary_loss_mlp": 0.01049609, + "balance_loss_clip": 1.03153515, + "balance_loss_mlp": 1.03291965, + "epoch": 0.2820983015181121, + "flos": 24026788794240.0, + "grad_norm": 2.210880639910713, + "language_loss": 0.81585336, + "learning_rate": 3.3695427174973654e-06, + "loss": 0.83703262, + "num_input_tokens_seen": 101250980, + "step": 4692, + "time_per_iteration": 2.7213404178619385 + }, + { + "auxiliary_loss_clip": 0.01057783, + "auxiliary_loss_mlp": 0.01038249, + "balance_loss_clip": 1.02977788, + "balance_loss_mlp": 1.0228771, + "epoch": 0.2821584247707801, + "flos": 30007925976960.0, + "grad_norm": 1.6928932943958694, + "language_loss": 0.74243814, + "learning_rate": 3.3692588682262022e-06, + "loss": 0.76339841, + "num_input_tokens_seen": 101273335, + "step": 4693, + "time_per_iteration": 2.7524101734161377 + }, + { + "auxiliary_loss_clip": 0.0105554, + "auxiliary_loss_mlp": 0.01034948, + "balance_loss_clip": 1.02768266, + "balance_loss_mlp": 1.01952243, + "epoch": 0.2822185480234481, + "flos": 21396762967680.0, + "grad_norm": 1.6111321356530521, + "language_loss": 0.77934927, + "learning_rate": 3.3689749670326046e-06, + "loss": 0.80025411, + "num_input_tokens_seen": 101292110, + "step": 4694, + "time_per_iteration": 2.7867751121520996 + }, + { + "auxiliary_loss_clip": 0.01078797, + "auxiliary_loss_mlp": 0.01036993, + "balance_loss_clip": 1.03073049, + "balance_loss_mlp": 1.02268791, + "epoch": 0.28227867127611606, + "flos": 27452809964160.0, + "grad_norm": 1.8963356011091064, + "language_loss": 0.67115724, + "learning_rate": 3.3686910139273392e-06, + "loss": 0.69231516, + "num_input_tokens_seen": 101312815, + "step": 4695, + "time_per_iteration": 2.6065666675567627 + }, + { + "auxiliary_loss_clip": 0.01074635, + "auxiliary_loss_mlp": 0.01046237, + "balance_loss_clip": 1.03135383, + "balance_loss_mlp": 1.02920222, + "epoch": 0.282338794528784, + "flos": 22593736811520.0, + "grad_norm": 1.9670867118980082, + "language_loss": 0.75522125, + "learning_rate": 3.3684070089211736e-06, + "loss": 0.77643001, + "num_input_tokens_seen": 101329045, + "step": 4696, + "time_per_iteration": 2.6929385662078857 + }, + { + "auxiliary_loss_clip": 0.01051003, + "auxiliary_loss_mlp": 0.0104148, + "balance_loss_clip": 1.02930975, + "balance_loss_mlp": 1.02673376, + "epoch": 0.282398917781452, + "flos": 42010923386880.0, + "grad_norm": 1.761953251486386, + "language_loss": 0.62094402, + "learning_rate": 3.368122952024877e-06, + "loss": 0.64186883, + "num_input_tokens_seen": 101352715, + "step": 4697, + "time_per_iteration": 2.991518259048462 + }, + { + "auxiliary_loss_clip": 0.01046392, + "auxiliary_loss_mlp": 0.01035581, + "balance_loss_clip": 1.02790475, + "balance_loss_mlp": 1.02147818, + "epoch": 0.28245904103411995, + "flos": 23224724052480.0, + "grad_norm": 1.3271783674117419, + "language_loss": 0.728567, + "learning_rate": 3.3678388432492214e-06, + "loss": 0.74938673, + "num_input_tokens_seen": 101374640, + "step": 4698, + "time_per_iteration": 2.9038848876953125 + }, + { + "auxiliary_loss_clip": 0.01086468, + "auxiliary_loss_mlp": 0.01038028, + "balance_loss_clip": 1.02929652, + "balance_loss_mlp": 1.02357984, + "epoch": 0.2825191642867879, + "flos": 25374623760000.0, + "grad_norm": 1.6810674663536644, + "language_loss": 0.74993229, + "learning_rate": 3.3675546826049788e-06, + "loss": 0.77117729, + "num_input_tokens_seen": 101393595, + "step": 4699, + "time_per_iteration": 2.570033073425293 + }, + { + "auxiliary_loss_clip": 0.01077498, + "auxiliary_loss_mlp": 0.01036212, + "balance_loss_clip": 1.02923882, + "balance_loss_mlp": 1.0199275, + "epoch": 0.2825792875394559, + "flos": 17236799199360.0, + "grad_norm": 3.4225618925465318, + "language_loss": 0.80367124, + "learning_rate": 3.3672704701029265e-06, + "loss": 0.8248083, + "num_input_tokens_seen": 101409265, + "step": 4700, + "time_per_iteration": 2.5287046432495117 + }, + { + "auxiliary_loss_clip": 0.0106656, + "auxiliary_loss_mlp": 0.01040531, + "balance_loss_clip": 1.03092742, + "balance_loss_mlp": 1.02775145, + "epoch": 0.28263941079212385, + "flos": 26723967096960.0, + "grad_norm": 2.6229520240291, + "language_loss": 0.81472051, + "learning_rate": 3.3669862057538402e-06, + "loss": 0.83579147, + "num_input_tokens_seen": 101428365, + "step": 4701, + "time_per_iteration": 2.6079142093658447 + }, + { + "auxiliary_loss_clip": 0.01025128, + "auxiliary_loss_mlp": 0.01042645, + "balance_loss_clip": 1.03028989, + "balance_loss_mlp": 1.02760077, + "epoch": 0.2826995340447918, + "flos": 25921327737600.0, + "grad_norm": 2.8740014049287277, + "language_loss": 0.72794056, + "learning_rate": 3.3667018895685004e-06, + "loss": 0.7486183, + "num_input_tokens_seen": 101447280, + "step": 4702, + "time_per_iteration": 2.7860865592956543 + }, + { + "auxiliary_loss_clip": 0.01088523, + "auxiliary_loss_mlp": 0.01031428, + "balance_loss_clip": 1.03091645, + "balance_loss_mlp": 1.01641977, + "epoch": 0.2827596572974598, + "flos": 22379709623040.0, + "grad_norm": 2.222365511826863, + "language_loss": 0.7841382, + "learning_rate": 3.3664175215576886e-06, + "loss": 0.80533767, + "num_input_tokens_seen": 101465435, + "step": 4703, + "time_per_iteration": 2.564483404159546 + }, + { + "auxiliary_loss_clip": 0.0106016, + "auxiliary_loss_mlp": 0.01041087, + "balance_loss_clip": 1.02747679, + "balance_loss_mlp": 1.02542305, + "epoch": 0.28281978055012774, + "flos": 33547137880320.0, + "grad_norm": 1.9292728916404307, + "language_loss": 0.69352853, + "learning_rate": 3.3661331017321867e-06, + "loss": 0.71454102, + "num_input_tokens_seen": 101486355, + "step": 4704, + "time_per_iteration": 2.7829911708831787 + }, + { + "auxiliary_loss_clip": 0.01058593, + "auxiliary_loss_mlp": 0.01037243, + "balance_loss_clip": 1.03129721, + "balance_loss_mlp": 1.02173412, + "epoch": 0.2828799038027957, + "flos": 23440870143360.0, + "grad_norm": 2.5496786695777276, + "language_loss": 0.70375687, + "learning_rate": 3.3658486301027807e-06, + "loss": 0.72471523, + "num_input_tokens_seen": 101505875, + "step": 4705, + "time_per_iteration": 2.8561947345733643 + }, + { + "auxiliary_loss_clip": 0.0101753, + "auxiliary_loss_mlp": 0.01002793, + "balance_loss_clip": 1.00972247, + "balance_loss_mlp": 1.00098121, + "epoch": 0.2829400270554637, + "flos": 69873690251520.0, + "grad_norm": 0.7422720960748825, + "language_loss": 0.59301537, + "learning_rate": 3.3655641066802577e-06, + "loss": 0.61321861, + "num_input_tokens_seen": 101565045, + "step": 4706, + "time_per_iteration": 3.2435648441314697 + }, + { + "auxiliary_loss_clip": 0.01060957, + "auxiliary_loss_mlp": 0.01035515, + "balance_loss_clip": 1.0278331, + "balance_loss_mlp": 1.02150226, + "epoch": 0.2830001503081317, + "flos": 24789028331520.0, + "grad_norm": 1.4890909013863036, + "language_loss": 0.81996381, + "learning_rate": 3.365279531475407e-06, + "loss": 0.84092855, + "num_input_tokens_seen": 101585825, + "step": 4707, + "time_per_iteration": 2.7047171592712402 + }, + { + "auxiliary_loss_clip": 0.01069165, + "auxiliary_loss_mlp": 0.01038581, + "balance_loss_clip": 1.02868891, + "balance_loss_mlp": 1.02232051, + "epoch": 0.28306027356079966, + "flos": 27669387018240.0, + "grad_norm": 1.5293087510826864, + "language_loss": 0.8056761, + "learning_rate": 3.36499490449902e-06, + "loss": 0.82675356, + "num_input_tokens_seen": 101606105, + "step": 4708, + "time_per_iteration": 2.6511178016662598 + }, + { + "auxiliary_loss_clip": 0.01003581, + "auxiliary_loss_mlp": 0.01006094, + "balance_loss_clip": 1.00697839, + "balance_loss_mlp": 1.00424671, + "epoch": 0.2831203968134676, + "flos": 60527938199040.0, + "grad_norm": 0.8840823319438372, + "language_loss": 0.62867391, + "learning_rate": 3.3647102257618895e-06, + "loss": 0.64877069, + "num_input_tokens_seen": 101656875, + "step": 4709, + "time_per_iteration": 4.618484020233154 + }, + { + "auxiliary_loss_clip": 0.01062471, + "auxiliary_loss_mlp": 0.01038198, + "balance_loss_clip": 1.03135896, + "balance_loss_mlp": 1.02181864, + "epoch": 0.2831805200661356, + "flos": 22054790171520.0, + "grad_norm": 1.456578660571654, + "language_loss": 0.73831856, + "learning_rate": 3.3644254952748103e-06, + "loss": 0.75932527, + "num_input_tokens_seen": 101676225, + "step": 4710, + "time_per_iteration": 4.239165306091309 + }, + { + "auxiliary_loss_clip": 0.01054344, + "auxiliary_loss_mlp": 0.01050843, + "balance_loss_clip": 1.02831316, + "balance_loss_mlp": 1.03457665, + "epoch": 0.28324064331880355, + "flos": 22600668136320.0, + "grad_norm": 1.9907443914757037, + "language_loss": 0.78755105, + "learning_rate": 3.364140713048579e-06, + "loss": 0.80860293, + "num_input_tokens_seen": 101693710, + "step": 4711, + "time_per_iteration": 2.83598256111145 + }, + { + "auxiliary_loss_clip": 0.0108518, + "auxiliary_loss_mlp": 0.00748903, + "balance_loss_clip": 1.03554857, + "balance_loss_mlp": 1.00168765, + "epoch": 0.2833007665714715, + "flos": 30404127968640.0, + "grad_norm": 5.508895308085215, + "language_loss": 0.70952833, + "learning_rate": 3.363855879093996e-06, + "loss": 0.72786915, + "num_input_tokens_seen": 101714010, + "step": 4712, + "time_per_iteration": 2.715571165084839 + }, + { + "auxiliary_loss_clip": 0.01095542, + "auxiliary_loss_mlp": 0.0104379, + "balance_loss_clip": 1.03506529, + "balance_loss_mlp": 1.02714801, + "epoch": 0.2833608898241395, + "flos": 23549499849600.0, + "grad_norm": 1.897881216542277, + "language_loss": 0.81645036, + "learning_rate": 3.3635709934218605e-06, + "loss": 0.83784366, + "num_input_tokens_seen": 101732995, + "step": 4713, + "time_per_iteration": 2.6487019062042236 + }, + { + "auxiliary_loss_clip": 0.01072408, + "auxiliary_loss_mlp": 0.01039254, + "balance_loss_clip": 1.03477609, + "balance_loss_mlp": 1.0237211, + "epoch": 0.28342101307680745, + "flos": 20266726118400.0, + "grad_norm": 2.2191251350642265, + "language_loss": 0.75237, + "learning_rate": 3.3632860560429766e-06, + "loss": 0.77348661, + "num_input_tokens_seen": 101751385, + "step": 4714, + "time_per_iteration": 2.610409736633301 + }, + { + "auxiliary_loss_clip": 0.01081588, + "auxiliary_loss_mlp": 0.0104908, + "balance_loss_clip": 1.03360343, + "balance_loss_mlp": 1.03391671, + "epoch": 0.2834811363294754, + "flos": 30847050576000.0, + "grad_norm": 1.5567450593675585, + "language_loss": 0.78280079, + "learning_rate": 3.3630010669681494e-06, + "loss": 0.80410743, + "num_input_tokens_seen": 101773825, + "step": 4715, + "time_per_iteration": 2.703599214553833 + }, + { + "auxiliary_loss_clip": 0.01068512, + "auxiliary_loss_mlp": 0.01040191, + "balance_loss_clip": 1.03147006, + "balance_loss_mlp": 1.02477169, + "epoch": 0.2835412595821434, + "flos": 22711021695360.0, + "grad_norm": 2.490394728727751, + "language_loss": 0.73845327, + "learning_rate": 3.3627160262081845e-06, + "loss": 0.75954032, + "num_input_tokens_seen": 101791920, + "step": 4716, + "time_per_iteration": 2.608894109725952 + }, + { + "auxiliary_loss_clip": 0.01064229, + "auxiliary_loss_mlp": 0.01046965, + "balance_loss_clip": 1.02828407, + "balance_loss_mlp": 1.0289166, + "epoch": 0.28360138283481134, + "flos": 18077719478400.0, + "grad_norm": 3.0887303438721325, + "language_loss": 0.74528229, + "learning_rate": 3.3624309337738917e-06, + "loss": 0.76639426, + "num_input_tokens_seen": 101809515, + "step": 4717, + "time_per_iteration": 2.5518875122070312 + }, + { + "auxiliary_loss_clip": 0.01059197, + "auxiliary_loss_mlp": 0.01043127, + "balance_loss_clip": 1.02864695, + "balance_loss_mlp": 1.0280118, + "epoch": 0.2836615060874793, + "flos": 17854785717120.0, + "grad_norm": 1.9092315772799575, + "language_loss": 0.67191666, + "learning_rate": 3.3621457896760813e-06, + "loss": 0.69293988, + "num_input_tokens_seen": 101827735, + "step": 4718, + "time_per_iteration": 4.2703611850738525 + }, + { + "auxiliary_loss_clip": 0.01073111, + "auxiliary_loss_mlp": 0.01040488, + "balance_loss_clip": 1.03107619, + "balance_loss_mlp": 1.0242697, + "epoch": 0.2837216293401473, + "flos": 25740302169600.0, + "grad_norm": 1.6645966246225115, + "language_loss": 0.72243118, + "learning_rate": 3.361860593925566e-06, + "loss": 0.74356717, + "num_input_tokens_seen": 101845970, + "step": 4719, + "time_per_iteration": 4.226760625839233 + }, + { + "auxiliary_loss_clip": 0.01079941, + "auxiliary_loss_mlp": 0.01042699, + "balance_loss_clip": 1.03146744, + "balance_loss_mlp": 1.02754772, + "epoch": 0.2837817525928153, + "flos": 20923532259840.0, + "grad_norm": 2.8504673183659994, + "language_loss": 0.80435026, + "learning_rate": 3.3615753465331605e-06, + "loss": 0.82557672, + "num_input_tokens_seen": 101865040, + "step": 4720, + "time_per_iteration": 2.6541664600372314 + }, + { + "auxiliary_loss_clip": 0.01077118, + "auxiliary_loss_mlp": 0.01040705, + "balance_loss_clip": 1.03021622, + "balance_loss_mlp": 1.02393246, + "epoch": 0.28384187584548326, + "flos": 18916700423040.0, + "grad_norm": 1.9720996493622567, + "language_loss": 0.79263335, + "learning_rate": 3.3612900475096817e-06, + "loss": 0.8138116, + "num_input_tokens_seen": 101883735, + "step": 4721, + "time_per_iteration": 2.5811612606048584 + }, + { + "auxiliary_loss_clip": 0.01041661, + "auxiliary_loss_mlp": 0.00748907, + "balance_loss_clip": 1.02902806, + "balance_loss_mlp": 1.00167644, + "epoch": 0.2839019990981512, + "flos": 27343964776320.0, + "grad_norm": 2.577087073394462, + "language_loss": 0.82611465, + "learning_rate": 3.3610046968659474e-06, + "loss": 0.84402025, + "num_input_tokens_seen": 101903025, + "step": 4722, + "time_per_iteration": 2.8690054416656494 + }, + { + "auxiliary_loss_clip": 0.01093194, + "auxiliary_loss_mlp": 0.01034517, + "balance_loss_clip": 1.0335393, + "balance_loss_mlp": 1.02011609, + "epoch": 0.2839621223508192, + "flos": 18114312458880.0, + "grad_norm": 1.7829390131788978, + "language_loss": 0.70114756, + "learning_rate": 3.3607192946127785e-06, + "loss": 0.72242463, + "num_input_tokens_seen": 101922255, + "step": 4723, + "time_per_iteration": 2.5509824752807617 + }, + { + "auxiliary_loss_clip": 0.01063854, + "auxiliary_loss_mlp": 0.01039765, + "balance_loss_clip": 1.02900982, + "balance_loss_mlp": 1.023314, + "epoch": 0.28402224560348716, + "flos": 26358360514560.0, + "grad_norm": 3.9445161571072975, + "language_loss": 0.78571373, + "learning_rate": 3.360433840760998e-06, + "loss": 0.80674994, + "num_input_tokens_seen": 101943100, + "step": 4724, + "time_per_iteration": 2.749691963195801 + }, + { + "auxiliary_loss_clip": 0.01063403, + "auxiliary_loss_mlp": 0.0104685, + "balance_loss_clip": 1.02987218, + "balance_loss_mlp": 1.0301131, + "epoch": 0.2840823688561551, + "flos": 24060795995520.0, + "grad_norm": 1.713620385278694, + "language_loss": 0.92245704, + "learning_rate": 3.36014833532143e-06, + "loss": 0.94355953, + "num_input_tokens_seen": 101963160, + "step": 4725, + "time_per_iteration": 2.9541571140289307 + }, + { + "auxiliary_loss_clip": 0.01083122, + "auxiliary_loss_mlp": 0.01039333, + "balance_loss_clip": 1.03179288, + "balance_loss_mlp": 1.02332306, + "epoch": 0.2841424921088231, + "flos": 29459821368960.0, + "grad_norm": 1.5785818462360588, + "language_loss": 0.88873589, + "learning_rate": 3.3598627783049e-06, + "loss": 0.90996039, + "num_input_tokens_seen": 101984300, + "step": 4726, + "time_per_iteration": 2.734349012374878 + }, + { + "auxiliary_loss_clip": 0.01083811, + "auxiliary_loss_mlp": 0.01046333, + "balance_loss_clip": 1.03298283, + "balance_loss_mlp": 1.03045976, + "epoch": 0.28420261536149105, + "flos": 48100367053440.0, + "grad_norm": 2.788924018994635, + "language_loss": 0.78650403, + "learning_rate": 3.359577169722238e-06, + "loss": 0.80780542, + "num_input_tokens_seen": 102005765, + "step": 4727, + "time_per_iteration": 2.867701530456543 + }, + { + "auxiliary_loss_clip": 0.01078098, + "auxiliary_loss_mlp": 0.01035956, + "balance_loss_clip": 1.0305388, + "balance_loss_mlp": 1.02206266, + "epoch": 0.284262738614159, + "flos": 25666146541440.0, + "grad_norm": 2.8864934159812714, + "language_loss": 0.67109251, + "learning_rate": 3.3592915095842733e-06, + "loss": 0.69223309, + "num_input_tokens_seen": 102022755, + "step": 4728, + "time_per_iteration": 2.6684255599975586 + }, + { + "auxiliary_loss_clip": 0.01058839, + "auxiliary_loss_mlp": 0.0104413, + "balance_loss_clip": 1.02960372, + "balance_loss_mlp": 1.02819169, + "epoch": 0.284322861866827, + "flos": 19718980646400.0, + "grad_norm": 1.9449631285852556, + "language_loss": 0.76595902, + "learning_rate": 3.3590057979018386e-06, + "loss": 0.78698868, + "num_input_tokens_seen": 102041850, + "step": 4729, + "time_per_iteration": 2.6604204177856445 + }, + { + "auxiliary_loss_clip": 0.01072772, + "auxiliary_loss_mlp": 0.01044171, + "balance_loss_clip": 1.03253162, + "balance_loss_mlp": 1.02876925, + "epoch": 0.28438298511949495, + "flos": 23915250086400.0, + "grad_norm": 1.824160332325582, + "language_loss": 0.66658521, + "learning_rate": 3.3587200346857674e-06, + "loss": 0.68775463, + "num_input_tokens_seen": 102059500, + "step": 4730, + "time_per_iteration": 2.7348122596740723 + }, + { + "auxiliary_loss_clip": 0.01070172, + "auxiliary_loss_mlp": 0.01034044, + "balance_loss_clip": 1.03154349, + "balance_loss_mlp": 1.01817679, + "epoch": 0.2844431083721629, + "flos": 26067340523520.0, + "grad_norm": 1.954788012609253, + "language_loss": 0.74175447, + "learning_rate": 3.3584342199468965e-06, + "loss": 0.76279664, + "num_input_tokens_seen": 102080460, + "step": 4731, + "time_per_iteration": 2.7249560356140137 + }, + { + "auxiliary_loss_clip": 0.01061036, + "auxiliary_loss_mlp": 0.01034824, + "balance_loss_clip": 1.03425002, + "balance_loss_mlp": 1.01902914, + "epoch": 0.2845032316248309, + "flos": 25810435474560.0, + "grad_norm": 1.611734162444224, + "language_loss": 0.8337388, + "learning_rate": 3.3581483536960638e-06, + "loss": 0.85469741, + "num_input_tokens_seen": 102100950, + "step": 4732, + "time_per_iteration": 2.8168649673461914 + }, + { + "auxiliary_loss_clip": 0.010836, + "auxiliary_loss_mlp": 0.01045063, + "balance_loss_clip": 1.0319984, + "balance_loss_mlp": 1.02808738, + "epoch": 0.2845633548774989, + "flos": 19823192979840.0, + "grad_norm": 1.9511563300154686, + "language_loss": 0.78496814, + "learning_rate": 3.357862435944109e-06, + "loss": 0.8062548, + "num_input_tokens_seen": 102119345, + "step": 4733, + "time_per_iteration": 2.6318626403808594 + }, + { + "auxiliary_loss_clip": 0.01100353, + "auxiliary_loss_mlp": 0.0104432, + "balance_loss_clip": 1.03494906, + "balance_loss_mlp": 1.02752328, + "epoch": 0.28462347813016686, + "flos": 23182815859200.0, + "grad_norm": 2.700206228586547, + "language_loss": 0.7157954, + "learning_rate": 3.357576466701875e-06, + "loss": 0.7372421, + "num_input_tokens_seen": 102139050, + "step": 4734, + "time_per_iteration": 2.6039535999298096 + }, + { + "auxiliary_loss_clip": 0.01068735, + "auxiliary_loss_mlp": 0.01030251, + "balance_loss_clip": 1.02898884, + "balance_loss_mlp": 1.01515269, + "epoch": 0.2846836013828348, + "flos": 18660477732480.0, + "grad_norm": 1.8750962429114877, + "language_loss": 0.74148571, + "learning_rate": 3.3572904459802056e-06, + "loss": 0.76247549, + "num_input_tokens_seen": 102157935, + "step": 4735, + "time_per_iteration": 2.658438205718994 + }, + { + "auxiliary_loss_clip": 0.01071811, + "auxiliary_loss_mlp": 0.01039871, + "balance_loss_clip": 1.03195429, + "balance_loss_mlp": 1.02501726, + "epoch": 0.2847437246355028, + "flos": 14173511523840.0, + "grad_norm": 1.7305520273767416, + "language_loss": 0.79844975, + "learning_rate": 3.357004373789946e-06, + "loss": 0.81956661, + "num_input_tokens_seen": 102175325, + "step": 4736, + "time_per_iteration": 2.7378995418548584 + }, + { + "auxiliary_loss_clip": 0.01094106, + "auxiliary_loss_mlp": 0.01040588, + "balance_loss_clip": 1.03292227, + "balance_loss_mlp": 1.02499497, + "epoch": 0.28480384788817076, + "flos": 29278364837760.0, + "grad_norm": 1.99788910395378, + "language_loss": 0.5963372, + "learning_rate": 3.3567182501419453e-06, + "loss": 0.61768413, + "num_input_tokens_seen": 102196625, + "step": 4737, + "time_per_iteration": 2.7382874488830566 + }, + { + "auxiliary_loss_clip": 0.01072704, + "auxiliary_loss_mlp": 0.01032904, + "balance_loss_clip": 1.02795875, + "balance_loss_mlp": 1.01803827, + "epoch": 0.2848639711408387, + "flos": 22601314581120.0, + "grad_norm": 1.824385432232638, + "language_loss": 0.86533141, + "learning_rate": 3.356432075047052e-06, + "loss": 0.88638753, + "num_input_tokens_seen": 102214975, + "step": 4738, + "time_per_iteration": 2.644739866256714 + }, + { + "auxiliary_loss_clip": 0.01069177, + "auxiliary_loss_mlp": 0.01048005, + "balance_loss_clip": 1.03362846, + "balance_loss_mlp": 1.03020668, + "epoch": 0.2849240943935067, + "flos": 17599460866560.0, + "grad_norm": 2.835161764993899, + "language_loss": 0.89167434, + "learning_rate": 3.356145848516118e-06, + "loss": 0.91284615, + "num_input_tokens_seen": 102231885, + "step": 4739, + "time_per_iteration": 2.7096574306488037 + }, + { + "auxiliary_loss_clip": 0.01085056, + "auxiliary_loss_mlp": 0.01035952, + "balance_loss_clip": 1.03580403, + "balance_loss_mlp": 1.0210036, + "epoch": 0.28498421764617465, + "flos": 24862573428480.0, + "grad_norm": 1.4491200001433775, + "language_loss": 0.72144204, + "learning_rate": 3.355859570559998e-06, + "loss": 0.74265218, + "num_input_tokens_seen": 102252725, + "step": 4740, + "time_per_iteration": 2.6615993976593018 + }, + { + "auxiliary_loss_clip": 0.01072594, + "auxiliary_loss_mlp": 0.01033712, + "balance_loss_clip": 1.033319, + "balance_loss_mlp": 1.01906109, + "epoch": 0.2850443408988426, + "flos": 22782555630720.0, + "grad_norm": 1.5356585279539514, + "language_loss": 0.77753752, + "learning_rate": 3.3555732411895477e-06, + "loss": 0.79860055, + "num_input_tokens_seen": 102271730, + "step": 4741, + "time_per_iteration": 2.662226438522339 + }, + { + "auxiliary_loss_clip": 0.01064019, + "auxiliary_loss_mlp": 0.0104695, + "balance_loss_clip": 1.03225386, + "balance_loss_mlp": 1.03012919, + "epoch": 0.2851044641515106, + "flos": 18844053166080.0, + "grad_norm": 1.7771925859316682, + "language_loss": 0.76024294, + "learning_rate": 3.3552868604156235e-06, + "loss": 0.78135258, + "num_input_tokens_seen": 102291325, + "step": 4742, + "time_per_iteration": 2.800201892852783 + }, + { + "auxiliary_loss_clip": 0.01097314, + "auxiliary_loss_mlp": 0.01045385, + "balance_loss_clip": 1.03299797, + "balance_loss_mlp": 1.0268836, + "epoch": 0.28516458740417855, + "flos": 18880502492160.0, + "grad_norm": 2.591395593515692, + "language_loss": 0.57394719, + "learning_rate": 3.355000428249086e-06, + "loss": 0.59537417, + "num_input_tokens_seen": 102309000, + "step": 4743, + "time_per_iteration": 2.6583445072174072 + }, + { + "auxiliary_loss_clip": 0.01061424, + "auxiliary_loss_mlp": 0.01048123, + "balance_loss_clip": 1.03105736, + "balance_loss_mlp": 1.03208995, + "epoch": 0.2852247106568465, + "flos": 25299821687040.0, + "grad_norm": 1.7457595712421463, + "language_loss": 0.74304175, + "learning_rate": 3.354713944700797e-06, + "loss": 0.76413727, + "num_input_tokens_seen": 102329240, + "step": 4744, + "time_per_iteration": 2.732440710067749 + }, + { + "auxiliary_loss_clip": 0.01079957, + "auxiliary_loss_mlp": 0.01042255, + "balance_loss_clip": 1.03192127, + "balance_loss_mlp": 1.02688932, + "epoch": 0.2852848339095145, + "flos": 11655383541120.0, + "grad_norm": 2.0886470767733822, + "language_loss": 0.77429861, + "learning_rate": 3.3544274097816185e-06, + "loss": 0.79552072, + "num_input_tokens_seen": 102344440, + "step": 4745, + "time_per_iteration": 2.6180450916290283 + }, + { + "auxiliary_loss_clip": 0.0107921, + "auxiliary_loss_mlp": 0.01039363, + "balance_loss_clip": 1.03633308, + "balance_loss_mlp": 1.02433085, + "epoch": 0.2853449571621825, + "flos": 12933228856320.0, + "grad_norm": 1.8032775635318212, + "language_loss": 0.82744241, + "learning_rate": 3.3541408235024173e-06, + "loss": 0.8486281, + "num_input_tokens_seen": 102360985, + "step": 4746, + "time_per_iteration": 2.6230897903442383 + }, + { + "auxiliary_loss_clip": 0.01046961, + "auxiliary_loss_mlp": 0.01039919, + "balance_loss_clip": 1.0286355, + "balance_loss_mlp": 1.02307498, + "epoch": 0.28540508041485046, + "flos": 20010575255040.0, + "grad_norm": 1.807574753047142, + "language_loss": 0.79655993, + "learning_rate": 3.3538541858740604e-06, + "loss": 0.81742871, + "num_input_tokens_seen": 102380320, + "step": 4747, + "time_per_iteration": 2.812603712081909 + }, + { + "auxiliary_loss_clip": 0.01015294, + "auxiliary_loss_mlp": 0.01011526, + "balance_loss_clip": 1.00940204, + "balance_loss_mlp": 1.0093329, + "epoch": 0.28546520366751843, + "flos": 68139349966080.0, + "grad_norm": 0.7819921710715008, + "language_loss": 0.60488403, + "learning_rate": 3.3535674969074173e-06, + "loss": 0.62515217, + "num_input_tokens_seen": 102439140, + "step": 4748, + "time_per_iteration": 3.1857657432556152 + }, + { + "auxiliary_loss_clip": 0.01092688, + "auxiliary_loss_mlp": 0.01041779, + "balance_loss_clip": 1.03155684, + "balance_loss_mlp": 1.02593589, + "epoch": 0.2855253269201864, + "flos": 13251540205440.0, + "grad_norm": 2.4980131757132376, + "language_loss": 0.80339432, + "learning_rate": 3.3532807566133592e-06, + "loss": 0.82473898, + "num_input_tokens_seen": 102450990, + "step": 4749, + "time_per_iteration": 2.6130950450897217 + }, + { + "auxiliary_loss_clip": 0.01080855, + "auxiliary_loss_mlp": 0.01036313, + "balance_loss_clip": 1.0314436, + "balance_loss_mlp": 1.02088165, + "epoch": 0.28558545017285436, + "flos": 28620876337920.0, + "grad_norm": 1.9161440688890408, + "language_loss": 0.70199168, + "learning_rate": 3.3529939650027587e-06, + "loss": 0.72316337, + "num_input_tokens_seen": 102471820, + "step": 4750, + "time_per_iteration": 2.697007417678833 + }, + { + "auxiliary_loss_clip": 0.01080375, + "auxiliary_loss_mlp": 0.01032924, + "balance_loss_clip": 1.03345823, + "balance_loss_mlp": 1.01792121, + "epoch": 0.2856455734255223, + "flos": 34130470752000.0, + "grad_norm": 1.7355278596364392, + "language_loss": 0.81893802, + "learning_rate": 3.3527071220864917e-06, + "loss": 0.84007096, + "num_input_tokens_seen": 102492625, + "step": 4751, + "time_per_iteration": 2.7810933589935303 + }, + { + "auxiliary_loss_clip": 0.01092504, + "auxiliary_loss_mlp": 0.01041265, + "balance_loss_clip": 1.03301489, + "balance_loss_mlp": 1.02604198, + "epoch": 0.2857056966781903, + "flos": 39786149779200.0, + "grad_norm": 1.9812469318661075, + "language_loss": 0.79975492, + "learning_rate": 3.3524202278754353e-06, + "loss": 0.82109261, + "num_input_tokens_seen": 102514145, + "step": 4752, + "time_per_iteration": 2.840853452682495 + }, + { + "auxiliary_loss_clip": 0.01073841, + "auxiliary_loss_mlp": 0.01039123, + "balance_loss_clip": 1.02979386, + "balance_loss_mlp": 1.02375674, + "epoch": 0.28576581993085826, + "flos": 21872292145920.0, + "grad_norm": 1.7015462511881065, + "language_loss": 0.78903484, + "learning_rate": 3.3521332823804676e-06, + "loss": 0.81016445, + "num_input_tokens_seen": 102532365, + "step": 4753, + "time_per_iteration": 2.626991033554077 + }, + { + "auxiliary_loss_clip": 0.01095102, + "auxiliary_loss_mlp": 0.01039238, + "balance_loss_clip": 1.0332073, + "balance_loss_mlp": 1.02208972, + "epoch": 0.2858259431835262, + "flos": 19091656592640.0, + "grad_norm": 2.299780559752782, + "language_loss": 0.89624643, + "learning_rate": 3.3518462856124704e-06, + "loss": 0.9175899, + "num_input_tokens_seen": 102548425, + "step": 4754, + "time_per_iteration": 2.635629415512085 + }, + { + "auxiliary_loss_clip": 0.01080327, + "auxiliary_loss_mlp": 0.01040613, + "balance_loss_clip": 1.03166127, + "balance_loss_mlp": 1.02587867, + "epoch": 0.2858860664361942, + "flos": 20334309557760.0, + "grad_norm": 2.8610883539341994, + "language_loss": 0.82254124, + "learning_rate": 3.3515592375823267e-06, + "loss": 0.84375066, + "num_input_tokens_seen": 102566370, + "step": 4755, + "time_per_iteration": 2.691964626312256 + }, + { + "auxiliary_loss_clip": 0.01048773, + "auxiliary_loss_mlp": 0.01039072, + "balance_loss_clip": 1.03008997, + "balance_loss_mlp": 1.02444482, + "epoch": 0.28594618968886215, + "flos": 24461738582400.0, + "grad_norm": 1.4745792456892337, + "language_loss": 0.83713967, + "learning_rate": 3.351272138300922e-06, + "loss": 0.85801804, + "num_input_tokens_seen": 102588715, + "step": 4756, + "time_per_iteration": 2.7726540565490723 + }, + { + "auxiliary_loss_clip": 0.00989885, + "auxiliary_loss_mlp": 0.01026584, + "balance_loss_clip": 1.0050652, + "balance_loss_mlp": 1.0243187, + "epoch": 0.2860063129415301, + "flos": 71652850709760.0, + "grad_norm": 0.8729125704304191, + "language_loss": 0.60999858, + "learning_rate": 3.350984987779142e-06, + "loss": 0.63016331, + "num_input_tokens_seen": 102656715, + "step": 4757, + "time_per_iteration": 4.9027488231658936 + }, + { + "auxiliary_loss_clip": 0.01093967, + "auxiliary_loss_mlp": 0.01029883, + "balance_loss_clip": 1.03452682, + "balance_loss_mlp": 1.01496375, + "epoch": 0.2860664361941981, + "flos": 20558679863040.0, + "grad_norm": 2.6195411522238365, + "language_loss": 0.65503716, + "learning_rate": 3.3506977860278756e-06, + "loss": 0.67627567, + "num_input_tokens_seen": 102676545, + "step": 4758, + "time_per_iteration": 4.108074188232422 + }, + { + "auxiliary_loss_clip": 0.0108272, + "auxiliary_loss_mlp": 0.01032753, + "balance_loss_clip": 1.03194213, + "balance_loss_mlp": 1.01797163, + "epoch": 0.2861265594468661, + "flos": 35996389534080.0, + "grad_norm": 1.3753636072344715, + "language_loss": 0.62794405, + "learning_rate": 3.3504105330580143e-06, + "loss": 0.64909875, + "num_input_tokens_seen": 102702875, + "step": 4759, + "time_per_iteration": 2.86691951751709 + }, + { + "auxiliary_loss_clip": 0.01075486, + "auxiliary_loss_mlp": 0.00748757, + "balance_loss_clip": 1.03198647, + "balance_loss_mlp": 1.00140214, + "epoch": 0.28618668269953407, + "flos": 20047419630720.0, + "grad_norm": 1.6965737875979245, + "language_loss": 0.74205804, + "learning_rate": 3.3501232288804496e-06, + "loss": 0.7603004, + "num_input_tokens_seen": 102723160, + "step": 4760, + "time_per_iteration": 2.79647159576416 + }, + { + "auxiliary_loss_clip": 0.01076526, + "auxiliary_loss_mlp": 0.01038393, + "balance_loss_clip": 1.03878748, + "balance_loss_mlp": 1.02454138, + "epoch": 0.28624680595220203, + "flos": 24971849579520.0, + "grad_norm": 1.9169398815448218, + "language_loss": 0.72222555, + "learning_rate": 3.3498358735060773e-06, + "loss": 0.74337471, + "num_input_tokens_seen": 102743855, + "step": 4761, + "time_per_iteration": 2.877396821975708 + }, + { + "auxiliary_loss_clip": 0.01028159, + "auxiliary_loss_mlp": 0.01046791, + "balance_loss_clip": 1.03145432, + "balance_loss_mlp": 1.03150797, + "epoch": 0.28630692920487, + "flos": 22492253911680.0, + "grad_norm": 2.0772167073296366, + "language_loss": 0.7445932, + "learning_rate": 3.349548466945793e-06, + "loss": 0.76534271, + "num_input_tokens_seen": 102761370, + "step": 4762, + "time_per_iteration": 2.8268232345581055 + }, + { + "auxiliary_loss_clip": 0.01059883, + "auxiliary_loss_mlp": 0.01039517, + "balance_loss_clip": 1.03363442, + "balance_loss_mlp": 1.02453792, + "epoch": 0.28636705245753796, + "flos": 21249888255360.0, + "grad_norm": 1.6277439387346004, + "language_loss": 0.76293212, + "learning_rate": 3.349261009210496e-06, + "loss": 0.78392613, + "num_input_tokens_seen": 102780885, + "step": 4763, + "time_per_iteration": 2.7115414142608643 + }, + { + "auxiliary_loss_clip": 0.01050413, + "auxiliary_loss_mlp": 0.01038814, + "balance_loss_clip": 1.02849782, + "balance_loss_mlp": 1.02242315, + "epoch": 0.28642717571020593, + "flos": 24095772864000.0, + "grad_norm": 1.5686643138785663, + "language_loss": 0.76939613, + "learning_rate": 3.348973500311086e-06, + "loss": 0.79028839, + "num_input_tokens_seen": 102801000, + "step": 4764, + "time_per_iteration": 2.8073925971984863 + }, + { + "auxiliary_loss_clip": 0.01057358, + "auxiliary_loss_mlp": 0.01041935, + "balance_loss_clip": 1.03016305, + "balance_loss_mlp": 1.02473319, + "epoch": 0.2864872989628739, + "flos": 22601386408320.0, + "grad_norm": 1.8386514095579727, + "language_loss": 0.70505977, + "learning_rate": 3.348685940258466e-06, + "loss": 0.72605264, + "num_input_tokens_seen": 102820230, + "step": 4765, + "time_per_iteration": 2.7391395568847656 + }, + { + "auxiliary_loss_clip": 0.01077529, + "auxiliary_loss_mlp": 0.01032023, + "balance_loss_clip": 1.03083348, + "balance_loss_mlp": 1.01698518, + "epoch": 0.28654742221554186, + "flos": 32745073138560.0, + "grad_norm": 1.5527674750257483, + "language_loss": 0.75828326, + "learning_rate": 3.3483983290635395e-06, + "loss": 0.77937877, + "num_input_tokens_seen": 102842670, + "step": 4766, + "time_per_iteration": 5.825960636138916 + }, + { + "auxiliary_loss_clip": 0.01081656, + "auxiliary_loss_mlp": 0.01035414, + "balance_loss_clip": 1.03270912, + "balance_loss_mlp": 1.02052474, + "epoch": 0.2866075454682098, + "flos": 26981626331520.0, + "grad_norm": 1.7003019619863047, + "language_loss": 0.77566707, + "learning_rate": 3.348110666737214e-06, + "loss": 0.79683775, + "num_input_tokens_seen": 102864480, + "step": 4767, + "time_per_iteration": 2.674572706222534 + }, + { + "auxiliary_loss_clip": 0.01093057, + "auxiliary_loss_mlp": 0.01044455, + "balance_loss_clip": 1.0332377, + "balance_loss_mlp": 1.02904105, + "epoch": 0.2866676687208778, + "flos": 23253847004160.0, + "grad_norm": 2.1207043457786865, + "language_loss": 0.65328479, + "learning_rate": 3.3478229532903956e-06, + "loss": 0.67465997, + "num_input_tokens_seen": 102883740, + "step": 4768, + "time_per_iteration": 2.626314640045166 + }, + { + "auxiliary_loss_clip": 0.01070408, + "auxiliary_loss_mlp": 0.01041296, + "balance_loss_clip": 1.03126979, + "balance_loss_mlp": 1.02529764, + "epoch": 0.28672779197354575, + "flos": 21579727870080.0, + "grad_norm": 1.5404393473136004, + "language_loss": 0.70599687, + "learning_rate": 3.3475351887339967e-06, + "loss": 0.72711384, + "num_input_tokens_seen": 102902945, + "step": 4769, + "time_per_iteration": 2.75024151802063 + }, + { + "auxiliary_loss_clip": 0.01039899, + "auxiliary_loss_mlp": 0.01039696, + "balance_loss_clip": 1.03309655, + "balance_loss_mlp": 1.02446151, + "epoch": 0.2867879152262137, + "flos": 19865568049920.0, + "grad_norm": 1.6165500485855708, + "language_loss": 0.74745452, + "learning_rate": 3.3472473730789288e-06, + "loss": 0.76825047, + "num_input_tokens_seen": 102922405, + "step": 4770, + "time_per_iteration": 2.777221918106079 + }, + { + "auxiliary_loss_clip": 0.01042821, + "auxiliary_loss_mlp": 0.01041277, + "balance_loss_clip": 1.02889824, + "balance_loss_mlp": 1.02557683, + "epoch": 0.2868480384788817, + "flos": 28213325648640.0, + "grad_norm": 2.175440136984641, + "language_loss": 0.67756081, + "learning_rate": 3.3469595063361045e-06, + "loss": 0.69840181, + "num_input_tokens_seen": 102938980, + "step": 4771, + "time_per_iteration": 2.8753671646118164 + }, + { + "auxiliary_loss_clip": 0.01015275, + "auxiliary_loss_mlp": 0.01011086, + "balance_loss_clip": 1.00849891, + "balance_loss_mlp": 1.00891614, + "epoch": 0.2869081617315497, + "flos": 65424286690560.0, + "grad_norm": 0.7841094511480321, + "language_loss": 0.56869304, + "learning_rate": 3.3466715885164414e-06, + "loss": 0.58895671, + "num_input_tokens_seen": 103000405, + "step": 4772, + "time_per_iteration": 3.1401188373565674 + }, + { + "auxiliary_loss_clip": 0.01029671, + "auxiliary_loss_mlp": 0.00748985, + "balance_loss_clip": 1.03073573, + "balance_loss_mlp": 1.0015614, + "epoch": 0.28696828498421767, + "flos": 18660729127680.0, + "grad_norm": 2.232620083151222, + "language_loss": 0.82685339, + "learning_rate": 3.346383619630856e-06, + "loss": 0.84463996, + "num_input_tokens_seen": 103017970, + "step": 4773, + "time_per_iteration": 2.8258814811706543 + }, + { + "auxiliary_loss_clip": 0.01092114, + "auxiliary_loss_mlp": 0.01038048, + "balance_loss_clip": 1.03106284, + "balance_loss_mlp": 1.02170408, + "epoch": 0.28702840823688563, + "flos": 23659745667840.0, + "grad_norm": 2.270847740830089, + "language_loss": 0.77316087, + "learning_rate": 3.34609559969027e-06, + "loss": 0.79446244, + "num_input_tokens_seen": 103036385, + "step": 4774, + "time_per_iteration": 2.6787960529327393 + }, + { + "auxiliary_loss_clip": 0.01069146, + "auxiliary_loss_mlp": 0.01039281, + "balance_loss_clip": 1.031178, + "balance_loss_mlp": 1.02308083, + "epoch": 0.2870885314895536, + "flos": 13804744544640.0, + "grad_norm": 1.7184667595837233, + "language_loss": 0.73213112, + "learning_rate": 3.3458075287056034e-06, + "loss": 0.75321537, + "num_input_tokens_seen": 103052170, + "step": 4775, + "time_per_iteration": 2.653763771057129 + }, + { + "auxiliary_loss_clip": 0.01081811, + "auxiliary_loss_mlp": 0.0103995, + "balance_loss_clip": 1.03220487, + "balance_loss_mlp": 1.02498293, + "epoch": 0.28714865474222157, + "flos": 17786771314560.0, + "grad_norm": 2.005036623764946, + "language_loss": 0.88213158, + "learning_rate": 3.34551940668778e-06, + "loss": 0.90334922, + "num_input_tokens_seen": 103070510, + "step": 4776, + "time_per_iteration": 2.8192601203918457 + }, + { + "auxiliary_loss_clip": 0.01082273, + "auxiliary_loss_mlp": 0.01041487, + "balance_loss_clip": 1.03415179, + "balance_loss_mlp": 1.02632403, + "epoch": 0.28720877799488953, + "flos": 15997486199040.0, + "grad_norm": 1.7584565257225675, + "language_loss": 0.7415489, + "learning_rate": 3.345231233647726e-06, + "loss": 0.76278651, + "num_input_tokens_seen": 103089590, + "step": 4777, + "time_per_iteration": 2.771841287612915 + }, + { + "auxiliary_loss_clip": 0.01082729, + "auxiliary_loss_mlp": 0.01048577, + "balance_loss_clip": 1.03813386, + "balance_loss_mlp": 1.03213751, + "epoch": 0.2872689012475575, + "flos": 20923137210240.0, + "grad_norm": 2.088504727743761, + "language_loss": 0.79914218, + "learning_rate": 3.3449430095963696e-06, + "loss": 0.82045525, + "num_input_tokens_seen": 103109080, + "step": 4778, + "time_per_iteration": 2.8704681396484375 + }, + { + "auxiliary_loss_clip": 0.01075125, + "auxiliary_loss_mlp": 0.01043682, + "balance_loss_clip": 1.03588796, + "balance_loss_mlp": 1.02741003, + "epoch": 0.28732902450022546, + "flos": 21325121291520.0, + "grad_norm": 1.7466652394259645, + "language_loss": 0.73703098, + "learning_rate": 3.3446547345446386e-06, + "loss": 0.75821906, + "num_input_tokens_seen": 103127755, + "step": 4779, + "time_per_iteration": 2.8840014934539795 + }, + { + "auxiliary_loss_clip": 0.01070145, + "auxiliary_loss_mlp": 0.01039327, + "balance_loss_clip": 1.03115916, + "balance_loss_mlp": 1.02230358, + "epoch": 0.2873891477528934, + "flos": 20850382212480.0, + "grad_norm": 1.4772423851356533, + "language_loss": 0.76372051, + "learning_rate": 3.3443664085034656e-06, + "loss": 0.78481525, + "num_input_tokens_seen": 103147035, + "step": 4780, + "time_per_iteration": 2.731010675430298 + }, + { + "auxiliary_loss_clip": 0.01052763, + "auxiliary_loss_mlp": 0.01041495, + "balance_loss_clip": 1.02919149, + "balance_loss_mlp": 1.02623081, + "epoch": 0.2874492710055614, + "flos": 17420051410560.0, + "grad_norm": 1.6520114544705793, + "language_loss": 0.8127141, + "learning_rate": 3.344078031483784e-06, + "loss": 0.83365667, + "num_input_tokens_seen": 103165410, + "step": 4781, + "time_per_iteration": 2.6545310020446777 + }, + { + "auxiliary_loss_clip": 0.01054407, + "auxiliary_loss_mlp": 0.01042093, + "balance_loss_clip": 1.03230262, + "balance_loss_mlp": 1.02485549, + "epoch": 0.28750939425822936, + "flos": 13406818700160.0, + "grad_norm": 1.7918753702387142, + "language_loss": 0.86423302, + "learning_rate": 3.3437896034965283e-06, + "loss": 0.885198, + "num_input_tokens_seen": 103183710, + "step": 4782, + "time_per_iteration": 2.6760141849517822 + }, + { + "auxiliary_loss_clip": 0.0106689, + "auxiliary_loss_mlp": 0.01044707, + "balance_loss_clip": 1.03620934, + "balance_loss_mlp": 1.02872658, + "epoch": 0.2875695175108973, + "flos": 21870029589120.0, + "grad_norm": 1.4979606953334121, + "language_loss": 0.71518528, + "learning_rate": 3.3435011245526357e-06, + "loss": 0.73630124, + "num_input_tokens_seen": 103203790, + "step": 4783, + "time_per_iteration": 2.6936166286468506 + }, + { + "auxiliary_loss_clip": 0.0107579, + "auxiliary_loss_mlp": 0.01040506, + "balance_loss_clip": 1.03582656, + "balance_loss_mlp": 1.02485371, + "epoch": 0.2876296407635653, + "flos": 26245457089920.0, + "grad_norm": 1.6392750464703112, + "language_loss": 0.76581407, + "learning_rate": 3.343212594663047e-06, + "loss": 0.78697705, + "num_input_tokens_seen": 103223925, + "step": 4784, + "time_per_iteration": 2.683645725250244 + }, + { + "auxiliary_loss_clip": 0.01050917, + "auxiliary_loss_mlp": 0.01046061, + "balance_loss_clip": 1.03090191, + "balance_loss_mlp": 1.02976489, + "epoch": 0.28768976401623325, + "flos": 25373654092800.0, + "grad_norm": 1.4306438342174679, + "language_loss": 0.75582504, + "learning_rate": 3.3429240138387015e-06, + "loss": 0.77679485, + "num_input_tokens_seen": 103244760, + "step": 4785, + "time_per_iteration": 2.8603086471557617 + }, + { + "auxiliary_loss_clip": 0.01096231, + "auxiliary_loss_mlp": 0.01049085, + "balance_loss_clip": 1.03550529, + "balance_loss_mlp": 1.03320622, + "epoch": 0.28774988726890127, + "flos": 30664372982400.0, + "grad_norm": 1.9420157835825311, + "language_loss": 0.8279494, + "learning_rate": 3.3426353820905425e-06, + "loss": 0.84940255, + "num_input_tokens_seen": 103261995, + "step": 4786, + "time_per_iteration": 2.685192823410034 + }, + { + "auxiliary_loss_clip": 0.01068396, + "auxiliary_loss_mlp": 0.00748966, + "balance_loss_clip": 1.0359447, + "balance_loss_mlp": 1.00184894, + "epoch": 0.28781001052156924, + "flos": 20595452411520.0, + "grad_norm": 1.577811975715651, + "language_loss": 0.80244982, + "learning_rate": 3.342346699429516e-06, + "loss": 0.82062346, + "num_input_tokens_seen": 103279780, + "step": 4787, + "time_per_iteration": 2.826174020767212 + }, + { + "auxiliary_loss_clip": 0.01072808, + "auxiliary_loss_mlp": 0.01035849, + "balance_loss_clip": 1.0326978, + "balance_loss_mlp": 1.02020884, + "epoch": 0.2878701337742372, + "flos": 26542330997760.0, + "grad_norm": 1.7090798505156728, + "language_loss": 0.83588725, + "learning_rate": 3.3420579658665677e-06, + "loss": 0.85697389, + "num_input_tokens_seen": 103300580, + "step": 4788, + "time_per_iteration": 2.7023160457611084 + }, + { + "auxiliary_loss_clip": 0.01058994, + "auxiliary_loss_mlp": 0.01044721, + "balance_loss_clip": 1.03546715, + "balance_loss_mlp": 1.02886665, + "epoch": 0.28793025702690517, + "flos": 28146855530880.0, + "grad_norm": 2.03916158553712, + "language_loss": 0.73764414, + "learning_rate": 3.3417691814126468e-06, + "loss": 0.7586813, + "num_input_tokens_seen": 103320430, + "step": 4789, + "time_per_iteration": 3.017040729522705 + }, + { + "auxiliary_loss_clip": 0.01078069, + "auxiliary_loss_mlp": 0.01039767, + "balance_loss_clip": 1.03121495, + "balance_loss_mlp": 1.0248065, + "epoch": 0.28799038027957313, + "flos": 23805471144960.0, + "grad_norm": 1.6245110779627583, + "language_loss": 0.83572912, + "learning_rate": 3.341480346078704e-06, + "loss": 0.85690749, + "num_input_tokens_seen": 103337695, + "step": 4790, + "time_per_iteration": 2.6398587226867676 + }, + { + "auxiliary_loss_clip": 0.01083555, + "auxiliary_loss_mlp": 0.01039643, + "balance_loss_clip": 1.03298926, + "balance_loss_mlp": 1.02394271, + "epoch": 0.2880505035322411, + "flos": 22344122223360.0, + "grad_norm": 1.6947343132159065, + "language_loss": 0.77862239, + "learning_rate": 3.3411914598756922e-06, + "loss": 0.7998544, + "num_input_tokens_seen": 103357010, + "step": 4791, + "time_per_iteration": 2.7249248027801514 + }, + { + "auxiliary_loss_clip": 0.0107443, + "auxiliary_loss_mlp": 0.0103645, + "balance_loss_clip": 1.03321338, + "balance_loss_mlp": 1.02069068, + "epoch": 0.28811062678490906, + "flos": 18004246208640.0, + "grad_norm": 1.666830592065681, + "language_loss": 0.70352006, + "learning_rate": 3.3409025228145654e-06, + "loss": 0.72462887, + "num_input_tokens_seen": 103375600, + "step": 4792, + "time_per_iteration": 2.736982822418213 + }, + { + "auxiliary_loss_clip": 0.01058385, + "auxiliary_loss_mlp": 0.01035958, + "balance_loss_clip": 1.03658819, + "balance_loss_mlp": 1.02018654, + "epoch": 0.28817075003757703, + "flos": 22090880361600.0, + "grad_norm": 2.709510690313526, + "language_loss": 0.79228431, + "learning_rate": 3.3406135349062812e-06, + "loss": 0.81322777, + "num_input_tokens_seen": 103395225, + "step": 4793, + "time_per_iteration": 2.885607957839966 + }, + { + "auxiliary_loss_clip": 0.01070853, + "auxiliary_loss_mlp": 0.01036237, + "balance_loss_clip": 1.0335412, + "balance_loss_mlp": 1.02217066, + "epoch": 0.288230873290245, + "flos": 41683130847360.0, + "grad_norm": 1.657643559018263, + "language_loss": 0.78009975, + "learning_rate": 3.340324496161797e-06, + "loss": 0.80117065, + "num_input_tokens_seen": 103417245, + "step": 4794, + "time_per_iteration": 2.836700677871704 + }, + { + "auxiliary_loss_clip": 0.01082271, + "auxiliary_loss_mlp": 0.01042321, + "balance_loss_clip": 1.03369248, + "balance_loss_mlp": 1.02684176, + "epoch": 0.28829099654291296, + "flos": 18624423456000.0, + "grad_norm": 2.110162789127578, + "language_loss": 0.83052242, + "learning_rate": 3.340035406592074e-06, + "loss": 0.85176831, + "num_input_tokens_seen": 103435500, + "step": 4795, + "time_per_iteration": 2.6271607875823975 + }, + { + "auxiliary_loss_clip": 0.01079758, + "auxiliary_loss_mlp": 0.01040763, + "balance_loss_clip": 1.03356552, + "balance_loss_mlp": 1.02654719, + "epoch": 0.2883511197955809, + "flos": 24674832017280.0, + "grad_norm": 2.1367564962653165, + "language_loss": 0.74361867, + "learning_rate": 3.339746266208074e-06, + "loss": 0.76482391, + "num_input_tokens_seen": 103451040, + "step": 4796, + "time_per_iteration": 2.761896848678589 + }, + { + "auxiliary_loss_clip": 0.01088443, + "auxiliary_loss_mlp": 0.01046368, + "balance_loss_clip": 1.03429937, + "balance_loss_mlp": 1.02928483, + "epoch": 0.2884112430482489, + "flos": 23112143850240.0, + "grad_norm": 1.7689150742037196, + "language_loss": 0.72836041, + "learning_rate": 3.3394570750207614e-06, + "loss": 0.74970853, + "num_input_tokens_seen": 103471330, + "step": 4797, + "time_per_iteration": 2.6759114265441895 + }, + { + "auxiliary_loss_clip": 0.01054052, + "auxiliary_loss_mlp": 0.00748924, + "balance_loss_clip": 1.02981746, + "balance_loss_mlp": 1.00189412, + "epoch": 0.28847136630091685, + "flos": 16873347432960.0, + "grad_norm": 2.43467012154709, + "language_loss": 0.74221218, + "learning_rate": 3.3391678330411017e-06, + "loss": 0.76024199, + "num_input_tokens_seen": 103488060, + "step": 4798, + "time_per_iteration": 2.6981821060180664 + }, + { + "auxiliary_loss_clip": 0.01083506, + "auxiliary_loss_mlp": 0.01039094, + "balance_loss_clip": 1.0309763, + "balance_loss_mlp": 1.02159441, + "epoch": 0.2885314895535849, + "flos": 25657527277440.0, + "grad_norm": 5.722622610402179, + "language_loss": 0.64462733, + "learning_rate": 3.3388785402800642e-06, + "loss": 0.6658532, + "num_input_tokens_seen": 103503600, + "step": 4799, + "time_per_iteration": 2.608215093612671 + }, + { + "auxiliary_loss_clip": 0.01093667, + "auxiliary_loss_mlp": 0.01046289, + "balance_loss_clip": 1.03367996, + "balance_loss_mlp": 1.03082728, + "epoch": 0.28859161280625284, + "flos": 21107251347840.0, + "grad_norm": 1.7795715260768046, + "language_loss": 0.82127601, + "learning_rate": 3.3385891967486178e-06, + "loss": 0.84267557, + "num_input_tokens_seen": 103524195, + "step": 4800, + "time_per_iteration": 2.554352283477783 + }, + { + "auxiliary_loss_clip": 0.01050693, + "auxiliary_loss_mlp": 0.01037723, + "balance_loss_clip": 1.030339, + "balance_loss_mlp": 1.02258325, + "epoch": 0.2886517360589208, + "flos": 26469540086400.0, + "grad_norm": 1.8230173280315987, + "language_loss": 0.91123044, + "learning_rate": 3.3382998024577347e-06, + "loss": 0.93211454, + "num_input_tokens_seen": 103545235, + "step": 4801, + "time_per_iteration": 2.6872074604034424 + }, + { + "auxiliary_loss_clip": 0.01067708, + "auxiliary_loss_mlp": 0.00748966, + "balance_loss_clip": 1.03187346, + "balance_loss_mlp": 1.00188303, + "epoch": 0.28871185931158877, + "flos": 25265275781760.0, + "grad_norm": 2.0061887588774394, + "language_loss": 0.74182665, + "learning_rate": 3.33801035741839e-06, + "loss": 0.75999337, + "num_input_tokens_seen": 103563305, + "step": 4802, + "time_per_iteration": 2.6610753536224365 + }, + { + "auxiliary_loss_clip": 0.00996505, + "auxiliary_loss_mlp": 0.01011003, + "balance_loss_clip": 1.01097965, + "balance_loss_mlp": 1.00888097, + "epoch": 0.28877198256425674, + "flos": 66665431284480.0, + "grad_norm": 0.7833506269779982, + "language_loss": 0.63000888, + "learning_rate": 3.337720861641558e-06, + "loss": 0.65008396, + "num_input_tokens_seen": 103625025, + "step": 4803, + "time_per_iteration": 3.212311029434204 + }, + { + "auxiliary_loss_clip": 0.01034512, + "auxiliary_loss_mlp": 0.01044209, + "balance_loss_clip": 1.02622068, + "balance_loss_mlp": 1.02923644, + "epoch": 0.2888321058169247, + "flos": 20303031790080.0, + "grad_norm": 1.6418856766193624, + "language_loss": 0.70732474, + "learning_rate": 3.3374313151382165e-06, + "loss": 0.72811198, + "num_input_tokens_seen": 103644235, + "step": 4804, + "time_per_iteration": 4.309911489486694 + }, + { + "auxiliary_loss_clip": 0.01082883, + "auxiliary_loss_mlp": 0.0103754, + "balance_loss_clip": 1.03125584, + "balance_loss_mlp": 1.02122068, + "epoch": 0.28889222906959267, + "flos": 25516721963520.0, + "grad_norm": 2.0324858701592468, + "language_loss": 0.6816135, + "learning_rate": 3.337141717919346e-06, + "loss": 0.70281774, + "num_input_tokens_seen": 103664700, + "step": 4805, + "time_per_iteration": 4.378906011581421 + }, + { + "auxiliary_loss_clip": 0.01083227, + "auxiliary_loss_mlp": 0.01039235, + "balance_loss_clip": 1.03333843, + "balance_loss_mlp": 1.02402961, + "epoch": 0.28895235232226063, + "flos": 32671312560000.0, + "grad_norm": 1.5645839984940966, + "language_loss": 0.69385093, + "learning_rate": 3.3368520699959272e-06, + "loss": 0.71507555, + "num_input_tokens_seen": 103686595, + "step": 4806, + "time_per_iteration": 2.7086427211761475 + }, + { + "auxiliary_loss_clip": 0.01067781, + "auxiliary_loss_mlp": 0.01039257, + "balance_loss_clip": 1.03127456, + "balance_loss_mlp": 1.02411771, + "epoch": 0.2890124755749286, + "flos": 29714679342720.0, + "grad_norm": 1.6179261906939304, + "language_loss": 0.71362776, + "learning_rate": 3.3365623713789443e-06, + "loss": 0.73469812, + "num_input_tokens_seen": 103707525, + "step": 4807, + "time_per_iteration": 2.8546323776245117 + }, + { + "auxiliary_loss_clip": 0.01059756, + "auxiliary_loss_mlp": 0.01035488, + "balance_loss_clip": 1.03254533, + "balance_loss_mlp": 1.01995468, + "epoch": 0.28907259882759656, + "flos": 22674464628480.0, + "grad_norm": 1.692421906570613, + "language_loss": 0.81489205, + "learning_rate": 3.336272622079382e-06, + "loss": 0.83584452, + "num_input_tokens_seen": 103727905, + "step": 4808, + "time_per_iteration": 2.715538263320923 + }, + { + "auxiliary_loss_clip": 0.01048625, + "auxiliary_loss_mlp": 0.01041726, + "balance_loss_clip": 1.03050292, + "balance_loss_mlp": 1.02592492, + "epoch": 0.2891327220802645, + "flos": 22566050403840.0, + "grad_norm": 1.744248300855932, + "language_loss": 0.78636086, + "learning_rate": 3.3359828221082276e-06, + "loss": 0.80726445, + "num_input_tokens_seen": 103748335, + "step": 4809, + "time_per_iteration": 2.7081363201141357 + }, + { + "auxiliary_loss_clip": 0.01041352, + "auxiliary_loss_mlp": 0.01043643, + "balance_loss_clip": 1.02772582, + "balance_loss_mlp": 1.02645278, + "epoch": 0.2891928453329325, + "flos": 21652806090240.0, + "grad_norm": 1.8288922530728595, + "language_loss": 0.78849745, + "learning_rate": 3.3356929714764714e-06, + "loss": 0.80934739, + "num_input_tokens_seen": 103767020, + "step": 4810, + "time_per_iteration": 2.7660465240478516 + }, + { + "auxiliary_loss_clip": 0.01040741, + "auxiliary_loss_mlp": 0.01036988, + "balance_loss_clip": 1.02890062, + "balance_loss_mlp": 1.02232528, + "epoch": 0.28925296858560046, + "flos": 23222102359680.0, + "grad_norm": 1.7438639976172814, + "language_loss": 0.76963288, + "learning_rate": 3.3354030701951032e-06, + "loss": 0.79041016, + "num_input_tokens_seen": 103786355, + "step": 4811, + "time_per_iteration": 2.7552945613861084 + }, + { + "auxiliary_loss_clip": 0.01081336, + "auxiliary_loss_mlp": 0.01041988, + "balance_loss_clip": 1.03344131, + "balance_loss_mlp": 1.02634752, + "epoch": 0.2893130918382685, + "flos": 28621666437120.0, + "grad_norm": 1.3610763487215638, + "language_loss": 0.77402502, + "learning_rate": 3.335113118275117e-06, + "loss": 0.79525828, + "num_input_tokens_seen": 103809345, + "step": 4812, + "time_per_iteration": 2.7419629096984863 + }, + { + "auxiliary_loss_clip": 0.0101348, + "auxiliary_loss_mlp": 0.01028344, + "balance_loss_clip": 1.03076267, + "balance_loss_mlp": 1.02522027, + "epoch": 0.28937321509093644, + "flos": 72301288982400.0, + "grad_norm": 0.8649027017364089, + "language_loss": 0.60185558, + "learning_rate": 3.3348231157275085e-06, + "loss": 0.6222738, + "num_input_tokens_seen": 103871180, + "step": 4813, + "time_per_iteration": 6.6170573234558105 + }, + { + "auxiliary_loss_clip": 0.01047653, + "auxiliary_loss_mlp": 0.01039643, + "balance_loss_clip": 1.03061795, + "balance_loss_mlp": 1.02384806, + "epoch": 0.2894333383436044, + "flos": 16216397637120.0, + "grad_norm": 1.926811954257924, + "language_loss": 0.81909293, + "learning_rate": 3.3345330625632725e-06, + "loss": 0.83996582, + "num_input_tokens_seen": 103889040, + "step": 4814, + "time_per_iteration": 2.6569392681121826 + }, + { + "auxiliary_loss_clip": 0.01054546, + "auxiliary_loss_mlp": 0.01047353, + "balance_loss_clip": 1.03362274, + "balance_loss_mlp": 1.03199899, + "epoch": 0.2894934615962724, + "flos": 24828278918400.0, + "grad_norm": 1.5889161900370568, + "language_loss": 0.72433096, + "learning_rate": 3.3342429587934094e-06, + "loss": 0.74535, + "num_input_tokens_seen": 103910380, + "step": 4815, + "time_per_iteration": 2.732254981994629 + }, + { + "auxiliary_loss_clip": 0.01077991, + "auxiliary_loss_mlp": 0.01046885, + "balance_loss_clip": 1.03231478, + "balance_loss_mlp": 1.03313434, + "epoch": 0.28955358484894034, + "flos": 20449978329600.0, + "grad_norm": 1.9401032476279858, + "language_loss": 0.70237863, + "learning_rate": 3.3339528044289198e-06, + "loss": 0.72362745, + "num_input_tokens_seen": 103929955, + "step": 4816, + "time_per_iteration": 2.6800966262817383 + }, + { + "auxiliary_loss_clip": 0.01072448, + "auxiliary_loss_mlp": 0.01052018, + "balance_loss_clip": 1.03450179, + "balance_loss_mlp": 1.03541815, + "epoch": 0.2896137081016083, + "flos": 22565188477440.0, + "grad_norm": 1.9672128209793176, + "language_loss": 0.74161804, + "learning_rate": 3.3336625994808055e-06, + "loss": 0.76286268, + "num_input_tokens_seen": 103948020, + "step": 4817, + "time_per_iteration": 2.681483745574951 + }, + { + "auxiliary_loss_clip": 0.01062505, + "auxiliary_loss_mlp": 0.0105042, + "balance_loss_clip": 1.03347147, + "balance_loss_mlp": 1.03420734, + "epoch": 0.28967383135427627, + "flos": 26687948734080.0, + "grad_norm": 1.7479355536326973, + "language_loss": 0.76296216, + "learning_rate": 3.3333723439600723e-06, + "loss": 0.78409147, + "num_input_tokens_seen": 103968740, + "step": 4818, + "time_per_iteration": 2.7224464416503906 + }, + { + "auxiliary_loss_clip": 0.01037439, + "auxiliary_loss_mlp": 0.01041786, + "balance_loss_clip": 1.03404689, + "balance_loss_mlp": 1.02609825, + "epoch": 0.28973395460694423, + "flos": 15558262692480.0, + "grad_norm": 1.6897803237087188, + "language_loss": 0.795991, + "learning_rate": 3.3330820378777263e-06, + "loss": 0.81678331, + "num_input_tokens_seen": 103986005, + "step": 4819, + "time_per_iteration": 2.6817736625671387 + }, + { + "auxiliary_loss_clip": 0.01067775, + "auxiliary_loss_mlp": 0.01046343, + "balance_loss_clip": 1.03568029, + "balance_loss_mlp": 1.02971339, + "epoch": 0.2897940778596122, + "flos": 18697465762560.0, + "grad_norm": 1.7867263987259459, + "language_loss": 0.79057479, + "learning_rate": 3.332791681244776e-06, + "loss": 0.81171596, + "num_input_tokens_seen": 104005070, + "step": 4820, + "time_per_iteration": 2.596700668334961 + }, + { + "auxiliary_loss_clip": 0.01047949, + "auxiliary_loss_mlp": 0.01040818, + "balance_loss_clip": 1.03051817, + "balance_loss_mlp": 1.02517819, + "epoch": 0.28985420111228016, + "flos": 18770292587520.0, + "grad_norm": 1.8911866124502286, + "language_loss": 0.72462511, + "learning_rate": 3.332501274072231e-06, + "loss": 0.74551278, + "num_input_tokens_seen": 104022945, + "step": 4821, + "time_per_iteration": 2.694695234298706 + }, + { + "auxiliary_loss_clip": 0.01080411, + "auxiliary_loss_mlp": 0.01040969, + "balance_loss_clip": 1.03190541, + "balance_loss_mlp": 1.02524519, + "epoch": 0.28991432436494813, + "flos": 23069840607360.0, + "grad_norm": 1.7601906566631282, + "language_loss": 0.72637045, + "learning_rate": 3.332210816371104e-06, + "loss": 0.74758422, + "num_input_tokens_seen": 104042080, + "step": 4822, + "time_per_iteration": 2.748534679412842 + }, + { + "auxiliary_loss_clip": 0.01081558, + "auxiliary_loss_mlp": 0.01046628, + "balance_loss_clip": 1.03429103, + "balance_loss_mlp": 1.03139329, + "epoch": 0.2899744476176161, + "flos": 17603195880960.0, + "grad_norm": 2.7176270411898806, + "language_loss": 0.66607189, + "learning_rate": 3.3319203081524102e-06, + "loss": 0.68735373, + "num_input_tokens_seen": 104060975, + "step": 4823, + "time_per_iteration": 2.769700527191162 + }, + { + "auxiliary_loss_clip": 0.01055943, + "auxiliary_loss_mlp": 0.01039879, + "balance_loss_clip": 1.02811384, + "balance_loss_mlp": 1.024966, + "epoch": 0.29003457087028406, + "flos": 22309360836480.0, + "grad_norm": 2.0507777453230664, + "language_loss": 0.80888152, + "learning_rate": 3.331629749427164e-06, + "loss": 0.82983977, + "num_input_tokens_seen": 104081395, + "step": 4824, + "time_per_iteration": 2.877429246902466 + }, + { + "auxiliary_loss_clip": 0.01093346, + "auxiliary_loss_mlp": 0.01044672, + "balance_loss_clip": 1.03322053, + "balance_loss_mlp": 1.02890658, + "epoch": 0.2900946941229521, + "flos": 21944975316480.0, + "grad_norm": 3.642285967067947, + "language_loss": 0.72450912, + "learning_rate": 3.331339140206385e-06, + "loss": 0.74588925, + "num_input_tokens_seen": 104099995, + "step": 4825, + "time_per_iteration": 2.696948528289795 + }, + { + "auxiliary_loss_clip": 0.01096265, + "auxiliary_loss_mlp": 0.01039181, + "balance_loss_clip": 1.03617895, + "balance_loss_mlp": 1.02352929, + "epoch": 0.29015481737562004, + "flos": 17932173569280.0, + "grad_norm": 2.0750947140009894, + "language_loss": 0.73419285, + "learning_rate": 3.331048480501092e-06, + "loss": 0.75554734, + "num_input_tokens_seen": 104118930, + "step": 4826, + "time_per_iteration": 2.705106735229492 + }, + { + "auxiliary_loss_clip": 0.01080132, + "auxiliary_loss_mlp": 0.01039731, + "balance_loss_clip": 1.03194308, + "balance_loss_mlp": 1.02537787, + "epoch": 0.290214940628288, + "flos": 22783525297920.0, + "grad_norm": 1.8870855499633261, + "language_loss": 0.68773609, + "learning_rate": 3.3307577703223073e-06, + "loss": 0.70893466, + "num_input_tokens_seen": 104136940, + "step": 4827, + "time_per_iteration": 2.6996257305145264 + }, + { + "auxiliary_loss_clip": 0.01071886, + "auxiliary_loss_mlp": 0.01040236, + "balance_loss_clip": 1.03196764, + "balance_loss_mlp": 1.02380919, + "epoch": 0.290275063880956, + "flos": 20006481104640.0, + "grad_norm": 2.108760883507074, + "language_loss": 0.8028602, + "learning_rate": 3.3304670096810545e-06, + "loss": 0.8239814, + "num_input_tokens_seen": 104154280, + "step": 4828, + "time_per_iteration": 2.6755049228668213 + }, + { + "auxiliary_loss_clip": 0.01090671, + "auxiliary_loss_mlp": 0.01044802, + "balance_loss_clip": 1.03419685, + "balance_loss_mlp": 1.02989507, + "epoch": 0.29033518713362394, + "flos": 22053605022720.0, + "grad_norm": 1.8221471067649502, + "language_loss": 0.8045187, + "learning_rate": 3.33017619858836e-06, + "loss": 0.82587349, + "num_input_tokens_seen": 104172605, + "step": 4829, + "time_per_iteration": 2.8154327869415283 + }, + { + "auxiliary_loss_clip": 0.01063158, + "auxiliary_loss_mlp": 0.01045276, + "balance_loss_clip": 1.03247261, + "balance_loss_mlp": 1.03017783, + "epoch": 0.2903953103862919, + "flos": 25630056351360.0, + "grad_norm": 1.6641119761657104, + "language_loss": 0.82849914, + "learning_rate": 3.329885337055249e-06, + "loss": 0.84958345, + "num_input_tokens_seen": 104194120, + "step": 4830, + "time_per_iteration": 2.7990639209747314 + }, + { + "auxiliary_loss_clip": 0.0108177, + "auxiliary_loss_mlp": 0.01045459, + "balance_loss_clip": 1.03237164, + "balance_loss_mlp": 1.0304271, + "epoch": 0.29045543363895987, + "flos": 16945851035520.0, + "grad_norm": 2.482450500478426, + "language_loss": 0.7922585, + "learning_rate": 3.3295944250927546e-06, + "loss": 0.8135308, + "num_input_tokens_seen": 104210875, + "step": 4831, + "time_per_iteration": 2.7463557720184326 + }, + { + "auxiliary_loss_clip": 0.01087876, + "auxiliary_loss_mlp": 0.01045412, + "balance_loss_clip": 1.03228855, + "balance_loss_mlp": 1.03099394, + "epoch": 0.29051555689162784, + "flos": 26395492199040.0, + "grad_norm": 1.735016935591219, + "language_loss": 0.74452144, + "learning_rate": 3.3293034627119055e-06, + "loss": 0.76585436, + "num_input_tokens_seen": 104229875, + "step": 4832, + "time_per_iteration": 2.60066819190979 + }, + { + "auxiliary_loss_clip": 0.01066881, + "auxiliary_loss_mlp": 0.01031121, + "balance_loss_clip": 1.03016448, + "balance_loss_mlp": 1.01836598, + "epoch": 0.2905756801442958, + "flos": 21103875469440.0, + "grad_norm": 1.9212796151015652, + "language_loss": 0.75786901, + "learning_rate": 3.329012449923736e-06, + "loss": 0.77884901, + "num_input_tokens_seen": 104250405, + "step": 4833, + "time_per_iteration": 2.7960727214813232 + }, + { + "auxiliary_loss_clip": 0.01055647, + "auxiliary_loss_mlp": 0.01036837, + "balance_loss_clip": 1.02912748, + "balance_loss_mlp": 1.0221144, + "epoch": 0.29063580339696377, + "flos": 15706071158400.0, + "grad_norm": 2.4498528088162774, + "language_loss": 0.64659166, + "learning_rate": 3.3287213867392813e-06, + "loss": 0.66751653, + "num_input_tokens_seen": 104269185, + "step": 4834, + "time_per_iteration": 2.6408979892730713 + }, + { + "auxiliary_loss_clip": 0.01063717, + "auxiliary_loss_mlp": 0.01029763, + "balance_loss_clip": 1.03113472, + "balance_loss_mlp": 1.01648271, + "epoch": 0.29069592664963173, + "flos": 24644990793600.0, + "grad_norm": 1.5146478204510319, + "language_loss": 0.71697307, + "learning_rate": 3.3284302731695783e-06, + "loss": 0.73790789, + "num_input_tokens_seen": 104289400, + "step": 4835, + "time_per_iteration": 2.699023723602295 + }, + { + "auxiliary_loss_clip": 0.01059138, + "auxiliary_loss_mlp": 0.01034925, + "balance_loss_clip": 1.0288403, + "balance_loss_mlp": 1.02106047, + "epoch": 0.2907560499022997, + "flos": 24973753000320.0, + "grad_norm": 2.033276118277871, + "language_loss": 0.79689145, + "learning_rate": 3.3281391092256668e-06, + "loss": 0.81783211, + "num_input_tokens_seen": 104310485, + "step": 4836, + "time_per_iteration": 2.7608871459960938 + }, + { + "auxiliary_loss_clip": 0.01049028, + "auxiliary_loss_mlp": 0.01043678, + "balance_loss_clip": 1.02901578, + "balance_loss_mlp": 1.02776361, + "epoch": 0.29081617315496766, + "flos": 18657496903680.0, + "grad_norm": 1.7613168339864942, + "language_loss": 0.80865657, + "learning_rate": 3.3278478949185865e-06, + "loss": 0.82958359, + "num_input_tokens_seen": 104327330, + "step": 4837, + "time_per_iteration": 2.6905314922332764 + }, + { + "auxiliary_loss_clip": 0.0106772, + "auxiliary_loss_mlp": 0.0103569, + "balance_loss_clip": 1.02982736, + "balance_loss_mlp": 1.02099764, + "epoch": 0.2908762964076356, + "flos": 35331035955840.0, + "grad_norm": 2.3237228561127523, + "language_loss": 0.67451984, + "learning_rate": 3.327556630259381e-06, + "loss": 0.69555396, + "num_input_tokens_seen": 104350350, + "step": 4838, + "time_per_iteration": 2.749419689178467 + }, + { + "auxiliary_loss_clip": 0.01095116, + "auxiliary_loss_mlp": 0.00748917, + "balance_loss_clip": 1.03505754, + "balance_loss_mlp": 1.00184536, + "epoch": 0.29093641966030365, + "flos": 23076305055360.0, + "grad_norm": 1.7144371533954803, + "language_loss": 0.71298873, + "learning_rate": 3.327265315259095e-06, + "loss": 0.7314291, + "num_input_tokens_seen": 104369995, + "step": 4839, + "time_per_iteration": 2.68534255027771 + }, + { + "auxiliary_loss_clip": 0.01087619, + "auxiliary_loss_mlp": 0.01033993, + "balance_loss_clip": 1.03053892, + "balance_loss_mlp": 1.02021277, + "epoch": 0.2909965429129716, + "flos": 35955415094400.0, + "grad_norm": 1.9445448404465526, + "language_loss": 0.75788295, + "learning_rate": 3.326973949928776e-06, + "loss": 0.77909911, + "num_input_tokens_seen": 104392285, + "step": 4840, + "time_per_iteration": 2.6032447814941406 + }, + { + "auxiliary_loss_clip": 0.01045449, + "auxiliary_loss_mlp": 0.01038844, + "balance_loss_clip": 1.02924347, + "balance_loss_mlp": 1.02411008, + "epoch": 0.2910566661656396, + "flos": 30880231764480.0, + "grad_norm": 2.450437548119884, + "language_loss": 0.60557413, + "learning_rate": 3.326682534279471e-06, + "loss": 0.6264171, + "num_input_tokens_seen": 104412640, + "step": 4841, + "time_per_iteration": 2.7213101387023926 + }, + { + "auxiliary_loss_clip": 0.0106868, + "auxiliary_loss_mlp": 0.01034886, + "balance_loss_clip": 1.03232598, + "balance_loss_mlp": 1.0197587, + "epoch": 0.29111678941830754, + "flos": 30010188533760.0, + "grad_norm": 2.184830938170319, + "language_loss": 0.71449769, + "learning_rate": 3.326391068322232e-06, + "loss": 0.73553336, + "num_input_tokens_seen": 104435245, + "step": 4842, + "time_per_iteration": 2.766378879547119 + }, + { + "auxiliary_loss_clip": 0.01074517, + "auxiliary_loss_mlp": 0.01032854, + "balance_loss_clip": 1.0298506, + "balance_loss_mlp": 1.01937175, + "epoch": 0.2911769126709755, + "flos": 22857393617280.0, + "grad_norm": 1.653958687920935, + "language_loss": 0.73171723, + "learning_rate": 3.3260995520681098e-06, + "loss": 0.75279093, + "num_input_tokens_seen": 104455395, + "step": 4843, + "time_per_iteration": 2.606640338897705 + }, + { + "auxiliary_loss_clip": 0.0104406, + "auxiliary_loss_mlp": 0.01039363, + "balance_loss_clip": 1.03219533, + "balance_loss_mlp": 1.02458155, + "epoch": 0.2912370359236435, + "flos": 21650507619840.0, + "grad_norm": 2.423805126699606, + "language_loss": 0.5836798, + "learning_rate": 3.3258079855281602e-06, + "loss": 0.604514, + "num_input_tokens_seen": 104473350, + "step": 4844, + "time_per_iteration": 2.678447723388672 + }, + { + "auxiliary_loss_clip": 0.01083576, + "auxiliary_loss_mlp": 0.01037517, + "balance_loss_clip": 1.03495824, + "balance_loss_mlp": 1.0214479, + "epoch": 0.29129715917631144, + "flos": 22893340152960.0, + "grad_norm": 2.003595441055099, + "language_loss": 0.86606181, + "learning_rate": 3.3255163687134396e-06, + "loss": 0.88727272, + "num_input_tokens_seen": 104492265, + "step": 4845, + "time_per_iteration": 2.6312012672424316 + }, + { + "auxiliary_loss_clip": 0.01071607, + "auxiliary_loss_mlp": 0.01054918, + "balance_loss_clip": 1.03416634, + "balance_loss_mlp": 1.03901589, + "epoch": 0.2913572824289794, + "flos": 22674464628480.0, + "grad_norm": 1.6625122287044527, + "language_loss": 0.67178154, + "learning_rate": 3.3252247016350046e-06, + "loss": 0.69304681, + "num_input_tokens_seen": 104510755, + "step": 4846, + "time_per_iteration": 2.7009317874908447 + }, + { + "auxiliary_loss_clip": 0.01066045, + "auxiliary_loss_mlp": 0.01036657, + "balance_loss_clip": 1.03210926, + "balance_loss_mlp": 1.02256036, + "epoch": 0.29141740568164737, + "flos": 23107403255040.0, + "grad_norm": 1.9042813723585836, + "language_loss": 0.70275784, + "learning_rate": 3.3249329843039166e-06, + "loss": 0.7237848, + "num_input_tokens_seen": 104530830, + "step": 4847, + "time_per_iteration": 2.7776763439178467 + }, + { + "auxiliary_loss_clip": 0.01078283, + "auxiliary_loss_mlp": 0.01032824, + "balance_loss_clip": 1.03208494, + "balance_loss_mlp": 1.01806569, + "epoch": 0.29147752893431533, + "flos": 23587026583680.0, + "grad_norm": 1.4537952642221166, + "language_loss": 0.73825318, + "learning_rate": 3.324641216731237e-06, + "loss": 0.75936425, + "num_input_tokens_seen": 104550115, + "step": 4848, + "time_per_iteration": 2.7736823558807373 + }, + { + "auxiliary_loss_clip": 0.0107089, + "auxiliary_loss_mlp": 0.01041978, + "balance_loss_clip": 1.02971148, + "balance_loss_mlp": 1.02558661, + "epoch": 0.2915376521869833, + "flos": 20591968792320.0, + "grad_norm": 1.9857340878809906, + "language_loss": 0.76955462, + "learning_rate": 3.3243493989280295e-06, + "loss": 0.79068327, + "num_input_tokens_seen": 104566255, + "step": 4849, + "time_per_iteration": 2.699848175048828 + }, + { + "auxiliary_loss_clip": 0.01070512, + "auxiliary_loss_mlp": 0.01039415, + "balance_loss_clip": 1.03082108, + "balance_loss_mlp": 1.02378654, + "epoch": 0.29159777543965126, + "flos": 20811490761600.0, + "grad_norm": 2.073275687598651, + "language_loss": 0.78352249, + "learning_rate": 3.3240575309053596e-06, + "loss": 0.80462176, + "num_input_tokens_seen": 104585235, + "step": 4850, + "time_per_iteration": 2.8594655990600586 + }, + { + "auxiliary_loss_clip": 0.01066863, + "auxiliary_loss_mlp": 0.01034809, + "balance_loss_clip": 1.03137529, + "balance_loss_mlp": 1.01938367, + "epoch": 0.29165789869231923, + "flos": 24244155947520.0, + "grad_norm": 1.686447911336226, + "language_loss": 0.76262426, + "learning_rate": 3.323765612674296e-06, + "loss": 0.78364098, + "num_input_tokens_seen": 104605315, + "step": 4851, + "time_per_iteration": 4.283161401748657 + }, + { + "auxiliary_loss_clip": 0.01075546, + "auxiliary_loss_mlp": 0.01043636, + "balance_loss_clip": 1.03107631, + "balance_loss_mlp": 1.02983189, + "epoch": 0.29171802194498725, + "flos": 28949925853440.0, + "grad_norm": 3.5124029511374855, + "language_loss": 0.77407789, + "learning_rate": 3.3234736442459078e-06, + "loss": 0.79526973, + "num_input_tokens_seen": 104626055, + "step": 4852, + "time_per_iteration": 4.237766981124878 + }, + { + "auxiliary_loss_clip": 0.01068057, + "auxiliary_loss_mlp": 0.01039099, + "balance_loss_clip": 1.03125072, + "balance_loss_mlp": 1.02397168, + "epoch": 0.2917781451976552, + "flos": 22598226011520.0, + "grad_norm": 2.0068192038608044, + "language_loss": 0.78426164, + "learning_rate": 3.3231816256312665e-06, + "loss": 0.80533314, + "num_input_tokens_seen": 104646005, + "step": 4853, + "time_per_iteration": 2.8271021842956543 + }, + { + "auxiliary_loss_clip": 0.01061002, + "auxiliary_loss_mlp": 0.01035062, + "balance_loss_clip": 1.03256178, + "balance_loss_mlp": 1.02045929, + "epoch": 0.2918382684503232, + "flos": 21574448570880.0, + "grad_norm": 2.14017682055484, + "language_loss": 0.88299024, + "learning_rate": 3.322889556841445e-06, + "loss": 0.90395093, + "num_input_tokens_seen": 104661620, + "step": 4854, + "time_per_iteration": 2.8715994358062744 + }, + { + "auxiliary_loss_clip": 0.0107949, + "auxiliary_loss_mlp": 0.0104724, + "balance_loss_clip": 1.03231001, + "balance_loss_mlp": 1.02944219, + "epoch": 0.29189839170299114, + "flos": 24353503925760.0, + "grad_norm": 1.9769469059364386, + "language_loss": 0.86762369, + "learning_rate": 3.322597437887519e-06, + "loss": 0.88889098, + "num_input_tokens_seen": 104681445, + "step": 4855, + "time_per_iteration": 2.7254574298858643 + }, + { + "auxiliary_loss_clip": 0.0100555, + "auxiliary_loss_mlp": 0.01028364, + "balance_loss_clip": 1.00536919, + "balance_loss_mlp": 1.0265044, + "epoch": 0.2919585149556591, + "flos": 71316726215040.0, + "grad_norm": 0.8075472869707984, + "language_loss": 0.60251611, + "learning_rate": 3.322305268780566e-06, + "loss": 0.62285525, + "num_input_tokens_seen": 104747945, + "step": 4856, + "time_per_iteration": 3.338771343231201 + }, + { + "auxiliary_loss_clip": 0.010625, + "auxiliary_loss_mlp": 0.00748975, + "balance_loss_clip": 1.0296545, + "balance_loss_mlp": 1.00193644, + "epoch": 0.2920186382083271, + "flos": 15633208419840.0, + "grad_norm": 1.7202434549270136, + "language_loss": 0.68375862, + "learning_rate": 3.322013049531664e-06, + "loss": 0.7018733, + "num_input_tokens_seen": 104766225, + "step": 4857, + "time_per_iteration": 2.6163034439086914 + }, + { + "auxiliary_loss_clip": 0.01073829, + "auxiliary_loss_mlp": 0.0074878, + "balance_loss_clip": 1.03070045, + "balance_loss_mlp": 1.00188053, + "epoch": 0.29207876146099504, + "flos": 28366018364160.0, + "grad_norm": 1.989044472920746, + "language_loss": 0.83580351, + "learning_rate": 3.321720780151895e-06, + "loss": 0.85402966, + "num_input_tokens_seen": 104785345, + "step": 4858, + "time_per_iteration": 2.7875590324401855 + }, + { + "auxiliary_loss_clip": 0.01089088, + "auxiliary_loss_mlp": 0.0103946, + "balance_loss_clip": 1.03254783, + "balance_loss_mlp": 1.02477324, + "epoch": 0.292138884713663, + "flos": 21870963342720.0, + "grad_norm": 1.9108408513406843, + "language_loss": 0.77351922, + "learning_rate": 3.321428460652342e-06, + "loss": 0.79480469, + "num_input_tokens_seen": 104804560, + "step": 4859, + "time_per_iteration": 2.6140241622924805 + }, + { + "auxiliary_loss_clip": 0.01053438, + "auxiliary_loss_mlp": 0.0103794, + "balance_loss_clip": 1.03243494, + "balance_loss_mlp": 1.02184701, + "epoch": 0.29219900796633097, + "flos": 20992552243200.0, + "grad_norm": 1.999780363856297, + "language_loss": 0.68476462, + "learning_rate": 3.3211360910440885e-06, + "loss": 0.7056784, + "num_input_tokens_seen": 104821105, + "step": 4860, + "time_per_iteration": 4.406907320022583 + }, + { + "auxiliary_loss_clip": 0.01067492, + "auxiliary_loss_mlp": 0.01040806, + "balance_loss_clip": 1.03284431, + "balance_loss_mlp": 1.02759719, + "epoch": 0.29225913121899894, + "flos": 35004608133120.0, + "grad_norm": 2.166049961085204, + "language_loss": 0.75286233, + "learning_rate": 3.320843671338222e-06, + "loss": 0.77394533, + "num_input_tokens_seen": 104841440, + "step": 4861, + "time_per_iteration": 4.2695887088775635 + }, + { + "auxiliary_loss_clip": 0.01074403, + "auxiliary_loss_mlp": 0.01040768, + "balance_loss_clip": 1.03035724, + "balance_loss_mlp": 1.02734494, + "epoch": 0.2923192544716669, + "flos": 13515663888000.0, + "grad_norm": 1.768295980816518, + "language_loss": 0.91378373, + "learning_rate": 3.320551201545832e-06, + "loss": 0.93493539, + "num_input_tokens_seen": 104858210, + "step": 4862, + "time_per_iteration": 2.5616345405578613 + }, + { + "auxiliary_loss_clip": 0.01077973, + "auxiliary_loss_mlp": 0.01035155, + "balance_loss_clip": 1.03152418, + "balance_loss_mlp": 1.02156532, + "epoch": 0.29237937772433487, + "flos": 19463512141440.0, + "grad_norm": 1.9806349191096353, + "language_loss": 0.73497766, + "learning_rate": 3.320258681678008e-06, + "loss": 0.75610894, + "num_input_tokens_seen": 104875620, + "step": 4863, + "time_per_iteration": 2.554689407348633 + }, + { + "auxiliary_loss_clip": 0.01010327, + "auxiliary_loss_mlp": 0.01036038, + "balance_loss_clip": 1.02735472, + "balance_loss_mlp": 1.02248418, + "epoch": 0.29243950097700283, + "flos": 20850597694080.0, + "grad_norm": 1.839005504218704, + "language_loss": 0.77822506, + "learning_rate": 3.319966111745842e-06, + "loss": 0.79868871, + "num_input_tokens_seen": 104894600, + "step": 4864, + "time_per_iteration": 2.933803081512451 + }, + { + "auxiliary_loss_clip": 0.01046815, + "auxiliary_loss_mlp": 0.01041978, + "balance_loss_clip": 1.02981663, + "balance_loss_mlp": 1.02575338, + "epoch": 0.29249962422967085, + "flos": 23584225322880.0, + "grad_norm": 1.5004078171119732, + "language_loss": 0.82016098, + "learning_rate": 3.319673491760429e-06, + "loss": 0.84104896, + "num_input_tokens_seen": 104914530, + "step": 4865, + "time_per_iteration": 2.6886444091796875 + }, + { + "auxiliary_loss_clip": 0.01036412, + "auxiliary_loss_mlp": 0.01041469, + "balance_loss_clip": 1.03282452, + "balance_loss_mlp": 1.02572155, + "epoch": 0.2925597474823388, + "flos": 22273342473600.0, + "grad_norm": 1.9051502470974508, + "language_loss": 0.85134941, + "learning_rate": 3.3193808217328645e-06, + "loss": 0.87212825, + "num_input_tokens_seen": 104933460, + "step": 4866, + "time_per_iteration": 2.790393829345703 + }, + { + "auxiliary_loss_clip": 0.01055021, + "auxiliary_loss_mlp": 0.01031072, + "balance_loss_clip": 1.02827764, + "balance_loss_mlp": 1.01727927, + "epoch": 0.2926198707350068, + "flos": 34456108475520.0, + "grad_norm": 1.732524719504866, + "language_loss": 0.75587702, + "learning_rate": 3.3190881016742476e-06, + "loss": 0.77673805, + "num_input_tokens_seen": 104954495, + "step": 4867, + "time_per_iteration": 2.7707602977752686 + }, + { + "auxiliary_loss_clip": 0.0103566, + "auxiliary_loss_mlp": 0.01039225, + "balance_loss_clip": 1.02948022, + "balance_loss_mlp": 1.02443111, + "epoch": 0.29267999398767475, + "flos": 20704153944960.0, + "grad_norm": 1.800763684666031, + "language_loss": 0.73130333, + "learning_rate": 3.3187953315956776e-06, + "loss": 0.75205219, + "num_input_tokens_seen": 104971915, + "step": 4868, + "time_per_iteration": 2.834205150604248 + }, + { + "auxiliary_loss_clip": 0.01036156, + "auxiliary_loss_mlp": 0.01034879, + "balance_loss_clip": 1.02772844, + "balance_loss_mlp": 1.02026999, + "epoch": 0.2927401172403427, + "flos": 18368667642240.0, + "grad_norm": 1.325938463681747, + "language_loss": 0.74864662, + "learning_rate": 3.3185025115082566e-06, + "loss": 0.76935697, + "num_input_tokens_seen": 104991335, + "step": 4869, + "time_per_iteration": 2.728809118270874 + }, + { + "auxiliary_loss_clip": 0.01058861, + "auxiliary_loss_mlp": 0.01033256, + "balance_loss_clip": 1.02934444, + "balance_loss_mlp": 1.01854587, + "epoch": 0.2928002404930107, + "flos": 26104041244800.0, + "grad_norm": 2.5049044758541474, + "language_loss": 0.76498175, + "learning_rate": 3.318209641423088e-06, + "loss": 0.78590292, + "num_input_tokens_seen": 105012015, + "step": 4870, + "time_per_iteration": 2.9737939834594727 + }, + { + "auxiliary_loss_clip": 0.01077309, + "auxiliary_loss_mlp": 0.01041696, + "balance_loss_clip": 1.03242946, + "balance_loss_mlp": 1.02634168, + "epoch": 0.29286036374567864, + "flos": 21324726241920.0, + "grad_norm": 2.10341763486318, + "language_loss": 0.68003798, + "learning_rate": 3.3179167213512777e-06, + "loss": 0.70122802, + "num_input_tokens_seen": 105031460, + "step": 4871, + "time_per_iteration": 2.7335288524627686 + }, + { + "auxiliary_loss_clip": 0.01054591, + "auxiliary_loss_mlp": 0.01037312, + "balance_loss_clip": 1.0251925, + "balance_loss_mlp": 1.02252996, + "epoch": 0.2929204869983466, + "flos": 29569492569600.0, + "grad_norm": 2.304408087906331, + "language_loss": 0.77229637, + "learning_rate": 3.317623751303933e-06, + "loss": 0.79321539, + "num_input_tokens_seen": 105052965, + "step": 4872, + "time_per_iteration": 2.748267889022827 + }, + { + "auxiliary_loss_clip": 0.01027401, + "auxiliary_loss_mlp": 0.01038657, + "balance_loss_clip": 1.02774835, + "balance_loss_mlp": 1.0228852, + "epoch": 0.2929806102510146, + "flos": 19058259922560.0, + "grad_norm": 1.9882291587155454, + "language_loss": 0.72649086, + "learning_rate": 3.317330731292164e-06, + "loss": 0.74715137, + "num_input_tokens_seen": 105071840, + "step": 4873, + "time_per_iteration": 2.7530713081359863 + }, + { + "auxiliary_loss_clip": 0.01079506, + "auxiliary_loss_mlp": 0.01034144, + "balance_loss_clip": 1.03065479, + "balance_loss_mlp": 1.01918888, + "epoch": 0.29304073350368254, + "flos": 21944221130880.0, + "grad_norm": 1.8134230000989269, + "language_loss": 0.78098601, + "learning_rate": 3.3170376613270812e-06, + "loss": 0.80212247, + "num_input_tokens_seen": 105089445, + "step": 4874, + "time_per_iteration": 2.688023805618286 + }, + { + "auxiliary_loss_clip": 0.0104463, + "auxiliary_loss_mlp": 0.01043144, + "balance_loss_clip": 1.03289366, + "balance_loss_mlp": 1.0271579, + "epoch": 0.2931008567563505, + "flos": 15450818135040.0, + "grad_norm": 1.9356893578799892, + "language_loss": 0.77399701, + "learning_rate": 3.3167445414197985e-06, + "loss": 0.79487479, + "num_input_tokens_seen": 105106210, + "step": 4875, + "time_per_iteration": 2.744852066040039 + }, + { + "auxiliary_loss_clip": 0.01083497, + "auxiliary_loss_mlp": 0.01033209, + "balance_loss_clip": 1.03481102, + "balance_loss_mlp": 1.01855779, + "epoch": 0.29316098000901847, + "flos": 16983162288000.0, + "grad_norm": 1.5377198769729283, + "language_loss": 0.68837154, + "learning_rate": 3.316451371581431e-06, + "loss": 0.70953864, + "num_input_tokens_seen": 105124200, + "step": 4876, + "time_per_iteration": 2.5977165699005127 + }, + { + "auxiliary_loss_clip": 0.01066528, + "auxiliary_loss_mlp": 0.01040102, + "balance_loss_clip": 1.02958012, + "balance_loss_mlp": 1.02539778, + "epoch": 0.29322110326168643, + "flos": 16357705741440.0, + "grad_norm": 1.9627179834884065, + "language_loss": 0.8147639, + "learning_rate": 3.316158151823096e-06, + "loss": 0.83583021, + "num_input_tokens_seen": 105140400, + "step": 4877, + "time_per_iteration": 2.558040142059326 + }, + { + "auxiliary_loss_clip": 0.01081603, + "auxiliary_loss_mlp": 0.01040776, + "balance_loss_clip": 1.03278136, + "balance_loss_mlp": 1.02576756, + "epoch": 0.29328122651435445, + "flos": 13990869843840.0, + "grad_norm": 1.7681345344721318, + "language_loss": 0.67711997, + "learning_rate": 3.315864882155911e-06, + "loss": 0.69834375, + "num_input_tokens_seen": 105157535, + "step": 4878, + "time_per_iteration": 2.5678789615631104 + }, + { + "auxiliary_loss_clip": 0.01043663, + "auxiliary_loss_mlp": 0.01043501, + "balance_loss_clip": 1.02828407, + "balance_loss_mlp": 1.0285821, + "epoch": 0.2933413497670224, + "flos": 25264593423360.0, + "grad_norm": 1.8232567632583572, + "language_loss": 0.7359255, + "learning_rate": 3.3155715625909982e-06, + "loss": 0.75679708, + "num_input_tokens_seen": 105175185, + "step": 4879, + "time_per_iteration": 2.6901400089263916 + }, + { + "auxiliary_loss_clip": 0.01056115, + "auxiliary_loss_mlp": 0.00748961, + "balance_loss_clip": 1.03712702, + "balance_loss_mlp": 1.00182903, + "epoch": 0.2934014730196904, + "flos": 32123746656000.0, + "grad_norm": 1.9352686021195684, + "language_loss": 0.66404253, + "learning_rate": 3.3152781931394803e-06, + "loss": 0.68209332, + "num_input_tokens_seen": 105194540, + "step": 4880, + "time_per_iteration": 2.880929946899414 + }, + { + "auxiliary_loss_clip": 0.01073706, + "auxiliary_loss_mlp": 0.010503, + "balance_loss_clip": 1.02992082, + "balance_loss_mlp": 1.03511262, + "epoch": 0.29346159627235835, + "flos": 24352498344960.0, + "grad_norm": 1.9666073136811753, + "language_loss": 0.70346534, + "learning_rate": 3.314984773812481e-06, + "loss": 0.7247054, + "num_input_tokens_seen": 105213215, + "step": 4881, + "time_per_iteration": 2.669952630996704 + }, + { + "auxiliary_loss_clip": 0.0105805, + "auxiliary_loss_mlp": 0.00748907, + "balance_loss_clip": 1.03045392, + "balance_loss_mlp": 1.00198913, + "epoch": 0.2935217195250263, + "flos": 22746752749440.0, + "grad_norm": 1.7075444459803006, + "language_loss": 0.83518863, + "learning_rate": 3.314691304621127e-06, + "loss": 0.85325819, + "num_input_tokens_seen": 105231585, + "step": 4882, + "time_per_iteration": 2.65122652053833 + }, + { + "auxiliary_loss_clip": 0.01093964, + "auxiliary_loss_mlp": 0.01042737, + "balance_loss_clip": 1.03323555, + "balance_loss_mlp": 1.02734685, + "epoch": 0.2935818427776943, + "flos": 21725561088000.0, + "grad_norm": 2.6503493618404432, + "language_loss": 0.71686137, + "learning_rate": 3.314397785576548e-06, + "loss": 0.73822838, + "num_input_tokens_seen": 105250120, + "step": 4883, + "time_per_iteration": 2.544774055480957 + }, + { + "auxiliary_loss_clip": 0.01070606, + "auxiliary_loss_mlp": 0.01034032, + "balance_loss_clip": 1.03243351, + "balance_loss_mlp": 1.01889229, + "epoch": 0.29364196603036224, + "flos": 23804968354560.0, + "grad_norm": 2.1157306907290407, + "language_loss": 0.9226613, + "learning_rate": 3.3141042166898726e-06, + "loss": 0.94370764, + "num_input_tokens_seen": 105266065, + "step": 4884, + "time_per_iteration": 2.7795538902282715 + }, + { + "auxiliary_loss_clip": 0.01082055, + "auxiliary_loss_mlp": 0.01042336, + "balance_loss_clip": 1.03435802, + "balance_loss_mlp": 1.02767324, + "epoch": 0.2937020892830302, + "flos": 23470064922240.0, + "grad_norm": 2.103625101443611, + "language_loss": 0.7342577, + "learning_rate": 3.313810597972234e-06, + "loss": 0.75550163, + "num_input_tokens_seen": 105282155, + "step": 4885, + "time_per_iteration": 2.665827751159668 + }, + { + "auxiliary_loss_clip": 0.01066512, + "auxiliary_loss_mlp": 0.01041952, + "balance_loss_clip": 1.02941656, + "balance_loss_mlp": 1.02671707, + "epoch": 0.2937622125356982, + "flos": 24272740195200.0, + "grad_norm": 1.871534615833799, + "language_loss": 0.85241687, + "learning_rate": 3.3135169294347655e-06, + "loss": 0.87350154, + "num_input_tokens_seen": 105299225, + "step": 4886, + "time_per_iteration": 2.6565515995025635 + }, + { + "auxiliary_loss_clip": 0.01058609, + "auxiliary_loss_mlp": 0.01036473, + "balance_loss_clip": 1.02920341, + "balance_loss_mlp": 1.02248383, + "epoch": 0.29382233578836614, + "flos": 20662461233280.0, + "grad_norm": 2.3425755038007514, + "language_loss": 0.76883698, + "learning_rate": 3.313223211088603e-06, + "loss": 0.78978777, + "num_input_tokens_seen": 105315710, + "step": 4887, + "time_per_iteration": 2.7510833740234375 + }, + { + "auxiliary_loss_clip": 0.01070437, + "auxiliary_loss_mlp": 0.01045851, + "balance_loss_clip": 1.0326767, + "balance_loss_mlp": 1.03110445, + "epoch": 0.2938824590410341, + "flos": 16545052103040.0, + "grad_norm": 2.228335635825435, + "language_loss": 0.7973032, + "learning_rate": 3.3129294429448855e-06, + "loss": 0.81846607, + "num_input_tokens_seen": 105333505, + "step": 4888, + "time_per_iteration": 2.6146180629730225 + }, + { + "auxiliary_loss_clip": 0.01067057, + "auxiliary_loss_mlp": 0.01035388, + "balance_loss_clip": 1.03354561, + "balance_loss_mlp": 1.02087462, + "epoch": 0.29394258229370207, + "flos": 37925474382720.0, + "grad_norm": 1.490949253685954, + "language_loss": 0.55230534, + "learning_rate": 3.3126356250147517e-06, + "loss": 0.57332981, + "num_input_tokens_seen": 105355605, + "step": 4889, + "time_per_iteration": 2.811199903488159 + }, + { + "auxiliary_loss_clip": 0.01082726, + "auxiliary_loss_mlp": 0.01036596, + "balance_loss_clip": 1.03324127, + "balance_loss_mlp": 1.02057457, + "epoch": 0.29400270554637004, + "flos": 20044690197120.0, + "grad_norm": 2.512359850861105, + "language_loss": 0.84284687, + "learning_rate": 3.3123417573093434e-06, + "loss": 0.86404014, + "num_input_tokens_seen": 105374225, + "step": 4890, + "time_per_iteration": 2.733727216720581 + }, + { + "auxiliary_loss_clip": 0.01083525, + "auxiliary_loss_mlp": 0.01043704, + "balance_loss_clip": 1.03436255, + "balance_loss_mlp": 1.02834415, + "epoch": 0.294062828799038, + "flos": 15266380775040.0, + "grad_norm": 1.7626783002922288, + "language_loss": 0.7238782, + "learning_rate": 3.3120478398398046e-06, + "loss": 0.74515045, + "num_input_tokens_seen": 105391565, + "step": 4891, + "time_per_iteration": 2.6140947341918945 + }, + { + "auxiliary_loss_clip": 0.01092487, + "auxiliary_loss_mlp": 0.01043843, + "balance_loss_clip": 1.03369176, + "balance_loss_mlp": 1.02788067, + "epoch": 0.294122952051706, + "flos": 22747147799040.0, + "grad_norm": 1.6220332960919277, + "language_loss": 0.77216637, + "learning_rate": 3.3117538726172797e-06, + "loss": 0.79352969, + "num_input_tokens_seen": 105409840, + "step": 4892, + "time_per_iteration": 2.607042074203491 + }, + { + "auxiliary_loss_clip": 0.01088548, + "auxiliary_loss_mlp": 0.01034398, + "balance_loss_clip": 1.03248906, + "balance_loss_mlp": 1.01920521, + "epoch": 0.294183075304374, + "flos": 24972891073920.0, + "grad_norm": 6.407209825863595, + "language_loss": 0.78429258, + "learning_rate": 3.3114598556529164e-06, + "loss": 0.80552202, + "num_input_tokens_seen": 105428645, + "step": 4893, + "time_per_iteration": 2.6247663497924805 + }, + { + "auxiliary_loss_clip": 0.01057661, + "auxiliary_loss_mlp": 0.01038035, + "balance_loss_clip": 1.03061748, + "balance_loss_mlp": 1.0233959, + "epoch": 0.29424319855704195, + "flos": 30952986762240.0, + "grad_norm": 2.336548534885058, + "language_loss": 0.84773743, + "learning_rate": 3.311165788957864e-06, + "loss": 0.86869437, + "num_input_tokens_seen": 105447480, + "step": 4894, + "time_per_iteration": 2.730229616165161 + }, + { + "auxiliary_loss_clip": 0.01078346, + "auxiliary_loss_mlp": 0.01033941, + "balance_loss_clip": 1.03158665, + "balance_loss_mlp": 1.01943278, + "epoch": 0.2943033218097099, + "flos": 15231583474560.0, + "grad_norm": 2.422698420141688, + "language_loss": 0.90337741, + "learning_rate": 3.310871672543274e-06, + "loss": 0.92450023, + "num_input_tokens_seen": 105464600, + "step": 4895, + "time_per_iteration": 2.592874050140381 + }, + { + "auxiliary_loss_clip": 0.01081471, + "auxiliary_loss_mlp": 0.01041272, + "balance_loss_clip": 1.03194833, + "balance_loss_mlp": 1.02578688, + "epoch": 0.2943634450623779, + "flos": 21725884310400.0, + "grad_norm": 1.805678552109774, + "language_loss": 0.86652541, + "learning_rate": 3.3105775064202982e-06, + "loss": 0.88775289, + "num_input_tokens_seen": 105481510, + "step": 4896, + "time_per_iteration": 2.5931642055511475 + }, + { + "auxiliary_loss_clip": 0.0108699, + "auxiliary_loss_mlp": 0.01045368, + "balance_loss_clip": 1.03712404, + "balance_loss_mlp": 1.02978706, + "epoch": 0.29442356831504585, + "flos": 22602104680320.0, + "grad_norm": 1.9743254005489816, + "language_loss": 0.73509276, + "learning_rate": 3.3102832906000924e-06, + "loss": 0.75641632, + "num_input_tokens_seen": 105501390, + "step": 4897, + "time_per_iteration": 4.215045213699341 + }, + { + "auxiliary_loss_clip": 0.0107174, + "auxiliary_loss_mlp": 0.01043045, + "balance_loss_clip": 1.02783263, + "balance_loss_mlp": 1.02609301, + "epoch": 0.2944836915677138, + "flos": 20011401267840.0, + "grad_norm": 1.9882003752711486, + "language_loss": 0.73972368, + "learning_rate": 3.309989025093813e-06, + "loss": 0.76087153, + "num_input_tokens_seen": 105519600, + "step": 4898, + "time_per_iteration": 2.6196203231811523 + }, + { + "auxiliary_loss_clip": 0.01090845, + "auxiliary_loss_mlp": 0.0104366, + "balance_loss_clip": 1.03906977, + "balance_loss_mlp": 1.02545691, + "epoch": 0.2945438148203818, + "flos": 20045875345920.0, + "grad_norm": 2.7191585235428315, + "language_loss": 0.69692039, + "learning_rate": 3.309694709912618e-06, + "loss": 0.71826541, + "num_input_tokens_seen": 105535970, + "step": 4899, + "time_per_iteration": 2.5484941005706787 + }, + { + "auxiliary_loss_clip": 0.01067572, + "auxiliary_loss_mlp": 0.00749071, + "balance_loss_clip": 1.0302844, + "balance_loss_mlp": 1.00201654, + "epoch": 0.29460393807304974, + "flos": 23733542160000.0, + "grad_norm": 1.8331346314901213, + "language_loss": 0.79057163, + "learning_rate": 3.3094003450676685e-06, + "loss": 0.80873805, + "num_input_tokens_seen": 105556735, + "step": 4900, + "time_per_iteration": 4.213306427001953 + }, + { + "auxiliary_loss_clip": 0.01046315, + "auxiliary_loss_mlp": 0.01043358, + "balance_loss_clip": 1.02430177, + "balance_loss_mlp": 1.026824, + "epoch": 0.2946640613257177, + "flos": 14976079056000.0, + "grad_norm": 1.7298864299789405, + "language_loss": 0.80708736, + "learning_rate": 3.3091059305701268e-06, + "loss": 0.82798409, + "num_input_tokens_seen": 105574875, + "step": 4901, + "time_per_iteration": 2.553387403488159 + }, + { + "auxiliary_loss_clip": 0.0106546, + "auxiliary_loss_mlp": 0.01032223, + "balance_loss_clip": 1.03212214, + "balance_loss_mlp": 1.0183531, + "epoch": 0.2947241845783857, + "flos": 24243904552320.0, + "grad_norm": 2.0530638127114287, + "language_loss": 0.57655054, + "learning_rate": 3.308811466431157e-06, + "loss": 0.59752738, + "num_input_tokens_seen": 105594225, + "step": 4902, + "time_per_iteration": 2.6230340003967285 + }, + { + "auxiliary_loss_clip": 0.01068563, + "auxiliary_loss_mlp": 0.01033205, + "balance_loss_clip": 1.03124142, + "balance_loss_mlp": 1.01903117, + "epoch": 0.29478430783105364, + "flos": 19938394874880.0, + "grad_norm": 1.6074309031286766, + "language_loss": 0.75374031, + "learning_rate": 3.308516952661925e-06, + "loss": 0.77475798, + "num_input_tokens_seen": 105614000, + "step": 4903, + "time_per_iteration": 2.6469032764434814 + }, + { + "auxiliary_loss_clip": 0.01061013, + "auxiliary_loss_mlp": 0.01041408, + "balance_loss_clip": 1.03004885, + "balance_loss_mlp": 1.02449214, + "epoch": 0.2948444310837216, + "flos": 27381347856000.0, + "grad_norm": 1.910207293758782, + "language_loss": 0.62382656, + "learning_rate": 3.3082223892736e-06, + "loss": 0.64485073, + "num_input_tokens_seen": 105634575, + "step": 4904, + "time_per_iteration": 2.6445083618164062 + }, + { + "auxiliary_loss_clip": 0.01080623, + "auxiliary_loss_mlp": 0.01037585, + "balance_loss_clip": 1.03109896, + "balance_loss_mlp": 1.02312529, + "epoch": 0.2949045543363896, + "flos": 23405462311680.0, + "grad_norm": 1.780354884335991, + "language_loss": 0.73262495, + "learning_rate": 3.3079277762773496e-06, + "loss": 0.75380701, + "num_input_tokens_seen": 105654385, + "step": 4905, + "time_per_iteration": 2.650392770767212 + }, + { + "auxiliary_loss_clip": 0.01052996, + "auxiliary_loss_mlp": 0.01034249, + "balance_loss_clip": 1.03006852, + "balance_loss_mlp": 1.01951444, + "epoch": 0.2949646775890576, + "flos": 23951483930880.0, + "grad_norm": 1.6951440220710157, + "language_loss": 0.81449473, + "learning_rate": 3.3076331136843476e-06, + "loss": 0.8353672, + "num_input_tokens_seen": 105673570, + "step": 4906, + "time_per_iteration": 2.7162435054779053 + }, + { + "auxiliary_loss_clip": 0.01037171, + "auxiliary_loss_mlp": 0.01034379, + "balance_loss_clip": 1.02682543, + "balance_loss_mlp": 1.02005053, + "epoch": 0.29502480084172555, + "flos": 22784315397120.0, + "grad_norm": 1.9552652525715706, + "language_loss": 0.87581611, + "learning_rate": 3.3073384015057667e-06, + "loss": 0.89653158, + "num_input_tokens_seen": 105691940, + "step": 4907, + "time_per_iteration": 4.316766738891602 + }, + { + "auxiliary_loss_clip": 0.01091974, + "auxiliary_loss_mlp": 0.01037127, + "balance_loss_clip": 1.03267074, + "balance_loss_mlp": 1.02132034, + "epoch": 0.2950849240943935, + "flos": 19646656611840.0, + "grad_norm": 1.9802125994433653, + "language_loss": 0.81987202, + "learning_rate": 3.307043639752782e-06, + "loss": 0.84116304, + "num_input_tokens_seen": 105709825, + "step": 4908, + "time_per_iteration": 2.673287868499756 + }, + { + "auxiliary_loss_clip": 0.01021373, + "auxiliary_loss_mlp": 0.0100206, + "balance_loss_clip": 1.00623822, + "balance_loss_mlp": 1.00041509, + "epoch": 0.2951450473470615, + "flos": 71002829260800.0, + "grad_norm": 0.776142680902946, + "language_loss": 0.57206464, + "learning_rate": 3.3067488284365728e-06, + "loss": 0.59229898, + "num_input_tokens_seen": 105766880, + "step": 4909, + "time_per_iteration": 4.624423980712891 + }, + { + "auxiliary_loss_clip": 0.0107912, + "auxiliary_loss_mlp": 0.00748974, + "balance_loss_clip": 1.03397655, + "balance_loss_mlp": 1.00200152, + "epoch": 0.29520517059972945, + "flos": 22966310632320.0, + "grad_norm": 1.5188249122798074, + "language_loss": 0.86935329, + "learning_rate": 3.3064539675683163e-06, + "loss": 0.88763422, + "num_input_tokens_seen": 105786875, + "step": 4910, + "time_per_iteration": 2.6435108184814453 + }, + { + "auxiliary_loss_clip": 0.01074074, + "auxiliary_loss_mlp": 0.01038683, + "balance_loss_clip": 1.0310638, + "balance_loss_mlp": 1.02521241, + "epoch": 0.2952652938523974, + "flos": 20485673470080.0, + "grad_norm": 1.656092560717512, + "language_loss": 0.72456086, + "learning_rate": 3.3061590571591946e-06, + "loss": 0.74568844, + "num_input_tokens_seen": 105805315, + "step": 4911, + "time_per_iteration": 2.8017754554748535 + }, + { + "auxiliary_loss_clip": 0.01077418, + "auxiliary_loss_mlp": 0.01030014, + "balance_loss_clip": 1.03237557, + "balance_loss_mlp": 1.01589966, + "epoch": 0.2953254171050654, + "flos": 19646584784640.0, + "grad_norm": 1.8954842665116483, + "language_loss": 0.89828014, + "learning_rate": 3.3058640972203904e-06, + "loss": 0.91935444, + "num_input_tokens_seen": 105825125, + "step": 4912, + "time_per_iteration": 2.6383628845214844 + }, + { + "auxiliary_loss_clip": 0.01058599, + "auxiliary_loss_mlp": 0.01046668, + "balance_loss_clip": 1.03108382, + "balance_loss_mlp": 1.03102827, + "epoch": 0.29538554035773334, + "flos": 22747973811840.0, + "grad_norm": 1.4222022400577068, + "language_loss": 0.83088148, + "learning_rate": 3.3055690877630894e-06, + "loss": 0.85193413, + "num_input_tokens_seen": 105846085, + "step": 4913, + "time_per_iteration": 2.684272289276123 + }, + { + "auxiliary_loss_clip": 0.01088297, + "auxiliary_loss_mlp": 0.01038329, + "balance_loss_clip": 1.03136945, + "balance_loss_mlp": 1.0243578, + "epoch": 0.2954456636104013, + "flos": 21871861182720.0, + "grad_norm": 3.3369634090873284, + "language_loss": 0.7723124, + "learning_rate": 3.3052740287984765e-06, + "loss": 0.79357868, + "num_input_tokens_seen": 105865400, + "step": 4914, + "time_per_iteration": 2.6325864791870117 + }, + { + "auxiliary_loss_clip": 0.01066707, + "auxiliary_loss_mlp": 0.0103992, + "balance_loss_clip": 1.03146005, + "balance_loss_mlp": 1.0246489, + "epoch": 0.2955057868630693, + "flos": 40442560871040.0, + "grad_norm": 4.704675337808383, + "language_loss": 0.81253004, + "learning_rate": 3.3049789203377424e-06, + "loss": 0.83359635, + "num_input_tokens_seen": 105887920, + "step": 4915, + "time_per_iteration": 2.8515539169311523 + }, + { + "auxiliary_loss_clip": 0.01021525, + "auxiliary_loss_mlp": 0.01041587, + "balance_loss_clip": 1.03142929, + "balance_loss_mlp": 1.0264231, + "epoch": 0.29556591011573724, + "flos": 22564506119040.0, + "grad_norm": 1.807533896732677, + "language_loss": 0.84562719, + "learning_rate": 3.3046837623920772e-06, + "loss": 0.86625832, + "num_input_tokens_seen": 105904035, + "step": 4916, + "time_per_iteration": 2.840658187866211 + }, + { + "auxiliary_loss_clip": 0.01068704, + "auxiliary_loss_mlp": 0.01033088, + "balance_loss_clip": 1.02849412, + "balance_loss_mlp": 1.01903367, + "epoch": 0.2956260333684052, + "flos": 22089300163200.0, + "grad_norm": 3.401785500530931, + "language_loss": 0.69865358, + "learning_rate": 3.3043885549726723e-06, + "loss": 0.71967149, + "num_input_tokens_seen": 105922685, + "step": 4917, + "time_per_iteration": 2.6126110553741455 + }, + { + "auxiliary_loss_clip": 0.01068262, + "auxiliary_loss_mlp": 0.0103401, + "balance_loss_clip": 1.03186369, + "balance_loss_mlp": 1.02009225, + "epoch": 0.2956861566210732, + "flos": 16435488643200.0, + "grad_norm": 1.9804772150010979, + "language_loss": 0.909563, + "learning_rate": 3.3040932980907226e-06, + "loss": 0.93058574, + "num_input_tokens_seen": 105940425, + "step": 4918, + "time_per_iteration": 2.6047780513763428 + }, + { + "auxiliary_loss_clip": 0.01091011, + "auxiliary_loss_mlp": 0.01035625, + "balance_loss_clip": 1.03311849, + "balance_loss_mlp": 1.02117085, + "epoch": 0.2957462798737412, + "flos": 25812087500160.0, + "grad_norm": 2.3415609525349796, + "language_loss": 0.72944653, + "learning_rate": 3.303797991757425e-06, + "loss": 0.75071293, + "num_input_tokens_seen": 105960550, + "step": 4919, + "time_per_iteration": 2.6201930046081543 + }, + { + "auxiliary_loss_clip": 0.01063337, + "auxiliary_loss_mlp": 0.01039645, + "balance_loss_clip": 1.02887261, + "balance_loss_mlp": 1.0250895, + "epoch": 0.29580640312640916, + "flos": 16690849407360.0, + "grad_norm": 1.7238261974462536, + "language_loss": 0.76009786, + "learning_rate": 3.3035026359839763e-06, + "loss": 0.78112769, + "num_input_tokens_seen": 105978820, + "step": 4920, + "time_per_iteration": 2.6302075386047363 + }, + { + "auxiliary_loss_clip": 0.01072376, + "auxiliary_loss_mlp": 0.01048919, + "balance_loss_clip": 1.03659606, + "balance_loss_mlp": 1.03300452, + "epoch": 0.2958665263790771, + "flos": 23945594100480.0, + "grad_norm": 2.667430377373629, + "language_loss": 0.67994004, + "learning_rate": 3.3032072307815774e-06, + "loss": 0.70115298, + "num_input_tokens_seen": 105997545, + "step": 4921, + "time_per_iteration": 2.653665781021118 + }, + { + "auxiliary_loss_clip": 0.01069801, + "auxiliary_loss_mlp": 0.0104001, + "balance_loss_clip": 1.03254831, + "balance_loss_mlp": 1.02382123, + "epoch": 0.2959266496317451, + "flos": 18478410670080.0, + "grad_norm": 1.8659729788787744, + "language_loss": 0.74573016, + "learning_rate": 3.3029117761614298e-06, + "loss": 0.76682824, + "num_input_tokens_seen": 106015320, + "step": 4922, + "time_per_iteration": 2.6546952724456787 + }, + { + "auxiliary_loss_clip": 0.01092591, + "auxiliary_loss_mlp": 0.00749065, + "balance_loss_clip": 1.03209376, + "balance_loss_mlp": 1.00212026, + "epoch": 0.29598677288441305, + "flos": 25957489754880.0, + "grad_norm": 3.8685599550458383, + "language_loss": 0.76644766, + "learning_rate": 3.302616272134737e-06, + "loss": 0.78486425, + "num_input_tokens_seen": 106034555, + "step": 4923, + "time_per_iteration": 2.6176559925079346 + }, + { + "auxiliary_loss_clip": 0.01068864, + "auxiliary_loss_mlp": 0.01033991, + "balance_loss_clip": 1.03238928, + "balance_loss_mlp": 1.01941752, + "epoch": 0.296046896137081, + "flos": 25155999630720.0, + "grad_norm": 2.4301635647903894, + "language_loss": 0.86253369, + "learning_rate": 3.3023207187127042e-06, + "loss": 0.88356221, + "num_input_tokens_seen": 106054200, + "step": 4924, + "time_per_iteration": 2.637399196624756 + }, + { + "auxiliary_loss_clip": 0.01076875, + "auxiliary_loss_mlp": 0.01032791, + "balance_loss_clip": 1.03093576, + "balance_loss_mlp": 1.017735, + "epoch": 0.296107019389749, + "flos": 21761148487680.0, + "grad_norm": 1.4605491145156146, + "language_loss": 0.81996357, + "learning_rate": 3.3020251159065396e-06, + "loss": 0.84106016, + "num_input_tokens_seen": 106074700, + "step": 4925, + "time_per_iteration": 2.6261422634124756 + }, + { + "auxiliary_loss_clip": 0.0102006, + "auxiliary_loss_mlp": 0.01042983, + "balance_loss_clip": 1.02226305, + "balance_loss_mlp": 1.02673471, + "epoch": 0.29616714264241695, + "flos": 17960039544960.0, + "grad_norm": 11.406480608142878, + "language_loss": 0.86492276, + "learning_rate": 3.301729463727452e-06, + "loss": 0.88555324, + "num_input_tokens_seen": 106091415, + "step": 4926, + "time_per_iteration": 2.683067560195923 + }, + { + "auxiliary_loss_clip": 0.0105697, + "auxiliary_loss_mlp": 0.01032304, + "balance_loss_clip": 1.03017521, + "balance_loss_mlp": 1.01776659, + "epoch": 0.2962272658950849, + "flos": 15012779777280.0, + "grad_norm": 1.8457227638864964, + "language_loss": 0.85835123, + "learning_rate": 3.3014337621866527e-06, + "loss": 0.87924397, + "num_input_tokens_seen": 106109135, + "step": 4927, + "time_per_iteration": 2.6021361351013184 + }, + { + "auxiliary_loss_clip": 0.01076963, + "auxiliary_loss_mlp": 0.0103633, + "balance_loss_clip": 1.0316124, + "balance_loss_mlp": 1.0224061, + "epoch": 0.2962873891477529, + "flos": 14720861946240.0, + "grad_norm": 2.9319672420335805, + "language_loss": 0.80977523, + "learning_rate": 3.3011380112953553e-06, + "loss": 0.83090824, + "num_input_tokens_seen": 106125750, + "step": 4928, + "time_per_iteration": 2.5727570056915283 + }, + { + "auxiliary_loss_clip": 0.01071473, + "auxiliary_loss_mlp": 0.01041014, + "balance_loss_clip": 1.03105545, + "balance_loss_mlp": 1.02318025, + "epoch": 0.29634751240042084, + "flos": 26723787528960.0, + "grad_norm": 2.5075132241229188, + "language_loss": 0.72994465, + "learning_rate": 3.300842211064773e-06, + "loss": 0.75106949, + "num_input_tokens_seen": 106142835, + "step": 4929, + "time_per_iteration": 2.69631290435791 + }, + { + "auxiliary_loss_clip": 0.01060803, + "auxiliary_loss_mlp": 0.01048784, + "balance_loss_clip": 1.02858937, + "balance_loss_mlp": 1.03149819, + "epoch": 0.2964076356530888, + "flos": 14571293713920.0, + "grad_norm": 2.5007340882453972, + "language_loss": 0.7212075, + "learning_rate": 3.3005463615061246e-06, + "loss": 0.74230337, + "num_input_tokens_seen": 106160680, + "step": 4930, + "time_per_iteration": 2.628629207611084 + }, + { + "auxiliary_loss_clip": 0.01001487, + "auxiliary_loss_mlp": 0.01007642, + "balance_loss_clip": 1.01731408, + "balance_loss_mlp": 1.00406599, + "epoch": 0.29646775890575683, + "flos": 63104315063040.0, + "grad_norm": 0.8145238244883882, + "language_loss": 0.60669422, + "learning_rate": 3.3002504626306275e-06, + "loss": 0.62678552, + "num_input_tokens_seen": 106224415, + "step": 4931, + "time_per_iteration": 3.237462043762207 + }, + { + "auxiliary_loss_clip": 0.00984297, + "auxiliary_loss_mlp": 0.0100417, + "balance_loss_clip": 1.0171802, + "balance_loss_mlp": 1.00233448, + "epoch": 0.2965278821584248, + "flos": 63067686168960.0, + "grad_norm": 0.7386792040413516, + "language_loss": 0.52402127, + "learning_rate": 3.2999545144495023e-06, + "loss": 0.54390597, + "num_input_tokens_seen": 106279140, + "step": 4932, + "time_per_iteration": 3.264557361602783 + }, + { + "auxiliary_loss_clip": 0.01073791, + "auxiliary_loss_mlp": 0.01038066, + "balance_loss_clip": 1.02928209, + "balance_loss_mlp": 1.02344549, + "epoch": 0.29658800541109276, + "flos": 23768734510080.0, + "grad_norm": 1.639394464269314, + "language_loss": 0.81472665, + "learning_rate": 3.299658516973972e-06, + "loss": 0.83584523, + "num_input_tokens_seen": 106298190, + "step": 4933, + "time_per_iteration": 2.6692416667938232 + }, + { + "auxiliary_loss_clip": 0.01037738, + "auxiliary_loss_mlp": 0.01034875, + "balance_loss_clip": 1.02759051, + "balance_loss_mlp": 1.01930654, + "epoch": 0.2966481286637607, + "flos": 23988543788160.0, + "grad_norm": 2.006339712257759, + "language_loss": 0.75168025, + "learning_rate": 3.299362470215261e-06, + "loss": 0.77240634, + "num_input_tokens_seen": 106319065, + "step": 4934, + "time_per_iteration": 2.9247186183929443 + }, + { + "auxiliary_loss_clip": 0.01070022, + "auxiliary_loss_mlp": 0.01040306, + "balance_loss_clip": 1.03066802, + "balance_loss_mlp": 1.02458787, + "epoch": 0.2967082519164287, + "flos": 17165157523200.0, + "grad_norm": 1.7662111415088804, + "language_loss": 0.62084442, + "learning_rate": 3.299066374184594e-06, + "loss": 0.64194769, + "num_input_tokens_seen": 106338040, + "step": 4935, + "time_per_iteration": 2.7706172466278076 + }, + { + "auxiliary_loss_clip": 0.01074869, + "auxiliary_loss_mlp": 0.01038679, + "balance_loss_clip": 1.03266835, + "balance_loss_mlp": 1.02340889, + "epoch": 0.29676837516909665, + "flos": 29387712816000.0, + "grad_norm": 1.413201076315705, + "language_loss": 0.79767227, + "learning_rate": 3.2987702288932e-06, + "loss": 0.81880772, + "num_input_tokens_seen": 106358900, + "step": 4936, + "time_per_iteration": 2.8305094242095947 + }, + { + "auxiliary_loss_clip": 0.01053019, + "auxiliary_loss_mlp": 0.01037971, + "balance_loss_clip": 1.03435218, + "balance_loss_mlp": 1.02314115, + "epoch": 0.2968284984217646, + "flos": 34751222616960.0, + "grad_norm": 1.991537943987972, + "language_loss": 0.74093324, + "learning_rate": 3.298474034352309e-06, + "loss": 0.76184314, + "num_input_tokens_seen": 106381805, + "step": 4937, + "time_per_iteration": 2.9598562717437744 + }, + { + "auxiliary_loss_clip": 0.01047998, + "auxiliary_loss_mlp": 0.01039148, + "balance_loss_clip": 1.03346205, + "balance_loss_mlp": 1.02394879, + "epoch": 0.2968886216744326, + "flos": 21544104556800.0, + "grad_norm": 1.588810597993268, + "language_loss": 0.77851224, + "learning_rate": 3.2981777905731526e-06, + "loss": 0.79938364, + "num_input_tokens_seen": 106402365, + "step": 4938, + "time_per_iteration": 2.8542630672454834 + }, + { + "auxiliary_loss_clip": 0.01066094, + "auxiliary_loss_mlp": 0.01041983, + "balance_loss_clip": 1.03486085, + "balance_loss_mlp": 1.0257467, + "epoch": 0.29694874492710055, + "flos": 12787323811200.0, + "grad_norm": 1.9088447119683973, + "language_loss": 0.76416659, + "learning_rate": 3.297881497566964e-06, + "loss": 0.78524733, + "num_input_tokens_seen": 106419800, + "step": 4939, + "time_per_iteration": 2.7049808502197266 + }, + { + "auxiliary_loss_clip": 0.01049982, + "auxiliary_loss_mlp": 0.01036645, + "balance_loss_clip": 1.02892566, + "balance_loss_mlp": 1.02103424, + "epoch": 0.2970088681797685, + "flos": 24569973239040.0, + "grad_norm": 1.60593737585661, + "language_loss": 0.7805953, + "learning_rate": 3.297585155344979e-06, + "loss": 0.80146158, + "num_input_tokens_seen": 106440300, + "step": 4940, + "time_per_iteration": 2.8057820796966553 + }, + { + "auxiliary_loss_clip": 0.01071087, + "auxiliary_loss_mlp": 0.01038749, + "balance_loss_clip": 1.03575909, + "balance_loss_mlp": 1.02210712, + "epoch": 0.2970689914324365, + "flos": 23659171050240.0, + "grad_norm": 1.5308195706590906, + "language_loss": 0.75425607, + "learning_rate": 3.297288763918435e-06, + "loss": 0.77535444, + "num_input_tokens_seen": 106460035, + "step": 4941, + "time_per_iteration": 2.8101470470428467 + }, + { + "auxiliary_loss_clip": 0.01085844, + "auxiliary_loss_mlp": 0.01046077, + "balance_loss_clip": 1.03530693, + "balance_loss_mlp": 1.03015018, + "epoch": 0.29712911468510445, + "flos": 39670301439360.0, + "grad_norm": 2.9289683610955977, + "language_loss": 0.74004745, + "learning_rate": 3.2969923232985712e-06, + "loss": 0.76136661, + "num_input_tokens_seen": 106481095, + "step": 4942, + "time_per_iteration": 2.85404634475708 + }, + { + "auxiliary_loss_clip": 0.01063294, + "auxiliary_loss_mlp": 0.01040876, + "balance_loss_clip": 1.03376961, + "balance_loss_mlp": 1.02431762, + "epoch": 0.2971892379377724, + "flos": 26395312631040.0, + "grad_norm": 2.1914741591391307, + "language_loss": 0.70134139, + "learning_rate": 3.2966958334966287e-06, + "loss": 0.72238314, + "num_input_tokens_seen": 106501590, + "step": 4943, + "time_per_iteration": 2.798513174057007 + }, + { + "auxiliary_loss_clip": 0.01072439, + "auxiliary_loss_mlp": 0.0103759, + "balance_loss_clip": 1.03415012, + "balance_loss_mlp": 1.02178288, + "epoch": 0.2972493611904404, + "flos": 17603195880960.0, + "grad_norm": 1.9099430450844328, + "language_loss": 0.80301178, + "learning_rate": 3.2963992945238497e-06, + "loss": 0.82411206, + "num_input_tokens_seen": 106519430, + "step": 4944, + "time_per_iteration": 4.283116340637207 + }, + { + "auxiliary_loss_clip": 0.01064734, + "auxiliary_loss_mlp": 0.01036195, + "balance_loss_clip": 1.03113294, + "balance_loss_mlp": 1.02256358, + "epoch": 0.2973094844431084, + "flos": 20412774817920.0, + "grad_norm": 2.292728380769101, + "language_loss": 0.83033025, + "learning_rate": 3.2961027063914795e-06, + "loss": 0.85133958, + "num_input_tokens_seen": 106535870, + "step": 4945, + "time_per_iteration": 2.7343146800994873 + }, + { + "auxiliary_loss_clip": 0.01038474, + "auxiliary_loss_mlp": 0.01033747, + "balance_loss_clip": 1.02970123, + "balance_loss_mlp": 1.02050853, + "epoch": 0.29736960769577636, + "flos": 17493488766720.0, + "grad_norm": 2.9835074120877434, + "language_loss": 0.66691077, + "learning_rate": 3.2958060691107654e-06, + "loss": 0.68763304, + "num_input_tokens_seen": 106553560, + "step": 4946, + "time_per_iteration": 2.78899884223938 + }, + { + "auxiliary_loss_clip": 0.01074259, + "auxiliary_loss_mlp": 0.00749123, + "balance_loss_clip": 1.03383934, + "balance_loss_mlp": 1.00197303, + "epoch": 0.2974297309484443, + "flos": 26103969417600.0, + "grad_norm": 1.8387912310571164, + "language_loss": 0.73727697, + "learning_rate": 3.2955093826929547e-06, + "loss": 0.75551081, + "num_input_tokens_seen": 106574115, + "step": 4947, + "time_per_iteration": 4.3624587059021 + }, + { + "auxiliary_loss_clip": 0.01060482, + "auxiliary_loss_mlp": 0.0104128, + "balance_loss_clip": 1.03237009, + "balance_loss_mlp": 1.02556205, + "epoch": 0.2974898542011123, + "flos": 25666433850240.0, + "grad_norm": 2.4381100113780767, + "language_loss": 0.73234642, + "learning_rate": 3.2952126471492985e-06, + "loss": 0.75336403, + "num_input_tokens_seen": 106593070, + "step": 4948, + "time_per_iteration": 2.6904306411743164 + }, + { + "auxiliary_loss_clip": 0.01089061, + "auxiliary_loss_mlp": 0.01033623, + "balance_loss_clip": 1.03304243, + "balance_loss_mlp": 1.0194726, + "epoch": 0.29754997745378026, + "flos": 18661339658880.0, + "grad_norm": 2.0147071596238493, + "language_loss": 0.83616674, + "learning_rate": 3.2949158624910497e-06, + "loss": 0.85739356, + "num_input_tokens_seen": 106610695, + "step": 4949, + "time_per_iteration": 2.5782806873321533 + }, + { + "auxiliary_loss_clip": 0.01074153, + "auxiliary_loss_mlp": 0.01035663, + "balance_loss_clip": 1.02963471, + "balance_loss_mlp": 1.02118552, + "epoch": 0.2976101007064482, + "flos": 22274599449600.0, + "grad_norm": 2.2553180750797948, + "language_loss": 0.71145082, + "learning_rate": 3.2946190287294603e-06, + "loss": 0.73254895, + "num_input_tokens_seen": 106631300, + "step": 4950, + "time_per_iteration": 2.6972646713256836 + }, + { + "auxiliary_loss_clip": 0.01044467, + "auxiliary_loss_mlp": 0.01038006, + "balance_loss_clip": 1.03102744, + "balance_loss_mlp": 1.02421319, + "epoch": 0.2976702239591162, + "flos": 21945657674880.0, + "grad_norm": 2.524428061549558, + "language_loss": 0.82354534, + "learning_rate": 3.294322145875789e-06, + "loss": 0.84437007, + "num_input_tokens_seen": 106650065, + "step": 4951, + "time_per_iteration": 2.703251600265503 + }, + { + "auxiliary_loss_clip": 0.01061354, + "auxiliary_loss_mlp": 0.01032902, + "balance_loss_clip": 1.02669311, + "balance_loss_mlp": 1.01781571, + "epoch": 0.29773034721178415, + "flos": 24637197542400.0, + "grad_norm": 2.871416209932159, + "language_loss": 0.74047744, + "learning_rate": 3.2940252139412912e-06, + "loss": 0.76141989, + "num_input_tokens_seen": 106668230, + "step": 4952, + "time_per_iteration": 2.711794137954712 + }, + { + "auxiliary_loss_clip": 0.01011254, + "auxiliary_loss_mlp": 0.01045049, + "balance_loss_clip": 1.02597082, + "balance_loss_mlp": 1.02871704, + "epoch": 0.2977904704644521, + "flos": 20557566541440.0, + "grad_norm": 1.6642695480822478, + "language_loss": 0.83650076, + "learning_rate": 3.293728232937228e-06, + "loss": 0.85706383, + "num_input_tokens_seen": 106687785, + "step": 4953, + "time_per_iteration": 2.7929370403289795 + }, + { + "auxiliary_loss_clip": 0.01063381, + "auxiliary_loss_mlp": 0.01034384, + "balance_loss_clip": 1.03077054, + "balance_loss_mlp": 1.02024555, + "epoch": 0.2978505937171201, + "flos": 18916449027840.0, + "grad_norm": 2.0874665799061405, + "language_loss": 0.7359792, + "learning_rate": 3.2934312028748597e-06, + "loss": 0.75695688, + "num_input_tokens_seen": 106706875, + "step": 4954, + "time_per_iteration": 2.6684186458587646 + }, + { + "auxiliary_loss_clip": 0.01085514, + "auxiliary_loss_mlp": 0.0103757, + "balance_loss_clip": 1.03078413, + "balance_loss_mlp": 1.02406991, + "epoch": 0.29791071696978805, + "flos": 19317750750720.0, + "grad_norm": 1.80361077517054, + "language_loss": 0.75447053, + "learning_rate": 3.293134123765452e-06, + "loss": 0.7757014, + "num_input_tokens_seen": 106725105, + "step": 4955, + "time_per_iteration": 4.2574546337127686 + }, + { + "auxiliary_loss_clip": 0.01040282, + "auxiliary_loss_mlp": 0.01035227, + "balance_loss_clip": 1.02943707, + "balance_loss_mlp": 1.0203557, + "epoch": 0.297970840222456, + "flos": 18806813740800.0, + "grad_norm": 2.044608826483173, + "language_loss": 0.72472119, + "learning_rate": 3.2928369956202684e-06, + "loss": 0.74547637, + "num_input_tokens_seen": 106744780, + "step": 4956, + "time_per_iteration": 4.298493385314941 + }, + { + "auxiliary_loss_clip": 0.01080236, + "auxiliary_loss_mlp": 0.01041265, + "balance_loss_clip": 1.03064096, + "balance_loss_mlp": 1.02526116, + "epoch": 0.298030963475124, + "flos": 22852760762880.0, + "grad_norm": 1.7689236674870061, + "language_loss": 0.79161811, + "learning_rate": 3.2925398184505754e-06, + "loss": 0.81283307, + "num_input_tokens_seen": 106764670, + "step": 4957, + "time_per_iteration": 2.675459861755371 + }, + { + "auxiliary_loss_clip": 0.01080159, + "auxiliary_loss_mlp": 0.01037043, + "balance_loss_clip": 1.03161168, + "balance_loss_mlp": 1.02158761, + "epoch": 0.298091086727792, + "flos": 21868485304320.0, + "grad_norm": 1.5714179369069174, + "language_loss": 0.70175254, + "learning_rate": 3.2922425922676437e-06, + "loss": 0.72292459, + "num_input_tokens_seen": 106783695, + "step": 4958, + "time_per_iteration": 2.653001546859741 + }, + { + "auxiliary_loss_clip": 0.01056686, + "auxiliary_loss_mlp": 0.01043154, + "balance_loss_clip": 1.03184986, + "balance_loss_mlp": 1.02841926, + "epoch": 0.29815120998045996, + "flos": 21175014355200.0, + "grad_norm": 1.9461515039008055, + "language_loss": 0.78890729, + "learning_rate": 3.291945317082743e-06, + "loss": 0.80990571, + "num_input_tokens_seen": 106803150, + "step": 4959, + "time_per_iteration": 2.668548822402954 + }, + { + "auxiliary_loss_clip": 0.01072609, + "auxiliary_loss_mlp": 0.0104581, + "balance_loss_clip": 1.02855444, + "balance_loss_mlp": 1.03137398, + "epoch": 0.29821133323312793, + "flos": 19896271200000.0, + "grad_norm": 1.7705512596803683, + "language_loss": 0.79594254, + "learning_rate": 3.291647992907147e-06, + "loss": 0.81712675, + "num_input_tokens_seen": 106820705, + "step": 4960, + "time_per_iteration": 2.6221625804901123 + }, + { + "auxiliary_loss_clip": 0.01053864, + "auxiliary_loss_mlp": 0.01038087, + "balance_loss_clip": 1.02856684, + "balance_loss_mlp": 1.02288795, + "epoch": 0.2982714564857959, + "flos": 12750766744320.0, + "grad_norm": 2.295607797090458, + "language_loss": 0.73845255, + "learning_rate": 3.291350619752129e-06, + "loss": 0.75937206, + "num_input_tokens_seen": 106837335, + "step": 4961, + "time_per_iteration": 2.6196577548980713 + }, + { + "auxiliary_loss_clip": 0.01073963, + "auxiliary_loss_mlp": 0.01038762, + "balance_loss_clip": 1.03030133, + "balance_loss_mlp": 1.024791, + "epoch": 0.29833157973846386, + "flos": 22271905929600.0, + "grad_norm": 2.1568545262416987, + "language_loss": 0.61950845, + "learning_rate": 3.291053197628967e-06, + "loss": 0.64063573, + "num_input_tokens_seen": 106856250, + "step": 4962, + "time_per_iteration": 2.613046169281006 + }, + { + "auxiliary_loss_clip": 0.01076588, + "auxiliary_loss_mlp": 0.01040308, + "balance_loss_clip": 1.03180242, + "balance_loss_mlp": 1.02559733, + "epoch": 0.2983917029911318, + "flos": 15372999319680.0, + "grad_norm": 2.207620470750912, + "language_loss": 0.82648796, + "learning_rate": 3.2907557265489375e-06, + "loss": 0.84765697, + "num_input_tokens_seen": 106873370, + "step": 4963, + "time_per_iteration": 2.532336711883545 + }, + { + "auxiliary_loss_clip": 0.01057986, + "auxiliary_loss_mlp": 0.01031248, + "balance_loss_clip": 1.03276777, + "balance_loss_mlp": 1.01654339, + "epoch": 0.2984518262437998, + "flos": 15377632174080.0, + "grad_norm": 2.7133044965630724, + "language_loss": 0.66444021, + "learning_rate": 3.290458206523322e-06, + "loss": 0.68533254, + "num_input_tokens_seen": 106890330, + "step": 4964, + "time_per_iteration": 2.614319324493408 + }, + { + "auxiliary_loss_clip": 0.01072034, + "auxiliary_loss_mlp": 0.01034216, + "balance_loss_clip": 1.02824044, + "balance_loss_mlp": 1.02096009, + "epoch": 0.29851194949646775, + "flos": 18108458542080.0, + "grad_norm": 1.6544745335159932, + "language_loss": 0.71187103, + "learning_rate": 3.2901606375634015e-06, + "loss": 0.73293352, + "num_input_tokens_seen": 106909190, + "step": 4965, + "time_per_iteration": 2.5610859394073486 + }, + { + "auxiliary_loss_clip": 0.01091991, + "auxiliary_loss_mlp": 0.01046674, + "balance_loss_clip": 1.0350256, + "balance_loss_mlp": 1.03196979, + "epoch": 0.2985720727491357, + "flos": 22018233104640.0, + "grad_norm": 1.976433838930395, + "language_loss": 0.66256803, + "learning_rate": 3.289863019680461e-06, + "loss": 0.68395472, + "num_input_tokens_seen": 106927825, + "step": 4966, + "time_per_iteration": 2.5804455280303955 + }, + { + "auxiliary_loss_clip": 0.01090656, + "auxiliary_loss_mlp": 0.01036561, + "balance_loss_clip": 1.03421807, + "balance_loss_mlp": 1.02245808, + "epoch": 0.2986321960018037, + "flos": 13041355772160.0, + "grad_norm": 4.701091892787994, + "language_loss": 0.74266207, + "learning_rate": 3.289565352885785e-06, + "loss": 0.76393425, + "num_input_tokens_seen": 106943155, + "step": 4967, + "time_per_iteration": 2.5188803672790527 + }, + { + "auxiliary_loss_clip": 0.01051458, + "auxiliary_loss_mlp": 0.01033391, + "balance_loss_clip": 1.02576876, + "balance_loss_mlp": 1.01953292, + "epoch": 0.29869231925447165, + "flos": 14465034305280.0, + "grad_norm": 1.906624888237815, + "language_loss": 0.7149533, + "learning_rate": 3.2892676371906614e-06, + "loss": 0.73580182, + "num_input_tokens_seen": 106960295, + "step": 4968, + "time_per_iteration": 2.6237199306488037 + }, + { + "auxiliary_loss_clip": 0.01064877, + "auxiliary_loss_mlp": 0.01029282, + "balance_loss_clip": 1.02785158, + "balance_loss_mlp": 1.01406479, + "epoch": 0.2987524425071396, + "flos": 31650228639360.0, + "grad_norm": 1.8972341487253486, + "language_loss": 0.76650679, + "learning_rate": 3.2889698726063805e-06, + "loss": 0.78744841, + "num_input_tokens_seen": 106982870, + "step": 4969, + "time_per_iteration": 2.7248971462249756 + }, + { + "auxiliary_loss_clip": 0.01087162, + "auxiliary_loss_mlp": 0.01032284, + "balance_loss_clip": 1.03168964, + "balance_loss_mlp": 1.01896834, + "epoch": 0.2988125657598076, + "flos": 21433427775360.0, + "grad_norm": 1.66986849127607, + "language_loss": 0.70019329, + "learning_rate": 3.2886720591442327e-06, + "loss": 0.72138774, + "num_input_tokens_seen": 107002405, + "step": 4970, + "time_per_iteration": 2.583527088165283 + }, + { + "auxiliary_loss_clip": 0.01078995, + "auxiliary_loss_mlp": 0.01038761, + "balance_loss_clip": 1.03008354, + "balance_loss_mlp": 1.02265537, + "epoch": 0.2988726890124756, + "flos": 18076965292800.0, + "grad_norm": 2.2203338527942087, + "language_loss": 0.84815466, + "learning_rate": 3.2883741968155103e-06, + "loss": 0.86933219, + "num_input_tokens_seen": 107017310, + "step": 4971, + "time_per_iteration": 2.533015012741089 + }, + { + "auxiliary_loss_clip": 0.01056706, + "auxiliary_loss_mlp": 0.01042229, + "balance_loss_clip": 1.02870715, + "balance_loss_mlp": 1.0264461, + "epoch": 0.29893281226514357, + "flos": 21755653706880.0, + "grad_norm": 1.697677505652734, + "language_loss": 0.79584908, + "learning_rate": 3.2880762856315107e-06, + "loss": 0.8168385, + "num_input_tokens_seen": 107034645, + "step": 4972, + "time_per_iteration": 2.6595778465270996 + }, + { + "auxiliary_loss_clip": 0.01088195, + "auxiliary_loss_mlp": 0.01041214, + "balance_loss_clip": 1.03136909, + "balance_loss_mlp": 1.02611029, + "epoch": 0.29899293551781153, + "flos": 16836718538880.0, + "grad_norm": 10.524367581645208, + "language_loss": 0.85013592, + "learning_rate": 3.2877783256035285e-06, + "loss": 0.87143004, + "num_input_tokens_seen": 107051125, + "step": 4973, + "time_per_iteration": 2.6408658027648926 + }, + { + "auxiliary_loss_clip": 0.0105706, + "auxiliary_loss_mlp": 0.01032195, + "balance_loss_clip": 1.03007662, + "balance_loss_mlp": 1.01797938, + "epoch": 0.2990530587704795, + "flos": 11729215946880.0, + "grad_norm": 1.5968510820098558, + "language_loss": 0.77138245, + "learning_rate": 3.287480316742863e-06, + "loss": 0.79227507, + "num_input_tokens_seen": 107068815, + "step": 4974, + "time_per_iteration": 2.6988346576690674 + }, + { + "auxiliary_loss_clip": 0.01060415, + "auxiliary_loss_mlp": 0.00748959, + "balance_loss_clip": 1.0293982, + "balance_loss_mlp": 1.00204885, + "epoch": 0.29911318202314746, + "flos": 28039877850240.0, + "grad_norm": 1.6831101826088637, + "language_loss": 0.72151554, + "learning_rate": 3.287182259060815e-06, + "loss": 0.73960924, + "num_input_tokens_seen": 107090420, + "step": 4975, + "time_per_iteration": 2.720832109451294 + }, + { + "auxiliary_loss_clip": 0.0108185, + "auxiliary_loss_mlp": 0.01036862, + "balance_loss_clip": 1.03584099, + "balance_loss_mlp": 1.02199686, + "epoch": 0.2991733052758154, + "flos": 18733555952640.0, + "grad_norm": 2.9054497832594297, + "language_loss": 0.76104832, + "learning_rate": 3.286884152568687e-06, + "loss": 0.78223538, + "num_input_tokens_seen": 107107255, + "step": 4976, + "time_per_iteration": 2.7758917808532715 + }, + { + "auxiliary_loss_clip": 0.01074572, + "auxiliary_loss_mlp": 0.01037852, + "balance_loss_clip": 1.03084803, + "balance_loss_mlp": 1.02397013, + "epoch": 0.2992334285284834, + "flos": 15559160532480.0, + "grad_norm": 2.1362300248924715, + "language_loss": 0.86178088, + "learning_rate": 3.2865859972777827e-06, + "loss": 0.88290507, + "num_input_tokens_seen": 107123840, + "step": 4977, + "time_per_iteration": 2.6745309829711914 + }, + { + "auxiliary_loss_clip": 0.01067176, + "auxiliary_loss_mlp": 0.01036363, + "balance_loss_clip": 1.03174376, + "balance_loss_mlp": 1.02195621, + "epoch": 0.29929355178115136, + "flos": 21797561900160.0, + "grad_norm": 1.6930735479835728, + "language_loss": 0.68925464, + "learning_rate": 3.2862877931994088e-06, + "loss": 0.71029001, + "num_input_tokens_seen": 107143475, + "step": 4978, + "time_per_iteration": 2.7631494998931885 + }, + { + "auxiliary_loss_clip": 0.0107234, + "auxiliary_loss_mlp": 0.01035036, + "balance_loss_clip": 1.03476381, + "balance_loss_mlp": 1.01943159, + "epoch": 0.2993536750338193, + "flos": 21178533888000.0, + "grad_norm": 2.072964222271676, + "language_loss": 0.76288521, + "learning_rate": 3.2859895403448726e-06, + "loss": 0.78395891, + "num_input_tokens_seen": 107161725, + "step": 4979, + "time_per_iteration": 2.7493298053741455 + }, + { + "auxiliary_loss_clip": 0.0103183, + "auxiliary_loss_mlp": 0.01041954, + "balance_loss_clip": 1.0244019, + "balance_loss_mlp": 1.02571774, + "epoch": 0.2994137982864873, + "flos": 32122130544000.0, + "grad_norm": 1.815083776010309, + "language_loss": 0.68671095, + "learning_rate": 3.285691238725484e-06, + "loss": 0.70744878, + "num_input_tokens_seen": 107183935, + "step": 4980, + "time_per_iteration": 2.7930829524993896 + }, + { + "auxiliary_loss_clip": 0.01076188, + "auxiliary_loss_mlp": 0.00748993, + "balance_loss_clip": 1.03301859, + "balance_loss_mlp": 1.00199628, + "epoch": 0.29947392153915525, + "flos": 21105419754240.0, + "grad_norm": 2.129286137417993, + "language_loss": 0.73273301, + "learning_rate": 3.285392888352555e-06, + "loss": 0.75098479, + "num_input_tokens_seen": 107204285, + "step": 4981, + "time_per_iteration": 2.7182087898254395 + }, + { + "auxiliary_loss_clip": 0.01079934, + "auxiliary_loss_mlp": 0.01042062, + "balance_loss_clip": 1.0310874, + "balance_loss_mlp": 1.02743554, + "epoch": 0.2995340447918232, + "flos": 21542632099200.0, + "grad_norm": 1.5287046437821439, + "language_loss": 0.86242425, + "learning_rate": 3.2850944892373987e-06, + "loss": 0.88364422, + "num_input_tokens_seen": 107225265, + "step": 4982, + "time_per_iteration": 2.5847055912017822 + }, + { + "auxiliary_loss_clip": 0.01068161, + "auxiliary_loss_mlp": 0.01035982, + "balance_loss_clip": 1.03179085, + "balance_loss_mlp": 1.01972175, + "epoch": 0.2995941680444912, + "flos": 16725143917440.0, + "grad_norm": 2.231598150594055, + "language_loss": 0.85960454, + "learning_rate": 3.2847960413913307e-06, + "loss": 0.88064599, + "num_input_tokens_seen": 107241335, + "step": 4983, + "time_per_iteration": 2.6265745162963867 + }, + { + "auxiliary_loss_clip": 0.0108086, + "auxiliary_loss_mlp": 0.01043253, + "balance_loss_clip": 1.03668118, + "balance_loss_mlp": 1.02914441, + "epoch": 0.2996542912971592, + "flos": 20923496346240.0, + "grad_norm": 2.5477583885762582, + "language_loss": 0.78751642, + "learning_rate": 3.284497544825668e-06, + "loss": 0.80875754, + "num_input_tokens_seen": 107259375, + "step": 4984, + "time_per_iteration": 2.5727505683898926 + }, + { + "auxiliary_loss_clip": 0.01056744, + "auxiliary_loss_mlp": 0.01042385, + "balance_loss_clip": 1.02986026, + "balance_loss_mlp": 1.02586246, + "epoch": 0.29971441454982717, + "flos": 25079868754560.0, + "grad_norm": 1.632740434001135, + "language_loss": 0.78259248, + "learning_rate": 3.2841989995517303e-06, + "loss": 0.8035838, + "num_input_tokens_seen": 107279890, + "step": 4985, + "time_per_iteration": 2.6524298191070557 + }, + { + "auxiliary_loss_clip": 0.01027428, + "auxiliary_loss_mlp": 0.01042699, + "balance_loss_clip": 1.02545369, + "balance_loss_mlp": 1.02459073, + "epoch": 0.29977453780249513, + "flos": 52555911840000.0, + "grad_norm": 1.991370296533481, + "language_loss": 0.71703184, + "learning_rate": 3.283900405580837e-06, + "loss": 0.73773313, + "num_input_tokens_seen": 107303430, + "step": 4986, + "time_per_iteration": 2.9838643074035645 + }, + { + "auxiliary_loss_clip": 0.0107036, + "auxiliary_loss_mlp": 0.01041087, + "balance_loss_clip": 1.03113508, + "balance_loss_mlp": 1.02569699, + "epoch": 0.2998346610551631, + "flos": 22237144542720.0, + "grad_norm": 1.9103285795733014, + "language_loss": 0.73203409, + "learning_rate": 3.283601762924312e-06, + "loss": 0.75314856, + "num_input_tokens_seen": 107323700, + "step": 4987, + "time_per_iteration": 2.6529948711395264 + }, + { + "auxiliary_loss_clip": 0.01062835, + "auxiliary_loss_mlp": 0.0103807, + "balance_loss_clip": 1.0326283, + "balance_loss_mlp": 1.02347302, + "epoch": 0.29989478430783106, + "flos": 16873203778560.0, + "grad_norm": 1.9315142208821756, + "language_loss": 0.80079532, + "learning_rate": 3.2833030715934793e-06, + "loss": 0.8218044, + "num_input_tokens_seen": 107341965, + "step": 4988, + "time_per_iteration": 2.584357261657715 + }, + { + "auxiliary_loss_clip": 0.01057278, + "auxiliary_loss_mlp": 0.00749132, + "balance_loss_clip": 1.02948534, + "balance_loss_mlp": 1.0021584, + "epoch": 0.29995490756049903, + "flos": 23768878164480.0, + "grad_norm": 1.4614990977980165, + "language_loss": 0.70617998, + "learning_rate": 3.2830043315996658e-06, + "loss": 0.72424406, + "num_input_tokens_seen": 107362615, + "step": 4989, + "time_per_iteration": 2.69024395942688 + }, + { + "auxiliary_loss_clip": 0.01061375, + "auxiliary_loss_mlp": 0.01038979, + "balance_loss_clip": 1.035043, + "balance_loss_mlp": 1.02286232, + "epoch": 0.300015030813167, + "flos": 14465321614080.0, + "grad_norm": 1.9221435529705102, + "language_loss": 0.85429156, + "learning_rate": 3.282705542954199e-06, + "loss": 0.87529504, + "num_input_tokens_seen": 107378980, + "step": 4990, + "time_per_iteration": 2.7112648487091064 + }, + { + "auxiliary_loss_clip": 0.01078827, + "auxiliary_loss_mlp": 0.01036394, + "balance_loss_clip": 1.03115225, + "balance_loss_mlp": 1.02025342, + "epoch": 0.30007515406583496, + "flos": 25191982080000.0, + "grad_norm": 1.8173121984596958, + "language_loss": 0.66949391, + "learning_rate": 3.28240670566841e-06, + "loss": 0.69064617, + "num_input_tokens_seen": 107397640, + "step": 4991, + "time_per_iteration": 4.244549751281738 + }, + { + "auxiliary_loss_clip": 0.01064279, + "auxiliary_loss_mlp": 0.01037477, + "balance_loss_clip": 1.02848864, + "balance_loss_mlp": 1.0205133, + "epoch": 0.3001352773185029, + "flos": 19391188106880.0, + "grad_norm": 2.4788523549188213, + "language_loss": 0.79012394, + "learning_rate": 3.28210781975363e-06, + "loss": 0.81114149, + "num_input_tokens_seen": 107416020, + "step": 4992, + "time_per_iteration": 2.766953945159912 + }, + { + "auxiliary_loss_clip": 0.01088983, + "auxiliary_loss_mlp": 0.01039828, + "balance_loss_clip": 1.03251195, + "balance_loss_mlp": 1.02493894, + "epoch": 0.3001954005711709, + "flos": 21543853161600.0, + "grad_norm": 1.9941497802695323, + "language_loss": 0.82599235, + "learning_rate": 3.281808885221193e-06, + "loss": 0.84728044, + "num_input_tokens_seen": 107436340, + "step": 4993, + "time_per_iteration": 2.5853781700134277 + }, + { + "auxiliary_loss_clip": 0.01046073, + "auxiliary_loss_mlp": 0.01044264, + "balance_loss_clip": 1.02808571, + "balance_loss_mlp": 1.02753878, + "epoch": 0.30025552382383885, + "flos": 17384320356480.0, + "grad_norm": 2.109933507090898, + "language_loss": 0.85820484, + "learning_rate": 3.2815099020824345e-06, + "loss": 0.87910819, + "num_input_tokens_seen": 107454585, + "step": 4994, + "time_per_iteration": 4.319659948348999 + }, + { + "auxiliary_loss_clip": 0.01063616, + "auxiliary_loss_mlp": 0.01039514, + "balance_loss_clip": 1.03563583, + "balance_loss_mlp": 1.02423096, + "epoch": 0.3003156470765068, + "flos": 29533330552320.0, + "grad_norm": 1.9406581452653642, + "language_loss": 0.80766439, + "learning_rate": 3.2812108703486924e-06, + "loss": 0.82869571, + "num_input_tokens_seen": 107477180, + "step": 4995, + "time_per_iteration": 2.7279911041259766 + }, + { + "auxiliary_loss_clip": 0.01062889, + "auxiliary_loss_mlp": 0.01034631, + "balance_loss_clip": 1.03084731, + "balance_loss_mlp": 1.01995015, + "epoch": 0.3003757703291748, + "flos": 43646402465280.0, + "grad_norm": 1.6781585780403518, + "language_loss": 0.67367828, + "learning_rate": 3.2809117900313055e-06, + "loss": 0.69465351, + "num_input_tokens_seen": 107500250, + "step": 4996, + "time_per_iteration": 2.9691550731658936 + }, + { + "auxiliary_loss_clip": 0.01066677, + "auxiliary_loss_mlp": 0.010354, + "balance_loss_clip": 1.03046584, + "balance_loss_mlp": 1.02080917, + "epoch": 0.30043589358184275, + "flos": 22528380015360.0, + "grad_norm": 1.7999797889231466, + "language_loss": 0.75369459, + "learning_rate": 3.280612661141615e-06, + "loss": 0.7747153, + "num_input_tokens_seen": 107520070, + "step": 4997, + "time_per_iteration": 2.6624016761779785 + }, + { + "auxiliary_loss_clip": 0.01078188, + "auxiliary_loss_mlp": 0.01041693, + "balance_loss_clip": 1.03169918, + "balance_loss_mlp": 1.02822208, + "epoch": 0.30049601683451077, + "flos": 20995892208000.0, + "grad_norm": 1.84400275589647, + "language_loss": 0.77758819, + "learning_rate": 3.2803134836909646e-06, + "loss": 0.798787, + "num_input_tokens_seen": 107539285, + "step": 4998, + "time_per_iteration": 2.6665589809417725 + }, + { + "auxiliary_loss_clip": 0.01086101, + "auxiliary_loss_mlp": 0.01038233, + "balance_loss_clip": 1.03198791, + "balance_loss_mlp": 1.02434564, + "epoch": 0.30055614008717874, + "flos": 23916004272000.0, + "grad_norm": 2.5886124209540466, + "language_loss": 0.73188174, + "learning_rate": 3.2800142576906985e-06, + "loss": 0.75312507, + "num_input_tokens_seen": 107560260, + "step": 4999, + "time_per_iteration": 2.62945294380188 + }, + { + "auxiliary_loss_clip": 0.01076537, + "auxiliary_loss_mlp": 0.01038819, + "balance_loss_clip": 1.03056145, + "balance_loss_mlp": 1.02432907, + "epoch": 0.3006162633398467, + "flos": 19169798630400.0, + "grad_norm": 1.601114900592911, + "language_loss": 0.75693345, + "learning_rate": 3.2797149831521626e-06, + "loss": 0.77808696, + "num_input_tokens_seen": 107579260, + "step": 5000, + "time_per_iteration": 2.643503189086914 + }, + { + "auxiliary_loss_clip": 0.01087647, + "auxiliary_loss_mlp": 0.0103495, + "balance_loss_clip": 1.03300762, + "balance_loss_mlp": 1.02169359, + "epoch": 0.30067638659251467, + "flos": 14679241061760.0, + "grad_norm": 1.8245933202787432, + "language_loss": 0.82513189, + "learning_rate": 3.2794156600867073e-06, + "loss": 0.84635782, + "num_input_tokens_seen": 107595245, + "step": 5001, + "time_per_iteration": 2.5010592937469482 + }, + { + "auxiliary_loss_clip": 0.01077703, + "auxiliary_loss_mlp": 0.01042801, + "balance_loss_clip": 1.03160095, + "balance_loss_mlp": 1.02741694, + "epoch": 0.30073650984518263, + "flos": 23368007404800.0, + "grad_norm": 1.5407634419615437, + "language_loss": 0.80771339, + "learning_rate": 3.2791162885056815e-06, + "loss": 0.82891846, + "num_input_tokens_seen": 107613985, + "step": 5002, + "time_per_iteration": 5.848532438278198 + }, + { + "auxiliary_loss_clip": 0.01041906, + "auxiliary_loss_mlp": 0.01038439, + "balance_loss_clip": 1.03009462, + "balance_loss_mlp": 1.02251291, + "epoch": 0.3007966330978506, + "flos": 22966633854720.0, + "grad_norm": 1.8172001927426997, + "language_loss": 0.71000719, + "learning_rate": 3.2788168684204376e-06, + "loss": 0.73081064, + "num_input_tokens_seen": 107631435, + "step": 5003, + "time_per_iteration": 2.788020372390747 + }, + { + "auxiliary_loss_clip": 0.01060504, + "auxiliary_loss_mlp": 0.01043379, + "balance_loss_clip": 1.03182626, + "balance_loss_mlp": 1.02831078, + "epoch": 0.30085675635051856, + "flos": 27818452460160.0, + "grad_norm": 1.810836673976906, + "language_loss": 0.70122349, + "learning_rate": 3.27851739984233e-06, + "loss": 0.72226232, + "num_input_tokens_seen": 107650530, + "step": 5004, + "time_per_iteration": 2.8093667030334473 + }, + { + "auxiliary_loss_clip": 0.01071327, + "auxiliary_loss_mlp": 0.01044212, + "balance_loss_clip": 1.0328629, + "balance_loss_mlp": 1.02847064, + "epoch": 0.3009168796031865, + "flos": 10882729059840.0, + "grad_norm": 4.776053673550562, + "language_loss": 0.81549126, + "learning_rate": 3.278217882782715e-06, + "loss": 0.83664668, + "num_input_tokens_seen": 107662240, + "step": 5005, + "time_per_iteration": 2.7045552730560303 + }, + { + "auxiliary_loss_clip": 0.01077653, + "auxiliary_loss_mlp": 0.01037043, + "balance_loss_clip": 1.0322001, + "balance_loss_mlp": 1.02282774, + "epoch": 0.3009770028558545, + "flos": 23805399317760.0, + "grad_norm": 2.8600633276772376, + "language_loss": 0.74955553, + "learning_rate": 3.2779183172529497e-06, + "loss": 0.77070254, + "num_input_tokens_seen": 107680330, + "step": 5006, + "time_per_iteration": 2.650881290435791 + }, + { + "auxiliary_loss_clip": 0.01054729, + "auxiliary_loss_mlp": 0.00748772, + "balance_loss_clip": 1.03081036, + "balance_loss_mlp": 1.00164485, + "epoch": 0.30103712610852246, + "flos": 26468211283200.0, + "grad_norm": 1.8944125466817059, + "language_loss": 0.71556973, + "learning_rate": 3.2776187032643932e-06, + "loss": 0.73360479, + "num_input_tokens_seen": 107700020, + "step": 5007, + "time_per_iteration": 2.817601442337036 + }, + { + "auxiliary_loss_clip": 0.01077668, + "auxiliary_loss_mlp": 0.01041432, + "balance_loss_clip": 1.03174067, + "balance_loss_mlp": 1.02520776, + "epoch": 0.3010972493611904, + "flos": 22856459863680.0, + "grad_norm": 2.1567293726280057, + "language_loss": 0.76140499, + "learning_rate": 3.2773190408284075e-06, + "loss": 0.78259593, + "num_input_tokens_seen": 107718575, + "step": 5008, + "time_per_iteration": 2.7434914112091064 + }, + { + "auxiliary_loss_clip": 0.01079286, + "auxiliary_loss_mlp": 0.01036612, + "balance_loss_clip": 1.03293324, + "balance_loss_mlp": 1.02184176, + "epoch": 0.3011573726138584, + "flos": 24053685102720.0, + "grad_norm": 1.8379658240750014, + "language_loss": 0.84337938, + "learning_rate": 3.2770193299563564e-06, + "loss": 0.86453843, + "num_input_tokens_seen": 107738635, + "step": 5009, + "time_per_iteration": 2.682504177093506 + }, + { + "auxiliary_loss_clip": 0.01081921, + "auxiliary_loss_mlp": 0.01038924, + "balance_loss_clip": 1.03149223, + "balance_loss_mlp": 1.02204418, + "epoch": 0.30121749586652635, + "flos": 20259687052800.0, + "grad_norm": 2.158528351657574, + "language_loss": 0.83286667, + "learning_rate": 3.276719570659604e-06, + "loss": 0.85407513, + "num_input_tokens_seen": 107753415, + "step": 5010, + "time_per_iteration": 2.643378734588623 + }, + { + "auxiliary_loss_clip": 0.01058621, + "auxiliary_loss_mlp": 0.01037483, + "balance_loss_clip": 1.03069139, + "balance_loss_mlp": 1.02276015, + "epoch": 0.3012776191191944, + "flos": 26943058103040.0, + "grad_norm": 2.0629454428438962, + "language_loss": 0.85506672, + "learning_rate": 3.2764197629495176e-06, + "loss": 0.8760277, + "num_input_tokens_seen": 107773840, + "step": 5011, + "time_per_iteration": 2.8659026622772217 + }, + { + "auxiliary_loss_clip": 0.01066197, + "auxiliary_loss_mlp": 0.01041309, + "balance_loss_clip": 1.02910006, + "balance_loss_mlp": 1.02532268, + "epoch": 0.30133774237186234, + "flos": 20412307941120.0, + "grad_norm": 1.9235790868953873, + "language_loss": 0.72159797, + "learning_rate": 3.2761199068374656e-06, + "loss": 0.74267304, + "num_input_tokens_seen": 107792020, + "step": 5012, + "time_per_iteration": 2.7224137783050537 + }, + { + "auxiliary_loss_clip": 0.01079545, + "auxiliary_loss_mlp": 0.01039529, + "balance_loss_clip": 1.03234267, + "balance_loss_mlp": 1.02427602, + "epoch": 0.3013978656245303, + "flos": 19792453916160.0, + "grad_norm": 2.12297233968524, + "language_loss": 0.87624037, + "learning_rate": 3.275820002334819e-06, + "loss": 0.89743114, + "num_input_tokens_seen": 107809595, + "step": 5013, + "time_per_iteration": 2.621962785720825 + }, + { + "auxiliary_loss_clip": 0.01055335, + "auxiliary_loss_mlp": 0.01044457, + "balance_loss_clip": 1.02720511, + "balance_loss_mlp": 1.02596796, + "epoch": 0.30145798887719827, + "flos": 16249650652800.0, + "grad_norm": 2.5867139929977827, + "language_loss": 0.83082289, + "learning_rate": 3.2755200494529496e-06, + "loss": 0.85182083, + "num_input_tokens_seen": 107827230, + "step": 5014, + "time_per_iteration": 2.69513201713562 + }, + { + "auxiliary_loss_clip": 0.01045837, + "auxiliary_loss_mlp": 0.01042002, + "balance_loss_clip": 1.02808785, + "balance_loss_mlp": 1.02590859, + "epoch": 0.30151811212986623, + "flos": 24571733005440.0, + "grad_norm": 1.6198066572627432, + "language_loss": 0.68427336, + "learning_rate": 3.2752200482032323e-06, + "loss": 0.7051518, + "num_input_tokens_seen": 107847195, + "step": 5015, + "time_per_iteration": 2.8561787605285645 + }, + { + "auxiliary_loss_clip": 0.01065835, + "auxiliary_loss_mlp": 0.01042268, + "balance_loss_clip": 1.03016269, + "balance_loss_mlp": 1.02725351, + "epoch": 0.3015782353825342, + "flos": 21872076664320.0, + "grad_norm": 2.4773376913475293, + "language_loss": 0.74648166, + "learning_rate": 3.2749199985970436e-06, + "loss": 0.76756263, + "num_input_tokens_seen": 107866420, + "step": 5016, + "time_per_iteration": 2.6756770610809326 + }, + { + "auxiliary_loss_clip": 0.01079551, + "auxiliary_loss_mlp": 0.01040072, + "balance_loss_clip": 1.03132081, + "balance_loss_mlp": 1.02450943, + "epoch": 0.30163835863520216, + "flos": 28769331248640.0, + "grad_norm": 1.483716555838727, + "language_loss": 0.65546292, + "learning_rate": 3.2746199006457603e-06, + "loss": 0.67665911, + "num_input_tokens_seen": 107889090, + "step": 5017, + "time_per_iteration": 2.7968618869781494 + }, + { + "auxiliary_loss_clip": 0.01035737, + "auxiliary_loss_mlp": 0.01050003, + "balance_loss_clip": 1.02671349, + "balance_loss_mlp": 1.03330207, + "epoch": 0.30169848188787013, + "flos": 22966202891520.0, + "grad_norm": 2.3000503468448925, + "language_loss": 0.69174254, + "learning_rate": 3.2743197543607628e-06, + "loss": 0.71259999, + "num_input_tokens_seen": 107907520, + "step": 5018, + "time_per_iteration": 2.674398183822632 + }, + { + "auxiliary_loss_clip": 0.01085253, + "auxiliary_loss_mlp": 0.01040486, + "balance_loss_clip": 1.03126776, + "balance_loss_mlp": 1.02680087, + "epoch": 0.3017586051405381, + "flos": 21835268202240.0, + "grad_norm": 2.2654537690632024, + "language_loss": 0.78947866, + "learning_rate": 3.2740195597534327e-06, + "loss": 0.81073606, + "num_input_tokens_seen": 107925650, + "step": 5019, + "time_per_iteration": 2.5684094429016113 + }, + { + "auxiliary_loss_clip": 0.01069344, + "auxiliary_loss_mlp": 0.0103853, + "balance_loss_clip": 1.03336966, + "balance_loss_mlp": 1.02406406, + "epoch": 0.30181872839320606, + "flos": 22160403135360.0, + "grad_norm": 2.013298080333651, + "language_loss": 0.69777435, + "learning_rate": 3.2737193168351527e-06, + "loss": 0.71885312, + "num_input_tokens_seen": 107943975, + "step": 5020, + "time_per_iteration": 2.712982416152954 + }, + { + "auxiliary_loss_clip": 0.01092684, + "auxiliary_loss_mlp": 0.01043436, + "balance_loss_clip": 1.03264236, + "balance_loss_mlp": 1.02849865, + "epoch": 0.301878851645874, + "flos": 18114168804480.0, + "grad_norm": 2.0032374020660075, + "language_loss": 0.78549719, + "learning_rate": 3.2734190256173085e-06, + "loss": 0.80685842, + "num_input_tokens_seen": 107962950, + "step": 5021, + "time_per_iteration": 2.6435067653656006 + }, + { + "auxiliary_loss_clip": 0.01075987, + "auxiliary_loss_mlp": 0.01032833, + "balance_loss_clip": 1.02875245, + "balance_loss_mlp": 1.01851606, + "epoch": 0.301938974898542, + "flos": 17602226213760.0, + "grad_norm": 2.3210569222607917, + "language_loss": 0.75749683, + "learning_rate": 3.2731186861112877e-06, + "loss": 0.77858508, + "num_input_tokens_seen": 107979700, + "step": 5022, + "time_per_iteration": 2.5518462657928467 + }, + { + "auxiliary_loss_clip": 0.01089562, + "auxiliary_loss_mlp": 0.01041882, + "balance_loss_clip": 1.03120017, + "balance_loss_mlp": 1.02649188, + "epoch": 0.30199909815120995, + "flos": 11181219079680.0, + "grad_norm": 1.6882110618998012, + "language_loss": 0.69702983, + "learning_rate": 3.2728182983284793e-06, + "loss": 0.71834433, + "num_input_tokens_seen": 107996645, + "step": 5023, + "time_per_iteration": 2.5881316661834717 + }, + { + "auxiliary_loss_clip": 0.0106022, + "auxiliary_loss_mlp": 0.01035704, + "balance_loss_clip": 1.02858424, + "balance_loss_mlp": 1.02113616, + "epoch": 0.302059221403878, + "flos": 21907843632000.0, + "grad_norm": 1.9820550278692424, + "language_loss": 0.71750224, + "learning_rate": 3.2725178622802724e-06, + "loss": 0.73846143, + "num_input_tokens_seen": 108015020, + "step": 5024, + "time_per_iteration": 2.6356678009033203 + }, + { + "auxiliary_loss_clip": 0.01077379, + "auxiliary_loss_mlp": 0.0104371, + "balance_loss_clip": 1.03249168, + "balance_loss_mlp": 1.02867186, + "epoch": 0.30211934465654594, + "flos": 26396390039040.0, + "grad_norm": 1.6467667802438308, + "language_loss": 0.74342555, + "learning_rate": 3.272217377978061e-06, + "loss": 0.7646364, + "num_input_tokens_seen": 108036430, + "step": 5025, + "time_per_iteration": 2.70235276222229 + }, + { + "auxiliary_loss_clip": 0.01078555, + "auxiliary_loss_mlp": 0.01040377, + "balance_loss_clip": 1.03311419, + "balance_loss_mlp": 1.02626908, + "epoch": 0.3021794679092139, + "flos": 23400470321280.0, + "grad_norm": 1.5723754582598763, + "language_loss": 0.67458552, + "learning_rate": 3.2719168454332387e-06, + "loss": 0.69577479, + "num_input_tokens_seen": 108054250, + "step": 5026, + "time_per_iteration": 2.746708393096924 + }, + { + "auxiliary_loss_clip": 0.010794, + "auxiliary_loss_mlp": 0.01040366, + "balance_loss_clip": 1.03324544, + "balance_loss_mlp": 1.02575064, + "epoch": 0.30223959116188187, + "flos": 20260979942400.0, + "grad_norm": 1.7364023479494484, + "language_loss": 0.84982425, + "learning_rate": 3.2716162646572034e-06, + "loss": 0.87102187, + "num_input_tokens_seen": 108071495, + "step": 5027, + "time_per_iteration": 2.5817174911499023 + }, + { + "auxiliary_loss_clip": 0.01067952, + "auxiliary_loss_mlp": 0.0103959, + "balance_loss_clip": 1.03553855, + "balance_loss_mlp": 1.0254283, + "epoch": 0.30229971441454984, + "flos": 26687840993280.0, + "grad_norm": 1.5016850835833393, + "language_loss": 0.78442192, + "learning_rate": 3.271315635661351e-06, + "loss": 0.80549735, + "num_input_tokens_seen": 108092135, + "step": 5028, + "time_per_iteration": 2.7219247817993164 + }, + { + "auxiliary_loss_clip": 0.01063902, + "auxiliary_loss_mlp": 0.01043233, + "balance_loss_clip": 1.03003573, + "balance_loss_mlp": 1.0273068, + "epoch": 0.3023598376672178, + "flos": 34345323953280.0, + "grad_norm": 1.7587403117853935, + "language_loss": 0.76811975, + "learning_rate": 3.2710149584570826e-06, + "loss": 0.78919113, + "num_input_tokens_seen": 108112945, + "step": 5029, + "time_per_iteration": 2.9248948097229004 + }, + { + "auxiliary_loss_clip": 0.0105777, + "auxiliary_loss_mlp": 0.0103993, + "balance_loss_clip": 1.03048491, + "balance_loss_mlp": 1.02360404, + "epoch": 0.30241996091988577, + "flos": 23112143850240.0, + "grad_norm": 4.958043344181292, + "language_loss": 0.82390344, + "learning_rate": 3.2707142330557993e-06, + "loss": 0.84488046, + "num_input_tokens_seen": 108130325, + "step": 5030, + "time_per_iteration": 2.682079315185547 + }, + { + "auxiliary_loss_clip": 0.01041107, + "auxiliary_loss_mlp": 0.00748726, + "balance_loss_clip": 1.02937376, + "balance_loss_mlp": 1.00159669, + "epoch": 0.30248008417255373, + "flos": 19390002958080.0, + "grad_norm": 1.9036398205401974, + "language_loss": 0.69589871, + "learning_rate": 3.270413459468905e-06, + "loss": 0.71379703, + "num_input_tokens_seen": 108150300, + "step": 5031, + "time_per_iteration": 2.7681081295013428 + }, + { + "auxiliary_loss_clip": 0.01068413, + "auxiliary_loss_mlp": 0.01036484, + "balance_loss_clip": 1.02914548, + "balance_loss_mlp": 1.02142835, + "epoch": 0.3025402074252217, + "flos": 23769704177280.0, + "grad_norm": 1.6851174690194508, + "language_loss": 0.82292461, + "learning_rate": 3.2701126377078047e-06, + "loss": 0.84397364, + "num_input_tokens_seen": 108170330, + "step": 5032, + "time_per_iteration": 2.6151461601257324 + }, + { + "auxiliary_loss_clip": 0.01052307, + "auxiliary_loss_mlp": 0.0104475, + "balance_loss_clip": 1.03244019, + "balance_loss_mlp": 1.02769136, + "epoch": 0.30260033067788966, + "flos": 25994118648960.0, + "grad_norm": 2.3028008551513524, + "language_loss": 0.73215926, + "learning_rate": 3.269811767783906e-06, + "loss": 0.7531299, + "num_input_tokens_seen": 108191265, + "step": 5033, + "time_per_iteration": 2.7618823051452637 + }, + { + "auxiliary_loss_clip": 0.01078373, + "auxiliary_loss_mlp": 0.01046389, + "balance_loss_clip": 1.03212714, + "balance_loss_mlp": 1.03054643, + "epoch": 0.3026604539305576, + "flos": 25374551932800.0, + "grad_norm": 1.3657990570123573, + "language_loss": 0.73896635, + "learning_rate": 3.2695108497086185e-06, + "loss": 0.76021391, + "num_input_tokens_seen": 108211615, + "step": 5034, + "time_per_iteration": 2.6732802391052246 + }, + { + "auxiliary_loss_clip": 0.01089263, + "auxiliary_loss_mlp": 0.01033561, + "balance_loss_clip": 1.03196073, + "balance_loss_mlp": 1.01867175, + "epoch": 0.3027205771832256, + "flos": 25812733944960.0, + "grad_norm": 1.8122382872328442, + "language_loss": 0.71894419, + "learning_rate": 3.269209883493352e-06, + "loss": 0.74017251, + "num_input_tokens_seen": 108231080, + "step": 5035, + "time_per_iteration": 2.689959764480591 + }, + { + "auxiliary_loss_clip": 0.01071735, + "auxiliary_loss_mlp": 0.01037245, + "balance_loss_clip": 1.02874148, + "balance_loss_mlp": 1.02328515, + "epoch": 0.30278070043589356, + "flos": 27344539393920.0, + "grad_norm": 2.403746742298767, + "language_loss": 0.87371254, + "learning_rate": 3.2689088691495196e-06, + "loss": 0.89480233, + "num_input_tokens_seen": 108251125, + "step": 5036, + "time_per_iteration": 2.664863109588623 + }, + { + "auxiliary_loss_clip": 0.01053034, + "auxiliary_loss_mlp": 0.01048471, + "balance_loss_clip": 1.02933121, + "balance_loss_mlp": 1.03230667, + "epoch": 0.3028408236885616, + "flos": 24786227070720.0, + "grad_norm": 3.6661213252194673, + "language_loss": 0.77463448, + "learning_rate": 3.268607806688536e-06, + "loss": 0.79564953, + "num_input_tokens_seen": 108272545, + "step": 5037, + "time_per_iteration": 2.794593334197998 + }, + { + "auxiliary_loss_clip": 0.01045292, + "auxiliary_loss_mlp": 0.01040832, + "balance_loss_clip": 1.02698088, + "balance_loss_mlp": 1.02461958, + "epoch": 0.30290094694122954, + "flos": 12932474670720.0, + "grad_norm": 3.666521635964655, + "language_loss": 0.77472103, + "learning_rate": 3.268306696121816e-06, + "loss": 0.79558218, + "num_input_tokens_seen": 108289725, + "step": 5038, + "time_per_iteration": 4.383882761001587 + }, + { + "auxiliary_loss_clip": 0.01062504, + "auxiliary_loss_mlp": 0.01036115, + "balance_loss_clip": 1.02954578, + "balance_loss_mlp": 1.02205455, + "epoch": 0.3029610701938975, + "flos": 25916443488000.0, + "grad_norm": 1.683568159027928, + "language_loss": 0.73755151, + "learning_rate": 3.2680055374607804e-06, + "loss": 0.75853765, + "num_input_tokens_seen": 108310690, + "step": 5039, + "time_per_iteration": 2.6897895336151123 + }, + { + "auxiliary_loss_clip": 0.01086275, + "auxiliary_loss_mlp": 0.00748572, + "balance_loss_clip": 1.03257918, + "balance_loss_mlp": 1.00150287, + "epoch": 0.3030211934465655, + "flos": 21980993679360.0, + "grad_norm": 12.376332156323887, + "language_loss": 0.79919815, + "learning_rate": 3.267704330716847e-06, + "loss": 0.81754661, + "num_input_tokens_seen": 108328905, + "step": 5040, + "time_per_iteration": 2.597628116607666 + }, + { + "auxiliary_loss_clip": 0.01061674, + "auxiliary_loss_mlp": 0.01036626, + "balance_loss_clip": 1.03035593, + "balance_loss_mlp": 1.0222311, + "epoch": 0.30308131669923344, + "flos": 20991977625600.0, + "grad_norm": 2.0069408475201205, + "language_loss": 0.81927013, + "learning_rate": 3.267403075901438e-06, + "loss": 0.84025311, + "num_input_tokens_seen": 108346680, + "step": 5041, + "time_per_iteration": 4.199628114700317 + }, + { + "auxiliary_loss_clip": 0.0099393, + "auxiliary_loss_mlp": 0.01012099, + "balance_loss_clip": 1.01772296, + "balance_loss_mlp": 1.01015556, + "epoch": 0.3031414399519014, + "flos": 60548875827840.0, + "grad_norm": 0.7641504635822757, + "language_loss": 0.59453797, + "learning_rate": 3.267101773025978e-06, + "loss": 0.61459827, + "num_input_tokens_seen": 108413885, + "step": 5042, + "time_per_iteration": 3.4268124103546143 + }, + { + "auxiliary_loss_clip": 0.01089866, + "auxiliary_loss_mlp": 0.01033701, + "balance_loss_clip": 1.03265667, + "balance_loss_mlp": 1.01919377, + "epoch": 0.30320156320456937, + "flos": 21907664064000.0, + "grad_norm": 1.68478831501279, + "language_loss": 0.71300292, + "learning_rate": 3.266800422101892e-06, + "loss": 0.73423862, + "num_input_tokens_seen": 108433640, + "step": 5043, + "time_per_iteration": 2.573274612426758 + }, + { + "auxiliary_loss_clip": 0.01048125, + "auxiliary_loss_mlp": 0.01031065, + "balance_loss_clip": 1.0317775, + "balance_loss_mlp": 1.01714146, + "epoch": 0.30326168645723733, + "flos": 21652770176640.0, + "grad_norm": 2.3503952086228477, + "language_loss": 0.69256222, + "learning_rate": 3.266499023140606e-06, + "loss": 0.71335405, + "num_input_tokens_seen": 108452640, + "step": 5044, + "time_per_iteration": 2.688424825668335 + }, + { + "auxiliary_loss_clip": 0.01078664, + "auxiliary_loss_mlp": 0.0103611, + "balance_loss_clip": 1.03312421, + "balance_loss_mlp": 1.02209115, + "epoch": 0.3033218097099053, + "flos": 21871286565120.0, + "grad_norm": 1.3549696600770866, + "language_loss": 0.77239549, + "learning_rate": 3.2661975761535513e-06, + "loss": 0.79354322, + "num_input_tokens_seen": 108472470, + "step": 5045, + "time_per_iteration": 2.608705759048462 + }, + { + "auxiliary_loss_clip": 0.01090364, + "auxiliary_loss_mlp": 0.00748671, + "balance_loss_clip": 1.03291821, + "balance_loss_mlp": 1.00166357, + "epoch": 0.30338193296257326, + "flos": 27089717333760.0, + "grad_norm": 1.6136050266710302, + "language_loss": 0.72324616, + "learning_rate": 3.2658960811521564e-06, + "loss": 0.74163646, + "num_input_tokens_seen": 108493025, + "step": 5046, + "time_per_iteration": 2.5908312797546387 + }, + { + "auxiliary_loss_clip": 0.01077468, + "auxiliary_loss_mlp": 0.01035974, + "balance_loss_clip": 1.03189874, + "balance_loss_mlp": 1.0189209, + "epoch": 0.30344205621524123, + "flos": 19534363718400.0, + "grad_norm": 1.5052349431236793, + "language_loss": 0.80864203, + "learning_rate": 3.2655945381478564e-06, + "loss": 0.82977653, + "num_input_tokens_seen": 108513480, + "step": 5047, + "time_per_iteration": 2.602854013442993 + }, + { + "auxiliary_loss_clip": 0.01028958, + "auxiliary_loss_mlp": 0.01039774, + "balance_loss_clip": 1.02623534, + "balance_loss_mlp": 1.02500415, + "epoch": 0.3035021794679092, + "flos": 23910976368000.0, + "grad_norm": 1.7812297763251033, + "language_loss": 0.72066212, + "learning_rate": 3.265292947152084e-06, + "loss": 0.74134946, + "num_input_tokens_seen": 108533155, + "step": 5048, + "time_per_iteration": 2.830446481704712 + }, + { + "auxiliary_loss_clip": 0.0106744, + "auxiliary_loss_mlp": 0.01031476, + "balance_loss_clip": 1.03091466, + "balance_loss_mlp": 1.01752806, + "epoch": 0.30356230272057716, + "flos": 16143606725760.0, + "grad_norm": 1.8328162635457037, + "language_loss": 0.75588363, + "learning_rate": 3.2649913081762763e-06, + "loss": 0.77687281, + "num_input_tokens_seen": 108551900, + "step": 5049, + "time_per_iteration": 4.228721618652344 + }, + { + "auxiliary_loss_clip": 0.01080112, + "auxiliary_loss_mlp": 0.01036267, + "balance_loss_clip": 1.03175735, + "balance_loss_mlp": 1.02162862, + "epoch": 0.3036224259732452, + "flos": 28914697589760.0, + "grad_norm": 1.7979218167377085, + "language_loss": 0.82045507, + "learning_rate": 3.2646896212318717e-06, + "loss": 0.8416189, + "num_input_tokens_seen": 108574005, + "step": 5050, + "time_per_iteration": 4.246530055999756 + }, + { + "auxiliary_loss_clip": 0.01060892, + "auxiliary_loss_mlp": 0.01039917, + "balance_loss_clip": 1.03296304, + "balance_loss_mlp": 1.02396655, + "epoch": 0.30368254922591315, + "flos": 21105599322240.0, + "grad_norm": 2.3315671738281694, + "language_loss": 0.736696, + "learning_rate": 3.2643878863303106e-06, + "loss": 0.75770414, + "num_input_tokens_seen": 108592715, + "step": 5051, + "time_per_iteration": 2.674319267272949 + }, + { + "auxiliary_loss_clip": 0.01028084, + "auxiliary_loss_mlp": 0.00748625, + "balance_loss_clip": 1.02751243, + "balance_loss_mlp": 1.00158763, + "epoch": 0.3037426724785811, + "flos": 23002293081600.0, + "grad_norm": 2.6769693245781325, + "language_loss": 0.76693958, + "learning_rate": 3.264086103483033e-06, + "loss": 0.78470671, + "num_input_tokens_seen": 108611770, + "step": 5052, + "time_per_iteration": 2.7695846557617188 + }, + { + "auxiliary_loss_clip": 0.01092087, + "auxiliary_loss_mlp": 0.01037047, + "balance_loss_clip": 1.03317988, + "balance_loss_mlp": 1.0223906, + "epoch": 0.3038027957312491, + "flos": 15632705629440.0, + "grad_norm": 1.9563660852203526, + "language_loss": 0.83091724, + "learning_rate": 3.2637842727014836e-06, + "loss": 0.85220861, + "num_input_tokens_seen": 108629070, + "step": 5053, + "time_per_iteration": 2.607168674468994 + }, + { + "auxiliary_loss_clip": 0.01067092, + "auxiliary_loss_mlp": 0.01037716, + "balance_loss_clip": 1.03138912, + "balance_loss_mlp": 1.02271914, + "epoch": 0.30386291898391704, + "flos": 12713994195840.0, + "grad_norm": 1.5827935759604976, + "language_loss": 0.70920646, + "learning_rate": 3.2634823939971083e-06, + "loss": 0.73025459, + "num_input_tokens_seen": 108646315, + "step": 5054, + "time_per_iteration": 2.6163060665130615 + }, + { + "auxiliary_loss_clip": 0.0109187, + "auxiliary_loss_mlp": 0.01039773, + "balance_loss_clip": 1.03455973, + "balance_loss_mlp": 1.02475846, + "epoch": 0.303923042236585, + "flos": 26359437922560.0, + "grad_norm": 3.3727680264013133, + "language_loss": 0.69286484, + "learning_rate": 3.2631804673813545e-06, + "loss": 0.7141813, + "num_input_tokens_seen": 108665920, + "step": 5055, + "time_per_iteration": 2.631932497024536 + }, + { + "auxiliary_loss_clip": 0.01071891, + "auxiliary_loss_mlp": 0.01041556, + "balance_loss_clip": 1.03609157, + "balance_loss_mlp": 1.02574921, + "epoch": 0.30398316548925297, + "flos": 19719232041600.0, + "grad_norm": 2.0438352112728535, + "language_loss": 0.67404568, + "learning_rate": 3.2628784928656707e-06, + "loss": 0.69518018, + "num_input_tokens_seen": 108683485, + "step": 5056, + "time_per_iteration": 2.6654129028320312 + }, + { + "auxiliary_loss_clip": 0.01061103, + "auxiliary_loss_mlp": 0.01038794, + "balance_loss_clip": 1.0302273, + "balance_loss_mlp": 1.02426219, + "epoch": 0.30404328874192094, + "flos": 24239846315520.0, + "grad_norm": 1.6049483946554113, + "language_loss": 0.82119453, + "learning_rate": 3.262576470461507e-06, + "loss": 0.84219348, + "num_input_tokens_seen": 108702700, + "step": 5057, + "time_per_iteration": 2.6968555450439453 + }, + { + "auxiliary_loss_clip": 0.01065253, + "auxiliary_loss_mlp": 0.01034884, + "balance_loss_clip": 1.02968454, + "balance_loss_mlp": 1.01965463, + "epoch": 0.3041034119945889, + "flos": 24498942094080.0, + "grad_norm": 1.9581993811483316, + "language_loss": 0.89028692, + "learning_rate": 3.2622744001803176e-06, + "loss": 0.91128832, + "num_input_tokens_seen": 108721860, + "step": 5058, + "time_per_iteration": 2.699871301651001 + }, + { + "auxiliary_loss_clip": 0.01052089, + "auxiliary_loss_mlp": 0.01039639, + "balance_loss_clip": 1.02897525, + "balance_loss_mlp": 1.02463067, + "epoch": 0.30416353524725687, + "flos": 28288881907200.0, + "grad_norm": 2.477826632083382, + "language_loss": 0.71107972, + "learning_rate": 3.2619722820335564e-06, + "loss": 0.73199695, + "num_input_tokens_seen": 108743215, + "step": 5059, + "time_per_iteration": 2.7245333194732666 + }, + { + "auxiliary_loss_clip": 0.01028768, + "auxiliary_loss_mlp": 0.01036633, + "balance_loss_clip": 1.02649164, + "balance_loss_mlp": 1.02251852, + "epoch": 0.30422365849992483, + "flos": 23660392112640.0, + "grad_norm": 1.6270282644856042, + "language_loss": 0.72882497, + "learning_rate": 3.26167011603268e-06, + "loss": 0.749479, + "num_input_tokens_seen": 108765505, + "step": 5060, + "time_per_iteration": 2.880765676498413 + }, + { + "auxiliary_loss_clip": 0.01091069, + "auxiliary_loss_mlp": 0.01038499, + "balance_loss_clip": 1.03420007, + "balance_loss_mlp": 1.0246942, + "epoch": 0.3042837817525928, + "flos": 22998773548800.0, + "grad_norm": 2.5337610567155404, + "language_loss": 0.76794457, + "learning_rate": 3.2613679021891463e-06, + "loss": 0.78924024, + "num_input_tokens_seen": 108783370, + "step": 5061, + "time_per_iteration": 2.564784049987793 + }, + { + "auxiliary_loss_clip": 0.01050426, + "auxiliary_loss_mlp": 0.01040557, + "balance_loss_clip": 1.03162861, + "balance_loss_mlp": 1.02466595, + "epoch": 0.30434390500526076, + "flos": 22082332924800.0, + "grad_norm": 2.360806705466592, + "language_loss": 0.81977034, + "learning_rate": 3.261065640514415e-06, + "loss": 0.84068012, + "num_input_tokens_seen": 108797430, + "step": 5062, + "time_per_iteration": 2.6822214126586914 + }, + { + "auxiliary_loss_clip": 0.01083198, + "auxiliary_loss_mlp": 0.01035836, + "balance_loss_clip": 1.02882528, + "balance_loss_mlp": 1.02236557, + "epoch": 0.3044040282579287, + "flos": 25483504861440.0, + "grad_norm": 1.797991732737994, + "language_loss": 0.74597478, + "learning_rate": 3.2607633310199483e-06, + "loss": 0.76716506, + "num_input_tokens_seen": 108816945, + "step": 5063, + "time_per_iteration": 2.720654249191284 + }, + { + "auxiliary_loss_clip": 0.01075454, + "auxiliary_loss_mlp": 0.00748651, + "balance_loss_clip": 1.03048396, + "balance_loss_mlp": 1.00185454, + "epoch": 0.30446415151059675, + "flos": 21945478106880.0, + "grad_norm": 1.945224015316786, + "language_loss": 0.84234262, + "learning_rate": 3.26046097371721e-06, + "loss": 0.86058366, + "num_input_tokens_seen": 108836615, + "step": 5064, + "time_per_iteration": 2.670095443725586 + }, + { + "auxiliary_loss_clip": 0.01077298, + "auxiliary_loss_mlp": 0.01034832, + "balance_loss_clip": 1.03169918, + "balance_loss_mlp": 1.01930463, + "epoch": 0.3045242747632647, + "flos": 16435416816000.0, + "grad_norm": 2.5443980676265623, + "language_loss": 0.75349224, + "learning_rate": 3.2601585686176655e-06, + "loss": 0.7746135, + "num_input_tokens_seen": 108855165, + "step": 5065, + "time_per_iteration": 2.5757882595062256 + }, + { + "auxiliary_loss_clip": 0.01066302, + "auxiliary_loss_mlp": 0.01045654, + "balance_loss_clip": 1.03105378, + "balance_loss_mlp": 1.02916694, + "epoch": 0.3045843980159327, + "flos": 31540341957120.0, + "grad_norm": 1.798313644190224, + "language_loss": 0.62265694, + "learning_rate": 3.2598561157327814e-06, + "loss": 0.64377654, + "num_input_tokens_seen": 108874690, + "step": 5066, + "time_per_iteration": 2.6828935146331787 + }, + { + "auxiliary_loss_clip": 0.01063017, + "auxiliary_loss_mlp": 0.01045693, + "balance_loss_clip": 1.03346062, + "balance_loss_mlp": 1.02908683, + "epoch": 0.30464452126860064, + "flos": 17853636481920.0, + "grad_norm": 2.0663367276046976, + "language_loss": 0.82882249, + "learning_rate": 3.2595536150740265e-06, + "loss": 0.84990954, + "num_input_tokens_seen": 108893140, + "step": 5067, + "time_per_iteration": 2.620223045349121 + }, + { + "auxiliary_loss_clip": 0.01089233, + "auxiliary_loss_mlp": 0.01040073, + "balance_loss_clip": 1.03412867, + "balance_loss_mlp": 1.02571964, + "epoch": 0.3047046445212686, + "flos": 20631398947200.0, + "grad_norm": 1.987264344274208, + "language_loss": 0.6310522, + "learning_rate": 3.259251066652873e-06, + "loss": 0.65234518, + "num_input_tokens_seen": 108911880, + "step": 5068, + "time_per_iteration": 2.539015769958496 + }, + { + "auxiliary_loss_clip": 0.01076294, + "auxiliary_loss_mlp": 0.01029969, + "balance_loss_clip": 1.02994728, + "balance_loss_mlp": 1.01546717, + "epoch": 0.3047647677739366, + "flos": 21287594557440.0, + "grad_norm": 1.9632471749512042, + "language_loss": 0.74783266, + "learning_rate": 3.258948470480793e-06, + "loss": 0.76889521, + "num_input_tokens_seen": 108930440, + "step": 5069, + "time_per_iteration": 2.6006181240081787 + }, + { + "auxiliary_loss_clip": 0.01047042, + "auxiliary_loss_mlp": 0.01035629, + "balance_loss_clip": 1.027318, + "balance_loss_mlp": 1.0211153, + "epoch": 0.30482489102660454, + "flos": 20995928121600.0, + "grad_norm": 1.978745504584536, + "language_loss": 0.75648767, + "learning_rate": 3.258645826569261e-06, + "loss": 0.77731442, + "num_input_tokens_seen": 108949125, + "step": 5070, + "time_per_iteration": 2.8809380531311035 + }, + { + "auxiliary_loss_clip": 0.01092879, + "auxiliary_loss_mlp": 0.00748794, + "balance_loss_clip": 1.03345704, + "balance_loss_mlp": 1.00194204, + "epoch": 0.3048850142792725, + "flos": 26290812988800.0, + "grad_norm": 1.6589159175419657, + "language_loss": 0.81481689, + "learning_rate": 3.2583431349297527e-06, + "loss": 0.83323359, + "num_input_tokens_seen": 108972190, + "step": 5071, + "time_per_iteration": 2.667909622192383 + }, + { + "auxiliary_loss_clip": 0.01054869, + "auxiliary_loss_mlp": 0.01040846, + "balance_loss_clip": 1.02722383, + "balance_loss_mlp": 1.02465773, + "epoch": 0.30494513753194047, + "flos": 22346241125760.0, + "grad_norm": 1.6981928960881703, + "language_loss": 0.76123023, + "learning_rate": 3.2580403955737467e-06, + "loss": 0.78218734, + "num_input_tokens_seen": 108990325, + "step": 5072, + "time_per_iteration": 2.817490339279175 + }, + { + "auxiliary_loss_clip": 0.01054492, + "auxiliary_loss_mlp": 0.01039978, + "balance_loss_clip": 1.03068256, + "balance_loss_mlp": 1.02464771, + "epoch": 0.30500526078460843, + "flos": 19537667769600.0, + "grad_norm": 1.9956798727229341, + "language_loss": 0.7112664, + "learning_rate": 3.257737608512723e-06, + "loss": 0.73221111, + "num_input_tokens_seen": 109009505, + "step": 5073, + "time_per_iteration": 2.790942668914795 + }, + { + "auxiliary_loss_clip": 0.01084651, + "auxiliary_loss_mlp": 0.01043875, + "balance_loss_clip": 1.03432941, + "balance_loss_mlp": 1.0279963, + "epoch": 0.3050653840372764, + "flos": 14465321614080.0, + "grad_norm": 2.272073009272379, + "language_loss": 0.7630657, + "learning_rate": 3.257434773758163e-06, + "loss": 0.78435099, + "num_input_tokens_seen": 109026350, + "step": 5074, + "time_per_iteration": 2.7615251541137695 + }, + { + "auxiliary_loss_clip": 0.01067695, + "auxiliary_loss_mlp": 0.01039482, + "balance_loss_clip": 1.03232169, + "balance_loss_mlp": 1.02543283, + "epoch": 0.30512550728994436, + "flos": 24243796811520.0, + "grad_norm": 2.061890616080411, + "language_loss": 0.74793911, + "learning_rate": 3.25713189132155e-06, + "loss": 0.7690109, + "num_input_tokens_seen": 109044165, + "step": 5075, + "time_per_iteration": 2.681156635284424 + }, + { + "auxiliary_loss_clip": 0.01093776, + "auxiliary_loss_mlp": 0.01040645, + "balance_loss_clip": 1.03421843, + "balance_loss_mlp": 1.02346694, + "epoch": 0.30518563054261233, + "flos": 16360542915840.0, + "grad_norm": 2.1185682510206707, + "language_loss": 0.75547332, + "learning_rate": 3.2568289612143703e-06, + "loss": 0.77681756, + "num_input_tokens_seen": 109060665, + "step": 5076, + "time_per_iteration": 2.5480852127075195 + }, + { + "auxiliary_loss_clip": 0.01066465, + "auxiliary_loss_mlp": 0.01036733, + "balance_loss_clip": 1.03275228, + "balance_loss_mlp": 1.02215385, + "epoch": 0.30524575379528035, + "flos": 21579584215680.0, + "grad_norm": 1.5776879403914574, + "language_loss": 0.7949965, + "learning_rate": 3.25652598344811e-06, + "loss": 0.81602848, + "num_input_tokens_seen": 109080035, + "step": 5077, + "time_per_iteration": 2.677797794342041 + }, + { + "auxiliary_loss_clip": 0.01039747, + "auxiliary_loss_mlp": 0.01033026, + "balance_loss_clip": 1.02872002, + "balance_loss_mlp": 1.01958561, + "epoch": 0.3053058770479483, + "flos": 16545231671040.0, + "grad_norm": 2.0390072459906587, + "language_loss": 0.75005412, + "learning_rate": 3.256222958034259e-06, + "loss": 0.77078187, + "num_input_tokens_seen": 109097385, + "step": 5078, + "time_per_iteration": 2.7139573097229004 + }, + { + "auxiliary_loss_clip": 0.01034309, + "auxiliary_loss_mlp": 0.0104364, + "balance_loss_clip": 1.02762556, + "balance_loss_mlp": 1.02861357, + "epoch": 0.3053660003006163, + "flos": 12312907954560.0, + "grad_norm": 1.7696385396050562, + "language_loss": 0.66748703, + "learning_rate": 3.255919884984307e-06, + "loss": 0.68826652, + "num_input_tokens_seen": 109115495, + "step": 5079, + "time_per_iteration": 2.7315127849578857 + }, + { + "auxiliary_loss_clip": 0.01080909, + "auxiliary_loss_mlp": 0.01034456, + "balance_loss_clip": 1.03407633, + "balance_loss_mlp": 1.02035403, + "epoch": 0.30542612355328425, + "flos": 23112287504640.0, + "grad_norm": 1.856781059878208, + "language_loss": 0.80077052, + "learning_rate": 3.2556167643097477e-06, + "loss": 0.82192421, + "num_input_tokens_seen": 109134235, + "step": 5080, + "time_per_iteration": 2.6090123653411865 + }, + { + "auxiliary_loss_clip": 0.01077337, + "auxiliary_loss_mlp": 0.00748658, + "balance_loss_clip": 1.03206992, + "balance_loss_mlp": 1.0016973, + "epoch": 0.3054862468059522, + "flos": 24389450461440.0, + "grad_norm": 3.0376206561894454, + "language_loss": 0.81137013, + "learning_rate": 3.255313596022074e-06, + "loss": 0.82963002, + "num_input_tokens_seen": 109152760, + "step": 5081, + "time_per_iteration": 2.632762908935547 + }, + { + "auxiliary_loss_clip": 0.01076289, + "auxiliary_loss_mlp": 0.01034895, + "balance_loss_clip": 1.03133607, + "balance_loss_mlp": 1.02032804, + "epoch": 0.3055463700586202, + "flos": 29386096704000.0, + "grad_norm": 1.6831999597365326, + "language_loss": 0.71904504, + "learning_rate": 3.255010380132783e-06, + "loss": 0.74015689, + "num_input_tokens_seen": 109173925, + "step": 5082, + "time_per_iteration": 2.665003538131714 + }, + { + "auxiliary_loss_clip": 0.01082006, + "auxiliary_loss_mlp": 0.01038983, + "balance_loss_clip": 1.03254843, + "balance_loss_mlp": 1.0229733, + "epoch": 0.30560649331128814, + "flos": 25591775431680.0, + "grad_norm": 1.9254314880895973, + "language_loss": 0.72990382, + "learning_rate": 3.2547071166533736e-06, + "loss": 0.75111377, + "num_input_tokens_seen": 109192510, + "step": 5083, + "time_per_iteration": 2.6084742546081543 + }, + { + "auxiliary_loss_clip": 0.01058384, + "auxiliary_loss_mlp": 0.00748743, + "balance_loss_clip": 1.02897143, + "balance_loss_mlp": 1.00180125, + "epoch": 0.3056666165639561, + "flos": 19128321400320.0, + "grad_norm": 1.970792343771881, + "language_loss": 0.71277475, + "learning_rate": 3.254403805595344e-06, + "loss": 0.73084605, + "num_input_tokens_seen": 109210885, + "step": 5084, + "time_per_iteration": 2.6144986152648926 + }, + { + "auxiliary_loss_clip": 0.01045027, + "auxiliary_loss_mlp": 0.0103657, + "balance_loss_clip": 1.02696466, + "balance_loss_mlp": 1.01958323, + "epoch": 0.30572673981662407, + "flos": 15523860441600.0, + "grad_norm": 1.899252256218733, + "language_loss": 0.78333175, + "learning_rate": 3.2541004469701962e-06, + "loss": 0.80414778, + "num_input_tokens_seen": 109229180, + "step": 5085, + "time_per_iteration": 4.248273849487305 + }, + { + "auxiliary_loss_clip": 0.01085798, + "auxiliary_loss_mlp": 0.0103617, + "balance_loss_clip": 1.0316422, + "balance_loss_mlp": 1.02193618, + "epoch": 0.30578686306929204, + "flos": 21506541909120.0, + "grad_norm": 1.6429650617474438, + "language_loss": 0.78144509, + "learning_rate": 3.2537970407894342e-06, + "loss": 0.80266476, + "num_input_tokens_seen": 109249510, + "step": 5086, + "time_per_iteration": 2.5717694759368896 + }, + { + "auxiliary_loss_clip": 0.01049791, + "auxiliary_loss_mlp": 0.01043559, + "balance_loss_clip": 1.02774048, + "balance_loss_mlp": 1.02713227, + "epoch": 0.30584698632196, + "flos": 20954271323520.0, + "grad_norm": 1.8010372737549933, + "language_loss": 0.76610005, + "learning_rate": 3.253493587064563e-06, + "loss": 0.78703356, + "num_input_tokens_seen": 109268200, + "step": 5087, + "time_per_iteration": 2.6928648948669434 + }, + { + "auxiliary_loss_clip": 0.01081176, + "auxiliary_loss_mlp": 0.01038817, + "balance_loss_clip": 1.03194332, + "balance_loss_mlp": 1.02324796, + "epoch": 0.30590710957462797, + "flos": 24681116897280.0, + "grad_norm": 1.7843054512724053, + "language_loss": 0.72272301, + "learning_rate": 3.2531900858070885e-06, + "loss": 0.74392289, + "num_input_tokens_seen": 109288370, + "step": 5088, + "time_per_iteration": 4.3503806591033936 + }, + { + "auxiliary_loss_clip": 0.01076085, + "auxiliary_loss_mlp": 0.01036868, + "balance_loss_clip": 1.03056991, + "balance_loss_mlp": 1.02129912, + "epoch": 0.30596723282729593, + "flos": 17086907744640.0, + "grad_norm": 2.2500339889631733, + "language_loss": 0.79009879, + "learning_rate": 3.252886537028521e-06, + "loss": 0.81122828, + "num_input_tokens_seen": 109306730, + "step": 5089, + "time_per_iteration": 2.678375005722046 + }, + { + "auxiliary_loss_clip": 0.01069429, + "auxiliary_loss_mlp": 0.01044373, + "balance_loss_clip": 1.03268969, + "balance_loss_mlp": 1.0288763, + "epoch": 0.30602735607996395, + "flos": 22857106308480.0, + "grad_norm": 1.714996042195951, + "language_loss": 0.77163577, + "learning_rate": 3.2525829407403703e-06, + "loss": 0.79277384, + "num_input_tokens_seen": 109327360, + "step": 5090, + "time_per_iteration": 2.67586350440979 + }, + { + "auxiliary_loss_clip": 0.01070071, + "auxiliary_loss_mlp": 0.01049556, + "balance_loss_clip": 1.03178716, + "balance_loss_mlp": 1.03352189, + "epoch": 0.3060874793326319, + "flos": 29861482227840.0, + "grad_norm": 1.6913629275002344, + "language_loss": 0.76194692, + "learning_rate": 3.2522792969541488e-06, + "loss": 0.78314316, + "num_input_tokens_seen": 109348135, + "step": 5091, + "time_per_iteration": 2.688086986541748 + }, + { + "auxiliary_loss_clip": 0.01027063, + "auxiliary_loss_mlp": 0.01043846, + "balance_loss_clip": 1.03502226, + "balance_loss_mlp": 1.02628648, + "epoch": 0.3061476025852999, + "flos": 20448577699200.0, + "grad_norm": 2.0208264443313495, + "language_loss": 0.71647668, + "learning_rate": 3.2519756056813705e-06, + "loss": 0.73718572, + "num_input_tokens_seen": 109366220, + "step": 5092, + "time_per_iteration": 2.8207545280456543 + }, + { + "auxiliary_loss_clip": 0.01069384, + "auxiliary_loss_mlp": 0.01037036, + "balance_loss_clip": 1.03270841, + "balance_loss_mlp": 1.02253461, + "epoch": 0.30620772583796785, + "flos": 19391475415680.0, + "grad_norm": 2.366458102300884, + "language_loss": 0.82277161, + "learning_rate": 3.2516718669335522e-06, + "loss": 0.84383583, + "num_input_tokens_seen": 109385260, + "step": 5093, + "time_per_iteration": 2.751612663269043 + }, + { + "auxiliary_loss_clip": 0.01089752, + "auxiliary_loss_mlp": 0.00748867, + "balance_loss_clip": 1.03311324, + "balance_loss_mlp": 1.00176334, + "epoch": 0.3062678490906358, + "flos": 24024562151040.0, + "grad_norm": 2.3410896789736637, + "language_loss": 0.74907935, + "learning_rate": 3.2513680807222114e-06, + "loss": 0.76746553, + "num_input_tokens_seen": 109405025, + "step": 5094, + "time_per_iteration": 2.7335739135742188 + }, + { + "auxiliary_loss_clip": 0.01066056, + "auxiliary_loss_mlp": 0.01037936, + "balance_loss_clip": 1.03096557, + "balance_loss_mlp": 1.02355361, + "epoch": 0.3063279723433038, + "flos": 19754639873280.0, + "grad_norm": 1.8007993525353352, + "language_loss": 0.76129889, + "learning_rate": 3.251064247058868e-06, + "loss": 0.78233886, + "num_input_tokens_seen": 109422465, + "step": 5095, + "time_per_iteration": 2.6432547569274902 + }, + { + "auxiliary_loss_clip": 0.01076164, + "auxiliary_loss_mlp": 0.01037282, + "balance_loss_clip": 1.03192973, + "balance_loss_mlp": 1.02293515, + "epoch": 0.30638809559597174, + "flos": 22450022496000.0, + "grad_norm": 1.6745115345911972, + "language_loss": 0.80915797, + "learning_rate": 3.250760365955042e-06, + "loss": 0.8302924, + "num_input_tokens_seen": 109440575, + "step": 5096, + "time_per_iteration": 4.208844184875488 + }, + { + "auxiliary_loss_clip": 0.01077509, + "auxiliary_loss_mlp": 0.01035401, + "balance_loss_clip": 1.03059113, + "balance_loss_mlp": 1.02029121, + "epoch": 0.3064482188486397, + "flos": 17165157523200.0, + "grad_norm": 2.2563832276481817, + "language_loss": 0.81739664, + "learning_rate": 3.250456437422258e-06, + "loss": 0.83852565, + "num_input_tokens_seen": 109459050, + "step": 5097, + "time_per_iteration": 2.670351982116699 + }, + { + "auxiliary_loss_clip": 0.01090081, + "auxiliary_loss_mlp": 0.0104106, + "balance_loss_clip": 1.03266168, + "balance_loss_mlp": 1.02478862, + "epoch": 0.3065083421013077, + "flos": 23768483114880.0, + "grad_norm": 1.9850716867561278, + "language_loss": 0.77955264, + "learning_rate": 3.250152461472041e-06, + "loss": 0.8008641, + "num_input_tokens_seen": 109475860, + "step": 5098, + "time_per_iteration": 4.197249174118042 + }, + { + "auxiliary_loss_clip": 0.01047536, + "auxiliary_loss_mlp": 0.01041898, + "balance_loss_clip": 1.03331685, + "balance_loss_mlp": 1.02683616, + "epoch": 0.30656846535397564, + "flos": 26431833784320.0, + "grad_norm": 2.002995155641832, + "language_loss": 0.83452272, + "learning_rate": 3.249848438115917e-06, + "loss": 0.85541707, + "num_input_tokens_seen": 109494760, + "step": 5099, + "time_per_iteration": 2.8516852855682373 + }, + { + "auxiliary_loss_clip": 0.0108956, + "auxiliary_loss_mlp": 0.01044327, + "balance_loss_clip": 1.03104067, + "balance_loss_mlp": 1.02822804, + "epoch": 0.3066285886066436, + "flos": 26651786716800.0, + "grad_norm": 1.6654202132711524, + "language_loss": 0.85575771, + "learning_rate": 3.2495443673654148e-06, + "loss": 0.87709653, + "num_input_tokens_seen": 109516480, + "step": 5100, + "time_per_iteration": 2.6030349731445312 + }, + { + "auxiliary_loss_clip": 0.01051151, + "auxiliary_loss_mlp": 0.01037561, + "balance_loss_clip": 1.02821445, + "balance_loss_mlp": 1.0213958, + "epoch": 0.30668871185931157, + "flos": 15049947375360.0, + "grad_norm": 1.7757362223814006, + "language_loss": 0.79233265, + "learning_rate": 3.249240249232065e-06, + "loss": 0.81321979, + "num_input_tokens_seen": 109534615, + "step": 5101, + "time_per_iteration": 2.602760076522827 + }, + { + "auxiliary_loss_clip": 0.01053399, + "auxiliary_loss_mlp": 0.01042904, + "balance_loss_clip": 1.03141475, + "balance_loss_mlp": 1.02572596, + "epoch": 0.30674883511197953, + "flos": 20082109190400.0, + "grad_norm": 1.7805620630551313, + "language_loss": 0.80120766, + "learning_rate": 3.2489360837273998e-06, + "loss": 0.82217073, + "num_input_tokens_seen": 109554040, + "step": 5102, + "time_per_iteration": 2.676713466644287 + }, + { + "auxiliary_loss_clip": 0.01093964, + "auxiliary_loss_mlp": 0.01041102, + "balance_loss_clip": 1.03570926, + "balance_loss_mlp": 1.02350664, + "epoch": 0.30680895836464755, + "flos": 22893807029760.0, + "grad_norm": 1.8120757551826627, + "language_loss": 0.89012516, + "learning_rate": 3.2486318708629532e-06, + "loss": 0.91147578, + "num_input_tokens_seen": 109574345, + "step": 5103, + "time_per_iteration": 2.5925540924072266 + }, + { + "auxiliary_loss_clip": 0.01066549, + "auxiliary_loss_mlp": 0.01048597, + "balance_loss_clip": 1.02906966, + "balance_loss_mlp": 1.03299832, + "epoch": 0.3068690816173155, + "flos": 23696159080320.0, + "grad_norm": 2.38285437404797, + "language_loss": 0.73846966, + "learning_rate": 3.2483276106502607e-06, + "loss": 0.75962114, + "num_input_tokens_seen": 109593670, + "step": 5104, + "time_per_iteration": 2.6569557189941406 + }, + { + "auxiliary_loss_clip": 0.01073093, + "auxiliary_loss_mlp": 0.00748827, + "balance_loss_clip": 1.02956605, + "balance_loss_mlp": 1.00168943, + "epoch": 0.3069292048699835, + "flos": 23551044134400.0, + "grad_norm": 4.012725557937885, + "language_loss": 0.72577894, + "learning_rate": 3.2480233031008605e-06, + "loss": 0.74399817, + "num_input_tokens_seen": 109613385, + "step": 5105, + "time_per_iteration": 2.596071243286133 + }, + { + "auxiliary_loss_clip": 0.01070839, + "auxiliary_loss_mlp": 0.01041345, + "balance_loss_clip": 1.03289795, + "balance_loss_mlp": 1.02525806, + "epoch": 0.30698932812265145, + "flos": 24531656405760.0, + "grad_norm": 1.8189236522928225, + "language_loss": 0.87430084, + "learning_rate": 3.2477189482262916e-06, + "loss": 0.89542264, + "num_input_tokens_seen": 109632395, + "step": 5106, + "time_per_iteration": 2.6471312046051025 + }, + { + "auxiliary_loss_clip": 0.01056671, + "auxiliary_loss_mlp": 0.01047631, + "balance_loss_clip": 1.02933514, + "balance_loss_mlp": 1.03089404, + "epoch": 0.3070494513753194, + "flos": 20996430912000.0, + "grad_norm": 2.1255451939574863, + "language_loss": 0.71065164, + "learning_rate": 3.2474145460380945e-06, + "loss": 0.7316947, + "num_input_tokens_seen": 109651380, + "step": 5107, + "time_per_iteration": 2.6317384243011475 + }, + { + "auxiliary_loss_clip": 0.01055606, + "auxiliary_loss_mlp": 0.01045665, + "balance_loss_clip": 1.02977514, + "balance_loss_mlp": 1.0298878, + "epoch": 0.3071095746279874, + "flos": 19025940660480.0, + "grad_norm": 2.3033917467304232, + "language_loss": 0.7219243, + "learning_rate": 3.247110096547814e-06, + "loss": 0.74293697, + "num_input_tokens_seen": 109670240, + "step": 5108, + "time_per_iteration": 2.6418039798736572 + }, + { + "auxiliary_loss_clip": 0.01059422, + "auxiliary_loss_mlp": 0.01036059, + "balance_loss_clip": 1.02956522, + "balance_loss_mlp": 1.02093697, + "epoch": 0.30716969788065535, + "flos": 21215521918080.0, + "grad_norm": 1.4181551109672548, + "language_loss": 0.8543359, + "learning_rate": 3.2468055997669926e-06, + "loss": 0.87529069, + "num_input_tokens_seen": 109690810, + "step": 5109, + "time_per_iteration": 2.640207529067993 + }, + { + "auxiliary_loss_clip": 0.01058958, + "auxiliary_loss_mlp": 0.0103531, + "balance_loss_clip": 1.02880609, + "balance_loss_mlp": 1.02077246, + "epoch": 0.3072298211333233, + "flos": 25772765086080.0, + "grad_norm": 1.5747771396328247, + "language_loss": 0.67169863, + "learning_rate": 3.2465010557071788e-06, + "loss": 0.69264126, + "num_input_tokens_seen": 109711145, + "step": 5110, + "time_per_iteration": 2.704080104827881 + }, + { + "auxiliary_loss_clip": 0.01075905, + "auxiliary_loss_mlp": 0.01031094, + "balance_loss_clip": 1.03159189, + "balance_loss_mlp": 1.01757622, + "epoch": 0.3072899443859913, + "flos": 25848931875840.0, + "grad_norm": 1.6353016900155926, + "language_loss": 0.7675246, + "learning_rate": 3.246196464379919e-06, + "loss": 0.7885946, + "num_input_tokens_seen": 109731425, + "step": 5111, + "time_per_iteration": 2.6938741207122803 + }, + { + "auxiliary_loss_clip": 0.01087874, + "auxiliary_loss_mlp": 0.01035101, + "balance_loss_clip": 1.03208041, + "balance_loss_mlp": 1.02055717, + "epoch": 0.30735006763865924, + "flos": 25922800195200.0, + "grad_norm": 1.8434406660670262, + "language_loss": 0.67649901, + "learning_rate": 3.245891825796765e-06, + "loss": 0.69772875, + "num_input_tokens_seen": 109752720, + "step": 5112, + "time_per_iteration": 2.6783573627471924 + }, + { + "auxiliary_loss_clip": 0.01081772, + "auxiliary_loss_mlp": 0.01039679, + "balance_loss_clip": 1.03178644, + "balance_loss_mlp": 1.02272749, + "epoch": 0.3074101908913272, + "flos": 30917004312960.0, + "grad_norm": 1.9970308351577168, + "language_loss": 0.79373986, + "learning_rate": 3.2455871399692678e-06, + "loss": 0.81495428, + "num_input_tokens_seen": 109772840, + "step": 5113, + "time_per_iteration": 2.711441993713379 + }, + { + "auxiliary_loss_clip": 0.01054314, + "auxiliary_loss_mlp": 0.00748676, + "balance_loss_clip": 1.02940118, + "balance_loss_mlp": 1.00165379, + "epoch": 0.30747031414399517, + "flos": 18401058731520.0, + "grad_norm": 1.9219147045475875, + "language_loss": 0.77401978, + "learning_rate": 3.2452824069089815e-06, + "loss": 0.79204971, + "num_input_tokens_seen": 109790150, + "step": 5114, + "time_per_iteration": 2.840886116027832 + }, + { + "auxiliary_loss_clip": 0.01059487, + "auxiliary_loss_mlp": 0.01037863, + "balance_loss_clip": 1.03378117, + "balance_loss_mlp": 1.02030337, + "epoch": 0.30753043739666314, + "flos": 22633166966400.0, + "grad_norm": 1.9184615467165178, + "language_loss": 0.62553722, + "learning_rate": 3.2449776266274623e-06, + "loss": 0.64651072, + "num_input_tokens_seen": 109807985, + "step": 5115, + "time_per_iteration": 2.9524033069610596 + }, + { + "auxiliary_loss_clip": 0.0108267, + "auxiliary_loss_mlp": 0.01038243, + "balance_loss_clip": 1.03283679, + "balance_loss_mlp": 1.0234611, + "epoch": 0.3075905606493311, + "flos": 27344072517120.0, + "grad_norm": 2.300241319595623, + "language_loss": 0.82805914, + "learning_rate": 3.2446727991362657e-06, + "loss": 0.84926832, + "num_input_tokens_seen": 109825920, + "step": 5116, + "time_per_iteration": 2.7386796474456787 + }, + { + "auxiliary_loss_clip": 0.0106693, + "auxiliary_loss_mlp": 0.01048819, + "balance_loss_clip": 1.03211617, + "balance_loss_mlp": 1.03344107, + "epoch": 0.3076506839019991, + "flos": 22090808534400.0, + "grad_norm": 3.237778714159977, + "language_loss": 0.75768036, + "learning_rate": 3.244367924446952e-06, + "loss": 0.77883792, + "num_input_tokens_seen": 109846220, + "step": 5117, + "time_per_iteration": 2.8163058757781982 + }, + { + "auxiliary_loss_clip": 0.01050754, + "auxiliary_loss_mlp": 0.01042543, + "balance_loss_clip": 1.03057206, + "balance_loss_mlp": 1.02553141, + "epoch": 0.3077108071546671, + "flos": 21289533891840.0, + "grad_norm": 3.0638256609892416, + "language_loss": 0.71425658, + "learning_rate": 3.2440630025710826e-06, + "loss": 0.7351895, + "num_input_tokens_seen": 109863870, + "step": 5118, + "time_per_iteration": 2.7582764625549316 + }, + { + "auxiliary_loss_clip": 0.01058303, + "auxiliary_loss_mlp": 0.010391, + "balance_loss_clip": 1.03887141, + "balance_loss_mlp": 1.02452683, + "epoch": 0.30777093040733505, + "flos": 21430985650560.0, + "grad_norm": 1.740299991851238, + "language_loss": 0.74300086, + "learning_rate": 3.243758033520219e-06, + "loss": 0.76397491, + "num_input_tokens_seen": 109883500, + "step": 5119, + "time_per_iteration": 2.8253188133239746 + }, + { + "auxiliary_loss_clip": 0.01080525, + "auxiliary_loss_mlp": 0.01052423, + "balance_loss_clip": 1.03180385, + "balance_loss_mlp": 1.03585923, + "epoch": 0.307831053660003, + "flos": 23149275534720.0, + "grad_norm": 1.8945531570369796, + "language_loss": 0.80134124, + "learning_rate": 3.243453017305926e-06, + "loss": 0.82267076, + "num_input_tokens_seen": 109904620, + "step": 5120, + "time_per_iteration": 2.740234136581421 + }, + { + "auxiliary_loss_clip": 0.01075815, + "auxiliary_loss_mlp": 0.01044423, + "balance_loss_clip": 1.02932656, + "balance_loss_mlp": 1.02980804, + "epoch": 0.307891176912671, + "flos": 17019755268480.0, + "grad_norm": 1.5545599359072335, + "language_loss": 0.79763246, + "learning_rate": 3.24314795393977e-06, + "loss": 0.81883478, + "num_input_tokens_seen": 109922275, + "step": 5121, + "time_per_iteration": 2.5878918170928955 + }, + { + "auxiliary_loss_clip": 0.01055889, + "auxiliary_loss_mlp": 0.01037931, + "balance_loss_clip": 1.03025413, + "balance_loss_mlp": 1.02316093, + "epoch": 0.30795130016533895, + "flos": 27705046245120.0, + "grad_norm": 1.720975192996036, + "language_loss": 0.82635939, + "learning_rate": 3.242842843433319e-06, + "loss": 0.84729755, + "num_input_tokens_seen": 109944265, + "step": 5122, + "time_per_iteration": 2.8417391777038574 + }, + { + "auxiliary_loss_clip": 0.01013304, + "auxiliary_loss_mlp": 0.010099, + "balance_loss_clip": 1.00849855, + "balance_loss_mlp": 1.00769448, + "epoch": 0.3080114234180069, + "flos": 69058699591680.0, + "grad_norm": 0.7539398413915881, + "language_loss": 0.58629042, + "learning_rate": 3.242537685798143e-06, + "loss": 0.6065225, + "num_input_tokens_seen": 110014160, + "step": 5123, + "time_per_iteration": 3.3729476928710938 + }, + { + "auxiliary_loss_clip": 0.01080811, + "auxiliary_loss_mlp": 0.0074868, + "balance_loss_clip": 1.0306524, + "balance_loss_mlp": 1.0016557, + "epoch": 0.3080715466706749, + "flos": 24060221377920.0, + "grad_norm": 1.5685411502617055, + "language_loss": 0.83284336, + "learning_rate": 3.242232481045813e-06, + "loss": 0.85113823, + "num_input_tokens_seen": 110034865, + "step": 5124, + "time_per_iteration": 2.75791335105896 + }, + { + "auxiliary_loss_clip": 0.01094986, + "auxiliary_loss_mlp": 0.01042933, + "balance_loss_clip": 1.0355134, + "balance_loss_mlp": 1.02744782, + "epoch": 0.30813166992334284, + "flos": 25848680480640.0, + "grad_norm": 1.8991667450959273, + "language_loss": 0.79255903, + "learning_rate": 3.2419272291879035e-06, + "loss": 0.81393826, + "num_input_tokens_seen": 110052930, + "step": 5125, + "time_per_iteration": 2.6058969497680664 + }, + { + "auxiliary_loss_clip": 0.01070736, + "auxiliary_loss_mlp": 0.01038693, + "balance_loss_clip": 1.02835119, + "balance_loss_mlp": 1.02151513, + "epoch": 0.3081917931760108, + "flos": 20449619193600.0, + "grad_norm": 2.0521726783950878, + "language_loss": 0.64216244, + "learning_rate": 3.241621930235989e-06, + "loss": 0.6632567, + "num_input_tokens_seen": 110071765, + "step": 5126, + "time_per_iteration": 2.688950300216675 + }, + { + "auxiliary_loss_clip": 0.01051451, + "auxiliary_loss_mlp": 0.01040438, + "balance_loss_clip": 1.03522682, + "balance_loss_mlp": 1.02569199, + "epoch": 0.3082519164286788, + "flos": 22166257052160.0, + "grad_norm": 1.6990336585699934, + "language_loss": 0.86647999, + "learning_rate": 3.241316584201646e-06, + "loss": 0.88739884, + "num_input_tokens_seen": 110092660, + "step": 5127, + "time_per_iteration": 2.8068225383758545 + }, + { + "auxiliary_loss_clip": 0.01041452, + "auxiliary_loss_mlp": 0.01037377, + "balance_loss_clip": 1.02842486, + "balance_loss_mlp": 1.02181971, + "epoch": 0.30831203968134674, + "flos": 28913404700160.0, + "grad_norm": 1.5797522181989538, + "language_loss": 0.68909693, + "learning_rate": 3.2410111910964538e-06, + "loss": 0.70988524, + "num_input_tokens_seen": 110114960, + "step": 5128, + "time_per_iteration": 2.8625354766845703 + }, + { + "auxiliary_loss_clip": 0.01081036, + "auxiliary_loss_mlp": 0.00748665, + "balance_loss_clip": 1.03261018, + "balance_loss_mlp": 1.00165737, + "epoch": 0.3083721629340147, + "flos": 25667726739840.0, + "grad_norm": 2.0519793473710406, + "language_loss": 0.70927429, + "learning_rate": 3.240705750931993e-06, + "loss": 0.72757125, + "num_input_tokens_seen": 110135750, + "step": 5129, + "time_per_iteration": 2.647402048110962 + }, + { + "auxiliary_loss_clip": 0.00990663, + "auxiliary_loss_mlp": 0.01003163, + "balance_loss_clip": 1.00521183, + "balance_loss_mlp": 1.00112462, + "epoch": 0.3084322861866827, + "flos": 68212679581440.0, + "grad_norm": 0.8330532016783395, + "language_loss": 0.59183538, + "learning_rate": 3.240400263719846e-06, + "loss": 0.61177361, + "num_input_tokens_seen": 110189480, + "step": 5130, + "time_per_iteration": 3.199782609939575 + }, + { + "auxiliary_loss_clip": 0.01066488, + "auxiliary_loss_mlp": 0.01039844, + "balance_loss_clip": 1.02968884, + "balance_loss_mlp": 1.02458572, + "epoch": 0.3084924094393507, + "flos": 20296495514880.0, + "grad_norm": 2.2469758760663474, + "language_loss": 0.72723603, + "learning_rate": 3.2400947294715957e-06, + "loss": 0.74829936, + "num_input_tokens_seen": 110206445, + "step": 5131, + "time_per_iteration": 2.6756105422973633 + }, + { + "auxiliary_loss_clip": 0.01050852, + "auxiliary_loss_mlp": 0.01036157, + "balance_loss_clip": 1.02910972, + "balance_loss_mlp": 1.02225125, + "epoch": 0.30855253269201866, + "flos": 23949831905280.0, + "grad_norm": 1.4583932607175627, + "language_loss": 0.70940924, + "learning_rate": 3.2397891481988303e-06, + "loss": 0.73027933, + "num_input_tokens_seen": 110226845, + "step": 5132, + "time_per_iteration": 4.250192880630493 + }, + { + "auxiliary_loss_clip": 0.01086293, + "auxiliary_loss_mlp": 0.00748613, + "balance_loss_clip": 1.03350735, + "balance_loss_mlp": 1.00160241, + "epoch": 0.3086126559446866, + "flos": 19281876042240.0, + "grad_norm": 1.7380993798741644, + "language_loss": 0.90160787, + "learning_rate": 3.239483519913136e-06, + "loss": 0.91995698, + "num_input_tokens_seen": 110244095, + "step": 5133, + "time_per_iteration": 2.620492696762085 + }, + { + "auxiliary_loss_clip": 0.01070479, + "auxiliary_loss_mlp": 0.01045775, + "balance_loss_clip": 1.03076792, + "balance_loss_mlp": 1.03071868, + "epoch": 0.3086727791973546, + "flos": 33760770019200.0, + "grad_norm": 2.1134592070580576, + "language_loss": 0.67380196, + "learning_rate": 3.239177844626102e-06, + "loss": 0.69496453, + "num_input_tokens_seen": 110264240, + "step": 5134, + "time_per_iteration": 2.8242709636688232 + }, + { + "auxiliary_loss_clip": 0.0108187, + "auxiliary_loss_mlp": 0.01050561, + "balance_loss_clip": 1.03405356, + "balance_loss_mlp": 1.03457475, + "epoch": 0.30873290245002255, + "flos": 16034151006720.0, + "grad_norm": 2.7453681413716553, + "language_loss": 0.8263706, + "learning_rate": 3.2388721223493197e-06, + "loss": 0.84769487, + "num_input_tokens_seen": 110282450, + "step": 5135, + "time_per_iteration": 4.348390579223633 + }, + { + "auxiliary_loss_clip": 0.00993019, + "auxiliary_loss_mlp": 0.01004545, + "balance_loss_clip": 1.00792646, + "balance_loss_mlp": 1.00249481, + "epoch": 0.3087930257026905, + "flos": 65048304055680.0, + "grad_norm": 0.6959954527116143, + "language_loss": 0.5526005, + "learning_rate": 3.2385663530943824e-06, + "loss": 0.57257617, + "num_input_tokens_seen": 110343715, + "step": 5136, + "time_per_iteration": 3.3447036743164062 + }, + { + "auxiliary_loss_clip": 0.01069425, + "auxiliary_loss_mlp": 0.00748673, + "balance_loss_clip": 1.03251791, + "balance_loss_mlp": 1.00165451, + "epoch": 0.3088531489553585, + "flos": 74738829824640.0, + "grad_norm": 2.00631221488005, + "language_loss": 0.75921082, + "learning_rate": 3.2382605368728852e-06, + "loss": 0.77739179, + "num_input_tokens_seen": 110368430, + "step": 5137, + "time_per_iteration": 3.1126670837402344 + }, + { + "auxiliary_loss_clip": 0.01055196, + "auxiliary_loss_mlp": 0.01032771, + "balance_loss_clip": 1.02972126, + "balance_loss_mlp": 1.01971161, + "epoch": 0.30891327220802645, + "flos": 21142300043520.0, + "grad_norm": 1.802354087778123, + "language_loss": 0.80033404, + "learning_rate": 3.237954673696424e-06, + "loss": 0.82121372, + "num_input_tokens_seen": 110386735, + "step": 5138, + "time_per_iteration": 2.7181804180145264 + }, + { + "auxiliary_loss_clip": 0.01028862, + "auxiliary_loss_mlp": 0.01041825, + "balance_loss_clip": 1.02739906, + "balance_loss_mlp": 1.02477801, + "epoch": 0.3089733954606944, + "flos": 25664494515840.0, + "grad_norm": 1.447716193407214, + "language_loss": 0.81297708, + "learning_rate": 3.2376487635765983e-06, + "loss": 0.83368397, + "num_input_tokens_seen": 110406820, + "step": 5139, + "time_per_iteration": 2.7988173961639404 + }, + { + "auxiliary_loss_clip": 0.01082285, + "auxiliary_loss_mlp": 0.01036344, + "balance_loss_clip": 1.03225172, + "balance_loss_mlp": 1.02026892, + "epoch": 0.3090335187133624, + "flos": 19427350124160.0, + "grad_norm": 2.364438680970901, + "language_loss": 0.77264965, + "learning_rate": 3.2373428065250067e-06, + "loss": 0.79383594, + "num_input_tokens_seen": 110424225, + "step": 5140, + "time_per_iteration": 2.69576358795166 + }, + { + "auxiliary_loss_clip": 0.01051859, + "auxiliary_loss_mlp": 0.01043283, + "balance_loss_clip": 1.02909887, + "balance_loss_mlp": 1.0295856, + "epoch": 0.30909364196603034, + "flos": 20011329440640.0, + "grad_norm": 1.6715372143537912, + "language_loss": 0.78584802, + "learning_rate": 3.237036802553252e-06, + "loss": 0.80679941, + "num_input_tokens_seen": 110443310, + "step": 5141, + "time_per_iteration": 2.6754074096679688 + }, + { + "auxiliary_loss_clip": 0.01068416, + "auxiliary_loss_mlp": 0.01039314, + "balance_loss_clip": 1.03138995, + "balance_loss_mlp": 1.02409136, + "epoch": 0.3091537652186983, + "flos": 19677575243520.0, + "grad_norm": 2.000872113188923, + "language_loss": 0.87227082, + "learning_rate": 3.2367307516729377e-06, + "loss": 0.8933481, + "num_input_tokens_seen": 110460215, + "step": 5142, + "time_per_iteration": 2.643965244293213 + }, + { + "auxiliary_loss_clip": 0.01077632, + "auxiliary_loss_mlp": 0.0103918, + "balance_loss_clip": 1.03039265, + "balance_loss_mlp": 1.02477932, + "epoch": 0.3092138884713663, + "flos": 17020042577280.0, + "grad_norm": 15.4410238295801, + "language_loss": 0.79186219, + "learning_rate": 3.23642465389567e-06, + "loss": 0.8130303, + "num_input_tokens_seen": 110479385, + "step": 5143, + "time_per_iteration": 4.313833236694336 + }, + { + "auxiliary_loss_clip": 0.01058995, + "auxiliary_loss_mlp": 0.01035942, + "balance_loss_clip": 1.03088927, + "balance_loss_mlp": 1.02118409, + "epoch": 0.3092740117240343, + "flos": 25009986844800.0, + "grad_norm": 1.837465586720204, + "language_loss": 0.72257733, + "learning_rate": 3.236118509233055e-06, + "loss": 0.7435267, + "num_input_tokens_seen": 110499885, + "step": 5144, + "time_per_iteration": 2.727257251739502 + }, + { + "auxiliary_loss_clip": 0.01076353, + "auxiliary_loss_mlp": 0.01038714, + "balance_loss_clip": 1.02798343, + "balance_loss_mlp": 1.02349091, + "epoch": 0.30933413497670226, + "flos": 25590410714880.0, + "grad_norm": 2.835397623933764, + "language_loss": 0.74100924, + "learning_rate": 3.235812317696702e-06, + "loss": 0.76215994, + "num_input_tokens_seen": 110519690, + "step": 5145, + "time_per_iteration": 4.336535692214966 + }, + { + "auxiliary_loss_clip": 0.01055747, + "auxiliary_loss_mlp": 0.01041488, + "balance_loss_clip": 1.02810526, + "balance_loss_mlp": 1.0262053, + "epoch": 0.3093942582293702, + "flos": 24389665943040.0, + "grad_norm": 1.5508633497497835, + "language_loss": 0.75895822, + "learning_rate": 3.2355060792982224e-06, + "loss": 0.77993059, + "num_input_tokens_seen": 110540520, + "step": 5146, + "time_per_iteration": 2.793313503265381 + }, + { + "auxiliary_loss_clip": 0.01058243, + "auxiliary_loss_mlp": 0.01035437, + "balance_loss_clip": 1.02798426, + "balance_loss_mlp": 1.02165008, + "epoch": 0.3094543814820382, + "flos": 19646441130240.0, + "grad_norm": 1.6357103527227714, + "language_loss": 0.66745031, + "learning_rate": 3.2351997940492286e-06, + "loss": 0.68838704, + "num_input_tokens_seen": 110557950, + "step": 5147, + "time_per_iteration": 2.7115557193756104 + }, + { + "auxiliary_loss_clip": 0.01081097, + "auxiliary_loss_mlp": 0.01036819, + "balance_loss_clip": 1.03323984, + "balance_loss_mlp": 1.02261484, + "epoch": 0.30951450473470615, + "flos": 25663812157440.0, + "grad_norm": 6.186423592033332, + "language_loss": 0.75233287, + "learning_rate": 3.2348934619613346e-06, + "loss": 0.77351201, + "num_input_tokens_seen": 110578215, + "step": 5148, + "time_per_iteration": 2.627990961074829 + }, + { + "auxiliary_loss_clip": 0.01084748, + "auxiliary_loss_mlp": 0.01046738, + "balance_loss_clip": 1.03358054, + "balance_loss_mlp": 1.03106165, + "epoch": 0.3095746279873741, + "flos": 12020415505920.0, + "grad_norm": 2.4490688177600024, + "language_loss": 0.72832119, + "learning_rate": 3.2345870830461567e-06, + "loss": 0.74963605, + "num_input_tokens_seen": 110592990, + "step": 5149, + "time_per_iteration": 2.540264129638672 + }, + { + "auxiliary_loss_clip": 0.01046921, + "auxiliary_loss_mlp": 0.01043527, + "balance_loss_clip": 1.02865553, + "balance_loss_mlp": 1.02731478, + "epoch": 0.3096347512400421, + "flos": 23623044946560.0, + "grad_norm": 1.7846111085339824, + "language_loss": 0.84990942, + "learning_rate": 3.2342806573153132e-06, + "loss": 0.87081391, + "num_input_tokens_seen": 110612130, + "step": 5150, + "time_per_iteration": 2.782589912414551 + }, + { + "auxiliary_loss_clip": 0.01031982, + "auxiliary_loss_mlp": 0.01043268, + "balance_loss_clip": 1.02646756, + "balance_loss_mlp": 1.02790785, + "epoch": 0.30969487449271005, + "flos": 22529313768960.0, + "grad_norm": 1.6788660784380827, + "language_loss": 0.78711087, + "learning_rate": 3.233974184780424e-06, + "loss": 0.80786335, + "num_input_tokens_seen": 110632045, + "step": 5151, + "time_per_iteration": 2.727437973022461 + }, + { + "auxiliary_loss_clip": 0.01079682, + "auxiliary_loss_mlp": 0.01037638, + "balance_loss_clip": 1.03210735, + "balance_loss_mlp": 1.02200949, + "epoch": 0.309754997745378, + "flos": 15267925059840.0, + "grad_norm": 1.9309078795034864, + "language_loss": 0.67306614, + "learning_rate": 3.2336676654531084e-06, + "loss": 0.69423938, + "num_input_tokens_seen": 110649340, + "step": 5152, + "time_per_iteration": 2.6510584354400635 + }, + { + "auxiliary_loss_clip": 0.01034851, + "auxiliary_loss_mlp": 0.01041348, + "balance_loss_clip": 1.02876985, + "balance_loss_mlp": 1.02651882, + "epoch": 0.309815120998046, + "flos": 26979291947520.0, + "grad_norm": 1.8343366925694533, + "language_loss": 0.82443988, + "learning_rate": 3.2333610993449926e-06, + "loss": 0.84520197, + "num_input_tokens_seen": 110668450, + "step": 5153, + "time_per_iteration": 2.8353850841522217 + }, + { + "auxiliary_loss_clip": 0.01067046, + "auxiliary_loss_mlp": 0.00748604, + "balance_loss_clip": 1.03104258, + "balance_loss_mlp": 1.00167882, + "epoch": 0.30987524425071394, + "flos": 21143161969920.0, + "grad_norm": 2.1657381056634843, + "language_loss": 0.73920375, + "learning_rate": 3.2330544864676997e-06, + "loss": 0.75736022, + "num_input_tokens_seen": 110689410, + "step": 5154, + "time_per_iteration": 2.7695844173431396 + }, + { + "auxiliary_loss_clip": 0.01078577, + "auxiliary_loss_mlp": 0.01036347, + "balance_loss_clip": 1.03157735, + "balance_loss_mlp": 1.02190506, + "epoch": 0.3099353675033819, + "flos": 15268284195840.0, + "grad_norm": 2.062818934122229, + "language_loss": 0.76431262, + "learning_rate": 3.232747826832858e-06, + "loss": 0.78546184, + "num_input_tokens_seen": 110707350, + "step": 5155, + "time_per_iteration": 2.635324001312256 + }, + { + "auxiliary_loss_clip": 0.01078042, + "auxiliary_loss_mlp": 0.0103924, + "balance_loss_clip": 1.03601623, + "balance_loss_mlp": 1.02338493, + "epoch": 0.30999549075604993, + "flos": 15413794191360.0, + "grad_norm": 1.8922909297709385, + "language_loss": 0.78880274, + "learning_rate": 3.232441120452094e-06, + "loss": 0.80997562, + "num_input_tokens_seen": 110724910, + "step": 5156, + "time_per_iteration": 2.6927151679992676 + }, + { + "auxiliary_loss_clip": 0.01084545, + "auxiliary_loss_mlp": 0.01049263, + "balance_loss_clip": 1.03579473, + "balance_loss_mlp": 1.0320251, + "epoch": 0.3100556140087179, + "flos": 23184539712000.0, + "grad_norm": 2.0759336299328663, + "language_loss": 0.75058448, + "learning_rate": 3.23213436733704e-06, + "loss": 0.77192259, + "num_input_tokens_seen": 110744010, + "step": 5157, + "time_per_iteration": 2.6072826385498047 + }, + { + "auxiliary_loss_clip": 0.01047557, + "auxiliary_loss_mlp": 0.01033997, + "balance_loss_clip": 1.02750957, + "balance_loss_mlp": 1.02013314, + "epoch": 0.31011573726138586, + "flos": 25742169676800.0, + "grad_norm": 1.5990172212491898, + "language_loss": 0.69213867, + "learning_rate": 3.231827567499327e-06, + "loss": 0.71295428, + "num_input_tokens_seen": 110765835, + "step": 5158, + "time_per_iteration": 2.8127663135528564 + }, + { + "auxiliary_loss_clip": 0.01044473, + "auxiliary_loss_mlp": 0.01038443, + "balance_loss_clip": 1.02858078, + "balance_loss_mlp": 1.02531219, + "epoch": 0.3101758605140538, + "flos": 20011329440640.0, + "grad_norm": 1.7839898126753517, + "language_loss": 0.84382588, + "learning_rate": 3.2315207209505896e-06, + "loss": 0.86465502, + "num_input_tokens_seen": 110784655, + "step": 5159, + "time_per_iteration": 2.7699849605560303 + }, + { + "auxiliary_loss_clip": 0.01065136, + "auxiliary_loss_mlp": 0.01038989, + "balance_loss_clip": 1.02879632, + "balance_loss_mlp": 1.02378368, + "epoch": 0.3102359837667218, + "flos": 19135683688320.0, + "grad_norm": 1.7103018175408164, + "language_loss": 0.85029745, + "learning_rate": 3.231213827702462e-06, + "loss": 0.87133873, + "num_input_tokens_seen": 110802545, + "step": 5160, + "time_per_iteration": 2.6594622135162354 + }, + { + "auxiliary_loss_clip": 0.01076356, + "auxiliary_loss_mlp": 0.01036859, + "balance_loss_clip": 1.03043008, + "balance_loss_mlp": 1.02264905, + "epoch": 0.31029610701938976, + "flos": 22265405568000.0, + "grad_norm": 1.8426891199477022, + "language_loss": 0.7575264, + "learning_rate": 3.230906887766584e-06, + "loss": 0.77865857, + "num_input_tokens_seen": 110820265, + "step": 5161, + "time_per_iteration": 2.6258270740509033 + }, + { + "auxiliary_loss_clip": 0.01078161, + "auxiliary_loss_mlp": 0.01036348, + "balance_loss_clip": 1.02947712, + "balance_loss_mlp": 1.02200687, + "epoch": 0.3103562302720577, + "flos": 20805349536000.0, + "grad_norm": 1.8656070425728297, + "language_loss": 0.81520176, + "learning_rate": 3.2305999011545924e-06, + "loss": 0.83634686, + "num_input_tokens_seen": 110836195, + "step": 5162, + "time_per_iteration": 2.605989456176758 + }, + { + "auxiliary_loss_clip": 0.01075436, + "auxiliary_loss_mlp": 0.01036565, + "balance_loss_clip": 1.03067064, + "balance_loss_mlp": 1.02360058, + "epoch": 0.3104163535247257, + "flos": 22344158136960.0, + "grad_norm": 2.2834009009253595, + "language_loss": 0.82949406, + "learning_rate": 3.2302928678781295e-06, + "loss": 0.85061407, + "num_input_tokens_seen": 110856420, + "step": 5163, + "time_per_iteration": 2.600607395172119 + }, + { + "auxiliary_loss_clip": 0.01092381, + "auxiliary_loss_mlp": 0.01043, + "balance_loss_clip": 1.03442025, + "balance_loss_mlp": 1.02782512, + "epoch": 0.31047647677739365, + "flos": 21689363157120.0, + "grad_norm": 1.647630562781693, + "language_loss": 0.76439536, + "learning_rate": 3.2299857879488376e-06, + "loss": 0.78574914, + "num_input_tokens_seen": 110876650, + "step": 5164, + "time_per_iteration": 2.588411808013916 + }, + { + "auxiliary_loss_clip": 0.01045083, + "auxiliary_loss_mlp": 0.01040142, + "balance_loss_clip": 1.0306623, + "balance_loss_mlp": 1.02535415, + "epoch": 0.3105366000300616, + "flos": 18917275040640.0, + "grad_norm": 1.7655404607417498, + "language_loss": 0.74808991, + "learning_rate": 3.2296786613783626e-06, + "loss": 0.76894218, + "num_input_tokens_seen": 110894445, + "step": 5165, + "time_per_iteration": 2.711355209350586 + }, + { + "auxiliary_loss_clip": 0.01051911, + "auxiliary_loss_mlp": 0.01045871, + "balance_loss_clip": 1.03201437, + "balance_loss_mlp": 1.02980137, + "epoch": 0.3105967232827296, + "flos": 18260397072000.0, + "grad_norm": 1.476089232184947, + "language_loss": 0.75737023, + "learning_rate": 3.229371488178348e-06, + "loss": 0.77834797, + "num_input_tokens_seen": 110912855, + "step": 5166, + "time_per_iteration": 2.692127227783203 + }, + { + "auxiliary_loss_clip": 0.01067187, + "auxiliary_loss_mlp": 0.01037205, + "balance_loss_clip": 1.02994275, + "balance_loss_mlp": 1.0218029, + "epoch": 0.31065684653539755, + "flos": 17672144037120.0, + "grad_norm": 2.066794972745593, + "language_loss": 0.73241651, + "learning_rate": 3.229064268360444e-06, + "loss": 0.75346041, + "num_input_tokens_seen": 110928025, + "step": 5167, + "time_per_iteration": 2.5973384380340576 + }, + { + "auxiliary_loss_clip": 0.00984461, + "auxiliary_loss_mlp": 0.01017777, + "balance_loss_clip": 1.01138878, + "balance_loss_mlp": 1.01514232, + "epoch": 0.3107169697880655, + "flos": 68531996511360.0, + "grad_norm": 0.7174842554109343, + "language_loss": 0.52979386, + "learning_rate": 3.2287570019362997e-06, + "loss": 0.54981625, + "num_input_tokens_seen": 110992215, + "step": 5168, + "time_per_iteration": 3.329366683959961 + }, + { + "auxiliary_loss_clip": 0.01080731, + "auxiliary_loss_mlp": 0.01038186, + "balance_loss_clip": 1.0324986, + "balance_loss_mlp": 1.02220035, + "epoch": 0.3107770930407335, + "flos": 13188733274880.0, + "grad_norm": 2.079676909645861, + "language_loss": 0.78509736, + "learning_rate": 3.2284496889175668e-06, + "loss": 0.80628651, + "num_input_tokens_seen": 111010400, + "step": 5169, + "time_per_iteration": 2.7009048461914062 + }, + { + "auxiliary_loss_clip": 0.01064416, + "auxiliary_loss_mlp": 0.01035865, + "balance_loss_clip": 1.02852178, + "balance_loss_mlp": 1.02147079, + "epoch": 0.3108372162934015, + "flos": 31580849520000.0, + "grad_norm": 1.559701783085579, + "language_loss": 0.640733, + "learning_rate": 3.2281423293158986e-06, + "loss": 0.66173577, + "num_input_tokens_seen": 111033960, + "step": 5170, + "time_per_iteration": 2.7778866291046143 + }, + { + "auxiliary_loss_clip": 0.01057437, + "auxiliary_loss_mlp": 0.00748838, + "balance_loss_clip": 1.03402913, + "balance_loss_mlp": 1.0015254, + "epoch": 0.31089733954606946, + "flos": 28729829266560.0, + "grad_norm": 2.361713313194787, + "language_loss": 0.77867234, + "learning_rate": 3.22783492314295e-06, + "loss": 0.79673511, + "num_input_tokens_seen": 111053265, + "step": 5171, + "time_per_iteration": 2.835206985473633 + }, + { + "auxiliary_loss_clip": 0.01050246, + "auxiliary_loss_mlp": 0.01047622, + "balance_loss_clip": 1.0340538, + "balance_loss_mlp": 1.03231561, + "epoch": 0.3109574627987374, + "flos": 19683249592320.0, + "grad_norm": 1.994517348397462, + "language_loss": 0.83390266, + "learning_rate": 3.2275274704103785e-06, + "loss": 0.85488141, + "num_input_tokens_seen": 111071130, + "step": 5172, + "time_per_iteration": 2.9693148136138916 + }, + { + "auxiliary_loss_clip": 0.01045549, + "auxiliary_loss_mlp": 0.0104595, + "balance_loss_clip": 1.03370285, + "balance_loss_mlp": 1.02999949, + "epoch": 0.3110175860514054, + "flos": 14683981656960.0, + "grad_norm": 2.5931248665951188, + "language_loss": 0.84048891, + "learning_rate": 3.227219971129842e-06, + "loss": 0.86140382, + "num_input_tokens_seen": 111089560, + "step": 5173, + "time_per_iteration": 2.7402455806732178 + }, + { + "auxiliary_loss_clip": 0.01086384, + "auxiliary_loss_mlp": 0.01032317, + "balance_loss_clip": 1.03272772, + "balance_loss_mlp": 1.01866746, + "epoch": 0.31107770930407336, + "flos": 25739655724800.0, + "grad_norm": 1.5742684364754371, + "language_loss": 0.83481151, + "learning_rate": 3.226912425313001e-06, + "loss": 0.85599852, + "num_input_tokens_seen": 111109960, + "step": 5174, + "time_per_iteration": 2.674372673034668 + }, + { + "auxiliary_loss_clip": 0.01067813, + "auxiliary_loss_mlp": 0.01041936, + "balance_loss_clip": 1.03249264, + "balance_loss_mlp": 1.02713597, + "epoch": 0.3111378325567413, + "flos": 19208259118080.0, + "grad_norm": 2.0434179299637494, + "language_loss": 0.85347867, + "learning_rate": 3.2266048329715183e-06, + "loss": 0.87457615, + "num_input_tokens_seen": 111127960, + "step": 5175, + "time_per_iteration": 2.7028918266296387 + }, + { + "auxiliary_loss_clip": 0.01025582, + "auxiliary_loss_mlp": 0.01039301, + "balance_loss_clip": 1.02783239, + "balance_loss_mlp": 1.02332735, + "epoch": 0.3111979558094093, + "flos": 23696374561920.0, + "grad_norm": 1.759368910666937, + "language_loss": 0.83878684, + "learning_rate": 3.2262971941170575e-06, + "loss": 0.85943568, + "num_input_tokens_seen": 111146730, + "step": 5176, + "time_per_iteration": 2.8232905864715576 + }, + { + "auxiliary_loss_clip": 0.01068272, + "auxiliary_loss_mlp": 0.01041403, + "balance_loss_clip": 1.02661955, + "balance_loss_mlp": 1.02563155, + "epoch": 0.31125807906207725, + "flos": 21033023892480.0, + "grad_norm": 1.9101294640789372, + "language_loss": 0.80975181, + "learning_rate": 3.2259895087612837e-06, + "loss": 0.83084857, + "num_input_tokens_seen": 111166295, + "step": 5177, + "time_per_iteration": 2.6528635025024414 + }, + { + "auxiliary_loss_clip": 0.01072173, + "auxiliary_loss_mlp": 0.00748526, + "balance_loss_clip": 1.03099537, + "balance_loss_mlp": 1.00147593, + "epoch": 0.3113182023147452, + "flos": 23076628277760.0, + "grad_norm": 1.5875175294618655, + "language_loss": 0.81059766, + "learning_rate": 3.2256817769158657e-06, + "loss": 0.82880461, + "num_input_tokens_seen": 111185665, + "step": 5178, + "time_per_iteration": 2.711883783340454 + }, + { + "auxiliary_loss_clip": 0.01071442, + "auxiliary_loss_mlp": 0.01040458, + "balance_loss_clip": 1.0354104, + "balance_loss_mlp": 1.02599156, + "epoch": 0.3113783255674132, + "flos": 11838994888320.0, + "grad_norm": 2.0429788134030167, + "language_loss": 0.81263286, + "learning_rate": 3.225373998592471e-06, + "loss": 0.8337518, + "num_input_tokens_seen": 111201615, + "step": 5179, + "time_per_iteration": 4.252765417098999 + }, + { + "auxiliary_loss_clip": 0.01055315, + "auxiliary_loss_mlp": 0.01044393, + "balance_loss_clip": 1.03112328, + "balance_loss_mlp": 1.02964675, + "epoch": 0.31143844882008115, + "flos": 16289547684480.0, + "grad_norm": 1.6821113715665197, + "language_loss": 0.78592998, + "learning_rate": 3.2250661738027715e-06, + "loss": 0.80692708, + "num_input_tokens_seen": 111220515, + "step": 5180, + "time_per_iteration": 2.8721535205841064 + }, + { + "auxiliary_loss_clip": 0.0105489, + "auxiliary_loss_mlp": 0.01032674, + "balance_loss_clip": 1.03534865, + "balance_loss_mlp": 1.01842809, + "epoch": 0.3114985720727491, + "flos": 23217792727680.0, + "grad_norm": 1.7164959310668746, + "language_loss": 0.83336538, + "learning_rate": 3.22475830255844e-06, + "loss": 0.85424101, + "num_input_tokens_seen": 111240395, + "step": 5181, + "time_per_iteration": 2.9345169067382812 + }, + { + "auxiliary_loss_clip": 0.01055709, + "auxiliary_loss_mlp": 0.0103826, + "balance_loss_clip": 1.03038549, + "balance_loss_mlp": 1.02440214, + "epoch": 0.3115586953254171, + "flos": 30044626698240.0, + "grad_norm": 1.5641641465890237, + "language_loss": 0.73908037, + "learning_rate": 3.2244503848711516e-06, + "loss": 0.76002002, + "num_input_tokens_seen": 111261100, + "step": 5182, + "time_per_iteration": 4.459765195846558 + }, + { + "auxiliary_loss_clip": 0.01040131, + "auxiliary_loss_mlp": 0.007486, + "balance_loss_clip": 1.02976513, + "balance_loss_mlp": 1.00156987, + "epoch": 0.3116188185780851, + "flos": 25666326109440.0, + "grad_norm": 1.867276263111633, + "language_loss": 0.70127457, + "learning_rate": 3.2241424207525815e-06, + "loss": 0.71916187, + "num_input_tokens_seen": 111281320, + "step": 5183, + "time_per_iteration": 2.9073708057403564 + }, + { + "auxiliary_loss_clip": 0.00991966, + "auxiliary_loss_mlp": 0.01004012, + "balance_loss_clip": 1.0067277, + "balance_loss_mlp": 1.00193775, + "epoch": 0.31167894183075306, + "flos": 69510058917120.0, + "grad_norm": 0.9539824506074622, + "language_loss": 0.59617531, + "learning_rate": 3.223834410214408e-06, + "loss": 0.61613512, + "num_input_tokens_seen": 111341405, + "step": 5184, + "time_per_iteration": 3.4232394695281982 + }, + { + "auxiliary_loss_clip": 0.01058676, + "auxiliary_loss_mlp": 0.01040811, + "balance_loss_clip": 1.02787614, + "balance_loss_mlp": 1.02680933, + "epoch": 0.31173906508342103, + "flos": 14939845211520.0, + "grad_norm": 2.641881146184396, + "language_loss": 0.70082712, + "learning_rate": 3.223526353268311e-06, + "loss": 0.7218219, + "num_input_tokens_seen": 111358975, + "step": 5185, + "time_per_iteration": 2.824226140975952 + }, + { + "auxiliary_loss_clip": 0.01067909, + "auxiliary_loss_mlp": 0.01045847, + "balance_loss_clip": 1.0369575, + "balance_loss_mlp": 1.03038585, + "epoch": 0.311799188336089, + "flos": 16176033728640.0, + "grad_norm": 2.6900776903821964, + "language_loss": 0.63898623, + "learning_rate": 3.2232182499259725e-06, + "loss": 0.66012377, + "num_input_tokens_seen": 111375845, + "step": 5186, + "time_per_iteration": 2.866290330886841 + }, + { + "auxiliary_loss_clip": 0.0106883, + "auxiliary_loss_mlp": 0.01043428, + "balance_loss_clip": 1.0308311, + "balance_loss_mlp": 1.02745426, + "epoch": 0.31185931158875696, + "flos": 25009627708800.0, + "grad_norm": 2.0637403714235956, + "language_loss": 0.86825079, + "learning_rate": 3.2229101001990747e-06, + "loss": 0.88937342, + "num_input_tokens_seen": 111394150, + "step": 5187, + "time_per_iteration": 2.664266347885132 + }, + { + "auxiliary_loss_clip": 0.01089827, + "auxiliary_loss_mlp": 0.00748523, + "balance_loss_clip": 1.03365815, + "balance_loss_mlp": 1.00142956, + "epoch": 0.3119194348414249, + "flos": 37232901273600.0, + "grad_norm": 1.4113013286474725, + "language_loss": 0.62921154, + "learning_rate": 3.2226019040993036e-06, + "loss": 0.64759505, + "num_input_tokens_seen": 111418355, + "step": 5188, + "time_per_iteration": 2.708641767501831 + }, + { + "auxiliary_loss_clip": 0.01063169, + "auxiliary_loss_mlp": 0.0104595, + "balance_loss_clip": 1.03844094, + "balance_loss_mlp": 1.03094125, + "epoch": 0.3119795580940929, + "flos": 15012779777280.0, + "grad_norm": 2.1769240879645673, + "language_loss": 0.82552159, + "learning_rate": 3.222293661638346e-06, + "loss": 0.84661269, + "num_input_tokens_seen": 111435445, + "step": 5189, + "time_per_iteration": 2.6864426136016846 + }, + { + "auxiliary_loss_clip": 0.00974477, + "auxiliary_loss_mlp": 0.01031726, + "balance_loss_clip": 1.02006888, + "balance_loss_mlp": 1.01625264, + "epoch": 0.31203968134676086, + "flos": 15998168557440.0, + "grad_norm": 1.668882051859127, + "language_loss": 0.79070675, + "learning_rate": 3.22198537282789e-06, + "loss": 0.81076872, + "num_input_tokens_seen": 111453430, + "step": 5190, + "time_per_iteration": 4.601499080657959 + }, + { + "auxiliary_loss_clip": 0.01024311, + "auxiliary_loss_mlp": 0.01047327, + "balance_loss_clip": 1.02383578, + "balance_loss_mlp": 1.02995229, + "epoch": 0.3120998045994288, + "flos": 23837359443840.0, + "grad_norm": 1.8835839317480558, + "language_loss": 0.75391901, + "learning_rate": 3.2216770376796262e-06, + "loss": 0.77463537, + "num_input_tokens_seen": 111475325, + "step": 5191, + "time_per_iteration": 3.0882625579833984 + }, + { + "auxiliary_loss_clip": 0.01018458, + "auxiliary_loss_mlp": 0.00747637, + "balance_loss_clip": 1.01439285, + "balance_loss_mlp": 1.00190568, + "epoch": 0.3121599278520968, + "flos": 69184205712000.0, + "grad_norm": 0.8356672101595308, + "language_loss": 0.63911098, + "learning_rate": 3.221368656205247e-06, + "loss": 0.65677196, + "num_input_tokens_seen": 111533960, + "step": 5192, + "time_per_iteration": 4.847533226013184 + }, + { + "auxiliary_loss_clip": 0.01079147, + "auxiliary_loss_mlp": 0.01043222, + "balance_loss_clip": 1.03194368, + "balance_loss_mlp": 1.02724814, + "epoch": 0.31222005110476475, + "flos": 23806368984960.0, + "grad_norm": 1.7896428200604673, + "language_loss": 0.80084234, + "learning_rate": 3.221060228416446e-06, + "loss": 0.82206607, + "num_input_tokens_seen": 111554055, + "step": 5193, + "time_per_iteration": 2.714265823364258 + }, + { + "auxiliary_loss_clip": 0.01060493, + "auxiliary_loss_mlp": 0.01048652, + "balance_loss_clip": 1.02947617, + "balance_loss_mlp": 1.03154516, + "epoch": 0.3122801743574327, + "flos": 25226132935680.0, + "grad_norm": 2.2158240367346833, + "language_loss": 0.72330093, + "learning_rate": 3.2207517543249183e-06, + "loss": 0.7443924, + "num_input_tokens_seen": 111574305, + "step": 5194, + "time_per_iteration": 2.706084966659546 + }, + { + "auxiliary_loss_clip": 0.0108953, + "auxiliary_loss_mlp": 0.01037524, + "balance_loss_clip": 1.03389025, + "balance_loss_mlp": 1.02345145, + "epoch": 0.3123402976101007, + "flos": 22966490200320.0, + "grad_norm": 1.4485106734791517, + "language_loss": 0.76837867, + "learning_rate": 3.2204432339423616e-06, + "loss": 0.78964925, + "num_input_tokens_seen": 111595680, + "step": 5195, + "time_per_iteration": 2.6571314334869385 + }, + { + "auxiliary_loss_clip": 0.01087479, + "auxiliary_loss_mlp": 0.01040474, + "balance_loss_clip": 1.03024602, + "balance_loss_mlp": 1.02593064, + "epoch": 0.3124004208627687, + "flos": 25192089820800.0, + "grad_norm": 1.4614210643279246, + "language_loss": 0.78009528, + "learning_rate": 3.220134667280476e-06, + "loss": 0.80137479, + "num_input_tokens_seen": 111618135, + "step": 5196, + "time_per_iteration": 2.6503515243530273 + }, + { + "auxiliary_loss_clip": 0.00998058, + "auxiliary_loss_mlp": 0.00747566, + "balance_loss_clip": 1.00475764, + "balance_loss_mlp": 1.00195444, + "epoch": 0.31246054411543667, + "flos": 67485165517440.0, + "grad_norm": 0.7698923589407127, + "language_loss": 0.54764557, + "learning_rate": 3.2198260543509613e-06, + "loss": 0.56510186, + "num_input_tokens_seen": 111682220, + "step": 5197, + "time_per_iteration": 3.2613236904144287 + }, + { + "auxiliary_loss_clip": 0.0108776, + "auxiliary_loss_mlp": 0.01034489, + "balance_loss_clip": 1.03265655, + "balance_loss_mlp": 1.02061927, + "epoch": 0.31252066736810463, + "flos": 17858520731520.0, + "grad_norm": 1.6820222150886244, + "language_loss": 0.66361833, + "learning_rate": 3.21951739516552e-06, + "loss": 0.6848408, + "num_input_tokens_seen": 111700815, + "step": 5198, + "time_per_iteration": 2.7021231651306152 + }, + { + "auxiliary_loss_clip": 0.0104438, + "auxiliary_loss_mlp": 0.01036238, + "balance_loss_clip": 1.02737677, + "balance_loss_mlp": 1.02001333, + "epoch": 0.3125807906207726, + "flos": 18475034791680.0, + "grad_norm": 2.462273618242146, + "language_loss": 0.6934216, + "learning_rate": 3.219208689735857e-06, + "loss": 0.7142278, + "num_input_tokens_seen": 111718195, + "step": 5199, + "time_per_iteration": 2.619532585144043 + }, + { + "auxiliary_loss_clip": 0.01077323, + "auxiliary_loss_mlp": 0.01044201, + "balance_loss_clip": 1.03069687, + "balance_loss_mlp": 1.02908504, + "epoch": 0.31264091387344056, + "flos": 18946541646720.0, + "grad_norm": 1.7958661165194967, + "language_loss": 0.78673923, + "learning_rate": 3.2188999380736785e-06, + "loss": 0.80795443, + "num_input_tokens_seen": 111734440, + "step": 5200, + "time_per_iteration": 2.562922716140747 + }, + { + "auxiliary_loss_clip": 0.01076978, + "auxiliary_loss_mlp": 0.01031492, + "balance_loss_clip": 1.03245318, + "balance_loss_mlp": 1.01693702, + "epoch": 0.3127010371261085, + "flos": 21468512384640.0, + "grad_norm": 1.9303074369080166, + "language_loss": 0.83862996, + "learning_rate": 3.2185911401906917e-06, + "loss": 0.85971469, + "num_input_tokens_seen": 111751960, + "step": 5201, + "time_per_iteration": 2.608668804168701 + }, + { + "auxiliary_loss_clip": 0.01090393, + "auxiliary_loss_mlp": 0.01041116, + "balance_loss_clip": 1.03381419, + "balance_loss_mlp": 1.02602422, + "epoch": 0.3127611603787765, + "flos": 15336047203200.0, + "grad_norm": 1.915678374808205, + "language_loss": 0.69455254, + "learning_rate": 3.2182822960986072e-06, + "loss": 0.71586758, + "num_input_tokens_seen": 111769585, + "step": 5202, + "time_per_iteration": 2.616429090499878 + }, + { + "auxiliary_loss_clip": 0.01089727, + "auxiliary_loss_mlp": 0.01040274, + "balance_loss_clip": 1.03253913, + "balance_loss_mlp": 1.02639794, + "epoch": 0.31282128363144446, + "flos": 17602980399360.0, + "grad_norm": 1.7232330107358917, + "language_loss": 0.83754766, + "learning_rate": 3.2179734058091358e-06, + "loss": 0.85884774, + "num_input_tokens_seen": 111787880, + "step": 5203, + "time_per_iteration": 2.7906556129455566 + }, + { + "auxiliary_loss_clip": 0.01039749, + "auxiliary_loss_mlp": 0.01043511, + "balance_loss_clip": 1.03145313, + "balance_loss_mlp": 1.02824605, + "epoch": 0.3128814068841124, + "flos": 26756753235840.0, + "grad_norm": 1.8979719553315058, + "language_loss": 0.60871583, + "learning_rate": 3.2176644693339913e-06, + "loss": 0.62954843, + "num_input_tokens_seen": 111805950, + "step": 5204, + "time_per_iteration": 2.782787799835205 + }, + { + "auxiliary_loss_clip": 0.01053003, + "auxiliary_loss_mlp": 0.01035744, + "balance_loss_clip": 1.02859378, + "balance_loss_mlp": 1.02242243, + "epoch": 0.3129415301367804, + "flos": 22272372806400.0, + "grad_norm": 1.9054875443299018, + "language_loss": 0.65798855, + "learning_rate": 3.217355486684887e-06, + "loss": 0.67887598, + "num_input_tokens_seen": 111826135, + "step": 5205, + "time_per_iteration": 2.667116165161133 + }, + { + "auxiliary_loss_clip": 0.01078653, + "auxiliary_loss_mlp": 0.01038946, + "balance_loss_clip": 1.03194642, + "balance_loss_mlp": 1.02373481, + "epoch": 0.31300165338944835, + "flos": 26464907232000.0, + "grad_norm": 1.7741656057756838, + "language_loss": 0.76679689, + "learning_rate": 3.2170464578735414e-06, + "loss": 0.78797293, + "num_input_tokens_seen": 111844700, + "step": 5206, + "time_per_iteration": 2.671170949935913 + }, + { + "auxiliary_loss_clip": 0.01085314, + "auxiliary_loss_mlp": 0.01036038, + "balance_loss_clip": 1.030967, + "balance_loss_mlp": 1.0219121, + "epoch": 0.3130617766421163, + "flos": 21944652094080.0, + "grad_norm": 3.117853154155397, + "language_loss": 0.83440524, + "learning_rate": 3.216737382911672e-06, + "loss": 0.85561883, + "num_input_tokens_seen": 111861585, + "step": 5207, + "time_per_iteration": 2.5491061210632324 + }, + { + "auxiliary_loss_clip": 0.01073938, + "auxiliary_loss_mlp": 0.0103959, + "balance_loss_clip": 1.03161097, + "balance_loss_mlp": 1.02672184, + "epoch": 0.3131218998947843, + "flos": 23292774368640.0, + "grad_norm": 1.5522003423411137, + "language_loss": 0.7117449, + "learning_rate": 3.216428261810999e-06, + "loss": 0.73288018, + "num_input_tokens_seen": 111882950, + "step": 5208, + "time_per_iteration": 2.7529067993164062 + }, + { + "auxiliary_loss_clip": 0.01070389, + "auxiliary_loss_mlp": 0.01043303, + "balance_loss_clip": 1.03470433, + "balance_loss_mlp": 1.02853942, + "epoch": 0.3131820231474523, + "flos": 21139642437120.0, + "grad_norm": 2.1255657135471204, + "language_loss": 0.74336791, + "learning_rate": 3.2161190945832445e-06, + "loss": 0.76450479, + "num_input_tokens_seen": 111901640, + "step": 5209, + "time_per_iteration": 2.8693010807037354 + }, + { + "auxiliary_loss_clip": 0.01087135, + "auxiliary_loss_mlp": 0.01034403, + "balance_loss_clip": 1.03080726, + "balance_loss_mlp": 1.02131987, + "epoch": 0.31324214640012027, + "flos": 23909863046400.0, + "grad_norm": 2.052106892499828, + "language_loss": 0.77456725, + "learning_rate": 3.2158098812401325e-06, + "loss": 0.79578257, + "num_input_tokens_seen": 111919615, + "step": 5210, + "time_per_iteration": 2.6792056560516357 + }, + { + "auxiliary_loss_clip": 0.01073894, + "auxiliary_loss_mlp": 0.01034979, + "balance_loss_clip": 1.03141475, + "balance_loss_mlp": 1.02151394, + "epoch": 0.31330226965278823, + "flos": 22236929061120.0, + "grad_norm": 1.7751947435404893, + "language_loss": 0.79378378, + "learning_rate": 3.2155006217933874e-06, + "loss": 0.8148725, + "num_input_tokens_seen": 111938485, + "step": 5211, + "time_per_iteration": 2.593966484069824 + }, + { + "auxiliary_loss_clip": 0.01076009, + "auxiliary_loss_mlp": 0.01032805, + "balance_loss_clip": 1.03240657, + "balance_loss_mlp": 1.01982355, + "epoch": 0.3133623929054562, + "flos": 19753993428480.0, + "grad_norm": 2.3064669329758996, + "language_loss": 0.79206121, + "learning_rate": 3.2151913162547367e-06, + "loss": 0.81314939, + "num_input_tokens_seen": 111956425, + "step": 5212, + "time_per_iteration": 2.593515396118164 + }, + { + "auxiliary_loss_clip": 0.01068026, + "auxiliary_loss_mlp": 0.01045121, + "balance_loss_clip": 1.0311259, + "balance_loss_mlp": 1.03048778, + "epoch": 0.31342251615812416, + "flos": 27162256849920.0, + "grad_norm": 1.8401338986991211, + "language_loss": 0.70681286, + "learning_rate": 3.2148819646359097e-06, + "loss": 0.72794431, + "num_input_tokens_seen": 111975915, + "step": 5213, + "time_per_iteration": 2.845623254776001 + }, + { + "auxiliary_loss_clip": 0.01078756, + "auxiliary_loss_mlp": 0.01038859, + "balance_loss_clip": 1.03334808, + "balance_loss_mlp": 1.02460718, + "epoch": 0.31348263941079213, + "flos": 20229809915520.0, + "grad_norm": 1.7734919031470844, + "language_loss": 0.77591562, + "learning_rate": 3.2145725669486374e-06, + "loss": 0.79709172, + "num_input_tokens_seen": 111995055, + "step": 5214, + "time_per_iteration": 2.7052361965179443 + }, + { + "auxiliary_loss_clip": 0.01048572, + "auxiliary_loss_mlp": 0.01034273, + "balance_loss_clip": 1.0347507, + "balance_loss_mlp": 1.02078485, + "epoch": 0.3135427626634601, + "flos": 24607643627520.0, + "grad_norm": 2.2606836433576687, + "language_loss": 0.82680643, + "learning_rate": 3.2142631232046517e-06, + "loss": 0.84763491, + "num_input_tokens_seen": 112015830, + "step": 5215, + "time_per_iteration": 2.7613508701324463 + }, + { + "auxiliary_loss_clip": 0.01078179, + "auxiliary_loss_mlp": 0.01033791, + "balance_loss_clip": 1.0325073, + "balance_loss_mlp": 1.0194267, + "epoch": 0.31360288591612806, + "flos": 20959873845120.0, + "grad_norm": 2.121583995114013, + "language_loss": 0.79947591, + "learning_rate": 3.213953633415686e-06, + "loss": 0.82059562, + "num_input_tokens_seen": 112035065, + "step": 5216, + "time_per_iteration": 2.6053307056427 + }, + { + "auxiliary_loss_clip": 0.01062192, + "auxiliary_loss_mlp": 0.01045374, + "balance_loss_clip": 1.02867126, + "balance_loss_mlp": 1.02968609, + "epoch": 0.313663009168796, + "flos": 26980513009920.0, + "grad_norm": 1.9016914487878682, + "language_loss": 0.68634021, + "learning_rate": 3.213644097593477e-06, + "loss": 0.70741594, + "num_input_tokens_seen": 112058405, + "step": 5217, + "time_per_iteration": 2.7115015983581543 + }, + { + "auxiliary_loss_clip": 0.01062868, + "auxiliary_loss_mlp": 0.01032098, + "balance_loss_clip": 1.02873111, + "balance_loss_mlp": 1.01832938, + "epoch": 0.313723132421464, + "flos": 18040911016320.0, + "grad_norm": 1.524165390797523, + "language_loss": 0.81062317, + "learning_rate": 3.2133345157497624e-06, + "loss": 0.83157283, + "num_input_tokens_seen": 112076420, + "step": 5218, + "time_per_iteration": 2.6125869750976562 + }, + { + "auxiliary_loss_clip": 0.01085416, + "auxiliary_loss_mlp": 0.01035177, + "balance_loss_clip": 1.02911425, + "balance_loss_mlp": 1.02062142, + "epoch": 0.31378325567413196, + "flos": 22488913946880.0, + "grad_norm": 2.1738587057815746, + "language_loss": 0.6925143, + "learning_rate": 3.2130248878962813e-06, + "loss": 0.71372026, + "num_input_tokens_seen": 112090775, + "step": 5219, + "time_per_iteration": 2.813204526901245 + }, + { + "auxiliary_loss_clip": 0.01063943, + "auxiliary_loss_mlp": 0.01038135, + "balance_loss_clip": 1.02886367, + "balance_loss_mlp": 1.02478337, + "epoch": 0.3138433789267999, + "flos": 22419247518720.0, + "grad_norm": 2.303947178155899, + "language_loss": 0.79318094, + "learning_rate": 3.2127152140447747e-06, + "loss": 0.81420171, + "num_input_tokens_seen": 112110980, + "step": 5220, + "time_per_iteration": 2.879429817199707 + }, + { + "auxiliary_loss_clip": 0.01076845, + "auxiliary_loss_mlp": 0.01033723, + "balance_loss_clip": 1.03175688, + "balance_loss_mlp": 1.02065122, + "epoch": 0.3139035021794679, + "flos": 13005912026880.0, + "grad_norm": 1.687400672694976, + "language_loss": 0.72983342, + "learning_rate": 3.212405494206986e-06, + "loss": 0.75093913, + "num_input_tokens_seen": 112129020, + "step": 5221, + "time_per_iteration": 2.645259380340576 + }, + { + "auxiliary_loss_clip": 0.01052609, + "auxiliary_loss_mlp": 0.01033754, + "balance_loss_clip": 1.02806234, + "balance_loss_mlp": 1.02022421, + "epoch": 0.31396362543213585, + "flos": 16945994689920.0, + "grad_norm": 1.628550796350182, + "language_loss": 0.81788254, + "learning_rate": 3.2120957283946588e-06, + "loss": 0.83874619, + "num_input_tokens_seen": 112147865, + "step": 5222, + "time_per_iteration": 2.6936416625976562 + }, + { + "auxiliary_loss_clip": 0.01075767, + "auxiliary_loss_mlp": 0.01039253, + "balance_loss_clip": 1.02921724, + "balance_loss_mlp": 1.02438712, + "epoch": 0.31402374868480387, + "flos": 20156731695360.0, + "grad_norm": 2.057399172884289, + "language_loss": 0.69582498, + "learning_rate": 3.2117859166195407e-06, + "loss": 0.71697521, + "num_input_tokens_seen": 112166745, + "step": 5223, + "time_per_iteration": 2.5797626972198486 + }, + { + "auxiliary_loss_clip": 0.01063884, + "auxiliary_loss_mlp": 0.00748193, + "balance_loss_clip": 1.02643561, + "balance_loss_mlp": 1.00119376, + "epoch": 0.31408387193747184, + "flos": 21251073404160.0, + "grad_norm": 1.5385440394311931, + "language_loss": 0.8050971, + "learning_rate": 3.211476058893379e-06, + "loss": 0.82321781, + "num_input_tokens_seen": 112185895, + "step": 5224, + "time_per_iteration": 2.5984582901000977 + }, + { + "auxiliary_loss_clip": 0.01085024, + "auxiliary_loss_mlp": 0.01041222, + "balance_loss_clip": 1.03603733, + "balance_loss_mlp": 1.0266428, + "epoch": 0.3141439951901398, + "flos": 27484267299840.0, + "grad_norm": 2.1177005410445675, + "language_loss": 0.58021396, + "learning_rate": 3.2111661552279243e-06, + "loss": 0.60147637, + "num_input_tokens_seen": 112204465, + "step": 5225, + "time_per_iteration": 2.7487142086029053 + }, + { + "auxiliary_loss_clip": 0.0103454, + "auxiliary_loss_mlp": 0.01030202, + "balance_loss_clip": 1.02592993, + "balance_loss_mlp": 1.01800084, + "epoch": 0.31420411844280777, + "flos": 17852235851520.0, + "grad_norm": 1.8352617668378637, + "language_loss": 0.81735194, + "learning_rate": 3.2108562056349273e-06, + "loss": 0.83799934, + "num_input_tokens_seen": 112221635, + "step": 5226, + "time_per_iteration": 4.255790710449219 + }, + { + "auxiliary_loss_clip": 0.01069556, + "auxiliary_loss_mlp": 0.01043231, + "balance_loss_clip": 1.03047502, + "balance_loss_mlp": 1.02821052, + "epoch": 0.31426424169547573, + "flos": 21616967295360.0, + "grad_norm": 1.9738062930150269, + "language_loss": 0.73969299, + "learning_rate": 3.210546210126141e-06, + "loss": 0.76082087, + "num_input_tokens_seen": 112241240, + "step": 5227, + "time_per_iteration": 2.604792594909668 + }, + { + "auxiliary_loss_clip": 0.01081597, + "auxiliary_loss_mlp": 0.01039969, + "balance_loss_clip": 1.03683543, + "balance_loss_mlp": 1.02531803, + "epoch": 0.3143243649481437, + "flos": 30920631586560.0, + "grad_norm": 1.6900618055413779, + "language_loss": 0.67971218, + "learning_rate": 3.2102361687133213e-06, + "loss": 0.70092785, + "num_input_tokens_seen": 112262350, + "step": 5228, + "time_per_iteration": 2.8907244205474854 + }, + { + "auxiliary_loss_clip": 0.01065993, + "auxiliary_loss_mlp": 0.01038453, + "balance_loss_clip": 1.03065944, + "balance_loss_mlp": 1.02522099, + "epoch": 0.31438448820081166, + "flos": 22821411168000.0, + "grad_norm": 1.853407405437719, + "language_loss": 0.7976464, + "learning_rate": 3.2099260814082254e-06, + "loss": 0.8186909, + "num_input_tokens_seen": 112283710, + "step": 5229, + "time_per_iteration": 4.239457845687866 + }, + { + "auxiliary_loss_clip": 0.01065172, + "auxiliary_loss_mlp": 0.01031867, + "balance_loss_clip": 1.03110504, + "balance_loss_mlp": 1.01807451, + "epoch": 0.3144446114534796, + "flos": 23292127923840.0, + "grad_norm": 1.7429432811926038, + "language_loss": 0.696495, + "learning_rate": 3.209615948222611e-06, + "loss": 0.7174654, + "num_input_tokens_seen": 112304285, + "step": 5230, + "time_per_iteration": 2.6676249504089355 + }, + { + "auxiliary_loss_clip": 0.01046885, + "auxiliary_loss_mlp": 0.01041579, + "balance_loss_clip": 1.02812922, + "balance_loss_mlp": 1.02599859, + "epoch": 0.3145047347061476, + "flos": 31355976424320.0, + "grad_norm": 1.6083892004824523, + "language_loss": 0.79667836, + "learning_rate": 3.209305769168239e-06, + "loss": 0.81756294, + "num_input_tokens_seen": 112325110, + "step": 5231, + "time_per_iteration": 2.7699692249298096 + }, + { + "auxiliary_loss_clip": 0.01067112, + "auxiliary_loss_mlp": 0.01043813, + "balance_loss_clip": 1.03593946, + "balance_loss_mlp": 1.0290308, + "epoch": 0.31456485795881556, + "flos": 10889552643840.0, + "grad_norm": 2.0391928307348093, + "language_loss": 0.85033977, + "learning_rate": 3.2089955442568704e-06, + "loss": 0.87144899, + "num_input_tokens_seen": 112339855, + "step": 5232, + "time_per_iteration": 2.655205011367798 + }, + { + "auxiliary_loss_clip": 0.01029363, + "auxiliary_loss_mlp": 0.01047132, + "balance_loss_clip": 1.02572, + "balance_loss_mlp": 1.03218317, + "epoch": 0.3146249812114835, + "flos": 17092438439040.0, + "grad_norm": 1.9295178155071522, + "language_loss": 0.80071497, + "learning_rate": 3.2086852735002692e-06, + "loss": 0.82147998, + "num_input_tokens_seen": 112358480, + "step": 5233, + "time_per_iteration": 2.809293508529663 + }, + { + "auxiliary_loss_clip": 0.01040792, + "auxiliary_loss_mlp": 0.01035616, + "balance_loss_clip": 1.02885282, + "balance_loss_mlp": 1.0217104, + "epoch": 0.3146851044641515, + "flos": 55291442889600.0, + "grad_norm": 1.703956832088978, + "language_loss": 0.70921141, + "learning_rate": 3.2083749569102024e-06, + "loss": 0.72997546, + "num_input_tokens_seen": 112382350, + "step": 5234, + "time_per_iteration": 3.1621041297912598 + }, + { + "auxiliary_loss_clip": 0.01057198, + "auxiliary_loss_mlp": 0.01032679, + "balance_loss_clip": 1.03363347, + "balance_loss_mlp": 1.01871371, + "epoch": 0.31474522771681945, + "flos": 27015884928000.0, + "grad_norm": 1.845731379839688, + "language_loss": 0.72626615, + "learning_rate": 3.2080645944984356e-06, + "loss": 0.74716491, + "num_input_tokens_seen": 112400260, + "step": 5235, + "time_per_iteration": 2.9259438514709473 + }, + { + "auxiliary_loss_clip": 0.01072951, + "auxiliary_loss_mlp": 0.0103147, + "balance_loss_clip": 1.02848089, + "balance_loss_mlp": 1.01851761, + "epoch": 0.3148053509694875, + "flos": 21251935330560.0, + "grad_norm": 1.7716688711632294, + "language_loss": 0.78329837, + "learning_rate": 3.2077541862767384e-06, + "loss": 0.80434263, + "num_input_tokens_seen": 112419400, + "step": 5236, + "time_per_iteration": 2.84287166595459 + }, + { + "auxiliary_loss_clip": 0.01088428, + "auxiliary_loss_mlp": 0.01040422, + "balance_loss_clip": 1.03121543, + "balance_loss_mlp": 1.02602792, + "epoch": 0.31486547422215544, + "flos": 31248675521280.0, + "grad_norm": 1.4956019656971797, + "language_loss": 0.75912529, + "learning_rate": 3.207443732256881e-06, + "loss": 0.78041375, + "num_input_tokens_seen": 112440825, + "step": 5237, + "time_per_iteration": 2.717376708984375 + }, + { + "auxiliary_loss_clip": 0.01080377, + "auxiliary_loss_mlp": 0.01032639, + "balance_loss_clip": 1.02904296, + "balance_loss_mlp": 1.02040839, + "epoch": 0.3149255974748234, + "flos": 19828615933440.0, + "grad_norm": 1.9099713721210831, + "language_loss": 0.79668152, + "learning_rate": 3.2071332324506372e-06, + "loss": 0.81781173, + "num_input_tokens_seen": 112459180, + "step": 5238, + "time_per_iteration": 4.117889165878296 + }, + { + "auxiliary_loss_clip": 0.01010182, + "auxiliary_loss_mlp": 0.01014579, + "balance_loss_clip": 1.0060519, + "balance_loss_mlp": 1.01292229, + "epoch": 0.31498572072749137, + "flos": 67683965339520.0, + "grad_norm": 0.8358639041421478, + "language_loss": 0.67858267, + "learning_rate": 3.2068226868697795e-06, + "loss": 0.69883031, + "num_input_tokens_seen": 112516680, + "step": 5239, + "time_per_iteration": 4.751955509185791 + }, + { + "auxiliary_loss_clip": 0.01069742, + "auxiliary_loss_mlp": 0.01035359, + "balance_loss_clip": 1.0317657, + "balance_loss_mlp": 1.01992118, + "epoch": 0.31504584398015933, + "flos": 19793136274560.0, + "grad_norm": 2.8084504809155235, + "language_loss": 0.82392776, + "learning_rate": 3.2065120955260846e-06, + "loss": 0.84497875, + "num_input_tokens_seen": 112535895, + "step": 5240, + "time_per_iteration": 2.6267940998077393 + }, + { + "auxiliary_loss_clip": 0.01059383, + "auxiliary_loss_mlp": 0.00748208, + "balance_loss_clip": 1.03018641, + "balance_loss_mlp": 1.00106919, + "epoch": 0.3151059672328273, + "flos": 26615409217920.0, + "grad_norm": 2.0484402405174458, + "language_loss": 0.81293786, + "learning_rate": 3.2062014584313302e-06, + "loss": 0.8310138, + "num_input_tokens_seen": 112557490, + "step": 5241, + "time_per_iteration": 2.656621217727661 + }, + { + "auxiliary_loss_clip": 0.0108573, + "auxiliary_loss_mlp": 0.01037928, + "balance_loss_clip": 1.03331065, + "balance_loss_mlp": 1.02463579, + "epoch": 0.31516609048549526, + "flos": 24204438483840.0, + "grad_norm": 1.667894218098288, + "language_loss": 0.74241167, + "learning_rate": 3.2058907755972956e-06, + "loss": 0.76364821, + "num_input_tokens_seen": 112577075, + "step": 5242, + "time_per_iteration": 2.734187364578247 + }, + { + "auxiliary_loss_clip": 0.0104919, + "auxiliary_loss_mlp": 0.01031725, + "balance_loss_clip": 1.02649879, + "balance_loss_mlp": 1.01710987, + "epoch": 0.31522621373816323, + "flos": 25958710817280.0, + "grad_norm": 2.3734914356053824, + "language_loss": 0.73723745, + "learning_rate": 3.2055800470357626e-06, + "loss": 0.75804663, + "num_input_tokens_seen": 112597620, + "step": 5243, + "time_per_iteration": 2.6877243518829346 + }, + { + "auxiliary_loss_clip": 0.010743, + "auxiliary_loss_mlp": 0.01034249, + "balance_loss_clip": 1.03031564, + "balance_loss_mlp": 1.02058196, + "epoch": 0.3152863369908312, + "flos": 21908813299200.0, + "grad_norm": 2.3854638568928657, + "language_loss": 0.64175606, + "learning_rate": 3.205269272758513e-06, + "loss": 0.6628415, + "num_input_tokens_seen": 112617150, + "step": 5244, + "time_per_iteration": 2.595390796661377 + }, + { + "auxiliary_loss_clip": 0.01043741, + "auxiliary_loss_mlp": 0.01037944, + "balance_loss_clip": 1.02999473, + "balance_loss_mlp": 1.02427721, + "epoch": 0.31534646024349916, + "flos": 16281072074880.0, + "grad_norm": 2.398268560145086, + "language_loss": 0.91905195, + "learning_rate": 3.2049584527773313e-06, + "loss": 0.93986881, + "num_input_tokens_seen": 112631090, + "step": 5245, + "time_per_iteration": 2.7403063774108887 + }, + { + "auxiliary_loss_clip": 0.01077192, + "auxiliary_loss_mlp": 0.01040903, + "balance_loss_clip": 1.03068697, + "balance_loss_mlp": 1.02643108, + "epoch": 0.3154065834961671, + "flos": 24717243000960.0, + "grad_norm": 1.591415284699614, + "language_loss": 0.75308293, + "learning_rate": 3.2046475871040048e-06, + "loss": 0.77426398, + "num_input_tokens_seen": 112651220, + "step": 5246, + "time_per_iteration": 2.6489689350128174 + }, + { + "auxiliary_loss_clip": 0.01085634, + "auxiliary_loss_mlp": 0.01038831, + "balance_loss_clip": 1.0296737, + "balance_loss_mlp": 1.0252707, + "epoch": 0.3154667067488351, + "flos": 35371148469120.0, + "grad_norm": 1.6412741906766524, + "language_loss": 0.61834061, + "learning_rate": 3.204336675750321e-06, + "loss": 0.63958526, + "num_input_tokens_seen": 112671560, + "step": 5247, + "time_per_iteration": 2.645871639251709 + }, + { + "auxiliary_loss_clip": 0.01077532, + "auxiliary_loss_mlp": 0.01038873, + "balance_loss_clip": 1.03118849, + "balance_loss_mlp": 1.02445507, + "epoch": 0.31552683000150306, + "flos": 17456464823040.0, + "grad_norm": 2.422258109540647, + "language_loss": 0.82281244, + "learning_rate": 3.2040257187280693e-06, + "loss": 0.84397644, + "num_input_tokens_seen": 112689790, + "step": 5248, + "time_per_iteration": 2.5470895767211914 + }, + { + "auxiliary_loss_clip": 0.01067738, + "auxiliary_loss_mlp": 0.01050107, + "balance_loss_clip": 1.03155661, + "balance_loss_mlp": 1.03528917, + "epoch": 0.3155869532541711, + "flos": 18405763413120.0, + "grad_norm": 1.7323775936934862, + "language_loss": 0.85430485, + "learning_rate": 3.2037147160490423e-06, + "loss": 0.87548327, + "num_input_tokens_seen": 112708265, + "step": 5249, + "time_per_iteration": 2.651092767715454 + }, + { + "auxiliary_loss_clip": 0.01051127, + "auxiliary_loss_mlp": 0.01036271, + "balance_loss_clip": 1.02921486, + "balance_loss_mlp": 1.02183437, + "epoch": 0.31564707650683904, + "flos": 21579763783680.0, + "grad_norm": 2.225002098587925, + "language_loss": 0.8532272, + "learning_rate": 3.2034036677250322e-06, + "loss": 0.87410116, + "num_input_tokens_seen": 112727820, + "step": 5250, + "time_per_iteration": 2.6824698448181152 + }, + { + "auxiliary_loss_clip": 0.0106186, + "auxiliary_loss_mlp": 0.01037825, + "balance_loss_clip": 1.02878249, + "balance_loss_mlp": 1.02304888, + "epoch": 0.315707199759507, + "flos": 21030976817280.0, + "grad_norm": 2.4330245027897424, + "language_loss": 0.68050325, + "learning_rate": 3.203092573767835e-06, + "loss": 0.70150018, + "num_input_tokens_seen": 112743140, + "step": 5251, + "time_per_iteration": 2.676107883453369 + }, + { + "auxiliary_loss_clip": 0.01086223, + "auxiliary_loss_mlp": 0.01037789, + "balance_loss_clip": 1.03150344, + "balance_loss_mlp": 1.02386546, + "epoch": 0.31576732301217497, + "flos": 26828861788800.0, + "grad_norm": 1.7703590329111956, + "language_loss": 0.78889, + "learning_rate": 3.202781434189246e-06, + "loss": 0.81013012, + "num_input_tokens_seen": 112764705, + "step": 5252, + "time_per_iteration": 2.6073625087738037 + }, + { + "auxiliary_loss_clip": 0.01065667, + "auxiliary_loss_mlp": 0.0104057, + "balance_loss_clip": 1.02884197, + "balance_loss_mlp": 1.02603245, + "epoch": 0.31582744626484294, + "flos": 22711165349760.0, + "grad_norm": 1.7617884695690769, + "language_loss": 0.74388742, + "learning_rate": 3.202470249001066e-06, + "loss": 0.7649498, + "num_input_tokens_seen": 112785310, + "step": 5253, + "time_per_iteration": 2.6689813137054443 + }, + { + "auxiliary_loss_clip": 0.01062373, + "auxiliary_loss_mlp": 0.01035038, + "balance_loss_clip": 1.03013492, + "balance_loss_mlp": 1.02078617, + "epoch": 0.3158875695175109, + "flos": 23951914894080.0, + "grad_norm": 1.8111581896820912, + "language_loss": 0.73381066, + "learning_rate": 3.2021590182150924e-06, + "loss": 0.75478476, + "num_input_tokens_seen": 112802905, + "step": 5254, + "time_per_iteration": 2.6586053371429443 + }, + { + "auxiliary_loss_clip": 0.01075099, + "auxiliary_loss_mlp": 0.01038183, + "balance_loss_clip": 1.03018332, + "balance_loss_mlp": 1.02383661, + "epoch": 0.31594769277017887, + "flos": 13261883322240.0, + "grad_norm": 2.055474743399487, + "language_loss": 0.77905399, + "learning_rate": 3.201847741843128e-06, + "loss": 0.80018681, + "num_input_tokens_seen": 112820305, + "step": 5255, + "time_per_iteration": 2.6554267406463623 + }, + { + "auxiliary_loss_clip": 0.01062042, + "auxiliary_loss_mlp": 0.01038629, + "balance_loss_clip": 1.02836549, + "balance_loss_mlp": 1.02301264, + "epoch": 0.31600781602284683, + "flos": 23368258800000.0, + "grad_norm": 3.476151709654491, + "language_loss": 0.782767, + "learning_rate": 3.2015364198969772e-06, + "loss": 0.80377376, + "num_input_tokens_seen": 112841185, + "step": 5256, + "time_per_iteration": 2.6430182456970215 + }, + { + "auxiliary_loss_clip": 0.01048903, + "auxiliary_loss_mlp": 0.0103505, + "balance_loss_clip": 1.02961636, + "balance_loss_mlp": 1.02296829, + "epoch": 0.3160679392755148, + "flos": 19828580019840.0, + "grad_norm": 1.9017513821313572, + "language_loss": 0.71294856, + "learning_rate": 3.2012250523884453e-06, + "loss": 0.73378807, + "num_input_tokens_seen": 112860570, + "step": 5257, + "time_per_iteration": 2.7897636890411377 + }, + { + "auxiliary_loss_clip": 0.01077254, + "auxiliary_loss_mlp": 0.01037693, + "balance_loss_clip": 1.03121758, + "balance_loss_mlp": 1.02280974, + "epoch": 0.31612806252818276, + "flos": 20193216935040.0, + "grad_norm": 1.962849762935447, + "language_loss": 0.76560467, + "learning_rate": 3.2009136393293393e-06, + "loss": 0.78675413, + "num_input_tokens_seen": 112877975, + "step": 5258, + "time_per_iteration": 2.7059824466705322 + }, + { + "auxiliary_loss_clip": 0.01052515, + "auxiliary_loss_mlp": 0.01040266, + "balance_loss_clip": 1.02723455, + "balance_loss_mlp": 1.02529359, + "epoch": 0.31618818578085073, + "flos": 24235967646720.0, + "grad_norm": 3.41323212371278, + "language_loss": 0.72756153, + "learning_rate": 3.200602180731467e-06, + "loss": 0.74848938, + "num_input_tokens_seen": 112896170, + "step": 5259, + "time_per_iteration": 2.7023532390594482 + }, + { + "auxiliary_loss_clip": 0.01055743, + "auxiliary_loss_mlp": 0.0074807, + "balance_loss_clip": 1.02826893, + "balance_loss_mlp": 1.0007962, + "epoch": 0.3162483090335187, + "flos": 25081844002560.0, + "grad_norm": 1.9133513819806454, + "language_loss": 0.66011786, + "learning_rate": 3.20029067660664e-06, + "loss": 0.67815596, + "num_input_tokens_seen": 112916180, + "step": 5260, + "time_per_iteration": 2.7198903560638428 + }, + { + "auxiliary_loss_clip": 0.01071634, + "auxiliary_loss_mlp": 0.01028343, + "balance_loss_clip": 1.0272851, + "balance_loss_mlp": 1.01478291, + "epoch": 0.31630843228618666, + "flos": 26323383646080.0, + "grad_norm": 2.5943035871111246, + "language_loss": 0.72428548, + "learning_rate": 3.1999791269666706e-06, + "loss": 0.74528527, + "num_input_tokens_seen": 112936745, + "step": 5261, + "time_per_iteration": 2.7285923957824707 + }, + { + "auxiliary_loss_clip": 0.0100699, + "auxiliary_loss_mlp": 0.01004083, + "balance_loss_clip": 1.00354242, + "balance_loss_mlp": 1.00258112, + "epoch": 0.3163685555388547, + "flos": 66758441552640.0, + "grad_norm": 0.7396279197979293, + "language_loss": 0.50623047, + "learning_rate": 3.1996675318233716e-06, + "loss": 0.5263412, + "num_input_tokens_seen": 112994845, + "step": 5262, + "time_per_iteration": 3.2363905906677246 + }, + { + "auxiliary_loss_clip": 0.01078247, + "auxiliary_loss_mlp": 0.01039805, + "balance_loss_clip": 1.03212905, + "balance_loss_mlp": 1.02593517, + "epoch": 0.31642867879152264, + "flos": 25995662933760.0, + "grad_norm": 1.4432712096693894, + "language_loss": 0.85125279, + "learning_rate": 3.19935589118856e-06, + "loss": 0.8724333, + "num_input_tokens_seen": 113015125, + "step": 5263, + "time_per_iteration": 2.701873779296875 + }, + { + "auxiliary_loss_clip": 0.01059611, + "auxiliary_loss_mlp": 0.01039177, + "balance_loss_clip": 1.02883959, + "balance_loss_mlp": 1.02655268, + "epoch": 0.3164888020441906, + "flos": 25774955815680.0, + "grad_norm": 1.4954820567278242, + "language_loss": 0.81729221, + "learning_rate": 3.1990442050740535e-06, + "loss": 0.83828008, + "num_input_tokens_seen": 113035535, + "step": 5264, + "time_per_iteration": 2.7335586547851562 + }, + { + "auxiliary_loss_clip": 0.01057637, + "auxiliary_loss_mlp": 0.01037467, + "balance_loss_clip": 1.02768207, + "balance_loss_mlp": 1.02219629, + "epoch": 0.3165489252968586, + "flos": 19756220071680.0, + "grad_norm": 1.7292551099511781, + "language_loss": 0.79447031, + "learning_rate": 3.19873247349167e-06, + "loss": 0.81542134, + "num_input_tokens_seen": 113052720, + "step": 5265, + "time_per_iteration": 2.593454599380493 + }, + { + "auxiliary_loss_clip": 0.01077286, + "auxiliary_loss_mlp": 0.01034269, + "balance_loss_clip": 1.03098142, + "balance_loss_mlp": 1.0196656, + "epoch": 0.31660904854952654, + "flos": 23183929180800.0, + "grad_norm": 1.5735763285103206, + "language_loss": 0.74728185, + "learning_rate": 3.1984206964532307e-06, + "loss": 0.76839739, + "num_input_tokens_seen": 113071435, + "step": 5266, + "time_per_iteration": 2.6241259574890137 + }, + { + "auxiliary_loss_clip": 0.01049018, + "auxiliary_loss_mlp": 0.01034634, + "balance_loss_clip": 1.02725339, + "balance_loss_mlp": 1.02056742, + "epoch": 0.3166691718021945, + "flos": 20408501099520.0, + "grad_norm": 2.006207801500955, + "language_loss": 0.79319715, + "learning_rate": 3.1981088739705585e-06, + "loss": 0.81403369, + "num_input_tokens_seen": 113088645, + "step": 5267, + "time_per_iteration": 2.602092742919922 + }, + { + "auxiliary_loss_clip": 0.0099894, + "auxiliary_loss_mlp": 0.01007077, + "balance_loss_clip": 1.00549245, + "balance_loss_mlp": 1.00549185, + "epoch": 0.31672929505486247, + "flos": 70144781172480.0, + "grad_norm": 0.7311929522976118, + "language_loss": 0.5781045, + "learning_rate": 3.197797006055478e-06, + "loss": 0.59816468, + "num_input_tokens_seen": 113152775, + "step": 5268, + "time_per_iteration": 3.147493362426758 + }, + { + "auxiliary_loss_clip": 0.01085457, + "auxiliary_loss_mlp": 0.01032008, + "balance_loss_clip": 1.02948499, + "balance_loss_mlp": 1.01806021, + "epoch": 0.31678941830753043, + "flos": 14355758154240.0, + "grad_norm": 2.2050888205576613, + "language_loss": 0.73131067, + "learning_rate": 3.197485092719815e-06, + "loss": 0.75248528, + "num_input_tokens_seen": 113171410, + "step": 5269, + "time_per_iteration": 2.583320140838623 + }, + { + "auxiliary_loss_clip": 0.0105306, + "auxiliary_loss_mlp": 0.01039481, + "balance_loss_clip": 1.02867377, + "balance_loss_mlp": 1.02522349, + "epoch": 0.3168495415601984, + "flos": 22747722416640.0, + "grad_norm": 2.0876386632519894, + "language_loss": 0.79589057, + "learning_rate": 3.1971731339753973e-06, + "loss": 0.81681597, + "num_input_tokens_seen": 113189965, + "step": 5270, + "time_per_iteration": 2.6759679317474365 + }, + { + "auxiliary_loss_clip": 0.01089147, + "auxiliary_loss_mlp": 0.01043749, + "balance_loss_clip": 1.03134656, + "balance_loss_mlp": 1.02801371, + "epoch": 0.31690966481286637, + "flos": 20115254465280.0, + "grad_norm": 2.1942377806654108, + "language_loss": 0.79073197, + "learning_rate": 3.1968611298340545e-06, + "loss": 0.81206095, + "num_input_tokens_seen": 113206355, + "step": 5271, + "time_per_iteration": 2.751970052719116 + }, + { + "auxiliary_loss_clip": 0.0108569, + "auxiliary_loss_mlp": 0.01033405, + "balance_loss_clip": 1.03007495, + "balance_loss_mlp": 1.01853991, + "epoch": 0.31696978806553433, + "flos": 21178928937600.0, + "grad_norm": 1.7989961745655254, + "language_loss": 0.7297048, + "learning_rate": 3.1965490803076173e-06, + "loss": 0.75089574, + "num_input_tokens_seen": 113225440, + "step": 5272, + "time_per_iteration": 2.899637460708618 + }, + { + "auxiliary_loss_clip": 0.01066001, + "auxiliary_loss_mlp": 0.01041042, + "balance_loss_clip": 1.02921414, + "balance_loss_mlp": 1.02424526, + "epoch": 0.3170299113182023, + "flos": 42997030439040.0, + "grad_norm": 2.0406301412187706, + "language_loss": 0.69080949, + "learning_rate": 3.1962369854079194e-06, + "loss": 0.71187997, + "num_input_tokens_seen": 113248840, + "step": 5273, + "time_per_iteration": 3.0592682361602783 + }, + { + "auxiliary_loss_clip": 0.01075855, + "auxiliary_loss_mlp": 0.0074806, + "balance_loss_clip": 1.03099728, + "balance_loss_mlp": 1.00087905, + "epoch": 0.31709003457087026, + "flos": 24460158384000.0, + "grad_norm": 1.6293196789778142, + "language_loss": 0.68042833, + "learning_rate": 3.195924845146795e-06, + "loss": 0.69866753, + "num_input_tokens_seen": 113269630, + "step": 5274, + "time_per_iteration": 4.296051979064941 + }, + { + "auxiliary_loss_clip": 0.01042788, + "auxiliary_loss_mlp": 0.01044049, + "balance_loss_clip": 1.02746916, + "balance_loss_mlp": 1.02952337, + "epoch": 0.3171501578235382, + "flos": 24135310759680.0, + "grad_norm": 1.532263504904686, + "language_loss": 0.80722106, + "learning_rate": 3.195612659536081e-06, + "loss": 0.82808936, + "num_input_tokens_seen": 113291200, + "step": 5275, + "time_per_iteration": 2.7716615200042725 + }, + { + "auxiliary_loss_clip": 0.01073997, + "auxiliary_loss_mlp": 0.01044138, + "balance_loss_clip": 1.02883101, + "balance_loss_mlp": 1.02940404, + "epoch": 0.31721028107620625, + "flos": 18879712392960.0, + "grad_norm": 2.0999393395249735, + "language_loss": 0.72642398, + "learning_rate": 3.1953004285876147e-06, + "loss": 0.74760538, + "num_input_tokens_seen": 113310170, + "step": 5276, + "time_per_iteration": 2.6944127082824707 + }, + { + "auxiliary_loss_clip": 0.01067013, + "auxiliary_loss_mlp": 0.01034261, + "balance_loss_clip": 1.03327584, + "balance_loss_mlp": 1.02069545, + "epoch": 0.3172704043288742, + "flos": 23147874904320.0, + "grad_norm": 2.040134667550094, + "language_loss": 0.77966326, + "learning_rate": 3.194988152313236e-06, + "loss": 0.80067599, + "num_input_tokens_seen": 113331140, + "step": 5277, + "time_per_iteration": 4.353730201721191 + }, + { + "auxiliary_loss_clip": 0.01056745, + "auxiliary_loss_mlp": 0.01045255, + "balance_loss_clip": 1.02741909, + "balance_loss_mlp": 1.02813637, + "epoch": 0.3173305275815422, + "flos": 17858520731520.0, + "grad_norm": 1.7025660724328937, + "language_loss": 0.78697467, + "learning_rate": 3.1946758307247878e-06, + "loss": 0.8079946, + "num_input_tokens_seen": 113350030, + "step": 5278, + "time_per_iteration": 2.728573799133301 + }, + { + "auxiliary_loss_clip": 0.01007871, + "auxiliary_loss_mlp": 0.01002321, + "balance_loss_clip": 1.00514174, + "balance_loss_mlp": 1.00077176, + "epoch": 0.31739065083421014, + "flos": 59973476883840.0, + "grad_norm": 0.8778004261413767, + "language_loss": 0.62795341, + "learning_rate": 3.1943634638341114e-06, + "loss": 0.64805543, + "num_input_tokens_seen": 113395820, + "step": 5279, + "time_per_iteration": 3.0646328926086426 + }, + { + "auxiliary_loss_clip": 0.01090095, + "auxiliary_loss_mlp": 0.01042721, + "balance_loss_clip": 1.03069067, + "balance_loss_mlp": 1.02685404, + "epoch": 0.3174507740868781, + "flos": 23800981944960.0, + "grad_norm": 1.575773413632872, + "language_loss": 0.81198466, + "learning_rate": 3.194051051653053e-06, + "loss": 0.83331275, + "num_input_tokens_seen": 113416835, + "step": 5280, + "time_per_iteration": 2.5907413959503174 + }, + { + "auxiliary_loss_clip": 0.0105678, + "auxiliary_loss_mlp": 0.01047383, + "balance_loss_clip": 1.03149605, + "balance_loss_mlp": 1.03344786, + "epoch": 0.31751089733954607, + "flos": 27638899349760.0, + "grad_norm": 1.585342071364139, + "language_loss": 0.78158456, + "learning_rate": 3.19373859419346e-06, + "loss": 0.80262619, + "num_input_tokens_seen": 113440850, + "step": 5281, + "time_per_iteration": 2.8331706523895264 + }, + { + "auxiliary_loss_clip": 0.01063106, + "auxiliary_loss_mlp": 0.01039232, + "balance_loss_clip": 1.02879477, + "balance_loss_mlp": 1.0244205, + "epoch": 0.31757102059221404, + "flos": 23769273214080.0, + "grad_norm": 2.8664593856917904, + "language_loss": 0.78202635, + "learning_rate": 3.193426091467179e-06, + "loss": 0.80304968, + "num_input_tokens_seen": 113461000, + "step": 5282, + "time_per_iteration": 2.7637898921966553 + }, + { + "auxiliary_loss_clip": 0.01063245, + "auxiliary_loss_mlp": 0.01045794, + "balance_loss_clip": 1.03104389, + "balance_loss_mlp": 1.02971268, + "epoch": 0.317631143844882, + "flos": 25264521596160.0, + "grad_norm": 1.9514843867689775, + "language_loss": 0.66707146, + "learning_rate": 3.193113543486061e-06, + "loss": 0.68816185, + "num_input_tokens_seen": 113480820, + "step": 5283, + "time_per_iteration": 2.7116858959198 + }, + { + "auxiliary_loss_clip": 0.01007947, + "auxiliary_loss_mlp": 0.01003768, + "balance_loss_clip": 1.00535417, + "balance_loss_mlp": 1.00218284, + "epoch": 0.31769126709754997, + "flos": 55825939221120.0, + "grad_norm": 1.2797480193369495, + "language_loss": 0.52823955, + "learning_rate": 3.192800950261958e-06, + "loss": 0.54835665, + "num_input_tokens_seen": 113536910, + "step": 5284, + "time_per_iteration": 3.268143653869629 + }, + { + "auxiliary_loss_clip": 0.0107186, + "auxiliary_loss_mlp": 0.01037388, + "balance_loss_clip": 1.03458345, + "balance_loss_mlp": 1.02341044, + "epoch": 0.31775139035021793, + "flos": 16690562098560.0, + "grad_norm": 1.9980720488847645, + "language_loss": 0.70228398, + "learning_rate": 3.1924883118067235e-06, + "loss": 0.72337645, + "num_input_tokens_seen": 113555480, + "step": 5285, + "time_per_iteration": 4.26698112487793 + }, + { + "auxiliary_loss_clip": 0.01015799, + "auxiliary_loss_mlp": 0.01004033, + "balance_loss_clip": 1.00265121, + "balance_loss_mlp": 1.00242412, + "epoch": 0.3178115136028859, + "flos": 64227241019520.0, + "grad_norm": 0.8162061043906976, + "language_loss": 0.60512525, + "learning_rate": 3.1921756281322123e-06, + "loss": 0.62532359, + "num_input_tokens_seen": 113616790, + "step": 5286, + "time_per_iteration": 4.781310081481934 + }, + { + "auxiliary_loss_clip": 0.01088032, + "auxiliary_loss_mlp": 0.01046502, + "balance_loss_clip": 1.03160036, + "balance_loss_mlp": 1.03155887, + "epoch": 0.31787163685555386, + "flos": 18697465762560.0, + "grad_norm": 1.7682917473718305, + "language_loss": 0.72232139, + "learning_rate": 3.1918628992502826e-06, + "loss": 0.74366671, + "num_input_tokens_seen": 113635320, + "step": 5287, + "time_per_iteration": 2.6076033115386963 + }, + { + "auxiliary_loss_clip": 0.01076438, + "auxiliary_loss_mlp": 0.01043397, + "balance_loss_clip": 1.02918601, + "balance_loss_mlp": 1.02703547, + "epoch": 0.31793176010822183, + "flos": 21324762155520.0, + "grad_norm": 3.6601029817012076, + "language_loss": 0.75370067, + "learning_rate": 3.191550125172792e-06, + "loss": 0.77489907, + "num_input_tokens_seen": 113654000, + "step": 5288, + "time_per_iteration": 2.6736645698547363 + }, + { + "auxiliary_loss_clip": 0.01072731, + "auxiliary_loss_mlp": 0.01033106, + "balance_loss_clip": 1.02871025, + "balance_loss_mlp": 1.02027297, + "epoch": 0.31799188336088985, + "flos": 20958688696320.0, + "grad_norm": 1.6722721793722577, + "language_loss": 0.87432158, + "learning_rate": 3.1912373059116007e-06, + "loss": 0.8953799, + "num_input_tokens_seen": 113672375, + "step": 5289, + "time_per_iteration": 2.716305732727051 + }, + { + "auxiliary_loss_clip": 0.0107488, + "auxiliary_loss_mlp": 0.0103524, + "balance_loss_clip": 1.03197479, + "balance_loss_mlp": 1.02161443, + "epoch": 0.3180520066135578, + "flos": 22491930689280.0, + "grad_norm": 1.481549213387103, + "language_loss": 0.68041122, + "learning_rate": 3.190924441478572e-06, + "loss": 0.7015124, + "num_input_tokens_seen": 113692385, + "step": 5290, + "time_per_iteration": 2.5901596546173096 + }, + { + "auxiliary_loss_clip": 0.0106637, + "auxiliary_loss_mlp": 0.01038141, + "balance_loss_clip": 1.02955842, + "balance_loss_mlp": 1.02352571, + "epoch": 0.3181121298662258, + "flos": 27235335070080.0, + "grad_norm": 1.7287425811331627, + "language_loss": 0.79655802, + "learning_rate": 3.1906115318855687e-06, + "loss": 0.81760317, + "num_input_tokens_seen": 113712145, + "step": 5291, + "time_per_iteration": 2.7935330867767334 + }, + { + "auxiliary_loss_clip": 0.01050825, + "auxiliary_loss_mlp": 0.01037555, + "balance_loss_clip": 1.03282237, + "balance_loss_mlp": 1.02179587, + "epoch": 0.31817225311889374, + "flos": 23180158252800.0, + "grad_norm": 2.3129977144627727, + "language_loss": 0.7972306, + "learning_rate": 3.1902985771444577e-06, + "loss": 0.8181144, + "num_input_tokens_seen": 113731435, + "step": 5292, + "time_per_iteration": 2.7720210552215576 + }, + { + "auxiliary_loss_clip": 0.01067377, + "auxiliary_loss_mlp": 0.01032502, + "balance_loss_clip": 1.02808785, + "balance_loss_mlp": 1.01968098, + "epoch": 0.3182323763715617, + "flos": 23258803080960.0, + "grad_norm": 1.537015622758522, + "language_loss": 0.75034416, + "learning_rate": 3.1899855772671043e-06, + "loss": 0.77134293, + "num_input_tokens_seen": 113750825, + "step": 5293, + "time_per_iteration": 2.6186563968658447 + }, + { + "auxiliary_loss_clip": 0.01073335, + "auxiliary_loss_mlp": 0.01037765, + "balance_loss_clip": 1.03061509, + "balance_loss_mlp": 1.02461636, + "epoch": 0.3182924996242297, + "flos": 29016683280000.0, + "grad_norm": 2.897266553610382, + "language_loss": 0.73929286, + "learning_rate": 3.189672532265379e-06, + "loss": 0.76040387, + "num_input_tokens_seen": 113770010, + "step": 5294, + "time_per_iteration": 2.6949045658111572 + }, + { + "auxiliary_loss_clip": 0.01088802, + "auxiliary_loss_mlp": 0.01032278, + "balance_loss_clip": 1.0315063, + "balance_loss_mlp": 1.01689386, + "epoch": 0.31835262287689764, + "flos": 20449188230400.0, + "grad_norm": 1.9238643342798092, + "language_loss": 0.75654209, + "learning_rate": 3.189359442151152e-06, + "loss": 0.77775288, + "num_input_tokens_seen": 113788640, + "step": 5295, + "time_per_iteration": 2.588836669921875 + }, + { + "auxiliary_loss_clip": 0.01049667, + "auxiliary_loss_mlp": 0.01035604, + "balance_loss_clip": 1.02989364, + "balance_loss_mlp": 1.02139401, + "epoch": 0.3184127461295656, + "flos": 25119478477440.0, + "grad_norm": 1.6541865496247214, + "language_loss": 0.69614697, + "learning_rate": 3.189046306936296e-06, + "loss": 0.71699965, + "num_input_tokens_seen": 113809515, + "step": 5296, + "time_per_iteration": 2.7708191871643066 + }, + { + "auxiliary_loss_clip": 0.01065389, + "auxiliary_loss_mlp": 0.01035755, + "balance_loss_clip": 1.03058338, + "balance_loss_mlp": 1.02195621, + "epoch": 0.31847286938223357, + "flos": 25551231955200.0, + "grad_norm": 1.7389729125381375, + "language_loss": 0.77696425, + "learning_rate": 3.1887331266326846e-06, + "loss": 0.79797566, + "num_input_tokens_seen": 113829770, + "step": 5297, + "time_per_iteration": 2.7626595497131348 + }, + { + "auxiliary_loss_clip": 0.01052927, + "auxiliary_loss_mlp": 0.01027391, + "balance_loss_clip": 1.03013587, + "balance_loss_mlp": 1.0136168, + "epoch": 0.31853299263490154, + "flos": 27782470010880.0, + "grad_norm": 1.8655248201349268, + "language_loss": 0.79274005, + "learning_rate": 3.1884199012521942e-06, + "loss": 0.8135432, + "num_input_tokens_seen": 113849320, + "step": 5298, + "time_per_iteration": 2.8994131088256836 + }, + { + "auxiliary_loss_clip": 0.01066517, + "auxiliary_loss_mlp": 0.01033105, + "balance_loss_clip": 1.02975535, + "balance_loss_mlp": 1.01941931, + "epoch": 0.3185931158875695, + "flos": 22706747976960.0, + "grad_norm": 4.394119545913123, + "language_loss": 0.74106371, + "learning_rate": 3.1881066308067016e-06, + "loss": 0.76205993, + "num_input_tokens_seen": 113867860, + "step": 5299, + "time_per_iteration": 3.06076717376709 + }, + { + "auxiliary_loss_clip": 0.01075143, + "auxiliary_loss_mlp": 0.01041423, + "balance_loss_clip": 1.03474593, + "balance_loss_mlp": 1.02723694, + "epoch": 0.31865323914023747, + "flos": 24571517523840.0, + "grad_norm": 2.2524942077181107, + "language_loss": 0.78297067, + "learning_rate": 3.1877933153080873e-06, + "loss": 0.80413628, + "num_input_tokens_seen": 113886375, + "step": 5300, + "time_per_iteration": 2.909160852432251 + }, + { + "auxiliary_loss_clip": 0.01062115, + "auxiliary_loss_mlp": 0.01038469, + "balance_loss_clip": 1.02697754, + "balance_loss_mlp": 1.02357435, + "epoch": 0.31871336239290543, + "flos": 18186564666240.0, + "grad_norm": 2.046636067845213, + "language_loss": 0.84191692, + "learning_rate": 3.1874799547682304e-06, + "loss": 0.86292279, + "num_input_tokens_seen": 113904065, + "step": 5301, + "time_per_iteration": 2.777217149734497 + }, + { + "auxiliary_loss_clip": 0.01074322, + "auxiliary_loss_mlp": 0.01038571, + "balance_loss_clip": 1.03156233, + "balance_loss_mlp": 1.02457571, + "epoch": 0.31877348564557345, + "flos": 21826756679040.0, + "grad_norm": 2.157913892602298, + "language_loss": 0.77241009, + "learning_rate": 3.187166549199015e-06, + "loss": 0.79353905, + "num_input_tokens_seen": 113918415, + "step": 5302, + "time_per_iteration": 2.565497636795044 + }, + { + "auxiliary_loss_clip": 0.01080108, + "auxiliary_loss_mlp": 0.01037582, + "balance_loss_clip": 1.02821612, + "balance_loss_mlp": 1.02327657, + "epoch": 0.3188336088982414, + "flos": 22015252275840.0, + "grad_norm": 1.6255327282299226, + "language_loss": 0.79611874, + "learning_rate": 3.1868530986123255e-06, + "loss": 0.81729567, + "num_input_tokens_seen": 113938135, + "step": 5303, + "time_per_iteration": 2.5848777294158936 + }, + { + "auxiliary_loss_clip": 0.01080658, + "auxiliary_loss_mlp": 0.01039685, + "balance_loss_clip": 1.03285587, + "balance_loss_mlp": 1.02425981, + "epoch": 0.3188937321509094, + "flos": 20047886507520.0, + "grad_norm": 2.389171565902683, + "language_loss": 0.73090893, + "learning_rate": 3.186539603020047e-06, + "loss": 0.75211239, + "num_input_tokens_seen": 113957125, + "step": 5304, + "time_per_iteration": 2.5782506465911865 + }, + { + "auxiliary_loss_clip": 0.01043623, + "auxiliary_loss_mlp": 0.01039376, + "balance_loss_clip": 1.02464652, + "balance_loss_mlp": 1.02617955, + "epoch": 0.31895385540357735, + "flos": 25848105863040.0, + "grad_norm": 1.9975917671970067, + "language_loss": 0.7223236, + "learning_rate": 3.186226062434068e-06, + "loss": 0.74315363, + "num_input_tokens_seen": 113974875, + "step": 5305, + "time_per_iteration": 2.6499204635620117 + }, + { + "auxiliary_loss_clip": 0.01064219, + "auxiliary_loss_mlp": 0.01034235, + "balance_loss_clip": 1.02924573, + "balance_loss_mlp": 1.02128911, + "epoch": 0.3190139786562453, + "flos": 23477714519040.0, + "grad_norm": 1.8119222277319167, + "language_loss": 0.64212, + "learning_rate": 3.1859124768662778e-06, + "loss": 0.66310459, + "num_input_tokens_seen": 113994450, + "step": 5306, + "time_per_iteration": 2.640360116958618 + }, + { + "auxiliary_loss_clip": 0.01055637, + "auxiliary_loss_mlp": 0.01039806, + "balance_loss_clip": 1.02820659, + "balance_loss_mlp": 1.02519083, + "epoch": 0.3190741019089133, + "flos": 29095543589760.0, + "grad_norm": 2.302616711543337, + "language_loss": 0.79430819, + "learning_rate": 3.1855988463285678e-06, + "loss": 0.81526256, + "num_input_tokens_seen": 114013945, + "step": 5307, + "time_per_iteration": 2.736556053161621 + }, + { + "auxiliary_loss_clip": 0.01057966, + "auxiliary_loss_mlp": 0.01036664, + "balance_loss_clip": 1.02743745, + "balance_loss_mlp": 1.02217984, + "epoch": 0.31913422516158124, + "flos": 17129534209920.0, + "grad_norm": 1.7075804356665685, + "language_loss": 0.77201509, + "learning_rate": 3.1852851708328308e-06, + "loss": 0.79296142, + "num_input_tokens_seen": 114031375, + "step": 5308, + "time_per_iteration": 2.6104846000671387 + }, + { + "auxiliary_loss_clip": 0.01082664, + "auxiliary_loss_mlp": 0.01045729, + "balance_loss_clip": 1.0317626, + "balance_loss_mlp": 1.02940905, + "epoch": 0.3191943484142492, + "flos": 16069846147200.0, + "grad_norm": 5.030988263867782, + "language_loss": 0.74381447, + "learning_rate": 3.184971450390961e-06, + "loss": 0.76509845, + "num_input_tokens_seen": 114048465, + "step": 5309, + "time_per_iteration": 2.620589017868042 + }, + { + "auxiliary_loss_clip": 0.01072645, + "auxiliary_loss_mlp": 0.01032593, + "balance_loss_clip": 1.02833247, + "balance_loss_mlp": 1.01910508, + "epoch": 0.3192544716669172, + "flos": 22966166977920.0, + "grad_norm": 1.849336208565519, + "language_loss": 0.82744443, + "learning_rate": 3.184657685014856e-06, + "loss": 0.84849685, + "num_input_tokens_seen": 114068415, + "step": 5310, + "time_per_iteration": 2.7110466957092285 + }, + { + "auxiliary_loss_clip": 0.01063104, + "auxiliary_loss_mlp": 0.01036643, + "balance_loss_clip": 1.02899051, + "balance_loss_mlp": 1.02356613, + "epoch": 0.31931459491958514, + "flos": 26870339018880.0, + "grad_norm": 1.6363804444041643, + "language_loss": 0.78490549, + "learning_rate": 3.184343874716412e-06, + "loss": 0.80590296, + "num_input_tokens_seen": 114088565, + "step": 5311, + "time_per_iteration": 2.705573081970215 + }, + { + "auxiliary_loss_clip": 0.01046123, + "auxiliary_loss_mlp": 0.0103703, + "balance_loss_clip": 1.02651405, + "balance_loss_mlp": 1.02301693, + "epoch": 0.3193747181722531, + "flos": 21836525178240.0, + "grad_norm": 1.7605768910127666, + "language_loss": 0.84448409, + "learning_rate": 3.1840300195075295e-06, + "loss": 0.86531556, + "num_input_tokens_seen": 114107160, + "step": 5312, + "time_per_iteration": 2.843048095703125 + }, + { + "auxiliary_loss_clip": 0.01035819, + "auxiliary_loss_mlp": 0.0104497, + "balance_loss_clip": 1.02534842, + "balance_loss_mlp": 1.02948475, + "epoch": 0.31943484142492107, + "flos": 18324999682560.0, + "grad_norm": 2.3740508556517774, + "language_loss": 0.79298663, + "learning_rate": 3.1837161194001102e-06, + "loss": 0.81379449, + "num_input_tokens_seen": 114123420, + "step": 5313, + "time_per_iteration": 2.808224678039551 + }, + { + "auxiliary_loss_clip": 0.01072508, + "auxiliary_loss_mlp": 0.01033302, + "balance_loss_clip": 1.02983606, + "balance_loss_mlp": 1.01931882, + "epoch": 0.31949496467758903, + "flos": 21615818060160.0, + "grad_norm": 6.548870844345392, + "language_loss": 0.85562217, + "learning_rate": 3.183402174406057e-06, + "loss": 0.87668025, + "num_input_tokens_seen": 114139230, + "step": 5314, + "time_per_iteration": 2.8366544246673584 + }, + { + "auxiliary_loss_clip": 0.01053254, + "auxiliary_loss_mlp": 0.01043274, + "balance_loss_clip": 1.02591705, + "balance_loss_mlp": 1.0278362, + "epoch": 0.31955508793025705, + "flos": 21760214734080.0, + "grad_norm": 1.6944323635156928, + "language_loss": 0.79659629, + "learning_rate": 3.1830881845372747e-06, + "loss": 0.81756157, + "num_input_tokens_seen": 114159290, + "step": 5315, + "time_per_iteration": 2.700770616531372 + }, + { + "auxiliary_loss_clip": 0.01048299, + "auxiliary_loss_mlp": 0.01047297, + "balance_loss_clip": 1.02717376, + "balance_loss_mlp": 1.0310843, + "epoch": 0.319615211182925, + "flos": 17164331510400.0, + "grad_norm": 3.4104564072025947, + "language_loss": 0.67185855, + "learning_rate": 3.18277414980567e-06, + "loss": 0.69281447, + "num_input_tokens_seen": 114177655, + "step": 5316, + "time_per_iteration": 2.9234728813171387 + }, + { + "auxiliary_loss_clip": 0.0107074, + "auxiliary_loss_mlp": 0.01035356, + "balance_loss_clip": 1.02811956, + "balance_loss_mlp": 1.02256465, + "epoch": 0.319675334435593, + "flos": 28112812416000.0, + "grad_norm": 1.546041789758843, + "language_loss": 0.6956144, + "learning_rate": 3.1824600702231515e-06, + "loss": 0.71667534, + "num_input_tokens_seen": 114200880, + "step": 5317, + "time_per_iteration": 2.694715738296509 + }, + { + "auxiliary_loss_clip": 0.00999534, + "auxiliary_loss_mlp": 0.01004695, + "balance_loss_clip": 1.00661051, + "balance_loss_mlp": 1.00301385, + "epoch": 0.31973545768826095, + "flos": 69501119408640.0, + "grad_norm": 0.7330519046869465, + "language_loss": 0.53090501, + "learning_rate": 3.182145945801628e-06, + "loss": 0.55094731, + "num_input_tokens_seen": 114267145, + "step": 5318, + "time_per_iteration": 3.4657742977142334 + }, + { + "auxiliary_loss_clip": 0.01083805, + "auxiliary_loss_mlp": 0.01035449, + "balance_loss_clip": 1.03119802, + "balance_loss_mlp": 1.02194262, + "epoch": 0.3197955809409289, + "flos": 13699203408000.0, + "grad_norm": 1.7371905061397328, + "language_loss": 0.84290212, + "learning_rate": 3.181831776553012e-06, + "loss": 0.86409467, + "num_input_tokens_seen": 114284630, + "step": 5319, + "time_per_iteration": 2.702204465866089 + }, + { + "auxiliary_loss_clip": 0.01067704, + "auxiliary_loss_mlp": 0.01035649, + "balance_loss_clip": 1.02793753, + "balance_loss_mlp": 1.02136171, + "epoch": 0.3198557041935969, + "flos": 33218124278400.0, + "grad_norm": 1.592208262399936, + "language_loss": 0.6377008, + "learning_rate": 3.1815175624892165e-06, + "loss": 0.65873432, + "num_input_tokens_seen": 114305830, + "step": 5320, + "time_per_iteration": 2.8586480617523193 + }, + { + "auxiliary_loss_clip": 0.01057701, + "auxiliary_loss_mlp": 0.010354, + "balance_loss_clip": 1.02909613, + "balance_loss_mlp": 1.02140498, + "epoch": 0.31991582744626484, + "flos": 23732033788800.0, + "grad_norm": 1.8297508063593138, + "language_loss": 0.70378816, + "learning_rate": 3.1812033036221567e-06, + "loss": 0.72471917, + "num_input_tokens_seen": 114325165, + "step": 5321, + "time_per_iteration": 4.287619590759277 + }, + { + "auxiliary_loss_clip": 0.01091256, + "auxiliary_loss_mlp": 0.00748149, + "balance_loss_clip": 1.03194761, + "balance_loss_mlp": 1.0005672, + "epoch": 0.3199759506989328, + "flos": 18550842445440.0, + "grad_norm": 2.4003299852609223, + "language_loss": 0.86504734, + "learning_rate": 3.180888999963749e-06, + "loss": 0.88344145, + "num_input_tokens_seen": 114341310, + "step": 5322, + "time_per_iteration": 2.56679630279541 + }, + { + "auxiliary_loss_clip": 0.01057562, + "auxiliary_loss_mlp": 0.01036238, + "balance_loss_clip": 1.02645671, + "balance_loss_mlp": 1.02190852, + "epoch": 0.3200360739516008, + "flos": 22418888382720.0, + "grad_norm": 1.6990611274412462, + "language_loss": 0.83356172, + "learning_rate": 3.1805746515259123e-06, + "loss": 0.8544997, + "num_input_tokens_seen": 114360355, + "step": 5323, + "time_per_iteration": 2.698240280151367 + }, + { + "auxiliary_loss_clip": 0.01068557, + "auxiliary_loss_mlp": 0.01033262, + "balance_loss_clip": 1.02884126, + "balance_loss_mlp": 1.01812255, + "epoch": 0.32009619720426874, + "flos": 20595236929920.0, + "grad_norm": 1.8264769493339994, + "language_loss": 0.78045017, + "learning_rate": 3.1802602583205663e-06, + "loss": 0.80146837, + "num_input_tokens_seen": 114379220, + "step": 5324, + "time_per_iteration": 4.140656232833862 + }, + { + "auxiliary_loss_clip": 0.01059119, + "auxiliary_loss_mlp": 0.01031197, + "balance_loss_clip": 1.02894425, + "balance_loss_mlp": 1.01652241, + "epoch": 0.3201563204569367, + "flos": 18147637301760.0, + "grad_norm": 1.8965201412428923, + "language_loss": 0.80089891, + "learning_rate": 3.1799458203596333e-06, + "loss": 0.82180208, + "num_input_tokens_seen": 114396365, + "step": 5325, + "time_per_iteration": 2.6786892414093018 + }, + { + "auxiliary_loss_clip": 0.01076385, + "auxiliary_loss_mlp": 0.01035549, + "balance_loss_clip": 1.03124928, + "balance_loss_mlp": 1.02123785, + "epoch": 0.32021644370960467, + "flos": 31684235840640.0, + "grad_norm": 1.5739544018371643, + "language_loss": 0.74728543, + "learning_rate": 3.179631337655037e-06, + "loss": 0.76840478, + "num_input_tokens_seen": 114416780, + "step": 5326, + "time_per_iteration": 2.747849702835083 + }, + { + "auxiliary_loss_clip": 0.01042548, + "auxiliary_loss_mlp": 0.01036918, + "balance_loss_clip": 1.02794623, + "balance_loss_mlp": 1.02321506, + "epoch": 0.32027656696227264, + "flos": 26865921646080.0, + "grad_norm": 1.6516506697581446, + "language_loss": 0.80851591, + "learning_rate": 3.179316810218701e-06, + "loss": 0.8293106, + "num_input_tokens_seen": 114437405, + "step": 5327, + "time_per_iteration": 2.8079047203063965 + }, + { + "auxiliary_loss_clip": 0.01058197, + "auxiliary_loss_mlp": 0.01033943, + "balance_loss_clip": 1.03135371, + "balance_loss_mlp": 1.01990604, + "epoch": 0.32033669021494066, + "flos": 24169928492160.0, + "grad_norm": 1.477430377053737, + "language_loss": 0.77584702, + "learning_rate": 3.179002238062554e-06, + "loss": 0.79676843, + "num_input_tokens_seen": 114458505, + "step": 5328, + "time_per_iteration": 2.6914055347442627 + }, + { + "auxiliary_loss_clip": 0.01037792, + "auxiliary_loss_mlp": 0.01037679, + "balance_loss_clip": 1.0296762, + "balance_loss_mlp": 1.02177668, + "epoch": 0.3203968134676086, + "flos": 24460768915200.0, + "grad_norm": 1.6025911480924353, + "language_loss": 0.74092978, + "learning_rate": 3.178687621198524e-06, + "loss": 0.76168454, + "num_input_tokens_seen": 114479050, + "step": 5329, + "time_per_iteration": 2.8419692516326904 + }, + { + "auxiliary_loss_clip": 0.0105403, + "auxiliary_loss_mlp": 0.01032436, + "balance_loss_clip": 1.02693677, + "balance_loss_mlp": 1.01947165, + "epoch": 0.3204569367202766, + "flos": 18004713085440.0, + "grad_norm": 1.5722485170350824, + "language_loss": 0.7078383, + "learning_rate": 3.1783729596385415e-06, + "loss": 0.7287029, + "num_input_tokens_seen": 114497415, + "step": 5330, + "time_per_iteration": 2.6795833110809326 + }, + { + "auxiliary_loss_clip": 0.01045013, + "auxiliary_loss_mlp": 0.01052317, + "balance_loss_clip": 1.03366756, + "balance_loss_mlp": 1.0337677, + "epoch": 0.32051705997294455, + "flos": 30589678650240.0, + "grad_norm": 1.6223751256324028, + "language_loss": 0.80153477, + "learning_rate": 3.1780582533945376e-06, + "loss": 0.8225081, + "num_input_tokens_seen": 114518785, + "step": 5331, + "time_per_iteration": 2.765634298324585 + }, + { + "auxiliary_loss_clip": 0.0100923, + "auxiliary_loss_mlp": 0.01006795, + "balance_loss_clip": 1.00632429, + "balance_loss_mlp": 1.00500679, + "epoch": 0.3205771832256125, + "flos": 68417979765120.0, + "grad_norm": 0.8555627610794241, + "language_loss": 0.57874227, + "learning_rate": 3.177743502478447e-06, + "loss": 0.59890246, + "num_input_tokens_seen": 114577710, + "step": 5332, + "time_per_iteration": 3.16279673576355 + }, + { + "auxiliary_loss_clip": 0.01043796, + "auxiliary_loss_mlp": 0.01032859, + "balance_loss_clip": 1.02659476, + "balance_loss_mlp": 1.01862526, + "epoch": 0.3206373064782805, + "flos": 30443953173120.0, + "grad_norm": 1.7018400201628967, + "language_loss": 0.73426783, + "learning_rate": 3.177428706902205e-06, + "loss": 0.75503433, + "num_input_tokens_seen": 114598640, + "step": 5333, + "time_per_iteration": 6.022827863693237 + }, + { + "auxiliary_loss_clip": 0.01063722, + "auxiliary_loss_mlp": 0.01043779, + "balance_loss_clip": 1.02876687, + "balance_loss_mlp": 1.02881265, + "epoch": 0.32069742973094845, + "flos": 22054502862720.0, + "grad_norm": 1.5939602191159645, + "language_loss": 0.70322442, + "learning_rate": 3.1771138666777485e-06, + "loss": 0.72429943, + "num_input_tokens_seen": 114618780, + "step": 5334, + "time_per_iteration": 2.7432503700256348 + }, + { + "auxiliary_loss_clip": 0.01045758, + "auxiliary_loss_mlp": 0.01039362, + "balance_loss_clip": 1.02851725, + "balance_loss_mlp": 1.02483058, + "epoch": 0.3207575529836164, + "flos": 22054000072320.0, + "grad_norm": 1.9417443422079887, + "language_loss": 0.77248621, + "learning_rate": 3.1767989818170156e-06, + "loss": 0.79333735, + "num_input_tokens_seen": 114637525, + "step": 5335, + "time_per_iteration": 2.8496217727661133 + }, + { + "auxiliary_loss_clip": 0.0107573, + "auxiliary_loss_mlp": 0.01038867, + "balance_loss_clip": 1.0313673, + "balance_loss_mlp": 1.02490723, + "epoch": 0.3208176762362844, + "flos": 34057536186240.0, + "grad_norm": 1.4827869933770568, + "language_loss": 0.68183506, + "learning_rate": 3.1764840523319477e-06, + "loss": 0.70298105, + "num_input_tokens_seen": 114659705, + "step": 5336, + "time_per_iteration": 2.7595067024230957 + }, + { + "auxiliary_loss_clip": 0.01043138, + "auxiliary_loss_mlp": 0.01039357, + "balance_loss_clip": 1.02625418, + "balance_loss_mlp": 1.02446198, + "epoch": 0.32087779948895234, + "flos": 21798711135360.0, + "grad_norm": 2.01572169992676, + "language_loss": 0.79014933, + "learning_rate": 3.176169078234487e-06, + "loss": 0.8109743, + "num_input_tokens_seen": 114678340, + "step": 5337, + "time_per_iteration": 2.776158094406128 + }, + { + "auxiliary_loss_clip": 0.01069491, + "auxiliary_loss_mlp": 0.0103287, + "balance_loss_clip": 1.02886653, + "balance_loss_mlp": 1.01968002, + "epoch": 0.3209379227416203, + "flos": 21434110133760.0, + "grad_norm": 1.9950012015165803, + "language_loss": 0.74200529, + "learning_rate": 3.1758540595365766e-06, + "loss": 0.76302886, + "num_input_tokens_seen": 114696980, + "step": 5338, + "time_per_iteration": 2.620725154876709 + }, + { + "auxiliary_loss_clip": 0.01061161, + "auxiliary_loss_mlp": 0.01035857, + "balance_loss_clip": 1.02704477, + "balance_loss_mlp": 1.02102733, + "epoch": 0.3209980459942883, + "flos": 25849075530240.0, + "grad_norm": 1.8578858089886314, + "language_loss": 0.63057053, + "learning_rate": 3.1755389962501626e-06, + "loss": 0.6515407, + "num_input_tokens_seen": 114717330, + "step": 5339, + "time_per_iteration": 2.854825735092163 + }, + { + "auxiliary_loss_clip": 0.01084779, + "auxiliary_loss_mlp": 0.01036861, + "balance_loss_clip": 1.03067183, + "balance_loss_mlp": 1.02227008, + "epoch": 0.32105816924695624, + "flos": 19099162535040.0, + "grad_norm": 2.5467729199003224, + "language_loss": 0.81904888, + "learning_rate": 3.175223888387192e-06, + "loss": 0.84026527, + "num_input_tokens_seen": 114736320, + "step": 5340, + "time_per_iteration": 2.5088424682617188 + }, + { + "auxiliary_loss_clip": 0.01051101, + "auxiliary_loss_mlp": 0.01037761, + "balance_loss_clip": 1.02694154, + "balance_loss_mlp": 1.02364647, + "epoch": 0.3211182924996242, + "flos": 16581860565120.0, + "grad_norm": 1.9354903370875192, + "language_loss": 0.76438349, + "learning_rate": 3.1749087359596137e-06, + "loss": 0.78527212, + "num_input_tokens_seen": 114754575, + "step": 5341, + "time_per_iteration": 2.6841115951538086 + }, + { + "auxiliary_loss_clip": 0.01051631, + "auxiliary_loss_mlp": 0.01035322, + "balance_loss_clip": 1.03043389, + "balance_loss_mlp": 1.02202415, + "epoch": 0.3211784157522922, + "flos": 22672202071680.0, + "grad_norm": 3.4926832181649985, + "language_loss": 0.79070854, + "learning_rate": 3.1745935389793786e-06, + "loss": 0.81157804, + "num_input_tokens_seen": 114773590, + "step": 5342, + "time_per_iteration": 2.804446220397949 + }, + { + "auxiliary_loss_clip": 0.01065859, + "auxiliary_loss_mlp": 0.01035546, + "balance_loss_clip": 1.03035343, + "balance_loss_mlp": 1.02014446, + "epoch": 0.3212385390049602, + "flos": 20558787603840.0, + "grad_norm": 2.2264541286879367, + "language_loss": 0.74802887, + "learning_rate": 3.174278297458438e-06, + "loss": 0.76904285, + "num_input_tokens_seen": 114790775, + "step": 5343, + "time_per_iteration": 2.7424073219299316 + }, + { + "auxiliary_loss_clip": 0.01024351, + "auxiliary_loss_mlp": 0.0103706, + "balance_loss_clip": 1.02618897, + "balance_loss_mlp": 1.021909, + "epoch": 0.32129866225762815, + "flos": 24791147233920.0, + "grad_norm": 1.5174952415535155, + "language_loss": 0.82691133, + "learning_rate": 3.173963011408748e-06, + "loss": 0.84752542, + "num_input_tokens_seen": 114809835, + "step": 5344, + "time_per_iteration": 2.8615763187408447 + }, + { + "auxiliary_loss_clip": 0.01045119, + "auxiliary_loss_mlp": 0.0103796, + "balance_loss_clip": 1.03001833, + "balance_loss_mlp": 1.0233686, + "epoch": 0.3213587855102961, + "flos": 18366871962240.0, + "grad_norm": 1.8716334432086261, + "language_loss": 0.79554451, + "learning_rate": 3.173647680842262e-06, + "loss": 0.81637526, + "num_input_tokens_seen": 114826505, + "step": 5345, + "time_per_iteration": 2.7668368816375732 + }, + { + "auxiliary_loss_clip": 0.01064475, + "auxiliary_loss_mlp": 0.01036173, + "balance_loss_clip": 1.02881467, + "balance_loss_mlp": 1.02187347, + "epoch": 0.3214189087629641, + "flos": 27015992668800.0, + "grad_norm": 1.7284259438693996, + "language_loss": 0.82971281, + "learning_rate": 3.1733323057709384e-06, + "loss": 0.85071933, + "num_input_tokens_seen": 114846140, + "step": 5346, + "time_per_iteration": 2.769064426422119 + }, + { + "auxiliary_loss_clip": 0.01054052, + "auxiliary_loss_mlp": 0.01035102, + "balance_loss_clip": 1.0288285, + "balance_loss_mlp": 1.01992655, + "epoch": 0.32147903201563205, + "flos": 23148269953920.0, + "grad_norm": 1.5922220859215312, + "language_loss": 0.81565058, + "learning_rate": 3.1730168862067366e-06, + "loss": 0.83654213, + "num_input_tokens_seen": 114866660, + "step": 5347, + "time_per_iteration": 2.715725898742676 + }, + { + "auxiliary_loss_clip": 0.01070277, + "auxiliary_loss_mlp": 0.01037451, + "balance_loss_clip": 1.02853084, + "balance_loss_mlp": 1.02226329, + "epoch": 0.3215391552683, + "flos": 16580747243520.0, + "grad_norm": 4.135632435972338, + "language_loss": 0.79864538, + "learning_rate": 3.1727014221616164e-06, + "loss": 0.81972265, + "num_input_tokens_seen": 114882820, + "step": 5348, + "time_per_iteration": 2.5787553787231445 + }, + { + "auxiliary_loss_clip": 0.01058558, + "auxiliary_loss_mlp": 0.01051651, + "balance_loss_clip": 1.02922785, + "balance_loss_mlp": 1.0369761, + "epoch": 0.321599278520968, + "flos": 17821820010240.0, + "grad_norm": 2.12953340815988, + "language_loss": 0.85180485, + "learning_rate": 3.172385913647542e-06, + "loss": 0.87290692, + "num_input_tokens_seen": 114900745, + "step": 5349, + "time_per_iteration": 2.7333858013153076 + }, + { + "auxiliary_loss_clip": 0.0105449, + "auxiliary_loss_mlp": 0.0104331, + "balance_loss_clip": 1.02851367, + "balance_loss_mlp": 1.02803957, + "epoch": 0.32165940177363594, + "flos": 16251769555200.0, + "grad_norm": 2.148765739154167, + "language_loss": 0.80338138, + "learning_rate": 3.172070360676475e-06, + "loss": 0.82435942, + "num_input_tokens_seen": 114917940, + "step": 5350, + "time_per_iteration": 2.6285219192504883 + }, + { + "auxiliary_loss_clip": 0.01074461, + "auxiliary_loss_mlp": 0.01041299, + "balance_loss_clip": 1.03052771, + "balance_loss_mlp": 1.02770925, + "epoch": 0.3217195250263039, + "flos": 27599900158080.0, + "grad_norm": 1.692383486632074, + "language_loss": 0.79996234, + "learning_rate": 3.1717547632603828e-06, + "loss": 0.8211199, + "num_input_tokens_seen": 114937735, + "step": 5351, + "time_per_iteration": 2.750847101211548 + }, + { + "auxiliary_loss_clip": 0.01054535, + "auxiliary_loss_mlp": 0.01043515, + "balance_loss_clip": 1.03052378, + "balance_loss_mlp": 1.02839899, + "epoch": 0.3217796482789719, + "flos": 21470595373440.0, + "grad_norm": 1.573481316535104, + "language_loss": 0.75810903, + "learning_rate": 3.1714391214112326e-06, + "loss": 0.77908957, + "num_input_tokens_seen": 114956630, + "step": 5352, + "time_per_iteration": 2.7529070377349854 + }, + { + "auxiliary_loss_clip": 0.01045447, + "auxiliary_loss_mlp": 0.01037882, + "balance_loss_clip": 1.02949226, + "balance_loss_mlp": 1.02321315, + "epoch": 0.32183977153163984, + "flos": 21215593745280.0, + "grad_norm": 1.9465051564571385, + "language_loss": 0.81670654, + "learning_rate": 3.1711234351409933e-06, + "loss": 0.83753985, + "num_input_tokens_seen": 114976470, + "step": 5353, + "time_per_iteration": 2.741781234741211 + }, + { + "auxiliary_loss_clip": 0.01024478, + "auxiliary_loss_mlp": 0.01040208, + "balance_loss_clip": 1.031672, + "balance_loss_mlp": 1.02560425, + "epoch": 0.3218998947843078, + "flos": 24608182331520.0, + "grad_norm": 1.6403219086868042, + "language_loss": 0.72911799, + "learning_rate": 3.1708077044616365e-06, + "loss": 0.7497648, + "num_input_tokens_seen": 114996710, + "step": 5354, + "time_per_iteration": 2.8954732418060303 + }, + { + "auxiliary_loss_clip": 0.01050404, + "auxiliary_loss_mlp": 0.01029875, + "balance_loss_clip": 1.02688718, + "balance_loss_mlp": 1.0168575, + "epoch": 0.3219600180369758, + "flos": 22270577126400.0, + "grad_norm": 1.707224595354063, + "language_loss": 0.83478498, + "learning_rate": 3.1704919293851334e-06, + "loss": 0.85558778, + "num_input_tokens_seen": 115015775, + "step": 5355, + "time_per_iteration": 2.7591376304626465 + }, + { + "auxiliary_loss_clip": 0.01088132, + "auxiliary_loss_mlp": 0.01043717, + "balance_loss_clip": 1.03279984, + "balance_loss_mlp": 1.02925646, + "epoch": 0.3220201412896438, + "flos": 14939126939520.0, + "grad_norm": 2.1116597723305635, + "language_loss": 0.71423054, + "learning_rate": 3.1701761099234597e-06, + "loss": 0.73554897, + "num_input_tokens_seen": 115034265, + "step": 5356, + "time_per_iteration": 2.672753095626831 + }, + { + "auxiliary_loss_clip": 0.01052116, + "auxiliary_loss_mlp": 0.01044864, + "balance_loss_clip": 1.03319407, + "balance_loss_mlp": 1.02917635, + "epoch": 0.32208026454231176, + "flos": 22667389649280.0, + "grad_norm": 3.242925766052079, + "language_loss": 0.67968053, + "learning_rate": 3.1698602460885903e-06, + "loss": 0.70065033, + "num_input_tokens_seen": 115051945, + "step": 5357, + "time_per_iteration": 2.800363779067993 + }, + { + "auxiliary_loss_clip": 0.0099064, + "auxiliary_loss_mlp": 0.01002445, + "balance_loss_clip": 1.00554276, + "balance_loss_mlp": 1.00082326, + "epoch": 0.3221403877949797, + "flos": 64605130053120.0, + "grad_norm": 0.712606451701601, + "language_loss": 0.58218622, + "learning_rate": 3.1695443378925035e-06, + "loss": 0.60211706, + "num_input_tokens_seen": 115119090, + "step": 5358, + "time_per_iteration": 3.3195109367370605 + }, + { + "auxiliary_loss_clip": 0.01028142, + "auxiliary_loss_mlp": 0.01035096, + "balance_loss_clip": 1.02641511, + "balance_loss_mlp": 1.02050424, + "epoch": 0.3222005110476477, + "flos": 20157019004160.0, + "grad_norm": 1.9949303022399703, + "language_loss": 0.83407706, + "learning_rate": 3.1692283853471777e-06, + "loss": 0.85470939, + "num_input_tokens_seen": 115137755, + "step": 5359, + "time_per_iteration": 2.872443675994873 + }, + { + "auxiliary_loss_clip": 0.01074671, + "auxiliary_loss_mlp": 0.01032951, + "balance_loss_clip": 1.02951217, + "balance_loss_mlp": 1.01887834, + "epoch": 0.32226063430031565, + "flos": 22674177319680.0, + "grad_norm": 1.6321677035242363, + "language_loss": 0.79998368, + "learning_rate": 3.168912388464595e-06, + "loss": 0.82105994, + "num_input_tokens_seen": 115158150, + "step": 5360, + "time_per_iteration": 2.742905616760254 + }, + { + "auxiliary_loss_clip": 0.01006418, + "auxiliary_loss_mlp": 0.01002111, + "balance_loss_clip": 1.00405014, + "balance_loss_mlp": 1.00050187, + "epoch": 0.3223207575529836, + "flos": 63828525075840.0, + "grad_norm": 0.6569150737098535, + "language_loss": 0.57034016, + "learning_rate": 3.168596347256737e-06, + "loss": 0.59042537, + "num_input_tokens_seen": 115212755, + "step": 5361, + "time_per_iteration": 3.0889623165130615 + }, + { + "auxiliary_loss_clip": 0.01027973, + "auxiliary_loss_mlp": 0.01041436, + "balance_loss_clip": 1.02672648, + "balance_loss_mlp": 1.02656436, + "epoch": 0.3223808808056516, + "flos": 26870123537280.0, + "grad_norm": 1.990370992821854, + "language_loss": 0.71170789, + "learning_rate": 3.168280261735588e-06, + "loss": 0.73240197, + "num_input_tokens_seen": 115233090, + "step": 5362, + "time_per_iteration": 2.7855238914489746 + }, + { + "auxiliary_loss_clip": 0.01065966, + "auxiliary_loss_mlp": 0.01043897, + "balance_loss_clip": 1.02844357, + "balance_loss_mlp": 1.02950311, + "epoch": 0.32244100405831955, + "flos": 26761350176640.0, + "grad_norm": 1.7577914746174919, + "language_loss": 0.73793936, + "learning_rate": 3.167964131913135e-06, + "loss": 0.75903797, + "num_input_tokens_seen": 115252645, + "step": 5363, + "time_per_iteration": 2.7030327320098877 + }, + { + "auxiliary_loss_clip": 0.01077629, + "auxiliary_loss_mlp": 0.01039318, + "balance_loss_clip": 1.0294733, + "balance_loss_mlp": 1.02466762, + "epoch": 0.3225011273109875, + "flos": 23803029020160.0, + "grad_norm": 2.177743941238767, + "language_loss": 0.77095008, + "learning_rate": 3.167647957801365e-06, + "loss": 0.79211956, + "num_input_tokens_seen": 115269085, + "step": 5364, + "time_per_iteration": 2.612412452697754 + }, + { + "auxiliary_loss_clip": 0.01064252, + "auxiliary_loss_mlp": 0.01036616, + "balance_loss_clip": 1.0291779, + "balance_loss_mlp": 1.02252507, + "epoch": 0.3225612505636555, + "flos": 17274505501440.0, + "grad_norm": 3.16830255662953, + "language_loss": 0.77358353, + "learning_rate": 3.1673317394122672e-06, + "loss": 0.7945922, + "num_input_tokens_seen": 115286470, + "step": 5365, + "time_per_iteration": 2.609830617904663 + }, + { + "auxiliary_loss_clip": 0.01070477, + "auxiliary_loss_mlp": 0.01044655, + "balance_loss_clip": 1.03452182, + "balance_loss_mlp": 1.03020096, + "epoch": 0.32262137381632344, + "flos": 23366247638400.0, + "grad_norm": 1.9083938713891349, + "language_loss": 0.76669723, + "learning_rate": 3.1670154767578333e-06, + "loss": 0.78784847, + "num_input_tokens_seen": 115307000, + "step": 5366, + "time_per_iteration": 2.651801109313965 + }, + { + "auxiliary_loss_clip": 0.01058176, + "auxiliary_loss_mlp": 0.01036805, + "balance_loss_clip": 1.02908874, + "balance_loss_mlp": 1.02204108, + "epoch": 0.3226814970689914, + "flos": 23258803080960.0, + "grad_norm": 1.7310380483691743, + "language_loss": 0.71969646, + "learning_rate": 3.166699169850055e-06, + "loss": 0.74064624, + "num_input_tokens_seen": 115325925, + "step": 5367, + "time_per_iteration": 2.685802936553955 + }, + { + "auxiliary_loss_clip": 0.01081565, + "auxiliary_loss_mlp": 0.0103643, + "balance_loss_clip": 1.02999985, + "balance_loss_mlp": 1.02360308, + "epoch": 0.32274162032165943, + "flos": 16395196561920.0, + "grad_norm": 1.9356804537437542, + "language_loss": 0.74473727, + "learning_rate": 3.1663828187009274e-06, + "loss": 0.76591724, + "num_input_tokens_seen": 115343705, + "step": 5368, + "time_per_iteration": 2.625120162963867 + }, + { + "auxiliary_loss_clip": 0.01043244, + "auxiliary_loss_mlp": 0.01036221, + "balance_loss_clip": 1.02488923, + "balance_loss_mlp": 1.02204132, + "epoch": 0.3228017435743274, + "flos": 27855081354240.0, + "grad_norm": 1.9019069885347895, + "language_loss": 0.78725451, + "learning_rate": 3.1660664233224467e-06, + "loss": 0.80804914, + "num_input_tokens_seen": 115364170, + "step": 5369, + "time_per_iteration": 4.2203285694122314 + }, + { + "auxiliary_loss_clip": 0.01053503, + "auxiliary_loss_mlp": 0.01031426, + "balance_loss_clip": 1.03304648, + "balance_loss_mlp": 1.01787782, + "epoch": 0.32286186682699536, + "flos": 19608770741760.0, + "grad_norm": 2.0174733811564947, + "language_loss": 0.83052737, + "learning_rate": 3.16574998372661e-06, + "loss": 0.85137665, + "num_input_tokens_seen": 115382495, + "step": 5370, + "time_per_iteration": 2.7019646167755127 + }, + { + "auxiliary_loss_clip": 0.01083537, + "auxiliary_loss_mlp": 0.01030729, + "balance_loss_clip": 1.0306499, + "balance_loss_mlp": 1.01747251, + "epoch": 0.3229219900796633, + "flos": 24134017870080.0, + "grad_norm": 2.043459522672347, + "language_loss": 0.82642424, + "learning_rate": 3.1654334999254177e-06, + "loss": 0.84756684, + "num_input_tokens_seen": 115399450, + "step": 5371, + "time_per_iteration": 4.107529163360596 + }, + { + "auxiliary_loss_clip": 0.01076956, + "auxiliary_loss_mlp": 0.00747766, + "balance_loss_clip": 1.03114021, + "balance_loss_mlp": 1.0004319, + "epoch": 0.3229821133323313, + "flos": 17748705876480.0, + "grad_norm": 3.0512163440754128, + "language_loss": 0.88333714, + "learning_rate": 3.1651169719308695e-06, + "loss": 0.90158439, + "num_input_tokens_seen": 115417700, + "step": 5372, + "time_per_iteration": 2.6474227905273438 + }, + { + "auxiliary_loss_clip": 0.01085054, + "auxiliary_loss_mlp": 0.01041354, + "balance_loss_clip": 1.0324682, + "balance_loss_mlp": 1.02744818, + "epoch": 0.32304223658499925, + "flos": 22346025644160.0, + "grad_norm": 1.9321045723982588, + "language_loss": 0.72550273, + "learning_rate": 3.1648003997549694e-06, + "loss": 0.74676681, + "num_input_tokens_seen": 115435840, + "step": 5373, + "time_per_iteration": 2.597795248031616 + }, + { + "auxiliary_loss_clip": 0.01054734, + "auxiliary_loss_mlp": 0.01033197, + "balance_loss_clip": 1.02857196, + "balance_loss_mlp": 1.01945245, + "epoch": 0.3231023598376672, + "flos": 18478302929280.0, + "grad_norm": 2.125587462515991, + "language_loss": 0.80867708, + "learning_rate": 3.1644837834097214e-06, + "loss": 0.82955641, + "num_input_tokens_seen": 115454210, + "step": 5374, + "time_per_iteration": 2.64886474609375 + }, + { + "auxiliary_loss_clip": 0.01045304, + "auxiliary_loss_mlp": 0.01032071, + "balance_loss_clip": 1.02789831, + "balance_loss_mlp": 1.01880872, + "epoch": 0.3231624830903352, + "flos": 27636313570560.0, + "grad_norm": 2.3245027333643473, + "language_loss": 0.87872994, + "learning_rate": 3.1641671229071317e-06, + "loss": 0.89950371, + "num_input_tokens_seen": 115471785, + "step": 5375, + "time_per_iteration": 2.8670456409454346 + }, + { + "auxiliary_loss_clip": 0.01083871, + "auxiliary_loss_mlp": 0.01031999, + "balance_loss_clip": 1.02809, + "balance_loss_mlp": 1.01759291, + "epoch": 0.32322260634300315, + "flos": 21726423014400.0, + "grad_norm": 1.9232816211168855, + "language_loss": 0.75512266, + "learning_rate": 3.1638504182592076e-06, + "loss": 0.77628136, + "num_input_tokens_seen": 115491405, + "step": 5376, + "time_per_iteration": 2.7295498847961426 + }, + { + "auxiliary_loss_clip": 0.01047508, + "auxiliary_loss_mlp": 0.01031855, + "balance_loss_clip": 1.03218591, + "balance_loss_mlp": 1.01893866, + "epoch": 0.3232827295956711, + "flos": 22637656166400.0, + "grad_norm": 2.512742914405771, + "language_loss": 0.67322969, + "learning_rate": 3.1635336694779594e-06, + "loss": 0.69402337, + "num_input_tokens_seen": 115511555, + "step": 5377, + "time_per_iteration": 2.9370276927948 + }, + { + "auxiliary_loss_clip": 0.01047274, + "auxiliary_loss_mlp": 0.01043401, + "balance_loss_clip": 1.02832508, + "balance_loss_mlp": 1.02757001, + "epoch": 0.3233428528483391, + "flos": 26322593546880.0, + "grad_norm": 1.3909949949654943, + "language_loss": 0.72557259, + "learning_rate": 3.1632168765753982e-06, + "loss": 0.74647933, + "num_input_tokens_seen": 115532860, + "step": 5378, + "time_per_iteration": 2.7871687412261963 + }, + { + "auxiliary_loss_clip": 0.0107534, + "auxiliary_loss_mlp": 0.01036136, + "balance_loss_clip": 1.03293371, + "balance_loss_mlp": 1.0223434, + "epoch": 0.32340297610100704, + "flos": 28585217111040.0, + "grad_norm": 2.722424729033325, + "language_loss": 0.81642497, + "learning_rate": 3.1629000395635357e-06, + "loss": 0.83753973, + "num_input_tokens_seen": 115553850, + "step": 5379, + "time_per_iteration": 4.369482040405273 + }, + { + "auxiliary_loss_clip": 0.01072703, + "auxiliary_loss_mlp": 0.01033573, + "balance_loss_clip": 1.02884722, + "balance_loss_mlp": 1.02039456, + "epoch": 0.323463099353675, + "flos": 30773792787840.0, + "grad_norm": 1.854665505460358, + "language_loss": 0.78932214, + "learning_rate": 3.162583158454388e-06, + "loss": 0.81038493, + "num_input_tokens_seen": 115575530, + "step": 5380, + "time_per_iteration": 2.7778537273406982 + }, + { + "auxiliary_loss_clip": 0.01074245, + "auxiliary_loss_mlp": 0.01037983, + "balance_loss_clip": 1.03139555, + "balance_loss_mlp": 1.02473843, + "epoch": 0.32352322260634303, + "flos": 25228610974080.0, + "grad_norm": 1.7675142568120679, + "language_loss": 0.77151573, + "learning_rate": 3.1622662332599697e-06, + "loss": 0.79263794, + "num_input_tokens_seen": 115594885, + "step": 5381, + "time_per_iteration": 4.256155967712402 + }, + { + "auxiliary_loss_clip": 0.01070401, + "auxiliary_loss_mlp": 0.01034276, + "balance_loss_clip": 1.03022492, + "balance_loss_mlp": 1.02178288, + "epoch": 0.323583345859011, + "flos": 23330480670720.0, + "grad_norm": 2.0013015661825064, + "language_loss": 0.71778566, + "learning_rate": 3.1619492639922998e-06, + "loss": 0.73883241, + "num_input_tokens_seen": 115614080, + "step": 5382, + "time_per_iteration": 2.645012617111206 + }, + { + "auxiliary_loss_clip": 0.010545, + "auxiliary_loss_mlp": 0.01042474, + "balance_loss_clip": 1.02668464, + "balance_loss_mlp": 1.02760816, + "epoch": 0.32364346911167896, + "flos": 26207499392640.0, + "grad_norm": 2.285982344802164, + "language_loss": 0.70495117, + "learning_rate": 3.1616322506633964e-06, + "loss": 0.72592092, + "num_input_tokens_seen": 115632820, + "step": 5383, + "time_per_iteration": 2.6305694580078125 + }, + { + "auxiliary_loss_clip": 0.01069852, + "auxiliary_loss_mlp": 0.0103256, + "balance_loss_clip": 1.02879345, + "balance_loss_mlp": 1.02057934, + "epoch": 0.3237035923643469, + "flos": 23695764030720.0, + "grad_norm": 1.6542609456637567, + "language_loss": 0.78645825, + "learning_rate": 3.161315193285283e-06, + "loss": 0.8074823, + "num_input_tokens_seen": 115652860, + "step": 5384, + "time_per_iteration": 2.7014968395233154 + }, + { + "auxiliary_loss_clip": 0.0102698, + "auxiliary_loss_mlp": 0.01038032, + "balance_loss_clip": 1.02809107, + "balance_loss_mlp": 1.02290404, + "epoch": 0.3237637156170149, + "flos": 14428728633600.0, + "grad_norm": 1.9449987048415975, + "language_loss": 0.7490263, + "learning_rate": 3.16099809186998e-06, + "loss": 0.76967645, + "num_input_tokens_seen": 115670940, + "step": 5385, + "time_per_iteration": 2.6819729804992676 + }, + { + "auxiliary_loss_clip": 0.0106411, + "auxiliary_loss_mlp": 0.01037822, + "balance_loss_clip": 1.03163409, + "balance_loss_mlp": 1.02414918, + "epoch": 0.32382383886968286, + "flos": 31062981185280.0, + "grad_norm": 1.9452114980303996, + "language_loss": 0.71701127, + "learning_rate": 3.1606809464295145e-06, + "loss": 0.73803055, + "num_input_tokens_seen": 115691155, + "step": 5386, + "time_per_iteration": 2.764266014099121 + }, + { + "auxiliary_loss_clip": 0.01084489, + "auxiliary_loss_mlp": 0.01038346, + "balance_loss_clip": 1.02857661, + "balance_loss_mlp": 1.02400541, + "epoch": 0.3238839621223508, + "flos": 23256935573760.0, + "grad_norm": 1.724567268875505, + "language_loss": 0.94594049, + "learning_rate": 3.1603637569759095e-06, + "loss": 0.96716887, + "num_input_tokens_seen": 115710340, + "step": 5387, + "time_per_iteration": 2.693568468093872 + }, + { + "auxiliary_loss_clip": 0.01075723, + "auxiliary_loss_mlp": 0.01040977, + "balance_loss_clip": 1.03026271, + "balance_loss_mlp": 1.02633822, + "epoch": 0.3239440853750188, + "flos": 22964658606720.0, + "grad_norm": 1.9760610780638506, + "language_loss": 0.77539325, + "learning_rate": 3.1600465235211956e-06, + "loss": 0.79656023, + "num_input_tokens_seen": 115726745, + "step": 5388, + "time_per_iteration": 2.764206647872925 + }, + { + "auxiliary_loss_clip": 0.01055599, + "auxiliary_loss_mlp": 0.01030297, + "balance_loss_clip": 1.0271939, + "balance_loss_mlp": 1.01611698, + "epoch": 0.32400420862768675, + "flos": 36246614653440.0, + "grad_norm": 2.036478403059954, + "language_loss": 0.71484858, + "learning_rate": 3.1597292460774006e-06, + "loss": 0.73570752, + "num_input_tokens_seen": 115749385, + "step": 5389, + "time_per_iteration": 2.791447877883911 + }, + { + "auxiliary_loss_clip": 0.01054369, + "auxiliary_loss_mlp": 0.01035878, + "balance_loss_clip": 1.03159714, + "balance_loss_mlp": 1.02175212, + "epoch": 0.3240643318803547, + "flos": 21616500418560.0, + "grad_norm": 3.4371414187035434, + "language_loss": 0.80911535, + "learning_rate": 3.159411924656557e-06, + "loss": 0.83001781, + "num_input_tokens_seen": 115768105, + "step": 5390, + "time_per_iteration": 2.786661148071289 + }, + { + "auxiliary_loss_clip": 0.01053789, + "auxiliary_loss_mlp": 0.01043, + "balance_loss_clip": 1.02847672, + "balance_loss_mlp": 1.02913022, + "epoch": 0.3241244551330227, + "flos": 23295611543040.0, + "grad_norm": 1.8446003321553706, + "language_loss": 0.72696829, + "learning_rate": 3.1590945592706967e-06, + "loss": 0.74793613, + "num_input_tokens_seen": 115787340, + "step": 5391, + "time_per_iteration": 2.631291389465332 + }, + { + "auxiliary_loss_clip": 0.01055624, + "auxiliary_loss_mlp": 0.01037533, + "balance_loss_clip": 1.02618861, + "balance_loss_mlp": 1.02469468, + "epoch": 0.32418457838569065, + "flos": 14097236993280.0, + "grad_norm": 1.4891392177601388, + "language_loss": 0.76751876, + "learning_rate": 3.158777149931855e-06, + "loss": 0.78845036, + "num_input_tokens_seen": 115805565, + "step": 5392, + "time_per_iteration": 2.703986644744873 + }, + { + "auxiliary_loss_clip": 0.01057034, + "auxiliary_loss_mlp": 0.01040633, + "balance_loss_clip": 1.02761483, + "balance_loss_mlp": 1.0252192, + "epoch": 0.3242447016383586, + "flos": 29752672953600.0, + "grad_norm": 1.7257502992382603, + "language_loss": 0.61806089, + "learning_rate": 3.158459696652067e-06, + "loss": 0.63903761, + "num_input_tokens_seen": 115826725, + "step": 5393, + "time_per_iteration": 2.7225682735443115 + }, + { + "auxiliary_loss_clip": 0.01073471, + "auxiliary_loss_mlp": 0.01035865, + "balance_loss_clip": 1.03133476, + "balance_loss_mlp": 1.02220333, + "epoch": 0.3243048248910266, + "flos": 24351205455360.0, + "grad_norm": 1.8313694013527098, + "language_loss": 0.82659191, + "learning_rate": 3.158142199443371e-06, + "loss": 0.84768528, + "num_input_tokens_seen": 115846955, + "step": 5394, + "time_per_iteration": 2.79075026512146 + }, + { + "auxiliary_loss_clip": 0.01057162, + "auxiliary_loss_mlp": 0.01040835, + "balance_loss_clip": 1.02797127, + "balance_loss_mlp": 1.02869368, + "epoch": 0.3243649481436946, + "flos": 24353037048960.0, + "grad_norm": 1.6756694459628378, + "language_loss": 0.81669432, + "learning_rate": 3.1578246583178076e-06, + "loss": 0.83767432, + "num_input_tokens_seen": 115865975, + "step": 5395, + "time_per_iteration": 2.693779945373535 + }, + { + "auxiliary_loss_clip": 0.01072143, + "auxiliary_loss_mlp": 0.01036774, + "balance_loss_clip": 1.03144455, + "balance_loss_mlp": 1.02392292, + "epoch": 0.32442507139636256, + "flos": 22925228451840.0, + "grad_norm": 1.6270976627865137, + "language_loss": 0.83478212, + "learning_rate": 3.157507073287417e-06, + "loss": 0.85587126, + "num_input_tokens_seen": 115884950, + "step": 5396, + "time_per_iteration": 2.583415985107422 + }, + { + "auxiliary_loss_clip": 0.01049293, + "auxiliary_loss_mlp": 0.01044689, + "balance_loss_clip": 1.02958894, + "balance_loss_mlp": 1.02855992, + "epoch": 0.32448519464903053, + "flos": 22200192426240.0, + "grad_norm": 3.776496705002305, + "language_loss": 0.7552129, + "learning_rate": 3.1571894443642414e-06, + "loss": 0.77615267, + "num_input_tokens_seen": 115904170, + "step": 5397, + "time_per_iteration": 2.7063868045806885 + }, + { + "auxiliary_loss_clip": 0.01052589, + "auxiliary_loss_mlp": 0.01032945, + "balance_loss_clip": 1.03085017, + "balance_loss_mlp": 1.01961184, + "epoch": 0.3245453179016985, + "flos": 18838450644480.0, + "grad_norm": 2.005087272745594, + "language_loss": 0.67196178, + "learning_rate": 3.1568717715603263e-06, + "loss": 0.69281715, + "num_input_tokens_seen": 115919255, + "step": 5398, + "time_per_iteration": 2.623178482055664 + }, + { + "auxiliary_loss_clip": 0.01063949, + "auxiliary_loss_mlp": 0.01035527, + "balance_loss_clip": 1.03158808, + "balance_loss_mlp": 1.02159107, + "epoch": 0.32460544115436646, + "flos": 21178390233600.0, + "grad_norm": 1.409973453225752, + "language_loss": 0.72589201, + "learning_rate": 3.156554054887718e-06, + "loss": 0.74688673, + "num_input_tokens_seen": 115938535, + "step": 5399, + "time_per_iteration": 2.6376123428344727 + }, + { + "auxiliary_loss_clip": 0.01047145, + "auxiliary_loss_mlp": 0.01034921, + "balance_loss_clip": 1.02673686, + "balance_loss_mlp": 1.02071154, + "epoch": 0.3246655644070344, + "flos": 21981137333760.0, + "grad_norm": 2.2316910771673193, + "language_loss": 0.7137875, + "learning_rate": 3.1562362943584645e-06, + "loss": 0.73460811, + "num_input_tokens_seen": 115955005, + "step": 5400, + "time_per_iteration": 2.785125970840454 + }, + { + "auxiliary_loss_clip": 0.01073686, + "auxiliary_loss_mlp": 0.01035283, + "balance_loss_clip": 1.02981424, + "balance_loss_mlp": 1.02193737, + "epoch": 0.3247256876597024, + "flos": 32159729105280.0, + "grad_norm": 1.903412956452341, + "language_loss": 0.79413551, + "learning_rate": 3.155918489984614e-06, + "loss": 0.81522518, + "num_input_tokens_seen": 115975305, + "step": 5401, + "time_per_iteration": 2.78175687789917 + }, + { + "auxiliary_loss_clip": 0.01049474, + "auxiliary_loss_mlp": 0.01036664, + "balance_loss_clip": 1.02496767, + "balance_loss_mlp": 1.02124989, + "epoch": 0.32478581091237035, + "flos": 20997544233600.0, + "grad_norm": 1.4124132885862788, + "language_loss": 0.8766225, + "learning_rate": 3.1556006417782196e-06, + "loss": 0.89748394, + "num_input_tokens_seen": 115994810, + "step": 5402, + "time_per_iteration": 2.689087390899658 + }, + { + "auxiliary_loss_clip": 0.010233, + "auxiliary_loss_mlp": 0.01038835, + "balance_loss_clip": 1.02261448, + "balance_loss_mlp": 1.02402353, + "epoch": 0.3248459341650383, + "flos": 17924990849280.0, + "grad_norm": 1.9473124761680756, + "language_loss": 0.84356403, + "learning_rate": 3.155282749751332e-06, + "loss": 0.86418539, + "num_input_tokens_seen": 116011095, + "step": 5403, + "time_per_iteration": 2.6407787799835205 + }, + { + "auxiliary_loss_clip": 0.01048287, + "auxiliary_loss_mlp": 0.01040497, + "balance_loss_clip": 1.02846885, + "balance_loss_mlp": 1.02772987, + "epoch": 0.3249060574177063, + "flos": 24535606901760.0, + "grad_norm": 2.053722692387445, + "language_loss": 0.87411737, + "learning_rate": 3.154964813916007e-06, + "loss": 0.89500523, + "num_input_tokens_seen": 116028805, + "step": 5404, + "time_per_iteration": 2.6497278213500977 + }, + { + "auxiliary_loss_clip": 0.01073066, + "auxiliary_loss_mlp": 0.01034621, + "balance_loss_clip": 1.03120685, + "balance_loss_mlp": 1.02132916, + "epoch": 0.32496618067037425, + "flos": 25994765093760.0, + "grad_norm": 1.5841673357891384, + "language_loss": 0.7237317, + "learning_rate": 3.1546468342843008e-06, + "loss": 0.74480855, + "num_input_tokens_seen": 116047765, + "step": 5405, + "time_per_iteration": 2.5888874530792236 + }, + { + "auxiliary_loss_clip": 0.01055177, + "auxiliary_loss_mlp": 0.01041229, + "balance_loss_clip": 1.0347259, + "balance_loss_mlp": 1.02742469, + "epoch": 0.3250263039230422, + "flos": 19573757959680.0, + "grad_norm": 1.8860170061744144, + "language_loss": 0.83077419, + "learning_rate": 3.1543288108682707e-06, + "loss": 0.85173827, + "num_input_tokens_seen": 116068385, + "step": 5406, + "time_per_iteration": 2.6639726161956787 + }, + { + "auxiliary_loss_clip": 0.01081786, + "auxiliary_loss_mlp": 0.01031407, + "balance_loss_clip": 1.03077376, + "balance_loss_mlp": 1.01847327, + "epoch": 0.3250864271757102, + "flos": 16763640318720.0, + "grad_norm": 2.260594027061076, + "language_loss": 0.87590557, + "learning_rate": 3.1540107436799764e-06, + "loss": 0.89703751, + "num_input_tokens_seen": 116085350, + "step": 5407, + "time_per_iteration": 2.564234733581543 + }, + { + "auxiliary_loss_clip": 0.0105813, + "auxiliary_loss_mlp": 0.01033648, + "balance_loss_clip": 1.02697754, + "balance_loss_mlp": 1.02052271, + "epoch": 0.3251465504283782, + "flos": 27819458040960.0, + "grad_norm": 1.48108206687377, + "language_loss": 0.6969074, + "learning_rate": 3.153692632731479e-06, + "loss": 0.71782517, + "num_input_tokens_seen": 116107560, + "step": 5408, + "time_per_iteration": 2.704299211502075 + }, + { + "auxiliary_loss_clip": 0.0107868, + "auxiliary_loss_mlp": 0.01032552, + "balance_loss_clip": 1.03171992, + "balance_loss_mlp": 1.01852703, + "epoch": 0.32520667368104617, + "flos": 19063144172160.0, + "grad_norm": 2.4538280517811795, + "language_loss": 0.77225035, + "learning_rate": 3.153374478034841e-06, + "loss": 0.79336268, + "num_input_tokens_seen": 116125980, + "step": 5409, + "time_per_iteration": 2.5754640102386475 + }, + { + "auxiliary_loss_clip": 0.01026024, + "auxiliary_loss_mlp": 0.01042449, + "balance_loss_clip": 1.02483547, + "balance_loss_mlp": 1.02874637, + "epoch": 0.32526679693371413, + "flos": 29382146208000.0, + "grad_norm": 1.9050975372307015, + "language_loss": 0.83371651, + "learning_rate": 3.1530562796021285e-06, + "loss": 0.85440123, + "num_input_tokens_seen": 116146530, + "step": 5410, + "time_per_iteration": 2.7340950965881348 + }, + { + "auxiliary_loss_clip": 0.01042477, + "auxiliary_loss_mlp": 0.01036839, + "balance_loss_clip": 1.0320375, + "balance_loss_mlp": 1.02356529, + "epoch": 0.3253269201863821, + "flos": 20704513080960.0, + "grad_norm": 1.6170640222698094, + "language_loss": 0.71231794, + "learning_rate": 3.152738037445405e-06, + "loss": 0.73311114, + "num_input_tokens_seen": 116165695, + "step": 5411, + "time_per_iteration": 2.69596791267395 + }, + { + "auxiliary_loss_clip": 0.01033881, + "auxiliary_loss_mlp": 0.01035822, + "balance_loss_clip": 1.02838063, + "balance_loss_mlp": 1.02361536, + "epoch": 0.32538704343905006, + "flos": 29094142959360.0, + "grad_norm": 1.6861177628030595, + "language_loss": 0.83281416, + "learning_rate": 3.1524197515767403e-06, + "loss": 0.85351121, + "num_input_tokens_seen": 116185375, + "step": 5412, + "time_per_iteration": 2.7541608810424805 + }, + { + "auxiliary_loss_clip": 0.01054354, + "auxiliary_loss_mlp": 0.01036743, + "balance_loss_clip": 1.03076601, + "balance_loss_mlp": 1.02199066, + "epoch": 0.325447166691718, + "flos": 24676124906880.0, + "grad_norm": 1.6597969361160587, + "language_loss": 0.80620122, + "learning_rate": 3.152101422008203e-06, + "loss": 0.8271122, + "num_input_tokens_seen": 116204335, + "step": 5413, + "time_per_iteration": 2.695772409439087 + }, + { + "auxiliary_loss_clip": 0.01060337, + "auxiliary_loss_mlp": 0.01035708, + "balance_loss_clip": 1.02907228, + "balance_loss_mlp": 1.02122378, + "epoch": 0.325507289944386, + "flos": 21543134889600.0, + "grad_norm": 1.6284143690539967, + "language_loss": 0.76483297, + "learning_rate": 3.151783048751864e-06, + "loss": 0.78579354, + "num_input_tokens_seen": 116222840, + "step": 5414, + "time_per_iteration": 2.708162546157837 + }, + { + "auxiliary_loss_clip": 0.00988035, + "auxiliary_loss_mlp": 0.01007262, + "balance_loss_clip": 1.00635183, + "balance_loss_mlp": 1.00560462, + "epoch": 0.32556741319705396, + "flos": 71518722347520.0, + "grad_norm": 0.9105488751435172, + "language_loss": 0.64065593, + "learning_rate": 3.1514646318197965e-06, + "loss": 0.66060895, + "num_input_tokens_seen": 116274940, + "step": 5415, + "time_per_iteration": 3.1641733646392822 + }, + { + "auxiliary_loss_clip": 0.01037108, + "auxiliary_loss_mlp": 0.01034115, + "balance_loss_clip": 1.02568388, + "balance_loss_mlp": 1.02029848, + "epoch": 0.3256275364497219, + "flos": 23732428838400.0, + "grad_norm": 1.4771268837464908, + "language_loss": 0.73971838, + "learning_rate": 3.151146171224075e-06, + "loss": 0.76043069, + "num_input_tokens_seen": 116297300, + "step": 5416, + "time_per_iteration": 2.8413314819335938 + }, + { + "auxiliary_loss_clip": 0.01016287, + "auxiliary_loss_mlp": 0.01002719, + "balance_loss_clip": 1.00432205, + "balance_loss_mlp": 1.00079942, + "epoch": 0.3256876597023899, + "flos": 67289199891840.0, + "grad_norm": 0.7787068755317291, + "language_loss": 0.57982385, + "learning_rate": 3.1508276669767757e-06, + "loss": 0.60001385, + "num_input_tokens_seen": 116362370, + "step": 5417, + "time_per_iteration": 4.8820860385894775 + }, + { + "auxiliary_loss_clip": 0.00998828, + "auxiliary_loss_mlp": 0.0100347, + "balance_loss_clip": 1.00654209, + "balance_loss_mlp": 1.00162244, + "epoch": 0.32574778295505785, + "flos": 71282323964160.0, + "grad_norm": 0.83333588982146, + "language_loss": 0.6342355, + "learning_rate": 3.150509119089975e-06, + "loss": 0.65425843, + "num_input_tokens_seen": 116430365, + "step": 5418, + "time_per_iteration": 4.867122650146484 + }, + { + "auxiliary_loss_clip": 0.01061294, + "auxiliary_loss_mlp": 0.01042306, + "balance_loss_clip": 1.03203177, + "balance_loss_mlp": 1.02897286, + "epoch": 0.3258079062077258, + "flos": 20776370238720.0, + "grad_norm": 2.017990407143761, + "language_loss": 0.69475514, + "learning_rate": 3.1501905275757537e-06, + "loss": 0.71579117, + "num_input_tokens_seen": 116447525, + "step": 5419, + "time_per_iteration": 2.7156500816345215 + }, + { + "auxiliary_loss_clip": 0.01072254, + "auxiliary_loss_mlp": 0.01035479, + "balance_loss_clip": 1.03063726, + "balance_loss_mlp": 1.02138901, + "epoch": 0.3258680294603938, + "flos": 22235456603520.0, + "grad_norm": 1.6565956004089675, + "language_loss": 0.76876944, + "learning_rate": 3.1498718924461926e-06, + "loss": 0.78984678, + "num_input_tokens_seen": 116466310, + "step": 5420, + "time_per_iteration": 2.6037538051605225 + }, + { + "auxiliary_loss_clip": 0.01070858, + "auxiliary_loss_mlp": 0.00747815, + "balance_loss_clip": 1.02877796, + "balance_loss_mlp": 1.00039327, + "epoch": 0.3259281527130618, + "flos": 26979974305920.0, + "grad_norm": 1.6141677722477121, + "language_loss": 0.80400491, + "learning_rate": 3.1495532137133736e-06, + "loss": 0.8221916, + "num_input_tokens_seen": 116487825, + "step": 5421, + "time_per_iteration": 2.709484338760376 + }, + { + "auxiliary_loss_clip": 0.0108223, + "auxiliary_loss_mlp": 0.01033626, + "balance_loss_clip": 1.03074288, + "balance_loss_mlp": 1.02085853, + "epoch": 0.32598827596572977, + "flos": 26214251149440.0, + "grad_norm": 4.9260578391009835, + "language_loss": 0.75622648, + "learning_rate": 3.149234491389381e-06, + "loss": 0.77738506, + "num_input_tokens_seen": 116509950, + "step": 5422, + "time_per_iteration": 2.629570484161377 + }, + { + "auxiliary_loss_clip": 0.01048633, + "auxiliary_loss_mlp": 0.00747722, + "balance_loss_clip": 1.02891469, + "balance_loss_mlp": 1.00034928, + "epoch": 0.32604839921839773, + "flos": 17639752947840.0, + "grad_norm": 1.8643674782986572, + "language_loss": 0.63071191, + "learning_rate": 3.1489157254863026e-06, + "loss": 0.6486755, + "num_input_tokens_seen": 116527695, + "step": 5423, + "time_per_iteration": 2.6085331439971924 + }, + { + "auxiliary_loss_clip": 0.01052728, + "auxiliary_loss_mlp": 0.01033133, + "balance_loss_clip": 1.02715516, + "balance_loss_mlp": 1.02096152, + "epoch": 0.3261085224710657, + "flos": 23622721724160.0, + "grad_norm": 7.2652815033358875, + "language_loss": 0.74645227, + "learning_rate": 3.148596916016224e-06, + "loss": 0.76731086, + "num_input_tokens_seen": 116547800, + "step": 5424, + "time_per_iteration": 2.6381924152374268 + }, + { + "auxiliary_loss_clip": 0.01061814, + "auxiliary_loss_mlp": 0.01036156, + "balance_loss_clip": 1.0311439, + "balance_loss_mlp": 1.02339435, + "epoch": 0.32616864572373366, + "flos": 23260455106560.0, + "grad_norm": 1.559359068478769, + "language_loss": 0.76823789, + "learning_rate": 3.1482780629912355e-06, + "loss": 0.78921753, + "num_input_tokens_seen": 116568460, + "step": 5425, + "time_per_iteration": 2.6414451599121094 + }, + { + "auxiliary_loss_clip": 0.01054681, + "auxiliary_loss_mlp": 0.01039945, + "balance_loss_clip": 1.0292002, + "balance_loss_mlp": 1.0239476, + "epoch": 0.32622876897640163, + "flos": 25593427457280.0, + "grad_norm": 2.515273150475215, + "language_loss": 0.7790345, + "learning_rate": 3.147959166423428e-06, + "loss": 0.79998076, + "num_input_tokens_seen": 116588705, + "step": 5426, + "time_per_iteration": 4.39985203742981 + }, + { + "auxiliary_loss_clip": 0.01038395, + "auxiliary_loss_mlp": 0.01035542, + "balance_loss_clip": 1.02932525, + "balance_loss_mlp": 1.02067637, + "epoch": 0.3262888922290696, + "flos": 22418996123520.0, + "grad_norm": 1.634836246379635, + "language_loss": 0.74295712, + "learning_rate": 3.147640226324893e-06, + "loss": 0.76369655, + "num_input_tokens_seen": 116608845, + "step": 5427, + "time_per_iteration": 2.690882444381714 + }, + { + "auxiliary_loss_clip": 0.01051123, + "auxiliary_loss_mlp": 0.01039553, + "balance_loss_clip": 1.02818966, + "balance_loss_mlp": 1.02469897, + "epoch": 0.32634901548173756, + "flos": 19718908819200.0, + "grad_norm": 1.6340720710992367, + "language_loss": 0.79360843, + "learning_rate": 3.1473212427077266e-06, + "loss": 0.81451517, + "num_input_tokens_seen": 116628145, + "step": 5428, + "time_per_iteration": 4.338188171386719 + }, + { + "auxiliary_loss_clip": 0.01073419, + "auxiliary_loss_mlp": 0.01039403, + "balance_loss_clip": 1.03060532, + "balance_loss_mlp": 1.02583146, + "epoch": 0.3264091387344055, + "flos": 16142924367360.0, + "grad_norm": 1.5776907399445148, + "language_loss": 0.71218777, + "learning_rate": 3.147002215584023e-06, + "loss": 0.73331606, + "num_input_tokens_seen": 116646920, + "step": 5429, + "time_per_iteration": 2.5956215858459473 + }, + { + "auxiliary_loss_clip": 0.01053284, + "auxiliary_loss_mlp": 0.01036158, + "balance_loss_clip": 1.03147101, + "balance_loss_mlp": 1.02364755, + "epoch": 0.3264692619870735, + "flos": 16399075230720.0, + "grad_norm": 1.7435005762777196, + "language_loss": 0.78395629, + "learning_rate": 3.146683144965881e-06, + "loss": 0.80485076, + "num_input_tokens_seen": 116665100, + "step": 5430, + "time_per_iteration": 2.63311505317688 + }, + { + "auxiliary_loss_clip": 0.01044386, + "auxiliary_loss_mlp": 0.01041497, + "balance_loss_clip": 1.03235829, + "balance_loss_mlp": 1.0261364, + "epoch": 0.32652938523974145, + "flos": 22382331315840.0, + "grad_norm": 5.38893970076297, + "language_loss": 0.84303737, + "learning_rate": 3.146364030865399e-06, + "loss": 0.86389619, + "num_input_tokens_seen": 116682205, + "step": 5431, + "time_per_iteration": 2.7644734382629395 + }, + { + "auxiliary_loss_clip": 0.01071289, + "auxiliary_loss_mlp": 0.01034347, + "balance_loss_clip": 1.02943218, + "balance_loss_mlp": 1.02109039, + "epoch": 0.3265895084924094, + "flos": 21908059113600.0, + "grad_norm": 1.7177979763213727, + "language_loss": 0.70260715, + "learning_rate": 3.146044873294678e-06, + "loss": 0.72366351, + "num_input_tokens_seen": 116702575, + "step": 5432, + "time_per_iteration": 2.723576784133911 + }, + { + "auxiliary_loss_clip": 0.01025359, + "auxiliary_loss_mlp": 0.01038857, + "balance_loss_clip": 1.02522254, + "balance_loss_mlp": 1.02457619, + "epoch": 0.3266496317450774, + "flos": 16067152627200.0, + "grad_norm": 1.4705487533625035, + "language_loss": 0.84172976, + "learning_rate": 3.1457256722658203e-06, + "loss": 0.86237192, + "num_input_tokens_seen": 116720885, + "step": 5433, + "time_per_iteration": 2.7342116832733154 + }, + { + "auxiliary_loss_clip": 0.01061411, + "auxiliary_loss_mlp": 0.01034255, + "balance_loss_clip": 1.03107178, + "balance_loss_mlp": 1.02142191, + "epoch": 0.3267097549977454, + "flos": 22528236360960.0, + "grad_norm": 1.6566368845498292, + "language_loss": 0.85696208, + "learning_rate": 3.145406427790931e-06, + "loss": 0.87791872, + "num_input_tokens_seen": 116740395, + "step": 5434, + "time_per_iteration": 2.6633684635162354 + }, + { + "auxiliary_loss_clip": 0.01059984, + "auxiliary_loss_mlp": 0.01036362, + "balance_loss_clip": 1.02906632, + "balance_loss_mlp": 1.0218606, + "epoch": 0.32676987825041337, + "flos": 27270419679360.0, + "grad_norm": 1.7380568262218277, + "language_loss": 0.88014442, + "learning_rate": 3.1450871398821147e-06, + "loss": 0.90110785, + "num_input_tokens_seen": 116758870, + "step": 5435, + "time_per_iteration": 2.7505338191986084 + }, + { + "auxiliary_loss_clip": 0.01084494, + "auxiliary_loss_mlp": 0.01031378, + "balance_loss_clip": 1.03157783, + "balance_loss_mlp": 1.01802039, + "epoch": 0.32683000150308134, + "flos": 11508257433600.0, + "grad_norm": 2.434142413388363, + "language_loss": 0.76981354, + "learning_rate": 3.144767808551479e-06, + "loss": 0.79097223, + "num_input_tokens_seen": 116773440, + "step": 5436, + "time_per_iteration": 2.571539878845215 + }, + { + "auxiliary_loss_clip": 0.01084213, + "auxiliary_loss_mlp": 0.01030392, + "balance_loss_clip": 1.03252792, + "balance_loss_mlp": 1.01734495, + "epoch": 0.3268901247557493, + "flos": 25630200005760.0, + "grad_norm": 1.7614781163303477, + "language_loss": 0.72158927, + "learning_rate": 3.144448433811134e-06, + "loss": 0.74273527, + "num_input_tokens_seen": 116794375, + "step": 5437, + "time_per_iteration": 2.6511545181274414 + }, + { + "auxiliary_loss_clip": 0.01040131, + "auxiliary_loss_mlp": 0.01040375, + "balance_loss_clip": 1.02676439, + "balance_loss_mlp": 1.02485979, + "epoch": 0.32695024800841727, + "flos": 24860849575680.0, + "grad_norm": 1.5425522054863308, + "language_loss": 0.63634777, + "learning_rate": 3.144129015673189e-06, + "loss": 0.65715277, + "num_input_tokens_seen": 116815095, + "step": 5438, + "time_per_iteration": 2.708176612854004 + }, + { + "auxiliary_loss_clip": 0.01075377, + "auxiliary_loss_mlp": 0.01032674, + "balance_loss_clip": 1.03268433, + "balance_loss_mlp": 1.01861882, + "epoch": 0.32701037126108523, + "flos": 28839249072000.0, + "grad_norm": 2.3622113538116247, + "language_loss": 0.7469846, + "learning_rate": 3.1438095541497576e-06, + "loss": 0.76806515, + "num_input_tokens_seen": 116836630, + "step": 5439, + "time_per_iteration": 2.647458553314209 + }, + { + "auxiliary_loss_clip": 0.01075227, + "auxiliary_loss_mlp": 0.01040462, + "balance_loss_clip": 1.03237188, + "balance_loss_mlp": 1.02616262, + "epoch": 0.3270704945137532, + "flos": 27965075777280.0, + "grad_norm": 2.344614778835889, + "language_loss": 0.75082386, + "learning_rate": 3.1434900492529527e-06, + "loss": 0.77198076, + "num_input_tokens_seen": 116856880, + "step": 5440, + "time_per_iteration": 2.6506683826446533 + }, + { + "auxiliary_loss_clip": 0.01072405, + "auxiliary_loss_mlp": 0.00747902, + "balance_loss_clip": 1.03120387, + "balance_loss_mlp": 1.00025022, + "epoch": 0.32713061776642116, + "flos": 23690700213120.0, + "grad_norm": 100.2623527548657, + "language_loss": 0.84714985, + "learning_rate": 3.1431705009948914e-06, + "loss": 0.86535293, + "num_input_tokens_seen": 116873770, + "step": 5441, + "time_per_iteration": 2.657148599624634 + }, + { + "auxiliary_loss_clip": 0.01067396, + "auxiliary_loss_mlp": 0.0103885, + "balance_loss_clip": 1.02750194, + "balance_loss_mlp": 1.02445567, + "epoch": 0.3271907410190891, + "flos": 22455625017600.0, + "grad_norm": 1.8613223427219294, + "language_loss": 0.86474323, + "learning_rate": 3.1428509093876897e-06, + "loss": 0.88580567, + "num_input_tokens_seen": 116891225, + "step": 5442, + "time_per_iteration": 2.589137077331543 + }, + { + "auxiliary_loss_clip": 0.01049045, + "auxiliary_loss_mlp": 0.01040674, + "balance_loss_clip": 1.03026342, + "balance_loss_mlp": 1.02527177, + "epoch": 0.3272508642717571, + "flos": 22820118278400.0, + "grad_norm": 1.863328603364177, + "language_loss": 0.77556336, + "learning_rate": 3.1425312744434668e-06, + "loss": 0.79646063, + "num_input_tokens_seen": 116912300, + "step": 5443, + "time_per_iteration": 2.7448203563690186 + }, + { + "auxiliary_loss_clip": 0.01056519, + "auxiliary_loss_mlp": 0.00747797, + "balance_loss_clip": 1.03087854, + "balance_loss_mlp": 1.00024986, + "epoch": 0.32731098752442506, + "flos": 11801360413440.0, + "grad_norm": 2.90074152840356, + "language_loss": 0.8154887, + "learning_rate": 3.142211596174343e-06, + "loss": 0.83353198, + "num_input_tokens_seen": 116929425, + "step": 5444, + "time_per_iteration": 2.810279607772827 + }, + { + "auxiliary_loss_clip": 0.01040433, + "auxiliary_loss_mlp": 0.01033625, + "balance_loss_clip": 1.02888918, + "balance_loss_mlp": 1.01986241, + "epoch": 0.327371110777093, + "flos": 21027780506880.0, + "grad_norm": 2.4471337136037317, + "language_loss": 0.58991086, + "learning_rate": 3.1418918745924423e-06, + "loss": 0.61065143, + "num_input_tokens_seen": 116948255, + "step": 5445, + "time_per_iteration": 2.7482354640960693 + }, + { + "auxiliary_loss_clip": 0.01077478, + "auxiliary_loss_mlp": 0.01037968, + "balance_loss_clip": 1.03253651, + "balance_loss_mlp": 1.02362728, + "epoch": 0.327431234029761, + "flos": 19062102677760.0, + "grad_norm": 2.339884523329886, + "language_loss": 0.88603604, + "learning_rate": 3.1415721097098865e-06, + "loss": 0.9071905, + "num_input_tokens_seen": 116964905, + "step": 5446, + "time_per_iteration": 2.5608487129211426 + }, + { + "auxiliary_loss_clip": 0.01072347, + "auxiliary_loss_mlp": 0.01042386, + "balance_loss_clip": 1.03264165, + "balance_loss_mlp": 1.02648342, + "epoch": 0.32749135728242895, + "flos": 25849219184640.0, + "grad_norm": 1.647050815878882, + "language_loss": 0.78865296, + "learning_rate": 3.141252301538802e-06, + "loss": 0.80980027, + "num_input_tokens_seen": 116983650, + "step": 5447, + "time_per_iteration": 2.6711840629577637 + }, + { + "auxiliary_loss_clip": 0.0105257, + "auxiliary_loss_mlp": 0.00747982, + "balance_loss_clip": 1.02742314, + "balance_loss_mlp": 1.00031805, + "epoch": 0.327551480535097, + "flos": 20120533764480.0, + "grad_norm": 2.109656022153761, + "language_loss": 0.72793078, + "learning_rate": 3.1409324500913157e-06, + "loss": 0.74593627, + "num_input_tokens_seen": 117003265, + "step": 5448, + "time_per_iteration": 2.606689691543579 + }, + { + "auxiliary_loss_clip": 0.01085557, + "auxiliary_loss_mlp": 0.01040455, + "balance_loss_clip": 1.03172457, + "balance_loss_mlp": 1.02663898, + "epoch": 0.32761160378776494, + "flos": 28803553931520.0, + "grad_norm": 1.4251095641015232, + "language_loss": 0.67023259, + "learning_rate": 3.1406125553795567e-06, + "loss": 0.69149268, + "num_input_tokens_seen": 117025370, + "step": 5449, + "time_per_iteration": 2.6492273807525635 + }, + { + "auxiliary_loss_clip": 0.01053646, + "auxiliary_loss_mlp": 0.01038151, + "balance_loss_clip": 1.03015924, + "balance_loss_mlp": 1.02522254, + "epoch": 0.3276717270404329, + "flos": 26937778803840.0, + "grad_norm": 1.8290064243412218, + "language_loss": 0.65989101, + "learning_rate": 3.1402926174156556e-06, + "loss": 0.68080902, + "num_input_tokens_seen": 117044350, + "step": 5450, + "time_per_iteration": 2.727917432785034 + }, + { + "auxiliary_loss_clip": 0.01072704, + "auxiliary_loss_mlp": 0.0103822, + "balance_loss_clip": 1.03140914, + "balance_loss_mlp": 1.02428997, + "epoch": 0.32773185029310087, + "flos": 25338425829120.0, + "grad_norm": 1.5614811027182163, + "language_loss": 0.77598888, + "learning_rate": 3.1399726362117437e-06, + "loss": 0.79709816, + "num_input_tokens_seen": 117064450, + "step": 5451, + "time_per_iteration": 2.6725008487701416 + }, + { + "auxiliary_loss_clip": 0.01078035, + "auxiliary_loss_mlp": 0.01041026, + "balance_loss_clip": 1.03204823, + "balance_loss_mlp": 1.02588606, + "epoch": 0.32779197354576883, + "flos": 26391721271040.0, + "grad_norm": 2.1987152295649697, + "language_loss": 0.70390105, + "learning_rate": 3.1396526117799555e-06, + "loss": 0.7250917, + "num_input_tokens_seen": 117083060, + "step": 5452, + "time_per_iteration": 2.640834093093872 + }, + { + "auxiliary_loss_clip": 0.01059615, + "auxiliary_loss_mlp": 0.01035973, + "balance_loss_clip": 1.02898932, + "balance_loss_mlp": 1.02280641, + "epoch": 0.3278520967984368, + "flos": 24899381890560.0, + "grad_norm": 1.8608457743571645, + "language_loss": 0.78465831, + "learning_rate": 3.1393325441324256e-06, + "loss": 0.80561417, + "num_input_tokens_seen": 117101860, + "step": 5453, + "time_per_iteration": 2.670259714126587 + }, + { + "auxiliary_loss_clip": 0.01075083, + "auxiliary_loss_mlp": 0.01032111, + "balance_loss_clip": 1.0307467, + "balance_loss_mlp": 1.01880121, + "epoch": 0.32791222005110476, + "flos": 29752996176000.0, + "grad_norm": 1.8700996240730743, + "language_loss": 0.7529642, + "learning_rate": 3.1390124332812916e-06, + "loss": 0.77403617, + "num_input_tokens_seen": 117123100, + "step": 5454, + "time_per_iteration": 2.689103126525879 + }, + { + "auxiliary_loss_clip": 0.01014535, + "auxiliary_loss_mlp": 0.01036153, + "balance_loss_clip": 1.02236724, + "balance_loss_mlp": 1.02326012, + "epoch": 0.32797234330377273, + "flos": 16508064072960.0, + "grad_norm": 3.551162557094289, + "language_loss": 0.76787126, + "learning_rate": 3.1386922792386924e-06, + "loss": 0.78837812, + "num_input_tokens_seen": 117140515, + "step": 5455, + "time_per_iteration": 2.796151876449585 + }, + { + "auxiliary_loss_clip": 0.01073482, + "auxiliary_loss_mlp": 0.01041169, + "balance_loss_clip": 1.03046525, + "balance_loss_mlp": 1.02600586, + "epoch": 0.3280324665564407, + "flos": 26577918397440.0, + "grad_norm": 1.817949962755801, + "language_loss": 0.73704326, + "learning_rate": 3.138372082016768e-06, + "loss": 0.7581898, + "num_input_tokens_seen": 117161485, + "step": 5456, + "time_per_iteration": 2.676584482192993 + }, + { + "auxiliary_loss_clip": 0.01087237, + "auxiliary_loss_mlp": 0.01044392, + "balance_loss_clip": 1.03210366, + "balance_loss_mlp": 1.03030121, + "epoch": 0.32809258980910866, + "flos": 22929969047040.0, + "grad_norm": 1.6572837912608298, + "language_loss": 0.78243816, + "learning_rate": 3.1380518416276596e-06, + "loss": 0.80375445, + "num_input_tokens_seen": 117181870, + "step": 5457, + "time_per_iteration": 2.6634693145751953 + }, + { + "auxiliary_loss_clip": 0.01055097, + "auxiliary_loss_mlp": 0.01037225, + "balance_loss_clip": 1.02909517, + "balance_loss_mlp": 1.02361178, + "epoch": 0.3281527130617766, + "flos": 22783848520320.0, + "grad_norm": 4.38094583402175, + "language_loss": 0.78918272, + "learning_rate": 3.1377315580835115e-06, + "loss": 0.81010592, + "num_input_tokens_seen": 117201380, + "step": 5458, + "time_per_iteration": 2.760049343109131 + }, + { + "auxiliary_loss_clip": 0.01073826, + "auxiliary_loss_mlp": 0.01042444, + "balance_loss_clip": 1.0328083, + "balance_loss_mlp": 1.02854979, + "epoch": 0.3282128363144446, + "flos": 21250678354560.0, + "grad_norm": 4.069554049828606, + "language_loss": 0.73057717, + "learning_rate": 3.1374112313964686e-06, + "loss": 0.75173986, + "num_input_tokens_seen": 117221040, + "step": 5459, + "time_per_iteration": 2.665492057800293 + }, + { + "auxiliary_loss_clip": 0.01065982, + "auxiliary_loss_mlp": 0.01037427, + "balance_loss_clip": 1.03192782, + "balance_loss_mlp": 1.0242722, + "epoch": 0.32827295956711255, + "flos": 30843064166400.0, + "grad_norm": 1.993537179561115, + "language_loss": 0.84357154, + "learning_rate": 3.1370908615786783e-06, + "loss": 0.86460561, + "num_input_tokens_seen": 117241395, + "step": 5460, + "time_per_iteration": 2.699869394302368 + }, + { + "auxiliary_loss_clip": 0.01083666, + "auxiliary_loss_mlp": 0.01031921, + "balance_loss_clip": 1.02973676, + "balance_loss_mlp": 1.01914811, + "epoch": 0.3283330828197806, + "flos": 25915006944000.0, + "grad_norm": 2.389444327594609, + "language_loss": 0.76308388, + "learning_rate": 3.136770448642288e-06, + "loss": 0.78423977, + "num_input_tokens_seen": 117259340, + "step": 5461, + "time_per_iteration": 2.6144111156463623 + }, + { + "auxiliary_loss_clip": 0.01064407, + "auxiliary_loss_mlp": 0.01036095, + "balance_loss_clip": 1.02939367, + "balance_loss_mlp": 1.02037776, + "epoch": 0.32839320607244854, + "flos": 38582065042560.0, + "grad_norm": 1.7395517756741403, + "language_loss": 0.62564713, + "learning_rate": 3.1364499925994484e-06, + "loss": 0.64665216, + "num_input_tokens_seen": 117282375, + "step": 5462, + "time_per_iteration": 2.725968360900879 + }, + { + "auxiliary_loss_clip": 0.01082099, + "auxiliary_loss_mlp": 0.00747854, + "balance_loss_clip": 1.03006601, + "balance_loss_mlp": 1.00026262, + "epoch": 0.3284533293251165, + "flos": 26650888876800.0, + "grad_norm": 2.469981836339072, + "language_loss": 0.7826643, + "learning_rate": 3.1361294934623115e-06, + "loss": 0.80096376, + "num_input_tokens_seen": 117303830, + "step": 5463, + "time_per_iteration": 2.7847766876220703 + }, + { + "auxiliary_loss_clip": 0.01056727, + "auxiliary_loss_mlp": 0.01042491, + "balance_loss_clip": 1.03138685, + "balance_loss_mlp": 1.02803123, + "epoch": 0.32851345257778447, + "flos": 15304158904320.0, + "grad_norm": 2.8635492673125587, + "language_loss": 0.6978302, + "learning_rate": 3.1358089512430303e-06, + "loss": 0.71882236, + "num_input_tokens_seen": 117320665, + "step": 5464, + "time_per_iteration": 2.6273319721221924 + }, + { + "auxiliary_loss_clip": 0.01074725, + "auxiliary_loss_mlp": 0.01038621, + "balance_loss_clip": 1.03231096, + "balance_loss_mlp": 1.02442884, + "epoch": 0.32857357583045244, + "flos": 23513732881920.0, + "grad_norm": 1.610136829607299, + "language_loss": 0.72248393, + "learning_rate": 3.1354883659537594e-06, + "loss": 0.74361742, + "num_input_tokens_seen": 117339795, + "step": 5465, + "time_per_iteration": 4.1776323318481445 + }, + { + "auxiliary_loss_clip": 0.01058409, + "auxiliary_loss_mlp": 0.01043613, + "balance_loss_clip": 1.03083348, + "balance_loss_mlp": 1.02927232, + "epoch": 0.3286336990831204, + "flos": 20995209849600.0, + "grad_norm": 10.38024942624597, + "language_loss": 0.82926917, + "learning_rate": 3.1351677376066567e-06, + "loss": 0.8502894, + "num_input_tokens_seen": 117359525, + "step": 5466, + "time_per_iteration": 4.221863269805908 + }, + { + "auxiliary_loss_clip": 0.01065589, + "auxiliary_loss_mlp": 0.0103634, + "balance_loss_clip": 1.03109205, + "balance_loss_mlp": 1.02279234, + "epoch": 0.32869382233578837, + "flos": 23658811914240.0, + "grad_norm": 1.7909040182979215, + "language_loss": 0.79807377, + "learning_rate": 3.134847066213879e-06, + "loss": 0.81909311, + "num_input_tokens_seen": 117380320, + "step": 5467, + "time_per_iteration": 2.6223058700561523 + }, + { + "auxiliary_loss_clip": 0.01063304, + "auxiliary_loss_mlp": 0.01030401, + "balance_loss_clip": 1.02947545, + "balance_loss_mlp": 1.01713908, + "epoch": 0.32875394558845633, + "flos": 25336522408320.0, + "grad_norm": 1.778380768780634, + "language_loss": 0.74384362, + "learning_rate": 3.134526351787587e-06, + "loss": 0.76478064, + "num_input_tokens_seen": 117400695, + "step": 5468, + "time_per_iteration": 2.659271717071533 + }, + { + "auxiliary_loss_clip": 0.01056109, + "auxiliary_loss_mlp": 0.01041608, + "balance_loss_clip": 1.02823853, + "balance_loss_mlp": 1.02597976, + "epoch": 0.3288140688411243, + "flos": 14903108576640.0, + "grad_norm": 1.971217631370473, + "language_loss": 0.78542101, + "learning_rate": 3.134205594339942e-06, + "loss": 0.80639815, + "num_input_tokens_seen": 117418800, + "step": 5469, + "time_per_iteration": 2.7397208213806152 + }, + { + "auxiliary_loss_clip": 0.01047962, + "auxiliary_loss_mlp": 0.01032498, + "balance_loss_clip": 1.02828765, + "balance_loss_mlp": 1.01921153, + "epoch": 0.32887419209379226, + "flos": 18551345235840.0, + "grad_norm": 2.612585205843501, + "language_loss": 0.81767839, + "learning_rate": 3.133884793883107e-06, + "loss": 0.83848298, + "num_input_tokens_seen": 117438220, + "step": 5470, + "time_per_iteration": 2.6890249252319336 + }, + { + "auxiliary_loss_clip": 0.01084107, + "auxiliary_loss_mlp": 0.01036255, + "balance_loss_clip": 1.02968884, + "balance_loss_mlp": 1.02243233, + "epoch": 0.3289343153464602, + "flos": 48105610439040.0, + "grad_norm": 1.7496648782610293, + "language_loss": 0.67523879, + "learning_rate": 3.1335639504292478e-06, + "loss": 0.69644243, + "num_input_tokens_seen": 117462560, + "step": 5471, + "time_per_iteration": 2.749318838119507 + }, + { + "auxiliary_loss_clip": 0.0108848, + "auxiliary_loss_mlp": 0.01037554, + "balance_loss_clip": 1.0320698, + "balance_loss_mlp": 1.02157378, + "epoch": 0.3289944385991282, + "flos": 27600295207680.0, + "grad_norm": 1.649963651330893, + "language_loss": 0.6514042, + "learning_rate": 3.1332430639905288e-06, + "loss": 0.67266452, + "num_input_tokens_seen": 117483665, + "step": 5472, + "time_per_iteration": 2.578450918197632 + }, + { + "auxiliary_loss_clip": 0.0107707, + "auxiliary_loss_mlp": 0.01044102, + "balance_loss_clip": 1.03238845, + "balance_loss_mlp": 1.02894473, + "epoch": 0.32905456185179616, + "flos": 20120318282880.0, + "grad_norm": 1.653561346346052, + "language_loss": 0.88005614, + "learning_rate": 3.13292213457912e-06, + "loss": 0.90126789, + "num_input_tokens_seen": 117503565, + "step": 5473, + "time_per_iteration": 4.156154632568359 + }, + { + "auxiliary_loss_clip": 0.0104662, + "auxiliary_loss_mlp": 0.01041083, + "balance_loss_clip": 1.02827406, + "balance_loss_mlp": 1.02514505, + "epoch": 0.3291146851044642, + "flos": 23180230080000.0, + "grad_norm": 1.9339182783389275, + "language_loss": 0.78543687, + "learning_rate": 3.1326011622071903e-06, + "loss": 0.80631387, + "num_input_tokens_seen": 117521460, + "step": 5474, + "time_per_iteration": 4.264777183532715 + }, + { + "auxiliary_loss_clip": 0.01006701, + "auxiliary_loss_mlp": 0.01021241, + "balance_loss_clip": 1.0145731, + "balance_loss_mlp": 1.01898789, + "epoch": 0.32917480835713214, + "flos": 67621912594560.0, + "grad_norm": 0.8096650435201366, + "language_loss": 0.60228419, + "learning_rate": 3.132280146886911e-06, + "loss": 0.6225636, + "num_input_tokens_seen": 117580550, + "step": 5475, + "time_per_iteration": 3.2321670055389404 + }, + { + "auxiliary_loss_clip": 0.01036697, + "auxiliary_loss_mlp": 0.01055681, + "balance_loss_clip": 1.02633667, + "balance_loss_mlp": 1.03759694, + "epoch": 0.3292349316098001, + "flos": 27964537073280.0, + "grad_norm": 4.008912379713856, + "language_loss": 0.77074325, + "learning_rate": 3.131959088630455e-06, + "loss": 0.79166698, + "num_input_tokens_seen": 117600645, + "step": 5476, + "time_per_iteration": 2.801748037338257 + }, + { + "auxiliary_loss_clip": 0.01046361, + "auxiliary_loss_mlp": 0.0104354, + "balance_loss_clip": 1.0291822, + "balance_loss_mlp": 1.02929449, + "epoch": 0.3292950548624681, + "flos": 20263673462400.0, + "grad_norm": 1.8619101022832856, + "language_loss": 0.74867493, + "learning_rate": 3.131637987449997e-06, + "loss": 0.76957399, + "num_input_tokens_seen": 117618880, + "step": 5477, + "time_per_iteration": 2.8474066257476807 + }, + { + "auxiliary_loss_clip": 0.01081645, + "auxiliary_loss_mlp": 0.01035179, + "balance_loss_clip": 1.03034163, + "balance_loss_mlp": 1.02250755, + "epoch": 0.32935517811513604, + "flos": 20812999132800.0, + "grad_norm": 2.243040282530106, + "language_loss": 0.75239438, + "learning_rate": 3.131316843357713e-06, + "loss": 0.77356261, + "num_input_tokens_seen": 117636445, + "step": 5478, + "time_per_iteration": 2.6151797771453857 + }, + { + "auxiliary_loss_clip": 0.01072445, + "auxiliary_loss_mlp": 0.01042099, + "balance_loss_clip": 1.03126264, + "balance_loss_mlp": 1.02879572, + "epoch": 0.329415301367804, + "flos": 18441853603200.0, + "grad_norm": 2.209275620142526, + "language_loss": 0.80302602, + "learning_rate": 3.1309956563657807e-06, + "loss": 0.82417142, + "num_input_tokens_seen": 117653105, + "step": 5479, + "time_per_iteration": 2.74871826171875 + }, + { + "auxiliary_loss_clip": 0.01004282, + "auxiliary_loss_mlp": 0.01004725, + "balance_loss_clip": 1.01134515, + "balance_loss_mlp": 1.0029254, + "epoch": 0.32947542462047197, + "flos": 66323024887680.0, + "grad_norm": 0.7482974290309935, + "language_loss": 0.56515312, + "learning_rate": 3.1306744264863804e-06, + "loss": 0.58524323, + "num_input_tokens_seen": 117719225, + "step": 5480, + "time_per_iteration": 3.259993076324463 + }, + { + "auxiliary_loss_clip": 0.01068581, + "auxiliary_loss_mlp": 0.0074783, + "balance_loss_clip": 1.0295372, + "balance_loss_mlp": 1.00020862, + "epoch": 0.32953554787313993, + "flos": 23221599569280.0, + "grad_norm": 1.7915927585875182, + "language_loss": 0.76932478, + "learning_rate": 3.1303531537316915e-06, + "loss": 0.78748882, + "num_input_tokens_seen": 117738725, + "step": 5481, + "time_per_iteration": 2.7434134483337402 + }, + { + "auxiliary_loss_clip": 0.01069639, + "auxiliary_loss_mlp": 0.01035295, + "balance_loss_clip": 1.03468347, + "balance_loss_mlp": 1.02194333, + "epoch": 0.3295956711258079, + "flos": 27009492307200.0, + "grad_norm": 1.5894828141774364, + "language_loss": 0.78480637, + "learning_rate": 3.130031838113899e-06, + "loss": 0.80585563, + "num_input_tokens_seen": 117757765, + "step": 5482, + "time_per_iteration": 2.7369112968444824 + }, + { + "auxiliary_loss_clip": 0.01076938, + "auxiliary_loss_mlp": 0.01044615, + "balance_loss_clip": 1.03174686, + "balance_loss_mlp": 1.02999377, + "epoch": 0.32965579437847586, + "flos": 19171702051200.0, + "grad_norm": 2.561929296961136, + "language_loss": 0.73710418, + "learning_rate": 3.129710479645185e-06, + "loss": 0.75831962, + "num_input_tokens_seen": 117776810, + "step": 5483, + "time_per_iteration": 2.6701014041900635 + }, + { + "auxiliary_loss_clip": 0.0107122, + "auxiliary_loss_mlp": 0.0104458, + "balance_loss_clip": 1.03328753, + "balance_loss_mlp": 1.03076327, + "epoch": 0.32971591763114383, + "flos": 30482521401600.0, + "grad_norm": 1.4937589396782804, + "language_loss": 0.75755429, + "learning_rate": 3.1293890783377366e-06, + "loss": 0.77871227, + "num_input_tokens_seen": 117797730, + "step": 5484, + "time_per_iteration": 2.6334218978881836 + }, + { + "auxiliary_loss_clip": 0.01089863, + "auxiliary_loss_mlp": 0.01044202, + "balance_loss_clip": 1.03629541, + "balance_loss_mlp": 1.02989066, + "epoch": 0.3297760408838118, + "flos": 16289583598080.0, + "grad_norm": 2.0090415987889805, + "language_loss": 0.71353322, + "learning_rate": 3.129067634203742e-06, + "loss": 0.73487389, + "num_input_tokens_seen": 117815365, + "step": 5485, + "time_per_iteration": 2.5297350883483887 + }, + { + "auxiliary_loss_clip": 0.01036699, + "auxiliary_loss_mlp": 0.01041455, + "balance_loss_clip": 1.03393567, + "balance_loss_mlp": 1.02841389, + "epoch": 0.32983616413647976, + "flos": 29530924341120.0, + "grad_norm": 1.5733368420874143, + "language_loss": 0.80602413, + "learning_rate": 3.128746147255388e-06, + "loss": 0.82680571, + "num_input_tokens_seen": 117836095, + "step": 5486, + "time_per_iteration": 2.752624034881592 + }, + { + "auxiliary_loss_clip": 0.01057252, + "auxiliary_loss_mlp": 0.01040994, + "balance_loss_clip": 1.03106737, + "balance_loss_mlp": 1.02634335, + "epoch": 0.3298962873891478, + "flos": 20631398947200.0, + "grad_norm": 2.0251528081679235, + "language_loss": 0.8457877, + "learning_rate": 3.1284246175048683e-06, + "loss": 0.86677015, + "num_input_tokens_seen": 117854655, + "step": 5487, + "time_per_iteration": 2.6383416652679443 + }, + { + "auxiliary_loss_clip": 0.01030466, + "auxiliary_loss_mlp": 0.01037311, + "balance_loss_clip": 1.02617836, + "balance_loss_mlp": 1.02147985, + "epoch": 0.32995641064181574, + "flos": 14976007228800.0, + "grad_norm": 2.5739994104245327, + "language_loss": 0.74240142, + "learning_rate": 3.1281030449643735e-06, + "loss": 0.76307917, + "num_input_tokens_seen": 117873300, + "step": 5488, + "time_per_iteration": 2.7776315212249756 + }, + { + "auxiliary_loss_clip": 0.01087903, + "auxiliary_loss_mlp": 0.01039642, + "balance_loss_clip": 1.034127, + "balance_loss_mlp": 1.02564716, + "epoch": 0.3300165338944837, + "flos": 18661447399680.0, + "grad_norm": 2.5614353009282205, + "language_loss": 0.7197392, + "learning_rate": 3.127781429646098e-06, + "loss": 0.7410146, + "num_input_tokens_seen": 117891540, + "step": 5489, + "time_per_iteration": 2.537320375442505 + }, + { + "auxiliary_loss_clip": 0.01084426, + "auxiliary_loss_mlp": 0.01035799, + "balance_loss_clip": 1.03143418, + "balance_loss_mlp": 1.02231026, + "epoch": 0.3300766571471517, + "flos": 25583730785280.0, + "grad_norm": 2.8391819558044666, + "language_loss": 0.88273138, + "learning_rate": 3.127459771562238e-06, + "loss": 0.90393358, + "num_input_tokens_seen": 117907690, + "step": 5490, + "time_per_iteration": 2.5445444583892822 + }, + { + "auxiliary_loss_clip": 0.01074897, + "auxiliary_loss_mlp": 0.01033382, + "balance_loss_clip": 1.03117108, + "balance_loss_mlp": 1.02020955, + "epoch": 0.33013678039981964, + "flos": 11363501623680.0, + "grad_norm": 2.0524513658819097, + "language_loss": 0.8310163, + "learning_rate": 3.1271380707249907e-06, + "loss": 0.85209906, + "num_input_tokens_seen": 117925640, + "step": 5491, + "time_per_iteration": 2.552114248275757 + }, + { + "auxiliary_loss_clip": 0.0106805, + "auxiliary_loss_mlp": 0.01039811, + "balance_loss_clip": 1.03466082, + "balance_loss_mlp": 1.0262208, + "epoch": 0.3301969036524876, + "flos": 24821203939200.0, + "grad_norm": 1.78165996299874, + "language_loss": 0.77322829, + "learning_rate": 3.126816327146554e-06, + "loss": 0.79430687, + "num_input_tokens_seen": 117944525, + "step": 5492, + "time_per_iteration": 2.6735377311706543 + }, + { + "auxiliary_loss_clip": 0.01093007, + "auxiliary_loss_mlp": 0.01044875, + "balance_loss_clip": 1.0362916, + "balance_loss_mlp": 1.03002119, + "epoch": 0.33025702690515557, + "flos": 15961144613760.0, + "grad_norm": 2.2554228953539104, + "language_loss": 0.74617589, + "learning_rate": 3.12649454083913e-06, + "loss": 0.76755464, + "num_input_tokens_seen": 117962515, + "step": 5493, + "time_per_iteration": 2.5337231159210205 + }, + { + "auxiliary_loss_clip": 0.00982477, + "auxiliary_loss_mlp": 0.01006606, + "balance_loss_clip": 1.00970769, + "balance_loss_mlp": 1.00432956, + "epoch": 0.33031715015782354, + "flos": 59416755989760.0, + "grad_norm": 0.7839794962043005, + "language_loss": 0.53928959, + "learning_rate": 3.12617271181492e-06, + "loss": 0.55918044, + "num_input_tokens_seen": 118018780, + "step": 5494, + "time_per_iteration": 3.2438101768493652 + }, + { + "auxiliary_loss_clip": 0.01062372, + "auxiliary_loss_mlp": 0.01037805, + "balance_loss_clip": 1.03011477, + "balance_loss_mlp": 1.02389312, + "epoch": 0.3303772734104915, + "flos": 23184360144000.0, + "grad_norm": 2.50207640524128, + "language_loss": 0.87041509, + "learning_rate": 3.1258508400861276e-06, + "loss": 0.89141691, + "num_input_tokens_seen": 118038610, + "step": 5495, + "time_per_iteration": 2.6730751991271973 + }, + { + "auxiliary_loss_clip": 0.01059324, + "auxiliary_loss_mlp": 0.01045317, + "balance_loss_clip": 1.03355074, + "balance_loss_mlp": 1.03016567, + "epoch": 0.33043739666315947, + "flos": 33071896010880.0, + "grad_norm": 1.9352836177162973, + "language_loss": 0.7339344, + "learning_rate": 3.1255289256649587e-06, + "loss": 0.7549808, + "num_input_tokens_seen": 118055905, + "step": 5496, + "time_per_iteration": 2.8192195892333984 + }, + { + "auxiliary_loss_clip": 0.01052995, + "auxiliary_loss_mlp": 0.01032111, + "balance_loss_clip": 1.02892494, + "balance_loss_mlp": 1.01858079, + "epoch": 0.33049751991582743, + "flos": 24895431394560.0, + "grad_norm": 5.772944121442944, + "language_loss": 0.72018182, + "learning_rate": 3.1252069685636196e-06, + "loss": 0.74103284, + "num_input_tokens_seen": 118073695, + "step": 5497, + "time_per_iteration": 2.672367572784424 + }, + { + "auxiliary_loss_clip": 0.01063367, + "auxiliary_loss_mlp": 0.01037492, + "balance_loss_clip": 1.03159678, + "balance_loss_mlp": 1.02383649, + "epoch": 0.3305576431684954, + "flos": 29460575554560.0, + "grad_norm": 2.077587497387801, + "language_loss": 0.8002317, + "learning_rate": 3.124884968794321e-06, + "loss": 0.82124031, + "num_input_tokens_seen": 118094030, + "step": 5498, + "time_per_iteration": 2.6887547969818115 + }, + { + "auxiliary_loss_clip": 0.01064137, + "auxiliary_loss_mlp": 0.01034401, + "balance_loss_clip": 1.02677691, + "balance_loss_mlp": 1.02035844, + "epoch": 0.33061776642116336, + "flos": 22632305040000.0, + "grad_norm": 2.2412785115333738, + "language_loss": 0.75827557, + "learning_rate": 3.12456292636927e-06, + "loss": 0.77926093, + "num_input_tokens_seen": 118111665, + "step": 5499, + "time_per_iteration": 2.7882704734802246 + }, + { + "auxiliary_loss_clip": 0.01069964, + "auxiliary_loss_mlp": 0.01034575, + "balance_loss_clip": 1.03531909, + "balance_loss_mlp": 1.02071142, + "epoch": 0.3306778896738313, + "flos": 25776320532480.0, + "grad_norm": 1.4769654282288602, + "language_loss": 0.79229426, + "learning_rate": 3.124240841300681e-06, + "loss": 0.81333959, + "num_input_tokens_seen": 118132435, + "step": 5500, + "time_per_iteration": 2.856182098388672 + }, + { + "auxiliary_loss_clip": 0.0107785, + "auxiliary_loss_mlp": 0.0103245, + "balance_loss_clip": 1.033077, + "balance_loss_mlp": 1.01781678, + "epoch": 0.33073801292649935, + "flos": 36940552479360.0, + "grad_norm": 1.968907912558561, + "language_loss": 0.66144061, + "learning_rate": 3.1239187136007665e-06, + "loss": 0.68254364, + "num_input_tokens_seen": 118155255, + "step": 5501, + "time_per_iteration": 2.9100711345672607 + }, + { + "auxiliary_loss_clip": 0.01075844, + "auxiliary_loss_mlp": 0.01045667, + "balance_loss_clip": 1.03096056, + "balance_loss_mlp": 1.03059268, + "epoch": 0.3307981361791673, + "flos": 12967738848000.0, + "grad_norm": 2.2813733990119505, + "language_loss": 0.7732985, + "learning_rate": 3.1235965432817417e-06, + "loss": 0.79451358, + "num_input_tokens_seen": 118169865, + "step": 5502, + "time_per_iteration": 2.7559995651245117 + }, + { + "auxiliary_loss_clip": 0.01070266, + "auxiliary_loss_mlp": 0.01037457, + "balance_loss_clip": 1.03633857, + "balance_loss_mlp": 1.02257967, + "epoch": 0.3308582594318353, + "flos": 25374372364800.0, + "grad_norm": 1.5594632602218792, + "language_loss": 0.72261083, + "learning_rate": 3.123274330355824e-06, + "loss": 0.74368805, + "num_input_tokens_seen": 118190760, + "step": 5503, + "time_per_iteration": 2.667071580886841 + }, + { + "auxiliary_loss_clip": 0.01054864, + "auxiliary_loss_mlp": 0.0103934, + "balance_loss_clip": 1.02665186, + "balance_loss_mlp": 1.02403402, + "epoch": 0.33091838268450324, + "flos": 26468570419200.0, + "grad_norm": 1.6377941023268703, + "language_loss": 0.74966562, + "learning_rate": 3.12295207483523e-06, + "loss": 0.77060759, + "num_input_tokens_seen": 118213620, + "step": 5504, + "time_per_iteration": 2.7220020294189453 + }, + { + "auxiliary_loss_clip": 0.0106152, + "auxiliary_loss_mlp": 0.01037005, + "balance_loss_clip": 1.03063273, + "balance_loss_mlp": 1.02316487, + "epoch": 0.3309785059371712, + "flos": 24971167221120.0, + "grad_norm": 1.6149693970667174, + "language_loss": 0.69797355, + "learning_rate": 3.1226297767321816e-06, + "loss": 0.7189588, + "num_input_tokens_seen": 118235010, + "step": 5505, + "time_per_iteration": 2.833284378051758 + }, + { + "auxiliary_loss_clip": 0.01066799, + "auxiliary_loss_mlp": 0.01041085, + "balance_loss_clip": 1.03095293, + "balance_loss_mlp": 1.02754903, + "epoch": 0.3310386291898392, + "flos": 20446710192000.0, + "grad_norm": 1.741201481342828, + "language_loss": 0.82315266, + "learning_rate": 3.122307436058899e-06, + "loss": 0.84423149, + "num_input_tokens_seen": 118255820, + "step": 5506, + "time_per_iteration": 2.78818416595459 + }, + { + "auxiliary_loss_clip": 0.01071598, + "auxiliary_loss_mlp": 0.01034417, + "balance_loss_clip": 1.03118515, + "balance_loss_mlp": 1.02009964, + "epoch": 0.33109875244250714, + "flos": 23182672204800.0, + "grad_norm": 2.2670026763592217, + "language_loss": 0.79260868, + "learning_rate": 3.121985052827606e-06, + "loss": 0.81366885, + "num_input_tokens_seen": 118274160, + "step": 5507, + "time_per_iteration": 2.7718217372894287 + }, + { + "auxiliary_loss_clip": 0.01062917, + "auxiliary_loss_mlp": 0.01046745, + "balance_loss_clip": 1.03000426, + "balance_loss_mlp": 1.0324223, + "epoch": 0.3311588756951751, + "flos": 24168384207360.0, + "grad_norm": 1.5905957899198817, + "language_loss": 0.71439654, + "learning_rate": 3.1216626270505274e-06, + "loss": 0.73549318, + "num_input_tokens_seen": 118294385, + "step": 5508, + "time_per_iteration": 2.8385918140411377 + }, + { + "auxiliary_loss_clip": 0.01051403, + "auxiliary_loss_mlp": 0.01034963, + "balance_loss_clip": 1.02908015, + "balance_loss_mlp": 1.02171242, + "epoch": 0.33121899894784307, + "flos": 28145742209280.0, + "grad_norm": 1.9267166544892005, + "language_loss": 0.72205228, + "learning_rate": 3.12134015873989e-06, + "loss": 0.74291587, + "num_input_tokens_seen": 118313105, + "step": 5509, + "time_per_iteration": 2.8720004558563232 + }, + { + "auxiliary_loss_clip": 0.01075608, + "auxiliary_loss_mlp": 0.01038776, + "balance_loss_clip": 1.03402245, + "balance_loss_mlp": 1.02472675, + "epoch": 0.33127912220051103, + "flos": 29567660976000.0, + "grad_norm": 1.4779905016832247, + "language_loss": 0.73176348, + "learning_rate": 3.121017647907921e-06, + "loss": 0.75290734, + "num_input_tokens_seen": 118335250, + "step": 5510, + "time_per_iteration": 2.8261630535125732 + }, + { + "auxiliary_loss_clip": 0.01037992, + "auxiliary_loss_mlp": 0.01040255, + "balance_loss_clip": 1.02672887, + "balance_loss_mlp": 1.02674317, + "epoch": 0.331339245453179, + "flos": 14428836374400.0, + "grad_norm": 2.043887745234093, + "language_loss": 0.87882853, + "learning_rate": 3.1206950945668508e-06, + "loss": 0.89961106, + "num_input_tokens_seen": 118351470, + "step": 5511, + "time_per_iteration": 4.426092624664307 + }, + { + "auxiliary_loss_clip": 0.0103148, + "auxiliary_loss_mlp": 0.01042057, + "balance_loss_clip": 1.02833891, + "balance_loss_mlp": 1.02855694, + "epoch": 0.33139936870584696, + "flos": 20887118847360.0, + "grad_norm": 1.6416421395817913, + "language_loss": 0.72799432, + "learning_rate": 3.12037249872891e-06, + "loss": 0.74872971, + "num_input_tokens_seen": 118370970, + "step": 5512, + "time_per_iteration": 2.873342752456665 + }, + { + "auxiliary_loss_clip": 0.0104645, + "auxiliary_loss_mlp": 0.01036472, + "balance_loss_clip": 1.02875817, + "balance_loss_mlp": 1.02313209, + "epoch": 0.33145949195851493, + "flos": 36284356869120.0, + "grad_norm": 1.7543352980883358, + "language_loss": 0.71874058, + "learning_rate": 3.1200498604063317e-06, + "loss": 0.73956978, + "num_input_tokens_seen": 118393125, + "step": 5513, + "time_per_iteration": 2.756683826446533 + }, + { + "auxiliary_loss_clip": 0.01056852, + "auxiliary_loss_mlp": 0.01035029, + "balance_loss_clip": 1.03151715, + "balance_loss_mlp": 1.02014589, + "epoch": 0.33151961521118295, + "flos": 14279735018880.0, + "grad_norm": 1.882980237109665, + "language_loss": 0.68109643, + "learning_rate": 3.1197271796113507e-06, + "loss": 0.70201528, + "num_input_tokens_seen": 118410860, + "step": 5514, + "time_per_iteration": 4.793079853057861 + }, + { + "auxiliary_loss_clip": 0.01055836, + "auxiliary_loss_mlp": 0.01046361, + "balance_loss_clip": 1.0283848, + "balance_loss_mlp": 1.02988017, + "epoch": 0.3315797384638509, + "flos": 20774323163520.0, + "grad_norm": 2.8039341377384526, + "language_loss": 0.65865105, + "learning_rate": 3.1194044563562026e-06, + "loss": 0.67967308, + "num_input_tokens_seen": 118429570, + "step": 5515, + "time_per_iteration": 2.8536224365234375 + }, + { + "auxiliary_loss_clip": 0.01073922, + "auxiliary_loss_mlp": 0.0103385, + "balance_loss_clip": 1.03063369, + "balance_loss_mlp": 1.01983118, + "epoch": 0.3316398617165189, + "flos": 24679464871680.0, + "grad_norm": 1.6917815327189365, + "language_loss": 0.69403815, + "learning_rate": 3.1190816906531257e-06, + "loss": 0.7151159, + "num_input_tokens_seen": 118450285, + "step": 5516, + "time_per_iteration": 2.863345146179199 + }, + { + "auxiliary_loss_clip": 0.01074225, + "auxiliary_loss_mlp": 0.01038406, + "balance_loss_clip": 1.02896309, + "balance_loss_mlp": 1.02458942, + "epoch": 0.33169998496918685, + "flos": 18587974129920.0, + "grad_norm": 2.1440508373113403, + "language_loss": 0.80267406, + "learning_rate": 3.118758882514359e-06, + "loss": 0.82380038, + "num_input_tokens_seen": 118468270, + "step": 5517, + "time_per_iteration": 2.5949599742889404 + }, + { + "auxiliary_loss_clip": 0.0106261, + "auxiliary_loss_mlp": 0.01032184, + "balance_loss_clip": 1.02723217, + "balance_loss_mlp": 1.01823592, + "epoch": 0.3317601082218548, + "flos": 20193647898240.0, + "grad_norm": 1.6769783830029787, + "language_loss": 0.74487877, + "learning_rate": 3.118436031952143e-06, + "loss": 0.7658267, + "num_input_tokens_seen": 118486615, + "step": 5518, + "time_per_iteration": 2.7054224014282227 + }, + { + "auxiliary_loss_clip": 0.01000482, + "auxiliary_loss_mlp": 0.01009511, + "balance_loss_clip": 1.00910985, + "balance_loss_mlp": 1.00775898, + "epoch": 0.3318202314745228, + "flos": 68974703637120.0, + "grad_norm": 0.6122389519161642, + "language_loss": 0.54339129, + "learning_rate": 3.1181131389787206e-06, + "loss": 0.56349123, + "num_input_tokens_seen": 118553580, + "step": 5519, + "time_per_iteration": 3.3372225761413574 + }, + { + "auxiliary_loss_clip": 0.01072993, + "auxiliary_loss_mlp": 0.0103498, + "balance_loss_clip": 1.03070343, + "balance_loss_mlp": 1.02089548, + "epoch": 0.33188035472719074, + "flos": 21500113374720.0, + "grad_norm": 2.4724288998692505, + "language_loss": 0.78735197, + "learning_rate": 3.117790203606336e-06, + "loss": 0.80843168, + "num_input_tokens_seen": 118570280, + "step": 5520, + "time_per_iteration": 2.6298751831054688 + }, + { + "auxiliary_loss_clip": 0.01058734, + "auxiliary_loss_mlp": 0.01035157, + "balance_loss_clip": 1.02961111, + "balance_loss_mlp": 1.02240753, + "epoch": 0.3319404779798587, + "flos": 28870490926080.0, + "grad_norm": 1.6953790795123227, + "language_loss": 0.76126099, + "learning_rate": 3.1174672258472344e-06, + "loss": 0.78219986, + "num_input_tokens_seen": 118590455, + "step": 5521, + "time_per_iteration": 5.83334755897522 + }, + { + "auxiliary_loss_clip": 0.01074531, + "auxiliary_loss_mlp": 0.01039909, + "balance_loss_clip": 1.02978218, + "balance_loss_mlp": 1.02543104, + "epoch": 0.33200060123252667, + "flos": 23076915586560.0, + "grad_norm": 1.7835825808095536, + "language_loss": 0.70091027, + "learning_rate": 3.117144205713664e-06, + "loss": 0.72205466, + "num_input_tokens_seen": 118609495, + "step": 5522, + "time_per_iteration": 2.63396954536438 + }, + { + "auxiliary_loss_clip": 0.01060234, + "auxiliary_loss_mlp": 0.0103206, + "balance_loss_clip": 1.02965748, + "balance_loss_mlp": 1.01928639, + "epoch": 0.33206072448519464, + "flos": 21142479611520.0, + "grad_norm": 1.7446691142433193, + "language_loss": 0.73822629, + "learning_rate": 3.1168211432178735e-06, + "loss": 0.75914925, + "num_input_tokens_seen": 118628720, + "step": 5523, + "time_per_iteration": 2.6199963092803955 + }, + { + "auxiliary_loss_clip": 0.01052292, + "auxiliary_loss_mlp": 0.0103552, + "balance_loss_clip": 1.02641201, + "balance_loss_mlp": 1.02176309, + "epoch": 0.3321208477378626, + "flos": 13079097987840.0, + "grad_norm": 1.7559742738496558, + "language_loss": 0.81703651, + "learning_rate": 3.116498038372114e-06, + "loss": 0.83791459, + "num_input_tokens_seen": 118645955, + "step": 5524, + "time_per_iteration": 2.730022668838501 + }, + { + "auxiliary_loss_clip": 0.01049459, + "auxiliary_loss_mlp": 0.00747764, + "balance_loss_clip": 1.02958465, + "balance_loss_mlp": 1.00036132, + "epoch": 0.33218097099053057, + "flos": 21215414177280.0, + "grad_norm": 1.63585467760344, + "language_loss": 0.82945758, + "learning_rate": 3.116174891188636e-06, + "loss": 0.84742981, + "num_input_tokens_seen": 118665605, + "step": 5525, + "time_per_iteration": 2.75248122215271 + }, + { + "auxiliary_loss_clip": 0.01018879, + "auxiliary_loss_mlp": 0.01002683, + "balance_loss_clip": 1.00702333, + "balance_loss_mlp": 1.00084686, + "epoch": 0.33224109424319853, + "flos": 64348979189760.0, + "grad_norm": 0.7642490838590513, + "language_loss": 0.52647513, + "learning_rate": 3.1158517016796945e-06, + "loss": 0.54669076, + "num_input_tokens_seen": 118728155, + "step": 5526, + "time_per_iteration": 3.118210792541504 + }, + { + "auxiliary_loss_clip": 0.01047698, + "auxiliary_loss_mlp": 0.00747889, + "balance_loss_clip": 1.03099597, + "balance_loss_mlp": 1.00037718, + "epoch": 0.33230121749586655, + "flos": 17346003523200.0, + "grad_norm": 2.2510552232400114, + "language_loss": 0.77690423, + "learning_rate": 3.1155284698575445e-06, + "loss": 0.79486012, + "num_input_tokens_seen": 118743955, + "step": 5527, + "time_per_iteration": 2.629044532775879 + }, + { + "auxiliary_loss_clip": 0.01050084, + "auxiliary_loss_mlp": 0.01043388, + "balance_loss_clip": 1.03620028, + "balance_loss_mlp": 1.0296607, + "epoch": 0.3323613407485345, + "flos": 20997041443200.0, + "grad_norm": 1.6524983610097017, + "language_loss": 0.72278529, + "learning_rate": 3.1152051957344434e-06, + "loss": 0.74372005, + "num_input_tokens_seen": 118763275, + "step": 5528, + "time_per_iteration": 2.671114444732666 + }, + { + "auxiliary_loss_clip": 0.0106098, + "auxiliary_loss_mlp": 0.01036094, + "balance_loss_clip": 1.02873588, + "balance_loss_mlp": 1.02279055, + "epoch": 0.3324214640012025, + "flos": 13152535344000.0, + "grad_norm": 2.650060848052893, + "language_loss": 0.82796633, + "learning_rate": 3.1148818793226497e-06, + "loss": 0.84893709, + "num_input_tokens_seen": 118781110, + "step": 5529, + "time_per_iteration": 2.622262716293335 + }, + { + "auxiliary_loss_clip": 0.01057357, + "auxiliary_loss_mlp": 0.00747839, + "balance_loss_clip": 1.02973282, + "balance_loss_mlp": 1.00026822, + "epoch": 0.33248158725387045, + "flos": 22273522041600.0, + "grad_norm": 1.7732235842136583, + "language_loss": 0.69528008, + "learning_rate": 3.114558520634423e-06, + "loss": 0.713332, + "num_input_tokens_seen": 118800620, + "step": 5530, + "time_per_iteration": 2.6174533367156982 + }, + { + "auxiliary_loss_clip": 0.01070333, + "auxiliary_loss_mlp": 0.01040686, + "balance_loss_clip": 1.02996564, + "balance_loss_mlp": 1.02597499, + "epoch": 0.3325417105065384, + "flos": 20740998320640.0, + "grad_norm": 3.1785782451471696, + "language_loss": 0.7684828, + "learning_rate": 3.1142351196820256e-06, + "loss": 0.78959292, + "num_input_tokens_seen": 118818725, + "step": 5531, + "time_per_iteration": 2.6651523113250732 + }, + { + "auxiliary_loss_clip": 0.01067373, + "auxiliary_loss_mlp": 0.01036015, + "balance_loss_clip": 1.0329771, + "balance_loss_mlp": 1.02184105, + "epoch": 0.3326018337592064, + "flos": 24790536702720.0, + "grad_norm": 1.7186737533072858, + "language_loss": 0.73097342, + "learning_rate": 3.1139116764777206e-06, + "loss": 0.75200731, + "num_input_tokens_seen": 118839390, + "step": 5532, + "time_per_iteration": 2.685269594192505 + }, + { + "auxiliary_loss_clip": 0.0106699, + "auxiliary_loss_mlp": 0.0103234, + "balance_loss_clip": 1.03458118, + "balance_loss_mlp": 1.01936996, + "epoch": 0.33266195701187434, + "flos": 14501699112960.0, + "grad_norm": 1.8554892385114299, + "language_loss": 0.66286469, + "learning_rate": 3.1135881910337735e-06, + "loss": 0.68385804, + "num_input_tokens_seen": 118856275, + "step": 5533, + "time_per_iteration": 2.5811967849731445 + }, + { + "auxiliary_loss_clip": 0.01032121, + "auxiliary_loss_mlp": 0.01034216, + "balance_loss_clip": 1.02881813, + "balance_loss_mlp": 1.0199821, + "epoch": 0.3327220802645423, + "flos": 15304410299520.0, + "grad_norm": 1.9717998400852816, + "language_loss": 0.71310455, + "learning_rate": 3.113264663362451e-06, + "loss": 0.73376787, + "num_input_tokens_seen": 118873830, + "step": 5534, + "time_per_iteration": 2.7140326499938965 + }, + { + "auxiliary_loss_clip": 0.01036523, + "auxiliary_loss_mlp": 0.01035069, + "balance_loss_clip": 1.02796292, + "balance_loss_mlp": 1.02145505, + "epoch": 0.3327822035172103, + "flos": 23477534951040.0, + "grad_norm": 1.4814988349559524, + "language_loss": 0.67435431, + "learning_rate": 3.1129410934760204e-06, + "loss": 0.69507021, + "num_input_tokens_seen": 118891560, + "step": 5535, + "time_per_iteration": 2.7197587490081787 + }, + { + "auxiliary_loss_clip": 0.01074453, + "auxiliary_loss_mlp": 0.00747855, + "balance_loss_clip": 1.03064561, + "balance_loss_mlp": 1.00033474, + "epoch": 0.33284232676987824, + "flos": 25374516019200.0, + "grad_norm": 2.2486236673652464, + "language_loss": 0.72541833, + "learning_rate": 3.1126174813867517e-06, + "loss": 0.74364138, + "num_input_tokens_seen": 118910260, + "step": 5536, + "time_per_iteration": 2.6218667030334473 + }, + { + "auxiliary_loss_clip": 0.01073507, + "auxiliary_loss_mlp": 0.01037258, + "balance_loss_clip": 1.030545, + "balance_loss_mlp": 1.02453256, + "epoch": 0.3329024500225462, + "flos": 23694363400320.0, + "grad_norm": 1.5896236484986974, + "language_loss": 0.81861174, + "learning_rate": 3.112293827106917e-06, + "loss": 0.83971941, + "num_input_tokens_seen": 118929985, + "step": 5537, + "time_per_iteration": 2.709322690963745 + }, + { + "auxiliary_loss_clip": 0.0107999, + "auxiliary_loss_mlp": 0.01035939, + "balance_loss_clip": 1.03494072, + "balance_loss_mlp": 1.02181911, + "epoch": 0.33296257327521417, + "flos": 31723163205120.0, + "grad_norm": 1.8436555138215995, + "language_loss": 0.71470743, + "learning_rate": 3.111970130648789e-06, + "loss": 0.73586667, + "num_input_tokens_seen": 118951355, + "step": 5538, + "time_per_iteration": 2.7719826698303223 + }, + { + "auxiliary_loss_clip": 0.01070662, + "auxiliary_loss_mlp": 0.01031745, + "balance_loss_clip": 1.02958465, + "balance_loss_mlp": 1.01837575, + "epoch": 0.33302269652788213, + "flos": 22744705674240.0, + "grad_norm": 1.9237840597440916, + "language_loss": 0.74492109, + "learning_rate": 3.1116463920246424e-06, + "loss": 0.76594514, + "num_input_tokens_seen": 118970910, + "step": 5539, + "time_per_iteration": 2.7861740589141846 + }, + { + "auxiliary_loss_clip": 0.01088705, + "auxiliary_loss_mlp": 0.01045861, + "balance_loss_clip": 1.0314647, + "balance_loss_mlp": 1.03150833, + "epoch": 0.33308281978055015, + "flos": 11473747441920.0, + "grad_norm": 1.9784912977629183, + "language_loss": 0.71008795, + "learning_rate": 3.1113226112467527e-06, + "loss": 0.73143363, + "num_input_tokens_seen": 118989200, + "step": 5540, + "time_per_iteration": 2.5586025714874268 + }, + { + "auxiliary_loss_clip": 0.01071782, + "auxiliary_loss_mlp": 0.01033764, + "balance_loss_clip": 1.02819014, + "balance_loss_mlp": 1.02050209, + "epoch": 0.3331429430332181, + "flos": 38213693112960.0, + "grad_norm": 1.710957242532363, + "language_loss": 0.60667193, + "learning_rate": 3.1109987883273983e-06, + "loss": 0.62772739, + "num_input_tokens_seen": 119011030, + "step": 5541, + "time_per_iteration": 2.73740553855896 + }, + { + "auxiliary_loss_clip": 0.01068281, + "auxiliary_loss_mlp": 0.0104017, + "balance_loss_clip": 1.03206325, + "balance_loss_mlp": 1.02580559, + "epoch": 0.3332030662858861, + "flos": 22528667324160.0, + "grad_norm": 1.5561238800884767, + "language_loss": 0.68495989, + "learning_rate": 3.1106749232788584e-06, + "loss": 0.70604444, + "num_input_tokens_seen": 119030620, + "step": 5542, + "time_per_iteration": 2.6991374492645264 + }, + { + "auxiliary_loss_clip": 0.01071778, + "auxiliary_loss_mlp": 0.01035181, + "balance_loss_clip": 1.02983487, + "balance_loss_mlp": 1.02201438, + "epoch": 0.33326318953855405, + "flos": 15997773507840.0, + "grad_norm": 1.611256494102914, + "language_loss": 0.75469941, + "learning_rate": 3.110351016113414e-06, + "loss": 0.775769, + "num_input_tokens_seen": 119048015, + "step": 5543, + "time_per_iteration": 2.6879591941833496 + }, + { + "auxiliary_loss_clip": 0.01010354, + "auxiliary_loss_mlp": 0.0103952, + "balance_loss_clip": 1.02902889, + "balance_loss_mlp": 1.02454686, + "epoch": 0.333323312791222, + "flos": 25593535198080.0, + "grad_norm": 1.5702389249180844, + "language_loss": 0.75264555, + "learning_rate": 3.110027066843348e-06, + "loss": 0.77314436, + "num_input_tokens_seen": 119066280, + "step": 5544, + "time_per_iteration": 2.8444838523864746 + }, + { + "auxiliary_loss_clip": 0.01080326, + "auxiliary_loss_mlp": 0.01030194, + "balance_loss_clip": 1.02918541, + "balance_loss_mlp": 1.01724148, + "epoch": 0.33338343604389, + "flos": 25119550304640.0, + "grad_norm": 1.52310551547973, + "language_loss": 0.7080127, + "learning_rate": 3.1097030754809456e-06, + "loss": 0.72911787, + "num_input_tokens_seen": 119087680, + "step": 5545, + "time_per_iteration": 2.6775970458984375 + }, + { + "auxiliary_loss_clip": 0.01048936, + "auxiliary_loss_mlp": 0.01036231, + "balance_loss_clip": 1.03040385, + "balance_loss_mlp": 1.02292109, + "epoch": 0.33344355929655795, + "flos": 16947287579520.0, + "grad_norm": 1.680308584589662, + "language_loss": 0.69136012, + "learning_rate": 3.1093790420384894e-06, + "loss": 0.71221173, + "num_input_tokens_seen": 119105820, + "step": 5546, + "time_per_iteration": 2.8328421115875244 + }, + { + "auxiliary_loss_clip": 0.01048957, + "auxiliary_loss_mlp": 0.01033294, + "balance_loss_clip": 1.02718782, + "balance_loss_mlp": 1.01915526, + "epoch": 0.3335036825492259, + "flos": 27889591345920.0, + "grad_norm": 1.563781229623156, + "language_loss": 0.64706266, + "learning_rate": 3.1090549665282702e-06, + "loss": 0.66788507, + "num_input_tokens_seen": 119126630, + "step": 5547, + "time_per_iteration": 2.8237645626068115 + }, + { + "auxiliary_loss_clip": 0.01060933, + "auxiliary_loss_mlp": 0.01029281, + "balance_loss_clip": 1.03130448, + "balance_loss_mlp": 1.01665688, + "epoch": 0.3335638058018939, + "flos": 16179553261440.0, + "grad_norm": 2.5954376855577213, + "language_loss": 0.85864604, + "learning_rate": 3.1087308489625742e-06, + "loss": 0.87954819, + "num_input_tokens_seen": 119143375, + "step": 5548, + "time_per_iteration": 2.7264130115509033 + }, + { + "auxiliary_loss_clip": 0.01072712, + "auxiliary_loss_mlp": 0.01035764, + "balance_loss_clip": 1.02909529, + "balance_loss_mlp": 1.02090454, + "epoch": 0.33362392905456184, + "flos": 39896108288640.0, + "grad_norm": 1.8260730525861986, + "language_loss": 0.74304372, + "learning_rate": 3.1084066893536945e-06, + "loss": 0.76412857, + "num_input_tokens_seen": 119166450, + "step": 5549, + "time_per_iteration": 2.754384756088257 + }, + { + "auxiliary_loss_clip": 0.01075155, + "auxiliary_loss_mlp": 0.0103679, + "balance_loss_clip": 1.029755, + "balance_loss_mlp": 1.02206755, + "epoch": 0.3336840523072298, + "flos": 44271212567040.0, + "grad_norm": 1.7841889855386968, + "language_loss": 0.68230206, + "learning_rate": 3.108082487713921e-06, + "loss": 0.70342147, + "num_input_tokens_seen": 119189645, + "step": 5550, + "time_per_iteration": 2.840506076812744 + }, + { + "auxiliary_loss_clip": 0.01043121, + "auxiliary_loss_mlp": 0.01048173, + "balance_loss_clip": 1.02863407, + "balance_loss_mlp": 1.03355169, + "epoch": 0.33374417555989777, + "flos": 15085678429440.0, + "grad_norm": 2.0187793304256614, + "language_loss": 0.60394824, + "learning_rate": 3.1077582440555495e-06, + "loss": 0.62486118, + "num_input_tokens_seen": 119208045, + "step": 5551, + "time_per_iteration": 2.6955621242523193 + }, + { + "auxiliary_loss_clip": 0.01045679, + "auxiliary_loss_mlp": 0.01038407, + "balance_loss_clip": 1.0292294, + "balance_loss_mlp": 1.02379143, + "epoch": 0.33380429881256574, + "flos": 15849174942720.0, + "grad_norm": 1.8570982408372276, + "language_loss": 0.70415586, + "learning_rate": 3.1074339583908746e-06, + "loss": 0.72499675, + "num_input_tokens_seen": 119224910, + "step": 5552, + "time_per_iteration": 2.6914403438568115 + }, + { + "auxiliary_loss_clip": 0.01044945, + "auxiliary_loss_mlp": 0.01032708, + "balance_loss_clip": 1.02591872, + "balance_loss_mlp": 1.01919591, + "epoch": 0.33386442206523376, + "flos": 13480327883520.0, + "grad_norm": 2.0453965188508234, + "language_loss": 0.82578588, + "learning_rate": 3.107109630732192e-06, + "loss": 0.84656239, + "num_input_tokens_seen": 119243290, + "step": 5553, + "time_per_iteration": 2.6376595497131348 + }, + { + "auxiliary_loss_clip": 0.0106412, + "auxiliary_loss_mlp": 0.00747974, + "balance_loss_clip": 1.03085911, + "balance_loss_mlp": 1.00055265, + "epoch": 0.3339245453179017, + "flos": 16690669839360.0, + "grad_norm": 2.5599308620192303, + "language_loss": 0.80609596, + "learning_rate": 3.1067852610918017e-06, + "loss": 0.82421684, + "num_input_tokens_seen": 119261195, + "step": 5554, + "time_per_iteration": 2.7047553062438965 + }, + { + "auxiliary_loss_clip": 0.01073305, + "auxiliary_loss_mlp": 0.01042516, + "balance_loss_clip": 1.03028023, + "balance_loss_mlp": 1.02839541, + "epoch": 0.3339846685705697, + "flos": 24610624456320.0, + "grad_norm": 1.8018984050261013, + "language_loss": 0.81286573, + "learning_rate": 3.1064608494820032e-06, + "loss": 0.83402395, + "num_input_tokens_seen": 119282845, + "step": 5555, + "time_per_iteration": 2.687069892883301 + }, + { + "auxiliary_loss_clip": 0.01066844, + "auxiliary_loss_mlp": 0.0103718, + "balance_loss_clip": 1.02737761, + "balance_loss_mlp": 1.02391839, + "epoch": 0.33404479182323765, + "flos": 30953812775040.0, + "grad_norm": 1.8131130294619875, + "language_loss": 0.74430084, + "learning_rate": 3.106136395915099e-06, + "loss": 0.76534104, + "num_input_tokens_seen": 119304430, + "step": 5556, + "time_per_iteration": 2.7625339031219482 + }, + { + "auxiliary_loss_clip": 0.01071321, + "auxiliary_loss_mlp": 0.01035453, + "balance_loss_clip": 1.02927279, + "balance_loss_mlp": 1.02269197, + "epoch": 0.3341049150759056, + "flos": 23513301918720.0, + "grad_norm": 1.5177490899319772, + "language_loss": 0.82277513, + "learning_rate": 3.105811900403391e-06, + "loss": 0.84384286, + "num_input_tokens_seen": 119323830, + "step": 5557, + "time_per_iteration": 2.67626690864563 + }, + { + "auxiliary_loss_clip": 0.01064731, + "auxiliary_loss_mlp": 0.01038763, + "balance_loss_clip": 1.03027856, + "balance_loss_mlp": 1.02464914, + "epoch": 0.3341650383285736, + "flos": 24026824707840.0, + "grad_norm": 1.5639173428969269, + "language_loss": 0.80223334, + "learning_rate": 3.1054873629591855e-06, + "loss": 0.82326829, + "num_input_tokens_seen": 119346340, + "step": 5558, + "time_per_iteration": 4.239088773727417 + }, + { + "auxiliary_loss_clip": 0.01055161, + "auxiliary_loss_mlp": 0.01034395, + "balance_loss_clip": 1.02948129, + "balance_loss_mlp": 1.02081144, + "epoch": 0.33422516158124155, + "flos": 24901967669760.0, + "grad_norm": 1.6261135955814512, + "language_loss": 0.81677127, + "learning_rate": 3.105162783594788e-06, + "loss": 0.83766687, + "num_input_tokens_seen": 119367285, + "step": 5559, + "time_per_iteration": 2.678651809692383 + }, + { + "auxiliary_loss_clip": 0.01048523, + "auxiliary_loss_mlp": 0.01035482, + "balance_loss_clip": 1.02837658, + "balance_loss_mlp": 1.02192819, + "epoch": 0.3342852848339095, + "flos": 18333403464960.0, + "grad_norm": 2.05993274246224, + "language_loss": 0.72036862, + "learning_rate": 3.1048381623225074e-06, + "loss": 0.74120867, + "num_input_tokens_seen": 119385370, + "step": 5560, + "time_per_iteration": 4.424911260604858 + }, + { + "auxiliary_loss_clip": 0.0106495, + "auxiliary_loss_mlp": 0.01043225, + "balance_loss_clip": 1.02976036, + "balance_loss_mlp": 1.02810907, + "epoch": 0.3343454080865775, + "flos": 30046530119040.0, + "grad_norm": 1.3650852966084381, + "language_loss": 0.74660569, + "learning_rate": 3.1045134991546526e-06, + "loss": 0.76768744, + "num_input_tokens_seen": 119409150, + "step": 5561, + "time_per_iteration": 2.76486873626709 + }, + { + "auxiliary_loss_clip": 0.01066352, + "auxiliary_loss_mlp": 0.01040274, + "balance_loss_clip": 1.03199959, + "balance_loss_mlp": 1.02627313, + "epoch": 0.33440553133924544, + "flos": 16398823835520.0, + "grad_norm": 2.599402625329966, + "language_loss": 0.69314831, + "learning_rate": 3.1041887941035355e-06, + "loss": 0.71421456, + "num_input_tokens_seen": 119426475, + "step": 5562, + "time_per_iteration": 2.60024094581604 + }, + { + "auxiliary_loss_clip": 0.01071386, + "auxiliary_loss_mlp": 0.01036857, + "balance_loss_clip": 1.02987242, + "balance_loss_mlp": 1.02442372, + "epoch": 0.3344656545919134, + "flos": 24242072958720.0, + "grad_norm": 1.916796410349362, + "language_loss": 0.64792681, + "learning_rate": 3.1038640471814685e-06, + "loss": 0.66900927, + "num_input_tokens_seen": 119446900, + "step": 5563, + "time_per_iteration": 2.644627809524536 + }, + { + "auxiliary_loss_clip": 0.01036441, + "auxiliary_loss_mlp": 0.01041263, + "balance_loss_clip": 1.03175831, + "balance_loss_mlp": 1.02556276, + "epoch": 0.3345257778445814, + "flos": 52118843149440.0, + "grad_norm": 1.3880945070476165, + "language_loss": 0.74317861, + "learning_rate": 3.103539258400766e-06, + "loss": 0.76395559, + "num_input_tokens_seen": 119470945, + "step": 5564, + "time_per_iteration": 3.063382863998413 + }, + { + "auxiliary_loss_clip": 0.00992126, + "auxiliary_loss_mlp": 0.01002314, + "balance_loss_clip": 1.00982368, + "balance_loss_mlp": 1.00080049, + "epoch": 0.33458590109724934, + "flos": 68048602254720.0, + "grad_norm": 0.7784701427820393, + "language_loss": 0.55538177, + "learning_rate": 3.103214427773745e-06, + "loss": 0.57532614, + "num_input_tokens_seen": 119529925, + "step": 5565, + "time_per_iteration": 3.2720794677734375 + }, + { + "auxiliary_loss_clip": 0.01084504, + "auxiliary_loss_mlp": 0.01034914, + "balance_loss_clip": 1.03261137, + "balance_loss_mlp": 1.02190232, + "epoch": 0.3346460243499173, + "flos": 37414788768000.0, + "grad_norm": 1.7692016300048126, + "language_loss": 0.64871442, + "learning_rate": 3.102889555312721e-06, + "loss": 0.66990864, + "num_input_tokens_seen": 119550700, + "step": 5566, + "time_per_iteration": 2.652985095977783 + }, + { + "auxiliary_loss_clip": 0.01065031, + "auxiliary_loss_mlp": 0.01038243, + "balance_loss_clip": 1.0311358, + "balance_loss_mlp": 1.0241642, + "epoch": 0.3347061476025853, + "flos": 18697358021760.0, + "grad_norm": 10.444426091075952, + "language_loss": 0.77366519, + "learning_rate": 3.102564641030016e-06, + "loss": 0.79469788, + "num_input_tokens_seen": 119569295, + "step": 5567, + "time_per_iteration": 2.5904223918914795 + }, + { + "auxiliary_loss_clip": 0.01064644, + "auxiliary_loss_mlp": 0.01035233, + "balance_loss_clip": 1.03163624, + "balance_loss_mlp": 1.02090967, + "epoch": 0.3347662708552533, + "flos": 13917827537280.0, + "grad_norm": 1.6773223347164952, + "language_loss": 0.76434559, + "learning_rate": 3.102239684937949e-06, + "loss": 0.78534436, + "num_input_tokens_seen": 119587375, + "step": 5568, + "time_per_iteration": 4.358225107192993 + }, + { + "auxiliary_loss_clip": 0.01055694, + "auxiliary_loss_mlp": 0.01040808, + "balance_loss_clip": 1.03256679, + "balance_loss_mlp": 1.02640772, + "epoch": 0.33482639410792125, + "flos": 19750402068480.0, + "grad_norm": 1.95012220514394, + "language_loss": 0.70677066, + "learning_rate": 3.101914687048842e-06, + "loss": 0.7277357, + "num_input_tokens_seen": 119604530, + "step": 5569, + "time_per_iteration": 4.302316904067993 + }, + { + "auxiliary_loss_clip": 0.01053699, + "auxiliary_loss_mlp": 0.0103507, + "balance_loss_clip": 1.03081107, + "balance_loss_mlp": 1.01920295, + "epoch": 0.3348865173605892, + "flos": 16102991422080.0, + "grad_norm": 1.923648368877557, + "language_loss": 0.89531279, + "learning_rate": 3.10158964737502e-06, + "loss": 0.9162004, + "num_input_tokens_seen": 119621025, + "step": 5570, + "time_per_iteration": 2.6404995918273926 + }, + { + "auxiliary_loss_clip": 0.01049046, + "auxiliary_loss_mlp": 0.01030028, + "balance_loss_clip": 1.0278939, + "balance_loss_mlp": 1.01638436, + "epoch": 0.3349466406132572, + "flos": 25008945350400.0, + "grad_norm": 1.4908616283209526, + "language_loss": 0.79979038, + "learning_rate": 3.101264565928808e-06, + "loss": 0.82058108, + "num_input_tokens_seen": 119641725, + "step": 5571, + "time_per_iteration": 2.8079144954681396 + }, + { + "auxiliary_loss_clip": 0.01018924, + "auxiliary_loss_mlp": 0.00747103, + "balance_loss_clip": 1.00759506, + "balance_loss_mlp": 1.00055826, + "epoch": 0.33500676386592515, + "flos": 54319991564160.0, + "grad_norm": 0.8950088702517398, + "language_loss": 0.55966532, + "learning_rate": 3.1009394427225335e-06, + "loss": 0.57732558, + "num_input_tokens_seen": 119693560, + "step": 5572, + "time_per_iteration": 3.16666579246521 + }, + { + "auxiliary_loss_clip": 0.01088964, + "auxiliary_loss_mlp": 0.01044377, + "balance_loss_clip": 1.03605032, + "balance_loss_mlp": 1.03037596, + "epoch": 0.3350668871185931, + "flos": 26797332625920.0, + "grad_norm": 1.8101063262933006, + "language_loss": 0.78443444, + "learning_rate": 3.1006142777685257e-06, + "loss": 0.80576789, + "num_input_tokens_seen": 119712935, + "step": 5573, + "time_per_iteration": 2.6048309803009033 + }, + { + "auxiliary_loss_clip": 0.01059179, + "auxiliary_loss_mlp": 0.01042916, + "balance_loss_clip": 1.03365982, + "balance_loss_mlp": 1.02723432, + "epoch": 0.3351270103712611, + "flos": 33510508986240.0, + "grad_norm": 2.1019150453090805, + "language_loss": 0.72519743, + "learning_rate": 3.1002890710791133e-06, + "loss": 0.74621832, + "num_input_tokens_seen": 119731680, + "step": 5574, + "time_per_iteration": 2.8319387435913086 + }, + { + "auxiliary_loss_clip": 0.01071404, + "auxiliary_loss_mlp": 0.01033256, + "balance_loss_clip": 1.02979517, + "balance_loss_mlp": 1.02008367, + "epoch": 0.33518713362392905, + "flos": 26506240807680.0, + "grad_norm": 1.7741448068375911, + "language_loss": 0.87764716, + "learning_rate": 3.0999638226666287e-06, + "loss": 0.8986938, + "num_input_tokens_seen": 119752155, + "step": 5575, + "time_per_iteration": 2.6686434745788574 + }, + { + "auxiliary_loss_clip": 0.01067545, + "auxiliary_loss_mlp": 0.01039805, + "balance_loss_clip": 1.03148997, + "balance_loss_mlp": 1.02461731, + "epoch": 0.335247256876597, + "flos": 17232345912960.0, + "grad_norm": 2.317872465623539, + "language_loss": 0.828125, + "learning_rate": 3.0996385325434063e-06, + "loss": 0.84919846, + "num_input_tokens_seen": 119769195, + "step": 5576, + "time_per_iteration": 2.68319034576416 + }, + { + "auxiliary_loss_clip": 0.01074841, + "auxiliary_loss_mlp": 0.01041712, + "balance_loss_clip": 1.03063285, + "balance_loss_mlp": 1.02713239, + "epoch": 0.335307380129265, + "flos": 25629373992960.0, + "grad_norm": 2.301096419103968, + "language_loss": 0.73147166, + "learning_rate": 3.0993132007217806e-06, + "loss": 0.75263727, + "num_input_tokens_seen": 119786810, + "step": 5577, + "time_per_iteration": 2.5979597568511963 + }, + { + "auxiliary_loss_clip": 0.0105476, + "auxiliary_loss_mlp": 0.01033257, + "balance_loss_clip": 1.03428292, + "balance_loss_mlp": 1.01911855, + "epoch": 0.33536750338193294, + "flos": 19680089195520.0, + "grad_norm": 1.6604529815179145, + "language_loss": 0.8161298, + "learning_rate": 3.0989878272140883e-06, + "loss": 0.83701003, + "num_input_tokens_seen": 119805395, + "step": 5578, + "time_per_iteration": 2.6781492233276367 + }, + { + "auxiliary_loss_clip": 0.0102641, + "auxiliary_loss_mlp": 0.00747925, + "balance_loss_clip": 1.03005695, + "balance_loss_mlp": 1.00042009, + "epoch": 0.3354276266346009, + "flos": 18332613365760.0, + "grad_norm": 1.8885228439173873, + "language_loss": 0.71479452, + "learning_rate": 3.0986624120326676e-06, + "loss": 0.73253787, + "num_input_tokens_seen": 119823135, + "step": 5579, + "time_per_iteration": 2.8552637100219727 + }, + { + "auxiliary_loss_clip": 0.0103425, + "auxiliary_loss_mlp": 0.01038674, + "balance_loss_clip": 1.02994251, + "balance_loss_mlp": 1.02420235, + "epoch": 0.3354877498872689, + "flos": 17858556645120.0, + "grad_norm": 1.7670044663875193, + "language_loss": 0.81564844, + "learning_rate": 3.0983369551898573e-06, + "loss": 0.83637768, + "num_input_tokens_seen": 119842265, + "step": 5580, + "time_per_iteration": 2.696922540664673 + }, + { + "auxiliary_loss_clip": 0.01060029, + "auxiliary_loss_mlp": 0.01032912, + "balance_loss_clip": 1.03001332, + "balance_loss_mlp": 1.01894069, + "epoch": 0.3355478731399369, + "flos": 24717745791360.0, + "grad_norm": 1.5993893568205815, + "language_loss": 0.78099585, + "learning_rate": 3.0980114566980003e-06, + "loss": 0.80192524, + "num_input_tokens_seen": 119862500, + "step": 5581, + "time_per_iteration": 2.6428747177124023 + }, + { + "auxiliary_loss_clip": 0.01054261, + "auxiliary_loss_mlp": 0.01039206, + "balance_loss_clip": 1.0297364, + "balance_loss_mlp": 1.02366138, + "epoch": 0.33560799639260486, + "flos": 16873886136960.0, + "grad_norm": 2.7213294366632947, + "language_loss": 0.7470091, + "learning_rate": 3.0976859165694384e-06, + "loss": 0.7679438, + "num_input_tokens_seen": 119880160, + "step": 5582, + "time_per_iteration": 2.6101906299591064 + }, + { + "auxiliary_loss_clip": 0.01057616, + "auxiliary_loss_mlp": 0.01043613, + "balance_loss_clip": 1.02851725, + "balance_loss_mlp": 1.02877188, + "epoch": 0.3356681196452728, + "flos": 18333511205760.0, + "grad_norm": 1.4538525231678205, + "language_loss": 0.81774437, + "learning_rate": 3.0973603348165166e-06, + "loss": 0.83875662, + "num_input_tokens_seen": 119899040, + "step": 5583, + "time_per_iteration": 2.5787441730499268 + }, + { + "auxiliary_loss_clip": 0.01061403, + "auxiliary_loss_mlp": 0.01042707, + "balance_loss_clip": 1.02948248, + "balance_loss_mlp": 1.02957618, + "epoch": 0.3357282428979408, + "flos": 34750612085760.0, + "grad_norm": 1.9407253174453187, + "language_loss": 0.77734172, + "learning_rate": 3.097034711451581e-06, + "loss": 0.79838288, + "num_input_tokens_seen": 119921120, + "step": 5584, + "time_per_iteration": 2.6893150806427 + }, + { + "auxiliary_loss_clip": 0.01063217, + "auxiliary_loss_mlp": 0.01040578, + "balance_loss_clip": 1.02924883, + "balance_loss_mlp": 1.02674425, + "epoch": 0.33578836615060875, + "flos": 21580087006080.0, + "grad_norm": 1.588211958005569, + "language_loss": 0.76053268, + "learning_rate": 3.0967090464869795e-06, + "loss": 0.78157061, + "num_input_tokens_seen": 119940165, + "step": 5585, + "time_per_iteration": 2.607802391052246 + }, + { + "auxiliary_loss_clip": 0.01067227, + "auxiliary_loss_mlp": 0.01031892, + "balance_loss_clip": 1.02714956, + "balance_loss_mlp": 1.0180757, + "epoch": 0.3358484894032767, + "flos": 24530291688960.0, + "grad_norm": 1.999779691950157, + "language_loss": 0.777659, + "learning_rate": 3.0963833399350608e-06, + "loss": 0.79865021, + "num_input_tokens_seen": 119959730, + "step": 5586, + "time_per_iteration": 2.6264569759368896 + }, + { + "auxiliary_loss_clip": 0.01050102, + "auxiliary_loss_mlp": 0.01044992, + "balance_loss_clip": 1.03100514, + "balance_loss_mlp": 1.02739668, + "epoch": 0.3359086126559447, + "flos": 22455589104000.0, + "grad_norm": 1.839368082675769, + "language_loss": 0.80594736, + "learning_rate": 3.0960575918081756e-06, + "loss": 0.82689834, + "num_input_tokens_seen": 119979315, + "step": 5587, + "time_per_iteration": 2.6659467220306396 + }, + { + "auxiliary_loss_clip": 0.0107928, + "auxiliary_loss_mlp": 0.01034008, + "balance_loss_clip": 1.03020954, + "balance_loss_mlp": 1.02174103, + "epoch": 0.33596873590861265, + "flos": 16543687386240.0, + "grad_norm": 1.766277254647746, + "language_loss": 0.67356813, + "learning_rate": 3.095731802118677e-06, + "loss": 0.69470096, + "num_input_tokens_seen": 119996140, + "step": 5588, + "time_per_iteration": 2.66542387008667 + }, + { + "auxiliary_loss_clip": 0.01059206, + "auxiliary_loss_mlp": 0.00747994, + "balance_loss_clip": 1.02932215, + "balance_loss_mlp": 1.00046492, + "epoch": 0.3360288591612806, + "flos": 31175812782720.0, + "grad_norm": 1.894097885507203, + "language_loss": 0.70162785, + "learning_rate": 3.095405970878919e-06, + "loss": 0.71969986, + "num_input_tokens_seen": 120017720, + "step": 5589, + "time_per_iteration": 2.7750985622406006 + }, + { + "auxiliary_loss_clip": 0.01056657, + "auxiliary_loss_mlp": 0.01040471, + "balance_loss_clip": 1.02598047, + "balance_loss_mlp": 1.02523613, + "epoch": 0.3360889824139486, + "flos": 23696913265920.0, + "grad_norm": 2.097006256096246, + "language_loss": 0.67241275, + "learning_rate": 3.0950800981012567e-06, + "loss": 0.69338405, + "num_input_tokens_seen": 120036335, + "step": 5590, + "time_per_iteration": 2.727940320968628 + }, + { + "auxiliary_loss_clip": 0.01060233, + "auxiliary_loss_mlp": 0.01043484, + "balance_loss_clip": 1.03710866, + "balance_loss_mlp": 1.02866638, + "epoch": 0.33614910566661654, + "flos": 19318109886720.0, + "grad_norm": 2.2116722065243573, + "language_loss": 0.73236549, + "learning_rate": 3.094754183798047e-06, + "loss": 0.75340271, + "num_input_tokens_seen": 120056120, + "step": 5591, + "time_per_iteration": 2.7253360748291016 + }, + { + "auxiliary_loss_clip": 0.0108169, + "auxiliary_loss_mlp": 0.01037005, + "balance_loss_clip": 1.03000462, + "balance_loss_mlp": 1.02347505, + "epoch": 0.3362092289192845, + "flos": 16472261191680.0, + "grad_norm": 1.907692333731897, + "language_loss": 0.70395899, + "learning_rate": 3.0944282279816493e-06, + "loss": 0.72514594, + "num_input_tokens_seen": 120073650, + "step": 5592, + "time_per_iteration": 2.546774387359619 + }, + { + "auxiliary_loss_clip": 0.01057543, + "auxiliary_loss_mlp": 0.01037943, + "balance_loss_clip": 1.02834964, + "balance_loss_mlp": 1.02530122, + "epoch": 0.33626935217195253, + "flos": 24243581329920.0, + "grad_norm": 2.220779207903772, + "language_loss": 0.76917577, + "learning_rate": 3.094102230664423e-06, + "loss": 0.79013062, + "num_input_tokens_seen": 120093260, + "step": 5593, + "time_per_iteration": 2.8525848388671875 + }, + { + "auxiliary_loss_clip": 0.01045265, + "auxiliary_loss_mlp": 0.0074804, + "balance_loss_clip": 1.02536809, + "balance_loss_mlp": 1.00043631, + "epoch": 0.3363294754246205, + "flos": 19718765164800.0, + "grad_norm": 2.279223308554861, + "language_loss": 0.71749252, + "learning_rate": 3.093776191858731e-06, + "loss": 0.73542559, + "num_input_tokens_seen": 120111830, + "step": 5594, + "time_per_iteration": 2.6913270950317383 + }, + { + "auxiliary_loss_clip": 0.01031024, + "auxiliary_loss_mlp": 0.00748253, + "balance_loss_clip": 1.02665544, + "balance_loss_mlp": 1.00059295, + "epoch": 0.33638959867728846, + "flos": 22596286677120.0, + "grad_norm": 1.5706276219475994, + "language_loss": 0.80080175, + "learning_rate": 3.0934501115769363e-06, + "loss": 0.81859457, + "num_input_tokens_seen": 120130470, + "step": 5595, + "time_per_iteration": 2.738020181655884 + }, + { + "auxiliary_loss_clip": 0.01062946, + "auxiliary_loss_mlp": 0.01033995, + "balance_loss_clip": 1.03001952, + "balance_loss_mlp": 1.0215199, + "epoch": 0.3364497219299564, + "flos": 20994742972800.0, + "grad_norm": 1.9983448902279848, + "language_loss": 0.81336254, + "learning_rate": 3.0931239898314037e-06, + "loss": 0.83433193, + "num_input_tokens_seen": 120150735, + "step": 5596, + "time_per_iteration": 2.6494009494781494 + }, + { + "auxiliary_loss_clip": 0.0106221, + "auxiliary_loss_mlp": 0.01037539, + "balance_loss_clip": 1.02982116, + "balance_loss_mlp": 1.02515364, + "epoch": 0.3365098451826244, + "flos": 25228610974080.0, + "grad_norm": 1.7355407501367446, + "language_loss": 0.75785601, + "learning_rate": 3.0927978266344995e-06, + "loss": 0.77885354, + "num_input_tokens_seen": 120173230, + "step": 5597, + "time_per_iteration": 2.747830629348755 + }, + { + "auxiliary_loss_clip": 0.01068177, + "auxiliary_loss_mlp": 0.01034619, + "balance_loss_clip": 1.02761281, + "balance_loss_mlp": 1.02126813, + "epoch": 0.33656996843529235, + "flos": 24571697091840.0, + "grad_norm": 7.601579840443, + "language_loss": 0.78695303, + "learning_rate": 3.0924716219985916e-06, + "loss": 0.80798101, + "num_input_tokens_seen": 120191860, + "step": 5598, + "time_per_iteration": 2.6509666442871094 + }, + { + "auxiliary_loss_clip": 0.0108518, + "auxiliary_loss_mlp": 0.01037773, + "balance_loss_clip": 1.02975142, + "balance_loss_mlp": 1.02337313, + "epoch": 0.3366300916879603, + "flos": 44091120752640.0, + "grad_norm": 1.4274816982237761, + "language_loss": 0.64371192, + "learning_rate": 3.0921453759360514e-06, + "loss": 0.66494149, + "num_input_tokens_seen": 120219195, + "step": 5599, + "time_per_iteration": 2.7687506675720215 + }, + { + "auxiliary_loss_clip": 0.01048984, + "auxiliary_loss_mlp": 0.01053109, + "balance_loss_clip": 1.02818286, + "balance_loss_mlp": 1.03502512, + "epoch": 0.3366902149406283, + "flos": 13879869840000.0, + "grad_norm": 2.517876842608336, + "language_loss": 0.82126725, + "learning_rate": 3.091819088459249e-06, + "loss": 0.84228814, + "num_input_tokens_seen": 120232950, + "step": 5600, + "time_per_iteration": 2.771923780441284 + }, + { + "auxiliary_loss_clip": 0.01076817, + "auxiliary_loss_mlp": 0.0104288, + "balance_loss_clip": 1.03085756, + "balance_loss_mlp": 1.02752542, + "epoch": 0.33675033819329625, + "flos": 16253098358400.0, + "grad_norm": 2.2948328900753565, + "language_loss": 0.83183837, + "learning_rate": 3.0914927595805573e-06, + "loss": 0.85303539, + "num_input_tokens_seen": 120248865, + "step": 5601, + "time_per_iteration": 2.6435019969940186 + }, + { + "auxiliary_loss_clip": 0.0107557, + "auxiliary_loss_mlp": 0.01032872, + "balance_loss_clip": 1.03564334, + "balance_loss_mlp": 1.01998603, + "epoch": 0.3368104614459642, + "flos": 17055809544960.0, + "grad_norm": 1.660416138028964, + "language_loss": 0.83191019, + "learning_rate": 3.0911663893123507e-06, + "loss": 0.85299456, + "num_input_tokens_seen": 120267820, + "step": 5602, + "time_per_iteration": 2.6256134510040283 + }, + { + "auxiliary_loss_clip": 0.01084388, + "auxiliary_loss_mlp": 0.01045576, + "balance_loss_clip": 1.03120697, + "balance_loss_mlp": 1.03171825, + "epoch": 0.3368705846986322, + "flos": 17858628472320.0, + "grad_norm": 1.6747507739539074, + "language_loss": 0.69803715, + "learning_rate": 3.0908399776670048e-06, + "loss": 0.71933681, + "num_input_tokens_seen": 120286540, + "step": 5603, + "time_per_iteration": 2.5125200748443604 + }, + { + "auxiliary_loss_clip": 0.01068095, + "auxiliary_loss_mlp": 0.01039391, + "balance_loss_clip": 1.03237033, + "balance_loss_mlp": 1.02460361, + "epoch": 0.33693070795130015, + "flos": 22929502170240.0, + "grad_norm": 1.639821262088139, + "language_loss": 0.83262223, + "learning_rate": 3.090513524656898e-06, + "loss": 0.85369706, + "num_input_tokens_seen": 120307305, + "step": 5604, + "time_per_iteration": 2.647670030593872 + }, + { + "auxiliary_loss_clip": 0.01046581, + "auxiliary_loss_mlp": 0.01038126, + "balance_loss_clip": 1.02729988, + "balance_loss_mlp": 1.02458358, + "epoch": 0.3369908312039681, + "flos": 22017443005440.0, + "grad_norm": 1.3155634977196085, + "language_loss": 0.73763114, + "learning_rate": 3.090187030294409e-06, + "loss": 0.75847816, + "num_input_tokens_seen": 120327845, + "step": 5605, + "time_per_iteration": 4.289697647094727 + }, + { + "auxiliary_loss_clip": 0.01065992, + "auxiliary_loss_mlp": 0.01040118, + "balance_loss_clip": 1.03030765, + "balance_loss_mlp": 1.0254488, + "epoch": 0.33705095445663613, + "flos": 11801970944640.0, + "grad_norm": 2.7978338073760156, + "language_loss": 0.83918935, + "learning_rate": 3.089860494591919e-06, + "loss": 0.86025041, + "num_input_tokens_seen": 120343255, + "step": 5606, + "time_per_iteration": 2.7175376415252686 + }, + { + "auxiliary_loss_clip": 0.01056125, + "auxiliary_loss_mlp": 0.01037483, + "balance_loss_clip": 1.02679443, + "balance_loss_mlp": 1.02448916, + "epoch": 0.3371110777093041, + "flos": 25046400257280.0, + "grad_norm": 2.0795070498705797, + "language_loss": 0.68069285, + "learning_rate": 3.089533917561809e-06, + "loss": 0.70162892, + "num_input_tokens_seen": 120361745, + "step": 5607, + "time_per_iteration": 2.6734557151794434 + }, + { + "auxiliary_loss_clip": 0.0106674, + "auxiliary_loss_mlp": 0.0104663, + "balance_loss_clip": 1.02906096, + "balance_loss_mlp": 1.0308882, + "epoch": 0.33717120096197206, + "flos": 26579031719040.0, + "grad_norm": 2.686366351878259, + "language_loss": 0.70978135, + "learning_rate": 3.089207299216464e-06, + "loss": 0.73091507, + "num_input_tokens_seen": 120380565, + "step": 5608, + "time_per_iteration": 4.28315806388855 + }, + { + "auxiliary_loss_clip": 0.01007671, + "auxiliary_loss_mlp": 0.01038401, + "balance_loss_clip": 1.0256834, + "balance_loss_mlp": 1.02511537, + "epoch": 0.33723132421464, + "flos": 15158541168000.0, + "grad_norm": 1.8687843896674878, + "language_loss": 0.79117525, + "learning_rate": 3.088880639568269e-06, + "loss": 0.81163603, + "num_input_tokens_seen": 120399235, + "step": 5609, + "time_per_iteration": 2.746311664581299 + }, + { + "auxiliary_loss_clip": 0.01073278, + "auxiliary_loss_mlp": 0.01037766, + "balance_loss_clip": 1.03028083, + "balance_loss_mlp": 1.02302575, + "epoch": 0.337291447467308, + "flos": 23436093634560.0, + "grad_norm": 1.7190730893669832, + "language_loss": 0.82465649, + "learning_rate": 3.0885539386296114e-06, + "loss": 0.84576696, + "num_input_tokens_seen": 120420095, + "step": 5610, + "time_per_iteration": 2.648715019226074 + }, + { + "auxiliary_loss_clip": 0.01069747, + "auxiliary_loss_mlp": 0.01036408, + "balance_loss_clip": 1.02885902, + "balance_loss_mlp": 1.02210307, + "epoch": 0.33735157071997596, + "flos": 17238163916160.0, + "grad_norm": 1.8625061060303536, + "language_loss": 0.81932127, + "learning_rate": 3.088227196412879e-06, + "loss": 0.84038287, + "num_input_tokens_seen": 120437690, + "step": 5611, + "time_per_iteration": 2.583718776702881 + }, + { + "auxiliary_loss_clip": 0.01066427, + "auxiliary_loss_mlp": 0.01037088, + "balance_loss_clip": 1.03243184, + "balance_loss_mlp": 1.02207923, + "epoch": 0.3374116939726439, + "flos": 28257388657920.0, + "grad_norm": 2.2864341770444363, + "language_loss": 0.79939854, + "learning_rate": 3.0879004129304626e-06, + "loss": 0.82043374, + "num_input_tokens_seen": 120459240, + "step": 5612, + "time_per_iteration": 2.7586288452148438 + }, + { + "auxiliary_loss_clip": 0.01028085, + "auxiliary_loss_mlp": 0.01031939, + "balance_loss_clip": 1.02565598, + "balance_loss_mlp": 1.01811039, + "epoch": 0.3374718172253119, + "flos": 35919396731520.0, + "grad_norm": 3.1771255396767613, + "language_loss": 0.69994068, + "learning_rate": 3.087573588194753e-06, + "loss": 0.72054094, + "num_input_tokens_seen": 120481090, + "step": 5613, + "time_per_iteration": 3.004119873046875 + }, + { + "auxiliary_loss_clip": 0.01064963, + "auxiliary_loss_mlp": 0.01033488, + "balance_loss_clip": 1.02972817, + "balance_loss_mlp": 1.01876569, + "epoch": 0.33753194047797985, + "flos": 18186672407040.0, + "grad_norm": 2.231586136940485, + "language_loss": 0.79684067, + "learning_rate": 3.087246722218144e-06, + "loss": 0.81782514, + "num_input_tokens_seen": 120500045, + "step": 5614, + "time_per_iteration": 2.8774611949920654 + }, + { + "auxiliary_loss_clip": 0.01052125, + "auxiliary_loss_mlp": 0.01043313, + "balance_loss_clip": 1.02736115, + "balance_loss_mlp": 1.02707648, + "epoch": 0.3375920637306478, + "flos": 23148916398720.0, + "grad_norm": 1.8127162319376395, + "language_loss": 0.91064155, + "learning_rate": 3.086919815013031e-06, + "loss": 0.93159592, + "num_input_tokens_seen": 120521125, + "step": 5615, + "time_per_iteration": 2.6731669902801514 + }, + { + "auxiliary_loss_clip": 0.01069956, + "auxiliary_loss_mlp": 0.01034167, + "balance_loss_clip": 1.02925169, + "balance_loss_mlp": 1.0209173, + "epoch": 0.3376521869833158, + "flos": 23112215677440.0, + "grad_norm": 1.6898913768281065, + "language_loss": 0.80991489, + "learning_rate": 3.086592866591809e-06, + "loss": 0.8309561, + "num_input_tokens_seen": 120539180, + "step": 5616, + "time_per_iteration": 4.228132009506226 + }, + { + "auxiliary_loss_clip": 0.0108026, + "auxiliary_loss_mlp": 0.00748094, + "balance_loss_clip": 1.03229165, + "balance_loss_mlp": 1.00037503, + "epoch": 0.33771231023598375, + "flos": 19274585581440.0, + "grad_norm": 2.474919671727445, + "language_loss": 0.84060991, + "learning_rate": 3.0862658769668774e-06, + "loss": 0.85889345, + "num_input_tokens_seen": 120556280, + "step": 5617, + "time_per_iteration": 4.187256336212158 + }, + { + "auxiliary_loss_clip": 0.01012367, + "auxiliary_loss_mlp": 0.01036872, + "balance_loss_clip": 1.025913, + "balance_loss_mlp": 1.02254283, + "epoch": 0.3377724334886517, + "flos": 18150187167360.0, + "grad_norm": 1.4542647953940269, + "language_loss": 0.79976809, + "learning_rate": 3.0859388461506343e-06, + "loss": 0.82026041, + "num_input_tokens_seen": 120575395, + "step": 5618, + "time_per_iteration": 2.7997913360595703 + }, + { + "auxiliary_loss_clip": 0.01045636, + "auxiliary_loss_mlp": 0.01036104, + "balance_loss_clip": 1.02961504, + "balance_loss_mlp": 1.02231109, + "epoch": 0.3378325567413197, + "flos": 25775997310080.0, + "grad_norm": 1.6032321808361225, + "language_loss": 0.71126223, + "learning_rate": 3.085611774155481e-06, + "loss": 0.73207963, + "num_input_tokens_seen": 120596075, + "step": 5619, + "time_per_iteration": 2.9572317600250244 + }, + { + "auxiliary_loss_clip": 0.01060585, + "auxiliary_loss_mlp": 0.01042242, + "balance_loss_clip": 1.02740014, + "balance_loss_mlp": 1.02846766, + "epoch": 0.3378926799939877, + "flos": 21317112558720.0, + "grad_norm": 2.8886762102191557, + "language_loss": 0.69420469, + "learning_rate": 3.085284660993821e-06, + "loss": 0.71523297, + "num_input_tokens_seen": 120614195, + "step": 5620, + "time_per_iteration": 2.909689426422119 + }, + { + "auxiliary_loss_clip": 0.01085739, + "auxiliary_loss_mlp": 0.01040606, + "balance_loss_clip": 1.03287554, + "balance_loss_mlp": 1.02672458, + "epoch": 0.33795280324665566, + "flos": 24900028335360.0, + "grad_norm": 1.6342560654234015, + "language_loss": 0.67747295, + "learning_rate": 3.084957506678058e-06, + "loss": 0.69873637, + "num_input_tokens_seen": 120634475, + "step": 5621, + "time_per_iteration": 2.782803535461426 + }, + { + "auxiliary_loss_clip": 0.01061981, + "auxiliary_loss_mlp": 0.010392, + "balance_loss_clip": 1.03164148, + "balance_loss_mlp": 1.02591991, + "epoch": 0.33801292649932363, + "flos": 24753943722240.0, + "grad_norm": 1.5850742053866105, + "language_loss": 0.82596874, + "learning_rate": 3.0846303112205975e-06, + "loss": 0.84698057, + "num_input_tokens_seen": 120654980, + "step": 5622, + "time_per_iteration": 2.8075549602508545 + }, + { + "auxiliary_loss_clip": 0.01047031, + "auxiliary_loss_mlp": 0.01032672, + "balance_loss_clip": 1.02622342, + "balance_loss_mlp": 1.01940989, + "epoch": 0.3380730497519916, + "flos": 26723967096960.0, + "grad_norm": 1.6858773842086328, + "language_loss": 0.73344493, + "learning_rate": 3.0843030746338464e-06, + "loss": 0.75424194, + "num_input_tokens_seen": 120676245, + "step": 5623, + "time_per_iteration": 2.6843063831329346 + }, + { + "auxiliary_loss_clip": 0.01004259, + "auxiliary_loss_mlp": 0.01006429, + "balance_loss_clip": 1.01181841, + "balance_loss_mlp": 1.00461721, + "epoch": 0.33813317300465956, + "flos": 70035756416640.0, + "grad_norm": 0.8076582222839396, + "language_loss": 0.54897219, + "learning_rate": 3.083975796930215e-06, + "loss": 0.5690791, + "num_input_tokens_seen": 120741965, + "step": 5624, + "time_per_iteration": 3.3315110206604004 + }, + { + "auxiliary_loss_clip": 0.01046026, + "auxiliary_loss_mlp": 0.0104744, + "balance_loss_clip": 1.02945876, + "balance_loss_mlp": 1.0308466, + "epoch": 0.3381932962573275, + "flos": 24097317148800.0, + "grad_norm": 2.0788357237824386, + "language_loss": 0.72956699, + "learning_rate": 3.083648478122111e-06, + "loss": 0.75050163, + "num_input_tokens_seen": 120760410, + "step": 5625, + "time_per_iteration": 2.8069510459899902 + }, + { + "auxiliary_loss_clip": 0.01076709, + "auxiliary_loss_mlp": 0.0103713, + "balance_loss_clip": 1.03247499, + "balance_loss_mlp": 1.02308095, + "epoch": 0.3382534195099955, + "flos": 19278248768640.0, + "grad_norm": 2.2124606080838882, + "language_loss": 0.70544177, + "learning_rate": 3.0833211182219497e-06, + "loss": 0.72658014, + "num_input_tokens_seen": 120777705, + "step": 5626, + "time_per_iteration": 2.734499931335449 + }, + { + "auxiliary_loss_clip": 0.01058991, + "auxiliary_loss_mlp": 0.01034927, + "balance_loss_clip": 1.02960896, + "balance_loss_mlp": 1.02105665, + "epoch": 0.33831354276266346, + "flos": 25226240676480.0, + "grad_norm": 2.416203438215232, + "language_loss": 0.80815184, + "learning_rate": 3.0829937172421425e-06, + "loss": 0.82909107, + "num_input_tokens_seen": 120798660, + "step": 5627, + "time_per_iteration": 2.8988208770751953 + }, + { + "auxiliary_loss_clip": 0.01083385, + "auxiliary_loss_mlp": 0.00748188, + "balance_loss_clip": 1.03691041, + "balance_loss_mlp": 1.00041938, + "epoch": 0.3383736660153314, + "flos": 23112000195840.0, + "grad_norm": 1.654151461268085, + "language_loss": 0.80091149, + "learning_rate": 3.0826662751951055e-06, + "loss": 0.81922722, + "num_input_tokens_seen": 120816705, + "step": 5628, + "time_per_iteration": 2.757965326309204 + }, + { + "auxiliary_loss_clip": 0.01034526, + "auxiliary_loss_mlp": 0.01035839, + "balance_loss_clip": 1.02781725, + "balance_loss_mlp": 1.02029359, + "epoch": 0.3384337892679994, + "flos": 23477139901440.0, + "grad_norm": 1.9178142488141523, + "language_loss": 0.77549738, + "learning_rate": 3.082338792093254e-06, + "loss": 0.79620099, + "num_input_tokens_seen": 120835375, + "step": 5629, + "time_per_iteration": 2.7640862464904785 + }, + { + "auxiliary_loss_clip": 0.01067264, + "auxiliary_loss_mlp": 0.01039679, + "balance_loss_clip": 1.03097796, + "balance_loss_mlp": 1.02349663, + "epoch": 0.33849391252066735, + "flos": 19425805839360.0, + "grad_norm": 2.153657977458943, + "language_loss": 0.84703559, + "learning_rate": 3.0820112679490074e-06, + "loss": 0.86810499, + "num_input_tokens_seen": 120854260, + "step": 5630, + "time_per_iteration": 2.697206974029541 + }, + { + "auxiliary_loss_clip": 0.01037089, + "auxiliary_loss_mlp": 0.01046881, + "balance_loss_clip": 1.03033853, + "balance_loss_mlp": 1.0320394, + "epoch": 0.3385540357733353, + "flos": 21064840364160.0, + "grad_norm": 1.9095127622186905, + "language_loss": 0.71588755, + "learning_rate": 3.0816837027747857e-06, + "loss": 0.73672724, + "num_input_tokens_seen": 120871590, + "step": 5631, + "time_per_iteration": 2.830249547958374 + }, + { + "auxiliary_loss_clip": 0.01003804, + "auxiliary_loss_mlp": 0.01008507, + "balance_loss_clip": 1.01224589, + "balance_loss_mlp": 1.00683773, + "epoch": 0.3386141590260033, + "flos": 69208013450880.0, + "grad_norm": 0.8576244663304492, + "language_loss": 0.56150222, + "learning_rate": 3.0813560965830084e-06, + "loss": 0.58162534, + "num_input_tokens_seen": 120925550, + "step": 5632, + "time_per_iteration": 3.2794082164764404 + }, + { + "auxiliary_loss_clip": 0.01076188, + "auxiliary_loss_mlp": 0.01035878, + "balance_loss_clip": 1.03358579, + "balance_loss_mlp": 1.02186525, + "epoch": 0.3386742822786713, + "flos": 25519487310720.0, + "grad_norm": 1.5541330819285568, + "language_loss": 0.80538511, + "learning_rate": 3.0810284493861005e-06, + "loss": 0.82650572, + "num_input_tokens_seen": 120947620, + "step": 5633, + "time_per_iteration": 2.731227159500122 + }, + { + "auxiliary_loss_clip": 0.01043739, + "auxiliary_loss_mlp": 0.01035147, + "balance_loss_clip": 1.02590609, + "balance_loss_mlp": 1.02107441, + "epoch": 0.33873440553133927, + "flos": 23623116773760.0, + "grad_norm": 1.899829314963836, + "language_loss": 0.59035408, + "learning_rate": 3.0807007611964855e-06, + "loss": 0.61114299, + "num_input_tokens_seen": 120965205, + "step": 5634, + "time_per_iteration": 2.7014360427856445 + }, + { + "auxiliary_loss_clip": 0.01063692, + "auxiliary_loss_mlp": 0.01036859, + "balance_loss_clip": 1.03331614, + "balance_loss_mlp": 1.02263165, + "epoch": 0.33879452878400723, + "flos": 17088882992640.0, + "grad_norm": 1.5932209354932487, + "language_loss": 0.92522615, + "learning_rate": 3.080373032026589e-06, + "loss": 0.94623166, + "num_input_tokens_seen": 120983560, + "step": 5635, + "time_per_iteration": 2.5960869789123535 + }, + { + "auxiliary_loss_clip": 0.01043724, + "auxiliary_loss_mlp": 0.0103532, + "balance_loss_clip": 1.03242052, + "balance_loss_mlp": 1.02049661, + "epoch": 0.3388546520366752, + "flos": 15742053607680.0, + "grad_norm": 1.7596598467490963, + "language_loss": 0.74926066, + "learning_rate": 3.0800452618888386e-06, + "loss": 0.77005112, + "num_input_tokens_seen": 121001400, + "step": 5636, + "time_per_iteration": 2.742568254470825 + }, + { + "auxiliary_loss_clip": 0.01070421, + "auxiliary_loss_mlp": 0.01041117, + "balance_loss_clip": 1.02912247, + "balance_loss_mlp": 1.02627587, + "epoch": 0.33891477528934316, + "flos": 22418744728320.0, + "grad_norm": 1.5036019381170767, + "language_loss": 0.83247507, + "learning_rate": 3.0797174507956637e-06, + "loss": 0.85359049, + "num_input_tokens_seen": 121021760, + "step": 5637, + "time_per_iteration": 2.6219139099121094 + }, + { + "auxiliary_loss_clip": 0.01038443, + "auxiliary_loss_mlp": 0.01045357, + "balance_loss_clip": 1.02914977, + "balance_loss_mlp": 1.02827406, + "epoch": 0.3389748985420111, + "flos": 17274828723840.0, + "grad_norm": 1.7240829255020418, + "language_loss": 0.69422871, + "learning_rate": 3.079389598759495e-06, + "loss": 0.71506667, + "num_input_tokens_seen": 121041070, + "step": 5638, + "time_per_iteration": 2.7031290531158447 + }, + { + "auxiliary_loss_clip": 0.01052039, + "auxiliary_loss_mlp": 0.01043078, + "balance_loss_clip": 1.02858043, + "balance_loss_mlp": 1.02839768, + "epoch": 0.3390350217946791, + "flos": 27744979190400.0, + "grad_norm": 1.5942921130435872, + "language_loss": 0.80595517, + "learning_rate": 3.079061705792765e-06, + "loss": 0.82690638, + "num_input_tokens_seen": 121060890, + "step": 5639, + "time_per_iteration": 2.6965627670288086 + }, + { + "auxiliary_loss_clip": 0.01087179, + "auxiliary_loss_mlp": 0.01046076, + "balance_loss_clip": 1.03226435, + "balance_loss_mlp": 1.03033447, + "epoch": 0.33909514504734706, + "flos": 20339804338560.0, + "grad_norm": 2.176264519914417, + "language_loss": 0.6739397, + "learning_rate": 3.078733771907907e-06, + "loss": 0.69527233, + "num_input_tokens_seen": 121079135, + "step": 5640, + "time_per_iteration": 2.6096761226654053 + }, + { + "auxiliary_loss_clip": 0.01062284, + "auxiliary_loss_mlp": 0.01035353, + "balance_loss_clip": 1.03018963, + "balance_loss_mlp": 1.02048159, + "epoch": 0.339155268300015, + "flos": 14830030356480.0, + "grad_norm": 1.7000815434294743, + "language_loss": 0.6956315, + "learning_rate": 3.0784057971173554e-06, + "loss": 0.71660781, + "num_input_tokens_seen": 121097685, + "step": 5641, + "time_per_iteration": 2.592804431915283 + }, + { + "auxiliary_loss_clip": 0.0108777, + "auxiliary_loss_mlp": 0.01043588, + "balance_loss_clip": 1.03270185, + "balance_loss_mlp": 1.02909207, + "epoch": 0.339215391552683, + "flos": 26067951054720.0, + "grad_norm": 1.801942582037569, + "language_loss": 0.87435943, + "learning_rate": 3.0780777814335483e-06, + "loss": 0.89567304, + "num_input_tokens_seen": 121115640, + "step": 5642, + "time_per_iteration": 2.652190685272217 + }, + { + "auxiliary_loss_clip": 0.0106628, + "auxiliary_loss_mlp": 0.01030937, + "balance_loss_clip": 1.02887797, + "balance_loss_mlp": 1.01899827, + "epoch": 0.33927551480535095, + "flos": 14574705505920.0, + "grad_norm": 1.9386742743136838, + "language_loss": 0.83501172, + "learning_rate": 3.077749724868924e-06, + "loss": 0.85598379, + "num_input_tokens_seen": 121132485, + "step": 5643, + "time_per_iteration": 2.6066620349884033 + }, + { + "auxiliary_loss_clip": 0.01057059, + "auxiliary_loss_mlp": 0.01043669, + "balance_loss_clip": 1.02964282, + "balance_loss_mlp": 1.02969193, + "epoch": 0.3393356380580189, + "flos": 23805578885760.0, + "grad_norm": 2.1016566074167264, + "language_loss": 0.76934236, + "learning_rate": 3.077421627435922e-06, + "loss": 0.7903496, + "num_input_tokens_seen": 121152935, + "step": 5644, + "time_per_iteration": 2.6448416709899902 + }, + { + "auxiliary_loss_clip": 0.01074508, + "auxiliary_loss_mlp": 0.01045174, + "balance_loss_clip": 1.03066158, + "balance_loss_mlp": 1.03024936, + "epoch": 0.3393957613106869, + "flos": 17347871030400.0, + "grad_norm": 2.5892442198595838, + "language_loss": 0.63253129, + "learning_rate": 3.0770934891469832e-06, + "loss": 0.65372813, + "num_input_tokens_seen": 121169835, + "step": 5645, + "time_per_iteration": 2.5362255573272705 + }, + { + "auxiliary_loss_clip": 0.0107143, + "auxiliary_loss_mlp": 0.01037872, + "balance_loss_clip": 1.0293988, + "balance_loss_mlp": 1.02425253, + "epoch": 0.3394558845633549, + "flos": 28433960939520.0, + "grad_norm": 1.9319061178012695, + "language_loss": 0.761011, + "learning_rate": 3.076765310014552e-06, + "loss": 0.78210402, + "num_input_tokens_seen": 121190290, + "step": 5646, + "time_per_iteration": 2.6376569271087646 + }, + { + "auxiliary_loss_clip": 0.01079044, + "auxiliary_loss_mlp": 0.01040537, + "balance_loss_clip": 1.03303504, + "balance_loss_mlp": 1.02530837, + "epoch": 0.33951600781602287, + "flos": 22086929865600.0, + "grad_norm": 2.146095098832994, + "language_loss": 0.79385173, + "learning_rate": 3.0764370900510727e-06, + "loss": 0.81504756, + "num_input_tokens_seen": 121209060, + "step": 5647, + "time_per_iteration": 2.5904228687286377 + }, + { + "auxiliary_loss_clip": 0.01067384, + "auxiliary_loss_mlp": 0.00748044, + "balance_loss_clip": 1.03355777, + "balance_loss_mlp": 1.00030375, + "epoch": 0.33957613106869083, + "flos": 23878262056320.0, + "grad_norm": 2.1416292850110716, + "language_loss": 0.77286839, + "learning_rate": 3.0761088292689904e-06, + "loss": 0.79102266, + "num_input_tokens_seen": 121227480, + "step": 5648, + "time_per_iteration": 2.600623369216919 + }, + { + "auxiliary_loss_clip": 0.0096518, + "auxiliary_loss_mlp": 0.01025913, + "balance_loss_clip": 1.02009499, + "balance_loss_mlp": 1.02346957, + "epoch": 0.3396362543213588, + "flos": 71242642414080.0, + "grad_norm": 0.7870371228607687, + "language_loss": 0.56351566, + "learning_rate": 3.075780527680754e-06, + "loss": 0.58342659, + "num_input_tokens_seen": 121291305, + "step": 5649, + "time_per_iteration": 3.319164514541626 + }, + { + "auxiliary_loss_clip": 0.0105527, + "auxiliary_loss_mlp": 0.00748053, + "balance_loss_clip": 1.02789629, + "balance_loss_mlp": 1.00038218, + "epoch": 0.33969637757402676, + "flos": 25921615046400.0, + "grad_norm": 1.8635915756136652, + "language_loss": 0.85267895, + "learning_rate": 3.0754521852988117e-06, + "loss": 0.87071216, + "num_input_tokens_seen": 121312740, + "step": 5650, + "time_per_iteration": 2.6177663803100586 + }, + { + "auxiliary_loss_clip": 0.0107646, + "auxiliary_loss_mlp": 0.0103115, + "balance_loss_clip": 1.03306615, + "balance_loss_mlp": 1.01706505, + "epoch": 0.33975650082669473, + "flos": 35261728663680.0, + "grad_norm": 1.5979958535113368, + "language_loss": 0.70983124, + "learning_rate": 3.0751238021356152e-06, + "loss": 0.73090732, + "num_input_tokens_seen": 121334220, + "step": 5651, + "time_per_iteration": 2.724022150039673 + }, + { + "auxiliary_loss_clip": 0.01035797, + "auxiliary_loss_mlp": 0.01037409, + "balance_loss_clip": 1.02843559, + "balance_loss_mlp": 1.02317524, + "epoch": 0.3398166240793627, + "flos": 16647001879680.0, + "grad_norm": 2.4087969082072203, + "language_loss": 0.81240928, + "learning_rate": 3.074795378203616e-06, + "loss": 0.83314133, + "num_input_tokens_seen": 121351870, + "step": 5652, + "time_per_iteration": 4.29085373878479 + }, + { + "auxiliary_loss_clip": 0.01090861, + "auxiliary_loss_mlp": 0.01041937, + "balance_loss_clip": 1.03447723, + "balance_loss_mlp": 1.02683306, + "epoch": 0.33987674733203066, + "flos": 24062196625920.0, + "grad_norm": 1.73116538296118, + "language_loss": 0.7746287, + "learning_rate": 3.0744669135152685e-06, + "loss": 0.79595667, + "num_input_tokens_seen": 121373400, + "step": 5653, + "time_per_iteration": 2.6002678871154785 + }, + { + "auxiliary_loss_clip": 0.01066824, + "auxiliary_loss_mlp": 0.01043749, + "balance_loss_clip": 1.02957535, + "balance_loss_mlp": 1.02831781, + "epoch": 0.3399368705846986, + "flos": 13250678279040.0, + "grad_norm": 2.438295335873668, + "language_loss": 0.85839075, + "learning_rate": 3.0741384080830278e-06, + "loss": 0.87949651, + "num_input_tokens_seen": 121385225, + "step": 5654, + "time_per_iteration": 2.653050661087036 + }, + { + "auxiliary_loss_clip": 0.0107363, + "auxiliary_loss_mlp": 0.01043148, + "balance_loss_clip": 1.03008735, + "balance_loss_mlp": 1.0290513, + "epoch": 0.3399969938373666, + "flos": 27012832272000.0, + "grad_norm": 1.9320036671684462, + "language_loss": 0.64822525, + "learning_rate": 3.073809861919351e-06, + "loss": 0.66939306, + "num_input_tokens_seen": 121404735, + "step": 5655, + "time_per_iteration": 4.424251079559326 + }, + { + "auxiliary_loss_clip": 0.01076279, + "auxiliary_loss_mlp": 0.01041341, + "balance_loss_clip": 1.03303766, + "balance_loss_mlp": 1.02792358, + "epoch": 0.34005711709003456, + "flos": 28550096588160.0, + "grad_norm": 1.492302003345987, + "language_loss": 0.76432395, + "learning_rate": 3.073481275036697e-06, + "loss": 0.78550017, + "num_input_tokens_seen": 121426780, + "step": 5656, + "time_per_iteration": 2.6758716106414795 + }, + { + "auxiliary_loss_clip": 0.01052822, + "auxiliary_loss_mlp": 0.01043713, + "balance_loss_clip": 1.03045964, + "balance_loss_mlp": 1.02933025, + "epoch": 0.3401172403427025, + "flos": 21617003208960.0, + "grad_norm": 1.6305478491706697, + "language_loss": 0.83163297, + "learning_rate": 3.073152647447525e-06, + "loss": 0.85259837, + "num_input_tokens_seen": 121447245, + "step": 5657, + "time_per_iteration": 2.7051568031311035 + }, + { + "auxiliary_loss_clip": 0.01064747, + "auxiliary_loss_mlp": 0.01042085, + "balance_loss_clip": 1.03287554, + "balance_loss_mlp": 1.02873397, + "epoch": 0.3401773635953705, + "flos": 25885776251520.0, + "grad_norm": 1.9890526279697534, + "language_loss": 0.85499889, + "learning_rate": 3.0728239791642976e-06, + "loss": 0.87606722, + "num_input_tokens_seen": 121468165, + "step": 5658, + "time_per_iteration": 2.897653818130493 + }, + { + "auxiliary_loss_clip": 0.01006915, + "auxiliary_loss_mlp": 0.01011665, + "balance_loss_clip": 1.00526333, + "balance_loss_mlp": 1.01019323, + "epoch": 0.3402374868480385, + "flos": 65507995336320.0, + "grad_norm": 0.8184094379996406, + "language_loss": 0.60039449, + "learning_rate": 3.072495270199477e-06, + "loss": 0.62058032, + "num_input_tokens_seen": 121523795, + "step": 5659, + "time_per_iteration": 3.1649811267852783 + }, + { + "auxiliary_loss_clip": 0.01085507, + "auxiliary_loss_mlp": 0.01034849, + "balance_loss_clip": 1.03457785, + "balance_loss_mlp": 1.02163482, + "epoch": 0.34029761010070647, + "flos": 24060580513920.0, + "grad_norm": 2.426748287435539, + "language_loss": 0.67975783, + "learning_rate": 3.0721665205655284e-06, + "loss": 0.70096147, + "num_input_tokens_seen": 121542950, + "step": 5660, + "time_per_iteration": 2.565734624862671 + }, + { + "auxiliary_loss_clip": 0.01089728, + "auxiliary_loss_mlp": 0.01043843, + "balance_loss_clip": 1.03564322, + "balance_loss_mlp": 1.0288887, + "epoch": 0.34035773335337444, + "flos": 27599720590080.0, + "grad_norm": 2.2973961222675596, + "language_loss": 0.67359799, + "learning_rate": 3.071837730274918e-06, + "loss": 0.69493371, + "num_input_tokens_seen": 121562765, + "step": 5661, + "time_per_iteration": 2.5989694595336914 + }, + { + "auxiliary_loss_clip": 0.01067594, + "auxiliary_loss_mlp": 0.01043455, + "balance_loss_clip": 1.03369439, + "balance_loss_mlp": 1.0297935, + "epoch": 0.3404178566060424, + "flos": 20812783651200.0, + "grad_norm": 2.3194792017061054, + "language_loss": 0.79126346, + "learning_rate": 3.071508899340113e-06, + "loss": 0.812374, + "num_input_tokens_seen": 121581610, + "step": 5662, + "time_per_iteration": 2.7336270809173584 + }, + { + "auxiliary_loss_clip": 0.01058626, + "auxiliary_loss_mlp": 0.01041525, + "balance_loss_clip": 1.03358889, + "balance_loss_mlp": 1.02613568, + "epoch": 0.34047797985871037, + "flos": 26833566470400.0, + "grad_norm": 2.8681332401402155, + "language_loss": 0.7324183, + "learning_rate": 3.0711800277735833e-06, + "loss": 0.75341982, + "num_input_tokens_seen": 121601885, + "step": 5663, + "time_per_iteration": 4.391671895980835 + }, + { + "auxiliary_loss_clip": 0.01044871, + "auxiliary_loss_mlp": 0.01038112, + "balance_loss_clip": 1.02829766, + "balance_loss_mlp": 1.02532125, + "epoch": 0.34053810311137833, + "flos": 19682639061120.0, + "grad_norm": 2.0809986806268426, + "language_loss": 0.86080289, + "learning_rate": 3.0708511155877997e-06, + "loss": 0.88163269, + "num_input_tokens_seen": 121621335, + "step": 5664, + "time_per_iteration": 4.572909116744995 + }, + { + "auxiliary_loss_clip": 0.01087072, + "auxiliary_loss_mlp": 0.01041792, + "balance_loss_clip": 1.03336024, + "balance_loss_mlp": 1.02870834, + "epoch": 0.3405982263640463, + "flos": 21725740656000.0, + "grad_norm": 2.1123571731228004, + "language_loss": 0.68525022, + "learning_rate": 3.070522162795235e-06, + "loss": 0.70653886, + "num_input_tokens_seen": 121641310, + "step": 5665, + "time_per_iteration": 2.618621349334717 + }, + { + "auxiliary_loss_clip": 0.01085513, + "auxiliary_loss_mlp": 0.01033951, + "balance_loss_clip": 1.03173232, + "balance_loss_mlp": 1.0192647, + "epoch": 0.34065834961671426, + "flos": 18041629288320.0, + "grad_norm": 3.1011837403022726, + "language_loss": 0.72878981, + "learning_rate": 3.0701931694083626e-06, + "loss": 0.7499845, + "num_input_tokens_seen": 121659625, + "step": 5666, + "time_per_iteration": 2.59718656539917 + }, + { + "auxiliary_loss_clip": 0.01078392, + "auxiliary_loss_mlp": 0.01038093, + "balance_loss_clip": 1.03276694, + "balance_loss_mlp": 1.02369308, + "epoch": 0.3407184728693822, + "flos": 21397337585280.0, + "grad_norm": 1.5522599261646262, + "language_loss": 0.733697, + "learning_rate": 3.0698641354396576e-06, + "loss": 0.75486183, + "num_input_tokens_seen": 121679205, + "step": 5667, + "time_per_iteration": 2.6788899898529053 + }, + { + "auxiliary_loss_clip": 0.01006029, + "auxiliary_loss_mlp": 0.01001868, + "balance_loss_clip": 1.0045836, + "balance_loss_mlp": 1.00008035, + "epoch": 0.3407785961220502, + "flos": 68688101018880.0, + "grad_norm": 0.8299480796481916, + "language_loss": 0.63343197, + "learning_rate": 3.069535060901597e-06, + "loss": 0.65351099, + "num_input_tokens_seen": 121751085, + "step": 5668, + "time_per_iteration": 3.3710410594940186 + }, + { + "auxiliary_loss_clip": 0.00990842, + "auxiliary_loss_mlp": 0.01045092, + "balance_loss_clip": 1.02568543, + "balance_loss_mlp": 1.03039324, + "epoch": 0.34083871937471816, + "flos": 14064379027200.0, + "grad_norm": 1.9381674283630728, + "language_loss": 0.72124481, + "learning_rate": 3.0692059458066596e-06, + "loss": 0.74160409, + "num_input_tokens_seen": 121768565, + "step": 5669, + "time_per_iteration": 3.0706052780151367 + }, + { + "auxiliary_loss_clip": 0.01057036, + "auxiliary_loss_mlp": 0.00747878, + "balance_loss_clip": 1.03284359, + "balance_loss_mlp": 1.00023305, + "epoch": 0.3408988426273861, + "flos": 17085435287040.0, + "grad_norm": 2.232882058472012, + "language_loss": 0.80471349, + "learning_rate": 3.0688767901673265e-06, + "loss": 0.82276273, + "num_input_tokens_seen": 121784925, + "step": 5670, + "time_per_iteration": 3.2329792976379395 + }, + { + "auxiliary_loss_clip": 0.01037802, + "auxiliary_loss_mlp": 0.01036702, + "balance_loss_clip": 1.02721786, + "balance_loss_mlp": 1.02218819, + "epoch": 0.3409589658800541, + "flos": 24024562151040.0, + "grad_norm": 1.730752689122073, + "language_loss": 0.77045876, + "learning_rate": 3.068547593996078e-06, + "loss": 0.79120374, + "num_input_tokens_seen": 121804425, + "step": 5671, + "time_per_iteration": 2.7193315029144287 + }, + { + "auxiliary_loss_clip": 0.01088874, + "auxiliary_loss_mlp": 0.00748015, + "balance_loss_clip": 1.03395998, + "balance_loss_mlp": 1.00025737, + "epoch": 0.34101908913272205, + "flos": 21142012734720.0, + "grad_norm": 1.8804355497785878, + "language_loss": 0.7368983, + "learning_rate": 3.0682183573053974e-06, + "loss": 0.7552672, + "num_input_tokens_seen": 121825145, + "step": 5672, + "time_per_iteration": 2.589621067047119 + }, + { + "auxiliary_loss_clip": 0.01069086, + "auxiliary_loss_mlp": 0.0103987, + "balance_loss_clip": 1.03032804, + "balance_loss_mlp": 1.02481973, + "epoch": 0.3410792123853901, + "flos": 15702012921600.0, + "grad_norm": 1.8300523977332077, + "language_loss": 0.73747969, + "learning_rate": 3.06788908010777e-06, + "loss": 0.75856924, + "num_input_tokens_seen": 121842185, + "step": 5673, + "time_per_iteration": 2.566671848297119 + }, + { + "auxiliary_loss_clip": 0.01076806, + "auxiliary_loss_mlp": 0.01037892, + "balance_loss_clip": 1.03328586, + "balance_loss_mlp": 1.02382541, + "epoch": 0.34113933563805804, + "flos": 23036012974080.0, + "grad_norm": 1.8432952352473089, + "language_loss": 0.79736578, + "learning_rate": 3.067559762415682e-06, + "loss": 0.81851286, + "num_input_tokens_seen": 121862260, + "step": 5674, + "time_per_iteration": 2.681011915206909 + }, + { + "auxiliary_loss_clip": 0.01016039, + "auxiliary_loss_mlp": 0.01002485, + "balance_loss_clip": 1.00472188, + "balance_loss_mlp": 1.00091195, + "epoch": 0.341199458890726, + "flos": 69614235336960.0, + "grad_norm": 0.7891585509662818, + "language_loss": 0.56060255, + "learning_rate": 3.0672304042416198e-06, + "loss": 0.58078778, + "num_input_tokens_seen": 121923560, + "step": 5675, + "time_per_iteration": 3.33091402053833 + }, + { + "auxiliary_loss_clip": 0.01067333, + "auxiliary_loss_mlp": 0.00747904, + "balance_loss_clip": 1.03423309, + "balance_loss_mlp": 1.00021613, + "epoch": 0.34125958214339397, + "flos": 22346348866560.0, + "grad_norm": 1.7161534723730791, + "language_loss": 0.79027283, + "learning_rate": 3.0669010055980734e-06, + "loss": 0.80842525, + "num_input_tokens_seen": 121943515, + "step": 5676, + "time_per_iteration": 2.6326398849487305 + }, + { + "auxiliary_loss_clip": 0.01066544, + "auxiliary_loss_mlp": 0.01036053, + "balance_loss_clip": 1.02755105, + "balance_loss_mlp": 1.02149725, + "epoch": 0.34131970539606193, + "flos": 21871933009920.0, + "grad_norm": 1.7583013774690999, + "language_loss": 0.85928464, + "learning_rate": 3.0665715664975357e-06, + "loss": 0.88031054, + "num_input_tokens_seen": 121962540, + "step": 5677, + "time_per_iteration": 2.6524038314819336 + }, + { + "auxiliary_loss_clip": 0.01062928, + "auxiliary_loss_mlp": 0.01042854, + "balance_loss_clip": 1.03001261, + "balance_loss_mlp": 1.02797639, + "epoch": 0.3413798286487299, + "flos": 24935723475840.0, + "grad_norm": 2.0275871176878244, + "language_loss": 0.78960192, + "learning_rate": 3.0662420869524966e-06, + "loss": 0.81065977, + "num_input_tokens_seen": 121979830, + "step": 5678, + "time_per_iteration": 2.7014248371124268 + }, + { + "auxiliary_loss_clip": 0.01075146, + "auxiliary_loss_mlp": 0.01033389, + "balance_loss_clip": 1.02938962, + "balance_loss_mlp": 1.01921499, + "epoch": 0.34143995190139786, + "flos": 25374372364800.0, + "grad_norm": 1.68776630434586, + "language_loss": 0.74787545, + "learning_rate": 3.0659125669754506e-06, + "loss": 0.76896077, + "num_input_tokens_seen": 121999055, + "step": 5679, + "time_per_iteration": 2.9971866607666016 + }, + { + "auxiliary_loss_clip": 0.01002979, + "auxiliary_loss_mlp": 0.01003213, + "balance_loss_clip": 1.00176275, + "balance_loss_mlp": 1.00154388, + "epoch": 0.34150007515406583, + "flos": 67782578129280.0, + "grad_norm": 0.7667189097874879, + "language_loss": 0.59454852, + "learning_rate": 3.0655830065788923e-06, + "loss": 0.61461043, + "num_input_tokens_seen": 122067015, + "step": 5680, + "time_per_iteration": 3.2534019947052 + }, + { + "auxiliary_loss_clip": 0.01060899, + "auxiliary_loss_mlp": 0.01033384, + "balance_loss_clip": 1.029989, + "balance_loss_mlp": 1.02022946, + "epoch": 0.3415601984067338, + "flos": 20302421258880.0, + "grad_norm": 1.7636940583551255, + "language_loss": 0.71870887, + "learning_rate": 3.0652534057753206e-06, + "loss": 0.73965174, + "num_input_tokens_seen": 122085295, + "step": 5681, + "time_per_iteration": 2.7261269092559814 + }, + { + "auxiliary_loss_clip": 0.01064474, + "auxiliary_loss_mlp": 0.01042346, + "balance_loss_clip": 1.03165615, + "balance_loss_mlp": 1.02837455, + "epoch": 0.34162032165940176, + "flos": 26031178506240.0, + "grad_norm": 2.9022672742785205, + "language_loss": 0.71007407, + "learning_rate": 3.064923764577233e-06, + "loss": 0.73114228, + "num_input_tokens_seen": 122104020, + "step": 5682, + "time_per_iteration": 2.654106378555298 + }, + { + "auxiliary_loss_clip": 0.01085224, + "auxiliary_loss_mlp": 0.01038123, + "balance_loss_clip": 1.03069758, + "balance_loss_mlp": 1.02337134, + "epoch": 0.3416804449120697, + "flos": 28803338449920.0, + "grad_norm": 2.549192144173338, + "language_loss": 0.84096193, + "learning_rate": 3.0645940829971295e-06, + "loss": 0.86219537, + "num_input_tokens_seen": 122125080, + "step": 5683, + "time_per_iteration": 2.5687899589538574 + }, + { + "auxiliary_loss_clip": 0.01064699, + "auxiliary_loss_mlp": 0.01047961, + "balance_loss_clip": 1.03032005, + "balance_loss_mlp": 1.03266609, + "epoch": 0.3417405681647377, + "flos": 22601601889920.0, + "grad_norm": 1.581397163585015, + "language_loss": 0.70543098, + "learning_rate": 3.0642643610475116e-06, + "loss": 0.72655761, + "num_input_tokens_seen": 122146350, + "step": 5684, + "time_per_iteration": 2.6286256313323975 + }, + { + "auxiliary_loss_clip": 0.01082848, + "auxiliary_loss_mlp": 0.01034686, + "balance_loss_clip": 1.03112948, + "balance_loss_mlp": 1.02135849, + "epoch": 0.34180069141740566, + "flos": 24716237420160.0, + "grad_norm": 1.3922148222746418, + "language_loss": 0.75201201, + "learning_rate": 3.0639345987408823e-06, + "loss": 0.77318734, + "num_input_tokens_seen": 122168085, + "step": 5685, + "time_per_iteration": 2.8105366230010986 + }, + { + "auxiliary_loss_clip": 0.01065034, + "auxiliary_loss_mlp": 0.01042868, + "balance_loss_clip": 1.02777171, + "balance_loss_mlp": 1.02855706, + "epoch": 0.3418608146700737, + "flos": 30518755246080.0, + "grad_norm": 1.6138489888009946, + "language_loss": 0.70517403, + "learning_rate": 3.0636047960897468e-06, + "loss": 0.72625309, + "num_input_tokens_seen": 122191040, + "step": 5686, + "time_per_iteration": 2.683544158935547 + }, + { + "auxiliary_loss_clip": 0.01073143, + "auxiliary_loss_mlp": 0.01042195, + "balance_loss_clip": 1.02968383, + "balance_loss_mlp": 1.02816367, + "epoch": 0.34192093792274164, + "flos": 15122343237120.0, + "grad_norm": 2.2171678076871117, + "language_loss": 0.77886689, + "learning_rate": 3.06327495310661e-06, + "loss": 0.80002022, + "num_input_tokens_seen": 122209225, + "step": 5687, + "time_per_iteration": 2.5606019496917725 + }, + { + "auxiliary_loss_clip": 0.01060874, + "auxiliary_loss_mlp": 0.01037857, + "balance_loss_clip": 1.03001595, + "balance_loss_mlp": 1.02355826, + "epoch": 0.3419810611754096, + "flos": 13187799521280.0, + "grad_norm": 1.8821848944556614, + "language_loss": 0.86758351, + "learning_rate": 3.062945069803981e-06, + "loss": 0.88857079, + "num_input_tokens_seen": 122226160, + "step": 5688, + "time_per_iteration": 2.563873291015625 + }, + { + "auxiliary_loss_clip": 0.0107247, + "auxiliary_loss_mlp": 0.01044128, + "balance_loss_clip": 1.03464532, + "balance_loss_mlp": 1.02840424, + "epoch": 0.34204118442807757, + "flos": 19536267139200.0, + "grad_norm": 1.8647544058880599, + "language_loss": 0.79347646, + "learning_rate": 3.0626151461943684e-06, + "loss": 0.81464243, + "num_input_tokens_seen": 122243115, + "step": 5689, + "time_per_iteration": 2.600770950317383 + }, + { + "auxiliary_loss_clip": 0.01078284, + "auxiliary_loss_mlp": 0.01039572, + "balance_loss_clip": 1.03180408, + "balance_loss_mlp": 1.02493894, + "epoch": 0.34210130768074554, + "flos": 15194846839680.0, + "grad_norm": 1.743698309513882, + "language_loss": 0.73548102, + "learning_rate": 3.0622851822902834e-06, + "loss": 0.75665957, + "num_input_tokens_seen": 122261105, + "step": 5690, + "time_per_iteration": 2.6023991107940674 + }, + { + "auxiliary_loss_clip": 0.01064426, + "auxiliary_loss_mlp": 0.01038448, + "balance_loss_clip": 1.02686381, + "balance_loss_mlp": 1.0238626, + "epoch": 0.3421614309334135, + "flos": 24936226266240.0, + "grad_norm": 1.9311110870195503, + "language_loss": 0.75558901, + "learning_rate": 3.061955178104237e-06, + "loss": 0.77661771, + "num_input_tokens_seen": 122279995, + "step": 5691, + "time_per_iteration": 2.725592613220215 + }, + { + "auxiliary_loss_clip": 0.01071586, + "auxiliary_loss_mlp": 0.01037162, + "balance_loss_clip": 1.03036499, + "balance_loss_mlp": 1.02448416, + "epoch": 0.34222155418608147, + "flos": 21908633731200.0, + "grad_norm": 1.586837255656538, + "language_loss": 0.68322319, + "learning_rate": 3.0616251336487447e-06, + "loss": 0.70431066, + "num_input_tokens_seen": 122299070, + "step": 5692, + "time_per_iteration": 2.83530330657959 + }, + { + "auxiliary_loss_clip": 0.01075717, + "auxiliary_loss_mlp": 0.01041993, + "balance_loss_clip": 1.03104532, + "balance_loss_mlp": 1.02657318, + "epoch": 0.34228167743874943, + "flos": 18114061063680.0, + "grad_norm": 2.399310361971391, + "language_loss": 0.7286126, + "learning_rate": 3.06129504893632e-06, + "loss": 0.74978971, + "num_input_tokens_seen": 122316800, + "step": 5693, + "time_per_iteration": 2.5944998264312744 + }, + { + "auxiliary_loss_clip": 0.01044548, + "auxiliary_loss_mlp": 0.01036979, + "balance_loss_clip": 1.02951705, + "balance_loss_mlp": 1.02322829, + "epoch": 0.3423418006914174, + "flos": 21288600138240.0, + "grad_norm": 1.784083113766595, + "language_loss": 0.75942445, + "learning_rate": 3.0609649239794813e-06, + "loss": 0.7802397, + "num_input_tokens_seen": 122335275, + "step": 5694, + "time_per_iteration": 2.753573417663574 + }, + { + "auxiliary_loss_clip": 0.0105019, + "auxiliary_loss_mlp": 0.01042335, + "balance_loss_clip": 1.02998495, + "balance_loss_mlp": 1.02940083, + "epoch": 0.34240192394408536, + "flos": 19823480288640.0, + "grad_norm": 1.762749252346713, + "language_loss": 0.79396415, + "learning_rate": 3.060634758790747e-06, + "loss": 0.81488943, + "num_input_tokens_seen": 122353215, + "step": 5695, + "time_per_iteration": 2.6877334117889404 + }, + { + "auxiliary_loss_clip": 0.01028757, + "auxiliary_loss_mlp": 0.01037347, + "balance_loss_clip": 1.02630973, + "balance_loss_mlp": 1.02384639, + "epoch": 0.3424620471967533, + "flos": 24535535074560.0, + "grad_norm": 2.2088608323820735, + "language_loss": 0.7359376, + "learning_rate": 3.060304553382635e-06, + "loss": 0.75659865, + "num_input_tokens_seen": 122372495, + "step": 5696, + "time_per_iteration": 2.785856008529663 + }, + { + "auxiliary_loss_clip": 0.01031478, + "auxiliary_loss_mlp": 0.01050174, + "balance_loss_clip": 1.0244298, + "balance_loss_mlp": 1.03440273, + "epoch": 0.3425221704494213, + "flos": 25848895962240.0, + "grad_norm": 1.7020473591964613, + "language_loss": 0.70820773, + "learning_rate": 3.0599743077676685e-06, + "loss": 0.72902429, + "num_input_tokens_seen": 122394600, + "step": 5697, + "time_per_iteration": 2.7620041370391846 + }, + { + "auxiliary_loss_clip": 0.01056992, + "auxiliary_loss_mlp": 0.01029043, + "balance_loss_clip": 1.02932882, + "balance_loss_mlp": 1.0155009, + "epoch": 0.34258229370208926, + "flos": 21540513196800.0, + "grad_norm": 1.8806435883463684, + "language_loss": 0.82117462, + "learning_rate": 3.05964402195837e-06, + "loss": 0.84203506, + "num_input_tokens_seen": 122414700, + "step": 5698, + "time_per_iteration": 2.7226221561431885 + }, + { + "auxiliary_loss_clip": 0.01024289, + "auxiliary_loss_mlp": 0.01052272, + "balance_loss_clip": 1.02754807, + "balance_loss_mlp": 1.03493941, + "epoch": 0.3426424169547573, + "flos": 23652778429440.0, + "grad_norm": 2.1308795962772358, + "language_loss": 0.68718863, + "learning_rate": 3.0593136959672645e-06, + "loss": 0.70795429, + "num_input_tokens_seen": 122432760, + "step": 5699, + "time_per_iteration": 4.427539348602295 + }, + { + "auxiliary_loss_clip": 0.01062043, + "auxiliary_loss_mlp": 0.01034409, + "balance_loss_clip": 1.03005576, + "balance_loss_mlp": 1.02107537, + "epoch": 0.34270254020742524, + "flos": 24644883052800.0, + "grad_norm": 1.9907164759524878, + "language_loss": 0.72375327, + "learning_rate": 3.058983329806877e-06, + "loss": 0.74471778, + "num_input_tokens_seen": 122449105, + "step": 5700, + "time_per_iteration": 2.6250851154327393 + }, + { + "auxiliary_loss_clip": 0.01062384, + "auxiliary_loss_mlp": 0.01035334, + "balance_loss_clip": 1.03224802, + "balance_loss_mlp": 1.02167249, + "epoch": 0.3427626634600932, + "flos": 20996754134400.0, + "grad_norm": 2.360168557426399, + "language_loss": 0.81821507, + "learning_rate": 3.0586529234897354e-06, + "loss": 0.83919227, + "num_input_tokens_seen": 122468700, + "step": 5701, + "time_per_iteration": 2.6043994426727295 + }, + { + "auxiliary_loss_clip": 0.01074602, + "auxiliary_loss_mlp": 0.01035174, + "balance_loss_clip": 1.03341711, + "balance_loss_mlp": 1.02169716, + "epoch": 0.3428227867127612, + "flos": 21433786911360.0, + "grad_norm": 1.8214764039317164, + "language_loss": 0.70921493, + "learning_rate": 3.0583224770283694e-06, + "loss": 0.73031265, + "num_input_tokens_seen": 122488160, + "step": 5702, + "time_per_iteration": 2.566957950592041 + }, + { + "auxiliary_loss_clip": 0.00995804, + "auxiliary_loss_mlp": 0.01008556, + "balance_loss_clip": 1.00517356, + "balance_loss_mlp": 1.00708342, + "epoch": 0.34288290996542914, + "flos": 55731782695680.0, + "grad_norm": 0.7756310804316039, + "language_loss": 0.57439536, + "learning_rate": 3.057991990435309e-06, + "loss": 0.59443897, + "num_input_tokens_seen": 122542890, + "step": 5703, + "time_per_iteration": 4.963779449462891 + }, + { + "auxiliary_loss_clip": 0.01068757, + "auxiliary_loss_mlp": 0.01038655, + "balance_loss_clip": 1.02979708, + "balance_loss_mlp": 1.0235033, + "epoch": 0.3429430332180971, + "flos": 20156803522560.0, + "grad_norm": 1.7260367077340473, + "language_loss": 0.74849832, + "learning_rate": 3.057661463723086e-06, + "loss": 0.76957238, + "num_input_tokens_seen": 122561770, + "step": 5704, + "time_per_iteration": 2.749267339706421 + }, + { + "auxiliary_loss_clip": 0.01048273, + "auxiliary_loss_mlp": 0.01033064, + "balance_loss_clip": 1.02905869, + "balance_loss_mlp": 1.02051759, + "epoch": 0.34300315647076507, + "flos": 17965857548160.0, + "grad_norm": 2.1558144518712474, + "language_loss": 0.72986239, + "learning_rate": 3.0573308969042346e-06, + "loss": 0.75067574, + "num_input_tokens_seen": 122580580, + "step": 5705, + "time_per_iteration": 2.62605357170105 + }, + { + "auxiliary_loss_clip": 0.01043981, + "auxiliary_loss_mlp": 0.01032647, + "balance_loss_clip": 1.03083897, + "balance_loss_mlp": 1.01898527, + "epoch": 0.34306327972343303, + "flos": 22086822124800.0, + "grad_norm": 2.0522158015546497, + "language_loss": 0.79638261, + "learning_rate": 3.057000289991289e-06, + "loss": 0.81714892, + "num_input_tokens_seen": 122599810, + "step": 5706, + "time_per_iteration": 2.681013584136963 + }, + { + "auxiliary_loss_clip": 0.01070145, + "auxiliary_loss_mlp": 0.010359, + "balance_loss_clip": 1.03347015, + "balance_loss_mlp": 1.02097487, + "epoch": 0.343123402976101, + "flos": 18442679616000.0, + "grad_norm": 2.0990605854120994, + "language_loss": 0.82668638, + "learning_rate": 3.056669642996787e-06, + "loss": 0.84774685, + "num_input_tokens_seen": 122616035, + "step": 5707, + "time_per_iteration": 2.768002986907959 + }, + { + "auxiliary_loss_clip": 0.01075475, + "auxiliary_loss_mlp": 0.01036107, + "balance_loss_clip": 1.03183031, + "balance_loss_mlp": 1.02193916, + "epoch": 0.34318352622876896, + "flos": 17163685065600.0, + "grad_norm": 1.6083716098772702, + "language_loss": 0.75103176, + "learning_rate": 3.056338955933266e-06, + "loss": 0.77214754, + "num_input_tokens_seen": 122633785, + "step": 5708, + "time_per_iteration": 2.621302604675293 + }, + { + "auxiliary_loss_clip": 0.01052917, + "auxiliary_loss_mlp": 0.01037935, + "balance_loss_clip": 1.02823329, + "balance_loss_mlp": 1.02380824, + "epoch": 0.34324364948143693, + "flos": 26688164215680.0, + "grad_norm": 1.6437244322460522, + "language_loss": 0.81030703, + "learning_rate": 3.0560082288132662e-06, + "loss": 0.83121562, + "num_input_tokens_seen": 122652100, + "step": 5709, + "time_per_iteration": 2.738079071044922 + }, + { + "auxiliary_loss_clip": 0.01064814, + "auxiliary_loss_mlp": 0.01039022, + "balance_loss_clip": 1.03224802, + "balance_loss_mlp": 1.02392435, + "epoch": 0.3433037727341049, + "flos": 21251576194560.0, + "grad_norm": 3.0081327077167197, + "language_loss": 0.78853381, + "learning_rate": 3.055677461649329e-06, + "loss": 0.80957216, + "num_input_tokens_seen": 122669720, + "step": 5710, + "time_per_iteration": 2.6534194946289062 + }, + { + "auxiliary_loss_clip": 0.01074838, + "auxiliary_loss_mlp": 0.01037714, + "balance_loss_clip": 1.02945495, + "balance_loss_mlp": 1.0224967, + "epoch": 0.34336389598677286, + "flos": 20629423699200.0, + "grad_norm": 2.054694018632251, + "language_loss": 0.7004205, + "learning_rate": 3.055346654453996e-06, + "loss": 0.72154593, + "num_input_tokens_seen": 122688715, + "step": 5711, + "time_per_iteration": 5.7852067947387695 + }, + { + "auxiliary_loss_clip": 0.01045169, + "auxiliary_loss_mlp": 0.00748088, + "balance_loss_clip": 1.02650917, + "balance_loss_mlp": 1.00018835, + "epoch": 0.3434240192394409, + "flos": 14538579402240.0, + "grad_norm": 1.917619403550593, + "language_loss": 0.67466533, + "learning_rate": 3.055015807239812e-06, + "loss": 0.69259793, + "num_input_tokens_seen": 122706970, + "step": 5712, + "time_per_iteration": 2.6271374225616455 + }, + { + "auxiliary_loss_clip": 0.00984465, + "auxiliary_loss_mlp": 0.01004586, + "balance_loss_clip": 1.00395513, + "balance_loss_mlp": 1.00298822, + "epoch": 0.34348414249210885, + "flos": 58051538841600.0, + "grad_norm": 0.8520019438545742, + "language_loss": 0.58091706, + "learning_rate": 3.0546849200193226e-06, + "loss": 0.60080761, + "num_input_tokens_seen": 122758095, + "step": 5713, + "time_per_iteration": 3.183643102645874 + }, + { + "auxiliary_loss_clip": 0.01083867, + "auxiliary_loss_mlp": 0.01039793, + "balance_loss_clip": 1.03134656, + "balance_loss_mlp": 1.02626836, + "epoch": 0.3435442657447768, + "flos": 20704441253760.0, + "grad_norm": 1.6369951845369164, + "language_loss": 0.80670369, + "learning_rate": 3.054353992805076e-06, + "loss": 0.82794029, + "num_input_tokens_seen": 122777815, + "step": 5714, + "time_per_iteration": 2.5778467655181885 + }, + { + "auxiliary_loss_clip": 0.01084772, + "auxiliary_loss_mlp": 0.0103803, + "balance_loss_clip": 1.03223419, + "balance_loss_mlp": 1.02348638, + "epoch": 0.3436043889974448, + "flos": 22930256355840.0, + "grad_norm": 1.8115500844423462, + "language_loss": 0.72199535, + "learning_rate": 3.05402302560962e-06, + "loss": 0.74322331, + "num_input_tokens_seen": 122797555, + "step": 5715, + "time_per_iteration": 2.6393039226531982 + }, + { + "auxiliary_loss_clip": 0.0101232, + "auxiliary_loss_mlp": 0.01004846, + "balance_loss_clip": 1.01202345, + "balance_loss_mlp": 1.00272417, + "epoch": 0.34366451225011274, + "flos": 58403285752320.0, + "grad_norm": 0.9005879423336698, + "language_loss": 0.65920699, + "learning_rate": 3.053692018445505e-06, + "loss": 0.67937863, + "num_input_tokens_seen": 122863955, + "step": 5716, + "time_per_iteration": 3.3017849922180176 + }, + { + "auxiliary_loss_clip": 0.01067727, + "auxiliary_loss_mlp": 0.01040569, + "balance_loss_clip": 1.03169227, + "balance_loss_mlp": 1.02668071, + "epoch": 0.3437246355027807, + "flos": 15596292216960.0, + "grad_norm": 1.9505904650088581, + "language_loss": 0.74914491, + "learning_rate": 3.0533609713252838e-06, + "loss": 0.77022779, + "num_input_tokens_seen": 122883000, + "step": 5717, + "time_per_iteration": 2.793792963027954 + }, + { + "auxiliary_loss_clip": 0.01042658, + "auxiliary_loss_mlp": 0.01041893, + "balance_loss_clip": 1.03216588, + "balance_loss_mlp": 1.02825594, + "epoch": 0.34378475875544867, + "flos": 27672260106240.0, + "grad_norm": 2.2005739424116237, + "language_loss": 0.75187051, + "learning_rate": 3.0530298842615077e-06, + "loss": 0.77271599, + "num_input_tokens_seen": 122903265, + "step": 5718, + "time_per_iteration": 2.9570093154907227 + }, + { + "auxiliary_loss_clip": 0.0105443, + "auxiliary_loss_mlp": 0.01044898, + "balance_loss_clip": 1.03211415, + "balance_loss_mlp": 1.02989531, + "epoch": 0.34384488200811664, + "flos": 31431496769280.0, + "grad_norm": 1.809684457774984, + "language_loss": 0.64198339, + "learning_rate": 3.052698757266734e-06, + "loss": 0.66297662, + "num_input_tokens_seen": 122923860, + "step": 5719, + "time_per_iteration": 2.8499677181243896 + }, + { + "auxiliary_loss_clip": 0.01039555, + "auxiliary_loss_mlp": 0.01037435, + "balance_loss_clip": 1.0285368, + "balance_loss_mlp": 1.02149689, + "epoch": 0.3439050052607846, + "flos": 24899920594560.0, + "grad_norm": 1.8465658286302167, + "language_loss": 0.73785311, + "learning_rate": 3.0523675903535183e-06, + "loss": 0.758623, + "num_input_tokens_seen": 122945305, + "step": 5720, + "time_per_iteration": 2.9997925758361816 + }, + { + "auxiliary_loss_clip": 0.01065958, + "auxiliary_loss_mlp": 0.01043384, + "balance_loss_clip": 1.02928722, + "balance_loss_mlp": 1.02700496, + "epoch": 0.34396512851345257, + "flos": 18150079426560.0, + "grad_norm": 1.7502298189711722, + "language_loss": 0.74147737, + "learning_rate": 3.0520363835344173e-06, + "loss": 0.76257074, + "num_input_tokens_seen": 122962535, + "step": 5721, + "time_per_iteration": 2.764139175415039 + }, + { + "auxiliary_loss_clip": 0.01057737, + "auxiliary_loss_mlp": 0.00748049, + "balance_loss_clip": 1.02986407, + "balance_loss_mlp": 1.00030446, + "epoch": 0.34402525176612053, + "flos": 16034438315520.0, + "grad_norm": 2.3722892746480495, + "language_loss": 0.80462217, + "learning_rate": 3.051705136821992e-06, + "loss": 0.82268, + "num_input_tokens_seen": 122979750, + "step": 5722, + "time_per_iteration": 2.7914884090423584 + }, + { + "auxiliary_loss_clip": 0.01037003, + "auxiliary_loss_mlp": 0.0103602, + "balance_loss_clip": 1.02908945, + "balance_loss_mlp": 1.02280593, + "epoch": 0.3440853750187885, + "flos": 21178641628800.0, + "grad_norm": 1.5357090417133163, + "language_loss": 0.81710488, + "learning_rate": 3.051373850228801e-06, + "loss": 0.83783507, + "num_input_tokens_seen": 122998955, + "step": 5723, + "time_per_iteration": 2.8570146560668945 + }, + { + "auxiliary_loss_clip": 0.01044548, + "auxiliary_loss_mlp": 0.01048786, + "balance_loss_clip": 1.0276382, + "balance_loss_mlp": 1.03349137, + "epoch": 0.34414549827145646, + "flos": 12677868092160.0, + "grad_norm": 1.9084855825822322, + "language_loss": 0.81081235, + "learning_rate": 3.0510425237674096e-06, + "loss": 0.83174574, + "num_input_tokens_seen": 123016165, + "step": 5724, + "time_per_iteration": 2.821989059448242 + }, + { + "auxiliary_loss_clip": 0.01054584, + "auxiliary_loss_mlp": 0.01037749, + "balance_loss_clip": 1.02825558, + "balance_loss_mlp": 1.02239442, + "epoch": 0.3442056215241244, + "flos": 31284514316160.0, + "grad_norm": 3.661706149596834, + "language_loss": 0.69184816, + "learning_rate": 3.05071115745038e-06, + "loss": 0.71277148, + "num_input_tokens_seen": 123036900, + "step": 5725, + "time_per_iteration": 2.831087350845337 + }, + { + "auxiliary_loss_clip": 0.01078826, + "auxiliary_loss_mlp": 0.01041584, + "balance_loss_clip": 1.0320996, + "balance_loss_mlp": 1.02572322, + "epoch": 0.34426574477679245, + "flos": 23367289132800.0, + "grad_norm": 1.5963129081428327, + "language_loss": 0.69233346, + "learning_rate": 3.0503797512902773e-06, + "loss": 0.71353757, + "num_input_tokens_seen": 123057480, + "step": 5726, + "time_per_iteration": 2.7649765014648438 + }, + { + "auxiliary_loss_clip": 0.01058626, + "auxiliary_loss_mlp": 0.01033732, + "balance_loss_clip": 1.03478622, + "balance_loss_mlp": 1.02051818, + "epoch": 0.3443258680294604, + "flos": 24535427333760.0, + "grad_norm": 1.6292393757131827, + "language_loss": 0.73380363, + "learning_rate": 3.0500483052996703e-06, + "loss": 0.75472713, + "num_input_tokens_seen": 123076890, + "step": 5727, + "time_per_iteration": 2.846245527267456 + }, + { + "auxiliary_loss_clip": 0.01041302, + "auxiliary_loss_mlp": 0.01041026, + "balance_loss_clip": 1.02730179, + "balance_loss_mlp": 1.0259223, + "epoch": 0.3443859912821284, + "flos": 20230133137920.0, + "grad_norm": 1.7880509648359435, + "language_loss": 0.88231146, + "learning_rate": 3.0497168194911257e-06, + "loss": 0.90313476, + "num_input_tokens_seen": 123092530, + "step": 5728, + "time_per_iteration": 2.7021090984344482 + }, + { + "auxiliary_loss_clip": 0.01030209, + "auxiliary_loss_mlp": 0.01040275, + "balance_loss_clip": 1.02591157, + "balance_loss_mlp": 1.0262922, + "epoch": 0.34444611453479634, + "flos": 24316515895680.0, + "grad_norm": 2.124380769540587, + "language_loss": 0.6994797, + "learning_rate": 3.0493852938772143e-06, + "loss": 0.7201845, + "num_input_tokens_seen": 123110560, + "step": 5729, + "time_per_iteration": 2.782843828201294 + }, + { + "auxiliary_loss_clip": 0.01072339, + "auxiliary_loss_mlp": 0.01031735, + "balance_loss_clip": 1.02995229, + "balance_loss_mlp": 1.01729906, + "epoch": 0.3445062377874643, + "flos": 16983413683200.0, + "grad_norm": 1.8714219708696993, + "language_loss": 0.74129486, + "learning_rate": 3.0490537284705078e-06, + "loss": 0.76233554, + "num_input_tokens_seen": 123128655, + "step": 5730, + "time_per_iteration": 2.728522539138794 + }, + { + "auxiliary_loss_clip": 0.01042155, + "auxiliary_loss_mlp": 0.01044797, + "balance_loss_clip": 1.0261786, + "balance_loss_mlp": 1.02937126, + "epoch": 0.3445663610401323, + "flos": 20302708567680.0, + "grad_norm": 3.000486445210636, + "language_loss": 0.80118316, + "learning_rate": 3.048722123283578e-06, + "loss": 0.82205272, + "num_input_tokens_seen": 123145130, + "step": 5731, + "time_per_iteration": 2.654306411743164 + }, + { + "auxiliary_loss_clip": 0.01072673, + "auxiliary_loss_mlp": 0.01037958, + "balance_loss_clip": 1.03007436, + "balance_loss_mlp": 1.02401102, + "epoch": 0.34462648429280024, + "flos": 15888102307200.0, + "grad_norm": 1.876105089204821, + "language_loss": 0.78532881, + "learning_rate": 3.0483904783290006e-06, + "loss": 0.80643511, + "num_input_tokens_seen": 123162265, + "step": 5732, + "time_per_iteration": 2.607564926147461 + }, + { + "auxiliary_loss_clip": 0.00984693, + "auxiliary_loss_mlp": 0.01006678, + "balance_loss_clip": 1.00425267, + "balance_loss_mlp": 1.00497341, + "epoch": 0.3446866075454682, + "flos": 59311035285120.0, + "grad_norm": 0.7425058567348111, + "language_loss": 0.53510702, + "learning_rate": 3.0480587936193505e-06, + "loss": 0.55502069, + "num_input_tokens_seen": 123218620, + "step": 5733, + "time_per_iteration": 3.248748779296875 + }, + { + "auxiliary_loss_clip": 0.0106418, + "auxiliary_loss_mlp": 0.01038571, + "balance_loss_clip": 1.03012192, + "balance_loss_mlp": 1.0239197, + "epoch": 0.34474673079813617, + "flos": 22343799000960.0, + "grad_norm": 2.0654859410578195, + "language_loss": 0.83336931, + "learning_rate": 3.047727069167207e-06, + "loss": 0.85439688, + "num_input_tokens_seen": 123237325, + "step": 5734, + "time_per_iteration": 2.6531968116760254 + }, + { + "auxiliary_loss_clip": 0.01062311, + "auxiliary_loss_mlp": 0.01032159, + "balance_loss_clip": 1.02954364, + "balance_loss_mlp": 1.01830697, + "epoch": 0.34480685405080413, + "flos": 27670141203840.0, + "grad_norm": 1.9447744101602276, + "language_loss": 0.92465389, + "learning_rate": 3.0473953049851478e-06, + "loss": 0.94559854, + "num_input_tokens_seen": 123258650, + "step": 5735, + "time_per_iteration": 2.703562021255493 + }, + { + "auxiliary_loss_clip": 0.01053644, + "auxiliary_loss_mlp": 0.01037409, + "balance_loss_clip": 1.03701639, + "balance_loss_mlp": 1.02205503, + "epoch": 0.3448669773034721, + "flos": 22456020067200.0, + "grad_norm": 1.6420124957717552, + "language_loss": 0.76528102, + "learning_rate": 3.0470635010857533e-06, + "loss": 0.78619146, + "num_input_tokens_seen": 123277155, + "step": 5736, + "time_per_iteration": 2.963242292404175 + }, + { + "auxiliary_loss_clip": 0.01068889, + "auxiliary_loss_mlp": 0.01039924, + "balance_loss_clip": 1.03353822, + "balance_loss_mlp": 1.02518964, + "epoch": 0.34492710055614006, + "flos": 24936190352640.0, + "grad_norm": 1.518060993599022, + "language_loss": 0.78417838, + "learning_rate": 3.0467316574816064e-06, + "loss": 0.8052665, + "num_input_tokens_seen": 123297640, + "step": 5737, + "time_per_iteration": 2.7570364475250244 + }, + { + "auxiliary_loss_clip": 0.01024788, + "auxiliary_loss_mlp": 0.01042254, + "balance_loss_clip": 1.02477193, + "balance_loss_mlp": 1.02457476, + "epoch": 0.34498722380880803, + "flos": 20120821073280.0, + "grad_norm": 2.040863148285123, + "language_loss": 0.71508873, + "learning_rate": 3.0463997741852893e-06, + "loss": 0.73575914, + "num_input_tokens_seen": 123314370, + "step": 5738, + "time_per_iteration": 2.9265940189361572 + }, + { + "auxiliary_loss_clip": 0.01041987, + "auxiliary_loss_mlp": 0.0103959, + "balance_loss_clip": 1.02624476, + "balance_loss_mlp": 1.02386642, + "epoch": 0.34504734706147605, + "flos": 28438126917120.0, + "grad_norm": 1.8319983358263312, + "language_loss": 0.82036507, + "learning_rate": 3.046067851209389e-06, + "loss": 0.8411808, + "num_input_tokens_seen": 123336085, + "step": 5739, + "time_per_iteration": 2.834165096282959 + }, + { + "auxiliary_loss_clip": 0.01053059, + "auxiliary_loss_mlp": 0.01038194, + "balance_loss_clip": 1.0309639, + "balance_loss_mlp": 1.02316141, + "epoch": 0.345107470314144, + "flos": 22674464628480.0, + "grad_norm": 1.9515397200486664, + "language_loss": 0.82474691, + "learning_rate": 3.0457358885664898e-06, + "loss": 0.84565949, + "num_input_tokens_seen": 123354460, + "step": 5740, + "time_per_iteration": 2.7079617977142334 + }, + { + "auxiliary_loss_clip": 0.0107511, + "auxiliary_loss_mlp": 0.01035822, + "balance_loss_clip": 1.03201151, + "balance_loss_mlp": 1.02022886, + "epoch": 0.345167593566812, + "flos": 20630716588800.0, + "grad_norm": 2.183479035481436, + "language_loss": 0.7680614, + "learning_rate": 3.045403886269181e-06, + "loss": 0.78917074, + "num_input_tokens_seen": 123373420, + "step": 5741, + "time_per_iteration": 2.694580078125 + }, + { + "auxiliary_loss_clip": 0.01063339, + "auxiliary_loss_mlp": 0.01034699, + "balance_loss_clip": 1.02893507, + "balance_loss_mlp": 1.01981592, + "epoch": 0.34522771681947995, + "flos": 26214358890240.0, + "grad_norm": 1.6951817996883305, + "language_loss": 0.76970398, + "learning_rate": 3.045071844330053e-06, + "loss": 0.79068434, + "num_input_tokens_seen": 123394730, + "step": 5742, + "time_per_iteration": 2.677931308746338 + }, + { + "auxiliary_loss_clip": 0.01074679, + "auxiliary_loss_mlp": 0.01039583, + "balance_loss_clip": 1.03017116, + "balance_loss_mlp": 1.02483702, + "epoch": 0.3452878400721479, + "flos": 19062354072960.0, + "grad_norm": 1.842368771230934, + "language_loss": 0.76089716, + "learning_rate": 3.0447397627616955e-06, + "loss": 0.78203976, + "num_input_tokens_seen": 123412895, + "step": 5743, + "time_per_iteration": 2.614016532897949 + }, + { + "auxiliary_loss_clip": 0.01072462, + "auxiliary_loss_mlp": 0.01039061, + "balance_loss_clip": 1.03018975, + "balance_loss_mlp": 1.02523851, + "epoch": 0.3453479633248159, + "flos": 27929739772800.0, + "grad_norm": 1.690366876959169, + "language_loss": 0.70344293, + "learning_rate": 3.0444076415767016e-06, + "loss": 0.72455812, + "num_input_tokens_seen": 123432320, + "step": 5744, + "time_per_iteration": 2.64705228805542 + }, + { + "auxiliary_loss_clip": 0.01081598, + "auxiliary_loss_mlp": 0.01038608, + "balance_loss_clip": 1.03052568, + "balance_loss_mlp": 1.024261, + "epoch": 0.34540808657748384, + "flos": 19606113135360.0, + "grad_norm": 3.6012304442972476, + "language_loss": 0.79710805, + "learning_rate": 3.044075480787665e-06, + "loss": 0.81831014, + "num_input_tokens_seen": 123450980, + "step": 5745, + "time_per_iteration": 2.598642110824585 + }, + { + "auxiliary_loss_clip": 0.01044624, + "auxiliary_loss_mlp": 0.01039099, + "balance_loss_clip": 1.0306778, + "balance_loss_mlp": 1.02411449, + "epoch": 0.3454682098301518, + "flos": 20411661496320.0, + "grad_norm": 1.6784908676874686, + "language_loss": 0.88998449, + "learning_rate": 3.043743280407182e-06, + "loss": 0.91082168, + "num_input_tokens_seen": 123469365, + "step": 5746, + "time_per_iteration": 4.20418381690979 + }, + { + "auxiliary_loss_clip": 0.01075765, + "auxiliary_loss_mlp": 0.01040776, + "balance_loss_clip": 1.03003168, + "balance_loss_mlp": 1.02533877, + "epoch": 0.34552833308281977, + "flos": 21325121291520.0, + "grad_norm": 1.8670729922991274, + "language_loss": 0.64489555, + "learning_rate": 3.043411040447849e-06, + "loss": 0.66606092, + "num_input_tokens_seen": 123489425, + "step": 5747, + "time_per_iteration": 2.685582160949707 + }, + { + "auxiliary_loss_clip": 0.01062784, + "auxiliary_loss_mlp": 0.01036569, + "balance_loss_clip": 1.02986169, + "balance_loss_mlp": 1.02305079, + "epoch": 0.34558845633548774, + "flos": 36243633824640.0, + "grad_norm": 1.5318102711658501, + "language_loss": 0.72574842, + "learning_rate": 3.043078760922264e-06, + "loss": 0.74674189, + "num_input_tokens_seen": 123509970, + "step": 5748, + "time_per_iteration": 2.7386295795440674 + }, + { + "auxiliary_loss_clip": 0.0103482, + "auxiliary_loss_mlp": 0.01035444, + "balance_loss_clip": 1.02922726, + "balance_loss_mlp": 1.0223012, + "epoch": 0.3456485795881557, + "flos": 22450561200000.0, + "grad_norm": 1.6361747142172398, + "language_loss": 0.75692219, + "learning_rate": 3.042746441843029e-06, + "loss": 0.77762485, + "num_input_tokens_seen": 123531055, + "step": 5749, + "time_per_iteration": 2.783949613571167 + }, + { + "auxiliary_loss_clip": 0.01004314, + "auxiliary_loss_mlp": 0.01002464, + "balance_loss_clip": 1.01272154, + "balance_loss_mlp": 1.00069988, + "epoch": 0.34570870284082367, + "flos": 62004299005440.0, + "grad_norm": 0.8724567201060678, + "language_loss": 0.62661129, + "learning_rate": 3.0424140832227437e-06, + "loss": 0.6466791, + "num_input_tokens_seen": 123584720, + "step": 5750, + "time_per_iteration": 4.763587713241577 + }, + { + "auxiliary_loss_clip": 0.01060232, + "auxiliary_loss_mlp": 0.01030035, + "balance_loss_clip": 1.03049457, + "balance_loss_mlp": 1.01650441, + "epoch": 0.34576882609349163, + "flos": 22782196494720.0, + "grad_norm": 1.541816698775158, + "language_loss": 0.807818, + "learning_rate": 3.042081685074012e-06, + "loss": 0.82872069, + "num_input_tokens_seen": 123604465, + "step": 5751, + "time_per_iteration": 2.8207504749298096 + }, + { + "auxiliary_loss_clip": 0.01081742, + "auxiliary_loss_mlp": 0.01043797, + "balance_loss_clip": 1.03114319, + "balance_loss_mlp": 1.03035021, + "epoch": 0.34582894934615965, + "flos": 12348818576640.0, + "grad_norm": 2.1780065221274936, + "language_loss": 0.83969939, + "learning_rate": 3.041749247409439e-06, + "loss": 0.86095476, + "num_input_tokens_seen": 123622320, + "step": 5752, + "time_per_iteration": 2.8346810340881348 + }, + { + "auxiliary_loss_clip": 0.00995102, + "auxiliary_loss_mlp": 0.00746923, + "balance_loss_clip": 1.00439608, + "balance_loss_mlp": 1.00046849, + "epoch": 0.3458890725988276, + "flos": 70167691071360.0, + "grad_norm": 0.7352041479404574, + "language_loss": 0.63157547, + "learning_rate": 3.0414167702416296e-06, + "loss": 0.64899576, + "num_input_tokens_seen": 123678010, + "step": 5753, + "time_per_iteration": 3.2836062908172607 + }, + { + "auxiliary_loss_clip": 0.01062769, + "auxiliary_loss_mlp": 0.01037185, + "balance_loss_clip": 1.03100252, + "balance_loss_mlp": 1.02309442, + "epoch": 0.3459491958514956, + "flos": 17092582093440.0, + "grad_norm": 1.7077923858602273, + "language_loss": 0.70951009, + "learning_rate": 3.0410842535831914e-06, + "loss": 0.73050964, + "num_input_tokens_seen": 123696830, + "step": 5754, + "time_per_iteration": 2.723325729370117 + }, + { + "auxiliary_loss_clip": 0.01075426, + "auxiliary_loss_mlp": 0.01033763, + "balance_loss_clip": 1.03246117, + "balance_loss_mlp": 1.01920116, + "epoch": 0.34600931910416355, + "flos": 16650952375680.0, + "grad_norm": 1.6880276465687931, + "language_loss": 0.73100621, + "learning_rate": 3.0407516974467343e-06, + "loss": 0.75209808, + "num_input_tokens_seen": 123714360, + "step": 5755, + "time_per_iteration": 2.756197214126587 + }, + { + "auxiliary_loss_clip": 0.0106894, + "auxiliary_loss_mlp": 0.01033959, + "balance_loss_clip": 1.02833319, + "balance_loss_mlp": 1.0199337, + "epoch": 0.3460694423568315, + "flos": 38546190334080.0, + "grad_norm": 2.0057211366214664, + "language_loss": 0.72433192, + "learning_rate": 3.040419101844869e-06, + "loss": 0.74536091, + "num_input_tokens_seen": 123739250, + "step": 5756, + "time_per_iteration": 2.7449429035186768 + }, + { + "auxiliary_loss_clip": 0.01005075, + "auxiliary_loss_mlp": 0.01012635, + "balance_loss_clip": 1.00453389, + "balance_loss_mlp": 1.01099038, + "epoch": 0.3461295656094995, + "flos": 72081479704320.0, + "grad_norm": 0.727604262844275, + "language_loss": 0.62568629, + "learning_rate": 3.040086466790207e-06, + "loss": 0.64586341, + "num_input_tokens_seen": 123802845, + "step": 5757, + "time_per_iteration": 3.2009594440460205 + }, + { + "auxiliary_loss_clip": 0.00986088, + "auxiliary_loss_mlp": 0.00746901, + "balance_loss_clip": 1.00477481, + "balance_loss_mlp": 1.00042975, + "epoch": 0.34618968886216744, + "flos": 65460089571840.0, + "grad_norm": 0.826445835300437, + "language_loss": 0.59278071, + "learning_rate": 3.039753792295362e-06, + "loss": 0.61011064, + "num_input_tokens_seen": 123861805, + "step": 5758, + "time_per_iteration": 6.380641222000122 + }, + { + "auxiliary_loss_clip": 0.01071526, + "auxiliary_loss_mlp": 0.01038208, + "balance_loss_clip": 1.03660798, + "balance_loss_mlp": 1.02473128, + "epoch": 0.3462498121148354, + "flos": 23472542960640.0, + "grad_norm": 1.8599691446053306, + "language_loss": 0.71823967, + "learning_rate": 3.0394210783729487e-06, + "loss": 0.73933697, + "num_input_tokens_seen": 123881820, + "step": 5759, + "time_per_iteration": 2.6818201541900635 + }, + { + "auxiliary_loss_clip": 0.01031801, + "auxiliary_loss_mlp": 0.0104942, + "balance_loss_clip": 1.02711248, + "balance_loss_mlp": 1.03364825, + "epoch": 0.3463099353675034, + "flos": 24170790418560.0, + "grad_norm": 2.1512393249601756, + "language_loss": 0.83183694, + "learning_rate": 3.0390883250355836e-06, + "loss": 0.85264915, + "num_input_tokens_seen": 123903700, + "step": 5760, + "time_per_iteration": 2.875976800918579 + }, + { + "auxiliary_loss_clip": 0.00984338, + "auxiliary_loss_mlp": 0.01001528, + "balance_loss_clip": 1.00321484, + "balance_loss_mlp": 0.99983484, + "epoch": 0.34637005862017134, + "flos": 63700609766400.0, + "grad_norm": 0.8167255368596201, + "language_loss": 0.56500494, + "learning_rate": 3.0387555322958865e-06, + "loss": 0.5848636, + "num_input_tokens_seen": 123960075, + "step": 5761, + "time_per_iteration": 3.300360918045044 + }, + { + "auxiliary_loss_clip": 0.01067773, + "auxiliary_loss_mlp": 0.00748043, + "balance_loss_clip": 1.0274477, + "balance_loss_mlp": 1.00058413, + "epoch": 0.3464301818728393, + "flos": 13145532192000.0, + "grad_norm": 1.9940087166344864, + "language_loss": 0.95296288, + "learning_rate": 3.038422700166474e-06, + "loss": 0.97112107, + "num_input_tokens_seen": 123975805, + "step": 5762, + "time_per_iteration": 2.5333850383758545 + }, + { + "auxiliary_loss_clip": 0.01048673, + "auxiliary_loss_mlp": 0.01039005, + "balance_loss_clip": 1.02799296, + "balance_loss_mlp": 1.02448511, + "epoch": 0.34649030512550727, + "flos": 29315173299840.0, + "grad_norm": 1.6519193589793661, + "language_loss": 0.69555491, + "learning_rate": 3.0380898286599692e-06, + "loss": 0.71643174, + "num_input_tokens_seen": 123997530, + "step": 5763, + "time_per_iteration": 2.7235498428344727 + }, + { + "auxiliary_loss_clip": 0.01071968, + "auxiliary_loss_mlp": 0.01044996, + "balance_loss_clip": 1.03101826, + "balance_loss_mlp": 1.02854466, + "epoch": 0.34655042837817523, + "flos": 23730884553600.0, + "grad_norm": 1.8992554898798077, + "language_loss": 0.83829468, + "learning_rate": 3.0377569177889945e-06, + "loss": 0.85946435, + "num_input_tokens_seen": 124016375, + "step": 5764, + "time_per_iteration": 2.623737335205078 + }, + { + "auxiliary_loss_clip": 0.01052123, + "auxiliary_loss_mlp": 0.01038954, + "balance_loss_clip": 1.02827311, + "balance_loss_mlp": 1.02450013, + "epoch": 0.34661055163084326, + "flos": 22054215553920.0, + "grad_norm": 2.7374421839593523, + "language_loss": 0.68146884, + "learning_rate": 3.0374239675661722e-06, + "loss": 0.70237958, + "num_input_tokens_seen": 124033975, + "step": 5765, + "time_per_iteration": 2.864553689956665 + }, + { + "auxiliary_loss_clip": 0.01058639, + "auxiliary_loss_mlp": 0.01046487, + "balance_loss_clip": 1.03246665, + "balance_loss_mlp": 1.03091836, + "epoch": 0.3466706748835112, + "flos": 21799213925760.0, + "grad_norm": 1.7995922393940937, + "language_loss": 0.77030516, + "learning_rate": 3.03709097800413e-06, + "loss": 0.79135644, + "num_input_tokens_seen": 124051930, + "step": 5766, + "time_per_iteration": 2.6212687492370605 + }, + { + "auxiliary_loss_clip": 0.01032914, + "auxiliary_loss_mlp": 0.01037077, + "balance_loss_clip": 1.02866137, + "balance_loss_mlp": 1.02343905, + "epoch": 0.3467307981361792, + "flos": 19461680547840.0, + "grad_norm": 1.626943951191586, + "language_loss": 0.73441917, + "learning_rate": 3.0367579491154943e-06, + "loss": 0.75511903, + "num_input_tokens_seen": 124071220, + "step": 5767, + "time_per_iteration": 2.869791030883789 + }, + { + "auxiliary_loss_clip": 0.01064986, + "auxiliary_loss_mlp": 0.01045585, + "balance_loss_clip": 1.03423285, + "balance_loss_mlp": 1.03032589, + "epoch": 0.34679092138884715, + "flos": 24827452905600.0, + "grad_norm": 1.6801545898769312, + "language_loss": 0.77963245, + "learning_rate": 3.036424880912893e-06, + "loss": 0.8007381, + "num_input_tokens_seen": 124090140, + "step": 5768, + "time_per_iteration": 2.716320514678955 + }, + { + "auxiliary_loss_clip": 0.01004306, + "auxiliary_loss_mlp": 0.01003009, + "balance_loss_clip": 1.00386989, + "balance_loss_mlp": 1.00130403, + "epoch": 0.3468510446415151, + "flos": 63236070149760.0, + "grad_norm": 0.7652300677689902, + "language_loss": 0.57493794, + "learning_rate": 3.036091773408956e-06, + "loss": 0.59501112, + "num_input_tokens_seen": 124152025, + "step": 5769, + "time_per_iteration": 3.2131686210632324 + }, + { + "auxiliary_loss_clip": 0.01060848, + "auxiliary_loss_mlp": 0.01039232, + "balance_loss_clip": 1.03200388, + "balance_loss_mlp": 1.02256656, + "epoch": 0.3469111678941831, + "flos": 12120713256960.0, + "grad_norm": 2.1523973185756122, + "language_loss": 0.85792208, + "learning_rate": 3.0357586266163154e-06, + "loss": 0.87892294, + "num_input_tokens_seen": 124165795, + "step": 5770, + "time_per_iteration": 2.630507230758667 + }, + { + "auxiliary_loss_clip": 0.00997417, + "auxiliary_loss_mlp": 0.01058389, + "balance_loss_clip": 1.00624132, + "balance_loss_mlp": 1.05626678, + "epoch": 0.34697129114685105, + "flos": 65934110378880.0, + "grad_norm": 0.777073935815057, + "language_loss": 0.59787637, + "learning_rate": 3.0354254405476036e-06, + "loss": 0.61843443, + "num_input_tokens_seen": 124222925, + "step": 5771, + "time_per_iteration": 2.9852969646453857 + }, + { + "auxiliary_loss_clip": 0.01076756, + "auxiliary_loss_mlp": 0.01046035, + "balance_loss_clip": 1.03249502, + "balance_loss_mlp": 1.03100228, + "epoch": 0.347031414399519, + "flos": 34454205054720.0, + "grad_norm": 1.776458734207006, + "language_loss": 0.7161389, + "learning_rate": 3.0350922152154557e-06, + "loss": 0.7373668, + "num_input_tokens_seen": 124240915, + "step": 5772, + "time_per_iteration": 2.7849860191345215 + }, + { + "auxiliary_loss_clip": 0.01061799, + "auxiliary_loss_mlp": 0.0074813, + "balance_loss_clip": 1.03501689, + "balance_loss_mlp": 1.00049555, + "epoch": 0.347091537652187, + "flos": 26944135511040.0, + "grad_norm": 1.426573494978839, + "language_loss": 0.76424021, + "learning_rate": 3.034758950632507e-06, + "loss": 0.78233951, + "num_input_tokens_seen": 124262770, + "step": 5773, + "time_per_iteration": 2.822598695755005 + }, + { + "auxiliary_loss_clip": 0.01077086, + "auxiliary_loss_mlp": 0.0104022, + "balance_loss_clip": 1.03172576, + "balance_loss_mlp": 1.02490199, + "epoch": 0.34715166090485494, + "flos": 21142228216320.0, + "grad_norm": 2.292550918918001, + "language_loss": 0.7087149, + "learning_rate": 3.034425646811396e-06, + "loss": 0.72988796, + "num_input_tokens_seen": 124280950, + "step": 5774, + "time_per_iteration": 2.696484327316284 + }, + { + "auxiliary_loss_clip": 0.01063272, + "auxiliary_loss_mlp": 0.00748018, + "balance_loss_clip": 1.03182864, + "balance_loss_mlp": 1.00043392, + "epoch": 0.3472117841575229, + "flos": 23478001827840.0, + "grad_norm": 1.5696184358460183, + "language_loss": 0.76385868, + "learning_rate": 3.0340923037647602e-06, + "loss": 0.78197157, + "num_input_tokens_seen": 124299540, + "step": 5775, + "time_per_iteration": 2.78125262260437 + }, + { + "auxiliary_loss_clip": 0.01061946, + "auxiliary_loss_mlp": 0.01046854, + "balance_loss_clip": 1.02994275, + "balance_loss_mlp": 1.03032541, + "epoch": 0.34727190741019087, + "flos": 17492806408320.0, + "grad_norm": 1.911081434820443, + "language_loss": 0.7741518, + "learning_rate": 3.0337589215052404e-06, + "loss": 0.79523975, + "num_input_tokens_seen": 124316285, + "step": 5776, + "time_per_iteration": 2.7272746562957764 + }, + { + "auxiliary_loss_clip": 0.00999561, + "auxiliary_loss_mlp": 0.01006881, + "balance_loss_clip": 1.00835729, + "balance_loss_mlp": 1.00529575, + "epoch": 0.34733203066285884, + "flos": 65265491640960.0, + "grad_norm": 0.8401308527193052, + "language_loss": 0.63342011, + "learning_rate": 3.033425500045478e-06, + "loss": 0.65348458, + "num_input_tokens_seen": 124376650, + "step": 5777, + "time_per_iteration": 3.3261494636535645 + }, + { + "auxiliary_loss_clip": 0.01053367, + "auxiliary_loss_mlp": 0.01039046, + "balance_loss_clip": 1.0304544, + "balance_loss_mlp": 1.02378726, + "epoch": 0.3473921539155268, + "flos": 28658726294400.0, + "grad_norm": 1.8416455172324764, + "language_loss": 0.64600217, + "learning_rate": 3.033092039398119e-06, + "loss": 0.66692626, + "num_input_tokens_seen": 124396475, + "step": 5778, + "time_per_iteration": 2.7535202503204346 + }, + { + "auxiliary_loss_clip": 0.01066774, + "auxiliary_loss_mlp": 0.01048797, + "balance_loss_clip": 1.03009272, + "balance_loss_mlp": 1.03397298, + "epoch": 0.3474522771681948, + "flos": 40836895355520.0, + "grad_norm": 1.7968149944309375, + "language_loss": 0.71219325, + "learning_rate": 3.0327585395758046e-06, + "loss": 0.73334897, + "num_input_tokens_seen": 124416480, + "step": 5779, + "time_per_iteration": 2.8406896591186523 + }, + { + "auxiliary_loss_clip": 0.01090579, + "auxiliary_loss_mlp": 0.01043689, + "balance_loss_clip": 1.03363538, + "balance_loss_mlp": 1.02893066, + "epoch": 0.3475124004208628, + "flos": 24608577381120.0, + "grad_norm": 1.77903282719424, + "language_loss": 0.62461823, + "learning_rate": 3.0324250005911837e-06, + "loss": 0.64596087, + "num_input_tokens_seen": 124435950, + "step": 5780, + "time_per_iteration": 2.690727949142456 + }, + { + "auxiliary_loss_clip": 0.01042482, + "auxiliary_loss_mlp": 0.01044101, + "balance_loss_clip": 1.02794516, + "balance_loss_mlp": 1.02956963, + "epoch": 0.34757252367353075, + "flos": 22711309004160.0, + "grad_norm": 1.577564079255901, + "language_loss": 0.72295868, + "learning_rate": 3.0320914224569033e-06, + "loss": 0.7438246, + "num_input_tokens_seen": 124455410, + "step": 5781, + "time_per_iteration": 2.800727605819702 + }, + { + "auxiliary_loss_clip": 0.01021397, + "auxiliary_loss_mlp": 0.01051578, + "balance_loss_clip": 1.02583194, + "balance_loss_mlp": 1.0358839, + "epoch": 0.3476326469261987, + "flos": 19828184970240.0, + "grad_norm": 1.868396017562576, + "language_loss": 0.76578116, + "learning_rate": 3.031757805185612e-06, + "loss": 0.78651083, + "num_input_tokens_seen": 124474870, + "step": 5782, + "time_per_iteration": 2.812627077102661 + }, + { + "auxiliary_loss_clip": 0.01066551, + "auxiliary_loss_mlp": 0.01038753, + "balance_loss_clip": 1.03274131, + "balance_loss_mlp": 1.02392364, + "epoch": 0.3476927701788667, + "flos": 19938107566080.0, + "grad_norm": 3.0971338821074674, + "language_loss": 0.6238001, + "learning_rate": 3.0314241487899622e-06, + "loss": 0.64485317, + "num_input_tokens_seen": 124494105, + "step": 5783, + "time_per_iteration": 2.701307535171509 + }, + { + "auxiliary_loss_clip": 0.0104566, + "auxiliary_loss_mlp": 0.01031777, + "balance_loss_clip": 1.03356743, + "balance_loss_mlp": 1.01849735, + "epoch": 0.34775289343153465, + "flos": 20735108490240.0, + "grad_norm": 1.612617888932647, + "language_loss": 0.88353193, + "learning_rate": 3.031090453282605e-06, + "loss": 0.90430629, + "num_input_tokens_seen": 124512030, + "step": 5784, + "time_per_iteration": 2.7522478103637695 + }, + { + "auxiliary_loss_clip": 0.01046678, + "auxiliary_loss_mlp": 0.01036806, + "balance_loss_clip": 1.0350101, + "balance_loss_mlp": 1.02244163, + "epoch": 0.3478130166842026, + "flos": 19354846521600.0, + "grad_norm": 1.807249595513624, + "language_loss": 0.82296073, + "learning_rate": 3.0307567186761946e-06, + "loss": 0.84379554, + "num_input_tokens_seen": 124530980, + "step": 5785, + "time_per_iteration": 2.7678980827331543 + }, + { + "auxiliary_loss_clip": 0.01053251, + "auxiliary_loss_mlp": 0.01043833, + "balance_loss_clip": 1.03002644, + "balance_loss_mlp": 1.02951622, + "epoch": 0.3478731399368706, + "flos": 22051198811520.0, + "grad_norm": 2.0863356390565038, + "language_loss": 0.80666482, + "learning_rate": 3.0304229449833862e-06, + "loss": 0.82763571, + "num_input_tokens_seen": 124549330, + "step": 5786, + "time_per_iteration": 2.6561052799224854 + }, + { + "auxiliary_loss_clip": 0.01088725, + "auxiliary_loss_mlp": 0.00748106, + "balance_loss_clip": 1.03581452, + "balance_loss_mlp": 1.00039887, + "epoch": 0.34793326318953854, + "flos": 18041449720320.0, + "grad_norm": 1.6783863578963256, + "language_loss": 0.75082731, + "learning_rate": 3.030089132216836e-06, + "loss": 0.76919568, + "num_input_tokens_seen": 124567200, + "step": 5787, + "time_per_iteration": 2.5726706981658936 + }, + { + "auxiliary_loss_clip": 0.01051663, + "auxiliary_loss_mlp": 0.00748251, + "balance_loss_clip": 1.02824569, + "balance_loss_mlp": 1.00044012, + "epoch": 0.3479933864422065, + "flos": 29314670509440.0, + "grad_norm": 1.5164063475152914, + "language_loss": 0.8101899, + "learning_rate": 3.029755280389203e-06, + "loss": 0.82818902, + "num_input_tokens_seen": 124587025, + "step": 5788, + "time_per_iteration": 2.719019651412964 + }, + { + "auxiliary_loss_clip": 0.01092816, + "auxiliary_loss_mlp": 0.01036413, + "balance_loss_clip": 1.03645682, + "balance_loss_mlp": 1.02095103, + "epoch": 0.3480535096948745, + "flos": 20120713332480.0, + "grad_norm": 2.0825708860794996, + "language_loss": 0.85889226, + "learning_rate": 3.029421389513147e-06, + "loss": 0.88018453, + "num_input_tokens_seen": 124605860, + "step": 5789, + "time_per_iteration": 2.6346802711486816 + }, + { + "auxiliary_loss_clip": 0.01082565, + "auxiliary_loss_mlp": 0.01059455, + "balance_loss_clip": 1.03687203, + "balance_loss_mlp": 1.0443933, + "epoch": 0.34811363294754244, + "flos": 18548974938240.0, + "grad_norm": 1.8161085336413856, + "language_loss": 0.84893513, + "learning_rate": 3.029087459601328e-06, + "loss": 0.87035537, + "num_input_tokens_seen": 124624270, + "step": 5790, + "time_per_iteration": 2.664874315261841 + }, + { + "auxiliary_loss_clip": 0.0108265, + "auxiliary_loss_mlp": 0.01044708, + "balance_loss_clip": 1.03605175, + "balance_loss_mlp": 1.02931798, + "epoch": 0.3481737562002104, + "flos": 26870303105280.0, + "grad_norm": 2.2340581417199292, + "language_loss": 0.81324822, + "learning_rate": 3.0287534906664097e-06, + "loss": 0.83452183, + "num_input_tokens_seen": 124644005, + "step": 5791, + "time_per_iteration": 2.767833709716797 + }, + { + "auxiliary_loss_clip": 0.01078543, + "auxiliary_loss_mlp": 0.01042404, + "balance_loss_clip": 1.03346419, + "balance_loss_mlp": 1.02721071, + "epoch": 0.3482338794528784, + "flos": 28908664104960.0, + "grad_norm": 1.6883767161146146, + "language_loss": 0.77577269, + "learning_rate": 3.028419482721056e-06, + "loss": 0.79698217, + "num_input_tokens_seen": 124663020, + "step": 5792, + "time_per_iteration": 2.662040948867798 + }, + { + "auxiliary_loss_clip": 0.0105745, + "auxiliary_loss_mlp": 0.01032011, + "balance_loss_clip": 1.0273124, + "balance_loss_mlp": 1.01786685, + "epoch": 0.3482940027055464, + "flos": 22200767043840.0, + "grad_norm": 1.6987446799259214, + "language_loss": 0.81558621, + "learning_rate": 3.0280854357779325e-06, + "loss": 0.83648086, + "num_input_tokens_seen": 124682975, + "step": 5793, + "time_per_iteration": 4.273141384124756 + }, + { + "auxiliary_loss_clip": 0.01079927, + "auxiliary_loss_mlp": 0.0104834, + "balance_loss_clip": 1.03522742, + "balance_loss_mlp": 1.0329442, + "epoch": 0.34835412595821436, + "flos": 20302708567680.0, + "grad_norm": 1.7371900715915651, + "language_loss": 0.76067483, + "learning_rate": 3.027751349849706e-06, + "loss": 0.78195751, + "num_input_tokens_seen": 124701340, + "step": 5794, + "time_per_iteration": 2.6970582008361816 + }, + { + "auxiliary_loss_clip": 0.01073888, + "auxiliary_loss_mlp": 0.01041538, + "balance_loss_clip": 1.03153229, + "balance_loss_mlp": 1.02715588, + "epoch": 0.3484142492108823, + "flos": 20449691020800.0, + "grad_norm": 3.958928531628266, + "language_loss": 0.57136285, + "learning_rate": 3.0274172249490456e-06, + "loss": 0.59251714, + "num_input_tokens_seen": 124719165, + "step": 5795, + "time_per_iteration": 2.727074384689331 + }, + { + "auxiliary_loss_clip": 0.01064118, + "auxiliary_loss_mlp": 0.01041025, + "balance_loss_clip": 1.03256023, + "balance_loss_mlp": 1.02699459, + "epoch": 0.3484743724635503, + "flos": 24352929308160.0, + "grad_norm": 1.7488052113311432, + "language_loss": 0.82650512, + "learning_rate": 3.0270830610886213e-06, + "loss": 0.84755653, + "num_input_tokens_seen": 124738670, + "step": 5796, + "time_per_iteration": 4.371414422988892 + }, + { + "auxiliary_loss_clip": 0.01074944, + "auxiliary_loss_mlp": 0.01030846, + "balance_loss_clip": 1.03421378, + "balance_loss_mlp": 1.01741147, + "epoch": 0.34853449571621825, + "flos": 24353001135360.0, + "grad_norm": 1.5081618018038234, + "language_loss": 0.83640987, + "learning_rate": 3.0267488582811033e-06, + "loss": 0.85746771, + "num_input_tokens_seen": 124758760, + "step": 5797, + "time_per_iteration": 2.727858066558838 + }, + { + "auxiliary_loss_clip": 0.01084697, + "auxiliary_loss_mlp": 0.01036923, + "balance_loss_clip": 1.03333127, + "balance_loss_mlp": 1.02211106, + "epoch": 0.3485946189688862, + "flos": 27267690245760.0, + "grad_norm": 1.5308116354542736, + "language_loss": 0.73291141, + "learning_rate": 3.026414616539167e-06, + "loss": 0.75412762, + "num_input_tokens_seen": 124777765, + "step": 5798, + "time_per_iteration": 2.611560106277466 + }, + { + "auxiliary_loss_clip": 0.01084572, + "auxiliary_loss_mlp": 0.01041266, + "balance_loss_clip": 1.03101039, + "balance_loss_mlp": 1.0260545, + "epoch": 0.3486547422215542, + "flos": 20156695781760.0, + "grad_norm": 1.9962272796697407, + "language_loss": 0.75683033, + "learning_rate": 3.026080335875485e-06, + "loss": 0.77808869, + "num_input_tokens_seen": 124796775, + "step": 5799, + "time_per_iteration": 2.6684231758117676 + }, + { + "auxiliary_loss_clip": 0.01028272, + "auxiliary_loss_mlp": 0.01041465, + "balance_loss_clip": 1.03594697, + "balance_loss_mlp": 1.0272795, + "epoch": 0.34871486547422215, + "flos": 20230348619520.0, + "grad_norm": 1.9631831327850557, + "language_loss": 0.75980914, + "learning_rate": 3.025746016302734e-06, + "loss": 0.78050649, + "num_input_tokens_seen": 124815825, + "step": 5800, + "time_per_iteration": 3.094053030014038 + }, + { + "auxiliary_loss_clip": 0.01062547, + "auxiliary_loss_mlp": 0.0074802, + "balance_loss_clip": 1.03018332, + "balance_loss_mlp": 1.00037313, + "epoch": 0.3487749887268901, + "flos": 44053234882560.0, + "grad_norm": 1.785845700118741, + "language_loss": 0.67520988, + "learning_rate": 3.025411657833591e-06, + "loss": 0.69331557, + "num_input_tokens_seen": 124838420, + "step": 5801, + "time_per_iteration": 3.033412218093872 + }, + { + "auxiliary_loss_clip": 0.01048086, + "auxiliary_loss_mlp": 0.01040428, + "balance_loss_clip": 1.0261519, + "balance_loss_mlp": 1.02510977, + "epoch": 0.3488351119795581, + "flos": 23295144666240.0, + "grad_norm": 1.6976498806922105, + "language_loss": 0.76160371, + "learning_rate": 3.025077260480735e-06, + "loss": 0.78248888, + "num_input_tokens_seen": 124857320, + "step": 5802, + "time_per_iteration": 2.703303337097168 + }, + { + "auxiliary_loss_clip": 0.01012845, + "auxiliary_loss_mlp": 0.010371, + "balance_loss_clip": 1.02717495, + "balance_loss_mlp": 1.02293789, + "epoch": 0.34889523523222604, + "flos": 19934839428480.0, + "grad_norm": 1.6080168709176936, + "language_loss": 0.78710967, + "learning_rate": 3.0247428242568474e-06, + "loss": 0.80760908, + "num_input_tokens_seen": 124875685, + "step": 5803, + "time_per_iteration": 2.8143367767333984 + }, + { + "auxiliary_loss_clip": 0.01060386, + "auxiliary_loss_mlp": 0.00748154, + "balance_loss_clip": 1.02775407, + "balance_loss_mlp": 1.0004096, + "epoch": 0.348955358484894, + "flos": 30446179816320.0, + "grad_norm": 1.925939056587641, + "language_loss": 0.67729771, + "learning_rate": 3.0244083491746085e-06, + "loss": 0.69538307, + "num_input_tokens_seen": 124895960, + "step": 5804, + "time_per_iteration": 2.6610682010650635 + }, + { + "auxiliary_loss_clip": 0.01061674, + "auxiliary_loss_mlp": 0.0104383, + "balance_loss_clip": 1.03244972, + "balance_loss_mlp": 1.02933455, + "epoch": 0.349015481737562, + "flos": 17999972490240.0, + "grad_norm": 1.7254722159503442, + "language_loss": 0.76231194, + "learning_rate": 3.024073835246702e-06, + "loss": 0.78336704, + "num_input_tokens_seen": 124914140, + "step": 5805, + "time_per_iteration": 4.583614349365234 + }, + { + "auxiliary_loss_clip": 0.01038685, + "auxiliary_loss_mlp": 0.0103684, + "balance_loss_clip": 1.02802205, + "balance_loss_mlp": 1.02184904, + "epoch": 0.34907560499023, + "flos": 27198490694400.0, + "grad_norm": 3.9731675483747764, + "language_loss": 0.67214751, + "learning_rate": 3.023739282485814e-06, + "loss": 0.6929028, + "num_input_tokens_seen": 124934180, + "step": 5806, + "time_per_iteration": 2.73447847366333 + }, + { + "auxiliary_loss_clip": 0.01076719, + "auxiliary_loss_mlp": 0.01043466, + "balance_loss_clip": 1.03331995, + "balance_loss_mlp": 1.02870846, + "epoch": 0.34913572824289796, + "flos": 30226873328640.0, + "grad_norm": 34.61726936825756, + "language_loss": 0.71991134, + "learning_rate": 3.023404690904629e-06, + "loss": 0.74111319, + "num_input_tokens_seen": 124956060, + "step": 5807, + "time_per_iteration": 2.6642844676971436 + }, + { + "auxiliary_loss_clip": 0.0108392, + "auxiliary_loss_mlp": 0.01040358, + "balance_loss_clip": 1.0296793, + "balance_loss_mlp": 1.02524197, + "epoch": 0.3491958514955659, + "flos": 29971907614080.0, + "grad_norm": 1.731959943707347, + "language_loss": 0.73998427, + "learning_rate": 3.0230700605158364e-06, + "loss": 0.76122707, + "num_input_tokens_seen": 124976070, + "step": 5808, + "time_per_iteration": 2.6208486557006836 + }, + { + "auxiliary_loss_clip": 0.01083776, + "auxiliary_loss_mlp": 0.01042777, + "balance_loss_clip": 1.03305912, + "balance_loss_mlp": 1.02852595, + "epoch": 0.3492559747482339, + "flos": 22783273902720.0, + "grad_norm": 1.3467811099494493, + "language_loss": 0.84163421, + "learning_rate": 3.0227353913321238e-06, + "loss": 0.86289978, + "num_input_tokens_seen": 124996995, + "step": 5809, + "time_per_iteration": 2.58686900138855 + }, + { + "auxiliary_loss_clip": 0.01056315, + "auxiliary_loss_mlp": 0.01039934, + "balance_loss_clip": 1.0282557, + "balance_loss_mlp": 1.02697563, + "epoch": 0.34931609800090185, + "flos": 26068022881920.0, + "grad_norm": 2.0543872411768214, + "language_loss": 0.8044802, + "learning_rate": 3.0224006833661835e-06, + "loss": 0.82544267, + "num_input_tokens_seen": 125015600, + "step": 5810, + "time_per_iteration": 2.640768527984619 + }, + { + "auxiliary_loss_clip": 0.01081399, + "auxiliary_loss_mlp": 0.01039016, + "balance_loss_clip": 1.02929008, + "balance_loss_mlp": 1.02547979, + "epoch": 0.3493762212535698, + "flos": 29242023252480.0, + "grad_norm": 1.661807641804037, + "language_loss": 0.75576895, + "learning_rate": 3.0220659366307057e-06, + "loss": 0.77697307, + "num_input_tokens_seen": 125035290, + "step": 5811, + "time_per_iteration": 2.6550450325012207 + }, + { + "auxiliary_loss_clip": 0.0106626, + "auxiliary_loss_mlp": 0.010377, + "balance_loss_clip": 1.03083265, + "balance_loss_mlp": 1.02338839, + "epoch": 0.3494363445062378, + "flos": 27126058919040.0, + "grad_norm": 1.6270509646564926, + "language_loss": 0.80144942, + "learning_rate": 3.021731151138386e-06, + "loss": 0.82248902, + "num_input_tokens_seen": 125057130, + "step": 5812, + "time_per_iteration": 2.68438458442688 + }, + { + "auxiliary_loss_clip": 0.01026367, + "auxiliary_loss_mlp": 0.01036173, + "balance_loss_clip": 1.02637362, + "balance_loss_mlp": 1.02186823, + "epoch": 0.34949646775890575, + "flos": 12276207233280.0, + "grad_norm": 1.7045363063641528, + "language_loss": 0.69233501, + "learning_rate": 3.021396326901918e-06, + "loss": 0.71296042, + "num_input_tokens_seen": 125073720, + "step": 5813, + "time_per_iteration": 2.7584598064422607 + }, + { + "auxiliary_loss_clip": 0.0104869, + "auxiliary_loss_mlp": 0.0074801, + "balance_loss_clip": 1.02523386, + "balance_loss_mlp": 1.00037551, + "epoch": 0.3495565910115737, + "flos": 17165516659200.0, + "grad_norm": 1.8026333706134154, + "language_loss": 0.76593769, + "learning_rate": 3.0210614639339998e-06, + "loss": 0.78390467, + "num_input_tokens_seen": 125090635, + "step": 5814, + "time_per_iteration": 2.713740110397339 + }, + { + "auxiliary_loss_clip": 0.01061, + "auxiliary_loss_mlp": 0.00748019, + "balance_loss_clip": 1.02993321, + "balance_loss_mlp": 1.00035226, + "epoch": 0.3496167142642417, + "flos": 26465661417600.0, + "grad_norm": 1.4842372976597369, + "language_loss": 0.84849042, + "learning_rate": 3.020726562247328e-06, + "loss": 0.86658061, + "num_input_tokens_seen": 125110070, + "step": 5815, + "time_per_iteration": 2.814723014831543 + }, + { + "auxiliary_loss_clip": 0.01069312, + "auxiliary_loss_mlp": 0.0103309, + "balance_loss_clip": 1.02881026, + "balance_loss_mlp": 1.02009034, + "epoch": 0.34967683751690964, + "flos": 17414843938560.0, + "grad_norm": 2.017383285477023, + "language_loss": 0.77366292, + "learning_rate": 3.0203916218546024e-06, + "loss": 0.79468697, + "num_input_tokens_seen": 125125730, + "step": 5816, + "time_per_iteration": 2.745656967163086 + }, + { + "auxiliary_loss_clip": 0.01076426, + "auxiliary_loss_mlp": 0.01042583, + "balance_loss_clip": 1.03334749, + "balance_loss_mlp": 1.02856445, + "epoch": 0.3497369607695776, + "flos": 22600021691520.0, + "grad_norm": 2.014864675245722, + "language_loss": 0.58650851, + "learning_rate": 3.0200566427685246e-06, + "loss": 0.60769868, + "num_input_tokens_seen": 125146195, + "step": 5817, + "time_per_iteration": 2.7840144634246826 + }, + { + "auxiliary_loss_clip": 0.01012752, + "auxiliary_loss_mlp": 0.01004126, + "balance_loss_clip": 1.00225353, + "balance_loss_mlp": 1.00255227, + "epoch": 0.34979708402224563, + "flos": 68529374818560.0, + "grad_norm": 0.8668543244834997, + "language_loss": 0.59877098, + "learning_rate": 3.0197216250017975e-06, + "loss": 0.61893976, + "num_input_tokens_seen": 125207790, + "step": 5818, + "time_per_iteration": 3.258140802383423 + }, + { + "auxiliary_loss_clip": 0.01042617, + "auxiliary_loss_mlp": 0.01036925, + "balance_loss_clip": 1.02819085, + "balance_loss_mlp": 1.02266216, + "epoch": 0.3498572072749136, + "flos": 18989634988800.0, + "grad_norm": 1.7944750114506085, + "language_loss": 0.83461171, + "learning_rate": 3.019386568567123e-06, + "loss": 0.85540718, + "num_input_tokens_seen": 125226220, + "step": 5819, + "time_per_iteration": 2.764033079147339 + }, + { + "auxiliary_loss_clip": 0.01055772, + "auxiliary_loss_mlp": 0.01031764, + "balance_loss_clip": 1.02719307, + "balance_loss_mlp": 1.01791155, + "epoch": 0.34991733052758156, + "flos": 27818883423360.0, + "grad_norm": 1.730027189258619, + "language_loss": 0.71385819, + "learning_rate": 3.0190514734772083e-06, + "loss": 0.73473358, + "num_input_tokens_seen": 125247485, + "step": 5820, + "time_per_iteration": 2.80053973197937 + }, + { + "auxiliary_loss_clip": 0.01069297, + "auxiliary_loss_mlp": 0.01038569, + "balance_loss_clip": 1.02828705, + "balance_loss_mlp": 1.02557528, + "epoch": 0.3499774537802495, + "flos": 33584197737600.0, + "grad_norm": 1.6969832454683271, + "language_loss": 0.7054168, + "learning_rate": 3.018716339744759e-06, + "loss": 0.7264955, + "num_input_tokens_seen": 125268625, + "step": 5821, + "time_per_iteration": 2.756004810333252 + }, + { + "auxiliary_loss_clip": 0.0107874, + "auxiliary_loss_mlp": 0.01045652, + "balance_loss_clip": 1.03301132, + "balance_loss_mlp": 1.03050113, + "epoch": 0.3500375770329175, + "flos": 23476744851840.0, + "grad_norm": 2.125088750969967, + "language_loss": 0.73674715, + "learning_rate": 3.0183811673824842e-06, + "loss": 0.75799108, + "num_input_tokens_seen": 125287530, + "step": 5822, + "time_per_iteration": 2.7063121795654297 + }, + { + "auxiliary_loss_clip": 0.01064321, + "auxiliary_loss_mlp": 0.01035843, + "balance_loss_clip": 1.03028119, + "balance_loss_mlp": 1.0210309, + "epoch": 0.35009770028558546, + "flos": 19026048401280.0, + "grad_norm": 1.5087010361794624, + "language_loss": 0.77793753, + "learning_rate": 3.018045956403094e-06, + "loss": 0.79893911, + "num_input_tokens_seen": 125307020, + "step": 5823, + "time_per_iteration": 2.704986333847046 + }, + { + "auxiliary_loss_clip": 0.01003427, + "auxiliary_loss_mlp": 0.01002913, + "balance_loss_clip": 1.00273705, + "balance_loss_mlp": 1.00135088, + "epoch": 0.3501578235382534, + "flos": 68351868783360.0, + "grad_norm": 0.7115822436293066, + "language_loss": 0.592188, + "learning_rate": 3.017710706819298e-06, + "loss": 0.6122514, + "num_input_tokens_seen": 125370445, + "step": 5824, + "time_per_iteration": 3.240204095840454 + }, + { + "auxiliary_loss_clip": 0.01063276, + "auxiliary_loss_mlp": 0.01033215, + "balance_loss_clip": 1.03022909, + "balance_loss_mlp": 1.01886821, + "epoch": 0.3502179467909214, + "flos": 21250893836160.0, + "grad_norm": 2.1005487181844624, + "language_loss": 0.84722614, + "learning_rate": 3.017375418643811e-06, + "loss": 0.868191, + "num_input_tokens_seen": 125388900, + "step": 5825, + "time_per_iteration": 2.594291925430298 + }, + { + "auxiliary_loss_clip": 0.0107401, + "auxiliary_loss_mlp": 0.00747938, + "balance_loss_clip": 1.0312407, + "balance_loss_mlp": 1.00052154, + "epoch": 0.35027807004358935, + "flos": 11942955826560.0, + "grad_norm": 2.843827140961011, + "language_loss": 0.82935715, + "learning_rate": 3.0170400918893464e-06, + "loss": 0.84757662, + "num_input_tokens_seen": 125402675, + "step": 5826, + "time_per_iteration": 2.5775837898254395 + }, + { + "auxiliary_loss_clip": 0.01066362, + "auxiliary_loss_mlp": 0.0104574, + "balance_loss_clip": 1.03319049, + "balance_loss_mlp": 1.03136373, + "epoch": 0.3503381932962573, + "flos": 21470918595840.0, + "grad_norm": 1.4741621799843578, + "language_loss": 0.80933595, + "learning_rate": 3.0167047265686186e-06, + "loss": 0.83045703, + "num_input_tokens_seen": 125421360, + "step": 5827, + "time_per_iteration": 2.6882221698760986 + }, + { + "auxiliary_loss_clip": 0.01028389, + "auxiliary_loss_mlp": 0.01035309, + "balance_loss_clip": 1.02641416, + "balance_loss_mlp": 1.02133775, + "epoch": 0.3503983165489253, + "flos": 21251109317760.0, + "grad_norm": 4.018727620320096, + "language_loss": 0.70478511, + "learning_rate": 3.0163693226943467e-06, + "loss": 0.72542214, + "num_input_tokens_seen": 125440000, + "step": 5828, + "time_per_iteration": 2.7709736824035645 + }, + { + "auxiliary_loss_clip": 0.01078546, + "auxiliary_loss_mlp": 0.01046298, + "balance_loss_clip": 1.03351533, + "balance_loss_mlp": 1.03009772, + "epoch": 0.35045843980159325, + "flos": 27815723026560.0, + "grad_norm": 1.8772802345761441, + "language_loss": 0.79427147, + "learning_rate": 3.016033880279248e-06, + "loss": 0.81551993, + "num_input_tokens_seen": 125460390, + "step": 5829, + "time_per_iteration": 2.615338087081909 + }, + { + "auxiliary_loss_clip": 0.01059741, + "auxiliary_loss_mlp": 0.01050292, + "balance_loss_clip": 1.03531575, + "balance_loss_mlp": 1.03475893, + "epoch": 0.3505185630542612, + "flos": 25921148169600.0, + "grad_norm": 1.770817717757365, + "language_loss": 0.72337556, + "learning_rate": 3.0156983993360417e-06, + "loss": 0.7444759, + "num_input_tokens_seen": 125478410, + "step": 5830, + "time_per_iteration": 2.7409582138061523 + }, + { + "auxiliary_loss_clip": 0.0103761, + "auxiliary_loss_mlp": 0.01038133, + "balance_loss_clip": 1.0258708, + "balance_loss_mlp": 1.02364874, + "epoch": 0.35057868630692923, + "flos": 20521763660160.0, + "grad_norm": 3.3729545540922614, + "language_loss": 0.88385218, + "learning_rate": 3.0153628798774513e-06, + "loss": 0.90460968, + "num_input_tokens_seen": 125495975, + "step": 5831, + "time_per_iteration": 2.6420631408691406 + }, + { + "auxiliary_loss_clip": 0.0103238, + "auxiliary_loss_mlp": 0.01046565, + "balance_loss_clip": 1.0290246, + "balance_loss_mlp": 1.03181887, + "epoch": 0.3506388095595972, + "flos": 20448649526400.0, + "grad_norm": 1.9514314826096542, + "language_loss": 0.7822051, + "learning_rate": 3.0150273219161985e-06, + "loss": 0.80299461, + "num_input_tokens_seen": 125515035, + "step": 5832, + "time_per_iteration": 2.7207562923431396 + }, + { + "auxiliary_loss_clip": 0.01047057, + "auxiliary_loss_mlp": 0.0104512, + "balance_loss_clip": 1.02877831, + "balance_loss_mlp": 1.02884805, + "epoch": 0.35069893281226516, + "flos": 23109665811840.0, + "grad_norm": 1.9415240622064596, + "language_loss": 0.70957202, + "learning_rate": 3.014691725465008e-06, + "loss": 0.73049378, + "num_input_tokens_seen": 125535555, + "step": 5833, + "time_per_iteration": 2.7316505908966064 + }, + { + "auxiliary_loss_clip": 0.01072027, + "auxiliary_loss_mlp": 0.01034065, + "balance_loss_clip": 1.03052032, + "balance_loss_mlp": 1.02066576, + "epoch": 0.35075905606493313, + "flos": 27271999877760.0, + "grad_norm": 1.3457518470139782, + "language_loss": 0.80722058, + "learning_rate": 3.014356090536606e-06, + "loss": 0.82828152, + "num_input_tokens_seen": 125558195, + "step": 5834, + "time_per_iteration": 2.7177443504333496 + }, + { + "auxiliary_loss_clip": 0.01050196, + "auxiliary_loss_mlp": 0.0104406, + "balance_loss_clip": 1.03835201, + "balance_loss_mlp": 1.02946877, + "epoch": 0.3508191793176011, + "flos": 19128608709120.0, + "grad_norm": 1.9729124411440675, + "language_loss": 0.83915913, + "learning_rate": 3.0140204171437183e-06, + "loss": 0.86010158, + "num_input_tokens_seen": 125575375, + "step": 5835, + "time_per_iteration": 2.702035903930664 + }, + { + "auxiliary_loss_clip": 0.01027393, + "auxiliary_loss_mlp": 0.01041324, + "balance_loss_clip": 1.02984285, + "balance_loss_mlp": 1.02711463, + "epoch": 0.35087930257026906, + "flos": 25557588662400.0, + "grad_norm": 1.4343410152220577, + "language_loss": 0.76733005, + "learning_rate": 3.0136847052990754e-06, + "loss": 0.78801715, + "num_input_tokens_seen": 125596745, + "step": 5836, + "time_per_iteration": 2.7646267414093018 + }, + { + "auxiliary_loss_clip": 0.0105059, + "auxiliary_loss_mlp": 0.01046082, + "balance_loss_clip": 1.03284121, + "balance_loss_mlp": 1.02989376, + "epoch": 0.350939425822937, + "flos": 18004246208640.0, + "grad_norm": 2.0965753055339977, + "language_loss": 0.7758984, + "learning_rate": 3.0133489550154074e-06, + "loss": 0.79686511, + "num_input_tokens_seen": 125613980, + "step": 5837, + "time_per_iteration": 2.682654619216919 + }, + { + "auxiliary_loss_clip": 0.01071644, + "auxiliary_loss_mlp": 0.01038634, + "balance_loss_clip": 1.03010714, + "balance_loss_mlp": 1.02478242, + "epoch": 0.350999549075605, + "flos": 22273198819200.0, + "grad_norm": 2.6114244598463023, + "language_loss": 0.68057472, + "learning_rate": 3.0130131663054442e-06, + "loss": 0.7016775, + "num_input_tokens_seen": 125632100, + "step": 5838, + "time_per_iteration": 2.5942111015319824 + }, + { + "auxiliary_loss_clip": 0.01085012, + "auxiliary_loss_mlp": 0.01036181, + "balance_loss_clip": 1.03198028, + "balance_loss_mlp": 1.02179813, + "epoch": 0.35105967232827295, + "flos": 14392279307520.0, + "grad_norm": 2.0558817385274555, + "language_loss": 0.82878006, + "learning_rate": 3.0126773391819215e-06, + "loss": 0.84999198, + "num_input_tokens_seen": 125649190, + "step": 5839, + "time_per_iteration": 2.562838554382324 + }, + { + "auxiliary_loss_clip": 0.01072179, + "auxiliary_loss_mlp": 0.01038405, + "balance_loss_clip": 1.03009748, + "balance_loss_mlp": 1.02379632, + "epoch": 0.3511197955809409, + "flos": 25082346792960.0, + "grad_norm": 2.1740522344686632, + "language_loss": 0.58728218, + "learning_rate": 3.012341473657572e-06, + "loss": 0.60838795, + "num_input_tokens_seen": 125668680, + "step": 5840, + "time_per_iteration": 2.743345260620117 + }, + { + "auxiliary_loss_clip": 0.01045132, + "auxiliary_loss_mlp": 0.01041503, + "balance_loss_clip": 1.02983713, + "balance_loss_mlp": 1.02645874, + "epoch": 0.3511799188336089, + "flos": 25884160139520.0, + "grad_norm": 2.1913311032743232, + "language_loss": 0.87060779, + "learning_rate": 3.0120055697451322e-06, + "loss": 0.89147413, + "num_input_tokens_seen": 125686935, + "step": 5841, + "time_per_iteration": 4.254394054412842 + }, + { + "auxiliary_loss_clip": 0.01064476, + "auxiliary_loss_mlp": 0.0104077, + "balance_loss_clip": 1.03262556, + "balance_loss_mlp": 1.02466512, + "epoch": 0.35124004208627685, + "flos": 20083725302400.0, + "grad_norm": 1.8563866383863594, + "language_loss": 0.75009811, + "learning_rate": 3.0116696274573406e-06, + "loss": 0.77115047, + "num_input_tokens_seen": 125707180, + "step": 5842, + "time_per_iteration": 2.6848466396331787 + }, + { + "auxiliary_loss_clip": 0.0107025, + "auxiliary_loss_mlp": 0.01043091, + "balance_loss_clip": 1.02966475, + "balance_loss_mlp": 1.0287385, + "epoch": 0.3513001653389448, + "flos": 17783431349760.0, + "grad_norm": 2.1404781978151615, + "language_loss": 0.68707776, + "learning_rate": 3.0113336468069346e-06, + "loss": 0.70821118, + "num_input_tokens_seen": 125722780, + "step": 5843, + "time_per_iteration": 2.622763156890869 + }, + { + "auxiliary_loss_clip": 0.01086297, + "auxiliary_loss_mlp": 0.01043447, + "balance_loss_clip": 1.03295898, + "balance_loss_mlp": 1.02862358, + "epoch": 0.3513602885916128, + "flos": 29387138198400.0, + "grad_norm": 2.4072926902536356, + "language_loss": 0.65519845, + "learning_rate": 3.010997627806655e-06, + "loss": 0.67649591, + "num_input_tokens_seen": 125742110, + "step": 5844, + "time_per_iteration": 4.129117965698242 + }, + { + "auxiliary_loss_clip": 0.01075301, + "auxiliary_loss_mlp": 0.01044977, + "balance_loss_clip": 1.03221822, + "balance_loss_mlp": 1.0297364, + "epoch": 0.3514204118442808, + "flos": 16179876483840.0, + "grad_norm": 2.411403673872704, + "language_loss": 0.75322682, + "learning_rate": 3.010661570469245e-06, + "loss": 0.77442956, + "num_input_tokens_seen": 125759980, + "step": 5845, + "time_per_iteration": 2.730870008468628 + }, + { + "auxiliary_loss_clip": 0.01072312, + "auxiliary_loss_mlp": 0.01043309, + "balance_loss_clip": 1.03135812, + "balance_loss_mlp": 1.0289923, + "epoch": 0.35148053509694877, + "flos": 23834665923840.0, + "grad_norm": 3.2256271112334205, + "language_loss": 0.73226643, + "learning_rate": 3.0103254748074465e-06, + "loss": 0.75342262, + "num_input_tokens_seen": 125772660, + "step": 5846, + "time_per_iteration": 2.5934715270996094 + }, + { + "auxiliary_loss_clip": 0.01041227, + "auxiliary_loss_mlp": 0.01036192, + "balance_loss_clip": 1.02981126, + "balance_loss_mlp": 1.02163625, + "epoch": 0.35154065834961673, + "flos": 20991295267200.0, + "grad_norm": 1.6064830898335525, + "language_loss": 0.75170076, + "learning_rate": 3.0099893408340046e-06, + "loss": 0.77247494, + "num_input_tokens_seen": 125791935, + "step": 5847, + "time_per_iteration": 2.597343683242798 + }, + { + "auxiliary_loss_clip": 0.01057838, + "auxiliary_loss_mlp": 0.01036084, + "balance_loss_clip": 1.02856708, + "balance_loss_mlp": 1.02137375, + "epoch": 0.3516007816022847, + "flos": 33255471444480.0, + "grad_norm": 1.906958565092796, + "language_loss": 0.72603214, + "learning_rate": 3.009653168561666e-06, + "loss": 0.74697137, + "num_input_tokens_seen": 125813455, + "step": 5848, + "time_per_iteration": 2.7330517768859863 + }, + { + "auxiliary_loss_clip": 0.0106836, + "auxiliary_loss_mlp": 0.01050868, + "balance_loss_clip": 1.03218114, + "balance_loss_mlp": 1.03542411, + "epoch": 0.35166090485495266, + "flos": 11726953390080.0, + "grad_norm": 2.3231986017225883, + "language_loss": 0.89333844, + "learning_rate": 3.009316958003178e-06, + "loss": 0.91453075, + "num_input_tokens_seen": 125827660, + "step": 5849, + "time_per_iteration": 2.5956571102142334 + }, + { + "auxiliary_loss_clip": 0.01063215, + "auxiliary_loss_mlp": 0.01034084, + "balance_loss_clip": 1.02998149, + "balance_loss_mlp": 1.01952839, + "epoch": 0.3517210281076206, + "flos": 22638446265600.0, + "grad_norm": 1.733287107868346, + "language_loss": 0.74351656, + "learning_rate": 3.0089807091712897e-06, + "loss": 0.76448953, + "num_input_tokens_seen": 125846655, + "step": 5850, + "time_per_iteration": 2.7545320987701416 + }, + { + "auxiliary_loss_clip": 0.01073596, + "auxiliary_loss_mlp": 0.01038005, + "balance_loss_clip": 1.03247666, + "balance_loss_mlp": 1.02305007, + "epoch": 0.3517811513602886, + "flos": 21322750993920.0, + "grad_norm": 1.6646944118861444, + "language_loss": 0.75586927, + "learning_rate": 3.0086444220787515e-06, + "loss": 0.77698529, + "num_input_tokens_seen": 125866290, + "step": 5851, + "time_per_iteration": 2.6742935180664062 + }, + { + "auxiliary_loss_clip": 0.01060227, + "auxiliary_loss_mlp": 0.0104479, + "balance_loss_clip": 1.03152442, + "balance_loss_mlp": 1.02767181, + "epoch": 0.35184127461295656, + "flos": 21032880238080.0, + "grad_norm": 1.8381367479250983, + "language_loss": 0.87209809, + "learning_rate": 3.0083080967383165e-06, + "loss": 0.89314824, + "num_input_tokens_seen": 125884620, + "step": 5852, + "time_per_iteration": 5.795276165008545 + }, + { + "auxiliary_loss_clip": 0.01083057, + "auxiliary_loss_mlp": 0.01035002, + "balance_loss_clip": 1.03104353, + "balance_loss_mlp": 1.02175832, + "epoch": 0.3519013978656245, + "flos": 22455265881600.0, + "grad_norm": 1.971358023935169, + "language_loss": 0.68014753, + "learning_rate": 3.007971733162737e-06, + "loss": 0.7013281, + "num_input_tokens_seen": 125902430, + "step": 5853, + "time_per_iteration": 2.6238021850585938 + }, + { + "auxiliary_loss_clip": 0.01057517, + "auxiliary_loss_mlp": 0.01035532, + "balance_loss_clip": 1.02822208, + "balance_loss_mlp": 1.02066684, + "epoch": 0.3519615211182925, + "flos": 13115295918720.0, + "grad_norm": 1.534706696596489, + "language_loss": 0.80793405, + "learning_rate": 3.0076353313647686e-06, + "loss": 0.82886457, + "num_input_tokens_seen": 125920570, + "step": 5854, + "time_per_iteration": 2.6634223461151123 + }, + { + "auxiliary_loss_clip": 0.01072369, + "auxiliary_loss_mlp": 0.01035871, + "balance_loss_clip": 1.03903747, + "balance_loss_mlp": 1.02233529, + "epoch": 0.35202164437096045, + "flos": 19135144984320.0, + "grad_norm": 1.4617750638858693, + "language_loss": 0.7315439, + "learning_rate": 3.0072988913571666e-06, + "loss": 0.75262642, + "num_input_tokens_seen": 125939800, + "step": 5855, + "time_per_iteration": 2.6847991943359375 + }, + { + "auxiliary_loss_clip": 0.01082043, + "auxiliary_loss_mlp": 0.01039596, + "balance_loss_clip": 1.03017497, + "balance_loss_mlp": 1.02586913, + "epoch": 0.3520817676236284, + "flos": 26542187343360.0, + "grad_norm": 11.027033753639882, + "language_loss": 0.71028066, + "learning_rate": 3.006962413152691e-06, + "loss": 0.73149705, + "num_input_tokens_seen": 125958720, + "step": 5856, + "time_per_iteration": 2.6486973762512207 + }, + { + "auxiliary_loss_clip": 0.0106665, + "auxiliary_loss_mlp": 0.0104617, + "balance_loss_clip": 1.02930868, + "balance_loss_mlp": 1.03005314, + "epoch": 0.3521418908762964, + "flos": 44893472803200.0, + "grad_norm": 2.2772401099190716, + "language_loss": 0.60870862, + "learning_rate": 3.0066258967640987e-06, + "loss": 0.62983686, + "num_input_tokens_seen": 125984310, + "step": 5857, + "time_per_iteration": 2.8014190196990967 + }, + { + "auxiliary_loss_clip": 0.01075678, + "auxiliary_loss_mlp": 0.01041113, + "balance_loss_clip": 1.03216743, + "balance_loss_mlp": 1.02612233, + "epoch": 0.3522020141289644, + "flos": 20187398931840.0, + "grad_norm": 1.991125144557333, + "language_loss": 0.7373842, + "learning_rate": 3.006289342204152e-06, + "loss": 0.75855207, + "num_input_tokens_seen": 126002410, + "step": 5858, + "time_per_iteration": 2.627747058868408 + }, + { + "auxiliary_loss_clip": 0.01086965, + "auxiliary_loss_mlp": 0.01037879, + "balance_loss_clip": 1.03181899, + "balance_loss_mlp": 1.02379417, + "epoch": 0.35226213738163237, + "flos": 27563917708800.0, + "grad_norm": 1.816659939396009, + "language_loss": 0.76121122, + "learning_rate": 3.0059527494856126e-06, + "loss": 0.78245962, + "num_input_tokens_seen": 126022490, + "step": 5859, + "time_per_iteration": 2.7492282390594482 + }, + { + "auxiliary_loss_clip": 0.01073474, + "auxiliary_loss_mlp": 0.01045025, + "balance_loss_clip": 1.03653193, + "balance_loss_mlp": 1.02871668, + "epoch": 0.35232226063430033, + "flos": 22966310632320.0, + "grad_norm": 1.8934005522225603, + "language_loss": 0.71692836, + "learning_rate": 3.0056161186212435e-06, + "loss": 0.7381134, + "num_input_tokens_seen": 126042895, + "step": 5860, + "time_per_iteration": 2.6507952213287354 + }, + { + "auxiliary_loss_clip": 0.01058646, + "auxiliary_loss_mlp": 0.01042769, + "balance_loss_clip": 1.02914214, + "balance_loss_mlp": 1.02649736, + "epoch": 0.3523823838869683, + "flos": 19168290259200.0, + "grad_norm": 2.3346753853902036, + "language_loss": 0.66779554, + "learning_rate": 3.005279449623811e-06, + "loss": 0.68880963, + "num_input_tokens_seen": 126060130, + "step": 5861, + "time_per_iteration": 2.6734330654144287 + }, + { + "auxiliary_loss_clip": 0.01063451, + "auxiliary_loss_mlp": 0.01035268, + "balance_loss_clip": 1.0309732, + "balance_loss_mlp": 1.02086139, + "epoch": 0.35244250713963626, + "flos": 17930988420480.0, + "grad_norm": 2.0303249491360527, + "language_loss": 0.66292369, + "learning_rate": 3.0049427425060815e-06, + "loss": 0.68391085, + "num_input_tokens_seen": 126077850, + "step": 5862, + "time_per_iteration": 2.5857973098754883 + }, + { + "auxiliary_loss_clip": 0.01066593, + "auxiliary_loss_mlp": 0.010422, + "balance_loss_clip": 1.03312063, + "balance_loss_mlp": 1.0261786, + "epoch": 0.35250263039230423, + "flos": 21432529935360.0, + "grad_norm": 1.8606263602753106, + "language_loss": 0.77525759, + "learning_rate": 3.0046059972808215e-06, + "loss": 0.79634553, + "num_input_tokens_seen": 126095985, + "step": 5863, + "time_per_iteration": 2.788321018218994 + }, + { + "auxiliary_loss_clip": 0.01076944, + "auxiliary_loss_mlp": 0.01039455, + "balance_loss_clip": 1.0333426, + "balance_loss_mlp": 1.02531123, + "epoch": 0.3525627536449722, + "flos": 27416863428480.0, + "grad_norm": 1.8704686995113504, + "language_loss": 0.7542178, + "learning_rate": 3.0042692139608024e-06, + "loss": 0.7753818, + "num_input_tokens_seen": 126116070, + "step": 5864, + "time_per_iteration": 2.9184176921844482 + }, + { + "auxiliary_loss_clip": 0.01072387, + "auxiliary_loss_mlp": 0.01050854, + "balance_loss_clip": 1.03033054, + "balance_loss_mlp": 1.03656673, + "epoch": 0.35262287689764016, + "flos": 24789818430720.0, + "grad_norm": 2.155267140785009, + "language_loss": 0.79803419, + "learning_rate": 3.003932392558793e-06, + "loss": 0.81926662, + "num_input_tokens_seen": 126135205, + "step": 5865, + "time_per_iteration": 2.64909291267395 + }, + { + "auxiliary_loss_clip": 0.01076776, + "auxiliary_loss_mlp": 0.01045125, + "balance_loss_clip": 1.03300726, + "balance_loss_mlp": 1.02943134, + "epoch": 0.3526830001503081, + "flos": 17821604528640.0, + "grad_norm": 2.388182386970465, + "language_loss": 0.81604838, + "learning_rate": 3.0035955330875677e-06, + "loss": 0.8372674, + "num_input_tokens_seen": 126151895, + "step": 5866, + "time_per_iteration": 2.556631326675415 + }, + { + "auxiliary_loss_clip": 0.01048382, + "auxiliary_loss_mlp": 0.01039918, + "balance_loss_clip": 1.03087902, + "balance_loss_mlp": 1.02359819, + "epoch": 0.3527431234029761, + "flos": 18078114528000.0, + "grad_norm": 2.405798619599296, + "language_loss": 0.8404128, + "learning_rate": 3.0032586355598986e-06, + "loss": 0.86129582, + "num_input_tokens_seen": 126168515, + "step": 5867, + "time_per_iteration": 2.641813278198242 + }, + { + "auxiliary_loss_clip": 0.01088125, + "auxiliary_loss_mlp": 0.01045743, + "balance_loss_clip": 1.03330374, + "balance_loss_mlp": 1.03003144, + "epoch": 0.35280324665564405, + "flos": 19427350124160.0, + "grad_norm": 1.9313501824294725, + "language_loss": 0.74221581, + "learning_rate": 3.0029216999885613e-06, + "loss": 0.76355445, + "num_input_tokens_seen": 126186460, + "step": 5868, + "time_per_iteration": 2.5157792568206787 + }, + { + "auxiliary_loss_clip": 0.01076686, + "auxiliary_loss_mlp": 0.01039006, + "balance_loss_clip": 1.03129649, + "balance_loss_mlp": 1.02384233, + "epoch": 0.352863369908312, + "flos": 21504027957120.0, + "grad_norm": 1.710207388799679, + "language_loss": 0.61321437, + "learning_rate": 3.0025847263863327e-06, + "loss": 0.63437128, + "num_input_tokens_seen": 126206170, + "step": 5869, + "time_per_iteration": 2.5852208137512207 + }, + { + "auxiliary_loss_clip": 0.01074512, + "auxiliary_loss_mlp": 0.01040202, + "balance_loss_clip": 1.03047919, + "balance_loss_mlp": 1.02491331, + "epoch": 0.35292349316098, + "flos": 22309504490880.0, + "grad_norm": 1.9435864876331026, + "language_loss": 0.74262327, + "learning_rate": 3.0022477147659917e-06, + "loss": 0.76377034, + "num_input_tokens_seen": 126225605, + "step": 5870, + "time_per_iteration": 2.5973479747772217 + }, + { + "auxiliary_loss_clip": 0.01076036, + "auxiliary_loss_mlp": 0.01042731, + "balance_loss_clip": 1.03233242, + "balance_loss_mlp": 1.02748966, + "epoch": 0.352983616413648, + "flos": 33109745967360.0, + "grad_norm": 1.4181664740815274, + "language_loss": 0.72005546, + "learning_rate": 3.001910665140316e-06, + "loss": 0.74124312, + "num_input_tokens_seen": 126250230, + "step": 5871, + "time_per_iteration": 2.7525250911712646 + }, + { + "auxiliary_loss_clip": 0.01063674, + "auxiliary_loss_mlp": 0.01036303, + "balance_loss_clip": 1.02760315, + "balance_loss_mlp": 1.02289152, + "epoch": 0.35304373966631597, + "flos": 18696603836160.0, + "grad_norm": 1.8907169048942107, + "language_loss": 0.73876524, + "learning_rate": 3.0015735775220873e-06, + "loss": 0.75976497, + "num_input_tokens_seen": 126268315, + "step": 5872, + "time_per_iteration": 2.5921988487243652 + }, + { + "auxiliary_loss_clip": 0.01063539, + "auxiliary_loss_mlp": 0.00748045, + "balance_loss_clip": 1.03129625, + "balance_loss_mlp": 1.00036669, + "epoch": 0.35310386291898394, + "flos": 23364954748800.0, + "grad_norm": 1.6820542046354945, + "language_loss": 0.82981211, + "learning_rate": 3.001236451924089e-06, + "loss": 0.84792799, + "num_input_tokens_seen": 126288390, + "step": 5873, + "time_per_iteration": 2.7476000785827637 + }, + { + "auxiliary_loss_clip": 0.01063048, + "auxiliary_loss_mlp": 0.01045652, + "balance_loss_clip": 1.02954912, + "balance_loss_mlp": 1.02982092, + "epoch": 0.3531639861716519, + "flos": 24461954064000.0, + "grad_norm": 1.7506703567568438, + "language_loss": 0.65506482, + "learning_rate": 3.000899288359104e-06, + "loss": 0.67615175, + "num_input_tokens_seen": 126305750, + "step": 5874, + "time_per_iteration": 2.7036588191986084 + }, + { + "auxiliary_loss_clip": 0.01003725, + "auxiliary_loss_mlp": 0.01010372, + "balance_loss_clip": 1.00233877, + "balance_loss_mlp": 1.00866699, + "epoch": 0.35322410942431987, + "flos": 70312446881280.0, + "grad_norm": 0.7652917261082689, + "language_loss": 0.61513972, + "learning_rate": 3.000562086839917e-06, + "loss": 0.63528067, + "num_input_tokens_seen": 126362495, + "step": 5875, + "time_per_iteration": 3.1013925075531006 + }, + { + "auxiliary_loss_clip": 0.0101605, + "auxiliary_loss_mlp": 0.01046018, + "balance_loss_clip": 1.02601373, + "balance_loss_mlp": 1.03167164, + "epoch": 0.35328423267698783, + "flos": 19820894509440.0, + "grad_norm": 2.3822309612748174, + "language_loss": 0.80216533, + "learning_rate": 3.0002248473793163e-06, + "loss": 0.82278609, + "num_input_tokens_seen": 126378320, + "step": 5876, + "time_per_iteration": 2.7092113494873047 + }, + { + "auxiliary_loss_clip": 0.00984667, + "auxiliary_loss_mlp": 0.00746668, + "balance_loss_clip": 1.00426614, + "balance_loss_mlp": 0.9998517, + "epoch": 0.3533443559296558, + "flos": 60826356391680.0, + "grad_norm": 0.6704832111336382, + "language_loss": 0.5679425, + "learning_rate": 2.999887569990088e-06, + "loss": 0.58525586, + "num_input_tokens_seen": 126442735, + "step": 5877, + "time_per_iteration": 3.351252555847168 + }, + { + "auxiliary_loss_clip": 0.01053403, + "auxiliary_loss_mlp": 0.01034792, + "balance_loss_clip": 1.02856338, + "balance_loss_mlp": 1.01955104, + "epoch": 0.35340447918232376, + "flos": 24755775315840.0, + "grad_norm": 1.6413078720655785, + "language_loss": 0.71934241, + "learning_rate": 2.999550254685024e-06, + "loss": 0.74022436, + "num_input_tokens_seen": 126463090, + "step": 5878, + "time_per_iteration": 2.71840238571167 + }, + { + "auxiliary_loss_clip": 0.01062914, + "auxiliary_loss_mlp": 0.01039144, + "balance_loss_clip": 1.02952957, + "balance_loss_mlp": 1.02442169, + "epoch": 0.3534646024349917, + "flos": 21796304924160.0, + "grad_norm": 1.949659055335156, + "language_loss": 0.78617883, + "learning_rate": 2.9992129014769136e-06, + "loss": 0.80719936, + "num_input_tokens_seen": 126482105, + "step": 5879, + "time_per_iteration": 2.664520502090454 + }, + { + "auxiliary_loss_clip": 0.01059516, + "auxiliary_loss_mlp": 0.01050191, + "balance_loss_clip": 1.03280175, + "balance_loss_mlp": 1.03261971, + "epoch": 0.3535247256876597, + "flos": 20012119539840.0, + "grad_norm": 2.0351866034649433, + "language_loss": 0.63126636, + "learning_rate": 2.9988755103785493e-06, + "loss": 0.65236342, + "num_input_tokens_seen": 126502125, + "step": 5880, + "time_per_iteration": 2.716855525970459 + }, + { + "auxiliary_loss_clip": 0.01064418, + "auxiliary_loss_mlp": 0.01035542, + "balance_loss_clip": 1.03104448, + "balance_loss_mlp": 1.01991415, + "epoch": 0.35358484894032766, + "flos": 18187929383040.0, + "grad_norm": 2.478701253935224, + "language_loss": 0.65732598, + "learning_rate": 2.998538081402727e-06, + "loss": 0.67832553, + "num_input_tokens_seen": 126521950, + "step": 5881, + "time_per_iteration": 2.6319103240966797 + }, + { + "auxiliary_loss_clip": 0.01072402, + "auxiliary_loss_mlp": 0.01035717, + "balance_loss_clip": 1.0326786, + "balance_loss_mlp": 1.02156126, + "epoch": 0.3536449721929956, + "flos": 22820369673600.0, + "grad_norm": 1.3335222584553599, + "language_loss": 0.75574952, + "learning_rate": 2.998200614562239e-06, + "loss": 0.77683073, + "num_input_tokens_seen": 126542445, + "step": 5882, + "time_per_iteration": 2.6192705631256104 + }, + { + "auxiliary_loss_clip": 0.01059274, + "auxiliary_loss_mlp": 0.0105067, + "balance_loss_clip": 1.03058386, + "balance_loss_mlp": 1.03266895, + "epoch": 0.3537050954456636, + "flos": 26432336574720.0, + "grad_norm": 2.1953207941560184, + "language_loss": 0.70431447, + "learning_rate": 2.9978631098698847e-06, + "loss": 0.72541392, + "num_input_tokens_seen": 126560690, + "step": 5883, + "time_per_iteration": 2.6776628494262695 + }, + { + "auxiliary_loss_clip": 0.01060195, + "auxiliary_loss_mlp": 0.01038524, + "balance_loss_clip": 1.03299439, + "balance_loss_mlp": 1.02225196, + "epoch": 0.3537652186983316, + "flos": 17197153562880.0, + "grad_norm": 2.4625442887264106, + "language_loss": 0.78170764, + "learning_rate": 2.9975255673384614e-06, + "loss": 0.8026948, + "num_input_tokens_seen": 126577620, + "step": 5884, + "time_per_iteration": 2.703389883041382 + }, + { + "auxiliary_loss_clip": 0.01060526, + "auxiliary_loss_mlp": 0.01038382, + "balance_loss_clip": 1.03096509, + "balance_loss_mlp": 1.02399981, + "epoch": 0.3538253419509996, + "flos": 19536769929600.0, + "grad_norm": 3.1025471387598174, + "language_loss": 0.75283444, + "learning_rate": 2.9971879869807673e-06, + "loss": 0.7738235, + "num_input_tokens_seen": 126596235, + "step": 5885, + "time_per_iteration": 2.620246171951294 + }, + { + "auxiliary_loss_clip": 0.01032568, + "auxiliary_loss_mlp": 0.01045822, + "balance_loss_clip": 1.02559662, + "balance_loss_mlp": 1.02917993, + "epoch": 0.35388546520366754, + "flos": 12128578335360.0, + "grad_norm": 2.188211705922472, + "language_loss": 0.83377582, + "learning_rate": 2.996850368809606e-06, + "loss": 0.85455972, + "num_input_tokens_seen": 126612830, + "step": 5886, + "time_per_iteration": 2.69539737701416 + }, + { + "auxiliary_loss_clip": 0.01084999, + "auxiliary_loss_mlp": 0.01038456, + "balance_loss_clip": 1.03154337, + "balance_loss_mlp": 1.02220738, + "epoch": 0.3539455884563355, + "flos": 19678149861120.0, + "grad_norm": 7.271848820673805, + "language_loss": 0.78706229, + "learning_rate": 2.9965127128377787e-06, + "loss": 0.8082968, + "num_input_tokens_seen": 126630910, + "step": 5887, + "time_per_iteration": 2.5289433002471924 + }, + { + "auxiliary_loss_clip": 0.0102898, + "auxiliary_loss_mlp": 0.01045611, + "balance_loss_clip": 1.02698851, + "balance_loss_mlp": 1.03079343, + "epoch": 0.35400571170900347, + "flos": 18072045129600.0, + "grad_norm": 2.093505758614012, + "language_loss": 0.6586827, + "learning_rate": 2.996175019078089e-06, + "loss": 0.67942858, + "num_input_tokens_seen": 126648365, + "step": 5888, + "time_per_iteration": 4.250921726226807 + }, + { + "auxiliary_loss_clip": 0.01053732, + "auxiliary_loss_mlp": 0.01039152, + "balance_loss_clip": 1.02893901, + "balance_loss_mlp": 1.02419102, + "epoch": 0.35406583496167143, + "flos": 26068058795520.0, + "grad_norm": 1.7233515007258322, + "language_loss": 0.77050412, + "learning_rate": 2.9958372875433437e-06, + "loss": 0.79143298, + "num_input_tokens_seen": 126667500, + "step": 5889, + "time_per_iteration": 2.7190091609954834 + }, + { + "auxiliary_loss_clip": 0.01058895, + "auxiliary_loss_mlp": 0.01038075, + "balance_loss_clip": 1.03466225, + "balance_loss_mlp": 1.02309024, + "epoch": 0.3541259582143394, + "flos": 19792453916160.0, + "grad_norm": 1.8112169087482415, + "language_loss": 0.80669469, + "learning_rate": 2.9954995182463478e-06, + "loss": 0.82766438, + "num_input_tokens_seen": 126686820, + "step": 5890, + "time_per_iteration": 2.7259061336517334 + }, + { + "auxiliary_loss_clip": 0.01058077, + "auxiliary_loss_mlp": 0.01034102, + "balance_loss_clip": 1.02892339, + "balance_loss_mlp": 1.02055407, + "epoch": 0.35418608146700736, + "flos": 24022084112640.0, + "grad_norm": 1.5800411903910687, + "language_loss": 0.79674327, + "learning_rate": 2.99516171119991e-06, + "loss": 0.8176651, + "num_input_tokens_seen": 126706965, + "step": 5891, + "time_per_iteration": 4.3975138664245605 + }, + { + "auxiliary_loss_clip": 0.01045477, + "auxiliary_loss_mlp": 0.01043859, + "balance_loss_clip": 1.02772164, + "balance_loss_mlp": 1.02744341, + "epoch": 0.35424620471967533, + "flos": 12385770693120.0, + "grad_norm": 2.812082476927375, + "language_loss": 0.7307905, + "learning_rate": 2.9948238664168415e-06, + "loss": 0.75168389, + "num_input_tokens_seen": 126724015, + "step": 5892, + "time_per_iteration": 2.65199875831604 + }, + { + "auxiliary_loss_clip": 0.01059746, + "auxiliary_loss_mlp": 0.01039805, + "balance_loss_clip": 1.02956283, + "balance_loss_mlp": 1.02406991, + "epoch": 0.3543063279723433, + "flos": 19673624747520.0, + "grad_norm": 2.1211622685137668, + "language_loss": 0.67415303, + "learning_rate": 2.9944859839099518e-06, + "loss": 0.69514853, + "num_input_tokens_seen": 126737565, + "step": 5893, + "time_per_iteration": 2.886181354522705 + }, + { + "auxiliary_loss_clip": 0.01030608, + "auxiliary_loss_mlp": 0.0104203, + "balance_loss_clip": 1.02562284, + "balance_loss_mlp": 1.02549553, + "epoch": 0.35436645122501126, + "flos": 21909208348800.0, + "grad_norm": 2.0984485279562413, + "language_loss": 0.69636577, + "learning_rate": 2.9941480636920533e-06, + "loss": 0.71709216, + "num_input_tokens_seen": 126756095, + "step": 5894, + "time_per_iteration": 2.8614182472229004 + }, + { + "auxiliary_loss_clip": 0.01057164, + "auxiliary_loss_mlp": 0.00748145, + "balance_loss_clip": 1.03113747, + "balance_loss_mlp": 1.00045705, + "epoch": 0.3544265744776792, + "flos": 21719527603200.0, + "grad_norm": 1.5517596058844545, + "language_loss": 0.74573016, + "learning_rate": 2.9938101057759615e-06, + "loss": 0.76378322, + "num_input_tokens_seen": 126775455, + "step": 5895, + "time_per_iteration": 2.8698041439056396 + }, + { + "auxiliary_loss_clip": 0.01053676, + "auxiliary_loss_mlp": 0.01036205, + "balance_loss_clip": 1.02735734, + "balance_loss_mlp": 1.02077925, + "epoch": 0.3544866977303472, + "flos": 21213223447680.0, + "grad_norm": 1.8161775433809035, + "language_loss": 0.83678579, + "learning_rate": 2.993472110174491e-06, + "loss": 0.85768461, + "num_input_tokens_seen": 126792320, + "step": 5896, + "time_per_iteration": 2.7405645847320557 + }, + { + "auxiliary_loss_clip": 0.01061207, + "auxiliary_loss_mlp": 0.00748112, + "balance_loss_clip": 1.02985215, + "balance_loss_mlp": 1.00037539, + "epoch": 0.35454682098301515, + "flos": 29311402371840.0, + "grad_norm": 1.5786763918988842, + "language_loss": 0.70309627, + "learning_rate": 2.9931340769004576e-06, + "loss": 0.7211895, + "num_input_tokens_seen": 126813680, + "step": 5897, + "time_per_iteration": 2.70941162109375 + }, + { + "auxiliary_loss_clip": 0.01060167, + "auxiliary_loss_mlp": 0.01042457, + "balance_loss_clip": 1.03295875, + "balance_loss_mlp": 1.02588725, + "epoch": 0.3546069442356832, + "flos": 24316587722880.0, + "grad_norm": 4.684213568534083, + "language_loss": 0.81575668, + "learning_rate": 2.9927960059666816e-06, + "loss": 0.83678293, + "num_input_tokens_seen": 126834395, + "step": 5898, + "time_per_iteration": 2.6291451454162598 + }, + { + "auxiliary_loss_clip": 0.01081917, + "auxiliary_loss_mlp": 0.01039849, + "balance_loss_clip": 1.03096783, + "balance_loss_mlp": 1.02600908, + "epoch": 0.35466706748835114, + "flos": 22857285876480.0, + "grad_norm": 1.693005093502049, + "language_loss": 0.7421366, + "learning_rate": 2.9924578973859804e-06, + "loss": 0.76335418, + "num_input_tokens_seen": 126855145, + "step": 5899, + "time_per_iteration": 4.179685115814209 + }, + { + "auxiliary_loss_clip": 0.01083851, + "auxiliary_loss_mlp": 0.00748149, + "balance_loss_clip": 1.03075075, + "balance_loss_mlp": 1.00035453, + "epoch": 0.3547271907410191, + "flos": 28330107742080.0, + "grad_norm": 1.5822813437198928, + "language_loss": 0.79807353, + "learning_rate": 2.9921197511711763e-06, + "loss": 0.81639355, + "num_input_tokens_seen": 126873790, + "step": 5900, + "time_per_iteration": 4.190350294113159 + }, + { + "auxiliary_loss_clip": 0.01057196, + "auxiliary_loss_mlp": 0.01047108, + "balance_loss_clip": 1.02801096, + "balance_loss_mlp": 1.03074038, + "epoch": 0.35478731399368707, + "flos": 23514092017920.0, + "grad_norm": 1.6413076389746437, + "language_loss": 0.81631672, + "learning_rate": 2.991781567335093e-06, + "loss": 0.83735979, + "num_input_tokens_seen": 126892865, + "step": 5901, + "time_per_iteration": 2.6360085010528564 + }, + { + "auxiliary_loss_clip": 0.01078001, + "auxiliary_loss_mlp": 0.00748261, + "balance_loss_clip": 1.03421462, + "balance_loss_mlp": 1.00040603, + "epoch": 0.35484743724635504, + "flos": 18624315715200.0, + "grad_norm": 1.8019800203018743, + "language_loss": 0.75709426, + "learning_rate": 2.9914433458905525e-06, + "loss": 0.77535689, + "num_input_tokens_seen": 126911935, + "step": 5902, + "time_per_iteration": 2.676560878753662 + }, + { + "auxiliary_loss_clip": 0.0107434, + "auxiliary_loss_mlp": 0.0103542, + "balance_loss_clip": 1.03132975, + "balance_loss_mlp": 1.02123368, + "epoch": 0.354907560499023, + "flos": 17384499924480.0, + "grad_norm": 3.408445456058238, + "language_loss": 0.70856643, + "learning_rate": 2.991105086850381e-06, + "loss": 0.72966409, + "num_input_tokens_seen": 126930040, + "step": 5903, + "time_per_iteration": 2.5632777214050293 + }, + { + "auxiliary_loss_clip": 0.0107501, + "auxiliary_loss_mlp": 0.01039966, + "balance_loss_clip": 1.02996516, + "balance_loss_mlp": 1.02450466, + "epoch": 0.35496768375169097, + "flos": 19208546426880.0, + "grad_norm": 2.583183076451736, + "language_loss": 0.74767745, + "learning_rate": 2.9907667902274053e-06, + "loss": 0.7688272, + "num_input_tokens_seen": 126948390, + "step": 5904, + "time_per_iteration": 2.5988383293151855 + }, + { + "auxiliary_loss_clip": 0.010665, + "auxiliary_loss_mlp": 0.00748067, + "balance_loss_clip": 1.03261232, + "balance_loss_mlp": 1.00030661, + "epoch": 0.35502780700435893, + "flos": 18332792933760.0, + "grad_norm": 2.845608510739797, + "language_loss": 0.78581679, + "learning_rate": 2.9904284560344536e-06, + "loss": 0.80396247, + "num_input_tokens_seen": 126964905, + "step": 5905, + "time_per_iteration": 2.565783739089966 + }, + { + "auxiliary_loss_clip": 0.01044393, + "auxiliary_loss_mlp": 0.01034555, + "balance_loss_clip": 1.02764893, + "balance_loss_mlp": 1.02200198, + "epoch": 0.3550879302570269, + "flos": 15448555578240.0, + "grad_norm": 1.9498037649351432, + "language_loss": 0.7279911, + "learning_rate": 2.990090084284356e-06, + "loss": 0.74878061, + "num_input_tokens_seen": 126982000, + "step": 5906, + "time_per_iteration": 2.6615118980407715 + }, + { + "auxiliary_loss_clip": 0.01056639, + "auxiliary_loss_mlp": 0.01038395, + "balance_loss_clip": 1.0299952, + "balance_loss_mlp": 1.021348, + "epoch": 0.35514805350969486, + "flos": 21979197999360.0, + "grad_norm": 1.9107835376900049, + "language_loss": 0.7476145, + "learning_rate": 2.9897516749899426e-06, + "loss": 0.76856482, + "num_input_tokens_seen": 126998390, + "step": 5907, + "time_per_iteration": 2.596601724624634 + }, + { + "auxiliary_loss_clip": 0.01005336, + "auxiliary_loss_mlp": 0.01038365, + "balance_loss_clip": 1.0206151, + "balance_loss_mlp": 1.02138948, + "epoch": 0.3552081767623628, + "flos": 29861949104640.0, + "grad_norm": 1.7592323564811678, + "language_loss": 0.75710392, + "learning_rate": 2.989413228164047e-06, + "loss": 0.77754092, + "num_input_tokens_seen": 127020220, + "step": 5908, + "time_per_iteration": 2.803833246231079 + }, + { + "auxiliary_loss_clip": 0.01065496, + "auxiliary_loss_mlp": 0.01041403, + "balance_loss_clip": 1.03151345, + "balance_loss_mlp": 1.02697241, + "epoch": 0.3552683000150308, + "flos": 26432264747520.0, + "grad_norm": 3.1685064304621546, + "language_loss": 0.68310255, + "learning_rate": 2.989074743819502e-06, + "loss": 0.70417154, + "num_input_tokens_seen": 127038585, + "step": 5909, + "time_per_iteration": 2.6406426429748535 + }, + { + "auxiliary_loss_clip": 0.01070122, + "auxiliary_loss_mlp": 0.01036588, + "balance_loss_clip": 1.03054845, + "balance_loss_mlp": 1.02274227, + "epoch": 0.35532842326769876, + "flos": 19785989468160.0, + "grad_norm": 1.9816296917927971, + "language_loss": 0.78362381, + "learning_rate": 2.988736221969144e-06, + "loss": 0.80469096, + "num_input_tokens_seen": 127056215, + "step": 5910, + "time_per_iteration": 2.605187177658081 + }, + { + "auxiliary_loss_clip": 0.01057063, + "auxiliary_loss_mlp": 0.01044512, + "balance_loss_clip": 1.02748418, + "balance_loss_mlp": 1.02816856, + "epoch": 0.3553885465203668, + "flos": 17239277237760.0, + "grad_norm": 1.9072987046999517, + "language_loss": 0.71038222, + "learning_rate": 2.98839766262581e-06, + "loss": 0.73139799, + "num_input_tokens_seen": 127075825, + "step": 5911, + "time_per_iteration": 2.6307311058044434 + }, + { + "auxiliary_loss_clip": 0.01068689, + "auxiliary_loss_mlp": 0.01041545, + "balance_loss_clip": 1.02849674, + "balance_loss_mlp": 1.02700758, + "epoch": 0.35544866977303474, + "flos": 14934350430720.0, + "grad_norm": 3.2417246453624484, + "language_loss": 0.86469197, + "learning_rate": 2.9880590658023366e-06, + "loss": 0.88579428, + "num_input_tokens_seen": 127091205, + "step": 5912, + "time_per_iteration": 2.5729362964630127 + }, + { + "auxiliary_loss_clip": 0.01064229, + "auxiliary_loss_mlp": 0.01035928, + "balance_loss_clip": 1.03145111, + "balance_loss_mlp": 1.02130067, + "epoch": 0.3555087930257027, + "flos": 19756040503680.0, + "grad_norm": 1.932946132964006, + "language_loss": 0.77111036, + "learning_rate": 2.9877204315115646e-06, + "loss": 0.79211187, + "num_input_tokens_seen": 127109210, + "step": 5913, + "time_per_iteration": 2.6540300846099854 + }, + { + "auxiliary_loss_clip": 0.01050649, + "auxiliary_loss_mlp": 0.01039382, + "balance_loss_clip": 1.03211927, + "balance_loss_mlp": 1.02490449, + "epoch": 0.3555689162783707, + "flos": 21068252156160.0, + "grad_norm": 1.3330918029248513, + "language_loss": 0.8253054, + "learning_rate": 2.9873817597663353e-06, + "loss": 0.84620571, + "num_input_tokens_seen": 127128400, + "step": 5914, + "time_per_iteration": 2.852839946746826 + }, + { + "auxiliary_loss_clip": 0.01086834, + "auxiliary_loss_mlp": 0.01039448, + "balance_loss_clip": 1.03349757, + "balance_loss_mlp": 1.02463019, + "epoch": 0.35562903953103864, + "flos": 33069633454080.0, + "grad_norm": 2.5578482359257375, + "language_loss": 0.70191145, + "learning_rate": 2.98704305057949e-06, + "loss": 0.72317421, + "num_input_tokens_seen": 127149965, + "step": 5915, + "time_per_iteration": 2.7409892082214355 + }, + { + "auxiliary_loss_clip": 0.01071866, + "auxiliary_loss_mlp": 0.01038758, + "balance_loss_clip": 1.03015614, + "balance_loss_mlp": 1.02457786, + "epoch": 0.3556891627837066, + "flos": 20557853850240.0, + "grad_norm": 1.6075369184527852, + "language_loss": 0.76208574, + "learning_rate": 2.9867043039638737e-06, + "loss": 0.78319192, + "num_input_tokens_seen": 127169865, + "step": 5916, + "time_per_iteration": 2.6789121627807617 + }, + { + "auxiliary_loss_clip": 0.01046339, + "auxiliary_loss_mlp": 0.01038507, + "balance_loss_clip": 1.02818429, + "balance_loss_mlp": 1.02456546, + "epoch": 0.35574928603637457, + "flos": 20703327932160.0, + "grad_norm": 3.125042239163763, + "language_loss": 0.8850131, + "learning_rate": 2.986365519932332e-06, + "loss": 0.90586162, + "num_input_tokens_seen": 127188075, + "step": 5917, + "time_per_iteration": 2.8414804935455322 + }, + { + "auxiliary_loss_clip": 0.01002978, + "auxiliary_loss_mlp": 0.01038846, + "balance_loss_clip": 1.02341819, + "balance_loss_mlp": 1.02226424, + "epoch": 0.35580940928904253, + "flos": 15194595444480.0, + "grad_norm": 2.2298274329950734, + "language_loss": 0.74335718, + "learning_rate": 2.98602669849771e-06, + "loss": 0.76377547, + "num_input_tokens_seen": 127206065, + "step": 5918, + "time_per_iteration": 2.7460358142852783 + }, + { + "auxiliary_loss_clip": 0.01011461, + "auxiliary_loss_mlp": 0.01006422, + "balance_loss_clip": 1.01191676, + "balance_loss_mlp": 1.00347793, + "epoch": 0.3558695325417105, + "flos": 58639145431680.0, + "grad_norm": 0.9083329086715825, + "language_loss": 0.63812053, + "learning_rate": 2.985687839672857e-06, + "loss": 0.65829939, + "num_input_tokens_seen": 127257885, + "step": 5919, + "time_per_iteration": 2.957951068878174 + }, + { + "auxiliary_loss_clip": 0.01076785, + "auxiliary_loss_mlp": 0.01040435, + "balance_loss_clip": 1.0321027, + "balance_loss_mlp": 1.02556372, + "epoch": 0.35592965579437846, + "flos": 22018233104640.0, + "grad_norm": 1.9571967438422702, + "language_loss": 0.73289627, + "learning_rate": 2.9853489434706223e-06, + "loss": 0.75406849, + "num_input_tokens_seen": 127275550, + "step": 5920, + "time_per_iteration": 2.6749260425567627 + }, + { + "auxiliary_loss_clip": 0.01045118, + "auxiliary_loss_mlp": 0.0103637, + "balance_loss_clip": 1.02910852, + "balance_loss_mlp": 1.0215764, + "epoch": 0.35598977904704643, + "flos": 23367684182400.0, + "grad_norm": 2.154747262041148, + "language_loss": 0.77078211, + "learning_rate": 2.985010009903857e-06, + "loss": 0.79159701, + "num_input_tokens_seen": 127295110, + "step": 5921, + "time_per_iteration": 2.711354970932007 + }, + { + "auxiliary_loss_clip": 0.01062475, + "auxiliary_loss_mlp": 0.01034074, + "balance_loss_clip": 1.03062797, + "balance_loss_mlp": 1.02085984, + "epoch": 0.3560499022997144, + "flos": 17785334770560.0, + "grad_norm": 2.266350972723116, + "language_loss": 0.67659181, + "learning_rate": 2.9846710389854133e-06, + "loss": 0.69755727, + "num_input_tokens_seen": 127312865, + "step": 5922, + "time_per_iteration": 2.7527008056640625 + }, + { + "auxiliary_loss_clip": 0.0106699, + "auxiliary_loss_mlp": 0.01041537, + "balance_loss_clip": 1.03022623, + "balance_loss_mlp": 1.02717221, + "epoch": 0.35611002555238236, + "flos": 20740459616640.0, + "grad_norm": 2.0739917657089095, + "language_loss": 0.79001838, + "learning_rate": 2.9843320307281454e-06, + "loss": 0.8111037, + "num_input_tokens_seen": 127331710, + "step": 5923, + "time_per_iteration": 2.6112053394317627 + }, + { + "auxiliary_loss_clip": 0.01065567, + "auxiliary_loss_mlp": 0.01034716, + "balance_loss_clip": 1.0326519, + "balance_loss_mlp": 1.0204885, + "epoch": 0.3561701488050504, + "flos": 19462219251840.0, + "grad_norm": 1.6725505804726881, + "language_loss": 0.85215557, + "learning_rate": 2.983992985144908e-06, + "loss": 0.8731584, + "num_input_tokens_seen": 127350950, + "step": 5924, + "time_per_iteration": 2.679187297821045 + }, + { + "auxiliary_loss_clip": 0.01055335, + "auxiliary_loss_mlp": 0.01040896, + "balance_loss_clip": 1.02871454, + "balance_loss_mlp": 1.0255419, + "epoch": 0.35623027205771834, + "flos": 30774942023040.0, + "grad_norm": 2.0626508794332223, + "language_loss": 0.77616286, + "learning_rate": 2.9836539022485578e-06, + "loss": 0.79712522, + "num_input_tokens_seen": 127369385, + "step": 5925, + "time_per_iteration": 2.684023857116699 + }, + { + "auxiliary_loss_clip": 0.01033065, + "auxiliary_loss_mlp": 0.01041351, + "balance_loss_clip": 1.02995229, + "balance_loss_mlp": 1.02714169, + "epoch": 0.3562903953103863, + "flos": 16981079299200.0, + "grad_norm": 2.387582335010766, + "language_loss": 0.75847399, + "learning_rate": 2.9833147820519535e-06, + "loss": 0.7792182, + "num_input_tokens_seen": 127386965, + "step": 5926, + "time_per_iteration": 2.8164010047912598 + }, + { + "auxiliary_loss_clip": 0.01055451, + "auxiliary_loss_mlp": 0.00748062, + "balance_loss_clip": 1.03143716, + "balance_loss_mlp": 1.00024915, + "epoch": 0.3563505185630543, + "flos": 23839837482240.0, + "grad_norm": 2.210647371419425, + "language_loss": 0.70136547, + "learning_rate": 2.9829756245679544e-06, + "loss": 0.71940058, + "num_input_tokens_seen": 127406075, + "step": 5927, + "time_per_iteration": 2.882751226425171 + }, + { + "auxiliary_loss_clip": 0.01081178, + "auxiliary_loss_mlp": 0.01036037, + "balance_loss_clip": 1.03061616, + "balance_loss_mlp": 1.02272129, + "epoch": 0.35641064181572224, + "flos": 22273450214400.0, + "grad_norm": 2.4038824969596844, + "language_loss": 0.79498285, + "learning_rate": 2.9826364298094212e-06, + "loss": 0.81615496, + "num_input_tokens_seen": 127425350, + "step": 5928, + "time_per_iteration": 2.8361685276031494 + }, + { + "auxiliary_loss_clip": 0.01081985, + "auxiliary_loss_mlp": 0.01034643, + "balance_loss_clip": 1.03080678, + "balance_loss_mlp": 1.02083874, + "epoch": 0.3564707650683902, + "flos": 23001251587200.0, + "grad_norm": 1.3440048382344048, + "language_loss": 0.81797767, + "learning_rate": 2.982297197789215e-06, + "loss": 0.83914393, + "num_input_tokens_seen": 127446335, + "step": 5929, + "time_per_iteration": 2.784904718399048 + }, + { + "auxiliary_loss_clip": 0.01068985, + "auxiliary_loss_mlp": 0.0103142, + "balance_loss_clip": 1.02798831, + "balance_loss_mlp": 1.01786005, + "epoch": 0.35653088832105817, + "flos": 14684268965760.0, + "grad_norm": 1.5872506999310547, + "language_loss": 0.69997668, + "learning_rate": 2.981957928520201e-06, + "loss": 0.72098064, + "num_input_tokens_seen": 127462795, + "step": 5930, + "time_per_iteration": 2.869753837585449 + }, + { + "auxiliary_loss_clip": 0.01076172, + "auxiliary_loss_mlp": 0.01043881, + "balance_loss_clip": 1.03330493, + "balance_loss_mlp": 1.02953398, + "epoch": 0.35659101157372614, + "flos": 23477068074240.0, + "grad_norm": 2.09087135011957, + "language_loss": 0.6780116, + "learning_rate": 2.981618622015244e-06, + "loss": 0.69921213, + "num_input_tokens_seen": 127482675, + "step": 5931, + "time_per_iteration": 2.744408130645752 + }, + { + "auxiliary_loss_clip": 0.0107354, + "auxiliary_loss_mlp": 0.0103238, + "balance_loss_clip": 1.03186369, + "balance_loss_mlp": 1.01850986, + "epoch": 0.3566511348263941, + "flos": 26578672583040.0, + "grad_norm": 1.6242200888185838, + "language_loss": 0.67774236, + "learning_rate": 2.981279278287211e-06, + "loss": 0.69880152, + "num_input_tokens_seen": 127502275, + "step": 5932, + "time_per_iteration": 2.6395936012268066 + }, + { + "auxiliary_loss_clip": 0.01039938, + "auxiliary_loss_mlp": 0.01030415, + "balance_loss_clip": 1.03051066, + "balance_loss_mlp": 1.01653934, + "epoch": 0.35671125807906207, + "flos": 13115008609920.0, + "grad_norm": 2.8092936576918004, + "language_loss": 0.78704697, + "learning_rate": 2.980939897348969e-06, + "loss": 0.80775052, + "num_input_tokens_seen": 127520195, + "step": 5933, + "time_per_iteration": 2.66519832611084 + }, + { + "auxiliary_loss_clip": 0.01065914, + "auxiliary_loss_mlp": 0.01045237, + "balance_loss_clip": 1.0291127, + "balance_loss_mlp": 1.02969193, + "epoch": 0.35677138133173003, + "flos": 33000577557120.0, + "grad_norm": 2.3033338938711787, + "language_loss": 0.69127882, + "learning_rate": 2.980600479213388e-06, + "loss": 0.7123903, + "num_input_tokens_seen": 127544495, + "step": 5934, + "time_per_iteration": 2.703418254852295 + }, + { + "auxiliary_loss_clip": 0.01059802, + "auxiliary_loss_mlp": 0.0074806, + "balance_loss_clip": 1.03008711, + "balance_loss_mlp": 1.00021756, + "epoch": 0.356831504584398, + "flos": 20777842696320.0, + "grad_norm": 2.5490441870053653, + "language_loss": 0.71509612, + "learning_rate": 2.9802610238933384e-06, + "loss": 0.73317474, + "num_input_tokens_seen": 127563810, + "step": 5935, + "time_per_iteration": 4.302875757217407 + }, + { + "auxiliary_loss_clip": 0.01055394, + "auxiliary_loss_mlp": 0.01037667, + "balance_loss_clip": 1.03193855, + "balance_loss_mlp": 1.02314162, + "epoch": 0.35689162783706596, + "flos": 12165566365440.0, + "grad_norm": 2.0038995629083245, + "language_loss": 0.77857029, + "learning_rate": 2.979921531401692e-06, + "loss": 0.79950094, + "num_input_tokens_seen": 127579065, + "step": 5936, + "time_per_iteration": 2.724578619003296 + }, + { + "auxiliary_loss_clip": 0.01073253, + "auxiliary_loss_mlp": 0.00747791, + "balance_loss_clip": 1.03127599, + "balance_loss_mlp": 1.00014794, + "epoch": 0.356951751089734, + "flos": 23841489507840.0, + "grad_norm": 1.382704609970795, + "language_loss": 0.64118552, + "learning_rate": 2.9795820017513242e-06, + "loss": 0.65939593, + "num_input_tokens_seen": 127599105, + "step": 5937, + "time_per_iteration": 2.676368236541748 + }, + { + "auxiliary_loss_clip": 0.0108471, + "auxiliary_loss_mlp": 0.00747797, + "balance_loss_clip": 1.03175497, + "balance_loss_mlp": 1.00024128, + "epoch": 0.35701187434240195, + "flos": 11722176881280.0, + "grad_norm": 4.166062626348737, + "language_loss": 0.78555095, + "learning_rate": 2.9792424349551073e-06, + "loss": 0.80387592, + "num_input_tokens_seen": 127614940, + "step": 5938, + "time_per_iteration": 4.207992076873779 + }, + { + "auxiliary_loss_clip": 0.01053923, + "auxiliary_loss_mlp": 0.01042132, + "balance_loss_clip": 1.0335331, + "balance_loss_mlp": 1.02816069, + "epoch": 0.3570719975950699, + "flos": 24898879100160.0, + "grad_norm": 2.4451696704893875, + "language_loss": 0.80093694, + "learning_rate": 2.9789028310259202e-06, + "loss": 0.82189745, + "num_input_tokens_seen": 127634960, + "step": 5939, + "time_per_iteration": 2.708855152130127 + }, + { + "auxiliary_loss_clip": 0.0106222, + "auxiliary_loss_mlp": 0.01037666, + "balance_loss_clip": 1.02960873, + "balance_loss_mlp": 1.02318192, + "epoch": 0.3571321208477379, + "flos": 25994836920960.0, + "grad_norm": 1.7888640639045097, + "language_loss": 0.7930786, + "learning_rate": 2.9785631899766395e-06, + "loss": 0.81407744, + "num_input_tokens_seen": 127654545, + "step": 5940, + "time_per_iteration": 2.746220111846924 + }, + { + "auxiliary_loss_clip": 0.01066546, + "auxiliary_loss_mlp": 0.01035633, + "balance_loss_clip": 1.03225946, + "balance_loss_mlp": 1.02030873, + "epoch": 0.35719224410040584, + "flos": 14501663199360.0, + "grad_norm": 1.9165514506536476, + "language_loss": 0.72289944, + "learning_rate": 2.9782235118201443e-06, + "loss": 0.74392122, + "num_input_tokens_seen": 127672320, + "step": 5941, + "time_per_iteration": 2.7116987705230713 + }, + { + "auxiliary_loss_clip": 0.01081264, + "auxiliary_loss_mlp": 0.01042746, + "balance_loss_clip": 1.03671646, + "balance_loss_mlp": 1.02754641, + "epoch": 0.3572523673530738, + "flos": 31175453646720.0, + "grad_norm": 1.9002908488820012, + "language_loss": 0.63916498, + "learning_rate": 2.9778837965693154e-06, + "loss": 0.66040504, + "num_input_tokens_seen": 127693315, + "step": 5942, + "time_per_iteration": 2.7888307571411133 + }, + { + "auxiliary_loss_clip": 0.01072215, + "auxiliary_loss_mlp": 0.01042287, + "balance_loss_clip": 1.03070259, + "balance_loss_mlp": 1.02755845, + "epoch": 0.3573124906057418, + "flos": 15851976203520.0, + "grad_norm": 1.9138248550929213, + "language_loss": 0.73677027, + "learning_rate": 2.9775440442370354e-06, + "loss": 0.75791538, + "num_input_tokens_seen": 127711570, + "step": 5943, + "time_per_iteration": 2.5644612312316895 + }, + { + "auxiliary_loss_clip": 0.01014912, + "auxiliary_loss_mlp": 0.01003033, + "balance_loss_clip": 1.00378633, + "balance_loss_mlp": 1.00109029, + "epoch": 0.35737261385840974, + "flos": 60822729118080.0, + "grad_norm": 0.7868992931345075, + "language_loss": 0.60681349, + "learning_rate": 2.9772042548361867e-06, + "loss": 0.62699294, + "num_input_tokens_seen": 127772475, + "step": 5944, + "time_per_iteration": 3.2109293937683105 + }, + { + "auxiliary_loss_clip": 0.01061045, + "auxiliary_loss_mlp": 0.01036522, + "balance_loss_clip": 1.03013945, + "balance_loss_mlp": 1.02252054, + "epoch": 0.3574327371110777, + "flos": 18843765857280.0, + "grad_norm": 2.640776003050003, + "language_loss": 0.72293735, + "learning_rate": 2.976864428379655e-06, + "loss": 0.74391299, + "num_input_tokens_seen": 127790940, + "step": 5945, + "time_per_iteration": 2.607304811477661 + }, + { + "auxiliary_loss_clip": 0.01059033, + "auxiliary_loss_mlp": 0.00747782, + "balance_loss_clip": 1.02792442, + "balance_loss_mlp": 1.00023723, + "epoch": 0.35749286036374567, + "flos": 23549679417600.0, + "grad_norm": 2.045887288847123, + "language_loss": 0.81631476, + "learning_rate": 2.976524564880326e-06, + "loss": 0.83438289, + "num_input_tokens_seen": 127808275, + "step": 5946, + "time_per_iteration": 4.312964916229248 + }, + { + "auxiliary_loss_clip": 0.01084992, + "auxiliary_loss_mlp": 0.01043741, + "balance_loss_clip": 1.03226805, + "balance_loss_mlp": 1.02938819, + "epoch": 0.35755298361641363, + "flos": 21105491581440.0, + "grad_norm": 1.7129738008471465, + "language_loss": 0.6869486, + "learning_rate": 2.9761846643510882e-06, + "loss": 0.70823592, + "num_input_tokens_seen": 127828840, + "step": 5947, + "time_per_iteration": 2.5357799530029297 + }, + { + "auxiliary_loss_clip": 0.01054149, + "auxiliary_loss_mlp": 0.01037608, + "balance_loss_clip": 1.02856135, + "balance_loss_mlp": 1.02379191, + "epoch": 0.3576131068690816, + "flos": 19245031666560.0, + "grad_norm": 2.0698500326621985, + "language_loss": 0.75627542, + "learning_rate": 2.9758447268048297e-06, + "loss": 0.77719307, + "num_input_tokens_seen": 127846240, + "step": 5948, + "time_per_iteration": 4.188467502593994 + }, + { + "auxiliary_loss_clip": 0.01041918, + "auxiliary_loss_mlp": 0.0104114, + "balance_loss_clip": 1.03639507, + "balance_loss_mlp": 1.02789617, + "epoch": 0.35767323012174956, + "flos": 28654703971200.0, + "grad_norm": 2.1773455901863237, + "language_loss": 0.7071411, + "learning_rate": 2.9755047522544415e-06, + "loss": 0.72797167, + "num_input_tokens_seen": 127866880, + "step": 5949, + "time_per_iteration": 2.9680633544921875 + }, + { + "auxiliary_loss_clip": 0.01064148, + "auxiliary_loss_mlp": 0.01040269, + "balance_loss_clip": 1.03288817, + "balance_loss_mlp": 1.02723885, + "epoch": 0.35773335337441753, + "flos": 17085363459840.0, + "grad_norm": 1.8652901733703462, + "language_loss": 0.77067274, + "learning_rate": 2.9751647407128154e-06, + "loss": 0.79171693, + "num_input_tokens_seen": 127883560, + "step": 5950, + "time_per_iteration": 2.907633066177368 + }, + { + "auxiliary_loss_clip": 0.01074825, + "auxiliary_loss_mlp": 0.01040001, + "balance_loss_clip": 1.03117871, + "balance_loss_mlp": 1.02535665, + "epoch": 0.35779347662708555, + "flos": 15888605097600.0, + "grad_norm": 1.6708469639154462, + "language_loss": 0.7266506, + "learning_rate": 2.9748246921928445e-06, + "loss": 0.74779892, + "num_input_tokens_seen": 127902330, + "step": 5951, + "time_per_iteration": 2.604337692260742 + }, + { + "auxiliary_loss_clip": 0.01076991, + "auxiliary_loss_mlp": 0.01042678, + "balance_loss_clip": 1.0320611, + "balance_loss_mlp": 1.02772915, + "epoch": 0.3578535998797535, + "flos": 28658834035200.0, + "grad_norm": 1.84758941950843, + "language_loss": 0.6942451, + "learning_rate": 2.9744846067074236e-06, + "loss": 0.71544176, + "num_input_tokens_seen": 127922325, + "step": 5952, + "time_per_iteration": 2.7903552055358887 + }, + { + "auxiliary_loss_clip": 0.01031766, + "auxiliary_loss_mlp": 0.01053107, + "balance_loss_clip": 1.02615309, + "balance_loss_mlp": 1.037884, + "epoch": 0.3579137231324215, + "flos": 37852432076160.0, + "grad_norm": 1.6896392009480663, + "language_loss": 0.70111048, + "learning_rate": 2.974144484269449e-06, + "loss": 0.72195923, + "num_input_tokens_seen": 127942635, + "step": 5953, + "time_per_iteration": 2.9405863285064697 + }, + { + "auxiliary_loss_clip": 0.01061861, + "auxiliary_loss_mlp": 0.01031074, + "balance_loss_clip": 1.02918983, + "balance_loss_mlp": 1.01786613, + "epoch": 0.35797384638508944, + "flos": 22346851656960.0, + "grad_norm": 2.1174159397802454, + "language_loss": 0.6654681, + "learning_rate": 2.9738043248918175e-06, + "loss": 0.68639743, + "num_input_tokens_seen": 127962520, + "step": 5954, + "time_per_iteration": 2.709169864654541 + }, + { + "auxiliary_loss_clip": 0.01062547, + "auxiliary_loss_mlp": 0.0103666, + "balance_loss_clip": 1.03183222, + "balance_loss_mlp": 1.02313542, + "epoch": 0.3580339696377574, + "flos": 13589711775360.0, + "grad_norm": 1.9112977823329285, + "language_loss": 0.74745041, + "learning_rate": 2.9734641285874282e-06, + "loss": 0.76844245, + "num_input_tokens_seen": 127981180, + "step": 5955, + "time_per_iteration": 2.6267666816711426 + }, + { + "auxiliary_loss_clip": 0.01071038, + "auxiliary_loss_mlp": 0.01034706, + "balance_loss_clip": 1.03397036, + "balance_loss_mlp": 1.02172375, + "epoch": 0.3580940928904254, + "flos": 23768231719680.0, + "grad_norm": 1.4782574765882632, + "language_loss": 0.75962967, + "learning_rate": 2.973123895369182e-06, + "loss": 0.78068709, + "num_input_tokens_seen": 127999725, + "step": 5956, + "time_per_iteration": 2.650866746902466 + }, + { + "auxiliary_loss_clip": 0.01082527, + "auxiliary_loss_mlp": 0.01033695, + "balance_loss_clip": 1.03252208, + "balance_loss_mlp": 1.02076721, + "epoch": 0.35815421614309334, + "flos": 19463871277440.0, + "grad_norm": 1.8761265994108427, + "language_loss": 0.73186857, + "learning_rate": 2.9727836252499805e-06, + "loss": 0.75303078, + "num_input_tokens_seen": 128018885, + "step": 5957, + "time_per_iteration": 2.6725523471832275 + }, + { + "auxiliary_loss_clip": 0.0106277, + "auxiliary_loss_mlp": 0.01036994, + "balance_loss_clip": 1.03212309, + "balance_loss_mlp": 1.02347553, + "epoch": 0.3582143393957613, + "flos": 23368186972800.0, + "grad_norm": 1.8297142007271865, + "language_loss": 0.70987916, + "learning_rate": 2.972443318242726e-06, + "loss": 0.7308768, + "num_input_tokens_seen": 128037875, + "step": 5958, + "time_per_iteration": 2.6186606884002686 + }, + { + "auxiliary_loss_clip": 0.01047001, + "auxiliary_loss_mlp": 0.01031027, + "balance_loss_clip": 1.02883661, + "balance_loss_mlp": 1.01834857, + "epoch": 0.35827446264842927, + "flos": 26323275905280.0, + "grad_norm": 1.625602021362389, + "language_loss": 0.88437879, + "learning_rate": 2.972102974360324e-06, + "loss": 0.90515906, + "num_input_tokens_seen": 128056045, + "step": 5959, + "time_per_iteration": 2.701497793197632 + }, + { + "auxiliary_loss_clip": 0.01081099, + "auxiliary_loss_mlp": 0.01037588, + "balance_loss_clip": 1.03082788, + "balance_loss_mlp": 1.023772, + "epoch": 0.35833458590109724, + "flos": 30446610779520.0, + "grad_norm": 1.510724655581756, + "language_loss": 0.58044463, + "learning_rate": 2.971762593615679e-06, + "loss": 0.60163152, + "num_input_tokens_seen": 128077815, + "step": 5960, + "time_per_iteration": 2.680323362350464 + }, + { + "auxiliary_loss_clip": 0.01084257, + "auxiliary_loss_mlp": 0.01039748, + "balance_loss_clip": 1.03258669, + "balance_loss_mlp": 1.0247333, + "epoch": 0.3583947091537652, + "flos": 14829886702080.0, + "grad_norm": 2.071236821908717, + "language_loss": 0.76177251, + "learning_rate": 2.9714221760216993e-06, + "loss": 0.78301263, + "num_input_tokens_seen": 128095460, + "step": 5961, + "time_per_iteration": 2.5678670406341553 + }, + { + "auxiliary_loss_clip": 0.0105591, + "auxiliary_loss_mlp": 0.01032789, + "balance_loss_clip": 1.03410351, + "balance_loss_mlp": 1.01893711, + "epoch": 0.35845483240643317, + "flos": 34240644743040.0, + "grad_norm": 1.7740826367248976, + "language_loss": 0.70211834, + "learning_rate": 2.971081721591294e-06, + "loss": 0.72300529, + "num_input_tokens_seen": 128118605, + "step": 5962, + "time_per_iteration": 2.8112637996673584 + }, + { + "auxiliary_loss_clip": 0.01066963, + "auxiliary_loss_mlp": 0.01039391, + "balance_loss_clip": 1.035133, + "balance_loss_mlp": 1.02688622, + "epoch": 0.35851495565910113, + "flos": 20960089326720.0, + "grad_norm": 1.5494091824890965, + "language_loss": 0.74283206, + "learning_rate": 2.9707412303373716e-06, + "loss": 0.76389563, + "num_input_tokens_seen": 128139205, + "step": 5963, + "time_per_iteration": 2.6281089782714844 + }, + { + "auxiliary_loss_clip": 0.01084312, + "auxiliary_loss_mlp": 0.01038302, + "balance_loss_clip": 1.03377676, + "balance_loss_mlp": 1.02464008, + "epoch": 0.35857507891176915, + "flos": 22309863626880.0, + "grad_norm": 1.5266145320562725, + "language_loss": 0.78363276, + "learning_rate": 2.9704007022728447e-06, + "loss": 0.80485892, + "num_input_tokens_seen": 128158765, + "step": 5964, + "time_per_iteration": 2.543220043182373 + }, + { + "auxiliary_loss_clip": 0.01066894, + "auxiliary_loss_mlp": 0.01036105, + "balance_loss_clip": 1.03407919, + "balance_loss_mlp": 1.02030969, + "epoch": 0.3586352021644371, + "flos": 23367863750400.0, + "grad_norm": 2.3647815289053034, + "language_loss": 0.66717458, + "learning_rate": 2.970060137410626e-06, + "loss": 0.68820453, + "num_input_tokens_seen": 128177850, + "step": 5965, + "time_per_iteration": 2.662482500076294 + }, + { + "auxiliary_loss_clip": 0.01084011, + "auxiliary_loss_mlp": 0.00747941, + "balance_loss_clip": 1.03240132, + "balance_loss_mlp": 1.00022054, + "epoch": 0.3586953254171051, + "flos": 27849227437440.0, + "grad_norm": 1.5678386450722022, + "language_loss": 0.78991228, + "learning_rate": 2.9697195357636294e-06, + "loss": 0.80823183, + "num_input_tokens_seen": 128196925, + "step": 5966, + "time_per_iteration": 2.6302597522735596 + }, + { + "auxiliary_loss_clip": 0.01026217, + "auxiliary_loss_mlp": 0.01044764, + "balance_loss_clip": 1.02649593, + "balance_loss_mlp": 1.02918887, + "epoch": 0.35875544866977305, + "flos": 19500500171520.0, + "grad_norm": 1.9067106245184762, + "language_loss": 0.91152424, + "learning_rate": 2.9693788973447715e-06, + "loss": 0.93223405, + "num_input_tokens_seen": 128213955, + "step": 5967, + "time_per_iteration": 2.7028043270111084 + }, + { + "auxiliary_loss_clip": 0.01049845, + "auxiliary_loss_mlp": 0.0104756, + "balance_loss_clip": 1.03168869, + "balance_loss_mlp": 1.03026283, + "epoch": 0.358815571922441, + "flos": 21471134077440.0, + "grad_norm": 1.9068887483232977, + "language_loss": 0.80533683, + "learning_rate": 2.9690382221669682e-06, + "loss": 0.82631087, + "num_input_tokens_seen": 128232980, + "step": 5968, + "time_per_iteration": 2.701960325241089 + }, + { + "auxiliary_loss_clip": 0.01058805, + "auxiliary_loss_mlp": 0.01055487, + "balance_loss_clip": 1.02990842, + "balance_loss_mlp": 1.03959632, + "epoch": 0.358875695175109, + "flos": 21835411856640.0, + "grad_norm": 3.2628249199167922, + "language_loss": 0.8405652, + "learning_rate": 2.9686975102431384e-06, + "loss": 0.86170805, + "num_input_tokens_seen": 128252795, + "step": 5969, + "time_per_iteration": 2.7442128658294678 + }, + { + "auxiliary_loss_clip": 0.0104057, + "auxiliary_loss_mlp": 0.01033119, + "balance_loss_clip": 1.02825904, + "balance_loss_mlp": 1.01880777, + "epoch": 0.35893581842777694, + "flos": 32011633330560.0, + "grad_norm": 1.7510091742622582, + "language_loss": 0.71934104, + "learning_rate": 2.968356761586202e-06, + "loss": 0.74007791, + "num_input_tokens_seen": 128273115, + "step": 5970, + "time_per_iteration": 2.6966800689697266 + }, + { + "auxiliary_loss_clip": 0.01060804, + "auxiliary_loss_mlp": 0.0103337, + "balance_loss_clip": 1.0293591, + "balance_loss_mlp": 1.01961946, + "epoch": 0.3589959416804449, + "flos": 20485817124480.0, + "grad_norm": 2.2373050059908794, + "language_loss": 0.79134941, + "learning_rate": 2.9680159762090805e-06, + "loss": 0.81229115, + "num_input_tokens_seen": 128292220, + "step": 5971, + "time_per_iteration": 2.682384967803955 + }, + { + "auxiliary_loss_clip": 0.01050731, + "auxiliary_loss_mlp": 0.01039422, + "balance_loss_clip": 1.03304219, + "balance_loss_mlp": 1.0239011, + "epoch": 0.3590560649331129, + "flos": 16180666583040.0, + "grad_norm": 1.6813081069775173, + "language_loss": 0.78530288, + "learning_rate": 2.967675154124696e-06, + "loss": 0.80620444, + "num_input_tokens_seen": 128310305, + "step": 5972, + "time_per_iteration": 2.6860713958740234 + }, + { + "auxiliary_loss_clip": 0.01042596, + "auxiliary_loss_mlp": 0.01036535, + "balance_loss_clip": 1.02717352, + "balance_loss_mlp": 1.02205074, + "epoch": 0.35911618818578084, + "flos": 20375391738240.0, + "grad_norm": 7.241341931335326, + "language_loss": 0.81293511, + "learning_rate": 2.9673342953459722e-06, + "loss": 0.83372641, + "num_input_tokens_seen": 128328305, + "step": 5973, + "time_per_iteration": 2.639615535736084 + }, + { + "auxiliary_loss_clip": 0.00996944, + "auxiliary_loss_mlp": 0.01002079, + "balance_loss_clip": 1.00552702, + "balance_loss_mlp": 1.00002885, + "epoch": 0.3591763114384488, + "flos": 41236691685120.0, + "grad_norm": 0.9166585123343872, + "language_loss": 0.56672609, + "learning_rate": 2.9669933998858355e-06, + "loss": 0.58671635, + "num_input_tokens_seen": 128378380, + "step": 5974, + "time_per_iteration": 3.1643970012664795 + }, + { + "auxiliary_loss_clip": 0.01074853, + "auxiliary_loss_mlp": 0.0103762, + "balance_loss_clip": 1.03149438, + "balance_loss_mlp": 1.02366018, + "epoch": 0.35923643469111677, + "flos": 18695454600960.0, + "grad_norm": 1.6653280828366894, + "language_loss": 0.68777692, + "learning_rate": 2.9666524677572114e-06, + "loss": 0.70890164, + "num_input_tokens_seen": 128394315, + "step": 5975, + "time_per_iteration": 2.581453800201416 + }, + { + "auxiliary_loss_clip": 0.01084806, + "auxiliary_loss_mlp": 0.01037545, + "balance_loss_clip": 1.03366661, + "balance_loss_mlp": 1.02358007, + "epoch": 0.35929655794378473, + "flos": 25009950931200.0, + "grad_norm": 1.5984009021815393, + "language_loss": 0.8013711, + "learning_rate": 2.96631149897303e-06, + "loss": 0.82259458, + "num_input_tokens_seen": 128414515, + "step": 5976, + "time_per_iteration": 2.6280386447906494 + }, + { + "auxiliary_loss_clip": 0.01018714, + "auxiliary_loss_mlp": 0.01039655, + "balance_loss_clip": 1.02541387, + "balance_loss_mlp": 1.02446747, + "epoch": 0.35935668119645275, + "flos": 14975576265600.0, + "grad_norm": 2.0330008230847065, + "language_loss": 0.78922546, + "learning_rate": 2.9659704935462194e-06, + "loss": 0.80980915, + "num_input_tokens_seen": 128430615, + "step": 5977, + "time_per_iteration": 2.7168805599212646 + }, + { + "auxiliary_loss_clip": 0.01051285, + "auxiliary_loss_mlp": 0.0104072, + "balance_loss_clip": 1.03058672, + "balance_loss_mlp": 1.02756524, + "epoch": 0.3594168044491207, + "flos": 21178138838400.0, + "grad_norm": 1.7800835209969306, + "language_loss": 0.79856199, + "learning_rate": 2.9656294514897102e-06, + "loss": 0.81948209, + "num_input_tokens_seen": 128449480, + "step": 5978, + "time_per_iteration": 2.7292606830596924 + }, + { + "auxiliary_loss_clip": 0.01083192, + "auxiliary_loss_mlp": 0.00748009, + "balance_loss_clip": 1.03108704, + "balance_loss_mlp": 1.00025499, + "epoch": 0.3594769277017887, + "flos": 27672152365440.0, + "grad_norm": 1.4444128809996475, + "language_loss": 0.67309529, + "learning_rate": 2.965288372816436e-06, + "loss": 0.69140732, + "num_input_tokens_seen": 128471465, + "step": 5979, + "time_per_iteration": 2.6733603477478027 + }, + { + "auxiliary_loss_clip": 0.01052312, + "auxiliary_loss_mlp": 0.01037652, + "balance_loss_clip": 1.02864027, + "balance_loss_mlp": 1.02244043, + "epoch": 0.35953705095445665, + "flos": 23002328995200.0, + "grad_norm": 1.9912348793764993, + "language_loss": 0.67027152, + "learning_rate": 2.9649472575393296e-06, + "loss": 0.69117117, + "num_input_tokens_seen": 128490645, + "step": 5980, + "time_per_iteration": 2.6406073570251465 + }, + { + "auxiliary_loss_clip": 0.01064319, + "auxiliary_loss_mlp": 0.01041826, + "balance_loss_clip": 1.03034627, + "balance_loss_mlp": 1.02603102, + "epoch": 0.3595971742071246, + "flos": 25513992529920.0, + "grad_norm": 2.0396548026697396, + "language_loss": 0.71554029, + "learning_rate": 2.964606105671327e-06, + "loss": 0.73660171, + "num_input_tokens_seen": 128510225, + "step": 5981, + "time_per_iteration": 2.656475305557251 + }, + { + "auxiliary_loss_clip": 0.01067248, + "auxiliary_loss_mlp": 0.01043939, + "balance_loss_clip": 1.0351783, + "balance_loss_mlp": 1.02752411, + "epoch": 0.3596572974597926, + "flos": 29862559635840.0, + "grad_norm": 2.112880035496706, + "language_loss": 0.70991009, + "learning_rate": 2.9642649172253635e-06, + "loss": 0.731022, + "num_input_tokens_seen": 128530195, + "step": 5982, + "time_per_iteration": 2.6754133701324463 + }, + { + "auxiliary_loss_clip": 0.01063393, + "auxiliary_loss_mlp": 0.01045353, + "balance_loss_clip": 1.03010654, + "balance_loss_mlp": 1.0304637, + "epoch": 0.35971742071246054, + "flos": 23112538899840.0, + "grad_norm": 1.6465499925847857, + "language_loss": 0.75711209, + "learning_rate": 2.9639236922143786e-06, + "loss": 0.77819955, + "num_input_tokens_seen": 128549990, + "step": 5983, + "time_per_iteration": 4.202735662460327 + }, + { + "auxiliary_loss_clip": 0.01091152, + "auxiliary_loss_mlp": 0.01049795, + "balance_loss_clip": 1.03485739, + "balance_loss_mlp": 1.03348708, + "epoch": 0.3597775439651285, + "flos": 16725359399040.0, + "grad_norm": 2.0838761728393163, + "language_loss": 0.76632631, + "learning_rate": 2.96358243065131e-06, + "loss": 0.7877357, + "num_input_tokens_seen": 128567925, + "step": 5984, + "time_per_iteration": 2.6349689960479736 + }, + { + "auxiliary_loss_clip": 0.01072784, + "auxiliary_loss_mlp": 0.00747855, + "balance_loss_clip": 1.0325222, + "balance_loss_mlp": 1.00015295, + "epoch": 0.3598376672177965, + "flos": 19719483436800.0, + "grad_norm": 1.9569866038616655, + "language_loss": 0.86215723, + "learning_rate": 2.9632411325490993e-06, + "loss": 0.88036364, + "num_input_tokens_seen": 128585655, + "step": 5985, + "time_per_iteration": 4.2666099071502686 + }, + { + "auxiliary_loss_clip": 0.01070454, + "auxiliary_loss_mlp": 0.01043696, + "balance_loss_clip": 1.03036451, + "balance_loss_mlp": 1.02866411, + "epoch": 0.35989779047046444, + "flos": 17311529445120.0, + "grad_norm": 1.354389643958148, + "language_loss": 0.72607487, + "learning_rate": 2.9628997979206884e-06, + "loss": 0.7472164, + "num_input_tokens_seen": 128604820, + "step": 5986, + "time_per_iteration": 2.5715274810791016 + }, + { + "auxiliary_loss_clip": 0.01049516, + "auxiliary_loss_mlp": 0.01039077, + "balance_loss_clip": 1.0282228, + "balance_loss_mlp": 1.02422333, + "epoch": 0.3599579137231324, + "flos": 22711237176960.0, + "grad_norm": 2.180711501252997, + "language_loss": 0.73507202, + "learning_rate": 2.9625584267790204e-06, + "loss": 0.75595796, + "num_input_tokens_seen": 128623070, + "step": 5987, + "time_per_iteration": 2.722900629043579 + }, + { + "auxiliary_loss_clip": 0.01086013, + "auxiliary_loss_mlp": 0.01040146, + "balance_loss_clip": 1.03247654, + "balance_loss_mlp": 1.02481592, + "epoch": 0.36001803697580037, + "flos": 20959873845120.0, + "grad_norm": 2.260459531618945, + "language_loss": 0.69775307, + "learning_rate": 2.9622170191370404e-06, + "loss": 0.71901464, + "num_input_tokens_seen": 128642430, + "step": 5988, + "time_per_iteration": 2.5412440299987793 + }, + { + "auxiliary_loss_clip": 0.01076391, + "auxiliary_loss_mlp": 0.01040264, + "balance_loss_clip": 1.0320189, + "balance_loss_mlp": 1.02552938, + "epoch": 0.36007816022846834, + "flos": 20485565729280.0, + "grad_norm": 2.1395124194122355, + "language_loss": 0.72843015, + "learning_rate": 2.9618755750076953e-06, + "loss": 0.74959671, + "num_input_tokens_seen": 128661285, + "step": 5989, + "time_per_iteration": 2.5589113235473633 + }, + { + "auxiliary_loss_clip": 0.01035139, + "auxiliary_loss_mlp": 0.01037499, + "balance_loss_clip": 1.02544916, + "balance_loss_mlp": 1.02312279, + "epoch": 0.36013828348113636, + "flos": 28001237794560.0, + "grad_norm": 1.4682981774143995, + "language_loss": 0.79838812, + "learning_rate": 2.961534094403931e-06, + "loss": 0.81911457, + "num_input_tokens_seen": 128682210, + "step": 5990, + "time_per_iteration": 2.716391086578369 + }, + { + "auxiliary_loss_clip": 0.01072007, + "auxiliary_loss_mlp": 0.01031772, + "balance_loss_clip": 1.03011549, + "balance_loss_mlp": 1.01750255, + "epoch": 0.3601984067338043, + "flos": 20082181017600.0, + "grad_norm": 1.9559005772572164, + "language_loss": 0.83757758, + "learning_rate": 2.961192577338698e-06, + "loss": 0.85861534, + "num_input_tokens_seen": 128700445, + "step": 5991, + "time_per_iteration": 2.656526803970337 + }, + { + "auxiliary_loss_clip": 0.0106127, + "auxiliary_loss_mlp": 0.01048828, + "balance_loss_clip": 1.03153229, + "balance_loss_mlp": 1.03361702, + "epoch": 0.3602585299864723, + "flos": 18617599872000.0, + "grad_norm": 2.0968977548708287, + "language_loss": 0.75611246, + "learning_rate": 2.9608510238249463e-06, + "loss": 0.77721339, + "num_input_tokens_seen": 128716855, + "step": 5992, + "time_per_iteration": 2.6212470531463623 + }, + { + "auxiliary_loss_clip": 0.01083058, + "auxiliary_loss_mlp": 0.01039182, + "balance_loss_clip": 1.03144407, + "balance_loss_mlp": 1.0242455, + "epoch": 0.36031865323914025, + "flos": 19573003774080.0, + "grad_norm": 1.7940213342032696, + "language_loss": 0.77399278, + "learning_rate": 2.960509433875627e-06, + "loss": 0.79521513, + "num_input_tokens_seen": 128735835, + "step": 5993, + "time_per_iteration": 4.1277501583099365 + }, + { + "auxiliary_loss_clip": 0.01057504, + "auxiliary_loss_mlp": 0.01042264, + "balance_loss_clip": 1.02928042, + "balance_loss_mlp": 1.02576494, + "epoch": 0.3603787764918082, + "flos": 17490615678720.0, + "grad_norm": 1.782603382017473, + "language_loss": 0.74278355, + "learning_rate": 2.9601678075036943e-06, + "loss": 0.76378125, + "num_input_tokens_seen": 128752465, + "step": 5994, + "time_per_iteration": 2.650982618331909 + }, + { + "auxiliary_loss_clip": 0.01046575, + "auxiliary_loss_mlp": 0.01038245, + "balance_loss_clip": 1.0304234, + "balance_loss_mlp": 1.02395833, + "epoch": 0.3604388997444762, + "flos": 15523393564800.0, + "grad_norm": 1.8049141996156777, + "language_loss": 0.68788421, + "learning_rate": 2.9598261447221024e-06, + "loss": 0.70873243, + "num_input_tokens_seen": 128770865, + "step": 5995, + "time_per_iteration": 4.377038955688477 + }, + { + "auxiliary_loss_clip": 0.01053343, + "auxiliary_loss_mlp": 0.0104602, + "balance_loss_clip": 1.02804792, + "balance_loss_mlp": 1.03057611, + "epoch": 0.36049902299714415, + "flos": 17310883000320.0, + "grad_norm": 2.3399700717775676, + "language_loss": 0.82466739, + "learning_rate": 2.9594844455438057e-06, + "loss": 0.84566098, + "num_input_tokens_seen": 128789730, + "step": 5996, + "time_per_iteration": 2.649210214614868 + }, + { + "auxiliary_loss_clip": 0.01080869, + "auxiliary_loss_mlp": 0.01035666, + "balance_loss_clip": 1.02961373, + "balance_loss_mlp": 1.02124786, + "epoch": 0.3605591462498121, + "flos": 17056025026560.0, + "grad_norm": 4.521434843128666, + "language_loss": 0.73918265, + "learning_rate": 2.959142709981763e-06, + "loss": 0.76034796, + "num_input_tokens_seen": 128806610, + "step": 5997, + "time_per_iteration": 2.5534608364105225 + }, + { + "auxiliary_loss_clip": 0.01066875, + "auxiliary_loss_mlp": 0.01038334, + "balance_loss_clip": 1.02951813, + "balance_loss_mlp": 1.02479148, + "epoch": 0.3606192695024801, + "flos": 16836862193280.0, + "grad_norm": 2.1449352570151548, + "language_loss": 0.68919802, + "learning_rate": 2.9588009380489337e-06, + "loss": 0.71025014, + "num_input_tokens_seen": 128824830, + "step": 5998, + "time_per_iteration": 2.5567121505737305 + }, + { + "auxiliary_loss_clip": 0.01035287, + "auxiliary_loss_mlp": 0.01042728, + "balance_loss_clip": 1.02844477, + "balance_loss_mlp": 1.02658677, + "epoch": 0.36067939275514804, + "flos": 12129655743360.0, + "grad_norm": 2.150082126844253, + "language_loss": 0.76820815, + "learning_rate": 2.9584591297582758e-06, + "loss": 0.78898835, + "num_input_tokens_seen": 128838170, + "step": 5999, + "time_per_iteration": 2.699305295944214 + }, + { + "auxiliary_loss_clip": 0.01053683, + "auxiliary_loss_mlp": 0.01042849, + "balance_loss_clip": 1.03228593, + "balance_loss_mlp": 1.02833509, + "epoch": 0.360739516007816, + "flos": 18041449720320.0, + "grad_norm": 1.6679608448276517, + "language_loss": 0.78003955, + "learning_rate": 2.9581172851227516e-06, + "loss": 0.80100489, + "num_input_tokens_seen": 128855625, + "step": 6000, + "time_per_iteration": 2.676640748977661 + }, + { + "auxiliary_loss_clip": 0.01048847, + "auxiliary_loss_mlp": 0.01036968, + "balance_loss_clip": 1.03104377, + "balance_loss_mlp": 1.02282393, + "epoch": 0.360799639260484, + "flos": 18549800951040.0, + "grad_norm": 1.6106712961468101, + "language_loss": 0.78284049, + "learning_rate": 2.9577754041553243e-06, + "loss": 0.80369866, + "num_input_tokens_seen": 128873540, + "step": 6001, + "time_per_iteration": 2.768402576446533 + }, + { + "auxiliary_loss_clip": 0.01079888, + "auxiliary_loss_mlp": 0.00747896, + "balance_loss_clip": 1.03000379, + "balance_loss_mlp": 1.00023699, + "epoch": 0.36085976251315194, + "flos": 19682028529920.0, + "grad_norm": 2.054420893174295, + "language_loss": 0.83451581, + "learning_rate": 2.9574334868689575e-06, + "loss": 0.85279369, + "num_input_tokens_seen": 128889925, + "step": 6002, + "time_per_iteration": 2.527280807495117 + }, + { + "auxiliary_loss_clip": 0.01051356, + "auxiliary_loss_mlp": 0.01033824, + "balance_loss_clip": 1.02642179, + "balance_loss_mlp": 1.02071118, + "epoch": 0.3609198857658199, + "flos": 24198943703040.0, + "grad_norm": 1.9879009827478384, + "language_loss": 0.90369976, + "learning_rate": 2.9570915332766165e-06, + "loss": 0.92455155, + "num_input_tokens_seen": 128906890, + "step": 6003, + "time_per_iteration": 2.6863842010498047 + }, + { + "auxiliary_loss_clip": 0.00988321, + "auxiliary_loss_mlp": 0.01003978, + "balance_loss_clip": 1.0069077, + "balance_loss_mlp": 1.00214255, + "epoch": 0.3609800090184879, + "flos": 57115995160320.0, + "grad_norm": 0.8700694010795479, + "language_loss": 0.53382295, + "learning_rate": 2.9567495433912693e-06, + "loss": 0.55374599, + "num_input_tokens_seen": 128965940, + "step": 6004, + "time_per_iteration": 3.2110700607299805 + }, + { + "auxiliary_loss_clip": 0.01061868, + "auxiliary_loss_mlp": 0.00748013, + "balance_loss_clip": 1.02887261, + "balance_loss_mlp": 1.00017989, + "epoch": 0.3610401322711559, + "flos": 20811239366400.0, + "grad_norm": 1.6830008041182043, + "language_loss": 0.77766716, + "learning_rate": 2.956407517225883e-06, + "loss": 0.795766, + "num_input_tokens_seen": 128985835, + "step": 6005, + "time_per_iteration": 2.6266791820526123 + }, + { + "auxiliary_loss_clip": 0.01061276, + "auxiliary_loss_mlp": 0.01045635, + "balance_loss_clip": 1.02891731, + "balance_loss_mlp": 1.03118122, + "epoch": 0.36110025552382385, + "flos": 13699167494400.0, + "grad_norm": 1.8985570199006823, + "language_loss": 0.79145694, + "learning_rate": 2.956065454793429e-06, + "loss": 0.81252611, + "num_input_tokens_seen": 129003120, + "step": 6006, + "time_per_iteration": 2.527912139892578 + }, + { + "auxiliary_loss_clip": 0.01085427, + "auxiliary_loss_mlp": 0.01039411, + "balance_loss_clip": 1.03173625, + "balance_loss_mlp": 1.02403259, + "epoch": 0.3611603787764918, + "flos": 22455014486400.0, + "grad_norm": 2.383507470150857, + "language_loss": 0.84766209, + "learning_rate": 2.955723356106876e-06, + "loss": 0.86891049, + "num_input_tokens_seen": 129021645, + "step": 6007, + "time_per_iteration": 2.6335556507110596 + }, + { + "auxiliary_loss_clip": 0.01067543, + "auxiliary_loss_mlp": 0.01035443, + "balance_loss_clip": 1.03183043, + "balance_loss_mlp": 1.01937377, + "epoch": 0.3612205020291598, + "flos": 20886651970560.0, + "grad_norm": 2.500693009240786, + "language_loss": 0.72421962, + "learning_rate": 2.955381221179198e-06, + "loss": 0.74524951, + "num_input_tokens_seen": 129038375, + "step": 6008, + "time_per_iteration": 2.638486623764038 + }, + { + "auxiliary_loss_clip": 0.01065089, + "auxiliary_loss_mlp": 0.01037017, + "balance_loss_clip": 1.02670765, + "balance_loss_mlp": 1.02267599, + "epoch": 0.36128062528182775, + "flos": 15741981780480.0, + "grad_norm": 1.9131225862601258, + "language_loss": 0.83107769, + "learning_rate": 2.955039050023368e-06, + "loss": 0.8520987, + "num_input_tokens_seen": 129056235, + "step": 6009, + "time_per_iteration": 2.7236597537994385 + }, + { + "auxiliary_loss_clip": 0.0105537, + "auxiliary_loss_mlp": 0.01042632, + "balance_loss_clip": 1.03112566, + "balance_loss_mlp": 1.0282439, + "epoch": 0.3613407485344957, + "flos": 16764502245120.0, + "grad_norm": 1.7172303440732937, + "language_loss": 0.76358235, + "learning_rate": 2.954696842652362e-06, + "loss": 0.78456235, + "num_input_tokens_seen": 129072405, + "step": 6010, + "time_per_iteration": 2.735492467880249 + }, + { + "auxiliary_loss_clip": 0.01062558, + "auxiliary_loss_mlp": 0.01038994, + "balance_loss_clip": 1.03209603, + "balance_loss_mlp": 1.02569592, + "epoch": 0.3614008717871637, + "flos": 20371189847040.0, + "grad_norm": 1.4925661070094387, + "language_loss": 0.8301791, + "learning_rate": 2.9543545990791554e-06, + "loss": 0.85119462, + "num_input_tokens_seen": 129090225, + "step": 6011, + "time_per_iteration": 2.6351277828216553 + }, + { + "auxiliary_loss_clip": 0.01090723, + "auxiliary_loss_mlp": 0.01042093, + "balance_loss_clip": 1.03463054, + "balance_loss_mlp": 1.02678645, + "epoch": 0.36146099503983165, + "flos": 22776665800320.0, + "grad_norm": 1.8835996131367974, + "language_loss": 0.62143171, + "learning_rate": 2.954012319316727e-06, + "loss": 0.64275992, + "num_input_tokens_seen": 129107685, + "step": 6012, + "time_per_iteration": 2.6786348819732666 + }, + { + "auxiliary_loss_clip": 0.01057902, + "auxiliary_loss_mlp": 0.01037542, + "balance_loss_clip": 1.0294733, + "balance_loss_mlp": 1.02422595, + "epoch": 0.3615211182924996, + "flos": 22996654646400.0, + "grad_norm": 1.9090177234241887, + "language_loss": 0.84166247, + "learning_rate": 2.9536700033780565e-06, + "loss": 0.8626169, + "num_input_tokens_seen": 129125315, + "step": 6013, + "time_per_iteration": 2.7169511318206787 + }, + { + "auxiliary_loss_clip": 0.01084692, + "auxiliary_loss_mlp": 0.01041008, + "balance_loss_clip": 1.03235424, + "balance_loss_mlp": 1.02529597, + "epoch": 0.3615812415451676, + "flos": 16648079287680.0, + "grad_norm": 1.7217037846075756, + "language_loss": 0.91503191, + "learning_rate": 2.9533276512761228e-06, + "loss": 0.93628895, + "num_input_tokens_seen": 129141600, + "step": 6014, + "time_per_iteration": 2.5846805572509766 + }, + { + "auxiliary_loss_clip": 0.01082464, + "auxiliary_loss_mlp": 0.01043066, + "balance_loss_clip": 1.0304687, + "balance_loss_mlp": 1.0283612, + "epoch": 0.36164136479783554, + "flos": 21320093387520.0, + "grad_norm": 2.64700995987452, + "language_loss": 0.74012703, + "learning_rate": 2.95298526302391e-06, + "loss": 0.76138234, + "num_input_tokens_seen": 129160665, + "step": 6015, + "time_per_iteration": 2.6792151927948 + }, + { + "auxiliary_loss_clip": 0.01006398, + "auxiliary_loss_mlp": 0.01044237, + "balance_loss_clip": 1.0248332, + "balance_loss_mlp": 1.02729702, + "epoch": 0.3617014880505035, + "flos": 24169569356160.0, + "grad_norm": 1.7909123730905623, + "language_loss": 0.65187711, + "learning_rate": 2.9526428386344e-06, + "loss": 0.67238343, + "num_input_tokens_seen": 129179220, + "step": 6016, + "time_per_iteration": 3.060312509536743 + }, + { + "auxiliary_loss_clip": 0.0107724, + "auxiliary_loss_mlp": 0.0104572, + "balance_loss_clip": 1.03300047, + "balance_loss_mlp": 1.02934027, + "epoch": 0.3617616113031715, + "flos": 39014824101120.0, + "grad_norm": 1.603388967658685, + "language_loss": 0.71580434, + "learning_rate": 2.9523003781205785e-06, + "loss": 0.73703396, + "num_input_tokens_seen": 129200385, + "step": 6017, + "time_per_iteration": 2.9539451599121094 + }, + { + "auxiliary_loss_clip": 0.01075662, + "auxiliary_loss_mlp": 0.01039551, + "balance_loss_clip": 1.03014994, + "balance_loss_mlp": 1.02428615, + "epoch": 0.3618217345558395, + "flos": 12130840892160.0, + "grad_norm": 2.205640565412631, + "language_loss": 0.73634458, + "learning_rate": 2.9519578814954307e-06, + "loss": 0.75749671, + "num_input_tokens_seen": 129217395, + "step": 6018, + "time_per_iteration": 2.6504693031311035 + }, + { + "auxiliary_loss_clip": 0.01051698, + "auxiliary_loss_mlp": 0.01040534, + "balance_loss_clip": 1.03248715, + "balance_loss_mlp": 1.02570462, + "epoch": 0.36188185780850746, + "flos": 24935005203840.0, + "grad_norm": 1.8162848463891252, + "language_loss": 0.69170487, + "learning_rate": 2.9516153487719448e-06, + "loss": 0.71262717, + "num_input_tokens_seen": 129238940, + "step": 6019, + "time_per_iteration": 2.7667243480682373 + }, + { + "auxiliary_loss_clip": 0.01065046, + "auxiliary_loss_mlp": 0.01038915, + "balance_loss_clip": 1.03082573, + "balance_loss_mlp": 1.02290535, + "epoch": 0.3619419810611754, + "flos": 20958832350720.0, + "grad_norm": 1.5006110932700758, + "language_loss": 0.76457953, + "learning_rate": 2.95127277996311e-06, + "loss": 0.78561914, + "num_input_tokens_seen": 129258240, + "step": 6020, + "time_per_iteration": 2.650580883026123 + }, + { + "auxiliary_loss_clip": 0.01072147, + "auxiliary_loss_mlp": 0.01041716, + "balance_loss_clip": 1.03213716, + "balance_loss_mlp": 1.02571857, + "epoch": 0.3620021043138434, + "flos": 22528882805760.0, + "grad_norm": 2.3963155709767534, + "language_loss": 0.74051237, + "learning_rate": 2.9509301750819156e-06, + "loss": 0.76165104, + "num_input_tokens_seen": 129279040, + "step": 6021, + "time_per_iteration": 2.7536349296569824 + }, + { + "auxiliary_loss_clip": 0.01051707, + "auxiliary_loss_mlp": 0.01037831, + "balance_loss_clip": 1.03127515, + "balance_loss_mlp": 1.02363873, + "epoch": 0.36206222756651135, + "flos": 15596687266560.0, + "grad_norm": 1.8071270126172434, + "language_loss": 0.81249005, + "learning_rate": 2.9505875341413533e-06, + "loss": 0.83338547, + "num_input_tokens_seen": 129295415, + "step": 6022, + "time_per_iteration": 2.669023036956787 + }, + { + "auxiliary_loss_clip": 0.01073723, + "auxiliary_loss_mlp": 0.01037669, + "balance_loss_clip": 1.03323901, + "balance_loss_mlp": 1.02402592, + "epoch": 0.3621223508191793, + "flos": 23587170238080.0, + "grad_norm": 1.5911763895005004, + "language_loss": 0.81388849, + "learning_rate": 2.950244857154417e-06, + "loss": 0.83500242, + "num_input_tokens_seen": 129312620, + "step": 6023, + "time_per_iteration": 2.599637508392334 + }, + { + "auxiliary_loss_clip": 0.01064857, + "auxiliary_loss_mlp": 0.01036778, + "balance_loss_clip": 1.03075707, + "balance_loss_mlp": 1.02173376, + "epoch": 0.3621824740718473, + "flos": 22309899540480.0, + "grad_norm": 1.6571909781180896, + "language_loss": 0.79474956, + "learning_rate": 2.9499021441341e-06, + "loss": 0.81576586, + "num_input_tokens_seen": 129331825, + "step": 6024, + "time_per_iteration": 2.6011455059051514 + }, + { + "auxiliary_loss_clip": 0.01043511, + "auxiliary_loss_mlp": 0.01038218, + "balance_loss_clip": 1.02600563, + "balance_loss_mlp": 1.0232985, + "epoch": 0.36224259732451525, + "flos": 16763640318720.0, + "grad_norm": 1.8856877772133986, + "language_loss": 0.74726856, + "learning_rate": 2.9495593950933997e-06, + "loss": 0.76808584, + "num_input_tokens_seen": 129350400, + "step": 6025, + "time_per_iteration": 2.6567797660827637 + }, + { + "auxiliary_loss_clip": 0.01069327, + "auxiliary_loss_mlp": 0.00747988, + "balance_loss_clip": 1.02928519, + "balance_loss_mlp": 1.00020623, + "epoch": 0.3623027205771832, + "flos": 23149742411520.0, + "grad_norm": 1.5520218383672748, + "language_loss": 0.72278202, + "learning_rate": 2.9492166100453107e-06, + "loss": 0.74095523, + "num_input_tokens_seen": 129371155, + "step": 6026, + "time_per_iteration": 2.656792640686035 + }, + { + "auxiliary_loss_clip": 0.01077954, + "auxiliary_loss_mlp": 0.01049222, + "balance_loss_clip": 1.03184068, + "balance_loss_mlp": 1.0336411, + "epoch": 0.3623628438298512, + "flos": 28549162834560.0, + "grad_norm": 1.8946853012002511, + "language_loss": 0.78889704, + "learning_rate": 2.948873789002833e-06, + "loss": 0.81016874, + "num_input_tokens_seen": 129391230, + "step": 6027, + "time_per_iteration": 2.7262392044067383 + }, + { + "auxiliary_loss_clip": 0.01060451, + "auxiliary_loss_mlp": 0.01044599, + "balance_loss_clip": 1.030442, + "balance_loss_mlp": 1.02924466, + "epoch": 0.36242296708251914, + "flos": 25484941405440.0, + "grad_norm": 1.6665812445454893, + "language_loss": 0.67493284, + "learning_rate": 2.9485309319789667e-06, + "loss": 0.69598329, + "num_input_tokens_seen": 129410065, + "step": 6028, + "time_per_iteration": 2.722069501876831 + }, + { + "auxiliary_loss_clip": 0.01047215, + "auxiliary_loss_mlp": 0.01035529, + "balance_loss_clip": 1.0304842, + "balance_loss_mlp": 1.02194548, + "epoch": 0.3624830903351871, + "flos": 16290373697280.0, + "grad_norm": 5.905139657538027, + "language_loss": 0.85115939, + "learning_rate": 2.9481880389867117e-06, + "loss": 0.87198687, + "num_input_tokens_seen": 129428655, + "step": 6029, + "time_per_iteration": 2.7958312034606934 + }, + { + "auxiliary_loss_clip": 0.01049874, + "auxiliary_loss_mlp": 0.010386, + "balance_loss_clip": 1.02956641, + "balance_loss_mlp": 1.02459252, + "epoch": 0.36254321358785513, + "flos": 18296307694080.0, + "grad_norm": 1.7118290881096399, + "language_loss": 0.72754622, + "learning_rate": 2.9478451100390714e-06, + "loss": 0.74843097, + "num_input_tokens_seen": 129447845, + "step": 6030, + "time_per_iteration": 4.407292366027832 + }, + { + "auxiliary_loss_clip": 0.01063263, + "auxiliary_loss_mlp": 0.01043929, + "balance_loss_clip": 1.02896094, + "balance_loss_mlp": 1.02716792, + "epoch": 0.3626033368405231, + "flos": 14865294533760.0, + "grad_norm": 2.268015448848168, + "language_loss": 0.74672627, + "learning_rate": 2.94750214514905e-06, + "loss": 0.76779819, + "num_input_tokens_seen": 129463275, + "step": 6031, + "time_per_iteration": 2.7962498664855957 + }, + { + "auxiliary_loss_clip": 0.01036876, + "auxiliary_loss_mlp": 0.01038472, + "balance_loss_clip": 1.02520108, + "balance_loss_mlp": 1.02416134, + "epoch": 0.36266346009319106, + "flos": 22306595489280.0, + "grad_norm": 1.591494815780855, + "language_loss": 0.73149127, + "learning_rate": 2.9471591443296516e-06, + "loss": 0.75224471, + "num_input_tokens_seen": 129483205, + "step": 6032, + "time_per_iteration": 4.240494966506958 + }, + { + "auxiliary_loss_clip": 0.01034789, + "auxiliary_loss_mlp": 0.01036927, + "balance_loss_clip": 1.0264219, + "balance_loss_mlp": 1.02308095, + "epoch": 0.362723583345859, + "flos": 18222331633920.0, + "grad_norm": 3.3565292495722683, + "language_loss": 0.77461755, + "learning_rate": 2.946816107593884e-06, + "loss": 0.7953347, + "num_input_tokens_seen": 129499885, + "step": 6033, + "time_per_iteration": 2.7263834476470947 + }, + { + "auxiliary_loss_clip": 0.00978306, + "auxiliary_loss_mlp": 0.0101572, + "balance_loss_clip": 1.00679028, + "balance_loss_mlp": 1.01349115, + "epoch": 0.362783706598527, + "flos": 68499174458880.0, + "grad_norm": 0.7914464420617561, + "language_loss": 0.64772272, + "learning_rate": 2.9464730349547547e-06, + "loss": 0.66766298, + "num_input_tokens_seen": 129561885, + "step": 6034, + "time_per_iteration": 3.510500431060791 + }, + { + "auxiliary_loss_clip": 0.01062555, + "auxiliary_loss_mlp": 0.01036986, + "balance_loss_clip": 1.02751064, + "balance_loss_mlp": 1.02216232, + "epoch": 0.36284382985119495, + "flos": 26576589594240.0, + "grad_norm": 1.4224384648274604, + "language_loss": 0.89729595, + "learning_rate": 2.946129926425273e-06, + "loss": 0.91829133, + "num_input_tokens_seen": 129582325, + "step": 6035, + "time_per_iteration": 3.098928213119507 + }, + { + "auxiliary_loss_clip": 0.01063642, + "auxiliary_loss_mlp": 0.01035766, + "balance_loss_clip": 1.03038645, + "balance_loss_mlp": 1.02087736, + "epoch": 0.3629039531038629, + "flos": 20156767608960.0, + "grad_norm": 2.027408961690558, + "language_loss": 0.74114579, + "learning_rate": 2.9457867820184496e-06, + "loss": 0.76213986, + "num_input_tokens_seen": 129600350, + "step": 6036, + "time_per_iteration": 2.699125051498413 + }, + { + "auxiliary_loss_clip": 0.0106171, + "auxiliary_loss_mlp": 0.01029629, + "balance_loss_clip": 1.02866435, + "balance_loss_mlp": 1.01495409, + "epoch": 0.3629640763565309, + "flos": 18625716345600.0, + "grad_norm": 1.734641954376935, + "language_loss": 0.7606985, + "learning_rate": 2.945443601747297e-06, + "loss": 0.78161192, + "num_input_tokens_seen": 129618425, + "step": 6037, + "time_per_iteration": 2.601708173751831 + }, + { + "auxiliary_loss_clip": 0.01061388, + "auxiliary_loss_mlp": 0.01049594, + "balance_loss_clip": 1.02802706, + "balance_loss_mlp": 1.03288126, + "epoch": 0.36302419960919885, + "flos": 19571459489280.0, + "grad_norm": 1.5297144913194436, + "language_loss": 0.784621, + "learning_rate": 2.945100385624828e-06, + "loss": 0.80573082, + "num_input_tokens_seen": 129636750, + "step": 6038, + "time_per_iteration": 2.647500514984131 + }, + { + "auxiliary_loss_clip": 0.01008129, + "auxiliary_loss_mlp": 0.01017839, + "balance_loss_clip": 1.0075295, + "balance_loss_mlp": 1.01577687, + "epoch": 0.3630843228618668, + "flos": 63797606444160.0, + "grad_norm": 0.8429773793148918, + "language_loss": 0.63398349, + "learning_rate": 2.9447571336640573e-06, + "loss": 0.65424317, + "num_input_tokens_seen": 129699030, + "step": 6039, + "time_per_iteration": 3.2418532371520996 + }, + { + "auxiliary_loss_clip": 0.0105371, + "auxiliary_loss_mlp": 0.01044078, + "balance_loss_clip": 1.02834773, + "balance_loss_mlp": 1.02838969, + "epoch": 0.3631444461145348, + "flos": 21835160461440.0, + "grad_norm": 1.7841120920257831, + "language_loss": 0.71192724, + "learning_rate": 2.944413845878002e-06, + "loss": 0.73290509, + "num_input_tokens_seen": 129717135, + "step": 6040, + "time_per_iteration": 2.7692503929138184 + }, + { + "auxiliary_loss_clip": 0.01074849, + "auxiliary_loss_mlp": 0.01039304, + "balance_loss_clip": 1.031901, + "balance_loss_mlp": 1.02523756, + "epoch": 0.36320456936720275, + "flos": 21722041555200.0, + "grad_norm": 1.746021163603964, + "language_loss": 0.81167352, + "learning_rate": 2.9440705222796783e-06, + "loss": 0.83281499, + "num_input_tokens_seen": 129735940, + "step": 6041, + "time_per_iteration": 4.328850269317627 + }, + { + "auxiliary_loss_clip": 0.01059633, + "auxiliary_loss_mlp": 0.01031767, + "balance_loss_clip": 1.02755129, + "balance_loss_mlp": 1.01632905, + "epoch": 0.3632646926198707, + "flos": 17019072910080.0, + "grad_norm": 1.8188875570905274, + "language_loss": 0.83967346, + "learning_rate": 2.943727162882107e-06, + "loss": 0.86058748, + "num_input_tokens_seen": 129752790, + "step": 6042, + "time_per_iteration": 4.213420629501343 + }, + { + "auxiliary_loss_clip": 0.01056468, + "auxiliary_loss_mlp": 0.01046823, + "balance_loss_clip": 1.02985072, + "balance_loss_mlp": 1.03269029, + "epoch": 0.36332481587253873, + "flos": 23331163029120.0, + "grad_norm": 1.6117702271593868, + "language_loss": 0.78269887, + "learning_rate": 2.9433837676983064e-06, + "loss": 0.80373174, + "num_input_tokens_seen": 129773655, + "step": 6043, + "time_per_iteration": 2.580622911453247 + }, + { + "auxiliary_loss_clip": 0.01059528, + "auxiliary_loss_mlp": 0.0104088, + "balance_loss_clip": 1.03695512, + "balance_loss_mlp": 1.0256393, + "epoch": 0.3633849391252067, + "flos": 10743539857920.0, + "grad_norm": 1.80344097739143, + "language_loss": 0.65938509, + "learning_rate": 2.943040336741298e-06, + "loss": 0.68038917, + "num_input_tokens_seen": 129791605, + "step": 6044, + "time_per_iteration": 2.647965669631958 + }, + { + "auxiliary_loss_clip": 0.01057338, + "auxiliary_loss_mlp": 0.01034354, + "balance_loss_clip": 1.02969038, + "balance_loss_mlp": 1.02009022, + "epoch": 0.36344506237787466, + "flos": 25849147357440.0, + "grad_norm": 2.0646223943753985, + "language_loss": 0.81083345, + "learning_rate": 2.9426968700241066e-06, + "loss": 0.83175039, + "num_input_tokens_seen": 129811075, + "step": 6045, + "time_per_iteration": 2.611863851547241 + }, + { + "auxiliary_loss_clip": 0.01046717, + "auxiliary_loss_mlp": 0.01038333, + "balance_loss_clip": 1.02773154, + "balance_loss_mlp": 1.02414083, + "epoch": 0.3635051856305426, + "flos": 30154046503680.0, + "grad_norm": 3.4546489193909915, + "language_loss": 0.64560628, + "learning_rate": 2.942353367559755e-06, + "loss": 0.6664567, + "num_input_tokens_seen": 129833755, + "step": 6046, + "time_per_iteration": 2.7245428562164307 + }, + { + "auxiliary_loss_clip": 0.01048054, + "auxiliary_loss_mlp": 0.01038716, + "balance_loss_clip": 1.02875614, + "balance_loss_mlp": 1.02529275, + "epoch": 0.3635653088832106, + "flos": 22198396746240.0, + "grad_norm": 1.7298455353151192, + "language_loss": 0.77910638, + "learning_rate": 2.9420098293612692e-06, + "loss": 0.79997408, + "num_input_tokens_seen": 129854475, + "step": 6047, + "time_per_iteration": 2.6782379150390625 + }, + { + "auxiliary_loss_clip": 0.01080918, + "auxiliary_loss_mlp": 0.01044316, + "balance_loss_clip": 1.03194678, + "balance_loss_mlp": 1.02807963, + "epoch": 0.36362543213587856, + "flos": 24787053083520.0, + "grad_norm": 1.5618701075386483, + "language_loss": 0.79248935, + "learning_rate": 2.9416662554416767e-06, + "loss": 0.81374168, + "num_input_tokens_seen": 129873530, + "step": 6048, + "time_per_iteration": 2.7312631607055664 + }, + { + "auxiliary_loss_clip": 0.01004391, + "auxiliary_loss_mlp": 0.01002315, + "balance_loss_clip": 1.00429237, + "balance_loss_mlp": 1.00049138, + "epoch": 0.3636855553885465, + "flos": 62526369231360.0, + "grad_norm": 0.7552242628554717, + "language_loss": 0.52615035, + "learning_rate": 2.9413226458140054e-06, + "loss": 0.54621744, + "num_input_tokens_seen": 129940400, + "step": 6049, + "time_per_iteration": 3.2095179557800293 + }, + { + "auxiliary_loss_clip": 0.01047133, + "auxiliary_loss_mlp": 0.01037991, + "balance_loss_clip": 1.02992463, + "balance_loss_mlp": 1.02306604, + "epoch": 0.3637456786412145, + "flos": 24060652341120.0, + "grad_norm": 1.8189537383716838, + "language_loss": 0.86364448, + "learning_rate": 2.9409790004912845e-06, + "loss": 0.88449568, + "num_input_tokens_seen": 129958635, + "step": 6050, + "time_per_iteration": 2.7161669731140137 + }, + { + "auxiliary_loss_clip": 0.01071308, + "auxiliary_loss_mlp": 0.00747836, + "balance_loss_clip": 1.03157401, + "balance_loss_mlp": 1.00012672, + "epoch": 0.36380580189388245, + "flos": 16691495852160.0, + "grad_norm": 1.6310581410117178, + "language_loss": 0.78516942, + "learning_rate": 2.940635319486546e-06, + "loss": 0.80336088, + "num_input_tokens_seen": 129977685, + "step": 6051, + "time_per_iteration": 2.570223093032837 + }, + { + "auxiliary_loss_clip": 0.01070234, + "auxiliary_loss_mlp": 0.01036293, + "balance_loss_clip": 1.02871013, + "balance_loss_mlp": 1.02248859, + "epoch": 0.3638659251465504, + "flos": 25114091437440.0, + "grad_norm": 1.7451585059746324, + "language_loss": 0.82624459, + "learning_rate": 2.940291602812822e-06, + "loss": 0.84730983, + "num_input_tokens_seen": 129997530, + "step": 6052, + "time_per_iteration": 2.7084848880767822 + }, + { + "auxiliary_loss_clip": 0.01045884, + "auxiliary_loss_mlp": 0.01032706, + "balance_loss_clip": 1.0272944, + "balance_loss_mlp": 1.02007556, + "epoch": 0.3639260483992184, + "flos": 23003011353600.0, + "grad_norm": 3.712948640587416, + "language_loss": 0.72522289, + "learning_rate": 2.939947850483145e-06, + "loss": 0.74600875, + "num_input_tokens_seen": 130017955, + "step": 6053, + "time_per_iteration": 2.9256012439727783 + }, + { + "auxiliary_loss_clip": 0.00978088, + "auxiliary_loss_mlp": 0.01026619, + "balance_loss_clip": 1.01628876, + "balance_loss_mlp": 1.02483118, + "epoch": 0.36398617165188635, + "flos": 70716011160960.0, + "grad_norm": 0.8135694048556504, + "language_loss": 0.6121664, + "learning_rate": 2.9396040625105532e-06, + "loss": 0.63221347, + "num_input_tokens_seen": 130074275, + "step": 6054, + "time_per_iteration": 3.4076530933380127 + }, + { + "auxiliary_loss_clip": 0.01058592, + "auxiliary_loss_mlp": 0.01041578, + "balance_loss_clip": 1.0292182, + "balance_loss_mlp": 1.02649808, + "epoch": 0.3640462949045543, + "flos": 22235456603520.0, + "grad_norm": 1.9056179378653832, + "language_loss": 0.76502621, + "learning_rate": 2.9392602389080802e-06, + "loss": 0.78602791, + "num_input_tokens_seen": 130091375, + "step": 6055, + "time_per_iteration": 2.719726324081421 + }, + { + "auxiliary_loss_clip": 0.01082832, + "auxiliary_loss_mlp": 0.0103923, + "balance_loss_clip": 1.03073919, + "balance_loss_mlp": 1.02469885, + "epoch": 0.3641064181572223, + "flos": 21543529939200.0, + "grad_norm": 1.7097677080828009, + "language_loss": 0.75136852, + "learning_rate": 2.938916379688765e-06, + "loss": 0.77258909, + "num_input_tokens_seen": 130111595, + "step": 6056, + "time_per_iteration": 2.613837480545044 + }, + { + "auxiliary_loss_clip": 0.01063442, + "auxiliary_loss_mlp": 0.01037784, + "balance_loss_clip": 1.03190041, + "balance_loss_mlp": 1.02440906, + "epoch": 0.3641665414098903, + "flos": 22273306560000.0, + "grad_norm": 2.101903792913583, + "language_loss": 0.80620551, + "learning_rate": 2.9385724848656468e-06, + "loss": 0.82721782, + "num_input_tokens_seen": 130131440, + "step": 6057, + "time_per_iteration": 2.8224520683288574 + }, + { + "auxiliary_loss_clip": 0.0105001, + "auxiliary_loss_mlp": 0.01037094, + "balance_loss_clip": 1.0279814, + "balance_loss_mlp": 1.02313471, + "epoch": 0.36422666466255826, + "flos": 28329676778880.0, + "grad_norm": 1.8705769748907757, + "language_loss": 0.79737526, + "learning_rate": 2.9382285544517647e-06, + "loss": 0.81824625, + "num_input_tokens_seen": 130151375, + "step": 6058, + "time_per_iteration": 2.7274129390716553 + }, + { + "auxiliary_loss_clip": 0.01048246, + "auxiliary_loss_mlp": 0.00747689, + "balance_loss_clip": 1.02661967, + "balance_loss_mlp": 1.00004542, + "epoch": 0.36428678791522623, + "flos": 24170503109760.0, + "grad_norm": 1.7897840627247958, + "language_loss": 0.84775126, + "learning_rate": 2.9378845884601636e-06, + "loss": 0.86571062, + "num_input_tokens_seen": 130169960, + "step": 6059, + "time_per_iteration": 2.802773952484131 + }, + { + "auxiliary_loss_clip": 0.01040058, + "auxiliary_loss_mlp": 0.01036169, + "balance_loss_clip": 1.02718639, + "balance_loss_mlp": 1.02092266, + "epoch": 0.3643469111678942, + "flos": 22528451842560.0, + "grad_norm": 1.6000305304036864, + "language_loss": 0.87788647, + "learning_rate": 2.937540586903884e-06, + "loss": 0.8986488, + "num_input_tokens_seen": 130189800, + "step": 6060, + "time_per_iteration": 2.815985918045044 + }, + { + "auxiliary_loss_clip": 0.01072761, + "auxiliary_loss_mlp": 0.01038542, + "balance_loss_clip": 1.03090239, + "balance_loss_mlp": 1.02362919, + "epoch": 0.36440703442056216, + "flos": 19426595938560.0, + "grad_norm": 3.4084740038567074, + "language_loss": 0.66863382, + "learning_rate": 2.937196549795971e-06, + "loss": 0.68974686, + "num_input_tokens_seen": 130206370, + "step": 6061, + "time_per_iteration": 2.6486549377441406 + }, + { + "auxiliary_loss_clip": 0.01065231, + "auxiliary_loss_mlp": 0.01034868, + "balance_loss_clip": 1.03243005, + "balance_loss_mlp": 1.02042568, + "epoch": 0.3644671576732301, + "flos": 18040515966720.0, + "grad_norm": 2.2115664148379057, + "language_loss": 0.75768036, + "learning_rate": 2.9368524771494718e-06, + "loss": 0.7786814, + "num_input_tokens_seen": 130224445, + "step": 6062, + "time_per_iteration": 2.748258590698242 + }, + { + "auxiliary_loss_clip": 0.01062457, + "auxiliary_loss_mlp": 0.01032995, + "balance_loss_clip": 1.03280604, + "balance_loss_mlp": 1.01690125, + "epoch": 0.3645272809258981, + "flos": 21542811667200.0, + "grad_norm": 1.4856955728273544, + "language_loss": 0.72461194, + "learning_rate": 2.936508368977432e-06, + "loss": 0.74556643, + "num_input_tokens_seen": 130245380, + "step": 6063, + "time_per_iteration": 2.6689534187316895 + }, + { + "auxiliary_loss_clip": 0.01062823, + "auxiliary_loss_mlp": 0.01036077, + "balance_loss_clip": 1.02749944, + "balance_loss_mlp": 1.02257085, + "epoch": 0.36458740417856605, + "flos": 22746860490240.0, + "grad_norm": 1.8287864376260674, + "language_loss": 0.67595768, + "learning_rate": 2.936164225292901e-06, + "loss": 0.69694662, + "num_input_tokens_seen": 130265575, + "step": 6064, + "time_per_iteration": 2.6637771129608154 + }, + { + "auxiliary_loss_clip": 0.01059878, + "auxiliary_loss_mlp": 0.01043428, + "balance_loss_clip": 1.02865195, + "balance_loss_mlp": 1.0294503, + "epoch": 0.364647527431234, + "flos": 26140670138880.0, + "grad_norm": 1.8561080896777034, + "language_loss": 0.74583203, + "learning_rate": 2.9358200461089297e-06, + "loss": 0.76686507, + "num_input_tokens_seen": 130286195, + "step": 6065, + "time_per_iteration": 2.670266628265381 + }, + { + "auxiliary_loss_clip": 0.01063489, + "auxiliary_loss_mlp": 0.01040931, + "balance_loss_clip": 1.03009152, + "balance_loss_mlp": 1.02570772, + "epoch": 0.364707650683902, + "flos": 31029907737600.0, + "grad_norm": 1.7693683145224601, + "language_loss": 0.74964684, + "learning_rate": 2.9354758314385676e-06, + "loss": 0.77069104, + "num_input_tokens_seen": 130306095, + "step": 6066, + "time_per_iteration": 2.7208504676818848 + }, + { + "auxiliary_loss_clip": 0.0106493, + "auxiliary_loss_mlp": 0.01033919, + "balance_loss_clip": 1.02998614, + "balance_loss_mlp": 1.02116966, + "epoch": 0.36476777393656995, + "flos": 19572896033280.0, + "grad_norm": 2.652697526083542, + "language_loss": 0.76696384, + "learning_rate": 2.9351315812948684e-06, + "loss": 0.7879523, + "num_input_tokens_seen": 130324685, + "step": 6067, + "time_per_iteration": 2.533595561981201 + }, + { + "auxiliary_loss_clip": 0.01080945, + "auxiliary_loss_mlp": 0.01033413, + "balance_loss_clip": 1.03246069, + "balance_loss_mlp": 1.0209794, + "epoch": 0.3648278971892379, + "flos": 17748849530880.0, + "grad_norm": 1.9057197170790794, + "language_loss": 0.7085045, + "learning_rate": 2.934787295690886e-06, + "loss": 0.72964811, + "num_input_tokens_seen": 130343855, + "step": 6068, + "time_per_iteration": 2.5470032691955566 + }, + { + "auxiliary_loss_clip": 0.01063558, + "auxiliary_loss_mlp": 0.01040448, + "balance_loss_clip": 1.02658856, + "balance_loss_mlp": 1.02636898, + "epoch": 0.3648880204419059, + "flos": 17931167988480.0, + "grad_norm": 1.9135123716407532, + "language_loss": 0.73800921, + "learning_rate": 2.9344429746396755e-06, + "loss": 0.7590493, + "num_input_tokens_seen": 130362320, + "step": 6069, + "time_per_iteration": 2.51412034034729 + }, + { + "auxiliary_loss_clip": 0.01064589, + "auxiliary_loss_mlp": 0.01035547, + "balance_loss_clip": 1.03166819, + "balance_loss_mlp": 1.02177882, + "epoch": 0.3649481436945739, + "flos": 22638266697600.0, + "grad_norm": 1.7472290131053139, + "language_loss": 0.66063654, + "learning_rate": 2.9340986181542945e-06, + "loss": 0.68163794, + "num_input_tokens_seen": 130383165, + "step": 6070, + "time_per_iteration": 2.715418815612793 + }, + { + "auxiliary_loss_clip": 0.01068914, + "auxiliary_loss_mlp": 0.0102977, + "balance_loss_clip": 1.02981353, + "balance_loss_mlp": 1.01697898, + "epoch": 0.36500826694724187, + "flos": 21579656042880.0, + "grad_norm": 1.8352536156176866, + "language_loss": 0.74036527, + "learning_rate": 2.9337542262477994e-06, + "loss": 0.76135206, + "num_input_tokens_seen": 130402425, + "step": 6071, + "time_per_iteration": 2.595186471939087 + }, + { + "auxiliary_loss_clip": 0.01067409, + "auxiliary_loss_mlp": 0.01034115, + "balance_loss_clip": 1.02810764, + "balance_loss_mlp": 1.02020955, + "epoch": 0.36506839019990983, + "flos": 13772533023360.0, + "grad_norm": 1.790784982025239, + "language_loss": 0.88245445, + "learning_rate": 2.9334097989332506e-06, + "loss": 0.9034698, + "num_input_tokens_seen": 130419440, + "step": 6072, + "time_per_iteration": 2.6059677600860596 + }, + { + "auxiliary_loss_clip": 0.01070859, + "auxiliary_loss_mlp": 0.0103447, + "balance_loss_clip": 1.03140092, + "balance_loss_mlp": 1.02118444, + "epoch": 0.3651285134525778, + "flos": 17274972378240.0, + "grad_norm": 6.496115662695566, + "language_loss": 0.72313046, + "learning_rate": 2.9330653362237094e-06, + "loss": 0.74418378, + "num_input_tokens_seen": 130438495, + "step": 6073, + "time_per_iteration": 2.563995599746704 + }, + { + "auxiliary_loss_clip": 0.0102385, + "auxiliary_loss_mlp": 0.01039512, + "balance_loss_clip": 1.03055048, + "balance_loss_mlp": 1.02467084, + "epoch": 0.36518863670524576, + "flos": 21907987286400.0, + "grad_norm": 2.3207298029263113, + "language_loss": 0.66695267, + "learning_rate": 2.932720838132236e-06, + "loss": 0.68758631, + "num_input_tokens_seen": 130455575, + "step": 6074, + "time_per_iteration": 2.8897972106933594 + }, + { + "auxiliary_loss_clip": 0.01051028, + "auxiliary_loss_mlp": 0.01034592, + "balance_loss_clip": 1.0293082, + "balance_loss_mlp": 1.02089524, + "epoch": 0.3652487599579137, + "flos": 27122180250240.0, + "grad_norm": 1.4461542664045235, + "language_loss": 0.72882819, + "learning_rate": 2.9323763046718954e-06, + "loss": 0.74968445, + "num_input_tokens_seen": 130476385, + "step": 6075, + "time_per_iteration": 2.907045602798462 + }, + { + "auxiliary_loss_clip": 0.01050954, + "auxiliary_loss_mlp": 0.01041143, + "balance_loss_clip": 1.03029966, + "balance_loss_mlp": 1.02659988, + "epoch": 0.3653088832105817, + "flos": 19755573626880.0, + "grad_norm": 2.179477248201944, + "language_loss": 0.89176011, + "learning_rate": 2.9320317358557524e-06, + "loss": 0.9126811, + "num_input_tokens_seen": 130493630, + "step": 6076, + "time_per_iteration": 2.75842547416687 + }, + { + "auxiliary_loss_clip": 0.01069041, + "auxiliary_loss_mlp": 0.01040762, + "balance_loss_clip": 1.03012323, + "balance_loss_mlp": 1.02682662, + "epoch": 0.36536900646324966, + "flos": 13115008609920.0, + "grad_norm": 2.2046384852809147, + "language_loss": 0.69761348, + "learning_rate": 2.931687131696872e-06, + "loss": 0.7187115, + "num_input_tokens_seen": 130510735, + "step": 6077, + "time_per_iteration": 4.245877265930176 + }, + { + "auxiliary_loss_clip": 0.01017286, + "auxiliary_loss_mlp": 0.01007513, + "balance_loss_clip": 1.00681353, + "balance_loss_mlp": 1.00564182, + "epoch": 0.3654291297159176, + "flos": 71100472383360.0, + "grad_norm": 0.7504305571869576, + "language_loss": 0.61758542, + "learning_rate": 2.9313424922083224e-06, + "loss": 0.63783348, + "num_input_tokens_seen": 130577050, + "step": 6078, + "time_per_iteration": 3.2755963802337646 + }, + { + "auxiliary_loss_clip": 0.01048261, + "auxiliary_loss_mlp": 0.01048305, + "balance_loss_clip": 1.02462077, + "balance_loss_mlp": 1.0332253, + "epoch": 0.3654892529685856, + "flos": 23617478338560.0, + "grad_norm": 1.7746181877643121, + "language_loss": 0.77931631, + "learning_rate": 2.930997817403173e-06, + "loss": 0.80028194, + "num_input_tokens_seen": 130593780, + "step": 6079, + "time_per_iteration": 4.384251832962036 + }, + { + "auxiliary_loss_clip": 0.01072384, + "auxiliary_loss_mlp": 0.01037232, + "balance_loss_clip": 1.03050399, + "balance_loss_mlp": 1.02302873, + "epoch": 0.36554937622125355, + "flos": 43470799850880.0, + "grad_norm": 2.161786503820887, + "language_loss": 0.62624043, + "learning_rate": 2.9306531072944913e-06, + "loss": 0.6473366, + "num_input_tokens_seen": 130615510, + "step": 6080, + "time_per_iteration": 2.892975091934204 + }, + { + "auxiliary_loss_clip": 0.01054029, + "auxiliary_loss_mlp": 0.01041745, + "balance_loss_clip": 1.0307076, + "balance_loss_mlp": 1.02637935, + "epoch": 0.3656094994739215, + "flos": 23294641875840.0, + "grad_norm": 2.6228764541206435, + "language_loss": 0.6777308, + "learning_rate": 2.930308361895352e-06, + "loss": 0.69868863, + "num_input_tokens_seen": 130635410, + "step": 6081, + "time_per_iteration": 2.6963133811950684 + }, + { + "auxiliary_loss_clip": 0.01063074, + "auxiliary_loss_mlp": 0.0074781, + "balance_loss_clip": 1.03062356, + "balance_loss_mlp": 1.0001781, + "epoch": 0.3656696227265895, + "flos": 24571984400640.0, + "grad_norm": 1.9201989642424397, + "language_loss": 0.75051355, + "learning_rate": 2.9299635812188257e-06, + "loss": 0.7686224, + "num_input_tokens_seen": 130657725, + "step": 6082, + "time_per_iteration": 2.880406379699707 + }, + { + "auxiliary_loss_clip": 0.01021477, + "auxiliary_loss_mlp": 0.00747623, + "balance_loss_clip": 1.02791989, + "balance_loss_mlp": 1.00013399, + "epoch": 0.3657297459792575, + "flos": 27928375056000.0, + "grad_norm": 2.1848620468893403, + "language_loss": 0.82768023, + "learning_rate": 2.929618765277987e-06, + "loss": 0.84537125, + "num_input_tokens_seen": 130678360, + "step": 6083, + "time_per_iteration": 2.7804641723632812 + }, + { + "auxiliary_loss_clip": 0.00995856, + "auxiliary_loss_mlp": 0.01014146, + "balance_loss_clip": 1.00527024, + "balance_loss_mlp": 1.01262569, + "epoch": 0.36578986923192547, + "flos": 67392622126080.0, + "grad_norm": 0.814610117294888, + "language_loss": 0.593009, + "learning_rate": 2.9292739140859125e-06, + "loss": 0.61310899, + "num_input_tokens_seen": 130742110, + "step": 6084, + "time_per_iteration": 3.2945961952209473 + }, + { + "auxiliary_loss_clip": 0.01038182, + "auxiliary_loss_mlp": 0.01041085, + "balance_loss_clip": 1.02807879, + "balance_loss_mlp": 1.0266428, + "epoch": 0.36584999248459343, + "flos": 20227511445120.0, + "grad_norm": 1.7947498134398026, + "language_loss": 0.73247564, + "learning_rate": 2.9289290276556767e-06, + "loss": 0.75326824, + "num_input_tokens_seen": 130759870, + "step": 6085, + "time_per_iteration": 2.6361186504364014 + }, + { + "auxiliary_loss_clip": 0.0105324, + "auxiliary_loss_mlp": 0.01039958, + "balance_loss_clip": 1.03236902, + "balance_loss_mlp": 1.02707136, + "epoch": 0.3659101157372614, + "flos": 19062461813760.0, + "grad_norm": 1.7966885765052916, + "language_loss": 0.77884972, + "learning_rate": 2.9285841060003604e-06, + "loss": 0.79978168, + "num_input_tokens_seen": 130778510, + "step": 6086, + "time_per_iteration": 2.6478121280670166 + }, + { + "auxiliary_loss_clip": 0.01057465, + "auxiliary_loss_mlp": 0.01036087, + "balance_loss_clip": 1.02532756, + "balance_loss_mlp": 1.02255726, + "epoch": 0.36597023898992936, + "flos": 30810708990720.0, + "grad_norm": 2.468824097528126, + "language_loss": 0.7707088, + "learning_rate": 2.9282391491330416e-06, + "loss": 0.79164433, + "num_input_tokens_seen": 130798535, + "step": 6087, + "time_per_iteration": 2.8261361122131348 + }, + { + "auxiliary_loss_clip": 0.01031133, + "auxiliary_loss_mlp": 0.01039173, + "balance_loss_clip": 1.02803731, + "balance_loss_mlp": 1.0244565, + "epoch": 0.36603036224259733, + "flos": 20521799573760.0, + "grad_norm": 2.0264487423215685, + "language_loss": 0.70749068, + "learning_rate": 2.9278941570668002e-06, + "loss": 0.72819376, + "num_input_tokens_seen": 130816655, + "step": 6088, + "time_per_iteration": 4.258260250091553 + }, + { + "auxiliary_loss_clip": 0.01076785, + "auxiliary_loss_mlp": 0.01039361, + "balance_loss_clip": 1.03103471, + "balance_loss_mlp": 1.02396524, + "epoch": 0.3660904854952653, + "flos": 38329397798400.0, + "grad_norm": 1.4442572166586873, + "language_loss": 0.79473019, + "learning_rate": 2.92754912981472e-06, + "loss": 0.81589162, + "num_input_tokens_seen": 130841225, + "step": 6089, + "time_per_iteration": 4.440003156661987 + }, + { + "auxiliary_loss_clip": 0.01046679, + "auxiliary_loss_mlp": 0.01036369, + "balance_loss_clip": 1.02769375, + "balance_loss_mlp": 1.02321482, + "epoch": 0.36615060874793326, + "flos": 21835555511040.0, + "grad_norm": 2.217299759525964, + "language_loss": 0.71297693, + "learning_rate": 2.927204067389884e-06, + "loss": 0.73380738, + "num_input_tokens_seen": 130861050, + "step": 6090, + "time_per_iteration": 2.65632700920105 + }, + { + "auxiliary_loss_clip": 0.01058115, + "auxiliary_loss_mlp": 0.01053018, + "balance_loss_clip": 1.03203464, + "balance_loss_mlp": 1.03949356, + "epoch": 0.3662107320006012, + "flos": 16581537342720.0, + "grad_norm": 3.0402348338232272, + "language_loss": 0.74307269, + "learning_rate": 2.9268589698053763e-06, + "loss": 0.764184, + "num_input_tokens_seen": 130879775, + "step": 6091, + "time_per_iteration": 2.6552836894989014 + }, + { + "auxiliary_loss_clip": 0.01019276, + "auxiliary_loss_mlp": 0.01036932, + "balance_loss_clip": 1.03034687, + "balance_loss_mlp": 1.02307379, + "epoch": 0.3662708552532692, + "flos": 20958365473920.0, + "grad_norm": 1.9104720025672854, + "language_loss": 0.73216617, + "learning_rate": 2.926513837074284e-06, + "loss": 0.75272822, + "num_input_tokens_seen": 130898070, + "step": 6092, + "time_per_iteration": 2.7572262287139893 + }, + { + "auxiliary_loss_clip": 0.01067751, + "auxiliary_loss_mlp": 0.01045489, + "balance_loss_clip": 1.0282445, + "balance_loss_mlp": 1.03107655, + "epoch": 0.36633097850593715, + "flos": 21902707987200.0, + "grad_norm": 2.326803634358305, + "language_loss": 0.78087413, + "learning_rate": 2.9261686692096942e-06, + "loss": 0.80200648, + "num_input_tokens_seen": 130915250, + "step": 6093, + "time_per_iteration": 2.664398431777954 + }, + { + "auxiliary_loss_clip": 0.01069503, + "auxiliary_loss_mlp": 0.01038051, + "balance_loss_clip": 1.02849305, + "balance_loss_mlp": 1.02508676, + "epoch": 0.3663911017586051, + "flos": 32854133808000.0, + "grad_norm": 1.6650923278274334, + "language_loss": 0.74495077, + "learning_rate": 2.925823466224696e-06, + "loss": 0.76602626, + "num_input_tokens_seen": 130936995, + "step": 6094, + "time_per_iteration": 2.7038300037384033 + }, + { + "auxiliary_loss_clip": 0.01085161, + "auxiliary_loss_mlp": 0.01052929, + "balance_loss_clip": 1.03327727, + "balance_loss_mlp": 1.0390172, + "epoch": 0.3664512250112731, + "flos": 27271748482560.0, + "grad_norm": 1.6635318455168193, + "language_loss": 0.79275757, + "learning_rate": 2.9254782281323785e-06, + "loss": 0.81413847, + "num_input_tokens_seen": 130957970, + "step": 6095, + "time_per_iteration": 2.6339731216430664 + }, + { + "auxiliary_loss_clip": 0.01058274, + "auxiliary_loss_mlp": 0.00747863, + "balance_loss_clip": 1.02961731, + "balance_loss_mlp": 1.000211, + "epoch": 0.3665113482639411, + "flos": 17784436930560.0, + "grad_norm": 2.1518136409675104, + "language_loss": 0.73773289, + "learning_rate": 2.925132954945834e-06, + "loss": 0.75579417, + "num_input_tokens_seen": 130974915, + "step": 6096, + "time_per_iteration": 2.6647584438323975 + }, + { + "auxiliary_loss_clip": 0.01043287, + "auxiliary_loss_mlp": 0.01037299, + "balance_loss_clip": 1.02534449, + "balance_loss_mlp": 1.02396512, + "epoch": 0.36657147151660907, + "flos": 27854614477440.0, + "grad_norm": 1.8934794301595403, + "language_loss": 0.67680931, + "learning_rate": 2.924787646678155e-06, + "loss": 0.69761515, + "num_input_tokens_seen": 130995745, + "step": 6097, + "time_per_iteration": 2.8070883750915527 + }, + { + "auxiliary_loss_clip": 0.01028386, + "auxiliary_loss_mlp": 0.010388, + "balance_loss_clip": 1.02808547, + "balance_loss_mlp": 1.02529955, + "epoch": 0.36663159476927704, + "flos": 25374013228800.0, + "grad_norm": 1.4540374336744897, + "language_loss": 0.77735144, + "learning_rate": 2.9244423033424365e-06, + "loss": 0.79802322, + "num_input_tokens_seen": 131015545, + "step": 6098, + "time_per_iteration": 2.817105531692505 + }, + { + "auxiliary_loss_clip": 0.01068026, + "auxiliary_loss_mlp": 0.01043613, + "balance_loss_clip": 1.02997935, + "balance_loss_mlp": 1.02977252, + "epoch": 0.366691718021945, + "flos": 21357225072000.0, + "grad_norm": 1.6889385379524295, + "language_loss": 0.73256803, + "learning_rate": 2.9240969249517723e-06, + "loss": 0.75368446, + "num_input_tokens_seen": 131033990, + "step": 6099, + "time_per_iteration": 2.7512941360473633 + }, + { + "auxiliary_loss_clip": 0.01057731, + "auxiliary_loss_mlp": 0.01044928, + "balance_loss_clip": 1.02990198, + "balance_loss_mlp": 1.03183317, + "epoch": 0.36675184127461297, + "flos": 16800376953600.0, + "grad_norm": 1.7907175678379825, + "language_loss": 0.8463974, + "learning_rate": 2.9237515115192602e-06, + "loss": 0.86742395, + "num_input_tokens_seen": 131050710, + "step": 6100, + "time_per_iteration": 2.7982773780822754 + }, + { + "auxiliary_loss_clip": 0.01047477, + "auxiliary_loss_mlp": 0.01032653, + "balance_loss_clip": 1.02716613, + "balance_loss_mlp": 1.01834762, + "epoch": 0.36681196452728093, + "flos": 21906514828800.0, + "grad_norm": 1.6690309275435438, + "language_loss": 0.70839345, + "learning_rate": 2.9234060630579992e-06, + "loss": 0.72919476, + "num_input_tokens_seen": 131071435, + "step": 6101, + "time_per_iteration": 2.741945505142212 + }, + { + "auxiliary_loss_clip": 0.01061776, + "auxiliary_loss_mlp": 0.0105075, + "balance_loss_clip": 1.03231406, + "balance_loss_mlp": 1.03533602, + "epoch": 0.3668720877799489, + "flos": 17712436118400.0, + "grad_norm": 2.1807870681176005, + "language_loss": 0.76293343, + "learning_rate": 2.9230605795810865e-06, + "loss": 0.78405869, + "num_input_tokens_seen": 131088775, + "step": 6102, + "time_per_iteration": 2.73592209815979 + }, + { + "auxiliary_loss_clip": 0.01074148, + "auxiliary_loss_mlp": 0.01032668, + "balance_loss_clip": 1.03039908, + "balance_loss_mlp": 1.01693845, + "epoch": 0.36693221103261686, + "flos": 47045455499520.0, + "grad_norm": 1.5416145216057962, + "language_loss": 0.70357144, + "learning_rate": 2.922715061101625e-06, + "loss": 0.72463953, + "num_input_tokens_seen": 131112800, + "step": 6103, + "time_per_iteration": 2.7740256786346436 + }, + { + "auxiliary_loss_clip": 0.01043332, + "auxiliary_loss_mlp": 0.01036493, + "balance_loss_clip": 1.03074241, + "balance_loss_mlp": 1.02220631, + "epoch": 0.3669923342852848, + "flos": 15960929132160.0, + "grad_norm": 1.8368277514776206, + "language_loss": 0.71484733, + "learning_rate": 2.922369507632716e-06, + "loss": 0.73564553, + "num_input_tokens_seen": 131131150, + "step": 6104, + "time_per_iteration": 2.678431272506714 + }, + { + "auxiliary_loss_clip": 0.0107027, + "auxiliary_loss_mlp": 0.0103428, + "balance_loss_clip": 1.02926493, + "balance_loss_mlp": 1.01995683, + "epoch": 0.3670524575379528, + "flos": 19974485064960.0, + "grad_norm": 2.1427571218568127, + "language_loss": 0.81737149, + "learning_rate": 2.9220239191874617e-06, + "loss": 0.83841699, + "num_input_tokens_seen": 131150365, + "step": 6105, + "time_per_iteration": 2.6447415351867676 + }, + { + "auxiliary_loss_clip": 0.01085311, + "auxiliary_loss_mlp": 0.01039251, + "balance_loss_clip": 1.03140616, + "balance_loss_mlp": 1.02395618, + "epoch": 0.36711258079062076, + "flos": 25702955003520.0, + "grad_norm": 1.673409771218762, + "language_loss": 0.80844319, + "learning_rate": 2.9216782957789692e-06, + "loss": 0.82968879, + "num_input_tokens_seen": 131169310, + "step": 6106, + "time_per_iteration": 2.5771591663360596 + }, + { + "auxiliary_loss_clip": 0.00988075, + "auxiliary_loss_mlp": 0.00746727, + "balance_loss_clip": 1.00724673, + "balance_loss_mlp": 1.00017095, + "epoch": 0.3671727040432887, + "flos": 60772743342720.0, + "grad_norm": 0.7014420868630306, + "language_loss": 0.59235793, + "learning_rate": 2.9213326374203426e-06, + "loss": 0.60970592, + "num_input_tokens_seen": 131232900, + "step": 6107, + "time_per_iteration": 3.2736754417419434 + }, + { + "auxiliary_loss_clip": 0.01059502, + "auxiliary_loss_mlp": 0.01029792, + "balance_loss_clip": 1.02986217, + "balance_loss_mlp": 1.01627374, + "epoch": 0.3672328272959567, + "flos": 18661303745280.0, + "grad_norm": 1.7012320867627606, + "language_loss": 0.74678278, + "learning_rate": 2.92098694412469e-06, + "loss": 0.7676757, + "num_input_tokens_seen": 131250920, + "step": 6108, + "time_per_iteration": 2.6228384971618652 + }, + { + "auxiliary_loss_clip": 0.01071362, + "auxiliary_loss_mlp": 0.0103586, + "balance_loss_clip": 1.02933598, + "balance_loss_mlp": 1.02190685, + "epoch": 0.3672929505486247, + "flos": 15049049535360.0, + "grad_norm": 1.9253784180234577, + "language_loss": 0.73085737, + "learning_rate": 2.9206412159051213e-06, + "loss": 0.75192952, + "num_input_tokens_seen": 131267910, + "step": 6109, + "time_per_iteration": 2.65071177482605 + }, + { + "auxiliary_loss_clip": 0.01021875, + "auxiliary_loss_mlp": 0.01040118, + "balance_loss_clip": 1.02842736, + "balance_loss_mlp": 1.0252583, + "epoch": 0.3673530738012927, + "flos": 20589347099520.0, + "grad_norm": 1.503705246340114, + "language_loss": 0.52782249, + "learning_rate": 2.920295452774744e-06, + "loss": 0.54844236, + "num_input_tokens_seen": 131287150, + "step": 6110, + "time_per_iteration": 2.7537620067596436 + }, + { + "auxiliary_loss_clip": 0.01067112, + "auxiliary_loss_mlp": 0.0103621, + "balance_loss_clip": 1.02863288, + "balance_loss_mlp": 1.02186298, + "epoch": 0.36741319705396064, + "flos": 21689830033920.0, + "grad_norm": 1.4650926661803518, + "language_loss": 0.80523252, + "learning_rate": 2.919949654746672e-06, + "loss": 0.82626569, + "num_input_tokens_seen": 131308225, + "step": 6111, + "time_per_iteration": 2.676170587539673 + }, + { + "auxiliary_loss_clip": 0.01032318, + "auxiliary_loss_mlp": 0.01038049, + "balance_loss_clip": 1.02757502, + "balance_loss_mlp": 1.02449465, + "epoch": 0.3674733203066286, + "flos": 29862200499840.0, + "grad_norm": 2.278402783668879, + "language_loss": 0.72817272, + "learning_rate": 2.9196038218340163e-06, + "loss": 0.74887633, + "num_input_tokens_seen": 131332115, + "step": 6112, + "time_per_iteration": 2.7560067176818848 + }, + { + "auxiliary_loss_clip": 0.01071274, + "auxiliary_loss_mlp": 0.01046264, + "balance_loss_clip": 1.0299747, + "balance_loss_mlp": 1.0325129, + "epoch": 0.36753344355929657, + "flos": 18257021193600.0, + "grad_norm": 1.8701855372247962, + "language_loss": 0.85418969, + "learning_rate": 2.919257954049892e-06, + "loss": 0.87536508, + "num_input_tokens_seen": 131351885, + "step": 6113, + "time_per_iteration": 2.5576677322387695 + }, + { + "auxiliary_loss_clip": 0.01070173, + "auxiliary_loss_mlp": 0.01034989, + "balance_loss_clip": 1.02864242, + "balance_loss_mlp": 1.02005219, + "epoch": 0.36759356681196453, + "flos": 25301150490240.0, + "grad_norm": 1.7018639662560104, + "language_loss": 0.7863878, + "learning_rate": 2.918912051407413e-06, + "loss": 0.80743945, + "num_input_tokens_seen": 131370245, + "step": 6114, + "time_per_iteration": 2.702446460723877 + }, + { + "auxiliary_loss_clip": 0.01075842, + "auxiliary_loss_mlp": 0.01042252, + "balance_loss_clip": 1.03163362, + "balance_loss_mlp": 1.02587223, + "epoch": 0.3676536900646325, + "flos": 21032952065280.0, + "grad_norm": 2.1394241727170895, + "language_loss": 0.67297447, + "learning_rate": 2.918566113919698e-06, + "loss": 0.69415545, + "num_input_tokens_seen": 131388115, + "step": 6115, + "time_per_iteration": 2.734734058380127 + }, + { + "auxiliary_loss_clip": 0.01049956, + "auxiliary_loss_mlp": 0.01034767, + "balance_loss_clip": 1.02641439, + "balance_loss_mlp": 1.02168953, + "epoch": 0.36771381331730046, + "flos": 16288506190080.0, + "grad_norm": 2.31453584779807, + "language_loss": 0.76260042, + "learning_rate": 2.9182201415998636e-06, + "loss": 0.78344762, + "num_input_tokens_seen": 131404595, + "step": 6116, + "time_per_iteration": 2.729323148727417 + }, + { + "auxiliary_loss_clip": 0.01031914, + "auxiliary_loss_mlp": 0.01035865, + "balance_loss_clip": 1.02586508, + "balance_loss_mlp": 1.02307415, + "epoch": 0.36777393656996843, + "flos": 22309971367680.0, + "grad_norm": 1.7705640915560406, + "language_loss": 0.62856781, + "learning_rate": 2.9178741344610286e-06, + "loss": 0.64924562, + "num_input_tokens_seen": 131423760, + "step": 6117, + "time_per_iteration": 2.9046802520751953 + }, + { + "auxiliary_loss_clip": 0.01046876, + "auxiliary_loss_mlp": 0.01033427, + "balance_loss_clip": 1.02663445, + "balance_loss_mlp": 1.0190618, + "epoch": 0.3678340598226364, + "flos": 26834069260800.0, + "grad_norm": 2.5150431209597914, + "language_loss": 0.73534322, + "learning_rate": 2.9175280925163156e-06, + "loss": 0.75614619, + "num_input_tokens_seen": 131444955, + "step": 6118, + "time_per_iteration": 2.8248465061187744 + }, + { + "auxiliary_loss_clip": 0.01075365, + "auxiliary_loss_mlp": 0.01038354, + "balance_loss_clip": 1.03230357, + "balance_loss_mlp": 1.02320814, + "epoch": 0.36789418307530436, + "flos": 21761723105280.0, + "grad_norm": 2.0111268471103614, + "language_loss": 0.72659588, + "learning_rate": 2.9171820157788445e-06, + "loss": 0.74773306, + "num_input_tokens_seen": 131465720, + "step": 6119, + "time_per_iteration": 2.616112470626831 + }, + { + "auxiliary_loss_clip": 0.0105933, + "auxiliary_loss_mlp": 0.01034265, + "balance_loss_clip": 1.03126729, + "balance_loss_mlp": 1.01976323, + "epoch": 0.3679543063279723, + "flos": 15924192497280.0, + "grad_norm": 1.9983140235222658, + "language_loss": 0.80438137, + "learning_rate": 2.9168359042617404e-06, + "loss": 0.82531738, + "num_input_tokens_seen": 131483080, + "step": 6120, + "time_per_iteration": 2.728656768798828 + }, + { + "auxiliary_loss_clip": 0.01051216, + "auxiliary_loss_mlp": 0.01034418, + "balance_loss_clip": 1.03222144, + "balance_loss_mlp": 1.01986217, + "epoch": 0.3680144295806403, + "flos": 24275541456000.0, + "grad_norm": 1.9436268039524984, + "language_loss": 0.64900827, + "learning_rate": 2.916489757978126e-06, + "loss": 0.66986465, + "num_input_tokens_seen": 131502545, + "step": 6121, + "time_per_iteration": 2.801304578781128 + }, + { + "auxiliary_loss_clip": 0.01077117, + "auxiliary_loss_mlp": 0.01043411, + "balance_loss_clip": 1.03508592, + "balance_loss_mlp": 1.02873671, + "epoch": 0.36807455283330826, + "flos": 26104148985600.0, + "grad_norm": 2.274993493696634, + "language_loss": 0.70952904, + "learning_rate": 2.9161435769411286e-06, + "loss": 0.73073435, + "num_input_tokens_seen": 131522155, + "step": 6122, + "time_per_iteration": 2.805509090423584 + }, + { + "auxiliary_loss_clip": 0.0105029, + "auxiliary_loss_mlp": 0.01037632, + "balance_loss_clip": 1.02939498, + "balance_loss_mlp": 1.02357173, + "epoch": 0.3681346760859763, + "flos": 24644990793600.0, + "grad_norm": 1.7878193910935267, + "language_loss": 0.69168293, + "learning_rate": 2.915797361163875e-06, + "loss": 0.71256214, + "num_input_tokens_seen": 131543865, + "step": 6123, + "time_per_iteration": 2.734360694885254 + }, + { + "auxiliary_loss_clip": 0.01068063, + "auxiliary_loss_mlp": 0.0103547, + "balance_loss_clip": 1.02986073, + "balance_loss_mlp": 1.01956701, + "epoch": 0.36819479933864424, + "flos": 23878369797120.0, + "grad_norm": 2.191409147371437, + "language_loss": 0.7339772, + "learning_rate": 2.9154511106594933e-06, + "loss": 0.75501251, + "num_input_tokens_seen": 131562155, + "step": 6124, + "time_per_iteration": 4.296372413635254 + }, + { + "auxiliary_loss_clip": 0.01060392, + "auxiliary_loss_mlp": 0.01040827, + "balance_loss_clip": 1.03126693, + "balance_loss_mlp": 1.02553272, + "epoch": 0.3682549225913122, + "flos": 25553997302400.0, + "grad_norm": 2.1367648487509334, + "language_loss": 0.74158347, + "learning_rate": 2.915104825441114e-06, + "loss": 0.76259565, + "num_input_tokens_seen": 131581695, + "step": 6125, + "time_per_iteration": 2.746224880218506 + }, + { + "auxiliary_loss_clip": 0.01070557, + "auxiliary_loss_mlp": 0.01044249, + "balance_loss_clip": 1.02923405, + "balance_loss_mlp": 1.02847815, + "epoch": 0.36831504584398017, + "flos": 16946605221120.0, + "grad_norm": 1.997961226179949, + "language_loss": 0.78565061, + "learning_rate": 2.9147585055218686e-06, + "loss": 0.8067987, + "num_input_tokens_seen": 131599465, + "step": 6126, + "time_per_iteration": 4.314951658248901 + }, + { + "auxiliary_loss_clip": 0.01071462, + "auxiliary_loss_mlp": 0.01045012, + "balance_loss_clip": 1.02906251, + "balance_loss_mlp": 1.0288825, + "epoch": 0.36837516909664814, + "flos": 19865065259520.0, + "grad_norm": 2.35887354188114, + "language_loss": 0.66023815, + "learning_rate": 2.914412150914888e-06, + "loss": 0.68140292, + "num_input_tokens_seen": 131618330, + "step": 6127, + "time_per_iteration": 2.7858266830444336 + }, + { + "auxiliary_loss_clip": 0.0106597, + "auxiliary_loss_mlp": 0.01036572, + "balance_loss_clip": 1.0343051, + "balance_loss_mlp": 1.02202868, + "epoch": 0.3684352923493161, + "flos": 37626984362880.0, + "grad_norm": 1.8442373196644763, + "language_loss": 0.70273316, + "learning_rate": 2.9140657616333074e-06, + "loss": 0.72375858, + "num_input_tokens_seen": 131638960, + "step": 6128, + "time_per_iteration": 2.8644373416900635 + }, + { + "auxiliary_loss_clip": 0.01065276, + "auxiliary_loss_mlp": 0.01042291, + "balance_loss_clip": 1.03253102, + "balance_loss_mlp": 1.02780688, + "epoch": 0.36849541560198407, + "flos": 14465501182080.0, + "grad_norm": 2.224444737784583, + "language_loss": 0.75418317, + "learning_rate": 2.9137193376902614e-06, + "loss": 0.77525878, + "num_input_tokens_seen": 131657440, + "step": 6129, + "time_per_iteration": 2.713996410369873 + }, + { + "auxiliary_loss_clip": 0.0106274, + "auxiliary_loss_mlp": 0.01037259, + "balance_loss_clip": 1.02875948, + "balance_loss_mlp": 1.02250135, + "epoch": 0.36855553885465203, + "flos": 25770753924480.0, + "grad_norm": 1.70306028572811, + "language_loss": 0.84419096, + "learning_rate": 2.9133728790988868e-06, + "loss": 0.86519092, + "num_input_tokens_seen": 131678035, + "step": 6130, + "time_per_iteration": 2.659555673599243 + }, + { + "auxiliary_loss_clip": 0.00999402, + "auxiliary_loss_mlp": 0.01014037, + "balance_loss_clip": 1.00914037, + "balance_loss_mlp": 1.01236808, + "epoch": 0.36861566210732, + "flos": 65049417377280.0, + "grad_norm": 1.0381278431873637, + "language_loss": 0.60318106, + "learning_rate": 2.913026385872321e-06, + "loss": 0.62331545, + "num_input_tokens_seen": 131742470, + "step": 6131, + "time_per_iteration": 3.3217382431030273 + }, + { + "auxiliary_loss_clip": 0.01040227, + "auxiliary_loss_mlp": 0.01032081, + "balance_loss_clip": 1.02646065, + "balance_loss_mlp": 1.01810956, + "epoch": 0.36867578535998796, + "flos": 30954495133440.0, + "grad_norm": 1.6651166088635445, + "language_loss": 0.72912174, + "learning_rate": 2.9126798580237034e-06, + "loss": 0.74984479, + "num_input_tokens_seen": 131764570, + "step": 6132, + "time_per_iteration": 2.815469741821289 + }, + { + "auxiliary_loss_clip": 0.01065791, + "auxiliary_loss_mlp": 0.01036712, + "balance_loss_clip": 1.02785134, + "balance_loss_mlp": 1.02126288, + "epoch": 0.3687359086126559, + "flos": 28837956182400.0, + "grad_norm": 1.6858600826870085, + "language_loss": 0.73809034, + "learning_rate": 2.9123332955661736e-06, + "loss": 0.7591154, + "num_input_tokens_seen": 131785720, + "step": 6133, + "time_per_iteration": 2.7363650798797607 + }, + { + "auxiliary_loss_clip": 0.0101419, + "auxiliary_loss_mlp": 0.01043803, + "balance_loss_clip": 1.02645135, + "balance_loss_mlp": 1.02813911, + "epoch": 0.3687960318653239, + "flos": 21396798881280.0, + "grad_norm": 1.453764707815451, + "language_loss": 0.70935285, + "learning_rate": 2.911986698512874e-06, + "loss": 0.72993273, + "num_input_tokens_seen": 131804430, + "step": 6134, + "time_per_iteration": 2.878389358520508 + }, + { + "auxiliary_loss_clip": 0.01046492, + "auxiliary_loss_mlp": 0.01030602, + "balance_loss_clip": 1.02782202, + "balance_loss_mlp": 1.01682103, + "epoch": 0.36885615511799186, + "flos": 20266043760000.0, + "grad_norm": 1.655848928781872, + "language_loss": 0.75276768, + "learning_rate": 2.9116400668769477e-06, + "loss": 0.77353859, + "num_input_tokens_seen": 131822060, + "step": 6135, + "time_per_iteration": 2.7422165870666504 + }, + { + "auxiliary_loss_clip": 0.00986129, + "auxiliary_loss_mlp": 0.01000282, + "balance_loss_clip": 1.00597906, + "balance_loss_mlp": 0.99860114, + "epoch": 0.3689162783706599, + "flos": 63088836301440.0, + "grad_norm": 0.8180340210711335, + "language_loss": 0.58844233, + "learning_rate": 2.9112934006715376e-06, + "loss": 0.60830641, + "num_input_tokens_seen": 131880715, + "step": 6136, + "time_per_iteration": 4.773345232009888 + }, + { + "auxiliary_loss_clip": 0.01045133, + "auxiliary_loss_mlp": 0.01034527, + "balance_loss_clip": 1.0251888, + "balance_loss_mlp": 1.0199182, + "epoch": 0.36897640162332784, + "flos": 10961984419200.0, + "grad_norm": 1.8208596431470336, + "language_loss": 0.79224867, + "learning_rate": 2.9109466999097918e-06, + "loss": 0.81304526, + "num_input_tokens_seen": 131895850, + "step": 6137, + "time_per_iteration": 4.333690881729126 + }, + { + "auxiliary_loss_clip": 0.01069764, + "auxiliary_loss_mlp": 0.0104414, + "balance_loss_clip": 1.02974594, + "balance_loss_mlp": 1.02946508, + "epoch": 0.3690365248759958, + "flos": 20704297599360.0, + "grad_norm": 1.8637836106080659, + "language_loss": 0.74164498, + "learning_rate": 2.9105999646048552e-06, + "loss": 0.762784, + "num_input_tokens_seen": 131915775, + "step": 6138, + "time_per_iteration": 2.681020975112915 + }, + { + "auxiliary_loss_clip": 0.01030618, + "auxiliary_loss_mlp": 0.01034712, + "balance_loss_clip": 1.02611089, + "balance_loss_mlp": 1.02022791, + "epoch": 0.3690966481286638, + "flos": 31826369957760.0, + "grad_norm": 1.9438246695835926, + "language_loss": 0.65066046, + "learning_rate": 2.9102531947698764e-06, + "loss": 0.67131376, + "num_input_tokens_seen": 131935715, + "step": 6139, + "time_per_iteration": 2.8084959983825684 + }, + { + "auxiliary_loss_clip": 0.0104643, + "auxiliary_loss_mlp": 0.01035742, + "balance_loss_clip": 1.02790558, + "balance_loss_mlp": 1.02181792, + "epoch": 0.36915677138133174, + "flos": 13114936782720.0, + "grad_norm": 1.9276885936256887, + "language_loss": 0.71411562, + "learning_rate": 2.909906390418006e-06, + "loss": 0.73493737, + "num_input_tokens_seen": 131954120, + "step": 6140, + "time_per_iteration": 2.626166582107544 + }, + { + "auxiliary_loss_clip": 0.00990554, + "auxiliary_loss_mlp": 0.01004871, + "balance_loss_clip": 1.01050866, + "balance_loss_mlp": 1.00327396, + "epoch": 0.3692168946339997, + "flos": 68686879956480.0, + "grad_norm": 0.7694435519774836, + "language_loss": 0.59331763, + "learning_rate": 2.9095595515623934e-06, + "loss": 0.61327183, + "num_input_tokens_seen": 132017485, + "step": 6141, + "time_per_iteration": 3.3361408710479736 + }, + { + "auxiliary_loss_clip": 0.01066654, + "auxiliary_loss_mlp": 0.01035998, + "balance_loss_clip": 1.02785587, + "balance_loss_mlp": 1.02219915, + "epoch": 0.36927701788666767, + "flos": 22017873968640.0, + "grad_norm": 2.655866868346641, + "language_loss": 0.74968946, + "learning_rate": 2.909212678216192e-06, + "loss": 0.77071601, + "num_input_tokens_seen": 132036760, + "step": 6142, + "time_per_iteration": 2.6179511547088623 + }, + { + "auxiliary_loss_clip": 0.01063826, + "auxiliary_loss_mlp": 0.01030381, + "balance_loss_clip": 1.02662206, + "balance_loss_mlp": 1.01786995, + "epoch": 0.36933714113933563, + "flos": 21835591424640.0, + "grad_norm": 2.9532507058994493, + "language_loss": 0.77248633, + "learning_rate": 2.908865770392555e-06, + "loss": 0.79342842, + "num_input_tokens_seen": 132056935, + "step": 6143, + "time_per_iteration": 2.6555821895599365 + }, + { + "auxiliary_loss_clip": 0.01063207, + "auxiliary_loss_mlp": 0.01031824, + "balance_loss_clip": 1.0267539, + "balance_loss_mlp": 1.01950336, + "epoch": 0.3693972643920036, + "flos": 23691705793920.0, + "grad_norm": 1.5074492805353343, + "language_loss": 0.82059085, + "learning_rate": 2.9085188281046364e-06, + "loss": 0.84154117, + "num_input_tokens_seen": 132077285, + "step": 6144, + "time_per_iteration": 2.602226972579956 + }, + { + "auxiliary_loss_clip": 0.0106716, + "auxiliary_loss_mlp": 0.0103774, + "balance_loss_clip": 1.02737451, + "balance_loss_mlp": 1.02471054, + "epoch": 0.36945738764467156, + "flos": 22856747172480.0, + "grad_norm": 2.1789658473342945, + "language_loss": 0.77382934, + "learning_rate": 2.908171851365593e-06, + "loss": 0.79487836, + "num_input_tokens_seen": 132095520, + "step": 6145, + "time_per_iteration": 2.578784227371216 + }, + { + "auxiliary_loss_clip": 0.01060832, + "auxiliary_loss_mlp": 0.0103217, + "balance_loss_clip": 1.02890086, + "balance_loss_mlp": 1.01781774, + "epoch": 0.36951751089733953, + "flos": 16615939593600.0, + "grad_norm": 1.763254581311822, + "language_loss": 0.77144349, + "learning_rate": 2.9078248401885815e-06, + "loss": 0.79237354, + "num_input_tokens_seen": 132112810, + "step": 6146, + "time_per_iteration": 2.53865647315979 + }, + { + "auxiliary_loss_clip": 0.01057938, + "auxiliary_loss_mlp": 0.01043619, + "balance_loss_clip": 1.02938914, + "balance_loss_mlp": 1.0290103, + "epoch": 0.3695776341500075, + "flos": 18914545607040.0, + "grad_norm": 1.6894359307551812, + "language_loss": 0.80401546, + "learning_rate": 2.907477794586761e-06, + "loss": 0.82503098, + "num_input_tokens_seen": 132131615, + "step": 6147, + "time_per_iteration": 2.6166014671325684 + }, + { + "auxiliary_loss_clip": 0.01052055, + "auxiliary_loss_mlp": 0.00747757, + "balance_loss_clip": 1.03058004, + "balance_loss_mlp": 1.00007343, + "epoch": 0.36963775740267546, + "flos": 20808474019200.0, + "grad_norm": 1.8150707274214117, + "language_loss": 0.8326695, + "learning_rate": 2.9071307145732926e-06, + "loss": 0.8506676, + "num_input_tokens_seen": 132149585, + "step": 6148, + "time_per_iteration": 2.6845457553863525 + }, + { + "auxiliary_loss_clip": 0.0106456, + "auxiliary_loss_mlp": 0.01032302, + "balance_loss_clip": 1.02832961, + "balance_loss_mlp": 1.01888537, + "epoch": 0.3696978806553435, + "flos": 26061881656320.0, + "grad_norm": 2.0830863764029703, + "language_loss": 0.74255699, + "learning_rate": 2.9067836001613357e-06, + "loss": 0.76352561, + "num_input_tokens_seen": 132165555, + "step": 6149, + "time_per_iteration": 2.623361110687256 + }, + { + "auxiliary_loss_clip": 0.0108087, + "auxiliary_loss_mlp": 0.01038525, + "balance_loss_clip": 1.03089118, + "balance_loss_mlp": 1.02377272, + "epoch": 0.36975800390801145, + "flos": 26833925606400.0, + "grad_norm": 1.789492959322528, + "language_loss": 0.70617241, + "learning_rate": 2.906436451364054e-06, + "loss": 0.72736633, + "num_input_tokens_seen": 132185100, + "step": 6150, + "time_per_iteration": 2.6025593280792236 + }, + { + "auxiliary_loss_clip": 0.01052673, + "auxiliary_loss_mlp": 0.01041671, + "balance_loss_clip": 1.02760768, + "balance_loss_mlp": 1.02826023, + "epoch": 0.3698181271606794, + "flos": 21142623265920.0, + "grad_norm": 1.519360612005054, + "language_loss": 0.81749815, + "learning_rate": 2.906089268194611e-06, + "loss": 0.83844155, + "num_input_tokens_seen": 132203930, + "step": 6151, + "time_per_iteration": 2.6824519634246826 + }, + { + "auxiliary_loss_clip": 0.00996752, + "auxiliary_loss_mlp": 0.01012171, + "balance_loss_clip": 1.00651944, + "balance_loss_mlp": 1.01033497, + "epoch": 0.3698782504133474, + "flos": 66742639568640.0, + "grad_norm": 0.7822598417678048, + "language_loss": 0.63162112, + "learning_rate": 2.9057420506661726e-06, + "loss": 0.65171033, + "num_input_tokens_seen": 132263845, + "step": 6152, + "time_per_iteration": 3.2691104412078857 + }, + { + "auxiliary_loss_clip": 0.01023343, + "auxiliary_loss_mlp": 0.01036904, + "balance_loss_clip": 1.025491, + "balance_loss_mlp": 1.02363002, + "epoch": 0.36993837366601534, + "flos": 24311523905280.0, + "grad_norm": 2.0406449218286937, + "language_loss": 0.70445883, + "learning_rate": 2.9053947987919044e-06, + "loss": 0.72506124, + "num_input_tokens_seen": 132282350, + "step": 6153, + "time_per_iteration": 2.695420265197754 + }, + { + "auxiliary_loss_clip": 0.01070921, + "auxiliary_loss_mlp": 0.01036331, + "balance_loss_clip": 1.03091621, + "balance_loss_mlp": 1.02249718, + "epoch": 0.3699984969186833, + "flos": 24349194293760.0, + "grad_norm": 1.9206673809879677, + "language_loss": 0.72109634, + "learning_rate": 2.9050475125849755e-06, + "loss": 0.74216884, + "num_input_tokens_seen": 132301930, + "step": 6154, + "time_per_iteration": 2.631269931793213 + }, + { + "auxiliary_loss_clip": 0.01055614, + "auxiliary_loss_mlp": 0.01032274, + "balance_loss_clip": 1.03009892, + "balance_loss_mlp": 1.01949501, + "epoch": 0.37005862017135127, + "flos": 19829154637440.0, + "grad_norm": 1.7291774920584462, + "language_loss": 0.68397224, + "learning_rate": 2.9047001920585534e-06, + "loss": 0.70485115, + "num_input_tokens_seen": 132320915, + "step": 6155, + "time_per_iteration": 2.6596248149871826 + }, + { + "auxiliary_loss_clip": 0.01067189, + "auxiliary_loss_mlp": 0.01029049, + "balance_loss_clip": 1.02914071, + "balance_loss_mlp": 1.0159713, + "epoch": 0.37011874342401924, + "flos": 19573793873280.0, + "grad_norm": 1.626664065194107, + "language_loss": 0.6800006, + "learning_rate": 2.9043528372258097e-06, + "loss": 0.70096296, + "num_input_tokens_seen": 132340415, + "step": 6156, + "time_per_iteration": 2.6866841316223145 + }, + { + "auxiliary_loss_clip": 0.01048955, + "auxiliary_loss_mlp": 0.01033858, + "balance_loss_clip": 1.02689958, + "balance_loss_mlp": 1.02166295, + "epoch": 0.3701788666766872, + "flos": 20374350243840.0, + "grad_norm": 1.714796928047477, + "language_loss": 0.81979495, + "learning_rate": 2.904005448099916e-06, + "loss": 0.84062314, + "num_input_tokens_seen": 132358600, + "step": 6157, + "time_per_iteration": 2.6184475421905518 + }, + { + "auxiliary_loss_clip": 0.0103271, + "auxiliary_loss_mlp": 0.01036381, + "balance_loss_clip": 1.02628434, + "balance_loss_mlp": 1.02172399, + "epoch": 0.37023898992935517, + "flos": 15340931452800.0, + "grad_norm": 2.113421140424396, + "language_loss": 0.76911438, + "learning_rate": 2.9036580246940444e-06, + "loss": 0.78980529, + "num_input_tokens_seen": 132373160, + "step": 6158, + "time_per_iteration": 2.7743213176727295 + }, + { + "auxiliary_loss_clip": 0.01080721, + "auxiliary_loss_mlp": 0.0103588, + "balance_loss_clip": 1.02974176, + "balance_loss_mlp": 1.02205825, + "epoch": 0.37029911318202313, + "flos": 19573937527680.0, + "grad_norm": 2.0298292229159984, + "language_loss": 0.69172001, + "learning_rate": 2.9033105670213708e-06, + "loss": 0.71288598, + "num_input_tokens_seen": 132392345, + "step": 6159, + "time_per_iteration": 2.645735740661621 + }, + { + "auxiliary_loss_clip": 0.01056543, + "auxiliary_loss_mlp": 0.01037547, + "balance_loss_clip": 1.02843022, + "balance_loss_mlp": 1.02520251, + "epoch": 0.3703592364346911, + "flos": 26213353309440.0, + "grad_norm": 1.7715530034117382, + "language_loss": 0.70823914, + "learning_rate": 2.9029630750950697e-06, + "loss": 0.72917998, + "num_input_tokens_seen": 132412620, + "step": 6160, + "time_per_iteration": 2.8980424404144287 + }, + { + "auxiliary_loss_clip": 0.01053518, + "auxiliary_loss_mlp": 0.01030785, + "balance_loss_clip": 1.02752638, + "balance_loss_mlp": 1.01933527, + "epoch": 0.37041935968735906, + "flos": 20048317470720.0, + "grad_norm": 1.7373560778956474, + "language_loss": 0.78793961, + "learning_rate": 2.9026155489283176e-06, + "loss": 0.8087827, + "num_input_tokens_seen": 132431570, + "step": 6161, + "time_per_iteration": 2.7178850173950195 + }, + { + "auxiliary_loss_clip": 0.01077931, + "auxiliary_loss_mlp": 0.0103492, + "balance_loss_clip": 1.02970076, + "balance_loss_mlp": 1.02103829, + "epoch": 0.3704794829400271, + "flos": 24133802388480.0, + "grad_norm": 1.5221644020179845, + "language_loss": 0.79283702, + "learning_rate": 2.902267988534295e-06, + "loss": 0.81396556, + "num_input_tokens_seen": 132451525, + "step": 6162, + "time_per_iteration": 2.7376749515533447 + }, + { + "auxiliary_loss_clip": 0.01049391, + "auxiliary_loss_mlp": 0.00747651, + "balance_loss_clip": 1.02571404, + "balance_loss_mlp": 1.00009561, + "epoch": 0.37053960619269505, + "flos": 14866874732160.0, + "grad_norm": 2.093277856216641, + "language_loss": 0.80109566, + "learning_rate": 2.9019203939261783e-06, + "loss": 0.81906605, + "num_input_tokens_seen": 132469875, + "step": 6163, + "time_per_iteration": 2.7715647220611572 + }, + { + "auxiliary_loss_clip": 0.01069468, + "auxiliary_loss_mlp": 0.01037406, + "balance_loss_clip": 1.03014326, + "balance_loss_mlp": 1.02364922, + "epoch": 0.370599729445363, + "flos": 21361498790400.0, + "grad_norm": 1.8807124470526446, + "language_loss": 0.68429548, + "learning_rate": 2.9015727651171507e-06, + "loss": 0.70536423, + "num_input_tokens_seen": 132488360, + "step": 6164, + "time_per_iteration": 2.8832032680511475 + }, + { + "auxiliary_loss_clip": 0.01049559, + "auxiliary_loss_mlp": 0.01038535, + "balance_loss_clip": 1.02812457, + "balance_loss_mlp": 1.02340126, + "epoch": 0.370659852698031, + "flos": 26829041356800.0, + "grad_norm": 2.294430488395495, + "language_loss": 0.83326817, + "learning_rate": 2.9012251021203935e-06, + "loss": 0.85414916, + "num_input_tokens_seen": 132508630, + "step": 6165, + "time_per_iteration": 2.8201851844787598 + }, + { + "auxiliary_loss_clip": 0.01056396, + "auxiliary_loss_mlp": 0.01034632, + "balance_loss_clip": 1.02870798, + "balance_loss_mlp": 1.01979613, + "epoch": 0.37071997595069894, + "flos": 19099018880640.0, + "grad_norm": 2.2807345384152864, + "language_loss": 0.69409502, + "learning_rate": 2.9008774049490896e-06, + "loss": 0.71500528, + "num_input_tokens_seen": 132527465, + "step": 6166, + "time_per_iteration": 2.8724608421325684 + }, + { + "auxiliary_loss_clip": 0.00985668, + "auxiliary_loss_mlp": 0.01010495, + "balance_loss_clip": 1.00548315, + "balance_loss_mlp": 1.00874221, + "epoch": 0.3707800992033669, + "flos": 52178384920320.0, + "grad_norm": 0.7979631782947025, + "language_loss": 0.56894708, + "learning_rate": 2.9005296736164244e-06, + "loss": 0.58890879, + "num_input_tokens_seen": 132579940, + "step": 6167, + "time_per_iteration": 3.1151952743530273 + }, + { + "auxiliary_loss_clip": 0.01050565, + "auxiliary_loss_mlp": 0.01037225, + "balance_loss_clip": 1.02573645, + "balance_loss_mlp": 1.02523851, + "epoch": 0.3708402224560349, + "flos": 19901837808000.0, + "grad_norm": 1.8102186254828287, + "language_loss": 0.74654925, + "learning_rate": 2.900181908135584e-06, + "loss": 0.76742721, + "num_input_tokens_seen": 132598390, + "step": 6168, + "time_per_iteration": 2.8505167961120605 + }, + { + "auxiliary_loss_clip": 0.01059392, + "auxiliary_loss_mlp": 0.00747651, + "balance_loss_clip": 1.02593899, + "balance_loss_mlp": 1.00005555, + "epoch": 0.37090034570870284, + "flos": 20007630339840.0, + "grad_norm": 2.1490635100443307, + "language_loss": 0.74010134, + "learning_rate": 2.899834108519755e-06, + "loss": 0.75817168, + "num_input_tokens_seen": 132616920, + "step": 6169, + "time_per_iteration": 2.5711510181427 + }, + { + "auxiliary_loss_clip": 0.01075748, + "auxiliary_loss_mlp": 0.01031236, + "balance_loss_clip": 1.02937031, + "balance_loss_mlp": 1.01875496, + "epoch": 0.3709604689613708, + "flos": 24134700228480.0, + "grad_norm": 1.85021056088677, + "language_loss": 0.79635048, + "learning_rate": 2.899486274782127e-06, + "loss": 0.81742036, + "num_input_tokens_seen": 132637660, + "step": 6170, + "time_per_iteration": 2.6938493251800537 + }, + { + "auxiliary_loss_clip": 0.01070119, + "auxiliary_loss_mlp": 0.01043143, + "balance_loss_clip": 1.02993584, + "balance_loss_mlp": 1.02852213, + "epoch": 0.37102059221403877, + "flos": 23876071326720.0, + "grad_norm": 1.6944099752631057, + "language_loss": 0.76556671, + "learning_rate": 2.8991384069358885e-06, + "loss": 0.78669935, + "num_input_tokens_seen": 132657635, + "step": 6171, + "time_per_iteration": 2.6402840614318848 + }, + { + "auxiliary_loss_clip": 0.01058574, + "auxiliary_loss_mlp": 0.01031118, + "balance_loss_clip": 1.02990127, + "balance_loss_mlp": 1.01779604, + "epoch": 0.37108071546670673, + "flos": 14501268149760.0, + "grad_norm": 1.8483910007756426, + "language_loss": 0.80386865, + "learning_rate": 2.898790504994232e-06, + "loss": 0.82476556, + "num_input_tokens_seen": 132674455, + "step": 6172, + "time_per_iteration": 4.2283241748809814 + }, + { + "auxiliary_loss_clip": 0.01068443, + "auxiliary_loss_mlp": 0.01033925, + "balance_loss_clip": 1.02782142, + "balance_loss_mlp": 1.01957273, + "epoch": 0.3711408387193747, + "flos": 34562619279360.0, + "grad_norm": 2.3470297642372278, + "language_loss": 0.59766138, + "learning_rate": 2.89844256897035e-06, + "loss": 0.61868507, + "num_input_tokens_seen": 132695140, + "step": 6173, + "time_per_iteration": 4.320311784744263 + }, + { + "auxiliary_loss_clip": 0.0105604, + "auxiliary_loss_mlp": 0.01030715, + "balance_loss_clip": 1.02739477, + "balance_loss_mlp": 1.01725674, + "epoch": 0.37120096197204266, + "flos": 17310703432320.0, + "grad_norm": 2.0247459823893945, + "language_loss": 0.80391407, + "learning_rate": 2.898094598877435e-06, + "loss": 0.8247816, + "num_input_tokens_seen": 132712470, + "step": 6174, + "time_per_iteration": 2.724679946899414 + }, + { + "auxiliary_loss_clip": 0.01073718, + "auxiliary_loss_mlp": 0.01033031, + "balance_loss_clip": 1.02783394, + "balance_loss_mlp": 1.02023959, + "epoch": 0.37126108522471063, + "flos": 30664049760000.0, + "grad_norm": 1.7302933620061707, + "language_loss": 0.79739439, + "learning_rate": 2.8977465947286826e-06, + "loss": 0.81846189, + "num_input_tokens_seen": 132732945, + "step": 6175, + "time_per_iteration": 2.6819870471954346 + }, + { + "auxiliary_loss_clip": 0.01070012, + "auxiliary_loss_mlp": 0.01042652, + "balance_loss_clip": 1.03124785, + "balance_loss_mlp": 1.02902627, + "epoch": 0.37132120847737865, + "flos": 25155640494720.0, + "grad_norm": 2.112595906575088, + "language_loss": 0.88509351, + "learning_rate": 2.89739855653729e-06, + "loss": 0.9062202, + "num_input_tokens_seen": 132752470, + "step": 6176, + "time_per_iteration": 2.6837589740753174 + }, + { + "auxiliary_loss_clip": 0.01068793, + "auxiliary_loss_mlp": 0.01033244, + "balance_loss_clip": 1.0307045, + "balance_loss_mlp": 1.02052474, + "epoch": 0.3713813317300466, + "flos": 21213474842880.0, + "grad_norm": 1.671042598181194, + "language_loss": 0.73055738, + "learning_rate": 2.8970504843164546e-06, + "loss": 0.75157773, + "num_input_tokens_seen": 132771485, + "step": 6177, + "time_per_iteration": 2.667681932449341 + }, + { + "auxiliary_loss_clip": 0.01048724, + "auxiliary_loss_mlp": 0.01039462, + "balance_loss_clip": 1.03019238, + "balance_loss_mlp": 1.02597332, + "epoch": 0.3714414549827146, + "flos": 21616644072960.0, + "grad_norm": 1.8438052041030155, + "language_loss": 0.75610912, + "learning_rate": 2.896702378079374e-06, + "loss": 0.77699095, + "num_input_tokens_seen": 132791465, + "step": 6178, + "time_per_iteration": 2.6768531799316406 + }, + { + "auxiliary_loss_clip": 0.01013189, + "auxiliary_loss_mlp": 0.01037787, + "balance_loss_clip": 1.02511704, + "balance_loss_mlp": 1.02352929, + "epoch": 0.37150157823538255, + "flos": 19972294335360.0, + "grad_norm": 1.7734771234203062, + "language_loss": 0.72186279, + "learning_rate": 2.8963542378392502e-06, + "loss": 0.74237251, + "num_input_tokens_seen": 132810160, + "step": 6179, + "time_per_iteration": 2.7619376182556152 + }, + { + "auxiliary_loss_clip": 0.01080602, + "auxiliary_loss_mlp": 0.010353, + "balance_loss_clip": 1.03003371, + "balance_loss_mlp": 1.02082849, + "epoch": 0.3715617014880505, + "flos": 24860562266880.0, + "grad_norm": 2.4405514230437335, + "language_loss": 0.7008146, + "learning_rate": 2.896006063609283e-06, + "loss": 0.72197366, + "num_input_tokens_seen": 132831265, + "step": 6180, + "time_per_iteration": 2.819809675216675 + }, + { + "auxiliary_loss_clip": 0.01056039, + "auxiliary_loss_mlp": 0.01030164, + "balance_loss_clip": 1.02777624, + "balance_loss_mlp": 1.01747465, + "epoch": 0.3716218247407185, + "flos": 20449080489600.0, + "grad_norm": 1.6382661121545152, + "language_loss": 0.77919555, + "learning_rate": 2.8956578554026767e-06, + "loss": 0.80005759, + "num_input_tokens_seen": 132850005, + "step": 6181, + "time_per_iteration": 2.730163812637329 + }, + { + "auxiliary_loss_clip": 0.0106507, + "auxiliary_loss_mlp": 0.01035306, + "balance_loss_clip": 1.02837968, + "balance_loss_mlp": 1.02188301, + "epoch": 0.37168194799338644, + "flos": 24133479166080.0, + "grad_norm": 1.915449364352403, + "language_loss": 0.78318888, + "learning_rate": 2.8953096132326343e-06, + "loss": 0.80419266, + "num_input_tokens_seen": 132865790, + "step": 6182, + "time_per_iteration": 2.6760175228118896 + }, + { + "auxiliary_loss_clip": 0.00995172, + "auxiliary_loss_mlp": 0.01008074, + "balance_loss_clip": 1.00490069, + "balance_loss_mlp": 1.00669718, + "epoch": 0.3717420712460544, + "flos": 67408926900480.0, + "grad_norm": 0.7899735834264958, + "language_loss": 0.5750246, + "learning_rate": 2.894961337112362e-06, + "loss": 0.59505713, + "num_input_tokens_seen": 132921775, + "step": 6183, + "time_per_iteration": 4.794605731964111 + }, + { + "auxiliary_loss_clip": 0.01067315, + "auxiliary_loss_mlp": 0.00747735, + "balance_loss_clip": 1.02687573, + "balance_loss_mlp": 1.00013554, + "epoch": 0.37180219449872237, + "flos": 22376908362240.0, + "grad_norm": 2.103550910374872, + "language_loss": 0.77416456, + "learning_rate": 2.894613027055066e-06, + "loss": 0.79231513, + "num_input_tokens_seen": 132941060, + "step": 6184, + "time_per_iteration": 4.383945465087891 + }, + { + "auxiliary_loss_clip": 0.01037784, + "auxiliary_loss_mlp": 0.01036416, + "balance_loss_clip": 1.02631807, + "balance_loss_mlp": 1.02289808, + "epoch": 0.37186231775139034, + "flos": 21869885934720.0, + "grad_norm": 1.7276626272787892, + "language_loss": 0.72679234, + "learning_rate": 2.894264683073954e-06, + "loss": 0.74753439, + "num_input_tokens_seen": 132961850, + "step": 6185, + "time_per_iteration": 2.647170066833496 + }, + { + "auxiliary_loss_clip": 0.01023838, + "auxiliary_loss_mlp": 0.01032033, + "balance_loss_clip": 1.02477777, + "balance_loss_mlp": 1.01855028, + "epoch": 0.3719224410040583, + "flos": 22415225195520.0, + "grad_norm": 1.4778400280599522, + "language_loss": 0.77201998, + "learning_rate": 2.8939163051822363e-06, + "loss": 0.7925787, + "num_input_tokens_seen": 132981625, + "step": 6186, + "time_per_iteration": 2.788065195083618 + }, + { + "auxiliary_loss_clip": 0.01073496, + "auxiliary_loss_mlp": 0.01037059, + "balance_loss_clip": 1.03114104, + "balance_loss_mlp": 1.02296305, + "epoch": 0.37198256425672627, + "flos": 25151223121920.0, + "grad_norm": 1.6824909685121936, + "language_loss": 0.83419997, + "learning_rate": 2.8935678933931224e-06, + "loss": 0.85530555, + "num_input_tokens_seen": 133001225, + "step": 6187, + "time_per_iteration": 2.657761335372925 + }, + { + "auxiliary_loss_clip": 0.01065725, + "auxiliary_loss_mlp": 0.01033507, + "balance_loss_clip": 1.02698219, + "balance_loss_mlp": 1.02080548, + "epoch": 0.37204268750939423, + "flos": 21138313633920.0, + "grad_norm": 1.8384188755725026, + "language_loss": 0.84618044, + "learning_rate": 2.893219447719824e-06, + "loss": 0.86717284, + "num_input_tokens_seen": 133018820, + "step": 6188, + "time_per_iteration": 2.6013076305389404 + }, + { + "auxiliary_loss_clip": 0.01059332, + "auxiliary_loss_mlp": 0.01033413, + "balance_loss_clip": 1.03220725, + "balance_loss_mlp": 1.01949513, + "epoch": 0.37210281076206225, + "flos": 21506829217920.0, + "grad_norm": 9.792263472546537, + "language_loss": 0.65443861, + "learning_rate": 2.8928709681755548e-06, + "loss": 0.67536604, + "num_input_tokens_seen": 133040205, + "step": 6189, + "time_per_iteration": 2.6648616790771484 + }, + { + "auxiliary_loss_clip": 0.01052155, + "auxiliary_loss_mlp": 0.01035463, + "balance_loss_clip": 1.02720833, + "balance_loss_mlp": 1.02142, + "epoch": 0.3721629340147302, + "flos": 17347835116800.0, + "grad_norm": 1.7841608392065482, + "language_loss": 0.84273088, + "learning_rate": 2.8925224547735293e-06, + "loss": 0.86360705, + "num_input_tokens_seen": 133058095, + "step": 6190, + "time_per_iteration": 2.5834548473358154 + }, + { + "auxiliary_loss_clip": 0.01050518, + "auxiliary_loss_mlp": 0.01037625, + "balance_loss_clip": 1.02725708, + "balance_loss_mlp": 1.02358806, + "epoch": 0.3722230572673982, + "flos": 16432400073600.0, + "grad_norm": 2.553602766768726, + "language_loss": 0.88752222, + "learning_rate": 2.8921739075269633e-06, + "loss": 0.90840364, + "num_input_tokens_seen": 133071530, + "step": 6191, + "time_per_iteration": 2.5766820907592773 + }, + { + "auxiliary_loss_clip": 0.01036431, + "auxiliary_loss_mlp": 0.01035924, + "balance_loss_clip": 1.02690315, + "balance_loss_mlp": 1.02040935, + "epoch": 0.37228318052006615, + "flos": 22674716023680.0, + "grad_norm": 1.5980285028783177, + "language_loss": 0.73868763, + "learning_rate": 2.891825326449073e-06, + "loss": 0.75941122, + "num_input_tokens_seen": 133091410, + "step": 6192, + "time_per_iteration": 2.805877685546875 + }, + { + "auxiliary_loss_clip": 0.01076473, + "auxiliary_loss_mlp": 0.01032853, + "balance_loss_clip": 1.02816534, + "balance_loss_mlp": 1.02021074, + "epoch": 0.3723433037727341, + "flos": 25265491263360.0, + "grad_norm": 2.4212102041418255, + "language_loss": 0.8001585, + "learning_rate": 2.8914767115530766e-06, + "loss": 0.82125175, + "num_input_tokens_seen": 133110365, + "step": 6193, + "time_per_iteration": 2.5440139770507812 + }, + { + "auxiliary_loss_clip": 0.01045909, + "auxiliary_loss_mlp": 0.01038828, + "balance_loss_clip": 1.02704287, + "balance_loss_mlp": 1.02549994, + "epoch": 0.3724034270254021, + "flos": 10524664333440.0, + "grad_norm": 2.246308271930953, + "language_loss": 0.84312999, + "learning_rate": 2.891128062852194e-06, + "loss": 0.86397731, + "num_input_tokens_seen": 133128255, + "step": 6194, + "time_per_iteration": 2.6190879344940186 + }, + { + "auxiliary_loss_clip": 0.01055803, + "auxiliary_loss_mlp": 0.01033455, + "balance_loss_clip": 1.02802539, + "balance_loss_mlp": 1.02056861, + "epoch": 0.37246355027807004, + "flos": 20266223328000.0, + "grad_norm": 2.4969713701335214, + "language_loss": 0.76592577, + "learning_rate": 2.890779380359646e-06, + "loss": 0.78681839, + "num_input_tokens_seen": 133143975, + "step": 6195, + "time_per_iteration": 2.6709394454956055 + }, + { + "auxiliary_loss_clip": 0.01055527, + "auxiliary_loss_mlp": 0.01032914, + "balance_loss_clip": 1.02807593, + "balance_loss_mlp": 1.02009881, + "epoch": 0.372523673530738, + "flos": 19500571998720.0, + "grad_norm": 1.5419526742065819, + "language_loss": 0.79377395, + "learning_rate": 2.890430664088655e-06, + "loss": 0.8146584, + "num_input_tokens_seen": 133162935, + "step": 6196, + "time_per_iteration": 2.643578290939331 + }, + { + "auxiliary_loss_clip": 0.01066711, + "auxiliary_loss_mlp": 0.01035484, + "balance_loss_clip": 1.02906621, + "balance_loss_mlp": 1.02293181, + "epoch": 0.372583796783406, + "flos": 16764250849920.0, + "grad_norm": 1.961175302246692, + "language_loss": 0.83143198, + "learning_rate": 2.890081914052443e-06, + "loss": 0.85245395, + "num_input_tokens_seen": 133181180, + "step": 6197, + "time_per_iteration": 2.659167766571045 + }, + { + "auxiliary_loss_clip": 0.01075298, + "auxiliary_loss_mlp": 0.01032079, + "balance_loss_clip": 1.0281918, + "balance_loss_mlp": 1.01868546, + "epoch": 0.37264392003607394, + "flos": 22637979388800.0, + "grad_norm": 1.5155987194724854, + "language_loss": 0.64262819, + "learning_rate": 2.889733130264237e-06, + "loss": 0.66370201, + "num_input_tokens_seen": 133199615, + "step": 6198, + "time_per_iteration": 2.5673656463623047 + }, + { + "auxiliary_loss_clip": 0.01063662, + "auxiliary_loss_mlp": 0.01043164, + "balance_loss_clip": 1.02733052, + "balance_loss_mlp": 1.03088546, + "epoch": 0.3727040432887419, + "flos": 19973120348160.0, + "grad_norm": 1.7524133111053128, + "language_loss": 0.74115264, + "learning_rate": 2.889384312737261e-06, + "loss": 0.76222086, + "num_input_tokens_seen": 133219650, + "step": 6199, + "time_per_iteration": 2.5820324420928955 + }, + { + "auxiliary_loss_clip": 0.01054182, + "auxiliary_loss_mlp": 0.01037588, + "balance_loss_clip": 1.02724874, + "balance_loss_mlp": 1.02501702, + "epoch": 0.37276416654140987, + "flos": 63899122279680.0, + "grad_norm": 3.4566723672734794, + "language_loss": 0.80781668, + "learning_rate": 2.889035461484742e-06, + "loss": 0.8287344, + "num_input_tokens_seen": 133245675, + "step": 6200, + "time_per_iteration": 3.1031556129455566 + }, + { + "auxiliary_loss_clip": 0.0104503, + "auxiliary_loss_mlp": 0.01041904, + "balance_loss_clip": 1.02759004, + "balance_loss_mlp": 1.02849841, + "epoch": 0.37282428979407783, + "flos": 39785970211200.0, + "grad_norm": 1.9915623770751407, + "language_loss": 0.59901142, + "learning_rate": 2.88868657651991e-06, + "loss": 0.61988074, + "num_input_tokens_seen": 133266905, + "step": 6201, + "time_per_iteration": 2.818357467651367 + }, + { + "auxiliary_loss_clip": 0.01068986, + "auxiliary_loss_mlp": 0.01031939, + "balance_loss_clip": 1.03021455, + "balance_loss_mlp": 1.01834953, + "epoch": 0.37288441304674586, + "flos": 22709046447360.0, + "grad_norm": 1.6826124978001076, + "language_loss": 0.73054373, + "learning_rate": 2.8883376578559934e-06, + "loss": 0.75155306, + "num_input_tokens_seen": 133286865, + "step": 6202, + "time_per_iteration": 2.626985788345337 + }, + { + "auxiliary_loss_clip": 0.01052316, + "auxiliary_loss_mlp": 0.01034296, + "balance_loss_clip": 1.02657402, + "balance_loss_mlp": 1.02118325, + "epoch": 0.3729445362994138, + "flos": 18770292587520.0, + "grad_norm": 2.0277692734541306, + "language_loss": 0.73991978, + "learning_rate": 2.8879887055062243e-06, + "loss": 0.76078594, + "num_input_tokens_seen": 133305295, + "step": 6203, + "time_per_iteration": 2.7253923416137695 + }, + { + "auxiliary_loss_clip": 0.01051546, + "auxiliary_loss_mlp": 0.01031797, + "balance_loss_clip": 1.02646804, + "balance_loss_mlp": 1.02048421, + "epoch": 0.3730046595520818, + "flos": 22456199635200.0, + "grad_norm": 2.07344010957326, + "language_loss": 0.81549597, + "learning_rate": 2.8876397194838353e-06, + "loss": 0.83632946, + "num_input_tokens_seen": 133324625, + "step": 6204, + "time_per_iteration": 2.7294228076934814 + }, + { + "auxiliary_loss_clip": 0.01069143, + "auxiliary_loss_mlp": 0.01039361, + "balance_loss_clip": 1.02976823, + "balance_loss_mlp": 1.02569318, + "epoch": 0.37306478280474975, + "flos": 24316372241280.0, + "grad_norm": 2.722315948169586, + "language_loss": 0.75525641, + "learning_rate": 2.8872906998020577e-06, + "loss": 0.77634144, + "num_input_tokens_seen": 133344625, + "step": 6205, + "time_per_iteration": 2.5591437816619873 + }, + { + "auxiliary_loss_clip": 0.01058994, + "auxiliary_loss_mlp": 0.01038126, + "balance_loss_clip": 1.02584934, + "balance_loss_mlp": 1.02429128, + "epoch": 0.3731249060574177, + "flos": 15815167741440.0, + "grad_norm": 1.9312880044704706, + "language_loss": 0.78281218, + "learning_rate": 2.886941646474128e-06, + "loss": 0.80378336, + "num_input_tokens_seen": 133363605, + "step": 6206, + "time_per_iteration": 2.5323121547698975 + }, + { + "auxiliary_loss_clip": 0.01076675, + "auxiliary_loss_mlp": 0.01039342, + "balance_loss_clip": 1.02786231, + "balance_loss_mlp": 1.02579951, + "epoch": 0.3731850293100857, + "flos": 19828077229440.0, + "grad_norm": 2.50370121237732, + "language_loss": 0.93568414, + "learning_rate": 2.886592559513283e-06, + "loss": 0.95684427, + "num_input_tokens_seen": 133379405, + "step": 6207, + "time_per_iteration": 2.473033905029297 + }, + { + "auxiliary_loss_clip": 0.01045505, + "auxiliary_loss_mlp": 0.01028448, + "balance_loss_clip": 1.02786469, + "balance_loss_mlp": 1.01585913, + "epoch": 0.37324515256275365, + "flos": 19062354072960.0, + "grad_norm": 2.0142054672239493, + "language_loss": 0.82137728, + "learning_rate": 2.886243438932759e-06, + "loss": 0.84211677, + "num_input_tokens_seen": 133397585, + "step": 6208, + "time_per_iteration": 2.5794193744659424 + }, + { + "auxiliary_loss_clip": 0.01063988, + "auxiliary_loss_mlp": 0.01032402, + "balance_loss_clip": 1.0272119, + "balance_loss_mlp": 1.01812696, + "epoch": 0.3733052758154216, + "flos": 20704333512960.0, + "grad_norm": 2.4928416183189896, + "language_loss": 0.73305464, + "learning_rate": 2.8858942847457953e-06, + "loss": 0.75401855, + "num_input_tokens_seen": 133415365, + "step": 6209, + "time_per_iteration": 2.545201063156128 + }, + { + "auxiliary_loss_clip": 0.0104834, + "auxiliary_loss_mlp": 0.01034362, + "balance_loss_clip": 1.03079605, + "balance_loss_mlp": 1.0195322, + "epoch": 0.3733653990680896, + "flos": 20193504243840.0, + "grad_norm": 1.6996687674358149, + "language_loss": 0.69869792, + "learning_rate": 2.8855450969656305e-06, + "loss": 0.71952498, + "num_input_tokens_seen": 133435700, + "step": 6210, + "time_per_iteration": 2.7380075454711914 + }, + { + "auxiliary_loss_clip": 0.01012824, + "auxiliary_loss_mlp": 0.01037765, + "balance_loss_clip": 1.02172565, + "balance_loss_mlp": 1.02192855, + "epoch": 0.37342552232075754, + "flos": 20339660684160.0, + "grad_norm": 2.2908476992588467, + "language_loss": 0.78026295, + "learning_rate": 2.8851958756055073e-06, + "loss": 0.80076885, + "num_input_tokens_seen": 133455180, + "step": 6211, + "time_per_iteration": 2.6345396041870117 + }, + { + "auxiliary_loss_clip": 0.01068017, + "auxiliary_loss_mlp": 0.01034342, + "balance_loss_clip": 1.02857494, + "balance_loss_mlp": 1.02060962, + "epoch": 0.3734856455734255, + "flos": 35517879527040.0, + "grad_norm": 1.535615984169197, + "language_loss": 0.73267925, + "learning_rate": 2.884846620678668e-06, + "loss": 0.75370288, + "num_input_tokens_seen": 133476715, + "step": 6212, + "time_per_iteration": 2.7346606254577637 + }, + { + "auxiliary_loss_clip": 0.0107642, + "auxiliary_loss_mlp": 0.01046019, + "balance_loss_clip": 1.03206539, + "balance_loss_mlp": 1.03113019, + "epoch": 0.37354576882609347, + "flos": 21142300043520.0, + "grad_norm": 2.0410685074333164, + "language_loss": 0.82426476, + "learning_rate": 2.884497332198356e-06, + "loss": 0.84548914, + "num_input_tokens_seen": 133494550, + "step": 6213, + "time_per_iteration": 2.573385715484619 + }, + { + "auxiliary_loss_clip": 0.01038855, + "auxiliary_loss_mlp": 0.01039674, + "balance_loss_clip": 1.02590072, + "balance_loss_mlp": 1.02457023, + "epoch": 0.37360589207876144, + "flos": 21506793304320.0, + "grad_norm": 2.0740893589312543, + "language_loss": 0.78999758, + "learning_rate": 2.8841480101778167e-06, + "loss": 0.81078285, + "num_input_tokens_seen": 133512640, + "step": 6214, + "time_per_iteration": 2.645871639251709 + }, + { + "auxiliary_loss_clip": 0.01055704, + "auxiliary_loss_mlp": 0.01040158, + "balance_loss_clip": 1.02772927, + "balance_loss_mlp": 1.02690816, + "epoch": 0.37366601533142946, + "flos": 38435800861440.0, + "grad_norm": 1.6452082984644736, + "language_loss": 0.84699714, + "learning_rate": 2.883798654630296e-06, + "loss": 0.86795568, + "num_input_tokens_seen": 133535540, + "step": 6215, + "time_per_iteration": 3.0921154022216797 + }, + { + "auxiliary_loss_clip": 0.01047167, + "auxiliary_loss_mlp": 0.01038354, + "balance_loss_clip": 1.02643371, + "balance_loss_mlp": 1.02341688, + "epoch": 0.3737261385840974, + "flos": 18441171244800.0, + "grad_norm": 2.184954311937797, + "language_loss": 0.67500162, + "learning_rate": 2.8834492655690423e-06, + "loss": 0.69585687, + "num_input_tokens_seen": 133555795, + "step": 6216, + "time_per_iteration": 2.928114891052246 + }, + { + "auxiliary_loss_clip": 0.01055401, + "auxiliary_loss_mlp": 0.01037127, + "balance_loss_clip": 1.02800846, + "balance_loss_mlp": 1.02252972, + "epoch": 0.3737862618367654, + "flos": 22929861306240.0, + "grad_norm": 2.76152721032141, + "language_loss": 0.65728623, + "learning_rate": 2.883099843007303e-06, + "loss": 0.67821151, + "num_input_tokens_seen": 133575905, + "step": 6217, + "time_per_iteration": 2.852959394454956 + }, + { + "auxiliary_loss_clip": 0.0106236, + "auxiliary_loss_mlp": 0.01035542, + "balance_loss_clip": 1.03085279, + "balance_loss_mlp": 1.02146959, + "epoch": 0.37384638508943335, + "flos": 15409664127360.0, + "grad_norm": 1.789215681548243, + "language_loss": 0.80227083, + "learning_rate": 2.88275038695833e-06, + "loss": 0.82324982, + "num_input_tokens_seen": 133592585, + "step": 6218, + "time_per_iteration": 2.8819167613983154 + }, + { + "auxiliary_loss_clip": 0.01062671, + "auxiliary_loss_mlp": 0.01034407, + "balance_loss_clip": 1.02672553, + "balance_loss_mlp": 1.02121663, + "epoch": 0.3739065083421013, + "flos": 24280820755200.0, + "grad_norm": 1.5129510939720339, + "language_loss": 0.78848994, + "learning_rate": 2.8824008974353736e-06, + "loss": 0.80946076, + "num_input_tokens_seen": 133615070, + "step": 6219, + "time_per_iteration": 2.7766995429992676 + }, + { + "auxiliary_loss_clip": 0.01055506, + "auxiliary_loss_mlp": 0.01039588, + "balance_loss_clip": 1.02807808, + "balance_loss_mlp": 1.02630842, + "epoch": 0.3739666315947693, + "flos": 23002831785600.0, + "grad_norm": 1.66725157685284, + "language_loss": 0.77157688, + "learning_rate": 2.8820513744516866e-06, + "loss": 0.79252779, + "num_input_tokens_seen": 133633490, + "step": 6220, + "time_per_iteration": 5.8131537437438965 + }, + { + "auxiliary_loss_clip": 0.01049663, + "auxiliary_loss_mlp": 0.01038886, + "balance_loss_clip": 1.02903247, + "balance_loss_mlp": 1.02450895, + "epoch": 0.37402675484743725, + "flos": 19391116279680.0, + "grad_norm": 1.6754810696141518, + "language_loss": 0.82741404, + "learning_rate": 2.8817018180205235e-06, + "loss": 0.8482995, + "num_input_tokens_seen": 133653425, + "step": 6221, + "time_per_iteration": 2.749302625656128 + }, + { + "auxiliary_loss_clip": 0.01055863, + "auxiliary_loss_mlp": 0.01040615, + "balance_loss_clip": 1.02733135, + "balance_loss_mlp": 1.02682233, + "epoch": 0.3740868781001052, + "flos": 17126158331520.0, + "grad_norm": 1.8007700881993374, + "language_loss": 0.76375341, + "learning_rate": 2.8813522281551387e-06, + "loss": 0.78471822, + "num_input_tokens_seen": 133670220, + "step": 6222, + "time_per_iteration": 2.8104019165039062 + }, + { + "auxiliary_loss_clip": 0.01048138, + "auxiliary_loss_mlp": 0.00747822, + "balance_loss_clip": 1.0277102, + "balance_loss_mlp": 1.00019145, + "epoch": 0.3741470013527732, + "flos": 20043505048320.0, + "grad_norm": 1.791465791502813, + "language_loss": 0.70560449, + "learning_rate": 2.881002604868789e-06, + "loss": 0.72356415, + "num_input_tokens_seen": 133688910, + "step": 6223, + "time_per_iteration": 2.706021785736084 + }, + { + "auxiliary_loss_clip": 0.01052718, + "auxiliary_loss_mlp": 0.01032986, + "balance_loss_clip": 1.03332186, + "balance_loss_mlp": 1.01925361, + "epoch": 0.37420712460544114, + "flos": 36897279569280.0, + "grad_norm": 1.687286116981882, + "language_loss": 0.68501747, + "learning_rate": 2.8806529481747325e-06, + "loss": 0.7058745, + "num_input_tokens_seen": 133708690, + "step": 6224, + "time_per_iteration": 2.778937578201294 + }, + { + "auxiliary_loss_clip": 0.01047888, + "auxiliary_loss_mlp": 0.01036161, + "balance_loss_clip": 1.03204215, + "balance_loss_mlp": 1.02258921, + "epoch": 0.3742672478581091, + "flos": 22201198007040.0, + "grad_norm": 1.6603422006508914, + "language_loss": 0.69974208, + "learning_rate": 2.880303258086228e-06, + "loss": 0.7205826, + "num_input_tokens_seen": 133728095, + "step": 6225, + "time_per_iteration": 2.8274362087249756 + }, + { + "auxiliary_loss_clip": 0.01037813, + "auxiliary_loss_mlp": 0.01036583, + "balance_loss_clip": 1.02662802, + "balance_loss_mlp": 1.02157426, + "epoch": 0.3743273711107771, + "flos": 24681547860480.0, + "grad_norm": 1.9026155164354381, + "language_loss": 0.79043561, + "learning_rate": 2.879953534616536e-06, + "loss": 0.81117952, + "num_input_tokens_seen": 133745590, + "step": 6226, + "time_per_iteration": 2.7590155601501465 + }, + { + "auxiliary_loss_clip": 0.01056738, + "auxiliary_loss_mlp": 0.01037106, + "balance_loss_clip": 1.02832067, + "balance_loss_mlp": 1.02266407, + "epoch": 0.37438749436344504, + "flos": 24459619680000.0, + "grad_norm": 1.9170191744660185, + "language_loss": 0.67543709, + "learning_rate": 2.879603777778917e-06, + "loss": 0.69637561, + "num_input_tokens_seen": 133766155, + "step": 6227, + "time_per_iteration": 2.6497409343719482 + }, + { + "auxiliary_loss_clip": 0.01039978, + "auxiliary_loss_mlp": 0.01032301, + "balance_loss_clip": 1.02563179, + "balance_loss_mlp": 1.01857388, + "epoch": 0.374447617616113, + "flos": 21798747048960.0, + "grad_norm": 1.590100584580474, + "language_loss": 0.82769328, + "learning_rate": 2.879253987586635e-06, + "loss": 0.84841609, + "num_input_tokens_seen": 133783185, + "step": 6228, + "time_per_iteration": 2.6499342918395996 + }, + { + "auxiliary_loss_clip": 0.01038383, + "auxiliary_loss_mlp": 0.01043838, + "balance_loss_clip": 1.02673805, + "balance_loss_mlp": 1.02902675, + "epoch": 0.374507740868781, + "flos": 17968191932160.0, + "grad_norm": 1.4983753085955942, + "language_loss": 0.74710727, + "learning_rate": 2.8789041640529535e-06, + "loss": 0.76792955, + "num_input_tokens_seen": 133800975, + "step": 6229, + "time_per_iteration": 2.6126766204833984 + }, + { + "auxiliary_loss_clip": 0.01048786, + "auxiliary_loss_mlp": 0.01034325, + "balance_loss_clip": 1.02878189, + "balance_loss_mlp": 1.01917386, + "epoch": 0.374567864121449, + "flos": 16105828596480.0, + "grad_norm": 2.0400744736457836, + "language_loss": 0.83393264, + "learning_rate": 2.8785543071911383e-06, + "loss": 0.85476375, + "num_input_tokens_seen": 133818020, + "step": 6230, + "time_per_iteration": 2.5959970951080322 + }, + { + "auxiliary_loss_clip": 0.01070537, + "auxiliary_loss_mlp": 0.01038124, + "balance_loss_clip": 1.03036499, + "balance_loss_mlp": 1.02361596, + "epoch": 0.37462798737411696, + "flos": 25773160135680.0, + "grad_norm": 1.8615229235179123, + "language_loss": 0.73560792, + "learning_rate": 2.878204417014456e-06, + "loss": 0.7566945, + "num_input_tokens_seen": 133840690, + "step": 6231, + "time_per_iteration": 4.188444137573242 + }, + { + "auxiliary_loss_clip": 0.01073276, + "auxiliary_loss_mlp": 0.01044077, + "balance_loss_clip": 1.03267884, + "balance_loss_mlp": 1.02894306, + "epoch": 0.3746881106267849, + "flos": 16654507822080.0, + "grad_norm": 2.5834660531436264, + "language_loss": 0.73582363, + "learning_rate": 2.8778544935361735e-06, + "loss": 0.75699711, + "num_input_tokens_seen": 133858350, + "step": 6232, + "time_per_iteration": 4.163059949874878 + }, + { + "auxiliary_loss_clip": 0.01048078, + "auxiliary_loss_mlp": 0.01033341, + "balance_loss_clip": 1.02465272, + "balance_loss_mlp": 1.01863074, + "epoch": 0.3747482338794529, + "flos": 26177981391360.0, + "grad_norm": 1.8964791815523432, + "language_loss": 0.77101433, + "learning_rate": 2.877504536769561e-06, + "loss": 0.79182845, + "num_input_tokens_seen": 133879775, + "step": 6233, + "time_per_iteration": 2.635164260864258 + }, + { + "auxiliary_loss_clip": 0.01061942, + "auxiliary_loss_mlp": 0.01035826, + "balance_loss_clip": 1.03130984, + "balance_loss_mlp": 1.02141976, + "epoch": 0.37480835713212085, + "flos": 12021061950720.0, + "grad_norm": 1.618169917867244, + "language_loss": 0.69329756, + "learning_rate": 2.8771545467278883e-06, + "loss": 0.71427524, + "num_input_tokens_seen": 133898295, + "step": 6234, + "time_per_iteration": 2.6314218044281006 + }, + { + "auxiliary_loss_clip": 0.01067699, + "auxiliary_loss_mlp": 0.01045255, + "balance_loss_clip": 1.02904773, + "balance_loss_mlp": 1.03167701, + "epoch": 0.3748684803847888, + "flos": 19679263182720.0, + "grad_norm": 1.9492011043066795, + "language_loss": 0.82587034, + "learning_rate": 2.8768045234244276e-06, + "loss": 0.84699988, + "num_input_tokens_seen": 133915230, + "step": 6235, + "time_per_iteration": 2.597174882888794 + }, + { + "auxiliary_loss_clip": 0.01083893, + "auxiliary_loss_mlp": 0.01031152, + "balance_loss_clip": 1.03248096, + "balance_loss_mlp": 1.01712084, + "epoch": 0.3749286036374568, + "flos": 20521189042560.0, + "grad_norm": 1.8299582322883698, + "language_loss": 0.78214443, + "learning_rate": 2.8764544668724517e-06, + "loss": 0.80329484, + "num_input_tokens_seen": 133934110, + "step": 6236, + "time_per_iteration": 2.6693203449249268 + }, + { + "auxiliary_loss_clip": 0.01059842, + "auxiliary_loss_mlp": 0.01047114, + "balance_loss_clip": 1.02662611, + "balance_loss_mlp": 1.02987671, + "epoch": 0.37498872689012475, + "flos": 20704620821760.0, + "grad_norm": 1.953578809461663, + "language_loss": 0.73331606, + "learning_rate": 2.876104377085234e-06, + "loss": 0.75438565, + "num_input_tokens_seen": 133952395, + "step": 6237, + "time_per_iteration": 2.6450538635253906 + }, + { + "auxiliary_loss_clip": 0.01055237, + "auxiliary_loss_mlp": 0.00747807, + "balance_loss_clip": 1.02648377, + "balance_loss_mlp": 1.0000453, + "epoch": 0.3750488501427927, + "flos": 21574843620480.0, + "grad_norm": 1.8830876628260913, + "language_loss": 0.93045533, + "learning_rate": 2.8757542540760508e-06, + "loss": 0.94848579, + "num_input_tokens_seen": 133969635, + "step": 6238, + "time_per_iteration": 2.690028190612793 + }, + { + "auxiliary_loss_clip": 0.01078272, + "auxiliary_loss_mlp": 0.01036219, + "balance_loss_clip": 1.02825618, + "balance_loss_mlp": 1.0214963, + "epoch": 0.3751089733954607, + "flos": 15923869274880.0, + "grad_norm": 1.9799400910246814, + "language_loss": 0.70912349, + "learning_rate": 2.8754040978581777e-06, + "loss": 0.73026848, + "num_input_tokens_seen": 133987215, + "step": 6239, + "time_per_iteration": 2.693657875061035 + }, + { + "auxiliary_loss_clip": 0.01019453, + "auxiliary_loss_mlp": 0.01035855, + "balance_loss_clip": 1.0291481, + "balance_loss_mlp": 1.0207628, + "epoch": 0.37516909664812864, + "flos": 36284644177920.0, + "grad_norm": 2.131046123937327, + "language_loss": 0.65347439, + "learning_rate": 2.875053908444895e-06, + "loss": 0.67402744, + "num_input_tokens_seen": 134009250, + "step": 6240, + "time_per_iteration": 3.1027188301086426 + }, + { + "auxiliary_loss_clip": 0.01045054, + "auxiliary_loss_mlp": 0.00747697, + "balance_loss_clip": 1.02644491, + "balance_loss_mlp": 1.00000143, + "epoch": 0.3752292199007966, + "flos": 13515915283200.0, + "grad_norm": 1.916782827578268, + "language_loss": 0.75777483, + "learning_rate": 2.8747036858494795e-06, + "loss": 0.77570236, + "num_input_tokens_seen": 134026875, + "step": 6241, + "time_per_iteration": 2.888333320617676 + }, + { + "auxiliary_loss_clip": 0.01051098, + "auxiliary_loss_mlp": 0.01041702, + "balance_loss_clip": 1.02862692, + "balance_loss_mlp": 1.02649105, + "epoch": 0.3752893431534646, + "flos": 27198095644800.0, + "grad_norm": 1.8596343340533172, + "language_loss": 0.83601189, + "learning_rate": 2.874353430085213e-06, + "loss": 0.85693979, + "num_input_tokens_seen": 134047185, + "step": 6242, + "time_per_iteration": 2.7853450775146484 + }, + { + "auxiliary_loss_clip": 0.01061958, + "auxiliary_loss_mlp": 0.01044295, + "balance_loss_clip": 1.03073955, + "balance_loss_mlp": 1.03062761, + "epoch": 0.3753494664061326, + "flos": 30007674581760.0, + "grad_norm": 2.710902053390422, + "language_loss": 0.6814748, + "learning_rate": 2.8740031411653766e-06, + "loss": 0.7025373, + "num_input_tokens_seen": 134067330, + "step": 6243, + "time_per_iteration": 2.778000831604004 + }, + { + "auxiliary_loss_clip": 0.0100458, + "auxiliary_loss_mlp": 0.00747828, + "balance_loss_clip": 1.02528381, + "balance_loss_mlp": 1.00001001, + "epoch": 0.37540958965880056, + "flos": 24461954064000.0, + "grad_norm": 2.221010558679171, + "language_loss": 0.83549821, + "learning_rate": 2.8736528191032535e-06, + "loss": 0.85302234, + "num_input_tokens_seen": 134085525, + "step": 6244, + "time_per_iteration": 2.8384861946105957 + }, + { + "auxiliary_loss_clip": 0.01020359, + "auxiliary_loss_mlp": 0.01037839, + "balance_loss_clip": 1.02382016, + "balance_loss_mlp": 1.02319384, + "epoch": 0.3754697129114685, + "flos": 16508387295360.0, + "grad_norm": 3.1073190072427765, + "language_loss": 0.83122563, + "learning_rate": 2.8733024639121277e-06, + "loss": 0.85180765, + "num_input_tokens_seen": 134101855, + "step": 6245, + "time_per_iteration": 2.727219343185425 + }, + { + "auxiliary_loss_clip": 0.01039304, + "auxiliary_loss_mlp": 0.01042, + "balance_loss_clip": 1.02414346, + "balance_loss_mlp": 1.02603793, + "epoch": 0.3755298361641365, + "flos": 19390900798080.0, + "grad_norm": 2.2968678309299357, + "language_loss": 0.63984686, + "learning_rate": 2.8729520756052853e-06, + "loss": 0.66065991, + "num_input_tokens_seen": 134119360, + "step": 6246, + "time_per_iteration": 2.644104480743408 + }, + { + "auxiliary_loss_clip": 0.01057906, + "auxiliary_loss_mlp": 0.01040378, + "balance_loss_clip": 1.02988446, + "balance_loss_mlp": 1.02466607, + "epoch": 0.37558995941680445, + "flos": 14720395069440.0, + "grad_norm": 1.6247374166163775, + "language_loss": 0.74861425, + "learning_rate": 2.8726016541960124e-06, + "loss": 0.76959711, + "num_input_tokens_seen": 134137475, + "step": 6247, + "time_per_iteration": 2.661102533340454 + }, + { + "auxiliary_loss_clip": 0.01070125, + "auxiliary_loss_mlp": 0.01034226, + "balance_loss_clip": 1.0281899, + "balance_loss_mlp": 1.0200932, + "epoch": 0.3756500826694724, + "flos": 21689901861120.0, + "grad_norm": 2.985888156888676, + "language_loss": 0.5486356, + "learning_rate": 2.872251199697598e-06, + "loss": 0.56967914, + "num_input_tokens_seen": 134154580, + "step": 6248, + "time_per_iteration": 2.5885581970214844 + }, + { + "auxiliary_loss_clip": 0.01061034, + "auxiliary_loss_mlp": 0.01039288, + "balance_loss_clip": 1.0268445, + "balance_loss_mlp": 1.02431583, + "epoch": 0.3757102059221404, + "flos": 26505666190080.0, + "grad_norm": 1.7210597361812736, + "language_loss": 0.84374005, + "learning_rate": 2.8719007121233297e-06, + "loss": 0.86474329, + "num_input_tokens_seen": 134174285, + "step": 6249, + "time_per_iteration": 2.6065587997436523 + }, + { + "auxiliary_loss_clip": 0.01058892, + "auxiliary_loss_mlp": 0.01034067, + "balance_loss_clip": 1.02920818, + "balance_loss_mlp": 1.02002442, + "epoch": 0.37577032917480835, + "flos": 37338083274240.0, + "grad_norm": 1.8667818394401294, + "language_loss": 0.67745519, + "learning_rate": 2.8715501914864993e-06, + "loss": 0.6983847, + "num_input_tokens_seen": 134195940, + "step": 6250, + "time_per_iteration": 2.7733166217803955 + }, + { + "auxiliary_loss_clip": 0.01060749, + "auxiliary_loss_mlp": 0.01041258, + "balance_loss_clip": 1.02998686, + "balance_loss_mlp": 1.02752507, + "epoch": 0.3758304524274763, + "flos": 21908597817600.0, + "grad_norm": 1.8491331412493413, + "language_loss": 0.78144062, + "learning_rate": 2.8711996378003987e-06, + "loss": 0.80246073, + "num_input_tokens_seen": 134212235, + "step": 6251, + "time_per_iteration": 2.728468418121338 + }, + { + "auxiliary_loss_clip": 0.01071218, + "auxiliary_loss_mlp": 0.01036217, + "balance_loss_clip": 1.03064036, + "balance_loss_mlp": 1.02232265, + "epoch": 0.3758905756801443, + "flos": 36569343375360.0, + "grad_norm": 2.833264174282692, + "language_loss": 0.58418202, + "learning_rate": 2.8708490510783203e-06, + "loss": 0.60525638, + "num_input_tokens_seen": 134233810, + "step": 6252, + "time_per_iteration": 2.7187864780426025 + }, + { + "auxiliary_loss_clip": 0.01061912, + "auxiliary_loss_mlp": 0.01038732, + "balance_loss_clip": 1.02894473, + "balance_loss_mlp": 1.02329469, + "epoch": 0.37595069893281224, + "flos": 24528783317760.0, + "grad_norm": 1.6984135891009675, + "language_loss": 0.89577776, + "learning_rate": 2.8704984313335584e-06, + "loss": 0.91678417, + "num_input_tokens_seen": 134252020, + "step": 6253, + "time_per_iteration": 2.6081466674804688 + }, + { + "auxiliary_loss_clip": 0.01048914, + "auxiliary_loss_mlp": 0.01036151, + "balance_loss_clip": 1.02978456, + "balance_loss_mlp": 1.02268636, + "epoch": 0.3760108221854802, + "flos": 16435021766400.0, + "grad_norm": 1.836393794111943, + "language_loss": 0.76693094, + "learning_rate": 2.8701477785794097e-06, + "loss": 0.7877816, + "num_input_tokens_seen": 134269495, + "step": 6254, + "time_per_iteration": 2.6256964206695557 + }, + { + "auxiliary_loss_clip": 0.01042791, + "auxiliary_loss_mlp": 0.01044321, + "balance_loss_clip": 1.02681518, + "balance_loss_mlp": 1.0284785, + "epoch": 0.37607094543814823, + "flos": 13771742924160.0, + "grad_norm": 3.4840690961533833, + "language_loss": 0.62247115, + "learning_rate": 2.869797092829169e-06, + "loss": 0.64334226, + "num_input_tokens_seen": 134287035, + "step": 6255, + "time_per_iteration": 2.7357661724090576 + }, + { + "auxiliary_loss_clip": 0.01072167, + "auxiliary_loss_mlp": 0.01033437, + "balance_loss_clip": 1.02905822, + "balance_loss_mlp": 1.01820242, + "epoch": 0.3761310686908162, + "flos": 19857918453120.0, + "grad_norm": 5.536321536518941, + "language_loss": 0.74039942, + "learning_rate": 2.869446374096135e-06, + "loss": 0.76145548, + "num_input_tokens_seen": 134304840, + "step": 6256, + "time_per_iteration": 2.5729353427886963 + }, + { + "auxiliary_loss_clip": 0.01070462, + "auxiliary_loss_mlp": 0.01042524, + "balance_loss_clip": 1.02964258, + "balance_loss_mlp": 1.02712226, + "epoch": 0.37619119194348416, + "flos": 12750802657920.0, + "grad_norm": 1.7900072005509662, + "language_loss": 0.70428526, + "learning_rate": 2.8690956223936088e-06, + "loss": 0.72541511, + "num_input_tokens_seen": 134323180, + "step": 6257, + "time_per_iteration": 2.605339765548706 + }, + { + "auxiliary_loss_clip": 0.01052493, + "auxiliary_loss_mlp": 0.0103377, + "balance_loss_clip": 1.02701831, + "balance_loss_mlp": 1.0198462, + "epoch": 0.3762513151961521, + "flos": 17530548624000.0, + "grad_norm": 1.9908816695333924, + "language_loss": 0.84533709, + "learning_rate": 2.868744837734889e-06, + "loss": 0.86619973, + "num_input_tokens_seen": 134341390, + "step": 6258, + "time_per_iteration": 2.751380205154419 + }, + { + "auxiliary_loss_clip": 0.01042187, + "auxiliary_loss_mlp": 0.01040773, + "balance_loss_clip": 1.03026295, + "balance_loss_mlp": 1.0274806, + "epoch": 0.3763114384488201, + "flos": 23617406511360.0, + "grad_norm": 1.472032786623171, + "language_loss": 0.80632192, + "learning_rate": 2.868394020133277e-06, + "loss": 0.82715148, + "num_input_tokens_seen": 134360425, + "step": 6259, + "time_per_iteration": 2.7709786891937256 + }, + { + "auxiliary_loss_clip": 0.01040941, + "auxiliary_loss_mlp": 0.01043829, + "balance_loss_clip": 1.02931786, + "balance_loss_mlp": 1.02812898, + "epoch": 0.37637156170148806, + "flos": 25406978935680.0, + "grad_norm": 1.6905793128436548, + "language_loss": 0.71249753, + "learning_rate": 2.8680431696020783e-06, + "loss": 0.73334521, + "num_input_tokens_seen": 134379775, + "step": 6260, + "time_per_iteration": 2.8043692111968994 + }, + { + "auxiliary_loss_clip": 0.01051485, + "auxiliary_loss_mlp": 0.01033215, + "balance_loss_clip": 1.02629328, + "balance_loss_mlp": 1.01830208, + "epoch": 0.376431684954156, + "flos": 23440906056960.0, + "grad_norm": 2.1823188804263256, + "language_loss": 0.77987099, + "learning_rate": 2.867692286154594e-06, + "loss": 0.80071801, + "num_input_tokens_seen": 134400315, + "step": 6261, + "time_per_iteration": 2.717000961303711 + }, + { + "auxiliary_loss_clip": 0.01056287, + "auxiliary_loss_mlp": 0.01043103, + "balance_loss_clip": 1.02928495, + "balance_loss_mlp": 1.02705693, + "epoch": 0.376491808206824, + "flos": 34204482725760.0, + "grad_norm": 1.69504247732927, + "language_loss": 0.80792081, + "learning_rate": 2.867341369804132e-06, + "loss": 0.82891476, + "num_input_tokens_seen": 134422875, + "step": 6262, + "time_per_iteration": 2.8801019191741943 + }, + { + "auxiliary_loss_clip": 0.01058122, + "auxiliary_loss_mlp": 0.01036769, + "balance_loss_clip": 1.02695417, + "balance_loss_mlp": 1.02215385, + "epoch": 0.37655193145949195, + "flos": 35185669614720.0, + "grad_norm": 2.0915911143276906, + "language_loss": 0.80663544, + "learning_rate": 2.866990420563998e-06, + "loss": 0.82758427, + "num_input_tokens_seen": 134443025, + "step": 6263, + "time_per_iteration": 2.751941442489624 + }, + { + "auxiliary_loss_clip": 0.01084212, + "auxiliary_loss_mlp": 0.01042452, + "balance_loss_clip": 1.03168559, + "balance_loss_mlp": 1.02843261, + "epoch": 0.3766120547121599, + "flos": 16761844638720.0, + "grad_norm": 1.7115616508537168, + "language_loss": 0.79881883, + "learning_rate": 2.866639438447501e-06, + "loss": 0.82008547, + "num_input_tokens_seen": 134460945, + "step": 6264, + "time_per_iteration": 2.7810919284820557 + }, + { + "auxiliary_loss_clip": 0.01078751, + "auxiliary_loss_mlp": 0.01049299, + "balance_loss_clip": 1.02829075, + "balance_loss_mlp": 1.03506577, + "epoch": 0.3766721779648279, + "flos": 23550361776000.0, + "grad_norm": 2.1028028848542846, + "language_loss": 0.73547834, + "learning_rate": 2.8662884234679497e-06, + "loss": 0.75675881, + "num_input_tokens_seen": 134480440, + "step": 6265, + "time_per_iteration": 2.6112022399902344 + }, + { + "auxiliary_loss_clip": 0.01070821, + "auxiliary_loss_mlp": 0.01037791, + "balance_loss_clip": 1.03191936, + "balance_loss_mlp": 1.02569127, + "epoch": 0.37673230121749585, + "flos": 29129191655040.0, + "grad_norm": 1.7282545014836415, + "language_loss": 0.68814051, + "learning_rate": 2.865937375638654e-06, + "loss": 0.70922667, + "num_input_tokens_seen": 134501110, + "step": 6266, + "time_per_iteration": 2.6265487670898438 + }, + { + "auxiliary_loss_clip": 0.01074122, + "auxiliary_loss_mlp": 0.01038857, + "balance_loss_clip": 1.02972138, + "balance_loss_mlp": 1.02364612, + "epoch": 0.3767924244701638, + "flos": 28146783703680.0, + "grad_norm": 6.362673084785264, + "language_loss": 0.62875158, + "learning_rate": 2.8655862949729264e-06, + "loss": 0.64988136, + "num_input_tokens_seen": 134522460, + "step": 6267, + "time_per_iteration": 5.812572479248047 + }, + { + "auxiliary_loss_clip": 0.01005111, + "auxiliary_loss_mlp": 0.01044495, + "balance_loss_clip": 1.00550914, + "balance_loss_mlp": 1.04250455, + "epoch": 0.37685254772283183, + "flos": 60797197526400.0, + "grad_norm": 0.7312313092209486, + "language_loss": 0.58891952, + "learning_rate": 2.8652351814840795e-06, + "loss": 0.60941565, + "num_input_tokens_seen": 134589545, + "step": 6268, + "time_per_iteration": 3.2976245880126953 + }, + { + "auxiliary_loss_clip": 0.01082064, + "auxiliary_loss_mlp": 0.01041554, + "balance_loss_clip": 1.02997684, + "balance_loss_mlp": 1.02644992, + "epoch": 0.3769126709754998, + "flos": 26032543223040.0, + "grad_norm": 1.4437573333905975, + "language_loss": 0.65269578, + "learning_rate": 2.8648840351854283e-06, + "loss": 0.67393196, + "num_input_tokens_seen": 134610550, + "step": 6269, + "time_per_iteration": 2.6561741828918457 + }, + { + "auxiliary_loss_clip": 0.01049892, + "auxiliary_loss_mlp": 0.01033799, + "balance_loss_clip": 1.03136468, + "balance_loss_mlp": 1.01928544, + "epoch": 0.37697279422816776, + "flos": 23579879777280.0, + "grad_norm": 1.588443888361706, + "language_loss": 0.70664108, + "learning_rate": 2.8645328560902874e-06, + "loss": 0.72747797, + "num_input_tokens_seen": 134630485, + "step": 6270, + "time_per_iteration": 2.670687198638916 + }, + { + "auxiliary_loss_clip": 0.01013038, + "auxiliary_loss_mlp": 0.01004623, + "balance_loss_clip": 1.00284314, + "balance_loss_mlp": 1.00275159, + "epoch": 0.3770329174808357, + "flos": 64745935367040.0, + "grad_norm": 2.10239606148246, + "language_loss": 0.56105298, + "learning_rate": 2.8641816442119746e-06, + "loss": 0.58122963, + "num_input_tokens_seen": 134693510, + "step": 6271, + "time_per_iteration": 3.1096439361572266 + }, + { + "auxiliary_loss_clip": 0.01068811, + "auxiliary_loss_mlp": 0.01036435, + "balance_loss_clip": 1.02873874, + "balance_loss_mlp": 1.02180815, + "epoch": 0.3770930407335037, + "flos": 21835304115840.0, + "grad_norm": 1.6340159377467112, + "language_loss": 0.80033886, + "learning_rate": 2.8638303995638066e-06, + "loss": 0.82139134, + "num_input_tokens_seen": 134713115, + "step": 6272, + "time_per_iteration": 2.6810474395751953 + }, + { + "auxiliary_loss_clip": 0.01067165, + "auxiliary_loss_mlp": 0.01029046, + "balance_loss_clip": 1.02921224, + "balance_loss_mlp": 1.01579046, + "epoch": 0.37715316398617166, + "flos": 22747901984640.0, + "grad_norm": 1.5680433010927297, + "language_loss": 0.73744214, + "learning_rate": 2.863479122159103e-06, + "loss": 0.75840425, + "num_input_tokens_seen": 134732635, + "step": 6273, + "time_per_iteration": 2.558950185775757 + }, + { + "auxiliary_loss_clip": 0.0106954, + "auxiliary_loss_mlp": 0.01048012, + "balance_loss_clip": 1.03026891, + "balance_loss_mlp": 1.03460133, + "epoch": 0.3772132872388396, + "flos": 18914581520640.0, + "grad_norm": 1.5568010409358968, + "language_loss": 0.71746063, + "learning_rate": 2.8631278120111858e-06, + "loss": 0.73863614, + "num_input_tokens_seen": 134750695, + "step": 6274, + "time_per_iteration": 2.5470025539398193 + }, + { + "auxiliary_loss_clip": 0.01050931, + "auxiliary_loss_mlp": 0.01037417, + "balance_loss_clip": 1.02866864, + "balance_loss_mlp": 1.02381516, + "epoch": 0.3772734104915076, + "flos": 17346219004800.0, + "grad_norm": 1.6910003088289594, + "language_loss": 0.83941126, + "learning_rate": 2.8627764691333742e-06, + "loss": 0.8602947, + "num_input_tokens_seen": 134768935, + "step": 6275, + "time_per_iteration": 2.5730865001678467 + }, + { + "auxiliary_loss_clip": 0.01024865, + "auxiliary_loss_mlp": 0.01031353, + "balance_loss_clip": 1.02618337, + "balance_loss_mlp": 1.0193305, + "epoch": 0.37733353374417555, + "flos": 32342370785280.0, + "grad_norm": 1.376451980356907, + "language_loss": 0.75672078, + "learning_rate": 2.8624250935389935e-06, + "loss": 0.77728295, + "num_input_tokens_seen": 134791260, + "step": 6276, + "time_per_iteration": 2.8040707111358643 + }, + { + "auxiliary_loss_clip": 0.01056383, + "auxiliary_loss_mlp": 0.01042502, + "balance_loss_clip": 1.02832937, + "balance_loss_mlp": 1.02757132, + "epoch": 0.3773936569968435, + "flos": 23360681030400.0, + "grad_norm": 5.134741886260662, + "language_loss": 0.86227643, + "learning_rate": 2.862073685241366e-06, + "loss": 0.88326526, + "num_input_tokens_seen": 134808350, + "step": 6277, + "time_per_iteration": 2.93776535987854 + }, + { + "auxiliary_loss_clip": 0.01071907, + "auxiliary_loss_mlp": 0.01034202, + "balance_loss_clip": 1.03258824, + "balance_loss_mlp": 1.02182198, + "epoch": 0.3774537802495115, + "flos": 21466788531840.0, + "grad_norm": 2.0474150693927413, + "language_loss": 0.78065157, + "learning_rate": 2.861722244253818e-06, + "loss": 0.80171263, + "num_input_tokens_seen": 134826005, + "step": 6278, + "time_per_iteration": 4.342893838882446 + }, + { + "auxiliary_loss_clip": 0.01056744, + "auxiliary_loss_mlp": 0.01045448, + "balance_loss_clip": 1.03180492, + "balance_loss_mlp": 1.02989078, + "epoch": 0.37751390350217945, + "flos": 24973717086720.0, + "grad_norm": 1.739422809621293, + "language_loss": 0.83079886, + "learning_rate": 2.8613707705896767e-06, + "loss": 0.85182083, + "num_input_tokens_seen": 134844995, + "step": 6279, + "time_per_iteration": 2.866804599761963 + }, + { + "auxiliary_loss_clip": 0.01058681, + "auxiliary_loss_mlp": 0.01033654, + "balance_loss_clip": 1.02898777, + "balance_loss_mlp": 1.02037382, + "epoch": 0.3775740267548474, + "flos": 27819098904960.0, + "grad_norm": 2.898089393658884, + "language_loss": 0.75087333, + "learning_rate": 2.861019264262269e-06, + "loss": 0.7717967, + "num_input_tokens_seen": 134865285, + "step": 6280, + "time_per_iteration": 4.488524436950684 + }, + { + "auxiliary_loss_clip": 0.01077294, + "auxiliary_loss_mlp": 0.01035931, + "balance_loss_clip": 1.02942371, + "balance_loss_mlp": 1.02379584, + "epoch": 0.3776341500075154, + "flos": 22565224391040.0, + "grad_norm": 1.3933624162071903, + "language_loss": 0.76312339, + "learning_rate": 2.8606677252849242e-06, + "loss": 0.78425562, + "num_input_tokens_seen": 134886535, + "step": 6281, + "time_per_iteration": 2.721909761428833 + }, + { + "auxiliary_loss_clip": 0.01048293, + "auxiliary_loss_mlp": 0.01035285, + "balance_loss_clip": 1.02711725, + "balance_loss_mlp": 1.02167106, + "epoch": 0.3776942732601834, + "flos": 23077238808960.0, + "grad_norm": 1.6134496409954857, + "language_loss": 0.84137225, + "learning_rate": 2.860316153670974e-06, + "loss": 0.86220801, + "num_input_tokens_seen": 134907435, + "step": 6282, + "time_per_iteration": 2.6700432300567627 + }, + { + "auxiliary_loss_clip": 0.01067613, + "auxiliary_loss_mlp": 0.01035174, + "balance_loss_clip": 1.02956092, + "balance_loss_mlp": 1.02171493, + "epoch": 0.37775439651285136, + "flos": 21724411852800.0, + "grad_norm": 2.893168625011359, + "language_loss": 0.69756383, + "learning_rate": 2.8599645494337484e-06, + "loss": 0.71859169, + "num_input_tokens_seen": 134925360, + "step": 6283, + "time_per_iteration": 2.577486276626587 + }, + { + "auxiliary_loss_clip": 0.0102225, + "auxiliary_loss_mlp": 0.01052815, + "balance_loss_clip": 1.03310764, + "balance_loss_mlp": 1.03693664, + "epoch": 0.37781451976551933, + "flos": 23987753688960.0, + "grad_norm": 1.720764622119206, + "language_loss": 0.76154417, + "learning_rate": 2.859612912586581e-06, + "loss": 0.78229481, + "num_input_tokens_seen": 134944205, + "step": 6284, + "time_per_iteration": 2.7623798847198486 + }, + { + "auxiliary_loss_clip": 0.01085169, + "auxiliary_loss_mlp": 0.01030141, + "balance_loss_clip": 1.03227258, + "balance_loss_mlp": 1.01622987, + "epoch": 0.3778746430181873, + "flos": 13727967223680.0, + "grad_norm": 2.2071355660962277, + "language_loss": 0.84734452, + "learning_rate": 2.8592612431428055e-06, + "loss": 0.86849761, + "num_input_tokens_seen": 134960255, + "step": 6285, + "time_per_iteration": 2.5225830078125 + }, + { + "auxiliary_loss_clip": 0.01060926, + "auxiliary_loss_mlp": 0.01038942, + "balance_loss_clip": 1.0303762, + "balance_loss_mlp": 1.02490497, + "epoch": 0.37793476627085526, + "flos": 19460495399040.0, + "grad_norm": 2.2430305771781245, + "language_loss": 0.84543049, + "learning_rate": 2.858909541115758e-06, + "loss": 0.86642921, + "num_input_tokens_seen": 134978605, + "step": 6286, + "time_per_iteration": 2.673295259475708 + }, + { + "auxiliary_loss_clip": 0.01068271, + "auxiliary_loss_mlp": 0.01040103, + "balance_loss_clip": 1.03119493, + "balance_loss_mlp": 1.02659011, + "epoch": 0.3779948895235232, + "flos": 10707018704640.0, + "grad_norm": 1.8495107825824944, + "language_loss": 0.81782663, + "learning_rate": 2.858557806518775e-06, + "loss": 0.83891034, + "num_input_tokens_seen": 134995020, + "step": 6287, + "time_per_iteration": 2.6133739948272705 + }, + { + "auxiliary_loss_clip": 0.01060332, + "auxiliary_loss_mlp": 0.01032307, + "balance_loss_clip": 1.02622569, + "balance_loss_mlp": 1.01828837, + "epoch": 0.3780550127761912, + "flos": 22310007281280.0, + "grad_norm": 2.1568721382604727, + "language_loss": 0.73309112, + "learning_rate": 2.8582060393651927e-06, + "loss": 0.75401753, + "num_input_tokens_seen": 135012620, + "step": 6288, + "time_per_iteration": 2.6171817779541016 + }, + { + "auxiliary_loss_clip": 0.01073137, + "auxiliary_loss_mlp": 0.0103433, + "balance_loss_clip": 1.03321528, + "balance_loss_mlp": 1.02034616, + "epoch": 0.37811513602885916, + "flos": 28950644125440.0, + "grad_norm": 1.825360364026772, + "language_loss": 0.75288117, + "learning_rate": 2.857854239668352e-06, + "loss": 0.77395582, + "num_input_tokens_seen": 135033365, + "step": 6289, + "time_per_iteration": 2.6929686069488525 + }, + { + "auxiliary_loss_clip": 0.01067301, + "auxiliary_loss_mlp": 0.01034218, + "balance_loss_clip": 1.02929187, + "balance_loss_mlp": 1.02133155, + "epoch": 0.3781752592815271, + "flos": 23112933949440.0, + "grad_norm": 1.6687263313643153, + "language_loss": 0.7304188, + "learning_rate": 2.857502407441593e-06, + "loss": 0.75143397, + "num_input_tokens_seen": 135052185, + "step": 6290, + "time_per_iteration": 2.658607006072998 + }, + { + "auxiliary_loss_clip": 0.01043974, + "auxiliary_loss_mlp": 0.01036496, + "balance_loss_clip": 1.02666426, + "balance_loss_mlp": 1.02098727, + "epoch": 0.3782353825341951, + "flos": 19755932762880.0, + "grad_norm": 2.915870963062707, + "language_loss": 0.79873025, + "learning_rate": 2.8571505426982566e-06, + "loss": 0.81953502, + "num_input_tokens_seen": 135070425, + "step": 6291, + "time_per_iteration": 2.798386335372925 + }, + { + "auxiliary_loss_clip": 0.01049739, + "auxiliary_loss_mlp": 0.01034813, + "balance_loss_clip": 1.02837586, + "balance_loss_mlp": 1.02054405, + "epoch": 0.37829550578686305, + "flos": 22050839675520.0, + "grad_norm": 1.9110241308684868, + "language_loss": 0.76077247, + "learning_rate": 2.8567986454516854e-06, + "loss": 0.781618, + "num_input_tokens_seen": 135090525, + "step": 6292, + "time_per_iteration": 2.8141157627105713 + }, + { + "auxiliary_loss_clip": 0.01058309, + "auxiliary_loss_mlp": 0.01042749, + "balance_loss_clip": 1.02708936, + "balance_loss_mlp": 1.02813363, + "epoch": 0.378355629039531, + "flos": 16470357770880.0, + "grad_norm": 1.829445787343224, + "language_loss": 0.69556522, + "learning_rate": 2.856446715715224e-06, + "loss": 0.71657574, + "num_input_tokens_seen": 135109575, + "step": 6293, + "time_per_iteration": 2.693983793258667 + }, + { + "auxiliary_loss_clip": 0.01079066, + "auxiliary_loss_mlp": 0.01036135, + "balance_loss_clip": 1.02977109, + "balance_loss_mlp": 1.02218795, + "epoch": 0.378415752292199, + "flos": 19974844200960.0, + "grad_norm": 2.0406508094005655, + "language_loss": 0.71445239, + "learning_rate": 2.8560947535022173e-06, + "loss": 0.73560441, + "num_input_tokens_seen": 135127000, + "step": 6294, + "time_per_iteration": 2.6072962284088135 + }, + { + "auxiliary_loss_clip": 0.01059689, + "auxiliary_loss_mlp": 0.01034096, + "balance_loss_clip": 1.03000546, + "balance_loss_mlp": 1.01885509, + "epoch": 0.378475875544867, + "flos": 14647388676480.0, + "grad_norm": 2.2505389312282693, + "language_loss": 0.82716745, + "learning_rate": 2.855742758826011e-06, + "loss": 0.84810531, + "num_input_tokens_seen": 135145285, + "step": 6295, + "time_per_iteration": 2.6472067832946777 + }, + { + "auxiliary_loss_clip": 0.01058139, + "auxiliary_loss_mlp": 0.01034657, + "balance_loss_clip": 1.02682889, + "balance_loss_mlp": 1.02056634, + "epoch": 0.37853599879753497, + "flos": 26650996617600.0, + "grad_norm": 1.764700175697416, + "language_loss": 0.71997249, + "learning_rate": 2.8553907316999547e-06, + "loss": 0.74090046, + "num_input_tokens_seen": 135165240, + "step": 6296, + "time_per_iteration": 2.75504469871521 + }, + { + "auxiliary_loss_clip": 0.01077343, + "auxiliary_loss_mlp": 0.01041656, + "balance_loss_clip": 1.0300864, + "balance_loss_mlp": 1.02872825, + "epoch": 0.37859612205020293, + "flos": 17311960408320.0, + "grad_norm": 1.9290182694323863, + "language_loss": 0.77352738, + "learning_rate": 2.855038672137396e-06, + "loss": 0.79471743, + "num_input_tokens_seen": 135184045, + "step": 6297, + "time_per_iteration": 2.566714286804199 + }, + { + "auxiliary_loss_clip": 0.01052686, + "auxiliary_loss_mlp": 0.01033047, + "balance_loss_clip": 1.02753282, + "balance_loss_mlp": 1.019719, + "epoch": 0.3786562453028709, + "flos": 18220392299520.0, + "grad_norm": 1.7928796147651198, + "language_loss": 0.79332328, + "learning_rate": 2.854686580151684e-06, + "loss": 0.81418061, + "num_input_tokens_seen": 135202365, + "step": 6298, + "time_per_iteration": 2.5854382514953613 + }, + { + "auxiliary_loss_clip": 0.01018243, + "auxiliary_loss_mlp": 0.01046413, + "balance_loss_clip": 1.02404225, + "balance_loss_mlp": 1.03116608, + "epoch": 0.37871636855553886, + "flos": 21214875473280.0, + "grad_norm": 1.5475758152905599, + "language_loss": 0.84298646, + "learning_rate": 2.8543344557561722e-06, + "loss": 0.86363304, + "num_input_tokens_seen": 135220955, + "step": 6299, + "time_per_iteration": 2.7120726108551025 + }, + { + "auxiliary_loss_clip": 0.01046314, + "auxiliary_loss_mlp": 0.01035482, + "balance_loss_clip": 1.02931583, + "balance_loss_mlp": 1.02137947, + "epoch": 0.3787764918082068, + "flos": 20952727038720.0, + "grad_norm": 2.2721389837980537, + "language_loss": 0.76454973, + "learning_rate": 2.8539822989642116e-06, + "loss": 0.78536773, + "num_input_tokens_seen": 135239715, + "step": 6300, + "time_per_iteration": 2.7369182109832764 + }, + { + "auxiliary_loss_clip": 0.01062522, + "auxiliary_loss_mlp": 0.01035759, + "balance_loss_clip": 1.02872598, + "balance_loss_mlp": 1.02027392, + "epoch": 0.3788366150608748, + "flos": 17308009912320.0, + "grad_norm": 1.9672371764750805, + "language_loss": 0.82378638, + "learning_rate": 2.8536301097891577e-06, + "loss": 0.84476924, + "num_input_tokens_seen": 135257035, + "step": 6301, + "time_per_iteration": 2.7837512493133545 + }, + { + "auxiliary_loss_clip": 0.01068213, + "auxiliary_loss_mlp": 0.01039694, + "balance_loss_clip": 1.02804923, + "balance_loss_mlp": 1.02558565, + "epoch": 0.37889673831354276, + "flos": 24311092942080.0, + "grad_norm": 1.8555690172845751, + "language_loss": 0.67741537, + "learning_rate": 2.8532778882443636e-06, + "loss": 0.69849449, + "num_input_tokens_seen": 135275720, + "step": 6302, + "time_per_iteration": 2.6092660427093506 + }, + { + "auxiliary_loss_clip": 0.01033224, + "auxiliary_loss_mlp": 0.01036619, + "balance_loss_clip": 1.02800274, + "balance_loss_mlp": 1.02347648, + "epoch": 0.3789568615662107, + "flos": 26683603188480.0, + "grad_norm": 1.7884680127619108, + "language_loss": 0.68485421, + "learning_rate": 2.8529256343431867e-06, + "loss": 0.70555264, + "num_input_tokens_seen": 135294140, + "step": 6303, + "time_per_iteration": 2.8145434856414795 + }, + { + "auxiliary_loss_clip": 0.01076719, + "auxiliary_loss_mlp": 0.0103428, + "balance_loss_clip": 1.02804089, + "balance_loss_mlp": 1.02141714, + "epoch": 0.3790169848188787, + "flos": 23585194990080.0, + "grad_norm": 1.6589276815459133, + "language_loss": 0.77753198, + "learning_rate": 2.8525733480989846e-06, + "loss": 0.79864204, + "num_input_tokens_seen": 135314845, + "step": 6304, + "time_per_iteration": 2.7018096446990967 + }, + { + "auxiliary_loss_clip": 0.01086341, + "auxiliary_loss_mlp": 0.01035767, + "balance_loss_clip": 1.03241301, + "balance_loss_mlp": 1.02032983, + "epoch": 0.37907710807154665, + "flos": 18437436230400.0, + "grad_norm": 2.0366559413152605, + "language_loss": 0.79717654, + "learning_rate": 2.8522210295251146e-06, + "loss": 0.81839764, + "num_input_tokens_seen": 135333055, + "step": 6305, + "time_per_iteration": 2.515092372894287 + }, + { + "auxiliary_loss_clip": 0.01006729, + "auxiliary_loss_mlp": 0.01008527, + "balance_loss_clip": 1.00622845, + "balance_loss_mlp": 1.00679827, + "epoch": 0.3791372313242146, + "flos": 50107165954560.0, + "grad_norm": 0.9714747025570976, + "language_loss": 0.64479822, + "learning_rate": 2.8518686786349387e-06, + "loss": 0.66495079, + "num_input_tokens_seen": 135387865, + "step": 6306, + "time_per_iteration": 3.0429394245147705 + }, + { + "auxiliary_loss_clip": 0.0105549, + "auxiliary_loss_mlp": 0.0105131, + "balance_loss_clip": 1.03018737, + "balance_loss_mlp": 1.03557444, + "epoch": 0.3791973545768826, + "flos": 24316551809280.0, + "grad_norm": 1.551275945742537, + "language_loss": 0.73233503, + "learning_rate": 2.851516295441817e-06, + "loss": 0.75340307, + "num_input_tokens_seen": 135409095, + "step": 6307, + "time_per_iteration": 2.708918333053589 + }, + { + "auxiliary_loss_clip": 0.01059382, + "auxiliary_loss_mlp": 0.01037993, + "balance_loss_clip": 1.02967298, + "balance_loss_mlp": 1.02330089, + "epoch": 0.3792574778295506, + "flos": 21579907438080.0, + "grad_norm": 4.509373801226174, + "language_loss": 0.78478545, + "learning_rate": 2.851163879959112e-06, + "loss": 0.80575919, + "num_input_tokens_seen": 135429585, + "step": 6308, + "time_per_iteration": 2.723421573638916 + }, + { + "auxiliary_loss_clip": 0.01045833, + "auxiliary_loss_mlp": 0.01042464, + "balance_loss_clip": 1.02758312, + "balance_loss_mlp": 1.02814651, + "epoch": 0.37931760108221857, + "flos": 22272731942400.0, + "grad_norm": 2.2725024034459786, + "language_loss": 0.7253952, + "learning_rate": 2.8508114322001876e-06, + "loss": 0.74627817, + "num_input_tokens_seen": 135446320, + "step": 6309, + "time_per_iteration": 2.645017147064209 + }, + { + "auxiliary_loss_clip": 0.01018923, + "auxiliary_loss_mlp": 0.01037918, + "balance_loss_clip": 1.02520669, + "balance_loss_mlp": 1.02313566, + "epoch": 0.37937772433488653, + "flos": 19682998197120.0, + "grad_norm": 1.7140733391448584, + "language_loss": 0.78638601, + "learning_rate": 2.8504589521784083e-06, + "loss": 0.80695438, + "num_input_tokens_seen": 135465720, + "step": 6310, + "time_per_iteration": 2.7545578479766846 + }, + { + "auxiliary_loss_clip": 0.01069661, + "auxiliary_loss_mlp": 0.00747655, + "balance_loss_clip": 1.02900708, + "balance_loss_mlp": 1.00000072, + "epoch": 0.3794378475875545, + "flos": 19099378016640.0, + "grad_norm": 2.1204456934241964, + "language_loss": 0.76029944, + "learning_rate": 2.8501064399071403e-06, + "loss": 0.7784726, + "num_input_tokens_seen": 135485155, + "step": 6311, + "time_per_iteration": 2.733278751373291 + }, + { + "auxiliary_loss_clip": 0.01056701, + "auxiliary_loss_mlp": 0.01040075, + "balance_loss_clip": 1.0308342, + "balance_loss_mlp": 1.02626419, + "epoch": 0.37949797084022246, + "flos": 20339660684160.0, + "grad_norm": 1.7428280227627584, + "language_loss": 0.70366323, + "learning_rate": 2.8497538953997504e-06, + "loss": 0.72463101, + "num_input_tokens_seen": 135502675, + "step": 6312, + "time_per_iteration": 2.604933023452759 + }, + { + "auxiliary_loss_clip": 0.00991481, + "auxiliary_loss_mlp": 0.0100213, + "balance_loss_clip": 1.01086283, + "balance_loss_mlp": 1.00050867, + "epoch": 0.37955809409289043, + "flos": 63972203477760.0, + "grad_norm": 0.7818871336526848, + "language_loss": 0.56079757, + "learning_rate": 2.849401318669608e-06, + "loss": 0.58073366, + "num_input_tokens_seen": 135562005, + "step": 6313, + "time_per_iteration": 3.353015184402466 + }, + { + "auxiliary_loss_clip": 0.0104196, + "auxiliary_loss_mlp": 0.0104269, + "balance_loss_clip": 1.02654088, + "balance_loss_mlp": 1.02864146, + "epoch": 0.3796182173455584, + "flos": 31540665179520.0, + "grad_norm": 1.9735441015699051, + "language_loss": 0.71630967, + "learning_rate": 2.849048709730083e-06, + "loss": 0.73715615, + "num_input_tokens_seen": 135582600, + "step": 6314, + "time_per_iteration": 2.8795385360717773 + }, + { + "auxiliary_loss_clip": 0.01075099, + "auxiliary_loss_mlp": 0.01041062, + "balance_loss_clip": 1.03210449, + "balance_loss_mlp": 1.02656031, + "epoch": 0.37967834059822636, + "flos": 12130804978560.0, + "grad_norm": 2.1618884777169862, + "language_loss": 0.7289027, + "learning_rate": 2.848696068594545e-06, + "loss": 0.75006431, + "num_input_tokens_seen": 135600280, + "step": 6315, + "time_per_iteration": 6.153602838516235 + }, + { + "auxiliary_loss_clip": 0.01065812, + "auxiliary_loss_mlp": 0.01038639, + "balance_loss_clip": 1.03030133, + "balance_loss_mlp": 1.02491784, + "epoch": 0.3797384638508943, + "flos": 39348578298240.0, + "grad_norm": 2.026223629527338, + "language_loss": 0.7072351, + "learning_rate": 2.8483433952763677e-06, + "loss": 0.72827959, + "num_input_tokens_seen": 135621560, + "step": 6316, + "time_per_iteration": 2.747326374053955 + }, + { + "auxiliary_loss_clip": 0.01049055, + "auxiliary_loss_mlp": 0.01034144, + "balance_loss_clip": 1.03056216, + "balance_loss_mlp": 1.0215497, + "epoch": 0.3797985871035623, + "flos": 34054016653440.0, + "grad_norm": 2.3194849486982796, + "language_loss": 0.65078294, + "learning_rate": 2.847990689788923e-06, + "loss": 0.67161494, + "num_input_tokens_seen": 135641745, + "step": 6317, + "time_per_iteration": 2.7667038440704346 + }, + { + "auxiliary_loss_clip": 0.01068327, + "auxiliary_loss_mlp": 0.01037622, + "balance_loss_clip": 1.02940583, + "balance_loss_mlp": 1.02501607, + "epoch": 0.37985871035623026, + "flos": 23222174186880.0, + "grad_norm": 2.0126251245675015, + "language_loss": 0.85487533, + "learning_rate": 2.8476379521455877e-06, + "loss": 0.87593484, + "num_input_tokens_seen": 135660650, + "step": 6318, + "time_per_iteration": 2.681173324584961 + }, + { + "auxiliary_loss_clip": 0.01059415, + "auxiliary_loss_mlp": 0.01041264, + "balance_loss_clip": 1.02890491, + "balance_loss_mlp": 1.02637446, + "epoch": 0.3799188336088982, + "flos": 18114958903680.0, + "grad_norm": 1.8670625389537752, + "language_loss": 0.75838637, + "learning_rate": 2.8472851823597354e-06, + "loss": 0.77939314, + "num_input_tokens_seen": 135679980, + "step": 6319, + "time_per_iteration": 2.6379411220550537 + }, + { + "auxiliary_loss_clip": 0.01083338, + "auxiliary_loss_mlp": 0.01040057, + "balance_loss_clip": 1.03280175, + "balance_loss_mlp": 1.02627587, + "epoch": 0.3799789568615662, + "flos": 21871897096320.0, + "grad_norm": 1.5509186487532, + "language_loss": 0.63761818, + "learning_rate": 2.846932380444744e-06, + "loss": 0.6588521, + "num_input_tokens_seen": 135699400, + "step": 6320, + "time_per_iteration": 2.6405279636383057 + }, + { + "auxiliary_loss_clip": 0.01044877, + "auxiliary_loss_mlp": 0.01038314, + "balance_loss_clip": 1.0329051, + "balance_loss_mlp": 1.0241642, + "epoch": 0.3800390801142342, + "flos": 32962943082240.0, + "grad_norm": 2.4665525700019195, + "language_loss": 0.71270561, + "learning_rate": 2.846579546413992e-06, + "loss": 0.73353755, + "num_input_tokens_seen": 135723455, + "step": 6321, + "time_per_iteration": 2.9718544483184814 + }, + { + "auxiliary_loss_clip": 0.01040714, + "auxiliary_loss_mlp": 0.01041984, + "balance_loss_clip": 1.0253787, + "balance_loss_mlp": 1.02818573, + "epoch": 0.38009920336690217, + "flos": 26907075653760.0, + "grad_norm": 3.4640091127552113, + "language_loss": 0.74547631, + "learning_rate": 2.846226680280859e-06, + "loss": 0.7663033, + "num_input_tokens_seen": 135744335, + "step": 6322, + "time_per_iteration": 2.823286533355713 + }, + { + "auxiliary_loss_clip": 0.0106965, + "auxiliary_loss_mlp": 0.01039542, + "balance_loss_clip": 1.02916479, + "balance_loss_mlp": 1.02539182, + "epoch": 0.38015932661957014, + "flos": 22488913946880.0, + "grad_norm": 2.1183556091386624, + "language_loss": 0.8496052, + "learning_rate": 2.845873782058725e-06, + "loss": 0.87069714, + "num_input_tokens_seen": 135761440, + "step": 6323, + "time_per_iteration": 2.665842294692993 + }, + { + "auxiliary_loss_clip": 0.01055979, + "auxiliary_loss_mlp": 0.01037488, + "balance_loss_clip": 1.02800608, + "balance_loss_mlp": 1.0218482, + "epoch": 0.3802194498722381, + "flos": 21980993679360.0, + "grad_norm": 2.5859700030129797, + "language_loss": 0.73386759, + "learning_rate": 2.845520851760973e-06, + "loss": 0.75480223, + "num_input_tokens_seen": 135779955, + "step": 6324, + "time_per_iteration": 2.61995530128479 + }, + { + "auxiliary_loss_clip": 0.01054735, + "auxiliary_loss_mlp": 0.01037912, + "balance_loss_clip": 1.03197193, + "balance_loss_mlp": 1.02274823, + "epoch": 0.38027957312490607, + "flos": 21324869896320.0, + "grad_norm": 1.73118297399414, + "language_loss": 0.84344363, + "learning_rate": 2.8451678894009847e-06, + "loss": 0.86437011, + "num_input_tokens_seen": 135799840, + "step": 6325, + "time_per_iteration": 4.3386390209198 + }, + { + "auxiliary_loss_clip": 0.01061294, + "auxiliary_loss_mlp": 0.01032933, + "balance_loss_clip": 1.03159642, + "balance_loss_mlp": 1.01993275, + "epoch": 0.38033969637757403, + "flos": 16691244456960.0, + "grad_norm": 1.8795913971079548, + "language_loss": 0.79280853, + "learning_rate": 2.8448148949921465e-06, + "loss": 0.81375086, + "num_input_tokens_seen": 135817880, + "step": 6326, + "time_per_iteration": 2.6833996772766113 + }, + { + "auxiliary_loss_clip": 0.01069735, + "auxiliary_loss_mlp": 0.01037482, + "balance_loss_clip": 1.02985644, + "balance_loss_mlp": 1.0241549, + "epoch": 0.380399819630242, + "flos": 36210847685760.0, + "grad_norm": 1.7896658224814803, + "language_loss": 0.72882581, + "learning_rate": 2.844461868547842e-06, + "loss": 0.74989796, + "num_input_tokens_seen": 135838940, + "step": 6327, + "time_per_iteration": 4.417211294174194 + }, + { + "auxiliary_loss_clip": 0.01081358, + "auxiliary_loss_mlp": 0.00747666, + "balance_loss_clip": 1.0316515, + "balance_loss_mlp": 1.00005865, + "epoch": 0.38045994288290996, + "flos": 21288851533440.0, + "grad_norm": 1.5822357643994345, + "language_loss": 0.82948923, + "learning_rate": 2.844108810081459e-06, + "loss": 0.84777945, + "num_input_tokens_seen": 135858325, + "step": 6328, + "time_per_iteration": 2.5649659633636475 + }, + { + "auxiliary_loss_clip": 0.01067408, + "auxiliary_loss_mlp": 0.01030469, + "balance_loss_clip": 1.02911174, + "balance_loss_mlp": 1.01699853, + "epoch": 0.38052006613557793, + "flos": 20922885815040.0, + "grad_norm": 1.4230143623711855, + "language_loss": 0.61257124, + "learning_rate": 2.843755719606385e-06, + "loss": 0.63355005, + "num_input_tokens_seen": 135878430, + "step": 6329, + "time_per_iteration": 2.570798873901367 + }, + { + "auxiliary_loss_clip": 0.01047433, + "auxiliary_loss_mlp": 0.01041367, + "balance_loss_clip": 1.02736509, + "balance_loss_mlp": 1.02734232, + "epoch": 0.3805801893882459, + "flos": 20990720649600.0, + "grad_norm": 1.988057703363336, + "language_loss": 0.55814147, + "learning_rate": 2.8434025971360104e-06, + "loss": 0.5790295, + "num_input_tokens_seen": 135894755, + "step": 6330, + "time_per_iteration": 2.5977365970611572 + }, + { + "auxiliary_loss_clip": 0.01050842, + "auxiliary_loss_mlp": 0.01036064, + "balance_loss_clip": 1.03524685, + "balance_loss_mlp": 1.02270079, + "epoch": 0.38064031264091386, + "flos": 25558594243200.0, + "grad_norm": 1.4580492481554943, + "language_loss": 0.66310745, + "learning_rate": 2.8430494426837243e-06, + "loss": 0.68397653, + "num_input_tokens_seen": 135918275, + "step": 6331, + "time_per_iteration": 2.7975263595581055 + }, + { + "auxiliary_loss_clip": 0.0107373, + "auxiliary_loss_mlp": 0.01042699, + "balance_loss_clip": 1.03464401, + "balance_loss_mlp": 1.02766061, + "epoch": 0.3807004358935818, + "flos": 15085857997440.0, + "grad_norm": 1.5939646726949193, + "language_loss": 0.75929725, + "learning_rate": 2.842696256262919e-06, + "loss": 0.78046149, + "num_input_tokens_seen": 135937430, + "step": 6332, + "time_per_iteration": 2.706604242324829 + }, + { + "auxiliary_loss_clip": 0.01026067, + "auxiliary_loss_mlp": 0.00747907, + "balance_loss_clip": 1.02891195, + "balance_loss_mlp": 1.00002599, + "epoch": 0.3807605591462498, + "flos": 16399398453120.0, + "grad_norm": 1.7141095510631637, + "language_loss": 0.81431448, + "learning_rate": 2.842343037886987e-06, + "loss": 0.83205426, + "num_input_tokens_seen": 135954210, + "step": 6333, + "time_per_iteration": 2.8336198329925537 + }, + { + "auxiliary_loss_clip": 0.0106821, + "auxiliary_loss_mlp": 0.0103316, + "balance_loss_clip": 1.02854228, + "balance_loss_mlp": 1.01951671, + "epoch": 0.3808206823989178, + "flos": 29057083102080.0, + "grad_norm": 1.820929181585814, + "language_loss": 0.86157244, + "learning_rate": 2.8419897875693226e-06, + "loss": 0.88258618, + "num_input_tokens_seen": 135974425, + "step": 6334, + "time_per_iteration": 2.7693827152252197 + }, + { + "auxiliary_loss_clip": 0.01071836, + "auxiliary_loss_mlp": 0.01034049, + "balance_loss_clip": 1.03016162, + "balance_loss_mlp": 1.02005994, + "epoch": 0.3808808056515858, + "flos": 15705855676800.0, + "grad_norm": 2.5841890949142403, + "language_loss": 0.78901672, + "learning_rate": 2.841636505323321e-06, + "loss": 0.81007564, + "num_input_tokens_seen": 135991985, + "step": 6335, + "time_per_iteration": 2.8596417903900146 + }, + { + "auxiliary_loss_clip": 0.01071304, + "auxiliary_loss_mlp": 0.01034404, + "balance_loss_clip": 1.03044486, + "balance_loss_mlp": 1.02002692, + "epoch": 0.38094092890425374, + "flos": 20704584908160.0, + "grad_norm": 1.7299524945209435, + "language_loss": 0.72833449, + "learning_rate": 2.8412831911623795e-06, + "loss": 0.74939156, + "num_input_tokens_seen": 136010015, + "step": 6336, + "time_per_iteration": 2.706953287124634 + }, + { + "auxiliary_loss_clip": 0.01067902, + "auxiliary_loss_mlp": 0.01028814, + "balance_loss_clip": 1.02919602, + "balance_loss_mlp": 1.01582062, + "epoch": 0.3810010521569217, + "flos": 20667956014080.0, + "grad_norm": 1.8621158060258605, + "language_loss": 0.6932919, + "learning_rate": 2.840929845099894e-06, + "loss": 0.71425903, + "num_input_tokens_seen": 136028440, + "step": 6337, + "time_per_iteration": 2.6289288997650146 + }, + { + "auxiliary_loss_clip": 0.01053685, + "auxiliary_loss_mlp": 0.01033287, + "balance_loss_clip": 1.02752948, + "balance_loss_mlp": 1.01906502, + "epoch": 0.38106117540958967, + "flos": 31827626933760.0, + "grad_norm": 1.552103371253741, + "language_loss": 0.63576281, + "learning_rate": 2.8405764671492652e-06, + "loss": 0.65663254, + "num_input_tokens_seen": 136048360, + "step": 6338, + "time_per_iteration": 2.7659432888031006 + }, + { + "auxiliary_loss_clip": 0.01058418, + "auxiliary_loss_mlp": 0.0104079, + "balance_loss_clip": 1.02951634, + "balance_loss_mlp": 1.02572191, + "epoch": 0.38112129866225763, + "flos": 16902757693440.0, + "grad_norm": 2.296528275169815, + "language_loss": 0.69281781, + "learning_rate": 2.8402230573238923e-06, + "loss": 0.71380997, + "num_input_tokens_seen": 136065500, + "step": 6339, + "time_per_iteration": 2.638472557067871 + }, + { + "auxiliary_loss_clip": 0.01052908, + "auxiliary_loss_mlp": 0.0103738, + "balance_loss_clip": 1.02846026, + "balance_loss_mlp": 1.02280068, + "epoch": 0.3811814219149256, + "flos": 20887226588160.0, + "grad_norm": 2.7065171650420194, + "language_loss": 0.68158579, + "learning_rate": 2.839869615637177e-06, + "loss": 0.70248872, + "num_input_tokens_seen": 136084060, + "step": 6340, + "time_per_iteration": 2.629420518875122 + }, + { + "auxiliary_loss_clip": 0.01052579, + "auxiliary_loss_mlp": 0.01038498, + "balance_loss_clip": 1.03050315, + "balance_loss_mlp": 1.02373362, + "epoch": 0.38124154516759357, + "flos": 16690813493760.0, + "grad_norm": 2.1294019687041796, + "language_loss": 0.89965993, + "learning_rate": 2.839516142102522e-06, + "loss": 0.92057067, + "num_input_tokens_seen": 136102310, + "step": 6341, + "time_per_iteration": 2.7632267475128174 + }, + { + "auxiliary_loss_clip": 0.01075239, + "auxiliary_loss_mlp": 0.01037048, + "balance_loss_clip": 1.03281736, + "balance_loss_mlp": 1.02215838, + "epoch": 0.38130166842026153, + "flos": 19681956702720.0, + "grad_norm": 1.6132945947623918, + "language_loss": 0.75033039, + "learning_rate": 2.83916263673333e-06, + "loss": 0.77145326, + "num_input_tokens_seen": 136120725, + "step": 6342, + "time_per_iteration": 2.5570054054260254 + }, + { + "auxiliary_loss_clip": 0.0105774, + "auxiliary_loss_mlp": 0.01032712, + "balance_loss_clip": 1.02844858, + "balance_loss_mlp": 1.01915836, + "epoch": 0.3813617916729295, + "flos": 22198432659840.0, + "grad_norm": 3.403727166095157, + "language_loss": 0.83399475, + "learning_rate": 2.838809099543007e-06, + "loss": 0.85489929, + "num_input_tokens_seen": 136139105, + "step": 6343, + "time_per_iteration": 2.5922868251800537 + }, + { + "auxiliary_loss_clip": 0.01023068, + "auxiliary_loss_mlp": 0.01041819, + "balance_loss_clip": 1.02924097, + "balance_loss_mlp": 1.02708459, + "epoch": 0.38142191492559746, + "flos": 19096899978240.0, + "grad_norm": 2.0815504437738626, + "language_loss": 0.76881748, + "learning_rate": 2.838455530544959e-06, + "loss": 0.78946632, + "num_input_tokens_seen": 136158265, + "step": 6344, + "time_per_iteration": 2.773075580596924 + }, + { + "auxiliary_loss_clip": 0.01047019, + "auxiliary_loss_mlp": 0.01046959, + "balance_loss_clip": 1.0308919, + "balance_loss_mlp": 1.03144979, + "epoch": 0.3814820381782654, + "flos": 24097748112000.0, + "grad_norm": 1.9797134888608265, + "language_loss": 0.7319324, + "learning_rate": 2.838101929752593e-06, + "loss": 0.75287223, + "num_input_tokens_seen": 136176100, + "step": 6345, + "time_per_iteration": 2.6831777095794678 + }, + { + "auxiliary_loss_clip": 0.01048067, + "auxiliary_loss_mlp": 0.00747664, + "balance_loss_clip": 1.02926588, + "balance_loss_mlp": 1.00006318, + "epoch": 0.3815421614309334, + "flos": 15778502933760.0, + "grad_norm": 1.7972610436910894, + "language_loss": 0.69682962, + "learning_rate": 2.8377482971793187e-06, + "loss": 0.71478689, + "num_input_tokens_seen": 136195125, + "step": 6346, + "time_per_iteration": 2.612992763519287 + }, + { + "auxiliary_loss_clip": 0.01065631, + "auxiliary_loss_mlp": 0.01036122, + "balance_loss_clip": 1.02854514, + "balance_loss_mlp": 1.0219357, + "epoch": 0.38160228468360136, + "flos": 19899754819200.0, + "grad_norm": 1.7663676871432779, + "language_loss": 0.75598526, + "learning_rate": 2.8373946328385437e-06, + "loss": 0.77700281, + "num_input_tokens_seen": 136213885, + "step": 6347, + "time_per_iteration": 2.6135494709014893 + }, + { + "auxiliary_loss_clip": 0.01071238, + "auxiliary_loss_mlp": 0.01038606, + "balance_loss_clip": 1.02989268, + "balance_loss_mlp": 1.02584505, + "epoch": 0.3816624079362694, + "flos": 19281050029440.0, + "grad_norm": 1.9783547437884899, + "language_loss": 0.74224627, + "learning_rate": 2.8370409367436813e-06, + "loss": 0.76334471, + "num_input_tokens_seen": 136232700, + "step": 6348, + "time_per_iteration": 2.8288981914520264 + }, + { + "auxiliary_loss_clip": 0.01060852, + "auxiliary_loss_mlp": 0.01033893, + "balance_loss_clip": 1.0308044, + "balance_loss_mlp": 1.01995707, + "epoch": 0.38172253118893734, + "flos": 21177564220800.0, + "grad_norm": 1.7926155653086564, + "language_loss": 0.87288523, + "learning_rate": 2.836687208908142e-06, + "loss": 0.89383256, + "num_input_tokens_seen": 136248975, + "step": 6349, + "time_per_iteration": 2.721256971359253 + }, + { + "auxiliary_loss_clip": 0.0106834, + "auxiliary_loss_mlp": 0.01035802, + "balance_loss_clip": 1.02898622, + "balance_loss_mlp": 1.02206361, + "epoch": 0.3817826544416053, + "flos": 17529219820800.0, + "grad_norm": 1.874440633936203, + "language_loss": 0.76746023, + "learning_rate": 2.836333449345341e-06, + "loss": 0.78850162, + "num_input_tokens_seen": 136266710, + "step": 6350, + "time_per_iteration": 2.5822691917419434 + }, + { + "auxiliary_loss_clip": 0.01050591, + "auxiliary_loss_mlp": 0.01034783, + "balance_loss_clip": 1.029338, + "balance_loss_mlp": 1.0200969, + "epoch": 0.38184277769427327, + "flos": 16326535714560.0, + "grad_norm": 2.3810301535031244, + "language_loss": 0.75717634, + "learning_rate": 2.8359796580686907e-06, + "loss": 0.7780301, + "num_input_tokens_seen": 136284445, + "step": 6351, + "time_per_iteration": 2.821012496948242 + }, + { + "auxiliary_loss_clip": 0.01064673, + "auxiliary_loss_mlp": 0.01037745, + "balance_loss_clip": 1.0269978, + "balance_loss_mlp": 1.02224815, + "epoch": 0.38190290094694124, + "flos": 30443450382720.0, + "grad_norm": 2.9181590601717318, + "language_loss": 0.74563122, + "learning_rate": 2.8356258350916085e-06, + "loss": 0.76665545, + "num_input_tokens_seen": 136305730, + "step": 6352, + "time_per_iteration": 2.745307445526123 + }, + { + "auxiliary_loss_clip": 0.01040477, + "auxiliary_loss_mlp": 0.01035333, + "balance_loss_clip": 1.02688956, + "balance_loss_mlp": 1.02258921, + "epoch": 0.3819630241996092, + "flos": 14209924936320.0, + "grad_norm": 1.7438892242366235, + "language_loss": 0.64252627, + "learning_rate": 2.8352719804275104e-06, + "loss": 0.66328442, + "num_input_tokens_seen": 136323850, + "step": 6353, + "time_per_iteration": 2.8199901580810547 + }, + { + "auxiliary_loss_clip": 0.01080467, + "auxiliary_loss_mlp": 0.01036591, + "balance_loss_clip": 1.03057694, + "balance_loss_mlp": 1.02267313, + "epoch": 0.38202314745227717, + "flos": 25009699536000.0, + "grad_norm": 3.128176020412398, + "language_loss": 0.83073181, + "learning_rate": 2.834918094089816e-06, + "loss": 0.85190237, + "num_input_tokens_seen": 136344880, + "step": 6354, + "time_per_iteration": 2.621009588241577 + }, + { + "auxiliary_loss_clip": 0.01078393, + "auxiliary_loss_mlp": 0.01032578, + "balance_loss_clip": 1.02983284, + "balance_loss_mlp": 1.01995325, + "epoch": 0.38208327070494513, + "flos": 20814507504000.0, + "grad_norm": 3.0853578884181587, + "language_loss": 0.80534464, + "learning_rate": 2.834564176091943e-06, + "loss": 0.82645434, + "num_input_tokens_seen": 136366060, + "step": 6355, + "time_per_iteration": 2.6091341972351074 + }, + { + "auxiliary_loss_clip": 0.01037995, + "auxiliary_loss_mlp": 0.01036168, + "balance_loss_clip": 1.02744079, + "balance_loss_mlp": 1.02261353, + "epoch": 0.3821433939576131, + "flos": 22637727993600.0, + "grad_norm": 1.8124294097118403, + "language_loss": 0.74925101, + "learning_rate": 2.8342102264473125e-06, + "loss": 0.76999265, + "num_input_tokens_seen": 136385625, + "step": 6356, + "time_per_iteration": 2.7650158405303955 + }, + { + "auxiliary_loss_clip": 0.01070378, + "auxiliary_loss_mlp": 0.00747793, + "balance_loss_clip": 1.03041625, + "balance_loss_mlp": 1.00010014, + "epoch": 0.38220351721028106, + "flos": 26869872142080.0, + "grad_norm": 2.041477672462455, + "language_loss": 0.81249034, + "learning_rate": 2.833856245169348e-06, + "loss": 0.83067209, + "num_input_tokens_seen": 136405750, + "step": 6357, + "time_per_iteration": 2.708040952682495 + }, + { + "auxiliary_loss_clip": 0.01068454, + "auxiliary_loss_mlp": 0.01048859, + "balance_loss_clip": 1.0368495, + "balance_loss_mlp": 1.03317153, + "epoch": 0.38226364046294903, + "flos": 23367468700800.0, + "grad_norm": 1.6063188026354493, + "language_loss": 0.77328402, + "learning_rate": 2.8335022322714695e-06, + "loss": 0.7944572, + "num_input_tokens_seen": 136426085, + "step": 6358, + "time_per_iteration": 2.642169952392578 + }, + { + "auxiliary_loss_clip": 0.0105935, + "auxiliary_loss_mlp": 0.01042246, + "balance_loss_clip": 1.02800477, + "balance_loss_mlp": 1.02789903, + "epoch": 0.382323763715617, + "flos": 19646225648640.0, + "grad_norm": 2.5975926691335154, + "language_loss": 0.79076868, + "learning_rate": 2.8331481877671036e-06, + "loss": 0.81178463, + "num_input_tokens_seen": 136442670, + "step": 6359, + "time_per_iteration": 2.7388429641723633 + }, + { + "auxiliary_loss_clip": 0.01012171, + "auxiliary_loss_mlp": 0.01047226, + "balance_loss_clip": 1.02512836, + "balance_loss_mlp": 1.03211033, + "epoch": 0.38238388696828496, + "flos": 54124741232640.0, + "grad_norm": 1.6463433974808361, + "language_loss": 0.69629812, + "learning_rate": 2.8327941116696754e-06, + "loss": 0.71689206, + "num_input_tokens_seen": 136465730, + "step": 6360, + "time_per_iteration": 3.045283555984497 + }, + { + "auxiliary_loss_clip": 0.01049458, + "auxiliary_loss_mlp": 0.01033272, + "balance_loss_clip": 1.02802324, + "balance_loss_mlp": 1.01828778, + "epoch": 0.382444010220953, + "flos": 24936190352640.0, + "grad_norm": 1.849686650804897, + "language_loss": 0.78954411, + "learning_rate": 2.83244000399261e-06, + "loss": 0.8103714, + "num_input_tokens_seen": 136487215, + "step": 6361, + "time_per_iteration": 2.7473440170288086 + }, + { + "auxiliary_loss_clip": 0.01056252, + "auxiliary_loss_mlp": 0.01038839, + "balance_loss_clip": 1.02796388, + "balance_loss_mlp": 1.02556467, + "epoch": 0.38250413347362094, + "flos": 42337351209600.0, + "grad_norm": 1.4185859436678459, + "language_loss": 0.65369809, + "learning_rate": 2.832085864749337e-06, + "loss": 0.674649, + "num_input_tokens_seen": 136510365, + "step": 6362, + "time_per_iteration": 5.975184440612793 + }, + { + "auxiliary_loss_clip": 0.01078439, + "auxiliary_loss_mlp": 0.01032242, + "balance_loss_clip": 1.02927351, + "balance_loss_mlp": 1.01753128, + "epoch": 0.3825642567262889, + "flos": 16289224462080.0, + "grad_norm": 1.6575166106331793, + "language_loss": 0.81793034, + "learning_rate": 2.8317316939532848e-06, + "loss": 0.83903718, + "num_input_tokens_seen": 136527100, + "step": 6363, + "time_per_iteration": 2.5311343669891357 + }, + { + "auxiliary_loss_clip": 0.01030545, + "auxiliary_loss_mlp": 0.01039605, + "balance_loss_clip": 1.02883768, + "balance_loss_mlp": 1.02585459, + "epoch": 0.3826243799789569, + "flos": 45654778586880.0, + "grad_norm": 1.652449992221416, + "language_loss": 0.58614177, + "learning_rate": 2.8313774916178825e-06, + "loss": 0.60684329, + "num_input_tokens_seen": 136550870, + "step": 6364, + "time_per_iteration": 2.8694405555725098 + }, + { + "auxiliary_loss_clip": 0.01062288, + "auxiliary_loss_mlp": 0.01037666, + "balance_loss_clip": 1.0302738, + "balance_loss_mlp": 1.02319396, + "epoch": 0.38268450323162484, + "flos": 25301581453440.0, + "grad_norm": 1.7430912062194803, + "language_loss": 0.68596804, + "learning_rate": 2.8310232577565635e-06, + "loss": 0.70696759, + "num_input_tokens_seen": 136569895, + "step": 6365, + "time_per_iteration": 2.6211516857147217 + }, + { + "auxiliary_loss_clip": 0.01072858, + "auxiliary_loss_mlp": 0.01037255, + "balance_loss_clip": 1.03108549, + "balance_loss_mlp": 1.02214479, + "epoch": 0.3827446264842928, + "flos": 21836022387840.0, + "grad_norm": 2.1998848311162993, + "language_loss": 0.73045552, + "learning_rate": 2.830668992382758e-06, + "loss": 0.75155663, + "num_input_tokens_seen": 136588585, + "step": 6366, + "time_per_iteration": 2.6241610050201416 + }, + { + "auxiliary_loss_clip": 0.01060179, + "auxiliary_loss_mlp": 0.0104047, + "balance_loss_clip": 1.02911973, + "balance_loss_mlp": 1.02542019, + "epoch": 0.38280474973696077, + "flos": 25734591907200.0, + "grad_norm": 1.9495612604984642, + "language_loss": 0.68746829, + "learning_rate": 2.830314695509902e-06, + "loss": 0.70847476, + "num_input_tokens_seen": 136606640, + "step": 6367, + "time_per_iteration": 2.6354684829711914 + }, + { + "auxiliary_loss_clip": 0.01070139, + "auxiliary_loss_mlp": 0.01034657, + "balance_loss_clip": 1.03077221, + "balance_loss_mlp": 1.02143669, + "epoch": 0.38286487298962874, + "flos": 24895934184960.0, + "grad_norm": 1.7944103744731947, + "language_loss": 0.63931918, + "learning_rate": 2.82996036715143e-06, + "loss": 0.66036713, + "num_input_tokens_seen": 136624940, + "step": 6368, + "time_per_iteration": 2.6078624725341797 + }, + { + "auxiliary_loss_clip": 0.0108131, + "auxiliary_loss_mlp": 0.01036275, + "balance_loss_clip": 1.03055608, + "balance_loss_mlp": 1.02233934, + "epoch": 0.3829249962422967, + "flos": 28543703967360.0, + "grad_norm": 1.3066009226896396, + "language_loss": 0.68170238, + "learning_rate": 2.8296060073207763e-06, + "loss": 0.70287824, + "num_input_tokens_seen": 136645540, + "step": 6369, + "time_per_iteration": 2.5788893699645996 + }, + { + "auxiliary_loss_clip": 0.01030501, + "auxiliary_loss_mlp": 0.01036402, + "balance_loss_clip": 1.0259378, + "balance_loss_mlp": 1.022192, + "epoch": 0.38298511949496467, + "flos": 21471205904640.0, + "grad_norm": 1.6453788778100857, + "language_loss": 0.78689814, + "learning_rate": 2.8292516160313804e-06, + "loss": 0.80756712, + "num_input_tokens_seen": 136664530, + "step": 6370, + "time_per_iteration": 2.8820114135742188 + }, + { + "auxiliary_loss_clip": 0.01062722, + "auxiliary_loss_mlp": 0.01041712, + "balance_loss_clip": 1.0296514, + "balance_loss_mlp": 1.02678728, + "epoch": 0.38304524274763263, + "flos": 31679998035840.0, + "grad_norm": 2.4961124424602463, + "language_loss": 0.64423323, + "learning_rate": 2.8288971932966805e-06, + "loss": 0.6652776, + "num_input_tokens_seen": 136682315, + "step": 6371, + "time_per_iteration": 2.6179986000061035 + }, + { + "auxiliary_loss_clip": 0.01054353, + "auxiliary_loss_mlp": 0.01036315, + "balance_loss_clip": 1.03043485, + "balance_loss_mlp": 1.02125835, + "epoch": 0.3831053660003006, + "flos": 25076816098560.0, + "grad_norm": 1.826088051809493, + "language_loss": 0.72807384, + "learning_rate": 2.8285427391301155e-06, + "loss": 0.74898046, + "num_input_tokens_seen": 136701185, + "step": 6372, + "time_per_iteration": 4.291127920150757 + }, + { + "auxiliary_loss_clip": 0.01071901, + "auxiliary_loss_mlp": 0.01035033, + "balance_loss_clip": 1.03041625, + "balance_loss_mlp": 1.02142572, + "epoch": 0.38316548925296856, + "flos": 23259018562560.0, + "grad_norm": 1.7496156997488408, + "language_loss": 0.84619528, + "learning_rate": 2.8281882535451266e-06, + "loss": 0.86726463, + "num_input_tokens_seen": 136721265, + "step": 6373, + "time_per_iteration": 4.3175249099731445 + }, + { + "auxiliary_loss_clip": 0.01028631, + "auxiliary_loss_mlp": 0.01042644, + "balance_loss_clip": 1.02548742, + "balance_loss_mlp": 1.02757609, + "epoch": 0.3832256125056366, + "flos": 34423465991040.0, + "grad_norm": 1.8832503062836383, + "language_loss": 0.74791956, + "learning_rate": 2.8278337365551567e-06, + "loss": 0.76863229, + "num_input_tokens_seen": 136741885, + "step": 6374, + "time_per_iteration": 2.896934747695923 + }, + { + "auxiliary_loss_clip": 0.01069945, + "auxiliary_loss_mlp": 0.01041189, + "balance_loss_clip": 1.03169966, + "balance_loss_mlp": 1.02640104, + "epoch": 0.38328573575830455, + "flos": 21762764599680.0, + "grad_norm": 2.423626129887849, + "language_loss": 0.76058549, + "learning_rate": 2.8274791881736485e-06, + "loss": 0.78169686, + "num_input_tokens_seen": 136760905, + "step": 6375, + "time_per_iteration": 2.8018345832824707 + }, + { + "auxiliary_loss_clip": 0.01070679, + "auxiliary_loss_mlp": 0.01036618, + "balance_loss_clip": 1.03041637, + "balance_loss_mlp": 1.02282584, + "epoch": 0.3833458590109725, + "flos": 17380010724480.0, + "grad_norm": 1.8684592125553188, + "language_loss": 0.7246362, + "learning_rate": 2.8271246084140457e-06, + "loss": 0.74570918, + "num_input_tokens_seen": 136777240, + "step": 6376, + "time_per_iteration": 2.6353797912597656 + }, + { + "auxiliary_loss_clip": 0.01070122, + "auxiliary_loss_mlp": 0.0103924, + "balance_loss_clip": 1.03062642, + "balance_loss_mlp": 1.02482152, + "epoch": 0.3834059822636405, + "flos": 29424557191680.0, + "grad_norm": 2.021761666000594, + "language_loss": 0.67690319, + "learning_rate": 2.826769997289796e-06, + "loss": 0.69799685, + "num_input_tokens_seen": 136801040, + "step": 6377, + "time_per_iteration": 2.8087456226348877 + }, + { + "auxiliary_loss_clip": 0.01054612, + "auxiliary_loss_mlp": 0.01041914, + "balance_loss_clip": 1.03215706, + "balance_loss_mlp": 1.02679253, + "epoch": 0.38346610551630844, + "flos": 21470739027840.0, + "grad_norm": 1.8828216788476906, + "language_loss": 0.73088121, + "learning_rate": 2.826415354814344e-06, + "loss": 0.75184655, + "num_input_tokens_seen": 136819495, + "step": 6378, + "time_per_iteration": 2.805037498474121 + }, + { + "auxiliary_loss_clip": 0.01035606, + "auxiliary_loss_mlp": 0.01039203, + "balance_loss_clip": 1.03027511, + "balance_loss_mlp": 1.02539217, + "epoch": 0.3835262287689764, + "flos": 27561224188800.0, + "grad_norm": 1.7124481938508636, + "language_loss": 0.69252539, + "learning_rate": 2.8260606810011396e-06, + "loss": 0.71327353, + "num_input_tokens_seen": 136838840, + "step": 6379, + "time_per_iteration": 2.885718822479248 + }, + { + "auxiliary_loss_clip": 0.01069939, + "auxiliary_loss_mlp": 0.01038718, + "balance_loss_clip": 1.03087115, + "balance_loss_mlp": 1.02460361, + "epoch": 0.3835863520216444, + "flos": 15523716787200.0, + "grad_norm": 1.7523851101285866, + "language_loss": 0.83193254, + "learning_rate": 2.8257059758636315e-06, + "loss": 0.85301912, + "num_input_tokens_seen": 136854425, + "step": 6380, + "time_per_iteration": 2.698878765106201 + }, + { + "auxiliary_loss_clip": 0.01078871, + "auxiliary_loss_mlp": 0.01031548, + "balance_loss_clip": 1.03090036, + "balance_loss_mlp": 1.01853669, + "epoch": 0.38364647527431234, + "flos": 21904934630400.0, + "grad_norm": 1.4643925217940128, + "language_loss": 0.81171811, + "learning_rate": 2.8253512394152697e-06, + "loss": 0.83282232, + "num_input_tokens_seen": 136874355, + "step": 6381, + "time_per_iteration": 2.650463819503784 + }, + { + "auxiliary_loss_clip": 0.01013435, + "auxiliary_loss_mlp": 0.01002981, + "balance_loss_clip": 1.00325751, + "balance_loss_mlp": 1.00134742, + "epoch": 0.3837065985269803, + "flos": 65534927558400.0, + "grad_norm": 0.7991219162067005, + "language_loss": 0.60406226, + "learning_rate": 2.8249964716695068e-06, + "loss": 0.62422639, + "num_input_tokens_seen": 136937475, + "step": 6382, + "time_per_iteration": 3.3000082969665527 + }, + { + "auxiliary_loss_clip": 0.01082665, + "auxiliary_loss_mlp": 0.01034734, + "balance_loss_clip": 1.02982783, + "balance_loss_mlp": 1.0197854, + "epoch": 0.38376672177964827, + "flos": 28256598558720.0, + "grad_norm": 3.1580215298069794, + "language_loss": 0.66212147, + "learning_rate": 2.824641672639794e-06, + "loss": 0.68329543, + "num_input_tokens_seen": 136955805, + "step": 6383, + "time_per_iteration": 2.5943658351898193 + }, + { + "auxiliary_loss_clip": 0.01045787, + "auxiliary_loss_mlp": 0.01034965, + "balance_loss_clip": 1.02925551, + "balance_loss_mlp": 1.02098215, + "epoch": 0.38382684503231623, + "flos": 20631363033600.0, + "grad_norm": 1.7093329878011543, + "language_loss": 0.75042683, + "learning_rate": 2.824286842339587e-06, + "loss": 0.77123439, + "num_input_tokens_seen": 136975240, + "step": 6384, + "time_per_iteration": 2.6558125019073486 + }, + { + "auxiliary_loss_clip": 0.01069319, + "auxiliary_loss_mlp": 0.01039085, + "balance_loss_clip": 1.03055429, + "balance_loss_mlp": 1.02523279, + "epoch": 0.3838869682849842, + "flos": 19605825826560.0, + "grad_norm": 1.4522954069903333, + "language_loss": 0.76214838, + "learning_rate": 2.823931980782341e-06, + "loss": 0.78323239, + "num_input_tokens_seen": 136994985, + "step": 6385, + "time_per_iteration": 2.6769518852233887 + }, + { + "auxiliary_loss_clip": 0.01004865, + "auxiliary_loss_mlp": 0.01001681, + "balance_loss_clip": 1.00407445, + "balance_loss_mlp": 1.00020838, + "epoch": 0.38394709153765216, + "flos": 56556110891520.0, + "grad_norm": 0.9268000359684692, + "language_loss": 0.67022479, + "learning_rate": 2.82357708798151e-06, + "loss": 0.69029021, + "num_input_tokens_seen": 137046290, + "step": 6386, + "time_per_iteration": 3.1067841053009033 + }, + { + "auxiliary_loss_clip": 0.01038405, + "auxiliary_loss_mlp": 0.01031391, + "balance_loss_clip": 1.02673769, + "balance_loss_mlp": 1.01802135, + "epoch": 0.3840072147903202, + "flos": 15888748752000.0, + "grad_norm": 1.7746665960766488, + "language_loss": 0.72197521, + "learning_rate": 2.8232221639505547e-06, + "loss": 0.74267316, + "num_input_tokens_seen": 137064725, + "step": 6387, + "time_per_iteration": 2.882861852645874 + }, + { + "auxiliary_loss_clip": 0.01082075, + "auxiliary_loss_mlp": 0.01043961, + "balance_loss_clip": 1.03357744, + "balance_loss_mlp": 1.03077674, + "epoch": 0.38406733804298815, + "flos": 28218030330240.0, + "grad_norm": 2.9256349851681076, + "language_loss": 0.81081241, + "learning_rate": 2.822867208702932e-06, + "loss": 0.83207285, + "num_input_tokens_seen": 137086030, + "step": 6388, + "time_per_iteration": 2.86861252784729 + }, + { + "auxiliary_loss_clip": 0.01050327, + "auxiliary_loss_mlp": 0.01037988, + "balance_loss_clip": 1.02794456, + "balance_loss_mlp": 1.02491105, + "epoch": 0.3841274612956561, + "flos": 18223588609920.0, + "grad_norm": 1.7471649825185347, + "language_loss": 0.76634514, + "learning_rate": 2.8225122222521026e-06, + "loss": 0.78722835, + "num_input_tokens_seen": 137105400, + "step": 6389, + "time_per_iteration": 2.620882749557495 + }, + { + "auxiliary_loss_clip": 0.0106604, + "auxiliary_loss_mlp": 0.01048131, + "balance_loss_clip": 1.03258634, + "balance_loss_mlp": 1.03339648, + "epoch": 0.3841875845483241, + "flos": 19792884879360.0, + "grad_norm": 1.6774902843621964, + "language_loss": 0.76325554, + "learning_rate": 2.8221572046115273e-06, + "loss": 0.78439724, + "num_input_tokens_seen": 137124985, + "step": 6390, + "time_per_iteration": 2.669286012649536 + }, + { + "auxiliary_loss_clip": 0.01031165, + "auxiliary_loss_mlp": 0.01042442, + "balance_loss_clip": 1.02517521, + "balance_loss_mlp": 1.02730191, + "epoch": 0.38424770780099204, + "flos": 29898829393920.0, + "grad_norm": 1.6687849422358105, + "language_loss": 0.70447493, + "learning_rate": 2.821802155794668e-06, + "loss": 0.72521102, + "num_input_tokens_seen": 137146745, + "step": 6391, + "time_per_iteration": 2.8435239791870117 + }, + { + "auxiliary_loss_clip": 0.01072925, + "auxiliary_loss_mlp": 0.01038741, + "balance_loss_clip": 1.03186929, + "balance_loss_mlp": 1.02472222, + "epoch": 0.38430783105366, + "flos": 20813717404800.0, + "grad_norm": 2.48066887288753, + "language_loss": 0.84090447, + "learning_rate": 2.8214470758149884e-06, + "loss": 0.86202109, + "num_input_tokens_seen": 137163195, + "step": 6392, + "time_per_iteration": 2.676807165145874 + }, + { + "auxiliary_loss_clip": 0.01064225, + "auxiliary_loss_mlp": 0.0103433, + "balance_loss_clip": 1.02815974, + "balance_loss_mlp": 1.02118087, + "epoch": 0.384367954306328, + "flos": 10998577399680.0, + "grad_norm": 2.49015372670264, + "language_loss": 0.61279434, + "learning_rate": 2.8210919646859536e-06, + "loss": 0.63377988, + "num_input_tokens_seen": 137179330, + "step": 6393, + "time_per_iteration": 2.6260409355163574 + }, + { + "auxiliary_loss_clip": 0.0105693, + "auxiliary_loss_mlp": 0.01036073, + "balance_loss_clip": 1.0311625, + "balance_loss_mlp": 1.02062297, + "epoch": 0.38442807755899594, + "flos": 25338030779520.0, + "grad_norm": 3.4529754929090863, + "language_loss": 0.71232653, + "learning_rate": 2.820736822421029e-06, + "loss": 0.73325664, + "num_input_tokens_seen": 137198655, + "step": 6394, + "time_per_iteration": 2.706446886062622 + }, + { + "auxiliary_loss_clip": 0.0106723, + "auxiliary_loss_mlp": 0.0103482, + "balance_loss_clip": 1.03053856, + "balance_loss_mlp": 1.01935244, + "epoch": 0.3844882008116639, + "flos": 21069760527360.0, + "grad_norm": 2.0029452224901565, + "language_loss": 0.81275272, + "learning_rate": 2.8203816490336822e-06, + "loss": 0.8337732, + "num_input_tokens_seen": 137217120, + "step": 6395, + "time_per_iteration": 2.738553047180176 + }, + { + "auxiliary_loss_clip": 0.0107185, + "auxiliary_loss_mlp": 0.01043068, + "balance_loss_clip": 1.03382897, + "balance_loss_mlp": 1.02948391, + "epoch": 0.38454832406433187, + "flos": 17963235855360.0, + "grad_norm": 1.9578260255039108, + "language_loss": 0.7105341, + "learning_rate": 2.8200264445373813e-06, + "loss": 0.73168331, + "num_input_tokens_seen": 137234410, + "step": 6396, + "time_per_iteration": 2.5838940143585205 + }, + { + "auxiliary_loss_clip": 0.01003838, + "auxiliary_loss_mlp": 0.01001248, + "balance_loss_clip": 1.00372767, + "balance_loss_mlp": 0.9996385, + "epoch": 0.38460844731699984, + "flos": 67924999555200.0, + "grad_norm": 0.8882083282058902, + "language_loss": 0.59738475, + "learning_rate": 2.8196712089455954e-06, + "loss": 0.61743557, + "num_input_tokens_seen": 137294940, + "step": 6397, + "time_per_iteration": 3.3445756435394287 + }, + { + "auxiliary_loss_clip": 0.0108226, + "auxiliary_loss_mlp": 0.01032666, + "balance_loss_clip": 1.03177452, + "balance_loss_mlp": 1.01918352, + "epoch": 0.3846685705696678, + "flos": 25849075530240.0, + "grad_norm": 1.6944398117489152, + "language_loss": 0.84526902, + "learning_rate": 2.819315942271794e-06, + "loss": 0.8664183, + "num_input_tokens_seen": 137315035, + "step": 6398, + "time_per_iteration": 2.8403007984161377 + }, + { + "auxiliary_loss_clip": 0.01081759, + "auxiliary_loss_mlp": 0.01033818, + "balance_loss_clip": 1.0319407, + "balance_loss_mlp": 1.02066934, + "epoch": 0.38472869382233577, + "flos": 16290194129280.0, + "grad_norm": 1.8193767189130143, + "language_loss": 0.80152661, + "learning_rate": 2.8189606445294515e-06, + "loss": 0.82268238, + "num_input_tokens_seen": 137333155, + "step": 6399, + "time_per_iteration": 2.675450325012207 + }, + { + "auxiliary_loss_clip": 0.01084507, + "auxiliary_loss_mlp": 0.00747892, + "balance_loss_clip": 1.03213704, + "balance_loss_mlp": 1.00014973, + "epoch": 0.38478881707500373, + "flos": 19353122668800.0, + "grad_norm": 1.871894069341867, + "language_loss": 0.6658957, + "learning_rate": 2.818605315732038e-06, + "loss": 0.68421972, + "num_input_tokens_seen": 137351515, + "step": 6400, + "time_per_iteration": 2.6656084060668945 + }, + { + "auxiliary_loss_clip": 0.01064312, + "auxiliary_loss_mlp": 0.01046638, + "balance_loss_clip": 1.03177142, + "balance_loss_mlp": 1.03223109, + "epoch": 0.38484894032767175, + "flos": 24860849575680.0, + "grad_norm": 1.8229413421995917, + "language_loss": 0.73375857, + "learning_rate": 2.81824995589303e-06, + "loss": 0.75486803, + "num_input_tokens_seen": 137371255, + "step": 6401, + "time_per_iteration": 2.697067975997925 + }, + { + "auxiliary_loss_clip": 0.01050934, + "auxiliary_loss_mlp": 0.01040572, + "balance_loss_clip": 1.03092504, + "balance_loss_mlp": 1.02673161, + "epoch": 0.3849090635803397, + "flos": 14501806853760.0, + "grad_norm": 1.9594077638350258, + "language_loss": 0.71691006, + "learning_rate": 2.8178945650259012e-06, + "loss": 0.73782516, + "num_input_tokens_seen": 137388980, + "step": 6402, + "time_per_iteration": 2.73332142829895 + }, + { + "auxiliary_loss_clip": 0.010777, + "auxiliary_loss_mlp": 0.01034456, + "balance_loss_clip": 1.02968407, + "balance_loss_mlp": 1.02161145, + "epoch": 0.3849691868330077, + "flos": 18515865576960.0, + "grad_norm": 1.8904941086425042, + "language_loss": 0.82779032, + "learning_rate": 2.817539143144128e-06, + "loss": 0.84891194, + "num_input_tokens_seen": 137406885, + "step": 6403, + "time_per_iteration": 2.6373438835144043 + }, + { + "auxiliary_loss_clip": 0.01024205, + "auxiliary_loss_mlp": 0.01038817, + "balance_loss_clip": 1.02418733, + "balance_loss_mlp": 1.02398181, + "epoch": 0.38502931008567565, + "flos": 21616392677760.0, + "grad_norm": 2.06879628011974, + "language_loss": 0.82643765, + "learning_rate": 2.817183690261189e-06, + "loss": 0.84706783, + "num_input_tokens_seen": 137425535, + "step": 6404, + "time_per_iteration": 2.64741849899292 + }, + { + "auxiliary_loss_clip": 0.01055748, + "auxiliary_loss_mlp": 0.01036197, + "balance_loss_clip": 1.02927291, + "balance_loss_mlp": 1.02286935, + "epoch": 0.3850894333383436, + "flos": 25415346804480.0, + "grad_norm": 1.6015540582458256, + "language_loss": 0.69754505, + "learning_rate": 2.816828206390563e-06, + "loss": 0.71846455, + "num_input_tokens_seen": 137447700, + "step": 6405, + "time_per_iteration": 2.7556591033935547 + }, + { + "auxiliary_loss_clip": 0.01049777, + "auxiliary_loss_mlp": 0.01039379, + "balance_loss_clip": 1.02771771, + "balance_loss_mlp": 1.02609861, + "epoch": 0.3851495565910116, + "flos": 20227870581120.0, + "grad_norm": 2.148433750062813, + "language_loss": 0.79062778, + "learning_rate": 2.816472691545729e-06, + "loss": 0.81151938, + "num_input_tokens_seen": 137462245, + "step": 6406, + "time_per_iteration": 2.595254421234131 + }, + { + "auxiliary_loss_clip": 0.01074669, + "auxiliary_loss_mlp": 0.01039069, + "balance_loss_clip": 1.03421736, + "balance_loss_mlp": 1.02441764, + "epoch": 0.38520967984367954, + "flos": 16508459122560.0, + "grad_norm": 2.1548754328575535, + "language_loss": 0.83920157, + "learning_rate": 2.8161171457401694e-06, + "loss": 0.86033893, + "num_input_tokens_seen": 137476455, + "step": 6407, + "time_per_iteration": 2.517838478088379 + }, + { + "auxiliary_loss_clip": 0.01002019, + "auxiliary_loss_mlp": 0.01003189, + "balance_loss_clip": 1.00231242, + "balance_loss_mlp": 1.00153184, + "epoch": 0.3852698030963475, + "flos": 61313772971520.0, + "grad_norm": 0.843825351032122, + "language_loss": 0.64929247, + "learning_rate": 2.815761568987365e-06, + "loss": 0.66934454, + "num_input_tokens_seen": 137539845, + "step": 6408, + "time_per_iteration": 3.199524164199829 + }, + { + "auxiliary_loss_clip": 0.01062137, + "auxiliary_loss_mlp": 0.01038378, + "balance_loss_clip": 1.03229547, + "balance_loss_mlp": 1.02322614, + "epoch": 0.3853299263490155, + "flos": 22893016930560.0, + "grad_norm": 1.624193329165545, + "language_loss": 0.7350117, + "learning_rate": 2.8154059613008e-06, + "loss": 0.75601685, + "num_input_tokens_seen": 137559880, + "step": 6409, + "time_per_iteration": 4.224771976470947 + }, + { + "auxiliary_loss_clip": 0.01042037, + "auxiliary_loss_mlp": 0.01049926, + "balance_loss_clip": 1.03161597, + "balance_loss_mlp": 1.03339195, + "epoch": 0.38539004960168344, + "flos": 20047491457920.0, + "grad_norm": 2.700813625045866, + "language_loss": 0.70957172, + "learning_rate": 2.81505032269396e-06, + "loss": 0.7304914, + "num_input_tokens_seen": 137578225, + "step": 6410, + "time_per_iteration": 2.6557869911193848 + }, + { + "auxiliary_loss_clip": 0.00980143, + "auxiliary_loss_mlp": 0.00746505, + "balance_loss_clip": 1.00960255, + "balance_loss_mlp": 0.9998309, + "epoch": 0.3854501728543514, + "flos": 68730691570560.0, + "grad_norm": 0.6931683375978334, + "language_loss": 0.60326284, + "learning_rate": 2.81469465318033e-06, + "loss": 0.62052935, + "num_input_tokens_seen": 137645770, + "step": 6411, + "time_per_iteration": 3.346054792404175 + }, + { + "auxiliary_loss_clip": 0.01036526, + "auxiliary_loss_mlp": 0.01031184, + "balance_loss_clip": 1.02770352, + "balance_loss_mlp": 1.01812458, + "epoch": 0.38551029610701937, + "flos": 20485027025280.0, + "grad_norm": 2.1641898356479548, + "language_loss": 0.77756393, + "learning_rate": 2.814338952773397e-06, + "loss": 0.79824102, + "num_input_tokens_seen": 137664090, + "step": 6412, + "time_per_iteration": 2.6782782077789307 + }, + { + "auxiliary_loss_clip": 0.0104439, + "auxiliary_loss_mlp": 0.01037848, + "balance_loss_clip": 1.02878249, + "balance_loss_mlp": 1.02160025, + "epoch": 0.38557041935968733, + "flos": 23471788775040.0, + "grad_norm": 2.048056416641373, + "language_loss": 0.77619898, + "learning_rate": 2.8139832214866493e-06, + "loss": 0.79702139, + "num_input_tokens_seen": 137683190, + "step": 6413, + "time_per_iteration": 2.764402151107788 + }, + { + "auxiliary_loss_clip": 0.01010694, + "auxiliary_loss_mlp": 0.01001378, + "balance_loss_clip": 1.00111318, + "balance_loss_mlp": 0.99984026, + "epoch": 0.38563054261235535, + "flos": 63966636869760.0, + "grad_norm": 0.8508592309077392, + "language_loss": 0.61256063, + "learning_rate": 2.813627459333576e-06, + "loss": 0.63268137, + "num_input_tokens_seen": 137737315, + "step": 6414, + "time_per_iteration": 2.979599714279175 + }, + { + "auxiliary_loss_clip": 0.0105371, + "auxiliary_loss_mlp": 0.01042103, + "balance_loss_clip": 1.03290379, + "balance_loss_mlp": 1.02765489, + "epoch": 0.3856906658650233, + "flos": 23987789602560.0, + "grad_norm": 2.01351476504322, + "language_loss": 0.77790236, + "learning_rate": 2.8132716663276685e-06, + "loss": 0.79886055, + "num_input_tokens_seen": 137753535, + "step": 6415, + "time_per_iteration": 2.6588032245635986 + }, + { + "auxiliary_loss_clip": 0.01054199, + "auxiliary_loss_mlp": 0.01032991, + "balance_loss_clip": 1.02885413, + "balance_loss_mlp": 1.02030683, + "epoch": 0.3857507891176913, + "flos": 25007436979200.0, + "grad_norm": 3.347659508144535, + "language_loss": 0.79801655, + "learning_rate": 2.8129158424824173e-06, + "loss": 0.81888843, + "num_input_tokens_seen": 137773405, + "step": 6416, + "time_per_iteration": 2.6988000869750977 + }, + { + "auxiliary_loss_clip": 0.01066022, + "auxiliary_loss_mlp": 0.00747789, + "balance_loss_clip": 1.03016663, + "balance_loss_mlp": 1.00014925, + "epoch": 0.38581091237035925, + "flos": 21536778182400.0, + "grad_norm": 1.7093536088012864, + "language_loss": 0.79249609, + "learning_rate": 2.8125599878113155e-06, + "loss": 0.81063414, + "num_input_tokens_seen": 137790810, + "step": 6417, + "time_per_iteration": 2.591799259185791 + }, + { + "auxiliary_loss_clip": 0.01057765, + "auxiliary_loss_mlp": 0.01034496, + "balance_loss_clip": 1.02896166, + "balance_loss_mlp": 1.02152014, + "epoch": 0.3858710356230272, + "flos": 17383889393280.0, + "grad_norm": 1.6693296673294324, + "language_loss": 0.80275136, + "learning_rate": 2.8122041023278583e-06, + "loss": 0.82367396, + "num_input_tokens_seen": 137810265, + "step": 6418, + "time_per_iteration": 2.667078971862793 + }, + { + "auxiliary_loss_clip": 0.01055513, + "auxiliary_loss_mlp": 0.01032888, + "balance_loss_clip": 1.02783096, + "balance_loss_mlp": 1.02032375, + "epoch": 0.3859311588756952, + "flos": 20339588856960.0, + "grad_norm": 1.775902847813845, + "language_loss": 0.79466295, + "learning_rate": 2.8118481860455407e-06, + "loss": 0.81554699, + "num_input_tokens_seen": 137828580, + "step": 6419, + "time_per_iteration": 2.6367995738983154 + }, + { + "auxiliary_loss_clip": 0.01058786, + "auxiliary_loss_mlp": 0.01036472, + "balance_loss_clip": 1.03045237, + "balance_loss_mlp": 1.02165377, + "epoch": 0.38599128212836314, + "flos": 26321157002880.0, + "grad_norm": 2.0253986997718005, + "language_loss": 0.67652553, + "learning_rate": 2.8114922389778573e-06, + "loss": 0.69747806, + "num_input_tokens_seen": 137846145, + "step": 6420, + "time_per_iteration": 4.279192686080933 + }, + { + "auxiliary_loss_clip": 0.01041537, + "auxiliary_loss_mlp": 0.01041844, + "balance_loss_clip": 1.02967107, + "balance_loss_mlp": 1.02740788, + "epoch": 0.3860514053810311, + "flos": 13553837066880.0, + "grad_norm": 1.945442777643193, + "language_loss": 0.81234908, + "learning_rate": 2.8111362611383076e-06, + "loss": 0.83318287, + "num_input_tokens_seen": 137863705, + "step": 6421, + "time_per_iteration": 4.295942068099976 + }, + { + "auxiliary_loss_clip": 0.0105905, + "auxiliary_loss_mlp": 0.01031211, + "balance_loss_clip": 1.02958143, + "balance_loss_mlp": 1.01708436, + "epoch": 0.3861115286336991, + "flos": 20954271323520.0, + "grad_norm": 2.0687678934563087, + "language_loss": 0.7170378, + "learning_rate": 2.8107802525403886e-06, + "loss": 0.73794043, + "num_input_tokens_seen": 137880285, + "step": 6422, + "time_per_iteration": 2.649515151977539 + }, + { + "auxiliary_loss_clip": 0.01056395, + "auxiliary_loss_mlp": 0.01037863, + "balance_loss_clip": 1.03053474, + "balance_loss_mlp": 1.0252564, + "epoch": 0.38617165188636704, + "flos": 16362697731840.0, + "grad_norm": 1.6869943805972147, + "language_loss": 0.66504663, + "learning_rate": 2.8104242131976025e-06, + "loss": 0.68598914, + "num_input_tokens_seen": 137898335, + "step": 6423, + "time_per_iteration": 2.6582489013671875 + }, + { + "auxiliary_loss_clip": 0.01075887, + "auxiliary_loss_mlp": 0.01038671, + "balance_loss_clip": 1.03465927, + "balance_loss_mlp": 1.02530158, + "epoch": 0.386231775139035, + "flos": 34787276893440.0, + "grad_norm": 2.0409285653327847, + "language_loss": 0.68620211, + "learning_rate": 2.810068143123449e-06, + "loss": 0.70734763, + "num_input_tokens_seen": 137918605, + "step": 6424, + "time_per_iteration": 2.7499334812164307 + }, + { + "auxiliary_loss_clip": 0.01047349, + "auxiliary_loss_mlp": 0.01036547, + "balance_loss_clip": 1.02920449, + "balance_loss_mlp": 1.02279007, + "epoch": 0.38629189839170297, + "flos": 21726171619200.0, + "grad_norm": 1.525181625907442, + "language_loss": 0.72506422, + "learning_rate": 2.809712042331429e-06, + "loss": 0.74590313, + "num_input_tokens_seen": 137938245, + "step": 6425, + "time_per_iteration": 2.710132122039795 + }, + { + "auxiliary_loss_clip": 0.0104564, + "auxiliary_loss_mlp": 0.0074794, + "balance_loss_clip": 1.02687085, + "balance_loss_mlp": 1.00017273, + "epoch": 0.38635202164437094, + "flos": 27923634460800.0, + "grad_norm": 2.0004964673398127, + "language_loss": 0.79986799, + "learning_rate": 2.8093559108350484e-06, + "loss": 0.8178038, + "num_input_tokens_seen": 137956770, + "step": 6426, + "time_per_iteration": 2.6640164852142334 + }, + { + "auxiliary_loss_clip": 0.01071621, + "auxiliary_loss_mlp": 0.01038685, + "balance_loss_clip": 1.03169155, + "balance_loss_mlp": 1.02465987, + "epoch": 0.38641214489703896, + "flos": 23586631534080.0, + "grad_norm": 1.7576801509318956, + "language_loss": 0.74154627, + "learning_rate": 2.80899974864781e-06, + "loss": 0.76264936, + "num_input_tokens_seen": 137977040, + "step": 6427, + "time_per_iteration": 2.5925276279449463 + }, + { + "auxiliary_loss_clip": 0.01018853, + "auxiliary_loss_mlp": 0.01047585, + "balance_loss_clip": 1.024387, + "balance_loss_mlp": 1.0318017, + "epoch": 0.3864722681497069, + "flos": 12641239198080.0, + "grad_norm": 1.9425369434442985, + "language_loss": 0.7035296, + "learning_rate": 2.8086435557832203e-06, + "loss": 0.72419393, + "num_input_tokens_seen": 137993545, + "step": 6428, + "time_per_iteration": 2.625044822692871 + }, + { + "auxiliary_loss_clip": 0.01063763, + "auxiliary_loss_mlp": 0.01044302, + "balance_loss_clip": 1.03321815, + "balance_loss_mlp": 1.03005695, + "epoch": 0.3865323914023749, + "flos": 17598922162560.0, + "grad_norm": 2.0480501147797043, + "language_loss": 0.84697074, + "learning_rate": 2.8082873322547863e-06, + "loss": 0.86805135, + "num_input_tokens_seen": 138010140, + "step": 6429, + "time_per_iteration": 2.564420461654663 + }, + { + "auxiliary_loss_clip": 0.0105854, + "auxiliary_loss_mlp": 0.01037562, + "balance_loss_clip": 1.0297749, + "balance_loss_mlp": 1.02337623, + "epoch": 0.38659251465504285, + "flos": 18478949374080.0, + "grad_norm": 2.530982658026525, + "language_loss": 0.80929279, + "learning_rate": 2.807931078076015e-06, + "loss": 0.83025384, + "num_input_tokens_seen": 138028880, + "step": 6430, + "time_per_iteration": 2.6153814792633057 + }, + { + "auxiliary_loss_clip": 0.00984484, + "auxiliary_loss_mlp": 0.01009216, + "balance_loss_clip": 1.00422573, + "balance_loss_mlp": 1.00774956, + "epoch": 0.3866526379077108, + "flos": 64165726978560.0, + "grad_norm": 0.7133454841888485, + "language_loss": 0.58792692, + "learning_rate": 2.807574793260416e-06, + "loss": 0.6078639, + "num_input_tokens_seen": 138098090, + "step": 6431, + "time_per_iteration": 3.2507827281951904 + }, + { + "auxiliary_loss_clip": 0.01026562, + "auxiliary_loss_mlp": 0.01040894, + "balance_loss_clip": 1.02719426, + "balance_loss_mlp": 1.02526009, + "epoch": 0.3867127611603788, + "flos": 14388292897920.0, + "grad_norm": 1.7296323847173156, + "language_loss": 0.78964913, + "learning_rate": 2.8072184778215004e-06, + "loss": 0.81032372, + "num_input_tokens_seen": 138114735, + "step": 6432, + "time_per_iteration": 2.8269641399383545 + }, + { + "auxiliary_loss_clip": 0.01069392, + "auxiliary_loss_mlp": 0.01046974, + "balance_loss_clip": 1.02848959, + "balance_loss_mlp": 1.03097641, + "epoch": 0.38677288441304675, + "flos": 20010754823040.0, + "grad_norm": 2.32343904001604, + "language_loss": 0.80659544, + "learning_rate": 2.806862131772779e-06, + "loss": 0.82775909, + "num_input_tokens_seen": 138130480, + "step": 6433, + "time_per_iteration": 2.5833046436309814 + }, + { + "auxiliary_loss_clip": 0.01063848, + "auxiliary_loss_mlp": 0.01033764, + "balance_loss_clip": 1.0324806, + "balance_loss_mlp": 1.01871967, + "epoch": 0.3868330076657147, + "flos": 22236893147520.0, + "grad_norm": 1.9569051982579346, + "language_loss": 0.70817685, + "learning_rate": 2.806505755127765e-06, + "loss": 0.72915292, + "num_input_tokens_seen": 138150640, + "step": 6434, + "time_per_iteration": 2.6324594020843506 + }, + { + "auxiliary_loss_clip": 0.01047732, + "auxiliary_loss_mlp": 0.01046543, + "balance_loss_clip": 1.02940631, + "balance_loss_mlp": 1.02973437, + "epoch": 0.3868931309183827, + "flos": 16727442387840.0, + "grad_norm": 3.8634676856476493, + "language_loss": 0.77410996, + "learning_rate": 2.806149347899972e-06, + "loss": 0.79505271, + "num_input_tokens_seen": 138169700, + "step": 6435, + "time_per_iteration": 2.672323703765869 + }, + { + "auxiliary_loss_clip": 0.01069713, + "auxiliary_loss_mlp": 0.01037807, + "balance_loss_clip": 1.03066707, + "balance_loss_mlp": 1.02364492, + "epoch": 0.38695325417105064, + "flos": 22674716023680.0, + "grad_norm": 1.6870007098152673, + "language_loss": 0.7929343, + "learning_rate": 2.805792910102915e-06, + "loss": 0.81400949, + "num_input_tokens_seen": 138185835, + "step": 6436, + "time_per_iteration": 2.78222393989563 + }, + { + "auxiliary_loss_clip": 0.01056273, + "auxiliary_loss_mlp": 0.0103171, + "balance_loss_clip": 1.029392, + "balance_loss_mlp": 1.01880574, + "epoch": 0.3870133774237186, + "flos": 23112036109440.0, + "grad_norm": 1.6975067569576134, + "language_loss": 0.76861167, + "learning_rate": 2.8054364417501093e-06, + "loss": 0.78949147, + "num_input_tokens_seen": 138204080, + "step": 6437, + "time_per_iteration": 2.6795108318328857 + }, + { + "auxiliary_loss_clip": 0.01060879, + "auxiliary_loss_mlp": 0.01037532, + "balance_loss_clip": 1.03068328, + "balance_loss_mlp": 1.024508, + "epoch": 0.3870735006763866, + "flos": 17675699483520.0, + "grad_norm": 1.998280615345762, + "language_loss": 0.8163631, + "learning_rate": 2.805079942855074e-06, + "loss": 0.83734727, + "num_input_tokens_seen": 138220710, + "step": 6438, + "time_per_iteration": 2.6317691802978516 + }, + { + "auxiliary_loss_clip": 0.01056189, + "auxiliary_loss_mlp": 0.00747804, + "balance_loss_clip": 1.02805781, + "balance_loss_mlp": 1.00026417, + "epoch": 0.38713362392905454, + "flos": 23295791111040.0, + "grad_norm": 1.3482788140228028, + "language_loss": 0.75395632, + "learning_rate": 2.804723413431326e-06, + "loss": 0.77199626, + "num_input_tokens_seen": 138241720, + "step": 6439, + "time_per_iteration": 2.687167167663574 + }, + { + "auxiliary_loss_clip": 0.01079415, + "auxiliary_loss_mlp": 0.0103186, + "balance_loss_clip": 1.03149378, + "balance_loss_mlp": 1.01829433, + "epoch": 0.38719374718172256, + "flos": 21031192298880.0, + "grad_norm": 1.357057209324222, + "language_loss": 0.74002254, + "learning_rate": 2.8043668534923855e-06, + "loss": 0.76113534, + "num_input_tokens_seen": 138261885, + "step": 6440, + "time_per_iteration": 2.5499346256256104 + }, + { + "auxiliary_loss_clip": 0.01071393, + "auxiliary_loss_mlp": 0.01038484, + "balance_loss_clip": 1.03002644, + "balance_loss_mlp": 1.02384543, + "epoch": 0.3872538704343905, + "flos": 19609776322560.0, + "grad_norm": 1.9938757116797765, + "language_loss": 0.82333612, + "learning_rate": 2.804010263051774e-06, + "loss": 0.84443492, + "num_input_tokens_seen": 138280255, + "step": 6441, + "time_per_iteration": 2.7515735626220703 + }, + { + "auxiliary_loss_clip": 0.01083172, + "auxiliary_loss_mlp": 0.01039844, + "balance_loss_clip": 1.03282523, + "balance_loss_mlp": 1.0262723, + "epoch": 0.3873139936870585, + "flos": 17530045833600.0, + "grad_norm": 2.0144071865739006, + "language_loss": 0.80959666, + "learning_rate": 2.8036536421230118e-06, + "loss": 0.83082688, + "num_input_tokens_seen": 138296675, + "step": 6442, + "time_per_iteration": 2.597320556640625 + }, + { + "auxiliary_loss_clip": 0.01038714, + "auxiliary_loss_mlp": 0.01033824, + "balance_loss_clip": 1.02751112, + "balance_loss_mlp": 1.01929808, + "epoch": 0.38737411693972645, + "flos": 17786555832960.0, + "grad_norm": 1.6553441751284532, + "language_loss": 0.84190112, + "learning_rate": 2.803296990719624e-06, + "loss": 0.86262649, + "num_input_tokens_seen": 138314985, + "step": 6443, + "time_per_iteration": 2.686497449874878 + }, + { + "auxiliary_loss_clip": 0.00994063, + "auxiliary_loss_mlp": 0.01006126, + "balance_loss_clip": 1.00495291, + "balance_loss_mlp": 1.00466621, + "epoch": 0.3874342401923944, + "flos": 58304637048960.0, + "grad_norm": 0.7712266775930398, + "language_loss": 0.50276017, + "learning_rate": 2.8029403088551327e-06, + "loss": 0.52276206, + "num_input_tokens_seen": 138373275, + "step": 6444, + "time_per_iteration": 3.25150465965271 + }, + { + "auxiliary_loss_clip": 0.01032905, + "auxiliary_loss_mlp": 0.00747727, + "balance_loss_clip": 1.02495575, + "balance_loss_mlp": 1.00021529, + "epoch": 0.3874943634450624, + "flos": 17711933328000.0, + "grad_norm": 1.6753272413219242, + "language_loss": 0.79111201, + "learning_rate": 2.802583596543065e-06, + "loss": 0.8089183, + "num_input_tokens_seen": 138391145, + "step": 6445, + "time_per_iteration": 2.755751848220825 + }, + { + "auxiliary_loss_clip": 0.01058231, + "auxiliary_loss_mlp": 0.01039648, + "balance_loss_clip": 1.02822089, + "balance_loss_mlp": 1.02578425, + "epoch": 0.38755448669773035, + "flos": 19244852098560.0, + "grad_norm": 1.7235305965599104, + "language_loss": 0.81094992, + "learning_rate": 2.8022268537969474e-06, + "loss": 0.83192873, + "num_input_tokens_seen": 138409875, + "step": 6446, + "time_per_iteration": 2.74999737739563 + }, + { + "auxiliary_loss_clip": 0.01060842, + "auxiliary_loss_mlp": 0.01039123, + "balance_loss_clip": 1.03106952, + "balance_loss_mlp": 1.02518761, + "epoch": 0.3876146099503983, + "flos": 20594267262720.0, + "grad_norm": 1.6945319589179602, + "language_loss": 0.77297354, + "learning_rate": 2.801870080630306e-06, + "loss": 0.79397321, + "num_input_tokens_seen": 138428965, + "step": 6447, + "time_per_iteration": 2.8995845317840576 + }, + { + "auxiliary_loss_clip": 0.01058997, + "auxiliary_loss_mlp": 0.01042609, + "balance_loss_clip": 1.02955818, + "balance_loss_mlp": 1.02932286, + "epoch": 0.3876747332030663, + "flos": 19281121856640.0, + "grad_norm": 1.4182701944519556, + "language_loss": 0.75903994, + "learning_rate": 2.801513277056671e-06, + "loss": 0.780056, + "num_input_tokens_seen": 138448090, + "step": 6448, + "time_per_iteration": 2.7749452590942383 + }, + { + "auxiliary_loss_clip": 0.01051769, + "auxiliary_loss_mlp": 0.01035809, + "balance_loss_clip": 1.02691531, + "balance_loss_mlp": 1.02220726, + "epoch": 0.38773485645573424, + "flos": 18945895201920.0, + "grad_norm": 1.8178762133974389, + "language_loss": 0.75927341, + "learning_rate": 2.8011564430895725e-06, + "loss": 0.78014922, + "num_input_tokens_seen": 138466105, + "step": 6449, + "time_per_iteration": 2.6098880767822266 + }, + { + "auxiliary_loss_clip": 0.01046659, + "auxiliary_loss_mlp": 0.00747821, + "balance_loss_clip": 1.02734447, + "balance_loss_mlp": 1.00016975, + "epoch": 0.3877949797084022, + "flos": 23071348978560.0, + "grad_norm": 1.490858923629565, + "language_loss": 0.77927685, + "learning_rate": 2.800799578742542e-06, + "loss": 0.79722166, + "num_input_tokens_seen": 138485160, + "step": 6450, + "time_per_iteration": 2.8176302909851074 + }, + { + "auxiliary_loss_clip": 0.01083644, + "auxiliary_loss_mlp": 0.01038836, + "balance_loss_clip": 1.03053236, + "balance_loss_mlp": 1.02402413, + "epoch": 0.3878551029610702, + "flos": 29095543589760.0, + "grad_norm": 2.3123769097169116, + "language_loss": 0.77785289, + "learning_rate": 2.8004426840291106e-06, + "loss": 0.79907775, + "num_input_tokens_seen": 138504135, + "step": 6451, + "time_per_iteration": 2.6494479179382324 + }, + { + "auxiliary_loss_clip": 0.0107544, + "auxiliary_loss_mlp": 0.01029497, + "balance_loss_clip": 1.02832413, + "balance_loss_mlp": 1.01622295, + "epoch": 0.38791522621373814, + "flos": 20996394998400.0, + "grad_norm": 1.6829695564437677, + "language_loss": 0.76642001, + "learning_rate": 2.800085758962812e-06, + "loss": 0.78746933, + "num_input_tokens_seen": 138523955, + "step": 6452, + "time_per_iteration": 2.600128412246704 + }, + { + "auxiliary_loss_clip": 0.01055181, + "auxiliary_loss_mlp": 0.01041471, + "balance_loss_clip": 1.0301882, + "balance_loss_mlp": 1.0276134, + "epoch": 0.3879753494664061, + "flos": 15486836497920.0, + "grad_norm": 2.198012867718325, + "language_loss": 0.79491532, + "learning_rate": 2.799728803557182e-06, + "loss": 0.81588185, + "num_input_tokens_seen": 138541655, + "step": 6453, + "time_per_iteration": 2.5883030891418457 + }, + { + "auxiliary_loss_clip": 0.010772, + "auxiliary_loss_mlp": 0.01038178, + "balance_loss_clip": 1.0331794, + "balance_loss_mlp": 1.02360439, + "epoch": 0.3880354727190741, + "flos": 22053964158720.0, + "grad_norm": 1.8856802242323294, + "language_loss": 0.71373951, + "learning_rate": 2.7993718178257555e-06, + "loss": 0.73489326, + "num_input_tokens_seen": 138560860, + "step": 6454, + "time_per_iteration": 2.6260440349578857 + }, + { + "auxiliary_loss_clip": 0.01083721, + "auxiliary_loss_mlp": 0.01040487, + "balance_loss_clip": 1.03121114, + "balance_loss_mlp": 1.02556217, + "epoch": 0.3880955959717421, + "flos": 20340307128960.0, + "grad_norm": 2.130926211057437, + "language_loss": 0.77639377, + "learning_rate": 2.7990148017820694e-06, + "loss": 0.79763591, + "num_input_tokens_seen": 138580200, + "step": 6455, + "time_per_iteration": 2.603090524673462 + }, + { + "auxiliary_loss_clip": 0.01076707, + "auxiliary_loss_mlp": 0.01031568, + "balance_loss_clip": 1.02886486, + "balance_loss_mlp": 1.01769257, + "epoch": 0.38815571922441006, + "flos": 23075407215360.0, + "grad_norm": 1.529295597119377, + "language_loss": 0.76260471, + "learning_rate": 2.798657755439662e-06, + "loss": 0.78368747, + "num_input_tokens_seen": 138598315, + "step": 6456, + "time_per_iteration": 4.181077480316162 + }, + { + "auxiliary_loss_clip": 0.01019069, + "auxiliary_loss_mlp": 0.01035728, + "balance_loss_clip": 1.02881479, + "balance_loss_mlp": 1.02122021, + "epoch": 0.388215842477078, + "flos": 20776944856320.0, + "grad_norm": 2.2004847539970074, + "language_loss": 0.61088103, + "learning_rate": 2.7983006788120726e-06, + "loss": 0.63142902, + "num_input_tokens_seen": 138615695, + "step": 6457, + "time_per_iteration": 4.367905855178833 + }, + { + "auxiliary_loss_clip": 0.01083132, + "auxiliary_loss_mlp": 0.01037644, + "balance_loss_clip": 1.03069162, + "balance_loss_mlp": 1.02198029, + "epoch": 0.388275965729746, + "flos": 20448182649600.0, + "grad_norm": 2.0607678136735554, + "language_loss": 0.79900038, + "learning_rate": 2.797943571912841e-06, + "loss": 0.82020819, + "num_input_tokens_seen": 138633180, + "step": 6458, + "time_per_iteration": 2.6738924980163574 + }, + { + "auxiliary_loss_clip": 0.0103579, + "auxiliary_loss_mlp": 0.01043501, + "balance_loss_clip": 1.03077638, + "balance_loss_mlp": 1.02774179, + "epoch": 0.38833608898241395, + "flos": 27892392606720.0, + "grad_norm": 1.7753401848362322, + "language_loss": 0.81508267, + "learning_rate": 2.797586434755509e-06, + "loss": 0.83587557, + "num_input_tokens_seen": 138654785, + "step": 6459, + "time_per_iteration": 2.7582294940948486 + }, + { + "auxiliary_loss_clip": 0.01051556, + "auxiliary_loss_mlp": 0.01037121, + "balance_loss_clip": 1.02734244, + "balance_loss_mlp": 1.02367413, + "epoch": 0.3883962122350819, + "flos": 18076390675200.0, + "grad_norm": 1.7228350701218418, + "language_loss": 0.61889291, + "learning_rate": 2.7972292673536202e-06, + "loss": 0.63977969, + "num_input_tokens_seen": 138673330, + "step": 6460, + "time_per_iteration": 2.620492935180664 + }, + { + "auxiliary_loss_clip": 0.01068174, + "auxiliary_loss_mlp": 0.01036612, + "balance_loss_clip": 1.02915883, + "balance_loss_mlp": 1.02410078, + "epoch": 0.3884563354877499, + "flos": 23622254847360.0, + "grad_norm": 1.7922079810411982, + "language_loss": 0.86397547, + "learning_rate": 2.796872069720717e-06, + "loss": 0.8850233, + "num_input_tokens_seen": 138694185, + "step": 6461, + "time_per_iteration": 2.6109559535980225 + }, + { + "auxiliary_loss_clip": 0.01071795, + "auxiliary_loss_mlp": 0.01036261, + "balance_loss_clip": 1.03051376, + "balance_loss_mlp": 1.02222395, + "epoch": 0.38851645874041785, + "flos": 27453528236160.0, + "grad_norm": 2.564446373812402, + "language_loss": 0.70827472, + "learning_rate": 2.7965148418703456e-06, + "loss": 0.72935528, + "num_input_tokens_seen": 138714625, + "step": 6462, + "time_per_iteration": 2.717424154281616 + }, + { + "auxiliary_loss_clip": 0.01029676, + "auxiliary_loss_mlp": 0.01036383, + "balance_loss_clip": 1.02433026, + "balance_loss_mlp": 1.02064729, + "epoch": 0.3885765819930858, + "flos": 25228072270080.0, + "grad_norm": 1.9926976384220307, + "language_loss": 0.75999701, + "learning_rate": 2.796157583816052e-06, + "loss": 0.78065759, + "num_input_tokens_seen": 138733585, + "step": 6463, + "time_per_iteration": 2.7293083667755127 + }, + { + "auxiliary_loss_clip": 0.01050232, + "auxiliary_loss_mlp": 0.01039377, + "balance_loss_clip": 1.03007197, + "balance_loss_mlp": 1.02387989, + "epoch": 0.3886367052457538, + "flos": 16946605221120.0, + "grad_norm": 3.067218507645986, + "language_loss": 0.70504951, + "learning_rate": 2.795800295571382e-06, + "loss": 0.72594565, + "num_input_tokens_seen": 138752335, + "step": 6464, + "time_per_iteration": 2.7528553009033203 + }, + { + "auxiliary_loss_clip": 0.0105931, + "auxiliary_loss_mlp": 0.0103612, + "balance_loss_clip": 1.03103781, + "balance_loss_mlp": 1.02191567, + "epoch": 0.38869682849842174, + "flos": 27154140376320.0, + "grad_norm": 1.8905054280033475, + "language_loss": 0.69677258, + "learning_rate": 2.7954429771498858e-06, + "loss": 0.71772683, + "num_input_tokens_seen": 138768450, + "step": 6465, + "time_per_iteration": 2.7277441024780273 + }, + { + "auxiliary_loss_clip": 0.01051613, + "auxiliary_loss_mlp": 0.01044877, + "balance_loss_clip": 1.03103602, + "balance_loss_mlp": 1.02940369, + "epoch": 0.3887569517510897, + "flos": 21063619301760.0, + "grad_norm": 1.9255340844526483, + "language_loss": 0.7842586, + "learning_rate": 2.7950856285651117e-06, + "loss": 0.80522352, + "num_input_tokens_seen": 138786775, + "step": 6466, + "time_per_iteration": 4.257962942123413 + }, + { + "auxiliary_loss_clip": 0.01046001, + "auxiliary_loss_mlp": 0.01039807, + "balance_loss_clip": 1.02902579, + "balance_loss_mlp": 1.02535844, + "epoch": 0.38881707500375773, + "flos": 29497384016640.0, + "grad_norm": 1.4524766656929897, + "language_loss": 0.69501716, + "learning_rate": 2.794728249830611e-06, + "loss": 0.71587527, + "num_input_tokens_seen": 138810100, + "step": 6467, + "time_per_iteration": 4.3300864696502686 + }, + { + "auxiliary_loss_clip": 0.01045215, + "auxiliary_loss_mlp": 0.01040679, + "balance_loss_clip": 1.0265305, + "balance_loss_mlp": 1.02568877, + "epoch": 0.3888771982564257, + "flos": 17488281294720.0, + "grad_norm": 3.9393050261100755, + "language_loss": 0.83495986, + "learning_rate": 2.794370840959936e-06, + "loss": 0.85581887, + "num_input_tokens_seen": 138825140, + "step": 6468, + "time_per_iteration": 2.5928001403808594 + }, + { + "auxiliary_loss_clip": 0.01055025, + "auxiliary_loss_mlp": 0.01035699, + "balance_loss_clip": 1.02973533, + "balance_loss_mlp": 1.02340853, + "epoch": 0.38893732150909366, + "flos": 21942425450880.0, + "grad_norm": 2.6068469988243788, + "language_loss": 0.84277999, + "learning_rate": 2.7940134019666383e-06, + "loss": 0.86368728, + "num_input_tokens_seen": 138844115, + "step": 6469, + "time_per_iteration": 2.7096588611602783 + }, + { + "auxiliary_loss_clip": 0.01040049, + "auxiliary_loss_mlp": 0.01036863, + "balance_loss_clip": 1.02776599, + "balance_loss_mlp": 1.0218364, + "epoch": 0.3889974447617616, + "flos": 24276367468800.0, + "grad_norm": 1.6262514070136578, + "language_loss": 0.74778426, + "learning_rate": 2.793655932864273e-06, + "loss": 0.76855338, + "num_input_tokens_seen": 138860860, + "step": 6470, + "time_per_iteration": 2.6405746936798096 + }, + { + "auxiliary_loss_clip": 0.01050986, + "auxiliary_loss_mlp": 0.00747782, + "balance_loss_clip": 1.03078508, + "balance_loss_mlp": 1.00005865, + "epoch": 0.3890575680144296, + "flos": 25667116208640.0, + "grad_norm": 1.5765029639914916, + "language_loss": 0.7449435, + "learning_rate": 2.7932984336663953e-06, + "loss": 0.76293123, + "num_input_tokens_seen": 138881910, + "step": 6471, + "time_per_iteration": 2.6840922832489014 + }, + { + "auxiliary_loss_clip": 0.0102208, + "auxiliary_loss_mlp": 0.01042038, + "balance_loss_clip": 1.02464414, + "balance_loss_mlp": 1.02731586, + "epoch": 0.38911769126709755, + "flos": 22855274714880.0, + "grad_norm": 1.603839546065824, + "language_loss": 0.6820876, + "learning_rate": 2.792940904386562e-06, + "loss": 0.70272875, + "num_input_tokens_seen": 138900975, + "step": 6472, + "time_per_iteration": 2.677400588989258 + }, + { + "auxiliary_loss_clip": 0.01051446, + "auxiliary_loss_mlp": 0.01043333, + "balance_loss_clip": 1.030599, + "balance_loss_mlp": 1.02996981, + "epoch": 0.3891778145197655, + "flos": 25447522412160.0, + "grad_norm": 1.804503441610956, + "language_loss": 0.76612353, + "learning_rate": 2.7925833450383293e-06, + "loss": 0.78707135, + "num_input_tokens_seen": 138920795, + "step": 6473, + "time_per_iteration": 2.720318078994751 + }, + { + "auxiliary_loss_clip": 0.0106086, + "auxiliary_loss_mlp": 0.01041519, + "balance_loss_clip": 1.03165507, + "balance_loss_mlp": 1.02716625, + "epoch": 0.3892379377724335, + "flos": 14027965614720.0, + "grad_norm": 2.3209770577784505, + "language_loss": 0.705158, + "learning_rate": 2.792225755635257e-06, + "loss": 0.72618181, + "num_input_tokens_seen": 138938770, + "step": 6474, + "time_per_iteration": 2.6710381507873535 + }, + { + "auxiliary_loss_clip": 0.0108034, + "auxiliary_loss_mlp": 0.01037197, + "balance_loss_clip": 1.03055668, + "balance_loss_mlp": 1.02417374, + "epoch": 0.38929806102510145, + "flos": 20157449967360.0, + "grad_norm": 1.6427787295871241, + "language_loss": 0.68889678, + "learning_rate": 2.7918681361909046e-06, + "loss": 0.71007216, + "num_input_tokens_seen": 138958880, + "step": 6475, + "time_per_iteration": 2.6756863594055176 + }, + { + "auxiliary_loss_clip": 0.01054639, + "auxiliary_loss_mlp": 0.01051797, + "balance_loss_clip": 1.02850103, + "balance_loss_mlp": 1.03582287, + "epoch": 0.3893581842777694, + "flos": 22163958581760.0, + "grad_norm": 1.7829211480170586, + "language_loss": 0.75545031, + "learning_rate": 2.7915104867188332e-06, + "loss": 0.77651465, + "num_input_tokens_seen": 138977240, + "step": 6476, + "time_per_iteration": 2.7672715187072754 + }, + { + "auxiliary_loss_clip": 0.00993975, + "auxiliary_loss_mlp": 0.01001712, + "balance_loss_clip": 1.00359631, + "balance_loss_mlp": 1.00026977, + "epoch": 0.3894183075304374, + "flos": 67301877392640.0, + "grad_norm": 0.772952565329658, + "language_loss": 0.58233827, + "learning_rate": 2.7911528072326055e-06, + "loss": 0.60229516, + "num_input_tokens_seen": 139039035, + "step": 6477, + "time_per_iteration": 3.2133028507232666 + }, + { + "auxiliary_loss_clip": 0.01039336, + "auxiliary_loss_mlp": 0.01036192, + "balance_loss_clip": 1.02862549, + "balance_loss_mlp": 1.02132058, + "epoch": 0.38947843078310534, + "flos": 18547502480640.0, + "grad_norm": 1.8209568575836101, + "language_loss": 0.78020859, + "learning_rate": 2.7907950977457832e-06, + "loss": 0.80096382, + "num_input_tokens_seen": 139055560, + "step": 6478, + "time_per_iteration": 2.738147020339966 + }, + { + "auxiliary_loss_clip": 0.0106128, + "auxiliary_loss_mlp": 0.01039134, + "balance_loss_clip": 1.02680397, + "balance_loss_mlp": 1.02569282, + "epoch": 0.3895385540357733, + "flos": 14605875532800.0, + "grad_norm": 2.1077653215107133, + "language_loss": 0.82341057, + "learning_rate": 2.7904373582719317e-06, + "loss": 0.84441471, + "num_input_tokens_seen": 139071865, + "step": 6479, + "time_per_iteration": 2.6395153999328613 + }, + { + "auxiliary_loss_clip": 0.01078403, + "auxiliary_loss_mlp": 0.01034771, + "balance_loss_clip": 1.03008389, + "balance_loss_mlp": 1.02080524, + "epoch": 0.38959867728844133, + "flos": 19975203336960.0, + "grad_norm": 1.6062585441968646, + "language_loss": 0.80168498, + "learning_rate": 2.790079588824617e-06, + "loss": 0.82281667, + "num_input_tokens_seen": 139089640, + "step": 6480, + "time_per_iteration": 2.6682887077331543 + }, + { + "auxiliary_loss_clip": 0.01047832, + "auxiliary_loss_mlp": 0.0102908, + "balance_loss_clip": 1.02561426, + "balance_loss_mlp": 1.01605093, + "epoch": 0.3896588005411093, + "flos": 22672130244480.0, + "grad_norm": 1.7833210863733198, + "language_loss": 0.83208597, + "learning_rate": 2.7897217894174038e-06, + "loss": 0.85285509, + "num_input_tokens_seen": 139109365, + "step": 6481, + "time_per_iteration": 2.6464688777923584 + }, + { + "auxiliary_loss_clip": 0.01049238, + "auxiliary_loss_mlp": 0.01036271, + "balance_loss_clip": 1.03011954, + "balance_loss_mlp": 1.02319384, + "epoch": 0.38971892379377726, + "flos": 20996035862400.0, + "grad_norm": 2.905681021608683, + "language_loss": 0.75497419, + "learning_rate": 2.789363960063863e-06, + "loss": 0.77582926, + "num_input_tokens_seen": 139128260, + "step": 6482, + "time_per_iteration": 2.6843953132629395 + }, + { + "auxiliary_loss_clip": 0.01048152, + "auxiliary_loss_mlp": 0.01032388, + "balance_loss_clip": 1.02901196, + "balance_loss_mlp": 1.01898861, + "epoch": 0.3897790470464452, + "flos": 22528487756160.0, + "grad_norm": 3.8634635772867547, + "language_loss": 0.78901458, + "learning_rate": 2.78900610077756e-06, + "loss": 0.80982006, + "num_input_tokens_seen": 139147315, + "step": 6483, + "time_per_iteration": 2.7356128692626953 + }, + { + "auxiliary_loss_clip": 0.0106785, + "auxiliary_loss_mlp": 0.01028879, + "balance_loss_clip": 1.02828991, + "balance_loss_mlp": 1.01342988, + "epoch": 0.3898391702991132, + "flos": 26209905603840.0, + "grad_norm": 1.4592676697291083, + "language_loss": 0.79990089, + "learning_rate": 2.788648211572067e-06, + "loss": 0.82086819, + "num_input_tokens_seen": 139167270, + "step": 6484, + "time_per_iteration": 2.6967527866363525 + }, + { + "auxiliary_loss_clip": 0.01061585, + "auxiliary_loss_mlp": 0.01051403, + "balance_loss_clip": 1.02938676, + "balance_loss_mlp": 1.03546476, + "epoch": 0.38989929355178116, + "flos": 21065558636160.0, + "grad_norm": 1.60782304324815, + "language_loss": 0.78156656, + "learning_rate": 2.7882902924609557e-06, + "loss": 0.80269647, + "num_input_tokens_seen": 139185970, + "step": 6485, + "time_per_iteration": 2.5701565742492676 + }, + { + "auxiliary_loss_clip": 0.01032525, + "auxiliary_loss_mlp": 0.01036131, + "balance_loss_clip": 1.02871382, + "balance_loss_mlp": 1.02217209, + "epoch": 0.3899594168044491, + "flos": 25484115392640.0, + "grad_norm": 3.004718429430567, + "language_loss": 0.85400391, + "learning_rate": 2.7879323434577965e-06, + "loss": 0.87469047, + "num_input_tokens_seen": 139203730, + "step": 6486, + "time_per_iteration": 2.717524766921997 + }, + { + "auxiliary_loss_clip": 0.0105654, + "auxiliary_loss_mlp": 0.01034089, + "balance_loss_clip": 1.02877569, + "balance_loss_mlp": 1.02043915, + "epoch": 0.3900195400571171, + "flos": 31139363456640.0, + "grad_norm": 2.0066364753674137, + "language_loss": 0.85509038, + "learning_rate": 2.7875743645761645e-06, + "loss": 0.87599671, + "num_input_tokens_seen": 139222560, + "step": 6487, + "time_per_iteration": 2.7568531036376953 + }, + { + "auxiliary_loss_clip": 0.0105093, + "auxiliary_loss_mlp": 0.01034503, + "balance_loss_clip": 1.02758873, + "balance_loss_mlp": 1.02038884, + "epoch": 0.39007966330978505, + "flos": 20229917656320.0, + "grad_norm": 1.5880749955148619, + "language_loss": 0.72966266, + "learning_rate": 2.787216355829633e-06, + "loss": 0.75051701, + "num_input_tokens_seen": 139242165, + "step": 6488, + "time_per_iteration": 2.6470658779144287 + }, + { + "auxiliary_loss_clip": 0.0104121, + "auxiliary_loss_mlp": 0.01037538, + "balance_loss_clip": 1.02732003, + "balance_loss_mlp": 1.02206469, + "epoch": 0.390139786562453, + "flos": 22528739151360.0, + "grad_norm": 1.6986042543877657, + "language_loss": 0.68522823, + "learning_rate": 2.786858317231779e-06, + "loss": 0.70601571, + "num_input_tokens_seen": 139262525, + "step": 6489, + "time_per_iteration": 2.6720058917999268 + }, + { + "auxiliary_loss_clip": 0.01055924, + "auxiliary_loss_mlp": 0.01041863, + "balance_loss_clip": 1.02973366, + "balance_loss_mlp": 1.02866101, + "epoch": 0.390199909815121, + "flos": 26432911192320.0, + "grad_norm": 1.6257479200287084, + "language_loss": 0.80638874, + "learning_rate": 2.7865002487961788e-06, + "loss": 0.82736665, + "num_input_tokens_seen": 139282835, + "step": 6490, + "time_per_iteration": 2.620354652404785 + }, + { + "auxiliary_loss_clip": 0.01070247, + "auxiliary_loss_mlp": 0.01033274, + "balance_loss_clip": 1.02995467, + "balance_loss_mlp": 1.01908851, + "epoch": 0.39026003306778895, + "flos": 17274577328640.0, + "grad_norm": 2.233438389992761, + "language_loss": 0.89035654, + "learning_rate": 2.7861421505364104e-06, + "loss": 0.91139185, + "num_input_tokens_seen": 139299490, + "step": 6491, + "time_per_iteration": 2.5532617568969727 + }, + { + "auxiliary_loss_clip": 0.01040615, + "auxiliary_loss_mlp": 0.01042997, + "balance_loss_clip": 1.02775645, + "balance_loss_mlp": 1.02761865, + "epoch": 0.3903201563204569, + "flos": 24532841554560.0, + "grad_norm": 2.7989275006478938, + "language_loss": 0.78843415, + "learning_rate": 2.7857840224660523e-06, + "loss": 0.80927026, + "num_input_tokens_seen": 139317865, + "step": 6492, + "time_per_iteration": 2.705451011657715 + }, + { + "auxiliary_loss_clip": 0.0105312, + "auxiliary_loss_mlp": 0.01039348, + "balance_loss_clip": 1.02728939, + "balance_loss_mlp": 1.02590096, + "epoch": 0.39038027957312493, + "flos": 23767944410880.0, + "grad_norm": 1.6454148029851472, + "language_loss": 0.74593699, + "learning_rate": 2.7854258645986857e-06, + "loss": 0.76686162, + "num_input_tokens_seen": 139339840, + "step": 6493, + "time_per_iteration": 2.7764949798583984 + }, + { + "auxiliary_loss_clip": 0.01040356, + "auxiliary_loss_mlp": 0.01039884, + "balance_loss_clip": 1.03056681, + "balance_loss_mlp": 1.02553105, + "epoch": 0.3904404028257929, + "flos": 14100612871680.0, + "grad_norm": 1.9834788512579207, + "language_loss": 0.76490957, + "learning_rate": 2.7850676769478916e-06, + "loss": 0.78571194, + "num_input_tokens_seen": 139357555, + "step": 6494, + "time_per_iteration": 2.687811851501465 + }, + { + "auxiliary_loss_clip": 0.01075885, + "auxiliary_loss_mlp": 0.01046006, + "balance_loss_clip": 1.03038621, + "balance_loss_mlp": 1.03038955, + "epoch": 0.39050052607846086, + "flos": 16910048154240.0, + "grad_norm": 1.6790609504981686, + "language_loss": 0.74312305, + "learning_rate": 2.7847094595272525e-06, + "loss": 0.76434195, + "num_input_tokens_seen": 139374455, + "step": 6495, + "time_per_iteration": 2.652390241622925 + }, + { + "auxiliary_loss_clip": 0.01081356, + "auxiliary_loss_mlp": 0.01040211, + "balance_loss_clip": 1.03149557, + "balance_loss_mlp": 1.02476192, + "epoch": 0.39056064933112883, + "flos": 25915761129600.0, + "grad_norm": 1.6763821502892435, + "language_loss": 0.68169224, + "learning_rate": 2.784351212350352e-06, + "loss": 0.70290792, + "num_input_tokens_seen": 139394770, + "step": 6496, + "time_per_iteration": 2.631645917892456 + }, + { + "auxiliary_loss_clip": 0.00984979, + "auxiliary_loss_mlp": 0.01001803, + "balance_loss_clip": 1.00546074, + "balance_loss_mlp": 1.00025308, + "epoch": 0.3906207725837968, + "flos": 60028421713920.0, + "grad_norm": 0.6635605267942113, + "language_loss": 0.53965676, + "learning_rate": 2.783992935430775e-06, + "loss": 0.55952454, + "num_input_tokens_seen": 139454760, + "step": 6497, + "time_per_iteration": 3.461648464202881 + }, + { + "auxiliary_loss_clip": 0.01047313, + "auxiliary_loss_mlp": 0.00747774, + "balance_loss_clip": 1.03103423, + "balance_loss_mlp": 1.00012016, + "epoch": 0.39068089583646476, + "flos": 21068683119360.0, + "grad_norm": 1.8101613531918699, + "language_loss": 0.69037974, + "learning_rate": 2.7836346287821068e-06, + "loss": 0.70833063, + "num_input_tokens_seen": 139472645, + "step": 6498, + "time_per_iteration": 2.7258188724517822 + }, + { + "auxiliary_loss_clip": 0.00986786, + "auxiliary_loss_mlp": 0.01003764, + "balance_loss_clip": 1.00637066, + "balance_loss_mlp": 1.00205898, + "epoch": 0.3907410190891327, + "flos": 70445677403520.0, + "grad_norm": 0.736329050433796, + "language_loss": 0.51762784, + "learning_rate": 2.783276292417936e-06, + "loss": 0.53753328, + "num_input_tokens_seen": 139536730, + "step": 6499, + "time_per_iteration": 3.285191774368286 + }, + { + "auxiliary_loss_clip": 0.01069576, + "auxiliary_loss_mlp": 0.01046178, + "balance_loss_clip": 1.02889121, + "balance_loss_mlp": 1.03127122, + "epoch": 0.3908011423418007, + "flos": 27962454084480.0, + "grad_norm": 1.7529046441993072, + "language_loss": 0.74218827, + "learning_rate": 2.7829179263518487e-06, + "loss": 0.76334578, + "num_input_tokens_seen": 139557540, + "step": 6500, + "time_per_iteration": 2.687408208847046 + }, + { + "auxiliary_loss_clip": 0.01069318, + "auxiliary_loss_mlp": 0.01035153, + "balance_loss_clip": 1.02944958, + "balance_loss_mlp": 1.02144361, + "epoch": 0.39086126559446865, + "flos": 24462097718400.0, + "grad_norm": 1.9974495021309766, + "language_loss": 0.68427563, + "learning_rate": 2.7825595305974354e-06, + "loss": 0.70532036, + "num_input_tokens_seen": 139576875, + "step": 6501, + "time_per_iteration": 2.622084617614746 + }, + { + "auxiliary_loss_clip": 0.01070245, + "auxiliary_loss_mlp": 0.01037418, + "balance_loss_clip": 1.03165209, + "balance_loss_mlp": 1.02384639, + "epoch": 0.3909213888471366, + "flos": 16941541403520.0, + "grad_norm": 2.355768556194631, + "language_loss": 0.78742534, + "learning_rate": 2.782201105168287e-06, + "loss": 0.80850196, + "num_input_tokens_seen": 139594295, + "step": 6502, + "time_per_iteration": 2.6008141040802 + }, + { + "auxiliary_loss_clip": 0.01060116, + "auxiliary_loss_mlp": 0.01039864, + "balance_loss_clip": 1.0325942, + "balance_loss_mlp": 1.02685821, + "epoch": 0.3909815120998046, + "flos": 29278400751360.0, + "grad_norm": 2.9602027311476484, + "language_loss": 0.80907428, + "learning_rate": 2.7818426500779932e-06, + "loss": 0.83007407, + "num_input_tokens_seen": 139614080, + "step": 6503, + "time_per_iteration": 4.34628963470459 + }, + { + "auxiliary_loss_clip": 0.01053305, + "auxiliary_loss_mlp": 0.01032576, + "balance_loss_clip": 1.02834451, + "balance_loss_mlp": 1.02013087, + "epoch": 0.39104163535247255, + "flos": 18951246328320.0, + "grad_norm": 1.7803658809225844, + "language_loss": 0.71314144, + "learning_rate": 2.7814841653401485e-06, + "loss": 0.73400033, + "num_input_tokens_seen": 139632755, + "step": 6504, + "time_per_iteration": 2.596327543258667 + }, + { + "auxiliary_loss_clip": 0.01076323, + "auxiliary_loss_mlp": 0.01032929, + "balance_loss_clip": 1.0289526, + "balance_loss_mlp": 1.01940513, + "epoch": 0.3911017586051405, + "flos": 26323347732480.0, + "grad_norm": 1.5330006457353378, + "language_loss": 0.83096004, + "learning_rate": 2.7811256509683454e-06, + "loss": 0.85205257, + "num_input_tokens_seen": 139654205, + "step": 6505, + "time_per_iteration": 4.228886365890503 + }, + { + "auxiliary_loss_clip": 0.01078718, + "auxiliary_loss_mlp": 0.01034249, + "balance_loss_clip": 1.03200018, + "balance_loss_mlp": 1.0198133, + "epoch": 0.3911618818578085, + "flos": 21835770992640.0, + "grad_norm": 1.8822829971521307, + "language_loss": 0.71212137, + "learning_rate": 2.7807671069761797e-06, + "loss": 0.73325109, + "num_input_tokens_seen": 139673595, + "step": 6506, + "time_per_iteration": 2.5422422885894775 + }, + { + "auxiliary_loss_clip": 0.01053288, + "auxiliary_loss_mlp": 0.01036714, + "balance_loss_clip": 1.02772188, + "balance_loss_mlp": 1.02459049, + "epoch": 0.3912220051104765, + "flos": 16359680989440.0, + "grad_norm": 2.0816320585925143, + "language_loss": 0.74985743, + "learning_rate": 2.7804085333772477e-06, + "loss": 0.77075744, + "num_input_tokens_seen": 139690565, + "step": 6507, + "time_per_iteration": 2.622040033340454 + }, + { + "auxiliary_loss_clip": 0.01014275, + "auxiliary_loss_mlp": 0.01002163, + "balance_loss_clip": 1.00499713, + "balance_loss_mlp": 1.00073266, + "epoch": 0.39128212836314447, + "flos": 71050986420480.0, + "grad_norm": 0.7608080187811954, + "language_loss": 0.56483823, + "learning_rate": 2.7800499301851446e-06, + "loss": 0.5850026, + "num_input_tokens_seen": 139749420, + "step": 6508, + "time_per_iteration": 3.2559947967529297 + }, + { + "auxiliary_loss_clip": 0.01067806, + "auxiliary_loss_mlp": 0.01036993, + "balance_loss_clip": 1.03042316, + "balance_loss_mlp": 1.0243448, + "epoch": 0.39134225161581243, + "flos": 20331975173760.0, + "grad_norm": 1.9416949948909736, + "language_loss": 0.76146585, + "learning_rate": 2.779691297413471e-06, + "loss": 0.7825138, + "num_input_tokens_seen": 139766265, + "step": 6509, + "time_per_iteration": 2.63840651512146 + }, + { + "auxiliary_loss_clip": 0.01044586, + "auxiliary_loss_mlp": 0.01044475, + "balance_loss_clip": 1.02650023, + "balance_loss_mlp": 1.02974057, + "epoch": 0.3914023748684804, + "flos": 17018390551680.0, + "grad_norm": 3.2164575434424325, + "language_loss": 0.83274853, + "learning_rate": 2.779332635075825e-06, + "loss": 0.85363919, + "num_input_tokens_seen": 139782400, + "step": 6510, + "time_per_iteration": 2.6433961391448975 + }, + { + "auxiliary_loss_clip": 0.01065963, + "auxiliary_loss_mlp": 0.01040087, + "balance_loss_clip": 1.02706993, + "balance_loss_mlp": 1.02673006, + "epoch": 0.39146249812114836, + "flos": 18405224709120.0, + "grad_norm": 1.7202121637542696, + "language_loss": 0.76895273, + "learning_rate": 2.7789739431858073e-06, + "loss": 0.79001331, + "num_input_tokens_seen": 139801435, + "step": 6511, + "time_per_iteration": 2.5834925174713135 + }, + { + "auxiliary_loss_clip": 0.00990517, + "auxiliary_loss_mlp": 0.01005009, + "balance_loss_clip": 1.00150776, + "balance_loss_mlp": 1.00350082, + "epoch": 0.3915226213738163, + "flos": 67637355442560.0, + "grad_norm": 0.7260431637565092, + "language_loss": 0.5772965, + "learning_rate": 2.7786152217570196e-06, + "loss": 0.59725177, + "num_input_tokens_seen": 139869700, + "step": 6512, + "time_per_iteration": 3.300577163696289 + }, + { + "auxiliary_loss_clip": 0.01081029, + "auxiliary_loss_mlp": 0.01032551, + "balance_loss_clip": 1.03144944, + "balance_loss_mlp": 1.01839495, + "epoch": 0.3915827446264843, + "flos": 26359330181760.0, + "grad_norm": 1.779248960328768, + "language_loss": 0.6962775, + "learning_rate": 2.7782564708030647e-06, + "loss": 0.71741331, + "num_input_tokens_seen": 139890140, + "step": 6513, + "time_per_iteration": 2.705587863922119 + }, + { + "auxiliary_loss_clip": 0.01045127, + "auxiliary_loss_mlp": 0.01040753, + "balance_loss_clip": 1.03220165, + "balance_loss_mlp": 1.02613211, + "epoch": 0.39164286787915226, + "flos": 21943897908480.0, + "grad_norm": 2.2972527983988713, + "language_loss": 0.75935733, + "learning_rate": 2.7778976903375464e-06, + "loss": 0.78021616, + "num_input_tokens_seen": 139908020, + "step": 6514, + "time_per_iteration": 4.367039918899536 + }, + { + "auxiliary_loss_clip": 0.01046049, + "auxiliary_loss_mlp": 0.01036942, + "balance_loss_clip": 1.03065586, + "balance_loss_mlp": 1.02397764, + "epoch": 0.3917029911318202, + "flos": 16399829416320.0, + "grad_norm": 1.6140526546422995, + "language_loss": 0.76722848, + "learning_rate": 2.7775388803740693e-06, + "loss": 0.7880584, + "num_input_tokens_seen": 139926180, + "step": 6515, + "time_per_iteration": 4.241411447525024 + }, + { + "auxiliary_loss_clip": 0.01032756, + "auxiliary_loss_mlp": 0.01045413, + "balance_loss_clip": 1.02483869, + "balance_loss_mlp": 1.03155518, + "epoch": 0.3917631143844882, + "flos": 26211701283840.0, + "grad_norm": 1.3728319886273666, + "language_loss": 0.79816687, + "learning_rate": 2.7771800409262406e-06, + "loss": 0.81894857, + "num_input_tokens_seen": 139947420, + "step": 6516, + "time_per_iteration": 2.7651889324188232 + }, + { + "auxiliary_loss_clip": 0.01044502, + "auxiliary_loss_mlp": 0.01038226, + "balance_loss_clip": 1.03277087, + "balance_loss_mlp": 1.0247612, + "epoch": 0.39182323763715615, + "flos": 18548364407040.0, + "grad_norm": 1.7712487537858495, + "language_loss": 0.69710279, + "learning_rate": 2.7768211720076665e-06, + "loss": 0.71793002, + "num_input_tokens_seen": 139965800, + "step": 6517, + "time_per_iteration": 2.733193874359131 + }, + { + "auxiliary_loss_clip": 0.01032566, + "auxiliary_loss_mlp": 0.0104669, + "balance_loss_clip": 1.0233258, + "balance_loss_mlp": 1.03262937, + "epoch": 0.3918833608898241, + "flos": 34313543395200.0, + "grad_norm": 1.7377506071864977, + "language_loss": 0.72194791, + "learning_rate": 2.776462273631956e-06, + "loss": 0.74274051, + "num_input_tokens_seen": 139988140, + "step": 6518, + "time_per_iteration": 2.8079020977020264 + }, + { + "auxiliary_loss_clip": 0.01070055, + "auxiliary_loss_mlp": 0.01032883, + "balance_loss_clip": 1.0307548, + "balance_loss_mlp": 1.01957917, + "epoch": 0.3919434841424921, + "flos": 36939582812160.0, + "grad_norm": 1.7026865756806107, + "language_loss": 0.61659324, + "learning_rate": 2.7761033458127177e-06, + "loss": 0.63762259, + "num_input_tokens_seen": 140010060, + "step": 6519, + "time_per_iteration": 2.7854206562042236 + }, + { + "auxiliary_loss_clip": 0.01083112, + "auxiliary_loss_mlp": 0.01041738, + "balance_loss_clip": 1.03078246, + "balance_loss_mlp": 1.0268544, + "epoch": 0.3920036073951601, + "flos": 23508956373120.0, + "grad_norm": 2.6375793353436445, + "language_loss": 0.67239994, + "learning_rate": 2.775744388563563e-06, + "loss": 0.6936484, + "num_input_tokens_seen": 140029400, + "step": 6520, + "time_per_iteration": 2.6550278663635254 + }, + { + "auxiliary_loss_clip": 0.01075416, + "auxiliary_loss_mlp": 0.01036678, + "balance_loss_clip": 1.02914715, + "balance_loss_mlp": 1.023386, + "epoch": 0.39206373064782807, + "flos": 18406086635520.0, + "grad_norm": 2.807422443120173, + "language_loss": 0.78729397, + "learning_rate": 2.775385401898104e-06, + "loss": 0.80841488, + "num_input_tokens_seen": 140048940, + "step": 6521, + "time_per_iteration": 2.6448793411254883 + }, + { + "auxiliary_loss_clip": 0.01072711, + "auxiliary_loss_mlp": 0.01037575, + "balance_loss_clip": 1.03070974, + "balance_loss_mlp": 1.02147007, + "epoch": 0.39212385390049603, + "flos": 12313051608960.0, + "grad_norm": 2.137661481174609, + "language_loss": 0.69150639, + "learning_rate": 2.775026385829952e-06, + "loss": 0.71260923, + "num_input_tokens_seen": 140066380, + "step": 6522, + "time_per_iteration": 2.6109471321105957 + }, + { + "auxiliary_loss_clip": 0.01058385, + "auxiliary_loss_mlp": 0.01039116, + "balance_loss_clip": 1.02835107, + "balance_loss_mlp": 1.0256393, + "epoch": 0.392183977153164, + "flos": 19719160214400.0, + "grad_norm": 2.2248146654860097, + "language_loss": 0.76440269, + "learning_rate": 2.774667340372722e-06, + "loss": 0.78537774, + "num_input_tokens_seen": 140085275, + "step": 6523, + "time_per_iteration": 2.6787607669830322 + }, + { + "auxiliary_loss_clip": 0.01055139, + "auxiliary_loss_mlp": 0.01041367, + "balance_loss_clip": 1.02664006, + "balance_loss_mlp": 1.02777672, + "epoch": 0.39224410040583196, + "flos": 33144902403840.0, + "grad_norm": 2.322019773940644, + "language_loss": 0.61958343, + "learning_rate": 2.7743082655400293e-06, + "loss": 0.64054847, + "num_input_tokens_seen": 140105105, + "step": 6524, + "time_per_iteration": 2.831129789352417 + }, + { + "auxiliary_loss_clip": 0.01077831, + "auxiliary_loss_mlp": 0.0103692, + "balance_loss_clip": 1.02861214, + "balance_loss_mlp": 1.02247787, + "epoch": 0.39230422365849993, + "flos": 27782434097280.0, + "grad_norm": 1.720499829030504, + "language_loss": 0.73874283, + "learning_rate": 2.773949161345489e-06, + "loss": 0.75989032, + "num_input_tokens_seen": 140125645, + "step": 6525, + "time_per_iteration": 2.666090726852417 + }, + { + "auxiliary_loss_clip": 0.01058785, + "auxiliary_loss_mlp": 0.01035648, + "balance_loss_clip": 1.02947283, + "balance_loss_mlp": 1.02297044, + "epoch": 0.3923643469111679, + "flos": 17931634865280.0, + "grad_norm": 1.8516882064537066, + "language_loss": 0.80871558, + "learning_rate": 2.773590027802719e-06, + "loss": 0.82965994, + "num_input_tokens_seen": 140141925, + "step": 6526, + "time_per_iteration": 2.588329792022705 + }, + { + "auxiliary_loss_clip": 0.0106602, + "auxiliary_loss_mlp": 0.0103588, + "balance_loss_clip": 1.02778959, + "balance_loss_mlp": 1.02305889, + "epoch": 0.39242447016383586, + "flos": 24059539019520.0, + "grad_norm": 1.6089219712737246, + "language_loss": 0.7013917, + "learning_rate": 2.7732308649253383e-06, + "loss": 0.72241068, + "num_input_tokens_seen": 140160965, + "step": 6527, + "time_per_iteration": 2.569084405899048 + }, + { + "auxiliary_loss_clip": 0.01045832, + "auxiliary_loss_mlp": 0.01033131, + "balance_loss_clip": 1.03014565, + "balance_loss_mlp": 1.01961303, + "epoch": 0.3924845934165038, + "flos": 10664069016960.0, + "grad_norm": 2.528528432034182, + "language_loss": 0.8185426, + "learning_rate": 2.772871672726965e-06, + "loss": 0.83933222, + "num_input_tokens_seen": 140177780, + "step": 6528, + "time_per_iteration": 2.600435733795166 + }, + { + "auxiliary_loss_clip": 0.01057584, + "auxiliary_loss_mlp": 0.01036619, + "balance_loss_clip": 1.03103697, + "balance_loss_mlp": 1.02345836, + "epoch": 0.3925447166691718, + "flos": 31245910174080.0, + "grad_norm": 1.8027696099785446, + "language_loss": 0.68525612, + "learning_rate": 2.7725124512212205e-06, + "loss": 0.70619816, + "num_input_tokens_seen": 140201660, + "step": 6529, + "time_per_iteration": 2.681307792663574 + }, + { + "auxiliary_loss_clip": 0.01053606, + "auxiliary_loss_mlp": 0.01038474, + "balance_loss_clip": 1.02640569, + "balance_loss_mlp": 1.02429414, + "epoch": 0.39260483992183975, + "flos": 29415040087680.0, + "grad_norm": 3.911440837603998, + "language_loss": 0.80437738, + "learning_rate": 2.7721532004217267e-06, + "loss": 0.82529813, + "num_input_tokens_seen": 140218585, + "step": 6530, + "time_per_iteration": 2.6560256481170654 + }, + { + "auxiliary_loss_clip": 0.0106592, + "auxiliary_loss_mlp": 0.01034107, + "balance_loss_clip": 1.02763224, + "balance_loss_mlp": 1.02109587, + "epoch": 0.3926649631745077, + "flos": 22857788666880.0, + "grad_norm": 1.4447606623570202, + "language_loss": 0.75617635, + "learning_rate": 2.7717939203421063e-06, + "loss": 0.77717662, + "num_input_tokens_seen": 140239905, + "step": 6531, + "time_per_iteration": 2.585514545440674 + }, + { + "auxiliary_loss_clip": 0.01011797, + "auxiliary_loss_mlp": 0.0100412, + "balance_loss_clip": 1.00247812, + "balance_loss_mlp": 1.00267756, + "epoch": 0.3927250864271757, + "flos": 63893881872000.0, + "grad_norm": 0.8143046847117394, + "language_loss": 0.60339433, + "learning_rate": 2.7714346109959822e-06, + "loss": 0.62355345, + "num_input_tokens_seen": 140293820, + "step": 6532, + "time_per_iteration": 3.059011936187744 + }, + { + "auxiliary_loss_clip": 0.00991044, + "auxiliary_loss_mlp": 0.01010047, + "balance_loss_clip": 1.0018363, + "balance_loss_mlp": 1.00842559, + "epoch": 0.3927852096798437, + "flos": 68909741890560.0, + "grad_norm": 0.7828511967076206, + "language_loss": 0.55498445, + "learning_rate": 2.771075272396981e-06, + "loss": 0.5749954, + "num_input_tokens_seen": 140360420, + "step": 6533, + "time_per_iteration": 3.2570817470550537 + }, + { + "auxiliary_loss_clip": 0.01056828, + "auxiliary_loss_mlp": 0.01036718, + "balance_loss_clip": 1.03028715, + "balance_loss_mlp": 1.02332449, + "epoch": 0.39284533293251167, + "flos": 29715972232320.0, + "grad_norm": 2.13344783434988, + "language_loss": 0.75878072, + "learning_rate": 2.7707159045587284e-06, + "loss": 0.77971619, + "num_input_tokens_seen": 140381950, + "step": 6534, + "time_per_iteration": 2.6692042350769043 + }, + { + "auxiliary_loss_clip": 0.01063346, + "auxiliary_loss_mlp": 0.01039078, + "balance_loss_clip": 1.02765441, + "balance_loss_mlp": 1.02399254, + "epoch": 0.39290545618517964, + "flos": 18552027594240.0, + "grad_norm": 1.988635796010378, + "language_loss": 0.78192234, + "learning_rate": 2.770356507494851e-06, + "loss": 0.80294663, + "num_input_tokens_seen": 140399410, + "step": 6535, + "time_per_iteration": 2.649031639099121 + }, + { + "auxiliary_loss_clip": 0.01046655, + "auxiliary_loss_mlp": 0.01032429, + "balance_loss_clip": 1.03122759, + "balance_loss_mlp": 1.02002573, + "epoch": 0.3929655794378476, + "flos": 26249479413120.0, + "grad_norm": 1.8191170006652975, + "language_loss": 0.68162608, + "learning_rate": 2.769997081218978e-06, + "loss": 0.7024169, + "num_input_tokens_seen": 140419055, + "step": 6536, + "time_per_iteration": 2.7387375831604004 + }, + { + "auxiliary_loss_clip": 0.01050807, + "auxiliary_loss_mlp": 0.0103186, + "balance_loss_clip": 1.02703047, + "balance_loss_mlp": 1.01975405, + "epoch": 0.39302570269051557, + "flos": 29277933874560.0, + "grad_norm": 2.160787099143129, + "language_loss": 0.68925583, + "learning_rate": 2.769637625744738e-06, + "loss": 0.71008247, + "num_input_tokens_seen": 140438800, + "step": 6537, + "time_per_iteration": 2.771695375442505 + }, + { + "auxiliary_loss_clip": 0.01070796, + "auxiliary_loss_mlp": 0.01033533, + "balance_loss_clip": 1.03116369, + "balance_loss_mlp": 1.01993108, + "epoch": 0.39308582594318353, + "flos": 17347440067200.0, + "grad_norm": 1.6487729427946678, + "language_loss": 0.78480726, + "learning_rate": 2.769278141085763e-06, + "loss": 0.80585051, + "num_input_tokens_seen": 140456880, + "step": 6538, + "time_per_iteration": 2.6156811714172363 + }, + { + "auxiliary_loss_clip": 0.00969165, + "auxiliary_loss_mlp": 0.0100992, + "balance_loss_clip": 1.00941753, + "balance_loss_mlp": 1.00848961, + "epoch": 0.3931459491958515, + "flos": 61007094650880.0, + "grad_norm": 1.0345561225490891, + "language_loss": 0.61787486, + "learning_rate": 2.768918627255683e-06, + "loss": 0.63766575, + "num_input_tokens_seen": 140507510, + "step": 6539, + "time_per_iteration": 3.0542569160461426 + }, + { + "auxiliary_loss_clip": 0.01056708, + "auxiliary_loss_mlp": 0.01033034, + "balance_loss_clip": 1.03003836, + "balance_loss_mlp": 1.01923525, + "epoch": 0.39320607244851946, + "flos": 39016009249920.0, + "grad_norm": 1.9681234321013235, + "language_loss": 0.68173909, + "learning_rate": 2.7685590842681315e-06, + "loss": 0.70263648, + "num_input_tokens_seen": 140528740, + "step": 6540, + "time_per_iteration": 2.897343397140503 + }, + { + "auxiliary_loss_clip": 0.01054693, + "auxiliary_loss_mlp": 0.01033301, + "balance_loss_clip": 1.02746975, + "balance_loss_mlp": 1.02087879, + "epoch": 0.3932661957011874, + "flos": 24679752180480.0, + "grad_norm": 1.7805127345092913, + "language_loss": 0.72662532, + "learning_rate": 2.7681995121367433e-06, + "loss": 0.74750525, + "num_input_tokens_seen": 140547560, + "step": 6541, + "time_per_iteration": 2.8008925914764404 + }, + { + "auxiliary_loss_clip": 0.01010163, + "auxiliary_loss_mlp": 0.0100751, + "balance_loss_clip": 1.00096798, + "balance_loss_mlp": 1.00587702, + "epoch": 0.3933263189538554, + "flos": 70096552185600.0, + "grad_norm": 0.8277553967705921, + "language_loss": 0.60390812, + "learning_rate": 2.7678399108751516e-06, + "loss": 0.62408483, + "num_input_tokens_seen": 140601175, + "step": 6542, + "time_per_iteration": 2.9458677768707275 + }, + { + "auxiliary_loss_clip": 0.01066814, + "auxiliary_loss_mlp": 0.01031416, + "balance_loss_clip": 1.02934575, + "balance_loss_mlp": 1.01894677, + "epoch": 0.39338644220652336, + "flos": 22929071207040.0, + "grad_norm": 1.6703119784387506, + "language_loss": 0.82446349, + "learning_rate": 2.7674802804969947e-06, + "loss": 0.84544581, + "num_input_tokens_seen": 140622200, + "step": 6543, + "time_per_iteration": 2.622912883758545 + }, + { + "auxiliary_loss_clip": 0.01051724, + "auxiliary_loss_mlp": 0.01036483, + "balance_loss_clip": 1.02491212, + "balance_loss_mlp": 1.023072, + "epoch": 0.3934465654591913, + "flos": 30848163897600.0, + "grad_norm": 1.6385109852781332, + "language_loss": 0.68859422, + "learning_rate": 2.767120621015908e-06, + "loss": 0.70947635, + "num_input_tokens_seen": 140643125, + "step": 6544, + "time_per_iteration": 2.68377947807312 + }, + { + "auxiliary_loss_clip": 0.01059479, + "auxiliary_loss_mlp": 0.01044662, + "balance_loss_clip": 1.028126, + "balance_loss_mlp": 1.03008246, + "epoch": 0.3935066887118593, + "flos": 29236528471680.0, + "grad_norm": 1.9997627247847822, + "language_loss": 0.75144458, + "learning_rate": 2.76676093244553e-06, + "loss": 0.77248597, + "num_input_tokens_seen": 140662500, + "step": 6545, + "time_per_iteration": 2.6949374675750732 + }, + { + "auxiliary_loss_clip": 0.01040807, + "auxiliary_loss_mlp": 0.01032866, + "balance_loss_clip": 1.02913713, + "balance_loss_mlp": 1.02178586, + "epoch": 0.3935668119645273, + "flos": 19135288638720.0, + "grad_norm": 1.381125918576341, + "language_loss": 0.7488125, + "learning_rate": 2.7664012147995015e-06, + "loss": 0.76954925, + "num_input_tokens_seen": 140681960, + "step": 6546, + "time_per_iteration": 2.7187211513519287 + }, + { + "auxiliary_loss_clip": 0.010619, + "auxiliary_loss_mlp": 0.01037172, + "balance_loss_clip": 1.03138375, + "balance_loss_mlp": 1.02334392, + "epoch": 0.3936269352171953, + "flos": 18516116972160.0, + "grad_norm": 1.6036402423676632, + "language_loss": 0.81345218, + "learning_rate": 2.7660414680914617e-06, + "loss": 0.83444297, + "num_input_tokens_seen": 140699170, + "step": 6547, + "time_per_iteration": 2.6027395725250244 + }, + { + "auxiliary_loss_clip": 0.01059758, + "auxiliary_loss_mlp": 0.00747463, + "balance_loss_clip": 1.02637124, + "balance_loss_mlp": 1.00008464, + "epoch": 0.39368705846986324, + "flos": 15632813370240.0, + "grad_norm": 2.077396726708155, + "language_loss": 0.84660041, + "learning_rate": 2.7656816923350525e-06, + "loss": 0.86467266, + "num_input_tokens_seen": 140714920, + "step": 6548, + "time_per_iteration": 2.6430914402008057 + }, + { + "auxiliary_loss_clip": 0.01067106, + "auxiliary_loss_mlp": 0.00747371, + "balance_loss_clip": 1.03016663, + "balance_loss_mlp": 0.99999213, + "epoch": 0.3937471817225312, + "flos": 21325839563520.0, + "grad_norm": 1.554864461198093, + "language_loss": 0.72756648, + "learning_rate": 2.7653218875439174e-06, + "loss": 0.74571121, + "num_input_tokens_seen": 140734595, + "step": 6549, + "time_per_iteration": 2.666177749633789 + }, + { + "auxiliary_loss_clip": 0.01020652, + "auxiliary_loss_mlp": 0.01038542, + "balance_loss_clip": 1.02801979, + "balance_loss_mlp": 1.02373648, + "epoch": 0.39380730497519917, + "flos": 20776693461120.0, + "grad_norm": 1.765439341167977, + "language_loss": 0.77680552, + "learning_rate": 2.764962053731699e-06, + "loss": 0.79739749, + "num_input_tokens_seen": 140754050, + "step": 6550, + "time_per_iteration": 2.7716119289398193 + }, + { + "auxiliary_loss_clip": 0.01043706, + "auxiliary_loss_mlp": 0.01026314, + "balance_loss_clip": 1.02885747, + "balance_loss_mlp": 1.01351047, + "epoch": 0.39386742822786713, + "flos": 21609784575360.0, + "grad_norm": 1.6133066959785398, + "language_loss": 0.81581032, + "learning_rate": 2.7646021909120434e-06, + "loss": 0.83651054, + "num_input_tokens_seen": 140771440, + "step": 6551, + "time_per_iteration": 4.345073461532593 + }, + { + "auxiliary_loss_clip": 0.01067999, + "auxiliary_loss_mlp": 0.0104, + "balance_loss_clip": 1.0287739, + "balance_loss_mlp": 1.02680981, + "epoch": 0.3939275514805351, + "flos": 12414642249600.0, + "grad_norm": 2.9243898266527153, + "language_loss": 0.80296361, + "learning_rate": 2.764242299098596e-06, + "loss": 0.82404363, + "num_input_tokens_seen": 140786715, + "step": 6552, + "time_per_iteration": 4.226542711257935 + }, + { + "auxiliary_loss_clip": 0.01079464, + "auxiliary_loss_mlp": 0.01037535, + "balance_loss_clip": 1.0310204, + "balance_loss_mlp": 1.0246191, + "epoch": 0.39398767473320306, + "flos": 18552027594240.0, + "grad_norm": 1.6890022343664077, + "language_loss": 0.71177292, + "learning_rate": 2.763882378305003e-06, + "loss": 0.73294294, + "num_input_tokens_seen": 140804950, + "step": 6553, + "time_per_iteration": 2.625480890274048 + }, + { + "auxiliary_loss_clip": 0.0106758, + "auxiliary_loss_mlp": 0.00747415, + "balance_loss_clip": 1.03045988, + "balance_loss_mlp": 1.0000273, + "epoch": 0.39404779798587103, + "flos": 29308888419840.0, + "grad_norm": 1.9320496431753247, + "language_loss": 0.64194232, + "learning_rate": 2.7635224285449144e-06, + "loss": 0.66009229, + "num_input_tokens_seen": 140822800, + "step": 6554, + "time_per_iteration": 2.815072774887085 + }, + { + "auxiliary_loss_clip": 0.01055825, + "auxiliary_loss_mlp": 0.01036469, + "balance_loss_clip": 1.02990556, + "balance_loss_mlp": 1.02450657, + "epoch": 0.394107921238539, + "flos": 34897055834880.0, + "grad_norm": 1.9303927256441498, + "language_loss": 0.79743594, + "learning_rate": 2.7631624498319796e-06, + "loss": 0.81835884, + "num_input_tokens_seen": 140842940, + "step": 6555, + "time_per_iteration": 2.8014588356018066 + }, + { + "auxiliary_loss_clip": 0.01048074, + "auxiliary_loss_mlp": 0.01036468, + "balance_loss_clip": 1.02781296, + "balance_loss_mlp": 1.02185297, + "epoch": 0.39416804449120696, + "flos": 25081413039360.0, + "grad_norm": 1.691593699020956, + "language_loss": 0.72067195, + "learning_rate": 2.7628024421798473e-06, + "loss": 0.74151742, + "num_input_tokens_seen": 140863060, + "step": 6556, + "time_per_iteration": 2.6223835945129395 + }, + { + "auxiliary_loss_clip": 0.01076944, + "auxiliary_loss_mlp": 0.0103272, + "balance_loss_clip": 1.02858758, + "balance_loss_mlp": 1.02042925, + "epoch": 0.3942281677438749, + "flos": 32306639731200.0, + "grad_norm": 1.6984168333491514, + "language_loss": 0.83698416, + "learning_rate": 2.7624424056021705e-06, + "loss": 0.85808086, + "num_input_tokens_seen": 140883795, + "step": 6557, + "time_per_iteration": 2.8497488498687744 + }, + { + "auxiliary_loss_clip": 0.01067062, + "auxiliary_loss_mlp": 0.01034555, + "balance_loss_clip": 1.02946675, + "balance_loss_mlp": 1.02223492, + "epoch": 0.3942882909965429, + "flos": 24936621315840.0, + "grad_norm": 2.305220941758675, + "language_loss": 0.8056891, + "learning_rate": 2.7620823401126004e-06, + "loss": 0.82670534, + "num_input_tokens_seen": 140903055, + "step": 6558, + "time_per_iteration": 2.660276412963867 + }, + { + "auxiliary_loss_clip": 0.01076261, + "auxiliary_loss_mlp": 0.01033818, + "balance_loss_clip": 1.02989972, + "balance_loss_mlp": 1.02189064, + "epoch": 0.39434841424921085, + "flos": 11874797769600.0, + "grad_norm": 1.6551789404177704, + "language_loss": 0.71385312, + "learning_rate": 2.761722245724792e-06, + "loss": 0.73495394, + "num_input_tokens_seen": 140920685, + "step": 6559, + "time_per_iteration": 2.5922188758850098 + }, + { + "auxiliary_loss_clip": 0.01059404, + "auxiliary_loss_mlp": 0.01039386, + "balance_loss_clip": 1.02848387, + "balance_loss_mlp": 1.02512288, + "epoch": 0.3944085375018789, + "flos": 16361620323840.0, + "grad_norm": 2.486469967435072, + "language_loss": 0.80516279, + "learning_rate": 2.7613621224524003e-06, + "loss": 0.82615066, + "num_input_tokens_seen": 140937320, + "step": 6560, + "time_per_iteration": 2.687978744506836 + }, + { + "auxiliary_loss_clip": 0.01055353, + "auxiliary_loss_mlp": 0.01036372, + "balance_loss_clip": 1.02916527, + "balance_loss_mlp": 1.02254415, + "epoch": 0.39446866075454684, + "flos": 10633365866880.0, + "grad_norm": 2.792152344064107, + "language_loss": 0.82478774, + "learning_rate": 2.7610019703090803e-06, + "loss": 0.84570503, + "num_input_tokens_seen": 140954855, + "step": 6561, + "time_per_iteration": 4.314976215362549 + }, + { + "auxiliary_loss_clip": 0.01066248, + "auxiliary_loss_mlp": 0.01037663, + "balance_loss_clip": 1.02805507, + "balance_loss_mlp": 1.02521145, + "epoch": 0.3945287840072148, + "flos": 18187498419840.0, + "grad_norm": 4.274287180848111, + "language_loss": 0.80103838, + "learning_rate": 2.7606417893084887e-06, + "loss": 0.82207739, + "num_input_tokens_seen": 140973250, + "step": 6562, + "time_per_iteration": 2.6408088207244873 + }, + { + "auxiliary_loss_clip": 0.0104564, + "auxiliary_loss_mlp": 0.01035023, + "balance_loss_clip": 1.02547872, + "balance_loss_mlp": 1.02232707, + "epoch": 0.39458890725988277, + "flos": 23039891642880.0, + "grad_norm": 1.4023429516446775, + "language_loss": 0.81366628, + "learning_rate": 2.7602815794642853e-06, + "loss": 0.83447295, + "num_input_tokens_seen": 140993050, + "step": 6563, + "time_per_iteration": 4.362712383270264 + }, + { + "auxiliary_loss_clip": 0.01027572, + "auxiliary_loss_mlp": 0.01043074, + "balance_loss_clip": 1.02384639, + "balance_loss_mlp": 1.02736187, + "epoch": 0.39464903051255074, + "flos": 17159052211200.0, + "grad_norm": 6.142125198589638, + "language_loss": 0.69813502, + "learning_rate": 2.759921340790127e-06, + "loss": 0.71884143, + "num_input_tokens_seen": 141010815, + "step": 6564, + "time_per_iteration": 2.658205986022949 + }, + { + "auxiliary_loss_clip": 0.01062743, + "auxiliary_loss_mlp": 0.01037558, + "balance_loss_clip": 1.02716243, + "balance_loss_mlp": 1.02464795, + "epoch": 0.3947091537652187, + "flos": 15889000147200.0, + "grad_norm": 2.153442297528756, + "language_loss": 0.83186996, + "learning_rate": 2.759561073299676e-06, + "loss": 0.85287297, + "num_input_tokens_seen": 141028720, + "step": 6565, + "time_per_iteration": 2.623481035232544 + }, + { + "auxiliary_loss_clip": 0.01035192, + "auxiliary_loss_mlp": 0.01045585, + "balance_loss_clip": 1.02636576, + "balance_loss_mlp": 1.03209686, + "epoch": 0.39476927701788667, + "flos": 18545491319040.0, + "grad_norm": 1.871773210499955, + "language_loss": 0.83699083, + "learning_rate": 2.7592007770065937e-06, + "loss": 0.85779858, + "num_input_tokens_seen": 141046025, + "step": 6566, + "time_per_iteration": 2.6053435802459717 + }, + { + "auxiliary_loss_clip": 0.01081159, + "auxiliary_loss_mlp": 0.01036253, + "balance_loss_clip": 1.02911997, + "balance_loss_mlp": 1.0231936, + "epoch": 0.39482940027055463, + "flos": 22275712771200.0, + "grad_norm": 1.9476283653176312, + "language_loss": 0.77419901, + "learning_rate": 2.7588404519245403e-06, + "loss": 0.79537308, + "num_input_tokens_seen": 141066865, + "step": 6567, + "time_per_iteration": 2.577641487121582 + }, + { + "auxiliary_loss_clip": 0.01061778, + "auxiliary_loss_mlp": 0.0103706, + "balance_loss_clip": 1.02727914, + "balance_loss_mlp": 1.02501369, + "epoch": 0.3948895235232226, + "flos": 14757634494720.0, + "grad_norm": 1.7774517376995602, + "language_loss": 0.80288267, + "learning_rate": 2.758480098067182e-06, + "loss": 0.82387102, + "num_input_tokens_seen": 141084210, + "step": 6568, + "time_per_iteration": 2.6272494792938232 + }, + { + "auxiliary_loss_clip": 0.01049732, + "auxiliary_loss_mlp": 0.01037489, + "balance_loss_clip": 1.03205621, + "balance_loss_mlp": 1.02537155, + "epoch": 0.39494964677589056, + "flos": 22565763095040.0, + "grad_norm": 1.7226610732110799, + "language_loss": 0.84943843, + "learning_rate": 2.7581197154481816e-06, + "loss": 0.8703106, + "num_input_tokens_seen": 141103895, + "step": 6569, + "time_per_iteration": 2.665173292160034 + }, + { + "auxiliary_loss_clip": 0.01027054, + "auxiliary_loss_mlp": 0.01037992, + "balance_loss_clip": 1.0337435, + "balance_loss_mlp": 1.02500367, + "epoch": 0.3950097700285585, + "flos": 22963186149120.0, + "grad_norm": 1.8278532249638064, + "language_loss": 0.74455392, + "learning_rate": 2.7577593040812066e-06, + "loss": 0.76520431, + "num_input_tokens_seen": 141124000, + "step": 6570, + "time_per_iteration": 2.7944271564483643 + }, + { + "auxiliary_loss_clip": 0.01037482, + "auxiliary_loss_mlp": 0.01035549, + "balance_loss_clip": 1.02724123, + "balance_loss_mlp": 1.0224719, + "epoch": 0.3950698932812265, + "flos": 20595236929920.0, + "grad_norm": 2.3307282244132925, + "language_loss": 0.800376, + "learning_rate": 2.757398863979922e-06, + "loss": 0.82110631, + "num_input_tokens_seen": 141142535, + "step": 6571, + "time_per_iteration": 2.6424176692962646 + }, + { + "auxiliary_loss_clip": 0.01044871, + "auxiliary_loss_mlp": 0.01041813, + "balance_loss_clip": 1.02685046, + "balance_loss_mlp": 1.02874804, + "epoch": 0.39513001653389446, + "flos": 20375786787840.0, + "grad_norm": 1.6087623018900672, + "language_loss": 0.77849722, + "learning_rate": 2.757038395157997e-06, + "loss": 0.79936415, + "num_input_tokens_seen": 141161575, + "step": 6572, + "time_per_iteration": 2.599177598953247 + }, + { + "auxiliary_loss_clip": 0.01046915, + "auxiliary_loss_mlp": 0.01034513, + "balance_loss_clip": 1.02859902, + "balance_loss_mlp": 1.02117348, + "epoch": 0.3951901397865625, + "flos": 26463650256000.0, + "grad_norm": 1.701696607240659, + "language_loss": 0.7490139, + "learning_rate": 2.7566778976291002e-06, + "loss": 0.7698282, + "num_input_tokens_seen": 141181150, + "step": 6573, + "time_per_iteration": 2.720818519592285 + }, + { + "auxiliary_loss_clip": 0.01061153, + "auxiliary_loss_mlp": 0.01031243, + "balance_loss_clip": 1.02666724, + "balance_loss_mlp": 1.0198226, + "epoch": 0.39525026303923044, + "flos": 43838345767680.0, + "grad_norm": 1.3989524069446346, + "language_loss": 0.67726147, + "learning_rate": 2.7563173714069017e-06, + "loss": 0.69818544, + "num_input_tokens_seen": 141206310, + "step": 6574, + "time_per_iteration": 2.826510429382324 + }, + { + "auxiliary_loss_clip": 0.01012308, + "auxiliary_loss_mlp": 0.01038819, + "balance_loss_clip": 1.02316809, + "balance_loss_mlp": 1.0239954, + "epoch": 0.3953103862918984, + "flos": 18040803275520.0, + "grad_norm": 2.230735469811324, + "language_loss": 0.71593702, + "learning_rate": 2.755956816505072e-06, + "loss": 0.73644823, + "num_input_tokens_seen": 141223925, + "step": 6575, + "time_per_iteration": 2.7255992889404297 + }, + { + "auxiliary_loss_clip": 0.0104867, + "auxiliary_loss_mlp": 0.01040918, + "balance_loss_clip": 1.02668071, + "balance_loss_mlp": 1.02770329, + "epoch": 0.3953705095445664, + "flos": 16976015481600.0, + "grad_norm": 2.048202951307479, + "language_loss": 0.7385118, + "learning_rate": 2.7555962329372845e-06, + "loss": 0.75940776, + "num_input_tokens_seen": 141239010, + "step": 6576, + "time_per_iteration": 2.614071846008301 + }, + { + "auxiliary_loss_clip": 0.01077777, + "auxiliary_loss_mlp": 0.01038091, + "balance_loss_clip": 1.02935898, + "balance_loss_mlp": 1.02639067, + "epoch": 0.39543063279723434, + "flos": 17411144837760.0, + "grad_norm": 2.6078445493161904, + "language_loss": 0.83715904, + "learning_rate": 2.7552356207172124e-06, + "loss": 0.85831767, + "num_input_tokens_seen": 141252255, + "step": 6577, + "time_per_iteration": 2.560817003250122 + }, + { + "auxiliary_loss_clip": 0.01052199, + "auxiliary_loss_mlp": 0.01032212, + "balance_loss_clip": 1.02742314, + "balance_loss_mlp": 1.0191524, + "epoch": 0.3954907560499023, + "flos": 22784207656320.0, + "grad_norm": 2.2980574286548823, + "language_loss": 0.90283549, + "learning_rate": 2.75487497985853e-06, + "loss": 0.92367959, + "num_input_tokens_seen": 141269325, + "step": 6578, + "time_per_iteration": 2.6801421642303467 + }, + { + "auxiliary_loss_clip": 0.01061348, + "auxiliary_loss_mlp": 0.01032477, + "balance_loss_clip": 1.03074908, + "balance_loss_mlp": 1.01781392, + "epoch": 0.39555087930257027, + "flos": 21944400698880.0, + "grad_norm": 1.8011011190833224, + "language_loss": 0.77520329, + "learning_rate": 2.7545143103749117e-06, + "loss": 0.79614156, + "num_input_tokens_seen": 141288505, + "step": 6579, + "time_per_iteration": 2.6661977767944336 + }, + { + "auxiliary_loss_clip": 0.01027134, + "auxiliary_loss_mlp": 0.01035917, + "balance_loss_clip": 1.02597928, + "balance_loss_mlp": 1.0219872, + "epoch": 0.39561100255523823, + "flos": 20404622430720.0, + "grad_norm": 2.0316273450533338, + "language_loss": 0.68478203, + "learning_rate": 2.754153612280037e-06, + "loss": 0.70541251, + "num_input_tokens_seen": 141303680, + "step": 6580, + "time_per_iteration": 2.8326568603515625 + }, + { + "auxiliary_loss_clip": 0.01066899, + "auxiliary_loss_mlp": 0.01027949, + "balance_loss_clip": 1.02975214, + "balance_loss_mlp": 1.01565862, + "epoch": 0.3956711258079062, + "flos": 27964572986880.0, + "grad_norm": 1.6794507153043257, + "language_loss": 0.58516604, + "learning_rate": 2.7537928855875797e-06, + "loss": 0.60611451, + "num_input_tokens_seen": 141324090, + "step": 6581, + "time_per_iteration": 2.6456637382507324 + }, + { + "auxiliary_loss_clip": 0.01050983, + "auxiliary_loss_mlp": 0.01046325, + "balance_loss_clip": 1.02722633, + "balance_loss_mlp": 1.03103006, + "epoch": 0.39573124906057416, + "flos": 14428297670400.0, + "grad_norm": 2.1166420842766502, + "language_loss": 0.69844162, + "learning_rate": 2.7534321303112224e-06, + "loss": 0.71941471, + "num_input_tokens_seen": 141342235, + "step": 6582, + "time_per_iteration": 2.6388471126556396 + }, + { + "auxiliary_loss_clip": 0.0107864, + "auxiliary_loss_mlp": 0.00747502, + "balance_loss_clip": 1.02941132, + "balance_loss_mlp": 1.00008178, + "epoch": 0.39579137231324213, + "flos": 18733699607040.0, + "grad_norm": 2.145881371397103, + "language_loss": 0.76232034, + "learning_rate": 2.753071346464642e-06, + "loss": 0.78058177, + "num_input_tokens_seen": 141361195, + "step": 6583, + "time_per_iteration": 2.5503857135772705 + }, + { + "auxiliary_loss_clip": 0.0102997, + "auxiliary_loss_mlp": 0.00747496, + "balance_loss_clip": 1.02629185, + "balance_loss_mlp": 1.00011659, + "epoch": 0.3958514955659101, + "flos": 17676417755520.0, + "grad_norm": 1.4838348776843007, + "language_loss": 0.65808666, + "learning_rate": 2.7527105340615207e-06, + "loss": 0.6758613, + "num_input_tokens_seen": 141378275, + "step": 6584, + "time_per_iteration": 2.6852707862854004 + }, + { + "auxiliary_loss_clip": 0.0105331, + "auxiliary_loss_mlp": 0.0103867, + "balance_loss_clip": 1.03169119, + "balance_loss_mlp": 1.02411461, + "epoch": 0.39591161881857806, + "flos": 29309103901440.0, + "grad_norm": 2.3230342773341905, + "language_loss": 0.7237376, + "learning_rate": 2.7523496931155413e-06, + "loss": 0.7446574, + "num_input_tokens_seen": 141396960, + "step": 6585, + "time_per_iteration": 2.752469778060913 + }, + { + "auxiliary_loss_clip": 0.01045434, + "auxiliary_loss_mlp": 0.01031632, + "balance_loss_clip": 1.02755725, + "balance_loss_mlp": 1.01848304, + "epoch": 0.3959717420712461, + "flos": 25771831332480.0, + "grad_norm": 1.7202794610169998, + "language_loss": 0.73189139, + "learning_rate": 2.7519888236403856e-06, + "loss": 0.75266206, + "num_input_tokens_seen": 141417320, + "step": 6586, + "time_per_iteration": 2.8451857566833496 + }, + { + "auxiliary_loss_clip": 0.01045337, + "auxiliary_loss_mlp": 0.01033045, + "balance_loss_clip": 1.02660728, + "balance_loss_mlp": 1.01950884, + "epoch": 0.39603186532391405, + "flos": 20923783655040.0, + "grad_norm": 1.6364686248024327, + "language_loss": 0.71482813, + "learning_rate": 2.7516279256497382e-06, + "loss": 0.73561198, + "num_input_tokens_seen": 141435985, + "step": 6587, + "time_per_iteration": 2.7020068168640137 + }, + { + "auxiliary_loss_clip": 0.00968927, + "auxiliary_loss_mlp": 0.01006997, + "balance_loss_clip": 1.00756407, + "balance_loss_mlp": 1.00547111, + "epoch": 0.396091988576582, + "flos": 54880986176640.0, + "grad_norm": 0.8910269439750926, + "language_loss": 0.61225128, + "learning_rate": 2.751266999157285e-06, + "loss": 0.63201052, + "num_input_tokens_seen": 141486075, + "step": 6588, + "time_per_iteration": 3.1596508026123047 + }, + { + "auxiliary_loss_clip": 0.01057464, + "auxiliary_loss_mlp": 0.00747646, + "balance_loss_clip": 1.02828074, + "balance_loss_mlp": 1.00015354, + "epoch": 0.39615211182925, + "flos": 20702896968960.0, + "grad_norm": 1.8176500243186458, + "language_loss": 0.8135035, + "learning_rate": 2.7509060441767115e-06, + "loss": 0.83155453, + "num_input_tokens_seen": 141505280, + "step": 6589, + "time_per_iteration": 2.670724630355835 + }, + { + "auxiliary_loss_clip": 0.01058181, + "auxiliary_loss_mlp": 0.01033957, + "balance_loss_clip": 1.0300734, + "balance_loss_mlp": 1.0206176, + "epoch": 0.39621223508191794, + "flos": 20994312009600.0, + "grad_norm": 2.159114203124612, + "language_loss": 0.70419645, + "learning_rate": 2.7505450607217057e-06, + "loss": 0.7251178, + "num_input_tokens_seen": 141523930, + "step": 6590, + "time_per_iteration": 2.575563907623291 + }, + { + "auxiliary_loss_clip": 0.0106189, + "auxiliary_loss_mlp": 0.01045312, + "balance_loss_clip": 1.0283556, + "balance_loss_mlp": 1.03103042, + "epoch": 0.3962723583345859, + "flos": 23368833417600.0, + "grad_norm": 1.7155543132693298, + "language_loss": 0.7559725, + "learning_rate": 2.750184048805956e-06, + "loss": 0.77704453, + "num_input_tokens_seen": 141541320, + "step": 6591, + "time_per_iteration": 2.7002902030944824 + }, + { + "auxiliary_loss_clip": 0.00994451, + "auxiliary_loss_mlp": 0.01041204, + "balance_loss_clip": 1.02619314, + "balance_loss_mlp": 1.02698791, + "epoch": 0.39633248158725387, + "flos": 25115599808640.0, + "grad_norm": 1.614242874420428, + "language_loss": 0.78265703, + "learning_rate": 2.749823008443152e-06, + "loss": 0.80301362, + "num_input_tokens_seen": 141561880, + "step": 6592, + "time_per_iteration": 3.112431287765503 + }, + { + "auxiliary_loss_clip": 0.01014044, + "auxiliary_loss_mlp": 0.01036106, + "balance_loss_clip": 1.02654004, + "balance_loss_mlp": 1.02237868, + "epoch": 0.39639260483992184, + "flos": 39787622236800.0, + "grad_norm": 1.8522358985388263, + "language_loss": 0.69639343, + "learning_rate": 2.7494619396469843e-06, + "loss": 0.71689498, + "num_input_tokens_seen": 141586460, + "step": 6593, + "time_per_iteration": 3.2934014797210693 + }, + { + "auxiliary_loss_clip": 0.01008871, + "auxiliary_loss_mlp": 0.01038904, + "balance_loss_clip": 1.02558422, + "balance_loss_mlp": 1.02384853, + "epoch": 0.3964527280925898, + "flos": 17347045017600.0, + "grad_norm": 1.9495293202359365, + "language_loss": 0.78167629, + "learning_rate": 2.7491008424311452e-06, + "loss": 0.80215406, + "num_input_tokens_seen": 141605955, + "step": 6594, + "time_per_iteration": 2.821594476699829 + }, + { + "auxiliary_loss_clip": 0.00984355, + "auxiliary_loss_mlp": 0.01002054, + "balance_loss_clip": 1.005193, + "balance_loss_mlp": 1.00044429, + "epoch": 0.39651285134525777, + "flos": 71717848369920.0, + "grad_norm": 0.9432298783347066, + "language_loss": 0.6293388, + "learning_rate": 2.7487397168093265e-06, + "loss": 0.64920288, + "num_input_tokens_seen": 141673140, + "step": 6595, + "time_per_iteration": 3.254349946975708 + }, + { + "auxiliary_loss_clip": 0.0104338, + "auxiliary_loss_mlp": 0.01047064, + "balance_loss_clip": 1.02808714, + "balance_loss_mlp": 1.03157902, + "epoch": 0.39657297459792573, + "flos": 25775710001280.0, + "grad_norm": 2.5483319916877036, + "language_loss": 0.63331628, + "learning_rate": 2.748378562795223e-06, + "loss": 0.65422076, + "num_input_tokens_seen": 141692955, + "step": 6596, + "time_per_iteration": 2.6403422355651855 + }, + { + "auxiliary_loss_clip": 0.01068115, + "auxiliary_loss_mlp": 0.01036826, + "balance_loss_clip": 1.03022218, + "balance_loss_mlp": 1.02316427, + "epoch": 0.3966330978505937, + "flos": 20266115587200.0, + "grad_norm": 2.0144275019845583, + "language_loss": 0.78612494, + "learning_rate": 2.7480173804025293e-06, + "loss": 0.80717432, + "num_input_tokens_seen": 141710680, + "step": 6597, + "time_per_iteration": 4.334571838378906 + }, + { + "auxiliary_loss_clip": 0.01047119, + "auxiliary_loss_mlp": 0.00747672, + "balance_loss_clip": 1.02863646, + "balance_loss_mlp": 1.00015306, + "epoch": 0.39669322110326166, + "flos": 20631183465600.0, + "grad_norm": 2.0743568738501486, + "language_loss": 0.67179394, + "learning_rate": 2.747656169644941e-06, + "loss": 0.68974185, + "num_input_tokens_seen": 141729860, + "step": 6598, + "time_per_iteration": 2.7575109004974365 + }, + { + "auxiliary_loss_clip": 0.01079816, + "auxiliary_loss_mlp": 0.01040414, + "balance_loss_clip": 1.03011262, + "balance_loss_mlp": 1.02750337, + "epoch": 0.3967533443559297, + "flos": 21726063878400.0, + "grad_norm": 1.6452573058238547, + "language_loss": 0.7894792, + "learning_rate": 2.747294930536157e-06, + "loss": 0.81068146, + "num_input_tokens_seen": 141749060, + "step": 6599, + "time_per_iteration": 4.3766303062438965 + }, + { + "auxiliary_loss_clip": 0.01038302, + "auxiliary_loss_mlp": 0.0103712, + "balance_loss_clip": 1.02893186, + "balance_loss_mlp": 1.02174211, + "epoch": 0.39681346760859765, + "flos": 25484151306240.0, + "grad_norm": 1.7178482662028298, + "language_loss": 0.72718751, + "learning_rate": 2.7469336630898737e-06, + "loss": 0.74794173, + "num_input_tokens_seen": 141769860, + "step": 6600, + "time_per_iteration": 2.866241455078125 + }, + { + "auxiliary_loss_clip": 0.01031816, + "auxiliary_loss_mlp": 0.01035779, + "balance_loss_clip": 1.02372098, + "balance_loss_mlp": 1.02178955, + "epoch": 0.3968735908612656, + "flos": 20959586536320.0, + "grad_norm": 1.9640007944791866, + "language_loss": 0.85509586, + "learning_rate": 2.746572367319791e-06, + "loss": 0.87577188, + "num_input_tokens_seen": 141788465, + "step": 6601, + "time_per_iteration": 2.806796073913574 + }, + { + "auxiliary_loss_clip": 0.01048983, + "auxiliary_loss_mlp": 0.01039921, + "balance_loss_clip": 1.02920175, + "balance_loss_mlp": 1.02432799, + "epoch": 0.3969337141139336, + "flos": 10707090531840.0, + "grad_norm": 2.617899703164351, + "language_loss": 0.69775474, + "learning_rate": 2.7462110432396095e-06, + "loss": 0.71864378, + "num_input_tokens_seen": 141804955, + "step": 6602, + "time_per_iteration": 2.7464704513549805 + }, + { + "auxiliary_loss_clip": 0.01079218, + "auxiliary_loss_mlp": 0.01039519, + "balance_loss_clip": 1.02927887, + "balance_loss_mlp": 1.02616763, + "epoch": 0.39699383736660154, + "flos": 17593714690560.0, + "grad_norm": 2.1865401490793106, + "language_loss": 0.83975506, + "learning_rate": 2.7458496908630305e-06, + "loss": 0.86094248, + "num_input_tokens_seen": 141820025, + "step": 6603, + "time_per_iteration": 2.8663582801818848 + }, + { + "auxiliary_loss_clip": 0.01061856, + "auxiliary_loss_mlp": 0.01034622, + "balance_loss_clip": 1.03157806, + "balance_loss_mlp": 1.02144337, + "epoch": 0.3970539606192695, + "flos": 17785945301760.0, + "grad_norm": 1.5824168730429873, + "language_loss": 0.72760665, + "learning_rate": 2.7454883102037563e-06, + "loss": 0.7485714, + "num_input_tokens_seen": 141838735, + "step": 6604, + "time_per_iteration": 2.802523374557495 + }, + { + "auxiliary_loss_clip": 0.01053631, + "auxiliary_loss_mlp": 0.01036298, + "balance_loss_clip": 1.02773571, + "balance_loss_mlp": 1.02281559, + "epoch": 0.3971140838719375, + "flos": 24789495208320.0, + "grad_norm": 1.5667598867763588, + "language_loss": 0.82598364, + "learning_rate": 2.745126901275491e-06, + "loss": 0.84688294, + "num_input_tokens_seen": 141858090, + "step": 6605, + "time_per_iteration": 2.7944447994232178 + }, + { + "auxiliary_loss_clip": 0.01075212, + "auxiliary_loss_mlp": 0.01030647, + "balance_loss_clip": 1.02837205, + "balance_loss_mlp": 1.01894665, + "epoch": 0.39717420712460544, + "flos": 24243581329920.0, + "grad_norm": 1.5033602499673475, + "language_loss": 0.7347579, + "learning_rate": 2.7447654640919383e-06, + "loss": 0.75581646, + "num_input_tokens_seen": 141877540, + "step": 6606, + "time_per_iteration": 2.6619932651519775 + }, + { + "auxiliary_loss_clip": 0.01043356, + "auxiliary_loss_mlp": 0.01042176, + "balance_loss_clip": 1.02982378, + "balance_loss_mlp": 1.02704811, + "epoch": 0.3972343303772734, + "flos": 25884698843520.0, + "grad_norm": 1.8913376440015188, + "language_loss": 0.73775911, + "learning_rate": 2.744403998666805e-06, + "loss": 0.75861442, + "num_input_tokens_seen": 141897315, + "step": 6607, + "time_per_iteration": 2.7248833179473877 + }, + { + "auxiliary_loss_clip": 0.01072034, + "auxiliary_loss_mlp": 0.0103559, + "balance_loss_clip": 1.03171492, + "balance_loss_mlp": 1.02222061, + "epoch": 0.39729445362994137, + "flos": 45623716300800.0, + "grad_norm": 1.4662281904437835, + "language_loss": 0.67511725, + "learning_rate": 2.744042505013797e-06, + "loss": 0.69619358, + "num_input_tokens_seen": 141919580, + "step": 6608, + "time_per_iteration": 4.384977340698242 + }, + { + "auxiliary_loss_clip": 0.01038938, + "auxiliary_loss_mlp": 0.01049582, + "balance_loss_clip": 1.0252322, + "balance_loss_mlp": 1.03340578, + "epoch": 0.39735457688260933, + "flos": 20193971120640.0, + "grad_norm": 1.8087458869700372, + "language_loss": 0.74264151, + "learning_rate": 2.7436809831466233e-06, + "loss": 0.76352674, + "num_input_tokens_seen": 141937045, + "step": 6609, + "time_per_iteration": 4.219418048858643 + }, + { + "auxiliary_loss_clip": 0.01057333, + "auxiliary_loss_mlp": 0.01032978, + "balance_loss_clip": 1.03085709, + "balance_loss_mlp": 1.01931071, + "epoch": 0.3974147001352773, + "flos": 23331163029120.0, + "grad_norm": 1.5650500094741222, + "language_loss": 0.71232915, + "learning_rate": 2.7433194330789927e-06, + "loss": 0.7332322, + "num_input_tokens_seen": 141956695, + "step": 6610, + "time_per_iteration": 2.6415505409240723 + }, + { + "auxiliary_loss_clip": 0.01055695, + "auxiliary_loss_mlp": 0.01032828, + "balance_loss_clip": 1.02514052, + "balance_loss_mlp": 1.01944685, + "epoch": 0.39747482338794526, + "flos": 21688644885120.0, + "grad_norm": 1.6262867149236613, + "language_loss": 0.78176141, + "learning_rate": 2.7429578548246133e-06, + "loss": 0.80264664, + "num_input_tokens_seen": 141975935, + "step": 6611, + "time_per_iteration": 2.6906638145446777 + }, + { + "auxiliary_loss_clip": 0.01069542, + "auxiliary_loss_mlp": 0.01036807, + "balance_loss_clip": 1.03083849, + "balance_loss_mlp": 1.02354503, + "epoch": 0.3975349466406133, + "flos": 30988717816320.0, + "grad_norm": 1.8454086456729941, + "language_loss": 0.78925067, + "learning_rate": 2.7425962483971985e-06, + "loss": 0.81031418, + "num_input_tokens_seen": 141995750, + "step": 6612, + "time_per_iteration": 2.7220427989959717 + }, + { + "auxiliary_loss_clip": 0.00985532, + "auxiliary_loss_mlp": 0.01009609, + "balance_loss_clip": 1.00594664, + "balance_loss_mlp": 1.00810742, + "epoch": 0.39759506989328125, + "flos": 63683948833920.0, + "grad_norm": 0.8717813391244091, + "language_loss": 0.65006518, + "learning_rate": 2.742234613810459e-06, + "loss": 0.67001665, + "num_input_tokens_seen": 142057655, + "step": 6613, + "time_per_iteration": 3.118068218231201 + }, + { + "auxiliary_loss_clip": 0.01043555, + "auxiliary_loss_mlp": 0.01039516, + "balance_loss_clip": 1.02492416, + "balance_loss_mlp": 1.02414989, + "epoch": 0.3976551931459492, + "flos": 23695835857920.0, + "grad_norm": 2.3354657988557923, + "language_loss": 0.71500933, + "learning_rate": 2.741872951078109e-06, + "loss": 0.73584002, + "num_input_tokens_seen": 142076020, + "step": 6614, + "time_per_iteration": 2.626044511795044 + }, + { + "auxiliary_loss_clip": 0.01069967, + "auxiliary_loss_mlp": 0.01032352, + "balance_loss_clip": 1.03105533, + "balance_loss_mlp": 1.01855922, + "epoch": 0.3977153163986172, + "flos": 15669657745920.0, + "grad_norm": 1.7448069014629384, + "language_loss": 0.81306589, + "learning_rate": 2.741511260213862e-06, + "loss": 0.8340891, + "num_input_tokens_seen": 142093790, + "step": 6615, + "time_per_iteration": 2.542980194091797 + }, + { + "auxiliary_loss_clip": 0.010434, + "auxiliary_loss_mlp": 0.01032485, + "balance_loss_clip": 1.02936244, + "balance_loss_mlp": 1.0197767, + "epoch": 0.39777543965128515, + "flos": 14064702249600.0, + "grad_norm": 2.9851373289872, + "language_loss": 0.66770256, + "learning_rate": 2.741149541231434e-06, + "loss": 0.68846136, + "num_input_tokens_seen": 142110545, + "step": 6616, + "time_per_iteration": 2.5578839778900146 + }, + { + "auxiliary_loss_clip": 0.0108217, + "auxiliary_loss_mlp": 0.01040377, + "balance_loss_clip": 1.03179419, + "balance_loss_mlp": 1.02687705, + "epoch": 0.3978355629039531, + "flos": 23367468700800.0, + "grad_norm": 2.188077269500368, + "language_loss": 0.8379482, + "learning_rate": 2.740787794144541e-06, + "loss": 0.85917372, + "num_input_tokens_seen": 142128695, + "step": 6617, + "time_per_iteration": 2.533710479736328 + }, + { + "auxiliary_loss_clip": 0.01074865, + "auxiliary_loss_mlp": 0.0103713, + "balance_loss_clip": 1.02976489, + "balance_loss_mlp": 1.02483964, + "epoch": 0.3978956861566211, + "flos": 19062785036160.0, + "grad_norm": 1.5766609994035412, + "language_loss": 0.72246814, + "learning_rate": 2.7404260189669e-06, + "loss": 0.74358809, + "num_input_tokens_seen": 142148375, + "step": 6618, + "time_per_iteration": 2.5511343479156494 + }, + { + "auxiliary_loss_clip": 0.01059297, + "auxiliary_loss_mlp": 0.01040397, + "balance_loss_clip": 1.03023958, + "balance_loss_mlp": 1.02521527, + "epoch": 0.39795580940928904, + "flos": 30227699341440.0, + "grad_norm": 1.664694858960158, + "language_loss": 0.65150511, + "learning_rate": 2.740064215712231e-06, + "loss": 0.6725021, + "num_input_tokens_seen": 142169735, + "step": 6619, + "time_per_iteration": 2.6816675662994385 + }, + { + "auxiliary_loss_clip": 0.01011235, + "auxiliary_loss_mlp": 0.01002633, + "balance_loss_clip": 1.00232804, + "balance_loss_mlp": 1.00101173, + "epoch": 0.398015932661957, + "flos": 69847224906240.0, + "grad_norm": 0.7697595232234348, + "language_loss": 0.58267659, + "learning_rate": 2.7397023843942527e-06, + "loss": 0.60281527, + "num_input_tokens_seen": 142229520, + "step": 6620, + "time_per_iteration": 3.1395387649536133 + }, + { + "auxiliary_loss_clip": 0.01063444, + "auxiliary_loss_mlp": 0.01037529, + "balance_loss_clip": 1.03431499, + "balance_loss_mlp": 1.02542973, + "epoch": 0.39807605591462497, + "flos": 20157773189760.0, + "grad_norm": 1.6685251837782882, + "language_loss": 0.79092109, + "learning_rate": 2.739340525026686e-06, + "loss": 0.81193078, + "num_input_tokens_seen": 142247660, + "step": 6621, + "time_per_iteration": 2.934566020965576 + }, + { + "auxiliary_loss_clip": 0.0105922, + "auxiliary_loss_mlp": 0.01031889, + "balance_loss_clip": 1.03050685, + "balance_loss_mlp": 1.0190618, + "epoch": 0.39813617916729294, + "flos": 21141761339520.0, + "grad_norm": 2.3765749696181797, + "language_loss": 0.78000915, + "learning_rate": 2.738978637623252e-06, + "loss": 0.80092025, + "num_input_tokens_seen": 142266990, + "step": 6622, + "time_per_iteration": 2.709491729736328 + }, + { + "auxiliary_loss_clip": 0.01049792, + "auxiliary_loss_mlp": 0.01034516, + "balance_loss_clip": 1.02590537, + "balance_loss_mlp": 1.02164102, + "epoch": 0.3981963024199609, + "flos": 18988485753600.0, + "grad_norm": 1.4462597236577512, + "language_loss": 0.74919319, + "learning_rate": 2.738616722197674e-06, + "loss": 0.77003622, + "num_input_tokens_seen": 142287170, + "step": 6623, + "time_per_iteration": 2.737523317337036 + }, + { + "auxiliary_loss_clip": 0.01041538, + "auxiliary_loss_mlp": 0.01039488, + "balance_loss_clip": 1.02789307, + "balance_loss_mlp": 1.02574897, + "epoch": 0.39825642567262887, + "flos": 16575108808320.0, + "grad_norm": 1.965248348624921, + "language_loss": 0.79520494, + "learning_rate": 2.7382547787636766e-06, + "loss": 0.81601524, + "num_input_tokens_seen": 142305405, + "step": 6624, + "time_per_iteration": 2.716139793395996 + }, + { + "auxiliary_loss_clip": 0.01082697, + "auxiliary_loss_mlp": 0.01043571, + "balance_loss_clip": 1.03151011, + "balance_loss_mlp": 1.02840137, + "epoch": 0.39831654892529683, + "flos": 22199833290240.0, + "grad_norm": 2.113524486688082, + "language_loss": 0.83477664, + "learning_rate": 2.7378928073349832e-06, + "loss": 0.85603929, + "num_input_tokens_seen": 142322710, + "step": 6625, + "time_per_iteration": 2.5451276302337646 + }, + { + "auxiliary_loss_clip": 0.01065144, + "auxiliary_loss_mlp": 0.01040194, + "balance_loss_clip": 1.02772295, + "balance_loss_mlp": 1.02650261, + "epoch": 0.39837667217796485, + "flos": 10487963612160.0, + "grad_norm": 1.9189380830398461, + "language_loss": 0.86541939, + "learning_rate": 2.737530807925321e-06, + "loss": 0.8864727, + "num_input_tokens_seen": 142338535, + "step": 6626, + "time_per_iteration": 2.554368495941162 + }, + { + "auxiliary_loss_clip": 0.01014629, + "auxiliary_loss_mlp": 0.00747626, + "balance_loss_clip": 1.02595699, + "balance_loss_mlp": 1.00013149, + "epoch": 0.3984367954306328, + "flos": 17965282930560.0, + "grad_norm": 2.2153781200186153, + "language_loss": 0.83388513, + "learning_rate": 2.737168780548417e-06, + "loss": 0.85150766, + "num_input_tokens_seen": 142354570, + "step": 6627, + "time_per_iteration": 2.783371686935425 + }, + { + "auxiliary_loss_clip": 0.01033161, + "auxiliary_loss_mlp": 0.00747437, + "balance_loss_clip": 1.02493048, + "balance_loss_mlp": 1.00008011, + "epoch": 0.3984969186833008, + "flos": 22711057608960.0, + "grad_norm": 1.427898918587423, + "language_loss": 0.82874763, + "learning_rate": 2.736806725217998e-06, + "loss": 0.84655356, + "num_input_tokens_seen": 142374395, + "step": 6628, + "time_per_iteration": 2.8011856079101562 + }, + { + "auxiliary_loss_clip": 0.01041113, + "auxiliary_loss_mlp": 0.01049261, + "balance_loss_clip": 1.02763951, + "balance_loss_mlp": 1.03446758, + "epoch": 0.39855704193596875, + "flos": 23405785534080.0, + "grad_norm": 2.0095545884432013, + "language_loss": 0.7102477, + "learning_rate": 2.7364446419477945e-06, + "loss": 0.7311514, + "num_input_tokens_seen": 142396040, + "step": 6629, + "time_per_iteration": 2.693955659866333 + }, + { + "auxiliary_loss_clip": 0.01044437, + "auxiliary_loss_mlp": 0.01035349, + "balance_loss_clip": 1.02906549, + "balance_loss_mlp": 1.02274871, + "epoch": 0.3986171651886367, + "flos": 21251935330560.0, + "grad_norm": 1.7540261721830495, + "language_loss": 0.81150043, + "learning_rate": 2.7360825307515366e-06, + "loss": 0.83229828, + "num_input_tokens_seen": 142415495, + "step": 6630, + "time_per_iteration": 2.662734270095825 + }, + { + "auxiliary_loss_clip": 0.01029941, + "auxiliary_loss_mlp": 0.01032716, + "balance_loss_clip": 1.03186274, + "balance_loss_mlp": 1.0198772, + "epoch": 0.3986772884413047, + "flos": 12458705258880.0, + "grad_norm": 2.517262422779269, + "language_loss": 0.7489323, + "learning_rate": 2.7357203916429555e-06, + "loss": 0.76955891, + "num_input_tokens_seen": 142431865, + "step": 6631, + "time_per_iteration": 2.6932902336120605 + }, + { + "auxiliary_loss_clip": 0.01038656, + "auxiliary_loss_mlp": 0.01039203, + "balance_loss_clip": 1.02559483, + "balance_loss_mlp": 1.02569699, + "epoch": 0.39873741169397264, + "flos": 19646117907840.0, + "grad_norm": 1.986665637837692, + "language_loss": 0.71232021, + "learning_rate": 2.735358224635783e-06, + "loss": 0.73309886, + "num_input_tokens_seen": 142450595, + "step": 6632, + "time_per_iteration": 2.6267011165618896 + }, + { + "auxiliary_loss_clip": 0.01025344, + "auxiliary_loss_mlp": 0.00747525, + "balance_loss_clip": 1.0298506, + "balance_loss_mlp": 1.00007033, + "epoch": 0.3987975349466406, + "flos": 21684766216320.0, + "grad_norm": 1.7545925433939396, + "language_loss": 0.74910122, + "learning_rate": 2.7349960297437533e-06, + "loss": 0.76682991, + "num_input_tokens_seen": 142466650, + "step": 6633, + "time_per_iteration": 2.709890365600586 + }, + { + "auxiliary_loss_clip": 0.01058139, + "auxiliary_loss_mlp": 0.01029757, + "balance_loss_clip": 1.03001535, + "balance_loss_mlp": 1.01664376, + "epoch": 0.3988576581993086, + "flos": 23914064937600.0, + "grad_norm": 1.804978194863514, + "language_loss": 0.80935311, + "learning_rate": 2.7346338069806e-06, + "loss": 0.83023208, + "num_input_tokens_seen": 142486165, + "step": 6634, + "time_per_iteration": 2.6222448348999023 + }, + { + "auxiliary_loss_clip": 0.01052191, + "auxiliary_loss_mlp": 0.01033586, + "balance_loss_clip": 1.02865076, + "balance_loss_mlp": 1.01985347, + "epoch": 0.39891778145197654, + "flos": 18149899858560.0, + "grad_norm": 1.868710459435321, + "language_loss": 0.75006998, + "learning_rate": 2.7342715563600597e-06, + "loss": 0.77092773, + "num_input_tokens_seen": 142505035, + "step": 6635, + "time_per_iteration": 2.7028768062591553 + }, + { + "auxiliary_loss_clip": 0.01048659, + "auxiliary_loss_mlp": 0.01040815, + "balance_loss_clip": 1.02958512, + "balance_loss_mlp": 1.02584243, + "epoch": 0.3989779047046445, + "flos": 22595281096320.0, + "grad_norm": 2.0948809878135797, + "language_loss": 0.65923363, + "learning_rate": 2.733909277895868e-06, + "loss": 0.68012834, + "num_input_tokens_seen": 142521870, + "step": 6636, + "time_per_iteration": 2.684572219848633 + }, + { + "auxiliary_loss_clip": 0.01066072, + "auxiliary_loss_mlp": 0.01031186, + "balance_loss_clip": 1.02901542, + "balance_loss_mlp": 1.0185318, + "epoch": 0.39903802795731247, + "flos": 18077216688000.0, + "grad_norm": 1.8238369777255048, + "language_loss": 0.81178844, + "learning_rate": 2.733546971601763e-06, + "loss": 0.83276105, + "num_input_tokens_seen": 142540455, + "step": 6637, + "time_per_iteration": 2.5895497798919678 + }, + { + "auxiliary_loss_clip": 0.00982639, + "auxiliary_loss_mlp": 0.01005326, + "balance_loss_clip": 1.0032835, + "balance_loss_mlp": 1.00352561, + "epoch": 0.39909815120998043, + "flos": 70441367771520.0, + "grad_norm": 0.729321032768094, + "language_loss": 0.53186369, + "learning_rate": 2.733184637491484e-06, + "loss": 0.55174327, + "num_input_tokens_seen": 142599665, + "step": 6638, + "time_per_iteration": 3.256880521774292 + }, + { + "auxiliary_loss_clip": 0.010605, + "auxiliary_loss_mlp": 0.00747714, + "balance_loss_clip": 1.03183734, + "balance_loss_mlp": 1.00015879, + "epoch": 0.39915827446264845, + "flos": 18549262247040.0, + "grad_norm": 1.4874553043108045, + "language_loss": 0.75127459, + "learning_rate": 2.732822275578769e-06, + "loss": 0.76935673, + "num_input_tokens_seen": 142618845, + "step": 6639, + "time_per_iteration": 2.6609768867492676 + }, + { + "auxiliary_loss_clip": 0.01001473, + "auxiliary_loss_mlp": 0.01036366, + "balance_loss_clip": 1.02159977, + "balance_loss_mlp": 1.02287734, + "epoch": 0.3992183977153164, + "flos": 29897249195520.0, + "grad_norm": 2.0477785921557583, + "language_loss": 0.76124465, + "learning_rate": 2.7324598858773603e-06, + "loss": 0.78162313, + "num_input_tokens_seen": 142640885, + "step": 6640, + "time_per_iteration": 2.8523240089416504 + }, + { + "auxiliary_loss_clip": 0.01051612, + "auxiliary_loss_mlp": 0.01038029, + "balance_loss_clip": 1.03065252, + "balance_loss_mlp": 1.02426028, + "epoch": 0.3992785209679844, + "flos": 22565080736640.0, + "grad_norm": 2.0707795816000956, + "language_loss": 0.82313585, + "learning_rate": 2.7320974684009996e-06, + "loss": 0.84403229, + "num_input_tokens_seen": 142659340, + "step": 6641, + "time_per_iteration": 2.701085329055786 + }, + { + "auxiliary_loss_clip": 0.01079438, + "auxiliary_loss_mlp": 0.01031457, + "balance_loss_clip": 1.03054738, + "balance_loss_mlp": 1.01792049, + "epoch": 0.39933864422065235, + "flos": 19682674974720.0, + "grad_norm": 1.9029056627373007, + "language_loss": 0.76806593, + "learning_rate": 2.7317350231634288e-06, + "loss": 0.78917491, + "num_input_tokens_seen": 142677085, + "step": 6642, + "time_per_iteration": 2.537104845046997 + }, + { + "auxiliary_loss_clip": 0.01057198, + "auxiliary_loss_mlp": 0.01033764, + "balance_loss_clip": 1.02904058, + "balance_loss_mlp": 1.02039433, + "epoch": 0.3993987674733203, + "flos": 23038491012480.0, + "grad_norm": 1.9714614891560476, + "language_loss": 0.7227295, + "learning_rate": 2.731372550178393e-06, + "loss": 0.74363911, + "num_input_tokens_seen": 142694595, + "step": 6643, + "time_per_iteration": 2.7289891242980957 + }, + { + "auxiliary_loss_clip": 0.01067702, + "auxiliary_loss_mlp": 0.01033088, + "balance_loss_clip": 1.0288986, + "balance_loss_mlp": 1.01968253, + "epoch": 0.3994588907259883, + "flos": 19390828970880.0, + "grad_norm": 1.8206066673058685, + "language_loss": 0.66579968, + "learning_rate": 2.7310100494596375e-06, + "loss": 0.68680757, + "num_input_tokens_seen": 142714175, + "step": 6644, + "time_per_iteration": 2.688251256942749 + }, + { + "auxiliary_loss_clip": 0.01075497, + "auxiliary_loss_mlp": 0.01042135, + "balance_loss_clip": 1.0266794, + "balance_loss_mlp": 1.0287056, + "epoch": 0.39951901397865625, + "flos": 13734395758080.0, + "grad_norm": 1.8797263490387297, + "language_loss": 0.78091425, + "learning_rate": 2.730647521020907e-06, + "loss": 0.80209059, + "num_input_tokens_seen": 142730955, + "step": 6645, + "time_per_iteration": 4.19232964515686 + }, + { + "auxiliary_loss_clip": 0.01068702, + "auxiliary_loss_mlp": 0.01035021, + "balance_loss_clip": 1.0295167, + "balance_loss_mlp": 1.02198541, + "epoch": 0.3995791372313242, + "flos": 23586451966080.0, + "grad_norm": 1.5940271173679252, + "language_loss": 0.69822681, + "learning_rate": 2.73028496487595e-06, + "loss": 0.71926403, + "num_input_tokens_seen": 142751200, + "step": 6646, + "time_per_iteration": 4.337158203125 + }, + { + "auxiliary_loss_clip": 0.01024693, + "auxiliary_loss_mlp": 0.01041093, + "balance_loss_clip": 1.0235014, + "balance_loss_mlp": 1.02721667, + "epoch": 0.3996392604839922, + "flos": 21355896268800.0, + "grad_norm": 1.8429847251509923, + "language_loss": 0.7176488, + "learning_rate": 2.729922381038513e-06, + "loss": 0.73830664, + "num_input_tokens_seen": 142770170, + "step": 6647, + "time_per_iteration": 2.7932543754577637 + }, + { + "auxiliary_loss_clip": 0.01047135, + "auxiliary_loss_mlp": 0.01037086, + "balance_loss_clip": 1.030007, + "balance_loss_mlp": 1.02446783, + "epoch": 0.39969938373666014, + "flos": 26032255914240.0, + "grad_norm": 1.5868703708983738, + "language_loss": 0.74299395, + "learning_rate": 2.7295597695223463e-06, + "loss": 0.76383615, + "num_input_tokens_seen": 142792680, + "step": 6648, + "time_per_iteration": 2.8049614429473877 + }, + { + "auxiliary_loss_clip": 0.01079444, + "auxiliary_loss_mlp": 0.01032539, + "balance_loss_clip": 1.02985907, + "balance_loss_mlp": 1.01881182, + "epoch": 0.3997595069893281, + "flos": 20116367786880.0, + "grad_norm": 1.936685737054462, + "language_loss": 0.66043723, + "learning_rate": 2.7291971303412006e-06, + "loss": 0.681557, + "num_input_tokens_seen": 142810510, + "step": 6649, + "time_per_iteration": 2.6082816123962402 + }, + { + "auxiliary_loss_clip": 0.01053261, + "auxiliary_loss_mlp": 0.0103751, + "balance_loss_clip": 1.03188705, + "balance_loss_mlp": 1.02407503, + "epoch": 0.39981963024199607, + "flos": 27783403764480.0, + "grad_norm": 1.878703294270772, + "language_loss": 0.75404102, + "learning_rate": 2.728834463508826e-06, + "loss": 0.77494872, + "num_input_tokens_seen": 142832455, + "step": 6650, + "time_per_iteration": 2.7173619270324707 + }, + { + "auxiliary_loss_clip": 0.01078877, + "auxiliary_loss_mlp": 0.01039391, + "balance_loss_clip": 1.03050172, + "balance_loss_mlp": 1.02588439, + "epoch": 0.39987975349466404, + "flos": 21944436612480.0, + "grad_norm": 1.8185217002164258, + "language_loss": 0.71725798, + "learning_rate": 2.728471769038975e-06, + "loss": 0.73844069, + "num_input_tokens_seen": 142852590, + "step": 6651, + "time_per_iteration": 2.578097343444824 + }, + { + "auxiliary_loss_clip": 0.01076715, + "auxiliary_loss_mlp": 0.0103787, + "balance_loss_clip": 1.02808797, + "balance_loss_mlp": 1.02441716, + "epoch": 0.39993987674733206, + "flos": 20704405340160.0, + "grad_norm": 2.715962505437041, + "language_loss": 0.73037291, + "learning_rate": 2.728109046945403e-06, + "loss": 0.75151879, + "num_input_tokens_seen": 142870595, + "step": 6652, + "time_per_iteration": 2.528451919555664 + }, + { + "auxiliary_loss_clip": 0.00984973, + "auxiliary_loss_mlp": 0.01002891, + "balance_loss_clip": 1.00460863, + "balance_loss_mlp": 1.00125813, + "epoch": 0.4, + "flos": 61525429862400.0, + "grad_norm": 0.8637054612749491, + "language_loss": 0.6073159, + "learning_rate": 2.727746297241862e-06, + "loss": 0.62719452, + "num_input_tokens_seen": 142925805, + "step": 6653, + "time_per_iteration": 3.1786584854125977 + }, + { + "auxiliary_loss_clip": 0.01044861, + "auxiliary_loss_mlp": 0.01038394, + "balance_loss_clip": 1.02952409, + "balance_loss_mlp": 1.02612162, + "epoch": 0.400060123252668, + "flos": 14502309644160.0, + "grad_norm": 2.0289626247911854, + "language_loss": 0.66860098, + "learning_rate": 2.7273835199421085e-06, + "loss": 0.68943357, + "num_input_tokens_seen": 142943145, + "step": 6654, + "time_per_iteration": 2.775888204574585 + }, + { + "auxiliary_loss_clip": 0.01065943, + "auxiliary_loss_mlp": 0.01037407, + "balance_loss_clip": 1.02867568, + "balance_loss_mlp": 1.02522969, + "epoch": 0.40012024650533595, + "flos": 19093308618240.0, + "grad_norm": 2.353499649259721, + "language_loss": 0.89926863, + "learning_rate": 2.7270207150599e-06, + "loss": 0.92030215, + "num_input_tokens_seen": 142956925, + "step": 6655, + "time_per_iteration": 4.170480966567993 + }, + { + "auxiliary_loss_clip": 0.01046487, + "auxiliary_loss_mlp": 0.01037899, + "balance_loss_clip": 1.02641559, + "balance_loss_mlp": 1.0253284, + "epoch": 0.4001803697580039, + "flos": 29351012094720.0, + "grad_norm": 1.5839050034525446, + "language_loss": 0.73186827, + "learning_rate": 2.7266578826089917e-06, + "loss": 0.75271213, + "num_input_tokens_seen": 142978040, + "step": 6656, + "time_per_iteration": 4.295807123184204 + }, + { + "auxiliary_loss_clip": 0.0107914, + "auxiliary_loss_mlp": 0.01044601, + "balance_loss_clip": 1.02998996, + "balance_loss_mlp": 1.03048062, + "epoch": 0.4002404930106719, + "flos": 20920048640640.0, + "grad_norm": 1.4555889529227573, + "language_loss": 0.73440701, + "learning_rate": 2.726295022603144e-06, + "loss": 0.75564444, + "num_input_tokens_seen": 142998390, + "step": 6657, + "time_per_iteration": 2.6674087047576904 + }, + { + "auxiliary_loss_clip": 0.01079099, + "auxiliary_loss_mlp": 0.01042941, + "balance_loss_clip": 1.02968526, + "balance_loss_mlp": 1.02873111, + "epoch": 0.40030061626333985, + "flos": 28405735827840.0, + "grad_norm": 1.4564499821326524, + "language_loss": 0.79644281, + "learning_rate": 2.725932135056117e-06, + "loss": 0.81766319, + "num_input_tokens_seen": 143021505, + "step": 6658, + "time_per_iteration": 2.7330353260040283 + }, + { + "auxiliary_loss_clip": 0.01062285, + "auxiliary_loss_mlp": 0.010412, + "balance_loss_clip": 1.02630985, + "balance_loss_mlp": 1.02770543, + "epoch": 0.4003607395160078, + "flos": 25921615046400.0, + "grad_norm": 1.5978530515806344, + "language_loss": 0.77096879, + "learning_rate": 2.72556921998167e-06, + "loss": 0.79200363, + "num_input_tokens_seen": 143041375, + "step": 6659, + "time_per_iteration": 2.698415994644165 + }, + { + "auxiliary_loss_clip": 0.01070213, + "auxiliary_loss_mlp": 0.01030793, + "balance_loss_clip": 1.02744913, + "balance_loss_mlp": 1.01977801, + "epoch": 0.4004208627686758, + "flos": 20768648814720.0, + "grad_norm": 1.7081752082017565, + "language_loss": 0.7265358, + "learning_rate": 2.7252062773935662e-06, + "loss": 0.74754584, + "num_input_tokens_seen": 143058725, + "step": 6660, + "time_per_iteration": 2.6373579502105713 + }, + { + "auxiliary_loss_clip": 0.01047187, + "auxiliary_loss_mlp": 0.01039957, + "balance_loss_clip": 1.02557433, + "balance_loss_mlp": 1.02729094, + "epoch": 0.40048098602134374, + "flos": 24681224638080.0, + "grad_norm": 1.7376772282795874, + "language_loss": 0.70889109, + "learning_rate": 2.7248433073055674e-06, + "loss": 0.72976249, + "num_input_tokens_seen": 143076995, + "step": 6661, + "time_per_iteration": 2.68101167678833 + }, + { + "auxiliary_loss_clip": 0.01081221, + "auxiliary_loss_mlp": 0.01045542, + "balance_loss_clip": 1.0320034, + "balance_loss_mlp": 1.03207135, + "epoch": 0.4005411092740117, + "flos": 23185688947200.0, + "grad_norm": 1.561101805080747, + "language_loss": 0.7539537, + "learning_rate": 2.724480309731437e-06, + "loss": 0.77522135, + "num_input_tokens_seen": 143096780, + "step": 6662, + "time_per_iteration": 2.7245774269104004 + }, + { + "auxiliary_loss_clip": 0.01061109, + "auxiliary_loss_mlp": 0.01034673, + "balance_loss_clip": 1.02787232, + "balance_loss_mlp": 1.02078533, + "epoch": 0.4006012325266797, + "flos": 17522324409600.0, + "grad_norm": 1.9031437554520458, + "language_loss": 0.66310644, + "learning_rate": 2.7241172846849417e-06, + "loss": 0.68406421, + "num_input_tokens_seen": 143112590, + "step": 6663, + "time_per_iteration": 2.5776596069335938 + }, + { + "auxiliary_loss_clip": 0.0105779, + "auxiliary_loss_mlp": 0.01036241, + "balance_loss_clip": 1.02552915, + "balance_loss_mlp": 1.02268672, + "epoch": 0.40066135577934764, + "flos": 19857200181120.0, + "grad_norm": 2.546592795817381, + "language_loss": 0.85649651, + "learning_rate": 2.7237542321798455e-06, + "loss": 0.87743688, + "num_input_tokens_seen": 143130220, + "step": 6664, + "time_per_iteration": 2.671921968460083 + }, + { + "auxiliary_loss_clip": 0.01068984, + "auxiliary_loss_mlp": 0.01039388, + "balance_loss_clip": 1.03072786, + "balance_loss_mlp": 1.0264653, + "epoch": 0.40072147903201566, + "flos": 18150007599360.0, + "grad_norm": 2.5502505412874505, + "language_loss": 0.84659588, + "learning_rate": 2.723391152229917e-06, + "loss": 0.8676796, + "num_input_tokens_seen": 143147160, + "step": 6665, + "time_per_iteration": 2.695042371749878 + }, + { + "auxiliary_loss_clip": 0.01071349, + "auxiliary_loss_mlp": 0.01038381, + "balance_loss_clip": 1.03165793, + "balance_loss_mlp": 1.02453446, + "epoch": 0.4007816022846836, + "flos": 18661267831680.0, + "grad_norm": 2.081613162224829, + "language_loss": 0.78761983, + "learning_rate": 2.7230280448489236e-06, + "loss": 0.80871713, + "num_input_tokens_seen": 143164605, + "step": 6666, + "time_per_iteration": 2.584653377532959 + }, + { + "auxiliary_loss_clip": 0.01068606, + "auxiliary_loss_mlp": 0.01034308, + "balance_loss_clip": 1.02998233, + "balance_loss_mlp": 1.02100432, + "epoch": 0.4008417255373516, + "flos": 25703170485120.0, + "grad_norm": 1.853497177811489, + "language_loss": 0.73299706, + "learning_rate": 2.7226649100506333e-06, + "loss": 0.75402611, + "num_input_tokens_seen": 143183965, + "step": 6667, + "time_per_iteration": 2.6146910190582275 + }, + { + "auxiliary_loss_clip": 0.01059583, + "auxiliary_loss_mlp": 0.01048734, + "balance_loss_clip": 1.02802849, + "balance_loss_mlp": 1.03358245, + "epoch": 0.40090184879001955, + "flos": 22858614679680.0, + "grad_norm": 1.466354145876335, + "language_loss": 0.75737417, + "learning_rate": 2.7223017478488183e-06, + "loss": 0.77845728, + "num_input_tokens_seen": 143204965, + "step": 6668, + "time_per_iteration": 2.642413854598999 + }, + { + "auxiliary_loss_clip": 0.01044612, + "auxiliary_loss_mlp": 0.01032769, + "balance_loss_clip": 1.03119111, + "balance_loss_mlp": 1.01955521, + "epoch": 0.4009619720426875, + "flos": 29059848449280.0, + "grad_norm": 1.862498052818914, + "language_loss": 0.81680018, + "learning_rate": 2.721938558257248e-06, + "loss": 0.83757395, + "num_input_tokens_seen": 143225015, + "step": 6669, + "time_per_iteration": 2.7739977836608887 + }, + { + "auxiliary_loss_clip": 0.00987523, + "auxiliary_loss_mlp": 0.01005804, + "balance_loss_clip": 1.00643945, + "balance_loss_mlp": 1.00408769, + "epoch": 0.4010220952953555, + "flos": 66059763131520.0, + "grad_norm": 0.7025239639398504, + "language_loss": 0.53409153, + "learning_rate": 2.721575341289695e-06, + "loss": 0.55402482, + "num_input_tokens_seen": 143294925, + "step": 6670, + "time_per_iteration": 3.3798882961273193 + }, + { + "auxiliary_loss_clip": 0.01033512, + "auxiliary_loss_mlp": 0.01035126, + "balance_loss_clip": 1.02739525, + "balance_loss_mlp": 1.02254379, + "epoch": 0.40108221854802345, + "flos": 29642822184960.0, + "grad_norm": 1.6386260782171078, + "language_loss": 0.88583362, + "learning_rate": 2.7212120969599333e-06, + "loss": 0.90651995, + "num_input_tokens_seen": 143314170, + "step": 6671, + "time_per_iteration": 2.7734806537628174 + }, + { + "auxiliary_loss_clip": 0.0106809, + "auxiliary_loss_mlp": 0.0103376, + "balance_loss_clip": 1.02949286, + "balance_loss_mlp": 1.02020574, + "epoch": 0.4011423418006914, + "flos": 19929560129280.0, + "grad_norm": 1.8058045302120591, + "language_loss": 0.79163784, + "learning_rate": 2.720848825281736e-06, + "loss": 0.81265634, + "num_input_tokens_seen": 143330050, + "step": 6672, + "time_per_iteration": 2.6596808433532715 + }, + { + "auxiliary_loss_clip": 0.01040864, + "auxiliary_loss_mlp": 0.01035806, + "balance_loss_clip": 1.02885926, + "balance_loss_mlp": 1.02201927, + "epoch": 0.4012024650533594, + "flos": 20084299920000.0, + "grad_norm": 2.162358038525119, + "language_loss": 0.63247204, + "learning_rate": 2.72048552626888e-06, + "loss": 0.65323877, + "num_input_tokens_seen": 143348650, + "step": 6673, + "time_per_iteration": 2.8326494693756104 + }, + { + "auxiliary_loss_clip": 0.0105264, + "auxiliary_loss_mlp": 0.00747606, + "balance_loss_clip": 1.0267092, + "balance_loss_mlp": 1.00034261, + "epoch": 0.40126258830602735, + "flos": 21695719864320.0, + "grad_norm": 1.45455037284028, + "language_loss": 0.80328834, + "learning_rate": 2.7201221999351402e-06, + "loss": 0.82129085, + "num_input_tokens_seen": 143370275, + "step": 6674, + "time_per_iteration": 2.791621208190918 + }, + { + "auxiliary_loss_clip": 0.01044862, + "auxiliary_loss_mlp": 0.01031519, + "balance_loss_clip": 1.03471375, + "balance_loss_mlp": 1.01839387, + "epoch": 0.4013227115586953, + "flos": 12020379592320.0, + "grad_norm": 2.259619081142894, + "language_loss": 0.82453227, + "learning_rate": 2.719758846294294e-06, + "loss": 0.84529614, + "num_input_tokens_seen": 143385390, + "step": 6675, + "time_per_iteration": 2.72318172454834 + }, + { + "auxiliary_loss_clip": 0.01064366, + "auxiliary_loss_mlp": 0.01033108, + "balance_loss_clip": 1.02664471, + "balance_loss_mlp": 1.0193274, + "epoch": 0.4013828348113633, + "flos": 25447522412160.0, + "grad_norm": 1.7524416977811645, + "language_loss": 0.93353468, + "learning_rate": 2.71939546536012e-06, + "loss": 0.95450944, + "num_input_tokens_seen": 143404215, + "step": 6676, + "time_per_iteration": 2.69166898727417 + }, + { + "auxiliary_loss_clip": 0.01072457, + "auxiliary_loss_mlp": 0.01041028, + "balance_loss_clip": 1.03055263, + "balance_loss_mlp": 1.02632308, + "epoch": 0.40144295806403124, + "flos": 18582946225920.0, + "grad_norm": 2.423033167996546, + "language_loss": 0.79466045, + "learning_rate": 2.719032057146399e-06, + "loss": 0.8157953, + "num_input_tokens_seen": 143422245, + "step": 6677, + "time_per_iteration": 2.659494638442993 + }, + { + "auxiliary_loss_clip": 0.01055043, + "auxiliary_loss_mlp": 0.01033533, + "balance_loss_clip": 1.02971709, + "balance_loss_mlp": 1.02070594, + "epoch": 0.4015030813166992, + "flos": 22930220442240.0, + "grad_norm": 1.964456613798694, + "language_loss": 0.83693504, + "learning_rate": 2.71866862166691e-06, + "loss": 0.85782087, + "num_input_tokens_seen": 143443130, + "step": 6678, + "time_per_iteration": 2.6468000411987305 + }, + { + "auxiliary_loss_clip": 0.01075822, + "auxiliary_loss_mlp": 0.0103586, + "balance_loss_clip": 1.03021264, + "balance_loss_mlp": 1.02313399, + "epoch": 0.4015632045693672, + "flos": 20595057361920.0, + "grad_norm": 2.0942795582357263, + "language_loss": 0.6439184, + "learning_rate": 2.718305158935434e-06, + "loss": 0.66503519, + "num_input_tokens_seen": 143461385, + "step": 6679, + "time_per_iteration": 2.6449930667877197 + }, + { + "auxiliary_loss_clip": 0.01047662, + "auxiliary_loss_mlp": 0.01032454, + "balance_loss_clip": 1.02701879, + "balance_loss_mlp": 1.0196507, + "epoch": 0.4016233278220352, + "flos": 23438930808960.0, + "grad_norm": 1.482184758184274, + "language_loss": 0.78741175, + "learning_rate": 2.7179416689657554e-06, + "loss": 0.80821288, + "num_input_tokens_seen": 143481750, + "step": 6680, + "time_per_iteration": 2.7328412532806396 + }, + { + "auxiliary_loss_clip": 0.01049352, + "auxiliary_loss_mlp": 0.0074777, + "balance_loss_clip": 1.02884388, + "balance_loss_mlp": 1.00035942, + "epoch": 0.40168345107470316, + "flos": 21431057477760.0, + "grad_norm": 1.5852107361694936, + "language_loss": 0.75854659, + "learning_rate": 2.7175781517716556e-06, + "loss": 0.77651781, + "num_input_tokens_seen": 143501540, + "step": 6681, + "time_per_iteration": 2.6834561824798584 + }, + { + "auxiliary_loss_clip": 0.01039251, + "auxiliary_loss_mlp": 0.01030309, + "balance_loss_clip": 1.03125167, + "balance_loss_mlp": 1.01788151, + "epoch": 0.4017435743273711, + "flos": 22857214049280.0, + "grad_norm": 1.8846767462430585, + "language_loss": 0.64023733, + "learning_rate": 2.7172146073669213e-06, + "loss": 0.66093302, + "num_input_tokens_seen": 143520530, + "step": 6682, + "time_per_iteration": 2.795029640197754 + }, + { + "auxiliary_loss_clip": 0.01032257, + "auxiliary_loss_mlp": 0.01037507, + "balance_loss_clip": 1.0263145, + "balance_loss_mlp": 1.02431023, + "epoch": 0.4018036975800391, + "flos": 28622312881920.0, + "grad_norm": 1.6807509092699566, + "language_loss": 0.73014343, + "learning_rate": 2.716851035765337e-06, + "loss": 0.75084108, + "num_input_tokens_seen": 143540210, + "step": 6683, + "time_per_iteration": 2.770695924758911 + }, + { + "auxiliary_loss_clip": 0.01064984, + "auxiliary_loss_mlp": 0.01039382, + "balance_loss_clip": 1.02816534, + "balance_loss_mlp": 1.02636433, + "epoch": 0.40186382083270705, + "flos": 26651212099200.0, + "grad_norm": 1.7242294874481752, + "language_loss": 0.73342013, + "learning_rate": 2.7164874369806896e-06, + "loss": 0.75446373, + "num_input_tokens_seen": 143560940, + "step": 6684, + "time_per_iteration": 2.6840341091156006 + }, + { + "auxiliary_loss_clip": 0.00996102, + "auxiliary_loss_mlp": 0.01006739, + "balance_loss_clip": 1.0011766, + "balance_loss_mlp": 1.00509346, + "epoch": 0.401923944085375, + "flos": 59259969123840.0, + "grad_norm": 0.8061597052292658, + "language_loss": 0.603966, + "learning_rate": 2.716123811026767e-06, + "loss": 0.62399435, + "num_input_tokens_seen": 143624015, + "step": 6685, + "time_per_iteration": 3.297964334487915 + }, + { + "auxiliary_loss_clip": 0.01065845, + "auxiliary_loss_mlp": 0.01030573, + "balance_loss_clip": 1.02875268, + "balance_loss_mlp": 1.01802039, + "epoch": 0.401984067338043, + "flos": 16982803152000.0, + "grad_norm": 1.6951888725014463, + "language_loss": 0.70069599, + "learning_rate": 2.715760157917357e-06, + "loss": 0.72166014, + "num_input_tokens_seen": 143642750, + "step": 6686, + "time_per_iteration": 2.6570513248443604 + }, + { + "auxiliary_loss_clip": 0.01054397, + "auxiliary_loss_mlp": 0.01034691, + "balance_loss_clip": 1.02724755, + "balance_loss_mlp": 1.02227545, + "epoch": 0.40204419059071095, + "flos": 24972496024320.0, + "grad_norm": 1.4270120069308585, + "language_loss": 0.74873579, + "learning_rate": 2.7153964776662504e-06, + "loss": 0.76962674, + "num_input_tokens_seen": 143664515, + "step": 6687, + "time_per_iteration": 2.675095558166504 + }, + { + "auxiliary_loss_clip": 0.01060074, + "auxiliary_loss_mlp": 0.01035663, + "balance_loss_clip": 1.03200769, + "balance_loss_mlp": 1.02260375, + "epoch": 0.4021043138433789, + "flos": 23477463123840.0, + "grad_norm": 1.707978419977421, + "language_loss": 0.70654738, + "learning_rate": 2.7150327702872385e-06, + "loss": 0.72750473, + "num_input_tokens_seen": 143683135, + "step": 6688, + "time_per_iteration": 2.642618179321289 + }, + { + "auxiliary_loss_clip": 0.01050038, + "auxiliary_loss_mlp": 0.01038043, + "balance_loss_clip": 1.026546, + "balance_loss_mlp": 1.02363062, + "epoch": 0.4021644370960469, + "flos": 25995806588160.0, + "grad_norm": 1.8649832598668865, + "language_loss": 0.64439142, + "learning_rate": 2.7146690357941112e-06, + "loss": 0.66527224, + "num_input_tokens_seen": 143703985, + "step": 6689, + "time_per_iteration": 2.6372299194335938 + }, + { + "auxiliary_loss_clip": 0.01065803, + "auxiliary_loss_mlp": 0.01031189, + "balance_loss_clip": 1.02716374, + "balance_loss_mlp": 1.01848125, + "epoch": 0.40222456034871484, + "flos": 13587987922560.0, + "grad_norm": 2.036611386616659, + "language_loss": 0.73291087, + "learning_rate": 2.7143052742006632e-06, + "loss": 0.75388074, + "num_input_tokens_seen": 143719245, + "step": 6690, + "time_per_iteration": 2.5372471809387207 + }, + { + "auxiliary_loss_clip": 0.01043062, + "auxiliary_loss_mlp": 0.01033631, + "balance_loss_clip": 1.0266757, + "balance_loss_mlp": 1.02109063, + "epoch": 0.4022846836013828, + "flos": 24278019494400.0, + "grad_norm": 1.6196079447315346, + "language_loss": 0.74718153, + "learning_rate": 2.7139414855206872e-06, + "loss": 0.76794851, + "num_input_tokens_seen": 143739575, + "step": 6691, + "time_per_iteration": 2.7185513973236084 + }, + { + "auxiliary_loss_clip": 0.010611, + "auxiliary_loss_mlp": 0.01035078, + "balance_loss_clip": 1.03064799, + "balance_loss_mlp": 1.02182221, + "epoch": 0.40234480685405083, + "flos": 20151596050560.0, + "grad_norm": 1.5526618443322677, + "language_loss": 0.72459549, + "learning_rate": 2.7135776697679785e-06, + "loss": 0.74555725, + "num_input_tokens_seen": 143758515, + "step": 6692, + "time_per_iteration": 2.674380302429199 + }, + { + "auxiliary_loss_clip": 0.01029294, + "auxiliary_loss_mlp": 0.01034168, + "balance_loss_clip": 1.02512622, + "balance_loss_mlp": 1.02143598, + "epoch": 0.4024049301067188, + "flos": 22930220442240.0, + "grad_norm": 2.152609697827635, + "language_loss": 0.83997023, + "learning_rate": 2.7132138269563333e-06, + "loss": 0.86060488, + "num_input_tokens_seen": 143776770, + "step": 6693, + "time_per_iteration": 5.902799606323242 + }, + { + "auxiliary_loss_clip": 0.01041454, + "auxiliary_loss_mlp": 0.01042031, + "balance_loss_clip": 1.02899086, + "balance_loss_mlp": 1.02805936, + "epoch": 0.40246505335938676, + "flos": 36028421487360.0, + "grad_norm": 1.6633802123394905, + "language_loss": 0.70839494, + "learning_rate": 2.7128499570995483e-06, + "loss": 0.72922981, + "num_input_tokens_seen": 143798450, + "step": 6694, + "time_per_iteration": 2.9477782249450684 + }, + { + "auxiliary_loss_clip": 0.01056062, + "auxiliary_loss_mlp": 0.01040718, + "balance_loss_clip": 1.02864778, + "balance_loss_mlp": 1.02735496, + "epoch": 0.4025251766120547, + "flos": 20594303176320.0, + "grad_norm": 4.5320974834204675, + "language_loss": 0.68008769, + "learning_rate": 2.7124860602114212e-06, + "loss": 0.70105553, + "num_input_tokens_seen": 143816995, + "step": 6695, + "time_per_iteration": 2.8182809352874756 + }, + { + "auxiliary_loss_clip": 0.01046043, + "auxiliary_loss_mlp": 0.01034685, + "balance_loss_clip": 1.02483082, + "balance_loss_mlp": 1.02104712, + "epoch": 0.4025852998647227, + "flos": 64523932381440.0, + "grad_norm": 2.3002350785632797, + "language_loss": 0.79550207, + "learning_rate": 2.7121221363057515e-06, + "loss": 0.81630933, + "num_input_tokens_seen": 143842090, + "step": 6696, + "time_per_iteration": 3.0322189331054688 + }, + { + "auxiliary_loss_clip": 0.01050128, + "auxiliary_loss_mlp": 0.0103759, + "balance_loss_clip": 1.0280757, + "balance_loss_mlp": 1.0234158, + "epoch": 0.40264542311739066, + "flos": 20886292834560.0, + "grad_norm": 1.7841183502142135, + "language_loss": 0.70787215, + "learning_rate": 2.7117581853963393e-06, + "loss": 0.72874928, + "num_input_tokens_seen": 143860800, + "step": 6697, + "time_per_iteration": 2.692523717880249 + }, + { + "auxiliary_loss_clip": 0.01062673, + "auxiliary_loss_mlp": 0.01041771, + "balance_loss_clip": 1.02773821, + "balance_loss_mlp": 1.029338, + "epoch": 0.4027055463700586, + "flos": 26250197685120.0, + "grad_norm": 2.2104253243550605, + "language_loss": 0.6160506, + "learning_rate": 2.711394207496984e-06, + "loss": 0.63709503, + "num_input_tokens_seen": 143878950, + "step": 6698, + "time_per_iteration": 2.634744644165039 + }, + { + "auxiliary_loss_clip": 0.01066955, + "auxiliary_loss_mlp": 0.01034268, + "balance_loss_clip": 1.02939522, + "balance_loss_mlp": 1.02128005, + "epoch": 0.4027656696227266, + "flos": 20631398947200.0, + "grad_norm": 1.9964451278765265, + "language_loss": 0.76433599, + "learning_rate": 2.711030202621491e-06, + "loss": 0.78534818, + "num_input_tokens_seen": 143898385, + "step": 6699, + "time_per_iteration": 2.684258460998535 + }, + { + "auxiliary_loss_clip": 0.01043886, + "auxiliary_loss_mlp": 0.01032583, + "balance_loss_clip": 1.02790046, + "balance_loss_mlp": 1.02002406, + "epoch": 0.40282579287539455, + "flos": 22346277039360.0, + "grad_norm": 1.898295220451947, + "language_loss": 0.80274922, + "learning_rate": 2.7106661707836605e-06, + "loss": 0.82351387, + "num_input_tokens_seen": 143918795, + "step": 6700, + "time_per_iteration": 2.780998706817627 + }, + { + "auxiliary_loss_clip": 0.0106061, + "auxiliary_loss_mlp": 0.0104193, + "balance_loss_clip": 1.03051376, + "balance_loss_mlp": 1.02754736, + "epoch": 0.4028859161280625, + "flos": 29274988959360.0, + "grad_norm": 1.8096565482868692, + "language_loss": 0.74683279, + "learning_rate": 2.7103021119972977e-06, + "loss": 0.76785821, + "num_input_tokens_seen": 143938245, + "step": 6701, + "time_per_iteration": 2.648895263671875 + }, + { + "auxiliary_loss_clip": 0.01049643, + "auxiliary_loss_mlp": 0.01038675, + "balance_loss_clip": 1.02808809, + "balance_loss_mlp": 1.02614629, + "epoch": 0.4029460393807305, + "flos": 28622312881920.0, + "grad_norm": 1.5093425889115553, + "language_loss": 0.65977603, + "learning_rate": 2.709938026276208e-06, + "loss": 0.68065917, + "num_input_tokens_seen": 143960995, + "step": 6702, + "time_per_iteration": 4.214357376098633 + }, + { + "auxiliary_loss_clip": 0.01046544, + "auxiliary_loss_mlp": 0.01040387, + "balance_loss_clip": 1.02646661, + "balance_loss_mlp": 1.02590859, + "epoch": 0.40300616263339845, + "flos": 22601925112320.0, + "grad_norm": 2.4688785198997567, + "language_loss": 0.65850693, + "learning_rate": 2.7095739136341964e-06, + "loss": 0.67937618, + "num_input_tokens_seen": 143979910, + "step": 6703, + "time_per_iteration": 4.4479146003723145 + }, + { + "auxiliary_loss_clip": 0.01006559, + "auxiliary_loss_mlp": 0.01036028, + "balance_loss_clip": 1.02488256, + "balance_loss_mlp": 1.02180672, + "epoch": 0.4030662858860664, + "flos": 25520313323520.0, + "grad_norm": 2.0098882410796706, + "language_loss": 0.81900156, + "learning_rate": 2.709209774085071e-06, + "loss": 0.83942747, + "num_input_tokens_seen": 144000095, + "step": 6704, + "time_per_iteration": 2.9177279472351074 + }, + { + "auxiliary_loss_clip": 0.01051965, + "auxiliary_loss_mlp": 0.0103287, + "balance_loss_clip": 1.0284847, + "balance_loss_mlp": 1.01991236, + "epoch": 0.40312640913873443, + "flos": 23586703361280.0, + "grad_norm": 1.6481866865102814, + "language_loss": 0.73226953, + "learning_rate": 2.7088456076426407e-06, + "loss": 0.75311786, + "num_input_tokens_seen": 144019695, + "step": 6705, + "time_per_iteration": 3.0701117515563965 + }, + { + "auxiliary_loss_clip": 0.01065554, + "auxiliary_loss_mlp": 0.01034581, + "balance_loss_clip": 1.02932334, + "balance_loss_mlp": 1.02239144, + "epoch": 0.4031865323914024, + "flos": 20011042131840.0, + "grad_norm": 1.711185352812789, + "language_loss": 0.66366851, + "learning_rate": 2.708481414320713e-06, + "loss": 0.68466985, + "num_input_tokens_seen": 144038525, + "step": 6706, + "time_per_iteration": 2.632545232772827 + }, + { + "auxiliary_loss_clip": 0.01068138, + "auxiliary_loss_mlp": 0.0103701, + "balance_loss_clip": 1.02998304, + "balance_loss_mlp": 1.02352738, + "epoch": 0.40324665564407036, + "flos": 21871430219520.0, + "grad_norm": 1.6085516828377138, + "language_loss": 0.71428955, + "learning_rate": 2.7081171941330992e-06, + "loss": 0.73534107, + "num_input_tokens_seen": 144059485, + "step": 6707, + "time_per_iteration": 2.6388397216796875 + }, + { + "auxiliary_loss_clip": 0.01051914, + "auxiliary_loss_mlp": 0.01032339, + "balance_loss_clip": 1.02709639, + "balance_loss_mlp": 1.01973248, + "epoch": 0.4033067788967383, + "flos": 23878728933120.0, + "grad_norm": 1.5447283266166674, + "language_loss": 0.80220342, + "learning_rate": 2.707752947093611e-06, + "loss": 0.82304597, + "num_input_tokens_seen": 144080265, + "step": 6708, + "time_per_iteration": 2.706132411956787 + }, + { + "auxiliary_loss_clip": 0.01027151, + "auxiliary_loss_mlp": 0.01037452, + "balance_loss_clip": 1.02555287, + "balance_loss_mlp": 1.02404058, + "epoch": 0.4033669021494063, + "flos": 17419907756160.0, + "grad_norm": 2.144742874805986, + "language_loss": 0.82354122, + "learning_rate": 2.70738867321606e-06, + "loss": 0.84418726, + "num_input_tokens_seen": 144098040, + "step": 6709, + "time_per_iteration": 2.6701130867004395 + }, + { + "auxiliary_loss_clip": 0.01068627, + "auxiliary_loss_mlp": 0.01036724, + "balance_loss_clip": 1.03001714, + "balance_loss_mlp": 1.02349162, + "epoch": 0.40342702540207426, + "flos": 29600554855680.0, + "grad_norm": 1.5073187603747313, + "language_loss": 0.71425319, + "learning_rate": 2.70702437251426e-06, + "loss": 0.73530674, + "num_input_tokens_seen": 144118265, + "step": 6710, + "time_per_iteration": 2.670880079269409 + }, + { + "auxiliary_loss_clip": 0.0104628, + "auxiliary_loss_mlp": 0.01036226, + "balance_loss_clip": 1.02686977, + "balance_loss_mlp": 1.02261198, + "epoch": 0.4034871486547422, + "flos": 11284605400320.0, + "grad_norm": 2.195490835311863, + "language_loss": 0.84915936, + "learning_rate": 2.7066600450020236e-06, + "loss": 0.86998445, + "num_input_tokens_seen": 144133865, + "step": 6711, + "time_per_iteration": 2.828665256500244 + }, + { + "auxiliary_loss_clip": 0.01066727, + "auxiliary_loss_mlp": 0.010326, + "balance_loss_clip": 1.02806044, + "balance_loss_mlp": 1.0197674, + "epoch": 0.4035472719074102, + "flos": 15552839738880.0, + "grad_norm": 1.9448558058012544, + "language_loss": 0.75703299, + "learning_rate": 2.706295690693168e-06, + "loss": 0.77802622, + "num_input_tokens_seen": 144150125, + "step": 6712, + "time_per_iteration": 2.5813207626342773 + }, + { + "auxiliary_loss_clip": 0.01042961, + "auxiliary_loss_mlp": 0.01037751, + "balance_loss_clip": 1.02822518, + "balance_loss_mlp": 1.02433944, + "epoch": 0.40360739516007815, + "flos": 24674365140480.0, + "grad_norm": 1.9322526586528264, + "language_loss": 0.7845577, + "learning_rate": 2.7059313096015096e-06, + "loss": 0.80536479, + "num_input_tokens_seen": 144169295, + "step": 6713, + "time_per_iteration": 2.736513137817383 + }, + { + "auxiliary_loss_clip": 0.01039857, + "auxiliary_loss_mlp": 0.01036941, + "balance_loss_clip": 1.02599216, + "balance_loss_mlp": 1.02244473, + "epoch": 0.4036675184127461, + "flos": 17304095329920.0, + "grad_norm": 1.7781811885371883, + "language_loss": 0.88211316, + "learning_rate": 2.705566901740865e-06, + "loss": 0.90288115, + "num_input_tokens_seen": 144185790, + "step": 6714, + "time_per_iteration": 2.7622947692871094 + }, + { + "auxiliary_loss_clip": 0.01064094, + "auxiliary_loss_mlp": 0.01036557, + "balance_loss_clip": 1.02710557, + "balance_loss_mlp": 1.02421856, + "epoch": 0.4037276416654141, + "flos": 19864023765120.0, + "grad_norm": 1.4613638843636414, + "language_loss": 0.69175178, + "learning_rate": 2.7052024671250527e-06, + "loss": 0.7127583, + "num_input_tokens_seen": 144205190, + "step": 6715, + "time_per_iteration": 2.700171947479248 + }, + { + "auxiliary_loss_clip": 0.01031199, + "auxiliary_loss_mlp": 0.01032514, + "balance_loss_clip": 1.02477837, + "balance_loss_mlp": 1.01929343, + "epoch": 0.40378776491808205, + "flos": 18296271780480.0, + "grad_norm": 2.0503788741046813, + "language_loss": 0.77242529, + "learning_rate": 2.704838005767892e-06, + "loss": 0.79306245, + "num_input_tokens_seen": 144222705, + "step": 6716, + "time_per_iteration": 2.949108600616455 + }, + { + "auxiliary_loss_clip": 0.01033559, + "auxiliary_loss_mlp": 0.01034409, + "balance_loss_clip": 1.02754366, + "balance_loss_mlp": 1.02239883, + "epoch": 0.40384788817075, + "flos": 15049372757760.0, + "grad_norm": 1.9249622988762982, + "language_loss": 0.76206005, + "learning_rate": 2.7044735176832037e-06, + "loss": 0.78273976, + "num_input_tokens_seen": 144239545, + "step": 6717, + "time_per_iteration": 2.9909567832946777 + }, + { + "auxiliary_loss_clip": 0.00984296, + "auxiliary_loss_mlp": 0.01005907, + "balance_loss_clip": 1.00362456, + "balance_loss_mlp": 1.00447619, + "epoch": 0.40390801142341803, + "flos": 61929927895680.0, + "grad_norm": 0.9381468877847206, + "language_loss": 0.60757238, + "learning_rate": 2.7041090028848084e-06, + "loss": 0.62747437, + "num_input_tokens_seen": 144288145, + "step": 6718, + "time_per_iteration": 3.251357078552246 + }, + { + "auxiliary_loss_clip": 0.01080295, + "auxiliary_loss_mlp": 0.01039752, + "balance_loss_clip": 1.02994812, + "balance_loss_mlp": 1.02563739, + "epoch": 0.403968134676086, + "flos": 22738779930240.0, + "grad_norm": 2.059391280545663, + "language_loss": 0.74723285, + "learning_rate": 2.7037444613865306e-06, + "loss": 0.76843333, + "num_input_tokens_seen": 144302315, + "step": 6719, + "time_per_iteration": 2.768097162246704 + }, + { + "auxiliary_loss_clip": 0.01067397, + "auxiliary_loss_mlp": 0.01041368, + "balance_loss_clip": 1.02821565, + "balance_loss_mlp": 1.0274148, + "epoch": 0.40402825792875396, + "flos": 19784409269760.0, + "grad_norm": 2.9626476109624855, + "language_loss": 0.81265664, + "learning_rate": 2.7033798932021906e-06, + "loss": 0.83374429, + "num_input_tokens_seen": 144318990, + "step": 6720, + "time_per_iteration": 2.862332344055176 + }, + { + "auxiliary_loss_clip": 0.01053412, + "auxiliary_loss_mlp": 0.0103063, + "balance_loss_clip": 1.0258193, + "balance_loss_mlp": 1.01784468, + "epoch": 0.40408838118142193, + "flos": 19609273532160.0, + "grad_norm": 2.1237190738153733, + "language_loss": 0.76916808, + "learning_rate": 2.7030152983456153e-06, + "loss": 0.79000854, + "num_input_tokens_seen": 144335765, + "step": 6721, + "time_per_iteration": 2.7427175045013428 + }, + { + "auxiliary_loss_clip": 0.01041535, + "auxiliary_loss_mlp": 0.01026501, + "balance_loss_clip": 1.02688122, + "balance_loss_mlp": 1.01533079, + "epoch": 0.4041485044340899, + "flos": 24426043441920.0, + "grad_norm": 1.6692287990147467, + "language_loss": 0.72604299, + "learning_rate": 2.7026506768306304e-06, + "loss": 0.74672329, + "num_input_tokens_seen": 144355825, + "step": 6722, + "time_per_iteration": 2.689972400665283 + }, + { + "auxiliary_loss_clip": 0.01063974, + "auxiliary_loss_mlp": 0.01029335, + "balance_loss_clip": 1.02771223, + "balance_loss_mlp": 1.01736689, + "epoch": 0.40420862768675786, + "flos": 16760192613120.0, + "grad_norm": 1.8702926081125932, + "language_loss": 0.65860987, + "learning_rate": 2.7022860286710602e-06, + "loss": 0.67954296, + "num_input_tokens_seen": 144374320, + "step": 6723, + "time_per_iteration": 2.5542690753936768 + }, + { + "auxiliary_loss_clip": 0.01068748, + "auxiliary_loss_mlp": 0.01046723, + "balance_loss_clip": 1.03076458, + "balance_loss_mlp": 1.03253663, + "epoch": 0.4042687509394258, + "flos": 22491571553280.0, + "grad_norm": 1.43881077212799, + "language_loss": 0.7390421, + "learning_rate": 2.701921353880734e-06, + "loss": 0.76019681, + "num_input_tokens_seen": 144394325, + "step": 6724, + "time_per_iteration": 2.581246852874756 + }, + { + "auxiliary_loss_clip": 0.01050603, + "auxiliary_loss_mlp": 0.01035883, + "balance_loss_clip": 1.02784276, + "balance_loss_mlp": 1.02378953, + "epoch": 0.4043288741920938, + "flos": 30336149479680.0, + "grad_norm": 1.6990738177159421, + "language_loss": 0.74876881, + "learning_rate": 2.7015566524734787e-06, + "loss": 0.76963365, + "num_input_tokens_seen": 144412765, + "step": 6725, + "time_per_iteration": 2.659768581390381 + }, + { + "auxiliary_loss_clip": 0.01061872, + "auxiliary_loss_mlp": 0.01036034, + "balance_loss_clip": 1.02601671, + "balance_loss_mlp": 1.02212214, + "epoch": 0.40438899744476176, + "flos": 46348321363200.0, + "grad_norm": 1.6084369226482864, + "language_loss": 0.76849377, + "learning_rate": 2.701191924463126e-06, + "loss": 0.78947282, + "num_input_tokens_seen": 144435400, + "step": 6726, + "time_per_iteration": 2.8478260040283203 + }, + { + "auxiliary_loss_clip": 0.01047072, + "auxiliary_loss_mlp": 0.00747729, + "balance_loss_clip": 1.02442622, + "balance_loss_mlp": 1.00042558, + "epoch": 0.4044491206974297, + "flos": 13333524998400.0, + "grad_norm": 2.0475521969644985, + "language_loss": 0.81448185, + "learning_rate": 2.7008271698635054e-06, + "loss": 0.83242983, + "num_input_tokens_seen": 144452925, + "step": 6727, + "time_per_iteration": 2.6469919681549072 + }, + { + "auxiliary_loss_clip": 0.0107533, + "auxiliary_loss_mlp": 0.01032342, + "balance_loss_clip": 1.02886403, + "balance_loss_mlp": 1.01997972, + "epoch": 0.4045092439500977, + "flos": 12093745121280.0, + "grad_norm": 2.1207935419446513, + "language_loss": 0.84893906, + "learning_rate": 2.700462388688447e-06, + "loss": 0.87001574, + "num_input_tokens_seen": 144470195, + "step": 6728, + "time_per_iteration": 2.6914734840393066 + }, + { + "auxiliary_loss_clip": 0.01041379, + "auxiliary_loss_mlp": 0.01034822, + "balance_loss_clip": 1.02820063, + "balance_loss_mlp": 1.02209091, + "epoch": 0.40456936720276565, + "flos": 21179683123200.0, + "grad_norm": 6.273127726449955, + "language_loss": 0.81384987, + "learning_rate": 2.700097580951786e-06, + "loss": 0.83461189, + "num_input_tokens_seen": 144490320, + "step": 6729, + "time_per_iteration": 2.854842185974121 + }, + { + "auxiliary_loss_clip": 0.01050773, + "auxiliary_loss_mlp": 0.01038747, + "balance_loss_clip": 1.02599931, + "balance_loss_mlp": 1.02565813, + "epoch": 0.4046294904554336, + "flos": 23915286000000.0, + "grad_norm": 2.0702242606846215, + "language_loss": 0.74051791, + "learning_rate": 2.6997327466673533e-06, + "loss": 0.7614131, + "num_input_tokens_seen": 144508990, + "step": 6730, + "time_per_iteration": 2.645047187805176 + }, + { + "auxiliary_loss_clip": 0.01058497, + "auxiliary_loss_mlp": 0.01037605, + "balance_loss_clip": 1.02467752, + "balance_loss_mlp": 1.02448034, + "epoch": 0.4046896137081016, + "flos": 38071235773440.0, + "grad_norm": 2.2701541538228382, + "language_loss": 0.6737808, + "learning_rate": 2.699367885848985e-06, + "loss": 0.69474179, + "num_input_tokens_seen": 144529550, + "step": 6731, + "time_per_iteration": 2.726780414581299 + }, + { + "auxiliary_loss_clip": 0.0107365, + "auxiliary_loss_mlp": 0.01031527, + "balance_loss_clip": 1.02771425, + "balance_loss_mlp": 1.01940298, + "epoch": 0.4047497369607696, + "flos": 23617262856960.0, + "grad_norm": 1.5948641796531209, + "language_loss": 0.73633111, + "learning_rate": 2.699002998510517e-06, + "loss": 0.75738293, + "num_input_tokens_seen": 144549310, + "step": 6732, + "time_per_iteration": 2.6013777256011963 + }, + { + "auxiliary_loss_clip": 0.01053473, + "auxiliary_loss_mlp": 0.0074753, + "balance_loss_clip": 1.02802515, + "balance_loss_mlp": 1.00038433, + "epoch": 0.40480986021343757, + "flos": 12823593569280.0, + "grad_norm": 1.6180746501627887, + "language_loss": 0.77269125, + "learning_rate": 2.6986380846657852e-06, + "loss": 0.79070127, + "num_input_tokens_seen": 144567430, + "step": 6733, + "time_per_iteration": 2.67744779586792 + }, + { + "auxiliary_loss_clip": 0.01045188, + "auxiliary_loss_mlp": 0.01037945, + "balance_loss_clip": 1.02443814, + "balance_loss_mlp": 1.02357459, + "epoch": 0.40486998346610553, + "flos": 23768770423680.0, + "grad_norm": 1.7380566773295012, + "language_loss": 0.77050161, + "learning_rate": 2.698273144328627e-06, + "loss": 0.79133296, + "num_input_tokens_seen": 144585975, + "step": 6734, + "time_per_iteration": 2.640300750732422 + }, + { + "auxiliary_loss_clip": 0.01058287, + "auxiliary_loss_mlp": 0.01036155, + "balance_loss_clip": 1.02946746, + "balance_loss_mlp": 1.02345288, + "epoch": 0.4049301067187735, + "flos": 22856818999680.0, + "grad_norm": 1.9462010658984763, + "language_loss": 0.652637, + "learning_rate": 2.6979081775128805e-06, + "loss": 0.67358142, + "num_input_tokens_seen": 144605225, + "step": 6735, + "time_per_iteration": 2.6827683448791504 + }, + { + "auxiliary_loss_clip": 0.01032482, + "auxiliary_loss_mlp": 0.01037228, + "balance_loss_clip": 1.02278352, + "balance_loss_mlp": 1.02458608, + "epoch": 0.40499022997144146, + "flos": 22783992174720.0, + "grad_norm": 1.7511722481925251, + "language_loss": 0.83387363, + "learning_rate": 2.697543184232387e-06, + "loss": 0.85457069, + "num_input_tokens_seen": 144624145, + "step": 6736, + "time_per_iteration": 2.8124759197235107 + }, + { + "auxiliary_loss_clip": 0.01049309, + "auxiliary_loss_mlp": 0.00747711, + "balance_loss_clip": 1.02955842, + "balance_loss_mlp": 1.00033355, + "epoch": 0.4050503532241094, + "flos": 23039352938880.0, + "grad_norm": 1.788288604270356, + "language_loss": 0.75369263, + "learning_rate": 2.6971781645009863e-06, + "loss": 0.77166283, + "num_input_tokens_seen": 144644470, + "step": 6737, + "time_per_iteration": 2.7548632621765137 + }, + { + "auxiliary_loss_clip": 0.01062765, + "auxiliary_loss_mlp": 0.01039746, + "balance_loss_clip": 1.02713513, + "balance_loss_mlp": 1.02742624, + "epoch": 0.4051104764767774, + "flos": 16647756065280.0, + "grad_norm": 2.0718507719460195, + "language_loss": 0.71751064, + "learning_rate": 2.696813118332519e-06, + "loss": 0.7385357, + "num_input_tokens_seen": 144661055, + "step": 6738, + "time_per_iteration": 2.6406216621398926 + }, + { + "auxiliary_loss_clip": 0.01042068, + "auxiliary_loss_mlp": 0.01032431, + "balance_loss_clip": 1.02648318, + "balance_loss_mlp": 1.02118993, + "epoch": 0.40517059972944536, + "flos": 16358962717440.0, + "grad_norm": 1.9800751101408482, + "language_loss": 0.74889499, + "learning_rate": 2.696448045740828e-06, + "loss": 0.76964003, + "num_input_tokens_seen": 144677935, + "step": 6739, + "time_per_iteration": 2.6716763973236084 + }, + { + "auxiliary_loss_clip": 0.01041471, + "auxiliary_loss_mlp": 0.01035397, + "balance_loss_clip": 1.0267899, + "balance_loss_mlp": 1.02239132, + "epoch": 0.4052307229821133, + "flos": 28803374363520.0, + "grad_norm": 1.6588591812874114, + "language_loss": 0.74297535, + "learning_rate": 2.6960829467397576e-06, + "loss": 0.76374412, + "num_input_tokens_seen": 144697725, + "step": 6740, + "time_per_iteration": 4.387643814086914 + }, + { + "auxiliary_loss_clip": 0.01055318, + "auxiliary_loss_mlp": 0.01032712, + "balance_loss_clip": 1.02585554, + "balance_loss_mlp": 1.02056432, + "epoch": 0.4052908462347813, + "flos": 21397876289280.0, + "grad_norm": 1.6315744452650525, + "language_loss": 0.7692169, + "learning_rate": 2.695717821343153e-06, + "loss": 0.79009712, + "num_input_tokens_seen": 144718805, + "step": 6741, + "time_per_iteration": 4.15992283821106 + }, + { + "auxiliary_loss_clip": 0.01076361, + "auxiliary_loss_mlp": 0.01034253, + "balance_loss_clip": 1.0284549, + "balance_loss_mlp": 1.02078831, + "epoch": 0.40535096948744925, + "flos": 22419067950720.0, + "grad_norm": 1.8422004359217203, + "language_loss": 0.71480608, + "learning_rate": 2.6953526695648577e-06, + "loss": 0.7359122, + "num_input_tokens_seen": 144737105, + "step": 6742, + "time_per_iteration": 2.5575921535491943 + }, + { + "auxiliary_loss_clip": 0.01076435, + "auxiliary_loss_mlp": 0.01031299, + "balance_loss_clip": 1.02854943, + "balance_loss_mlp": 1.01840019, + "epoch": 0.4054110927401172, + "flos": 17010776868480.0, + "grad_norm": 2.2396459944026903, + "language_loss": 0.72574896, + "learning_rate": 2.6949874914187202e-06, + "loss": 0.74682635, + "num_input_tokens_seen": 144751350, + "step": 6743, + "time_per_iteration": 2.485283136367798 + }, + { + "auxiliary_loss_clip": 0.01046914, + "auxiliary_loss_mlp": 0.01035815, + "balance_loss_clip": 1.02401304, + "balance_loss_mlp": 1.02198672, + "epoch": 0.4054712159927852, + "flos": 21614848392960.0, + "grad_norm": 2.110458888554228, + "language_loss": 0.70339364, + "learning_rate": 2.694622286918588e-06, + "loss": 0.72422093, + "num_input_tokens_seen": 144770030, + "step": 6744, + "time_per_iteration": 2.7207131385803223 + }, + { + "auxiliary_loss_clip": 0.01063513, + "auxiliary_loss_mlp": 0.01035452, + "balance_loss_clip": 1.02768874, + "balance_loss_mlp": 1.02385879, + "epoch": 0.4055313392454532, + "flos": 25812554376960.0, + "grad_norm": 3.0233060867972923, + "language_loss": 0.79911894, + "learning_rate": 2.6942570560783076e-06, + "loss": 0.82010853, + "num_input_tokens_seen": 144790965, + "step": 6745, + "time_per_iteration": 2.676395893096924 + }, + { + "auxiliary_loss_clip": 0.01052951, + "auxiliary_loss_mlp": 0.01036432, + "balance_loss_clip": 1.02855229, + "balance_loss_mlp": 1.02320027, + "epoch": 0.40559146249812117, + "flos": 14137098111360.0, + "grad_norm": 2.0409553435368886, + "language_loss": 0.67087078, + "learning_rate": 2.693891798911731e-06, + "loss": 0.69176459, + "num_input_tokens_seen": 144807755, + "step": 6746, + "time_per_iteration": 2.5731635093688965 + }, + { + "auxiliary_loss_clip": 0.01035593, + "auxiliary_loss_mlp": 0.01027712, + "balance_loss_clip": 1.02566612, + "balance_loss_mlp": 1.01524854, + "epoch": 0.40565158575078913, + "flos": 41355481962240.0, + "grad_norm": 1.4306134103454333, + "language_loss": 0.57154381, + "learning_rate": 2.6935265154327075e-06, + "loss": 0.59217685, + "num_input_tokens_seen": 144832405, + "step": 6747, + "time_per_iteration": 2.836632013320923 + }, + { + "auxiliary_loss_clip": 0.0104598, + "auxiliary_loss_mlp": 0.01039223, + "balance_loss_clip": 1.0299412, + "balance_loss_mlp": 1.02708149, + "epoch": 0.4057117090034571, + "flos": 28544529980160.0, + "grad_norm": 1.9202797489393022, + "language_loss": 0.84652692, + "learning_rate": 2.693161205655089e-06, + "loss": 0.86737895, + "num_input_tokens_seen": 144853890, + "step": 6748, + "time_per_iteration": 2.717632293701172 + }, + { + "auxiliary_loss_clip": 0.01057778, + "auxiliary_loss_mlp": 0.01035475, + "balance_loss_clip": 1.03167331, + "balance_loss_mlp": 1.02223706, + "epoch": 0.40577183225612506, + "flos": 18004066640640.0, + "grad_norm": 1.8573745343604582, + "language_loss": 0.81049764, + "learning_rate": 2.6927958695927287e-06, + "loss": 0.83143014, + "num_input_tokens_seen": 144871395, + "step": 6749, + "time_per_iteration": 4.2199320793151855 + }, + { + "auxiliary_loss_clip": 0.01059511, + "auxiliary_loss_mlp": 0.00747714, + "balance_loss_clip": 1.02694273, + "balance_loss_mlp": 1.00019217, + "epoch": 0.40583195550879303, + "flos": 19536734016000.0, + "grad_norm": 1.6524016456491348, + "language_loss": 0.75721449, + "learning_rate": 2.6924305072594784e-06, + "loss": 0.77528673, + "num_input_tokens_seen": 144890975, + "step": 6750, + "time_per_iteration": 2.671117067337036 + }, + { + "auxiliary_loss_clip": 0.0105715, + "auxiliary_loss_mlp": 0.01034807, + "balance_loss_clip": 1.02659726, + "balance_loss_mlp": 1.02096105, + "epoch": 0.405892078761461, + "flos": 22309468577280.0, + "grad_norm": 2.979270408353393, + "language_loss": 0.73501861, + "learning_rate": 2.692065118669195e-06, + "loss": 0.75593817, + "num_input_tokens_seen": 144908170, + "step": 6751, + "time_per_iteration": 4.280811548233032 + }, + { + "auxiliary_loss_clip": 0.01028389, + "auxiliary_loss_mlp": 0.0103862, + "balance_loss_clip": 1.02763915, + "balance_loss_mlp": 1.02384388, + "epoch": 0.40595220201412896, + "flos": 25484402701440.0, + "grad_norm": 1.628907519632758, + "language_loss": 0.66704375, + "learning_rate": 2.6916997038357326e-06, + "loss": 0.68771386, + "num_input_tokens_seen": 144928020, + "step": 6752, + "time_per_iteration": 2.7588558197021484 + }, + { + "auxiliary_loss_clip": 0.0103359, + "auxiliary_loss_mlp": 0.01039693, + "balance_loss_clip": 1.02712595, + "balance_loss_mlp": 1.02504241, + "epoch": 0.4060123252667969, + "flos": 49856004103680.0, + "grad_norm": 1.7958019714785818, + "language_loss": 0.7076208, + "learning_rate": 2.691334262772948e-06, + "loss": 0.72835362, + "num_input_tokens_seen": 144951240, + "step": 6753, + "time_per_iteration": 3.113940477371216 + }, + { + "auxiliary_loss_clip": 0.01050222, + "auxiliary_loss_mlp": 0.01036405, + "balance_loss_clip": 1.02654219, + "balance_loss_mlp": 1.02226138, + "epoch": 0.4060724485194649, + "flos": 21135476459520.0, + "grad_norm": 2.2217050032362287, + "language_loss": 0.7193414, + "learning_rate": 2.690968795494699e-06, + "loss": 0.74020767, + "num_input_tokens_seen": 144969100, + "step": 6754, + "time_per_iteration": 2.6429567337036133 + }, + { + "auxiliary_loss_clip": 0.01039189, + "auxiliary_loss_mlp": 0.01041078, + "balance_loss_clip": 1.02498758, + "balance_loss_mlp": 1.02757144, + "epoch": 0.40613257177213286, + "flos": 21758059918080.0, + "grad_norm": 1.748975922733633, + "language_loss": 0.82875049, + "learning_rate": 2.690603302014844e-06, + "loss": 0.84955323, + "num_input_tokens_seen": 144987065, + "step": 6755, + "time_per_iteration": 2.795685291290283 + }, + { + "auxiliary_loss_clip": 0.01034624, + "auxiliary_loss_mlp": 0.01040054, + "balance_loss_clip": 1.02915382, + "balance_loss_mlp": 1.02570152, + "epoch": 0.4061926950248008, + "flos": 25555074710400.0, + "grad_norm": 1.9175490892791918, + "language_loss": 0.7035898, + "learning_rate": 2.6902377823472426e-06, + "loss": 0.72433662, + "num_input_tokens_seen": 145007310, + "step": 6756, + "time_per_iteration": 2.978855609893799 + }, + { + "auxiliary_loss_clip": 0.01012985, + "auxiliary_loss_mlp": 0.00747732, + "balance_loss_clip": 1.02311993, + "balance_loss_mlp": 1.00011563, + "epoch": 0.4062528182774688, + "flos": 23695799944320.0, + "grad_norm": 1.8156730954617684, + "language_loss": 0.7916522, + "learning_rate": 2.689872236505755e-06, + "loss": 0.80925941, + "num_input_tokens_seen": 145026210, + "step": 6757, + "time_per_iteration": 2.9767444133758545 + }, + { + "auxiliary_loss_clip": 0.01056561, + "auxiliary_loss_mlp": 0.01028157, + "balance_loss_clip": 1.02921939, + "balance_loss_mlp": 1.01581919, + "epoch": 0.4063129415301368, + "flos": 21726027964800.0, + "grad_norm": 1.7630548647928812, + "language_loss": 0.78292704, + "learning_rate": 2.6895066645042437e-06, + "loss": 0.80377424, + "num_input_tokens_seen": 145045475, + "step": 6758, + "time_per_iteration": 2.8420469760894775 + }, + { + "auxiliary_loss_clip": 0.01047813, + "auxiliary_loss_mlp": 0.01036564, + "balance_loss_clip": 1.03155327, + "balance_loss_mlp": 1.02330792, + "epoch": 0.40637306478280477, + "flos": 12787575206400.0, + "grad_norm": 1.970245534368122, + "language_loss": 0.88801849, + "learning_rate": 2.6891410663565703e-06, + "loss": 0.90886229, + "num_input_tokens_seen": 145062260, + "step": 6759, + "time_per_iteration": 2.822305202484131 + }, + { + "auxiliary_loss_clip": 0.01040847, + "auxiliary_loss_mlp": 0.01034035, + "balance_loss_clip": 1.0285058, + "balance_loss_mlp": 1.02130914, + "epoch": 0.40643318803547274, + "flos": 24024490323840.0, + "grad_norm": 1.788523544510184, + "language_loss": 0.64069533, + "learning_rate": 2.688775442076598e-06, + "loss": 0.66144413, + "num_input_tokens_seen": 145082470, + "step": 6760, + "time_per_iteration": 2.8286192417144775 + }, + { + "auxiliary_loss_clip": 0.01063314, + "auxiliary_loss_mlp": 0.01031167, + "balance_loss_clip": 1.02714586, + "balance_loss_mlp": 1.01705813, + "epoch": 0.4064933112881407, + "flos": 25592421876480.0, + "grad_norm": 1.4836465208803948, + "language_loss": 0.74952465, + "learning_rate": 2.688409791678193e-06, + "loss": 0.77046943, + "num_input_tokens_seen": 145105685, + "step": 6761, + "time_per_iteration": 2.725855588912964 + }, + { + "auxiliary_loss_clip": 0.01052214, + "auxiliary_loss_mlp": 0.01037235, + "balance_loss_clip": 1.02958632, + "balance_loss_mlp": 1.02465868, + "epoch": 0.40655343454080867, + "flos": 22054323294720.0, + "grad_norm": 1.4504897620962467, + "language_loss": 0.69780409, + "learning_rate": 2.6880441151752185e-06, + "loss": 0.71869856, + "num_input_tokens_seen": 145125590, + "step": 6762, + "time_per_iteration": 2.671278953552246 + }, + { + "auxiliary_loss_clip": 0.01066484, + "auxiliary_loss_mlp": 0.01037595, + "balance_loss_clip": 1.03130293, + "balance_loss_mlp": 1.02526331, + "epoch": 0.40661355779347663, + "flos": 26468893641600.0, + "grad_norm": 1.6993832931523059, + "language_loss": 0.73462707, + "learning_rate": 2.6876784125815433e-06, + "loss": 0.75566787, + "num_input_tokens_seen": 145146810, + "step": 6763, + "time_per_iteration": 2.8182015419006348 + }, + { + "auxiliary_loss_clip": 0.01027553, + "auxiliary_loss_mlp": 0.0103599, + "balance_loss_clip": 1.02210021, + "balance_loss_mlp": 1.02206635, + "epoch": 0.4066736810461446, + "flos": 13261129136640.0, + "grad_norm": 1.7730823418461021, + "language_loss": 0.68946886, + "learning_rate": 2.687312683911033e-06, + "loss": 0.71010435, + "num_input_tokens_seen": 145163130, + "step": 6764, + "time_per_iteration": 2.8738129138946533 + }, + { + "auxiliary_loss_clip": 0.01042917, + "auxiliary_loss_mlp": 0.01039696, + "balance_loss_clip": 1.02729702, + "balance_loss_mlp": 1.02447319, + "epoch": 0.40673380429881256, + "flos": 28803625758720.0, + "grad_norm": 2.3723934677334535, + "language_loss": 0.91122389, + "learning_rate": 2.686946929177557e-06, + "loss": 0.93205005, + "num_input_tokens_seen": 145181420, + "step": 6765, + "time_per_iteration": 2.7585034370422363 + }, + { + "auxiliary_loss_clip": 0.01060294, + "auxiliary_loss_mlp": 0.0103837, + "balance_loss_clip": 1.02618515, + "balance_loss_mlp": 1.02383828, + "epoch": 0.4067939275514805, + "flos": 12495334152960.0, + "grad_norm": 2.4605484718908044, + "language_loss": 0.78355938, + "learning_rate": 2.6865811483949855e-06, + "loss": 0.804546, + "num_input_tokens_seen": 145198545, + "step": 6766, + "time_per_iteration": 2.770329475402832 + }, + { + "auxiliary_loss_clip": 0.01076851, + "auxiliary_loss_mlp": 0.01035319, + "balance_loss_clip": 1.02750909, + "balance_loss_mlp": 1.02231979, + "epoch": 0.4068540508041485, + "flos": 18770508069120.0, + "grad_norm": 1.8347439890772659, + "language_loss": 0.7700671, + "learning_rate": 2.6862153415771867e-06, + "loss": 0.79118872, + "num_input_tokens_seen": 145215835, + "step": 6767, + "time_per_iteration": 2.742994785308838 + }, + { + "auxiliary_loss_clip": 0.01067619, + "auxiliary_loss_mlp": 0.01031765, + "balance_loss_clip": 1.03001094, + "balance_loss_mlp": 1.01915264, + "epoch": 0.40691417405681646, + "flos": 28512821249280.0, + "grad_norm": 1.8888311284068746, + "language_loss": 0.77480161, + "learning_rate": 2.685849508738034e-06, + "loss": 0.79579544, + "num_input_tokens_seen": 145236555, + "step": 6768, + "time_per_iteration": 2.8901207447052 + }, + { + "auxiliary_loss_clip": 0.01076031, + "auxiliary_loss_mlp": 0.01030429, + "balance_loss_clip": 1.02916694, + "balance_loss_mlp": 1.01807249, + "epoch": 0.4069742973094844, + "flos": 20814040627200.0, + "grad_norm": 2.1906587974947467, + "language_loss": 0.87511683, + "learning_rate": 2.6854836498913995e-06, + "loss": 0.8961814, + "num_input_tokens_seen": 145254595, + "step": 6769, + "time_per_iteration": 2.751753330230713 + }, + { + "auxiliary_loss_clip": 0.0104867, + "auxiliary_loss_mlp": 0.01036233, + "balance_loss_clip": 1.02908599, + "balance_loss_mlp": 1.02370405, + "epoch": 0.4070344205621524, + "flos": 21470272151040.0, + "grad_norm": 1.9784187183282318, + "language_loss": 0.8103385, + "learning_rate": 2.685117765051156e-06, + "loss": 0.83118749, + "num_input_tokens_seen": 145274005, + "step": 6770, + "time_per_iteration": 2.7267727851867676 + }, + { + "auxiliary_loss_clip": 0.01078246, + "auxiliary_loss_mlp": 0.01030572, + "balance_loss_clip": 1.02899563, + "balance_loss_mlp": 1.01694667, + "epoch": 0.4070945438148204, + "flos": 26830046937600.0, + "grad_norm": 1.729432876750853, + "language_loss": 0.8016156, + "learning_rate": 2.6847518542311783e-06, + "loss": 0.82270372, + "num_input_tokens_seen": 145294850, + "step": 6771, + "time_per_iteration": 2.6266064643859863 + }, + { + "auxiliary_loss_clip": 0.01039109, + "auxiliary_loss_mlp": 0.01035882, + "balance_loss_clip": 1.02624893, + "balance_loss_mlp": 1.02306068, + "epoch": 0.4071546670674884, + "flos": 26354158623360.0, + "grad_norm": 1.4896898963398983, + "language_loss": 0.7633605, + "learning_rate": 2.6843859174453417e-06, + "loss": 0.78411043, + "num_input_tokens_seen": 145317050, + "step": 6772, + "time_per_iteration": 2.8400936126708984 + }, + { + "auxiliary_loss_clip": 0.0105369, + "auxiliary_loss_mlp": 0.01034516, + "balance_loss_clip": 1.02632737, + "balance_loss_mlp": 1.02198696, + "epoch": 0.40721479032015634, + "flos": 17895401020800.0, + "grad_norm": 2.1249440200434067, + "language_loss": 0.81385481, + "learning_rate": 2.6840199547075218e-06, + "loss": 0.83473688, + "num_input_tokens_seen": 145334480, + "step": 6773, + "time_per_iteration": 2.643035888671875 + }, + { + "auxiliary_loss_clip": 0.00993833, + "auxiliary_loss_mlp": 0.01007425, + "balance_loss_clip": 1.00391984, + "balance_loss_mlp": 1.00570834, + "epoch": 0.4072749135728243, + "flos": 49854570537600.0, + "grad_norm": 0.8559572255721745, + "language_loss": 0.64346659, + "learning_rate": 2.683653966031597e-06, + "loss": 0.66347909, + "num_input_tokens_seen": 145388695, + "step": 6774, + "time_per_iteration": 3.1132261753082275 + }, + { + "auxiliary_loss_clip": 0.01038959, + "auxiliary_loss_mlp": 0.01033877, + "balance_loss_clip": 1.02939105, + "balance_loss_mlp": 1.02085364, + "epoch": 0.40733503682549227, + "flos": 27563630400000.0, + "grad_norm": 2.031772824239096, + "language_loss": 0.7245183, + "learning_rate": 2.683287951431446e-06, + "loss": 0.74524665, + "num_input_tokens_seen": 145408240, + "step": 6775, + "time_per_iteration": 2.7713141441345215 + }, + { + "auxiliary_loss_clip": 0.0104958, + "auxiliary_loss_mlp": 0.00747813, + "balance_loss_clip": 1.02966607, + "balance_loss_mlp": 1.00027144, + "epoch": 0.40739516007816023, + "flos": 22126970551680.0, + "grad_norm": 1.6642435065311698, + "language_loss": 0.77968496, + "learning_rate": 2.6829219109209474e-06, + "loss": 0.79765892, + "num_input_tokens_seen": 145428395, + "step": 6776, + "time_per_iteration": 2.6892600059509277 + }, + { + "auxiliary_loss_clip": 0.01068366, + "auxiliary_loss_mlp": 0.0103867, + "balance_loss_clip": 1.02856874, + "balance_loss_mlp": 1.02573037, + "epoch": 0.4074552833308282, + "flos": 23842243693440.0, + "grad_norm": 2.4928630224882897, + "language_loss": 0.79205149, + "learning_rate": 2.682555844513981e-06, + "loss": 0.81312191, + "num_input_tokens_seen": 145448290, + "step": 6777, + "time_per_iteration": 2.633769989013672 + }, + { + "auxiliary_loss_clip": 0.01011509, + "auxiliary_loss_mlp": 0.01002153, + "balance_loss_clip": 1.00222397, + "balance_loss_mlp": 1.000615, + "epoch": 0.40751540658349616, + "flos": 58000008781440.0, + "grad_norm": 0.702036971628076, + "language_loss": 0.53155535, + "learning_rate": 2.6821897522244286e-06, + "loss": 0.55169195, + "num_input_tokens_seen": 145509785, + "step": 6778, + "time_per_iteration": 3.14203143119812 + }, + { + "auxiliary_loss_clip": 0.01077371, + "auxiliary_loss_mlp": 0.0074761, + "balance_loss_clip": 1.03007066, + "balance_loss_mlp": 1.00029337, + "epoch": 0.40757552983616413, + "flos": 21214659991680.0, + "grad_norm": 2.0435602226454064, + "language_loss": 0.82641625, + "learning_rate": 2.6818236340661718e-06, + "loss": 0.84466612, + "num_input_tokens_seen": 145528620, + "step": 6779, + "time_per_iteration": 2.621321201324463 + }, + { + "auxiliary_loss_clip": 0.01064698, + "auxiliary_loss_mlp": 0.01034728, + "balance_loss_clip": 1.027233, + "balance_loss_mlp": 1.02164459, + "epoch": 0.4076356530888321, + "flos": 26833530556800.0, + "grad_norm": 1.459480633095451, + "language_loss": 0.75990069, + "learning_rate": 2.6814574900530957e-06, + "loss": 0.78089488, + "num_input_tokens_seen": 145547775, + "step": 6780, + "time_per_iteration": 2.6240334510803223 + }, + { + "auxiliary_loss_clip": 0.01063638, + "auxiliary_loss_mlp": 0.01030798, + "balance_loss_clip": 1.02820754, + "balance_loss_mlp": 1.01908576, + "epoch": 0.40769577634150006, + "flos": 12203021272320.0, + "grad_norm": 2.019774269955229, + "language_loss": 0.66364491, + "learning_rate": 2.6810913201990827e-06, + "loss": 0.68458927, + "num_input_tokens_seen": 145564465, + "step": 6781, + "time_per_iteration": 2.58736252784729 + }, + { + "auxiliary_loss_clip": 0.01049582, + "auxiliary_loss_mlp": 0.01035887, + "balance_loss_clip": 1.02493405, + "balance_loss_mlp": 1.02227366, + "epoch": 0.407755899594168, + "flos": 33655264796160.0, + "grad_norm": 1.685806175168607, + "language_loss": 0.71444106, + "learning_rate": 2.6807251245180183e-06, + "loss": 0.73529577, + "num_input_tokens_seen": 145585965, + "step": 6782, + "time_per_iteration": 2.7542436122894287 + }, + { + "auxiliary_loss_clip": 0.01067058, + "auxiliary_loss_mlp": 0.01032199, + "balance_loss_clip": 1.02861595, + "balance_loss_mlp": 1.01959896, + "epoch": 0.407816022846836, + "flos": 20157342226560.0, + "grad_norm": 1.9982794916737456, + "language_loss": 0.8181693, + "learning_rate": 2.6803589030237897e-06, + "loss": 0.83916187, + "num_input_tokens_seen": 145605000, + "step": 6783, + "time_per_iteration": 2.602292776107788 + }, + { + "auxiliary_loss_clip": 0.01066766, + "auxiliary_loss_mlp": 0.01036205, + "balance_loss_clip": 1.030478, + "balance_loss_mlp": 1.02301455, + "epoch": 0.40787614609950396, + "flos": 21178821196800.0, + "grad_norm": 1.6679992338948944, + "language_loss": 0.80631828, + "learning_rate": 2.679992655730283e-06, + "loss": 0.82734811, + "num_input_tokens_seen": 145623740, + "step": 6784, + "time_per_iteration": 2.7118499279022217 + }, + { + "auxiliary_loss_clip": 0.01052677, + "auxiliary_loss_mlp": 0.01037761, + "balance_loss_clip": 1.0304482, + "balance_loss_mlp": 1.02346146, + "epoch": 0.407936269352172, + "flos": 20520650338560.0, + "grad_norm": 1.6128760503809945, + "language_loss": 0.65596163, + "learning_rate": 2.679626382651386e-06, + "loss": 0.67686599, + "num_input_tokens_seen": 145643515, + "step": 6785, + "time_per_iteration": 2.701462745666504 + }, + { + "auxiliary_loss_clip": 0.01065301, + "auxiliary_loss_mlp": 0.01032864, + "balance_loss_clip": 1.03037727, + "balance_loss_mlp": 1.02097344, + "epoch": 0.40799639260483994, + "flos": 20118809911680.0, + "grad_norm": 2.308855424559058, + "language_loss": 0.80016196, + "learning_rate": 2.679260083800989e-06, + "loss": 0.82114363, + "num_input_tokens_seen": 145660890, + "step": 6786, + "time_per_iteration": 2.5593578815460205 + }, + { + "auxiliary_loss_clip": 0.01073743, + "auxiliary_loss_mlp": 0.01033682, + "balance_loss_clip": 1.02839112, + "balance_loss_mlp": 1.02199399, + "epoch": 0.4080565158575079, + "flos": 20997328752000.0, + "grad_norm": 1.5641126986775453, + "language_loss": 0.81119007, + "learning_rate": 2.678893759192982e-06, + "loss": 0.83226436, + "num_input_tokens_seen": 145680070, + "step": 6787, + "time_per_iteration": 4.097415208816528 + }, + { + "auxiliary_loss_clip": 0.01061342, + "auxiliary_loss_mlp": 0.01026329, + "balance_loss_clip": 1.02661383, + "balance_loss_mlp": 1.01419973, + "epoch": 0.40811663911017587, + "flos": 19317714837120.0, + "grad_norm": 1.6185067987105815, + "language_loss": 0.67387795, + "learning_rate": 2.678527408841255e-06, + "loss": 0.69475466, + "num_input_tokens_seen": 145698010, + "step": 6788, + "time_per_iteration": 4.1276633739471436 + }, + { + "auxiliary_loss_clip": 0.01043031, + "auxiliary_loss_mlp": 0.01041068, + "balance_loss_clip": 1.0224421, + "balance_loss_mlp": 1.02741277, + "epoch": 0.40817676236284384, + "flos": 40625382119040.0, + "grad_norm": 1.7934496990150102, + "language_loss": 0.65749645, + "learning_rate": 2.678161032759701e-06, + "loss": 0.67833745, + "num_input_tokens_seen": 145722215, + "step": 6789, + "time_per_iteration": 2.7649741172790527 + }, + { + "auxiliary_loss_clip": 0.01026421, + "auxiliary_loss_mlp": 0.01030821, + "balance_loss_clip": 1.02464461, + "balance_loss_mlp": 1.01830399, + "epoch": 0.4082368856155118, + "flos": 20522086882560.0, + "grad_norm": 2.0408185487134842, + "language_loss": 0.60486126, + "learning_rate": 2.6777946309622123e-06, + "loss": 0.62543368, + "num_input_tokens_seen": 145741090, + "step": 6790, + "time_per_iteration": 2.6657299995422363 + }, + { + "auxiliary_loss_clip": 0.01055301, + "auxiliary_loss_mlp": 0.01035338, + "balance_loss_clip": 1.02683496, + "balance_loss_mlp": 1.02199864, + "epoch": 0.40829700886817977, + "flos": 11427745098240.0, + "grad_norm": 3.953019904429752, + "language_loss": 0.69846034, + "learning_rate": 2.677428203462683e-06, + "loss": 0.71936673, + "num_input_tokens_seen": 145754985, + "step": 6791, + "time_per_iteration": 2.576195240020752 + }, + { + "auxiliary_loss_clip": 0.01001536, + "auxiliary_loss_mlp": 0.01006968, + "balance_loss_clip": 1.0026325, + "balance_loss_mlp": 1.00540662, + "epoch": 0.40835713212084773, + "flos": 67330677121920.0, + "grad_norm": 0.7970598173294049, + "language_loss": 0.59679008, + "learning_rate": 2.6770617502750093e-06, + "loss": 0.61687511, + "num_input_tokens_seen": 145815260, + "step": 6792, + "time_per_iteration": 3.1357383728027344 + }, + { + "auxiliary_loss_clip": 0.01080827, + "auxiliary_loss_mlp": 0.01037555, + "balance_loss_clip": 1.03111267, + "balance_loss_mlp": 1.02414942, + "epoch": 0.4084172553735157, + "flos": 21762010414080.0, + "grad_norm": 1.7063637106387815, + "language_loss": 0.80019534, + "learning_rate": 2.6766952714130857e-06, + "loss": 0.82137913, + "num_input_tokens_seen": 145832665, + "step": 6793, + "time_per_iteration": 2.638779640197754 + }, + { + "auxiliary_loss_clip": 0.01068356, + "auxiliary_loss_mlp": 0.01029789, + "balance_loss_clip": 1.03043234, + "balance_loss_mlp": 1.01684296, + "epoch": 0.40847737862618366, + "flos": 27417258478080.0, + "grad_norm": 1.720646346543944, + "language_loss": 0.84823763, + "learning_rate": 2.6763287668908094e-06, + "loss": 0.86921906, + "num_input_tokens_seen": 145850240, + "step": 6794, + "time_per_iteration": 2.684906005859375 + }, + { + "auxiliary_loss_clip": 0.01046735, + "auxiliary_loss_mlp": 0.01034621, + "balance_loss_clip": 1.02833974, + "balance_loss_mlp": 1.0218358, + "epoch": 0.4085375018788516, + "flos": 18587255857920.0, + "grad_norm": 1.5223140472968586, + "language_loss": 0.80200702, + "learning_rate": 2.6759622367220788e-06, + "loss": 0.82282054, + "num_input_tokens_seen": 145869545, + "step": 6795, + "time_per_iteration": 2.7837836742401123 + }, + { + "auxiliary_loss_clip": 0.01069907, + "auxiliary_loss_mlp": 0.01038676, + "balance_loss_clip": 1.02932608, + "balance_loss_mlp": 1.02472901, + "epoch": 0.4085976251315196, + "flos": 15411783029760.0, + "grad_norm": 2.5693763488729533, + "language_loss": 0.70108795, + "learning_rate": 2.675595680920792e-06, + "loss": 0.72217381, + "num_input_tokens_seen": 145884025, + "step": 6796, + "time_per_iteration": 4.185283422470093 + }, + { + "auxiliary_loss_clip": 0.01057466, + "auxiliary_loss_mlp": 0.00747755, + "balance_loss_clip": 1.02661681, + "balance_loss_mlp": 1.00024152, + "epoch": 0.40865774838418756, + "flos": 21252222639360.0, + "grad_norm": 1.699375553816643, + "language_loss": 0.78126884, + "learning_rate": 2.6752290995008498e-06, + "loss": 0.799321, + "num_input_tokens_seen": 145903210, + "step": 6797, + "time_per_iteration": 2.6416940689086914 + }, + { + "auxiliary_loss_clip": 0.01065603, + "auxiliary_loss_mlp": 0.01040057, + "balance_loss_clip": 1.02748859, + "balance_loss_mlp": 1.02741468, + "epoch": 0.4087178716368556, + "flos": 13772245714560.0, + "grad_norm": 1.8085841683318462, + "language_loss": 0.85632586, + "learning_rate": 2.6748624924761523e-06, + "loss": 0.87738246, + "num_input_tokens_seen": 145920985, + "step": 6798, + "time_per_iteration": 2.566796064376831 + }, + { + "auxiliary_loss_clip": 0.01072257, + "auxiliary_loss_mlp": 0.01030598, + "balance_loss_clip": 1.02748775, + "balance_loss_mlp": 1.01932704, + "epoch": 0.40877799488952354, + "flos": 23621752056960.0, + "grad_norm": 1.5539809029615073, + "language_loss": 0.84248662, + "learning_rate": 2.674495859860601e-06, + "loss": 0.86351526, + "num_input_tokens_seen": 145940350, + "step": 6799, + "time_per_iteration": 4.26988673210144 + }, + { + "auxiliary_loss_clip": 0.01030285, + "auxiliary_loss_mlp": 0.01043034, + "balance_loss_clip": 1.02447987, + "balance_loss_mlp": 1.02798939, + "epoch": 0.4088381181421915, + "flos": 20918791664640.0, + "grad_norm": 2.1876946602271072, + "language_loss": 0.83340877, + "learning_rate": 2.6741292016681e-06, + "loss": 0.85414195, + "num_input_tokens_seen": 145957460, + "step": 6800, + "time_per_iteration": 2.6448686122894287 + }, + { + "auxiliary_loss_clip": 0.01066709, + "auxiliary_loss_mlp": 0.01036026, + "balance_loss_clip": 1.02861965, + "balance_loss_mlp": 1.02283573, + "epoch": 0.4088982413948595, + "flos": 13297578462720.0, + "grad_norm": 1.9694718104850002, + "language_loss": 0.74643254, + "learning_rate": 2.6737625179125514e-06, + "loss": 0.76745993, + "num_input_tokens_seen": 145975285, + "step": 6801, + "time_per_iteration": 2.567237377166748 + }, + { + "auxiliary_loss_clip": 0.01066098, + "auxiliary_loss_mlp": 0.01033688, + "balance_loss_clip": 1.02787423, + "balance_loss_mlp": 1.02092099, + "epoch": 0.40895836464752744, + "flos": 15267673664640.0, + "grad_norm": 1.9508727409391635, + "language_loss": 0.8017835, + "learning_rate": 2.673395808607861e-06, + "loss": 0.82278132, + "num_input_tokens_seen": 145989150, + "step": 6802, + "time_per_iteration": 2.6981306076049805 + }, + { + "auxiliary_loss_clip": 0.01069103, + "auxiliary_loss_mlp": 0.01038607, + "balance_loss_clip": 1.03075254, + "balance_loss_mlp": 1.02362227, + "epoch": 0.4090184879001954, + "flos": 14501411804160.0, + "grad_norm": 2.083995759766081, + "language_loss": 0.75514627, + "learning_rate": 2.673029073767934e-06, + "loss": 0.77622336, + "num_input_tokens_seen": 146006980, + "step": 6803, + "time_per_iteration": 2.5542685985565186 + }, + { + "auxiliary_loss_clip": 0.01017303, + "auxiliary_loss_mlp": 0.00747703, + "balance_loss_clip": 1.02658606, + "balance_loss_mlp": 1.00021005, + "epoch": 0.40907861115286337, + "flos": 13881593692800.0, + "grad_norm": 1.6713962564325253, + "language_loss": 0.7897886, + "learning_rate": 2.6726623134066764e-06, + "loss": 0.80743873, + "num_input_tokens_seen": 146025125, + "step": 6804, + "time_per_iteration": 2.6661603450775146 + }, + { + "auxiliary_loss_clip": 0.01080388, + "auxiliary_loss_mlp": 0.01037689, + "balance_loss_clip": 1.0302012, + "balance_loss_mlp": 1.02506423, + "epoch": 0.40913873440553133, + "flos": 28037615293440.0, + "grad_norm": 2.3008469672380896, + "language_loss": 0.7488479, + "learning_rate": 2.672295527537998e-06, + "loss": 0.77002865, + "num_input_tokens_seen": 146044990, + "step": 6805, + "time_per_iteration": 2.6385915279388428 + }, + { + "auxiliary_loss_clip": 0.01030058, + "auxiliary_loss_mlp": 0.01040175, + "balance_loss_clip": 1.02585816, + "balance_loss_mlp": 1.02722287, + "epoch": 0.4091988576581993, + "flos": 21618188357760.0, + "grad_norm": 2.296742218922855, + "language_loss": 0.79387486, + "learning_rate": 2.671928716175804e-06, + "loss": 0.81457722, + "num_input_tokens_seen": 146066045, + "step": 6806, + "time_per_iteration": 2.822075366973877 + }, + { + "auxiliary_loss_clip": 0.01067692, + "auxiliary_loss_mlp": 0.01031597, + "balance_loss_clip": 1.02859342, + "balance_loss_mlp": 1.01873434, + "epoch": 0.40925898091086726, + "flos": 25224085860480.0, + "grad_norm": 1.7882283690695386, + "language_loss": 0.71788228, + "learning_rate": 2.671561879334007e-06, + "loss": 0.73887515, + "num_input_tokens_seen": 146086280, + "step": 6807, + "time_per_iteration": 2.770665407180786 + }, + { + "auxiliary_loss_clip": 0.00988627, + "auxiliary_loss_mlp": 0.0100262, + "balance_loss_clip": 1.00926924, + "balance_loss_mlp": 1.0007956, + "epoch": 0.40931910416353523, + "flos": 68930568800640.0, + "grad_norm": 0.8314593016650572, + "language_loss": 0.58810085, + "learning_rate": 2.6711950170265155e-06, + "loss": 0.60801327, + "num_input_tokens_seen": 146148840, + "step": 6808, + "time_per_iteration": 3.3852949142456055 + }, + { + "auxiliary_loss_clip": 0.01056836, + "auxiliary_loss_mlp": 0.01033403, + "balance_loss_clip": 1.02855301, + "balance_loss_mlp": 1.02167833, + "epoch": 0.4093792274162032, + "flos": 20189553747840.0, + "grad_norm": 1.636029645442891, + "language_loss": 0.54357541, + "learning_rate": 2.670828129267242e-06, + "loss": 0.56447774, + "num_input_tokens_seen": 146166195, + "step": 6809, + "time_per_iteration": 2.7704272270202637 + }, + { + "auxiliary_loss_clip": 0.01048945, + "auxiliary_loss_mlp": 0.01026749, + "balance_loss_clip": 1.02576804, + "balance_loss_mlp": 1.01472712, + "epoch": 0.40943935066887116, + "flos": 25228754628480.0, + "grad_norm": 1.7567371399192409, + "language_loss": 0.83132726, + "learning_rate": 2.6704612160700983e-06, + "loss": 0.85208422, + "num_input_tokens_seen": 146185045, + "step": 6810, + "time_per_iteration": 2.7139320373535156 + }, + { + "auxiliary_loss_clip": 0.01049479, + "auxiliary_loss_mlp": 0.0103906, + "balance_loss_clip": 1.02714694, + "balance_loss_mlp": 1.02387238, + "epoch": 0.4094994739215392, + "flos": 23255319461760.0, + "grad_norm": 3.778654432301892, + "language_loss": 0.77158368, + "learning_rate": 2.670094277448999e-06, + "loss": 0.79246902, + "num_input_tokens_seen": 146204655, + "step": 6811, + "time_per_iteration": 2.7976253032684326 + }, + { + "auxiliary_loss_clip": 0.0107683, + "auxiliary_loss_mlp": 0.01035822, + "balance_loss_clip": 1.02889466, + "balance_loss_mlp": 1.0222199, + "epoch": 0.40955959717420715, + "flos": 17382165540480.0, + "grad_norm": 1.5808656617874493, + "language_loss": 0.70183861, + "learning_rate": 2.669727313417857e-06, + "loss": 0.72296518, + "num_input_tokens_seen": 146222000, + "step": 6812, + "time_per_iteration": 2.6449570655822754 + }, + { + "auxiliary_loss_clip": 0.0107538, + "auxiliary_loss_mlp": 0.01039284, + "balance_loss_clip": 1.02897239, + "balance_loss_mlp": 1.02586746, + "epoch": 0.4096197204268751, + "flos": 25082418620160.0, + "grad_norm": 1.7454591219703082, + "language_loss": 0.66488469, + "learning_rate": 2.6693603239905872e-06, + "loss": 0.68603134, + "num_input_tokens_seen": 146242630, + "step": 6813, + "time_per_iteration": 2.7296364307403564 + }, + { + "auxiliary_loss_clip": 0.01055336, + "auxiliary_loss_mlp": 0.00747677, + "balance_loss_clip": 1.02640557, + "balance_loss_mlp": 1.00020576, + "epoch": 0.4096798436795431, + "flos": 30586769648640.0, + "grad_norm": 3.7447848859640804, + "language_loss": 0.73929757, + "learning_rate": 2.6689933091811087e-06, + "loss": 0.75732768, + "num_input_tokens_seen": 146263070, + "step": 6814, + "time_per_iteration": 2.8164191246032715 + }, + { + "auxiliary_loss_clip": 0.01031418, + "auxiliary_loss_mlp": 0.01032813, + "balance_loss_clip": 1.02569437, + "balance_loss_mlp": 1.01927662, + "epoch": 0.40973996693221104, + "flos": 24133622820480.0, + "grad_norm": 1.9317783409000169, + "language_loss": 0.65889943, + "learning_rate": 2.6686262690033357e-06, + "loss": 0.67954177, + "num_input_tokens_seen": 146282890, + "step": 6815, + "time_per_iteration": 2.8738763332366943 + }, + { + "auxiliary_loss_clip": 0.01064592, + "auxiliary_loss_mlp": 0.01036897, + "balance_loss_clip": 1.03028548, + "balance_loss_mlp": 1.02436781, + "epoch": 0.409800090184879, + "flos": 23988974751360.0, + "grad_norm": 2.3091823918298084, + "language_loss": 0.77127337, + "learning_rate": 2.668259203471188e-06, + "loss": 0.79228824, + "num_input_tokens_seen": 146301755, + "step": 6816, + "time_per_iteration": 2.6280436515808105 + }, + { + "auxiliary_loss_clip": 0.01059308, + "auxiliary_loss_mlp": 0.01037285, + "balance_loss_clip": 1.03052115, + "balance_loss_mlp": 1.02426112, + "epoch": 0.40986021343754697, + "flos": 16143678552960.0, + "grad_norm": 2.105638819938238, + "language_loss": 0.81496447, + "learning_rate": 2.6678921125985843e-06, + "loss": 0.83593035, + "num_input_tokens_seen": 146316835, + "step": 6817, + "time_per_iteration": 2.6933376789093018 + }, + { + "auxiliary_loss_clip": 0.01053223, + "auxiliary_loss_mlp": 0.01038194, + "balance_loss_clip": 1.02649426, + "balance_loss_mlp": 1.0240078, + "epoch": 0.40992033669021494, + "flos": 24790824011520.0, + "grad_norm": 1.8238557182958242, + "language_loss": 0.79955488, + "learning_rate": 2.667524996399444e-06, + "loss": 0.82046896, + "num_input_tokens_seen": 146336650, + "step": 6818, + "time_per_iteration": 2.664473056793213 + }, + { + "auxiliary_loss_clip": 0.01052313, + "auxiliary_loss_mlp": 0.0103581, + "balance_loss_clip": 1.03278744, + "balance_loss_mlp": 1.02372766, + "epoch": 0.4099804599428829, + "flos": 29641888431360.0, + "grad_norm": 1.7285127207818236, + "language_loss": 0.66139591, + "learning_rate": 2.66715785488769e-06, + "loss": 0.68227714, + "num_input_tokens_seen": 146357640, + "step": 6819, + "time_per_iteration": 2.851916551589966 + }, + { + "auxiliary_loss_clip": 0.01055346, + "auxiliary_loss_mlp": 0.01039714, + "balance_loss_clip": 1.02937305, + "balance_loss_mlp": 1.02470553, + "epoch": 0.41004058319555087, + "flos": 24826590979200.0, + "grad_norm": 2.093318470285641, + "language_loss": 0.8523699, + "learning_rate": 2.6667906880772428e-06, + "loss": 0.87332058, + "num_input_tokens_seen": 146379325, + "step": 6820, + "time_per_iteration": 2.7894744873046875 + }, + { + "auxiliary_loss_clip": 0.01065209, + "auxiliary_loss_mlp": 0.01029543, + "balance_loss_clip": 1.0290904, + "balance_loss_mlp": 1.01676416, + "epoch": 0.41010070644821883, + "flos": 25737464995200.0, + "grad_norm": 1.652965949574804, + "language_loss": 0.71279007, + "learning_rate": 2.6664234959820256e-06, + "loss": 0.73373759, + "num_input_tokens_seen": 146398635, + "step": 6821, + "time_per_iteration": 2.629281520843506 + }, + { + "auxiliary_loss_clip": 0.0106642, + "auxiliary_loss_mlp": 0.01033923, + "balance_loss_clip": 1.0298245, + "balance_loss_mlp": 1.02119172, + "epoch": 0.4101608297008868, + "flos": 22346061557760.0, + "grad_norm": 1.970027605467526, + "language_loss": 0.74321568, + "learning_rate": 2.6660562786159634e-06, + "loss": 0.76421916, + "num_input_tokens_seen": 146417585, + "step": 6822, + "time_per_iteration": 2.622727870941162 + }, + { + "auxiliary_loss_clip": 0.01055385, + "auxiliary_loss_mlp": 0.01031142, + "balance_loss_clip": 1.02760041, + "balance_loss_mlp": 1.01856506, + "epoch": 0.41022095295355476, + "flos": 21945083057280.0, + "grad_norm": 2.153381945369247, + "language_loss": 0.75248218, + "learning_rate": 2.6656890359929796e-06, + "loss": 0.7733475, + "num_input_tokens_seen": 146437035, + "step": 6823, + "time_per_iteration": 2.691349983215332 + }, + { + "auxiliary_loss_clip": 0.01040867, + "auxiliary_loss_mlp": 0.01038083, + "balance_loss_clip": 1.03154898, + "balance_loss_mlp": 1.0239743, + "epoch": 0.4102810762062228, + "flos": 27450511493760.0, + "grad_norm": 1.8420814905546286, + "language_loss": 0.73445356, + "learning_rate": 2.665321768127001e-06, + "loss": 0.75524306, + "num_input_tokens_seen": 146457370, + "step": 6824, + "time_per_iteration": 2.8116495609283447 + }, + { + "auxiliary_loss_clip": 0.01050217, + "auxiliary_loss_mlp": 0.01033413, + "balance_loss_clip": 1.02967286, + "balance_loss_mlp": 1.02008533, + "epoch": 0.41034119945889075, + "flos": 24499265316480.0, + "grad_norm": 1.8472158215392287, + "language_loss": 0.72120714, + "learning_rate": 2.6649544750319548e-06, + "loss": 0.7420435, + "num_input_tokens_seen": 146478105, + "step": 6825, + "time_per_iteration": 2.6767497062683105 + }, + { + "auxiliary_loss_clip": 0.01037207, + "auxiliary_loss_mlp": 0.01038353, + "balance_loss_clip": 1.02557838, + "balance_loss_mlp": 1.02611613, + "epoch": 0.4104013227115587, + "flos": 24352641999360.0, + "grad_norm": 1.9618759567427084, + "language_loss": 0.84869605, + "learning_rate": 2.664587156721768e-06, + "loss": 0.86945164, + "num_input_tokens_seen": 146497835, + "step": 6826, + "time_per_iteration": 2.6579203605651855 + }, + { + "auxiliary_loss_clip": 0.01053308, + "auxiliary_loss_mlp": 0.00747578, + "balance_loss_clip": 1.02785671, + "balance_loss_mlp": 1.00014269, + "epoch": 0.4104614459642267, + "flos": 23729340268800.0, + "grad_norm": 1.843350924322446, + "language_loss": 0.66881096, + "learning_rate": 2.6642198132103696e-06, + "loss": 0.68681985, + "num_input_tokens_seen": 146517735, + "step": 6827, + "time_per_iteration": 2.639582872390747 + }, + { + "auxiliary_loss_clip": 0.01053878, + "auxiliary_loss_mlp": 0.01031826, + "balance_loss_clip": 1.02755582, + "balance_loss_mlp": 1.01987565, + "epoch": 0.41052156921689464, + "flos": 22127976132480.0, + "grad_norm": 1.5204438893766783, + "language_loss": 0.71924657, + "learning_rate": 2.663852444511689e-06, + "loss": 0.7401036, + "num_input_tokens_seen": 146537640, + "step": 6828, + "time_per_iteration": 2.6243696212768555 + }, + { + "auxiliary_loss_clip": 0.01047672, + "auxiliary_loss_mlp": 0.0104, + "balance_loss_clip": 1.02662587, + "balance_loss_mlp": 1.02532542, + "epoch": 0.4105816924695626, + "flos": 20084371747200.0, + "grad_norm": 2.0022021717521237, + "language_loss": 0.8352567, + "learning_rate": 2.6634850506396574e-06, + "loss": 0.85613346, + "num_input_tokens_seen": 146554695, + "step": 6829, + "time_per_iteration": 2.676046133041382 + }, + { + "auxiliary_loss_clip": 0.01063149, + "auxiliary_loss_mlp": 0.01031471, + "balance_loss_clip": 1.02833581, + "balance_loss_mlp": 1.01943111, + "epoch": 0.4106418157222306, + "flos": 18076785724800.0, + "grad_norm": 1.4170737640463071, + "language_loss": 0.89681786, + "learning_rate": 2.663117631608206e-06, + "loss": 0.91776413, + "num_input_tokens_seen": 146573740, + "step": 6830, + "time_per_iteration": 2.573052167892456 + }, + { + "auxiliary_loss_clip": 0.01036697, + "auxiliary_loss_mlp": 0.01029729, + "balance_loss_clip": 1.02722549, + "balance_loss_mlp": 1.01697338, + "epoch": 0.41070193897489854, + "flos": 21647850013440.0, + "grad_norm": 1.7153946051080216, + "language_loss": 0.65221131, + "learning_rate": 2.662750187431268e-06, + "loss": 0.67287558, + "num_input_tokens_seen": 146592885, + "step": 6831, + "time_per_iteration": 2.6568655967712402 + }, + { + "auxiliary_loss_clip": 0.01074266, + "auxiliary_loss_mlp": 0.01034489, + "balance_loss_clip": 1.02838945, + "balance_loss_mlp": 1.02226448, + "epoch": 0.4107620622275665, + "flos": 26648195356800.0, + "grad_norm": 1.9218102632874772, + "language_loss": 0.69227934, + "learning_rate": 2.662382718122776e-06, + "loss": 0.71336687, + "num_input_tokens_seen": 146611995, + "step": 6832, + "time_per_iteration": 2.6558351516723633 + }, + { + "auxiliary_loss_clip": 0.01030949, + "auxiliary_loss_mlp": 0.01031111, + "balance_loss_clip": 1.02723527, + "balance_loss_mlp": 1.01892793, + "epoch": 0.41082218548023447, + "flos": 18734310138240.0, + "grad_norm": 2.0732682589178637, + "language_loss": 0.73472697, + "learning_rate": 2.662015223696666e-06, + "loss": 0.75534755, + "num_input_tokens_seen": 146628045, + "step": 6833, + "time_per_iteration": 2.718132495880127 + }, + { + "auxiliary_loss_clip": 0.01020149, + "auxiliary_loss_mlp": 0.0103288, + "balance_loss_clip": 1.02286613, + "balance_loss_mlp": 1.01836061, + "epoch": 0.41088230873290243, + "flos": 22893771116160.0, + "grad_norm": 1.7527005983937853, + "language_loss": 0.72804409, + "learning_rate": 2.6616477041668713e-06, + "loss": 0.74857438, + "num_input_tokens_seen": 146648355, + "step": 6834, + "time_per_iteration": 2.710798978805542 + }, + { + "auxiliary_loss_clip": 0.01066991, + "auxiliary_loss_mlp": 0.01045331, + "balance_loss_clip": 1.02844572, + "balance_loss_mlp": 1.03221798, + "epoch": 0.4109424319855704, + "flos": 24276978000000.0, + "grad_norm": 2.2282297913442135, + "language_loss": 0.71064818, + "learning_rate": 2.661280159547329e-06, + "loss": 0.73177135, + "num_input_tokens_seen": 146668370, + "step": 6835, + "time_per_iteration": 4.213528394699097 + }, + { + "auxiliary_loss_clip": 0.01064535, + "auxiliary_loss_mlp": 0.01035632, + "balance_loss_clip": 1.02738881, + "balance_loss_mlp": 1.02129149, + "epoch": 0.41100255523823837, + "flos": 12969139478400.0, + "grad_norm": 2.9028350082573913, + "language_loss": 0.86883235, + "learning_rate": 2.660912589851978e-06, + "loss": 0.88983405, + "num_input_tokens_seen": 146686665, + "step": 6836, + "time_per_iteration": 2.5659775733947754 + }, + { + "auxiliary_loss_clip": 0.01063281, + "auxiliary_loss_mlp": 0.01033639, + "balance_loss_clip": 1.02807748, + "balance_loss_mlp": 1.02102101, + "epoch": 0.4110626784909064, + "flos": 23145648261120.0, + "grad_norm": 2.061014976024437, + "language_loss": 0.68641663, + "learning_rate": 2.6605449950947547e-06, + "loss": 0.70738578, + "num_input_tokens_seen": 146706570, + "step": 6837, + "time_per_iteration": 2.619462013244629 + }, + { + "auxiliary_loss_clip": 0.01077697, + "auxiliary_loss_mlp": 0.01036779, + "balance_loss_clip": 1.02939105, + "balance_loss_mlp": 1.02363038, + "epoch": 0.41112280174357435, + "flos": 22747399194240.0, + "grad_norm": 2.960897645352544, + "language_loss": 0.7512393, + "learning_rate": 2.660177375289599e-06, + "loss": 0.77238405, + "num_input_tokens_seen": 146723425, + "step": 6838, + "time_per_iteration": 2.543489694595337 + }, + { + "auxiliary_loss_clip": 0.0104691, + "auxiliary_loss_mlp": 0.01035965, + "balance_loss_clip": 1.03038049, + "balance_loss_mlp": 1.02209473, + "epoch": 0.4111829249962423, + "flos": 21102403011840.0, + "grad_norm": 1.7567423392112314, + "language_loss": 0.82285208, + "learning_rate": 2.659809730450451e-06, + "loss": 0.84368086, + "num_input_tokens_seen": 146741640, + "step": 6839, + "time_per_iteration": 2.6977052688598633 + }, + { + "auxiliary_loss_clip": 0.01072607, + "auxiliary_loss_mlp": 0.0103197, + "balance_loss_clip": 1.02667069, + "balance_loss_mlp": 1.01932204, + "epoch": 0.4112430482489103, + "flos": 21505787723520.0, + "grad_norm": 1.8229510724228544, + "language_loss": 0.80266124, + "learning_rate": 2.6594420605912523e-06, + "loss": 0.82370698, + "num_input_tokens_seen": 146759195, + "step": 6840, + "time_per_iteration": 2.567166328430176 + }, + { + "auxiliary_loss_clip": 0.0106071, + "auxiliary_loss_mlp": 0.01029726, + "balance_loss_clip": 1.02636838, + "balance_loss_mlp": 1.01795959, + "epoch": 0.41130317150157825, + "flos": 19570022945280.0, + "grad_norm": 2.022256953177167, + "language_loss": 0.67973506, + "learning_rate": 2.6590743657259442e-06, + "loss": 0.70063937, + "num_input_tokens_seen": 146774990, + "step": 6841, + "time_per_iteration": 2.6389122009277344 + }, + { + "auxiliary_loss_clip": 0.01001803, + "auxiliary_loss_mlp": 0.01005016, + "balance_loss_clip": 1.0028342, + "balance_loss_mlp": 1.00340641, + "epoch": 0.4113632947542462, + "flos": 62383157706240.0, + "grad_norm": 0.7590831734730723, + "language_loss": 0.59658688, + "learning_rate": 2.65870664586847e-06, + "loss": 0.61665505, + "num_input_tokens_seen": 146839610, + "step": 6842, + "time_per_iteration": 3.2583441734313965 + }, + { + "auxiliary_loss_clip": 0.01062342, + "auxiliary_loss_mlp": 0.01035324, + "balance_loss_clip": 1.02825236, + "balance_loss_mlp": 1.02355158, + "epoch": 0.4114234180069142, + "flos": 13918617636480.0, + "grad_norm": 2.080811195789771, + "language_loss": 0.69239265, + "learning_rate": 2.6583389010327742e-06, + "loss": 0.71336931, + "num_input_tokens_seen": 146857360, + "step": 6843, + "time_per_iteration": 4.174192905426025 + }, + { + "auxiliary_loss_clip": 0.00990937, + "auxiliary_loss_mlp": 0.01004004, + "balance_loss_clip": 1.01148391, + "balance_loss_mlp": 1.00241876, + "epoch": 0.41148354125958214, + "flos": 64928505219840.0, + "grad_norm": 0.7447431932514355, + "language_loss": 0.53621578, + "learning_rate": 2.6579711312328013e-06, + "loss": 0.55616522, + "num_input_tokens_seen": 146917055, + "step": 6844, + "time_per_iteration": 3.2060186862945557 + }, + { + "auxiliary_loss_clip": 0.01061337, + "auxiliary_loss_mlp": 0.01031379, + "balance_loss_clip": 1.02683091, + "balance_loss_mlp": 1.01969671, + "epoch": 0.4115436645122501, + "flos": 18728779443840.0, + "grad_norm": 1.9894016282009155, + "language_loss": 0.66109753, + "learning_rate": 2.6576033364824967e-06, + "loss": 0.68202472, + "num_input_tokens_seen": 146935215, + "step": 6845, + "time_per_iteration": 2.5321831703186035 + }, + { + "auxiliary_loss_clip": 0.01073025, + "auxiliary_loss_mlp": 0.01030582, + "balance_loss_clip": 1.02940893, + "balance_loss_mlp": 1.01891744, + "epoch": 0.41160378776491807, + "flos": 16252918790400.0, + "grad_norm": 1.865288531271581, + "language_loss": 0.70325541, + "learning_rate": 2.657235516795808e-06, + "loss": 0.7242915, + "num_input_tokens_seen": 146951970, + "step": 6846, + "time_per_iteration": 4.127108573913574 + }, + { + "auxiliary_loss_clip": 0.01048477, + "auxiliary_loss_mlp": 0.01032152, + "balance_loss_clip": 1.02643144, + "balance_loss_mlp": 1.01995754, + "epoch": 0.41166391101758604, + "flos": 27970031854080.0, + "grad_norm": 1.5120826073152152, + "language_loss": 0.65130216, + "learning_rate": 2.6568676721866826e-06, + "loss": 0.67210841, + "num_input_tokens_seen": 146975615, + "step": 6847, + "time_per_iteration": 2.698140859603882 + }, + { + "auxiliary_loss_clip": 0.01050199, + "auxiliary_loss_mlp": 0.01038611, + "balance_loss_clip": 1.0264461, + "balance_loss_mlp": 1.02624869, + "epoch": 0.411724034270254, + "flos": 34131296764800.0, + "grad_norm": 1.43098657350057, + "language_loss": 0.70084041, + "learning_rate": 2.656499802669069e-06, + "loss": 0.7217285, + "num_input_tokens_seen": 146998855, + "step": 6848, + "time_per_iteration": 2.7769289016723633 + }, + { + "auxiliary_loss_clip": 0.0099357, + "auxiliary_loss_mlp": 0.00746073, + "balance_loss_clip": 1.00508809, + "balance_loss_mlp": 0.99976462, + "epoch": 0.41178415752292197, + "flos": 67923670752000.0, + "grad_norm": 0.8830147471498827, + "language_loss": 0.56234044, + "learning_rate": 2.6561319082569174e-06, + "loss": 0.57973683, + "num_input_tokens_seen": 147062710, + "step": 6849, + "time_per_iteration": 3.2460639476776123 + }, + { + "auxiliary_loss_clip": 0.0105143, + "auxiliary_loss_mlp": 0.0103534, + "balance_loss_clip": 1.0271641, + "balance_loss_mlp": 1.0229007, + "epoch": 0.41184428077558993, + "flos": 34313938444800.0, + "grad_norm": 1.5209306783473289, + "language_loss": 0.76052374, + "learning_rate": 2.6557639889641783e-06, + "loss": 0.78139144, + "num_input_tokens_seen": 147086075, + "step": 6850, + "time_per_iteration": 2.73081111907959 + }, + { + "auxiliary_loss_clip": 0.01028814, + "auxiliary_loss_mlp": 0.01030164, + "balance_loss_clip": 1.02503264, + "balance_loss_mlp": 1.01855302, + "epoch": 0.41190440402825795, + "flos": 35444118948480.0, + "grad_norm": 1.609306684420108, + "language_loss": 0.68161154, + "learning_rate": 2.6553960448048025e-06, + "loss": 0.70220131, + "num_input_tokens_seen": 147107590, + "step": 6851, + "time_per_iteration": 2.808781862258911 + }, + { + "auxiliary_loss_clip": 0.01048801, + "auxiliary_loss_mlp": 0.0103949, + "balance_loss_clip": 1.03047502, + "balance_loss_mlp": 1.0248158, + "epoch": 0.4119645272809259, + "flos": 20849879422080.0, + "grad_norm": 2.007141570961754, + "language_loss": 0.79348779, + "learning_rate": 2.655028075792743e-06, + "loss": 0.81437075, + "num_input_tokens_seen": 147123715, + "step": 6852, + "time_per_iteration": 2.7927749156951904 + }, + { + "auxiliary_loss_clip": 0.01077963, + "auxiliary_loss_mlp": 0.01033436, + "balance_loss_clip": 1.02942157, + "balance_loss_mlp": 1.01968479, + "epoch": 0.4120246505335939, + "flos": 27562050201600.0, + "grad_norm": 2.1186059411964617, + "language_loss": 0.77801043, + "learning_rate": 2.6546600819419537e-06, + "loss": 0.79912436, + "num_input_tokens_seen": 147144290, + "step": 6853, + "time_per_iteration": 2.5747551918029785 + }, + { + "auxiliary_loss_clip": 0.01064413, + "auxiliary_loss_mlp": 0.01040044, + "balance_loss_clip": 1.02775478, + "balance_loss_mlp": 1.02665734, + "epoch": 0.41208477378626185, + "flos": 37815444046080.0, + "grad_norm": 1.7017802223738787, + "language_loss": 0.65934753, + "learning_rate": 2.6542920632663883e-06, + "loss": 0.68039215, + "num_input_tokens_seen": 147166340, + "step": 6854, + "time_per_iteration": 2.7368807792663574 + }, + { + "auxiliary_loss_clip": 0.01045521, + "auxiliary_loss_mlp": 0.01038896, + "balance_loss_clip": 1.02543879, + "balance_loss_mlp": 1.02586043, + "epoch": 0.4121448970389298, + "flos": 23440762402560.0, + "grad_norm": 1.7720324987147105, + "language_loss": 0.8378281, + "learning_rate": 2.6539240197800023e-06, + "loss": 0.85867226, + "num_input_tokens_seen": 147184025, + "step": 6855, + "time_per_iteration": 2.6088147163391113 + }, + { + "auxiliary_loss_clip": 0.0106224, + "auxiliary_loss_mlp": 0.01039117, + "balance_loss_clip": 1.02766562, + "balance_loss_mlp": 1.0270772, + "epoch": 0.4122050202915978, + "flos": 21325300859520.0, + "grad_norm": 1.5925426957719586, + "language_loss": 0.78876042, + "learning_rate": 2.6535559514967517e-06, + "loss": 0.80977398, + "num_input_tokens_seen": 147202730, + "step": 6856, + "time_per_iteration": 2.6546332836151123 + }, + { + "auxiliary_loss_clip": 0.01040556, + "auxiliary_loss_mlp": 0.01034372, + "balance_loss_clip": 1.02753925, + "balance_loss_mlp": 1.02158689, + "epoch": 0.41226514354426574, + "flos": 17306286059520.0, + "grad_norm": 2.368689508855021, + "language_loss": 0.80272841, + "learning_rate": 2.6531878584305935e-06, + "loss": 0.82347769, + "num_input_tokens_seen": 147215315, + "step": 6857, + "time_per_iteration": 2.6358914375305176 + }, + { + "auxiliary_loss_clip": 0.01064399, + "auxiliary_loss_mlp": 0.00747693, + "balance_loss_clip": 1.02705717, + "balance_loss_mlp": 1.00023937, + "epoch": 0.4123252667969337, + "flos": 17638855107840.0, + "grad_norm": 3.5184454301284296, + "language_loss": 0.70465946, + "learning_rate": 2.6528197405954873e-06, + "loss": 0.72278041, + "num_input_tokens_seen": 147233330, + "step": 6858, + "time_per_iteration": 2.7027175426483154 + }, + { + "auxiliary_loss_clip": 0.01063081, + "auxiliary_loss_mlp": 0.01040099, + "balance_loss_clip": 1.02765536, + "balance_loss_mlp": 1.02698588, + "epoch": 0.4123853900496017, + "flos": 46424811375360.0, + "grad_norm": 1.5001561232979745, + "language_loss": 0.59347522, + "learning_rate": 2.652451598005391e-06, + "loss": 0.61450708, + "num_input_tokens_seen": 147257780, + "step": 6859, + "time_per_iteration": 2.7997004985809326 + }, + { + "auxiliary_loss_clip": 0.01074092, + "auxiliary_loss_mlp": 0.01033971, + "balance_loss_clip": 1.02647424, + "balance_loss_mlp": 1.02110863, + "epoch": 0.41244551330226964, + "flos": 17675160779520.0, + "grad_norm": 2.2245586707113345, + "language_loss": 0.734568, + "learning_rate": 2.652083430674264e-06, + "loss": 0.75564855, + "num_input_tokens_seen": 147276055, + "step": 6860, + "time_per_iteration": 2.5370118618011475 + }, + { + "auxiliary_loss_clip": 0.01009034, + "auxiliary_loss_mlp": 0.01032296, + "balance_loss_clip": 1.02994871, + "balance_loss_mlp": 1.0198741, + "epoch": 0.4125056365549376, + "flos": 18693730748160.0, + "grad_norm": 1.6122975118368554, + "language_loss": 0.7424742, + "learning_rate": 2.651715238616068e-06, + "loss": 0.76288748, + "num_input_tokens_seen": 147293200, + "step": 6861, + "time_per_iteration": 2.8164548873901367 + }, + { + "auxiliary_loss_clip": 0.01051925, + "auxiliary_loss_mlp": 0.01031897, + "balance_loss_clip": 1.0278753, + "balance_loss_mlp": 1.02032197, + "epoch": 0.41256575980760557, + "flos": 17895293280000.0, + "grad_norm": 2.0348942679210973, + "language_loss": 0.79771984, + "learning_rate": 2.651347021844765e-06, + "loss": 0.81855804, + "num_input_tokens_seen": 147310640, + "step": 6862, + "time_per_iteration": 2.602332592010498 + }, + { + "auxiliary_loss_clip": 0.01042438, + "auxiliary_loss_mlp": 0.01031505, + "balance_loss_clip": 1.02473712, + "balance_loss_mlp": 1.01954198, + "epoch": 0.41262588306027354, + "flos": 21981316901760.0, + "grad_norm": 1.7432043514173194, + "language_loss": 0.75683635, + "learning_rate": 2.650978780374318e-06, + "loss": 0.77757573, + "num_input_tokens_seen": 147329435, + "step": 6863, + "time_per_iteration": 2.6448957920074463 + }, + { + "auxiliary_loss_clip": 0.01000013, + "auxiliary_loss_mlp": 0.0101776, + "balance_loss_clip": 1.00144386, + "balance_loss_mlp": 1.01616275, + "epoch": 0.41268600631294156, + "flos": 53350006740480.0, + "grad_norm": 0.7063155466739196, + "language_loss": 0.52752578, + "learning_rate": 2.650610514218691e-06, + "loss": 0.5477035, + "num_input_tokens_seen": 147385805, + "step": 6864, + "time_per_iteration": 3.095719575881958 + }, + { + "auxiliary_loss_clip": 0.01077072, + "auxiliary_loss_mlp": 0.01033312, + "balance_loss_clip": 1.02831483, + "balance_loss_mlp": 1.02009177, + "epoch": 0.4127461295656095, + "flos": 24385356311040.0, + "grad_norm": 1.7188689818547567, + "language_loss": 0.72140229, + "learning_rate": 2.6502422233918468e-06, + "loss": 0.74250609, + "num_input_tokens_seen": 147405160, + "step": 6865, + "time_per_iteration": 2.6253178119659424 + }, + { + "auxiliary_loss_clip": 0.01010353, + "auxiliary_loss_mlp": 0.0100295, + "balance_loss_clip": 1.00155282, + "balance_loss_mlp": 1.00142992, + "epoch": 0.4128062528182775, + "flos": 71705242696320.0, + "grad_norm": 0.928706381737734, + "language_loss": 0.66613525, + "learning_rate": 2.649873907907753e-06, + "loss": 0.68626827, + "num_input_tokens_seen": 147460245, + "step": 6866, + "time_per_iteration": 2.9888458251953125 + }, + { + "auxiliary_loss_clip": 0.01071921, + "auxiliary_loss_mlp": 0.01031146, + "balance_loss_clip": 1.02566266, + "balance_loss_mlp": 1.01892734, + "epoch": 0.41286637607094545, + "flos": 17849111368320.0, + "grad_norm": 2.0013573696454015, + "language_loss": 0.81324112, + "learning_rate": 2.649505567780375e-06, + "loss": 0.83427179, + "num_input_tokens_seen": 147476200, + "step": 6867, + "time_per_iteration": 2.5036866664886475 + }, + { + "auxiliary_loss_clip": 0.01056576, + "auxiliary_loss_mlp": 0.01032147, + "balance_loss_clip": 1.0283339, + "balance_loss_mlp": 1.01985073, + "epoch": 0.4129264993236134, + "flos": 25549544016000.0, + "grad_norm": 2.5594363095956836, + "language_loss": 0.77770317, + "learning_rate": 2.6491372030236815e-06, + "loss": 0.79859042, + "num_input_tokens_seen": 147494315, + "step": 6868, + "time_per_iteration": 2.6261963844299316 + }, + { + "auxiliary_loss_clip": 0.01001219, + "auxiliary_loss_mlp": 0.01004573, + "balance_loss_clip": 1.00248408, + "balance_loss_mlp": 1.00289249, + "epoch": 0.4129866225762814, + "flos": 65414446364160.0, + "grad_norm": 0.8396494316858611, + "language_loss": 0.57799411, + "learning_rate": 2.64876881365164e-06, + "loss": 0.59805202, + "num_input_tokens_seen": 147543665, + "step": 6869, + "time_per_iteration": 2.894294500350952 + }, + { + "auxiliary_loss_clip": 0.01062006, + "auxiliary_loss_mlp": 0.01030865, + "balance_loss_clip": 1.02829409, + "balance_loss_mlp": 1.01842558, + "epoch": 0.41304674582894935, + "flos": 28876991287680.0, + "grad_norm": 1.6387430087532031, + "language_loss": 0.75247115, + "learning_rate": 2.64840039967822e-06, + "loss": 0.77339989, + "num_input_tokens_seen": 147564870, + "step": 6870, + "time_per_iteration": 2.734445810317993 + }, + { + "auxiliary_loss_clip": 0.01047559, + "auxiliary_loss_mlp": 0.01033782, + "balance_loss_clip": 1.03008437, + "balance_loss_mlp": 1.02083576, + "epoch": 0.4131068690816173, + "flos": 22891975436160.0, + "grad_norm": 1.612388728726288, + "language_loss": 0.83389139, + "learning_rate": 2.6480319611173912e-06, + "loss": 0.8547048, + "num_input_tokens_seen": 147584840, + "step": 6871, + "time_per_iteration": 2.712310552597046 + }, + { + "auxiliary_loss_clip": 0.01048227, + "auxiliary_loss_mlp": 0.01038728, + "balance_loss_clip": 1.0306623, + "balance_loss_mlp": 1.02607965, + "epoch": 0.4131669923342853, + "flos": 26065185707520.0, + "grad_norm": 2.5661858593116005, + "language_loss": 0.68252724, + "learning_rate": 2.6476634979831263e-06, + "loss": 0.7033968, + "num_input_tokens_seen": 147604635, + "step": 6872, + "time_per_iteration": 2.66509747505188 + }, + { + "auxiliary_loss_clip": 0.01061011, + "auxiliary_loss_mlp": 0.01032976, + "balance_loss_clip": 1.03229356, + "balance_loss_mlp": 1.02041733, + "epoch": 0.41322711558695324, + "flos": 19244564789760.0, + "grad_norm": 1.8135486873467943, + "language_loss": 0.76073855, + "learning_rate": 2.6472950102893964e-06, + "loss": 0.78167832, + "num_input_tokens_seen": 147620700, + "step": 6873, + "time_per_iteration": 2.6136205196380615 + }, + { + "auxiliary_loss_clip": 0.01047666, + "auxiliary_loss_mlp": 0.01035575, + "balance_loss_clip": 1.02664089, + "balance_loss_mlp": 1.02252781, + "epoch": 0.4132872388396212, + "flos": 22674464628480.0, + "grad_norm": 2.1106947736406267, + "language_loss": 0.83332062, + "learning_rate": 2.6469264980501746e-06, + "loss": 0.85415304, + "num_input_tokens_seen": 147639490, + "step": 6874, + "time_per_iteration": 2.6990089416503906 + }, + { + "auxiliary_loss_clip": 0.0104326, + "auxiliary_loss_mlp": 0.01033345, + "balance_loss_clip": 1.02641511, + "balance_loss_mlp": 1.02028608, + "epoch": 0.4133473620922892, + "flos": 20150195420160.0, + "grad_norm": 1.8057007551336621, + "language_loss": 0.71352738, + "learning_rate": 2.646557961279436e-06, + "loss": 0.73429346, + "num_input_tokens_seen": 147657205, + "step": 6875, + "time_per_iteration": 2.7472786903381348 + }, + { + "auxiliary_loss_clip": 0.01043797, + "auxiliary_loss_mlp": 0.01033178, + "balance_loss_clip": 1.0272646, + "balance_loss_mlp": 1.02100635, + "epoch": 0.41340748534495714, + "flos": 24242755317120.0, + "grad_norm": 1.5567979870235058, + "language_loss": 0.82555425, + "learning_rate": 2.646189399991154e-06, + "loss": 0.84632397, + "num_input_tokens_seen": 147677005, + "step": 6876, + "time_per_iteration": 2.680429458618164 + }, + { + "auxiliary_loss_clip": 0.01068777, + "auxiliary_loss_mlp": 0.01037956, + "balance_loss_clip": 1.02981544, + "balance_loss_mlp": 1.02403224, + "epoch": 0.41346760859762516, + "flos": 14392171566720.0, + "grad_norm": 2.2373589153947924, + "language_loss": 0.65665901, + "learning_rate": 2.6458208141993048e-06, + "loss": 0.67772639, + "num_input_tokens_seen": 147693435, + "step": 6877, + "time_per_iteration": 2.6509499549865723 + }, + { + "auxiliary_loss_clip": 0.01063043, + "auxiliary_loss_mlp": 0.01029355, + "balance_loss_clip": 1.02769232, + "balance_loss_mlp": 1.01714826, + "epoch": 0.4135277318502931, + "flos": 22492002516480.0, + "grad_norm": 2.1722927796996334, + "language_loss": 0.76522171, + "learning_rate": 2.6454522039178668e-06, + "loss": 0.78614575, + "num_input_tokens_seen": 147714000, + "step": 6878, + "time_per_iteration": 2.6163692474365234 + }, + { + "auxiliary_loss_clip": 0.01061437, + "auxiliary_loss_mlp": 0.00747516, + "balance_loss_clip": 1.02676094, + "balance_loss_mlp": 1.00020027, + "epoch": 0.4135878551029611, + "flos": 22418744728320.0, + "grad_norm": 1.908793356849353, + "language_loss": 0.80414784, + "learning_rate": 2.6450835691608154e-06, + "loss": 0.82223749, + "num_input_tokens_seen": 147731010, + "step": 6879, + "time_per_iteration": 2.5969064235687256 + }, + { + "auxiliary_loss_clip": 0.01073981, + "auxiliary_loss_mlp": 0.01033169, + "balance_loss_clip": 1.02885377, + "balance_loss_mlp": 1.02044356, + "epoch": 0.41364797835562905, + "flos": 27053232094080.0, + "grad_norm": 1.6244001136155164, + "language_loss": 0.84614736, + "learning_rate": 2.6447149099421315e-06, + "loss": 0.86721885, + "num_input_tokens_seen": 147750880, + "step": 6880, + "time_per_iteration": 2.6395204067230225 + }, + { + "auxiliary_loss_clip": 0.01054928, + "auxiliary_loss_mlp": 0.0102634, + "balance_loss_clip": 1.02778125, + "balance_loss_mlp": 1.0137279, + "epoch": 0.413708101608297, + "flos": 22967603521920.0, + "grad_norm": 1.6652769205251825, + "language_loss": 0.7048049, + "learning_rate": 2.6443462262757927e-06, + "loss": 0.72561765, + "num_input_tokens_seen": 147771360, + "step": 6881, + "time_per_iteration": 2.6456100940704346 + }, + { + "auxiliary_loss_clip": 0.01073234, + "auxiliary_loss_mlp": 0.01037172, + "balance_loss_clip": 1.029199, + "balance_loss_mlp": 1.02603745, + "epoch": 0.413768224860965, + "flos": 13333991875200.0, + "grad_norm": 1.756367238417933, + "language_loss": 0.80867535, + "learning_rate": 2.6439775181757805e-06, + "loss": 0.82977939, + "num_input_tokens_seen": 147787440, + "step": 6882, + "time_per_iteration": 2.756465435028076 + }, + { + "auxiliary_loss_clip": 0.01049792, + "auxiliary_loss_mlp": 0.01040642, + "balance_loss_clip": 1.0261867, + "balance_loss_mlp": 1.02618217, + "epoch": 0.41382834811363295, + "flos": 20813968800000.0, + "grad_norm": 2.0454719775186705, + "language_loss": 0.69311482, + "learning_rate": 2.643608785656077e-06, + "loss": 0.71401918, + "num_input_tokens_seen": 147805720, + "step": 6883, + "time_per_iteration": 5.852871417999268 + }, + { + "auxiliary_loss_clip": 0.01059566, + "auxiliary_loss_mlp": 0.0103053, + "balance_loss_clip": 1.02575564, + "balance_loss_mlp": 1.01871061, + "epoch": 0.4138884713663009, + "flos": 20667130001280.0, + "grad_norm": 3.331249298834347, + "language_loss": 0.75515223, + "learning_rate": 2.643240028730663e-06, + "loss": 0.77605325, + "num_input_tokens_seen": 147824605, + "step": 6884, + "time_per_iteration": 2.6174466609954834 + }, + { + "auxiliary_loss_clip": 0.01036013, + "auxiliary_loss_mlp": 0.01033586, + "balance_loss_clip": 1.02667904, + "balance_loss_mlp": 1.02138495, + "epoch": 0.4139485946189689, + "flos": 29056616225280.0, + "grad_norm": 1.7870125664142167, + "language_loss": 0.75874186, + "learning_rate": 2.642871247413523e-06, + "loss": 0.77943784, + "num_input_tokens_seen": 147845445, + "step": 6885, + "time_per_iteration": 2.793703556060791 + }, + { + "auxiliary_loss_clip": 0.01077129, + "auxiliary_loss_mlp": 0.01038652, + "balance_loss_clip": 1.0298301, + "balance_loss_mlp": 1.02572966, + "epoch": 0.41400871787163684, + "flos": 24425720219520.0, + "grad_norm": 2.047810822067501, + "language_loss": 0.70047295, + "learning_rate": 2.6425024417186414e-06, + "loss": 0.72163075, + "num_input_tokens_seen": 147865580, + "step": 6886, + "time_per_iteration": 2.6015219688415527 + }, + { + "auxiliary_loss_clip": 0.01077306, + "auxiliary_loss_mlp": 0.00747615, + "balance_loss_clip": 1.03042126, + "balance_loss_mlp": 1.00023615, + "epoch": 0.4140688411243048, + "flos": 19464050845440.0, + "grad_norm": 1.839990197073595, + "language_loss": 0.7522561, + "learning_rate": 2.642133611660002e-06, + "loss": 0.77050537, + "num_input_tokens_seen": 147885230, + "step": 6887, + "time_per_iteration": 2.5758512020111084 + }, + { + "auxiliary_loss_clip": 0.01064002, + "auxiliary_loss_mlp": 0.01028188, + "balance_loss_clip": 1.02817476, + "balance_loss_mlp": 1.01578414, + "epoch": 0.4141289643769728, + "flos": 19313656600320.0, + "grad_norm": 2.024157371640442, + "language_loss": 0.69726908, + "learning_rate": 2.641764757251592e-06, + "loss": 0.71819097, + "num_input_tokens_seen": 147903035, + "step": 6888, + "time_per_iteration": 2.530017375946045 + }, + { + "auxiliary_loss_clip": 0.01070469, + "auxiliary_loss_mlp": 0.01031394, + "balance_loss_clip": 1.02583861, + "balance_loss_mlp": 1.01911545, + "epoch": 0.41418908762964074, + "flos": 16726903683840.0, + "grad_norm": 1.7587964307861697, + "language_loss": 0.76001382, + "learning_rate": 2.6413958785073976e-06, + "loss": 0.78103244, + "num_input_tokens_seen": 147918745, + "step": 6889, + "time_per_iteration": 2.518556833267212 + }, + { + "auxiliary_loss_clip": 0.01047914, + "auxiliary_loss_mlp": 0.00747564, + "balance_loss_clip": 1.03196335, + "balance_loss_mlp": 1.00024486, + "epoch": 0.41424921088230876, + "flos": 25296840858240.0, + "grad_norm": 2.1570000189272314, + "language_loss": 0.8025378, + "learning_rate": 2.6410269754414074e-06, + "loss": 0.82049263, + "num_input_tokens_seen": 147938265, + "step": 6890, + "time_per_iteration": 2.710170030593872 + }, + { + "auxiliary_loss_clip": 0.01073293, + "auxiliary_loss_mlp": 0.01034687, + "balance_loss_clip": 1.02858424, + "balance_loss_mlp": 1.02194333, + "epoch": 0.4143093341349767, + "flos": 20960520289920.0, + "grad_norm": 1.6638490337452698, + "language_loss": 0.74335295, + "learning_rate": 2.6406580480676113e-06, + "loss": 0.76443279, + "num_input_tokens_seen": 147957320, + "step": 6891, + "time_per_iteration": 4.148912191390991 + }, + { + "auxiliary_loss_clip": 0.01038293, + "auxiliary_loss_mlp": 0.0103922, + "balance_loss_clip": 1.03022861, + "balance_loss_mlp": 1.02511764, + "epoch": 0.4143694573876447, + "flos": 22017694400640.0, + "grad_norm": 1.5627970254344141, + "language_loss": 0.8403213, + "learning_rate": 2.6402890963999963e-06, + "loss": 0.8610965, + "num_input_tokens_seen": 147977045, + "step": 6892, + "time_per_iteration": 2.7310683727264404 + }, + { + "auxiliary_loss_clip": 0.01023927, + "auxiliary_loss_mlp": 0.00747512, + "balance_loss_clip": 1.02247822, + "balance_loss_mlp": 1.00018907, + "epoch": 0.41442958064031266, + "flos": 35697396723840.0, + "grad_norm": 1.7023065795361938, + "language_loss": 0.70314175, + "learning_rate": 2.6399201204525554e-06, + "loss": 0.72085613, + "num_input_tokens_seen": 147996905, + "step": 6893, + "time_per_iteration": 2.795485496520996 + }, + { + "auxiliary_loss_clip": 0.01074674, + "auxiliary_loss_mlp": 0.01030104, + "balance_loss_clip": 1.02970123, + "balance_loss_mlp": 1.01796269, + "epoch": 0.4144897038929806, + "flos": 28293766156800.0, + "grad_norm": 2.72640510072164, + "language_loss": 0.72845924, + "learning_rate": 2.639551120239279e-06, + "loss": 0.74950707, + "num_input_tokens_seen": 148017875, + "step": 6894, + "time_per_iteration": 4.501348257064819 + }, + { + "auxiliary_loss_clip": 0.01065269, + "auxiliary_loss_mlp": 0.01034015, + "balance_loss_clip": 1.02772832, + "balance_loss_mlp": 1.02103949, + "epoch": 0.4145498271456486, + "flos": 11648093080320.0, + "grad_norm": 4.15040222189506, + "language_loss": 0.6251449, + "learning_rate": 2.63918209577416e-06, + "loss": 0.64613771, + "num_input_tokens_seen": 148032300, + "step": 6895, + "time_per_iteration": 2.8215365409851074 + }, + { + "auxiliary_loss_clip": 0.01035756, + "auxiliary_loss_mlp": 0.01036773, + "balance_loss_clip": 1.02673388, + "balance_loss_mlp": 1.02282584, + "epoch": 0.41460995039831655, + "flos": 27235622378880.0, + "grad_norm": 1.6845311331216004, + "language_loss": 0.70850706, + "learning_rate": 2.638813047071192e-06, + "loss": 0.72923237, + "num_input_tokens_seen": 148053260, + "step": 6896, + "time_per_iteration": 2.9101181030273438 + }, + { + "auxiliary_loss_clip": 0.01074132, + "auxiliary_loss_mlp": 0.01034991, + "balance_loss_clip": 1.02682447, + "balance_loss_mlp": 1.02137685, + "epoch": 0.4146700736509845, + "flos": 25922369232000.0, + "grad_norm": 2.551181350405432, + "language_loss": 0.72698867, + "learning_rate": 2.6384439741443696e-06, + "loss": 0.7480799, + "num_input_tokens_seen": 148072965, + "step": 6897, + "time_per_iteration": 2.6136910915374756 + }, + { + "auxiliary_loss_clip": 0.01062007, + "auxiliary_loss_mlp": 0.01034852, + "balance_loss_clip": 1.02820301, + "balance_loss_mlp": 1.02249599, + "epoch": 0.4147301969036525, + "flos": 26833243248000.0, + "grad_norm": 1.7618586857107408, + "language_loss": 0.84623778, + "learning_rate": 2.6380748770076873e-06, + "loss": 0.86720634, + "num_input_tokens_seen": 148093240, + "step": 6898, + "time_per_iteration": 2.672402858734131 + }, + { + "auxiliary_loss_clip": 0.01031641, + "auxiliary_loss_mlp": 0.01030833, + "balance_loss_clip": 1.02495134, + "balance_loss_mlp": 1.01807797, + "epoch": 0.41479032015632045, + "flos": 20298291194880.0, + "grad_norm": 1.7554849454162664, + "language_loss": 0.74289411, + "learning_rate": 2.6377057556751416e-06, + "loss": 0.76351887, + "num_input_tokens_seen": 148110925, + "step": 6899, + "time_per_iteration": 2.741920232772827 + }, + { + "auxiliary_loss_clip": 0.01042098, + "auxiliary_loss_mlp": 0.01030681, + "balance_loss_clip": 1.02463996, + "balance_loss_mlp": 1.01659632, + "epoch": 0.4148504434089884, + "flos": 25264988472960.0, + "grad_norm": 1.651987552899948, + "language_loss": 0.75859237, + "learning_rate": 2.6373366101607306e-06, + "loss": 0.77932012, + "num_input_tokens_seen": 148130670, + "step": 6900, + "time_per_iteration": 2.742903709411621 + }, + { + "auxiliary_loss_clip": 0.0106554, + "auxiliary_loss_mlp": 0.01034331, + "balance_loss_clip": 1.02852225, + "balance_loss_mlp": 1.02043736, + "epoch": 0.4149105666616564, + "flos": 12822300679680.0, + "grad_norm": 2.0881685151830167, + "language_loss": 0.8034966, + "learning_rate": 2.6369674404784503e-06, + "loss": 0.82449532, + "num_input_tokens_seen": 148148350, + "step": 6901, + "time_per_iteration": 2.6444966793060303 + }, + { + "auxiliary_loss_clip": 0.01034774, + "auxiliary_loss_mlp": 0.01036855, + "balance_loss_clip": 1.02414489, + "balance_loss_mlp": 1.02317619, + "epoch": 0.41497068991432434, + "flos": 16763891713920.0, + "grad_norm": 1.7217270385193149, + "language_loss": 0.69684249, + "learning_rate": 2.6365982466423014e-06, + "loss": 0.71755874, + "num_input_tokens_seen": 148167550, + "step": 6902, + "time_per_iteration": 2.7230372428894043 + }, + { + "auxiliary_loss_clip": 0.01051971, + "auxiliary_loss_mlp": 0.00747523, + "balance_loss_clip": 1.02822292, + "balance_loss_mlp": 1.00023794, + "epoch": 0.4150308131669923, + "flos": 18000906243840.0, + "grad_norm": 2.2699828073874775, + "language_loss": 0.83713144, + "learning_rate": 2.6362290286662834e-06, + "loss": 0.85512638, + "num_input_tokens_seen": 148184740, + "step": 6903, + "time_per_iteration": 2.6271796226501465 + }, + { + "auxiliary_loss_clip": 0.01077424, + "auxiliary_loss_mlp": 0.01034649, + "balance_loss_clip": 1.02810776, + "balance_loss_mlp": 1.02070141, + "epoch": 0.41509093641966033, + "flos": 30044770352640.0, + "grad_norm": 1.8841471079554801, + "language_loss": 0.68328321, + "learning_rate": 2.6358597865643968e-06, + "loss": 0.70440394, + "num_input_tokens_seen": 148204605, + "step": 6904, + "time_per_iteration": 2.614449977874756 + }, + { + "auxiliary_loss_clip": 0.0107607, + "auxiliary_loss_mlp": 0.00747666, + "balance_loss_clip": 1.02864885, + "balance_loss_mlp": 1.00017297, + "epoch": 0.4151510596723283, + "flos": 24279994742400.0, + "grad_norm": 1.6523670295717283, + "language_loss": 0.77755415, + "learning_rate": 2.635490520350643e-06, + "loss": 0.79579151, + "num_input_tokens_seen": 148224675, + "step": 6905, + "time_per_iteration": 2.60030460357666 + }, + { + "auxiliary_loss_clip": 0.01074168, + "auxiliary_loss_mlp": 0.01029442, + "balance_loss_clip": 1.02727675, + "balance_loss_mlp": 1.01663327, + "epoch": 0.41521118292499626, + "flos": 23476206147840.0, + "grad_norm": 1.542002954739475, + "language_loss": 0.68363422, + "learning_rate": 2.635121230039025e-06, + "loss": 0.70467031, + "num_input_tokens_seen": 148243375, + "step": 6906, + "time_per_iteration": 2.5589451789855957 + }, + { + "auxiliary_loss_clip": 0.01050731, + "auxiliary_loss_mlp": 0.01028873, + "balance_loss_clip": 1.02602518, + "balance_loss_mlp": 1.01642132, + "epoch": 0.4152713061776642, + "flos": 22125498094080.0, + "grad_norm": 2.8232350737231506, + "language_loss": 0.67371845, + "learning_rate": 2.6347519156435467e-06, + "loss": 0.69451451, + "num_input_tokens_seen": 148261140, + "step": 6907, + "time_per_iteration": 2.6477441787719727 + }, + { + "auxiliary_loss_clip": 0.01048504, + "auxiliary_loss_mlp": 0.01031563, + "balance_loss_clip": 1.03062367, + "balance_loss_mlp": 1.0194397, + "epoch": 0.4153314294303322, + "flos": 21251396626560.0, + "grad_norm": 1.9924511423657343, + "language_loss": 0.77321482, + "learning_rate": 2.6343825771782123e-06, + "loss": 0.79401553, + "num_input_tokens_seen": 148279655, + "step": 6908, + "time_per_iteration": 2.77107310295105 + }, + { + "auxiliary_loss_clip": 0.00995475, + "auxiliary_loss_mlp": 0.01004916, + "balance_loss_clip": 1.00600672, + "balance_loss_mlp": 1.00322294, + "epoch": 0.41539155268300015, + "flos": 57920681594880.0, + "grad_norm": 0.7790478881760413, + "language_loss": 0.64815378, + "learning_rate": 2.634013214657026e-06, + "loss": 0.6681577, + "num_input_tokens_seen": 148339005, + "step": 6909, + "time_per_iteration": 3.1688966751098633 + }, + { + "auxiliary_loss_clip": 0.01040492, + "auxiliary_loss_mlp": 0.01034139, + "balance_loss_clip": 1.02816176, + "balance_loss_mlp": 1.02173543, + "epoch": 0.4154516759356681, + "flos": 21903677654400.0, + "grad_norm": 1.836322714826997, + "language_loss": 0.86873043, + "learning_rate": 2.633643828093996e-06, + "loss": 0.88947678, + "num_input_tokens_seen": 148358715, + "step": 6910, + "time_per_iteration": 2.701630115509033 + }, + { + "auxiliary_loss_clip": 0.01003917, + "auxiliary_loss_mlp": 0.01004091, + "balance_loss_clip": 1.00445068, + "balance_loss_mlp": 1.00232625, + "epoch": 0.4155117991883361, + "flos": 67833677226240.0, + "grad_norm": 0.8113204221407808, + "language_loss": 0.62106216, + "learning_rate": 2.633274417503128e-06, + "loss": 0.64114219, + "num_input_tokens_seen": 148417280, + "step": 6911, + "time_per_iteration": 3.1740901470184326 + }, + { + "auxiliary_loss_clip": 0.01079066, + "auxiliary_loss_mlp": 0.01034307, + "balance_loss_clip": 1.02944493, + "balance_loss_mlp": 1.02037096, + "epoch": 0.41557192244100405, + "flos": 14282679934080.0, + "grad_norm": 2.7422433641610184, + "language_loss": 0.88211769, + "learning_rate": 2.6329049828984312e-06, + "loss": 0.90325147, + "num_input_tokens_seen": 148432610, + "step": 6912, + "time_per_iteration": 2.6151390075683594 + }, + { + "auxiliary_loss_clip": 0.01064763, + "auxiliary_loss_mlp": 0.01028593, + "balance_loss_clip": 1.02945125, + "balance_loss_mlp": 1.01698196, + "epoch": 0.415632045693672, + "flos": 24461954064000.0, + "grad_norm": 2.3686319650352705, + "language_loss": 0.63381004, + "learning_rate": 2.632535524293914e-06, + "loss": 0.65474355, + "num_input_tokens_seen": 148451510, + "step": 6913, + "time_per_iteration": 2.6859710216522217 + }, + { + "auxiliary_loss_clip": 0.01048757, + "auxiliary_loss_mlp": 0.00747629, + "balance_loss_clip": 1.02643394, + "balance_loss_mlp": 1.00025249, + "epoch": 0.41569216894634, + "flos": 20115290378880.0, + "grad_norm": 1.776862231530764, + "language_loss": 0.75513875, + "learning_rate": 2.632166041703586e-06, + "loss": 0.77310264, + "num_input_tokens_seen": 148469945, + "step": 6914, + "time_per_iteration": 2.7011098861694336 + }, + { + "auxiliary_loss_clip": 0.0103118, + "auxiliary_loss_mlp": 0.01036658, + "balance_loss_clip": 1.02745342, + "balance_loss_mlp": 1.02261543, + "epoch": 0.41575229219900794, + "flos": 23798827128960.0, + "grad_norm": 1.8653763813191635, + "language_loss": 0.87398946, + "learning_rate": 2.631796535141458e-06, + "loss": 0.89466786, + "num_input_tokens_seen": 148486655, + "step": 6915, + "time_per_iteration": 2.8788492679595947 + }, + { + "auxiliary_loss_clip": 0.01041099, + "auxiliary_loss_mlp": 0.01039467, + "balance_loss_clip": 1.02738428, + "balance_loss_mlp": 1.02617538, + "epoch": 0.4158124154516759, + "flos": 23108229267840.0, + "grad_norm": 2.8376053256021074, + "language_loss": 0.70911908, + "learning_rate": 2.6314270046215426e-06, + "loss": 0.72992474, + "num_input_tokens_seen": 148505035, + "step": 6916, + "time_per_iteration": 2.683919906616211 + }, + { + "auxiliary_loss_clip": 0.01080629, + "auxiliary_loss_mlp": 0.01032282, + "balance_loss_clip": 1.03165138, + "balance_loss_mlp": 1.01873994, + "epoch": 0.41587253870434393, + "flos": 24242970798720.0, + "grad_norm": 1.3816214731323642, + "language_loss": 0.71527982, + "learning_rate": 2.631057450157852e-06, + "loss": 0.73640883, + "num_input_tokens_seen": 148525575, + "step": 6917, + "time_per_iteration": 2.6533868312835693 + }, + { + "auxiliary_loss_clip": 0.01048491, + "auxiliary_loss_mlp": 0.01028237, + "balance_loss_clip": 1.02733386, + "balance_loss_mlp": 1.01627445, + "epoch": 0.4159326619570119, + "flos": 23881602021120.0, + "grad_norm": 1.4366862591483331, + "language_loss": 0.80931652, + "learning_rate": 2.6306878717643988e-06, + "loss": 0.83008379, + "num_input_tokens_seen": 148547270, + "step": 6918, + "time_per_iteration": 2.6522536277770996 + }, + { + "auxiliary_loss_clip": 0.01069874, + "auxiliary_loss_mlp": 0.01034465, + "balance_loss_clip": 1.03214645, + "balance_loss_mlp": 1.02112484, + "epoch": 0.41599278520967986, + "flos": 40626531354240.0, + "grad_norm": 1.5718467916257957, + "language_loss": 0.70002091, + "learning_rate": 2.6303182694551995e-06, + "loss": 0.72106433, + "num_input_tokens_seen": 148572100, + "step": 6919, + "time_per_iteration": 2.816941976547241 + }, + { + "auxiliary_loss_clip": 0.01053723, + "auxiliary_loss_mlp": 0.01031421, + "balance_loss_clip": 1.02915978, + "balance_loss_mlp": 1.01789653, + "epoch": 0.4160529084623478, + "flos": 18222942165120.0, + "grad_norm": 2.2122289831849287, + "language_loss": 0.81038505, + "learning_rate": 2.6299486432442677e-06, + "loss": 0.83123648, + "num_input_tokens_seen": 148591245, + "step": 6920, + "time_per_iteration": 2.787379741668701 + }, + { + "auxiliary_loss_clip": 0.01058805, + "auxiliary_loss_mlp": 0.01032579, + "balance_loss_clip": 1.03056884, + "balance_loss_mlp": 1.01912045, + "epoch": 0.4161130317150158, + "flos": 13661963982720.0, + "grad_norm": 2.1224869346616977, + "language_loss": 0.65674275, + "learning_rate": 2.6295789931456195e-06, + "loss": 0.67765653, + "num_input_tokens_seen": 148607980, + "step": 6921, + "time_per_iteration": 2.6812593936920166 + }, + { + "auxiliary_loss_clip": 0.01052936, + "auxiliary_loss_mlp": 0.0103724, + "balance_loss_clip": 1.02732229, + "balance_loss_mlp": 1.02400184, + "epoch": 0.41617315496768376, + "flos": 16178511767040.0, + "grad_norm": 1.8953665302165352, + "language_loss": 0.80882448, + "learning_rate": 2.629209319173274e-06, + "loss": 0.82972622, + "num_input_tokens_seen": 148624490, + "step": 6922, + "time_per_iteration": 2.646336793899536 + }, + { + "auxiliary_loss_clip": 0.0105627, + "auxiliary_loss_mlp": 0.01032966, + "balance_loss_clip": 1.02877498, + "balance_loss_mlp": 1.02015722, + "epoch": 0.4162332782203517, + "flos": 26213317395840.0, + "grad_norm": 1.6431316236272049, + "language_loss": 0.6751343, + "learning_rate": 2.628839621341247e-06, + "loss": 0.69602668, + "num_input_tokens_seen": 148646490, + "step": 6923, + "time_per_iteration": 2.649576425552368 + }, + { + "auxiliary_loss_clip": 0.01046137, + "auxiliary_loss_mlp": 0.01051056, + "balance_loss_clip": 1.02673101, + "balance_loss_mlp": 1.03687048, + "epoch": 0.4162934014730197, + "flos": 28183987215360.0, + "grad_norm": 1.8459906298330642, + "language_loss": 0.76074868, + "learning_rate": 2.6284698996635593e-06, + "loss": 0.78172064, + "num_input_tokens_seen": 148668580, + "step": 6924, + "time_per_iteration": 2.625964403152466 + }, + { + "auxiliary_loss_clip": 0.01078423, + "auxiliary_loss_mlp": 0.01034104, + "balance_loss_clip": 1.02973938, + "balance_loss_mlp": 1.02088404, + "epoch": 0.41635352472568765, + "flos": 19865316654720.0, + "grad_norm": 2.6417773544647534, + "language_loss": 0.73016804, + "learning_rate": 2.62810015415423e-06, + "loss": 0.7512933, + "num_input_tokens_seen": 148688410, + "step": 6925, + "time_per_iteration": 2.512378215789795 + }, + { + "auxiliary_loss_clip": 0.01050659, + "auxiliary_loss_mlp": 0.01031718, + "balance_loss_clip": 1.02463055, + "balance_loss_mlp": 1.01966608, + "epoch": 0.4164136479783556, + "flos": 14935356011520.0, + "grad_norm": 2.6980099428965314, + "language_loss": 0.84144479, + "learning_rate": 2.6277303848272792e-06, + "loss": 0.86226857, + "num_input_tokens_seen": 148704855, + "step": 6926, + "time_per_iteration": 2.753518581390381 + }, + { + "auxiliary_loss_clip": 0.01051849, + "auxiliary_loss_mlp": 0.01038351, + "balance_loss_clip": 1.02836049, + "balance_loss_mlp": 1.02688956, + "epoch": 0.4164737712310236, + "flos": 21757593041280.0, + "grad_norm": 1.7311159508452072, + "language_loss": 0.86490315, + "learning_rate": 2.6273605916967302e-06, + "loss": 0.88580513, + "num_input_tokens_seen": 148723065, + "step": 6927, + "time_per_iteration": 2.598550796508789 + }, + { + "auxiliary_loss_clip": 0.01066403, + "auxiliary_loss_mlp": 0.01041374, + "balance_loss_clip": 1.02878428, + "balance_loss_mlp": 1.02782607, + "epoch": 0.41653389448369155, + "flos": 20740136394240.0, + "grad_norm": 2.1009006618372514, + "language_loss": 0.72055054, + "learning_rate": 2.626990774776604e-06, + "loss": 0.74162829, + "num_input_tokens_seen": 148741780, + "step": 6928, + "time_per_iteration": 2.556544780731201 + }, + { + "auxiliary_loss_clip": 0.01052934, + "auxiliary_loss_mlp": 0.01037739, + "balance_loss_clip": 1.02662265, + "balance_loss_mlp": 1.02511489, + "epoch": 0.4165940177363595, + "flos": 24972891073920.0, + "grad_norm": 2.156567147846025, + "language_loss": 0.77989107, + "learning_rate": 2.6266209340809254e-06, + "loss": 0.80079782, + "num_input_tokens_seen": 148759795, + "step": 6929, + "time_per_iteration": 2.7139806747436523 + }, + { + "auxiliary_loss_clip": 0.01076109, + "auxiliary_loss_mlp": 0.01032498, + "balance_loss_clip": 1.02940154, + "balance_loss_mlp": 1.01998115, + "epoch": 0.41665414098902753, + "flos": 20521727746560.0, + "grad_norm": 3.1438614785887045, + "language_loss": 0.70727921, + "learning_rate": 2.6262510696237182e-06, + "loss": 0.7283653, + "num_input_tokens_seen": 148778680, + "step": 6930, + "time_per_iteration": 2.526806592941284 + }, + { + "auxiliary_loss_clip": 0.01054079, + "auxiliary_loss_mlp": 0.01035804, + "balance_loss_clip": 1.0263418, + "balance_loss_mlp": 1.0235908, + "epoch": 0.4167142642416955, + "flos": 19682926369920.0, + "grad_norm": 2.1857644016319306, + "language_loss": 0.81563264, + "learning_rate": 2.625881181419007e-06, + "loss": 0.83653146, + "num_input_tokens_seen": 148796470, + "step": 6931, + "time_per_iteration": 5.727591514587402 + }, + { + "auxiliary_loss_clip": 0.0101866, + "auxiliary_loss_mlp": 0.01033486, + "balance_loss_clip": 1.02191281, + "balance_loss_mlp": 1.02036691, + "epoch": 0.41677438749436346, + "flos": 23763742519680.0, + "grad_norm": 1.8104013087754653, + "language_loss": 0.79132754, + "learning_rate": 2.6255112694808193e-06, + "loss": 0.811849, + "num_input_tokens_seen": 148815300, + "step": 6932, + "time_per_iteration": 2.7364416122436523 + }, + { + "auxiliary_loss_clip": 0.01052403, + "auxiliary_loss_mlp": 0.00747625, + "balance_loss_clip": 1.02589047, + "balance_loss_mlp": 1.0002408, + "epoch": 0.41683451074703143, + "flos": 30410053712640.0, + "grad_norm": 1.9104713848990131, + "language_loss": 0.81651688, + "learning_rate": 2.6251413338231813e-06, + "loss": 0.83451712, + "num_input_tokens_seen": 148834315, + "step": 6933, + "time_per_iteration": 2.7120156288146973 + }, + { + "auxiliary_loss_clip": 0.01076828, + "auxiliary_loss_mlp": 0.01031551, + "balance_loss_clip": 1.02799809, + "balance_loss_mlp": 1.01767528, + "epoch": 0.4168946339996994, + "flos": 21506757390720.0, + "grad_norm": 1.6925330805428902, + "language_loss": 0.76998532, + "learning_rate": 2.624771374460121e-06, + "loss": 0.79106915, + "num_input_tokens_seen": 148852420, + "step": 6934, + "time_per_iteration": 2.5984296798706055 + }, + { + "auxiliary_loss_clip": 0.01067952, + "auxiliary_loss_mlp": 0.01033862, + "balance_loss_clip": 1.03010809, + "balance_loss_mlp": 1.02091002, + "epoch": 0.41695475725236736, + "flos": 17638675539840.0, + "grad_norm": 1.9404728981049348, + "language_loss": 0.67248166, + "learning_rate": 2.624401391405668e-06, + "loss": 0.6934998, + "num_input_tokens_seen": 148869305, + "step": 6935, + "time_per_iteration": 2.570472478866577 + }, + { + "auxiliary_loss_clip": 0.01055207, + "auxiliary_loss_mlp": 0.01035497, + "balance_loss_clip": 1.02859294, + "balance_loss_mlp": 1.02262831, + "epoch": 0.4170148805050353, + "flos": 15668903560320.0, + "grad_norm": 2.053744611060456, + "language_loss": 0.73212415, + "learning_rate": 2.6240313846738513e-06, + "loss": 0.75303125, + "num_input_tokens_seen": 148886395, + "step": 6936, + "time_per_iteration": 2.6070210933685303 + }, + { + "auxiliary_loss_clip": 0.010651, + "auxiliary_loss_mlp": 0.01036646, + "balance_loss_clip": 1.02911878, + "balance_loss_mlp": 1.02422428, + "epoch": 0.4170750037577033, + "flos": 15159151699200.0, + "grad_norm": 2.164488066231058, + "language_loss": 0.74383992, + "learning_rate": 2.6236613542787024e-06, + "loss": 0.76485741, + "num_input_tokens_seen": 148905235, + "step": 6937, + "time_per_iteration": 2.588503360748291 + }, + { + "auxiliary_loss_clip": 0.01051202, + "auxiliary_loss_mlp": 0.01031408, + "balance_loss_clip": 1.02604532, + "balance_loss_mlp": 1.01951671, + "epoch": 0.41713512701037125, + "flos": 28768289754240.0, + "grad_norm": 1.525910776713736, + "language_loss": 0.84419239, + "learning_rate": 2.6232913002342518e-06, + "loss": 0.86501849, + "num_input_tokens_seen": 148928130, + "step": 6938, + "time_per_iteration": 4.454598426818848 + }, + { + "auxiliary_loss_clip": 0.01051441, + "auxiliary_loss_mlp": 0.01036969, + "balance_loss_clip": 1.02686262, + "balance_loss_mlp": 1.02330196, + "epoch": 0.4171952502630392, + "flos": 28256993608320.0, + "grad_norm": 2.6677252918826873, + "language_loss": 0.73865753, + "learning_rate": 2.6229212225545334e-06, + "loss": 0.75954163, + "num_input_tokens_seen": 148948790, + "step": 6939, + "time_per_iteration": 2.6628153324127197 + }, + { + "auxiliary_loss_clip": 0.01064642, + "auxiliary_loss_mlp": 0.01034881, + "balance_loss_clip": 1.02749491, + "balance_loss_mlp": 1.02148151, + "epoch": 0.4172553735157072, + "flos": 24571697091840.0, + "grad_norm": 1.5349777852654964, + "language_loss": 0.74715734, + "learning_rate": 2.622551121253579e-06, + "loss": 0.76815259, + "num_input_tokens_seen": 148967690, + "step": 6940, + "time_per_iteration": 2.6430413722991943 + }, + { + "auxiliary_loss_clip": 0.01073604, + "auxiliary_loss_mlp": 0.01037544, + "balance_loss_clip": 1.02771473, + "balance_loss_mlp": 1.02531934, + "epoch": 0.41731549676837515, + "flos": 27045797978880.0, + "grad_norm": 1.846930331400562, + "language_loss": 0.71524328, + "learning_rate": 2.622180996345424e-06, + "loss": 0.73635471, + "num_input_tokens_seen": 148987150, + "step": 6941, + "time_per_iteration": 2.7174878120422363 + }, + { + "auxiliary_loss_clip": 0.0106651, + "auxiliary_loss_mlp": 0.01039205, + "balance_loss_clip": 1.029217, + "balance_loss_mlp": 1.02554917, + "epoch": 0.4173756200210431, + "flos": 28394063907840.0, + "grad_norm": 2.0667833399662143, + "language_loss": 0.73399472, + "learning_rate": 2.621810847844104e-06, + "loss": 0.75505185, + "num_input_tokens_seen": 149004895, + "step": 6942, + "time_per_iteration": 4.256747484207153 + }, + { + "auxiliary_loss_clip": 0.01037331, + "auxiliary_loss_mlp": 0.01041634, + "balance_loss_clip": 1.02629483, + "balance_loss_mlp": 1.02766919, + "epoch": 0.41743574327371114, + "flos": 22521556431360.0, + "grad_norm": 2.2060134278689256, + "language_loss": 0.72620356, + "learning_rate": 2.6214406757636534e-06, + "loss": 0.74699318, + "num_input_tokens_seen": 149020970, + "step": 6943, + "time_per_iteration": 2.629711627960205 + }, + { + "auxiliary_loss_clip": 0.01043458, + "auxiliary_loss_mlp": 0.00747707, + "balance_loss_clip": 1.02687383, + "balance_loss_mlp": 1.00024152, + "epoch": 0.4174958665263791, + "flos": 30113431200000.0, + "grad_norm": 1.801509388431173, + "language_loss": 0.64083624, + "learning_rate": 2.621070480118111e-06, + "loss": 0.65874791, + "num_input_tokens_seen": 149041795, + "step": 6944, + "time_per_iteration": 2.7828831672668457 + }, + { + "auxiliary_loss_clip": 0.01040541, + "auxiliary_loss_mlp": 0.010325, + "balance_loss_clip": 1.02386832, + "balance_loss_mlp": 1.01903522, + "epoch": 0.41755598977904707, + "flos": 25263444188160.0, + "grad_norm": 1.424438177399382, + "language_loss": 0.70153046, + "learning_rate": 2.620700260921513e-06, + "loss": 0.72226083, + "num_input_tokens_seen": 149063700, + "step": 6945, + "time_per_iteration": 2.70424485206604 + }, + { + "auxiliary_loss_clip": 0.01029651, + "auxiliary_loss_mlp": 0.01043697, + "balance_loss_clip": 1.02200687, + "balance_loss_mlp": 1.02800345, + "epoch": 0.41761611303171503, + "flos": 19828580019840.0, + "grad_norm": 1.7516245754710835, + "language_loss": 0.80921197, + "learning_rate": 2.620330018187899e-06, + "loss": 0.82994545, + "num_input_tokens_seen": 149082410, + "step": 6946, + "time_per_iteration": 2.654836416244507 + }, + { + "auxiliary_loss_clip": 0.01059288, + "auxiliary_loss_mlp": 0.01033561, + "balance_loss_clip": 1.0272342, + "balance_loss_mlp": 1.02073383, + "epoch": 0.417676236284383, + "flos": 15523249910400.0, + "grad_norm": 2.2077709023473484, + "language_loss": 0.78171241, + "learning_rate": 2.6199597519313086e-06, + "loss": 0.80264091, + "num_input_tokens_seen": 149098745, + "step": 6947, + "time_per_iteration": 2.573246717453003 + }, + { + "auxiliary_loss_clip": 0.01075785, + "auxiliary_loss_mlp": 0.01033096, + "balance_loss_clip": 1.02799428, + "balance_loss_mlp": 1.01947594, + "epoch": 0.41773635953705096, + "flos": 32524473761280.0, + "grad_norm": 2.1893051904868184, + "language_loss": 0.71506238, + "learning_rate": 2.6195894621657825e-06, + "loss": 0.73615122, + "num_input_tokens_seen": 149122255, + "step": 6948, + "time_per_iteration": 2.623352289199829 + }, + { + "auxiliary_loss_clip": 0.01060058, + "auxiliary_loss_mlp": 0.01027199, + "balance_loss_clip": 1.02506602, + "balance_loss_mlp": 1.01533771, + "epoch": 0.4177964827897189, + "flos": 23440941970560.0, + "grad_norm": 1.4680130253508046, + "language_loss": 0.76669884, + "learning_rate": 2.619219148905362e-06, + "loss": 0.78757143, + "num_input_tokens_seen": 149142845, + "step": 6949, + "time_per_iteration": 2.6371071338653564 + }, + { + "auxiliary_loss_clip": 0.01062585, + "auxiliary_loss_mlp": 0.01036907, + "balance_loss_clip": 1.03175533, + "balance_loss_mlp": 1.02363324, + "epoch": 0.4178566060423869, + "flos": 22748907565440.0, + "grad_norm": 1.563937359273841, + "language_loss": 0.8203733, + "learning_rate": 2.6188488121640888e-06, + "loss": 0.84136832, + "num_input_tokens_seen": 149163375, + "step": 6950, + "time_per_iteration": 2.6541407108306885 + }, + { + "auxiliary_loss_clip": 0.0105372, + "auxiliary_loss_mlp": 0.00747443, + "balance_loss_clip": 1.03040147, + "balance_loss_mlp": 1.00026357, + "epoch": 0.41791672929505486, + "flos": 26032794618240.0, + "grad_norm": 1.355869860752245, + "language_loss": 0.76197362, + "learning_rate": 2.618478451956007e-06, + "loss": 0.77998531, + "num_input_tokens_seen": 149185610, + "step": 6951, + "time_per_iteration": 2.6716363430023193 + }, + { + "auxiliary_loss_clip": 0.01033587, + "auxiliary_loss_mlp": 0.01031863, + "balance_loss_clip": 1.02596915, + "balance_loss_mlp": 1.01811242, + "epoch": 0.4179768525477228, + "flos": 19568694142080.0, + "grad_norm": 1.724256968545165, + "language_loss": 0.73124957, + "learning_rate": 2.61810806829516e-06, + "loss": 0.75190407, + "num_input_tokens_seen": 149203990, + "step": 6952, + "time_per_iteration": 2.7396843433380127 + }, + { + "auxiliary_loss_clip": 0.01066909, + "auxiliary_loss_mlp": 0.01033148, + "balance_loss_clip": 1.03158379, + "balance_loss_mlp": 1.02005291, + "epoch": 0.4180369758003908, + "flos": 17783826399360.0, + "grad_norm": 2.508668304236151, + "language_loss": 0.71719062, + "learning_rate": 2.617737661195593e-06, + "loss": 0.73819125, + "num_input_tokens_seen": 149221385, + "step": 6953, + "time_per_iteration": 2.537952423095703 + }, + { + "auxiliary_loss_clip": 0.01060025, + "auxiliary_loss_mlp": 0.01031482, + "balance_loss_clip": 1.02619696, + "balance_loss_mlp": 1.01828575, + "epoch": 0.41809709905305875, + "flos": 20960663944320.0, + "grad_norm": 1.612087671065326, + "language_loss": 0.75995481, + "learning_rate": 2.617367230671353e-06, + "loss": 0.78086984, + "num_input_tokens_seen": 149241175, + "step": 6954, + "time_per_iteration": 2.6040356159210205 + }, + { + "auxiliary_loss_clip": 0.01036807, + "auxiliary_loss_mlp": 0.01035732, + "balance_loss_clip": 1.02837181, + "balance_loss_mlp": 1.02121806, + "epoch": 0.4181572223057267, + "flos": 22017622573440.0, + "grad_norm": 2.215465877174121, + "language_loss": 0.84552097, + "learning_rate": 2.616996776736485e-06, + "loss": 0.86624634, + "num_input_tokens_seen": 149259115, + "step": 6955, + "time_per_iteration": 2.7587621212005615 + }, + { + "auxiliary_loss_clip": 0.01066256, + "auxiliary_loss_mlp": 0.01035244, + "balance_loss_clip": 1.02965188, + "balance_loss_mlp": 1.02294183, + "epoch": 0.4182173455583947, + "flos": 26245528917120.0, + "grad_norm": 1.908025789876227, + "language_loss": 0.83016074, + "learning_rate": 2.616626299405037e-06, + "loss": 0.85117567, + "num_input_tokens_seen": 149278705, + "step": 6956, + "time_per_iteration": 2.6753556728363037 + }, + { + "auxiliary_loss_clip": 0.01036579, + "auxiliary_loss_mlp": 0.01037237, + "balance_loss_clip": 1.02528334, + "balance_loss_mlp": 1.02358174, + "epoch": 0.4182774688110627, + "flos": 14791605782400.0, + "grad_norm": 1.9808271939229514, + "language_loss": 0.71148539, + "learning_rate": 2.616255798691059e-06, + "loss": 0.73222351, + "num_input_tokens_seen": 149294040, + "step": 6957, + "time_per_iteration": 2.7622153759002686 + }, + { + "auxiliary_loss_clip": 0.01045744, + "auxiliary_loss_mlp": 0.01037706, + "balance_loss_clip": 1.02802396, + "balance_loss_mlp": 1.02532005, + "epoch": 0.41833759206373067, + "flos": 20412020632320.0, + "grad_norm": 2.3491522516657124, + "language_loss": 0.7531234, + "learning_rate": 2.6158852746085982e-06, + "loss": 0.77395797, + "num_input_tokens_seen": 149310385, + "step": 6958, + "time_per_iteration": 2.723883628845215 + }, + { + "auxiliary_loss_clip": 0.0102255, + "auxiliary_loss_mlp": 0.00747553, + "balance_loss_clip": 1.02204657, + "balance_loss_mlp": 1.00025451, + "epoch": 0.41839771531639863, + "flos": 23656333875840.0, + "grad_norm": 1.659619725667721, + "language_loss": 0.77532661, + "learning_rate": 2.6155147271717066e-06, + "loss": 0.79302764, + "num_input_tokens_seen": 149328235, + "step": 6959, + "time_per_iteration": 2.8618905544281006 + }, + { + "auxiliary_loss_clip": 0.01034133, + "auxiliary_loss_mlp": 0.00747601, + "balance_loss_clip": 1.02519083, + "balance_loss_mlp": 1.00024629, + "epoch": 0.4184578385690666, + "flos": 19754137082880.0, + "grad_norm": 1.7142981357904996, + "language_loss": 0.7677685, + "learning_rate": 2.6151441563944347e-06, + "loss": 0.78558582, + "num_input_tokens_seen": 149347465, + "step": 6960, + "time_per_iteration": 2.7073729038238525 + }, + { + "auxiliary_loss_clip": 0.01049986, + "auxiliary_loss_mlp": 0.01029628, + "balance_loss_clip": 1.02767146, + "balance_loss_mlp": 1.01755834, + "epoch": 0.41851796182173456, + "flos": 20193396503040.0, + "grad_norm": 2.6224549791978493, + "language_loss": 0.75594372, + "learning_rate": 2.614773562290835e-06, + "loss": 0.7767399, + "num_input_tokens_seen": 149366685, + "step": 6961, + "time_per_iteration": 2.6304211616516113 + }, + { + "auxiliary_loss_clip": 0.0098391, + "auxiliary_loss_mlp": 0.01007485, + "balance_loss_clip": 1.00394845, + "balance_loss_mlp": 1.00597131, + "epoch": 0.41857808507440253, + "flos": 59018794231680.0, + "grad_norm": 0.7822659521675122, + "language_loss": 0.54637277, + "learning_rate": 2.61440294487496e-06, + "loss": 0.5662868, + "num_input_tokens_seen": 149422925, + "step": 6962, + "time_per_iteration": 3.1493351459503174 + }, + { + "auxiliary_loss_clip": 0.01067745, + "auxiliary_loss_mlp": 0.01034946, + "balance_loss_clip": 1.03009462, + "balance_loss_mlp": 1.02180934, + "epoch": 0.4186382083270705, + "flos": 18478805719680.0, + "grad_norm": 1.7522949339771523, + "language_loss": 0.85910946, + "learning_rate": 2.614032304160864e-06, + "loss": 0.88013637, + "num_input_tokens_seen": 149440820, + "step": 6963, + "time_per_iteration": 2.6640233993530273 + }, + { + "auxiliary_loss_clip": 0.01053437, + "auxiliary_loss_mlp": 0.01030866, + "balance_loss_clip": 1.02798975, + "balance_loss_mlp": 1.01855755, + "epoch": 0.41869833157973846, + "flos": 21578758202880.0, + "grad_norm": 1.4739887076400506, + "language_loss": 0.70342714, + "learning_rate": 2.6136616401626014e-06, + "loss": 0.72427016, + "num_input_tokens_seen": 149461060, + "step": 6964, + "time_per_iteration": 2.786905527114868 + }, + { + "auxiliary_loss_clip": 0.01071896, + "auxiliary_loss_mlp": 0.01031889, + "balance_loss_clip": 1.02765155, + "balance_loss_mlp": 1.0198015, + "epoch": 0.4187584548324064, + "flos": 35517412650240.0, + "grad_norm": 1.453306949049259, + "language_loss": 0.71433854, + "learning_rate": 2.6132909528942273e-06, + "loss": 0.73537636, + "num_input_tokens_seen": 149483115, + "step": 6965, + "time_per_iteration": 2.6487910747528076 + }, + { + "auxiliary_loss_clip": 0.01026997, + "auxiliary_loss_mlp": 0.0103251, + "balance_loss_clip": 1.02613163, + "balance_loss_mlp": 1.02106595, + "epoch": 0.4188185780850744, + "flos": 18655880791680.0, + "grad_norm": 1.486656405511036, + "language_loss": 0.72046572, + "learning_rate": 2.6129202423697997e-06, + "loss": 0.74106073, + "num_input_tokens_seen": 149501495, + "step": 6966, + "time_per_iteration": 2.6613547801971436 + }, + { + "auxiliary_loss_clip": 0.01066752, + "auxiliary_loss_mlp": 0.01033352, + "balance_loss_clip": 1.02704763, + "balance_loss_mlp": 1.01905859, + "epoch": 0.41887870133774235, + "flos": 40333428374400.0, + "grad_norm": 1.9918978216735597, + "language_loss": 0.7102378, + "learning_rate": 2.612549508603375e-06, + "loss": 0.73123884, + "num_input_tokens_seen": 149523170, + "step": 6967, + "time_per_iteration": 2.7391014099121094 + }, + { + "auxiliary_loss_clip": 0.01003366, + "auxiliary_loss_mlp": 0.01004488, + "balance_loss_clip": 1.00391746, + "balance_loss_mlp": 1.00264025, + "epoch": 0.4189388245904103, + "flos": 61371336516480.0, + "grad_norm": 0.753685139565028, + "language_loss": 0.46213254, + "learning_rate": 2.612178751609011e-06, + "loss": 0.48221108, + "num_input_tokens_seen": 149583955, + "step": 6968, + "time_per_iteration": 3.102529287338257 + }, + { + "auxiliary_loss_clip": 0.01069264, + "auxiliary_loss_mlp": 0.01039187, + "balance_loss_clip": 1.02936149, + "balance_loss_mlp": 1.02592468, + "epoch": 0.4189989478430783, + "flos": 28215624119040.0, + "grad_norm": 1.7860359484989763, + "language_loss": 0.74833816, + "learning_rate": 2.6118079714007685e-06, + "loss": 0.76942265, + "num_input_tokens_seen": 149604440, + "step": 6969, + "time_per_iteration": 2.6265389919281006 + }, + { + "auxiliary_loss_clip": 0.01054185, + "auxiliary_loss_mlp": 0.01035429, + "balance_loss_clip": 1.02747345, + "balance_loss_mlp": 1.0238533, + "epoch": 0.4190590710957463, + "flos": 24565879088640.0, + "grad_norm": 2.236580287821887, + "language_loss": 0.81061214, + "learning_rate": 2.611437167992705e-06, + "loss": 0.83150834, + "num_input_tokens_seen": 149623745, + "step": 6970, + "time_per_iteration": 2.6368935108184814 + }, + { + "auxiliary_loss_clip": 0.01062988, + "auxiliary_loss_mlp": 0.01033915, + "balance_loss_clip": 1.02826846, + "balance_loss_mlp": 1.02130294, + "epoch": 0.41911919434841427, + "flos": 21726027964800.0, + "grad_norm": 2.8983709774271214, + "language_loss": 0.8331548, + "learning_rate": 2.6110663413988835e-06, + "loss": 0.85412389, + "num_input_tokens_seen": 149643025, + "step": 6971, + "time_per_iteration": 2.57991361618042 + }, + { + "auxiliary_loss_clip": 0.01054166, + "auxiliary_loss_mlp": 0.01033487, + "balance_loss_clip": 1.02898526, + "balance_loss_mlp": 1.02019489, + "epoch": 0.41917931760108224, + "flos": 17601543855360.0, + "grad_norm": 1.6775380796740658, + "language_loss": 0.74582607, + "learning_rate": 2.6106954916333648e-06, + "loss": 0.76670259, + "num_input_tokens_seen": 149660695, + "step": 6972, + "time_per_iteration": 2.708550453186035 + }, + { + "auxiliary_loss_clip": 0.01042241, + "auxiliary_loss_mlp": 0.01033129, + "balance_loss_clip": 1.02360916, + "balance_loss_mlp": 1.0204215, + "epoch": 0.4192394408537502, + "flos": 37816701022080.0, + "grad_norm": 1.413275991054933, + "language_loss": 0.72644371, + "learning_rate": 2.610324618710212e-06, + "loss": 0.74719739, + "num_input_tokens_seen": 149682040, + "step": 6973, + "time_per_iteration": 2.7732584476470947 + }, + { + "auxiliary_loss_clip": 0.01051532, + "auxiliary_loss_mlp": 0.01043316, + "balance_loss_clip": 1.03167868, + "balance_loss_mlp": 1.03021526, + "epoch": 0.41929956410641817, + "flos": 23107726477440.0, + "grad_norm": 1.8674892158974417, + "language_loss": 0.74597657, + "learning_rate": 2.609953722643489e-06, + "loss": 0.7669251, + "num_input_tokens_seen": 149700855, + "step": 6974, + "time_per_iteration": 2.6721248626708984 + }, + { + "auxiliary_loss_clip": 0.01061431, + "auxiliary_loss_mlp": 0.0102935, + "balance_loss_clip": 1.02620566, + "balance_loss_mlp": 1.01726842, + "epoch": 0.41935968735908613, + "flos": 22524537260160.0, + "grad_norm": 2.0430421149014424, + "language_loss": 0.72490817, + "learning_rate": 2.609582803447259e-06, + "loss": 0.74581599, + "num_input_tokens_seen": 149717360, + "step": 6975, + "time_per_iteration": 2.679788827896118 + }, + { + "auxiliary_loss_clip": 0.01053736, + "auxiliary_loss_mlp": 0.01031063, + "balance_loss_clip": 1.02532816, + "balance_loss_mlp": 1.01801586, + "epoch": 0.4194198106117541, + "flos": 26870446759680.0, + "grad_norm": 1.626033175636416, + "language_loss": 0.80825984, + "learning_rate": 2.6092118611355885e-06, + "loss": 0.82910776, + "num_input_tokens_seen": 149738975, + "step": 6976, + "time_per_iteration": 2.6521835327148438 + }, + { + "auxiliary_loss_clip": 0.0103959, + "auxiliary_loss_mlp": 0.01031795, + "balance_loss_clip": 1.02393699, + "balance_loss_mlp": 1.01834786, + "epoch": 0.41947993386442206, + "flos": 19902412425600.0, + "grad_norm": 1.9821005111600494, + "language_loss": 0.67348206, + "learning_rate": 2.6088408957225425e-06, + "loss": 0.69419593, + "num_input_tokens_seen": 149757055, + "step": 6977, + "time_per_iteration": 4.284152030944824 + }, + { + "auxiliary_loss_clip": 0.01066486, + "auxiliary_loss_mlp": 0.01035409, + "balance_loss_clip": 1.02899861, + "balance_loss_mlp": 1.02312446, + "epoch": 0.41954005711709, + "flos": 17383889393280.0, + "grad_norm": 2.246125276004894, + "language_loss": 0.80608571, + "learning_rate": 2.6084699072221898e-06, + "loss": 0.82710469, + "num_input_tokens_seen": 149772885, + "step": 6978, + "time_per_iteration": 4.374671697616577 + }, + { + "auxiliary_loss_clip": 0.01075052, + "auxiliary_loss_mlp": 0.01036402, + "balance_loss_clip": 1.02661669, + "balance_loss_mlp": 1.0230093, + "epoch": 0.419600180369758, + "flos": 25003306915200.0, + "grad_norm": 1.8618000534991188, + "language_loss": 0.82728922, + "learning_rate": 2.6080988956485964e-06, + "loss": 0.84840369, + "num_input_tokens_seen": 149791515, + "step": 6979, + "time_per_iteration": 2.633162260055542 + }, + { + "auxiliary_loss_clip": 0.01073071, + "auxiliary_loss_mlp": 0.01032474, + "balance_loss_clip": 1.02719212, + "balance_loss_mlp": 1.02015924, + "epoch": 0.41966030362242596, + "flos": 17383781652480.0, + "grad_norm": 1.7313470215612266, + "language_loss": 0.83578807, + "learning_rate": 2.6077278610158325e-06, + "loss": 0.85684353, + "num_input_tokens_seen": 149807250, + "step": 6980, + "time_per_iteration": 2.520451068878174 + }, + { + "auxiliary_loss_clip": 0.01075231, + "auxiliary_loss_mlp": 0.01035257, + "balance_loss_clip": 1.0273068, + "balance_loss_mlp": 1.02289534, + "epoch": 0.4197204268750939, + "flos": 22156165330560.0, + "grad_norm": 2.980713722934616, + "language_loss": 0.79025924, + "learning_rate": 2.6073568033379665e-06, + "loss": 0.81136411, + "num_input_tokens_seen": 149821640, + "step": 6981, + "time_per_iteration": 2.5475616455078125 + }, + { + "auxiliary_loss_clip": 0.01042985, + "auxiliary_loss_mlp": 0.01031498, + "balance_loss_clip": 1.02758479, + "balance_loss_mlp": 1.0192728, + "epoch": 0.4197805501277619, + "flos": 22084128604800.0, + "grad_norm": 1.9337279106500502, + "language_loss": 0.84303498, + "learning_rate": 2.6069857226290696e-06, + "loss": 0.86377984, + "num_input_tokens_seen": 149840545, + "step": 6982, + "time_per_iteration": 2.773463726043701 + }, + { + "auxiliary_loss_clip": 0.01061507, + "auxiliary_loss_mlp": 0.0103829, + "balance_loss_clip": 1.02775812, + "balance_loss_mlp": 1.02471197, + "epoch": 0.4198406733804299, + "flos": 26432192920320.0, + "grad_norm": 3.8365210321492182, + "language_loss": 0.56884646, + "learning_rate": 2.606614618903214e-06, + "loss": 0.58984447, + "num_input_tokens_seen": 149860375, + "step": 6983, + "time_per_iteration": 2.6486217975616455 + }, + { + "auxiliary_loss_clip": 0.01063409, + "auxiliary_loss_mlp": 0.01033133, + "balance_loss_clip": 1.02847362, + "balance_loss_mlp": 1.02128959, + "epoch": 0.4199007966330979, + "flos": 12531029293440.0, + "grad_norm": 2.1765245763850656, + "language_loss": 0.81901181, + "learning_rate": 2.606243492174471e-06, + "loss": 0.83997726, + "num_input_tokens_seen": 149877850, + "step": 6984, + "time_per_iteration": 2.6423633098602295 + }, + { + "auxiliary_loss_clip": 0.01053365, + "auxiliary_loss_mlp": 0.01029544, + "balance_loss_clip": 1.02545869, + "balance_loss_mlp": 1.01629376, + "epoch": 0.41996091988576584, + "flos": 21762944167680.0, + "grad_norm": 1.985023040223349, + "language_loss": 0.78926873, + "learning_rate": 2.605872342456914e-06, + "loss": 0.81009781, + "num_input_tokens_seen": 149896110, + "step": 6985, + "time_per_iteration": 2.7351958751678467 + }, + { + "auxiliary_loss_clip": 0.01077243, + "auxiliary_loss_mlp": 0.01032524, + "balance_loss_clip": 1.02710676, + "balance_loss_mlp": 1.01883292, + "epoch": 0.4200210431384338, + "flos": 26541935948160.0, + "grad_norm": 1.7609537416433145, + "language_loss": 0.78073621, + "learning_rate": 2.6055011697646173e-06, + "loss": 0.80183387, + "num_input_tokens_seen": 149916495, + "step": 6986, + "time_per_iteration": 4.191808462142944 + }, + { + "auxiliary_loss_clip": 0.01052534, + "auxiliary_loss_mlp": 0.01031116, + "balance_loss_clip": 1.02820277, + "balance_loss_mlp": 1.0195111, + "epoch": 0.42008116639110177, + "flos": 26795824254720.0, + "grad_norm": 1.760149028739, + "language_loss": 0.71812481, + "learning_rate": 2.605129974111655e-06, + "loss": 0.73896128, + "num_input_tokens_seen": 149936445, + "step": 6987, + "time_per_iteration": 2.9491143226623535 + }, + { + "auxiliary_loss_clip": 0.01048363, + "auxiliary_loss_mlp": 0.00747661, + "balance_loss_clip": 1.02642679, + "balance_loss_mlp": 1.00038946, + "epoch": 0.42014128964376973, + "flos": 32087333243520.0, + "grad_norm": 1.4403423268506592, + "language_loss": 0.75045174, + "learning_rate": 2.604758755512104e-06, + "loss": 0.76841199, + "num_input_tokens_seen": 149959430, + "step": 6988, + "time_per_iteration": 2.7360596656799316 + }, + { + "auxiliary_loss_clip": 0.01067054, + "auxiliary_loss_mlp": 0.01032268, + "balance_loss_clip": 1.0277431, + "balance_loss_mlp": 1.01901245, + "epoch": 0.4202014128964377, + "flos": 26467133875200.0, + "grad_norm": 1.5613168351704032, + "language_loss": 0.7394464, + "learning_rate": 2.60438751398004e-06, + "loss": 0.76043963, + "num_input_tokens_seen": 149980365, + "step": 6989, + "time_per_iteration": 4.282067775726318 + }, + { + "auxiliary_loss_clip": 0.01053975, + "auxiliary_loss_mlp": 0.01031147, + "balance_loss_clip": 1.02608132, + "balance_loss_mlp": 1.0187906, + "epoch": 0.42026153614910566, + "flos": 13401216178560.0, + "grad_norm": 1.987893451517959, + "language_loss": 0.71191585, + "learning_rate": 2.6040162495295404e-06, + "loss": 0.73276711, + "num_input_tokens_seen": 149997375, + "step": 6990, + "time_per_iteration": 2.764486312866211 + }, + { + "auxiliary_loss_clip": 0.01015148, + "auxiliary_loss_mlp": 0.00746427, + "balance_loss_clip": 1.01659596, + "balance_loss_mlp": 1.00004387, + "epoch": 0.42032165940177363, + "flos": 60250457635200.0, + "grad_norm": 0.8300451758856352, + "language_loss": 0.60455954, + "learning_rate": 2.603644962174685e-06, + "loss": 0.62217534, + "num_input_tokens_seen": 150051230, + "step": 6991, + "time_per_iteration": 3.077690362930298 + }, + { + "auxiliary_loss_clip": 0.01077143, + "auxiliary_loss_mlp": 0.0103786, + "balance_loss_clip": 1.02878785, + "balance_loss_mlp": 1.02463424, + "epoch": 0.4203817826544416, + "flos": 24535211852160.0, + "grad_norm": 1.5595862035081867, + "language_loss": 0.83008832, + "learning_rate": 2.6032736519295517e-06, + "loss": 0.85123837, + "num_input_tokens_seen": 150071135, + "step": 6992, + "time_per_iteration": 2.6257448196411133 + }, + { + "auxiliary_loss_clip": 0.01011147, + "auxiliary_loss_mlp": 0.01002714, + "balance_loss_clip": 1.00197005, + "balance_loss_mlp": 1.00117588, + "epoch": 0.42044190590710956, + "flos": 58820781530880.0, + "grad_norm": 0.952399219677022, + "language_loss": 0.65541506, + "learning_rate": 2.6029023188082217e-06, + "loss": 0.67555368, + "num_input_tokens_seen": 150125220, + "step": 6993, + "time_per_iteration": 3.1419553756713867 + }, + { + "auxiliary_loss_clip": 0.01078863, + "auxiliary_loss_mlp": 0.01035748, + "balance_loss_clip": 1.02860141, + "balance_loss_mlp": 1.02064466, + "epoch": 0.4205020291597775, + "flos": 16436063260800.0, + "grad_norm": 1.9325829214450165, + "language_loss": 0.83426869, + "learning_rate": 2.6025309628247746e-06, + "loss": 0.85541481, + "num_input_tokens_seen": 150142300, + "step": 6994, + "time_per_iteration": 2.638015031814575 + }, + { + "auxiliary_loss_clip": 0.01066471, + "auxiliary_loss_mlp": 0.00747741, + "balance_loss_clip": 1.02994919, + "balance_loss_mlp": 1.00036895, + "epoch": 0.4205621524124455, + "flos": 18405655672320.0, + "grad_norm": 1.665683482795245, + "language_loss": 0.78034419, + "learning_rate": 2.6021595839932934e-06, + "loss": 0.79848635, + "num_input_tokens_seen": 150161345, + "step": 6995, + "time_per_iteration": 2.6521823406219482 + }, + { + "auxiliary_loss_clip": 0.01036513, + "auxiliary_loss_mlp": 0.01028772, + "balance_loss_clip": 1.02603197, + "balance_loss_mlp": 1.01676714, + "epoch": 0.4206222756651135, + "flos": 25520097841920.0, + "grad_norm": 1.407068665682573, + "language_loss": 0.80202824, + "learning_rate": 2.60178818232786e-06, + "loss": 0.82268107, + "num_input_tokens_seen": 150182420, + "step": 6996, + "time_per_iteration": 2.9116153717041016 + }, + { + "auxiliary_loss_clip": 0.0105921, + "auxiliary_loss_mlp": 0.00747796, + "balance_loss_clip": 1.02988982, + "balance_loss_mlp": 1.00040436, + "epoch": 0.4206823989177815, + "flos": 15304338472320.0, + "grad_norm": 2.1359360295923038, + "language_loss": 0.75688803, + "learning_rate": 2.601416757842559e-06, + "loss": 0.77495813, + "num_input_tokens_seen": 150200175, + "step": 6997, + "time_per_iteration": 2.779381036758423 + }, + { + "auxiliary_loss_clip": 0.01074585, + "auxiliary_loss_mlp": 0.01040576, + "balance_loss_clip": 1.02704835, + "balance_loss_mlp": 1.02747464, + "epoch": 0.42074252217044944, + "flos": 15554096714880.0, + "grad_norm": 1.9332427297067085, + "language_loss": 0.75863516, + "learning_rate": 2.6010453105514743e-06, + "loss": 0.77978671, + "num_input_tokens_seen": 150217100, + "step": 6998, + "time_per_iteration": 2.5287816524505615 + }, + { + "auxiliary_loss_clip": 0.01079609, + "auxiliary_loss_mlp": 0.01036391, + "balance_loss_clip": 1.03065777, + "balance_loss_mlp": 1.02303362, + "epoch": 0.4208026454231174, + "flos": 26145877610880.0, + "grad_norm": 1.581588871758225, + "language_loss": 0.76074332, + "learning_rate": 2.60067384046869e-06, + "loss": 0.78190327, + "num_input_tokens_seen": 150239830, + "step": 6999, + "time_per_iteration": 2.7333478927612305 + }, + { + "auxiliary_loss_clip": 0.01030283, + "auxiliary_loss_mlp": 0.01041039, + "balance_loss_clip": 1.02563429, + "balance_loss_mlp": 1.0265727, + "epoch": 0.42086276867578537, + "flos": 23550110380800.0, + "grad_norm": 2.2395760533835514, + "language_loss": 0.64121103, + "learning_rate": 2.600302347608295e-06, + "loss": 0.66192424, + "num_input_tokens_seen": 150260690, + "step": 7000, + "time_per_iteration": 2.853625774383545 + }, + { + "auxiliary_loss_clip": 0.01038325, + "auxiliary_loss_mlp": 0.01039336, + "balance_loss_clip": 1.0272752, + "balance_loss_mlp": 1.02569842, + "epoch": 0.42092289192845334, + "flos": 18113414618880.0, + "grad_norm": 1.5362580996152884, + "language_loss": 0.76299691, + "learning_rate": 2.5999308319843743e-06, + "loss": 0.78377348, + "num_input_tokens_seen": 150279885, + "step": 7001, + "time_per_iteration": 2.6821563243865967 + }, + { + "auxiliary_loss_clip": 0.01038711, + "auxiliary_loss_mlp": 0.00747675, + "balance_loss_clip": 1.02753615, + "balance_loss_mlp": 1.00044096, + "epoch": 0.4209830151811213, + "flos": 20006588845440.0, + "grad_norm": 1.5094885321633562, + "language_loss": 0.86475503, + "learning_rate": 2.5995592936110154e-06, + "loss": 0.88261896, + "num_input_tokens_seen": 150297390, + "step": 7002, + "time_per_iteration": 2.6784703731536865 + }, + { + "auxiliary_loss_clip": 0.0104419, + "auxiliary_loss_mlp": 0.01038483, + "balance_loss_clip": 1.02758431, + "balance_loss_mlp": 1.02676511, + "epoch": 0.42104313843378927, + "flos": 21978946604160.0, + "grad_norm": 1.8985705228763012, + "language_loss": 0.67649055, + "learning_rate": 2.5991877325023096e-06, + "loss": 0.69731736, + "num_input_tokens_seen": 150317390, + "step": 7003, + "time_per_iteration": 2.7640650272369385 + }, + { + "auxiliary_loss_clip": 0.01077625, + "auxiliary_loss_mlp": 0.01035912, + "balance_loss_clip": 1.0287931, + "balance_loss_mlp": 1.02222121, + "epoch": 0.42110326168645723, + "flos": 25443966965760.0, + "grad_norm": 1.8474137472015137, + "language_loss": 0.77258635, + "learning_rate": 2.598816148672344e-06, + "loss": 0.79372168, + "num_input_tokens_seen": 150337455, + "step": 7004, + "time_per_iteration": 2.602440357208252 + }, + { + "auxiliary_loss_clip": 0.0107532, + "auxiliary_loss_mlp": 0.01033486, + "balance_loss_clip": 1.02976155, + "balance_loss_mlp": 1.02015889, + "epoch": 0.4211633849391252, + "flos": 17822574195840.0, + "grad_norm": 2.9361400101428976, + "language_loss": 0.68474889, + "learning_rate": 2.59844454213521e-06, + "loss": 0.70583701, + "num_input_tokens_seen": 150355385, + "step": 7005, + "time_per_iteration": 2.6812961101531982 + }, + { + "auxiliary_loss_clip": 0.01065519, + "auxiliary_loss_mlp": 0.01037598, + "balance_loss_clip": 1.02752841, + "balance_loss_mlp": 1.02472329, + "epoch": 0.42122350819179316, + "flos": 16282436791680.0, + "grad_norm": 2.1680153930650956, + "language_loss": 0.72558057, + "learning_rate": 2.5980729129049994e-06, + "loss": 0.74661177, + "num_input_tokens_seen": 150371750, + "step": 7006, + "time_per_iteration": 2.5746123790740967 + }, + { + "auxiliary_loss_clip": 0.0107648, + "auxiliary_loss_mlp": 0.01031724, + "balance_loss_clip": 1.02886939, + "balance_loss_mlp": 1.01898038, + "epoch": 0.4212836314444611, + "flos": 19645866512640.0, + "grad_norm": 1.5374592479804432, + "language_loss": 0.70463836, + "learning_rate": 2.5977012609958033e-06, + "loss": 0.72572041, + "num_input_tokens_seen": 150389955, + "step": 7007, + "time_per_iteration": 2.5930609703063965 + }, + { + "auxiliary_loss_clip": 0.01051898, + "auxiliary_loss_mlp": 0.00747731, + "balance_loss_clip": 1.02624679, + "balance_loss_mlp": 1.0005033, + "epoch": 0.4213437546971291, + "flos": 18369026778240.0, + "grad_norm": 1.7780380459619405, + "language_loss": 0.82699883, + "learning_rate": 2.5973295864217166e-06, + "loss": 0.84499514, + "num_input_tokens_seen": 150405780, + "step": 7008, + "time_per_iteration": 2.5456082820892334 + }, + { + "auxiliary_loss_clip": 0.01043634, + "auxiliary_loss_mlp": 0.01036478, + "balance_loss_clip": 1.02705204, + "balance_loss_mlp": 1.02329314, + "epoch": 0.42140387794979706, + "flos": 27704507541120.0, + "grad_norm": 1.783732425132501, + "language_loss": 0.71851373, + "learning_rate": 2.596957889196831e-06, + "loss": 0.73931479, + "num_input_tokens_seen": 150425615, + "step": 7009, + "time_per_iteration": 2.7023117542266846 + }, + { + "auxiliary_loss_clip": 0.01076167, + "auxiliary_loss_mlp": 0.01028463, + "balance_loss_clip": 1.02899277, + "balance_loss_mlp": 1.015553, + "epoch": 0.4214640012024651, + "flos": 28147071012480.0, + "grad_norm": 2.8725393345246015, + "language_loss": 0.66222405, + "learning_rate": 2.596586169335243e-06, + "loss": 0.68327034, + "num_input_tokens_seen": 150445765, + "step": 7010, + "time_per_iteration": 2.5838820934295654 + }, + { + "auxiliary_loss_clip": 0.01040466, + "auxiliary_loss_mlp": 0.01029983, + "balance_loss_clip": 1.02573609, + "balance_loss_mlp": 1.01695299, + "epoch": 0.42152412445513304, + "flos": 22997265177600.0, + "grad_norm": 1.5119782359760336, + "language_loss": 0.72372711, + "learning_rate": 2.5962144268510477e-06, + "loss": 0.74443161, + "num_input_tokens_seen": 150464405, + "step": 7011, + "time_per_iteration": 2.7377352714538574 + }, + { + "auxiliary_loss_clip": 0.01007106, + "auxiliary_loss_mlp": 0.01003405, + "balance_loss_clip": 1.00708091, + "balance_loss_mlp": 1.00184381, + "epoch": 0.421584247707801, + "flos": 63749592938880.0, + "grad_norm": 0.7989371339847453, + "language_loss": 0.54330426, + "learning_rate": 2.5958426617583417e-06, + "loss": 0.56340939, + "num_input_tokens_seen": 150520430, + "step": 7012, + "time_per_iteration": 3.0425705909729004 + }, + { + "auxiliary_loss_clip": 0.01067016, + "auxiliary_loss_mlp": 0.01034128, + "balance_loss_clip": 1.02832437, + "balance_loss_mlp": 1.02054381, + "epoch": 0.421644370960469, + "flos": 24314612474880.0, + "grad_norm": 1.786469764483999, + "language_loss": 0.78702694, + "learning_rate": 2.5954708740712215e-06, + "loss": 0.80803847, + "num_input_tokens_seen": 150542610, + "step": 7013, + "time_per_iteration": 2.6139943599700928 + }, + { + "auxiliary_loss_clip": 0.01075571, + "auxiliary_loss_mlp": 0.01033245, + "balance_loss_clip": 1.02755725, + "balance_loss_mlp": 1.0193274, + "epoch": 0.42170449421313694, + "flos": 23440690575360.0, + "grad_norm": 1.8830042473666122, + "language_loss": 0.81474316, + "learning_rate": 2.595099063803787e-06, + "loss": 0.83583128, + "num_input_tokens_seen": 150560970, + "step": 7014, + "time_per_iteration": 2.5634610652923584 + }, + { + "auxiliary_loss_clip": 0.01059946, + "auxiliary_loss_mlp": 0.01032383, + "balance_loss_clip": 1.02473366, + "balance_loss_mlp": 1.01883483, + "epoch": 0.4217646174658049, + "flos": 23695476721920.0, + "grad_norm": 1.581384334160082, + "language_loss": 0.77754408, + "learning_rate": 2.5947272309701354e-06, + "loss": 0.7984674, + "num_input_tokens_seen": 150582615, + "step": 7015, + "time_per_iteration": 2.680536985397339 + }, + { + "auxiliary_loss_clip": 0.01075225, + "auxiliary_loss_mlp": 0.01036218, + "balance_loss_clip": 1.02734637, + "balance_loss_mlp": 1.02275956, + "epoch": 0.42182474071847287, + "flos": 24971562270720.0, + "grad_norm": 1.3492506190779023, + "language_loss": 0.82163489, + "learning_rate": 2.594355375584368e-06, + "loss": 0.8427493, + "num_input_tokens_seen": 150603640, + "step": 7016, + "time_per_iteration": 2.645228147506714 + }, + { + "auxiliary_loss_clip": 0.01036048, + "auxiliary_loss_mlp": 0.01031755, + "balance_loss_clip": 1.02510643, + "balance_loss_mlp": 1.01864195, + "epoch": 0.42188486397114083, + "flos": 22856639431680.0, + "grad_norm": 2.4996254949738144, + "language_loss": 0.68054271, + "learning_rate": 2.593983497660586e-06, + "loss": 0.70122075, + "num_input_tokens_seen": 150622490, + "step": 7017, + "time_per_iteration": 2.743821144104004 + }, + { + "auxiliary_loss_clip": 0.01003343, + "auxiliary_loss_mlp": 0.01002612, + "balance_loss_clip": 1.00378966, + "balance_loss_mlp": 1.00085974, + "epoch": 0.4219449872238088, + "flos": 66975700965120.0, + "grad_norm": 0.6772865143584262, + "language_loss": 0.59481114, + "learning_rate": 2.5936115972128895e-06, + "loss": 0.61487067, + "num_input_tokens_seen": 150689545, + "step": 7018, + "time_per_iteration": 3.2740049362182617 + }, + { + "auxiliary_loss_clip": 0.01057668, + "auxiliary_loss_mlp": 0.01035853, + "balance_loss_clip": 1.02492046, + "balance_loss_mlp": 1.02250719, + "epoch": 0.42200511047647676, + "flos": 13115367745920.0, + "grad_norm": 1.7286583130599678, + "language_loss": 0.74909472, + "learning_rate": 2.593239674255382e-06, + "loss": 0.77002996, + "num_input_tokens_seen": 150707610, + "step": 7019, + "time_per_iteration": 2.6978931427001953 + }, + { + "auxiliary_loss_clip": 0.01053824, + "auxiliary_loss_mlp": 0.01032831, + "balance_loss_clip": 1.0300976, + "balance_loss_mlp": 1.01894283, + "epoch": 0.42206523372914473, + "flos": 13991193066240.0, + "grad_norm": 1.8092780552138428, + "language_loss": 0.68802804, + "learning_rate": 2.592867728802166e-06, + "loss": 0.70889461, + "num_input_tokens_seen": 150724530, + "step": 7020, + "time_per_iteration": 2.6187126636505127 + }, + { + "auxiliary_loss_clip": 0.01054386, + "auxiliary_loss_mlp": 0.00747751, + "balance_loss_clip": 1.02890348, + "balance_loss_mlp": 1.00043678, + "epoch": 0.4221253569818127, + "flos": 21942317710080.0, + "grad_norm": 1.5473154610082895, + "language_loss": 0.80799353, + "learning_rate": 2.592495760867347e-06, + "loss": 0.82601488, + "num_input_tokens_seen": 150742870, + "step": 7021, + "time_per_iteration": 2.7508349418640137 + }, + { + "auxiliary_loss_clip": 0.00999592, + "auxiliary_loss_mlp": 0.01048538, + "balance_loss_clip": 1.02112103, + "balance_loss_mlp": 1.03356576, + "epoch": 0.42218548023448066, + "flos": 32192587071360.0, + "grad_norm": 1.9932822990840529, + "language_loss": 0.69754404, + "learning_rate": 2.5921237704650293e-06, + "loss": 0.71802533, + "num_input_tokens_seen": 150765500, + "step": 7022, + "time_per_iteration": 2.8822617530822754 + }, + { + "auxiliary_loss_clip": 0.01060385, + "auxiliary_loss_mlp": 0.0102848, + "balance_loss_clip": 1.02781725, + "balance_loss_mlp": 1.01751852, + "epoch": 0.4222456034871487, + "flos": 30118961894400.0, + "grad_norm": 1.621340258791183, + "language_loss": 0.67238772, + "learning_rate": 2.5917517576093188e-06, + "loss": 0.69327629, + "num_input_tokens_seen": 150784945, + "step": 7023, + "time_per_iteration": 2.776442527770996 + }, + { + "auxiliary_loss_clip": 0.01047048, + "auxiliary_loss_mlp": 0.01041014, + "balance_loss_clip": 1.02754664, + "balance_loss_mlp": 1.02618396, + "epoch": 0.42230572673981664, + "flos": 22127904305280.0, + "grad_norm": 2.6326977142923833, + "language_loss": 0.69571161, + "learning_rate": 2.591379722314322e-06, + "loss": 0.71659219, + "num_input_tokens_seen": 150803120, + "step": 7024, + "time_per_iteration": 4.273206949234009 + }, + { + "auxiliary_loss_clip": 0.01075468, + "auxiliary_loss_mlp": 0.01034538, + "balance_loss_clip": 1.02943516, + "balance_loss_mlp": 1.02172303, + "epoch": 0.4223658499924846, + "flos": 22055077480320.0, + "grad_norm": 1.5699671739037548, + "language_loss": 0.76805794, + "learning_rate": 2.591007664594147e-06, + "loss": 0.78915799, + "num_input_tokens_seen": 150823135, + "step": 7025, + "time_per_iteration": 4.4758827686309814 + }, + { + "auxiliary_loss_clip": 0.01042105, + "auxiliary_loss_mlp": 0.01035297, + "balance_loss_clip": 1.02537525, + "balance_loss_mlp": 1.02263737, + "epoch": 0.4224259732451526, + "flos": 20410727742720.0, + "grad_norm": 1.769455312233292, + "language_loss": 0.79432738, + "learning_rate": 2.5906355844629024e-06, + "loss": 0.81510139, + "num_input_tokens_seen": 150842070, + "step": 7026, + "time_per_iteration": 2.7358858585357666 + }, + { + "auxiliary_loss_clip": 0.01011638, + "auxiliary_loss_mlp": 0.01003279, + "balance_loss_clip": 1.00236177, + "balance_loss_mlp": 1.00165796, + "epoch": 0.42248609649782054, + "flos": 62846655828480.0, + "grad_norm": 0.7278964550768179, + "language_loss": 0.61938089, + "learning_rate": 2.5902634819346966e-06, + "loss": 0.63953006, + "num_input_tokens_seen": 150907450, + "step": 7027, + "time_per_iteration": 3.2410404682159424 + }, + { + "auxiliary_loss_clip": 0.01074507, + "auxiliary_loss_mlp": 0.01034272, + "balance_loss_clip": 1.02927017, + "balance_loss_mlp": 1.02192795, + "epoch": 0.4225462197504885, + "flos": 26249946289920.0, + "grad_norm": 2.447107388406867, + "language_loss": 0.71324182, + "learning_rate": 2.5898913570236414e-06, + "loss": 0.73432958, + "num_input_tokens_seen": 150928040, + "step": 7028, + "time_per_iteration": 2.61008620262146 + }, + { + "auxiliary_loss_clip": 0.01049025, + "auxiliary_loss_mlp": 0.01038523, + "balance_loss_clip": 1.02635574, + "balance_loss_mlp": 1.02607179, + "epoch": 0.42260634300315647, + "flos": 20521943228160.0, + "grad_norm": 1.9353049949616166, + "language_loss": 0.82618535, + "learning_rate": 2.589519209743846e-06, + "loss": 0.8470608, + "num_input_tokens_seen": 150945760, + "step": 7029, + "time_per_iteration": 2.8027899265289307 + }, + { + "auxiliary_loss_clip": 0.01034612, + "auxiliary_loss_mlp": 0.01035802, + "balance_loss_clip": 1.02778947, + "balance_loss_mlp": 1.02219486, + "epoch": 0.42266646625582444, + "flos": 24316731377280.0, + "grad_norm": 1.91352305064604, + "language_loss": 0.75347912, + "learning_rate": 2.589147040109424e-06, + "loss": 0.77418327, + "num_input_tokens_seen": 150965665, + "step": 7030, + "time_per_iteration": 2.6926681995391846 + }, + { + "auxiliary_loss_clip": 0.010722, + "auxiliary_loss_mlp": 0.01034004, + "balance_loss_clip": 1.0258714, + "balance_loss_mlp": 1.02098012, + "epoch": 0.4227265895084924, + "flos": 24204151175040.0, + "grad_norm": 2.2535283404802184, + "language_loss": 0.86578089, + "learning_rate": 2.588774848134486e-06, + "loss": 0.88684297, + "num_input_tokens_seen": 150982260, + "step": 7031, + "time_per_iteration": 2.541271209716797 + }, + { + "auxiliary_loss_clip": 0.01067653, + "auxiliary_loss_mlp": 0.01036354, + "balance_loss_clip": 1.0312494, + "balance_loss_mlp": 1.02271688, + "epoch": 0.42278671276116037, + "flos": 16909760845440.0, + "grad_norm": 2.7069849456186876, + "language_loss": 0.73174864, + "learning_rate": 2.5884026338331473e-06, + "loss": 0.75278878, + "num_input_tokens_seen": 150999990, + "step": 7032, + "time_per_iteration": 2.5395021438598633 + }, + { + "auxiliary_loss_clip": 0.01043359, + "auxiliary_loss_mlp": 0.01042401, + "balance_loss_clip": 1.02455783, + "balance_loss_mlp": 1.02859032, + "epoch": 0.42284683601382833, + "flos": 25411073086080.0, + "grad_norm": 1.9278186186615933, + "language_loss": 0.70500886, + "learning_rate": 2.5880303972195222e-06, + "loss": 0.72586644, + "num_input_tokens_seen": 151021105, + "step": 7033, + "time_per_iteration": 4.2685136795043945 + }, + { + "auxiliary_loss_clip": 0.0104988, + "auxiliary_loss_mlp": 0.00747827, + "balance_loss_clip": 1.02788174, + "balance_loss_mlp": 1.0005188, + "epoch": 0.4229069592664963, + "flos": 23040322606080.0, + "grad_norm": 8.281802155791016, + "language_loss": 0.90208828, + "learning_rate": 2.5876581383077256e-06, + "loss": 0.92006528, + "num_input_tokens_seen": 151040665, + "step": 7034, + "time_per_iteration": 2.6617891788482666 + }, + { + "auxiliary_loss_clip": 0.0105457, + "auxiliary_loss_mlp": 0.01035153, + "balance_loss_clip": 1.02850175, + "balance_loss_mlp": 1.02286863, + "epoch": 0.42296708251916426, + "flos": 26067448264320.0, + "grad_norm": 1.492391138138911, + "language_loss": 0.77179426, + "learning_rate": 2.5872858571118723e-06, + "loss": 0.79269147, + "num_input_tokens_seen": 151061240, + "step": 7035, + "time_per_iteration": 2.6718757152557373 + }, + { + "auxiliary_loss_clip": 0.01067619, + "auxiliary_loss_mlp": 0.0103907, + "balance_loss_clip": 1.03020918, + "balance_loss_mlp": 1.02592731, + "epoch": 0.4230272057718323, + "flos": 19458376496640.0, + "grad_norm": 1.7982734777673137, + "language_loss": 0.8267104, + "learning_rate": 2.5869135536460817e-06, + "loss": 0.84777725, + "num_input_tokens_seen": 151076870, + "step": 7036, + "time_per_iteration": 2.611799955368042 + }, + { + "auxiliary_loss_clip": 0.01052774, + "auxiliary_loss_mlp": 0.01030043, + "balance_loss_clip": 1.02901399, + "balance_loss_mlp": 1.01758599, + "epoch": 0.42308732902450025, + "flos": 22383300983040.0, + "grad_norm": 1.6969846031667997, + "language_loss": 0.70167708, + "learning_rate": 2.58654122792447e-06, + "loss": 0.72250527, + "num_input_tokens_seen": 151095110, + "step": 7037, + "time_per_iteration": 4.275037050247192 + }, + { + "auxiliary_loss_clip": 0.0104298, + "auxiliary_loss_mlp": 0.00747844, + "balance_loss_clip": 1.0271244, + "balance_loss_mlp": 1.00044215, + "epoch": 0.4231474522771682, + "flos": 20995425331200.0, + "grad_norm": 1.563895936158533, + "language_loss": 0.7815268, + "learning_rate": 2.586168879961155e-06, + "loss": 0.79943508, + "num_input_tokens_seen": 151114355, + "step": 7038, + "time_per_iteration": 2.701900005340576 + }, + { + "auxiliary_loss_clip": 0.01037307, + "auxiliary_loss_mlp": 0.01044339, + "balance_loss_clip": 1.02918744, + "balance_loss_mlp": 1.02985477, + "epoch": 0.4232075755298362, + "flos": 14975863574400.0, + "grad_norm": 22.636856462861562, + "language_loss": 0.66518795, + "learning_rate": 2.585796509770259e-06, + "loss": 0.68600446, + "num_input_tokens_seen": 151131505, + "step": 7039, + "time_per_iteration": 2.705490827560425 + }, + { + "auxiliary_loss_clip": 0.0106735, + "auxiliary_loss_mlp": 0.01035026, + "balance_loss_clip": 1.02776814, + "balance_loss_mlp": 1.02117395, + "epoch": 0.42326769878250414, + "flos": 24532661986560.0, + "grad_norm": 1.7937783590915273, + "language_loss": 0.75837314, + "learning_rate": 2.5854241173658996e-06, + "loss": 0.77939689, + "num_input_tokens_seen": 151151555, + "step": 7040, + "time_per_iteration": 2.9046895503997803 + }, + { + "auxiliary_loss_clip": 0.01064309, + "auxiliary_loss_mlp": 0.01031062, + "balance_loss_clip": 1.0270381, + "balance_loss_mlp": 1.0179553, + "epoch": 0.4233278220351721, + "flos": 26870303105280.0, + "grad_norm": 1.5538065620648813, + "language_loss": 0.65229487, + "learning_rate": 2.5850517027621996e-06, + "loss": 0.67324865, + "num_input_tokens_seen": 151172385, + "step": 7041, + "time_per_iteration": 2.6609199047088623 + }, + { + "auxiliary_loss_clip": 0.01046039, + "auxiliary_loss_mlp": 0.01033815, + "balance_loss_clip": 1.02532077, + "balance_loss_mlp": 1.01983166, + "epoch": 0.4233879452878401, + "flos": 42814927463040.0, + "grad_norm": 1.6829902081275685, + "language_loss": 0.73930413, + "learning_rate": 2.5846792659732803e-06, + "loss": 0.76010263, + "num_input_tokens_seen": 151194930, + "step": 7042, + "time_per_iteration": 2.832458257675171 + }, + { + "auxiliary_loss_clip": 0.01061825, + "auxiliary_loss_mlp": 0.01027064, + "balance_loss_clip": 1.02802575, + "balance_loss_mlp": 1.01543486, + "epoch": 0.42344806854050804, + "flos": 25229006023680.0, + "grad_norm": 1.3449761560622702, + "language_loss": 0.82036972, + "learning_rate": 2.5843068070132643e-06, + "loss": 0.84125865, + "num_input_tokens_seen": 151217905, + "step": 7043, + "time_per_iteration": 2.693683385848999 + }, + { + "auxiliary_loss_clip": 0.01056419, + "auxiliary_loss_mlp": 0.01039623, + "balance_loss_clip": 1.02953649, + "balance_loss_mlp": 1.02511549, + "epoch": 0.423508191793176, + "flos": 22778820616320.0, + "grad_norm": 2.594188813323731, + "language_loss": 0.64783365, + "learning_rate": 2.5839343258962763e-06, + "loss": 0.66879404, + "num_input_tokens_seen": 151234580, + "step": 7044, + "time_per_iteration": 2.8934288024902344 + }, + { + "auxiliary_loss_clip": 0.01058145, + "auxiliary_loss_mlp": 0.01046325, + "balance_loss_clip": 1.02723384, + "balance_loss_mlp": 1.03075635, + "epoch": 0.42356831504584397, + "flos": 34637493179520.0, + "grad_norm": 1.8228892796761487, + "language_loss": 0.75158924, + "learning_rate": 2.5835618226364393e-06, + "loss": 0.77263391, + "num_input_tokens_seen": 151254765, + "step": 7045, + "time_per_iteration": 2.8304519653320312 + }, + { + "auxiliary_loss_clip": 0.01041455, + "auxiliary_loss_mlp": 0.01042596, + "balance_loss_clip": 1.02931738, + "balance_loss_mlp": 1.02844524, + "epoch": 0.42362843829851193, + "flos": 17596767346560.0, + "grad_norm": 2.276490684181688, + "language_loss": 0.80916572, + "learning_rate": 2.5831892972478797e-06, + "loss": 0.83000624, + "num_input_tokens_seen": 151269045, + "step": 7046, + "time_per_iteration": 2.814026117324829 + }, + { + "auxiliary_loss_clip": 0.00994337, + "auxiliary_loss_mlp": 0.01034228, + "balance_loss_clip": 1.02910531, + "balance_loss_mlp": 1.01992261, + "epoch": 0.4236885615511799, + "flos": 22565691267840.0, + "grad_norm": 1.5436989348813097, + "language_loss": 0.76404095, + "learning_rate": 2.5828167497447242e-06, + "loss": 0.78432655, + "num_input_tokens_seen": 151287530, + "step": 7047, + "time_per_iteration": 3.005788803100586 + }, + { + "auxiliary_loss_clip": 0.01075512, + "auxiliary_loss_mlp": 0.01034257, + "balance_loss_clip": 1.03015196, + "balance_loss_mlp": 1.02191257, + "epoch": 0.42374868480384786, + "flos": 26469216864000.0, + "grad_norm": 1.6579229075982567, + "language_loss": 0.67784238, + "learning_rate": 2.582444180141098e-06, + "loss": 0.69894004, + "num_input_tokens_seen": 151308905, + "step": 7048, + "time_per_iteration": 2.8067643642425537 + }, + { + "auxiliary_loss_clip": 0.01059526, + "auxiliary_loss_mlp": 0.01032706, + "balance_loss_clip": 1.02630317, + "balance_loss_mlp": 1.01903903, + "epoch": 0.4238088080565159, + "flos": 20370220179840.0, + "grad_norm": 1.9272155546018312, + "language_loss": 0.78058815, + "learning_rate": 2.5820715884511307e-06, + "loss": 0.80151051, + "num_input_tokens_seen": 151326525, + "step": 7049, + "time_per_iteration": 2.6410322189331055 + }, + { + "auxiliary_loss_clip": 0.0106893, + "auxiliary_loss_mlp": 0.01037977, + "balance_loss_clip": 1.031039, + "balance_loss_mlp": 1.02422643, + "epoch": 0.42386893130918385, + "flos": 21172105353600.0, + "grad_norm": 1.748292710397693, + "language_loss": 0.82454431, + "learning_rate": 2.5816989746889504e-06, + "loss": 0.84561342, + "num_input_tokens_seen": 151344675, + "step": 7050, + "time_per_iteration": 2.6344292163848877 + }, + { + "auxiliary_loss_clip": 0.01074456, + "auxiliary_loss_mlp": 0.0102959, + "balance_loss_clip": 1.02669263, + "balance_loss_mlp": 1.01629269, + "epoch": 0.4239290545618518, + "flos": 17675627656320.0, + "grad_norm": 2.1245255703576804, + "language_loss": 0.73537838, + "learning_rate": 2.581326338868687e-06, + "loss": 0.75641882, + "num_input_tokens_seen": 151360730, + "step": 7051, + "time_per_iteration": 2.7022879123687744 + }, + { + "auxiliary_loss_clip": 0.01045435, + "auxiliary_loss_mlp": 0.01032481, + "balance_loss_clip": 1.02770472, + "balance_loss_mlp": 1.01959491, + "epoch": 0.4239891778145198, + "flos": 24314504734080.0, + "grad_norm": 1.4717967590973555, + "language_loss": 0.86051285, + "learning_rate": 2.5809536810044706e-06, + "loss": 0.88129199, + "num_input_tokens_seen": 151380445, + "step": 7052, + "time_per_iteration": 2.651831865310669 + }, + { + "auxiliary_loss_clip": 0.01056455, + "auxiliary_loss_mlp": 0.01042754, + "balance_loss_clip": 1.02900302, + "balance_loss_mlp": 1.02943277, + "epoch": 0.42404930106718774, + "flos": 20558428467840.0, + "grad_norm": 1.8029535834660277, + "language_loss": 0.72207642, + "learning_rate": 2.5805810011104323e-06, + "loss": 0.74306846, + "num_input_tokens_seen": 151399325, + "step": 7053, + "time_per_iteration": 2.6165664196014404 + }, + { + "auxiliary_loss_clip": 0.01041046, + "auxiliary_loss_mlp": 0.00747851, + "balance_loss_clip": 1.02874994, + "balance_loss_mlp": 1.0006094, + "epoch": 0.4241094243198557, + "flos": 22308067946880.0, + "grad_norm": 1.58057608329976, + "language_loss": 0.82177299, + "learning_rate": 2.580208299200704e-06, + "loss": 0.83966196, + "num_input_tokens_seen": 151417240, + "step": 7054, + "time_per_iteration": 2.6699132919311523 + }, + { + "auxiliary_loss_clip": 0.01003474, + "auxiliary_loss_mlp": 0.01011875, + "balance_loss_clip": 1.00453758, + "balance_loss_mlp": 1.01028919, + "epoch": 0.4241695475725237, + "flos": 70612445272320.0, + "grad_norm": 0.7863604178198947, + "language_loss": 0.60330552, + "learning_rate": 2.5798355752894183e-06, + "loss": 0.62345898, + "num_input_tokens_seen": 151476015, + "step": 7055, + "time_per_iteration": 3.1064584255218506 + }, + { + "auxiliary_loss_clip": 0.01076673, + "auxiliary_loss_mlp": 0.01041112, + "balance_loss_clip": 1.02899289, + "balance_loss_mlp": 1.02721834, + "epoch": 0.42422967082519164, + "flos": 14027462824320.0, + "grad_norm": 2.4077162847694455, + "language_loss": 0.76455897, + "learning_rate": 2.5794628293907107e-06, + "loss": 0.78573686, + "num_input_tokens_seen": 151492035, + "step": 7056, + "time_per_iteration": 2.5432581901550293 + }, + { + "auxiliary_loss_clip": 0.01070519, + "auxiliary_loss_mlp": 0.01037404, + "balance_loss_clip": 1.02964997, + "balance_loss_mlp": 1.02209735, + "epoch": 0.4242897940778596, + "flos": 22345522853760.0, + "grad_norm": 3.5088914294397053, + "language_loss": 0.84204507, + "learning_rate": 2.579090061518714e-06, + "loss": 0.86312425, + "num_input_tokens_seen": 151508970, + "step": 7057, + "time_per_iteration": 2.5713202953338623 + }, + { + "auxiliary_loss_clip": 0.01048036, + "auxiliary_loss_mlp": 0.01034518, + "balance_loss_clip": 1.02966213, + "balance_loss_mlp": 1.02049899, + "epoch": 0.42434991733052757, + "flos": 22595855713920.0, + "grad_norm": 2.2882811064327577, + "language_loss": 0.83269733, + "learning_rate": 2.5787172716875642e-06, + "loss": 0.8535229, + "num_input_tokens_seen": 151525295, + "step": 7058, + "time_per_iteration": 2.7147858142852783 + }, + { + "auxiliary_loss_clip": 0.01056076, + "auxiliary_loss_mlp": 0.00747765, + "balance_loss_clip": 1.03120148, + "balance_loss_mlp": 1.00052321, + "epoch": 0.42441004058319554, + "flos": 20011437181440.0, + "grad_norm": 7.589886421610288, + "language_loss": 0.80294484, + "learning_rate": 2.5783444599113973e-06, + "loss": 0.82098323, + "num_input_tokens_seen": 151544435, + "step": 7059, + "time_per_iteration": 2.6864027976989746 + }, + { + "auxiliary_loss_clip": 0.01078189, + "auxiliary_loss_mlp": 0.01038763, + "balance_loss_clip": 1.03084874, + "balance_loss_mlp": 1.02430916, + "epoch": 0.4244701638358635, + "flos": 11144985235200.0, + "grad_norm": 1.9033423011284545, + "language_loss": 0.70396948, + "learning_rate": 2.57797162620435e-06, + "loss": 0.72513902, + "num_input_tokens_seen": 151559520, + "step": 7060, + "time_per_iteration": 2.621873617172241 + }, + { + "auxiliary_loss_clip": 0.01067089, + "auxiliary_loss_mlp": 0.01033309, + "balance_loss_clip": 1.02888632, + "balance_loss_mlp": 1.02005315, + "epoch": 0.42453028708853147, + "flos": 23987753688960.0, + "grad_norm": 1.6829414953494986, + "language_loss": 0.75995445, + "learning_rate": 2.577598770580562e-06, + "loss": 0.78095841, + "num_input_tokens_seen": 151579790, + "step": 7061, + "time_per_iteration": 2.606947660446167 + }, + { + "auxiliary_loss_clip": 0.01071109, + "auxiliary_loss_mlp": 0.01040529, + "balance_loss_clip": 1.03206122, + "balance_loss_mlp": 1.02606273, + "epoch": 0.42459041034119943, + "flos": 18406338030720.0, + "grad_norm": 1.982159170340676, + "language_loss": 0.72993374, + "learning_rate": 2.5772258930541693e-06, + "loss": 0.75105011, + "num_input_tokens_seen": 151598285, + "step": 7062, + "time_per_iteration": 2.6239306926727295 + }, + { + "auxiliary_loss_clip": 0.01058548, + "auxiliary_loss_mlp": 0.01040034, + "balance_loss_clip": 1.03147173, + "balance_loss_mlp": 1.02641416, + "epoch": 0.42465053359386745, + "flos": 20958006337920.0, + "grad_norm": 1.7165816137993242, + "language_loss": 0.66242254, + "learning_rate": 2.5768529936393137e-06, + "loss": 0.68340832, + "num_input_tokens_seen": 151615430, + "step": 7063, + "time_per_iteration": 2.7543392181396484 + }, + { + "auxiliary_loss_clip": 0.010416, + "auxiliary_loss_mlp": 0.00747805, + "balance_loss_clip": 1.0250808, + "balance_loss_mlp": 1.00052786, + "epoch": 0.4247106568465354, + "flos": 33106190520960.0, + "grad_norm": 1.4787100261483237, + "language_loss": 0.78849864, + "learning_rate": 2.5764800723501354e-06, + "loss": 0.80639267, + "num_input_tokens_seen": 151637030, + "step": 7064, + "time_per_iteration": 2.752210855484009 + }, + { + "auxiliary_loss_clip": 0.01075037, + "auxiliary_loss_mlp": 0.01035919, + "balance_loss_clip": 1.02781892, + "balance_loss_mlp": 1.02210832, + "epoch": 0.4247707800992034, + "flos": 20046916840320.0, + "grad_norm": 2.037856019655122, + "language_loss": 0.75185335, + "learning_rate": 2.5761071292007736e-06, + "loss": 0.77296287, + "num_input_tokens_seen": 151655745, + "step": 7065, + "time_per_iteration": 2.7630629539489746 + }, + { + "auxiliary_loss_clip": 0.01066163, + "auxiliary_loss_mlp": 0.01036045, + "balance_loss_clip": 1.02967834, + "balance_loss_mlp": 1.0225625, + "epoch": 0.42483090335187135, + "flos": 22385132576640.0, + "grad_norm": 1.4090387589583209, + "language_loss": 0.72647351, + "learning_rate": 2.5757341642053725e-06, + "loss": 0.74749559, + "num_input_tokens_seen": 151678040, + "step": 7066, + "time_per_iteration": 2.79107666015625 + }, + { + "auxiliary_loss_clip": 0.01045707, + "auxiliary_loss_mlp": 0.01037589, + "balance_loss_clip": 1.02706659, + "balance_loss_mlp": 1.02304578, + "epoch": 0.4248910266045393, + "flos": 21356830022400.0, + "grad_norm": 2.081241019250159, + "language_loss": 0.79876947, + "learning_rate": 2.5753611773780745e-06, + "loss": 0.81960243, + "num_input_tokens_seen": 151696410, + "step": 7067, + "time_per_iteration": 2.7071733474731445 + }, + { + "auxiliary_loss_clip": 0.01012259, + "auxiliary_loss_mlp": 0.0100115, + "balance_loss_clip": 1.00360799, + "balance_loss_mlp": 0.99965346, + "epoch": 0.4249511498572073, + "flos": 64008114099840.0, + "grad_norm": 0.9174027876825195, + "language_loss": 0.63479018, + "learning_rate": 2.574988168733022e-06, + "loss": 0.65492427, + "num_input_tokens_seen": 151756365, + "step": 7068, + "time_per_iteration": 3.1207189559936523 + }, + { + "auxiliary_loss_clip": 0.01076613, + "auxiliary_loss_mlp": 0.01034645, + "balance_loss_clip": 1.02944088, + "balance_loss_mlp": 1.02034545, + "epoch": 0.42501127310987524, + "flos": 19607046888960.0, + "grad_norm": 1.8962749349419759, + "language_loss": 0.7207346, + "learning_rate": 2.574615138284361e-06, + "loss": 0.74184722, + "num_input_tokens_seen": 151775165, + "step": 7069, + "time_per_iteration": 2.5912935733795166 + }, + { + "auxiliary_loss_clip": 0.01077575, + "auxiliary_loss_mlp": 0.01034696, + "balance_loss_clip": 1.02916133, + "balance_loss_mlp": 1.02008116, + "epoch": 0.4250713963625432, + "flos": 19462326992640.0, + "grad_norm": 1.86753721369607, + "language_loss": 0.79014337, + "learning_rate": 2.5742420860462364e-06, + "loss": 0.81126606, + "num_input_tokens_seen": 151792620, + "step": 7070, + "time_per_iteration": 2.8348405361175537 + }, + { + "auxiliary_loss_clip": 0.01065864, + "auxiliary_loss_mlp": 0.01032771, + "balance_loss_clip": 1.0288384, + "balance_loss_mlp": 1.01891327, + "epoch": 0.4251315196152112, + "flos": 25337707557120.0, + "grad_norm": 1.7737094936318856, + "language_loss": 0.70622343, + "learning_rate": 2.573869012032795e-06, + "loss": 0.72720981, + "num_input_tokens_seen": 151812850, + "step": 7071, + "time_per_iteration": 4.3985278606414795 + }, + { + "auxiliary_loss_clip": 0.01075157, + "auxiliary_loss_mlp": 0.01034946, + "balance_loss_clip": 1.02810204, + "balance_loss_mlp": 1.02158213, + "epoch": 0.42519164286787914, + "flos": 26359186527360.0, + "grad_norm": 2.2329631492325137, + "language_loss": 0.71888191, + "learning_rate": 2.5734959162581824e-06, + "loss": 0.73998296, + "num_input_tokens_seen": 151831785, + "step": 7072, + "time_per_iteration": 4.2892351150512695 + }, + { + "auxiliary_loss_clip": 0.01034272, + "auxiliary_loss_mlp": 0.01036584, + "balance_loss_clip": 1.02644479, + "balance_loss_mlp": 1.02294087, + "epoch": 0.4252517661205471, + "flos": 26031070765440.0, + "grad_norm": 1.5363681469788257, + "language_loss": 0.81656897, + "learning_rate": 2.5731227987365475e-06, + "loss": 0.83727753, + "num_input_tokens_seen": 151853885, + "step": 7073, + "time_per_iteration": 2.7573494911193848 + }, + { + "auxiliary_loss_clip": 0.01065144, + "auxiliary_loss_mlp": 0.01034921, + "balance_loss_clip": 1.02930415, + "balance_loss_mlp": 1.02276778, + "epoch": 0.42531188937321507, + "flos": 12713635059840.0, + "grad_norm": 2.112090704161367, + "language_loss": 0.90617716, + "learning_rate": 2.5727496594820386e-06, + "loss": 0.92717779, + "num_input_tokens_seen": 151871780, + "step": 7074, + "time_per_iteration": 2.5456714630126953 + }, + { + "auxiliary_loss_clip": 0.01069965, + "auxiliary_loss_mlp": 0.00747864, + "balance_loss_clip": 1.02923298, + "balance_loss_mlp": 1.00055349, + "epoch": 0.42537201262588303, + "flos": 22091670460800.0, + "grad_norm": 1.6174483277039147, + "language_loss": 0.64191341, + "learning_rate": 2.572376498508805e-06, + "loss": 0.66009176, + "num_input_tokens_seen": 151891600, + "step": 7075, + "time_per_iteration": 2.5760037899017334 + }, + { + "auxiliary_loss_clip": 0.01041235, + "auxiliary_loss_mlp": 0.01029304, + "balance_loss_clip": 1.02760601, + "balance_loss_mlp": 1.01717472, + "epoch": 0.42543213587855105, + "flos": 23003119094400.0, + "grad_norm": 1.8331132694537282, + "language_loss": 0.73480904, + "learning_rate": 2.5720033158309973e-06, + "loss": 0.75551438, + "num_input_tokens_seen": 151911330, + "step": 7076, + "time_per_iteration": 2.6374568939208984 + }, + { + "auxiliary_loss_clip": 0.01048251, + "auxiliary_loss_mlp": 0.0103911, + "balance_loss_clip": 1.02718413, + "balance_loss_mlp": 1.02507937, + "epoch": 0.425492259131219, + "flos": 25082454533760.0, + "grad_norm": 1.925863802341982, + "language_loss": 0.78877807, + "learning_rate": 2.571630111462766e-06, + "loss": 0.80965167, + "num_input_tokens_seen": 151930355, + "step": 7077, + "time_per_iteration": 2.599297285079956 + }, + { + "auxiliary_loss_clip": 0.01051104, + "auxiliary_loss_mlp": 0.01032552, + "balance_loss_clip": 1.02821624, + "balance_loss_mlp": 1.02089334, + "epoch": 0.425552382383887, + "flos": 22816850140800.0, + "grad_norm": 1.6814748419807586, + "language_loss": 0.72808033, + "learning_rate": 2.571256885418265e-06, + "loss": 0.74891686, + "num_input_tokens_seen": 151949695, + "step": 7078, + "time_per_iteration": 2.638366937637329 + }, + { + "auxiliary_loss_clip": 0.0105501, + "auxiliary_loss_mlp": 0.01038213, + "balance_loss_clip": 1.02982926, + "balance_loss_mlp": 1.02574968, + "epoch": 0.42561250563655495, + "flos": 13553585671680.0, + "grad_norm": 1.7850141152341261, + "language_loss": 0.79559797, + "learning_rate": 2.5708836377116445e-06, + "loss": 0.81653023, + "num_input_tokens_seen": 151967640, + "step": 7079, + "time_per_iteration": 2.803492546081543 + }, + { + "auxiliary_loss_clip": 0.01067201, + "auxiliary_loss_mlp": 0.01031034, + "balance_loss_clip": 1.03204882, + "balance_loss_mlp": 1.01884472, + "epoch": 0.4256726288892229, + "flos": 46978303023360.0, + "grad_norm": 1.4673530489321052, + "language_loss": 0.7213974, + "learning_rate": 2.5705103683570592e-06, + "loss": 0.74237978, + "num_input_tokens_seen": 151994020, + "step": 7080, + "time_per_iteration": 4.45255708694458 + }, + { + "auxiliary_loss_clip": 0.01073395, + "auxiliary_loss_mlp": 0.01033826, + "balance_loss_clip": 1.02786994, + "balance_loss_mlp": 1.0214994, + "epoch": 0.4257327521418909, + "flos": 23586451966080.0, + "grad_norm": 2.2672951010520124, + "language_loss": 0.80443293, + "learning_rate": 2.5701370773686646e-06, + "loss": 0.82550508, + "num_input_tokens_seen": 152013415, + "step": 7081, + "time_per_iteration": 2.593864679336548 + }, + { + "auxiliary_loss_clip": 0.01044765, + "auxiliary_loss_mlp": 0.01031461, + "balance_loss_clip": 1.02737021, + "balance_loss_mlp": 1.01881337, + "epoch": 0.42579287539455885, + "flos": 18989994124800.0, + "grad_norm": 1.8937114723822799, + "language_loss": 0.81666797, + "learning_rate": 2.5697637647606138e-06, + "loss": 0.83743024, + "num_input_tokens_seen": 152030860, + "step": 7082, + "time_per_iteration": 2.6648335456848145 + }, + { + "auxiliary_loss_clip": 0.01065163, + "auxiliary_loss_mlp": 0.01040268, + "balance_loss_clip": 1.02912951, + "balance_loss_mlp": 1.02694607, + "epoch": 0.4258529986472268, + "flos": 25191910252800.0, + "grad_norm": 1.754165962666913, + "language_loss": 0.69819355, + "learning_rate": 2.569390430547065e-06, + "loss": 0.71924788, + "num_input_tokens_seen": 152050395, + "step": 7083, + "time_per_iteration": 2.831695556640625 + }, + { + "auxiliary_loss_clip": 0.01005793, + "auxiliary_loss_mlp": 0.0100549, + "balance_loss_clip": 1.00650668, + "balance_loss_mlp": 1.0038445, + "epoch": 0.4259131218998948, + "flos": 69968280718080.0, + "grad_norm": 0.8749509744829268, + "language_loss": 0.67151278, + "learning_rate": 2.569017074742173e-06, + "loss": 0.6916256, + "num_input_tokens_seen": 152113555, + "step": 7084, + "time_per_iteration": 3.3596811294555664 + }, + { + "auxiliary_loss_clip": 0.01064641, + "auxiliary_loss_mlp": 0.01040019, + "balance_loss_clip": 1.02928233, + "balance_loss_mlp": 1.02628636, + "epoch": 0.42597324515256274, + "flos": 18004964480640.0, + "grad_norm": 2.043924214558876, + "language_loss": 0.78772914, + "learning_rate": 2.5686436973600964e-06, + "loss": 0.80877572, + "num_input_tokens_seen": 152131575, + "step": 7085, + "time_per_iteration": 4.2086358070373535 + }, + { + "auxiliary_loss_clip": 0.01062861, + "auxiliary_loss_mlp": 0.01043901, + "balance_loss_clip": 1.02841008, + "balance_loss_mlp": 1.02919102, + "epoch": 0.4260333684052307, + "flos": 15158792563200.0, + "grad_norm": 1.8798678811813827, + "language_loss": 0.75964123, + "learning_rate": 2.568270298414995e-06, + "loss": 0.78070885, + "num_input_tokens_seen": 152149435, + "step": 7086, + "time_per_iteration": 2.614806890487671 + }, + { + "auxiliary_loss_clip": 0.01052205, + "auxiliary_loss_mlp": 0.01034529, + "balance_loss_clip": 1.02700377, + "balance_loss_mlp": 1.02099276, + "epoch": 0.42609349165789867, + "flos": 14939342421120.0, + "grad_norm": 2.069209900280641, + "language_loss": 0.80302215, + "learning_rate": 2.5678968779210255e-06, + "loss": 0.82388943, + "num_input_tokens_seen": 152166860, + "step": 7087, + "time_per_iteration": 2.693373918533325 + }, + { + "auxiliary_loss_clip": 0.01057368, + "auxiliary_loss_mlp": 0.01031675, + "balance_loss_clip": 1.02956057, + "balance_loss_mlp": 1.01846099, + "epoch": 0.42615361491056664, + "flos": 23731961961600.0, + "grad_norm": 1.643476492115386, + "language_loss": 0.65648127, + "learning_rate": 2.5675234358923505e-06, + "loss": 0.67737174, + "num_input_tokens_seen": 152187475, + "step": 7088, + "time_per_iteration": 2.634047508239746 + }, + { + "auxiliary_loss_clip": 0.0101629, + "auxiliary_loss_mlp": 0.01036953, + "balance_loss_clip": 1.02423179, + "balance_loss_mlp": 1.02334559, + "epoch": 0.42621373816323466, + "flos": 24936441747840.0, + "grad_norm": 1.93619514051127, + "language_loss": 0.68705225, + "learning_rate": 2.56714997234313e-06, + "loss": 0.70758474, + "num_input_tokens_seen": 152207235, + "step": 7089, + "time_per_iteration": 2.6934659481048584 + }, + { + "auxiliary_loss_clip": 0.01032393, + "auxiliary_loss_mlp": 0.01029622, + "balance_loss_clip": 1.02616119, + "balance_loss_mlp": 1.01636028, + "epoch": 0.4262738614159026, + "flos": 13552975140480.0, + "grad_norm": 2.613306347222143, + "language_loss": 0.73366022, + "learning_rate": 2.566776487287525e-06, + "loss": 0.75428033, + "num_input_tokens_seen": 152224240, + "step": 7090, + "time_per_iteration": 2.6450517177581787 + }, + { + "auxiliary_loss_clip": 0.01053996, + "auxiliary_loss_mlp": 0.01040786, + "balance_loss_clip": 1.02728963, + "balance_loss_mlp": 1.0276376, + "epoch": 0.4263339846685706, + "flos": 29748794284800.0, + "grad_norm": 1.7009087934061409, + "language_loss": 0.74628824, + "learning_rate": 2.5664029807396994e-06, + "loss": 0.76723605, + "num_input_tokens_seen": 152242595, + "step": 7091, + "time_per_iteration": 2.655144691467285 + }, + { + "auxiliary_loss_clip": 0.01032021, + "auxiliary_loss_mlp": 0.0102842, + "balance_loss_clip": 1.02869654, + "balance_loss_mlp": 1.01730943, + "epoch": 0.42639410792123855, + "flos": 16834204586880.0, + "grad_norm": 1.8290622972630173, + "language_loss": 0.82938325, + "learning_rate": 2.5660294527138156e-06, + "loss": 0.84998763, + "num_input_tokens_seen": 152260840, + "step": 7092, + "time_per_iteration": 2.881558895111084 + }, + { + "auxiliary_loss_clip": 0.01053445, + "auxiliary_loss_mlp": 0.01035049, + "balance_loss_clip": 1.02710271, + "balance_loss_mlp": 1.02189422, + "epoch": 0.4264542311739065, + "flos": 28763118195840.0, + "grad_norm": 1.6846913966903239, + "language_loss": 0.7367034, + "learning_rate": 2.565655903224038e-06, + "loss": 0.75758839, + "num_input_tokens_seen": 152280580, + "step": 7093, + "time_per_iteration": 2.692171812057495 + }, + { + "auxiliary_loss_clip": 0.01065575, + "auxiliary_loss_mlp": 0.01032876, + "balance_loss_clip": 1.02860498, + "balance_loss_mlp": 1.01962614, + "epoch": 0.4265143544265745, + "flos": 24713615727360.0, + "grad_norm": 2.2938419492929762, + "language_loss": 0.7007432, + "learning_rate": 2.565282332284532e-06, + "loss": 0.72172767, + "num_input_tokens_seen": 152298455, + "step": 7094, + "time_per_iteration": 2.641434907913208 + }, + { + "auxiliary_loss_clip": 0.01044779, + "auxiliary_loss_mlp": 0.01032995, + "balance_loss_clip": 1.02887118, + "balance_loss_mlp": 1.01993585, + "epoch": 0.42657447767924245, + "flos": 21865971352320.0, + "grad_norm": 1.8197607220597105, + "language_loss": 0.81865668, + "learning_rate": 2.564908739909464e-06, + "loss": 0.8394345, + "num_input_tokens_seen": 152316995, + "step": 7095, + "time_per_iteration": 2.76128888130188 + }, + { + "auxiliary_loss_clip": 0.01075758, + "auxiliary_loss_mlp": 0.01038185, + "balance_loss_clip": 1.02925563, + "balance_loss_mlp": 1.02535772, + "epoch": 0.4266346009319104, + "flos": 21470236237440.0, + "grad_norm": 2.2763734240033577, + "language_loss": 0.80732143, + "learning_rate": 2.5645351261129996e-06, + "loss": 0.82846081, + "num_input_tokens_seen": 152334800, + "step": 7096, + "time_per_iteration": 2.6304404735565186 + }, + { + "auxiliary_loss_clip": 0.01070925, + "auxiliary_loss_mlp": 0.01034608, + "balance_loss_clip": 1.03184855, + "balance_loss_mlp": 1.02197134, + "epoch": 0.4266947241845784, + "flos": 25519379569920.0, + "grad_norm": 2.1304890822992237, + "language_loss": 0.65283155, + "learning_rate": 2.5641614909093066e-06, + "loss": 0.6738869, + "num_input_tokens_seen": 152355175, + "step": 7097, + "time_per_iteration": 2.6691081523895264 + }, + { + "auxiliary_loss_clip": 0.01049453, + "auxiliary_loss_mlp": 0.0103169, + "balance_loss_clip": 1.03026938, + "balance_loss_mlp": 1.01889265, + "epoch": 0.42675484743724634, + "flos": 26541217676160.0, + "grad_norm": 1.7825735352523924, + "language_loss": 0.74221247, + "learning_rate": 2.5637878343125535e-06, + "loss": 0.76302391, + "num_input_tokens_seen": 152377245, + "step": 7098, + "time_per_iteration": 2.65750789642334 + }, + { + "auxiliary_loss_clip": 0.01066508, + "auxiliary_loss_mlp": 0.01029711, + "balance_loss_clip": 1.02971625, + "balance_loss_mlp": 1.01711106, + "epoch": 0.4268149706899143, + "flos": 23112718467840.0, + "grad_norm": 1.6314329255092344, + "language_loss": 0.74608767, + "learning_rate": 2.5634141563369086e-06, + "loss": 0.76704985, + "num_input_tokens_seen": 152396985, + "step": 7099, + "time_per_iteration": 2.6452620029449463 + }, + { + "auxiliary_loss_clip": 0.01053066, + "auxiliary_loss_mlp": 0.01037242, + "balance_loss_clip": 1.02701497, + "balance_loss_mlp": 1.02408123, + "epoch": 0.4268750939425823, + "flos": 22706532495360.0, + "grad_norm": 2.5962241730222764, + "language_loss": 0.82810175, + "learning_rate": 2.5630404569965432e-06, + "loss": 0.84900486, + "num_input_tokens_seen": 152415590, + "step": 7100, + "time_per_iteration": 2.677889347076416 + }, + { + "auxiliary_loss_clip": 0.01057568, + "auxiliary_loss_mlp": 0.01029699, + "balance_loss_clip": 1.02975106, + "balance_loss_mlp": 1.01736736, + "epoch": 0.42693521719525024, + "flos": 25374875155200.0, + "grad_norm": 1.506853157862758, + "language_loss": 0.82333827, + "learning_rate": 2.562666736305627e-06, + "loss": 0.84421092, + "num_input_tokens_seen": 152436735, + "step": 7101, + "time_per_iteration": 2.6912715435028076 + }, + { + "auxiliary_loss_clip": 0.01078086, + "auxiliary_loss_mlp": 0.01029713, + "balance_loss_clip": 1.03014994, + "balance_loss_mlp": 1.01607609, + "epoch": 0.42699534044791826, + "flos": 18150689957760.0, + "grad_norm": 2.4255822036677066, + "language_loss": 0.72860491, + "learning_rate": 2.5622929942783314e-06, + "loss": 0.74968296, + "num_input_tokens_seen": 152455685, + "step": 7102, + "time_per_iteration": 2.5622611045837402 + }, + { + "auxiliary_loss_clip": 0.01063205, + "auxiliary_loss_mlp": 0.01028605, + "balance_loss_clip": 1.02822447, + "balance_loss_mlp": 1.01652932, + "epoch": 0.4270554637005862, + "flos": 13698413308800.0, + "grad_norm": 1.7625522795167219, + "language_loss": 0.82844543, + "learning_rate": 2.5619192309288297e-06, + "loss": 0.84936351, + "num_input_tokens_seen": 152473500, + "step": 7103, + "time_per_iteration": 2.635308265686035 + }, + { + "auxiliary_loss_clip": 0.01045188, + "auxiliary_loss_mlp": 0.01037099, + "balance_loss_clip": 1.02387214, + "balance_loss_mlp": 1.02240658, + "epoch": 0.4271155869532542, + "flos": 17493596507520.0, + "grad_norm": 2.301742059116411, + "language_loss": 0.73725557, + "learning_rate": 2.561545446271294e-06, + "loss": 0.75807846, + "num_input_tokens_seen": 152491320, + "step": 7104, + "time_per_iteration": 2.6324174404144287 + }, + { + "auxiliary_loss_clip": 0.01057208, + "auxiliary_loss_mlp": 0.01033332, + "balance_loss_clip": 1.02691305, + "balance_loss_mlp": 1.02105367, + "epoch": 0.42717571020592215, + "flos": 32452293381120.0, + "grad_norm": 1.9008080663571583, + "language_loss": 0.74634635, + "learning_rate": 2.5611716403198987e-06, + "loss": 0.76725179, + "num_input_tokens_seen": 152511970, + "step": 7105, + "time_per_iteration": 2.6788954734802246 + }, + { + "auxiliary_loss_clip": 0.01077592, + "auxiliary_loss_mlp": 0.01035588, + "balance_loss_clip": 1.0300225, + "balance_loss_mlp": 1.02307701, + "epoch": 0.4272358334585901, + "flos": 16253062444800.0, + "grad_norm": 2.019973524617587, + "language_loss": 0.76665515, + "learning_rate": 2.560797813088819e-06, + "loss": 0.78778696, + "num_input_tokens_seen": 152530515, + "step": 7106, + "time_per_iteration": 2.547654628753662 + }, + { + "auxiliary_loss_clip": 0.01054714, + "auxiliary_loss_mlp": 0.01029761, + "balance_loss_clip": 1.02774394, + "balance_loss_mlp": 1.01768482, + "epoch": 0.4272959567112581, + "flos": 24200092938240.0, + "grad_norm": 1.8052224186620076, + "language_loss": 0.80234754, + "learning_rate": 2.560423964592229e-06, + "loss": 0.82319236, + "num_input_tokens_seen": 152549295, + "step": 7107, + "time_per_iteration": 2.7633326053619385 + }, + { + "auxiliary_loss_clip": 0.0102286, + "auxiliary_loss_mlp": 0.01032723, + "balance_loss_clip": 1.02556944, + "balance_loss_mlp": 1.02016425, + "epoch": 0.42735607996392605, + "flos": 27963495578880.0, + "grad_norm": 1.400483982228746, + "language_loss": 0.68025887, + "learning_rate": 2.5600500948443075e-06, + "loss": 0.70081472, + "num_input_tokens_seen": 152570725, + "step": 7108, + "time_per_iteration": 2.9758119583129883 + }, + { + "auxiliary_loss_clip": 0.01050417, + "auxiliary_loss_mlp": 0.01031559, + "balance_loss_clip": 1.02900362, + "balance_loss_mlp": 1.01936388, + "epoch": 0.427416203216594, + "flos": 20295597674880.0, + "grad_norm": 1.598815902276494, + "language_loss": 0.71633327, + "learning_rate": 2.5596762038592294e-06, + "loss": 0.73715305, + "num_input_tokens_seen": 152588950, + "step": 7109, + "time_per_iteration": 2.7284505367279053 + }, + { + "auxiliary_loss_clip": 0.0105958, + "auxiliary_loss_mlp": 0.01033283, + "balance_loss_clip": 1.02687526, + "balance_loss_mlp": 1.01883459, + "epoch": 0.427476326469262, + "flos": 26943955943040.0, + "grad_norm": 2.2216231531554547, + "language_loss": 0.64924592, + "learning_rate": 2.559302291651174e-06, + "loss": 0.67017448, + "num_input_tokens_seen": 152608965, + "step": 7110, + "time_per_iteration": 2.7124533653259277 + }, + { + "auxiliary_loss_clip": 0.01075956, + "auxiliary_loss_mlp": 0.00747617, + "balance_loss_clip": 1.02904356, + "balance_loss_mlp": 1.00040817, + "epoch": 0.42753644972192995, + "flos": 25702847262720.0, + "grad_norm": 1.5781262011911261, + "language_loss": 0.76262099, + "learning_rate": 2.5589283582343197e-06, + "loss": 0.78085673, + "num_input_tokens_seen": 152630220, + "step": 7111, + "time_per_iteration": 2.8130667209625244 + }, + { + "auxiliary_loss_clip": 0.01044677, + "auxiliary_loss_mlp": 0.01031162, + "balance_loss_clip": 1.02807832, + "balance_loss_mlp": 1.01826918, + "epoch": 0.4275965729745979, + "flos": 18767419499520.0, + "grad_norm": 2.5744735019887552, + "language_loss": 0.73016691, + "learning_rate": 2.558554403622845e-06, + "loss": 0.75092524, + "num_input_tokens_seen": 152648835, + "step": 7112, + "time_per_iteration": 2.852858781814575 + }, + { + "auxiliary_loss_clip": 0.01044189, + "auxiliary_loss_mlp": 0.01033177, + "balance_loss_clip": 1.02389193, + "balance_loss_mlp": 1.02140522, + "epoch": 0.4276566962272659, + "flos": 23764424878080.0, + "grad_norm": 1.6157427477480402, + "language_loss": 0.7140795, + "learning_rate": 2.5581804278309323e-06, + "loss": 0.73485315, + "num_input_tokens_seen": 152668375, + "step": 7113, + "time_per_iteration": 2.659632921218872 + }, + { + "auxiliary_loss_clip": 0.01066634, + "auxiliary_loss_mlp": 0.01037957, + "balance_loss_clip": 1.03000081, + "balance_loss_mlp": 1.0249933, + "epoch": 0.42771681947993384, + "flos": 22492505306880.0, + "grad_norm": 1.6934288363544863, + "language_loss": 0.61304617, + "learning_rate": 2.5578064308727617e-06, + "loss": 0.63409209, + "num_input_tokens_seen": 152689725, + "step": 7114, + "time_per_iteration": 2.5891127586364746 + }, + { + "auxiliary_loss_clip": 0.01070229, + "auxiliary_loss_mlp": 0.01043285, + "balance_loss_clip": 1.03001833, + "balance_loss_mlp": 1.02827621, + "epoch": 0.42777694273260186, + "flos": 25044712318080.0, + "grad_norm": 1.7892739572903398, + "language_loss": 0.6468761, + "learning_rate": 2.5574324127625153e-06, + "loss": 0.66801131, + "num_input_tokens_seen": 152709375, + "step": 7115, + "time_per_iteration": 2.5887699127197266 + }, + { + "auxiliary_loss_clip": 0.01046189, + "auxiliary_loss_mlp": 0.01033528, + "balance_loss_clip": 1.02396727, + "balance_loss_mlp": 1.02173209, + "epoch": 0.4278370659852698, + "flos": 18661519226880.0, + "grad_norm": 1.599920687730213, + "language_loss": 0.73749703, + "learning_rate": 2.5570583735143753e-06, + "loss": 0.75829422, + "num_input_tokens_seen": 152727510, + "step": 7116, + "time_per_iteration": 2.5755913257598877 + }, + { + "auxiliary_loss_clip": 0.01042322, + "auxiliary_loss_mlp": 0.0103693, + "balance_loss_clip": 1.02349353, + "balance_loss_mlp": 1.02477634, + "epoch": 0.4278971892379378, + "flos": 27308269635840.0, + "grad_norm": 1.5791804627071848, + "language_loss": 0.69257408, + "learning_rate": 2.5566843131425275e-06, + "loss": 0.71336663, + "num_input_tokens_seen": 152746670, + "step": 7117, + "time_per_iteration": 2.649864673614502 + }, + { + "auxiliary_loss_clip": 0.01051892, + "auxiliary_loss_mlp": 0.01037123, + "balance_loss_clip": 1.02741098, + "balance_loss_mlp": 1.02489185, + "epoch": 0.42795731249060576, + "flos": 12888698970240.0, + "grad_norm": 3.8007828003400066, + "language_loss": 0.70040631, + "learning_rate": 2.5563102316611536e-06, + "loss": 0.72129643, + "num_input_tokens_seen": 152760545, + "step": 7118, + "time_per_iteration": 2.7227413654327393 + }, + { + "auxiliary_loss_clip": 0.01030997, + "auxiliary_loss_mlp": 0.01041177, + "balance_loss_clip": 1.02428532, + "balance_loss_mlp": 1.02788496, + "epoch": 0.4280174357432737, + "flos": 33401448316800.0, + "grad_norm": 2.1012048889573247, + "language_loss": 0.7452805, + "learning_rate": 2.55593612908444e-06, + "loss": 0.76600224, + "num_input_tokens_seen": 152780970, + "step": 7119, + "time_per_iteration": 5.900202512741089 + }, + { + "auxiliary_loss_clip": 0.01012265, + "auxiliary_loss_mlp": 0.01034816, + "balance_loss_clip": 1.02415717, + "balance_loss_mlp": 1.02175641, + "epoch": 0.4280775589959417, + "flos": 18259104182400.0, + "grad_norm": 1.8279350845881641, + "language_loss": 0.74596667, + "learning_rate": 2.555562005426573e-06, + "loss": 0.76643741, + "num_input_tokens_seen": 152798475, + "step": 7120, + "time_per_iteration": 2.724541187286377 + }, + { + "auxiliary_loss_clip": 0.01051947, + "auxiliary_loss_mlp": 0.00747437, + "balance_loss_clip": 1.02771783, + "balance_loss_mlp": 1.00036359, + "epoch": 0.42813768224860965, + "flos": 21471277731840.0, + "grad_norm": 1.6422198398553798, + "language_loss": 0.76692885, + "learning_rate": 2.5551878607017385e-06, + "loss": 0.78492266, + "num_input_tokens_seen": 152817555, + "step": 7121, + "time_per_iteration": 2.651597499847412 + }, + { + "auxiliary_loss_clip": 0.01054888, + "auxiliary_loss_mlp": 0.0103446, + "balance_loss_clip": 1.02934337, + "balance_loss_mlp": 1.02345073, + "epoch": 0.4281978055012776, + "flos": 15669262696320.0, + "grad_norm": 2.76348166987502, + "language_loss": 0.85551691, + "learning_rate": 2.554813694924126e-06, + "loss": 0.87641025, + "num_input_tokens_seen": 152836295, + "step": 7122, + "time_per_iteration": 2.631580114364624 + }, + { + "auxiliary_loss_clip": 0.01020925, + "auxiliary_loss_mlp": 0.01033971, + "balance_loss_clip": 1.02488172, + "balance_loss_mlp": 1.02153742, + "epoch": 0.4282579287539456, + "flos": 17712005155200.0, + "grad_norm": 2.1808853532831627, + "language_loss": 0.81587917, + "learning_rate": 2.554439508107921e-06, + "loss": 0.83642811, + "num_input_tokens_seen": 152854950, + "step": 7123, + "time_per_iteration": 2.8857173919677734 + }, + { + "auxiliary_loss_clip": 0.01041762, + "auxiliary_loss_mlp": 0.01035329, + "balance_loss_clip": 1.02916813, + "balance_loss_mlp": 1.02300835, + "epoch": 0.42831805200661355, + "flos": 19281157770240.0, + "grad_norm": 3.368173692692415, + "language_loss": 0.80924249, + "learning_rate": 2.5540653002673153e-06, + "loss": 0.83001339, + "num_input_tokens_seen": 152873995, + "step": 7124, + "time_per_iteration": 2.6563594341278076 + }, + { + "auxiliary_loss_clip": 0.01061223, + "auxiliary_loss_mlp": 0.01034112, + "balance_loss_clip": 1.02528214, + "balance_loss_mlp": 1.02147031, + "epoch": 0.4283781752592815, + "flos": 19792633484160.0, + "grad_norm": 1.6884635102111505, + "language_loss": 0.80543166, + "learning_rate": 2.553691071416498e-06, + "loss": 0.82638502, + "num_input_tokens_seen": 152892925, + "step": 7125, + "time_per_iteration": 2.660780429840088 + }, + { + "auxiliary_loss_clip": 0.0107203, + "auxiliary_loss_mlp": 0.00747375, + "balance_loss_clip": 1.02803779, + "balance_loss_mlp": 1.00034833, + "epoch": 0.4284382985119495, + "flos": 16508064072960.0, + "grad_norm": 2.257742512589376, + "language_loss": 0.74998885, + "learning_rate": 2.553316821569659e-06, + "loss": 0.76818293, + "num_input_tokens_seen": 152910935, + "step": 7126, + "time_per_iteration": 2.6353201866149902 + }, + { + "auxiliary_loss_clip": 0.01061007, + "auxiliary_loss_mlp": 0.01029065, + "balance_loss_clip": 1.0263623, + "balance_loss_mlp": 1.01625037, + "epoch": 0.42849842176461744, + "flos": 23330767979520.0, + "grad_norm": 1.631999947789232, + "language_loss": 0.81165475, + "learning_rate": 2.5529425507409913e-06, + "loss": 0.83255547, + "num_input_tokens_seen": 152931030, + "step": 7127, + "time_per_iteration": 4.22074294090271 + }, + { + "auxiliary_loss_clip": 0.01030312, + "auxiliary_loss_mlp": 0.01035514, + "balance_loss_clip": 1.02552748, + "balance_loss_mlp": 1.02298546, + "epoch": 0.4285585450172854, + "flos": 17274433674240.0, + "grad_norm": 1.7795604322171308, + "language_loss": 0.76226079, + "learning_rate": 2.5525682589446867e-06, + "loss": 0.78291911, + "num_input_tokens_seen": 152948085, + "step": 7128, + "time_per_iteration": 2.891850471496582 + }, + { + "auxiliary_loss_clip": 0.01032452, + "auxiliary_loss_mlp": 0.01030596, + "balance_loss_clip": 1.02594137, + "balance_loss_mlp": 1.01840067, + "epoch": 0.42861866826995343, + "flos": 24279599692800.0, + "grad_norm": 1.9720737998224802, + "language_loss": 0.74098837, + "learning_rate": 2.552193946194937e-06, + "loss": 0.76161885, + "num_input_tokens_seen": 152966265, + "step": 7129, + "time_per_iteration": 2.6997172832489014 + }, + { + "auxiliary_loss_clip": 0.01063152, + "auxiliary_loss_mlp": 0.00747555, + "balance_loss_clip": 1.02763617, + "balance_loss_mlp": 1.00039971, + "epoch": 0.4286787915226214, + "flos": 24353108876160.0, + "grad_norm": 1.8661499784013411, + "language_loss": 0.77741039, + "learning_rate": 2.5518196125059394e-06, + "loss": 0.79551744, + "num_input_tokens_seen": 152986775, + "step": 7130, + "time_per_iteration": 2.597470283508301 + }, + { + "auxiliary_loss_clip": 0.01057644, + "auxiliary_loss_mlp": 0.01036355, + "balance_loss_clip": 1.03084493, + "balance_loss_mlp": 1.02302158, + "epoch": 0.42873891477528936, + "flos": 15449992122240.0, + "grad_norm": 1.748882819149843, + "language_loss": 0.72898394, + "learning_rate": 2.551445257891886e-06, + "loss": 0.74992394, + "num_input_tokens_seen": 153003595, + "step": 7131, + "time_per_iteration": 2.578397512435913 + }, + { + "auxiliary_loss_clip": 0.01052426, + "auxiliary_loss_mlp": 0.01035876, + "balance_loss_clip": 1.02673578, + "balance_loss_mlp": 1.02328181, + "epoch": 0.4287990380279573, + "flos": 17639573379840.0, + "grad_norm": 2.134079875096774, + "language_loss": 0.7701714, + "learning_rate": 2.551070882366973e-06, + "loss": 0.79105443, + "num_input_tokens_seen": 153021960, + "step": 7132, + "time_per_iteration": 2.5863754749298096 + }, + { + "auxiliary_loss_clip": 0.0103896, + "auxiliary_loss_mlp": 0.00747659, + "balance_loss_clip": 1.02742171, + "balance_loss_mlp": 1.00037909, + "epoch": 0.4288591612806253, + "flos": 27162328677120.0, + "grad_norm": 2.940803430744399, + "language_loss": 0.78406787, + "learning_rate": 2.550696485945397e-06, + "loss": 0.801934, + "num_input_tokens_seen": 153042110, + "step": 7133, + "time_per_iteration": 4.355731725692749 + }, + { + "auxiliary_loss_clip": 0.01053222, + "auxiliary_loss_mlp": 0.01031473, + "balance_loss_clip": 1.02638936, + "balance_loss_mlp": 1.01926017, + "epoch": 0.42891928453329325, + "flos": 17163182275200.0, + "grad_norm": 2.453460780229915, + "language_loss": 0.75024021, + "learning_rate": 2.550322068641355e-06, + "loss": 0.77108717, + "num_input_tokens_seen": 153058925, + "step": 7134, + "time_per_iteration": 2.61184024810791 + }, + { + "auxiliary_loss_clip": 0.01052113, + "auxiliary_loss_mlp": 0.01033362, + "balance_loss_clip": 1.02349937, + "balance_loss_mlp": 1.02114892, + "epoch": 0.4289794077859612, + "flos": 18187031543040.0, + "grad_norm": 1.7962551931362076, + "language_loss": 0.8371135, + "learning_rate": 2.5499476304690455e-06, + "loss": 0.85796821, + "num_input_tokens_seen": 153078070, + "step": 7135, + "time_per_iteration": 2.5790891647338867 + }, + { + "auxiliary_loss_clip": 0.01008572, + "auxiliary_loss_mlp": 0.01041631, + "balance_loss_clip": 1.02359462, + "balance_loss_mlp": 1.02735531, + "epoch": 0.4290395310386292, + "flos": 28256885867520.0, + "grad_norm": 3.845250345181187, + "language_loss": 0.74811125, + "learning_rate": 2.549573171442666e-06, + "loss": 0.76861322, + "num_input_tokens_seen": 153096680, + "step": 7136, + "time_per_iteration": 2.783048629760742 + }, + { + "auxiliary_loss_clip": 0.01061748, + "auxiliary_loss_mlp": 0.01034856, + "balance_loss_clip": 1.02566123, + "balance_loss_mlp": 1.02247596, + "epoch": 0.42909965429129715, + "flos": 16216074414720.0, + "grad_norm": 2.7588824578311355, + "language_loss": 0.79069245, + "learning_rate": 2.5491986915764175e-06, + "loss": 0.81165856, + "num_input_tokens_seen": 153113305, + "step": 7137, + "time_per_iteration": 2.654364585876465 + }, + { + "auxiliary_loss_clip": 0.01076699, + "auxiliary_loss_mlp": 0.01034248, + "balance_loss_clip": 1.02867007, + "balance_loss_mlp": 1.02105784, + "epoch": 0.4291597775439651, + "flos": 23112862122240.0, + "grad_norm": 2.6924687102810885, + "language_loss": 0.76363736, + "learning_rate": 2.548824190884499e-06, + "loss": 0.78474689, + "num_input_tokens_seen": 153132735, + "step": 7138, + "time_per_iteration": 2.668273448944092 + }, + { + "auxiliary_loss_clip": 0.00997932, + "auxiliary_loss_mlp": 0.0100768, + "balance_loss_clip": 1.00812864, + "balance_loss_mlp": 1.00621343, + "epoch": 0.4292199007966331, + "flos": 67546212681600.0, + "grad_norm": 0.7775582256482676, + "language_loss": 0.56221914, + "learning_rate": 2.548449669381113e-06, + "loss": 0.58227527, + "num_input_tokens_seen": 153187925, + "step": 7139, + "time_per_iteration": 3.13248610496521 + }, + { + "auxiliary_loss_clip": 0.01068937, + "auxiliary_loss_mlp": 0.0074738, + "balance_loss_clip": 1.0263207, + "balance_loss_mlp": 1.00028741, + "epoch": 0.42928002404930105, + "flos": 22999850956800.0, + "grad_norm": 2.079246761387863, + "language_loss": 0.80526471, + "learning_rate": 2.5480751270804595e-06, + "loss": 0.82342792, + "num_input_tokens_seen": 153206990, + "step": 7140, + "time_per_iteration": 2.5565152168273926 + }, + { + "auxiliary_loss_clip": 0.01061967, + "auxiliary_loss_mlp": 0.0102978, + "balance_loss_clip": 1.02621663, + "balance_loss_mlp": 1.01730478, + "epoch": 0.429340147301969, + "flos": 11544922241280.0, + "grad_norm": 1.6565055733958407, + "language_loss": 0.81923562, + "learning_rate": 2.5477005639967424e-06, + "loss": 0.8401531, + "num_input_tokens_seen": 153222345, + "step": 7141, + "time_per_iteration": 2.5628714561462402 + }, + { + "auxiliary_loss_clip": 0.01063224, + "auxiliary_loss_mlp": 0.01038472, + "balance_loss_clip": 1.02982438, + "balance_loss_mlp": 1.02516198, + "epoch": 0.42940027055463703, + "flos": 25264988472960.0, + "grad_norm": 2.460248592206615, + "language_loss": 0.86676395, + "learning_rate": 2.547325980144166e-06, + "loss": 0.8877809, + "num_input_tokens_seen": 153240570, + "step": 7142, + "time_per_iteration": 2.604677677154541 + }, + { + "auxiliary_loss_clip": 0.01054207, + "auxiliary_loss_mlp": 0.0102855, + "balance_loss_clip": 1.03066456, + "balance_loss_mlp": 1.01653409, + "epoch": 0.429460393807305, + "flos": 23805004268160.0, + "grad_norm": 2.6086912703255565, + "language_loss": 0.77927244, + "learning_rate": 2.5469513755369323e-06, + "loss": 0.80009997, + "num_input_tokens_seen": 153259575, + "step": 7143, + "time_per_iteration": 2.668347120285034 + }, + { + "auxiliary_loss_clip": 0.01020855, + "auxiliary_loss_mlp": 0.01040344, + "balance_loss_clip": 1.02461362, + "balance_loss_mlp": 1.02710009, + "epoch": 0.42952051705997296, + "flos": 13918294414080.0, + "grad_norm": 2.038168323646822, + "language_loss": 0.76555848, + "learning_rate": 2.5465767501892484e-06, + "loss": 0.78617042, + "num_input_tokens_seen": 153276650, + "step": 7144, + "time_per_iteration": 2.6641275882720947 + }, + { + "auxiliary_loss_clip": 0.01039523, + "auxiliary_loss_mlp": 0.01030067, + "balance_loss_clip": 1.02502346, + "balance_loss_mlp": 1.01753831, + "epoch": 0.4295806403126409, + "flos": 26760380509440.0, + "grad_norm": 1.6813663350360988, + "language_loss": 0.73525184, + "learning_rate": 2.54620210411532e-06, + "loss": 0.75594771, + "num_input_tokens_seen": 153298025, + "step": 7145, + "time_per_iteration": 2.761701822280884 + }, + { + "auxiliary_loss_clip": 0.01066653, + "auxiliary_loss_mlp": 0.0103346, + "balance_loss_clip": 1.03028333, + "balance_loss_mlp": 1.02065706, + "epoch": 0.4296407635653089, + "flos": 20952619297920.0, + "grad_norm": 9.242573451239423, + "language_loss": 0.79118377, + "learning_rate": 2.545827437329352e-06, + "loss": 0.81218493, + "num_input_tokens_seen": 153315775, + "step": 7146, + "time_per_iteration": 2.6816978454589844 + }, + { + "auxiliary_loss_clip": 0.01062234, + "auxiliary_loss_mlp": 0.01031593, + "balance_loss_clip": 1.02837181, + "balance_loss_mlp": 1.01958239, + "epoch": 0.42970088681797686, + "flos": 15852335339520.0, + "grad_norm": 2.141022898102775, + "language_loss": 0.82786125, + "learning_rate": 2.5454527498455532e-06, + "loss": 0.84879953, + "num_input_tokens_seen": 153332765, + "step": 7147, + "time_per_iteration": 2.605098247528076 + }, + { + "auxiliary_loss_clip": 0.01071435, + "auxiliary_loss_mlp": 0.01038278, + "balance_loss_clip": 1.03517628, + "balance_loss_mlp": 1.02389538, + "epoch": 0.4297610100706448, + "flos": 22382618624640.0, + "grad_norm": 1.8757907928627473, + "language_loss": 0.87306893, + "learning_rate": 2.545078041678131e-06, + "loss": 0.89416611, + "num_input_tokens_seen": 153350760, + "step": 7148, + "time_per_iteration": 2.6231963634490967 + }, + { + "auxiliary_loss_clip": 0.01045986, + "auxiliary_loss_mlp": 0.01031261, + "balance_loss_clip": 1.02725458, + "balance_loss_mlp": 1.0188694, + "epoch": 0.4298211333233128, + "flos": 27925681536000.0, + "grad_norm": 1.8645926431078022, + "language_loss": 0.77917528, + "learning_rate": 2.5447033128412957e-06, + "loss": 0.79994774, + "num_input_tokens_seen": 153370765, + "step": 7149, + "time_per_iteration": 2.709197998046875 + }, + { + "auxiliary_loss_clip": 0.01027011, + "auxiliary_loss_mlp": 0.01034041, + "balance_loss_clip": 1.02243364, + "balance_loss_mlp": 1.02100539, + "epoch": 0.42988125657598075, + "flos": 24425612478720.0, + "grad_norm": 1.6565589931847733, + "language_loss": 0.80072778, + "learning_rate": 2.544328563349256e-06, + "loss": 0.82133836, + "num_input_tokens_seen": 153390725, + "step": 7150, + "time_per_iteration": 2.70820689201355 + }, + { + "auxiliary_loss_clip": 0.01067671, + "auxiliary_loss_mlp": 0.01037867, + "balance_loss_clip": 1.03032792, + "balance_loss_mlp": 1.02347803, + "epoch": 0.4299413798286487, + "flos": 15850180523520.0, + "grad_norm": 1.7137727668862628, + "language_loss": 0.74805778, + "learning_rate": 2.5439537932162222e-06, + "loss": 0.76911312, + "num_input_tokens_seen": 153408010, + "step": 7151, + "time_per_iteration": 2.5726845264434814 + }, + { + "auxiliary_loss_clip": 0.01030243, + "auxiliary_loss_mlp": 0.01033788, + "balance_loss_clip": 1.02451575, + "balance_loss_mlp": 1.02003729, + "epoch": 0.4300015030813167, + "flos": 22309504490880.0, + "grad_norm": 1.9205587246360485, + "language_loss": 0.70428336, + "learning_rate": 2.543579002456406e-06, + "loss": 0.72492367, + "num_input_tokens_seen": 153426865, + "step": 7152, + "time_per_iteration": 2.6587588787078857 + }, + { + "auxiliary_loss_clip": 0.01050537, + "auxiliary_loss_mlp": 0.0103447, + "balance_loss_clip": 1.02799249, + "balance_loss_mlp": 1.02194095, + "epoch": 0.43006162633398465, + "flos": 34897666366080.0, + "grad_norm": 2.792808516499356, + "language_loss": 0.70966554, + "learning_rate": 2.54320419108402e-06, + "loss": 0.7305156, + "num_input_tokens_seen": 153449410, + "step": 7153, + "time_per_iteration": 2.7127294540405273 + }, + { + "auxiliary_loss_clip": 0.0105788, + "auxiliary_loss_mlp": 0.01032496, + "balance_loss_clip": 1.02655804, + "balance_loss_mlp": 1.01933503, + "epoch": 0.4301217495866526, + "flos": 15961575576960.0, + "grad_norm": 2.011367563749514, + "language_loss": 0.78178179, + "learning_rate": 2.542829359113276e-06, + "loss": 0.80268556, + "num_input_tokens_seen": 153467910, + "step": 7154, + "time_per_iteration": 2.5791175365448 + }, + { + "auxiliary_loss_clip": 0.01044469, + "auxiliary_loss_mlp": 0.0103231, + "balance_loss_clip": 1.02603626, + "balance_loss_mlp": 1.01910746, + "epoch": 0.43018187283932063, + "flos": 18770364414720.0, + "grad_norm": 1.8473631916441515, + "language_loss": 0.78600401, + "learning_rate": 2.542454506558389e-06, + "loss": 0.80677176, + "num_input_tokens_seen": 153487100, + "step": 7155, + "time_per_iteration": 2.6159493923187256 + }, + { + "auxiliary_loss_clip": 0.01052659, + "auxiliary_loss_mlp": 0.01029037, + "balance_loss_clip": 1.02768207, + "balance_loss_mlp": 1.01758683, + "epoch": 0.4302419960919886, + "flos": 20151703791360.0, + "grad_norm": 2.091183334504468, + "language_loss": 0.88725686, + "learning_rate": 2.5420796334335723e-06, + "loss": 0.90807378, + "num_input_tokens_seen": 153505565, + "step": 7156, + "time_per_iteration": 2.5884909629821777 + }, + { + "auxiliary_loss_clip": 0.01072979, + "auxiliary_loss_mlp": 0.01030533, + "balance_loss_clip": 1.02616131, + "balance_loss_mlp": 1.01750362, + "epoch": 0.43030211934465656, + "flos": 26432731624320.0, + "grad_norm": 1.797835808593719, + "language_loss": 0.82808834, + "learning_rate": 2.541704739753042e-06, + "loss": 0.84912348, + "num_input_tokens_seen": 153526130, + "step": 7157, + "time_per_iteration": 2.5676486492156982 + }, + { + "auxiliary_loss_clip": 0.01078325, + "auxiliary_loss_mlp": 0.0103666, + "balance_loss_clip": 1.0296123, + "balance_loss_mlp": 1.02339172, + "epoch": 0.43036224259732453, + "flos": 24389234979840.0, + "grad_norm": 2.296645889362557, + "language_loss": 0.71632981, + "learning_rate": 2.5413298255310132e-06, + "loss": 0.73747969, + "num_input_tokens_seen": 153546370, + "step": 7158, + "time_per_iteration": 2.5321097373962402 + }, + { + "auxiliary_loss_clip": 0.01064615, + "auxiliary_loss_mlp": 0.01032536, + "balance_loss_clip": 1.02824068, + "balance_loss_mlp": 1.02028084, + "epoch": 0.4304223658499925, + "flos": 17201714590080.0, + "grad_norm": 2.329036735418159, + "language_loss": 0.82658553, + "learning_rate": 2.5409548907817034e-06, + "loss": 0.84755701, + "num_input_tokens_seen": 153562800, + "step": 7159, + "time_per_iteration": 2.5540196895599365 + }, + { + "auxiliary_loss_clip": 0.0104528, + "auxiliary_loss_mlp": 0.01033909, + "balance_loss_clip": 1.02519596, + "balance_loss_mlp": 1.02118397, + "epoch": 0.43048248910266046, + "flos": 14903000835840.0, + "grad_norm": 2.1312547753740083, + "language_loss": 0.83244741, + "learning_rate": 2.54057993551933e-06, + "loss": 0.8532393, + "num_input_tokens_seen": 153578395, + "step": 7160, + "time_per_iteration": 2.5614380836486816 + }, + { + "auxiliary_loss_clip": 0.01067892, + "auxiliary_loss_mlp": 0.01040267, + "balance_loss_clip": 1.02925837, + "balance_loss_mlp": 1.02527022, + "epoch": 0.4305426123553284, + "flos": 21579835610880.0, + "grad_norm": 2.022518167611109, + "language_loss": 0.77157819, + "learning_rate": 2.5402049597581116e-06, + "loss": 0.79265976, + "num_input_tokens_seen": 153596880, + "step": 7161, + "time_per_iteration": 2.6258037090301514 + }, + { + "auxiliary_loss_clip": 0.01058266, + "auxiliary_loss_mlp": 0.01037095, + "balance_loss_clip": 1.02596617, + "balance_loss_mlp": 1.02428031, + "epoch": 0.4306027356079964, + "flos": 22601278667520.0, + "grad_norm": 1.969662485039603, + "language_loss": 0.73110402, + "learning_rate": 2.5398299635122662e-06, + "loss": 0.75205761, + "num_input_tokens_seen": 153616570, + "step": 7162, + "time_per_iteration": 2.641470432281494 + }, + { + "auxiliary_loss_clip": 0.0098495, + "auxiliary_loss_mlp": 0.00746204, + "balance_loss_clip": 1.00660813, + "balance_loss_mlp": 1.00009823, + "epoch": 0.43066285886066435, + "flos": 70672091806080.0, + "grad_norm": 0.7900124570556963, + "language_loss": 0.58986998, + "learning_rate": 2.5394549467960147e-06, + "loss": 0.60718155, + "num_input_tokens_seen": 153671450, + "step": 7163, + "time_per_iteration": 3.188142776489258 + }, + { + "auxiliary_loss_clip": 0.0104606, + "auxiliary_loss_mlp": 0.01039655, + "balance_loss_clip": 1.02541733, + "balance_loss_mlp": 1.0263629, + "epoch": 0.4307229821133323, + "flos": 26720591218560.0, + "grad_norm": 1.6777287410971566, + "language_loss": 0.79249257, + "learning_rate": 2.5390799096235783e-06, + "loss": 0.81334972, + "num_input_tokens_seen": 153691405, + "step": 7164, + "time_per_iteration": 2.672053337097168 + }, + { + "auxiliary_loss_clip": 0.01076492, + "auxiliary_loss_mlp": 0.0104058, + "balance_loss_clip": 1.02728021, + "balance_loss_mlp": 1.02717531, + "epoch": 0.4307831053660003, + "flos": 26177119464960.0, + "grad_norm": 1.8006177351391577, + "language_loss": 0.66955936, + "learning_rate": 2.538704852009177e-06, + "loss": 0.69073009, + "num_input_tokens_seen": 153711555, + "step": 7165, + "time_per_iteration": 2.7472870349884033 + }, + { + "auxiliary_loss_clip": 0.01053663, + "auxiliary_loss_mlp": 0.00747748, + "balance_loss_clip": 1.02851224, + "balance_loss_mlp": 1.00036824, + "epoch": 0.43084322861866825, + "flos": 18910343715840.0, + "grad_norm": 1.840033049722262, + "language_loss": 0.74908972, + "learning_rate": 2.538329773967034e-06, + "loss": 0.76710379, + "num_input_tokens_seen": 153730095, + "step": 7166, + "time_per_iteration": 4.292490005493164 + }, + { + "auxiliary_loss_clip": 0.01063844, + "auxiliary_loss_mlp": 0.01033438, + "balance_loss_clip": 1.02899146, + "balance_loss_mlp": 1.02223206, + "epoch": 0.4309033518713362, + "flos": 26432911192320.0, + "grad_norm": 1.6119628611149732, + "language_loss": 0.71812594, + "learning_rate": 2.537954675511372e-06, + "loss": 0.73909867, + "num_input_tokens_seen": 153749320, + "step": 7167, + "time_per_iteration": 4.2427380084991455 + }, + { + "auxiliary_loss_clip": 0.01048863, + "auxiliary_loss_mlp": 0.00747557, + "balance_loss_clip": 1.02597642, + "balance_loss_mlp": 1.00034809, + "epoch": 0.43096347512400424, + "flos": 21213295274880.0, + "grad_norm": 1.622602766804497, + "language_loss": 0.78397822, + "learning_rate": 2.537579556656414e-06, + "loss": 0.80194247, + "num_input_tokens_seen": 153767825, + "step": 7168, + "time_per_iteration": 2.6536972522735596 + }, + { + "auxiliary_loss_clip": 0.01055515, + "auxiliary_loss_mlp": 0.01042601, + "balance_loss_clip": 1.02882302, + "balance_loss_mlp": 1.02955961, + "epoch": 0.4310235983766722, + "flos": 16540131939840.0, + "grad_norm": 2.7461504551652074, + "language_loss": 0.82188356, + "learning_rate": 2.537204417416387e-06, + "loss": 0.84286469, + "num_input_tokens_seen": 153785350, + "step": 7169, + "time_per_iteration": 2.657914876937866 + }, + { + "auxiliary_loss_clip": 0.00991795, + "auxiliary_loss_mlp": 0.01004478, + "balance_loss_clip": 1.00232255, + "balance_loss_mlp": 1.00278497, + "epoch": 0.43108372162934017, + "flos": 64775704763520.0, + "grad_norm": 0.6733425378381809, + "language_loss": 0.60759407, + "learning_rate": 2.5368292578055132e-06, + "loss": 0.6275568, + "num_input_tokens_seen": 153856400, + "step": 7170, + "time_per_iteration": 3.321725845336914 + }, + { + "auxiliary_loss_clip": 0.01075253, + "auxiliary_loss_mlp": 0.01031212, + "balance_loss_clip": 1.02804649, + "balance_loss_mlp": 1.01933265, + "epoch": 0.43114384488200813, + "flos": 13444094039040.0, + "grad_norm": 1.780012110706677, + "language_loss": 0.75906336, + "learning_rate": 2.536454077838021e-06, + "loss": 0.780128, + "num_input_tokens_seen": 153875230, + "step": 7171, + "time_per_iteration": 2.715906858444214 + }, + { + "auxiliary_loss_clip": 0.01063999, + "auxiliary_loss_mlp": 0.01033794, + "balance_loss_clip": 1.02808785, + "balance_loss_mlp": 1.0215807, + "epoch": 0.4312039681346761, + "flos": 26286682924800.0, + "grad_norm": 1.6113714439134499, + "language_loss": 0.77320588, + "learning_rate": 2.5360788775281357e-06, + "loss": 0.79418379, + "num_input_tokens_seen": 153894740, + "step": 7172, + "time_per_iteration": 2.740382194519043 + }, + { + "auxiliary_loss_clip": 0.01047251, + "auxiliary_loss_mlp": 0.01043001, + "balance_loss_clip": 1.02591908, + "balance_loss_mlp": 1.02844, + "epoch": 0.43126409138734406, + "flos": 20376684627840.0, + "grad_norm": 1.9860306866863493, + "language_loss": 0.76593351, + "learning_rate": 2.535703656890086e-06, + "loss": 0.78683609, + "num_input_tokens_seen": 153913230, + "step": 7173, + "time_per_iteration": 2.7339820861816406 + }, + { + "auxiliary_loss_clip": 0.0107357, + "auxiliary_loss_mlp": 0.00747706, + "balance_loss_clip": 1.02788019, + "balance_loss_mlp": 1.00030947, + "epoch": 0.431324214640012, + "flos": 22123091882880.0, + "grad_norm": 1.9881783482197768, + "language_loss": 0.76827902, + "learning_rate": 2.5353284159381e-06, + "loss": 0.78649175, + "num_input_tokens_seen": 153933250, + "step": 7174, + "time_per_iteration": 2.609156370162964 + }, + { + "auxiliary_loss_clip": 0.01076249, + "auxiliary_loss_mlp": 0.01033454, + "balance_loss_clip": 1.02853417, + "balance_loss_mlp": 1.01928008, + "epoch": 0.43138433789268, + "flos": 15231008856960.0, + "grad_norm": 1.5936170224299533, + "language_loss": 0.82120311, + "learning_rate": 2.534953154686407e-06, + "loss": 0.84230006, + "num_input_tokens_seen": 153951325, + "step": 7175, + "time_per_iteration": 4.143289089202881 + }, + { + "auxiliary_loss_clip": 0.01034955, + "auxiliary_loss_mlp": 0.01053056, + "balance_loss_clip": 1.02666485, + "balance_loss_mlp": 1.03720737, + "epoch": 0.43144446114534796, + "flos": 18150294908160.0, + "grad_norm": 2.4970824052915805, + "language_loss": 0.74459809, + "learning_rate": 2.5345778731492366e-06, + "loss": 0.76547813, + "num_input_tokens_seen": 153966975, + "step": 7176, + "time_per_iteration": 2.7754178047180176 + }, + { + "auxiliary_loss_clip": 0.01065638, + "auxiliary_loss_mlp": 0.01033689, + "balance_loss_clip": 1.02682912, + "balance_loss_mlp": 1.02045107, + "epoch": 0.4315045843980159, + "flos": 22929861306240.0, + "grad_norm": 1.7938749990485046, + "language_loss": 0.73789597, + "learning_rate": 2.534202571340819e-06, + "loss": 0.75888926, + "num_input_tokens_seen": 153986695, + "step": 7177, + "time_per_iteration": 2.656311273574829 + }, + { + "auxiliary_loss_clip": 0.01062672, + "auxiliary_loss_mlp": 0.01037695, + "balance_loss_clip": 1.02847314, + "balance_loss_mlp": 1.02281749, + "epoch": 0.4315647076506839, + "flos": 22126862810880.0, + "grad_norm": 1.9420030434087237, + "language_loss": 0.81328046, + "learning_rate": 2.533827249275387e-06, + "loss": 0.83428419, + "num_input_tokens_seen": 154004710, + "step": 7178, + "time_per_iteration": 2.6163339614868164 + }, + { + "auxiliary_loss_clip": 0.01055666, + "auxiliary_loss_mlp": 0.01035587, + "balance_loss_clip": 1.03027642, + "balance_loss_mlp": 1.02307057, + "epoch": 0.43162483090335185, + "flos": 26871129118080.0, + "grad_norm": 1.4829021995062073, + "language_loss": 0.83954525, + "learning_rate": 2.5334519069671725e-06, + "loss": 0.86045778, + "num_input_tokens_seen": 154024320, + "step": 7179, + "time_per_iteration": 2.8967316150665283 + }, + { + "auxiliary_loss_clip": 0.01052779, + "auxiliary_loss_mlp": 0.01032296, + "balance_loss_clip": 1.02714491, + "balance_loss_mlp": 1.0193795, + "epoch": 0.4316849541560198, + "flos": 13913122855680.0, + "grad_norm": 1.6353908146933895, + "language_loss": 0.75049746, + "learning_rate": 2.5330765444304075e-06, + "loss": 0.77134824, + "num_input_tokens_seen": 154041755, + "step": 7180, + "time_per_iteration": 2.5926575660705566 + }, + { + "auxiliary_loss_clip": 0.01045928, + "auxiliary_loss_mlp": 0.00747949, + "balance_loss_clip": 1.02328789, + "balance_loss_mlp": 1.00035095, + "epoch": 0.4317450774086878, + "flos": 16435165420800.0, + "grad_norm": 1.6386790546534127, + "language_loss": 0.82172084, + "learning_rate": 2.5327011616793274e-06, + "loss": 0.83965963, + "num_input_tokens_seen": 154056775, + "step": 7181, + "time_per_iteration": 4.168341159820557 + }, + { + "auxiliary_loss_clip": 0.01051273, + "auxiliary_loss_mlp": 0.01034602, + "balance_loss_clip": 1.02781057, + "balance_loss_mlp": 1.02106535, + "epoch": 0.4318052006613558, + "flos": 20554980762240.0, + "grad_norm": 1.9147360522813985, + "language_loss": 0.88658667, + "learning_rate": 2.532325758728165e-06, + "loss": 0.90744537, + "num_input_tokens_seen": 154075015, + "step": 7182, + "time_per_iteration": 2.608844041824341 + }, + { + "auxiliary_loss_clip": 0.01064907, + "auxiliary_loss_mlp": 0.00747504, + "balance_loss_clip": 1.02932501, + "balance_loss_mlp": 1.00037992, + "epoch": 0.43186532391402377, + "flos": 22820046451200.0, + "grad_norm": 1.7439507931276192, + "language_loss": 0.75607562, + "learning_rate": 2.5319503355911566e-06, + "loss": 0.77419984, + "num_input_tokens_seen": 154095170, + "step": 7183, + "time_per_iteration": 2.720599412918091 + }, + { + "auxiliary_loss_clip": 0.01063816, + "auxiliary_loss_mlp": 0.01032321, + "balance_loss_clip": 1.0267365, + "balance_loss_mlp": 1.01953542, + "epoch": 0.43192544716669173, + "flos": 25556583081600.0, + "grad_norm": 1.7272616719971217, + "language_loss": 0.77543253, + "learning_rate": 2.5315748922825393e-06, + "loss": 0.79639387, + "num_input_tokens_seen": 154116895, + "step": 7184, + "time_per_iteration": 2.6402127742767334 + }, + { + "auxiliary_loss_clip": 0.01045733, + "auxiliary_loss_mlp": 0.01033254, + "balance_loss_clip": 1.02547598, + "balance_loss_mlp": 1.02112496, + "epoch": 0.4319855704193597, + "flos": 30954674701440.0, + "grad_norm": 2.154817740777848, + "language_loss": 0.73617411, + "learning_rate": 2.5311994288165474e-06, + "loss": 0.75696397, + "num_input_tokens_seen": 154138395, + "step": 7185, + "time_per_iteration": 2.746087074279785 + }, + { + "auxiliary_loss_clip": 0.01060201, + "auxiliary_loss_mlp": 0.01036772, + "balance_loss_clip": 1.02948952, + "balance_loss_mlp": 1.02305067, + "epoch": 0.43204569367202766, + "flos": 24238732993920.0, + "grad_norm": 2.257912709236391, + "language_loss": 0.75931388, + "learning_rate": 2.530823945207421e-06, + "loss": 0.78028363, + "num_input_tokens_seen": 154156775, + "step": 7186, + "time_per_iteration": 2.6806488037109375 + }, + { + "auxiliary_loss_clip": 0.01041386, + "auxiliary_loss_mlp": 0.01034995, + "balance_loss_clip": 1.02581942, + "balance_loss_mlp": 1.02233529, + "epoch": 0.43210581692469563, + "flos": 18406948561920.0, + "grad_norm": 3.204793793439647, + "language_loss": 0.76672101, + "learning_rate": 2.5304484414693962e-06, + "loss": 0.78748482, + "num_input_tokens_seen": 154177500, + "step": 7187, + "time_per_iteration": 2.7693028450012207 + }, + { + "auxiliary_loss_clip": 0.00989929, + "auxiliary_loss_mlp": 0.01001755, + "balance_loss_clip": 1.01128721, + "balance_loss_mlp": 1.00011027, + "epoch": 0.4321659401773636, + "flos": 49832378910720.0, + "grad_norm": 0.8640114959151766, + "language_loss": 0.68239981, + "learning_rate": 2.530072917616714e-06, + "loss": 0.7023167, + "num_input_tokens_seen": 154237110, + "step": 7188, + "time_per_iteration": 3.2730116844177246 + }, + { + "auxiliary_loss_clip": 0.01043444, + "auxiliary_loss_mlp": 0.01033216, + "balance_loss_clip": 1.02545714, + "balance_loss_mlp": 1.02096128, + "epoch": 0.43222606343003156, + "flos": 17128564542720.0, + "grad_norm": 1.9620152161519688, + "language_loss": 0.7808857, + "learning_rate": 2.529697373663614e-06, + "loss": 0.80165231, + "num_input_tokens_seen": 154253910, + "step": 7189, + "time_per_iteration": 2.6099491119384766 + }, + { + "auxiliary_loss_clip": 0.01030412, + "auxiliary_loss_mlp": 0.01042836, + "balance_loss_clip": 1.02675128, + "balance_loss_mlp": 1.02858412, + "epoch": 0.4322861866826995, + "flos": 22749949059840.0, + "grad_norm": 1.8800397716285984, + "language_loss": 0.71722102, + "learning_rate": 2.5293218096243364e-06, + "loss": 0.73795354, + "num_input_tokens_seen": 154274770, + "step": 7190, + "time_per_iteration": 2.7043182849884033 + }, + { + "auxiliary_loss_clip": 0.01047074, + "auxiliary_loss_mlp": 0.01033946, + "balance_loss_clip": 1.02405524, + "balance_loss_mlp": 1.02162623, + "epoch": 0.4323463099353675, + "flos": 27891925729920.0, + "grad_norm": 1.8665608977759585, + "language_loss": 0.79412282, + "learning_rate": 2.5289462255131223e-06, + "loss": 0.81493306, + "num_input_tokens_seen": 154295035, + "step": 7191, + "time_per_iteration": 2.7693865299224854 + }, + { + "auxiliary_loss_clip": 0.01033675, + "auxiliary_loss_mlp": 0.01028999, + "balance_loss_clip": 1.02859044, + "balance_loss_mlp": 1.01700115, + "epoch": 0.43240643318803546, + "flos": 21614740652160.0, + "grad_norm": 1.6004004477427658, + "language_loss": 0.75011563, + "learning_rate": 2.5285706213442146e-06, + "loss": 0.77074236, + "num_input_tokens_seen": 154314905, + "step": 7192, + "time_per_iteration": 2.681833505630493 + }, + { + "auxiliary_loss_clip": 0.01023802, + "auxiliary_loss_mlp": 0.01039685, + "balance_loss_clip": 1.02448058, + "balance_loss_mlp": 1.02561212, + "epoch": 0.4324665564407034, + "flos": 17558378686080.0, + "grad_norm": 1.8013820045548539, + "language_loss": 0.79062217, + "learning_rate": 2.5281949971318557e-06, + "loss": 0.81125706, + "num_input_tokens_seen": 154331740, + "step": 7193, + "time_per_iteration": 2.6754539012908936 + }, + { + "auxiliary_loss_clip": 0.01054562, + "auxiliary_loss_mlp": 0.01037144, + "balance_loss_clip": 1.02704716, + "balance_loss_mlp": 1.02415037, + "epoch": 0.4325266796933714, + "flos": 18402423448320.0, + "grad_norm": 1.7921026740769839, + "language_loss": 0.7595464, + "learning_rate": 2.5278193528902897e-06, + "loss": 0.7804634, + "num_input_tokens_seen": 154348740, + "step": 7194, + "time_per_iteration": 2.5922956466674805 + }, + { + "auxiliary_loss_clip": 0.01073641, + "auxiliary_loss_mlp": 0.0103641, + "balance_loss_clip": 1.02765274, + "balance_loss_mlp": 1.02355969, + "epoch": 0.4325868029460394, + "flos": 22564793427840.0, + "grad_norm": 3.1127166959714936, + "language_loss": 0.59466505, + "learning_rate": 2.5274436886337613e-06, + "loss": 0.61576557, + "num_input_tokens_seen": 154368835, + "step": 7195, + "time_per_iteration": 2.6853463649749756 + }, + { + "auxiliary_loss_clip": 0.01051849, + "auxiliary_loss_mlp": 0.01034085, + "balance_loss_clip": 1.02544987, + "balance_loss_mlp": 1.02008379, + "epoch": 0.43264692619870737, + "flos": 14605516396800.0, + "grad_norm": 1.8740143862260006, + "language_loss": 0.65144229, + "learning_rate": 2.527068004376515e-06, + "loss": 0.67230159, + "num_input_tokens_seen": 154384620, + "step": 7196, + "time_per_iteration": 2.709789276123047 + }, + { + "auxiliary_loss_clip": 0.01078665, + "auxiliary_loss_mlp": 0.01036814, + "balance_loss_clip": 1.0290271, + "balance_loss_mlp": 1.02329004, + "epoch": 0.43270704945137534, + "flos": 21501657659520.0, + "grad_norm": 2.1280914964582593, + "language_loss": 0.72206116, + "learning_rate": 2.526692300132797e-06, + "loss": 0.74321592, + "num_input_tokens_seen": 154402865, + "step": 7197, + "time_per_iteration": 2.637166976928711 + }, + { + "auxiliary_loss_clip": 0.01062215, + "auxiliary_loss_mlp": 0.01033873, + "balance_loss_clip": 1.02836406, + "balance_loss_mlp": 1.02134407, + "epoch": 0.4327671727040433, + "flos": 25155891889920.0, + "grad_norm": 1.5061812241730703, + "language_loss": 0.72613811, + "learning_rate": 2.5263165759168547e-06, + "loss": 0.74709898, + "num_input_tokens_seen": 154423625, + "step": 7198, + "time_per_iteration": 2.7251250743865967 + }, + { + "auxiliary_loss_clip": 0.01040833, + "auxiliary_loss_mlp": 0.01026473, + "balance_loss_clip": 1.02617431, + "balance_loss_mlp": 1.01465321, + "epoch": 0.43282729595671127, + "flos": 25447163276160.0, + "grad_norm": 1.3787111145978121, + "language_loss": 0.81029236, + "learning_rate": 2.525940831742934e-06, + "loss": 0.8309654, + "num_input_tokens_seen": 154444775, + "step": 7199, + "time_per_iteration": 2.814837694168091 + }, + { + "auxiliary_loss_clip": 0.01052181, + "auxiliary_loss_mlp": 0.01029344, + "balance_loss_clip": 1.02711511, + "balance_loss_mlp": 1.01667821, + "epoch": 0.43288741920937923, + "flos": 24126116878080.0, + "grad_norm": 5.540623690782354, + "language_loss": 0.68201816, + "learning_rate": 2.525565067625286e-06, + "loss": 0.70283341, + "num_input_tokens_seen": 154460815, + "step": 7200, + "time_per_iteration": 2.7188258171081543 + }, + { + "auxiliary_loss_clip": 0.01053466, + "auxiliary_loss_mlp": 0.00747569, + "balance_loss_clip": 1.02751279, + "balance_loss_mlp": 1.00036728, + "epoch": 0.4329475424620472, + "flos": 19204955066880.0, + "grad_norm": 1.6795378192684154, + "language_loss": 0.87257838, + "learning_rate": 2.525189283578157e-06, + "loss": 0.89058876, + "num_input_tokens_seen": 154479145, + "step": 7201, + "time_per_iteration": 2.757439613342285 + }, + { + "auxiliary_loss_clip": 0.01028889, + "auxiliary_loss_mlp": 0.01038467, + "balance_loss_clip": 1.03006148, + "balance_loss_mlp": 1.02310681, + "epoch": 0.43300766571471516, + "flos": 22638374438400.0, + "grad_norm": 3.6617566745565893, + "language_loss": 0.64513093, + "learning_rate": 2.5248134796157974e-06, + "loss": 0.66580451, + "num_input_tokens_seen": 154498905, + "step": 7202, + "time_per_iteration": 2.8828701972961426 + }, + { + "auxiliary_loss_clip": 0.0102701, + "auxiliary_loss_mlp": 0.01028539, + "balance_loss_clip": 1.02538931, + "balance_loss_mlp": 1.01662445, + "epoch": 0.4330677889673831, + "flos": 22121080721280.0, + "grad_norm": 1.7920516521845562, + "language_loss": 0.81908488, + "learning_rate": 2.5244376557524586e-06, + "loss": 0.83964038, + "num_input_tokens_seen": 154517270, + "step": 7203, + "time_per_iteration": 2.7720131874084473 + }, + { + "auxiliary_loss_clip": 0.01040645, + "auxiliary_loss_mlp": 0.0103939, + "balance_loss_clip": 1.02517986, + "balance_loss_mlp": 1.02655089, + "epoch": 0.4331279122200511, + "flos": 23221527742080.0, + "grad_norm": 1.829669185070496, + "language_loss": 0.81229925, + "learning_rate": 2.5240618120023912e-06, + "loss": 0.8330996, + "num_input_tokens_seen": 154535945, + "step": 7204, + "time_per_iteration": 2.854576349258423 + }, + { + "auxiliary_loss_clip": 0.01051558, + "auxiliary_loss_mlp": 0.01029345, + "balance_loss_clip": 1.02587235, + "balance_loss_mlp": 1.01752532, + "epoch": 0.43318803547271906, + "flos": 18259750627200.0, + "grad_norm": 2.1599520438901822, + "language_loss": 0.73821497, + "learning_rate": 2.5236859483798468e-06, + "loss": 0.75902402, + "num_input_tokens_seen": 154554935, + "step": 7205, + "time_per_iteration": 2.6886441707611084 + }, + { + "auxiliary_loss_clip": 0.01075738, + "auxiliary_loss_mlp": 0.00747416, + "balance_loss_clip": 1.03115571, + "balance_loss_mlp": 1.00024879, + "epoch": 0.433248158725387, + "flos": 27418407713280.0, + "grad_norm": 2.040573707874433, + "language_loss": 0.75225282, + "learning_rate": 2.5233100648990803e-06, + "loss": 0.77048433, + "num_input_tokens_seen": 154576065, + "step": 7206, + "time_per_iteration": 2.712442636489868 + }, + { + "auxiliary_loss_clip": 0.01027974, + "auxiliary_loss_mlp": 0.01031668, + "balance_loss_clip": 1.02654612, + "balance_loss_mlp": 1.01808977, + "epoch": 0.433308281978055, + "flos": 23218008209280.0, + "grad_norm": 1.9773394232966128, + "language_loss": 0.78715634, + "learning_rate": 2.522934161574342e-06, + "loss": 0.80775273, + "num_input_tokens_seen": 154595110, + "step": 7207, + "time_per_iteration": 2.823516845703125 + }, + { + "auxiliary_loss_clip": 0.01043952, + "auxiliary_loss_mlp": 0.01035369, + "balance_loss_clip": 1.02765012, + "balance_loss_mlp": 1.02153516, + "epoch": 0.433368405230723, + "flos": 15852407166720.0, + "grad_norm": 2.029198277691325, + "language_loss": 0.80785316, + "learning_rate": 2.5225582384198888e-06, + "loss": 0.82864642, + "num_input_tokens_seen": 154612255, + "step": 7208, + "time_per_iteration": 2.689211130142212 + }, + { + "auxiliary_loss_clip": 0.0105291, + "auxiliary_loss_mlp": 0.01033978, + "balance_loss_clip": 1.02741981, + "balance_loss_mlp": 1.02149057, + "epoch": 0.433428528483391, + "flos": 19026084314880.0, + "grad_norm": 2.1482542197240515, + "language_loss": 0.70519489, + "learning_rate": 2.5221822954499744e-06, + "loss": 0.72606373, + "num_input_tokens_seen": 154630440, + "step": 7209, + "time_per_iteration": 2.75677490234375 + }, + { + "auxiliary_loss_clip": 0.01059925, + "auxiliary_loss_mlp": 0.01031651, + "balance_loss_clip": 1.02623188, + "balance_loss_mlp": 1.01859772, + "epoch": 0.43348865173605894, + "flos": 24718248581760.0, + "grad_norm": 1.3610146547787598, + "language_loss": 0.81385505, + "learning_rate": 2.5218063326788557e-06, + "loss": 0.8347708, + "num_input_tokens_seen": 154652515, + "step": 7210, + "time_per_iteration": 2.6169676780700684 + }, + { + "auxiliary_loss_clip": 0.01050961, + "auxiliary_loss_mlp": 0.01033558, + "balance_loss_clip": 1.02677488, + "balance_loss_mlp": 1.02162504, + "epoch": 0.4335487749887269, + "flos": 22090664880000.0, + "grad_norm": 1.8494368658698448, + "language_loss": 0.82011306, + "learning_rate": 2.5214303501207885e-06, + "loss": 0.84095824, + "num_input_tokens_seen": 154670965, + "step": 7211, + "time_per_iteration": 2.6411361694335938 + }, + { + "auxiliary_loss_clip": 0.01059422, + "auxiliary_loss_mlp": 0.01035148, + "balance_loss_clip": 1.02484143, + "balance_loss_mlp": 1.02362633, + "epoch": 0.43360889824139487, + "flos": 22382941847040.0, + "grad_norm": 2.163106227191535, + "language_loss": 0.74656713, + "learning_rate": 2.521054347790029e-06, + "loss": 0.7675128, + "num_input_tokens_seen": 154689980, + "step": 7212, + "time_per_iteration": 2.710538625717163 + }, + { + "auxiliary_loss_clip": 0.01048536, + "auxiliary_loss_mlp": 0.01032053, + "balance_loss_clip": 1.02787232, + "balance_loss_mlp": 1.02041841, + "epoch": 0.43366902149406283, + "flos": 17528286067200.0, + "grad_norm": 1.8120202006017152, + "language_loss": 0.76424348, + "learning_rate": 2.5206783257008375e-06, + "loss": 0.78504944, + "num_input_tokens_seen": 154706570, + "step": 7213, + "time_per_iteration": 2.643815040588379 + }, + { + "auxiliary_loss_clip": 0.01064724, + "auxiliary_loss_mlp": 0.01031868, + "balance_loss_clip": 1.02862954, + "balance_loss_mlp": 1.01985717, + "epoch": 0.4337291447467308, + "flos": 19022672522880.0, + "grad_norm": 1.5708536460189477, + "language_loss": 0.64824384, + "learning_rate": 2.520302283867471e-06, + "loss": 0.66920972, + "num_input_tokens_seen": 154725210, + "step": 7214, + "time_per_iteration": 5.7594687938690186 + }, + { + "auxiliary_loss_clip": 0.01047092, + "auxiliary_loss_mlp": 0.01033103, + "balance_loss_clip": 1.02418661, + "balance_loss_mlp": 1.02180755, + "epoch": 0.43378926799939876, + "flos": 27234042180480.0, + "grad_norm": 1.6805996567914396, + "language_loss": 0.71498793, + "learning_rate": 2.519926222304191e-06, + "loss": 0.7357899, + "num_input_tokens_seen": 154745945, + "step": 7215, + "time_per_iteration": 2.679529905319214 + }, + { + "auxiliary_loss_clip": 0.01044831, + "auxiliary_loss_mlp": 0.01033577, + "balance_loss_clip": 1.02632904, + "balance_loss_mlp": 1.02001071, + "epoch": 0.43384939125206673, + "flos": 15961108700160.0, + "grad_norm": 1.7045784051327242, + "language_loss": 0.75278306, + "learning_rate": 2.519550141025255e-06, + "loss": 0.77356714, + "num_input_tokens_seen": 154763580, + "step": 7216, + "time_per_iteration": 2.539588689804077 + }, + { + "auxiliary_loss_clip": 0.01056817, + "auxiliary_loss_mlp": 0.01044133, + "balance_loss_clip": 1.0275898, + "balance_loss_mlp": 1.02951169, + "epoch": 0.4339095145047347, + "flos": 21793216354560.0, + "grad_norm": 4.317939572624773, + "language_loss": 0.75826412, + "learning_rate": 2.519174040044927e-06, + "loss": 0.77927363, + "num_input_tokens_seen": 154776825, + "step": 7217, + "time_per_iteration": 2.640099048614502 + }, + { + "auxiliary_loss_clip": 0.01034112, + "auxiliary_loss_mlp": 0.01038059, + "balance_loss_clip": 1.02485573, + "balance_loss_mlp": 1.02513123, + "epoch": 0.43396963775740266, + "flos": 14209853109120.0, + "grad_norm": 1.8020002887127011, + "language_loss": 0.73899084, + "learning_rate": 2.5187979193774664e-06, + "loss": 0.75971258, + "num_input_tokens_seen": 154794025, + "step": 7218, + "time_per_iteration": 2.7995970249176025 + }, + { + "auxiliary_loss_clip": 0.01052452, + "auxiliary_loss_mlp": 0.0103034, + "balance_loss_clip": 1.02977347, + "balance_loss_mlp": 1.01806712, + "epoch": 0.4340297610100706, + "flos": 19719052473600.0, + "grad_norm": 1.7272245666438597, + "language_loss": 0.69019723, + "learning_rate": 2.5184217790371367e-06, + "loss": 0.71102512, + "num_input_tokens_seen": 154813105, + "step": 7219, + "time_per_iteration": 2.6365468502044678 + }, + { + "auxiliary_loss_clip": 0.01043473, + "auxiliary_loss_mlp": 0.01034493, + "balance_loss_clip": 1.02657294, + "balance_loss_mlp": 1.02216721, + "epoch": 0.4340898842627386, + "flos": 18953508885120.0, + "grad_norm": 1.4623224935369665, + "language_loss": 0.77297461, + "learning_rate": 2.518045619038202e-06, + "loss": 0.79375422, + "num_input_tokens_seen": 154833525, + "step": 7220, + "time_per_iteration": 2.635136604309082 + }, + { + "auxiliary_loss_clip": 0.01016368, + "auxiliary_loss_mlp": 0.01033953, + "balance_loss_clip": 1.02731574, + "balance_loss_mlp": 1.02079201, + "epoch": 0.4341500075154066, + "flos": 22018304931840.0, + "grad_norm": 1.718626309677747, + "language_loss": 0.69430375, + "learning_rate": 2.5176694393949243e-06, + "loss": 0.71480703, + "num_input_tokens_seen": 154853090, + "step": 7221, + "time_per_iteration": 2.6965880393981934 + }, + { + "auxiliary_loss_clip": 0.01064656, + "auxiliary_loss_mlp": 0.01034148, + "balance_loss_clip": 1.02721119, + "balance_loss_mlp": 1.0220598, + "epoch": 0.4342101307680746, + "flos": 23582465556480.0, + "grad_norm": 1.6532939624548575, + "language_loss": 0.65327799, + "learning_rate": 2.51729324012157e-06, + "loss": 0.6742661, + "num_input_tokens_seen": 154872055, + "step": 7222, + "time_per_iteration": 4.22076416015625 + }, + { + "auxiliary_loss_clip": 0.0103968, + "auxiliary_loss_mlp": 0.01029053, + "balance_loss_clip": 1.02454436, + "balance_loss_mlp": 1.01600552, + "epoch": 0.43427025402074254, + "flos": 17967976450560.0, + "grad_norm": 1.906696132250825, + "language_loss": 0.7272718, + "learning_rate": 2.5169170212324053e-06, + "loss": 0.74795914, + "num_input_tokens_seen": 154886645, + "step": 7223, + "time_per_iteration": 2.671513557434082 + }, + { + "auxiliary_loss_clip": 0.01072946, + "auxiliary_loss_mlp": 0.01027879, + "balance_loss_clip": 1.02604723, + "balance_loss_mlp": 1.01542139, + "epoch": 0.4343303772734105, + "flos": 26286395616000.0, + "grad_norm": 1.8539594092954972, + "language_loss": 0.937105, + "learning_rate": 2.516540782741694e-06, + "loss": 0.95811319, + "num_input_tokens_seen": 154906775, + "step": 7224, + "time_per_iteration": 2.725059747695923 + }, + { + "auxiliary_loss_clip": 0.01032221, + "auxiliary_loss_mlp": 0.01033042, + "balance_loss_clip": 1.02392805, + "balance_loss_mlp": 1.02069187, + "epoch": 0.43439050052607847, + "flos": 26833961520000.0, + "grad_norm": 1.4476081070970557, + "language_loss": 0.60998738, + "learning_rate": 2.5161645246637056e-06, + "loss": 0.63064003, + "num_input_tokens_seen": 154926990, + "step": 7225, + "time_per_iteration": 2.7348530292510986 + }, + { + "auxiliary_loss_clip": 0.01041245, + "auxiliary_loss_mlp": 0.00747506, + "balance_loss_clip": 1.02482343, + "balance_loss_mlp": 1.00027871, + "epoch": 0.43445062377874644, + "flos": 21397660807680.0, + "grad_norm": 2.3053782504467057, + "language_loss": 0.78004646, + "learning_rate": 2.5157882470127054e-06, + "loss": 0.79793394, + "num_input_tokens_seen": 154946210, + "step": 7226, + "time_per_iteration": 2.6416752338409424 + }, + { + "auxiliary_loss_clip": 0.0106414, + "auxiliary_loss_mlp": 0.01029211, + "balance_loss_clip": 1.02896762, + "balance_loss_mlp": 1.01649141, + "epoch": 0.4345107470314144, + "flos": 19901945548800.0, + "grad_norm": 1.7630566584870049, + "language_loss": 0.84561849, + "learning_rate": 2.515411949802964e-06, + "loss": 0.866552, + "num_input_tokens_seen": 154964995, + "step": 7227, + "time_per_iteration": 2.6015641689300537 + }, + { + "auxiliary_loss_clip": 0.01056772, + "auxiliary_loss_mlp": 0.01034074, + "balance_loss_clip": 1.02600408, + "balance_loss_mlp": 1.02093661, + "epoch": 0.43457087028408237, + "flos": 26432623883520.0, + "grad_norm": 2.764061243066675, + "language_loss": 0.76783592, + "learning_rate": 2.5150356330487498e-06, + "loss": 0.78874439, + "num_input_tokens_seen": 154984775, + "step": 7228, + "time_per_iteration": 4.324065208435059 + }, + { + "auxiliary_loss_clip": 0.01037944, + "auxiliary_loss_mlp": 0.01034841, + "balance_loss_clip": 1.03100085, + "balance_loss_mlp": 1.02194834, + "epoch": 0.43463099353675033, + "flos": 31868816855040.0, + "grad_norm": 1.5791939866545681, + "language_loss": 0.80109435, + "learning_rate": 2.5146592967643324e-06, + "loss": 0.82182229, + "num_input_tokens_seen": 155008125, + "step": 7229, + "time_per_iteration": 2.8148205280303955 + }, + { + "auxiliary_loss_clip": 0.01058082, + "auxiliary_loss_mlp": 0.01035407, + "balance_loss_clip": 1.02497554, + "balance_loss_mlp": 1.022717, + "epoch": 0.4346911167894183, + "flos": 24571266128640.0, + "grad_norm": 1.8215126460321653, + "language_loss": 0.81724524, + "learning_rate": 2.5142829409639834e-06, + "loss": 0.83818007, + "num_input_tokens_seen": 155027885, + "step": 7230, + "time_per_iteration": 2.643314838409424 + }, + { + "auxiliary_loss_clip": 0.01058195, + "auxiliary_loss_mlp": 0.01039652, + "balance_loss_clip": 1.02761865, + "balance_loss_mlp": 1.02610397, + "epoch": 0.43475124004208626, + "flos": 17090678672640.0, + "grad_norm": 2.230677644070349, + "language_loss": 0.7732451, + "learning_rate": 2.513906565661973e-06, + "loss": 0.79422355, + "num_input_tokens_seen": 155043375, + "step": 7231, + "time_per_iteration": 2.594108819961548 + }, + { + "auxiliary_loss_clip": 0.01027101, + "auxiliary_loss_mlp": 0.01033355, + "balance_loss_clip": 1.02560949, + "balance_loss_mlp": 1.02174437, + "epoch": 0.4348113632947542, + "flos": 26104615862400.0, + "grad_norm": 1.678558126901942, + "language_loss": 0.68722439, + "learning_rate": 2.513530170872575e-06, + "loss": 0.707829, + "num_input_tokens_seen": 155062930, + "step": 7232, + "time_per_iteration": 2.7189228534698486 + }, + { + "auxiliary_loss_clip": 0.01044107, + "auxiliary_loss_mlp": 0.01032443, + "balance_loss_clip": 1.02761507, + "balance_loss_mlp": 1.01934171, + "epoch": 0.4348714865474222, + "flos": 34200496316160.0, + "grad_norm": 1.535053291446651, + "language_loss": 0.71823722, + "learning_rate": 2.5131537566100605e-06, + "loss": 0.7390027, + "num_input_tokens_seen": 155084980, + "step": 7233, + "time_per_iteration": 2.737421751022339 + }, + { + "auxiliary_loss_clip": 0.01018817, + "auxiliary_loss_mlp": 0.01040649, + "balance_loss_clip": 1.02708626, + "balance_loss_mlp": 1.02721381, + "epoch": 0.43493160980009016, + "flos": 31537468869120.0, + "grad_norm": 1.6704376217311714, + "language_loss": 0.74239242, + "learning_rate": 2.5127773228887053e-06, + "loss": 0.76298714, + "num_input_tokens_seen": 155107260, + "step": 7234, + "time_per_iteration": 2.768752098083496 + }, + { + "auxiliary_loss_clip": 0.01053704, + "auxiliary_loss_mlp": 0.01039629, + "balance_loss_clip": 1.02664232, + "balance_loss_mlp": 1.0261879, + "epoch": 0.4349917330527582, + "flos": 24061334699520.0, + "grad_norm": 2.12057729267492, + "language_loss": 0.59420419, + "learning_rate": 2.512400869722782e-06, + "loss": 0.61513758, + "num_input_tokens_seen": 155126720, + "step": 7235, + "time_per_iteration": 2.631439685821533 + }, + { + "auxiliary_loss_clip": 0.01006129, + "auxiliary_loss_mlp": 0.01036908, + "balance_loss_clip": 1.0199337, + "balance_loss_mlp": 1.02223396, + "epoch": 0.43505185630542614, + "flos": 30519329863680.0, + "grad_norm": 2.4398286310194517, + "language_loss": 0.77625412, + "learning_rate": 2.512024397126566e-06, + "loss": 0.79668444, + "num_input_tokens_seen": 155148640, + "step": 7236, + "time_per_iteration": 2.7934858798980713 + }, + { + "auxiliary_loss_clip": 0.01069613, + "auxiliary_loss_mlp": 0.0102673, + "balance_loss_clip": 1.02585363, + "balance_loss_mlp": 1.01427257, + "epoch": 0.4351119795580941, + "flos": 15735158196480.0, + "grad_norm": 1.7023795018076795, + "language_loss": 0.81049371, + "learning_rate": 2.5116479051143345e-06, + "loss": 0.83145714, + "num_input_tokens_seen": 155165870, + "step": 7237, + "time_per_iteration": 2.56142520904541 + }, + { + "auxiliary_loss_clip": 0.01058196, + "auxiliary_loss_mlp": 0.01032699, + "balance_loss_clip": 1.02639651, + "balance_loss_mlp": 1.02020574, + "epoch": 0.4351721028107621, + "flos": 18731760272640.0, + "grad_norm": 1.6299662907427892, + "language_loss": 0.6325264, + "learning_rate": 2.5112713937003623e-06, + "loss": 0.65343535, + "num_input_tokens_seen": 155185315, + "step": 7238, + "time_per_iteration": 2.655984878540039 + }, + { + "auxiliary_loss_clip": 0.01040369, + "auxiliary_loss_mlp": 0.00747524, + "balance_loss_clip": 1.0262152, + "balance_loss_mlp": 1.00035262, + "epoch": 0.43523222606343004, + "flos": 25226887121280.0, + "grad_norm": 1.8387084146090718, + "language_loss": 0.85952687, + "learning_rate": 2.510894862898928e-06, + "loss": 0.87740588, + "num_input_tokens_seen": 155205790, + "step": 7239, + "time_per_iteration": 2.7915585041046143 + }, + { + "auxiliary_loss_clip": 0.01051864, + "auxiliary_loss_mlp": 0.01028503, + "balance_loss_clip": 1.02628589, + "balance_loss_mlp": 1.0161531, + "epoch": 0.435292349316098, + "flos": 22709190101760.0, + "grad_norm": 1.5086228096564676, + "language_loss": 0.72521377, + "learning_rate": 2.510518312724309e-06, + "loss": 0.74601746, + "num_input_tokens_seen": 155226475, + "step": 7240, + "time_per_iteration": 2.792226552963257 + }, + { + "auxiliary_loss_clip": 0.0104569, + "auxiliary_loss_mlp": 0.01030121, + "balance_loss_clip": 1.03025293, + "balance_loss_mlp": 1.01762199, + "epoch": 0.43535247256876597, + "flos": 25775889569280.0, + "grad_norm": 2.8744345340509043, + "language_loss": 0.81603473, + "learning_rate": 2.5101417431907842e-06, + "loss": 0.83679283, + "num_input_tokens_seen": 155247110, + "step": 7241, + "time_per_iteration": 2.7584543228149414 + }, + { + "auxiliary_loss_clip": 0.01046161, + "auxiliary_loss_mlp": 0.00747684, + "balance_loss_clip": 1.02777421, + "balance_loss_mlp": 1.00030077, + "epoch": 0.43541259582143393, + "flos": 17528142412800.0, + "grad_norm": 2.6208281018876813, + "language_loss": 0.79476607, + "learning_rate": 2.5097651543126345e-06, + "loss": 0.81270456, + "num_input_tokens_seen": 155261335, + "step": 7242, + "time_per_iteration": 2.729440450668335 + }, + { + "auxiliary_loss_clip": 0.01043934, + "auxiliary_loss_mlp": 0.01032624, + "balance_loss_clip": 1.0234828, + "balance_loss_mlp": 1.01922464, + "epoch": 0.4354727190741019, + "flos": 15195205975680.0, + "grad_norm": 2.2604912944141518, + "language_loss": 0.68116289, + "learning_rate": 2.509388546104138e-06, + "loss": 0.70192844, + "num_input_tokens_seen": 155278510, + "step": 7243, + "time_per_iteration": 2.849616765975952 + }, + { + "auxiliary_loss_clip": 0.01014369, + "auxiliary_loss_mlp": 0.01032922, + "balance_loss_clip": 1.02468157, + "balance_loss_mlp": 1.02099478, + "epoch": 0.43553284232676986, + "flos": 16649264436480.0, + "grad_norm": 1.6076400610539678, + "language_loss": 0.81621659, + "learning_rate": 2.5090119185795766e-06, + "loss": 0.83668947, + "num_input_tokens_seen": 155296450, + "step": 7244, + "time_per_iteration": 2.9174633026123047 + }, + { + "auxiliary_loss_clip": 0.01022533, + "auxiliary_loss_mlp": 0.01028385, + "balance_loss_clip": 1.0275377, + "balance_loss_mlp": 1.01653528, + "epoch": 0.43559296557943783, + "flos": 23400865370880.0, + "grad_norm": 2.1397214284689645, + "language_loss": 0.73304939, + "learning_rate": 2.508635271753234e-06, + "loss": 0.75355864, + "num_input_tokens_seen": 155316080, + "step": 7245, + "time_per_iteration": 2.8031868934631348 + }, + { + "auxiliary_loss_clip": 0.01028451, + "auxiliary_loss_mlp": 0.01037406, + "balance_loss_clip": 1.02719152, + "balance_loss_mlp": 1.02532458, + "epoch": 0.4356530888321058, + "flos": 22419067950720.0, + "grad_norm": 1.7694120337858572, + "language_loss": 0.76865375, + "learning_rate": 2.508258605639389e-06, + "loss": 0.78931236, + "num_input_tokens_seen": 155336765, + "step": 7246, + "time_per_iteration": 2.7117321491241455 + }, + { + "auxiliary_loss_clip": 0.01059112, + "auxiliary_loss_mlp": 0.01035703, + "balance_loss_clip": 1.0257678, + "balance_loss_mlp": 1.02290034, + "epoch": 0.43571321208477376, + "flos": 21616141282560.0, + "grad_norm": 2.0720639045076097, + "language_loss": 0.85391998, + "learning_rate": 2.5078819202523275e-06, + "loss": 0.87486809, + "num_input_tokens_seen": 155356440, + "step": 7247, + "time_per_iteration": 2.614734172821045 + }, + { + "auxiliary_loss_clip": 0.01072384, + "auxiliary_loss_mlp": 0.0103663, + "balance_loss_clip": 1.02693915, + "balance_loss_mlp": 1.02468491, + "epoch": 0.4357733353374418, + "flos": 23987358639360.0, + "grad_norm": 1.6280434890620175, + "language_loss": 0.72427964, + "learning_rate": 2.507505215606333e-06, + "loss": 0.74536985, + "num_input_tokens_seen": 155377070, + "step": 7248, + "time_per_iteration": 2.6096951961517334 + }, + { + "auxiliary_loss_clip": 0.01061571, + "auxiliary_loss_mlp": 0.01031091, + "balance_loss_clip": 1.02680814, + "balance_loss_mlp": 1.01938498, + "epoch": 0.43583345859010975, + "flos": 25264737077760.0, + "grad_norm": 3.944235226637293, + "language_loss": 0.87171489, + "learning_rate": 2.5071284917156893e-06, + "loss": 0.89264154, + "num_input_tokens_seen": 155398415, + "step": 7249, + "time_per_iteration": 2.671626567840576 + }, + { + "auxiliary_loss_clip": 0.01056091, + "auxiliary_loss_mlp": 0.01038824, + "balance_loss_clip": 1.0287137, + "balance_loss_mlp": 1.02705216, + "epoch": 0.4358935818427777, + "flos": 23696302734720.0, + "grad_norm": 1.728051761764531, + "language_loss": 0.82216018, + "learning_rate": 2.506751748594683e-06, + "loss": 0.84310931, + "num_input_tokens_seen": 155415625, + "step": 7250, + "time_per_iteration": 2.7448198795318604 + }, + { + "auxiliary_loss_clip": 0.01066298, + "auxiliary_loss_mlp": 0.01032811, + "balance_loss_clip": 1.03041315, + "balance_loss_mlp": 1.0204972, + "epoch": 0.4359537050954457, + "flos": 29532827761920.0, + "grad_norm": 2.122888245375835, + "language_loss": 0.84773946, + "learning_rate": 2.5063749862575988e-06, + "loss": 0.8687306, + "num_input_tokens_seen": 155435505, + "step": 7251, + "time_per_iteration": 2.690171718597412 + }, + { + "auxiliary_loss_clip": 0.01049829, + "auxiliary_loss_mlp": 0.01033588, + "balance_loss_clip": 1.02304304, + "balance_loss_mlp": 1.02078509, + "epoch": 0.43601382834811364, + "flos": 22711273090560.0, + "grad_norm": 1.740743347395012, + "language_loss": 0.6938529, + "learning_rate": 2.5059982047187245e-06, + "loss": 0.71468711, + "num_input_tokens_seen": 155455425, + "step": 7252, + "time_per_iteration": 2.651076078414917 + }, + { + "auxiliary_loss_clip": 0.01045765, + "auxiliary_loss_mlp": 0.01033996, + "balance_loss_clip": 1.02613497, + "balance_loss_mlp": 1.02085364, + "epoch": 0.4360739516007816, + "flos": 19098731571840.0, + "grad_norm": 1.646656152301196, + "language_loss": 0.84084475, + "learning_rate": 2.505621403992348e-06, + "loss": 0.86164236, + "num_input_tokens_seen": 155474250, + "step": 7253, + "time_per_iteration": 2.6604950428009033 + }, + { + "auxiliary_loss_clip": 0.01060819, + "auxiliary_loss_mlp": 0.01031948, + "balance_loss_clip": 1.02708352, + "balance_loss_mlp": 1.01941299, + "epoch": 0.43613407485344957, + "flos": 23404420817280.0, + "grad_norm": 1.4181409939916392, + "language_loss": 0.7015909, + "learning_rate": 2.505244584092757e-06, + "loss": 0.72251856, + "num_input_tokens_seen": 155494685, + "step": 7254, + "time_per_iteration": 2.559210777282715 + }, + { + "auxiliary_loss_clip": 0.01051389, + "auxiliary_loss_mlp": 0.01030823, + "balance_loss_clip": 1.02684569, + "balance_loss_mlp": 1.01877105, + "epoch": 0.43619419810611754, + "flos": 22637799820800.0, + "grad_norm": 1.9097539695489487, + "language_loss": 0.81255484, + "learning_rate": 2.5048677450342406e-06, + "loss": 0.83337694, + "num_input_tokens_seen": 155513040, + "step": 7255, + "time_per_iteration": 2.589813709259033 + }, + { + "auxiliary_loss_clip": 0.01071865, + "auxiliary_loss_mlp": 0.01030526, + "balance_loss_clip": 1.0265255, + "balance_loss_mlp": 1.01870036, + "epoch": 0.4362543213587855, + "flos": 20047958334720.0, + "grad_norm": 1.7059831405529813, + "language_loss": 0.7757951, + "learning_rate": 2.504490886831089e-06, + "loss": 0.79681903, + "num_input_tokens_seen": 155530100, + "step": 7256, + "time_per_iteration": 2.4863617420196533 + }, + { + "auxiliary_loss_clip": 0.01073216, + "auxiliary_loss_mlp": 0.01029187, + "balance_loss_clip": 1.02931905, + "balance_loss_mlp": 1.01720643, + "epoch": 0.43631444461145347, + "flos": 21361319222400.0, + "grad_norm": 1.662844661040789, + "language_loss": 0.76245165, + "learning_rate": 2.5041140094975922e-06, + "loss": 0.78347564, + "num_input_tokens_seen": 155549375, + "step": 7257, + "time_per_iteration": 2.490659475326538 + }, + { + "auxiliary_loss_clip": 0.01062515, + "auxiliary_loss_mlp": 0.01033, + "balance_loss_clip": 1.02626872, + "balance_loss_mlp": 1.01985717, + "epoch": 0.43637456786412143, + "flos": 22418529246720.0, + "grad_norm": 1.6793815413084605, + "language_loss": 0.72755337, + "learning_rate": 2.5037371130480417e-06, + "loss": 0.74850857, + "num_input_tokens_seen": 155569395, + "step": 7258, + "time_per_iteration": 2.676974058151245 + }, + { + "auxiliary_loss_clip": 0.01051623, + "auxiliary_loss_mlp": 0.01030977, + "balance_loss_clip": 1.02670515, + "balance_loss_mlp": 1.01871586, + "epoch": 0.4364346911167894, + "flos": 28548839612160.0, + "grad_norm": 1.965738075512679, + "language_loss": 0.76758677, + "learning_rate": 2.5033601974967297e-06, + "loss": 0.78841281, + "num_input_tokens_seen": 155589090, + "step": 7259, + "time_per_iteration": 2.6692705154418945 + }, + { + "auxiliary_loss_clip": 0.01000831, + "auxiliary_loss_mlp": 0.01003027, + "balance_loss_clip": 1.01183844, + "balance_loss_mlp": 1.00139427, + "epoch": 0.43649481436945736, + "flos": 62659345380480.0, + "grad_norm": 0.7864711932155708, + "language_loss": 0.57007182, + "learning_rate": 2.5029832628579483e-06, + "loss": 0.59011042, + "num_input_tokens_seen": 155648660, + "step": 7260, + "time_per_iteration": 3.134399652481079 + }, + { + "auxiliary_loss_clip": 0.01049029, + "auxiliary_loss_mlp": 0.01038639, + "balance_loss_clip": 1.02475095, + "balance_loss_mlp": 1.0252341, + "epoch": 0.4365549376221254, + "flos": 30592120775040.0, + "grad_norm": 3.6047305976026682, + "language_loss": 0.70720184, + "learning_rate": 2.5026063091459907e-06, + "loss": 0.72807854, + "num_input_tokens_seen": 155669945, + "step": 7261, + "time_per_iteration": 5.9524219036102295 + }, + { + "auxiliary_loss_clip": 0.01024185, + "auxiliary_loss_mlp": 0.01047128, + "balance_loss_clip": 1.02382779, + "balance_loss_mlp": 1.03353262, + "epoch": 0.43661506087479335, + "flos": 17165875795200.0, + "grad_norm": 6.506460750381026, + "language_loss": 0.69011658, + "learning_rate": 2.5022293363751522e-06, + "loss": 0.71082973, + "num_input_tokens_seen": 155688555, + "step": 7262, + "time_per_iteration": 2.757291078567505 + }, + { + "auxiliary_loss_clip": 0.01013579, + "auxiliary_loss_mlp": 0.01031631, + "balance_loss_clip": 1.02704287, + "balance_loss_mlp": 1.0207355, + "epoch": 0.4366751841274613, + "flos": 22047499710720.0, + "grad_norm": 1.7623286115524104, + "language_loss": 0.79696423, + "learning_rate": 2.501852344559726e-06, + "loss": 0.81741637, + "num_input_tokens_seen": 155705370, + "step": 7263, + "time_per_iteration": 2.8557021617889404 + }, + { + "auxiliary_loss_clip": 0.01043222, + "auxiliary_loss_mlp": 0.01041401, + "balance_loss_clip": 1.02978897, + "balance_loss_mlp": 1.02874708, + "epoch": 0.4367353073801293, + "flos": 15997306631040.0, + "grad_norm": 2.280295546443318, + "language_loss": 0.75496376, + "learning_rate": 2.50147533371401e-06, + "loss": 0.77581, + "num_input_tokens_seen": 155721890, + "step": 7264, + "time_per_iteration": 2.8378429412841797 + }, + { + "auxiliary_loss_clip": 0.01023809, + "auxiliary_loss_mlp": 0.01031246, + "balance_loss_clip": 1.02596879, + "balance_loss_mlp": 1.01856816, + "epoch": 0.43679543063279724, + "flos": 38217535868160.0, + "grad_norm": 1.9563226575301988, + "language_loss": 0.61902344, + "learning_rate": 2.501098303852298e-06, + "loss": 0.63957393, + "num_input_tokens_seen": 155743970, + "step": 7265, + "time_per_iteration": 2.8699679374694824 + }, + { + "auxiliary_loss_clip": 0.01049562, + "auxiliary_loss_mlp": 0.01028883, + "balance_loss_clip": 1.02609515, + "balance_loss_mlp": 1.01732564, + "epoch": 0.4368555538854652, + "flos": 15193230727680.0, + "grad_norm": 2.568427062450808, + "language_loss": 0.72629386, + "learning_rate": 2.5007212549888884e-06, + "loss": 0.7470783, + "num_input_tokens_seen": 155761830, + "step": 7266, + "time_per_iteration": 2.7495760917663574 + }, + { + "auxiliary_loss_clip": 0.01055694, + "auxiliary_loss_mlp": 0.01031723, + "balance_loss_clip": 1.03066313, + "balance_loss_mlp": 1.01881838, + "epoch": 0.4369156771381332, + "flos": 23069086421760.0, + "grad_norm": 2.039896349993557, + "language_loss": 0.82211924, + "learning_rate": 2.5003441871380794e-06, + "loss": 0.84299338, + "num_input_tokens_seen": 155779610, + "step": 7267, + "time_per_iteration": 2.6296873092651367 + }, + { + "auxiliary_loss_clip": 0.01069475, + "auxiliary_loss_mlp": 0.01028171, + "balance_loss_clip": 1.02600789, + "balance_loss_mlp": 1.01676822, + "epoch": 0.43697580039080114, + "flos": 23441085624960.0, + "grad_norm": 2.2015057649348844, + "language_loss": 0.74488521, + "learning_rate": 2.4999671003141674e-06, + "loss": 0.76586169, + "num_input_tokens_seen": 155798765, + "step": 7268, + "time_per_iteration": 2.576510429382324 + }, + { + "auxiliary_loss_clip": 0.0107668, + "auxiliary_loss_mlp": 0.01033655, + "balance_loss_clip": 1.02843213, + "balance_loss_mlp": 1.02023196, + "epoch": 0.4370359236434691, + "flos": 18514680428160.0, + "grad_norm": 3.355187056109419, + "language_loss": 0.79431152, + "learning_rate": 2.499589994531454e-06, + "loss": 0.81541479, + "num_input_tokens_seen": 155817750, + "step": 7269, + "time_per_iteration": 4.2960474491119385 + }, + { + "auxiliary_loss_clip": 0.01054781, + "auxiliary_loss_mlp": 0.0103133, + "balance_loss_clip": 1.02879453, + "balance_loss_mlp": 1.0191586, + "epoch": 0.43709604689613707, + "flos": 23222497409280.0, + "grad_norm": 1.6700587742169197, + "language_loss": 0.75135851, + "learning_rate": 2.499212869804237e-06, + "loss": 0.77221966, + "num_input_tokens_seen": 155836490, + "step": 7270, + "time_per_iteration": 2.695298194885254 + }, + { + "auxiliary_loss_clip": 0.01011171, + "auxiliary_loss_mlp": 0.01040636, + "balance_loss_clip": 1.02361512, + "balance_loss_mlp": 1.02689767, + "epoch": 0.43715617014880503, + "flos": 23803711378560.0, + "grad_norm": 2.1584174156511686, + "language_loss": 0.79404104, + "learning_rate": 2.4988357261468182e-06, + "loss": 0.8145591, + "num_input_tokens_seen": 155856225, + "step": 7271, + "time_per_iteration": 2.843376636505127 + }, + { + "auxiliary_loss_clip": 0.01006042, + "auxiliary_loss_mlp": 0.0100245, + "balance_loss_clip": 1.00719094, + "balance_loss_mlp": 1.00096023, + "epoch": 0.437216293401473, + "flos": 61941204766080.0, + "grad_norm": 0.6957778942000568, + "language_loss": 0.54943538, + "learning_rate": 2.4984585635734993e-06, + "loss": 0.56952029, + "num_input_tokens_seen": 155916770, + "step": 7272, + "time_per_iteration": 3.2403550148010254 + }, + { + "auxiliary_loss_clip": 0.01075488, + "auxiliary_loss_mlp": 0.01036533, + "balance_loss_clip": 1.02874267, + "balance_loss_mlp": 1.02326536, + "epoch": 0.43727641665414096, + "flos": 21982250655360.0, + "grad_norm": 1.9071999574470968, + "language_loss": 0.69746476, + "learning_rate": 2.498081382098581e-06, + "loss": 0.71858501, + "num_input_tokens_seen": 155936490, + "step": 7273, + "time_per_iteration": 2.626997947692871 + }, + { + "auxiliary_loss_clip": 0.01048606, + "auxiliary_loss_mlp": 0.01040853, + "balance_loss_clip": 1.02607155, + "balance_loss_mlp": 1.0267024, + "epoch": 0.437336539906809, + "flos": 39530860842240.0, + "grad_norm": 2.1772282055326135, + "language_loss": 0.75074697, + "learning_rate": 2.497704181736367e-06, + "loss": 0.77164149, + "num_input_tokens_seen": 155957595, + "step": 7274, + "time_per_iteration": 2.7716352939605713 + }, + { + "auxiliary_loss_clip": 0.01060202, + "auxiliary_loss_mlp": 0.01026636, + "balance_loss_clip": 1.02662647, + "balance_loss_mlp": 1.01606846, + "epoch": 0.43739666315947695, + "flos": 17457147181440.0, + "grad_norm": 1.622796739983895, + "language_loss": 0.8044129, + "learning_rate": 2.49732696250116e-06, + "loss": 0.82528126, + "num_input_tokens_seen": 155975710, + "step": 7275, + "time_per_iteration": 4.165989637374878 + }, + { + "auxiliary_loss_clip": 0.01053038, + "auxiliary_loss_mlp": 0.01031474, + "balance_loss_clip": 1.02809978, + "balance_loss_mlp": 1.01990461, + "epoch": 0.4374567864121449, + "flos": 16358747235840.0, + "grad_norm": 2.080260278932457, + "language_loss": 0.80387765, + "learning_rate": 2.496949724407266e-06, + "loss": 0.82472277, + "num_input_tokens_seen": 155993090, + "step": 7276, + "time_per_iteration": 2.5907368659973145 + }, + { + "auxiliary_loss_clip": 0.01053919, + "auxiliary_loss_mlp": 0.01030296, + "balance_loss_clip": 1.02909303, + "balance_loss_mlp": 1.01727843, + "epoch": 0.4375169096648129, + "flos": 30587523834240.0, + "grad_norm": 2.146728400913487, + "language_loss": 0.72827721, + "learning_rate": 2.496572467468988e-06, + "loss": 0.74911928, + "num_input_tokens_seen": 156013685, + "step": 7277, + "time_per_iteration": 2.6759071350097656 + }, + { + "auxiliary_loss_clip": 0.01050803, + "auxiliary_loss_mlp": 0.00747474, + "balance_loss_clip": 1.02659452, + "balance_loss_mlp": 1.00045967, + "epoch": 0.43757703291748085, + "flos": 30555599621760.0, + "grad_norm": 2.0573685621979463, + "language_loss": 0.72856915, + "learning_rate": 2.4961951917006317e-06, + "loss": 0.74655193, + "num_input_tokens_seen": 156034300, + "step": 7278, + "time_per_iteration": 2.7173256874084473 + }, + { + "auxiliary_loss_clip": 0.01034197, + "auxiliary_loss_mlp": 0.01032384, + "balance_loss_clip": 1.02558231, + "balance_loss_mlp": 1.02091026, + "epoch": 0.4376371561701488, + "flos": 21397373498880.0, + "grad_norm": 1.546603566995408, + "language_loss": 0.65933919, + "learning_rate": 2.4958178971165046e-06, + "loss": 0.68000501, + "num_input_tokens_seen": 156053805, + "step": 7279, + "time_per_iteration": 2.657717227935791 + }, + { + "auxiliary_loss_clip": 0.01077942, + "auxiliary_loss_mlp": 0.01033322, + "balance_loss_clip": 1.02955902, + "balance_loss_mlp": 1.01991689, + "epoch": 0.4376972794228168, + "flos": 23404384903680.0, + "grad_norm": 1.6968593004945614, + "language_loss": 0.81779003, + "learning_rate": 2.4954405837309126e-06, + "loss": 0.83890265, + "num_input_tokens_seen": 156073295, + "step": 7280, + "time_per_iteration": 2.570404529571533 + }, + { + "auxiliary_loss_clip": 0.01048963, + "auxiliary_loss_mlp": 0.0103096, + "balance_loss_clip": 1.0255425, + "balance_loss_mlp": 1.01920044, + "epoch": 0.43775740267548474, + "flos": 22892945103360.0, + "grad_norm": 1.475456312648758, + "language_loss": 0.76958764, + "learning_rate": 2.4950632515581653e-06, + "loss": 0.79038692, + "num_input_tokens_seen": 156094540, + "step": 7281, + "time_per_iteration": 2.6077334880828857 + }, + { + "auxiliary_loss_clip": 0.01046116, + "auxiliary_loss_mlp": 0.01038913, + "balance_loss_clip": 1.02480698, + "balance_loss_mlp": 1.0269624, + "epoch": 0.4378175259281527, + "flos": 23294390480640.0, + "grad_norm": 1.9041919118255073, + "language_loss": 0.76051062, + "learning_rate": 2.494685900612569e-06, + "loss": 0.78136086, + "num_input_tokens_seen": 156114070, + "step": 7282, + "time_per_iteration": 2.603834867477417 + }, + { + "auxiliary_loss_clip": 0.01032023, + "auxiliary_loss_mlp": 0.01032606, + "balance_loss_clip": 1.0245676, + "balance_loss_mlp": 1.0201664, + "epoch": 0.43787764918082067, + "flos": 23876897339520.0, + "grad_norm": 1.7085924737340519, + "language_loss": 0.85031182, + "learning_rate": 2.4943085309084333e-06, + "loss": 0.87095815, + "num_input_tokens_seen": 156132130, + "step": 7283, + "time_per_iteration": 2.767652988433838 + }, + { + "auxiliary_loss_clip": 0.01049998, + "auxiliary_loss_mlp": 0.01033861, + "balance_loss_clip": 1.02673602, + "balance_loss_mlp": 1.02145743, + "epoch": 0.43793777243348864, + "flos": 23988148738560.0, + "grad_norm": 2.593961342937183, + "language_loss": 0.80429888, + "learning_rate": 2.49393114246007e-06, + "loss": 0.82513744, + "num_input_tokens_seen": 156150820, + "step": 7284, + "time_per_iteration": 2.6461219787597656 + }, + { + "auxiliary_loss_clip": 0.01062891, + "auxiliary_loss_mlp": 0.01034326, + "balance_loss_clip": 1.02781141, + "balance_loss_mlp": 1.02301931, + "epoch": 0.4379978956861566, + "flos": 18624064320000.0, + "grad_norm": 1.7784460696922981, + "language_loss": 0.8042289, + "learning_rate": 2.493553735281787e-06, + "loss": 0.82520109, + "num_input_tokens_seen": 156170125, + "step": 7285, + "time_per_iteration": 2.7291970252990723 + }, + { + "auxiliary_loss_clip": 0.01062176, + "auxiliary_loss_mlp": 0.01028429, + "balance_loss_clip": 1.02717257, + "balance_loss_mlp": 1.01679432, + "epoch": 0.43805801893882457, + "flos": 21981388728960.0, + "grad_norm": 1.9792523476538348, + "language_loss": 0.75078452, + "learning_rate": 2.493176309387897e-06, + "loss": 0.77169061, + "num_input_tokens_seen": 156187320, + "step": 7286, + "time_per_iteration": 2.6742312908172607 + }, + { + "auxiliary_loss_clip": 0.01036574, + "auxiliary_loss_mlp": 0.01027964, + "balance_loss_clip": 1.02358913, + "balance_loss_mlp": 1.01575696, + "epoch": 0.43811814219149253, + "flos": 26393337383040.0, + "grad_norm": 1.6718498946615183, + "language_loss": 0.73538041, + "learning_rate": 2.492798864792712e-06, + "loss": 0.75602579, + "num_input_tokens_seen": 156207455, + "step": 7287, + "time_per_iteration": 2.7868707180023193 + }, + { + "auxiliary_loss_clip": 0.01054616, + "auxiliary_loss_mlp": 0.01038931, + "balance_loss_clip": 1.02842498, + "balance_loss_mlp": 1.0262357, + "epoch": 0.43817826544416055, + "flos": 17493309198720.0, + "grad_norm": 1.7506365409599884, + "language_loss": 0.82209837, + "learning_rate": 2.492421401510545e-06, + "loss": 0.84303379, + "num_input_tokens_seen": 156226560, + "step": 7288, + "time_per_iteration": 2.708408832550049 + }, + { + "auxiliary_loss_clip": 0.010418, + "auxiliary_loss_mlp": 0.01030797, + "balance_loss_clip": 1.02416515, + "balance_loss_mlp": 1.01814282, + "epoch": 0.4382383886968285, + "flos": 21581020759680.0, + "grad_norm": 1.5044357225810872, + "language_loss": 0.84281915, + "learning_rate": 2.4920439195557093e-06, + "loss": 0.86354512, + "num_input_tokens_seen": 156246740, + "step": 7289, + "time_per_iteration": 2.723182201385498 + }, + { + "auxiliary_loss_clip": 0.01037185, + "auxiliary_loss_mlp": 0.01045176, + "balance_loss_clip": 1.02307534, + "balance_loss_mlp": 1.0309962, + "epoch": 0.4382985119494965, + "flos": 27923742201600.0, + "grad_norm": 1.6019517536348828, + "language_loss": 0.78065944, + "learning_rate": 2.4916664189425183e-06, + "loss": 0.80148298, + "num_input_tokens_seen": 156266440, + "step": 7290, + "time_per_iteration": 2.6634018421173096 + }, + { + "auxiliary_loss_clip": 0.010732, + "auxiliary_loss_mlp": 0.01036911, + "balance_loss_clip": 1.02797508, + "balance_loss_mlp": 1.02512145, + "epoch": 0.43835863520216445, + "flos": 24936836797440.0, + "grad_norm": 2.030317418678796, + "language_loss": 0.78266156, + "learning_rate": 2.491288899685288e-06, + "loss": 0.80376267, + "num_input_tokens_seen": 156286900, + "step": 7291, + "time_per_iteration": 2.744706869125366 + }, + { + "auxiliary_loss_clip": 0.01035886, + "auxiliary_loss_mlp": 0.01028076, + "balance_loss_clip": 1.02416825, + "balance_loss_mlp": 1.01571441, + "epoch": 0.4384187584548324, + "flos": 33510293504640.0, + "grad_norm": 1.6237896266022183, + "language_loss": 0.65074742, + "learning_rate": 2.4909113617983325e-06, + "loss": 0.67138708, + "num_input_tokens_seen": 156307690, + "step": 7292, + "time_per_iteration": 2.8647310733795166 + }, + { + "auxiliary_loss_clip": 0.01055545, + "auxiliary_loss_mlp": 0.01033027, + "balance_loss_clip": 1.02493072, + "balance_loss_mlp": 1.02000368, + "epoch": 0.4384788817075004, + "flos": 23951052967680.0, + "grad_norm": 1.6317315533047583, + "language_loss": 0.74170148, + "learning_rate": 2.49053380529597e-06, + "loss": 0.76258725, + "num_input_tokens_seen": 156326620, + "step": 7293, + "time_per_iteration": 2.768296480178833 + }, + { + "auxiliary_loss_clip": 0.01038206, + "auxiliary_loss_mlp": 0.01037152, + "balance_loss_clip": 1.02662778, + "balance_loss_mlp": 1.02470672, + "epoch": 0.43853900496016834, + "flos": 19098516090240.0, + "grad_norm": 2.0524908787678764, + "language_loss": 0.78765601, + "learning_rate": 2.490156230192516e-06, + "loss": 0.80840957, + "num_input_tokens_seen": 156345495, + "step": 7294, + "time_per_iteration": 2.704141139984131 + }, + { + "auxiliary_loss_clip": 0.01032233, + "auxiliary_loss_mlp": 0.01037576, + "balance_loss_clip": 1.0245595, + "balance_loss_mlp": 1.02498806, + "epoch": 0.4385991282128363, + "flos": 13225362168960.0, + "grad_norm": 1.561387103216548, + "language_loss": 0.72961092, + "learning_rate": 2.4897786365022883e-06, + "loss": 0.75030899, + "num_input_tokens_seen": 156363155, + "step": 7295, + "time_per_iteration": 2.8052539825439453 + }, + { + "auxiliary_loss_clip": 0.01036335, + "auxiliary_loss_mlp": 0.01039902, + "balance_loss_clip": 1.02497005, + "balance_loss_mlp": 1.02515578, + "epoch": 0.4386592514655043, + "flos": 14319883445760.0, + "grad_norm": 2.0652589356022024, + "language_loss": 0.74961579, + "learning_rate": 2.4894010242396063e-06, + "loss": 0.77037817, + "num_input_tokens_seen": 156380940, + "step": 7296, + "time_per_iteration": 2.89923357963562 + }, + { + "auxiliary_loss_clip": 0.01062338, + "auxiliary_loss_mlp": 0.01032166, + "balance_loss_clip": 1.02703547, + "balance_loss_mlp": 1.01990497, + "epoch": 0.43871937471817224, + "flos": 22784423137920.0, + "grad_norm": 1.7503608959284416, + "language_loss": 0.69057947, + "learning_rate": 2.4890233934187873e-06, + "loss": 0.71152449, + "num_input_tokens_seen": 156400415, + "step": 7297, + "time_per_iteration": 2.7472939491271973 + }, + { + "auxiliary_loss_clip": 0.01050878, + "auxiliary_loss_mlp": 0.01029903, + "balance_loss_clip": 1.02453375, + "balance_loss_mlp": 1.01772559, + "epoch": 0.4387794979708402, + "flos": 28072304853120.0, + "grad_norm": 1.4412810936141764, + "language_loss": 0.69528657, + "learning_rate": 2.4886457440541535e-06, + "loss": 0.71609437, + "num_input_tokens_seen": 156421120, + "step": 7298, + "time_per_iteration": 2.686915636062622 + }, + { + "auxiliary_loss_clip": 0.01062394, + "auxiliary_loss_mlp": 0.01025401, + "balance_loss_clip": 1.02820802, + "balance_loss_mlp": 1.01340246, + "epoch": 0.43883962122350817, + "flos": 26249551240320.0, + "grad_norm": 4.629199233673995, + "language_loss": 0.72638571, + "learning_rate": 2.4882680761600238e-06, + "loss": 0.74726361, + "num_input_tokens_seen": 156441535, + "step": 7299, + "time_per_iteration": 2.6127970218658447 + }, + { + "auxiliary_loss_clip": 0.01038344, + "auxiliary_loss_mlp": 0.00747729, + "balance_loss_clip": 1.02412438, + "balance_loss_mlp": 1.00038505, + "epoch": 0.43889974447617613, + "flos": 25883765089920.0, + "grad_norm": 1.6232789282466662, + "language_loss": 0.76584089, + "learning_rate": 2.487890389750719e-06, + "loss": 0.7837016, + "num_input_tokens_seen": 156462015, + "step": 7300, + "time_per_iteration": 2.693422555923462 + }, + { + "auxiliary_loss_clip": 0.01051253, + "auxiliary_loss_mlp": 0.01031829, + "balance_loss_clip": 1.02613568, + "balance_loss_mlp": 1.01930571, + "epoch": 0.43895986772884416, + "flos": 25046615738880.0, + "grad_norm": 2.0685113041910426, + "language_loss": 0.70923328, + "learning_rate": 2.4875126848405626e-06, + "loss": 0.73006415, + "num_input_tokens_seen": 156482165, + "step": 7301, + "time_per_iteration": 2.672818183898926 + }, + { + "auxiliary_loss_clip": 0.01030843, + "auxiliary_loss_mlp": 0.01040885, + "balance_loss_clip": 1.02854562, + "balance_loss_mlp": 1.02667522, + "epoch": 0.4390199909815121, + "flos": 25994585525760.0, + "grad_norm": 1.8941791108715766, + "language_loss": 0.70832515, + "learning_rate": 2.4871349614438757e-06, + "loss": 0.72904247, + "num_input_tokens_seen": 156503170, + "step": 7302, + "time_per_iteration": 2.711277723312378 + }, + { + "auxiliary_loss_clip": 0.01053038, + "auxiliary_loss_mlp": 0.0103465, + "balance_loss_clip": 1.0281682, + "balance_loss_mlp": 1.02318215, + "epoch": 0.4390801142341801, + "flos": 29022249888000.0, + "grad_norm": 1.6865170373842273, + "language_loss": 0.81942123, + "learning_rate": 2.486757219574983e-06, + "loss": 0.84029806, + "num_input_tokens_seen": 156523005, + "step": 7303, + "time_per_iteration": 2.6826517581939697 + }, + { + "auxiliary_loss_clip": 0.01056807, + "auxiliary_loss_mlp": 0.01043307, + "balance_loss_clip": 1.02569389, + "balance_loss_mlp": 1.02816141, + "epoch": 0.43914023748684805, + "flos": 33438544087680.0, + "grad_norm": 7.794293477154555, + "language_loss": 0.68162948, + "learning_rate": 2.4863794592482067e-06, + "loss": 0.70263064, + "num_input_tokens_seen": 156544440, + "step": 7304, + "time_per_iteration": 2.6503303050994873 + }, + { + "auxiliary_loss_clip": 0.01054076, + "auxiliary_loss_mlp": 0.00747533, + "balance_loss_clip": 1.02948737, + "balance_loss_mlp": 1.00038385, + "epoch": 0.439200360739516, + "flos": 34531844302080.0, + "grad_norm": 1.6164995870963381, + "language_loss": 0.78153086, + "learning_rate": 2.486001680477873e-06, + "loss": 0.79954696, + "num_input_tokens_seen": 156565410, + "step": 7305, + "time_per_iteration": 2.8547074794769287 + }, + { + "auxiliary_loss_clip": 0.01051056, + "auxiliary_loss_mlp": 0.01031364, + "balance_loss_clip": 1.02616644, + "balance_loss_mlp": 1.01884687, + "epoch": 0.439260483992184, + "flos": 21907843632000.0, + "grad_norm": 1.7339159862729743, + "language_loss": 0.68794405, + "learning_rate": 2.485623883278308e-06, + "loss": 0.70876819, + "num_input_tokens_seen": 156584210, + "step": 7306, + "time_per_iteration": 2.655503988265991 + }, + { + "auxiliary_loss_clip": 0.01031446, + "auxiliary_loss_mlp": 0.0103024, + "balance_loss_clip": 1.02384579, + "balance_loss_mlp": 1.01710308, + "epoch": 0.43932060724485195, + "flos": 20996430912000.0, + "grad_norm": 1.480118935313809, + "language_loss": 0.62766778, + "learning_rate": 2.4852460676638344e-06, + "loss": 0.64828467, + "num_input_tokens_seen": 156602730, + "step": 7307, + "time_per_iteration": 2.71437668800354 + }, + { + "auxiliary_loss_clip": 0.0107488, + "auxiliary_loss_mlp": 0.0103263, + "balance_loss_clip": 1.0275197, + "balance_loss_mlp": 1.01971352, + "epoch": 0.4393807304975199, + "flos": 17747053850880.0, + "grad_norm": 2.346605462578042, + "language_loss": 0.71828091, + "learning_rate": 2.4848682336487828e-06, + "loss": 0.73935604, + "num_input_tokens_seen": 156619405, + "step": 7308, + "time_per_iteration": 4.190759897232056 + }, + { + "auxiliary_loss_clip": 0.01054514, + "auxiliary_loss_mlp": 0.01032309, + "balance_loss_clip": 1.0259304, + "balance_loss_mlp": 1.01977384, + "epoch": 0.4394408537501879, + "flos": 22528523669760.0, + "grad_norm": 1.8052533049451145, + "language_loss": 0.76861167, + "learning_rate": 2.4844903812474787e-06, + "loss": 0.78947991, + "num_input_tokens_seen": 156638165, + "step": 7309, + "time_per_iteration": 4.3667261600494385 + }, + { + "auxiliary_loss_clip": 0.01058645, + "auxiliary_loss_mlp": 0.01027681, + "balance_loss_clip": 1.02617228, + "balance_loss_mlp": 1.01675558, + "epoch": 0.43950097700285584, + "flos": 23440654661760.0, + "grad_norm": 1.600346115252127, + "language_loss": 0.70849895, + "learning_rate": 2.484112510474251e-06, + "loss": 0.72936225, + "num_input_tokens_seen": 156658845, + "step": 7310, + "time_per_iteration": 2.714742422103882 + }, + { + "auxiliary_loss_clip": 0.01044579, + "auxiliary_loss_mlp": 0.00747526, + "balance_loss_clip": 1.02555156, + "balance_loss_mlp": 1.00044441, + "epoch": 0.4395611002555238, + "flos": 23180696956800.0, + "grad_norm": 2.051230156120594, + "language_loss": 0.76162302, + "learning_rate": 2.483734621343429e-06, + "loss": 0.77954412, + "num_input_tokens_seen": 156677275, + "step": 7311, + "time_per_iteration": 2.6591742038726807 + }, + { + "auxiliary_loss_clip": 0.01062187, + "auxiliary_loss_mlp": 0.0103053, + "balance_loss_clip": 1.02611315, + "balance_loss_mlp": 1.0188477, + "epoch": 0.43962122350819177, + "flos": 22127365601280.0, + "grad_norm": 4.78987789985246, + "language_loss": 0.81153989, + "learning_rate": 2.483356713869341e-06, + "loss": 0.83246708, + "num_input_tokens_seen": 156695815, + "step": 7312, + "time_per_iteration": 2.6848886013031006 + }, + { + "auxiliary_loss_clip": 0.01042601, + "auxiliary_loss_mlp": 0.01034147, + "balance_loss_clip": 1.02605391, + "balance_loss_mlp": 1.02168393, + "epoch": 0.43968134676085974, + "flos": 17420554200960.0, + "grad_norm": 2.057361506878266, + "language_loss": 0.84993184, + "learning_rate": 2.482978788066318e-06, + "loss": 0.87069929, + "num_input_tokens_seen": 156714385, + "step": 7313, + "time_per_iteration": 2.697185516357422 + }, + { + "auxiliary_loss_clip": 0.01050028, + "auxiliary_loss_mlp": 0.01029663, + "balance_loss_clip": 1.02468002, + "balance_loss_mlp": 1.01705098, + "epoch": 0.43974147001352776, + "flos": 18952646958720.0, + "grad_norm": 2.0648213174397236, + "language_loss": 0.67596734, + "learning_rate": 2.4826008439486904e-06, + "loss": 0.69676423, + "num_input_tokens_seen": 156732615, + "step": 7314, + "time_per_iteration": 2.5944766998291016 + }, + { + "auxiliary_loss_clip": 0.01055503, + "auxiliary_loss_mlp": 0.01030753, + "balance_loss_clip": 1.0275805, + "balance_loss_mlp": 1.01804543, + "epoch": 0.4398015932661957, + "flos": 18953508885120.0, + "grad_norm": 2.2589989028658075, + "language_loss": 0.76840019, + "learning_rate": 2.4822228815307915e-06, + "loss": 0.78926277, + "num_input_tokens_seen": 156750920, + "step": 7315, + "time_per_iteration": 2.5836007595062256 + }, + { + "auxiliary_loss_clip": 0.01052277, + "auxiliary_loss_mlp": 0.01030439, + "balance_loss_clip": 1.02662289, + "balance_loss_mlp": 1.01784444, + "epoch": 0.4398617165188637, + "flos": 24199913370240.0, + "grad_norm": 2.4037941479149407, + "language_loss": 0.74390125, + "learning_rate": 2.4818449008269523e-06, + "loss": 0.76472843, + "num_input_tokens_seen": 156768520, + "step": 7316, + "time_per_iteration": 2.6242892742156982 + }, + { + "auxiliary_loss_clip": 0.01045907, + "auxiliary_loss_mlp": 0.01035976, + "balance_loss_clip": 1.02992833, + "balance_loss_mlp": 1.02398372, + "epoch": 0.43992183977153165, + "flos": 22236677665920.0, + "grad_norm": 2.3450894742372475, + "language_loss": 0.64672768, + "learning_rate": 2.481466901851506e-06, + "loss": 0.66754651, + "num_input_tokens_seen": 156788700, + "step": 7317, + "time_per_iteration": 4.195641994476318 + }, + { + "auxiliary_loss_clip": 0.01051346, + "auxiliary_loss_mlp": 0.01033061, + "balance_loss_clip": 1.02673745, + "balance_loss_mlp": 1.02059793, + "epoch": 0.4399819630241996, + "flos": 18697465762560.0, + "grad_norm": 1.9213966993471818, + "language_loss": 0.79912448, + "learning_rate": 2.4810888846187865e-06, + "loss": 0.81996852, + "num_input_tokens_seen": 156806470, + "step": 7318, + "time_per_iteration": 2.6290671825408936 + }, + { + "auxiliary_loss_clip": 0.01024735, + "auxiliary_loss_mlp": 0.0103696, + "balance_loss_clip": 1.02238488, + "balance_loss_mlp": 1.02328718, + "epoch": 0.4400420862768676, + "flos": 23879375377920.0, + "grad_norm": 2.0101164177011173, + "language_loss": 0.79844409, + "learning_rate": 2.4807108491431283e-06, + "loss": 0.81906104, + "num_input_tokens_seen": 156825895, + "step": 7319, + "time_per_iteration": 2.799356460571289 + }, + { + "auxiliary_loss_clip": 0.01056493, + "auxiliary_loss_mlp": 0.01039902, + "balance_loss_clip": 1.02580929, + "balance_loss_mlp": 1.02624702, + "epoch": 0.44010220952953555, + "flos": 28037615293440.0, + "grad_norm": 1.6277002854398346, + "language_loss": 0.78990567, + "learning_rate": 2.4803327954388667e-06, + "loss": 0.81086963, + "num_input_tokens_seen": 156845990, + "step": 7320, + "time_per_iteration": 2.6776511669158936 + }, + { + "auxiliary_loss_clip": 0.01040186, + "auxiliary_loss_mlp": 0.0103792, + "balance_loss_clip": 1.02692938, + "balance_loss_mlp": 1.02619553, + "epoch": 0.4401623327822035, + "flos": 23768985905280.0, + "grad_norm": 1.6876849417312998, + "language_loss": 0.69622374, + "learning_rate": 2.4799547235203376e-06, + "loss": 0.71700478, + "num_input_tokens_seen": 156866685, + "step": 7321, + "time_per_iteration": 4.335397481918335 + }, + { + "auxiliary_loss_clip": 0.00975378, + "auxiliary_loss_mlp": 0.01007652, + "balance_loss_clip": 1.00733209, + "balance_loss_mlp": 1.00592315, + "epoch": 0.4402224560348715, + "flos": 70774583264640.0, + "grad_norm": 0.8879717562640609, + "language_loss": 0.5693233, + "learning_rate": 2.4795766334018763e-06, + "loss": 0.58915359, + "num_input_tokens_seen": 156923450, + "step": 7322, + "time_per_iteration": 3.316718339920044 + }, + { + "auxiliary_loss_clip": 0.01017778, + "auxiliary_loss_mlp": 0.0103988, + "balance_loss_clip": 1.02211905, + "balance_loss_mlp": 1.02782202, + "epoch": 0.44028257928753944, + "flos": 22891795868160.0, + "grad_norm": 1.527919369622499, + "language_loss": 0.76228547, + "learning_rate": 2.479198525097822e-06, + "loss": 0.78286207, + "num_input_tokens_seen": 156944795, + "step": 7323, + "time_per_iteration": 2.878824472427368 + }, + { + "auxiliary_loss_clip": 0.01066731, + "auxiliary_loss_mlp": 0.01035638, + "balance_loss_clip": 1.02875006, + "balance_loss_mlp": 1.02247715, + "epoch": 0.4403427025402074, + "flos": 17895760156800.0, + "grad_norm": 2.554773916436197, + "language_loss": 0.80590785, + "learning_rate": 2.478820398622511e-06, + "loss": 0.82693148, + "num_input_tokens_seen": 156962755, + "step": 7324, + "time_per_iteration": 2.642392873764038 + }, + { + "auxiliary_loss_clip": 0.00983307, + "auxiliary_loss_mlp": 0.01003237, + "balance_loss_clip": 1.00373936, + "balance_loss_mlp": 1.00174737, + "epoch": 0.4404028257928754, + "flos": 69562525708800.0, + "grad_norm": 0.6692604387250617, + "language_loss": 0.54581815, + "learning_rate": 2.478442253990283e-06, + "loss": 0.5656836, + "num_input_tokens_seen": 157028095, + "step": 7325, + "time_per_iteration": 3.2435708045959473 + }, + { + "auxiliary_loss_clip": 0.01071658, + "auxiliary_loss_mlp": 0.01023655, + "balance_loss_clip": 1.02823341, + "balance_loss_mlp": 1.0125978, + "epoch": 0.44046294904554334, + "flos": 20923675914240.0, + "grad_norm": 1.3943280888837735, + "language_loss": 0.69659352, + "learning_rate": 2.4780640912154766e-06, + "loss": 0.71754664, + "num_input_tokens_seen": 157048365, + "step": 7326, + "time_per_iteration": 2.579068422317505 + }, + { + "auxiliary_loss_clip": 0.01033457, + "auxiliary_loss_mlp": 0.01026561, + "balance_loss_clip": 1.02440274, + "balance_loss_mlp": 1.01480746, + "epoch": 0.44052307229821136, + "flos": 23623475909760.0, + "grad_norm": 1.6264934584265442, + "language_loss": 0.76597822, + "learning_rate": 2.477685910312432e-06, + "loss": 0.78657848, + "num_input_tokens_seen": 157069130, + "step": 7327, + "time_per_iteration": 2.7010862827301025 + }, + { + "auxiliary_loss_clip": 0.01045975, + "auxiliary_loss_mlp": 0.01031741, + "balance_loss_clip": 1.02410245, + "balance_loss_mlp": 1.01920629, + "epoch": 0.4405831955508793, + "flos": 17597665186560.0, + "grad_norm": 2.166320212213843, + "language_loss": 0.83830631, + "learning_rate": 2.4773077112954897e-06, + "loss": 0.85908341, + "num_input_tokens_seen": 157084940, + "step": 7328, + "time_per_iteration": 2.7152183055877686 + }, + { + "auxiliary_loss_clip": 0.01051825, + "auxiliary_loss_mlp": 0.01028182, + "balance_loss_clip": 1.02780223, + "balance_loss_mlp": 1.01652956, + "epoch": 0.4406433188035473, + "flos": 21463376739840.0, + "grad_norm": 2.429041077881789, + "language_loss": 0.77972054, + "learning_rate": 2.4769294941789908e-06, + "loss": 0.8005206, + "num_input_tokens_seen": 157102770, + "step": 7329, + "time_per_iteration": 2.683837413787842 + }, + { + "auxiliary_loss_clip": 0.01056964, + "auxiliary_loss_mlp": 0.0103312, + "balance_loss_clip": 1.02577853, + "balance_loss_mlp": 1.02045989, + "epoch": 0.44070344205621526, + "flos": 22673566788480.0, + "grad_norm": 1.7271727071077463, + "language_loss": 0.73314393, + "learning_rate": 2.476551258977278e-06, + "loss": 0.75404477, + "num_input_tokens_seen": 157122035, + "step": 7330, + "time_per_iteration": 2.6385130882263184 + }, + { + "auxiliary_loss_clip": 0.01054354, + "auxiliary_loss_mlp": 0.01035651, + "balance_loss_clip": 1.02851701, + "balance_loss_mlp": 1.02371216, + "epoch": 0.4407635653088832, + "flos": 23441193365760.0, + "grad_norm": 1.9424013423891813, + "language_loss": 0.74406576, + "learning_rate": 2.4761730057046936e-06, + "loss": 0.76496583, + "num_input_tokens_seen": 157142800, + "step": 7331, + "time_per_iteration": 2.6935439109802246 + }, + { + "auxiliary_loss_clip": 0.01022645, + "auxiliary_loss_mlp": 0.01032298, + "balance_loss_clip": 1.02411199, + "balance_loss_mlp": 1.02017999, + "epoch": 0.4408236885615512, + "flos": 24021294013440.0, + "grad_norm": 1.7413786157325424, + "language_loss": 0.76229835, + "learning_rate": 2.475794734375581e-06, + "loss": 0.78284776, + "num_input_tokens_seen": 157163295, + "step": 7332, + "time_per_iteration": 2.85632061958313 + }, + { + "auxiliary_loss_clip": 0.01052083, + "auxiliary_loss_mlp": 0.01036209, + "balance_loss_clip": 1.02732289, + "balance_loss_mlp": 1.02478921, + "epoch": 0.44088381181421915, + "flos": 12676826597760.0, + "grad_norm": 1.747727449357699, + "language_loss": 0.73378468, + "learning_rate": 2.475416445004285e-06, + "loss": 0.75466764, + "num_input_tokens_seen": 157180890, + "step": 7333, + "time_per_iteration": 2.567409038543701 + }, + { + "auxiliary_loss_clip": 0.01042918, + "auxiliary_loss_mlp": 0.01031063, + "balance_loss_clip": 1.0285871, + "balance_loss_mlp": 1.01908851, + "epoch": 0.4409439350668871, + "flos": 24569865498240.0, + "grad_norm": 1.6144547293148415, + "language_loss": 0.79837167, + "learning_rate": 2.4750381376051493e-06, + "loss": 0.81911147, + "num_input_tokens_seen": 157200580, + "step": 7334, + "time_per_iteration": 2.756932258605957 + }, + { + "auxiliary_loss_clip": 0.01048172, + "auxiliary_loss_mlp": 0.0103437, + "balance_loss_clip": 1.02677047, + "balance_loss_mlp": 1.01907587, + "epoch": 0.4410040583195551, + "flos": 22668574798080.0, + "grad_norm": 2.0763797524847303, + "language_loss": 0.75540137, + "learning_rate": 2.47465981219252e-06, + "loss": 0.77622676, + "num_input_tokens_seen": 157218345, + "step": 7335, + "time_per_iteration": 2.642106771469116 + }, + { + "auxiliary_loss_clip": 0.01048878, + "auxiliary_loss_mlp": 0.01034334, + "balance_loss_clip": 1.02795959, + "balance_loss_mlp": 1.0217272, + "epoch": 0.44106418157222305, + "flos": 10852528700160.0, + "grad_norm": 2.2494731111614814, + "language_loss": 0.72701126, + "learning_rate": 2.4742814687807423e-06, + "loss": 0.74784338, + "num_input_tokens_seen": 157234395, + "step": 7336, + "time_per_iteration": 2.5760550498962402 + }, + { + "auxiliary_loss_clip": 0.01066742, + "auxiliary_loss_mlp": 0.01041445, + "balance_loss_clip": 1.027385, + "balance_loss_mlp": 1.02784348, + "epoch": 0.441124304824891, + "flos": 21726710323200.0, + "grad_norm": 2.4095110846476784, + "language_loss": 0.63497388, + "learning_rate": 2.473903107384165e-06, + "loss": 0.65605575, + "num_input_tokens_seen": 157254805, + "step": 7337, + "time_per_iteration": 2.6239709854125977 + }, + { + "auxiliary_loss_clip": 0.00994183, + "auxiliary_loss_mlp": 0.00746116, + "balance_loss_clip": 1.00518727, + "balance_loss_mlp": 0.99994701, + "epoch": 0.441184428077559, + "flos": 63220486625280.0, + "grad_norm": 0.7478844031254931, + "language_loss": 0.5263896, + "learning_rate": 2.473524728017134e-06, + "loss": 0.54379261, + "num_input_tokens_seen": 157317870, + "step": 7338, + "time_per_iteration": 3.2447144985198975 + }, + { + "auxiliary_loss_clip": 0.01052026, + "auxiliary_loss_mlp": 0.01041354, + "balance_loss_clip": 1.02498829, + "balance_loss_mlp": 1.0272038, + "epoch": 0.44124455133022694, + "flos": 21177959270400.0, + "grad_norm": 2.090181164143848, + "language_loss": 0.70463252, + "learning_rate": 2.473146330693997e-06, + "loss": 0.72556633, + "num_input_tokens_seen": 157336505, + "step": 7339, + "time_per_iteration": 2.6459879875183105 + }, + { + "auxiliary_loss_clip": 0.01006899, + "auxiliary_loss_mlp": 0.01044628, + "balance_loss_clip": 1.02303386, + "balance_loss_mlp": 1.03238559, + "epoch": 0.4413046745828949, + "flos": 17457865453440.0, + "grad_norm": 1.451933035557406, + "language_loss": 0.69466919, + "learning_rate": 2.472767915429105e-06, + "loss": 0.71518445, + "num_input_tokens_seen": 157354995, + "step": 7340, + "time_per_iteration": 2.7774176597595215 + }, + { + "auxiliary_loss_clip": 0.0099393, + "auxiliary_loss_mlp": 0.01008436, + "balance_loss_clip": 1.00496387, + "balance_loss_mlp": 1.00687468, + "epoch": 0.4413647978355629, + "flos": 61586153804160.0, + "grad_norm": 0.8915176585074417, + "language_loss": 0.6404714, + "learning_rate": 2.4723894822368054e-06, + "loss": 0.6604951, + "num_input_tokens_seen": 157404260, + "step": 7341, + "time_per_iteration": 3.0007994174957275 + }, + { + "auxiliary_loss_clip": 0.01040544, + "auxiliary_loss_mlp": 0.01034839, + "balance_loss_clip": 1.02548122, + "balance_loss_mlp": 1.02214956, + "epoch": 0.4414249210882309, + "flos": 27527001505920.0, + "grad_norm": 2.421575469019761, + "language_loss": 0.73086327, + "learning_rate": 2.47201103113145e-06, + "loss": 0.75161713, + "num_input_tokens_seen": 157423045, + "step": 7342, + "time_per_iteration": 2.717416524887085 + }, + { + "auxiliary_loss_clip": 0.01072518, + "auxiliary_loss_mlp": 0.01038731, + "balance_loss_clip": 1.02592778, + "balance_loss_mlp": 1.02494478, + "epoch": 0.44148504434089886, + "flos": 23513984277120.0, + "grad_norm": 1.843767730257491, + "language_loss": 0.7972393, + "learning_rate": 2.4716325621273886e-06, + "loss": 0.81835175, + "num_input_tokens_seen": 157441815, + "step": 7343, + "time_per_iteration": 2.6099133491516113 + }, + { + "auxiliary_loss_clip": 0.01041629, + "auxiliary_loss_mlp": 0.01032387, + "balance_loss_clip": 1.02535129, + "balance_loss_mlp": 1.01968527, + "epoch": 0.4415451675935668, + "flos": 21580589796480.0, + "grad_norm": 2.0557272980361843, + "language_loss": 0.76138002, + "learning_rate": 2.4712540752389725e-06, + "loss": 0.78212023, + "num_input_tokens_seen": 157460470, + "step": 7344, + "time_per_iteration": 2.710235595703125 + }, + { + "auxiliary_loss_clip": 0.00990797, + "auxiliary_loss_mlp": 0.01003237, + "balance_loss_clip": 1.00263619, + "balance_loss_mlp": 1.00159192, + "epoch": 0.4416052908462348, + "flos": 59006368126080.0, + "grad_norm": 0.7918366149097147, + "language_loss": 0.63790661, + "learning_rate": 2.470875570480556e-06, + "loss": 0.65784693, + "num_input_tokens_seen": 157512655, + "step": 7345, + "time_per_iteration": 2.9383697509765625 + }, + { + "auxiliary_loss_clip": 0.01078549, + "auxiliary_loss_mlp": 0.01037993, + "balance_loss_clip": 1.03007436, + "balance_loss_mlp": 1.02499318, + "epoch": 0.44166541409890275, + "flos": 26357642242560.0, + "grad_norm": 1.6070677439295777, + "language_loss": 0.85943496, + "learning_rate": 2.470497047866489e-06, + "loss": 0.88060039, + "num_input_tokens_seen": 157533700, + "step": 7346, + "time_per_iteration": 2.6807801723480225 + }, + { + "auxiliary_loss_clip": 0.01065975, + "auxiliary_loss_mlp": 0.01040043, + "balance_loss_clip": 1.02834749, + "balance_loss_mlp": 1.02617359, + "epoch": 0.4417255373515707, + "flos": 20192678231040.0, + "grad_norm": 1.6633892546396454, + "language_loss": 0.80329853, + "learning_rate": 2.470118507411128e-06, + "loss": 0.8243587, + "num_input_tokens_seen": 157551105, + "step": 7347, + "time_per_iteration": 2.707098960876465 + }, + { + "auxiliary_loss_clip": 0.01056547, + "auxiliary_loss_mlp": 0.01035499, + "balance_loss_clip": 1.02969873, + "balance_loss_mlp": 1.02206421, + "epoch": 0.4417856606042387, + "flos": 17887895078400.0, + "grad_norm": 1.9751768983136277, + "language_loss": 0.83147311, + "learning_rate": 2.4697399491288263e-06, + "loss": 0.85239357, + "num_input_tokens_seen": 157568285, + "step": 7348, + "time_per_iteration": 2.6232948303222656 + }, + { + "auxiliary_loss_clip": 0.01067972, + "auxiliary_loss_mlp": 0.01034955, + "balance_loss_clip": 1.03063548, + "balance_loss_mlp": 1.02190709, + "epoch": 0.44184578385690665, + "flos": 27964034282880.0, + "grad_norm": 1.6638090162798835, + "language_loss": 0.69876415, + "learning_rate": 2.469361373033938e-06, + "loss": 0.71979344, + "num_input_tokens_seen": 157590405, + "step": 7349, + "time_per_iteration": 2.650296926498413 + }, + { + "auxiliary_loss_clip": 0.01041942, + "auxiliary_loss_mlp": 0.01035928, + "balance_loss_clip": 1.02545834, + "balance_loss_mlp": 1.02220702, + "epoch": 0.4419059071095746, + "flos": 23367899664000.0, + "grad_norm": 1.8735267416898607, + "language_loss": 0.74513042, + "learning_rate": 2.468982779140819e-06, + "loss": 0.76590908, + "num_input_tokens_seen": 157607420, + "step": 7350, + "time_per_iteration": 2.668039083480835 + }, + { + "auxiliary_loss_clip": 0.01073815, + "auxiliary_loss_mlp": 0.0103873, + "balance_loss_clip": 1.02653575, + "balance_loss_mlp": 1.02546191, + "epoch": 0.4419660303622426, + "flos": 15012169246080.0, + "grad_norm": 2.014948765720043, + "language_loss": 0.80338502, + "learning_rate": 2.468604167463827e-06, + "loss": 0.82451046, + "num_input_tokens_seen": 157624990, + "step": 7351, + "time_per_iteration": 2.5881235599517822 + }, + { + "auxiliary_loss_clip": 0.0102, + "auxiliary_loss_mlp": 0.00747383, + "balance_loss_clip": 1.02097988, + "balance_loss_mlp": 1.00025916, + "epoch": 0.44202615361491054, + "flos": 25371750672000.0, + "grad_norm": 1.5938276281069117, + "language_loss": 0.72794026, + "learning_rate": 2.4682255380173176e-06, + "loss": 0.74561411, + "num_input_tokens_seen": 157645300, + "step": 7352, + "time_per_iteration": 2.8028414249420166 + }, + { + "auxiliary_loss_clip": 0.01054258, + "auxiliary_loss_mlp": 0.01031468, + "balance_loss_clip": 1.02824116, + "balance_loss_mlp": 1.01847386, + "epoch": 0.4420862768675785, + "flos": 24681116897280.0, + "grad_norm": 2.119060483779935, + "language_loss": 0.87264776, + "learning_rate": 2.467846890815649e-06, + "loss": 0.89350504, + "num_input_tokens_seen": 157664060, + "step": 7353, + "time_per_iteration": 2.6715612411499023 + }, + { + "auxiliary_loss_clip": 0.01076074, + "auxiliary_loss_mlp": 0.01036463, + "balance_loss_clip": 1.02867484, + "balance_loss_mlp": 1.02457833, + "epoch": 0.44214640012024653, + "flos": 19528437974400.0, + "grad_norm": 1.902243535588449, + "language_loss": 0.75681257, + "learning_rate": 2.4674682258731795e-06, + "loss": 0.77793789, + "num_input_tokens_seen": 157680905, + "step": 7354, + "time_per_iteration": 2.560403347015381 + }, + { + "auxiliary_loss_clip": 0.01042081, + "auxiliary_loss_mlp": 0.01038056, + "balance_loss_clip": 1.02718055, + "balance_loss_mlp": 1.02667141, + "epoch": 0.4422065233729145, + "flos": 47557434003840.0, + "grad_norm": 1.7079850448002538, + "language_loss": 0.64346713, + "learning_rate": 2.467089543204268e-06, + "loss": 0.66426855, + "num_input_tokens_seen": 157701980, + "step": 7355, + "time_per_iteration": 4.512641429901123 + }, + { + "auxiliary_loss_clip": 0.010787, + "auxiliary_loss_mlp": 0.01040113, + "balance_loss_clip": 1.02886844, + "balance_loss_mlp": 1.02602875, + "epoch": 0.44226664662558246, + "flos": 19281050029440.0, + "grad_norm": 1.8266445540774803, + "language_loss": 0.78186578, + "learning_rate": 2.466710842823274e-06, + "loss": 0.80305392, + "num_input_tokens_seen": 157720555, + "step": 7356, + "time_per_iteration": 4.162431955337524 + }, + { + "auxiliary_loss_clip": 0.0105552, + "auxiliary_loss_mlp": 0.00747622, + "balance_loss_clip": 1.02779198, + "balance_loss_mlp": 1.00026464, + "epoch": 0.4423267698782504, + "flos": 17821820010240.0, + "grad_norm": 1.5558560770628658, + "language_loss": 0.77360755, + "learning_rate": 2.4663321247445577e-06, + "loss": 0.79163897, + "num_input_tokens_seen": 157739160, + "step": 7357, + "time_per_iteration": 2.599419355392456 + }, + { + "auxiliary_loss_clip": 0.01049215, + "auxiliary_loss_mlp": 0.01039962, + "balance_loss_clip": 1.02732635, + "balance_loss_mlp": 1.02656841, + "epoch": 0.4423868931309184, + "flos": 29204424691200.0, + "grad_norm": 1.7959197219509044, + "language_loss": 0.73099732, + "learning_rate": 2.465953388982481e-06, + "loss": 0.75188911, + "num_input_tokens_seen": 157760020, + "step": 7358, + "time_per_iteration": 2.700713872909546 + }, + { + "auxiliary_loss_clip": 0.01057252, + "auxiliary_loss_mlp": 0.01035831, + "balance_loss_clip": 1.029989, + "balance_loss_mlp": 1.02352309, + "epoch": 0.44244701638358636, + "flos": 29713135057920.0, + "grad_norm": 2.5274437786174766, + "language_loss": 0.75780499, + "learning_rate": 2.465574635551405e-06, + "loss": 0.77873588, + "num_input_tokens_seen": 157780435, + "step": 7359, + "time_per_iteration": 2.6272311210632324 + }, + { + "auxiliary_loss_clip": 0.01053432, + "auxiliary_loss_mlp": 0.01040167, + "balance_loss_clip": 1.02670491, + "balance_loss_mlp": 1.02670217, + "epoch": 0.4425071396362543, + "flos": 22930040874240.0, + "grad_norm": 2.5542372913644598, + "language_loss": 0.69878501, + "learning_rate": 2.4651958644656923e-06, + "loss": 0.71972096, + "num_input_tokens_seen": 157799420, + "step": 7360, + "time_per_iteration": 2.5841150283813477 + }, + { + "auxiliary_loss_clip": 0.01054456, + "auxiliary_loss_mlp": 0.01037169, + "balance_loss_clip": 1.02813005, + "balance_loss_mlp": 1.02447319, + "epoch": 0.4425672628889223, + "flos": 19792346175360.0, + "grad_norm": 2.7585708522221073, + "language_loss": 0.69522166, + "learning_rate": 2.4648170757397053e-06, + "loss": 0.71613789, + "num_input_tokens_seen": 157817025, + "step": 7361, + "time_per_iteration": 2.5733582973480225 + }, + { + "auxiliary_loss_clip": 0.01048587, + "auxiliary_loss_mlp": 0.0103968, + "balance_loss_clip": 1.0257237, + "balance_loss_mlp": 1.02553535, + "epoch": 0.44262738614159025, + "flos": 13662215377920.0, + "grad_norm": 1.9644641027580392, + "language_loss": 0.82263935, + "learning_rate": 2.464438269387809e-06, + "loss": 0.84352201, + "num_input_tokens_seen": 157834345, + "step": 7362, + "time_per_iteration": 2.655383825302124 + }, + { + "auxiliary_loss_clip": 0.01049592, + "auxiliary_loss_mlp": 0.01044265, + "balance_loss_clip": 1.02875638, + "balance_loss_mlp": 1.0302223, + "epoch": 0.4426875093942582, + "flos": 14210212245120.0, + "grad_norm": 1.7339168789907562, + "language_loss": 0.74617219, + "learning_rate": 2.464059445424366e-06, + "loss": 0.76711071, + "num_input_tokens_seen": 157852290, + "step": 7363, + "time_per_iteration": 2.662853240966797 + }, + { + "auxiliary_loss_clip": 0.00973348, + "auxiliary_loss_mlp": 0.01012682, + "balance_loss_clip": 1.00479698, + "balance_loss_mlp": 1.01119196, + "epoch": 0.4427476326469262, + "flos": 70117525728000.0, + "grad_norm": 0.6825629482349033, + "language_loss": 0.556422, + "learning_rate": 2.463680603863743e-06, + "loss": 0.57628226, + "num_input_tokens_seen": 157923060, + "step": 7364, + "time_per_iteration": 4.9524900913238525 + }, + { + "auxiliary_loss_clip": 0.01053244, + "auxiliary_loss_mlp": 0.01035937, + "balance_loss_clip": 1.02746677, + "balance_loss_mlp": 1.02350926, + "epoch": 0.44280775589959415, + "flos": 25445080287360.0, + "grad_norm": 1.6938087535907307, + "language_loss": 0.74340022, + "learning_rate": 2.463301744720305e-06, + "loss": 0.764292, + "num_input_tokens_seen": 157944110, + "step": 7365, + "time_per_iteration": 2.9475395679473877 + }, + { + "auxiliary_loss_clip": 0.01044632, + "auxiliary_loss_mlp": 0.01040635, + "balance_loss_clip": 1.02568936, + "balance_loss_mlp": 1.0271579, + "epoch": 0.4428678791522621, + "flos": 22857214049280.0, + "grad_norm": 2.247245977087458, + "language_loss": 0.73915267, + "learning_rate": 2.4629228680084184e-06, + "loss": 0.76000535, + "num_input_tokens_seen": 157964295, + "step": 7366, + "time_per_iteration": 2.6479859352111816 + }, + { + "auxiliary_loss_clip": 0.01054531, + "auxiliary_loss_mlp": 0.01028721, + "balance_loss_clip": 1.02822089, + "balance_loss_mlp": 1.01555419, + "epoch": 0.44292800240493013, + "flos": 25812446636160.0, + "grad_norm": 2.64264363893694, + "language_loss": 0.731466, + "learning_rate": 2.46254397374245e-06, + "loss": 0.75229853, + "num_input_tokens_seen": 157983970, + "step": 7367, + "time_per_iteration": 2.6607069969177246 + }, + { + "auxiliary_loss_clip": 0.010773, + "auxiliary_loss_mlp": 0.01038316, + "balance_loss_clip": 1.03001022, + "balance_loss_mlp": 1.02549469, + "epoch": 0.4429881256575981, + "flos": 32416885549440.0, + "grad_norm": 1.4241397899645931, + "language_loss": 0.74015093, + "learning_rate": 2.4621650619367677e-06, + "loss": 0.76130712, + "num_input_tokens_seen": 158006515, + "step": 7368, + "time_per_iteration": 2.6971709728240967 + }, + { + "auxiliary_loss_clip": 0.01045069, + "auxiliary_loss_mlp": 0.01036505, + "balance_loss_clip": 1.02552974, + "balance_loss_mlp": 1.02368414, + "epoch": 0.44304824891026606, + "flos": 22163707186560.0, + "grad_norm": 1.7592939724890966, + "language_loss": 0.79902172, + "learning_rate": 2.4617861326057403e-06, + "loss": 0.81983751, + "num_input_tokens_seen": 158025565, + "step": 7369, + "time_per_iteration": 4.236829042434692 + }, + { + "auxiliary_loss_clip": 0.01039058, + "auxiliary_loss_mlp": 0.0102925, + "balance_loss_clip": 1.02736831, + "balance_loss_mlp": 1.01691151, + "epoch": 0.443108372162934, + "flos": 25338569483520.0, + "grad_norm": 2.151828520608499, + "language_loss": 0.72122353, + "learning_rate": 2.461407185763737e-06, + "loss": 0.74190658, + "num_input_tokens_seen": 158045620, + "step": 7370, + "time_per_iteration": 2.6811106204986572 + }, + { + "auxiliary_loss_clip": 0.01075575, + "auxiliary_loss_mlp": 0.01033111, + "balance_loss_clip": 1.02827859, + "balance_loss_mlp": 1.0202657, + "epoch": 0.443168495415602, + "flos": 23330947547520.0, + "grad_norm": 2.677226040896859, + "language_loss": 0.70744258, + "learning_rate": 2.461028221425126e-06, + "loss": 0.72852945, + "num_input_tokens_seen": 158063505, + "step": 7371, + "time_per_iteration": 2.5741348266601562 + }, + { + "auxiliary_loss_clip": 0.01063143, + "auxiliary_loss_mlp": 0.01030003, + "balance_loss_clip": 1.02774322, + "balance_loss_mlp": 1.01836848, + "epoch": 0.44322861866826996, + "flos": 21871502046720.0, + "grad_norm": 2.109255134511257, + "language_loss": 0.6835531, + "learning_rate": 2.4606492396042786e-06, + "loss": 0.70448458, + "num_input_tokens_seen": 158080335, + "step": 7372, + "time_per_iteration": 2.6188833713531494 + }, + { + "auxiliary_loss_clip": 0.010417, + "auxiliary_loss_mlp": 0.01035297, + "balance_loss_clip": 1.0256114, + "balance_loss_mlp": 1.02136731, + "epoch": 0.4432887419209379, + "flos": 20084407660800.0, + "grad_norm": 7.298959505721236, + "language_loss": 0.83432317, + "learning_rate": 2.4602702403155664e-06, + "loss": 0.85509312, + "num_input_tokens_seen": 158098955, + "step": 7373, + "time_per_iteration": 2.6442675590515137 + }, + { + "auxiliary_loss_clip": 0.0100285, + "auxiliary_loss_mlp": 0.01002808, + "balance_loss_clip": 1.00384092, + "balance_loss_mlp": 1.00113893, + "epoch": 0.4433488651736059, + "flos": 70035540935040.0, + "grad_norm": 0.7559725798865873, + "language_loss": 0.55275112, + "learning_rate": 2.4598912235733604e-06, + "loss": 0.57280761, + "num_input_tokens_seen": 158164110, + "step": 7374, + "time_per_iteration": 3.254215955734253 + }, + { + "auxiliary_loss_clip": 0.01032509, + "auxiliary_loss_mlp": 0.01043856, + "balance_loss_clip": 1.02732861, + "balance_loss_mlp": 1.03024817, + "epoch": 0.44340898842627385, + "flos": 16282472705280.0, + "grad_norm": 4.340311439918872, + "language_loss": 0.82238811, + "learning_rate": 2.4595121893920327e-06, + "loss": 0.84315169, + "num_input_tokens_seen": 158179850, + "step": 7375, + "time_per_iteration": 2.667696237564087 + }, + { + "auxiliary_loss_clip": 0.01074237, + "auxiliary_loss_mlp": 0.01030644, + "balance_loss_clip": 1.02733612, + "balance_loss_mlp": 1.01789451, + "epoch": 0.4434691116789418, + "flos": 16611989097600.0, + "grad_norm": 2.2752084358732727, + "language_loss": 0.8451739, + "learning_rate": 2.4591331377859578e-06, + "loss": 0.86622268, + "num_input_tokens_seen": 158196590, + "step": 7376, + "time_per_iteration": 2.548731565475464 + }, + { + "auxiliary_loss_clip": 0.01054038, + "auxiliary_loss_mlp": 0.01032721, + "balance_loss_clip": 1.02732456, + "balance_loss_mlp": 1.02031088, + "epoch": 0.4435292349316098, + "flos": 19063251912960.0, + "grad_norm": 1.7997472512629813, + "language_loss": 0.77341402, + "learning_rate": 2.4587540687695077e-06, + "loss": 0.7942816, + "num_input_tokens_seen": 158216355, + "step": 7377, + "time_per_iteration": 2.6048312187194824 + }, + { + "auxiliary_loss_clip": 0.01063552, + "auxiliary_loss_mlp": 0.010311, + "balance_loss_clip": 1.02896023, + "balance_loss_mlp": 1.01905394, + "epoch": 0.44358935818427775, + "flos": 21251324799360.0, + "grad_norm": 1.9467744614209732, + "language_loss": 0.76219708, + "learning_rate": 2.458374982357057e-06, + "loss": 0.78314352, + "num_input_tokens_seen": 158235825, + "step": 7378, + "time_per_iteration": 2.5826971530914307 + }, + { + "auxiliary_loss_clip": 0.01048208, + "auxiliary_loss_mlp": 0.0104389, + "balance_loss_clip": 1.02656996, + "balance_loss_mlp": 1.02930474, + "epoch": 0.4436494814369457, + "flos": 12495298239360.0, + "grad_norm": 2.9529662276998634, + "language_loss": 0.68606532, + "learning_rate": 2.457995878562982e-06, + "loss": 0.70698631, + "num_input_tokens_seen": 158254230, + "step": 7379, + "time_per_iteration": 2.5988245010375977 + }, + { + "auxiliary_loss_clip": 0.01005336, + "auxiliary_loss_mlp": 0.0104067, + "balance_loss_clip": 1.02152085, + "balance_loss_mlp": 1.02622187, + "epoch": 0.44370960468961373, + "flos": 23659853408640.0, + "grad_norm": 1.8254864649741132, + "language_loss": 0.72847569, + "learning_rate": 2.457616757401656e-06, + "loss": 0.7489357, + "num_input_tokens_seen": 158273400, + "step": 7380, + "time_per_iteration": 2.788511037826538 + }, + { + "auxiliary_loss_clip": 0.0105593, + "auxiliary_loss_mlp": 0.01034288, + "balance_loss_clip": 1.02864408, + "balance_loss_mlp": 1.02196729, + "epoch": 0.4437697279422817, + "flos": 32416849635840.0, + "grad_norm": 1.704504890926331, + "language_loss": 0.6474207, + "learning_rate": 2.457237618887458e-06, + "loss": 0.66832286, + "num_input_tokens_seen": 158296840, + "step": 7381, + "time_per_iteration": 2.6659786701202393 + }, + { + "auxiliary_loss_clip": 0.01066509, + "auxiliary_loss_mlp": 0.01036012, + "balance_loss_clip": 1.03041565, + "balance_loss_mlp": 1.02310157, + "epoch": 0.44382985119494966, + "flos": 18112875914880.0, + "grad_norm": 2.0945453778984304, + "language_loss": 0.79864061, + "learning_rate": 2.456858463034763e-06, + "loss": 0.81966579, + "num_input_tokens_seen": 158314935, + "step": 7382, + "time_per_iteration": 2.555737018585205 + }, + { + "auxiliary_loss_clip": 0.01069252, + "auxiliary_loss_mlp": 0.01040623, + "balance_loss_clip": 1.03078413, + "balance_loss_mlp": 1.02774847, + "epoch": 0.44388997444761763, + "flos": 30774151923840.0, + "grad_norm": 1.7264926709207105, + "language_loss": 0.65607417, + "learning_rate": 2.456479289857949e-06, + "loss": 0.6771729, + "num_input_tokens_seen": 158334620, + "step": 7383, + "time_per_iteration": 2.7363171577453613 + }, + { + "auxiliary_loss_clip": 0.01050544, + "auxiliary_loss_mlp": 0.01034462, + "balance_loss_clip": 1.02769089, + "balance_loss_mlp": 1.02070546, + "epoch": 0.4439500977002856, + "flos": 20339157893760.0, + "grad_norm": 2.3492365322755524, + "language_loss": 0.75658822, + "learning_rate": 2.4561000993713953e-06, + "loss": 0.77743828, + "num_input_tokens_seen": 158350550, + "step": 7384, + "time_per_iteration": 2.6871180534362793 + }, + { + "auxiliary_loss_clip": 0.01078453, + "auxiliary_loss_mlp": 0.01038816, + "balance_loss_clip": 1.02971935, + "balance_loss_mlp": 1.02589345, + "epoch": 0.44401022095295356, + "flos": 20371225760640.0, + "grad_norm": 1.913559578684235, + "language_loss": 0.81180459, + "learning_rate": 2.4557208915894796e-06, + "loss": 0.83297729, + "num_input_tokens_seen": 158369555, + "step": 7385, + "time_per_iteration": 2.5266361236572266 + }, + { + "auxiliary_loss_clip": 0.01029276, + "auxiliary_loss_mlp": 0.01035227, + "balance_loss_clip": 1.02549994, + "balance_loss_mlp": 1.02076662, + "epoch": 0.4440703442056215, + "flos": 20230635928320.0, + "grad_norm": 1.661164422057976, + "language_loss": 0.81372803, + "learning_rate": 2.455341666526582e-06, + "loss": 0.83437306, + "num_input_tokens_seen": 158388045, + "step": 7386, + "time_per_iteration": 2.7385780811309814 + }, + { + "auxiliary_loss_clip": 0.01042575, + "auxiliary_loss_mlp": 0.0104382, + "balance_loss_clip": 1.02786624, + "balance_loss_mlp": 1.02865636, + "epoch": 0.4441304674582895, + "flos": 39494698824960.0, + "grad_norm": 1.7719608777284988, + "language_loss": 0.6961903, + "learning_rate": 2.4549624241970832e-06, + "loss": 0.71705425, + "num_input_tokens_seen": 158410115, + "step": 7387, + "time_per_iteration": 2.84328031539917 + }, + { + "auxiliary_loss_clip": 0.01025766, + "auxiliary_loss_mlp": 0.01047182, + "balance_loss_clip": 1.02887797, + "balance_loss_mlp": 1.03294206, + "epoch": 0.44419059071095746, + "flos": 14829671220480.0, + "grad_norm": 2.416143813941997, + "language_loss": 0.7166217, + "learning_rate": 2.4545831646153628e-06, + "loss": 0.73735118, + "num_input_tokens_seen": 158427765, + "step": 7388, + "time_per_iteration": 2.779928684234619 + }, + { + "auxiliary_loss_clip": 0.01066392, + "auxiliary_loss_mlp": 0.01037249, + "balance_loss_clip": 1.02781987, + "balance_loss_mlp": 1.0239985, + "epoch": 0.4442507139636254, + "flos": 22637835734400.0, + "grad_norm": 1.5738400056518194, + "language_loss": 0.6878618, + "learning_rate": 2.4542038877958044e-06, + "loss": 0.70889819, + "num_input_tokens_seen": 158446375, + "step": 7389, + "time_per_iteration": 2.6370325088500977 + }, + { + "auxiliary_loss_clip": 0.01066043, + "auxiliary_loss_mlp": 0.01035785, + "balance_loss_clip": 1.02879953, + "balance_loss_mlp": 1.02263653, + "epoch": 0.4443108372162934, + "flos": 38290721829120.0, + "grad_norm": 1.9554042110268024, + "language_loss": 0.74636662, + "learning_rate": 2.453824593752788e-06, + "loss": 0.76738489, + "num_input_tokens_seen": 158467260, + "step": 7390, + "time_per_iteration": 2.7406904697418213 + }, + { + "auxiliary_loss_clip": 0.0105709, + "auxiliary_loss_mlp": 0.01035482, + "balance_loss_clip": 1.02602363, + "balance_loss_mlp": 1.02156377, + "epoch": 0.44437096046896135, + "flos": 17748993185280.0, + "grad_norm": 2.0115365358958717, + "language_loss": 0.8138777, + "learning_rate": 2.4534452825006988e-06, + "loss": 0.8348034, + "num_input_tokens_seen": 158486720, + "step": 7391, + "time_per_iteration": 2.6725831031799316 + }, + { + "auxiliary_loss_clip": 0.01049919, + "auxiliary_loss_mlp": 0.01038965, + "balance_loss_clip": 1.02907205, + "balance_loss_mlp": 1.025244, + "epoch": 0.4444310837216293, + "flos": 13732348682880.0, + "grad_norm": 1.725096723001453, + "language_loss": 0.73538399, + "learning_rate": 2.4530659540539185e-06, + "loss": 0.75627279, + "num_input_tokens_seen": 158502530, + "step": 7392, + "time_per_iteration": 2.6854491233825684 + }, + { + "auxiliary_loss_clip": 0.01062512, + "auxiliary_loss_mlp": 0.01034165, + "balance_loss_clip": 1.02642071, + "balance_loss_mlp": 1.02217841, + "epoch": 0.44449120697429734, + "flos": 25010238240000.0, + "grad_norm": 1.57124609479393, + "language_loss": 0.79763794, + "learning_rate": 2.4526866084268313e-06, + "loss": 0.81860471, + "num_input_tokens_seen": 158522715, + "step": 7393, + "time_per_iteration": 2.6731483936309814 + }, + { + "auxiliary_loss_clip": 0.01065815, + "auxiliary_loss_mlp": 0.01034134, + "balance_loss_clip": 1.02731431, + "balance_loss_mlp": 1.02068734, + "epoch": 0.4445513302269653, + "flos": 32671707609600.0, + "grad_norm": 1.8321117970589542, + "language_loss": 0.80401868, + "learning_rate": 2.4523072456338226e-06, + "loss": 0.82501817, + "num_input_tokens_seen": 158543615, + "step": 7394, + "time_per_iteration": 2.6925923824310303 + }, + { + "auxiliary_loss_clip": 0.0104661, + "auxiliary_loss_mlp": 0.01037028, + "balance_loss_clip": 1.02516842, + "balance_loss_mlp": 1.02533937, + "epoch": 0.44461145347963327, + "flos": 11655814504320.0, + "grad_norm": 1.892045392105673, + "language_loss": 0.79230016, + "learning_rate": 2.4519278656892785e-06, + "loss": 0.81313658, + "num_input_tokens_seen": 158560330, + "step": 7395, + "time_per_iteration": 2.596520185470581 + }, + { + "auxiliary_loss_clip": 0.01048716, + "auxiliary_loss_mlp": 0.01034528, + "balance_loss_clip": 1.02573955, + "balance_loss_mlp": 1.02182055, + "epoch": 0.44467157673230123, + "flos": 20886759711360.0, + "grad_norm": 6.440598604557875, + "language_loss": 0.67979318, + "learning_rate": 2.451548468607584e-06, + "loss": 0.7006256, + "num_input_tokens_seen": 158579735, + "step": 7396, + "time_per_iteration": 2.676689863204956 + }, + { + "auxiliary_loss_clip": 0.01057739, + "auxiliary_loss_mlp": 0.00747601, + "balance_loss_clip": 1.02596307, + "balance_loss_mlp": 1.0002867, + "epoch": 0.4447316999849692, + "flos": 18546137763840.0, + "grad_norm": 1.7446615807685317, + "language_loss": 0.80993199, + "learning_rate": 2.451169054403126e-06, + "loss": 0.82798541, + "num_input_tokens_seen": 158597075, + "step": 7397, + "time_per_iteration": 2.670534372329712 + }, + { + "auxiliary_loss_clip": 0.01064513, + "auxiliary_loss_mlp": 0.01035577, + "balance_loss_clip": 1.02857471, + "balance_loss_mlp": 1.02314329, + "epoch": 0.44479182323763716, + "flos": 23769057732480.0, + "grad_norm": 1.5853909196861844, + "language_loss": 0.67609978, + "learning_rate": 2.450789623090293e-06, + "loss": 0.6971007, + "num_input_tokens_seen": 158616650, + "step": 7398, + "time_per_iteration": 2.586679220199585 + }, + { + "auxiliary_loss_clip": 0.01042156, + "auxiliary_loss_mlp": 0.0103887, + "balance_loss_clip": 1.02714872, + "balance_loss_mlp": 1.026824, + "epoch": 0.44485194649030513, + "flos": 16543831040640.0, + "grad_norm": 2.120988520080226, + "language_loss": 0.69518495, + "learning_rate": 2.450410174683472e-06, + "loss": 0.71599519, + "num_input_tokens_seen": 158634515, + "step": 7399, + "time_per_iteration": 2.616452693939209 + }, + { + "auxiliary_loss_clip": 0.01041484, + "auxiliary_loss_mlp": 0.01032549, + "balance_loss_clip": 1.02747536, + "balance_loss_mlp": 1.02016914, + "epoch": 0.4449120697429731, + "flos": 22600955445120.0, + "grad_norm": 2.198436044997554, + "language_loss": 0.7212382, + "learning_rate": 2.4500307091970514e-06, + "loss": 0.74197853, + "num_input_tokens_seen": 158653760, + "step": 7400, + "time_per_iteration": 2.646585702896118 + }, + { + "auxiliary_loss_clip": 0.01032901, + "auxiliary_loss_mlp": 0.00747287, + "balance_loss_clip": 1.02805972, + "balance_loss_mlp": 1.00027215, + "epoch": 0.44497219299564106, + "flos": 20004864992640.0, + "grad_norm": 1.8910227550566145, + "language_loss": 0.84492624, + "learning_rate": 2.449651226645422e-06, + "loss": 0.86272806, + "num_input_tokens_seen": 158672190, + "step": 7401, + "time_per_iteration": 2.717031478881836 + }, + { + "auxiliary_loss_clip": 0.01051928, + "auxiliary_loss_mlp": 0.01034121, + "balance_loss_clip": 1.02776909, + "balance_loss_mlp": 1.02269483, + "epoch": 0.445032316248309, + "flos": 25594253470080.0, + "grad_norm": 4.784319162453119, + "language_loss": 0.83624196, + "learning_rate": 2.449271727042973e-06, + "loss": 0.85710251, + "num_input_tokens_seen": 158694115, + "step": 7402, + "time_per_iteration": 4.293980598449707 + }, + { + "auxiliary_loss_clip": 0.010549, + "auxiliary_loss_mlp": 0.01035778, + "balance_loss_clip": 1.02811074, + "balance_loss_mlp": 1.02284968, + "epoch": 0.445092439500977, + "flos": 21250426959360.0, + "grad_norm": 7.32186640159141, + "language_loss": 0.77210945, + "learning_rate": 2.4488922104040947e-06, + "loss": 0.7930162, + "num_input_tokens_seen": 158711000, + "step": 7403, + "time_per_iteration": 4.138664245605469 + }, + { + "auxiliary_loss_clip": 0.00994341, + "auxiliary_loss_mlp": 0.01007492, + "balance_loss_clip": 1.00596976, + "balance_loss_mlp": 1.00573921, + "epoch": 0.44515256275364495, + "flos": 57764900309760.0, + "grad_norm": 0.7679598659727541, + "language_loss": 0.600389, + "learning_rate": 2.4485126767431793e-06, + "loss": 0.62040734, + "num_input_tokens_seen": 158769675, + "step": 7404, + "time_per_iteration": 3.1673104763031006 + }, + { + "auxiliary_loss_clip": 0.01049297, + "auxiliary_loss_mlp": 0.01041826, + "balance_loss_clip": 1.02652121, + "balance_loss_mlp": 1.02728212, + "epoch": 0.4452126860063129, + "flos": 15596004908160.0, + "grad_norm": 5.021825089458091, + "language_loss": 0.81848264, + "learning_rate": 2.4481331260746177e-06, + "loss": 0.83939385, + "num_input_tokens_seen": 158788215, + "step": 7405, + "time_per_iteration": 2.613321542739868 + }, + { + "auxiliary_loss_clip": 0.01053912, + "auxiliary_loss_mlp": 0.01028656, + "balance_loss_clip": 1.0279218, + "balance_loss_mlp": 1.01693761, + "epoch": 0.4452728092589809, + "flos": 21617398258560.0, + "grad_norm": 1.604969110424724, + "language_loss": 0.7483753, + "learning_rate": 2.4477535584128036e-06, + "loss": 0.76920104, + "num_input_tokens_seen": 158809090, + "step": 7406, + "time_per_iteration": 2.6691627502441406 + }, + { + "auxiliary_loss_clip": 0.01043497, + "auxiliary_loss_mlp": 0.01028651, + "balance_loss_clip": 1.02795839, + "balance_loss_mlp": 1.01663482, + "epoch": 0.4453329325116489, + "flos": 29497491757440.0, + "grad_norm": 1.6341492493636338, + "language_loss": 0.65306652, + "learning_rate": 2.447373973772129e-06, + "loss": 0.67378801, + "num_input_tokens_seen": 158828320, + "step": 7407, + "time_per_iteration": 2.7895381450653076 + }, + { + "auxiliary_loss_clip": 0.01049367, + "auxiliary_loss_mlp": 0.01030748, + "balance_loss_clip": 1.02772713, + "balance_loss_mlp": 1.018291, + "epoch": 0.44539305576431687, + "flos": 21361139654400.0, + "grad_norm": 1.583585728935061, + "language_loss": 0.67510277, + "learning_rate": 2.4469943721669887e-06, + "loss": 0.69590396, + "num_input_tokens_seen": 158847040, + "step": 7408, + "time_per_iteration": 2.6154532432556152 + }, + { + "auxiliary_loss_clip": 0.01073356, + "auxiliary_loss_mlp": 0.01033772, + "balance_loss_clip": 1.02672601, + "balance_loss_mlp": 1.02086163, + "epoch": 0.44545317901698483, + "flos": 41427626428800.0, + "grad_norm": 1.6447670596906658, + "language_loss": 0.71773505, + "learning_rate": 2.4466147536117776e-06, + "loss": 0.73880637, + "num_input_tokens_seen": 158870490, + "step": 7409, + "time_per_iteration": 2.76969313621521 + }, + { + "auxiliary_loss_clip": 0.01054162, + "auxiliary_loss_mlp": 0.01036651, + "balance_loss_clip": 1.02707887, + "balance_loss_mlp": 1.0233655, + "epoch": 0.4455133022696528, + "flos": 22055005653120.0, + "grad_norm": 2.0734767896311164, + "language_loss": 0.65527642, + "learning_rate": 2.4462351181208895e-06, + "loss": 0.67618454, + "num_input_tokens_seen": 158889920, + "step": 7410, + "time_per_iteration": 2.644745349884033 + }, + { + "auxiliary_loss_clip": 0.01053232, + "auxiliary_loss_mlp": 0.01035341, + "balance_loss_clip": 1.02609229, + "balance_loss_mlp": 1.02190661, + "epoch": 0.44557342552232077, + "flos": 23476960333440.0, + "grad_norm": 2.005576005323613, + "language_loss": 0.740381, + "learning_rate": 2.4458554657087217e-06, + "loss": 0.76126677, + "num_input_tokens_seen": 158909580, + "step": 7411, + "time_per_iteration": 4.215239524841309 + }, + { + "auxiliary_loss_clip": 0.0102342, + "auxiliary_loss_mlp": 0.01033417, + "balance_loss_clip": 1.03027916, + "balance_loss_mlp": 1.02129936, + "epoch": 0.44563354877498873, + "flos": 19134678107520.0, + "grad_norm": 2.2389592179507862, + "language_loss": 0.79098004, + "learning_rate": 2.4454757963896695e-06, + "loss": 0.81154841, + "num_input_tokens_seen": 158924600, + "step": 7412, + "time_per_iteration": 2.7300803661346436 + }, + { + "auxiliary_loss_clip": 0.01057008, + "auxiliary_loss_mlp": 0.01033881, + "balance_loss_clip": 1.02783108, + "balance_loss_mlp": 1.02144134, + "epoch": 0.4456936720276567, + "flos": 13621420506240.0, + "grad_norm": 2.168922954345238, + "language_loss": 0.80268121, + "learning_rate": 2.4450961101781304e-06, + "loss": 0.8235901, + "num_input_tokens_seen": 158939345, + "step": 7413, + "time_per_iteration": 2.7169601917266846 + }, + { + "auxiliary_loss_clip": 0.01062279, + "auxiliary_loss_mlp": 0.0102985, + "balance_loss_clip": 1.02770531, + "balance_loss_mlp": 1.01834631, + "epoch": 0.44575379528032466, + "flos": 14713715139840.0, + "grad_norm": 2.588521515107106, + "language_loss": 0.76277339, + "learning_rate": 2.4447164070885026e-06, + "loss": 0.78369468, + "num_input_tokens_seen": 158955855, + "step": 7414, + "time_per_iteration": 2.5965895652770996 + }, + { + "auxiliary_loss_clip": 0.01044283, + "auxiliary_loss_mlp": 0.01037978, + "balance_loss_clip": 1.02428555, + "balance_loss_mlp": 1.02456725, + "epoch": 0.4458139185329926, + "flos": 24170682677760.0, + "grad_norm": 1.5111326523267847, + "language_loss": 0.83365893, + "learning_rate": 2.4443366871351837e-06, + "loss": 0.85448158, + "num_input_tokens_seen": 158976315, + "step": 7415, + "time_per_iteration": 2.622981309890747 + }, + { + "auxiliary_loss_clip": 0.01070952, + "auxiliary_loss_mlp": 0.01037666, + "balance_loss_clip": 1.02577376, + "balance_loss_mlp": 1.02547073, + "epoch": 0.4458740417856606, + "flos": 21762225895680.0, + "grad_norm": 1.5186744710107225, + "language_loss": 0.8389982, + "learning_rate": 2.4439569503325732e-06, + "loss": 0.86008441, + "num_input_tokens_seen": 158996725, + "step": 7416, + "time_per_iteration": 4.122832298278809 + }, + { + "auxiliary_loss_clip": 0.01043257, + "auxiliary_loss_mlp": 0.01035366, + "balance_loss_clip": 1.02612889, + "balance_loss_mlp": 1.02244413, + "epoch": 0.44593416503832856, + "flos": 21068790860160.0, + "grad_norm": 4.822759760917808, + "language_loss": 0.81142682, + "learning_rate": 2.4435771966950706e-06, + "loss": 0.83221304, + "num_input_tokens_seen": 159017255, + "step": 7417, + "time_per_iteration": 2.645709991455078 + }, + { + "auxiliary_loss_clip": 0.0105161, + "auxiliary_loss_mlp": 0.01036412, + "balance_loss_clip": 1.02580905, + "balance_loss_mlp": 1.02356744, + "epoch": 0.4459942882909965, + "flos": 22600488568320.0, + "grad_norm": 2.0518270325157753, + "language_loss": 0.81108093, + "learning_rate": 2.443197426237077e-06, + "loss": 0.83196115, + "num_input_tokens_seen": 159035010, + "step": 7418, + "time_per_iteration": 2.615760326385498 + }, + { + "auxiliary_loss_clip": 0.01063675, + "auxiliary_loss_mlp": 0.0074761, + "balance_loss_clip": 1.02669978, + "balance_loss_mlp": 1.00016344, + "epoch": 0.4460544115436645, + "flos": 26505486622080.0, + "grad_norm": 1.599267660436335, + "language_loss": 0.77096975, + "learning_rate": 2.442817638972991e-06, + "loss": 0.78908259, + "num_input_tokens_seen": 159055345, + "step": 7419, + "time_per_iteration": 2.6184823513031006 + }, + { + "auxiliary_loss_clip": 0.01040069, + "auxiliary_loss_mlp": 0.01039534, + "balance_loss_clip": 1.02566862, + "balance_loss_mlp": 1.02719545, + "epoch": 0.4461145347963325, + "flos": 17604021893760.0, + "grad_norm": 1.949739956579391, + "language_loss": 0.72242481, + "learning_rate": 2.4424378349172176e-06, + "loss": 0.74322087, + "num_input_tokens_seen": 159074225, + "step": 7420, + "time_per_iteration": 2.5999865531921387 + }, + { + "auxiliary_loss_clip": 0.01052683, + "auxiliary_loss_mlp": 0.01030998, + "balance_loss_clip": 1.02702832, + "balance_loss_mlp": 1.01822472, + "epoch": 0.44617465804900047, + "flos": 27268193036160.0, + "grad_norm": 1.5275333155942155, + "language_loss": 0.74839962, + "learning_rate": 2.442058014084156e-06, + "loss": 0.76923645, + "num_input_tokens_seen": 159095415, + "step": 7421, + "time_per_iteration": 2.6260218620300293 + }, + { + "auxiliary_loss_clip": 0.01007173, + "auxiliary_loss_mlp": 0.01036145, + "balance_loss_clip": 1.02373743, + "balance_loss_mlp": 1.02374709, + "epoch": 0.44623478130166844, + "flos": 17786412178560.0, + "grad_norm": 2.21687891868926, + "language_loss": 0.7625953, + "learning_rate": 2.44167817648821e-06, + "loss": 0.78302848, + "num_input_tokens_seen": 159114615, + "step": 7422, + "time_per_iteration": 2.6738762855529785 + }, + { + "auxiliary_loss_clip": 0.01074292, + "auxiliary_loss_mlp": 0.01036989, + "balance_loss_clip": 1.02787113, + "balance_loss_mlp": 1.02406108, + "epoch": 0.4462949045543364, + "flos": 23003011353600.0, + "grad_norm": 1.5277323083141239, + "language_loss": 0.65422595, + "learning_rate": 2.441298322143784e-06, + "loss": 0.6753388, + "num_input_tokens_seen": 159134370, + "step": 7423, + "time_per_iteration": 2.5507240295410156 + }, + { + "auxiliary_loss_clip": 0.010517, + "auxiliary_loss_mlp": 0.01033469, + "balance_loss_clip": 1.02774763, + "balance_loss_mlp": 1.02242422, + "epoch": 0.44635502780700437, + "flos": 17820096157440.0, + "grad_norm": 1.4130900950873124, + "language_loss": 0.79309893, + "learning_rate": 2.4409184510652807e-06, + "loss": 0.81395054, + "num_input_tokens_seen": 159152540, + "step": 7424, + "time_per_iteration": 2.7062671184539795 + }, + { + "auxiliary_loss_clip": 0.01061385, + "auxiliary_loss_mlp": 0.01030501, + "balance_loss_clip": 1.02775407, + "balance_loss_mlp": 1.01916444, + "epoch": 0.44641515105967233, + "flos": 26688020561280.0, + "grad_norm": 1.4961418113459932, + "language_loss": 0.8041966, + "learning_rate": 2.4405385632671063e-06, + "loss": 0.82511544, + "num_input_tokens_seen": 159173425, + "step": 7425, + "time_per_iteration": 2.6525614261627197 + }, + { + "auxiliary_loss_clip": 0.01062859, + "auxiliary_loss_mlp": 0.01027427, + "balance_loss_clip": 1.02837837, + "balance_loss_mlp": 1.01608419, + "epoch": 0.4464752743123403, + "flos": 18913324544640.0, + "grad_norm": 1.6564246420820417, + "language_loss": 0.77086031, + "learning_rate": 2.4401586587636655e-06, + "loss": 0.79176325, + "num_input_tokens_seen": 159191210, + "step": 7426, + "time_per_iteration": 2.699014902114868 + }, + { + "auxiliary_loss_clip": 0.0105457, + "auxiliary_loss_mlp": 0.00747572, + "balance_loss_clip": 1.02778363, + "balance_loss_mlp": 1.00018728, + "epoch": 0.44653539756500826, + "flos": 29570318582400.0, + "grad_norm": 1.7033194922866848, + "language_loss": 0.64311719, + "learning_rate": 2.4397787375693634e-06, + "loss": 0.66113865, + "num_input_tokens_seen": 159211755, + "step": 7427, + "time_per_iteration": 2.740356206893921 + }, + { + "auxiliary_loss_clip": 0.01060345, + "auxiliary_loss_mlp": 0.01032338, + "balance_loss_clip": 1.02900863, + "balance_loss_mlp": 1.02053058, + "epoch": 0.44659552081767623, + "flos": 21468979261440.0, + "grad_norm": 1.6360239872711049, + "language_loss": 0.75567782, + "learning_rate": 2.439398799698608e-06, + "loss": 0.77660459, + "num_input_tokens_seen": 159230315, + "step": 7428, + "time_per_iteration": 2.6848955154418945 + }, + { + "auxiliary_loss_clip": 0.01043249, + "auxiliary_loss_mlp": 0.01035101, + "balance_loss_clip": 1.02582645, + "balance_loss_mlp": 1.02222061, + "epoch": 0.4466556440703442, + "flos": 17931886260480.0, + "grad_norm": 1.7860557346493928, + "language_loss": 0.77690428, + "learning_rate": 2.439018845165806e-06, + "loss": 0.79768777, + "num_input_tokens_seen": 159249810, + "step": 7429, + "time_per_iteration": 2.7170088291168213 + }, + { + "auxiliary_loss_clip": 0.01065863, + "auxiliary_loss_mlp": 0.0103253, + "balance_loss_clip": 1.02869976, + "balance_loss_mlp": 1.01955986, + "epoch": 0.44671576732301216, + "flos": 21107430915840.0, + "grad_norm": 3.59408368160355, + "language_loss": 0.90931642, + "learning_rate": 2.438638873985366e-06, + "loss": 0.9303003, + "num_input_tokens_seen": 159271715, + "step": 7430, + "time_per_iteration": 2.5933778285980225 + }, + { + "auxiliary_loss_clip": 0.01055962, + "auxiliary_loss_mlp": 0.00747707, + "balance_loss_clip": 1.02778959, + "balance_loss_mlp": 1.00017369, + "epoch": 0.4467758905756801, + "flos": 23508920459520.0, + "grad_norm": 1.6246773267072792, + "language_loss": 0.79998362, + "learning_rate": 2.4382588861716954e-06, + "loss": 0.81802034, + "num_input_tokens_seen": 159290690, + "step": 7431, + "time_per_iteration": 2.6431775093078613 + }, + { + "auxiliary_loss_clip": 0.01059335, + "auxiliary_loss_mlp": 0.01031676, + "balance_loss_clip": 1.03007102, + "balance_loss_mlp": 1.01816988, + "epoch": 0.4468360138283481, + "flos": 18734022829440.0, + "grad_norm": 2.000480400587275, + "language_loss": 0.79597461, + "learning_rate": 2.437878881739204e-06, + "loss": 0.81688476, + "num_input_tokens_seen": 159309400, + "step": 7432, + "time_per_iteration": 2.643895149230957 + }, + { + "auxiliary_loss_clip": 0.01047003, + "auxiliary_loss_mlp": 0.01033737, + "balance_loss_clip": 1.02960944, + "balance_loss_mlp": 1.02177405, + "epoch": 0.4468961370810161, + "flos": 23477139901440.0, + "grad_norm": 2.4624453534901374, + "language_loss": 0.76687372, + "learning_rate": 2.437498860702301e-06, + "loss": 0.7876811, + "num_input_tokens_seen": 159327425, + "step": 7433, + "time_per_iteration": 2.6299588680267334 + }, + { + "auxiliary_loss_clip": 0.01060977, + "auxiliary_loss_mlp": 0.0103534, + "balance_loss_clip": 1.02747738, + "balance_loss_mlp": 1.02479601, + "epoch": 0.4469562603336841, + "flos": 30075042539520.0, + "grad_norm": 1.9353031025912142, + "language_loss": 0.77288425, + "learning_rate": 2.437118823075398e-06, + "loss": 0.79384738, + "num_input_tokens_seen": 159345805, + "step": 7434, + "time_per_iteration": 2.6556499004364014 + }, + { + "auxiliary_loss_clip": 0.01067613, + "auxiliary_loss_mlp": 0.0102755, + "balance_loss_clip": 1.03064966, + "balance_loss_mlp": 1.01565289, + "epoch": 0.44701638358635204, + "flos": 22456415116800.0, + "grad_norm": 1.6404237182946584, + "language_loss": 0.649746, + "learning_rate": 2.436738768872905e-06, + "loss": 0.67069763, + "num_input_tokens_seen": 159364595, + "step": 7435, + "time_per_iteration": 2.686232089996338 + }, + { + "auxiliary_loss_clip": 0.01057281, + "auxiliary_loss_mlp": 0.01030299, + "balance_loss_clip": 1.02916849, + "balance_loss_mlp": 1.01735854, + "epoch": 0.44707650683902, + "flos": 24057851080320.0, + "grad_norm": 1.700720195345428, + "language_loss": 0.83449066, + "learning_rate": 2.4363586981092346e-06, + "loss": 0.85536647, + "num_input_tokens_seen": 159385265, + "step": 7436, + "time_per_iteration": 2.734760046005249 + }, + { + "auxiliary_loss_clip": 0.01018843, + "auxiliary_loss_mlp": 0.01043922, + "balance_loss_clip": 1.02429724, + "balance_loss_mlp": 1.02852035, + "epoch": 0.44713663009168797, + "flos": 23766938830080.0, + "grad_norm": 1.60652787595035, + "language_loss": 0.79727864, + "learning_rate": 2.435978610798798e-06, + "loss": 0.81790626, + "num_input_tokens_seen": 159405080, + "step": 7437, + "time_per_iteration": 2.7634050846099854 + }, + { + "auxiliary_loss_clip": 0.01036174, + "auxiliary_loss_mlp": 0.01033458, + "balance_loss_clip": 1.02887022, + "balance_loss_mlp": 1.02112627, + "epoch": 0.44719675334435594, + "flos": 24499265316480.0, + "grad_norm": 2.0092914667838038, + "language_loss": 0.71846616, + "learning_rate": 2.435598506956009e-06, + "loss": 0.7391625, + "num_input_tokens_seen": 159424595, + "step": 7438, + "time_per_iteration": 2.7533183097839355 + }, + { + "auxiliary_loss_clip": 0.01032653, + "auxiliary_loss_mlp": 0.01036244, + "balance_loss_clip": 1.02779937, + "balance_loss_mlp": 1.0229876, + "epoch": 0.4472568765970239, + "flos": 29781759991680.0, + "grad_norm": 1.758787033944838, + "language_loss": 0.67208213, + "learning_rate": 2.4352183865952808e-06, + "loss": 0.69277114, + "num_input_tokens_seen": 159443865, + "step": 7439, + "time_per_iteration": 2.8387537002563477 + }, + { + "auxiliary_loss_clip": 0.01046809, + "auxiliary_loss_mlp": 0.01039053, + "balance_loss_clip": 1.025635, + "balance_loss_mlp": 1.02423525, + "epoch": 0.44731699984969187, + "flos": 24643123286400.0, + "grad_norm": 1.6557205506399797, + "language_loss": 0.73952699, + "learning_rate": 2.4348382497310285e-06, + "loss": 0.76038563, + "num_input_tokens_seen": 159464525, + "step": 7440, + "time_per_iteration": 2.850071430206299 + }, + { + "auxiliary_loss_clip": 0.01018554, + "auxiliary_loss_mlp": 0.01044296, + "balance_loss_clip": 1.02231121, + "balance_loss_mlp": 1.03052127, + "epoch": 0.44737712310235983, + "flos": 29455691304960.0, + "grad_norm": 2.675154213471553, + "language_loss": 0.73927265, + "learning_rate": 2.4344580963776655e-06, + "loss": 0.75990117, + "num_input_tokens_seen": 159486385, + "step": 7441, + "time_per_iteration": 2.9189083576202393 + }, + { + "auxiliary_loss_clip": 0.01048112, + "auxiliary_loss_mlp": 0.01034479, + "balance_loss_clip": 1.02897131, + "balance_loss_mlp": 1.02095437, + "epoch": 0.4474372463550278, + "flos": 24896832024960.0, + "grad_norm": 1.9954557279130258, + "language_loss": 0.74761283, + "learning_rate": 2.4340779265496082e-06, + "loss": 0.7684387, + "num_input_tokens_seen": 159503880, + "step": 7442, + "time_per_iteration": 2.9435665607452393 + }, + { + "auxiliary_loss_clip": 0.01078599, + "auxiliary_loss_mlp": 0.0103262, + "balance_loss_clip": 1.02878451, + "balance_loss_mlp": 1.01892257, + "epoch": 0.44749736960769576, + "flos": 33181603125120.0, + "grad_norm": 2.745041256578012, + "language_loss": 0.74144918, + "learning_rate": 2.433697740261273e-06, + "loss": 0.76256138, + "num_input_tokens_seen": 159522980, + "step": 7443, + "time_per_iteration": 2.7147932052612305 + }, + { + "auxiliary_loss_clip": 0.01045884, + "auxiliary_loss_mlp": 0.01028161, + "balance_loss_clip": 1.02366662, + "balance_loss_mlp": 1.01462436, + "epoch": 0.4475574928603637, + "flos": 21071807602560.0, + "grad_norm": 1.6508076018128672, + "language_loss": 0.77446949, + "learning_rate": 2.4333175375270748e-06, + "loss": 0.79521, + "num_input_tokens_seen": 159543340, + "step": 7444, + "time_per_iteration": 2.590416431427002 + }, + { + "auxiliary_loss_clip": 0.01063857, + "auxiliary_loss_mlp": 0.01031968, + "balance_loss_clip": 1.0290966, + "balance_loss_mlp": 1.01951087, + "epoch": 0.4476176161130317, + "flos": 21862523646720.0, + "grad_norm": 3.8052909929074668, + "language_loss": 0.84986174, + "learning_rate": 2.4329373183614333e-06, + "loss": 0.87082005, + "num_input_tokens_seen": 159558210, + "step": 7445, + "time_per_iteration": 2.5576915740966797 + }, + { + "auxiliary_loss_clip": 0.01033611, + "auxiliary_loss_mlp": 0.0103435, + "balance_loss_clip": 1.02519941, + "balance_loss_mlp": 1.01953244, + "epoch": 0.4476777393656997, + "flos": 22528667324160.0, + "grad_norm": 1.8075939137414387, + "language_loss": 0.63696992, + "learning_rate": 2.432557082778765e-06, + "loss": 0.65764952, + "num_input_tokens_seen": 159577920, + "step": 7446, + "time_per_iteration": 2.7273776531219482 + }, + { + "auxiliary_loss_clip": 0.01004811, + "auxiliary_loss_mlp": 0.01001401, + "balance_loss_clip": 1.00606489, + "balance_loss_mlp": 0.99978018, + "epoch": 0.4477378626183677, + "flos": 49017133877760.0, + "grad_norm": 0.756497005199332, + "language_loss": 0.50187504, + "learning_rate": 2.4321768307934884e-06, + "loss": 0.52193713, + "num_input_tokens_seen": 159632295, + "step": 7447, + "time_per_iteration": 3.000143051147461 + }, + { + "auxiliary_loss_clip": 0.01013481, + "auxiliary_loss_mlp": 0.01002148, + "balance_loss_clip": 1.00503957, + "balance_loss_mlp": 1.00051486, + "epoch": 0.44779798587103564, + "flos": 56542179392640.0, + "grad_norm": 0.7538302829069639, + "language_loss": 0.59286129, + "learning_rate": 2.4317965624200235e-06, + "loss": 0.61301768, + "num_input_tokens_seen": 159698435, + "step": 7448, + "time_per_iteration": 3.194751024246216 + }, + { + "auxiliary_loss_clip": 0.01044608, + "auxiliary_loss_mlp": 0.01033987, + "balance_loss_clip": 1.02857888, + "balance_loss_mlp": 1.02223945, + "epoch": 0.4478581091237036, + "flos": 46498536040320.0, + "grad_norm": 1.95580951638485, + "language_loss": 0.59234023, + "learning_rate": 2.431416277672789e-06, + "loss": 0.61312622, + "num_input_tokens_seen": 159722150, + "step": 7449, + "time_per_iteration": 4.400540590286255 + }, + { + "auxiliary_loss_clip": 0.01044012, + "auxiliary_loss_mlp": 0.01028361, + "balance_loss_clip": 1.02536213, + "balance_loss_mlp": 1.0162971, + "epoch": 0.4479182323763716, + "flos": 20814363849600.0, + "grad_norm": 2.0572916948456124, + "language_loss": 0.79748434, + "learning_rate": 2.4310359765662065e-06, + "loss": 0.81820804, + "num_input_tokens_seen": 159740550, + "step": 7450, + "time_per_iteration": 4.471433401107788 + }, + { + "auxiliary_loss_clip": 0.01074241, + "auxiliary_loss_mlp": 0.01036603, + "balance_loss_clip": 1.028898, + "balance_loss_mlp": 1.02426505, + "epoch": 0.44797835562903954, + "flos": 14245979212800.0, + "grad_norm": 1.9605991969540766, + "language_loss": 0.792548, + "learning_rate": 2.430655659114697e-06, + "loss": 0.81365645, + "num_input_tokens_seen": 159758245, + "step": 7451, + "time_per_iteration": 2.725872039794922 + }, + { + "auxiliary_loss_clip": 0.00978244, + "auxiliary_loss_mlp": 0.01014963, + "balance_loss_clip": 1.00911641, + "balance_loss_mlp": 1.01324606, + "epoch": 0.4480384788817075, + "flos": 63534560169600.0, + "grad_norm": 0.8306779260671112, + "language_loss": 0.62794125, + "learning_rate": 2.430275325332681e-06, + "loss": 0.64787328, + "num_input_tokens_seen": 159826790, + "step": 7452, + "time_per_iteration": 3.4369144439697266 + }, + { + "auxiliary_loss_clip": 0.01076764, + "auxiliary_loss_mlp": 0.01038917, + "balance_loss_clip": 1.02893376, + "balance_loss_mlp": 1.02545261, + "epoch": 0.44809860213437547, + "flos": 21652626522240.0, + "grad_norm": 2.3490075354553652, + "language_loss": 0.6235888, + "learning_rate": 2.429894975234582e-06, + "loss": 0.64474559, + "num_input_tokens_seen": 159845805, + "step": 7453, + "time_per_iteration": 2.9876832962036133 + }, + { + "auxiliary_loss_clip": 0.00992193, + "auxiliary_loss_mlp": 0.0100565, + "balance_loss_clip": 1.00307441, + "balance_loss_mlp": 1.00392139, + "epoch": 0.44815872538704343, + "flos": 69190634246400.0, + "grad_norm": 0.7659808427915681, + "language_loss": 0.57037091, + "learning_rate": 2.4295146088348224e-06, + "loss": 0.59034932, + "num_input_tokens_seen": 159898860, + "step": 7454, + "time_per_iteration": 3.0590405464172363 + }, + { + "auxiliary_loss_clip": 0.01043618, + "auxiliary_loss_mlp": 0.010345, + "balance_loss_clip": 1.02363801, + "balance_loss_mlp": 1.02144706, + "epoch": 0.4482188486397114, + "flos": 12598289510400.0, + "grad_norm": 1.9897147479998007, + "language_loss": 0.74767542, + "learning_rate": 2.4291342261478255e-06, + "loss": 0.76845658, + "num_input_tokens_seen": 159911555, + "step": 7455, + "time_per_iteration": 2.593590259552002 + }, + { + "auxiliary_loss_clip": 0.01053986, + "auxiliary_loss_mlp": 0.01028493, + "balance_loss_clip": 1.02761722, + "balance_loss_mlp": 1.01676846, + "epoch": 0.44827897189237936, + "flos": 34058182631040.0, + "grad_norm": 1.8568798091400585, + "language_loss": 0.7598685, + "learning_rate": 2.428753827188016e-06, + "loss": 0.78069329, + "num_input_tokens_seen": 159931470, + "step": 7456, + "time_per_iteration": 2.6954972743988037 + }, + { + "auxiliary_loss_clip": 0.0107638, + "auxiliary_loss_mlp": 0.01035636, + "balance_loss_clip": 1.03096473, + "balance_loss_mlp": 1.0235306, + "epoch": 0.44833909514504733, + "flos": 25147416280320.0, + "grad_norm": 5.905359573555219, + "language_loss": 0.75966173, + "learning_rate": 2.428373411969818e-06, + "loss": 0.78078192, + "num_input_tokens_seen": 159946115, + "step": 7457, + "time_per_iteration": 2.5538432598114014 + }, + { + "auxiliary_loss_clip": 0.01056376, + "auxiliary_loss_mlp": 0.01032314, + "balance_loss_clip": 1.02536535, + "balance_loss_mlp": 1.01865304, + "epoch": 0.4483992183977153, + "flos": 16179984224640.0, + "grad_norm": 2.118343841221712, + "language_loss": 0.68273902, + "learning_rate": 2.4279929805076576e-06, + "loss": 0.70362592, + "num_input_tokens_seen": 159963915, + "step": 7458, + "time_per_iteration": 2.521760940551758 + }, + { + "auxiliary_loss_clip": 0.01042535, + "auxiliary_loss_mlp": 0.01034877, + "balance_loss_clip": 1.02801478, + "balance_loss_mlp": 1.0214119, + "epoch": 0.44845934165038326, + "flos": 17746048270080.0, + "grad_norm": 1.755520136554189, + "language_loss": 0.71933436, + "learning_rate": 2.427612532815961e-06, + "loss": 0.74010849, + "num_input_tokens_seen": 159982140, + "step": 7459, + "time_per_iteration": 4.263765811920166 + }, + { + "auxiliary_loss_clip": 0.01052285, + "auxiliary_loss_mlp": 0.01032528, + "balance_loss_clip": 1.02402425, + "balance_loss_mlp": 1.01955163, + "epoch": 0.4485194649030513, + "flos": 21835914647040.0, + "grad_norm": 1.6415066878801117, + "language_loss": 0.69757134, + "learning_rate": 2.427232068909154e-06, + "loss": 0.71841949, + "num_input_tokens_seen": 160002280, + "step": 7460, + "time_per_iteration": 2.587172746658325 + }, + { + "auxiliary_loss_clip": 0.01073223, + "auxiliary_loss_mlp": 0.01034437, + "balance_loss_clip": 1.02723241, + "balance_loss_mlp": 1.0215919, + "epoch": 0.44857958815571924, + "flos": 20084515401600.0, + "grad_norm": 1.9134487696070273, + "language_loss": 0.77278179, + "learning_rate": 2.4268515888016635e-06, + "loss": 0.79385841, + "num_input_tokens_seen": 160020260, + "step": 7461, + "time_per_iteration": 2.545840263366699 + }, + { + "auxiliary_loss_clip": 0.01076208, + "auxiliary_loss_mlp": 0.01034762, + "balance_loss_clip": 1.02831125, + "balance_loss_mlp": 1.02253127, + "epoch": 0.4486397114083872, + "flos": 27053519402880.0, + "grad_norm": 1.8703207693811768, + "language_loss": 0.6760816, + "learning_rate": 2.4264710925079184e-06, + "loss": 0.6971913, + "num_input_tokens_seen": 160040240, + "step": 7462, + "time_per_iteration": 2.61600399017334 + }, + { + "auxiliary_loss_clip": 0.01011312, + "auxiliary_loss_mlp": 0.0100193, + "balance_loss_clip": 1.00262594, + "balance_loss_mlp": 1.00034416, + "epoch": 0.4486998346610552, + "flos": 67321195931520.0, + "grad_norm": 0.7438212171248754, + "language_loss": 0.54420799, + "learning_rate": 2.4260905800423462e-06, + "loss": 0.56434041, + "num_input_tokens_seen": 160093865, + "step": 7463, + "time_per_iteration": 4.84618067741394 + }, + { + "auxiliary_loss_clip": 0.0106423, + "auxiliary_loss_mlp": 0.01032813, + "balance_loss_clip": 1.02820885, + "balance_loss_mlp": 1.02051079, + "epoch": 0.44875995791372314, + "flos": 27636816360960.0, + "grad_norm": 1.7947320640906663, + "language_loss": 0.757402, + "learning_rate": 2.4257100514193775e-06, + "loss": 0.77837247, + "num_input_tokens_seen": 160113590, + "step": 7464, + "time_per_iteration": 2.7168385982513428 + }, + { + "auxiliary_loss_clip": 0.01063028, + "auxiliary_loss_mlp": 0.01033485, + "balance_loss_clip": 1.02841449, + "balance_loss_mlp": 1.02189803, + "epoch": 0.4488200811663911, + "flos": 13005947940480.0, + "grad_norm": 2.0580578868033985, + "language_loss": 0.73879725, + "learning_rate": 2.425329506653441e-06, + "loss": 0.75976241, + "num_input_tokens_seen": 160131795, + "step": 7465, + "time_per_iteration": 2.5423340797424316 + }, + { + "auxiliary_loss_clip": 0.01057733, + "auxiliary_loss_mlp": 0.01040419, + "balance_loss_clip": 1.03034949, + "balance_loss_mlp": 1.02618551, + "epoch": 0.44888020441905907, + "flos": 27489977562240.0, + "grad_norm": 2.245763459992659, + "language_loss": 0.80795443, + "learning_rate": 2.424948945758966e-06, + "loss": 0.82893598, + "num_input_tokens_seen": 160150635, + "step": 7466, + "time_per_iteration": 2.6778388023376465 + }, + { + "auxiliary_loss_clip": 0.01055949, + "auxiliary_loss_mlp": 0.01039061, + "balance_loss_clip": 1.02948451, + "balance_loss_mlp": 1.02683592, + "epoch": 0.44894032767172704, + "flos": 18259678800000.0, + "grad_norm": 7.645583816025042, + "language_loss": 0.80039167, + "learning_rate": 2.4245683687503844e-06, + "loss": 0.82134181, + "num_input_tokens_seen": 160168615, + "step": 7467, + "time_per_iteration": 2.6215600967407227 + }, + { + "auxiliary_loss_clip": 0.0103259, + "auxiliary_loss_mlp": 0.0103105, + "balance_loss_clip": 1.02922785, + "balance_loss_mlp": 1.01930761, + "epoch": 0.449000450924395, + "flos": 21579835610880.0, + "grad_norm": 1.6435067094761855, + "language_loss": 0.74869204, + "learning_rate": 2.424187775642129e-06, + "loss": 0.76932842, + "num_input_tokens_seen": 160187295, + "step": 7468, + "time_per_iteration": 2.686131000518799 + }, + { + "auxiliary_loss_clip": 0.01042731, + "auxiliary_loss_mlp": 0.01028716, + "balance_loss_clip": 1.02513552, + "balance_loss_mlp": 1.0173018, + "epoch": 0.44906057417706297, + "flos": 17967904623360.0, + "grad_norm": 1.8096271118205116, + "language_loss": 0.70688283, + "learning_rate": 2.4238071664486297e-06, + "loss": 0.72759724, + "num_input_tokens_seen": 160205115, + "step": 7469, + "time_per_iteration": 2.603816270828247 + }, + { + "auxiliary_loss_clip": 0.01066391, + "auxiliary_loss_mlp": 0.01032993, + "balance_loss_clip": 1.02954721, + "balance_loss_mlp": 1.02002907, + "epoch": 0.44912069742973093, + "flos": 20047347803520.0, + "grad_norm": 1.7248927325395045, + "language_loss": 0.71819413, + "learning_rate": 2.4234265411843203e-06, + "loss": 0.73918796, + "num_input_tokens_seen": 160222580, + "step": 7470, + "time_per_iteration": 2.6423351764678955 + }, + { + "auxiliary_loss_clip": 0.01035713, + "auxiliary_loss_mlp": 0.01037391, + "balance_loss_clip": 1.02596939, + "balance_loss_mlp": 1.02384233, + "epoch": 0.4491808206823989, + "flos": 21033526682880.0, + "grad_norm": 1.8264904477816033, + "language_loss": 0.77280879, + "learning_rate": 2.423045899863634e-06, + "loss": 0.79353988, + "num_input_tokens_seen": 160241520, + "step": 7471, + "time_per_iteration": 2.6977856159210205 + }, + { + "auxiliary_loss_clip": 0.0107525, + "auxiliary_loss_mlp": 0.01033658, + "balance_loss_clip": 1.028759, + "balance_loss_mlp": 1.02138555, + "epoch": 0.44924094393506686, + "flos": 22967136645120.0, + "grad_norm": 1.5256393165380742, + "language_loss": 0.70048887, + "learning_rate": 2.4226652425010048e-06, + "loss": 0.72157794, + "num_input_tokens_seen": 160261815, + "step": 7472, + "time_per_iteration": 2.642672061920166 + }, + { + "auxiliary_loss_clip": 0.0100346, + "auxiliary_loss_mlp": 0.0100365, + "balance_loss_clip": 1.00400519, + "balance_loss_mlp": 1.00205219, + "epoch": 0.4493010671877349, + "flos": 59233467864960.0, + "grad_norm": 0.7391347241143033, + "language_loss": 0.61732161, + "learning_rate": 2.4222845691108676e-06, + "loss": 0.63739264, + "num_input_tokens_seen": 160317070, + "step": 7473, + "time_per_iteration": 3.174276828765869 + }, + { + "auxiliary_loss_clip": 0.01077226, + "auxiliary_loss_mlp": 0.00747648, + "balance_loss_clip": 1.02977777, + "balance_loss_mlp": 1.00024021, + "epoch": 0.44936119044040285, + "flos": 18004892653440.0, + "grad_norm": 4.948086574785505, + "language_loss": 0.77086163, + "learning_rate": 2.421903879707657e-06, + "loss": 0.78911042, + "num_input_tokens_seen": 160334980, + "step": 7474, + "time_per_iteration": 2.636488914489746 + }, + { + "auxiliary_loss_clip": 0.01026017, + "auxiliary_loss_mlp": 0.0103981, + "balance_loss_clip": 1.02531087, + "balance_loss_mlp": 1.02697086, + "epoch": 0.4494213136930708, + "flos": 21251827589760.0, + "grad_norm": 1.657658913390415, + "language_loss": 0.72127986, + "learning_rate": 2.4215231743058086e-06, + "loss": 0.74193811, + "num_input_tokens_seen": 160354500, + "step": 7475, + "time_per_iteration": 2.8003225326538086 + }, + { + "auxiliary_loss_clip": 0.0102665, + "auxiliary_loss_mlp": 0.01039044, + "balance_loss_clip": 1.02487051, + "balance_loss_mlp": 1.02590144, + "epoch": 0.4494814369457388, + "flos": 27418695022080.0, + "grad_norm": 1.8364505321678122, + "language_loss": 0.77464473, + "learning_rate": 2.4211424529197594e-06, + "loss": 0.79530168, + "num_input_tokens_seen": 160373650, + "step": 7476, + "time_per_iteration": 2.80062198638916 + }, + { + "auxiliary_loss_clip": 0.0106609, + "auxiliary_loss_mlp": 0.00747552, + "balance_loss_clip": 1.02726436, + "balance_loss_mlp": 1.00020599, + "epoch": 0.44954156019840674, + "flos": 22854053652480.0, + "grad_norm": 2.608770383255807, + "language_loss": 0.71855289, + "learning_rate": 2.4207617155639464e-06, + "loss": 0.73668927, + "num_input_tokens_seen": 160393430, + "step": 7477, + "time_per_iteration": 2.7131714820861816 + }, + { + "auxiliary_loss_clip": 0.01048498, + "auxiliary_loss_mlp": 0.01042533, + "balance_loss_clip": 1.02654409, + "balance_loss_mlp": 1.0283587, + "epoch": 0.4496016834510747, + "flos": 17201570935680.0, + "grad_norm": 2.2241236565732287, + "language_loss": 0.67339218, + "learning_rate": 2.4203809622528062e-06, + "loss": 0.69430244, + "num_input_tokens_seen": 160410545, + "step": 7478, + "time_per_iteration": 2.7611329555511475 + }, + { + "auxiliary_loss_clip": 0.01051169, + "auxiliary_loss_mlp": 0.01032708, + "balance_loss_clip": 1.02693951, + "balance_loss_mlp": 1.02130008, + "epoch": 0.4496618067037427, + "flos": 18916628595840.0, + "grad_norm": 2.2832410871443525, + "language_loss": 0.89376092, + "learning_rate": 2.420000193000779e-06, + "loss": 0.91459978, + "num_input_tokens_seen": 160428105, + "step": 7479, + "time_per_iteration": 2.7589526176452637 + }, + { + "auxiliary_loss_clip": 0.01016173, + "auxiliary_loss_mlp": 0.01039237, + "balance_loss_clip": 1.02907538, + "balance_loss_mlp": 1.02577782, + "epoch": 0.44972192995641064, + "flos": 21031659175680.0, + "grad_norm": 1.6684954931794247, + "language_loss": 0.75399625, + "learning_rate": 2.419619407822302e-06, + "loss": 0.77455038, + "num_input_tokens_seen": 160448815, + "step": 7480, + "time_per_iteration": 2.831512689590454 + }, + { + "auxiliary_loss_clip": 0.01037535, + "auxiliary_loss_mlp": 0.01035284, + "balance_loss_clip": 1.02494216, + "balance_loss_mlp": 1.02167666, + "epoch": 0.4497820532090786, + "flos": 20777088510720.0, + "grad_norm": 2.796295555048982, + "language_loss": 0.80110002, + "learning_rate": 2.419238606731815e-06, + "loss": 0.82182813, + "num_input_tokens_seen": 160465940, + "step": 7481, + "time_per_iteration": 2.822204351425171 + }, + { + "auxiliary_loss_clip": 0.01051039, + "auxiliary_loss_mlp": 0.01032529, + "balance_loss_clip": 1.0279094, + "balance_loss_mlp": 1.01986301, + "epoch": 0.44984217646174657, + "flos": 33802606385280.0, + "grad_norm": 1.6202950973373098, + "language_loss": 0.68828511, + "learning_rate": 2.418857789743758e-06, + "loss": 0.70912081, + "num_input_tokens_seen": 160486710, + "step": 7482, + "time_per_iteration": 2.904648780822754 + }, + { + "auxiliary_loss_clip": 0.01066905, + "auxiliary_loss_mlp": 0.01042083, + "balance_loss_clip": 1.03015327, + "balance_loss_mlp": 1.02958369, + "epoch": 0.44990229971441453, + "flos": 15518365660800.0, + "grad_norm": 2.12718274450336, + "language_loss": 0.84567869, + "learning_rate": 2.418476956872571e-06, + "loss": 0.8667686, + "num_input_tokens_seen": 160503405, + "step": 7483, + "time_per_iteration": 2.589237689971924 + }, + { + "auxiliary_loss_clip": 0.01039512, + "auxiliary_loss_mlp": 0.01041075, + "balance_loss_clip": 1.0258882, + "balance_loss_mlp": 1.02711606, + "epoch": 0.4499624229670825, + "flos": 29861913191040.0, + "grad_norm": 1.7418404700766026, + "language_loss": 0.80503488, + "learning_rate": 2.4180961081326967e-06, + "loss": 0.82584071, + "num_input_tokens_seen": 160525080, + "step": 7484, + "time_per_iteration": 2.738912343978882 + }, + { + "auxiliary_loss_clip": 0.01017707, + "auxiliary_loss_mlp": 0.01029702, + "balance_loss_clip": 1.02303398, + "balance_loss_mlp": 1.01566482, + "epoch": 0.45002254621975046, + "flos": 18513674847360.0, + "grad_norm": 2.970917225738591, + "language_loss": 0.7515527, + "learning_rate": 2.4177152435385754e-06, + "loss": 0.77202684, + "num_input_tokens_seen": 160540895, + "step": 7485, + "time_per_iteration": 2.618887186050415 + }, + { + "auxiliary_loss_clip": 0.00996447, + "auxiliary_loss_mlp": 0.0100398, + "balance_loss_clip": 1.00632083, + "balance_loss_mlp": 1.00247812, + "epoch": 0.4500826694724185, + "flos": 70420394229120.0, + "grad_norm": 0.7859777682386647, + "language_loss": 0.58703542, + "learning_rate": 2.4173343631046504e-06, + "loss": 0.60703969, + "num_input_tokens_seen": 160598270, + "step": 7486, + "time_per_iteration": 3.3120198249816895 + }, + { + "auxiliary_loss_clip": 0.01057387, + "auxiliary_loss_mlp": 0.01036815, + "balance_loss_clip": 1.02677751, + "balance_loss_mlp": 1.02249813, + "epoch": 0.45014279272508645, + "flos": 15778897983360.0, + "grad_norm": 2.400850456941051, + "language_loss": 0.83197844, + "learning_rate": 2.4169534668453654e-06, + "loss": 0.85292041, + "num_input_tokens_seen": 160614720, + "step": 7487, + "time_per_iteration": 2.727959632873535 + }, + { + "auxiliary_loss_clip": 0.0107368, + "auxiliary_loss_mlp": 0.01032986, + "balance_loss_clip": 1.0286808, + "balance_loss_mlp": 1.02018857, + "epoch": 0.4502029159777544, + "flos": 21799573061760.0, + "grad_norm": 1.5707247368054296, + "language_loss": 0.77091181, + "learning_rate": 2.4165725547751622e-06, + "loss": 0.79197848, + "num_input_tokens_seen": 160635170, + "step": 7488, + "time_per_iteration": 2.696215867996216 + }, + { + "auxiliary_loss_clip": 0.01072278, + "auxiliary_loss_mlp": 0.01039487, + "balance_loss_clip": 1.03170788, + "balance_loss_mlp": 1.02536702, + "epoch": 0.4502630392304224, + "flos": 28767966531840.0, + "grad_norm": 5.9735263646264505, + "language_loss": 0.72172052, + "learning_rate": 2.4161916269084858e-06, + "loss": 0.74283814, + "num_input_tokens_seen": 160654490, + "step": 7489, + "time_per_iteration": 2.8351516723632812 + }, + { + "auxiliary_loss_clip": 0.01052989, + "auxiliary_loss_mlp": 0.01036735, + "balance_loss_clip": 1.02935123, + "balance_loss_mlp": 1.02190542, + "epoch": 0.45032316248309034, + "flos": 15844182952320.0, + "grad_norm": 2.113509903042195, + "language_loss": 0.69585443, + "learning_rate": 2.4158106832597817e-06, + "loss": 0.71675169, + "num_input_tokens_seen": 160669400, + "step": 7490, + "time_per_iteration": 2.8300795555114746 + }, + { + "auxiliary_loss_clip": 0.01004859, + "auxiliary_loss_mlp": 0.01007581, + "balance_loss_clip": 1.01573634, + "balance_loss_mlp": 1.00606084, + "epoch": 0.4503832857357583, + "flos": 57853600945920.0, + "grad_norm": 0.7374442127761217, + "language_loss": 0.56689525, + "learning_rate": 2.415429723843495e-06, + "loss": 0.58701962, + "num_input_tokens_seen": 160733820, + "step": 7491, + "time_per_iteration": 3.2290639877319336 + }, + { + "auxiliary_loss_clip": 0.0106204, + "auxiliary_loss_mlp": 0.01031717, + "balance_loss_clip": 1.02811074, + "balance_loss_mlp": 1.01982617, + "epoch": 0.4504434089884263, + "flos": 23878082488320.0, + "grad_norm": 1.616121093895331, + "language_loss": 0.79065776, + "learning_rate": 2.4150487486740713e-06, + "loss": 0.81159532, + "num_input_tokens_seen": 160753175, + "step": 7492, + "time_per_iteration": 2.6104047298431396 + }, + { + "auxiliary_loss_clip": 0.01039765, + "auxiliary_loss_mlp": 0.00747556, + "balance_loss_clip": 1.0253371, + "balance_loss_mlp": 1.0002532, + "epoch": 0.45050353224109424, + "flos": 17785083375360.0, + "grad_norm": 4.269256150063326, + "language_loss": 0.92832196, + "learning_rate": 2.4146677577659573e-06, + "loss": 0.94619513, + "num_input_tokens_seen": 160768310, + "step": 7493, + "time_per_iteration": 2.738802909851074 + }, + { + "auxiliary_loss_clip": 0.01003536, + "auxiliary_loss_mlp": 0.01003572, + "balance_loss_clip": 1.00438905, + "balance_loss_mlp": 1.00200999, + "epoch": 0.4505636554937622, + "flos": 65063420703360.0, + "grad_norm": 0.7984494758126455, + "language_loss": 0.62866116, + "learning_rate": 2.4142867511336e-06, + "loss": 0.64873219, + "num_input_tokens_seen": 160827370, + "step": 7494, + "time_per_iteration": 3.282060384750366 + }, + { + "auxiliary_loss_clip": 0.01073334, + "auxiliary_loss_mlp": 0.01029088, + "balance_loss_clip": 1.02892292, + "balance_loss_mlp": 1.01721501, + "epoch": 0.45062377874643017, + "flos": 22200084685440.0, + "grad_norm": 1.4321014988016276, + "language_loss": 0.81890726, + "learning_rate": 2.4139057287914484e-06, + "loss": 0.83993143, + "num_input_tokens_seen": 160849140, + "step": 7495, + "time_per_iteration": 2.6749022006988525 + }, + { + "auxiliary_loss_clip": 0.01058025, + "auxiliary_loss_mlp": 0.01034984, + "balance_loss_clip": 1.0269382, + "balance_loss_mlp": 1.0206846, + "epoch": 0.45068390199909814, + "flos": 37670293186560.0, + "grad_norm": 1.5990778252571807, + "language_loss": 0.85387683, + "learning_rate": 2.41352469075395e-06, + "loss": 0.87480688, + "num_input_tokens_seen": 160871280, + "step": 7496, + "time_per_iteration": 4.451868772506714 + }, + { + "auxiliary_loss_clip": 0.01075782, + "auxiliary_loss_mlp": 0.01030886, + "balance_loss_clip": 1.02925754, + "balance_loss_mlp": 1.01800585, + "epoch": 0.4507440252517661, + "flos": 22302501338880.0, + "grad_norm": 2.1564573035377967, + "language_loss": 0.76235604, + "learning_rate": 2.4131436370355534e-06, + "loss": 0.78342277, + "num_input_tokens_seen": 160888625, + "step": 7497, + "time_per_iteration": 4.200602293014526 + }, + { + "auxiliary_loss_clip": 0.01042048, + "auxiliary_loss_mlp": 0.01032732, + "balance_loss_clip": 1.02492023, + "balance_loss_mlp": 1.02039373, + "epoch": 0.45080414850443407, + "flos": 13188374138880.0, + "grad_norm": 2.048175087872258, + "language_loss": 0.75164568, + "learning_rate": 2.4127625676507088e-06, + "loss": 0.77239347, + "num_input_tokens_seen": 160907040, + "step": 7498, + "time_per_iteration": 2.7997543811798096 + }, + { + "auxiliary_loss_clip": 0.01076048, + "auxiliary_loss_mlp": 0.01040511, + "balance_loss_clip": 1.02864003, + "balance_loss_mlp": 1.02713025, + "epoch": 0.4508642717571021, + "flos": 21944939402880.0, + "grad_norm": 2.608903841930559, + "language_loss": 0.70601737, + "learning_rate": 2.4123814826138663e-06, + "loss": 0.72718298, + "num_input_tokens_seen": 160927115, + "step": 7499, + "time_per_iteration": 2.6556484699249268 + }, + { + "auxiliary_loss_clip": 0.01038006, + "auxiliary_loss_mlp": 0.01037792, + "balance_loss_clip": 1.02950764, + "balance_loss_mlp": 1.02459598, + "epoch": 0.45092439500977005, + "flos": 23367468700800.0, + "grad_norm": 1.8088609992530305, + "language_loss": 0.76966447, + "learning_rate": 2.412000381939477e-06, + "loss": 0.79042244, + "num_input_tokens_seen": 160944405, + "step": 7500, + "time_per_iteration": 2.710827350616455 + }, + { + "auxiliary_loss_clip": 0.01035469, + "auxiliary_loss_mlp": 0.01031952, + "balance_loss_clip": 1.02834892, + "balance_loss_mlp": 1.01975071, + "epoch": 0.450984518262438, + "flos": 20772958446720.0, + "grad_norm": 1.8501420618001707, + "language_loss": 0.62286627, + "learning_rate": 2.411619265641992e-06, + "loss": 0.6435405, + "num_input_tokens_seen": 160961345, + "step": 7501, + "time_per_iteration": 2.719470977783203 + }, + { + "auxiliary_loss_clip": 0.01076656, + "auxiliary_loss_mlp": 0.01034597, + "balance_loss_clip": 1.0286516, + "balance_loss_mlp": 1.02131128, + "epoch": 0.451044641515106, + "flos": 17707372300800.0, + "grad_norm": 4.250923590733455, + "language_loss": 0.84519881, + "learning_rate": 2.411238133735863e-06, + "loss": 0.86631131, + "num_input_tokens_seen": 160977330, + "step": 7502, + "time_per_iteration": 2.48185396194458 + }, + { + "auxiliary_loss_clip": 0.01052217, + "auxiliary_loss_mlp": 0.01032779, + "balance_loss_clip": 1.02902079, + "balance_loss_mlp": 1.02088809, + "epoch": 0.45110476476777395, + "flos": 20594698225920.0, + "grad_norm": 1.410679374908939, + "language_loss": 0.79598689, + "learning_rate": 2.4108569862355418e-06, + "loss": 0.81683689, + "num_input_tokens_seen": 160997280, + "step": 7503, + "time_per_iteration": 2.590667963027954 + }, + { + "auxiliary_loss_clip": 0.01045548, + "auxiliary_loss_mlp": 0.01033779, + "balance_loss_clip": 1.0278461, + "balance_loss_mlp": 1.02148843, + "epoch": 0.4511648880204419, + "flos": 16034043265920.0, + "grad_norm": 2.0157077727741575, + "language_loss": 0.81152153, + "learning_rate": 2.410475823155484e-06, + "loss": 0.83231479, + "num_input_tokens_seen": 161014235, + "step": 7504, + "time_per_iteration": 2.6570475101470947 + }, + { + "auxiliary_loss_clip": 0.01029243, + "auxiliary_loss_mlp": 0.01033046, + "balance_loss_clip": 1.02399111, + "balance_loss_mlp": 1.02160215, + "epoch": 0.4512250112731099, + "flos": 23978811202560.0, + "grad_norm": 1.6180883633130723, + "language_loss": 0.64008522, + "learning_rate": 2.4100946445101405e-06, + "loss": 0.66070813, + "num_input_tokens_seen": 161032360, + "step": 7505, + "time_per_iteration": 2.65608811378479 + }, + { + "auxiliary_loss_clip": 0.00993285, + "auxiliary_loss_mlp": 0.01003416, + "balance_loss_clip": 1.01342309, + "balance_loss_mlp": 1.00162756, + "epoch": 0.45128513452577784, + "flos": 71462308037760.0, + "grad_norm": 0.8343543367124261, + "language_loss": 0.58862746, + "learning_rate": 2.409713450313968e-06, + "loss": 0.60859442, + "num_input_tokens_seen": 161091360, + "step": 7506, + "time_per_iteration": 4.905486822128296 + }, + { + "auxiliary_loss_clip": 0.0102618, + "auxiliary_loss_mlp": 0.01033668, + "balance_loss_clip": 1.02569747, + "balance_loss_mlp": 1.0213778, + "epoch": 0.4513452577784458, + "flos": 22090844448000.0, + "grad_norm": 3.514986126331221, + "language_loss": 0.79277748, + "learning_rate": 2.40933224058142e-06, + "loss": 0.81337595, + "num_input_tokens_seen": 161110825, + "step": 7507, + "time_per_iteration": 2.7218258380889893 + }, + { + "auxiliary_loss_clip": 0.0104329, + "auxiliary_loss_mlp": 0.01033585, + "balance_loss_clip": 1.02837455, + "balance_loss_mlp": 1.02004266, + "epoch": 0.4514053810311138, + "flos": 24276403382400.0, + "grad_norm": 1.5185893480912136, + "language_loss": 0.73825079, + "learning_rate": 2.4089510153269526e-06, + "loss": 0.75901949, + "num_input_tokens_seen": 161130685, + "step": 7508, + "time_per_iteration": 2.7647175788879395 + }, + { + "auxiliary_loss_clip": 0.01064569, + "auxiliary_loss_mlp": 0.01035754, + "balance_loss_clip": 1.03081846, + "balance_loss_mlp": 1.02341568, + "epoch": 0.45146550428378174, + "flos": 17886781756800.0, + "grad_norm": 1.9366133429323387, + "language_loss": 0.7915675, + "learning_rate": 2.4085697745650217e-06, + "loss": 0.81257069, + "num_input_tokens_seen": 161147555, + "step": 7509, + "time_per_iteration": 2.615192174911499 + }, + { + "auxiliary_loss_clip": 0.01076977, + "auxiliary_loss_mlp": 0.01032382, + "balance_loss_clip": 1.03186655, + "balance_loss_mlp": 1.02031815, + "epoch": 0.4515256275364497, + "flos": 24243437675520.0, + "grad_norm": 2.9460593056421285, + "language_loss": 0.73018044, + "learning_rate": 2.4081885183100837e-06, + "loss": 0.75127405, + "num_input_tokens_seen": 161166255, + "step": 7510, + "time_per_iteration": 4.214899778366089 + }, + { + "auxiliary_loss_clip": 0.01074552, + "auxiliary_loss_mlp": 0.01032756, + "balance_loss_clip": 1.02773118, + "balance_loss_mlp": 1.0197866, + "epoch": 0.45158575078911767, + "flos": 20631039811200.0, + "grad_norm": 2.03234353595904, + "language_loss": 0.7701683, + "learning_rate": 2.4078072465765964e-06, + "loss": 0.79124141, + "num_input_tokens_seen": 161184720, + "step": 7511, + "time_per_iteration": 2.7244997024536133 + }, + { + "auxiliary_loss_clip": 0.01066206, + "auxiliary_loss_mlp": 0.01033867, + "balance_loss_clip": 1.02948105, + "balance_loss_mlp": 1.02068865, + "epoch": 0.45164587404178563, + "flos": 23327751237120.0, + "grad_norm": 1.8083596000724955, + "language_loss": 0.78637397, + "learning_rate": 2.4074259593790174e-06, + "loss": 0.80737472, + "num_input_tokens_seen": 161204360, + "step": 7512, + "time_per_iteration": 2.656543731689453 + }, + { + "auxiliary_loss_clip": 0.01042849, + "auxiliary_loss_mlp": 0.01035026, + "balance_loss_clip": 1.0261234, + "balance_loss_mlp": 1.02115655, + "epoch": 0.45170599729445365, + "flos": 23805973935360.0, + "grad_norm": 3.9090399704818766, + "language_loss": 0.87452614, + "learning_rate": 2.4070446567318053e-06, + "loss": 0.89530492, + "num_input_tokens_seen": 161223575, + "step": 7513, + "time_per_iteration": 2.8006157875061035 + }, + { + "auxiliary_loss_clip": 0.01058833, + "auxiliary_loss_mlp": 0.01028345, + "balance_loss_clip": 1.02743101, + "balance_loss_mlp": 1.01708531, + "epoch": 0.4517661205471216, + "flos": 23512942782720.0, + "grad_norm": 1.592053646440933, + "language_loss": 0.67525393, + "learning_rate": 2.406663338649419e-06, + "loss": 0.69612575, + "num_input_tokens_seen": 161243805, + "step": 7514, + "time_per_iteration": 2.664445161819458 + }, + { + "auxiliary_loss_clip": 0.01066428, + "auxiliary_loss_mlp": 0.01033339, + "balance_loss_clip": 1.0298835, + "balance_loss_mlp": 1.01940894, + "epoch": 0.4518262437997896, + "flos": 23513948363520.0, + "grad_norm": 2.7585860974108996, + "language_loss": 0.69415641, + "learning_rate": 2.406282005146318e-06, + "loss": 0.71515405, + "num_input_tokens_seen": 161261450, + "step": 7515, + "time_per_iteration": 2.7679271697998047 + }, + { + "auxiliary_loss_clip": 0.01059485, + "auxiliary_loss_mlp": 0.0103416, + "balance_loss_clip": 1.02717006, + "balance_loss_mlp": 1.02039719, + "epoch": 0.45188636705245755, + "flos": 14568061489920.0, + "grad_norm": 2.3484470817842675, + "language_loss": 0.81541908, + "learning_rate": 2.405900656236963e-06, + "loss": 0.83635557, + "num_input_tokens_seen": 161276965, + "step": 7516, + "time_per_iteration": 2.651179075241089 + }, + { + "auxiliary_loss_clip": 0.0107259, + "auxiliary_loss_mlp": 0.01031849, + "balance_loss_clip": 1.02880263, + "balance_loss_mlp": 1.01930201, + "epoch": 0.4519464903051255, + "flos": 19901550499200.0, + "grad_norm": 1.8607727001104866, + "language_loss": 0.65583789, + "learning_rate": 2.4055192919358137e-06, + "loss": 0.67688233, + "num_input_tokens_seen": 161295375, + "step": 7517, + "time_per_iteration": 2.5823118686676025 + }, + { + "auxiliary_loss_clip": 0.01038124, + "auxiliary_loss_mlp": 0.01027946, + "balance_loss_clip": 1.02551329, + "balance_loss_mlp": 1.01641238, + "epoch": 0.4520066135577935, + "flos": 18844376388480.0, + "grad_norm": 1.8281474852799966, + "language_loss": 0.62752604, + "learning_rate": 2.405137912257333e-06, + "loss": 0.64818674, + "num_input_tokens_seen": 161313010, + "step": 7518, + "time_per_iteration": 2.642380475997925 + }, + { + "auxiliary_loss_clip": 0.01064504, + "auxiliary_loss_mlp": 0.01033523, + "balance_loss_clip": 1.02985525, + "balance_loss_mlp": 1.02201319, + "epoch": 0.45206673681046144, + "flos": 48214419713280.0, + "grad_norm": 1.3943494922316344, + "language_loss": 0.59164232, + "learning_rate": 2.404756517215982e-06, + "loss": 0.6126225, + "num_input_tokens_seen": 161336690, + "step": 7519, + "time_per_iteration": 2.776918888092041 + }, + { + "auxiliary_loss_clip": 0.01066775, + "auxiliary_loss_mlp": 0.01037313, + "balance_loss_clip": 1.0308125, + "balance_loss_mlp": 1.02517748, + "epoch": 0.4521268600631294, + "flos": 23842171866240.0, + "grad_norm": 1.7218294669813419, + "language_loss": 0.72494107, + "learning_rate": 2.404375106826223e-06, + "loss": 0.74598193, + "num_input_tokens_seen": 161357845, + "step": 7520, + "time_per_iteration": 2.595574378967285 + }, + { + "auxiliary_loss_clip": 0.01053079, + "auxiliary_loss_mlp": 0.01034656, + "balance_loss_clip": 1.0278362, + "balance_loss_mlp": 1.02291381, + "epoch": 0.4521869833157974, + "flos": 18843622202880.0, + "grad_norm": 2.099852642920432, + "language_loss": 0.75984424, + "learning_rate": 2.4039936811025194e-06, + "loss": 0.78072155, + "num_input_tokens_seen": 161375160, + "step": 7521, + "time_per_iteration": 2.6455538272857666 + }, + { + "auxiliary_loss_clip": 0.01061106, + "auxiliary_loss_mlp": 0.01034465, + "balance_loss_clip": 1.03229737, + "balance_loss_mlp": 1.02142894, + "epoch": 0.45224710656846534, + "flos": 19788072456960.0, + "grad_norm": 2.0406811278490986, + "language_loss": 0.67306769, + "learning_rate": 2.4036122400593343e-06, + "loss": 0.69402337, + "num_input_tokens_seen": 161393690, + "step": 7522, + "time_per_iteration": 2.5808534622192383 + }, + { + "auxiliary_loss_clip": 0.01061632, + "auxiliary_loss_mlp": 0.01033779, + "balance_loss_clip": 1.02686048, + "balance_loss_mlp": 1.02191794, + "epoch": 0.4523072298211333, + "flos": 28256131681920.0, + "grad_norm": 1.4295399309994348, + "language_loss": 0.60806745, + "learning_rate": 2.403230783711134e-06, + "loss": 0.62902153, + "num_input_tokens_seen": 161415015, + "step": 7523, + "time_per_iteration": 2.667187213897705 + }, + { + "auxiliary_loss_clip": 0.01067626, + "auxiliary_loss_mlp": 0.01040586, + "balance_loss_clip": 1.02838469, + "balance_loss_mlp": 1.02671647, + "epoch": 0.45236735307380127, + "flos": 11181039511680.0, + "grad_norm": 2.001638132439229, + "language_loss": 0.78329039, + "learning_rate": 2.4028493120723813e-06, + "loss": 0.80437249, + "num_input_tokens_seen": 161432940, + "step": 7524, + "time_per_iteration": 2.6645610332489014 + }, + { + "auxiliary_loss_clip": 0.01036571, + "auxiliary_loss_mlp": 0.01035162, + "balance_loss_clip": 1.02979088, + "balance_loss_mlp": 1.02225113, + "epoch": 0.45242747632646924, + "flos": 22601386408320.0, + "grad_norm": 1.6190434602220654, + "language_loss": 0.63901305, + "learning_rate": 2.4024678251575417e-06, + "loss": 0.65973037, + "num_input_tokens_seen": 161452215, + "step": 7525, + "time_per_iteration": 2.705787181854248 + }, + { + "auxiliary_loss_clip": 0.01065483, + "auxiliary_loss_mlp": 0.01033023, + "balance_loss_clip": 1.02978015, + "balance_loss_mlp": 1.02149594, + "epoch": 0.45248759957913726, + "flos": 18256267008000.0, + "grad_norm": 1.5939664457905336, + "language_loss": 0.78698653, + "learning_rate": 2.402086322981083e-06, + "loss": 0.80797154, + "num_input_tokens_seen": 161469520, + "step": 7526, + "time_per_iteration": 2.593705177307129 + }, + { + "auxiliary_loss_clip": 0.01045792, + "auxiliary_loss_mlp": 0.01028381, + "balance_loss_clip": 1.02611232, + "balance_loss_mlp": 1.01635289, + "epoch": 0.4525477228318052, + "flos": 22450094323200.0, + "grad_norm": 1.7431277227011042, + "language_loss": 0.81303251, + "learning_rate": 2.40170480555747e-06, + "loss": 0.83377427, + "num_input_tokens_seen": 161487335, + "step": 7527, + "time_per_iteration": 2.692171812057495 + }, + { + "auxiliary_loss_clip": 0.01039756, + "auxiliary_loss_mlp": 0.01028277, + "balance_loss_clip": 1.02851915, + "balance_loss_mlp": 1.01595092, + "epoch": 0.4526078460844732, + "flos": 29644869260160.0, + "grad_norm": 1.7463454832943508, + "language_loss": 0.65472269, + "learning_rate": 2.4013232729011706e-06, + "loss": 0.675403, + "num_input_tokens_seen": 161510095, + "step": 7528, + "time_per_iteration": 2.759678363800049 + }, + { + "auxiliary_loss_clip": 0.01044844, + "auxiliary_loss_mlp": 0.01028885, + "balance_loss_clip": 1.02378988, + "balance_loss_mlp": 1.01691604, + "epoch": 0.45266796933714115, + "flos": 23039747988480.0, + "grad_norm": 2.163792894941088, + "language_loss": 0.75207913, + "learning_rate": 2.4009417250266525e-06, + "loss": 0.77281642, + "num_input_tokens_seen": 161528725, + "step": 7529, + "time_per_iteration": 2.6490402221679688 + }, + { + "auxiliary_loss_clip": 0.01072143, + "auxiliary_loss_mlp": 0.01033201, + "balance_loss_clip": 1.02766442, + "balance_loss_mlp": 1.02139902, + "epoch": 0.4527280925898091, + "flos": 14428405411200.0, + "grad_norm": 1.9788423044916508, + "language_loss": 0.72708666, + "learning_rate": 2.400560161948384e-06, + "loss": 0.74814004, + "num_input_tokens_seen": 161547195, + "step": 7530, + "time_per_iteration": 2.578174352645874 + }, + { + "auxiliary_loss_clip": 0.01040666, + "auxiliary_loss_mlp": 0.01031275, + "balance_loss_clip": 1.02552688, + "balance_loss_mlp": 1.01975369, + "epoch": 0.4527882158424771, + "flos": 22925515760640.0, + "grad_norm": 1.546812236705516, + "language_loss": 0.76101387, + "learning_rate": 2.400178583680834e-06, + "loss": 0.78173321, + "num_input_tokens_seen": 161565565, + "step": 7531, + "time_per_iteration": 2.677504062652588 + }, + { + "auxiliary_loss_clip": 0.01069038, + "auxiliary_loss_mlp": 0.01034245, + "balance_loss_clip": 1.02608848, + "balance_loss_mlp": 1.02172256, + "epoch": 0.45284833909514505, + "flos": 25555326105600.0, + "grad_norm": 1.5339821519742158, + "language_loss": 0.67025363, + "learning_rate": 2.3997969902384717e-06, + "loss": 0.69128644, + "num_input_tokens_seen": 161586630, + "step": 7532, + "time_per_iteration": 2.577389717102051 + }, + { + "auxiliary_loss_clip": 0.01063959, + "auxiliary_loss_mlp": 0.01038056, + "balance_loss_clip": 1.02977169, + "balance_loss_mlp": 1.02638531, + "epoch": 0.452908462347813, + "flos": 18150007599360.0, + "grad_norm": 2.0451012909128763, + "language_loss": 0.78688204, + "learning_rate": 2.399415381635768e-06, + "loss": 0.80790216, + "num_input_tokens_seen": 161603815, + "step": 7533, + "time_per_iteration": 2.6281955242156982 + }, + { + "auxiliary_loss_clip": 0.01046569, + "auxiliary_loss_mlp": 0.01033967, + "balance_loss_clip": 1.02645099, + "balance_loss_mlp": 1.02050185, + "epoch": 0.452968585600481, + "flos": 19062749122560.0, + "grad_norm": 1.7670090260802371, + "language_loss": 0.83167338, + "learning_rate": 2.3990337578871927e-06, + "loss": 0.8524788, + "num_input_tokens_seen": 161622900, + "step": 7534, + "time_per_iteration": 2.6479544639587402 + }, + { + "auxiliary_loss_clip": 0.01058421, + "auxiliary_loss_mlp": 0.01038053, + "balance_loss_clip": 1.03149474, + "balance_loss_mlp": 1.02389717, + "epoch": 0.45302870885314894, + "flos": 22051737515520.0, + "grad_norm": 2.2125460632666503, + "language_loss": 0.7630713, + "learning_rate": 2.3986521190072176e-06, + "loss": 0.78403604, + "num_input_tokens_seen": 161641700, + "step": 7535, + "time_per_iteration": 2.6897406578063965 + }, + { + "auxiliary_loss_clip": 0.01043503, + "auxiliary_loss_mlp": 0.01034036, + "balance_loss_clip": 1.03107417, + "balance_loss_mlp": 1.02259147, + "epoch": 0.4530888321058169, + "flos": 20376217751040.0, + "grad_norm": 1.5478910151094902, + "language_loss": 0.80297494, + "learning_rate": 2.3982704650103138e-06, + "loss": 0.82375026, + "num_input_tokens_seen": 161661955, + "step": 7536, + "time_per_iteration": 2.707746982574463 + }, + { + "auxiliary_loss_clip": 0.0104937, + "auxiliary_loss_mlp": 0.01033228, + "balance_loss_clip": 1.02752662, + "balance_loss_mlp": 1.02029371, + "epoch": 0.4531489553584849, + "flos": 14830425406080.0, + "grad_norm": 1.8367713303086208, + "language_loss": 0.75862557, + "learning_rate": 2.3978887959109544e-06, + "loss": 0.77945155, + "num_input_tokens_seen": 161679245, + "step": 7537, + "time_per_iteration": 2.7294962406158447 + }, + { + "auxiliary_loss_clip": 0.0106227, + "auxiliary_loss_mlp": 0.01032695, + "balance_loss_clip": 1.02758384, + "balance_loss_mlp": 1.02103615, + "epoch": 0.45320907861115284, + "flos": 21944975316480.0, + "grad_norm": 2.1613430671734344, + "language_loss": 0.75794083, + "learning_rate": 2.3975071117236118e-06, + "loss": 0.77889049, + "num_input_tokens_seen": 161698795, + "step": 7538, + "time_per_iteration": 2.671178102493286 + }, + { + "auxiliary_loss_clip": 0.01006246, + "auxiliary_loss_mlp": 0.01004957, + "balance_loss_clip": 1.00640583, + "balance_loss_mlp": 1.00344908, + "epoch": 0.45326920186382086, + "flos": 66251455038720.0, + "grad_norm": 0.8752972314485048, + "language_loss": 0.62340742, + "learning_rate": 2.3971254124627593e-06, + "loss": 0.64351946, + "num_input_tokens_seen": 161761980, + "step": 7539, + "time_per_iteration": 3.337097406387329 + }, + { + "auxiliary_loss_clip": 0.01072702, + "auxiliary_loss_mlp": 0.0103921, + "balance_loss_clip": 1.02831948, + "balance_loss_mlp": 1.02736068, + "epoch": 0.4533293251164888, + "flos": 14684233052160.0, + "grad_norm": 1.6339642191339632, + "language_loss": 0.65760696, + "learning_rate": 2.396743698142872e-06, + "loss": 0.67872608, + "num_input_tokens_seen": 161779455, + "step": 7540, + "time_per_iteration": 2.8556056022644043 + }, + { + "auxiliary_loss_clip": 0.01052303, + "auxiliary_loss_mlp": 0.01039713, + "balance_loss_clip": 1.02776933, + "balance_loss_mlp": 1.0257771, + "epoch": 0.4533894483691568, + "flos": 22601206840320.0, + "grad_norm": 3.227289548373663, + "language_loss": 0.84981239, + "learning_rate": 2.396361968778424e-06, + "loss": 0.87073255, + "num_input_tokens_seen": 161798980, + "step": 7541, + "time_per_iteration": 2.710522174835205 + }, + { + "auxiliary_loss_clip": 0.01052672, + "auxiliary_loss_mlp": 0.01032839, + "balance_loss_clip": 1.02714097, + "balance_loss_mlp": 1.0210135, + "epoch": 0.45344957162182475, + "flos": 34751617666560.0, + "grad_norm": 1.7630984212548817, + "language_loss": 0.76838183, + "learning_rate": 2.395980224383889e-06, + "loss": 0.7892369, + "num_input_tokens_seen": 161819745, + "step": 7542, + "time_per_iteration": 2.7565629482269287 + }, + { + "auxiliary_loss_clip": 0.01052877, + "auxiliary_loss_mlp": 0.01026185, + "balance_loss_clip": 1.02722096, + "balance_loss_mlp": 1.01383471, + "epoch": 0.4535096948744927, + "flos": 23550218121600.0, + "grad_norm": 1.487958279675454, + "language_loss": 0.80345142, + "learning_rate": 2.395598464973746e-06, + "loss": 0.824242, + "num_input_tokens_seen": 161838575, + "step": 7543, + "time_per_iteration": 4.225313425064087 + }, + { + "auxiliary_loss_clip": 0.01060247, + "auxiliary_loss_mlp": 0.0074756, + "balance_loss_clip": 1.02723324, + "balance_loss_mlp": 1.00026977, + "epoch": 0.4535698181271607, + "flos": 25557552748800.0, + "grad_norm": 1.7393130842574605, + "language_loss": 0.7617622, + "learning_rate": 2.395216690562469e-06, + "loss": 0.77984023, + "num_input_tokens_seen": 161858590, + "step": 7544, + "time_per_iteration": 2.6945629119873047 + }, + { + "auxiliary_loss_clip": 0.01048354, + "auxiliary_loss_mlp": 0.01036338, + "balance_loss_clip": 1.03050351, + "balance_loss_mlp": 1.02427959, + "epoch": 0.45362994137982865, + "flos": 24864117713280.0, + "grad_norm": 1.7283113687601939, + "language_loss": 0.74934292, + "learning_rate": 2.3948349011645355e-06, + "loss": 0.77018988, + "num_input_tokens_seen": 161878390, + "step": 7545, + "time_per_iteration": 4.272920846939087 + }, + { + "auxiliary_loss_clip": 0.01053918, + "auxiliary_loss_mlp": 0.01031701, + "balance_loss_clip": 1.02828312, + "balance_loss_mlp": 1.01929152, + "epoch": 0.4536900646324966, + "flos": 30806794408320.0, + "grad_norm": 1.6349894955929165, + "language_loss": 0.72078943, + "learning_rate": 2.394453096794423e-06, + "loss": 0.74164563, + "num_input_tokens_seen": 161898610, + "step": 7546, + "time_per_iteration": 2.81311297416687 + }, + { + "auxiliary_loss_clip": 0.01054919, + "auxiliary_loss_mlp": 0.01029307, + "balance_loss_clip": 1.02792084, + "balance_loss_mlp": 1.01576471, + "epoch": 0.4537501878851646, + "flos": 23404313076480.0, + "grad_norm": 1.5632734371414916, + "language_loss": 0.75663602, + "learning_rate": 2.394071277466609e-06, + "loss": 0.77747828, + "num_input_tokens_seen": 161918210, + "step": 7547, + "time_per_iteration": 2.6561098098754883 + }, + { + "auxiliary_loss_clip": 0.01064822, + "auxiliary_loss_mlp": 0.01031011, + "balance_loss_clip": 1.02777791, + "balance_loss_mlp": 1.01802909, + "epoch": 0.45381031113783254, + "flos": 18149289327360.0, + "grad_norm": 2.194361427906915, + "language_loss": 0.69612247, + "learning_rate": 2.393689443195573e-06, + "loss": 0.71708077, + "num_input_tokens_seen": 161936950, + "step": 7548, + "time_per_iteration": 2.8680131435394287 + }, + { + "auxiliary_loss_clip": 0.01073279, + "auxiliary_loss_mlp": 0.0103596, + "balance_loss_clip": 1.02721131, + "balance_loss_mlp": 1.02370489, + "epoch": 0.4538704343905005, + "flos": 25336666062720.0, + "grad_norm": 1.9317986913608616, + "language_loss": 0.72298658, + "learning_rate": 2.393307593995794e-06, + "loss": 0.74407899, + "num_input_tokens_seen": 161955550, + "step": 7549, + "time_per_iteration": 2.6516835689544678 + }, + { + "auxiliary_loss_clip": 0.01038864, + "auxiliary_loss_mlp": 0.01027776, + "balance_loss_clip": 1.02475011, + "balance_loss_mlp": 1.01673746, + "epoch": 0.4539305576431685, + "flos": 28731445378560.0, + "grad_norm": 1.4942852145944734, + "language_loss": 0.65198427, + "learning_rate": 2.392925729881751e-06, + "loss": 0.67265069, + "num_input_tokens_seen": 161976760, + "step": 7550, + "time_per_iteration": 2.7866156101226807 + }, + { + "auxiliary_loss_clip": 0.01066055, + "auxiliary_loss_mlp": 0.01036928, + "balance_loss_clip": 1.03206575, + "balance_loss_mlp": 1.02456629, + "epoch": 0.45399068089583644, + "flos": 22492397566080.0, + "grad_norm": 1.6805756778682557, + "language_loss": 0.68582845, + "learning_rate": 2.3925438508679263e-06, + "loss": 0.70685828, + "num_input_tokens_seen": 161996120, + "step": 7551, + "time_per_iteration": 2.6184465885162354 + }, + { + "auxiliary_loss_clip": 0.01057885, + "auxiliary_loss_mlp": 0.01031909, + "balance_loss_clip": 1.02436495, + "balance_loss_mlp": 1.01911819, + "epoch": 0.45405080414850446, + "flos": 12893403651840.0, + "grad_norm": 1.9405543876872873, + "language_loss": 0.79273868, + "learning_rate": 2.392161956968798e-06, + "loss": 0.81363666, + "num_input_tokens_seen": 162011125, + "step": 7552, + "time_per_iteration": 2.6086184978485107 + }, + { + "auxiliary_loss_clip": 0.01006144, + "auxiliary_loss_mlp": 0.01002286, + "balance_loss_clip": 1.00748038, + "balance_loss_mlp": 1.00066495, + "epoch": 0.4541109274011724, + "flos": 59766919724160.0, + "grad_norm": 0.8197770220544318, + "language_loss": 0.57822466, + "learning_rate": 2.39178004819885e-06, + "loss": 0.59830898, + "num_input_tokens_seen": 162068705, + "step": 7553, + "time_per_iteration": 4.711807489395142 + }, + { + "auxiliary_loss_clip": 0.01012915, + "auxiliary_loss_mlp": 0.01032496, + "balance_loss_clip": 1.02541018, + "balance_loss_mlp": 1.02177882, + "epoch": 0.4541710506538404, + "flos": 28511743841280.0, + "grad_norm": 1.3654732019715952, + "language_loss": 0.76625896, + "learning_rate": 2.3913981245725626e-06, + "loss": 0.78671306, + "num_input_tokens_seen": 162089655, + "step": 7554, + "time_per_iteration": 2.7449536323547363 + }, + { + "auxiliary_loss_clip": 0.01056011, + "auxiliary_loss_mlp": 0.0103279, + "balance_loss_clip": 1.02846622, + "balance_loss_mlp": 1.01927781, + "epoch": 0.45423117390650836, + "flos": 17675591742720.0, + "grad_norm": 2.9967074428788876, + "language_loss": 0.76712853, + "learning_rate": 2.3910161861044194e-06, + "loss": 0.78801656, + "num_input_tokens_seen": 162108465, + "step": 7555, + "time_per_iteration": 2.610629081726074 + }, + { + "auxiliary_loss_clip": 0.01013129, + "auxiliary_loss_mlp": 0.01029896, + "balance_loss_clip": 1.02852869, + "balance_loss_mlp": 1.01832032, + "epoch": 0.4542912971591763, + "flos": 28072556248320.0, + "grad_norm": 1.3877620389418772, + "language_loss": 0.72670513, + "learning_rate": 2.390634232808903e-06, + "loss": 0.7471354, + "num_input_tokens_seen": 162129910, + "step": 7556, + "time_per_iteration": 2.816094398498535 + }, + { + "auxiliary_loss_clip": 0.01076617, + "auxiliary_loss_mlp": 0.01032911, + "balance_loss_clip": 1.0284009, + "balance_loss_mlp": 1.02039385, + "epoch": 0.4543514204118443, + "flos": 22671771108480.0, + "grad_norm": 3.625129490947443, + "language_loss": 0.63203281, + "learning_rate": 2.3902522647004982e-06, + "loss": 0.65312809, + "num_input_tokens_seen": 162148840, + "step": 7557, + "time_per_iteration": 4.293301343917847 + }, + { + "auxiliary_loss_clip": 0.00994959, + "auxiliary_loss_mlp": 0.01001619, + "balance_loss_clip": 1.00492394, + "balance_loss_mlp": 1.00002146, + "epoch": 0.45441154366451225, + "flos": 58216549921920.0, + "grad_norm": 0.682333223498442, + "language_loss": 0.57643104, + "learning_rate": 2.3898702817936875e-06, + "loss": 0.5963968, + "num_input_tokens_seen": 162208500, + "step": 7558, + "time_per_iteration": 3.0922203063964844 + }, + { + "auxiliary_loss_clip": 0.01064658, + "auxiliary_loss_mlp": 0.01032887, + "balance_loss_clip": 1.02809477, + "balance_loss_mlp": 1.01964247, + "epoch": 0.4544716669171802, + "flos": 16764286763520.0, + "grad_norm": 3.1286266256195967, + "language_loss": 0.56434369, + "learning_rate": 2.3894882841029573e-06, + "loss": 0.58531916, + "num_input_tokens_seen": 162224650, + "step": 7559, + "time_per_iteration": 2.582467555999756 + }, + { + "auxiliary_loss_clip": 0.01065779, + "auxiliary_loss_mlp": 0.0074745, + "balance_loss_clip": 1.02967477, + "balance_loss_mlp": 1.00027382, + "epoch": 0.4545317901698482, + "flos": 15925233991680.0, + "grad_norm": 1.7424145447639183, + "language_loss": 0.7166456, + "learning_rate": 2.389106271642792e-06, + "loss": 0.73477793, + "num_input_tokens_seen": 162242930, + "step": 7560, + "time_per_iteration": 2.6123132705688477 + }, + { + "auxiliary_loss_clip": 0.00990181, + "auxiliary_loss_mlp": 0.01032428, + "balance_loss_clip": 1.02218032, + "balance_loss_mlp": 1.01926768, + "epoch": 0.45459191342251615, + "flos": 17639752947840.0, + "grad_norm": 2.291876071704707, + "language_loss": 0.68889976, + "learning_rate": 2.3887242444276775e-06, + "loss": 0.70912576, + "num_input_tokens_seen": 162261455, + "step": 7561, + "time_per_iteration": 2.754012107849121 + }, + { + "auxiliary_loss_clip": 0.01050391, + "auxiliary_loss_mlp": 0.01030873, + "balance_loss_clip": 1.02727938, + "balance_loss_mlp": 1.02043021, + "epoch": 0.4546520366751841, + "flos": 16176608346240.0, + "grad_norm": 1.861185026976198, + "language_loss": 0.84921265, + "learning_rate": 2.3883422024721015e-06, + "loss": 0.87002528, + "num_input_tokens_seen": 162279725, + "step": 7562, + "time_per_iteration": 2.602602005004883 + }, + { + "auxiliary_loss_clip": 0.01058131, + "auxiliary_loss_mlp": 0.0103095, + "balance_loss_clip": 1.0257175, + "balance_loss_mlp": 1.0197916, + "epoch": 0.4547121599278521, + "flos": 19751443562880.0, + "grad_norm": 1.8045909956113433, + "language_loss": 0.8925457, + "learning_rate": 2.38796014579055e-06, + "loss": 0.91343647, + "num_input_tokens_seen": 162297865, + "step": 7563, + "time_per_iteration": 2.5998873710632324 + }, + { + "auxiliary_loss_clip": 0.01072576, + "auxiliary_loss_mlp": 0.00747539, + "balance_loss_clip": 1.02749038, + "balance_loss_mlp": 1.00024664, + "epoch": 0.45477228318052004, + "flos": 19937461121280.0, + "grad_norm": 1.8698706860767604, + "language_loss": 0.71606791, + "learning_rate": 2.3875780743975097e-06, + "loss": 0.73426902, + "num_input_tokens_seen": 162316010, + "step": 7564, + "time_per_iteration": 2.5193071365356445 + }, + { + "auxiliary_loss_clip": 0.0106291, + "auxiliary_loss_mlp": 0.01031053, + "balance_loss_clip": 1.02587986, + "balance_loss_mlp": 1.01895952, + "epoch": 0.454832406433188, + "flos": 21288312829440.0, + "grad_norm": 1.932091012500196, + "language_loss": 0.67927372, + "learning_rate": 2.3871959883074713e-06, + "loss": 0.70021331, + "num_input_tokens_seen": 162336115, + "step": 7565, + "time_per_iteration": 2.673737049102783 + }, + { + "auxiliary_loss_clip": 0.01032471, + "auxiliary_loss_mlp": 0.01030545, + "balance_loss_clip": 1.02782369, + "balance_loss_mlp": 1.01889277, + "epoch": 0.45489252968585603, + "flos": 24498726612480.0, + "grad_norm": 1.5802831260009347, + "language_loss": 0.80074036, + "learning_rate": 2.386813887534922e-06, + "loss": 0.82137054, + "num_input_tokens_seen": 162355705, + "step": 7566, + "time_per_iteration": 2.7102527618408203 + }, + { + "auxiliary_loss_clip": 0.010346, + "auxiliary_loss_mlp": 0.01028995, + "balance_loss_clip": 1.02484751, + "balance_loss_mlp": 1.01566148, + "epoch": 0.454952652938524, + "flos": 17092474352640.0, + "grad_norm": 1.680719776070796, + "language_loss": 0.7389816, + "learning_rate": 2.3864317720943508e-06, + "loss": 0.75961757, + "num_input_tokens_seen": 162374055, + "step": 7567, + "time_per_iteration": 2.6811680793762207 + }, + { + "auxiliary_loss_clip": 0.01038321, + "auxiliary_loss_mlp": 0.01035108, + "balance_loss_clip": 1.02616823, + "balance_loss_mlp": 1.02312171, + "epoch": 0.45501277619119196, + "flos": 27630387826560.0, + "grad_norm": 1.479696771781281, + "language_loss": 0.809174, + "learning_rate": 2.386049642000249e-06, + "loss": 0.82990831, + "num_input_tokens_seen": 162393560, + "step": 7568, + "time_per_iteration": 2.7058589458465576 + }, + { + "auxiliary_loss_clip": 0.01069952, + "auxiliary_loss_mlp": 0.01041668, + "balance_loss_clip": 1.02977312, + "balance_loss_mlp": 1.02766681, + "epoch": 0.4550728994438599, + "flos": 19974664632960.0, + "grad_norm": 1.9520014238814187, + "language_loss": 0.79794091, + "learning_rate": 2.3856674972671055e-06, + "loss": 0.81905711, + "num_input_tokens_seen": 162413170, + "step": 7569, + "time_per_iteration": 2.681791305541992 + }, + { + "auxiliary_loss_clip": 0.01065686, + "auxiliary_loss_mlp": 0.01033633, + "balance_loss_clip": 1.0280081, + "balance_loss_mlp": 1.02018619, + "epoch": 0.4551330226965279, + "flos": 26066873646720.0, + "grad_norm": 1.4361619368204104, + "language_loss": 0.75193977, + "learning_rate": 2.385285337909412e-06, + "loss": 0.77293301, + "num_input_tokens_seen": 162434080, + "step": 7570, + "time_per_iteration": 2.6516008377075195 + }, + { + "auxiliary_loss_clip": 0.01053325, + "auxiliary_loss_mlp": 0.01035603, + "balance_loss_clip": 1.02899671, + "balance_loss_mlp": 1.02311015, + "epoch": 0.45519314594919585, + "flos": 32781091501440.0, + "grad_norm": 1.8696194930214285, + "language_loss": 0.74939823, + "learning_rate": 2.3849031639416596e-06, + "loss": 0.77028751, + "num_input_tokens_seen": 162455445, + "step": 7571, + "time_per_iteration": 2.7758281230926514 + }, + { + "auxiliary_loss_clip": 0.01059869, + "auxiliary_loss_mlp": 0.01027928, + "balance_loss_clip": 1.02780294, + "balance_loss_mlp": 1.01661563, + "epoch": 0.4552532692018638, + "flos": 19172671718400.0, + "grad_norm": 1.544330527842239, + "language_loss": 0.81221861, + "learning_rate": 2.3845209753783414e-06, + "loss": 0.83309656, + "num_input_tokens_seen": 162474940, + "step": 7572, + "time_per_iteration": 2.7252790927886963 + }, + { + "auxiliary_loss_clip": 0.01055502, + "auxiliary_loss_mlp": 0.01035942, + "balance_loss_clip": 1.02839088, + "balance_loss_mlp": 1.02232862, + "epoch": 0.4553133924545318, + "flos": 26027156183040.0, + "grad_norm": 1.695232149355425, + "language_loss": 0.72644544, + "learning_rate": 2.3841387722339486e-06, + "loss": 0.74735987, + "num_input_tokens_seen": 162493340, + "step": 7573, + "time_per_iteration": 2.944577693939209 + }, + { + "auxiliary_loss_clip": 0.01066562, + "auxiliary_loss_mlp": 0.01035672, + "balance_loss_clip": 1.02915192, + "balance_loss_mlp": 1.02153397, + "epoch": 0.45537351570719975, + "flos": 30661535808000.0, + "grad_norm": 2.22183961045275, + "language_loss": 0.74478889, + "learning_rate": 2.3837565545229748e-06, + "loss": 0.76581126, + "num_input_tokens_seen": 162514360, + "step": 7574, + "time_per_iteration": 2.82191801071167 + }, + { + "auxiliary_loss_clip": 0.01062224, + "auxiliary_loss_mlp": 0.01029186, + "balance_loss_clip": 1.02635074, + "balance_loss_mlp": 1.01699662, + "epoch": 0.4554336389598677, + "flos": 24353396184960.0, + "grad_norm": 1.4547763216290286, + "language_loss": 0.7152971, + "learning_rate": 2.383374322259915e-06, + "loss": 0.7362113, + "num_input_tokens_seen": 162535240, + "step": 7575, + "time_per_iteration": 2.679238796234131 + }, + { + "auxiliary_loss_clip": 0.01050365, + "auxiliary_loss_mlp": 0.01029616, + "balance_loss_clip": 1.02642179, + "balance_loss_mlp": 1.01752257, + "epoch": 0.4554937622125357, + "flos": 20557925677440.0, + "grad_norm": 1.8401195114387934, + "language_loss": 0.73055422, + "learning_rate": 2.3829920754592617e-06, + "loss": 0.75135398, + "num_input_tokens_seen": 162553880, + "step": 7576, + "time_per_iteration": 2.645846128463745 + }, + { + "auxiliary_loss_clip": 0.01071561, + "auxiliary_loss_mlp": 0.01035114, + "balance_loss_clip": 1.02814436, + "balance_loss_mlp": 1.02266884, + "epoch": 0.45555388546520365, + "flos": 22820764723200.0, + "grad_norm": 1.8159409410794474, + "language_loss": 0.66313505, + "learning_rate": 2.382609814135511e-06, + "loss": 0.68420178, + "num_input_tokens_seen": 162574485, + "step": 7577, + "time_per_iteration": 2.5493969917297363 + }, + { + "auxiliary_loss_clip": 0.01048873, + "auxiliary_loss_mlp": 0.01045546, + "balance_loss_clip": 1.02639318, + "balance_loss_mlp": 1.0310024, + "epoch": 0.4556140087178716, + "flos": 21725992051200.0, + "grad_norm": 1.9786828956789213, + "language_loss": 0.74438894, + "learning_rate": 2.382227538303157e-06, + "loss": 0.76533318, + "num_input_tokens_seen": 162595130, + "step": 7578, + "time_per_iteration": 2.7956383228302 + }, + { + "auxiliary_loss_clip": 0.01022092, + "auxiliary_loss_mlp": 0.00747559, + "balance_loss_clip": 1.02368522, + "balance_loss_mlp": 1.0002284, + "epoch": 0.45567413197053963, + "flos": 25994513698560.0, + "grad_norm": 1.7499501351105435, + "language_loss": 0.70283651, + "learning_rate": 2.381845247976697e-06, + "loss": 0.72053301, + "num_input_tokens_seen": 162615720, + "step": 7579, + "time_per_iteration": 2.760664701461792 + }, + { + "auxiliary_loss_clip": 0.01057316, + "auxiliary_loss_mlp": 0.01032805, + "balance_loss_clip": 1.02490067, + "balance_loss_mlp": 1.02078867, + "epoch": 0.4557342552232076, + "flos": 21537604195200.0, + "grad_norm": 1.651939624266494, + "language_loss": 0.78311002, + "learning_rate": 2.381462943170627e-06, + "loss": 0.80401123, + "num_input_tokens_seen": 162635825, + "step": 7580, + "time_per_iteration": 2.595773696899414 + }, + { + "auxiliary_loss_clip": 0.01070975, + "auxiliary_loss_mlp": 0.01028005, + "balance_loss_clip": 1.0277667, + "balance_loss_mlp": 1.01525605, + "epoch": 0.45579437847587556, + "flos": 40001972647680.0, + "grad_norm": 1.5858908052158778, + "language_loss": 0.68926919, + "learning_rate": 2.381080623899444e-06, + "loss": 0.71025896, + "num_input_tokens_seen": 162659130, + "step": 7581, + "time_per_iteration": 2.6951987743377686 + }, + { + "auxiliary_loss_clip": 0.01052734, + "auxiliary_loss_mlp": 0.01027555, + "balance_loss_clip": 1.023242, + "balance_loss_mlp": 1.01535428, + "epoch": 0.4558545017285435, + "flos": 31138501530240.0, + "grad_norm": 1.5728252017352051, + "language_loss": 0.7337209, + "learning_rate": 2.3806982901776455e-06, + "loss": 0.75452381, + "num_input_tokens_seen": 162681665, + "step": 7582, + "time_per_iteration": 2.7914984226226807 + }, + { + "auxiliary_loss_clip": 0.01075243, + "auxiliary_loss_mlp": 0.01042567, + "balance_loss_clip": 1.02818036, + "balance_loss_mlp": 1.02895379, + "epoch": 0.4559146249812115, + "flos": 21725776569600.0, + "grad_norm": 1.865427109528693, + "language_loss": 0.72589374, + "learning_rate": 2.380315942019729e-06, + "loss": 0.7470718, + "num_input_tokens_seen": 162702040, + "step": 7583, + "time_per_iteration": 2.5573959350585938 + }, + { + "auxiliary_loss_clip": 0.01069719, + "auxiliary_loss_mlp": 0.01039063, + "balance_loss_clip": 1.03105164, + "balance_loss_mlp": 1.02586079, + "epoch": 0.45597474823387946, + "flos": 23805973935360.0, + "grad_norm": 1.7208864940774462, + "language_loss": 0.72921032, + "learning_rate": 2.379933579440195e-06, + "loss": 0.75029814, + "num_input_tokens_seen": 162722375, + "step": 7584, + "time_per_iteration": 2.7085859775543213 + }, + { + "auxiliary_loss_clip": 0.01031429, + "auxiliary_loss_mlp": 0.01031607, + "balance_loss_clip": 1.02459335, + "balance_loss_mlp": 1.018893, + "epoch": 0.4560348714865474, + "flos": 31905661230720.0, + "grad_norm": 3.0634885257941495, + "language_loss": 0.67908061, + "learning_rate": 2.379551202453541e-06, + "loss": 0.69971097, + "num_input_tokens_seen": 162746095, + "step": 7585, + "time_per_iteration": 2.725923538208008 + }, + { + "auxiliary_loss_clip": 0.01071709, + "auxiliary_loss_mlp": 0.01028681, + "balance_loss_clip": 1.02756071, + "balance_loss_mlp": 1.01681352, + "epoch": 0.4560949947392154, + "flos": 22048828513920.0, + "grad_norm": 1.4089715167048793, + "language_loss": 0.7625525, + "learning_rate": 2.379168811074267e-06, + "loss": 0.78355646, + "num_input_tokens_seen": 162766330, + "step": 7586, + "time_per_iteration": 2.7162423133850098 + }, + { + "auxiliary_loss_clip": 0.01050771, + "auxiliary_loss_mlp": 0.01027714, + "balance_loss_clip": 1.02712393, + "balance_loss_mlp": 1.01671088, + "epoch": 0.45615511799188335, + "flos": 24571804832640.0, + "grad_norm": 1.6608484034652693, + "language_loss": 0.78009915, + "learning_rate": 2.3787864053168747e-06, + "loss": 0.80088401, + "num_input_tokens_seen": 162784755, + "step": 7587, + "time_per_iteration": 2.6604645252227783 + }, + { + "auxiliary_loss_clip": 0.01044293, + "auxiliary_loss_mlp": 0.01041612, + "balance_loss_clip": 1.02377832, + "balance_loss_mlp": 1.02848148, + "epoch": 0.4562152412445513, + "flos": 18330709944960.0, + "grad_norm": 1.8152015666715964, + "language_loss": 0.6905756, + "learning_rate": 2.378403985195863e-06, + "loss": 0.71143472, + "num_input_tokens_seen": 162803850, + "step": 7588, + "time_per_iteration": 2.6182281970977783 + }, + { + "auxiliary_loss_clip": 0.01059998, + "auxiliary_loss_mlp": 0.01029384, + "balance_loss_clip": 1.02695656, + "balance_loss_mlp": 1.01773167, + "epoch": 0.4562753644972193, + "flos": 13516525814400.0, + "grad_norm": 1.6995685269711045, + "language_loss": 0.79319841, + "learning_rate": 2.378021550725735e-06, + "loss": 0.81409228, + "num_input_tokens_seen": 162820775, + "step": 7589, + "time_per_iteration": 2.5535151958465576 + }, + { + "auxiliary_loss_clip": 0.01057479, + "auxiliary_loss_mlp": 0.01032305, + "balance_loss_clip": 1.02549934, + "balance_loss_mlp": 1.01962161, + "epoch": 0.45633548774988725, + "flos": 29639697701760.0, + "grad_norm": 2.382408966991858, + "language_loss": 0.6196785, + "learning_rate": 2.377639101920992e-06, + "loss": 0.64057636, + "num_input_tokens_seen": 162839695, + "step": 7590, + "time_per_iteration": 2.6715736389160156 + }, + { + "auxiliary_loss_clip": 0.01041437, + "auxiliary_loss_mlp": 0.01031551, + "balance_loss_clip": 1.02283311, + "balance_loss_mlp": 1.01948082, + "epoch": 0.4563956110025552, + "flos": 22233409528320.0, + "grad_norm": 1.9066843655762602, + "language_loss": 0.73177487, + "learning_rate": 2.377256638796135e-06, + "loss": 0.75250471, + "num_input_tokens_seen": 162856095, + "step": 7591, + "time_per_iteration": 4.247936010360718 + }, + { + "auxiliary_loss_clip": 0.01056302, + "auxiliary_loss_mlp": 0.01037158, + "balance_loss_clip": 1.03037524, + "balance_loss_mlp": 1.02405715, + "epoch": 0.45645573425522323, + "flos": 17092043389440.0, + "grad_norm": 2.588874698145649, + "language_loss": 0.77085155, + "learning_rate": 2.3768741613656695e-06, + "loss": 0.79178613, + "num_input_tokens_seen": 162874070, + "step": 7592, + "time_per_iteration": 4.326889991760254 + }, + { + "auxiliary_loss_clip": 0.01043135, + "auxiliary_loss_mlp": 0.01030237, + "balance_loss_clip": 1.02374935, + "balance_loss_mlp": 1.01708841, + "epoch": 0.4565158575078912, + "flos": 20332334309760.0, + "grad_norm": 2.0109804065334145, + "language_loss": 0.69714212, + "learning_rate": 2.376491669644098e-06, + "loss": 0.7178759, + "num_input_tokens_seen": 162891000, + "step": 7593, + "time_per_iteration": 2.7897839546203613 + }, + { + "auxiliary_loss_clip": 0.01050129, + "auxiliary_loss_mlp": 0.01028061, + "balance_loss_clip": 1.02282333, + "balance_loss_mlp": 1.01749921, + "epoch": 0.45657598076055916, + "flos": 23983013093760.0, + "grad_norm": 1.9849334432139418, + "language_loss": 0.84077036, + "learning_rate": 2.3761091636459248e-06, + "loss": 0.86155224, + "num_input_tokens_seen": 162910120, + "step": 7594, + "time_per_iteration": 2.740231513977051 + }, + { + "auxiliary_loss_clip": 0.0100565, + "auxiliary_loss_mlp": 0.00746476, + "balance_loss_clip": 1.00721526, + "balance_loss_mlp": 0.99997628, + "epoch": 0.45663610401322713, + "flos": 69364297526400.0, + "grad_norm": 0.8469178828114273, + "language_loss": 0.52743983, + "learning_rate": 2.375726643385654e-06, + "loss": 0.54496109, + "num_input_tokens_seen": 162963720, + "step": 7595, + "time_per_iteration": 3.1975579261779785 + }, + { + "auxiliary_loss_clip": 0.01038083, + "auxiliary_loss_mlp": 0.01030933, + "balance_loss_clip": 1.02500105, + "balance_loss_mlp": 1.0180347, + "epoch": 0.4566962272658951, + "flos": 15149095891200.0, + "grad_norm": 2.082599233443853, + "language_loss": 0.87141508, + "learning_rate": 2.3753441088777915e-06, + "loss": 0.89210522, + "num_input_tokens_seen": 162975760, + "step": 7596, + "time_per_iteration": 2.698456048965454 + }, + { + "auxiliary_loss_clip": 0.0106519, + "auxiliary_loss_mlp": 0.01038912, + "balance_loss_clip": 1.02840579, + "balance_loss_mlp": 1.02727127, + "epoch": 0.45675635051856306, + "flos": 18697465762560.0, + "grad_norm": 1.6433207619572512, + "language_loss": 0.77040899, + "learning_rate": 2.374961560136843e-06, + "loss": 0.79145002, + "num_input_tokens_seen": 162994865, + "step": 7597, + "time_per_iteration": 2.6545679569244385 + }, + { + "auxiliary_loss_clip": 0.01061247, + "auxiliary_loss_mlp": 0.01032889, + "balance_loss_clip": 1.02635777, + "balance_loss_mlp": 1.0207063, + "epoch": 0.456816473771231, + "flos": 19098300608640.0, + "grad_norm": 1.9607373247079616, + "language_loss": 0.78124744, + "learning_rate": 2.374578997177314e-06, + "loss": 0.80218875, + "num_input_tokens_seen": 163014730, + "step": 7598, + "time_per_iteration": 2.729649782180786 + }, + { + "auxiliary_loss_clip": 0.01072156, + "auxiliary_loss_mlp": 0.01027109, + "balance_loss_clip": 1.02832294, + "balance_loss_mlp": 1.01602244, + "epoch": 0.456876597023899, + "flos": 28950069507840.0, + "grad_norm": 2.3070647711614387, + "language_loss": 0.71145654, + "learning_rate": 2.374196420013712e-06, + "loss": 0.73244917, + "num_input_tokens_seen": 163033405, + "step": 7599, + "time_per_iteration": 2.6181020736694336 + }, + { + "auxiliary_loss_clip": 0.0103855, + "auxiliary_loss_mlp": 0.01035196, + "balance_loss_clip": 1.02428746, + "balance_loss_mlp": 1.0233767, + "epoch": 0.45693672027656695, + "flos": 23289470317440.0, + "grad_norm": 1.936644794060945, + "language_loss": 0.69904244, + "learning_rate": 2.373813828660544e-06, + "loss": 0.71977991, + "num_input_tokens_seen": 163051400, + "step": 7600, + "time_per_iteration": 2.6512155532836914 + }, + { + "auxiliary_loss_clip": 0.01016913, + "auxiliary_loss_mlp": 0.01033411, + "balance_loss_clip": 1.025213, + "balance_loss_mlp": 1.02134109, + "epoch": 0.4569968435292349, + "flos": 20558212986240.0, + "grad_norm": 1.7681561316337138, + "language_loss": 0.78844064, + "learning_rate": 2.373431223132319e-06, + "loss": 0.80894387, + "num_input_tokens_seen": 163069250, + "step": 7601, + "time_per_iteration": 4.420172691345215 + }, + { + "auxiliary_loss_clip": 0.01041645, + "auxiliary_loss_mlp": 0.01035248, + "balance_loss_clip": 1.02441978, + "balance_loss_mlp": 1.02356565, + "epoch": 0.4570569667819029, + "flos": 41282619223680.0, + "grad_norm": 1.8830138151599698, + "language_loss": 0.71568561, + "learning_rate": 2.3730486034435448e-06, + "loss": 0.73645461, + "num_input_tokens_seen": 163091755, + "step": 7602, + "time_per_iteration": 2.761566400527954 + }, + { + "auxiliary_loss_clip": 0.01054157, + "auxiliary_loss_mlp": 0.01030829, + "balance_loss_clip": 1.02361667, + "balance_loss_mlp": 1.01734018, + "epoch": 0.45711709003457085, + "flos": 26031573555840.0, + "grad_norm": 1.6735360685332241, + "language_loss": 0.72998154, + "learning_rate": 2.372665969608729e-06, + "loss": 0.75083148, + "num_input_tokens_seen": 163111600, + "step": 7603, + "time_per_iteration": 2.6274547576904297 + }, + { + "auxiliary_loss_clip": 0.01061381, + "auxiliary_loss_mlp": 0.01038662, + "balance_loss_clip": 1.02781057, + "balance_loss_mlp": 1.02526891, + "epoch": 0.4571772132872388, + "flos": 22158068751360.0, + "grad_norm": 1.8888600433280434, + "language_loss": 0.83205491, + "learning_rate": 2.372283321642383e-06, + "loss": 0.85305536, + "num_input_tokens_seen": 163127350, + "step": 7604, + "time_per_iteration": 4.190701723098755 + }, + { + "auxiliary_loss_clip": 0.01063137, + "auxiliary_loss_mlp": 0.01038967, + "balance_loss_clip": 1.03476357, + "balance_loss_mlp": 1.02541852, + "epoch": 0.45723733653990684, + "flos": 23878872587520.0, + "grad_norm": 1.9163603799785764, + "language_loss": 0.86537313, + "learning_rate": 2.371900659559016e-06, + "loss": 0.8863942, + "num_input_tokens_seen": 163145855, + "step": 7605, + "time_per_iteration": 2.6331748962402344 + }, + { + "auxiliary_loss_clip": 0.01023971, + "auxiliary_loss_mlp": 0.0103175, + "balance_loss_clip": 1.02347732, + "balance_loss_mlp": 1.01956046, + "epoch": 0.4572974597925748, + "flos": 16871803148160.0, + "grad_norm": 6.127385477777012, + "language_loss": 0.73619628, + "learning_rate": 2.371517983373138e-06, + "loss": 0.7567535, + "num_input_tokens_seen": 163163830, + "step": 7606, + "time_per_iteration": 2.657365560531616 + }, + { + "auxiliary_loss_clip": 0.0104306, + "auxiliary_loss_mlp": 0.01033543, + "balance_loss_clip": 1.0267396, + "balance_loss_mlp": 1.02069235, + "epoch": 0.45735758304524277, + "flos": 13771491528960.0, + "grad_norm": 2.51051918926913, + "language_loss": 0.8046025, + "learning_rate": 2.371135293099262e-06, + "loss": 0.82536846, + "num_input_tokens_seen": 163180700, + "step": 7607, + "time_per_iteration": 2.617248773574829 + }, + { + "auxiliary_loss_clip": 0.01037335, + "auxiliary_loss_mlp": 0.01039514, + "balance_loss_clip": 1.02845991, + "balance_loss_mlp": 1.02728939, + "epoch": 0.45741770629791073, + "flos": 21100750986240.0, + "grad_norm": 2.004548159802393, + "language_loss": 0.80846304, + "learning_rate": 2.3707525887518982e-06, + "loss": 0.82923156, + "num_input_tokens_seen": 163199450, + "step": 7608, + "time_per_iteration": 2.6766252517700195 + }, + { + "auxiliary_loss_clip": 0.01047805, + "auxiliary_loss_mlp": 0.010348, + "balance_loss_clip": 1.02393985, + "balance_loss_mlp": 1.02206874, + "epoch": 0.4574778295505787, + "flos": 23112898035840.0, + "grad_norm": 1.6671796668184438, + "language_loss": 0.6809299, + "learning_rate": 2.370369870345559e-06, + "loss": 0.70175588, + "num_input_tokens_seen": 163217875, + "step": 7609, + "time_per_iteration": 2.8502259254455566 + }, + { + "auxiliary_loss_clip": 0.01052933, + "auxiliary_loss_mlp": 0.01037698, + "balance_loss_clip": 1.02998352, + "balance_loss_mlp": 1.02471614, + "epoch": 0.45753795280324666, + "flos": 24352929308160.0, + "grad_norm": 1.7522739078005183, + "language_loss": 0.8090114, + "learning_rate": 2.369987137894757e-06, + "loss": 0.82991767, + "num_input_tokens_seen": 163237430, + "step": 7610, + "time_per_iteration": 2.7565104961395264 + }, + { + "auxiliary_loss_clip": 0.01062078, + "auxiliary_loss_mlp": 0.01032929, + "balance_loss_clip": 1.02621388, + "balance_loss_mlp": 1.02028096, + "epoch": 0.4575980760559146, + "flos": 16653789550080.0, + "grad_norm": 2.496756100767349, + "language_loss": 0.82502222, + "learning_rate": 2.3696043914140057e-06, + "loss": 0.8459723, + "num_input_tokens_seen": 163253905, + "step": 7611, + "time_per_iteration": 2.6645498275756836 + }, + { + "auxiliary_loss_clip": 0.01066153, + "auxiliary_loss_mlp": 0.01025884, + "balance_loss_clip": 1.02933049, + "balance_loss_mlp": 1.01272368, + "epoch": 0.4576581993085826, + "flos": 35911423912320.0, + "grad_norm": 1.9921818153099728, + "language_loss": 0.73910594, + "learning_rate": 2.369221630917819e-06, + "loss": 0.76002628, + "num_input_tokens_seen": 163274285, + "step": 7612, + "time_per_iteration": 2.7155046463012695 + }, + { + "auxiliary_loss_clip": 0.01042792, + "auxiliary_loss_mlp": 0.01031378, + "balance_loss_clip": 1.02278244, + "balance_loss_mlp": 1.01836681, + "epoch": 0.45771832256125056, + "flos": 20080421251200.0, + "grad_norm": 2.1527127029094557, + "language_loss": 0.84740072, + "learning_rate": 2.368838856420711e-06, + "loss": 0.86814243, + "num_input_tokens_seen": 163293150, + "step": 7613, + "time_per_iteration": 2.633248805999756 + }, + { + "auxiliary_loss_clip": 0.01035643, + "auxiliary_loss_mlp": 0.01031069, + "balance_loss_clip": 1.02481973, + "balance_loss_mlp": 1.01865935, + "epoch": 0.4577784458139185, + "flos": 10744329957120.0, + "grad_norm": 3.4972500855194126, + "language_loss": 0.75775027, + "learning_rate": 2.3684560679371965e-06, + "loss": 0.77841747, + "num_input_tokens_seen": 163310065, + "step": 7614, + "time_per_iteration": 2.748469591140747 + }, + { + "auxiliary_loss_clip": 0.01070226, + "auxiliary_loss_mlp": 0.0102731, + "balance_loss_clip": 1.02696002, + "balance_loss_mlp": 1.01544857, + "epoch": 0.4578385690665865, + "flos": 21907269014400.0, + "grad_norm": 1.5560542105683972, + "language_loss": 0.74700475, + "learning_rate": 2.368073265481791e-06, + "loss": 0.7679801, + "num_input_tokens_seen": 163329415, + "step": 7615, + "time_per_iteration": 2.630614995956421 + }, + { + "auxiliary_loss_clip": 0.00998621, + "auxiliary_loss_mlp": 0.01002985, + "balance_loss_clip": 1.00989532, + "balance_loss_mlp": 1.00156677, + "epoch": 0.45789869231925445, + "flos": 64758286667520.0, + "grad_norm": 0.7758812278437968, + "language_loss": 0.57614756, + "learning_rate": 2.3676904490690105e-06, + "loss": 0.59616363, + "num_input_tokens_seen": 163385875, + "step": 7616, + "time_per_iteration": 3.14347767829895 + }, + { + "auxiliary_loss_clip": 0.01039929, + "auxiliary_loss_mlp": 0.0074742, + "balance_loss_clip": 1.02370334, + "balance_loss_mlp": 1.00034893, + "epoch": 0.4579588155719224, + "flos": 16144001775360.0, + "grad_norm": 2.263745192889907, + "language_loss": 0.71196896, + "learning_rate": 2.3673076187133704e-06, + "loss": 0.72984242, + "num_input_tokens_seen": 163405170, + "step": 7617, + "time_per_iteration": 2.708759069442749 + }, + { + "auxiliary_loss_clip": 0.0107393, + "auxiliary_loss_mlp": 0.01032291, + "balance_loss_clip": 1.02869272, + "balance_loss_mlp": 1.01961875, + "epoch": 0.45801893882459044, + "flos": 21395541905280.0, + "grad_norm": 2.6101984687099216, + "language_loss": 0.76179588, + "learning_rate": 2.36692477442939e-06, + "loss": 0.78285807, + "num_input_tokens_seen": 163423155, + "step": 7618, + "time_per_iteration": 2.68231201171875 + }, + { + "auxiliary_loss_clip": 0.01042856, + "auxiliary_loss_mlp": 0.01039143, + "balance_loss_clip": 1.02786112, + "balance_loss_mlp": 1.02667999, + "epoch": 0.4580790620772584, + "flos": 19536554448000.0, + "grad_norm": 1.6503611007758743, + "language_loss": 0.77275163, + "learning_rate": 2.366541916231585e-06, + "loss": 0.79357159, + "num_input_tokens_seen": 163442450, + "step": 7619, + "time_per_iteration": 2.778463125228882 + }, + { + "auxiliary_loss_clip": 0.01071703, + "auxiliary_loss_mlp": 0.0103109, + "balance_loss_clip": 1.02970171, + "balance_loss_mlp": 1.01971734, + "epoch": 0.45813918532992637, + "flos": 16581070465920.0, + "grad_norm": 1.746993363856166, + "language_loss": 0.71743023, + "learning_rate": 2.366159044134473e-06, + "loss": 0.73845816, + "num_input_tokens_seen": 163459810, + "step": 7620, + "time_per_iteration": 2.5149996280670166 + }, + { + "auxiliary_loss_clip": 0.01051101, + "auxiliary_loss_mlp": 0.01028606, + "balance_loss_clip": 1.02747023, + "balance_loss_mlp": 1.01684022, + "epoch": 0.45819930858259433, + "flos": 42230301701760.0, + "grad_norm": 1.7048540125486522, + "language_loss": 0.78340417, + "learning_rate": 2.3657761581525748e-06, + "loss": 0.8042013, + "num_input_tokens_seen": 163482970, + "step": 7621, + "time_per_iteration": 2.808826446533203 + }, + { + "auxiliary_loss_clip": 0.01002922, + "auxiliary_loss_mlp": 0.01002739, + "balance_loss_clip": 1.00381374, + "balance_loss_mlp": 1.001261, + "epoch": 0.4582594318352623, + "flos": 63714795638400.0, + "grad_norm": 0.7869138280801038, + "language_loss": 0.65050894, + "learning_rate": 2.3653932583004063e-06, + "loss": 0.67056555, + "num_input_tokens_seen": 163545330, + "step": 7622, + "time_per_iteration": 3.1752166748046875 + }, + { + "auxiliary_loss_clip": 0.01064828, + "auxiliary_loss_mlp": 0.01029878, + "balance_loss_clip": 1.02902794, + "balance_loss_mlp": 1.01683664, + "epoch": 0.45831955508793026, + "flos": 26869979882880.0, + "grad_norm": 2.6854568461284116, + "language_loss": 0.7975322, + "learning_rate": 2.3650103445924903e-06, + "loss": 0.8184793, + "num_input_tokens_seen": 163564620, + "step": 7623, + "time_per_iteration": 2.780866861343384 + }, + { + "auxiliary_loss_clip": 0.01020288, + "auxiliary_loss_mlp": 0.01035908, + "balance_loss_clip": 1.02320111, + "balance_loss_mlp": 1.02305138, + "epoch": 0.45837967834059823, + "flos": 18733951002240.0, + "grad_norm": 1.8950497886305349, + "language_loss": 0.70845258, + "learning_rate": 2.3646274170433452e-06, + "loss": 0.72901452, + "num_input_tokens_seen": 163581010, + "step": 7624, + "time_per_iteration": 2.745861053466797 + }, + { + "auxiliary_loss_clip": 0.01041787, + "auxiliary_loss_mlp": 0.01037661, + "balance_loss_clip": 1.02444124, + "balance_loss_mlp": 1.02490604, + "epoch": 0.4584398015932662, + "flos": 21178102924800.0, + "grad_norm": 2.1876379159512944, + "language_loss": 0.73337024, + "learning_rate": 2.364244475667491e-06, + "loss": 0.7541647, + "num_input_tokens_seen": 163599955, + "step": 7625, + "time_per_iteration": 2.605438232421875 + }, + { + "auxiliary_loss_clip": 0.01056708, + "auxiliary_loss_mlp": 0.01033367, + "balance_loss_clip": 1.02839541, + "balance_loss_mlp": 1.02170193, + "epoch": 0.45849992484593416, + "flos": 19790047704960.0, + "grad_norm": 1.87493791942172, + "language_loss": 0.78206289, + "learning_rate": 2.363861520479451e-06, + "loss": 0.80296361, + "num_input_tokens_seen": 163618545, + "step": 7626, + "time_per_iteration": 2.6451525688171387 + }, + { + "auxiliary_loss_clip": 0.01075081, + "auxiliary_loss_mlp": 0.01039285, + "balance_loss_clip": 1.02915227, + "balance_loss_mlp": 1.02695894, + "epoch": 0.4585600480986021, + "flos": 18223265387520.0, + "grad_norm": 1.5524957569075275, + "language_loss": 0.8493613, + "learning_rate": 2.3634785514937445e-06, + "loss": 0.87050498, + "num_input_tokens_seen": 163636055, + "step": 7627, + "time_per_iteration": 2.6979682445526123 + }, + { + "auxiliary_loss_clip": 0.01075274, + "auxiliary_loss_mlp": 0.0103409, + "balance_loss_clip": 1.02784026, + "balance_loss_mlp": 1.02097154, + "epoch": 0.4586201713512701, + "flos": 29022213974400.0, + "grad_norm": 1.5791648329925778, + "language_loss": 0.69547462, + "learning_rate": 2.3630955687248953e-06, + "loss": 0.71656829, + "num_input_tokens_seen": 163657485, + "step": 7628, + "time_per_iteration": 2.7315797805786133 + }, + { + "auxiliary_loss_clip": 0.01057376, + "auxiliary_loss_mlp": 0.01029833, + "balance_loss_clip": 1.02441216, + "balance_loss_mlp": 1.01756012, + "epoch": 0.45868029460393805, + "flos": 23404600385280.0, + "grad_norm": 1.4785718373755756, + "language_loss": 0.78240585, + "learning_rate": 2.3627125721874265e-06, + "loss": 0.80327797, + "num_input_tokens_seen": 163676030, + "step": 7629, + "time_per_iteration": 2.609323501586914 + }, + { + "auxiliary_loss_clip": 0.01051741, + "auxiliary_loss_mlp": 0.01043066, + "balance_loss_clip": 1.02467895, + "balance_loss_mlp": 1.02877295, + "epoch": 0.458740417856606, + "flos": 18221972497920.0, + "grad_norm": 2.0801499541193436, + "language_loss": 0.79461306, + "learning_rate": 2.3623295618958595e-06, + "loss": 0.81556112, + "num_input_tokens_seen": 163694490, + "step": 7630, + "time_per_iteration": 2.6857168674468994 + }, + { + "auxiliary_loss_clip": 0.0105501, + "auxiliary_loss_mlp": 0.01035414, + "balance_loss_clip": 1.02699947, + "balance_loss_mlp": 1.02278399, + "epoch": 0.458800541109274, + "flos": 34568760504960.0, + "grad_norm": 1.9462724331967551, + "language_loss": 0.72134662, + "learning_rate": 2.3619465378647198e-06, + "loss": 0.74225086, + "num_input_tokens_seen": 163717035, + "step": 7631, + "time_per_iteration": 2.807783603668213 + }, + { + "auxiliary_loss_clip": 0.01034457, + "auxiliary_loss_mlp": 0.01039533, + "balance_loss_clip": 1.02485621, + "balance_loss_mlp": 1.02520406, + "epoch": 0.458860664361942, + "flos": 17712112896000.0, + "grad_norm": 2.387506611923125, + "language_loss": 0.71196032, + "learning_rate": 2.361563500108531e-06, + "loss": 0.73270023, + "num_input_tokens_seen": 163734525, + "step": 7632, + "time_per_iteration": 2.7943613529205322 + }, + { + "auxiliary_loss_clip": 0.01028719, + "auxiliary_loss_mlp": 0.00747826, + "balance_loss_clip": 1.02580702, + "balance_loss_mlp": 1.00029755, + "epoch": 0.45892078761460997, + "flos": 18441889516800.0, + "grad_norm": 2.4346129302810593, + "language_loss": 0.68834651, + "learning_rate": 2.3611804486418178e-06, + "loss": 0.70611191, + "num_input_tokens_seen": 163752860, + "step": 7633, + "time_per_iteration": 2.724921941757202 + }, + { + "auxiliary_loss_clip": 0.01064394, + "auxiliary_loss_mlp": 0.01036624, + "balance_loss_clip": 1.0283401, + "balance_loss_mlp": 1.02403593, + "epoch": 0.45898091086727794, + "flos": 22672956257280.0, + "grad_norm": 1.6042289865131862, + "language_loss": 0.80763322, + "learning_rate": 2.3607973834791062e-06, + "loss": 0.82864338, + "num_input_tokens_seen": 163772495, + "step": 7634, + "time_per_iteration": 2.601069688796997 + }, + { + "auxiliary_loss_clip": 0.01066338, + "auxiliary_loss_mlp": 0.00747669, + "balance_loss_clip": 1.02813482, + "balance_loss_mlp": 1.00036538, + "epoch": 0.4590410341199459, + "flos": 21652949744640.0, + "grad_norm": 1.7637690929927627, + "language_loss": 0.81354821, + "learning_rate": 2.3604143046349216e-06, + "loss": 0.83168828, + "num_input_tokens_seen": 163791475, + "step": 7635, + "time_per_iteration": 2.635927677154541 + }, + { + "auxiliary_loss_clip": 0.01050358, + "auxiliary_loss_mlp": 0.01038174, + "balance_loss_clip": 1.02712345, + "balance_loss_mlp": 1.02659249, + "epoch": 0.45910115737261387, + "flos": 36535372087680.0, + "grad_norm": 1.6201664570986376, + "language_loss": 0.6451236, + "learning_rate": 2.3600312121237905e-06, + "loss": 0.66600895, + "num_input_tokens_seen": 163812995, + "step": 7636, + "time_per_iteration": 2.757305145263672 + }, + { + "auxiliary_loss_clip": 0.01062929, + "auxiliary_loss_mlp": 0.01030685, + "balance_loss_clip": 1.02874041, + "balance_loss_mlp": 1.01855564, + "epoch": 0.45916128062528183, + "flos": 24419866302720.0, + "grad_norm": 1.5166343220638367, + "language_loss": 0.80325538, + "learning_rate": 2.3596481059602395e-06, + "loss": 0.82419157, + "num_input_tokens_seen": 163833945, + "step": 7637, + "time_per_iteration": 2.6600959300994873 + }, + { + "auxiliary_loss_clip": 0.01039856, + "auxiliary_loss_mlp": 0.01036629, + "balance_loss_clip": 1.02462602, + "balance_loss_mlp": 1.02165616, + "epoch": 0.4592214038779498, + "flos": 23221958705280.0, + "grad_norm": 1.4637578264432571, + "language_loss": 0.75306332, + "learning_rate": 2.3592649861587965e-06, + "loss": 0.77382815, + "num_input_tokens_seen": 163853885, + "step": 7638, + "time_per_iteration": 4.313544273376465 + }, + { + "auxiliary_loss_clip": 0.0105951, + "auxiliary_loss_mlp": 0.01033211, + "balance_loss_clip": 1.02539682, + "balance_loss_mlp": 1.02030623, + "epoch": 0.45928152713061776, + "flos": 19172133014400.0, + "grad_norm": 1.6474703495580145, + "language_loss": 0.74035066, + "learning_rate": 2.358881852733989e-06, + "loss": 0.76127791, + "num_input_tokens_seen": 163871855, + "step": 7639, + "time_per_iteration": 2.589346170425415 + }, + { + "auxiliary_loss_clip": 0.01073489, + "auxiliary_loss_mlp": 0.01036465, + "balance_loss_clip": 1.02743506, + "balance_loss_mlp": 1.02343524, + "epoch": 0.4593416503832857, + "flos": 22414686491520.0, + "grad_norm": 2.086136767392864, + "language_loss": 0.68344033, + "learning_rate": 2.358498705700346e-06, + "loss": 0.7045399, + "num_input_tokens_seen": 163891450, + "step": 7640, + "time_per_iteration": 4.084036827087402 + }, + { + "auxiliary_loss_clip": 0.01044966, + "auxiliary_loss_mlp": 0.01033372, + "balance_loss_clip": 1.02676964, + "balance_loss_mlp": 1.02012157, + "epoch": 0.4594017736359537, + "flos": 18880215183360.0, + "grad_norm": 1.5563123725710588, + "language_loss": 0.75355601, + "learning_rate": 2.3581155450723958e-06, + "loss": 0.77433938, + "num_input_tokens_seen": 163909345, + "step": 7641, + "time_per_iteration": 2.635969400405884 + }, + { + "auxiliary_loss_clip": 0.01052696, + "auxiliary_loss_mlp": 0.01030698, + "balance_loss_clip": 1.02648854, + "balance_loss_mlp": 1.01723313, + "epoch": 0.45946189688862166, + "flos": 20518567349760.0, + "grad_norm": 2.05961822006861, + "language_loss": 0.75085008, + "learning_rate": 2.357732370864668e-06, + "loss": 0.77168399, + "num_input_tokens_seen": 163926940, + "step": 7642, + "time_per_iteration": 2.563530921936035 + }, + { + "auxiliary_loss_clip": 0.01012167, + "auxiliary_loss_mlp": 0.01013226, + "balance_loss_clip": 1.01350427, + "balance_loss_mlp": 1.01135409, + "epoch": 0.4595220201412896, + "flos": 61405990162560.0, + "grad_norm": 0.8411456414011459, + "language_loss": 0.58221883, + "learning_rate": 2.357349183091694e-06, + "loss": 0.60247278, + "num_input_tokens_seen": 163977785, + "step": 7643, + "time_per_iteration": 2.9727022647857666 + }, + { + "auxiliary_loss_clip": 0.01066712, + "auxiliary_loss_mlp": 0.01040899, + "balance_loss_clip": 1.02753615, + "balance_loss_mlp": 1.02779758, + "epoch": 0.4595821433939576, + "flos": 23330947547520.0, + "grad_norm": 42.36131218244724, + "language_loss": 0.93130326, + "learning_rate": 2.3569659817680016e-06, + "loss": 0.95237935, + "num_input_tokens_seen": 163996630, + "step": 7644, + "time_per_iteration": 2.6344058513641357 + }, + { + "auxiliary_loss_clip": 0.01064123, + "auxiliary_loss_mlp": 0.01035633, + "balance_loss_clip": 1.02701461, + "balance_loss_mlp": 1.02294898, + "epoch": 0.4596422666466256, + "flos": 14282356711680.0, + "grad_norm": 1.9753427901952458, + "language_loss": 0.83105415, + "learning_rate": 2.3565827669081243e-06, + "loss": 0.85205173, + "num_input_tokens_seen": 164013190, + "step": 7645, + "time_per_iteration": 2.5518691539764404 + }, + { + "auxiliary_loss_clip": 0.00972596, + "auxiliary_loss_mlp": 0.01005019, + "balance_loss_clip": 1.00310493, + "balance_loss_mlp": 1.00353467, + "epoch": 0.4597023898992936, + "flos": 65727337737600.0, + "grad_norm": 0.7607046840373897, + "language_loss": 0.59895778, + "learning_rate": 2.356199538526593e-06, + "loss": 0.618734, + "num_input_tokens_seen": 164074030, + "step": 7646, + "time_per_iteration": 3.126674175262451 + }, + { + "auxiliary_loss_clip": 0.01053553, + "auxiliary_loss_mlp": 0.01032782, + "balance_loss_clip": 1.02526891, + "balance_loss_mlp": 1.01953161, + "epoch": 0.45976251315196154, + "flos": 26907075653760.0, + "grad_norm": 1.6568244430422454, + "language_loss": 0.72624397, + "learning_rate": 2.355816296637939e-06, + "loss": 0.74710739, + "num_input_tokens_seen": 164095515, + "step": 7647, + "time_per_iteration": 2.5698001384735107 + }, + { + "auxiliary_loss_clip": 0.01041019, + "auxiliary_loss_mlp": 0.01038778, + "balance_loss_clip": 1.02501941, + "balance_loss_mlp": 1.02601671, + "epoch": 0.4598226364046295, + "flos": 26618066824320.0, + "grad_norm": 2.5350300714038045, + "language_loss": 0.66647089, + "learning_rate": 2.3554330412566957e-06, + "loss": 0.68726885, + "num_input_tokens_seen": 164117270, + "step": 7648, + "time_per_iteration": 4.3452208042144775 + }, + { + "auxiliary_loss_clip": 0.0106163, + "auxiliary_loss_mlp": 0.01033356, + "balance_loss_clip": 1.02699482, + "balance_loss_mlp": 1.02059507, + "epoch": 0.45988275965729747, + "flos": 24387762522240.0, + "grad_norm": 1.5084046446870187, + "language_loss": 0.78925014, + "learning_rate": 2.3550497723973953e-06, + "loss": 0.81019998, + "num_input_tokens_seen": 164137850, + "step": 7649, + "time_per_iteration": 2.643892288208008 + }, + { + "auxiliary_loss_clip": 0.01019996, + "auxiliary_loss_mlp": 0.01035877, + "balance_loss_clip": 1.02685308, + "balance_loss_mlp": 1.02300858, + "epoch": 0.45994288290996543, + "flos": 24535822383360.0, + "grad_norm": 2.3659955574838656, + "language_loss": 0.68899679, + "learning_rate": 2.3546664900745726e-06, + "loss": 0.70955551, + "num_input_tokens_seen": 164157960, + "step": 7650, + "time_per_iteration": 2.9737210273742676 + }, + { + "auxiliary_loss_clip": 0.01068036, + "auxiliary_loss_mlp": 0.01038175, + "balance_loss_clip": 1.02878332, + "balance_loss_mlp": 1.02358365, + "epoch": 0.4600030061626334, + "flos": 14830245838080.0, + "grad_norm": 2.3473999090884115, + "language_loss": 0.8466841, + "learning_rate": 2.354283194302761e-06, + "loss": 0.86774623, + "num_input_tokens_seen": 164174590, + "step": 7651, + "time_per_iteration": 4.226263046264648 + }, + { + "auxiliary_loss_clip": 0.01050733, + "auxiliary_loss_mlp": 0.00747708, + "balance_loss_clip": 1.02887988, + "balance_loss_mlp": 1.00034893, + "epoch": 0.46006312941530136, + "flos": 18113845582080.0, + "grad_norm": 2.9249917570005577, + "language_loss": 0.75199795, + "learning_rate": 2.3538998850964948e-06, + "loss": 0.76998234, + "num_input_tokens_seen": 164192935, + "step": 7652, + "time_per_iteration": 2.7285385131835938 + }, + { + "auxiliary_loss_clip": 0.01026922, + "auxiliary_loss_mlp": 0.0102867, + "balance_loss_clip": 1.02401614, + "balance_loss_mlp": 1.01629031, + "epoch": 0.46012325266796933, + "flos": 21976468565760.0, + "grad_norm": 1.7009403194002153, + "language_loss": 0.75736105, + "learning_rate": 2.3535165624703097e-06, + "loss": 0.77791697, + "num_input_tokens_seen": 164213160, + "step": 7653, + "time_per_iteration": 2.8839292526245117 + }, + { + "auxiliary_loss_clip": 0.01039293, + "auxiliary_loss_mlp": 0.0103769, + "balance_loss_clip": 1.03035426, + "balance_loss_mlp": 1.02305698, + "epoch": 0.4601833759206373, + "flos": 15268068714240.0, + "grad_norm": 5.025152690144994, + "language_loss": 0.65580761, + "learning_rate": 2.353133226438741e-06, + "loss": 0.67657751, + "num_input_tokens_seen": 164229330, + "step": 7654, + "time_per_iteration": 2.8964178562164307 + }, + { + "auxiliary_loss_clip": 0.01045534, + "auxiliary_loss_mlp": 0.01034695, + "balance_loss_clip": 1.02371943, + "balance_loss_mlp": 1.02188015, + "epoch": 0.46024349917330526, + "flos": 27088999061760.0, + "grad_norm": 1.7145425228184912, + "language_loss": 0.78975892, + "learning_rate": 2.3527498770163248e-06, + "loss": 0.8105613, + "num_input_tokens_seen": 164248240, + "step": 7655, + "time_per_iteration": 2.684356212615967 + }, + { + "auxiliary_loss_clip": 0.01037737, + "auxiliary_loss_mlp": 0.01033085, + "balance_loss_clip": 1.02402103, + "balance_loss_mlp": 1.02023399, + "epoch": 0.4603036224259732, + "flos": 24462923731200.0, + "grad_norm": 1.4913701741458245, + "language_loss": 0.67449647, + "learning_rate": 2.3523665142175985e-06, + "loss": 0.69520462, + "num_input_tokens_seen": 164268020, + "step": 7656, + "time_per_iteration": 2.8362410068511963 + }, + { + "auxiliary_loss_clip": 0.01046846, + "auxiliary_loss_mlp": 0.01032988, + "balance_loss_clip": 1.02420282, + "balance_loss_mlp": 1.0205251, + "epoch": 0.4603637456786412, + "flos": 28109292883200.0, + "grad_norm": 1.6725251294738817, + "language_loss": 0.81237495, + "learning_rate": 2.351983138057098e-06, + "loss": 0.83317327, + "num_input_tokens_seen": 164287305, + "step": 7657, + "time_per_iteration": 2.8023831844329834 + }, + { + "auxiliary_loss_clip": 0.01072053, + "auxiliary_loss_mlp": 0.00747658, + "balance_loss_clip": 1.02697408, + "balance_loss_mlp": 1.00036418, + "epoch": 0.4604238689313092, + "flos": 24348942898560.0, + "grad_norm": 2.1518396829198005, + "language_loss": 0.70945072, + "learning_rate": 2.3515997485493623e-06, + "loss": 0.72764778, + "num_input_tokens_seen": 164306835, + "step": 7658, + "time_per_iteration": 2.604163646697998 + }, + { + "auxiliary_loss_clip": 0.01002054, + "auxiliary_loss_mlp": 0.01001558, + "balance_loss_clip": 1.00385594, + "balance_loss_mlp": 0.9999966, + "epoch": 0.4604839921839772, + "flos": 53606229431040.0, + "grad_norm": 0.9510717932961825, + "language_loss": 0.62088978, + "learning_rate": 2.351216345708928e-06, + "loss": 0.64092588, + "num_input_tokens_seen": 164367095, + "step": 7659, + "time_per_iteration": 3.3374245166778564 + }, + { + "auxiliary_loss_clip": 0.01020646, + "auxiliary_loss_mlp": 0.0104021, + "balance_loss_clip": 1.0230515, + "balance_loss_mlp": 1.0252192, + "epoch": 0.46054411543664514, + "flos": 31248424126080.0, + "grad_norm": 1.853947698259582, + "language_loss": 0.68485475, + "learning_rate": 2.350832929550336e-06, + "loss": 0.70546329, + "num_input_tokens_seen": 164388895, + "step": 7660, + "time_per_iteration": 2.8259096145629883 + }, + { + "auxiliary_loss_clip": 0.01055325, + "auxiliary_loss_mlp": 0.01040227, + "balance_loss_clip": 1.02351081, + "balance_loss_mlp": 1.02765012, + "epoch": 0.4606042386893131, + "flos": 24092863862400.0, + "grad_norm": 1.783452630864341, + "language_loss": 0.7707985, + "learning_rate": 2.3504495000881227e-06, + "loss": 0.79175401, + "num_input_tokens_seen": 164409080, + "step": 7661, + "time_per_iteration": 2.6212360858917236 + }, + { + "auxiliary_loss_clip": 0.01054646, + "auxiliary_loss_mlp": 0.01038672, + "balance_loss_clip": 1.02626395, + "balance_loss_mlp": 1.02560711, + "epoch": 0.46066436194198107, + "flos": 26578457101440.0, + "grad_norm": 1.8245873543802986, + "language_loss": 0.74925113, + "learning_rate": 2.3500660573368305e-06, + "loss": 0.77018428, + "num_input_tokens_seen": 164427585, + "step": 7662, + "time_per_iteration": 2.6150474548339844 + }, + { + "auxiliary_loss_clip": 0.01046139, + "auxiliary_loss_mlp": 0.01038715, + "balance_loss_clip": 1.02458072, + "balance_loss_mlp": 1.023772, + "epoch": 0.46072448519464904, + "flos": 17775602184960.0, + "grad_norm": 2.8993513549173415, + "language_loss": 0.79043233, + "learning_rate": 2.349682601310998e-06, + "loss": 0.81128085, + "num_input_tokens_seen": 164438455, + "step": 7663, + "time_per_iteration": 2.5587809085845947 + }, + { + "auxiliary_loss_clip": 0.01060154, + "auxiliary_loss_mlp": 0.01030603, + "balance_loss_clip": 1.02688825, + "balance_loss_mlp": 1.01850343, + "epoch": 0.460784608447317, + "flos": 15086109392640.0, + "grad_norm": 1.7808115774081283, + "language_loss": 0.73285258, + "learning_rate": 2.3492991320251653e-06, + "loss": 0.7537601, + "num_input_tokens_seen": 164456830, + "step": 7664, + "time_per_iteration": 2.5690693855285645 + }, + { + "auxiliary_loss_clip": 0.01041684, + "auxiliary_loss_mlp": 0.01031809, + "balance_loss_clip": 1.02752221, + "balance_loss_mlp": 1.01948237, + "epoch": 0.46084473169998497, + "flos": 18588261438720.0, + "grad_norm": 1.794080795725318, + "language_loss": 0.72635972, + "learning_rate": 2.3489156494938753e-06, + "loss": 0.74709463, + "num_input_tokens_seen": 164475375, + "step": 7665, + "time_per_iteration": 2.6774966716766357 + }, + { + "auxiliary_loss_clip": 0.01043283, + "auxiliary_loss_mlp": 0.0103246, + "balance_loss_clip": 1.02691793, + "balance_loss_mlp": 1.02013969, + "epoch": 0.46090485495265293, + "flos": 19494789909120.0, + "grad_norm": 1.7061821756375337, + "language_loss": 0.77775115, + "learning_rate": 2.348532153731669e-06, + "loss": 0.79850858, + "num_input_tokens_seen": 164492040, + "step": 7666, + "time_per_iteration": 2.632208824157715 + }, + { + "auxiliary_loss_clip": 0.01029884, + "auxiliary_loss_mlp": 0.01033482, + "balance_loss_clip": 1.02316761, + "balance_loss_mlp": 1.01917648, + "epoch": 0.4609649782053209, + "flos": 33364927163520.0, + "grad_norm": 1.5399927070901318, + "language_loss": 0.73771763, + "learning_rate": 2.348148644753088e-06, + "loss": 0.75835133, + "num_input_tokens_seen": 164513665, + "step": 7667, + "time_per_iteration": 2.874783515930176 + }, + { + "auxiliary_loss_clip": 0.01030884, + "auxiliary_loss_mlp": 0.01027952, + "balance_loss_clip": 1.02629709, + "balance_loss_mlp": 1.01609087, + "epoch": 0.46102510145798886, + "flos": 23769165473280.0, + "grad_norm": 1.4121270400317398, + "language_loss": 0.76194298, + "learning_rate": 2.347765122572676e-06, + "loss": 0.78253138, + "num_input_tokens_seen": 164533890, + "step": 7668, + "time_per_iteration": 2.780254602432251 + }, + { + "auxiliary_loss_clip": 0.01021512, + "auxiliary_loss_mlp": 0.01028606, + "balance_loss_clip": 1.02484488, + "balance_loss_mlp": 1.0168047, + "epoch": 0.4610852247106568, + "flos": 23294821443840.0, + "grad_norm": 1.6029866461941806, + "language_loss": 0.77982008, + "learning_rate": 2.347381587204975e-06, + "loss": 0.80032134, + "num_input_tokens_seen": 164553815, + "step": 7669, + "time_per_iteration": 2.7752134799957275 + }, + { + "auxiliary_loss_clip": 0.01054815, + "auxiliary_loss_mlp": 0.01030882, + "balance_loss_clip": 1.02503788, + "balance_loss_mlp": 1.01702404, + "epoch": 0.4611453479633248, + "flos": 25447450584960.0, + "grad_norm": 2.3274735937947777, + "language_loss": 0.82745695, + "learning_rate": 2.34699803866453e-06, + "loss": 0.84831393, + "num_input_tokens_seen": 164573125, + "step": 7670, + "time_per_iteration": 2.733751058578491 + }, + { + "auxiliary_loss_clip": 0.01059175, + "auxiliary_loss_mlp": 0.01030604, + "balance_loss_clip": 1.02501941, + "balance_loss_mlp": 1.01867712, + "epoch": 0.4612054712159928, + "flos": 21139606523520.0, + "grad_norm": 1.7776102678842893, + "language_loss": 0.63441086, + "learning_rate": 2.3466144769658845e-06, + "loss": 0.65530866, + "num_input_tokens_seen": 164592575, + "step": 7671, + "time_per_iteration": 2.6273605823516846 + }, + { + "auxiliary_loss_clip": 0.00990707, + "auxiliary_loss_mlp": 0.01002067, + "balance_loss_clip": 1.00255191, + "balance_loss_mlp": 1.00032699, + "epoch": 0.4612655944686608, + "flos": 69959266404480.0, + "grad_norm": 0.6949366727442092, + "language_loss": 0.5588423, + "learning_rate": 2.346230902123583e-06, + "loss": 0.57877004, + "num_input_tokens_seen": 164659795, + "step": 7672, + "time_per_iteration": 3.3779776096343994 + }, + { + "auxiliary_loss_clip": 0.01064928, + "auxiliary_loss_mlp": 0.01030103, + "balance_loss_clip": 1.02903056, + "balance_loss_mlp": 1.01821184, + "epoch": 0.46132571772132874, + "flos": 16837149502080.0, + "grad_norm": 1.771048293617073, + "language_loss": 0.71200001, + "learning_rate": 2.3458473141521715e-06, + "loss": 0.73295033, + "num_input_tokens_seen": 164678735, + "step": 7673, + "time_per_iteration": 2.612281322479248 + }, + { + "auxiliary_loss_clip": 0.01043409, + "auxiliary_loss_mlp": 0.010343, + "balance_loss_clip": 1.02394664, + "balance_loss_mlp": 1.02218246, + "epoch": 0.4613858409739967, + "flos": 35808935431680.0, + "grad_norm": 1.642516751165515, + "language_loss": 0.71005964, + "learning_rate": 2.345463713066195e-06, + "loss": 0.73083675, + "num_input_tokens_seen": 164700885, + "step": 7674, + "time_per_iteration": 2.790090322494507 + }, + { + "auxiliary_loss_clip": 0.01049434, + "auxiliary_loss_mlp": 0.01033386, + "balance_loss_clip": 1.02468014, + "balance_loss_mlp": 1.02097058, + "epoch": 0.4614459642266647, + "flos": 35266756567680.0, + "grad_norm": 38.795423054987594, + "language_loss": 0.65310001, + "learning_rate": 2.3450800988801996e-06, + "loss": 0.67392826, + "num_input_tokens_seen": 164726960, + "step": 7675, + "time_per_iteration": 2.743595838546753 + }, + { + "auxiliary_loss_clip": 0.01011106, + "auxiliary_loss_mlp": 0.01006387, + "balance_loss_clip": 1.00306082, + "balance_loss_mlp": 1.00482512, + "epoch": 0.46150608747933264, + "flos": 66704610044160.0, + "grad_norm": 0.7139122251893514, + "language_loss": 0.58605814, + "learning_rate": 2.3446964716087327e-06, + "loss": 0.606233, + "num_input_tokens_seen": 164788525, + "step": 7676, + "time_per_iteration": 3.1175873279571533 + }, + { + "auxiliary_loss_clip": 0.00982698, + "auxiliary_loss_mlp": 0.01007238, + "balance_loss_clip": 1.00387251, + "balance_loss_mlp": 1.00581384, + "epoch": 0.4615662107320006, + "flos": 55830177025920.0, + "grad_norm": 0.7919650610115169, + "language_loss": 0.62686396, + "learning_rate": 2.344312831266341e-06, + "loss": 0.64676332, + "num_input_tokens_seen": 164843525, + "step": 7677, + "time_per_iteration": 3.236738443374634 + }, + { + "auxiliary_loss_clip": 0.01050368, + "auxiliary_loss_mlp": 0.01033318, + "balance_loss_clip": 1.02615809, + "balance_loss_mlp": 1.02160585, + "epoch": 0.46162633398466857, + "flos": 15483245137920.0, + "grad_norm": 3.4114637280947395, + "language_loss": 0.76869982, + "learning_rate": 2.3439291778675718e-06, + "loss": 0.78953671, + "num_input_tokens_seen": 164859895, + "step": 7678, + "time_per_iteration": 2.596102237701416 + }, + { + "auxiliary_loss_clip": 0.01074428, + "auxiliary_loss_mlp": 0.01033066, + "balance_loss_clip": 1.0294807, + "balance_loss_mlp": 1.01994729, + "epoch": 0.46168645723733653, + "flos": 20011437181440.0, + "grad_norm": 1.8421773030518753, + "language_loss": 0.66802919, + "learning_rate": 2.343545511426974e-06, + "loss": 0.68910408, + "num_input_tokens_seen": 164878030, + "step": 7679, + "time_per_iteration": 2.546388864517212 + }, + { + "auxiliary_loss_clip": 0.01040106, + "auxiliary_loss_mlp": 0.01034942, + "balance_loss_clip": 1.02576447, + "balance_loss_mlp": 1.02309227, + "epoch": 0.4617465804900045, + "flos": 20298542590080.0, + "grad_norm": 2.9849793630348467, + "language_loss": 0.69916099, + "learning_rate": 2.3431618319590963e-06, + "loss": 0.71991146, + "num_input_tokens_seen": 164895710, + "step": 7680, + "time_per_iteration": 2.704890489578247 + }, + { + "auxiliary_loss_clip": 0.01078676, + "auxiliary_loss_mlp": 0.01041927, + "balance_loss_clip": 1.03115606, + "balance_loss_mlp": 1.02911234, + "epoch": 0.46180670374267246, + "flos": 22346312952960.0, + "grad_norm": 2.631382436113829, + "language_loss": 0.63655543, + "learning_rate": 2.342778139478487e-06, + "loss": 0.65776145, + "num_input_tokens_seen": 164913365, + "step": 7681, + "time_per_iteration": 2.611849546432495 + }, + { + "auxiliary_loss_clip": 0.01059295, + "auxiliary_loss_mlp": 0.01026513, + "balance_loss_clip": 1.0270704, + "balance_loss_mlp": 1.01512265, + "epoch": 0.46186682699534043, + "flos": 19895696582400.0, + "grad_norm": 1.4256093527207443, + "language_loss": 0.67132699, + "learning_rate": 2.342394433999697e-06, + "loss": 0.69218504, + "num_input_tokens_seen": 164931620, + "step": 7682, + "time_per_iteration": 2.607954502105713 + }, + { + "auxiliary_loss_clip": 0.01031235, + "auxiliary_loss_mlp": 0.01036422, + "balance_loss_clip": 1.02401567, + "balance_loss_mlp": 1.0241133, + "epoch": 0.4619269502480084, + "flos": 31503569408640.0, + "grad_norm": 2.0030499512613256, + "language_loss": 0.74224246, + "learning_rate": 2.342010715537275e-06, + "loss": 0.76291907, + "num_input_tokens_seen": 164950905, + "step": 7683, + "time_per_iteration": 2.739687919616699 + }, + { + "auxiliary_loss_clip": 0.0107164, + "auxiliary_loss_mlp": 0.01031105, + "balance_loss_clip": 1.02738285, + "balance_loss_mlp": 1.01859403, + "epoch": 0.46198707350067636, + "flos": 25009484054400.0, + "grad_norm": 1.7416967829918253, + "language_loss": 0.76543945, + "learning_rate": 2.3416269841057726e-06, + "loss": 0.78646696, + "num_input_tokens_seen": 164970950, + "step": 7684, + "time_per_iteration": 2.595358371734619 + }, + { + "auxiliary_loss_clip": 0.01078602, + "auxiliary_loss_mlp": 0.01036956, + "balance_loss_clip": 1.03003144, + "balance_loss_mlp": 1.02404594, + "epoch": 0.4620471967533444, + "flos": 18292357198080.0, + "grad_norm": 1.7699017419838101, + "language_loss": 0.79673409, + "learning_rate": 2.3412432397197412e-06, + "loss": 0.81788969, + "num_input_tokens_seen": 164989855, + "step": 7685, + "time_per_iteration": 4.219254016876221 + }, + { + "auxiliary_loss_clip": 0.01039464, + "auxiliary_loss_mlp": 0.01043116, + "balance_loss_clip": 1.02967, + "balance_loss_mlp": 1.02952671, + "epoch": 0.46210732000601235, + "flos": 33985104410880.0, + "grad_norm": 1.9591313437561295, + "language_loss": 0.67226768, + "learning_rate": 2.340859482393731e-06, + "loss": 0.69309348, + "num_input_tokens_seen": 165012290, + "step": 7686, + "time_per_iteration": 4.287242650985718 + }, + { + "auxiliary_loss_clip": 0.01052485, + "auxiliary_loss_mlp": 0.00747557, + "balance_loss_clip": 1.02647519, + "balance_loss_mlp": 1.00042593, + "epoch": 0.4621674432586803, + "flos": 25009412227200.0, + "grad_norm": 3.2861327923263652, + "language_loss": 0.73469341, + "learning_rate": 2.340475712142296e-06, + "loss": 0.75269377, + "num_input_tokens_seen": 165030810, + "step": 7687, + "time_per_iteration": 2.7002992630004883 + }, + { + "auxiliary_loss_clip": 0.01021529, + "auxiliary_loss_mlp": 0.01030757, + "balance_loss_clip": 1.02980947, + "balance_loss_mlp": 1.01800156, + "epoch": 0.4622275665113483, + "flos": 22014031213440.0, + "grad_norm": 2.2055135048621044, + "language_loss": 0.7422775, + "learning_rate": 2.3400919289799873e-06, + "loss": 0.76280034, + "num_input_tokens_seen": 165050205, + "step": 7688, + "time_per_iteration": 2.835573196411133 + }, + { + "auxiliary_loss_clip": 0.01024406, + "auxiliary_loss_mlp": 0.00747504, + "balance_loss_clip": 1.02315879, + "balance_loss_mlp": 1.00040674, + "epoch": 0.46228768976401624, + "flos": 24058820747520.0, + "grad_norm": 1.7102452929738254, + "language_loss": 0.78333426, + "learning_rate": 2.3397081329213585e-06, + "loss": 0.8010534, + "num_input_tokens_seen": 165069370, + "step": 7689, + "time_per_iteration": 2.73773193359375 + }, + { + "auxiliary_loss_clip": 0.01060376, + "auxiliary_loss_mlp": 0.01033991, + "balance_loss_clip": 1.02598739, + "balance_loss_mlp": 1.02032328, + "epoch": 0.4623478130166842, + "flos": 26651391667200.0, + "grad_norm": 5.771537840837101, + "language_loss": 0.56712568, + "learning_rate": 2.339324323980964e-06, + "loss": 0.58806932, + "num_input_tokens_seen": 165089610, + "step": 7690, + "time_per_iteration": 2.630807399749756 + }, + { + "auxiliary_loss_clip": 0.01060698, + "auxiliary_loss_mlp": 0.01034475, + "balance_loss_clip": 1.02530837, + "balance_loss_mlp": 1.02158856, + "epoch": 0.46240793626935217, + "flos": 20558428467840.0, + "grad_norm": 2.486849434088788, + "language_loss": 0.8290863, + "learning_rate": 2.3389405021733562e-06, + "loss": 0.85003805, + "num_input_tokens_seen": 165109050, + "step": 7691, + "time_per_iteration": 2.585103750228882 + }, + { + "auxiliary_loss_clip": 0.01052318, + "auxiliary_loss_mlp": 0.01027129, + "balance_loss_clip": 1.02846861, + "balance_loss_mlp": 1.01545811, + "epoch": 0.46246805952202014, + "flos": 22456055980800.0, + "grad_norm": 2.1848910660673595, + "language_loss": 0.75235003, + "learning_rate": 2.338556667513091e-06, + "loss": 0.77314448, + "num_input_tokens_seen": 165130130, + "step": 7692, + "time_per_iteration": 2.742607593536377 + }, + { + "auxiliary_loss_clip": 0.01040404, + "auxiliary_loss_mlp": 0.01033602, + "balance_loss_clip": 1.02702487, + "balance_loss_mlp": 1.02072167, + "epoch": 0.4625281827746881, + "flos": 35041308854400.0, + "grad_norm": 1.8570192610074672, + "language_loss": 0.74493682, + "learning_rate": 2.338172820014723e-06, + "loss": 0.76567686, + "num_input_tokens_seen": 165152685, + "step": 7693, + "time_per_iteration": 2.7999513149261475 + }, + { + "auxiliary_loss_clip": 0.01036578, + "auxiliary_loss_mlp": 0.01045169, + "balance_loss_clip": 1.02694499, + "balance_loss_mlp": 1.03127527, + "epoch": 0.46258830602735607, + "flos": 21068647205760.0, + "grad_norm": 1.4659619402382884, + "language_loss": 0.85462463, + "learning_rate": 2.337788959692808e-06, + "loss": 0.87544215, + "num_input_tokens_seen": 165173315, + "step": 7694, + "time_per_iteration": 2.6936519145965576 + }, + { + "auxiliary_loss_clip": 0.01052686, + "auxiliary_loss_mlp": 0.01033949, + "balance_loss_clip": 1.02730846, + "balance_loss_mlp": 1.02187359, + "epoch": 0.46264842928002403, + "flos": 26177227205760.0, + "grad_norm": 1.9998769675164292, + "language_loss": 0.78843951, + "learning_rate": 2.337405086561902e-06, + "loss": 0.80930579, + "num_input_tokens_seen": 165192395, + "step": 7695, + "time_per_iteration": 4.3368306159973145 + }, + { + "auxiliary_loss_clip": 0.01059268, + "auxiliary_loss_mlp": 0.01031978, + "balance_loss_clip": 1.02617836, + "balance_loss_mlp": 1.01997972, + "epoch": 0.462708552532692, + "flos": 16764214936320.0, + "grad_norm": 1.6275581427241188, + "language_loss": 0.7202214, + "learning_rate": 2.3370212006365606e-06, + "loss": 0.74113387, + "num_input_tokens_seen": 165211355, + "step": 7696, + "time_per_iteration": 2.7410740852355957 + }, + { + "auxiliary_loss_clip": 0.01053151, + "auxiliary_loss_mlp": 0.0104127, + "balance_loss_clip": 1.02880955, + "balance_loss_mlp": 1.02834189, + "epoch": 0.46276867578535996, + "flos": 15560453422080.0, + "grad_norm": 1.6400194541121478, + "language_loss": 0.69749099, + "learning_rate": 2.3366373019313423e-06, + "loss": 0.71843517, + "num_input_tokens_seen": 165229380, + "step": 7697, + "time_per_iteration": 2.63627552986145 + }, + { + "auxiliary_loss_clip": 0.01072901, + "auxiliary_loss_mlp": 0.01030181, + "balance_loss_clip": 1.02936006, + "balance_loss_mlp": 1.01812291, + "epoch": 0.462828799038028, + "flos": 22415404763520.0, + "grad_norm": 1.8704815871855804, + "language_loss": 0.84909034, + "learning_rate": 2.3362533904608025e-06, + "loss": 0.87012112, + "num_input_tokens_seen": 165247200, + "step": 7698, + "time_per_iteration": 2.529181480407715 + }, + { + "auxiliary_loss_clip": 0.01071225, + "auxiliary_loss_mlp": 0.01032139, + "balance_loss_clip": 1.0266223, + "balance_loss_mlp": 1.01989651, + "epoch": 0.46288892229069595, + "flos": 21069580959360.0, + "grad_norm": 1.6594146687737492, + "language_loss": 0.71147013, + "learning_rate": 2.335869466239502e-06, + "loss": 0.73250371, + "num_input_tokens_seen": 165265825, + "step": 7699, + "time_per_iteration": 4.093656778335571 + }, + { + "auxiliary_loss_clip": 0.01016214, + "auxiliary_loss_mlp": 0.01034517, + "balance_loss_clip": 1.02334762, + "balance_loss_mlp": 1.02197051, + "epoch": 0.4629490455433639, + "flos": 23185688947200.0, + "grad_norm": 2.3645083126231583, + "language_loss": 0.71453732, + "learning_rate": 2.335485529281996e-06, + "loss": 0.7350446, + "num_input_tokens_seen": 165284380, + "step": 7700, + "time_per_iteration": 2.739964008331299 + }, + { + "auxiliary_loss_clip": 0.01071337, + "auxiliary_loss_mlp": 0.00747577, + "balance_loss_clip": 1.0274713, + "balance_loss_mlp": 1.00048137, + "epoch": 0.4630091687960319, + "flos": 18835541642880.0, + "grad_norm": 11.45970192965546, + "language_loss": 0.72926241, + "learning_rate": 2.3351015796028467e-06, + "loss": 0.74745154, + "num_input_tokens_seen": 165300320, + "step": 7701, + "time_per_iteration": 2.499837875366211 + }, + { + "auxiliary_loss_clip": 0.01030563, + "auxiliary_loss_mlp": 0.01038623, + "balance_loss_clip": 1.02529538, + "balance_loss_mlp": 1.0251404, + "epoch": 0.46306929204869984, + "flos": 38907020407680.0, + "grad_norm": 1.9775614621967659, + "language_loss": 0.64980221, + "learning_rate": 2.3347176172166114e-06, + "loss": 0.67049408, + "num_input_tokens_seen": 165318130, + "step": 7702, + "time_per_iteration": 2.7999889850616455 + }, + { + "auxiliary_loss_clip": 0.01049712, + "auxiliary_loss_mlp": 0.01029823, + "balance_loss_clip": 1.02630281, + "balance_loss_mlp": 1.01832569, + "epoch": 0.4631294153013678, + "flos": 19644178573440.0, + "grad_norm": 1.9942932872493082, + "language_loss": 0.73252541, + "learning_rate": 2.33433364213785e-06, + "loss": 0.75332075, + "num_input_tokens_seen": 165336225, + "step": 7703, + "time_per_iteration": 2.6415443420410156 + }, + { + "auxiliary_loss_clip": 0.01050241, + "auxiliary_loss_mlp": 0.01033721, + "balance_loss_clip": 1.02657199, + "balance_loss_mlp": 1.0204649, + "epoch": 0.4631895385540358, + "flos": 24608254158720.0, + "grad_norm": 1.5965504757365212, + "language_loss": 0.68423092, + "learning_rate": 2.3339496543811243e-06, + "loss": 0.70507061, + "num_input_tokens_seen": 165355005, + "step": 7704, + "time_per_iteration": 2.686335325241089 + }, + { + "auxiliary_loss_clip": 0.0106376, + "auxiliary_loss_mlp": 0.01029539, + "balance_loss_clip": 1.02730358, + "balance_loss_mlp": 1.01692677, + "epoch": 0.46324966180670374, + "flos": 26320115508480.0, + "grad_norm": 2.0678995632437687, + "language_loss": 0.81262016, + "learning_rate": 2.3335656539609934e-06, + "loss": 0.83355314, + "num_input_tokens_seen": 165374910, + "step": 7705, + "time_per_iteration": 2.599062442779541 + }, + { + "auxiliary_loss_clip": 0.01058425, + "auxiliary_loss_mlp": 0.01034199, + "balance_loss_clip": 1.02648401, + "balance_loss_mlp": 1.02221239, + "epoch": 0.4633097850593717, + "flos": 19240506552960.0, + "grad_norm": 1.8590609391850899, + "language_loss": 0.77218461, + "learning_rate": 2.3331816408920196e-06, + "loss": 0.79311079, + "num_input_tokens_seen": 165392590, + "step": 7706, + "time_per_iteration": 2.573599100112915 + }, + { + "auxiliary_loss_clip": 0.01047628, + "auxiliary_loss_mlp": 0.01030503, + "balance_loss_clip": 1.02625346, + "balance_loss_mlp": 1.01854634, + "epoch": 0.46336990831203967, + "flos": 22783166161920.0, + "grad_norm": 2.125625966273811, + "language_loss": 0.70275259, + "learning_rate": 2.3327976151887654e-06, + "loss": 0.72353387, + "num_input_tokens_seen": 165411195, + "step": 7707, + "time_per_iteration": 2.5970566272735596 + }, + { + "auxiliary_loss_clip": 0.01053396, + "auxiliary_loss_mlp": 0.01032646, + "balance_loss_clip": 1.02574682, + "balance_loss_mlp": 1.01902032, + "epoch": 0.46343003156470763, + "flos": 38210604543360.0, + "grad_norm": 2.120467102266717, + "language_loss": 0.61132765, + "learning_rate": 2.332413576865791e-06, + "loss": 0.63218808, + "num_input_tokens_seen": 165430150, + "step": 7708, + "time_per_iteration": 2.7388103008270264 + }, + { + "auxiliary_loss_clip": 0.01041824, + "auxiliary_loss_mlp": 0.01032336, + "balance_loss_clip": 1.02695692, + "balance_loss_mlp": 1.01967037, + "epoch": 0.4634901548173756, + "flos": 31938555110400.0, + "grad_norm": 2.0573905603867355, + "language_loss": 0.775231, + "learning_rate": 2.3320295259376614e-06, + "loss": 0.79597259, + "num_input_tokens_seen": 165450595, + "step": 7709, + "time_per_iteration": 2.8063855171203613 + }, + { + "auxiliary_loss_clip": 0.01076327, + "auxiliary_loss_mlp": 0.0103975, + "balance_loss_clip": 1.02956367, + "balance_loss_mlp": 1.02665508, + "epoch": 0.46355027807004356, + "flos": 20082540153600.0, + "grad_norm": 1.8786141022385803, + "language_loss": 0.76984847, + "learning_rate": 2.3316454624189385e-06, + "loss": 0.79100931, + "num_input_tokens_seen": 165469515, + "step": 7710, + "time_per_iteration": 2.661750555038452 + }, + { + "auxiliary_loss_clip": 0.01060849, + "auxiliary_loss_mlp": 0.0103015, + "balance_loss_clip": 1.0261153, + "balance_loss_mlp": 1.01663136, + "epoch": 0.4636104013227116, + "flos": 24061370613120.0, + "grad_norm": 1.869990284486234, + "language_loss": 0.72999942, + "learning_rate": 2.3312613863241865e-06, + "loss": 0.75090933, + "num_input_tokens_seen": 165488125, + "step": 7711, + "time_per_iteration": 2.6651201248168945 + }, + { + "auxiliary_loss_clip": 0.01046, + "auxiliary_loss_mlp": 0.01037503, + "balance_loss_clip": 1.02782762, + "balance_loss_mlp": 1.02428317, + "epoch": 0.46367052457537955, + "flos": 23914639555200.0, + "grad_norm": 1.366564042374781, + "language_loss": 0.71598542, + "learning_rate": 2.33087729766797e-06, + "loss": 0.73682046, + "num_input_tokens_seen": 165509225, + "step": 7712, + "time_per_iteration": 2.720472812652588 + }, + { + "auxiliary_loss_clip": 0.01056495, + "auxiliary_loss_mlp": 0.01039763, + "balance_loss_clip": 1.02953565, + "balance_loss_mlp": 1.02548122, + "epoch": 0.4637306478280475, + "flos": 26396533693440.0, + "grad_norm": 1.794513665156113, + "language_loss": 0.72916591, + "learning_rate": 2.3304931964648524e-06, + "loss": 0.75012845, + "num_input_tokens_seen": 165529945, + "step": 7713, + "time_per_iteration": 2.906151056289673 + }, + { + "auxiliary_loss_clip": 0.0103814, + "auxiliary_loss_mlp": 0.01037329, + "balance_loss_clip": 1.02628803, + "balance_loss_mlp": 1.02389407, + "epoch": 0.4637907710807155, + "flos": 21980706370560.0, + "grad_norm": 1.82332200830592, + "language_loss": 0.58258325, + "learning_rate": 2.3301090827294e-06, + "loss": 0.60333794, + "num_input_tokens_seen": 165550690, + "step": 7714, + "time_per_iteration": 2.7922987937927246 + }, + { + "auxiliary_loss_clip": 0.01061741, + "auxiliary_loss_mlp": 0.01034964, + "balance_loss_clip": 1.0273881, + "balance_loss_mlp": 1.0228461, + "epoch": 0.46385089433338345, + "flos": 12422291846400.0, + "grad_norm": 3.6771089655988365, + "language_loss": 0.69938076, + "learning_rate": 2.3297249564761784e-06, + "loss": 0.72034782, + "num_input_tokens_seen": 165567775, + "step": 7715, + "time_per_iteration": 2.636091470718384 + }, + { + "auxiliary_loss_clip": 0.01079175, + "auxiliary_loss_mlp": 0.01034478, + "balance_loss_clip": 1.03004265, + "balance_loss_mlp": 1.02157402, + "epoch": 0.4639110175860514, + "flos": 23915752876800.0, + "grad_norm": 1.8829852367857922, + "language_loss": 0.68311131, + "learning_rate": 2.3293408177197527e-06, + "loss": 0.70424783, + "num_input_tokens_seen": 165587010, + "step": 7716, + "time_per_iteration": 2.6307501792907715 + }, + { + "auxiliary_loss_clip": 0.01074677, + "auxiliary_loss_mlp": 0.01030658, + "balance_loss_clip": 1.0280149, + "balance_loss_mlp": 1.0174377, + "epoch": 0.4639711408387194, + "flos": 25300396304640.0, + "grad_norm": 1.5456647811672801, + "language_loss": 0.80905712, + "learning_rate": 2.328956666474691e-06, + "loss": 0.83011043, + "num_input_tokens_seen": 165607850, + "step": 7717, + "time_per_iteration": 2.647932767868042 + }, + { + "auxiliary_loss_clip": 0.01074793, + "auxiliary_loss_mlp": 0.01033293, + "balance_loss_clip": 1.02830899, + "balance_loss_mlp": 1.02099109, + "epoch": 0.46403126409138734, + "flos": 21211822817280.0, + "grad_norm": 1.628803305439209, + "language_loss": 0.72777355, + "learning_rate": 2.3285725027555593e-06, + "loss": 0.7488544, + "num_input_tokens_seen": 165627175, + "step": 7718, + "time_per_iteration": 2.5193400382995605 + }, + { + "auxiliary_loss_clip": 0.01071057, + "auxiliary_loss_mlp": 0.00747555, + "balance_loss_clip": 1.02625668, + "balance_loss_mlp": 1.00038052, + "epoch": 0.4640913873440553, + "flos": 35845564325760.0, + "grad_norm": 1.6108368760099194, + "language_loss": 0.70419604, + "learning_rate": 2.3281883265769254e-06, + "loss": 0.72238219, + "num_input_tokens_seen": 165648340, + "step": 7719, + "time_per_iteration": 2.7060625553131104 + }, + { + "auxiliary_loss_clip": 0.01047761, + "auxiliary_loss_mlp": 0.01035772, + "balance_loss_clip": 1.02933311, + "balance_loss_mlp": 1.02212858, + "epoch": 0.46415151059672327, + "flos": 19166207270400.0, + "grad_norm": 1.9361236528852788, + "language_loss": 0.86601055, + "learning_rate": 2.327804137953357e-06, + "loss": 0.88684595, + "num_input_tokens_seen": 165667195, + "step": 7720, + "time_per_iteration": 2.6412785053253174 + }, + { + "auxiliary_loss_clip": 0.01000473, + "auxiliary_loss_mlp": 0.01008784, + "balance_loss_clip": 1.01144409, + "balance_loss_mlp": 1.00740707, + "epoch": 0.46421163384939124, + "flos": 58912750304640.0, + "grad_norm": 0.7198107329492469, + "language_loss": 0.55080664, + "learning_rate": 2.3274199368994226e-06, + "loss": 0.57089919, + "num_input_tokens_seen": 165726760, + "step": 7721, + "time_per_iteration": 3.235947370529175 + }, + { + "auxiliary_loss_clip": 0.0104939, + "auxiliary_loss_mlp": 0.01036013, + "balance_loss_clip": 1.02623224, + "balance_loss_mlp": 1.02319801, + "epoch": 0.4642717571020592, + "flos": 20157342226560.0, + "grad_norm": 1.853469388541133, + "language_loss": 0.79761982, + "learning_rate": 2.3270357234296918e-06, + "loss": 0.81847388, + "num_input_tokens_seen": 165745005, + "step": 7722, + "time_per_iteration": 2.6179304122924805 + }, + { + "auxiliary_loss_clip": 0.01076704, + "auxiliary_loss_mlp": 0.01031236, + "balance_loss_clip": 1.02903199, + "balance_loss_mlp": 1.01834917, + "epoch": 0.46433188035472717, + "flos": 25046184775680.0, + "grad_norm": 1.6881396957076, + "language_loss": 0.78035772, + "learning_rate": 2.3266514975587332e-06, + "loss": 0.80143714, + "num_input_tokens_seen": 165765750, + "step": 7723, + "time_per_iteration": 2.74566650390625 + }, + { + "auxiliary_loss_clip": 0.00979826, + "auxiliary_loss_mlp": 0.01028787, + "balance_loss_clip": 1.02218747, + "balance_loss_mlp": 1.01687849, + "epoch": 0.4643920036073952, + "flos": 28075644817920.0, + "grad_norm": 1.5477538984259998, + "language_loss": 0.68328673, + "learning_rate": 2.326267259301118e-06, + "loss": 0.7033729, + "num_input_tokens_seen": 165787515, + "step": 7724, + "time_per_iteration": 3.0978522300720215 + }, + { + "auxiliary_loss_clip": 0.01066257, + "auxiliary_loss_mlp": 0.01033468, + "balance_loss_clip": 1.0311172, + "balance_loss_mlp": 1.02055776, + "epoch": 0.46445212686006315, + "flos": 18369350000640.0, + "grad_norm": 1.8013235491878394, + "language_loss": 0.67174661, + "learning_rate": 2.325883008671415e-06, + "loss": 0.69274384, + "num_input_tokens_seen": 165806675, + "step": 7725, + "time_per_iteration": 2.967456817626953 + }, + { + "auxiliary_loss_clip": 0.01058655, + "auxiliary_loss_mlp": 0.0103252, + "balance_loss_clip": 1.02669334, + "balance_loss_mlp": 1.02142191, + "epoch": 0.4645122501127311, + "flos": 31721618920320.0, + "grad_norm": 2.2249810801140475, + "language_loss": 0.64924383, + "learning_rate": 2.3254987456841955e-06, + "loss": 0.67015558, + "num_input_tokens_seen": 165829835, + "step": 7726, + "time_per_iteration": 3.310810089111328 + }, + { + "auxiliary_loss_clip": 0.01044956, + "auxiliary_loss_mlp": 0.00747529, + "balance_loss_clip": 1.02625108, + "balance_loss_mlp": 1.00040054, + "epoch": 0.4645723733653991, + "flos": 23768806337280.0, + "grad_norm": 1.7533331459640453, + "language_loss": 0.75127566, + "learning_rate": 2.3251144703540307e-06, + "loss": 0.7692005, + "num_input_tokens_seen": 165849380, + "step": 7727, + "time_per_iteration": 2.727240800857544 + }, + { + "auxiliary_loss_clip": 0.01052763, + "auxiliary_loss_mlp": 0.01034945, + "balance_loss_clip": 1.02730644, + "balance_loss_mlp": 1.02180791, + "epoch": 0.46463249661806705, + "flos": 33145512935040.0, + "grad_norm": 2.296444748939252, + "language_loss": 0.79144382, + "learning_rate": 2.3247301826954936e-06, + "loss": 0.81232089, + "num_input_tokens_seen": 165868620, + "step": 7728, + "time_per_iteration": 2.8102962970733643 + }, + { + "auxiliary_loss_clip": 0.01034582, + "auxiliary_loss_mlp": 0.0103451, + "balance_loss_clip": 1.02465391, + "balance_loss_mlp": 1.02152801, + "epoch": 0.464692619870735, + "flos": 18296020385280.0, + "grad_norm": 1.9928082527868363, + "language_loss": 0.7611621, + "learning_rate": 2.324345882723155e-06, + "loss": 0.78185302, + "num_input_tokens_seen": 165885915, + "step": 7729, + "time_per_iteration": 2.6881699562072754 + }, + { + "auxiliary_loss_clip": 0.01046915, + "auxiliary_loss_mlp": 0.01042292, + "balance_loss_clip": 1.02682877, + "balance_loss_mlp": 1.02904224, + "epoch": 0.464752743123403, + "flos": 22638051216000.0, + "grad_norm": 1.9609004886129133, + "language_loss": 0.8008526, + "learning_rate": 2.323961570451588e-06, + "loss": 0.82174468, + "num_input_tokens_seen": 165905465, + "step": 7730, + "time_per_iteration": 2.6555769443511963 + }, + { + "auxiliary_loss_clip": 0.01070364, + "auxiliary_loss_mlp": 0.01038259, + "balance_loss_clip": 1.02606487, + "balance_loss_mlp": 1.02596235, + "epoch": 0.46481286637607094, + "flos": 20412128373120.0, + "grad_norm": 1.6882521099845542, + "language_loss": 0.76736867, + "learning_rate": 2.3235772458953655e-06, + "loss": 0.78845489, + "num_input_tokens_seen": 165924640, + "step": 7731, + "time_per_iteration": 2.577000856399536 + }, + { + "auxiliary_loss_clip": 0.01039505, + "auxiliary_loss_mlp": 0.01032528, + "balance_loss_clip": 1.02692652, + "balance_loss_mlp": 1.02049398, + "epoch": 0.4648729896287389, + "flos": 34275406129920.0, + "grad_norm": 1.9487996251878963, + "language_loss": 0.65690792, + "learning_rate": 2.323192909069061e-06, + "loss": 0.67762822, + "num_input_tokens_seen": 165945765, + "step": 7732, + "time_per_iteration": 4.364741563796997 + }, + { + "auxiliary_loss_clip": 0.01048461, + "auxiliary_loss_mlp": 0.01038851, + "balance_loss_clip": 1.02527165, + "balance_loss_mlp": 1.02447426, + "epoch": 0.4649331128814069, + "flos": 21321781326720.0, + "grad_norm": 2.4320175437051854, + "language_loss": 0.72751343, + "learning_rate": 2.32280855998725e-06, + "loss": 0.74838656, + "num_input_tokens_seen": 165964025, + "step": 7733, + "time_per_iteration": 4.259122371673584 + }, + { + "auxiliary_loss_clip": 0.01008875, + "auxiliary_loss_mlp": 0.01002382, + "balance_loss_clip": 1.00113583, + "balance_loss_mlp": 1.00102937, + "epoch": 0.46499323613407484, + "flos": 58308515717760.0, + "grad_norm": 1.2027036681885286, + "language_loss": 0.51896584, + "learning_rate": 2.3224241986645057e-06, + "loss": 0.53907847, + "num_input_tokens_seen": 166021950, + "step": 7734, + "time_per_iteration": 3.1391375064849854 + }, + { + "auxiliary_loss_clip": 0.01054207, + "auxiliary_loss_mlp": 0.01030757, + "balance_loss_clip": 1.02834082, + "balance_loss_mlp": 1.01821578, + "epoch": 0.4650533593867428, + "flos": 10889660384640.0, + "grad_norm": 2.229223072431152, + "language_loss": 0.7567873, + "learning_rate": 2.3220398251154035e-06, + "loss": 0.77763689, + "num_input_tokens_seen": 166039675, + "step": 7735, + "time_per_iteration": 2.7163469791412354 + }, + { + "auxiliary_loss_clip": 0.01034609, + "auxiliary_loss_mlp": 0.01041139, + "balance_loss_clip": 1.02798128, + "balance_loss_mlp": 1.02756715, + "epoch": 0.46511348263941077, + "flos": 19974592805760.0, + "grad_norm": 1.894973652465129, + "language_loss": 0.70163196, + "learning_rate": 2.321655439354519e-06, + "loss": 0.72238946, + "num_input_tokens_seen": 166057745, + "step": 7736, + "time_per_iteration": 2.924403429031372 + }, + { + "auxiliary_loss_clip": 0.01070618, + "auxiliary_loss_mlp": 0.01035579, + "balance_loss_clip": 1.0278275, + "balance_loss_mlp": 1.02430224, + "epoch": 0.46517360589207873, + "flos": 19678401256320.0, + "grad_norm": 1.6393215360388265, + "language_loss": 0.72201461, + "learning_rate": 2.321271041396427e-06, + "loss": 0.74307656, + "num_input_tokens_seen": 166076440, + "step": 7737, + "time_per_iteration": 2.709991455078125 + }, + { + "auxiliary_loss_clip": 0.01050793, + "auxiliary_loss_mlp": 0.01041956, + "balance_loss_clip": 1.03121328, + "balance_loss_mlp": 1.02831233, + "epoch": 0.46523372914474675, + "flos": 16872665074560.0, + "grad_norm": 1.7841101274138713, + "language_loss": 0.83725512, + "learning_rate": 2.3208866312557065e-06, + "loss": 0.85818261, + "num_input_tokens_seen": 166092520, + "step": 7738, + "time_per_iteration": 2.732679605484009 + }, + { + "auxiliary_loss_clip": 0.01001561, + "auxiliary_loss_mlp": 0.01002064, + "balance_loss_clip": 1.00324404, + "balance_loss_mlp": 1.00054991, + "epoch": 0.4652938523974147, + "flos": 53439138339840.0, + "grad_norm": 0.7651896484177114, + "language_loss": 0.57709515, + "learning_rate": 2.320502208946932e-06, + "loss": 0.59713137, + "num_input_tokens_seen": 166156285, + "step": 7739, + "time_per_iteration": 3.2143375873565674 + }, + { + "auxiliary_loss_clip": 0.01053095, + "auxiliary_loss_mlp": 0.01038554, + "balance_loss_clip": 1.02727795, + "balance_loss_mlp": 1.02631116, + "epoch": 0.4653539756500827, + "flos": 15231296165760.0, + "grad_norm": 1.8765371346602169, + "language_loss": 0.84807318, + "learning_rate": 2.3201177744846815e-06, + "loss": 0.86898971, + "num_input_tokens_seen": 166173455, + "step": 7740, + "time_per_iteration": 2.6366894245147705 + }, + { + "auxiliary_loss_clip": 0.01046839, + "auxiliary_loss_mlp": 0.01035484, + "balance_loss_clip": 1.0257597, + "balance_loss_mlp": 1.02156043, + "epoch": 0.46541409890275065, + "flos": 23732249270400.0, + "grad_norm": 2.124211969075575, + "language_loss": 0.7575829, + "learning_rate": 2.3197333278835327e-06, + "loss": 0.77840614, + "num_input_tokens_seen": 166194370, + "step": 7741, + "time_per_iteration": 2.6131794452667236 + }, + { + "auxiliary_loss_clip": 0.01045571, + "auxiliary_loss_mlp": 0.01033871, + "balance_loss_clip": 1.02794385, + "balance_loss_mlp": 1.02146697, + "epoch": 0.4654742221554186, + "flos": 20847329556480.0, + "grad_norm": 1.8815622213309269, + "language_loss": 0.80806196, + "learning_rate": 2.319348869158064e-06, + "loss": 0.82885635, + "num_input_tokens_seen": 166213195, + "step": 7742, + "time_per_iteration": 4.183133840560913 + }, + { + "auxiliary_loss_clip": 0.01053036, + "auxiliary_loss_mlp": 0.01036932, + "balance_loss_clip": 1.02657318, + "balance_loss_mlp": 1.02373588, + "epoch": 0.4655343454080866, + "flos": 20704836303360.0, + "grad_norm": 1.6746859665906257, + "language_loss": 0.72524893, + "learning_rate": 2.3189643983228555e-06, + "loss": 0.74614853, + "num_input_tokens_seen": 166231350, + "step": 7743, + "time_per_iteration": 2.652569532394409 + }, + { + "auxiliary_loss_clip": 0.01044592, + "auxiliary_loss_mlp": 0.0103219, + "balance_loss_clip": 1.02790868, + "balance_loss_mlp": 1.01873708, + "epoch": 0.46559446866075455, + "flos": 18989850470400.0, + "grad_norm": 2.1091027245271765, + "language_loss": 0.71062422, + "learning_rate": 2.318579915392483e-06, + "loss": 0.73139203, + "num_input_tokens_seen": 166250530, + "step": 7744, + "time_per_iteration": 2.600905418395996 + }, + { + "auxiliary_loss_clip": 0.01025623, + "auxiliary_loss_mlp": 0.01028749, + "balance_loss_clip": 1.02672923, + "balance_loss_mlp": 1.01730466, + "epoch": 0.4656545919134225, + "flos": 34496364643200.0, + "grad_norm": 1.5806195803280665, + "language_loss": 0.85148442, + "learning_rate": 2.31819542038153e-06, + "loss": 0.87202811, + "num_input_tokens_seen": 166272545, + "step": 7745, + "time_per_iteration": 2.7655909061431885 + }, + { + "auxiliary_loss_clip": 0.01060825, + "auxiliary_loss_mlp": 0.01036653, + "balance_loss_clip": 1.02786112, + "balance_loss_mlp": 1.02419543, + "epoch": 0.4657147151660905, + "flos": 24310554238080.0, + "grad_norm": 1.4476731368043363, + "language_loss": 0.73104072, + "learning_rate": 2.317810913304574e-06, + "loss": 0.75201547, + "num_input_tokens_seen": 166292135, + "step": 7746, + "time_per_iteration": 4.305521726608276 + }, + { + "auxiliary_loss_clip": 0.01062268, + "auxiliary_loss_mlp": 0.01037609, + "balance_loss_clip": 1.02875376, + "balance_loss_mlp": 1.02587867, + "epoch": 0.46577483841875844, + "flos": 58795139220480.0, + "grad_norm": 1.6486621976339928, + "language_loss": 0.69702697, + "learning_rate": 2.3174263941761963e-06, + "loss": 0.7180258, + "num_input_tokens_seen": 166316710, + "step": 7747, + "time_per_iteration": 3.0570013523101807 + }, + { + "auxiliary_loss_clip": 0.01032585, + "auxiliary_loss_mlp": 0.01038091, + "balance_loss_clip": 1.02368855, + "balance_loss_mlp": 1.02454925, + "epoch": 0.4658349616714264, + "flos": 31321969223040.0, + "grad_norm": 1.5143295118552524, + "language_loss": 0.67644089, + "learning_rate": 2.317041863010978e-06, + "loss": 0.69714773, + "num_input_tokens_seen": 166338535, + "step": 7748, + "time_per_iteration": 2.753586769104004 + }, + { + "auxiliary_loss_clip": 0.01046238, + "auxiliary_loss_mlp": 0.01036701, + "balance_loss_clip": 1.02943039, + "balance_loss_mlp": 1.02250957, + "epoch": 0.46589508492409437, + "flos": 14860338456960.0, + "grad_norm": 3.449158653329985, + "language_loss": 0.63473618, + "learning_rate": 2.3166573198235007e-06, + "loss": 0.65556562, + "num_input_tokens_seen": 166355540, + "step": 7749, + "time_per_iteration": 2.691981077194214 + }, + { + "auxiliary_loss_clip": 0.01064413, + "auxiliary_loss_mlp": 0.01031012, + "balance_loss_clip": 1.02832782, + "balance_loss_mlp": 1.0171603, + "epoch": 0.46595520817676234, + "flos": 12895989431040.0, + "grad_norm": 1.9949394439348027, + "language_loss": 0.74112618, + "learning_rate": 2.3162727646283456e-06, + "loss": 0.76208043, + "num_input_tokens_seen": 166372635, + "step": 7750, + "time_per_iteration": 2.6320018768310547 + }, + { + "auxiliary_loss_clip": 0.01052615, + "auxiliary_loss_mlp": 0.01027482, + "balance_loss_clip": 1.0292697, + "balance_loss_mlp": 1.01390409, + "epoch": 0.46601533142943036, + "flos": 32854169721600.0, + "grad_norm": 2.4483353003091164, + "language_loss": 0.74331903, + "learning_rate": 2.3158881974400963e-06, + "loss": 0.76411998, + "num_input_tokens_seen": 166393175, + "step": 7751, + "time_per_iteration": 2.7381296157836914 + }, + { + "auxiliary_loss_clip": 0.01049453, + "auxiliary_loss_mlp": 0.01036966, + "balance_loss_clip": 1.03054512, + "balance_loss_mlp": 1.02310836, + "epoch": 0.4660754546820983, + "flos": 19967517826560.0, + "grad_norm": 6.130902009520449, + "language_loss": 0.73526704, + "learning_rate": 2.3155036182733345e-06, + "loss": 0.75613117, + "num_input_tokens_seen": 166408630, + "step": 7752, + "time_per_iteration": 2.7113161087036133 + }, + { + "auxiliary_loss_clip": 0.01057305, + "auxiliary_loss_mlp": 0.01040439, + "balance_loss_clip": 1.03076756, + "balance_loss_mlp": 1.02675962, + "epoch": 0.4661355779347663, + "flos": 26688164215680.0, + "grad_norm": 2.670040469599158, + "language_loss": 0.6900636, + "learning_rate": 2.315119027142644e-06, + "loss": 0.71104103, + "num_input_tokens_seen": 166428170, + "step": 7753, + "time_per_iteration": 2.7802162170410156 + }, + { + "auxiliary_loss_clip": 0.01049854, + "auxiliary_loss_mlp": 0.01032903, + "balance_loss_clip": 1.02753496, + "balance_loss_mlp": 1.02025497, + "epoch": 0.46619570118743425, + "flos": 20959442881920.0, + "grad_norm": 2.4886962934532626, + "language_loss": 0.73155522, + "learning_rate": 2.3147344240626076e-06, + "loss": 0.75238281, + "num_input_tokens_seen": 166446705, + "step": 7754, + "time_per_iteration": 2.6474523544311523 + }, + { + "auxiliary_loss_clip": 0.01054871, + "auxiliary_loss_mlp": 0.01029471, + "balance_loss_clip": 1.02719998, + "balance_loss_mlp": 1.01653099, + "epoch": 0.4662558244401022, + "flos": 24426079355520.0, + "grad_norm": 1.7713447505250415, + "language_loss": 0.78817511, + "learning_rate": 2.3143498090478114e-06, + "loss": 0.80901849, + "num_input_tokens_seen": 166466750, + "step": 7755, + "time_per_iteration": 2.722721576690674 + }, + { + "auxiliary_loss_clip": 0.01060244, + "auxiliary_loss_mlp": 0.01029184, + "balance_loss_clip": 1.02697754, + "balance_loss_mlp": 1.01724505, + "epoch": 0.4663159476927702, + "flos": 20595452411520.0, + "grad_norm": 1.7686143117367883, + "language_loss": 0.72333431, + "learning_rate": 2.3139651821128382e-06, + "loss": 0.7442286, + "num_input_tokens_seen": 166485400, + "step": 7756, + "time_per_iteration": 2.602850914001465 + }, + { + "auxiliary_loss_clip": 0.0105971, + "auxiliary_loss_mlp": 0.01028454, + "balance_loss_clip": 1.02536011, + "balance_loss_mlp": 1.01665258, + "epoch": 0.46637607094543815, + "flos": 25661872823040.0, + "grad_norm": 1.8311901997747075, + "language_loss": 0.7818644, + "learning_rate": 2.313580543272274e-06, + "loss": 0.80274606, + "num_input_tokens_seen": 166505730, + "step": 7757, + "time_per_iteration": 2.6140427589416504 + }, + { + "auxiliary_loss_clip": 0.01033588, + "auxiliary_loss_mlp": 0.01029755, + "balance_loss_clip": 1.02488172, + "balance_loss_mlp": 1.01753032, + "epoch": 0.4664361941981061, + "flos": 24273853516800.0, + "grad_norm": 1.7665212849716265, + "language_loss": 0.66501975, + "learning_rate": 2.313195892540705e-06, + "loss": 0.68565315, + "num_input_tokens_seen": 166523770, + "step": 7758, + "time_per_iteration": 2.682888984680176 + }, + { + "auxiliary_loss_clip": 0.01042384, + "auxiliary_loss_mlp": 0.01039013, + "balance_loss_clip": 1.02532923, + "balance_loss_mlp": 1.02616239, + "epoch": 0.4664963174507741, + "flos": 18405871153920.0, + "grad_norm": 1.6627846483108741, + "language_loss": 0.74817747, + "learning_rate": 2.3128112299327147e-06, + "loss": 0.76899147, + "num_input_tokens_seen": 166542935, + "step": 7759, + "time_per_iteration": 2.7190942764282227 + }, + { + "auxiliary_loss_clip": 0.01052858, + "auxiliary_loss_mlp": 0.01034939, + "balance_loss_clip": 1.02717638, + "balance_loss_mlp": 1.0229404, + "epoch": 0.46655644070344204, + "flos": 22455122227200.0, + "grad_norm": 1.451128274221469, + "language_loss": 0.77570558, + "learning_rate": 2.312426555462893e-06, + "loss": 0.79658353, + "num_input_tokens_seen": 166563935, + "step": 7760, + "time_per_iteration": 2.6681971549987793 + }, + { + "auxiliary_loss_clip": 0.01051586, + "auxiliary_loss_mlp": 0.01031758, + "balance_loss_clip": 1.02796173, + "balance_loss_mlp": 1.01896095, + "epoch": 0.46661656395611, + "flos": 13808407731840.0, + "grad_norm": 1.7410504729041987, + "language_loss": 0.74074185, + "learning_rate": 2.3120418691458237e-06, + "loss": 0.76157528, + "num_input_tokens_seen": 166582175, + "step": 7761, + "time_per_iteration": 2.742976665496826 + }, + { + "auxiliary_loss_clip": 0.01065357, + "auxiliary_loss_mlp": 0.01037827, + "balance_loss_clip": 1.02830529, + "balance_loss_mlp": 1.02333093, + "epoch": 0.466676687208778, + "flos": 21652159645440.0, + "grad_norm": 1.5834873382332577, + "language_loss": 0.78660899, + "learning_rate": 2.3116571709960956e-06, + "loss": 0.80764079, + "num_input_tokens_seen": 166601870, + "step": 7762, + "time_per_iteration": 2.626112937927246 + }, + { + "auxiliary_loss_clip": 0.01000369, + "auxiliary_loss_mlp": 0.01005218, + "balance_loss_clip": 1.00242496, + "balance_loss_mlp": 1.00391889, + "epoch": 0.46673681046144594, + "flos": 68534259068160.0, + "grad_norm": 0.7925062371122769, + "language_loss": 0.59827936, + "learning_rate": 2.311272461028297e-06, + "loss": 0.61833519, + "num_input_tokens_seen": 166668960, + "step": 7763, + "time_per_iteration": 3.2250521183013916 + }, + { + "auxiliary_loss_clip": 0.01038125, + "auxiliary_loss_mlp": 0.01041806, + "balance_loss_clip": 1.02500653, + "balance_loss_mlp": 1.02730417, + "epoch": 0.46679693371411396, + "flos": 15814449469440.0, + "grad_norm": 2.18204811055715, + "language_loss": 0.79235935, + "learning_rate": 2.3108877392570146e-06, + "loss": 0.81315863, + "num_input_tokens_seen": 166686110, + "step": 7764, + "time_per_iteration": 2.6888856887817383 + }, + { + "auxiliary_loss_clip": 0.01047358, + "auxiliary_loss_mlp": 0.0103267, + "balance_loss_clip": 1.03202105, + "balance_loss_mlp": 1.02126145, + "epoch": 0.4668570569667819, + "flos": 18514572687360.0, + "grad_norm": 1.757948911334952, + "language_loss": 0.71853465, + "learning_rate": 2.310503005696839e-06, + "loss": 0.73933494, + "num_input_tokens_seen": 166703930, + "step": 7765, + "time_per_iteration": 2.660825490951538 + }, + { + "auxiliary_loss_clip": 0.01045282, + "auxiliary_loss_mlp": 0.01036845, + "balance_loss_clip": 1.02914119, + "balance_loss_mlp": 1.0233326, + "epoch": 0.4669171802194499, + "flos": 19206643006080.0, + "grad_norm": 2.304307330205445, + "language_loss": 0.77544492, + "learning_rate": 2.3101182603623576e-06, + "loss": 0.7962662, + "num_input_tokens_seen": 166719940, + "step": 7766, + "time_per_iteration": 2.611103057861328 + }, + { + "auxiliary_loss_clip": 0.01054456, + "auxiliary_loss_mlp": 0.01032122, + "balance_loss_clip": 1.0247333, + "balance_loss_mlp": 1.0195694, + "epoch": 0.46697730347211786, + "flos": 12276135406080.0, + "grad_norm": 2.04953438627145, + "language_loss": 0.64876431, + "learning_rate": 2.3097335032681607e-06, + "loss": 0.66963005, + "num_input_tokens_seen": 166738285, + "step": 7767, + "time_per_iteration": 2.577256202697754 + }, + { + "auxiliary_loss_clip": 0.01065121, + "auxiliary_loss_mlp": 0.01039418, + "balance_loss_clip": 1.0300281, + "balance_loss_mlp": 1.02705574, + "epoch": 0.4670374267247858, + "flos": 23586739274880.0, + "grad_norm": 2.0470950589, + "language_loss": 0.74431437, + "learning_rate": 2.3093487344288393e-06, + "loss": 0.76535976, + "num_input_tokens_seen": 166758170, + "step": 7768, + "time_per_iteration": 2.762491226196289 + }, + { + "auxiliary_loss_clip": 0.01053933, + "auxiliary_loss_mlp": 0.01031648, + "balance_loss_clip": 1.02917874, + "balance_loss_mlp": 1.01942897, + "epoch": 0.4670975499774538, + "flos": 15991093578240.0, + "grad_norm": 1.6739799753573794, + "language_loss": 0.70876116, + "learning_rate": 2.308963953858982e-06, + "loss": 0.729617, + "num_input_tokens_seen": 166775750, + "step": 7769, + "time_per_iteration": 2.7695517539978027 + }, + { + "auxiliary_loss_clip": 0.01072429, + "auxiliary_loss_mlp": 0.01034612, + "balance_loss_clip": 1.02760983, + "balance_loss_mlp": 1.0224762, + "epoch": 0.46715767323012175, + "flos": 15377596260480.0, + "grad_norm": 12.275614285182732, + "language_loss": 0.81408548, + "learning_rate": 2.3085791615731803e-06, + "loss": 0.83515584, + "num_input_tokens_seen": 166791720, + "step": 7770, + "time_per_iteration": 2.57296085357666 + }, + { + "auxiliary_loss_clip": 0.01009718, + "auxiliary_loss_mlp": 0.0100241, + "balance_loss_clip": 1.00184989, + "balance_loss_mlp": 1.00096154, + "epoch": 0.4672177964827897, + "flos": 60252217401600.0, + "grad_norm": 0.8489617035713688, + "language_loss": 0.55651355, + "learning_rate": 2.3081943575860265e-06, + "loss": 0.57663476, + "num_input_tokens_seen": 166856360, + "step": 7771, + "time_per_iteration": 3.153867721557617 + }, + { + "auxiliary_loss_clip": 0.01056036, + "auxiliary_loss_mlp": 0.00747504, + "balance_loss_clip": 1.02468431, + "balance_loss_mlp": 1.00033569, + "epoch": 0.4672779197354577, + "flos": 27636134002560.0, + "grad_norm": 2.4790087712563404, + "language_loss": 0.66062641, + "learning_rate": 2.3078095419121117e-06, + "loss": 0.6786617, + "num_input_tokens_seen": 166875925, + "step": 7772, + "time_per_iteration": 2.6525330543518066 + }, + { + "auxiliary_loss_clip": 0.01061795, + "auxiliary_loss_mlp": 0.0103361, + "balance_loss_clip": 1.02808738, + "balance_loss_mlp": 1.02150404, + "epoch": 0.46733804298812565, + "flos": 31394257344000.0, + "grad_norm": 1.8947022833858962, + "language_loss": 0.63377237, + "learning_rate": 2.3074247145660283e-06, + "loss": 0.65472639, + "num_input_tokens_seen": 166896520, + "step": 7773, + "time_per_iteration": 2.7362070083618164 + }, + { + "auxiliary_loss_clip": 0.01053063, + "auxiliary_loss_mlp": 0.01035749, + "balance_loss_clip": 1.02648687, + "balance_loss_mlp": 1.0225879, + "epoch": 0.4673981662407936, + "flos": 19500607912320.0, + "grad_norm": 2.2030400361792144, + "language_loss": 0.80174649, + "learning_rate": 2.3070398755623685e-06, + "loss": 0.82263458, + "num_input_tokens_seen": 166915370, + "step": 7774, + "time_per_iteration": 2.607931613922119 + }, + { + "auxiliary_loss_clip": 0.01038065, + "auxiliary_loss_mlp": 0.01027186, + "balance_loss_clip": 1.0271101, + "balance_loss_mlp": 1.01448417, + "epoch": 0.4674582894934616, + "flos": 20521835487360.0, + "grad_norm": 1.580082362715298, + "language_loss": 0.77826959, + "learning_rate": 2.306655024915726e-06, + "loss": 0.79892206, + "num_input_tokens_seen": 166934875, + "step": 7775, + "time_per_iteration": 2.6594960689544678 + }, + { + "auxiliary_loss_clip": 0.01041602, + "auxiliary_loss_mlp": 0.01033535, + "balance_loss_clip": 1.02523017, + "balance_loss_mlp": 1.02107787, + "epoch": 0.46751841274612954, + "flos": 22090952188800.0, + "grad_norm": 1.9452362322351917, + "language_loss": 0.69993567, + "learning_rate": 2.306270162640694e-06, + "loss": 0.72068703, + "num_input_tokens_seen": 166954285, + "step": 7776, + "time_per_iteration": 2.6149110794067383 + }, + { + "auxiliary_loss_clip": 0.01064637, + "auxiliary_loss_mlp": 0.01036783, + "balance_loss_clip": 1.02960217, + "balance_loss_mlp": 1.02544665, + "epoch": 0.46757853599879756, + "flos": 26980082046720.0, + "grad_norm": 1.5428974527004062, + "language_loss": 0.73632532, + "learning_rate": 2.3058852887518678e-06, + "loss": 0.75733954, + "num_input_tokens_seen": 166975975, + "step": 7777, + "time_per_iteration": 2.592891216278076 + }, + { + "auxiliary_loss_clip": 0.01060352, + "auxiliary_loss_mlp": 0.01029282, + "balance_loss_clip": 1.02597141, + "balance_loss_mlp": 1.01709926, + "epoch": 0.4676386592514655, + "flos": 24134053783680.0, + "grad_norm": 2.3422314357283267, + "language_loss": 0.69510925, + "learning_rate": 2.3055004032638394e-06, + "loss": 0.71600562, + "num_input_tokens_seen": 166996140, + "step": 7778, + "time_per_iteration": 2.605797529220581 + }, + { + "auxiliary_loss_clip": 0.01060836, + "auxiliary_loss_mlp": 0.01040167, + "balance_loss_clip": 1.02654004, + "balance_loss_mlp": 1.02688086, + "epoch": 0.4676987825041335, + "flos": 25483720343040.0, + "grad_norm": 1.6985485008785943, + "language_loss": 0.73512006, + "learning_rate": 2.305115506191206e-06, + "loss": 0.75613004, + "num_input_tokens_seen": 167016105, + "step": 7779, + "time_per_iteration": 2.586927652359009 + }, + { + "auxiliary_loss_clip": 0.01026946, + "auxiliary_loss_mlp": 0.01040414, + "balance_loss_clip": 1.02444506, + "balance_loss_mlp": 1.02904105, + "epoch": 0.46775890575680146, + "flos": 21945298538880.0, + "grad_norm": 1.503568204097025, + "language_loss": 0.72494024, + "learning_rate": 2.304730597548562e-06, + "loss": 0.74561387, + "num_input_tokens_seen": 167036185, + "step": 7780, + "time_per_iteration": 4.3163793087005615 + }, + { + "auxiliary_loss_clip": 0.01037676, + "auxiliary_loss_mlp": 0.01044679, + "balance_loss_clip": 1.02216911, + "balance_loss_mlp": 1.03012919, + "epoch": 0.4678190290094694, + "flos": 25228395492480.0, + "grad_norm": 2.0535057363712714, + "language_loss": 0.74453795, + "learning_rate": 2.3043456773505023e-06, + "loss": 0.76536143, + "num_input_tokens_seen": 167054515, + "step": 7781, + "time_per_iteration": 2.6204497814178467 + }, + { + "auxiliary_loss_clip": 0.01063011, + "auxiliary_loss_mlp": 0.01032731, + "balance_loss_clip": 1.02686381, + "balance_loss_mlp": 1.02027392, + "epoch": 0.4678791522621374, + "flos": 32268358811520.0, + "grad_norm": 1.699769102173539, + "language_loss": 0.63513541, + "learning_rate": 2.3039607456116252e-06, + "loss": 0.65609288, + "num_input_tokens_seen": 167077245, + "step": 7782, + "time_per_iteration": 2.6682868003845215 + }, + { + "auxiliary_loss_clip": 0.01053097, + "auxiliary_loss_mlp": 0.0104147, + "balance_loss_clip": 1.02612305, + "balance_loss_mlp": 1.02928078, + "epoch": 0.46793927551480535, + "flos": 27046480337280.0, + "grad_norm": 1.8970469594215784, + "language_loss": 0.63405514, + "learning_rate": 2.3035758023465254e-06, + "loss": 0.65500081, + "num_input_tokens_seen": 167097235, + "step": 7783, + "time_per_iteration": 2.6268906593322754 + }, + { + "auxiliary_loss_clip": 0.01065985, + "auxiliary_loss_mlp": 0.0103898, + "balance_loss_clip": 1.02855873, + "balance_loss_mlp": 1.02536082, + "epoch": 0.4679993987674733, + "flos": 17457398576640.0, + "grad_norm": 2.432503828070492, + "language_loss": 0.67395931, + "learning_rate": 2.303190847569801e-06, + "loss": 0.69500899, + "num_input_tokens_seen": 167113155, + "step": 7784, + "time_per_iteration": 2.5458173751831055 + }, + { + "auxiliary_loss_clip": 0.01051655, + "auxiliary_loss_mlp": 0.01028844, + "balance_loss_clip": 1.0283587, + "balance_loss_mlp": 1.01727438, + "epoch": 0.4680595220201413, + "flos": 17165121609600.0, + "grad_norm": 2.3285079375538635, + "language_loss": 0.846771, + "learning_rate": 2.3028058812960497e-06, + "loss": 0.867576, + "num_input_tokens_seen": 167131765, + "step": 7785, + "time_per_iteration": 2.689122200012207 + }, + { + "auxiliary_loss_clip": 0.01043209, + "auxiliary_loss_mlp": 0.01032146, + "balance_loss_clip": 1.02755511, + "balance_loss_mlp": 1.0195576, + "epoch": 0.46811964527280925, + "flos": 11327591001600.0, + "grad_norm": 2.115666742760749, + "language_loss": 0.76962721, + "learning_rate": 2.3024209035398678e-06, + "loss": 0.79038072, + "num_input_tokens_seen": 167149030, + "step": 7786, + "time_per_iteration": 2.6341497898101807 + }, + { + "auxiliary_loss_clip": 0.01056788, + "auxiliary_loss_mlp": 0.01029137, + "balance_loss_clip": 1.02485824, + "balance_loss_mlp": 1.01806808, + "epoch": 0.4681797685254772, + "flos": 24278809593600.0, + "grad_norm": 3.294439599143166, + "language_loss": 0.73809916, + "learning_rate": 2.302035914315856e-06, + "loss": 0.7589584, + "num_input_tokens_seen": 167167375, + "step": 7787, + "time_per_iteration": 2.6396965980529785 + }, + { + "auxiliary_loss_clip": 0.01041198, + "auxiliary_loss_mlp": 0.01035483, + "balance_loss_clip": 1.02513504, + "balance_loss_mlp": 1.02333593, + "epoch": 0.4682398917781452, + "flos": 31650372293760.0, + "grad_norm": 2.114150386363484, + "language_loss": 0.65630496, + "learning_rate": 2.3016509136386116e-06, + "loss": 0.67707181, + "num_input_tokens_seen": 167188065, + "step": 7788, + "time_per_iteration": 2.7534966468811035 + }, + { + "auxiliary_loss_clip": 0.01058927, + "auxiliary_loss_mlp": 0.01028282, + "balance_loss_clip": 1.02560782, + "balance_loss_mlp": 1.01677799, + "epoch": 0.46830001503081314, + "flos": 28110765340800.0, + "grad_norm": 1.5486618895477777, + "language_loss": 0.63868928, + "learning_rate": 2.3012659015227343e-06, + "loss": 0.6595614, + "num_input_tokens_seen": 167209675, + "step": 7789, + "time_per_iteration": 2.7011945247650146 + }, + { + "auxiliary_loss_clip": 0.01002302, + "auxiliary_loss_mlp": 0.0100042, + "balance_loss_clip": 1.00407159, + "balance_loss_mlp": 0.99903709, + "epoch": 0.4683601382834811, + "flos": 57881718316800.0, + "grad_norm": 0.6946436859783881, + "language_loss": 0.61982286, + "learning_rate": 2.300880877982825e-06, + "loss": 0.63985008, + "num_input_tokens_seen": 167273940, + "step": 7790, + "time_per_iteration": 4.7978315353393555 + }, + { + "auxiliary_loss_clip": 0.01036128, + "auxiliary_loss_mlp": 0.01034085, + "balance_loss_clip": 1.02844644, + "balance_loss_mlp": 1.02129996, + "epoch": 0.46842026153614913, + "flos": 21871933009920.0, + "grad_norm": 1.8666286655851374, + "language_loss": 0.79430419, + "learning_rate": 2.3004958430334808e-06, + "loss": 0.81500632, + "num_input_tokens_seen": 167292730, + "step": 7791, + "time_per_iteration": 2.7315828800201416 + }, + { + "auxiliary_loss_clip": 0.01061398, + "auxiliary_loss_mlp": 0.01032626, + "balance_loss_clip": 1.02731991, + "balance_loss_mlp": 1.02096784, + "epoch": 0.4684803847888171, + "flos": 24900818434560.0, + "grad_norm": 1.579210636924046, + "language_loss": 0.75083566, + "learning_rate": 2.3001107966893052e-06, + "loss": 0.77177596, + "num_input_tokens_seen": 167313460, + "step": 7792, + "time_per_iteration": 2.656393051147461 + }, + { + "auxiliary_loss_clip": 0.01035357, + "auxiliary_loss_mlp": 0.01039029, + "balance_loss_clip": 1.0228101, + "balance_loss_mlp": 1.0263803, + "epoch": 0.46854050804148506, + "flos": 26251670142720.0, + "grad_norm": 1.49383682056823, + "language_loss": 0.68164057, + "learning_rate": 2.299725738964898e-06, + "loss": 0.70238441, + "num_input_tokens_seen": 167335385, + "step": 7793, + "time_per_iteration": 4.379226446151733 + }, + { + "auxiliary_loss_clip": 0.01061117, + "auxiliary_loss_mlp": 0.00747459, + "balance_loss_clip": 1.02855945, + "balance_loss_mlp": 1.00042498, + "epoch": 0.468600631294153, + "flos": 21579799697280.0, + "grad_norm": 1.5221300969957232, + "language_loss": 0.7358045, + "learning_rate": 2.2993406698748607e-06, + "loss": 0.75389028, + "num_input_tokens_seen": 167353625, + "step": 7794, + "time_per_iteration": 2.5976669788360596 + }, + { + "auxiliary_loss_clip": 0.01043432, + "auxiliary_loss_mlp": 0.01032809, + "balance_loss_clip": 1.02897, + "balance_loss_mlp": 1.02008986, + "epoch": 0.468660754546821, + "flos": 25885632597120.0, + "grad_norm": 1.5220885790092924, + "language_loss": 0.63554418, + "learning_rate": 2.2989555894337953e-06, + "loss": 0.65630651, + "num_input_tokens_seen": 167374565, + "step": 7795, + "time_per_iteration": 2.713869571685791 + }, + { + "auxiliary_loss_clip": 0.01028035, + "auxiliary_loss_mlp": 0.01026173, + "balance_loss_clip": 1.02319074, + "balance_loss_mlp": 1.01384699, + "epoch": 0.46872087779948896, + "flos": 35475001666560.0, + "grad_norm": 1.90587985380258, + "language_loss": 0.67933768, + "learning_rate": 2.298570497656304e-06, + "loss": 0.69987977, + "num_input_tokens_seen": 167395010, + "step": 7796, + "time_per_iteration": 2.779754400253296 + }, + { + "auxiliary_loss_clip": 0.0107147, + "auxiliary_loss_mlp": 0.00747521, + "balance_loss_clip": 1.02774358, + "balance_loss_mlp": 1.00041974, + "epoch": 0.4687810010521569, + "flos": 26396425952640.0, + "grad_norm": 1.722264357345542, + "language_loss": 0.70127231, + "learning_rate": 2.2981853945569894e-06, + "loss": 0.71946222, + "num_input_tokens_seen": 167415285, + "step": 7797, + "time_per_iteration": 2.7096190452575684 + }, + { + "auxiliary_loss_clip": 0.01046336, + "auxiliary_loss_mlp": 0.01029077, + "balance_loss_clip": 1.02631068, + "balance_loss_mlp": 1.01586866, + "epoch": 0.4688411243048249, + "flos": 19972761212160.0, + "grad_norm": 2.0384112138561443, + "language_loss": 0.6744076, + "learning_rate": 2.297800280150454e-06, + "loss": 0.6951617, + "num_input_tokens_seen": 167432405, + "step": 7798, + "time_per_iteration": 2.597982406616211 + }, + { + "auxiliary_loss_clip": 0.0100126, + "auxiliary_loss_mlp": 0.01003345, + "balance_loss_clip": 1.00285244, + "balance_loss_mlp": 1.00209355, + "epoch": 0.46890124755749285, + "flos": 63977015900160.0, + "grad_norm": 0.9239904760913773, + "language_loss": 0.6455636, + "learning_rate": 2.2974151544513033e-06, + "loss": 0.6656096, + "num_input_tokens_seen": 167499365, + "step": 7799, + "time_per_iteration": 3.3060812950134277 + }, + { + "auxiliary_loss_clip": 0.01043426, + "auxiliary_loss_mlp": 0.01026607, + "balance_loss_clip": 1.02917099, + "balance_loss_mlp": 1.01511574, + "epoch": 0.4689613708101608, + "flos": 23768985905280.0, + "grad_norm": 1.3752406695661537, + "language_loss": 0.72195363, + "learning_rate": 2.2970300174741395e-06, + "loss": 0.74265397, + "num_input_tokens_seen": 167520390, + "step": 7800, + "time_per_iteration": 2.652834892272949 + }, + { + "auxiliary_loss_clip": 0.01068874, + "auxiliary_loss_mlp": 0.01029181, + "balance_loss_clip": 1.02787399, + "balance_loss_mlp": 1.01875663, + "epoch": 0.4690214940628288, + "flos": 24788705109120.0, + "grad_norm": 1.6837437103279171, + "language_loss": 0.72412229, + "learning_rate": 2.296644869233568e-06, + "loss": 0.74510276, + "num_input_tokens_seen": 167539865, + "step": 7801, + "time_per_iteration": 2.601280689239502 + }, + { + "auxiliary_loss_clip": 0.01034552, + "auxiliary_loss_mlp": 0.010337, + "balance_loss_clip": 1.02350521, + "balance_loss_mlp": 1.02064705, + "epoch": 0.46908161731549675, + "flos": 18077324428800.0, + "grad_norm": 1.9298290061408259, + "language_loss": 0.62801301, + "learning_rate": 2.2962597097441936e-06, + "loss": 0.64869553, + "num_input_tokens_seen": 167558190, + "step": 7802, + "time_per_iteration": 2.7581491470336914 + }, + { + "auxiliary_loss_clip": 0.01071565, + "auxiliary_loss_mlp": 0.01035926, + "balance_loss_clip": 1.02650666, + "balance_loss_mlp": 1.02428484, + "epoch": 0.4691417405681647, + "flos": 25703350053120.0, + "grad_norm": 3.936429674332843, + "language_loss": 0.73966789, + "learning_rate": 2.2958745390206206e-06, + "loss": 0.76074278, + "num_input_tokens_seen": 167577685, + "step": 7803, + "time_per_iteration": 2.619442939758301 + }, + { + "auxiliary_loss_clip": 0.01049912, + "auxiliary_loss_mlp": 0.00747455, + "balance_loss_clip": 1.0280633, + "balance_loss_mlp": 1.00045252, + "epoch": 0.46920186382083273, + "flos": 17457039440640.0, + "grad_norm": 1.9478838945280688, + "language_loss": 0.77667648, + "learning_rate": 2.2954893570774558e-06, + "loss": 0.7946502, + "num_input_tokens_seen": 167596390, + "step": 7804, + "time_per_iteration": 2.6040091514587402 + }, + { + "auxiliary_loss_clip": 0.01051773, + "auxiliary_loss_mlp": 0.01029697, + "balance_loss_clip": 1.02752638, + "balance_loss_mlp": 1.01818156, + "epoch": 0.4692619870735007, + "flos": 20339445202560.0, + "grad_norm": 2.0713717548039585, + "language_loss": 0.77448368, + "learning_rate": 2.295104163929305e-06, + "loss": 0.7952984, + "num_input_tokens_seen": 167614980, + "step": 7805, + "time_per_iteration": 2.6932663917541504 + }, + { + "auxiliary_loss_clip": 0.01078986, + "auxiliary_loss_mlp": 0.01041198, + "balance_loss_clip": 1.03093028, + "balance_loss_mlp": 1.02812099, + "epoch": 0.46932211032616866, + "flos": 29496558003840.0, + "grad_norm": 1.8075267347614496, + "language_loss": 0.82566726, + "learning_rate": 2.2947189595907742e-06, + "loss": 0.84686911, + "num_input_tokens_seen": 167635895, + "step": 7806, + "time_per_iteration": 2.629838228225708 + }, + { + "auxiliary_loss_clip": 0.01051844, + "auxiliary_loss_mlp": 0.01031177, + "balance_loss_clip": 1.02776861, + "balance_loss_mlp": 1.0190537, + "epoch": 0.4693822335788366, + "flos": 36211242735360.0, + "grad_norm": 1.845249531132688, + "language_loss": 0.77184939, + "learning_rate": 2.294333744076472e-06, + "loss": 0.79267955, + "num_input_tokens_seen": 167657440, + "step": 7807, + "time_per_iteration": 2.7399184703826904 + }, + { + "auxiliary_loss_clip": 0.01052265, + "auxiliary_loss_mlp": 0.01033111, + "balance_loss_clip": 1.02815688, + "balance_loss_mlp": 1.02100527, + "epoch": 0.4694423568315046, + "flos": 20338978325760.0, + "grad_norm": 2.2800275428541332, + "language_loss": 0.51712841, + "learning_rate": 2.2939485174010035e-06, + "loss": 0.53798217, + "num_input_tokens_seen": 167675025, + "step": 7808, + "time_per_iteration": 2.6201813220977783 + }, + { + "auxiliary_loss_clip": 0.00971541, + "auxiliary_loss_mlp": 0.0100222, + "balance_loss_clip": 1.00398755, + "balance_loss_mlp": 1.00077724, + "epoch": 0.46950248008417256, + "flos": 64326353621760.0, + "grad_norm": 0.781685014663367, + "language_loss": 0.57795066, + "learning_rate": 2.293563279578978e-06, + "loss": 0.59768832, + "num_input_tokens_seen": 167729635, + "step": 7809, + "time_per_iteration": 3.1519737243652344 + }, + { + "auxiliary_loss_clip": 0.01028278, + "auxiliary_loss_mlp": 0.01037073, + "balance_loss_clip": 1.02803314, + "balance_loss_mlp": 1.0248245, + "epoch": 0.4695626033368405, + "flos": 19200106730880.0, + "grad_norm": 2.028999099179452, + "language_loss": 0.71452302, + "learning_rate": 2.2931780306250045e-06, + "loss": 0.73517656, + "num_input_tokens_seen": 167745135, + "step": 7810, + "time_per_iteration": 2.7350502014160156 + }, + { + "auxiliary_loss_clip": 0.01063018, + "auxiliary_loss_mlp": 0.01034901, + "balance_loss_clip": 1.02822244, + "balance_loss_mlp": 1.0233314, + "epoch": 0.4696227265895085, + "flos": 23002436736000.0, + "grad_norm": 2.1429341900128014, + "language_loss": 0.81032586, + "learning_rate": 2.29279277055369e-06, + "loss": 0.83130509, + "num_input_tokens_seen": 167763875, + "step": 7811, + "time_per_iteration": 2.621001720428467 + }, + { + "auxiliary_loss_clip": 0.01066394, + "auxiliary_loss_mlp": 0.01036527, + "balance_loss_clip": 1.03145361, + "balance_loss_mlp": 1.02417159, + "epoch": 0.46968284984217645, + "flos": 21870855601920.0, + "grad_norm": 1.559671341370385, + "language_loss": 0.80808592, + "learning_rate": 2.292407499379644e-06, + "loss": 0.82911509, + "num_input_tokens_seen": 167784895, + "step": 7812, + "time_per_iteration": 2.7183334827423096 + }, + { + "auxiliary_loss_clip": 0.01013004, + "auxiliary_loss_mlp": 0.01030385, + "balance_loss_clip": 1.02307749, + "balance_loss_mlp": 1.01858306, + "epoch": 0.4697429730948444, + "flos": 19974987855360.0, + "grad_norm": 1.7122638242791246, + "language_loss": 0.74344707, + "learning_rate": 2.292022217117477e-06, + "loss": 0.76388097, + "num_input_tokens_seen": 167803185, + "step": 7813, + "time_per_iteration": 2.769115686416626 + }, + { + "auxiliary_loss_clip": 0.01050209, + "auxiliary_loss_mlp": 0.0102704, + "balance_loss_clip": 1.026649, + "balance_loss_mlp": 1.01549494, + "epoch": 0.4698030963475124, + "flos": 15156206784000.0, + "grad_norm": 2.075607512622169, + "language_loss": 0.84688079, + "learning_rate": 2.291636923781798e-06, + "loss": 0.86765331, + "num_input_tokens_seen": 167816550, + "step": 7814, + "time_per_iteration": 2.635010242462158 + }, + { + "auxiliary_loss_clip": 0.01045776, + "auxiliary_loss_mlp": 0.01031976, + "balance_loss_clip": 1.02458441, + "balance_loss_mlp": 1.02062774, + "epoch": 0.46986321960018035, + "flos": 15151178880000.0, + "grad_norm": 2.072481040492035, + "language_loss": 0.81602019, + "learning_rate": 2.291251619387217e-06, + "loss": 0.83679771, + "num_input_tokens_seen": 167831845, + "step": 7815, + "time_per_iteration": 2.812622308731079 + }, + { + "auxiliary_loss_clip": 0.01032321, + "auxiliary_loss_mlp": 0.01031663, + "balance_loss_clip": 1.02818298, + "balance_loss_mlp": 1.01917601, + "epoch": 0.4699233428528483, + "flos": 23108911626240.0, + "grad_norm": 5.982264544656432, + "language_loss": 0.77590573, + "learning_rate": 2.2908663039483468e-06, + "loss": 0.79654551, + "num_input_tokens_seen": 167850360, + "step": 7816, + "time_per_iteration": 2.805379629135132 + }, + { + "auxiliary_loss_clip": 0.01009796, + "auxiliary_loss_mlp": 0.01002556, + "balance_loss_clip": 1.00224423, + "balance_loss_mlp": 1.00136423, + "epoch": 0.46998346610551633, + "flos": 68105558246400.0, + "grad_norm": 0.8413221132649336, + "language_loss": 0.59095371, + "learning_rate": 2.290480977479796e-06, + "loss": 0.61107719, + "num_input_tokens_seen": 167908660, + "step": 7817, + "time_per_iteration": 3.1392197608947754 + }, + { + "auxiliary_loss_clip": 0.01048145, + "auxiliary_loss_mlp": 0.01031056, + "balance_loss_clip": 1.02709985, + "balance_loss_mlp": 1.01965952, + "epoch": 0.4700435893581843, + "flos": 24129456842880.0, + "grad_norm": 1.7314906195337427, + "language_loss": 0.79360068, + "learning_rate": 2.2900956399961775e-06, + "loss": 0.81439269, + "num_input_tokens_seen": 167927905, + "step": 7818, + "time_per_iteration": 2.7726101875305176 + }, + { + "auxiliary_loss_clip": 0.01070365, + "auxiliary_loss_mlp": 0.01029799, + "balance_loss_clip": 1.02665663, + "balance_loss_mlp": 1.01869488, + "epoch": 0.47010371261085226, + "flos": 20150518642560.0, + "grad_norm": 2.0413555821728653, + "language_loss": 0.84248853, + "learning_rate": 2.289710291512104e-06, + "loss": 0.86349016, + "num_input_tokens_seen": 167945995, + "step": 7819, + "time_per_iteration": 2.5910656452178955 + }, + { + "auxiliary_loss_clip": 0.0103814, + "auxiliary_loss_mlp": 0.0103079, + "balance_loss_clip": 1.02486885, + "balance_loss_mlp": 1.01809371, + "epoch": 0.47016383586352023, + "flos": 15122199582720.0, + "grad_norm": 1.8930512899257144, + "language_loss": 0.76190948, + "learning_rate": 2.289324932042186e-06, + "loss": 0.78259879, + "num_input_tokens_seen": 167963380, + "step": 7820, + "time_per_iteration": 2.661027193069458 + }, + { + "auxiliary_loss_clip": 0.01060945, + "auxiliary_loss_mlp": 0.01034061, + "balance_loss_clip": 1.02929544, + "balance_loss_mlp": 1.02239001, + "epoch": 0.4702239591161882, + "flos": 13552975140480.0, + "grad_norm": 1.7923826320717906, + "language_loss": 0.74060482, + "learning_rate": 2.288939561601039e-06, + "loss": 0.76155484, + "num_input_tokens_seen": 167981740, + "step": 7821, + "time_per_iteration": 2.6618759632110596 + }, + { + "auxiliary_loss_clip": 0.01070685, + "auxiliary_loss_mlp": 0.01036426, + "balance_loss_clip": 1.02794778, + "balance_loss_mlp": 1.0248512, + "epoch": 0.47028408236885616, + "flos": 24276511123200.0, + "grad_norm": 1.7262757258481236, + "language_loss": 0.8924492, + "learning_rate": 2.2885541802032746e-06, + "loss": 0.91352028, + "num_input_tokens_seen": 167999380, + "step": 7822, + "time_per_iteration": 2.702690601348877 + }, + { + "auxiliary_loss_clip": 0.0105939, + "auxiliary_loss_mlp": 0.01031456, + "balance_loss_clip": 1.02734995, + "balance_loss_mlp": 1.02032197, + "epoch": 0.4703442056215241, + "flos": 22856926740480.0, + "grad_norm": 1.4936460561553007, + "language_loss": 0.7965591, + "learning_rate": 2.2881687878635055e-06, + "loss": 0.81746751, + "num_input_tokens_seen": 168018395, + "step": 7823, + "time_per_iteration": 2.6711699962615967 + }, + { + "auxiliary_loss_clip": 0.00993118, + "auxiliary_loss_mlp": 0.01005218, + "balance_loss_clip": 1.00494778, + "balance_loss_mlp": 1.00375223, + "epoch": 0.4704043288741921, + "flos": 69240227950080.0, + "grad_norm": 0.7270824680773919, + "language_loss": 0.56739432, + "learning_rate": 2.2877833845963487e-06, + "loss": 0.58737767, + "num_input_tokens_seen": 168084080, + "step": 7824, + "time_per_iteration": 3.3008294105529785 + }, + { + "auxiliary_loss_clip": 0.01046847, + "auxiliary_loss_mlp": 0.01035536, + "balance_loss_clip": 1.02503824, + "balance_loss_mlp": 1.02259541, + "epoch": 0.47046445212686006, + "flos": 18041090584320.0, + "grad_norm": 2.50502026419091, + "language_loss": 0.81047815, + "learning_rate": 2.2873979704164157e-06, + "loss": 0.83130193, + "num_input_tokens_seen": 168101555, + "step": 7825, + "time_per_iteration": 2.772277355194092 + }, + { + "auxiliary_loss_clip": 0.01051813, + "auxiliary_loss_mlp": 0.01029231, + "balance_loss_clip": 1.0275526, + "balance_loss_mlp": 1.01691699, + "epoch": 0.470524575379528, + "flos": 23951448017280.0, + "grad_norm": 2.0874135343096807, + "language_loss": 0.66757494, + "learning_rate": 2.287012545338324e-06, + "loss": 0.68838537, + "num_input_tokens_seen": 168121530, + "step": 7826, + "time_per_iteration": 2.78829026222229 + }, + { + "auxiliary_loss_clip": 0.01044915, + "auxiliary_loss_mlp": 0.01036306, + "balance_loss_clip": 1.02424717, + "balance_loss_mlp": 1.02378321, + "epoch": 0.470584698632196, + "flos": 18113558273280.0, + "grad_norm": 3.742305448974124, + "language_loss": 0.83801663, + "learning_rate": 2.2866271093766877e-06, + "loss": 0.8588289, + "num_input_tokens_seen": 168140335, + "step": 7827, + "time_per_iteration": 5.788501977920532 + }, + { + "auxiliary_loss_clip": 0.00993197, + "auxiliary_loss_mlp": 0.01001765, + "balance_loss_clip": 1.00430536, + "balance_loss_mlp": 1.00050092, + "epoch": 0.47064482188486395, + "flos": 57251916224640.0, + "grad_norm": 0.8148526663975766, + "language_loss": 0.55639368, + "learning_rate": 2.286241662546122e-06, + "loss": 0.57634336, + "num_input_tokens_seen": 168200535, + "step": 7828, + "time_per_iteration": 3.2431282997131348 + }, + { + "auxiliary_loss_clip": 0.01070172, + "auxiliary_loss_mlp": 0.01031665, + "balance_loss_clip": 1.02771902, + "balance_loss_mlp": 1.02001226, + "epoch": 0.4707049451375319, + "flos": 17895077798400.0, + "grad_norm": 2.063206051760298, + "language_loss": 0.80626404, + "learning_rate": 2.285856204861245e-06, + "loss": 0.82728243, + "num_input_tokens_seen": 168219610, + "step": 7829, + "time_per_iteration": 2.661632537841797 + }, + { + "auxiliary_loss_clip": 0.01070307, + "auxiliary_loss_mlp": 0.0102984, + "balance_loss_clip": 1.02802515, + "balance_loss_mlp": 1.01878953, + "epoch": 0.47076506839019994, + "flos": 25232669210880.0, + "grad_norm": 1.381754473701628, + "language_loss": 0.75645494, + "learning_rate": 2.2854707363366703e-06, + "loss": 0.7774564, + "num_input_tokens_seen": 168242505, + "step": 7830, + "time_per_iteration": 2.7604215145111084 + }, + { + "auxiliary_loss_clip": 0.01038102, + "auxiliary_loss_mlp": 0.01030786, + "balance_loss_clip": 1.02840447, + "balance_loss_mlp": 1.01844203, + "epoch": 0.4708251916428679, + "flos": 13479681438720.0, + "grad_norm": 2.2055116109370907, + "language_loss": 0.78660434, + "learning_rate": 2.2850852569870177e-06, + "loss": 0.80729318, + "num_input_tokens_seen": 168260220, + "step": 7831, + "time_per_iteration": 2.843167543411255 + }, + { + "auxiliary_loss_clip": 0.01020809, + "auxiliary_loss_mlp": 0.01042231, + "balance_loss_clip": 1.02141285, + "balance_loss_mlp": 1.02785456, + "epoch": 0.47088531489553587, + "flos": 30147833450880.0, + "grad_norm": 1.7122520556003316, + "language_loss": 0.75731218, + "learning_rate": 2.2846997668269033e-06, + "loss": 0.77794254, + "num_input_tokens_seen": 168277360, + "step": 7832, + "time_per_iteration": 2.8696155548095703 + }, + { + "auxiliary_loss_clip": 0.01050694, + "auxiliary_loss_mlp": 0.0102554, + "balance_loss_clip": 1.02819157, + "balance_loss_mlp": 1.01479959, + "epoch": 0.47094543814820383, + "flos": 21798280172160.0, + "grad_norm": 1.2859259927991595, + "language_loss": 0.74567193, + "learning_rate": 2.2843142658709454e-06, + "loss": 0.76643431, + "num_input_tokens_seen": 168296605, + "step": 7833, + "time_per_iteration": 2.886434316635132 + }, + { + "auxiliary_loss_clip": 0.01059669, + "auxiliary_loss_mlp": 0.01031031, + "balance_loss_clip": 1.02751923, + "balance_loss_mlp": 1.01919341, + "epoch": 0.4710055614008718, + "flos": 23003011353600.0, + "grad_norm": 1.735000375020159, + "language_loss": 0.75502014, + "learning_rate": 2.283928754133762e-06, + "loss": 0.77592719, + "num_input_tokens_seen": 168316205, + "step": 7834, + "time_per_iteration": 2.922271966934204 + }, + { + "auxiliary_loss_clip": 0.01029185, + "auxiliary_loss_mlp": 0.01033908, + "balance_loss_clip": 1.02721548, + "balance_loss_mlp": 1.02269673, + "epoch": 0.47106568465353976, + "flos": 42741346452480.0, + "grad_norm": 1.816089280218211, + "language_loss": 0.66234851, + "learning_rate": 2.283543231629972e-06, + "loss": 0.68297946, + "num_input_tokens_seen": 168338935, + "step": 7835, + "time_per_iteration": 3.0387532711029053 + }, + { + "auxiliary_loss_clip": 0.00999846, + "auxiliary_loss_mlp": 0.00746418, + "balance_loss_clip": 1.00208378, + "balance_loss_mlp": 1.00022888, + "epoch": 0.4711258079062077, + "flos": 68554008570240.0, + "grad_norm": 0.8673937621231835, + "language_loss": 0.62128937, + "learning_rate": 2.283157698374194e-06, + "loss": 0.63875198, + "num_input_tokens_seen": 168392800, + "step": 7836, + "time_per_iteration": 3.162062406539917 + }, + { + "auxiliary_loss_clip": 0.01037085, + "auxiliary_loss_mlp": 0.00747514, + "balance_loss_clip": 1.02570295, + "balance_loss_mlp": 1.00035167, + "epoch": 0.4711859311588757, + "flos": 25446588658560.0, + "grad_norm": 3.491591079945618, + "language_loss": 0.69800502, + "learning_rate": 2.2827721543810475e-06, + "loss": 0.71585095, + "num_input_tokens_seen": 168412940, + "step": 7837, + "time_per_iteration": 4.297774791717529 + }, + { + "auxiliary_loss_clip": 0.01055595, + "auxiliary_loss_mlp": 0.01032053, + "balance_loss_clip": 1.02699184, + "balance_loss_mlp": 1.01929724, + "epoch": 0.47124605441154366, + "flos": 21981891519360.0, + "grad_norm": 2.003882938749135, + "language_loss": 0.6611529, + "learning_rate": 2.282386599665153e-06, + "loss": 0.68202937, + "num_input_tokens_seen": 168431995, + "step": 7838, + "time_per_iteration": 2.5450656414031982 + }, + { + "auxiliary_loss_clip": 0.01044095, + "auxiliary_loss_mlp": 0.01032277, + "balance_loss_clip": 1.02406907, + "balance_loss_mlp": 1.01883626, + "epoch": 0.4713061776642116, + "flos": 25412689198080.0, + "grad_norm": 1.9039446209735913, + "language_loss": 0.77182859, + "learning_rate": 2.2820010342411304e-06, + "loss": 0.79259229, + "num_input_tokens_seen": 168454585, + "step": 7839, + "time_per_iteration": 4.249591112136841 + }, + { + "auxiliary_loss_clip": 0.01037046, + "auxiliary_loss_mlp": 0.01028158, + "balance_loss_clip": 1.02679825, + "balance_loss_mlp": 1.01718521, + "epoch": 0.4713663009168796, + "flos": 26542259170560.0, + "grad_norm": 1.963388028219675, + "language_loss": 0.72491133, + "learning_rate": 2.2816154581235993e-06, + "loss": 0.74556333, + "num_input_tokens_seen": 168471265, + "step": 7840, + "time_per_iteration": 2.7222371101379395 + }, + { + "auxiliary_loss_clip": 0.01048015, + "auxiliary_loss_mlp": 0.01027714, + "balance_loss_clip": 1.02549219, + "balance_loss_mlp": 1.01590669, + "epoch": 0.47142642416954755, + "flos": 23623583650560.0, + "grad_norm": 1.5915912837440758, + "language_loss": 0.74995047, + "learning_rate": 2.2812298713271833e-06, + "loss": 0.77070773, + "num_input_tokens_seen": 168491360, + "step": 7841, + "time_per_iteration": 2.6244523525238037 + }, + { + "auxiliary_loss_clip": 0.01043851, + "auxiliary_loss_mlp": 0.01034821, + "balance_loss_clip": 1.02635765, + "balance_loss_mlp": 1.02378798, + "epoch": 0.4714865474222155, + "flos": 22310150935680.0, + "grad_norm": 1.9760205022536739, + "language_loss": 0.70478952, + "learning_rate": 2.280844273866501e-06, + "loss": 0.72557622, + "num_input_tokens_seen": 168511335, + "step": 7842, + "time_per_iteration": 2.6332647800445557 + }, + { + "auxiliary_loss_clip": 0.0106424, + "auxiliary_loss_mlp": 0.01028246, + "balance_loss_clip": 1.03050041, + "balance_loss_mlp": 1.01617599, + "epoch": 0.4715466706748835, + "flos": 17822430541440.0, + "grad_norm": 2.3246973107172133, + "language_loss": 0.78406948, + "learning_rate": 2.280458665756177e-06, + "loss": 0.80499434, + "num_input_tokens_seen": 168529920, + "step": 7843, + "time_per_iteration": 2.566882610321045 + }, + { + "auxiliary_loss_clip": 0.01054744, + "auxiliary_loss_mlp": 0.01028641, + "balance_loss_clip": 1.02664518, + "balance_loss_mlp": 1.0178169, + "epoch": 0.4716067939275515, + "flos": 23659530186240.0, + "grad_norm": 1.6041593111161137, + "language_loss": 0.74497247, + "learning_rate": 2.280073047010832e-06, + "loss": 0.76580632, + "num_input_tokens_seen": 168550595, + "step": 7844, + "time_per_iteration": 2.680403709411621 + }, + { + "auxiliary_loss_clip": 0.01051513, + "auxiliary_loss_mlp": 0.01040279, + "balance_loss_clip": 1.02869773, + "balance_loss_mlp": 1.02839434, + "epoch": 0.47166691718021947, + "flos": 17930162407680.0, + "grad_norm": 3.3698482950267987, + "language_loss": 0.78600252, + "learning_rate": 2.279687417645088e-06, + "loss": 0.80692041, + "num_input_tokens_seen": 168569765, + "step": 7845, + "time_per_iteration": 2.791802406311035 + }, + { + "auxiliary_loss_clip": 0.01057556, + "auxiliary_loss_mlp": 0.01032942, + "balance_loss_clip": 1.02593446, + "balance_loss_mlp": 1.02174854, + "epoch": 0.47172704043288743, + "flos": 26614583205120.0, + "grad_norm": 22.20846793020291, + "language_loss": 0.73138827, + "learning_rate": 2.2793017776735703e-06, + "loss": 0.75229323, + "num_input_tokens_seen": 168591525, + "step": 7846, + "time_per_iteration": 2.752958059310913 + }, + { + "auxiliary_loss_clip": 0.0105709, + "auxiliary_loss_mlp": 0.01028, + "balance_loss_clip": 1.02646959, + "balance_loss_mlp": 1.0164243, + "epoch": 0.4717871636855554, + "flos": 27922700707200.0, + "grad_norm": 1.3398150280329724, + "language_loss": 0.74203801, + "learning_rate": 2.2789161271109e-06, + "loss": 0.76288891, + "num_input_tokens_seen": 168611235, + "step": 7847, + "time_per_iteration": 2.690706253051758 + }, + { + "auxiliary_loss_clip": 0.01021138, + "auxiliary_loss_mlp": 0.01029559, + "balance_loss_clip": 1.02539039, + "balance_loss_mlp": 1.0183239, + "epoch": 0.47184728693822336, + "flos": 14502237816960.0, + "grad_norm": 1.6997061004554042, + "language_loss": 0.80899405, + "learning_rate": 2.278530465971703e-06, + "loss": 0.82950103, + "num_input_tokens_seen": 168628710, + "step": 7848, + "time_per_iteration": 2.7164788246154785 + }, + { + "auxiliary_loss_clip": 0.01056699, + "auxiliary_loss_mlp": 0.01031172, + "balance_loss_clip": 1.02716625, + "balance_loss_mlp": 1.0191679, + "epoch": 0.47190741019089133, + "flos": 17856545483520.0, + "grad_norm": 1.910775942822178, + "language_loss": 0.70190454, + "learning_rate": 2.2781447942706032e-06, + "loss": 0.72278321, + "num_input_tokens_seen": 168645645, + "step": 7849, + "time_per_iteration": 2.6775810718536377 + }, + { + "auxiliary_loss_clip": 0.01040941, + "auxiliary_loss_mlp": 0.01036111, + "balance_loss_clip": 1.02602458, + "balance_loss_mlp": 1.02334917, + "epoch": 0.4719675334435593, + "flos": 17895472848000.0, + "grad_norm": 2.224739056115066, + "language_loss": 0.69762188, + "learning_rate": 2.277759112022224e-06, + "loss": 0.71839237, + "num_input_tokens_seen": 168664165, + "step": 7850, + "time_per_iteration": 2.8350586891174316 + }, + { + "auxiliary_loss_clip": 0.01018199, + "auxiliary_loss_mlp": 0.01028383, + "balance_loss_clip": 1.02533793, + "balance_loss_mlp": 1.01643801, + "epoch": 0.47202765669622726, + "flos": 20704369426560.0, + "grad_norm": 1.7990410989412973, + "language_loss": 0.75009727, + "learning_rate": 2.2773734192411916e-06, + "loss": 0.77056307, + "num_input_tokens_seen": 168681940, + "step": 7851, + "time_per_iteration": 2.933048963546753 + }, + { + "auxiliary_loss_clip": 0.01001465, + "auxiliary_loss_mlp": 0.01045974, + "balance_loss_clip": 1.02071238, + "balance_loss_mlp": 1.03166282, + "epoch": 0.4720877799488952, + "flos": 16360255607040.0, + "grad_norm": 2.021805678086048, + "language_loss": 0.75860631, + "learning_rate": 2.276987715942132e-06, + "loss": 0.77908075, + "num_input_tokens_seen": 168698830, + "step": 7852, + "time_per_iteration": 2.8563687801361084 + }, + { + "auxiliary_loss_clip": 0.01040209, + "auxiliary_loss_mlp": 0.01027909, + "balance_loss_clip": 1.02714896, + "balance_loss_mlp": 1.01540959, + "epoch": 0.4721479032015632, + "flos": 20668171495680.0, + "grad_norm": 1.5251268366971553, + "language_loss": 0.69137084, + "learning_rate": 2.2766020021396696e-06, + "loss": 0.71205199, + "num_input_tokens_seen": 168718305, + "step": 7853, + "time_per_iteration": 2.856299638748169 + }, + { + "auxiliary_loss_clip": 0.00983836, + "auxiliary_loss_mlp": 0.01004637, + "balance_loss_clip": 1.01419258, + "balance_loss_mlp": 1.00317705, + "epoch": 0.47220802645423116, + "flos": 67750438435200.0, + "grad_norm": 0.7114918593819182, + "language_loss": 0.50197685, + "learning_rate": 2.276216277848432e-06, + "loss": 0.52186155, + "num_input_tokens_seen": 168782365, + "step": 7854, + "time_per_iteration": 3.4863765239715576 + }, + { + "auxiliary_loss_clip": 0.01064591, + "auxiliary_loss_mlp": 0.01033105, + "balance_loss_clip": 1.02882493, + "balance_loss_mlp": 1.0203855, + "epoch": 0.4722681497068991, + "flos": 20921449271040.0, + "grad_norm": 1.8874962742509396, + "language_loss": 0.63763255, + "learning_rate": 2.2758305430830455e-06, + "loss": 0.65860951, + "num_input_tokens_seen": 168800485, + "step": 7855, + "time_per_iteration": 2.641730785369873 + }, + { + "auxiliary_loss_clip": 0.01060936, + "auxiliary_loss_mlp": 0.01032083, + "balance_loss_clip": 1.02746487, + "balance_loss_mlp": 1.01970291, + "epoch": 0.4723282729595671, + "flos": 28293083798400.0, + "grad_norm": 2.0973248825972686, + "language_loss": 0.75862813, + "learning_rate": 2.2754447978581376e-06, + "loss": 0.77955836, + "num_input_tokens_seen": 168818965, + "step": 7856, + "time_per_iteration": 2.699320077896118 + }, + { + "auxiliary_loss_clip": 0.01049476, + "auxiliary_loss_mlp": 0.01029442, + "balance_loss_clip": 1.02721953, + "balance_loss_mlp": 1.01847446, + "epoch": 0.4723883962122351, + "flos": 27125053338240.0, + "grad_norm": 1.7332541391271783, + "language_loss": 0.74484503, + "learning_rate": 2.2750590421883347e-06, + "loss": 0.76563424, + "num_input_tokens_seen": 168840355, + "step": 7857, + "time_per_iteration": 2.83089280128479 + }, + { + "auxiliary_loss_clip": 0.01049897, + "auxiliary_loss_mlp": 0.01033013, + "balance_loss_clip": 1.02730227, + "balance_loss_mlp": 1.02224886, + "epoch": 0.47244851946490307, + "flos": 31537253387520.0, + "grad_norm": 1.441176282137828, + "language_loss": 0.64610583, + "learning_rate": 2.2746732760882655e-06, + "loss": 0.66693497, + "num_input_tokens_seen": 168861765, + "step": 7858, + "time_per_iteration": 2.780306100845337 + }, + { + "auxiliary_loss_clip": 0.01057594, + "auxiliary_loss_mlp": 0.00747469, + "balance_loss_clip": 1.025316, + "balance_loss_mlp": 1.00039148, + "epoch": 0.47250864271757104, + "flos": 20886544229760.0, + "grad_norm": 1.5914068246725959, + "language_loss": 0.70164061, + "learning_rate": 2.2742874995725575e-06, + "loss": 0.71969116, + "num_input_tokens_seen": 168881310, + "step": 7859, + "time_per_iteration": 2.8322253227233887 + }, + { + "auxiliary_loss_clip": 0.01076324, + "auxiliary_loss_mlp": 0.01033567, + "balance_loss_clip": 1.03025627, + "balance_loss_mlp": 1.02170587, + "epoch": 0.472568765970239, + "flos": 20522086882560.0, + "grad_norm": 1.9447302045583623, + "language_loss": 0.62616879, + "learning_rate": 2.2739017126558413e-06, + "loss": 0.6472677, + "num_input_tokens_seen": 168899470, + "step": 7860, + "time_per_iteration": 2.641653299331665 + }, + { + "auxiliary_loss_clip": 0.01052467, + "auxiliary_loss_mlp": 0.01038804, + "balance_loss_clip": 1.02853918, + "balance_loss_mlp": 1.02670383, + "epoch": 0.47262888922290697, + "flos": 35805200417280.0, + "grad_norm": 2.435226309340082, + "language_loss": 0.722323, + "learning_rate": 2.2735159153527445e-06, + "loss": 0.74323565, + "num_input_tokens_seen": 168921495, + "step": 7861, + "time_per_iteration": 2.7650153636932373 + }, + { + "auxiliary_loss_clip": 0.01045647, + "auxiliary_loss_mlp": 0.01032427, + "balance_loss_clip": 1.02738953, + "balance_loss_mlp": 1.02024388, + "epoch": 0.47268901247557493, + "flos": 20667740532480.0, + "grad_norm": 3.534368713115836, + "language_loss": 0.85071123, + "learning_rate": 2.273130107677896e-06, + "loss": 0.87149197, + "num_input_tokens_seen": 168940515, + "step": 7862, + "time_per_iteration": 2.6583189964294434 + }, + { + "auxiliary_loss_clip": 0.0107094, + "auxiliary_loss_mlp": 0.01031421, + "balance_loss_clip": 1.02621007, + "balance_loss_mlp": 1.01939893, + "epoch": 0.4727491357282429, + "flos": 19573291082880.0, + "grad_norm": 1.9712389957195393, + "language_loss": 0.84536219, + "learning_rate": 2.272744289645927e-06, + "loss": 0.86638576, + "num_input_tokens_seen": 168958340, + "step": 7863, + "time_per_iteration": 2.59419584274292 + }, + { + "auxiliary_loss_clip": 0.01049548, + "auxiliary_loss_mlp": 0.01037095, + "balance_loss_clip": 1.0273689, + "balance_loss_mlp": 1.02530479, + "epoch": 0.47280925898091086, + "flos": 18217231902720.0, + "grad_norm": 1.7627823458050689, + "language_loss": 0.65469027, + "learning_rate": 2.272358461271467e-06, + "loss": 0.67555666, + "num_input_tokens_seen": 168974850, + "step": 7864, + "time_per_iteration": 2.6620213985443115 + }, + { + "auxiliary_loss_clip": 0.0106999, + "auxiliary_loss_mlp": 0.01031525, + "balance_loss_clip": 1.02662206, + "balance_loss_mlp": 1.01938951, + "epoch": 0.4728693822335788, + "flos": 17821820010240.0, + "grad_norm": 3.846263302573244, + "language_loss": 0.65100127, + "learning_rate": 2.271972622569147e-06, + "loss": 0.6720165, + "num_input_tokens_seen": 168992860, + "step": 7865, + "time_per_iteration": 2.5197372436523438 + }, + { + "auxiliary_loss_clip": 0.01042207, + "auxiliary_loss_mlp": 0.00747543, + "balance_loss_clip": 1.02530348, + "balance_loss_mlp": 1.00042701, + "epoch": 0.4729295054862468, + "flos": 20595057361920.0, + "grad_norm": 1.7779265296094298, + "language_loss": 0.7412799, + "learning_rate": 2.2715867735535976e-06, + "loss": 0.75917739, + "num_input_tokens_seen": 169010325, + "step": 7866, + "time_per_iteration": 2.758028507232666 + }, + { + "auxiliary_loss_clip": 0.01072007, + "auxiliary_loss_mlp": 0.01032716, + "balance_loss_clip": 1.02690625, + "balance_loss_mlp": 1.020926, + "epoch": 0.47298962873891476, + "flos": 23368079232000.0, + "grad_norm": 1.8451189946707836, + "language_loss": 0.82945532, + "learning_rate": 2.271200914239451e-06, + "loss": 0.85050255, + "num_input_tokens_seen": 169029840, + "step": 7867, + "time_per_iteration": 2.5977067947387695 + }, + { + "auxiliary_loss_clip": 0.01058062, + "auxiliary_loss_mlp": 0.01029289, + "balance_loss_clip": 1.02634072, + "balance_loss_mlp": 1.01795816, + "epoch": 0.4730497519915827, + "flos": 22052240305920.0, + "grad_norm": 1.6120909043263616, + "language_loss": 0.79687834, + "learning_rate": 2.2708150446413385e-06, + "loss": 0.81775188, + "num_input_tokens_seen": 169049975, + "step": 7868, + "time_per_iteration": 2.689218759536743 + }, + { + "auxiliary_loss_clip": 0.01001746, + "auxiliary_loss_mlp": 0.01032769, + "balance_loss_clip": 1.02461267, + "balance_loss_mlp": 1.01901269, + "epoch": 0.4731098752442507, + "flos": 21069724613760.0, + "grad_norm": 1.708694124677937, + "language_loss": 0.74658334, + "learning_rate": 2.2704291647738915e-06, + "loss": 0.76692843, + "num_input_tokens_seen": 169069540, + "step": 7869, + "time_per_iteration": 2.993619203567505 + }, + { + "auxiliary_loss_clip": 0.01052566, + "auxiliary_loss_mlp": 0.01040079, + "balance_loss_clip": 1.02844524, + "balance_loss_mlp": 1.02676904, + "epoch": 0.4731699984969187, + "flos": 22528775064960.0, + "grad_norm": 1.6131798153603327, + "language_loss": 0.73727977, + "learning_rate": 2.2700432746517443e-06, + "loss": 0.75820619, + "num_input_tokens_seen": 169089940, + "step": 7870, + "time_per_iteration": 2.926443576812744 + }, + { + "auxiliary_loss_clip": 0.01074698, + "auxiliary_loss_mlp": 0.01033806, + "balance_loss_clip": 1.02833402, + "balance_loss_mlp": 1.02089596, + "epoch": 0.4732301217495867, + "flos": 24898124914560.0, + "grad_norm": 1.9756213729287213, + "language_loss": 0.81347924, + "learning_rate": 2.2696573742895292e-06, + "loss": 0.83456427, + "num_input_tokens_seen": 169109650, + "step": 7871, + "time_per_iteration": 2.679384708404541 + }, + { + "auxiliary_loss_clip": 0.0105562, + "auxiliary_loss_mlp": 0.01032878, + "balance_loss_clip": 1.02593207, + "balance_loss_mlp": 1.02077222, + "epoch": 0.47329024500225464, + "flos": 22784423137920.0, + "grad_norm": 1.6160116145528636, + "language_loss": 0.75797153, + "learning_rate": 2.269271463701879e-06, + "loss": 0.77885652, + "num_input_tokens_seen": 169128990, + "step": 7872, + "time_per_iteration": 2.6839067935943604 + }, + { + "auxiliary_loss_clip": 0.01035983, + "auxiliary_loss_mlp": 0.01032402, + "balance_loss_clip": 1.02371097, + "balance_loss_mlp": 1.02039719, + "epoch": 0.4733503682549226, + "flos": 38695902220800.0, + "grad_norm": 2.1322721487678384, + "language_loss": 0.67883128, + "learning_rate": 2.268885542903428e-06, + "loss": 0.6995151, + "num_input_tokens_seen": 169154645, + "step": 7873, + "time_per_iteration": 2.9092838764190674 + }, + { + "auxiliary_loss_clip": 0.01060423, + "auxiliary_loss_mlp": 0.01029962, + "balance_loss_clip": 1.02719378, + "balance_loss_mlp": 1.01850581, + "epoch": 0.47341049150759057, + "flos": 22966849336320.0, + "grad_norm": 1.5441780460006032, + "language_loss": 0.72476017, + "learning_rate": 2.26849961190881e-06, + "loss": 0.745664, + "num_input_tokens_seen": 169174995, + "step": 7874, + "time_per_iteration": 4.217544078826904 + }, + { + "auxiliary_loss_clip": 0.01050886, + "auxiliary_loss_mlp": 0.01033477, + "balance_loss_clip": 1.02640903, + "balance_loss_mlp": 1.02221179, + "epoch": 0.47347061476025853, + "flos": 14538471661440.0, + "grad_norm": 2.233317658747711, + "language_loss": 0.65234637, + "learning_rate": 2.26811367073266e-06, + "loss": 0.67318994, + "num_input_tokens_seen": 169191815, + "step": 7875, + "time_per_iteration": 2.7013134956359863 + }, + { + "auxiliary_loss_clip": 0.01026375, + "auxiliary_loss_mlp": 0.01033737, + "balance_loss_clip": 1.02684498, + "balance_loss_mlp": 1.02082694, + "epoch": 0.4735307380129265, + "flos": 30263250827520.0, + "grad_norm": 2.6782139386440673, + "language_loss": 0.8065812, + "learning_rate": 2.2677277193896125e-06, + "loss": 0.82718229, + "num_input_tokens_seen": 169210430, + "step": 7876, + "time_per_iteration": 2.8389945030212402 + }, + { + "auxiliary_loss_clip": 0.0103594, + "auxiliary_loss_mlp": 0.01033077, + "balance_loss_clip": 1.0222621, + "balance_loss_mlp": 1.02038169, + "epoch": 0.47359086126559446, + "flos": 19391044452480.0, + "grad_norm": 1.7946775390703782, + "language_loss": 0.79317456, + "learning_rate": 2.267341757894304e-06, + "loss": 0.81386471, + "num_input_tokens_seen": 169229295, + "step": 7877, + "time_per_iteration": 2.7202460765838623 + }, + { + "auxiliary_loss_clip": 0.01060615, + "auxiliary_loss_mlp": 0.00747579, + "balance_loss_clip": 1.02695632, + "balance_loss_mlp": 1.00037134, + "epoch": 0.47365098451826243, + "flos": 21939408708480.0, + "grad_norm": 2.0223679532305785, + "language_loss": 0.70540392, + "learning_rate": 2.2669557862613685e-06, + "loss": 0.72348589, + "num_input_tokens_seen": 169247855, + "step": 7878, + "time_per_iteration": 2.614196538925171 + }, + { + "auxiliary_loss_clip": 0.01039083, + "auxiliary_loss_mlp": 0.01033117, + "balance_loss_clip": 1.02862597, + "balance_loss_mlp": 1.02162516, + "epoch": 0.4737111077709304, + "flos": 25845053207040.0, + "grad_norm": 1.6084361404984444, + "language_loss": 0.7519874, + "learning_rate": 2.2665698045054425e-06, + "loss": 0.77270937, + "num_input_tokens_seen": 169268860, + "step": 7879, + "time_per_iteration": 2.7997167110443115 + }, + { + "auxiliary_loss_clip": 0.0099312, + "auxiliary_loss_mlp": 0.0100917, + "balance_loss_clip": 1.00503457, + "balance_loss_mlp": 1.00781727, + "epoch": 0.47377123102359836, + "flos": 67760886314880.0, + "grad_norm": 0.7424467139828076, + "language_loss": 0.61255348, + "learning_rate": 2.266183812641164e-06, + "loss": 0.63257635, + "num_input_tokens_seen": 169331855, + "step": 7880, + "time_per_iteration": 3.2432334423065186 + }, + { + "auxiliary_loss_clip": 0.01046342, + "auxiliary_loss_mlp": 0.01033287, + "balance_loss_clip": 1.02399468, + "balance_loss_mlp": 1.02076435, + "epoch": 0.4738313542762663, + "flos": 24315977191680.0, + "grad_norm": 4.668311408378832, + "language_loss": 0.67777562, + "learning_rate": 2.2657978106831675e-06, + "loss": 0.69857192, + "num_input_tokens_seen": 169352175, + "step": 7881, + "time_per_iteration": 2.6534037590026855 + }, + { + "auxiliary_loss_clip": 0.01016444, + "auxiliary_loss_mlp": 0.01028768, + "balance_loss_clip": 1.03139555, + "balance_loss_mlp": 1.01753259, + "epoch": 0.4738914775289343, + "flos": 20705339093760.0, + "grad_norm": 1.7172129061490133, + "language_loss": 0.77353513, + "learning_rate": 2.265411798646092e-06, + "loss": 0.79398727, + "num_input_tokens_seen": 169371215, + "step": 7882, + "time_per_iteration": 2.969099998474121 + }, + { + "auxiliary_loss_clip": 0.0106121, + "auxiliary_loss_mlp": 0.01029181, + "balance_loss_clip": 1.02789545, + "balance_loss_mlp": 1.01710558, + "epoch": 0.4739516007816023, + "flos": 25446337263360.0, + "grad_norm": 1.5079101589718806, + "language_loss": 0.76058912, + "learning_rate": 2.2650257765445747e-06, + "loss": 0.78149307, + "num_input_tokens_seen": 169391745, + "step": 7883, + "time_per_iteration": 3.093170642852783 + }, + { + "auxiliary_loss_clip": 0.01051657, + "auxiliary_loss_mlp": 0.01031391, + "balance_loss_clip": 1.02690625, + "balance_loss_mlp": 1.02026296, + "epoch": 0.4740117240342703, + "flos": 19974341410560.0, + "grad_norm": 1.801862184687902, + "language_loss": 0.72184527, + "learning_rate": 2.2646397443932525e-06, + "loss": 0.74267578, + "num_input_tokens_seen": 169409845, + "step": 7884, + "time_per_iteration": 2.7230992317199707 + }, + { + "auxiliary_loss_clip": 0.01063567, + "auxiliary_loss_mlp": 0.01034862, + "balance_loss_clip": 1.02722812, + "balance_loss_mlp": 1.02207685, + "epoch": 0.47407184728693824, + "flos": 15661146222720.0, + "grad_norm": 1.858972909715809, + "language_loss": 0.82046753, + "learning_rate": 2.2642537022067655e-06, + "loss": 0.84145182, + "num_input_tokens_seen": 169426085, + "step": 7885, + "time_per_iteration": 4.260695457458496 + }, + { + "auxiliary_loss_clip": 0.01035314, + "auxiliary_loss_mlp": 0.0104563, + "balance_loss_clip": 1.02422762, + "balance_loss_mlp": 1.03182578, + "epoch": 0.4741319705396062, + "flos": 18588800142720.0, + "grad_norm": 1.9750552421523488, + "language_loss": 0.73469406, + "learning_rate": 2.263867649999751e-06, + "loss": 0.75550354, + "num_input_tokens_seen": 169444705, + "step": 7886, + "time_per_iteration": 4.224836587905884 + }, + { + "auxiliary_loss_clip": 0.01054572, + "auxiliary_loss_mlp": 0.01036606, + "balance_loss_clip": 1.02687252, + "balance_loss_mlp": 1.02296829, + "epoch": 0.47419209379227417, + "flos": 13261093223040.0, + "grad_norm": 2.0245234653108186, + "language_loss": 0.74033064, + "learning_rate": 2.263481587786849e-06, + "loss": 0.76124239, + "num_input_tokens_seen": 169460850, + "step": 7887, + "time_per_iteration": 2.709360361099243 + }, + { + "auxiliary_loss_clip": 0.01058431, + "auxiliary_loss_mlp": 0.01027251, + "balance_loss_clip": 1.02646041, + "balance_loss_mlp": 1.01601529, + "epoch": 0.47425221704494214, + "flos": 20044043752320.0, + "grad_norm": 1.6808938735303784, + "language_loss": 0.76846838, + "learning_rate": 2.2630955155826993e-06, + "loss": 0.78932518, + "num_input_tokens_seen": 169478890, + "step": 7888, + "time_per_iteration": 2.687560558319092 + }, + { + "auxiliary_loss_clip": 0.01062047, + "auxiliary_loss_mlp": 0.01033322, + "balance_loss_clip": 1.02716088, + "balance_loss_mlp": 1.02113867, + "epoch": 0.4743123402976101, + "flos": 27271892136960.0, + "grad_norm": 1.6552263787957495, + "language_loss": 0.72205138, + "learning_rate": 2.2627094334019406e-06, + "loss": 0.7430051, + "num_input_tokens_seen": 169499690, + "step": 7889, + "time_per_iteration": 2.7196078300476074 + }, + { + "auxiliary_loss_clip": 0.01011141, + "auxiliary_loss_mlp": 0.01000836, + "balance_loss_clip": 1.00357723, + "balance_loss_mlp": 0.99945307, + "epoch": 0.47437246355027807, + "flos": 55393970261760.0, + "grad_norm": 0.7184229262367102, + "language_loss": 0.56071007, + "learning_rate": 2.262323341259214e-06, + "loss": 0.58082974, + "num_input_tokens_seen": 169560475, + "step": 7890, + "time_per_iteration": 3.1964077949523926 + }, + { + "auxiliary_loss_clip": 0.01063377, + "auxiliary_loss_mlp": 0.01036741, + "balance_loss_clip": 1.02841818, + "balance_loss_mlp": 1.02364624, + "epoch": 0.47443258680294603, + "flos": 23878477537920.0, + "grad_norm": 2.2205145872384366, + "language_loss": 0.65921891, + "learning_rate": 2.2619372391691605e-06, + "loss": 0.68022013, + "num_input_tokens_seen": 169580110, + "step": 7891, + "time_per_iteration": 2.7525808811187744 + }, + { + "auxiliary_loss_clip": 0.01076219, + "auxiliary_loss_mlp": 0.01037439, + "balance_loss_clip": 1.02921891, + "balance_loss_mlp": 1.02399814, + "epoch": 0.474492710055614, + "flos": 21977761455360.0, + "grad_norm": 3.9322118617262967, + "language_loss": 0.70548344, + "learning_rate": 2.26155112714642e-06, + "loss": 0.72661996, + "num_input_tokens_seen": 169597510, + "step": 7892, + "time_per_iteration": 2.537452220916748 + }, + { + "auxiliary_loss_clip": 0.00992347, + "auxiliary_loss_mlp": 0.01005966, + "balance_loss_clip": 1.00443697, + "balance_loss_mlp": 1.00442815, + "epoch": 0.47455283330828196, + "flos": 62557180122240.0, + "grad_norm": 0.8102933276934842, + "language_loss": 0.58620977, + "learning_rate": 2.2611650052056355e-06, + "loss": 0.60619295, + "num_input_tokens_seen": 169660010, + "step": 7893, + "time_per_iteration": 3.3171045780181885 + }, + { + "auxiliary_loss_clip": 0.01061855, + "auxiliary_loss_mlp": 0.01032468, + "balance_loss_clip": 1.02833831, + "balance_loss_mlp": 1.02069068, + "epoch": 0.47461295656094993, + "flos": 12093637380480.0, + "grad_norm": 1.6690355693962458, + "language_loss": 0.7781058, + "learning_rate": 2.2607788733614463e-06, + "loss": 0.79904902, + "num_input_tokens_seen": 169678485, + "step": 7894, + "time_per_iteration": 2.551058292388916 + }, + { + "auxiliary_loss_clip": 0.01062367, + "auxiliary_loss_mlp": 0.01033961, + "balance_loss_clip": 1.02822793, + "balance_loss_mlp": 1.02214789, + "epoch": 0.4746730798136179, + "flos": 20884568981760.0, + "grad_norm": 2.1085577767503354, + "language_loss": 0.7466554, + "learning_rate": 2.260392731628497e-06, + "loss": 0.76761872, + "num_input_tokens_seen": 169697335, + "step": 7895, + "time_per_iteration": 2.6894352436065674 + }, + { + "auxiliary_loss_clip": 0.01056384, + "auxiliary_loss_mlp": 0.01029837, + "balance_loss_clip": 1.02528119, + "balance_loss_mlp": 1.01758814, + "epoch": 0.4747332030662859, + "flos": 19974808287360.0, + "grad_norm": 2.323427867108486, + "language_loss": 0.82154524, + "learning_rate": 2.260006580021429e-06, + "loss": 0.84240746, + "num_input_tokens_seen": 169715395, + "step": 7896, + "time_per_iteration": 2.586887836456299 + }, + { + "auxiliary_loss_clip": 0.01061443, + "auxiliary_loss_mlp": 0.01026851, + "balance_loss_clip": 1.02788854, + "balance_loss_mlp": 1.01445353, + "epoch": 0.4747933263189539, + "flos": 16034186920320.0, + "grad_norm": 1.8637292584092289, + "language_loss": 0.75845706, + "learning_rate": 2.259620418554886e-06, + "loss": 0.77933997, + "num_input_tokens_seen": 169733755, + "step": 7897, + "time_per_iteration": 2.5686089992523193 + }, + { + "auxiliary_loss_clip": 0.01055216, + "auxiliary_loss_mlp": 0.01032722, + "balance_loss_clip": 1.02848721, + "balance_loss_mlp": 1.02042532, + "epoch": 0.47485344957162184, + "flos": 13955102876160.0, + "grad_norm": 2.364639702349918, + "language_loss": 0.6312902, + "learning_rate": 2.25923424724351e-06, + "loss": 0.65216953, + "num_input_tokens_seen": 169751390, + "step": 7898, + "time_per_iteration": 2.6345784664154053 + }, + { + "auxiliary_loss_clip": 0.01032787, + "auxiliary_loss_mlp": 0.010401, + "balance_loss_clip": 1.02402091, + "balance_loss_mlp": 1.0261879, + "epoch": 0.4749135728242898, + "flos": 20449080489600.0, + "grad_norm": 3.0529385168147196, + "language_loss": 0.70173377, + "learning_rate": 2.258848066101946e-06, + "loss": 0.72246253, + "num_input_tokens_seen": 169769500, + "step": 7899, + "time_per_iteration": 2.6723406314849854 + }, + { + "auxiliary_loss_clip": 0.01062009, + "auxiliary_loss_mlp": 0.01032252, + "balance_loss_clip": 1.02723694, + "balance_loss_mlp": 1.01998591, + "epoch": 0.4749736960769578, + "flos": 28949961767040.0, + "grad_norm": 1.9963776790749834, + "language_loss": 0.68568254, + "learning_rate": 2.258461875144837e-06, + "loss": 0.70662516, + "num_input_tokens_seen": 169789215, + "step": 7900, + "time_per_iteration": 2.803828239440918 + }, + { + "auxiliary_loss_clip": 0.01039723, + "auxiliary_loss_mlp": 0.01037852, + "balance_loss_clip": 1.02630544, + "balance_loss_mlp": 1.02553225, + "epoch": 0.47503381932962574, + "flos": 31938770592000.0, + "grad_norm": 2.3299112780670317, + "language_loss": 0.70379436, + "learning_rate": 2.2580756743868273e-06, + "loss": 0.72457016, + "num_input_tokens_seen": 169808825, + "step": 7901, + "time_per_iteration": 2.7630741596221924 + }, + { + "auxiliary_loss_clip": 0.0104618, + "auxiliary_loss_mlp": 0.01049886, + "balance_loss_clip": 1.0273186, + "balance_loss_mlp": 1.03635573, + "epoch": 0.4750939425822937, + "flos": 22127257860480.0, + "grad_norm": 1.7923769492060881, + "language_loss": 0.73901463, + "learning_rate": 2.2576894638425636e-06, + "loss": 0.75997525, + "num_input_tokens_seen": 169827590, + "step": 7902, + "time_per_iteration": 2.6300487518310547 + }, + { + "auxiliary_loss_clip": 0.01036096, + "auxiliary_loss_mlp": 0.01033589, + "balance_loss_clip": 1.02615595, + "balance_loss_mlp": 1.02244294, + "epoch": 0.47515406583496167, + "flos": 20850094903680.0, + "grad_norm": 2.5981878435593297, + "language_loss": 0.68925434, + "learning_rate": 2.257303243526688e-06, + "loss": 0.70995122, + "num_input_tokens_seen": 169844925, + "step": 7903, + "time_per_iteration": 2.68738055229187 + }, + { + "auxiliary_loss_clip": 0.0104405, + "auxiliary_loss_mlp": 0.0103051, + "balance_loss_clip": 1.02470803, + "balance_loss_mlp": 1.0196681, + "epoch": 0.47521418908762963, + "flos": 17524802448000.0, + "grad_norm": 1.4998746495456878, + "language_loss": 0.71959424, + "learning_rate": 2.256917013453848e-06, + "loss": 0.74033988, + "num_input_tokens_seen": 169862705, + "step": 7904, + "time_per_iteration": 2.645501136779785 + }, + { + "auxiliary_loss_clip": 0.01001167, + "auxiliary_loss_mlp": 0.0103472, + "balance_loss_clip": 1.01929927, + "balance_loss_mlp": 1.02213192, + "epoch": 0.4752743123402976, + "flos": 20559434048640.0, + "grad_norm": 1.5642633975541975, + "language_loss": 0.86108899, + "learning_rate": 2.25653077363869e-06, + "loss": 0.88144779, + "num_input_tokens_seen": 169880155, + "step": 7905, + "time_per_iteration": 2.9030601978302 + }, + { + "auxiliary_loss_clip": 0.01050312, + "auxiliary_loss_mlp": 0.01032109, + "balance_loss_clip": 1.02464592, + "balance_loss_mlp": 1.02120113, + "epoch": 0.47533443559296557, + "flos": 26360623071360.0, + "grad_norm": 1.652959467149268, + "language_loss": 0.82303512, + "learning_rate": 2.2561445240958583e-06, + "loss": 0.84385931, + "num_input_tokens_seen": 169901525, + "step": 7906, + "time_per_iteration": 2.729560136795044 + }, + { + "auxiliary_loss_clip": 0.00991232, + "auxiliary_loss_mlp": 0.01002487, + "balance_loss_clip": 1.01261878, + "balance_loss_mlp": 1.00089002, + "epoch": 0.47539455884563353, + "flos": 65949660967680.0, + "grad_norm": 0.6662223901028703, + "language_loss": 0.58975029, + "learning_rate": 2.255758264840002e-06, + "loss": 0.60968745, + "num_input_tokens_seen": 169970345, + "step": 7907, + "time_per_iteration": 3.3958427906036377 + }, + { + "auxiliary_loss_clip": 0.01055603, + "auxiliary_loss_mlp": 0.01032926, + "balance_loss_clip": 1.0264951, + "balance_loss_mlp": 1.02124333, + "epoch": 0.4754546820983015, + "flos": 17238128002560.0, + "grad_norm": 1.8015746706972433, + "language_loss": 0.80816495, + "learning_rate": 2.255371995885765e-06, + "loss": 0.8290503, + "num_input_tokens_seen": 169986440, + "step": 7908, + "time_per_iteration": 2.6814639568328857 + }, + { + "auxiliary_loss_clip": 0.01060069, + "auxiliary_loss_mlp": 0.01035742, + "balance_loss_clip": 1.02745187, + "balance_loss_mlp": 1.02366662, + "epoch": 0.47551480535096946, + "flos": 19825886499840.0, + "grad_norm": 1.6229780139595325, + "language_loss": 0.74086034, + "learning_rate": 2.254985717247797e-06, + "loss": 0.76181841, + "num_input_tokens_seen": 170005705, + "step": 7909, + "time_per_iteration": 2.7515621185302734 + }, + { + "auxiliary_loss_clip": 0.0104371, + "auxiliary_loss_mlp": 0.01032297, + "balance_loss_clip": 1.02664959, + "balance_loss_mlp": 1.02007222, + "epoch": 0.4755749286036375, + "flos": 22163958581760.0, + "grad_norm": 1.6160036670908968, + "language_loss": 0.75066209, + "learning_rate": 2.2545994289407457e-06, + "loss": 0.77142215, + "num_input_tokens_seen": 170023415, + "step": 7910, + "time_per_iteration": 2.7370619773864746 + }, + { + "auxiliary_loss_clip": 0.01058654, + "auxiliary_loss_mlp": 0.01025901, + "balance_loss_clip": 1.02709532, + "balance_loss_mlp": 1.01526189, + "epoch": 0.47563505185630545, + "flos": 21648280976640.0, + "grad_norm": 1.9856578317453102, + "language_loss": 0.78860891, + "learning_rate": 2.2542131309792577e-06, + "loss": 0.80945456, + "num_input_tokens_seen": 170042395, + "step": 7911, + "time_per_iteration": 2.655153274536133 + }, + { + "auxiliary_loss_clip": 0.01042061, + "auxiliary_loss_mlp": 0.00747326, + "balance_loss_clip": 1.02491629, + "balance_loss_mlp": 1.00030005, + "epoch": 0.4756951751089734, + "flos": 20628777254400.0, + "grad_norm": 1.8298970608643934, + "language_loss": 0.75666714, + "learning_rate": 2.253826823377983e-06, + "loss": 0.77456105, + "num_input_tokens_seen": 170061610, + "step": 7912, + "time_per_iteration": 2.719151735305786 + }, + { + "auxiliary_loss_clip": 0.01068046, + "auxiliary_loss_mlp": 0.01039838, + "balance_loss_clip": 1.02586913, + "balance_loss_mlp": 1.02854896, + "epoch": 0.4757552983616414, + "flos": 25848788221440.0, + "grad_norm": 1.7730765334916676, + "language_loss": 0.74068773, + "learning_rate": 2.253440506151569e-06, + "loss": 0.76176655, + "num_input_tokens_seen": 170083505, + "step": 7913, + "time_per_iteration": 2.5962836742401123 + }, + { + "auxiliary_loss_clip": 0.0105324, + "auxiliary_loss_mlp": 0.01028259, + "balance_loss_clip": 1.02918553, + "balance_loss_mlp": 1.01582551, + "epoch": 0.47581542161430934, + "flos": 18223013992320.0, + "grad_norm": 2.8730750984529014, + "language_loss": 0.72100925, + "learning_rate": 2.253054179314666e-06, + "loss": 0.74182427, + "num_input_tokens_seen": 170100690, + "step": 7914, + "time_per_iteration": 2.6324734687805176 + }, + { + "auxiliary_loss_clip": 0.01054515, + "auxiliary_loss_mlp": 0.01035705, + "balance_loss_clip": 1.0307821, + "balance_loss_mlp": 1.02405238, + "epoch": 0.4758755448669773, + "flos": 21579763783680.0, + "grad_norm": 2.0413726032722024, + "language_loss": 0.6471681, + "learning_rate": 2.2526678428819227e-06, + "loss": 0.66807032, + "num_input_tokens_seen": 170119240, + "step": 7915, + "time_per_iteration": 2.685490608215332 + }, + { + "auxiliary_loss_clip": 0.01070217, + "auxiliary_loss_mlp": 0.0102997, + "balance_loss_clip": 1.02968597, + "balance_loss_mlp": 1.01828766, + "epoch": 0.47593566811964527, + "flos": 15231152511360.0, + "grad_norm": 1.7427185740564015, + "language_loss": 0.76994425, + "learning_rate": 2.2522814968679896e-06, + "loss": 0.79094613, + "num_input_tokens_seen": 170136450, + "step": 7916, + "time_per_iteration": 2.648512601852417 + }, + { + "auxiliary_loss_clip": 0.01071059, + "auxiliary_loss_mlp": 0.0103189, + "balance_loss_clip": 1.02816415, + "balance_loss_mlp": 1.02066636, + "epoch": 0.47599579137231324, + "flos": 21543242630400.0, + "grad_norm": 1.8099343463324, + "language_loss": 0.64388603, + "learning_rate": 2.2518951412875173e-06, + "loss": 0.66491556, + "num_input_tokens_seen": 170155295, + "step": 7917, + "time_per_iteration": 2.543337345123291 + }, + { + "auxiliary_loss_clip": 0.0098327, + "auxiliary_loss_mlp": 0.0100126, + "balance_loss_clip": 1.00544798, + "balance_loss_mlp": 0.99978811, + "epoch": 0.4760559146249812, + "flos": 64554602595840.0, + "grad_norm": 0.8355986630926011, + "language_loss": 0.65732765, + "learning_rate": 2.2515087761551557e-06, + "loss": 0.6771729, + "num_input_tokens_seen": 170222325, + "step": 7918, + "time_per_iteration": 3.239694833755493 + }, + { + "auxiliary_loss_clip": 0.01056752, + "auxiliary_loss_mlp": 0.0074748, + "balance_loss_clip": 1.02553451, + "balance_loss_mlp": 1.00025618, + "epoch": 0.47611603787764917, + "flos": 22233876405120.0, + "grad_norm": 2.7478108279207722, + "language_loss": 0.68688452, + "learning_rate": 2.2511224014855563e-06, + "loss": 0.70492685, + "num_input_tokens_seen": 170241625, + "step": 7919, + "time_per_iteration": 2.56957745552063 + }, + { + "auxiliary_loss_clip": 0.01049192, + "auxiliary_loss_mlp": 0.01029906, + "balance_loss_clip": 1.02609181, + "balance_loss_mlp": 1.01861715, + "epoch": 0.47617616113031713, + "flos": 22780005765120.0, + "grad_norm": 3.205307580788392, + "language_loss": 0.74990827, + "learning_rate": 2.2507360172933694e-06, + "loss": 0.77069926, + "num_input_tokens_seen": 170262470, + "step": 7920, + "time_per_iteration": 2.746232032775879 + }, + { + "auxiliary_loss_clip": 0.01053612, + "auxiliary_loss_mlp": 0.01032177, + "balance_loss_clip": 1.02800584, + "balance_loss_mlp": 1.01959443, + "epoch": 0.4762362843829851, + "flos": 24133802388480.0, + "grad_norm": 1.4654928971053593, + "language_loss": 0.77630353, + "learning_rate": 2.2503496235932487e-06, + "loss": 0.79716146, + "num_input_tokens_seen": 170283460, + "step": 7921, + "time_per_iteration": 4.288693189620972 + }, + { + "auxiliary_loss_clip": 0.01051624, + "auxiliary_loss_mlp": 0.01035964, + "balance_loss_clip": 1.02703369, + "balance_loss_mlp": 1.02266073, + "epoch": 0.47629640763565306, + "flos": 22452069571200.0, + "grad_norm": 1.546256130777509, + "language_loss": 0.78216743, + "learning_rate": 2.249963220399845e-06, + "loss": 0.80304331, + "num_input_tokens_seen": 170304225, + "step": 7922, + "time_per_iteration": 4.330570936203003 + }, + { + "auxiliary_loss_clip": 0.01042111, + "auxiliary_loss_mlp": 0.01032502, + "balance_loss_clip": 1.02760446, + "balance_loss_mlp": 1.01960385, + "epoch": 0.4763565308883211, + "flos": 11181398647680.0, + "grad_norm": 1.775382540724167, + "language_loss": 0.72561884, + "learning_rate": 2.2495768077278104e-06, + "loss": 0.74636495, + "num_input_tokens_seen": 170322110, + "step": 7923, + "time_per_iteration": 2.661564350128174 + }, + { + "auxiliary_loss_clip": 0.01041283, + "auxiliary_loss_mlp": 0.01033346, + "balance_loss_clip": 1.02657652, + "balance_loss_mlp": 1.02213478, + "epoch": 0.47641665414098905, + "flos": 22382151747840.0, + "grad_norm": 1.9130726301865757, + "language_loss": 0.82231164, + "learning_rate": 2.2491903855917992e-06, + "loss": 0.84305793, + "num_input_tokens_seen": 170340700, + "step": 7924, + "time_per_iteration": 2.6554200649261475 + }, + { + "auxiliary_loss_clip": 0.01067697, + "auxiliary_loss_mlp": 0.01031782, + "balance_loss_clip": 1.02935135, + "balance_loss_mlp": 1.01872218, + "epoch": 0.476476777393657, + "flos": 25046148862080.0, + "grad_norm": 1.7638602354805517, + "language_loss": 0.80305177, + "learning_rate": 2.2488039540064626e-06, + "loss": 0.82404649, + "num_input_tokens_seen": 170359780, + "step": 7925, + "time_per_iteration": 2.63997483253479 + }, + { + "auxiliary_loss_clip": 0.01047664, + "auxiliary_loss_mlp": 0.01035688, + "balance_loss_clip": 1.02472591, + "balance_loss_mlp": 1.02399325, + "epoch": 0.476536900646325, + "flos": 27269916888960.0, + "grad_norm": 1.636861369058598, + "language_loss": 0.72341233, + "learning_rate": 2.2484175129864558e-06, + "loss": 0.74424589, + "num_input_tokens_seen": 170381260, + "step": 7926, + "time_per_iteration": 2.7119617462158203 + }, + { + "auxiliary_loss_clip": 0.01063972, + "auxiliary_loss_mlp": 0.01031035, + "balance_loss_clip": 1.02849603, + "balance_loss_mlp": 1.01798201, + "epoch": 0.47659702389899294, + "flos": 25301401885440.0, + "grad_norm": 10.003269567761098, + "language_loss": 0.68595958, + "learning_rate": 2.248031062546432e-06, + "loss": 0.7069096, + "num_input_tokens_seen": 170400595, + "step": 7927, + "time_per_iteration": 2.638345241546631 + }, + { + "auxiliary_loss_clip": 0.01035219, + "auxiliary_loss_mlp": 0.01027284, + "balance_loss_clip": 1.02493262, + "balance_loss_mlp": 1.01602435, + "epoch": 0.4766571471516609, + "flos": 25992861672960.0, + "grad_norm": 1.51957411125972, + "language_loss": 0.68131268, + "learning_rate": 2.247644602701045e-06, + "loss": 0.70193768, + "num_input_tokens_seen": 170421110, + "step": 7928, + "time_per_iteration": 2.7358181476593018 + }, + { + "auxiliary_loss_clip": 0.01068746, + "auxiliary_loss_mlp": 0.0103011, + "balance_loss_clip": 1.0258497, + "balance_loss_mlp": 1.01824307, + "epoch": 0.4767172704043289, + "flos": 16032211672320.0, + "grad_norm": 2.068184829592374, + "language_loss": 0.78476852, + "learning_rate": 2.2472581334649496e-06, + "loss": 0.80575716, + "num_input_tokens_seen": 170436700, + "step": 7929, + "time_per_iteration": 2.5995564460754395 + }, + { + "auxiliary_loss_clip": 0.01043594, + "auxiliary_loss_mlp": 0.0103598, + "balance_loss_clip": 1.02541447, + "balance_loss_mlp": 1.0246911, + "epoch": 0.47677739365699684, + "flos": 39235351651200.0, + "grad_norm": 1.9670263713832703, + "language_loss": 0.66774631, + "learning_rate": 2.2468716548528016e-06, + "loss": 0.68854213, + "num_input_tokens_seen": 170459555, + "step": 7930, + "time_per_iteration": 2.7935757637023926 + }, + { + "auxiliary_loss_clip": 0.01051893, + "auxiliary_loss_mlp": 0.01026628, + "balance_loss_clip": 1.02485001, + "balance_loss_mlp": 1.01544619, + "epoch": 0.4768375169096648, + "flos": 24717781704960.0, + "grad_norm": 1.6991145390829288, + "language_loss": 0.80036867, + "learning_rate": 2.2464851668792555e-06, + "loss": 0.82115382, + "num_input_tokens_seen": 170479175, + "step": 7931, + "time_per_iteration": 2.7277634143829346 + }, + { + "auxiliary_loss_clip": 0.0103796, + "auxiliary_loss_mlp": 0.01033003, + "balance_loss_clip": 1.0228914, + "balance_loss_mlp": 1.02095675, + "epoch": 0.47689764016233277, + "flos": 22528667324160.0, + "grad_norm": 2.135140525302863, + "language_loss": 0.76246172, + "learning_rate": 2.2460986695589678e-06, + "loss": 0.78317142, + "num_input_tokens_seen": 170498450, + "step": 7932, + "time_per_iteration": 4.320749282836914 + }, + { + "auxiliary_loss_clip": 0.01050306, + "auxiliary_loss_mlp": 0.00747204, + "balance_loss_clip": 1.02815509, + "balance_loss_mlp": 1.00038362, + "epoch": 0.47695776341500074, + "flos": 15120619384320.0, + "grad_norm": 1.7218243204025288, + "language_loss": 0.79679197, + "learning_rate": 2.245712162906593e-06, + "loss": 0.81476706, + "num_input_tokens_seen": 170516255, + "step": 7933, + "time_per_iteration": 4.4041736125946045 + }, + { + "auxiliary_loss_clip": 0.0106328, + "auxiliary_loss_mlp": 0.01036167, + "balance_loss_clip": 1.02641094, + "balance_loss_mlp": 1.02226138, + "epoch": 0.4770178866676687, + "flos": 14678917839360.0, + "grad_norm": 1.7738169765812053, + "language_loss": 0.74052244, + "learning_rate": 2.2453256469367888e-06, + "loss": 0.76151693, + "num_input_tokens_seen": 170532705, + "step": 7934, + "time_per_iteration": 2.727187156677246 + }, + { + "auxiliary_loss_clip": 0.01061265, + "auxiliary_loss_mlp": 0.01031349, + "balance_loss_clip": 1.02673674, + "balance_loss_mlp": 1.0195471, + "epoch": 0.47707800992033667, + "flos": 22565583527040.0, + "grad_norm": 2.069757168918777, + "language_loss": 0.7988863, + "learning_rate": 2.244939121664211e-06, + "loss": 0.81981236, + "num_input_tokens_seen": 170551925, + "step": 7935, + "time_per_iteration": 2.59796404838562 + }, + { + "auxiliary_loss_clip": 0.01045968, + "auxiliary_loss_mlp": 0.0103554, + "balance_loss_clip": 1.02846956, + "balance_loss_mlp": 1.02345228, + "epoch": 0.4771381331730047, + "flos": 30918225375360.0, + "grad_norm": 1.612019124338063, + "language_loss": 0.71081185, + "learning_rate": 2.2445525871035177e-06, + "loss": 0.73162699, + "num_input_tokens_seen": 170572320, + "step": 7936, + "time_per_iteration": 2.73091721534729 + }, + { + "auxiliary_loss_clip": 0.01071846, + "auxiliary_loss_mlp": 0.01031609, + "balance_loss_clip": 1.02686417, + "balance_loss_mlp": 1.01910424, + "epoch": 0.47719825642567265, + "flos": 25738901539200.0, + "grad_norm": 1.968072739493563, + "language_loss": 0.67742705, + "learning_rate": 2.2441660432693656e-06, + "loss": 0.69846159, + "num_input_tokens_seen": 170589470, + "step": 7937, + "time_per_iteration": 2.593515396118164 + }, + { + "auxiliary_loss_clip": 0.01000405, + "auxiliary_loss_mlp": 0.01001567, + "balance_loss_clip": 1.00326276, + "balance_loss_mlp": 1.00027931, + "epoch": 0.4772583796783406, + "flos": 66355128668160.0, + "grad_norm": 0.7059090032722103, + "language_loss": 0.56335205, + "learning_rate": 2.2437794901764128e-06, + "loss": 0.58337176, + "num_input_tokens_seen": 170662265, + "step": 7938, + "time_per_iteration": 3.3905744552612305 + }, + { + "auxiliary_loss_clip": 0.01045023, + "auxiliary_loss_mlp": 0.01041369, + "balance_loss_clip": 1.02625275, + "balance_loss_mlp": 1.0277195, + "epoch": 0.4773185029310086, + "flos": 22051091070720.0, + "grad_norm": 2.5067809608551217, + "language_loss": 0.88944769, + "learning_rate": 2.243392927839317e-06, + "loss": 0.91031158, + "num_input_tokens_seen": 170679680, + "step": 7939, + "time_per_iteration": 2.610504150390625 + }, + { + "auxiliary_loss_clip": 0.01059053, + "auxiliary_loss_mlp": 0.01034423, + "balance_loss_clip": 1.02524257, + "balance_loss_mlp": 1.02297282, + "epoch": 0.47737862618367655, + "flos": 16727801523840.0, + "grad_norm": 1.8109517612353025, + "language_loss": 0.77147174, + "learning_rate": 2.2430063562727367e-06, + "loss": 0.7924065, + "num_input_tokens_seen": 170697340, + "step": 7940, + "time_per_iteration": 2.6267149448394775 + }, + { + "auxiliary_loss_clip": 0.01047837, + "auxiliary_loss_mlp": 0.01032464, + "balance_loss_clip": 1.0264864, + "balance_loss_mlp": 1.02176452, + "epoch": 0.4774387494363445, + "flos": 19609453100160.0, + "grad_norm": 1.5587140935891297, + "language_loss": 0.85009396, + "learning_rate": 2.2426197754913322e-06, + "loss": 0.870897, + "num_input_tokens_seen": 170714905, + "step": 7941, + "time_per_iteration": 2.640043020248413 + }, + { + "auxiliary_loss_clip": 0.01049104, + "auxiliary_loss_mlp": 0.0103258, + "balance_loss_clip": 1.02534914, + "balance_loss_mlp": 1.02035499, + "epoch": 0.4774988726890125, + "flos": 16653969118080.0, + "grad_norm": 1.7109921355185045, + "language_loss": 0.75637901, + "learning_rate": 2.24223318550976e-06, + "loss": 0.77719587, + "num_input_tokens_seen": 170731810, + "step": 7942, + "time_per_iteration": 2.581336259841919 + }, + { + "auxiliary_loss_clip": 0.01062815, + "auxiliary_loss_mlp": 0.01034443, + "balance_loss_clip": 1.02830184, + "balance_loss_mlp": 1.02298737, + "epoch": 0.47755899594168044, + "flos": 20485565729280.0, + "grad_norm": 2.3496382718769073, + "language_loss": 0.6480037, + "learning_rate": 2.241846586342682e-06, + "loss": 0.66897631, + "num_input_tokens_seen": 170750270, + "step": 7943, + "time_per_iteration": 2.570545196533203 + }, + { + "auxiliary_loss_clip": 0.01029262, + "auxiliary_loss_mlp": 0.0103145, + "balance_loss_clip": 1.02458465, + "balance_loss_mlp": 1.01876593, + "epoch": 0.4776191191943484, + "flos": 21652806090240.0, + "grad_norm": 2.182326789399433, + "language_loss": 0.73331738, + "learning_rate": 2.2414599780047577e-06, + "loss": 0.75392455, + "num_input_tokens_seen": 170769015, + "step": 7944, + "time_per_iteration": 2.7860124111175537 + }, + { + "auxiliary_loss_clip": 0.0106454, + "auxiliary_loss_mlp": 0.01032689, + "balance_loss_clip": 1.03035772, + "balance_loss_mlp": 1.01955855, + "epoch": 0.4776792424470164, + "flos": 18770220760320.0, + "grad_norm": 2.197142778386524, + "language_loss": 0.68129379, + "learning_rate": 2.2410733605106456e-06, + "loss": 0.70226604, + "num_input_tokens_seen": 170785725, + "step": 7945, + "time_per_iteration": 2.7289891242980957 + }, + { + "auxiliary_loss_clip": 0.01030211, + "auxiliary_loss_mlp": 0.00747616, + "balance_loss_clip": 1.02171588, + "balance_loss_mlp": 1.00045395, + "epoch": 0.47773936569968434, + "flos": 29715828577920.0, + "grad_norm": 1.797470556916516, + "language_loss": 0.75643271, + "learning_rate": 2.240686733875009e-06, + "loss": 0.77421099, + "num_input_tokens_seen": 170804600, + "step": 7946, + "time_per_iteration": 2.9282658100128174 + }, + { + "auxiliary_loss_clip": 0.01041759, + "auxiliary_loss_mlp": 0.01039483, + "balance_loss_clip": 1.02508891, + "balance_loss_mlp": 1.02659082, + "epoch": 0.4777994889523523, + "flos": 24791542283520.0, + "grad_norm": 1.7408600811054538, + "language_loss": 0.79222971, + "learning_rate": 2.240300098112506e-06, + "loss": 0.81304216, + "num_input_tokens_seen": 170824230, + "step": 7947, + "time_per_iteration": 2.658572196960449 + }, + { + "auxiliary_loss_clip": 0.0104904, + "auxiliary_loss_mlp": 0.01036032, + "balance_loss_clip": 1.02695107, + "balance_loss_mlp": 1.02424872, + "epoch": 0.47785961220502027, + "flos": 17858161595520.0, + "grad_norm": 1.9080043544053795, + "language_loss": 0.73754495, + "learning_rate": 2.2399134532377998e-06, + "loss": 0.75839573, + "num_input_tokens_seen": 170843365, + "step": 7948, + "time_per_iteration": 2.7097628116607666 + }, + { + "auxiliary_loss_clip": 0.01046727, + "auxiliary_loss_mlp": 0.01030082, + "balance_loss_clip": 1.02516747, + "balance_loss_mlp": 1.01754713, + "epoch": 0.4779197354576883, + "flos": 20266546550400.0, + "grad_norm": 1.417923360826774, + "language_loss": 0.78026962, + "learning_rate": 2.2395267992655514e-06, + "loss": 0.80103773, + "num_input_tokens_seen": 170863515, + "step": 7949, + "time_per_iteration": 2.6890759468078613 + }, + { + "auxiliary_loss_clip": 0.01040158, + "auxiliary_loss_mlp": 0.01029626, + "balance_loss_clip": 1.02291417, + "balance_loss_mlp": 1.01803899, + "epoch": 0.47797985871035625, + "flos": 17056599644160.0, + "grad_norm": 4.812554306662443, + "language_loss": 0.73850846, + "learning_rate": 2.2391401362104227e-06, + "loss": 0.7592063, + "num_input_tokens_seen": 170881245, + "step": 7950, + "time_per_iteration": 2.6254758834838867 + }, + { + "auxiliary_loss_clip": 0.01038684, + "auxiliary_loss_mlp": 0.01040224, + "balance_loss_clip": 1.0234797, + "balance_loss_mlp": 1.02703357, + "epoch": 0.4780399819630242, + "flos": 31358418549120.0, + "grad_norm": 1.5559745118100128, + "language_loss": 0.74136275, + "learning_rate": 2.2387534640870756e-06, + "loss": 0.76215184, + "num_input_tokens_seen": 170901285, + "step": 7951, + "time_per_iteration": 2.783017873764038 + }, + { + "auxiliary_loss_clip": 0.01045293, + "auxiliary_loss_mlp": 0.01034782, + "balance_loss_clip": 1.02859116, + "balance_loss_mlp": 1.02184796, + "epoch": 0.4781001052156922, + "flos": 24899597372160.0, + "grad_norm": 2.065739277667601, + "language_loss": 0.80082381, + "learning_rate": 2.238366782910174e-06, + "loss": 0.82162458, + "num_input_tokens_seen": 170919740, + "step": 7952, + "time_per_iteration": 2.724635362625122 + }, + { + "auxiliary_loss_clip": 0.01047096, + "auxiliary_loss_mlp": 0.01035662, + "balance_loss_clip": 1.02343667, + "balance_loss_mlp": 1.02299643, + "epoch": 0.47816022846836015, + "flos": 18697717157760.0, + "grad_norm": 1.6365034198575812, + "language_loss": 0.7863363, + "learning_rate": 2.23798009269438e-06, + "loss": 0.80716383, + "num_input_tokens_seen": 170938510, + "step": 7953, + "time_per_iteration": 2.641378879547119 + }, + { + "auxiliary_loss_clip": 0.01062762, + "auxiliary_loss_mlp": 0.01031622, + "balance_loss_clip": 1.02675962, + "balance_loss_mlp": 1.01942682, + "epoch": 0.4782203517210281, + "flos": 11977573559040.0, + "grad_norm": 6.990386012598686, + "language_loss": 0.84286129, + "learning_rate": 2.2375933934543566e-06, + "loss": 0.86380506, + "num_input_tokens_seen": 170951170, + "step": 7954, + "time_per_iteration": 2.649731159210205 + }, + { + "auxiliary_loss_clip": 0.01040316, + "auxiliary_loss_mlp": 0.01031214, + "balance_loss_clip": 1.02455938, + "balance_loss_mlp": 1.01894689, + "epoch": 0.4782804749736961, + "flos": 20813501923200.0, + "grad_norm": 1.4123330049076563, + "language_loss": 0.70406908, + "learning_rate": 2.237206685204768e-06, + "loss": 0.72478437, + "num_input_tokens_seen": 170970990, + "step": 7955, + "time_per_iteration": 2.6639175415039062 + }, + { + "auxiliary_loss_clip": 0.01043563, + "auxiliary_loss_mlp": 0.01032608, + "balance_loss_clip": 1.02572393, + "balance_loss_mlp": 1.02104485, + "epoch": 0.47834059822636404, + "flos": 23840304359040.0, + "grad_norm": 1.9562735137098677, + "language_loss": 0.81974208, + "learning_rate": 2.2368199679602787e-06, + "loss": 0.84050381, + "num_input_tokens_seen": 170991215, + "step": 7956, + "time_per_iteration": 2.6025240421295166 + }, + { + "auxiliary_loss_clip": 0.01050299, + "auxiliary_loss_mlp": 0.01033841, + "balance_loss_clip": 1.02644372, + "balance_loss_mlp": 1.02141976, + "epoch": 0.478400721479032, + "flos": 22633777497600.0, + "grad_norm": 2.1211952661705276, + "language_loss": 0.84424388, + "learning_rate": 2.2364332417355516e-06, + "loss": 0.8650853, + "num_input_tokens_seen": 171007325, + "step": 7957, + "time_per_iteration": 2.822134494781494 + }, + { + "auxiliary_loss_clip": 0.01056606, + "auxiliary_loss_mlp": 0.01033307, + "balance_loss_clip": 1.02426267, + "balance_loss_mlp": 1.02205932, + "epoch": 0.4784608447317, + "flos": 19354954262400.0, + "grad_norm": 1.7183379862941386, + "language_loss": 0.79652011, + "learning_rate": 2.2360465065452527e-06, + "loss": 0.81741917, + "num_input_tokens_seen": 171025650, + "step": 7958, + "time_per_iteration": 2.552549362182617 + }, + { + "auxiliary_loss_clip": 0.01023819, + "auxiliary_loss_mlp": 0.00747428, + "balance_loss_clip": 1.02001691, + "balance_loss_mlp": 1.00033808, + "epoch": 0.47852096798436794, + "flos": 24021114445440.0, + "grad_norm": 2.24531828408201, + "language_loss": 0.83561349, + "learning_rate": 2.235659762404047e-06, + "loss": 0.85332584, + "num_input_tokens_seen": 171045045, + "step": 7959, + "time_per_iteration": 2.6952335834503174 + }, + { + "auxiliary_loss_clip": 0.01037227, + "auxiliary_loss_mlp": 0.01026861, + "balance_loss_clip": 1.02599561, + "balance_loss_mlp": 1.01646638, + "epoch": 0.4785810912370359, + "flos": 25666433850240.0, + "grad_norm": 2.813155559265851, + "language_loss": 0.73023802, + "learning_rate": 2.235273009326599e-06, + "loss": 0.75087893, + "num_input_tokens_seen": 171062910, + "step": 7960, + "time_per_iteration": 2.745574951171875 + }, + { + "auxiliary_loss_clip": 0.01037246, + "auxiliary_loss_mlp": 0.01032217, + "balance_loss_clip": 1.02649999, + "balance_loss_mlp": 1.02134526, + "epoch": 0.47864121448970387, + "flos": 21432134885760.0, + "grad_norm": 1.7563816053005439, + "language_loss": 0.76712894, + "learning_rate": 2.2348862473275745e-06, + "loss": 0.78782356, + "num_input_tokens_seen": 171080875, + "step": 7961, + "time_per_iteration": 2.6456875801086426 + }, + { + "auxiliary_loss_clip": 0.01038879, + "auxiliary_loss_mlp": 0.0102923, + "balance_loss_clip": 1.02772617, + "balance_loss_mlp": 1.01734471, + "epoch": 0.47870133774237184, + "flos": 16143894034560.0, + "grad_norm": 1.6947312269873829, + "language_loss": 0.77342904, + "learning_rate": 2.2344994764216405e-06, + "loss": 0.79411012, + "num_input_tokens_seen": 171099190, + "step": 7962, + "time_per_iteration": 2.654038190841675 + }, + { + "auxiliary_loss_clip": 0.01046308, + "auxiliary_loss_mlp": 0.01031553, + "balance_loss_clip": 1.02569425, + "balance_loss_mlp": 1.01978135, + "epoch": 0.47876146099503986, + "flos": 26906788344960.0, + "grad_norm": 1.854496971789501, + "language_loss": 0.6493175, + "learning_rate": 2.2341126966234635e-06, + "loss": 0.67009616, + "num_input_tokens_seen": 171119060, + "step": 7963, + "time_per_iteration": 2.6942179203033447 + }, + { + "auxiliary_loss_clip": 0.01056591, + "auxiliary_loss_mlp": 0.01033103, + "balance_loss_clip": 1.02477646, + "balance_loss_mlp": 1.02118194, + "epoch": 0.4788215842477078, + "flos": 45332085778560.0, + "grad_norm": 2.3796760395778853, + "language_loss": 0.77895898, + "learning_rate": 2.2337259079477083e-06, + "loss": 0.79985589, + "num_input_tokens_seen": 171141900, + "step": 7964, + "time_per_iteration": 2.77925968170166 + }, + { + "auxiliary_loss_clip": 0.01060718, + "auxiliary_loss_mlp": 0.01031907, + "balance_loss_clip": 1.02511978, + "balance_loss_mlp": 1.0188359, + "epoch": 0.4788817075003758, + "flos": 22237180456320.0, + "grad_norm": 3.6395077228925854, + "language_loss": 0.7647168, + "learning_rate": 2.233339110409044e-06, + "loss": 0.78564298, + "num_input_tokens_seen": 171161045, + "step": 7965, + "time_per_iteration": 2.599555492401123 + }, + { + "auxiliary_loss_clip": 0.01005949, + "auxiliary_loss_mlp": 0.01036764, + "balance_loss_clip": 1.02086115, + "balance_loss_mlp": 1.02348995, + "epoch": 0.47894183075304375, + "flos": 16471183783680.0, + "grad_norm": 1.769949992154439, + "language_loss": 0.74626434, + "learning_rate": 2.232952304022137e-06, + "loss": 0.76669151, + "num_input_tokens_seen": 171179675, + "step": 7966, + "time_per_iteration": 2.645048141479492 + }, + { + "auxiliary_loss_clip": 0.01048637, + "auxiliary_loss_mlp": 0.01031136, + "balance_loss_clip": 1.02677286, + "balance_loss_mlp": 1.018381, + "epoch": 0.4790019540057117, + "flos": 24282688262400.0, + "grad_norm": 1.5814151010283293, + "language_loss": 0.73044527, + "learning_rate": 2.232565488801655e-06, + "loss": 0.751243, + "num_input_tokens_seen": 171201175, + "step": 7967, + "time_per_iteration": 2.691723585128784 + }, + { + "auxiliary_loss_clip": 0.01042973, + "auxiliary_loss_mlp": 0.01025195, + "balance_loss_clip": 1.02312016, + "balance_loss_mlp": 1.01352477, + "epoch": 0.4790620772583797, + "flos": 25666469763840.0, + "grad_norm": 1.8153997602733472, + "language_loss": 0.79433459, + "learning_rate": 2.232178664762267e-06, + "loss": 0.81501633, + "num_input_tokens_seen": 171221750, + "step": 7968, + "time_per_iteration": 4.401210308074951 + }, + { + "auxiliary_loss_clip": 0.00984352, + "auxiliary_loss_mlp": 0.01003171, + "balance_loss_clip": 1.00598729, + "balance_loss_mlp": 1.00188959, + "epoch": 0.47912220051104765, + "flos": 69428077102080.0, + "grad_norm": 0.7618578332805253, + "language_loss": 0.62207896, + "learning_rate": 2.2317918319186408e-06, + "loss": 0.64195418, + "num_input_tokens_seen": 171292235, + "step": 7969, + "time_per_iteration": 5.024095773696899 + }, + { + "auxiliary_loss_clip": 0.01035824, + "auxiliary_loss_mlp": 0.01029011, + "balance_loss_clip": 1.02622008, + "balance_loss_mlp": 1.01812148, + "epoch": 0.4791823237637156, + "flos": 24168922911360.0, + "grad_norm": 1.3881419957105723, + "language_loss": 0.77383882, + "learning_rate": 2.2314049902854446e-06, + "loss": 0.79448724, + "num_input_tokens_seen": 171312215, + "step": 7970, + "time_per_iteration": 2.701831579208374 + }, + { + "auxiliary_loss_clip": 0.01059174, + "auxiliary_loss_mlp": 0.01030567, + "balance_loss_clip": 1.02529621, + "balance_loss_mlp": 1.01837826, + "epoch": 0.4792424470163836, + "flos": 24751465683840.0, + "grad_norm": 2.3538771611670684, + "language_loss": 0.69929534, + "learning_rate": 2.231018139877349e-06, + "loss": 0.72019279, + "num_input_tokens_seen": 171332975, + "step": 7971, + "time_per_iteration": 2.6544225215911865 + }, + { + "auxiliary_loss_clip": 0.01016256, + "auxiliary_loss_mlp": 0.01030929, + "balance_loss_clip": 1.02179921, + "balance_loss_mlp": 1.01841772, + "epoch": 0.47930257026905154, + "flos": 23257905240960.0, + "grad_norm": 2.0700275423127463, + "language_loss": 0.79888475, + "learning_rate": 2.230631280709021e-06, + "loss": 0.81935662, + "num_input_tokens_seen": 171353880, + "step": 7972, + "time_per_iteration": 2.870713233947754 + }, + { + "auxiliary_loss_clip": 0.01061184, + "auxiliary_loss_mlp": 0.01024928, + "balance_loss_clip": 1.02627015, + "balance_loss_mlp": 1.01288188, + "epoch": 0.4793626935217195, + "flos": 14064091718400.0, + "grad_norm": 2.1614292080822426, + "language_loss": 0.69618219, + "learning_rate": 2.2302444127951327e-06, + "loss": 0.7170434, + "num_input_tokens_seen": 171370930, + "step": 7973, + "time_per_iteration": 2.644472122192383 + }, + { + "auxiliary_loss_clip": 0.01061816, + "auxiliary_loss_mlp": 0.01033128, + "balance_loss_clip": 1.02925467, + "balance_loss_mlp": 1.022071, + "epoch": 0.4794228167743875, + "flos": 21798854789760.0, + "grad_norm": 1.9530855853340192, + "language_loss": 0.78799701, + "learning_rate": 2.2298575361503523e-06, + "loss": 0.80894637, + "num_input_tokens_seen": 171387575, + "step": 7974, + "time_per_iteration": 2.6586320400238037 + }, + { + "auxiliary_loss_clip": 0.00991649, + "auxiliary_loss_mlp": 0.01005357, + "balance_loss_clip": 1.00447667, + "balance_loss_mlp": 1.0037837, + "epoch": 0.47948294002705544, + "flos": 66968805553920.0, + "grad_norm": 0.7533398820782898, + "language_loss": 0.53979874, + "learning_rate": 2.2294706507893517e-06, + "loss": 0.5597688, + "num_input_tokens_seen": 171449980, + "step": 7975, + "time_per_iteration": 3.2542574405670166 + }, + { + "auxiliary_loss_clip": 0.01054916, + "auxiliary_loss_mlp": 0.01036354, + "balance_loss_clip": 1.02703714, + "balance_loss_mlp": 1.02265739, + "epoch": 0.47954306327972346, + "flos": 12422471414400.0, + "grad_norm": 2.260890533285808, + "language_loss": 0.89527869, + "learning_rate": 2.2290837567268008e-06, + "loss": 0.91619134, + "num_input_tokens_seen": 171465290, + "step": 7976, + "time_per_iteration": 2.568844795227051 + }, + { + "auxiliary_loss_clip": 0.01075747, + "auxiliary_loss_mlp": 0.0103748, + "balance_loss_clip": 1.02866769, + "balance_loss_mlp": 1.02403283, + "epoch": 0.4796031865323914, + "flos": 18361951799040.0, + "grad_norm": 2.751477870744106, + "language_loss": 0.73193419, + "learning_rate": 2.2286968539773713e-06, + "loss": 0.75306648, + "num_input_tokens_seen": 171481130, + "step": 7977, + "time_per_iteration": 2.5164425373077393 + }, + { + "auxiliary_loss_clip": 0.01050862, + "auxiliary_loss_mlp": 0.0074753, + "balance_loss_clip": 1.02403224, + "balance_loss_mlp": 1.00032735, + "epoch": 0.4796633097850594, + "flos": 21835088634240.0, + "grad_norm": 3.2218881832613366, + "language_loss": 0.78632569, + "learning_rate": 2.228309942555734e-06, + "loss": 0.80430955, + "num_input_tokens_seen": 171501140, + "step": 7978, + "time_per_iteration": 2.547905206680298 + }, + { + "auxiliary_loss_clip": 0.01049961, + "auxiliary_loss_mlp": 0.01033076, + "balance_loss_clip": 1.02483332, + "balance_loss_mlp": 1.02124405, + "epoch": 0.47972343303772735, + "flos": 23437350610560.0, + "grad_norm": 1.6494418657612295, + "language_loss": 0.89232624, + "learning_rate": 2.22792302247656e-06, + "loss": 0.91315663, + "num_input_tokens_seen": 171519835, + "step": 7979, + "time_per_iteration": 4.236259460449219 + }, + { + "auxiliary_loss_clip": 0.01063384, + "auxiliary_loss_mlp": 0.01034396, + "balance_loss_clip": 1.02811718, + "balance_loss_mlp": 1.02122319, + "epoch": 0.4797835562903953, + "flos": 24899776940160.0, + "grad_norm": 1.418117296669068, + "language_loss": 0.77051938, + "learning_rate": 2.227536093754523e-06, + "loss": 0.79149711, + "num_input_tokens_seen": 171540980, + "step": 7980, + "time_per_iteration": 4.2851457595825195 + }, + { + "auxiliary_loss_clip": 0.01046155, + "auxiliary_loss_mlp": 0.0103198, + "balance_loss_clip": 1.02732539, + "balance_loss_mlp": 1.01872432, + "epoch": 0.4798436795430633, + "flos": 35042996793600.0, + "grad_norm": 2.14926188722645, + "language_loss": 0.71621454, + "learning_rate": 2.227149156404295e-06, + "loss": 0.73699588, + "num_input_tokens_seen": 171563600, + "step": 7981, + "time_per_iteration": 2.7602837085723877 + }, + { + "auxiliary_loss_clip": 0.010692, + "auxiliary_loss_mlp": 0.01028076, + "balance_loss_clip": 1.02776051, + "balance_loss_mlp": 1.01644731, + "epoch": 0.47990380279573125, + "flos": 20590209025920.0, + "grad_norm": 1.8843939185936773, + "language_loss": 0.70376742, + "learning_rate": 2.2267622104405473e-06, + "loss": 0.72474021, + "num_input_tokens_seen": 171580700, + "step": 7982, + "time_per_iteration": 2.5100157260894775 + }, + { + "auxiliary_loss_clip": 0.01045318, + "auxiliary_loss_mlp": 0.0102762, + "balance_loss_clip": 1.0253762, + "balance_loss_mlp": 1.0173918, + "epoch": 0.4799639260483992, + "flos": 26359402008960.0, + "grad_norm": 1.6936843565655795, + "language_loss": 0.71236461, + "learning_rate": 2.2263752558779544e-06, + "loss": 0.73309404, + "num_input_tokens_seen": 171602035, + "step": 7983, + "time_per_iteration": 2.664018392562866 + }, + { + "auxiliary_loss_clip": 0.01001765, + "auxiliary_loss_mlp": 0.00746455, + "balance_loss_clip": 1.00342131, + "balance_loss_mlp": 1.0002234, + "epoch": 0.4800240493010672, + "flos": 70979021521920.0, + "grad_norm": 0.8388092925791771, + "language_loss": 0.5939467, + "learning_rate": 2.2259882927311883e-06, + "loss": 0.61142886, + "num_input_tokens_seen": 171659215, + "step": 7984, + "time_per_iteration": 3.254427909851074 + }, + { + "auxiliary_loss_clip": 0.01009468, + "auxiliary_loss_mlp": 0.01043892, + "balance_loss_clip": 1.02033532, + "balance_loss_mlp": 1.02998018, + "epoch": 0.48008417255373514, + "flos": 17086656349440.0, + "grad_norm": 1.592664233226537, + "language_loss": 0.66594756, + "learning_rate": 2.2256013210149247e-06, + "loss": 0.68648124, + "num_input_tokens_seen": 171675710, + "step": 7985, + "time_per_iteration": 2.9149816036224365 + }, + { + "auxiliary_loss_clip": 0.01046238, + "auxiliary_loss_mlp": 0.01038072, + "balance_loss_clip": 1.02335262, + "balance_loss_mlp": 1.02535808, + "epoch": 0.4801442958064031, + "flos": 15413435055360.0, + "grad_norm": 2.0241635553745994, + "language_loss": 0.7018503, + "learning_rate": 2.225214340743835e-06, + "loss": 0.72269344, + "num_input_tokens_seen": 171692510, + "step": 7986, + "time_per_iteration": 2.7047789096832275 + }, + { + "auxiliary_loss_clip": 0.01045378, + "auxiliary_loss_mlp": 0.0103958, + "balance_loss_clip": 1.02834177, + "balance_loss_mlp": 1.02674699, + "epoch": 0.4802044190590711, + "flos": 11473747441920.0, + "grad_norm": 1.965823922923259, + "language_loss": 0.79000545, + "learning_rate": 2.2248273519325956e-06, + "loss": 0.81085503, + "num_input_tokens_seen": 171710235, + "step": 7987, + "time_per_iteration": 2.860672950744629 + }, + { + "auxiliary_loss_clip": 0.01018906, + "auxiliary_loss_mlp": 0.01035796, + "balance_loss_clip": 1.0229454, + "balance_loss_mlp": 1.02352333, + "epoch": 0.48026454231173904, + "flos": 20951003185920.0, + "grad_norm": 1.7832220215198615, + "language_loss": 0.75588179, + "learning_rate": 2.2244403545958812e-06, + "loss": 0.77642882, + "num_input_tokens_seen": 171726715, + "step": 7988, + "time_per_iteration": 3.0231213569641113 + }, + { + "auxiliary_loss_clip": 0.01041534, + "auxiliary_loss_mlp": 0.01030947, + "balance_loss_clip": 1.02796412, + "balance_loss_mlp": 1.01913357, + "epoch": 0.48032466556440706, + "flos": 20448110822400.0, + "grad_norm": 2.23307892035037, + "language_loss": 0.79081804, + "learning_rate": 2.224053348748365e-06, + "loss": 0.81154281, + "num_input_tokens_seen": 171743605, + "step": 7989, + "time_per_iteration": 2.887188196182251 + }, + { + "auxiliary_loss_clip": 0.0105298, + "auxiliary_loss_mlp": 0.010398, + "balance_loss_clip": 1.02653086, + "balance_loss_mlp": 1.02690768, + "epoch": 0.480384788817075, + "flos": 37120823861760.0, + "grad_norm": 1.7195600423303001, + "language_loss": 0.73501062, + "learning_rate": 2.223666334404724e-06, + "loss": 0.75593835, + "num_input_tokens_seen": 171765445, + "step": 7990, + "time_per_iteration": 2.9610683917999268 + }, + { + "auxiliary_loss_clip": 0.01000883, + "auxiliary_loss_mlp": 0.00746559, + "balance_loss_clip": 1.0021348, + "balance_loss_mlp": 1.00016201, + "epoch": 0.480444912069743, + "flos": 69552577641600.0, + "grad_norm": 0.7683861941401813, + "language_loss": 0.59041214, + "learning_rate": 2.223279311579633e-06, + "loss": 0.60788655, + "num_input_tokens_seen": 171830115, + "step": 7991, + "time_per_iteration": 3.3749847412109375 + }, + { + "auxiliary_loss_clip": 0.01061243, + "auxiliary_loss_mlp": 0.00747565, + "balance_loss_clip": 1.02692378, + "balance_loss_mlp": 1.00029075, + "epoch": 0.48050503532241096, + "flos": 29822231640960.0, + "grad_norm": 1.6911388954038222, + "language_loss": 0.66597581, + "learning_rate": 2.222892280287768e-06, + "loss": 0.68406391, + "num_input_tokens_seen": 171849135, + "step": 7992, + "time_per_iteration": 2.7676498889923096 + }, + { + "auxiliary_loss_clip": 0.01046897, + "auxiliary_loss_mlp": 0.01031976, + "balance_loss_clip": 1.02520823, + "balance_loss_mlp": 1.01985228, + "epoch": 0.4805651585750789, + "flos": 23948539015680.0, + "grad_norm": 1.6442355622690923, + "language_loss": 0.76320797, + "learning_rate": 2.2225052405438056e-06, + "loss": 0.7839967, + "num_input_tokens_seen": 171868880, + "step": 7993, + "time_per_iteration": 2.6332743167877197 + }, + { + "auxiliary_loss_clip": 0.01021836, + "auxiliary_loss_mlp": 0.01037023, + "balance_loss_clip": 1.02351832, + "balance_loss_mlp": 1.0248096, + "epoch": 0.4806252818277469, + "flos": 25665428269440.0, + "grad_norm": 1.5387190285817711, + "language_loss": 0.78371054, + "learning_rate": 2.222118192362422e-06, + "loss": 0.80429912, + "num_input_tokens_seen": 171889455, + "step": 7994, + "time_per_iteration": 2.7986795902252197 + }, + { + "auxiliary_loss_clip": 0.01051557, + "auxiliary_loss_mlp": 0.01027824, + "balance_loss_clip": 1.02708292, + "balance_loss_mlp": 1.0160048, + "epoch": 0.48068540508041485, + "flos": 13151996640000.0, + "grad_norm": 2.0940305284927785, + "language_loss": 0.79461181, + "learning_rate": 2.2217311357582946e-06, + "loss": 0.81540561, + "num_input_tokens_seen": 171906070, + "step": 7995, + "time_per_iteration": 2.6525626182556152 + }, + { + "auxiliary_loss_clip": 0.01022912, + "auxiliary_loss_mlp": 0.010313, + "balance_loss_clip": 1.02517736, + "balance_loss_mlp": 1.01874161, + "epoch": 0.4807455283330828, + "flos": 21176738208000.0, + "grad_norm": 1.434226558207948, + "language_loss": 0.82559478, + "learning_rate": 2.2213440707461e-06, + "loss": 0.84613693, + "num_input_tokens_seen": 171926515, + "step": 7996, + "time_per_iteration": 2.9510138034820557 + }, + { + "auxiliary_loss_clip": 0.01005979, + "auxiliary_loss_mlp": 0.01034035, + "balance_loss_clip": 1.02211595, + "balance_loss_mlp": 1.02178621, + "epoch": 0.4808056515857508, + "flos": 12275991751680.0, + "grad_norm": 1.7186843921876078, + "language_loss": 0.80637771, + "learning_rate": 2.220956997340516e-06, + "loss": 0.82677782, + "num_input_tokens_seen": 171943845, + "step": 7997, + "time_per_iteration": 2.770240306854248 + }, + { + "auxiliary_loss_clip": 0.01025561, + "auxiliary_loss_mlp": 0.01031766, + "balance_loss_clip": 1.02410269, + "balance_loss_mlp": 1.0189985, + "epoch": 0.48086577483841875, + "flos": 24826052275200.0, + "grad_norm": 1.6424289894199253, + "language_loss": 0.72446638, + "learning_rate": 2.220569915556221e-06, + "loss": 0.7450397, + "num_input_tokens_seen": 171964970, + "step": 7998, + "time_per_iteration": 2.841346502304077 + }, + { + "auxiliary_loss_clip": 0.01071764, + "auxiliary_loss_mlp": 0.01028892, + "balance_loss_clip": 1.02755523, + "balance_loss_mlp": 1.01677489, + "epoch": 0.4809258980910867, + "flos": 24465365856000.0, + "grad_norm": 1.7155385886458245, + "language_loss": 0.70598066, + "learning_rate": 2.220182825407892e-06, + "loss": 0.72698724, + "num_input_tokens_seen": 171986340, + "step": 7999, + "time_per_iteration": 2.6434953212738037 + }, + { + "auxiliary_loss_clip": 0.01061751, + "auxiliary_loss_mlp": 0.01043456, + "balance_loss_clip": 1.02692914, + "balance_loss_mlp": 1.03130841, + "epoch": 0.4809860213437547, + "flos": 21215952881280.0, + "grad_norm": 1.4611001691239185, + "language_loss": 0.71303982, + "learning_rate": 2.2197957269102083e-06, + "loss": 0.73409188, + "num_input_tokens_seen": 172007300, + "step": 8000, + "time_per_iteration": 2.6090002059936523 + }, + { + "auxiliary_loss_clip": 0.01064046, + "auxiliary_loss_mlp": 0.01032855, + "balance_loss_clip": 1.02917767, + "balance_loss_mlp": 1.01981306, + "epoch": 0.48104614459642264, + "flos": 37632084094080.0, + "grad_norm": 1.3706160705283232, + "language_loss": 0.74814391, + "learning_rate": 2.2194086200778485e-06, + "loss": 0.76911288, + "num_input_tokens_seen": 172029585, + "step": 8001, + "time_per_iteration": 2.7966065406799316 + }, + { + "auxiliary_loss_clip": 0.01063096, + "auxiliary_loss_mlp": 0.0103993, + "balance_loss_clip": 1.02737546, + "balance_loss_mlp": 1.02689457, + "epoch": 0.48110626784909066, + "flos": 18406122549120.0, + "grad_norm": 1.7301169737454936, + "language_loss": 0.81596661, + "learning_rate": 2.219021504925493e-06, + "loss": 0.83699685, + "num_input_tokens_seen": 172047495, + "step": 8002, + "time_per_iteration": 2.601492166519165 + }, + { + "auxiliary_loss_clip": 0.01067012, + "auxiliary_loss_mlp": 0.01037509, + "balance_loss_clip": 1.02990758, + "balance_loss_mlp": 1.02420545, + "epoch": 0.48116639110175863, + "flos": 28439814856320.0, + "grad_norm": 5.397074582825813, + "language_loss": 0.71201015, + "learning_rate": 2.218634381467819e-06, + "loss": 0.73305535, + "num_input_tokens_seen": 172067625, + "step": 8003, + "time_per_iteration": 2.8248450756073 + }, + { + "auxiliary_loss_clip": 0.01054867, + "auxiliary_loss_mlp": 0.01038094, + "balance_loss_clip": 1.02729464, + "balance_loss_mlp": 1.02589273, + "epoch": 0.4812265143544266, + "flos": 21725237865600.0, + "grad_norm": 1.592935283217966, + "language_loss": 0.82156861, + "learning_rate": 2.218247249719507e-06, + "loss": 0.84249824, + "num_input_tokens_seen": 172087885, + "step": 8004, + "time_per_iteration": 2.6414496898651123 + }, + { + "auxiliary_loss_clip": 0.01051066, + "auxiliary_loss_mlp": 0.01037264, + "balance_loss_clip": 1.02705443, + "balance_loss_mlp": 1.02190351, + "epoch": 0.48128663760709456, + "flos": 13224679810560.0, + "grad_norm": 2.9737440013353926, + "language_loss": 0.77738619, + "learning_rate": 2.217860109695239e-06, + "loss": 0.79826951, + "num_input_tokens_seen": 172105815, + "step": 8005, + "time_per_iteration": 2.572087287902832 + }, + { + "auxiliary_loss_clip": 0.0106484, + "auxiliary_loss_mlp": 0.0103971, + "balance_loss_clip": 1.02999425, + "balance_loss_mlp": 1.02653766, + "epoch": 0.4813467608597625, + "flos": 24243437675520.0, + "grad_norm": 2.3864085228210343, + "language_loss": 0.71180278, + "learning_rate": 2.217472961409692e-06, + "loss": 0.73284829, + "num_input_tokens_seen": 172126125, + "step": 8006, + "time_per_iteration": 2.6634507179260254 + }, + { + "auxiliary_loss_clip": 0.01041027, + "auxiliary_loss_mlp": 0.01035653, + "balance_loss_clip": 1.02467692, + "balance_loss_mlp": 1.02222395, + "epoch": 0.4814068841124305, + "flos": 27480424544640.0, + "grad_norm": 1.7740158955821739, + "language_loss": 0.70931613, + "learning_rate": 2.2170858048775495e-06, + "loss": 0.73008287, + "num_input_tokens_seen": 172141945, + "step": 8007, + "time_per_iteration": 2.680579423904419 + }, + { + "auxiliary_loss_clip": 0.01074599, + "auxiliary_loss_mlp": 0.01030968, + "balance_loss_clip": 1.0289278, + "balance_loss_mlp": 1.01867795, + "epoch": 0.48146700736509845, + "flos": 19572896033280.0, + "grad_norm": 2.301475841316452, + "language_loss": 0.71599233, + "learning_rate": 2.2166986401134914e-06, + "loss": 0.73704803, + "num_input_tokens_seen": 172161095, + "step": 8008, + "time_per_iteration": 2.868283748626709 + }, + { + "auxiliary_loss_clip": 0.0103836, + "auxiliary_loss_mlp": 0.0104121, + "balance_loss_clip": 1.02642274, + "balance_loss_mlp": 1.02739966, + "epoch": 0.4815271306177664, + "flos": 20627771673600.0, + "grad_norm": 1.8657502717944325, + "language_loss": 0.60871458, + "learning_rate": 2.216311467132199e-06, + "loss": 0.62951028, + "num_input_tokens_seen": 172178750, + "step": 8009, + "time_per_iteration": 2.963731050491333 + }, + { + "auxiliary_loss_clip": 0.00982678, + "auxiliary_loss_mlp": 0.01000732, + "balance_loss_clip": 1.00455403, + "balance_loss_mlp": 0.99952757, + "epoch": 0.4815872538704344, + "flos": 67691076232320.0, + "grad_norm": 0.8810499411849042, + "language_loss": 0.6136893, + "learning_rate": 2.2159242859483547e-06, + "loss": 0.63352334, + "num_input_tokens_seen": 172240235, + "step": 8010, + "time_per_iteration": 3.273008108139038 + }, + { + "auxiliary_loss_clip": 0.01062737, + "auxiliary_loss_mlp": 0.01041138, + "balance_loss_clip": 1.0274688, + "balance_loss_mlp": 1.02731538, + "epoch": 0.48164737712310235, + "flos": 22820764723200.0, + "grad_norm": 1.7574034690319231, + "language_loss": 0.73231375, + "learning_rate": 2.215537096576639e-06, + "loss": 0.7533524, + "num_input_tokens_seen": 172259875, + "step": 8011, + "time_per_iteration": 2.7233264446258545 + }, + { + "auxiliary_loss_clip": 0.0104415, + "auxiliary_loss_mlp": 0.01031735, + "balance_loss_clip": 1.02370143, + "balance_loss_mlp": 1.01960528, + "epoch": 0.4817075003757703, + "flos": 23733865382400.0, + "grad_norm": 1.8499665723071055, + "language_loss": 0.79384971, + "learning_rate": 2.2151498990317354e-06, + "loss": 0.81460857, + "num_input_tokens_seen": 172280150, + "step": 8012, + "time_per_iteration": 2.8078556060791016 + }, + { + "auxiliary_loss_clip": 0.01043397, + "auxiliary_loss_mlp": 0.01041709, + "balance_loss_clip": 1.02799261, + "balance_loss_mlp": 1.02870274, + "epoch": 0.4817676236284383, + "flos": 28182909807360.0, + "grad_norm": 1.6625863373138916, + "language_loss": 0.73813641, + "learning_rate": 2.214762693328326e-06, + "loss": 0.75898749, + "num_input_tokens_seen": 172300810, + "step": 8013, + "time_per_iteration": 2.855207920074463 + }, + { + "auxiliary_loss_clip": 0.01055226, + "auxiliary_loss_mlp": 0.01033298, + "balance_loss_clip": 1.03128922, + "balance_loss_mlp": 1.02109075, + "epoch": 0.48182774688110624, + "flos": 17091756080640.0, + "grad_norm": 2.04797204304045, + "language_loss": 0.90548623, + "learning_rate": 2.214375479481094e-06, + "loss": 0.92637146, + "num_input_tokens_seen": 172317930, + "step": 8014, + "time_per_iteration": 2.743849754333496 + }, + { + "auxiliary_loss_clip": 0.01074518, + "auxiliary_loss_mlp": 0.01037752, + "balance_loss_clip": 1.0274893, + "balance_loss_mlp": 1.02398372, + "epoch": 0.4818878701337742, + "flos": 12567873669120.0, + "grad_norm": 5.276712275140706, + "language_loss": 0.74575472, + "learning_rate": 2.213988257504722e-06, + "loss": 0.76687747, + "num_input_tokens_seen": 172336340, + "step": 8015, + "time_per_iteration": 2.626924514770508 + }, + { + "auxiliary_loss_clip": 0.01056478, + "auxiliary_loss_mlp": 0.01037053, + "balance_loss_clip": 1.02770543, + "balance_loss_mlp": 1.02334428, + "epoch": 0.48194799338644223, + "flos": 24608505553920.0, + "grad_norm": 2.1581145411409306, + "language_loss": 0.80227804, + "learning_rate": 2.213601027413894e-06, + "loss": 0.8232134, + "num_input_tokens_seen": 172354315, + "step": 8016, + "time_per_iteration": 5.8007283210754395 + }, + { + "auxiliary_loss_clip": 0.01061841, + "auxiliary_loss_mlp": 0.01030252, + "balance_loss_clip": 1.02959871, + "balance_loss_mlp": 1.0182656, + "epoch": 0.4820081166391102, + "flos": 21105204272640.0, + "grad_norm": 1.811238456665179, + "language_loss": 0.77130365, + "learning_rate": 2.2132137892232933e-06, + "loss": 0.79222465, + "num_input_tokens_seen": 172372695, + "step": 8017, + "time_per_iteration": 2.574270248413086 + }, + { + "auxiliary_loss_clip": 0.01060732, + "auxiliary_loss_mlp": 0.01030648, + "balance_loss_clip": 1.0289284, + "balance_loss_mlp": 1.01776767, + "epoch": 0.48206823989177816, + "flos": 25264593423360.0, + "grad_norm": 2.03171620584223, + "language_loss": 0.80145884, + "learning_rate": 2.2128265429476043e-06, + "loss": 0.82237256, + "num_input_tokens_seen": 172390905, + "step": 8018, + "time_per_iteration": 2.6332080364227295 + }, + { + "auxiliary_loss_clip": 0.01054794, + "auxiliary_loss_mlp": 0.01030421, + "balance_loss_clip": 1.03616846, + "balance_loss_mlp": 1.01855946, + "epoch": 0.4821283631444461, + "flos": 24645062620800.0, + "grad_norm": 1.609128289548335, + "language_loss": 0.76121509, + "learning_rate": 2.2124392886015124e-06, + "loss": 0.78206724, + "num_input_tokens_seen": 172412295, + "step": 8019, + "time_per_iteration": 2.7132456302642822 + }, + { + "auxiliary_loss_clip": 0.01043372, + "auxiliary_loss_mlp": 0.01032923, + "balance_loss_clip": 1.02667427, + "balance_loss_mlp": 1.02040648, + "epoch": 0.4821884863971141, + "flos": 23952094462080.0, + "grad_norm": 1.8640690385453105, + "language_loss": 0.79137629, + "learning_rate": 2.212052026199701e-06, + "loss": 0.81213921, + "num_input_tokens_seen": 172432625, + "step": 8020, + "time_per_iteration": 2.750577688217163 + }, + { + "auxiliary_loss_clip": 0.01069418, + "auxiliary_loss_mlp": 0.01029842, + "balance_loss_clip": 1.02579093, + "balance_loss_mlp": 1.01758146, + "epoch": 0.48224860964978206, + "flos": 17160668323200.0, + "grad_norm": 1.9092236025876133, + "language_loss": 0.69779551, + "learning_rate": 2.211664755756855e-06, + "loss": 0.71878809, + "num_input_tokens_seen": 172450010, + "step": 8021, + "time_per_iteration": 2.6091079711914062 + }, + { + "auxiliary_loss_clip": 0.01045663, + "auxiliary_loss_mlp": 0.01032171, + "balance_loss_clip": 1.02595294, + "balance_loss_mlp": 1.01874185, + "epoch": 0.48230873290245, + "flos": 23075838178560.0, + "grad_norm": 1.8159389759390703, + "language_loss": 0.62575579, + "learning_rate": 2.2112774772876603e-06, + "loss": 0.64653414, + "num_input_tokens_seen": 172469080, + "step": 8022, + "time_per_iteration": 2.6285479068756104 + }, + { + "auxiliary_loss_clip": 0.01050199, + "auxiliary_loss_mlp": 0.00747498, + "balance_loss_clip": 1.02696157, + "balance_loss_mlp": 1.00021195, + "epoch": 0.482368856155118, + "flos": 19353517718400.0, + "grad_norm": 2.4986079335324907, + "language_loss": 0.66140175, + "learning_rate": 2.2108901908068028e-06, + "loss": 0.67937875, + "num_input_tokens_seen": 172484850, + "step": 8023, + "time_per_iteration": 2.6889681816101074 + }, + { + "auxiliary_loss_clip": 0.00995861, + "auxiliary_loss_mlp": 0.01039665, + "balance_loss_clip": 1.02144217, + "balance_loss_mlp": 1.02581286, + "epoch": 0.48242897940778595, + "flos": 20078984707200.0, + "grad_norm": 1.8929704078399485, + "language_loss": 0.76847643, + "learning_rate": 2.2105028963289683e-06, + "loss": 0.78883171, + "num_input_tokens_seen": 172503525, + "step": 8024, + "time_per_iteration": 2.8864920139312744 + }, + { + "auxiliary_loss_clip": 0.01052272, + "auxiliary_loss_mlp": 0.01036829, + "balance_loss_clip": 1.02709222, + "balance_loss_mlp": 1.02343619, + "epoch": 0.4824891026604539, + "flos": 23403989854080.0, + "grad_norm": 1.6050068621933269, + "language_loss": 0.75069338, + "learning_rate": 2.2101155938688423e-06, + "loss": 0.77158439, + "num_input_tokens_seen": 172524360, + "step": 8025, + "time_per_iteration": 2.892894983291626 + }, + { + "auxiliary_loss_clip": 0.0107113, + "auxiliary_loss_mlp": 0.01033112, + "balance_loss_clip": 1.02758002, + "balance_loss_mlp": 1.02069056, + "epoch": 0.4825492259131219, + "flos": 20368675895040.0, + "grad_norm": 1.7471131204507842, + "language_loss": 0.70811415, + "learning_rate": 2.209728283441112e-06, + "loss": 0.72915655, + "num_input_tokens_seen": 172541480, + "step": 8026, + "time_per_iteration": 2.562685966491699 + }, + { + "auxiliary_loss_clip": 0.01055959, + "auxiliary_loss_mlp": 0.01042618, + "balance_loss_clip": 1.02464712, + "balance_loss_mlp": 1.02841449, + "epoch": 0.48260934916578985, + "flos": 14319021519360.0, + "grad_norm": 2.767319075993921, + "language_loss": 0.74637383, + "learning_rate": 2.209340965060465e-06, + "loss": 0.76735961, + "num_input_tokens_seen": 172559005, + "step": 8027, + "time_per_iteration": 5.788258075714111 + }, + { + "auxiliary_loss_clip": 0.01055044, + "auxiliary_loss_mlp": 0.01036671, + "balance_loss_clip": 1.02764249, + "balance_loss_mlp": 1.02429652, + "epoch": 0.4826694724184578, + "flos": 22121152548480.0, + "grad_norm": 1.7837092436750548, + "language_loss": 0.67218, + "learning_rate": 2.2089536387415868e-06, + "loss": 0.69309711, + "num_input_tokens_seen": 172578435, + "step": 8028, + "time_per_iteration": 2.652294874191284 + }, + { + "auxiliary_loss_clip": 0.01052883, + "auxiliary_loss_mlp": 0.01036474, + "balance_loss_clip": 1.028, + "balance_loss_mlp": 1.02380788, + "epoch": 0.48272959567112583, + "flos": 16181169373440.0, + "grad_norm": 1.5478677582132023, + "language_loss": 0.72666878, + "learning_rate": 2.2085663044991655e-06, + "loss": 0.74756229, + "num_input_tokens_seen": 172596095, + "step": 8029, + "time_per_iteration": 2.58908748626709 + }, + { + "auxiliary_loss_clip": 0.0105529, + "auxiliary_loss_mlp": 0.01031354, + "balance_loss_clip": 1.02946746, + "balance_loss_mlp": 1.01741862, + "epoch": 0.4827897189237938, + "flos": 23180445561600.0, + "grad_norm": 2.048174788868718, + "language_loss": 0.84595275, + "learning_rate": 2.2081789623478896e-06, + "loss": 0.86681926, + "num_input_tokens_seen": 172615255, + "step": 8030, + "time_per_iteration": 2.669740676879883 + }, + { + "auxiliary_loss_clip": 0.01040276, + "auxiliary_loss_mlp": 0.01033064, + "balance_loss_clip": 1.0239737, + "balance_loss_mlp": 1.02065396, + "epoch": 0.48284984217646176, + "flos": 21652626522240.0, + "grad_norm": 1.9836002515961946, + "language_loss": 0.73750979, + "learning_rate": 2.2077916123024466e-06, + "loss": 0.75824314, + "num_input_tokens_seen": 172633185, + "step": 8031, + "time_per_iteration": 2.6693198680877686 + }, + { + "auxiliary_loss_clip": 0.01044217, + "auxiliary_loss_mlp": 0.01047295, + "balance_loss_clip": 1.02512825, + "balance_loss_mlp": 1.03305531, + "epoch": 0.48290996542912973, + "flos": 31467443304960.0, + "grad_norm": 1.845457435696078, + "language_loss": 0.71848452, + "learning_rate": 2.2074042543775245e-06, + "loss": 0.73939967, + "num_input_tokens_seen": 172654280, + "step": 8032, + "time_per_iteration": 2.7153656482696533 + }, + { + "auxiliary_loss_clip": 0.01052377, + "auxiliary_loss_mlp": 0.01037561, + "balance_loss_clip": 1.0233264, + "balance_loss_mlp": 1.02469802, + "epoch": 0.4829700886817977, + "flos": 24461954064000.0, + "grad_norm": 2.178108489579967, + "language_loss": 0.74290884, + "learning_rate": 2.2070168885878126e-06, + "loss": 0.76380819, + "num_input_tokens_seen": 172675545, + "step": 8033, + "time_per_iteration": 2.6980295181274414 + }, + { + "auxiliary_loss_clip": 0.010278, + "auxiliary_loss_mlp": 0.01030903, + "balance_loss_clip": 1.0299027, + "balance_loss_mlp": 1.01821947, + "epoch": 0.48303021193446566, + "flos": 25702164904320.0, + "grad_norm": 1.6981880332342245, + "language_loss": 0.83449233, + "learning_rate": 2.2066295149479996e-06, + "loss": 0.85507935, + "num_input_tokens_seen": 172696455, + "step": 8034, + "time_per_iteration": 2.8673617839813232 + }, + { + "auxiliary_loss_clip": 0.01039509, + "auxiliary_loss_mlp": 0.01027484, + "balance_loss_clip": 1.02700031, + "balance_loss_mlp": 1.01555729, + "epoch": 0.4830903351871336, + "flos": 20085233673600.0, + "grad_norm": 1.7144275847665467, + "language_loss": 0.79765379, + "learning_rate": 2.2062421334727744e-06, + "loss": 0.81832373, + "num_input_tokens_seen": 172716720, + "step": 8035, + "time_per_iteration": 2.712265968322754 + }, + { + "auxiliary_loss_clip": 0.0105183, + "auxiliary_loss_mlp": 0.00747583, + "balance_loss_clip": 1.02687097, + "balance_loss_mlp": 1.00011539, + "epoch": 0.4831504584398016, + "flos": 39452216014080.0, + "grad_norm": 2.1985910797862003, + "language_loss": 0.69536507, + "learning_rate": 2.2058547441768267e-06, + "loss": 0.71335918, + "num_input_tokens_seen": 172737435, + "step": 8036, + "time_per_iteration": 2.751789093017578 + }, + { + "auxiliary_loss_clip": 0.01061007, + "auxiliary_loss_mlp": 0.01029623, + "balance_loss_clip": 1.02634144, + "balance_loss_mlp": 1.01711786, + "epoch": 0.48321058169246955, + "flos": 20006588845440.0, + "grad_norm": 1.8664662309913662, + "language_loss": 0.72496665, + "learning_rate": 2.205467347074847e-06, + "loss": 0.74587297, + "num_input_tokens_seen": 172755700, + "step": 8037, + "time_per_iteration": 2.552391529083252 + }, + { + "auxiliary_loss_clip": 0.01029333, + "auxiliary_loss_mlp": 0.01041158, + "balance_loss_clip": 1.02682734, + "balance_loss_mlp": 1.02535069, + "epoch": 0.4832707049451375, + "flos": 20741465197440.0, + "grad_norm": 2.028208399297294, + "language_loss": 0.69149488, + "learning_rate": 2.205079942181525e-06, + "loss": 0.71219981, + "num_input_tokens_seen": 172775185, + "step": 8038, + "time_per_iteration": 2.7758138179779053 + }, + { + "auxiliary_loss_clip": 0.01039841, + "auxiliary_loss_mlp": 0.01035881, + "balance_loss_clip": 1.02617383, + "balance_loss_mlp": 1.02344751, + "epoch": 0.4833308281978055, + "flos": 33145584762240.0, + "grad_norm": 1.5714752338352176, + "language_loss": 0.79126298, + "learning_rate": 2.20469252951155e-06, + "loss": 0.81202024, + "num_input_tokens_seen": 172796990, + "step": 8039, + "time_per_iteration": 3.1076724529266357 + }, + { + "auxiliary_loss_clip": 0.01061318, + "auxiliary_loss_mlp": 0.01030597, + "balance_loss_clip": 1.02691448, + "balance_loss_mlp": 1.01849127, + "epoch": 0.48339095145047345, + "flos": 19099234362240.0, + "grad_norm": 2.3943887880393513, + "language_loss": 0.77395272, + "learning_rate": 2.2043051090796143e-06, + "loss": 0.79487181, + "num_input_tokens_seen": 172814915, + "step": 8040, + "time_per_iteration": 2.703011989593506 + }, + { + "auxiliary_loss_clip": 0.01061868, + "auxiliary_loss_mlp": 0.01032835, + "balance_loss_clip": 1.02636385, + "balance_loss_mlp": 1.01981115, + "epoch": 0.4834510747031414, + "flos": 34459448440320.0, + "grad_norm": 1.579808590117352, + "language_loss": 0.75573599, + "learning_rate": 2.203917680900409e-06, + "loss": 0.77668309, + "num_input_tokens_seen": 172837060, + "step": 8041, + "time_per_iteration": 2.7630093097686768 + }, + { + "auxiliary_loss_clip": 0.01035897, + "auxiliary_loss_mlp": 0.01032961, + "balance_loss_clip": 1.02677488, + "balance_loss_mlp": 1.02047396, + "epoch": 0.48351119795580944, + "flos": 27380845065600.0, + "grad_norm": 1.6635251195229923, + "language_loss": 0.66625255, + "learning_rate": 2.203530244988624e-06, + "loss": 0.68694115, + "num_input_tokens_seen": 172856545, + "step": 8042, + "time_per_iteration": 2.777313470840454 + }, + { + "auxiliary_loss_clip": 0.00990818, + "auxiliary_loss_mlp": 0.0100193, + "balance_loss_clip": 1.00306451, + "balance_loss_mlp": 1.00050569, + "epoch": 0.4835713212084774, + "flos": 67143941291520.0, + "grad_norm": 0.6869663406964269, + "language_loss": 0.58542788, + "learning_rate": 2.2031428013589517e-06, + "loss": 0.60535538, + "num_input_tokens_seen": 172923055, + "step": 8043, + "time_per_iteration": 3.271740198135376 + }, + { + "auxiliary_loss_clip": 0.0104555, + "auxiliary_loss_mlp": 0.01035994, + "balance_loss_clip": 1.0244205, + "balance_loss_mlp": 1.02155137, + "epoch": 0.48363144446114537, + "flos": 17967473660160.0, + "grad_norm": 2.116851508454728, + "language_loss": 0.71735954, + "learning_rate": 2.2027553500260847e-06, + "loss": 0.73817492, + "num_input_tokens_seen": 172940700, + "step": 8044, + "time_per_iteration": 2.5967719554901123 + }, + { + "auxiliary_loss_clip": 0.01015482, + "auxiliary_loss_mlp": 0.01032345, + "balance_loss_clip": 1.02272987, + "balance_loss_mlp": 1.01900578, + "epoch": 0.48369156771381333, + "flos": 20593513077120.0, + "grad_norm": 1.6020935939299954, + "language_loss": 0.7584002, + "learning_rate": 2.202367891004714e-06, + "loss": 0.77887851, + "num_input_tokens_seen": 172961125, + "step": 8045, + "time_per_iteration": 2.8199803829193115 + }, + { + "auxiliary_loss_clip": 0.01032504, + "auxiliary_loss_mlp": 0.01038986, + "balance_loss_clip": 1.02692962, + "balance_loss_mlp": 1.02610517, + "epoch": 0.4837516909664813, + "flos": 22675075159680.0, + "grad_norm": 2.555026586845175, + "language_loss": 0.69307536, + "learning_rate": 2.201980424309533e-06, + "loss": 0.71379024, + "num_input_tokens_seen": 172980405, + "step": 8046, + "time_per_iteration": 2.760502576828003 + }, + { + "auxiliary_loss_clip": 0.01072701, + "auxiliary_loss_mlp": 0.0103156, + "balance_loss_clip": 1.02837539, + "balance_loss_mlp": 1.01839364, + "epoch": 0.48381181421914926, + "flos": 25518625384320.0, + "grad_norm": 1.8172711784384328, + "language_loss": 0.82301712, + "learning_rate": 2.2015929499552337e-06, + "loss": 0.84405977, + "num_input_tokens_seen": 172999105, + "step": 8047, + "time_per_iteration": 2.681135892868042 + }, + { + "auxiliary_loss_clip": 0.01040247, + "auxiliary_loss_mlp": 0.01035637, + "balance_loss_clip": 1.02408159, + "balance_loss_mlp": 1.02297723, + "epoch": 0.4838719374718172, + "flos": 24207491139840.0, + "grad_norm": 2.793854030326232, + "language_loss": 0.80611968, + "learning_rate": 2.2012054679565092e-06, + "loss": 0.82687849, + "num_input_tokens_seen": 173019935, + "step": 8048, + "time_per_iteration": 2.691044569015503 + }, + { + "auxiliary_loss_clip": 0.01063316, + "auxiliary_loss_mlp": 0.01034875, + "balance_loss_clip": 1.02685356, + "balance_loss_mlp": 1.02260184, + "epoch": 0.4839320607244852, + "flos": 26724577628160.0, + "grad_norm": 1.664619309544351, + "language_loss": 0.81145656, + "learning_rate": 2.200817978328054e-06, + "loss": 0.83243847, + "num_input_tokens_seen": 173039700, + "step": 8049, + "time_per_iteration": 2.6426284313201904 + }, + { + "auxiliary_loss_clip": 0.01043321, + "auxiliary_loss_mlp": 0.0102972, + "balance_loss_clip": 1.02655268, + "balance_loss_mlp": 1.01866341, + "epoch": 0.48399218397715316, + "flos": 20448900921600.0, + "grad_norm": 1.5911093020767741, + "language_loss": 0.72884279, + "learning_rate": 2.2004304810845602e-06, + "loss": 0.74957323, + "num_input_tokens_seen": 173059170, + "step": 8050, + "time_per_iteration": 2.8467791080474854 + }, + { + "auxiliary_loss_clip": 0.00999561, + "auxiliary_loss_mlp": 0.00746302, + "balance_loss_clip": 1.00199974, + "balance_loss_mlp": 0.99990445, + "epoch": 0.4840523072298211, + "flos": 67180570185600.0, + "grad_norm": 0.7043950196089812, + "language_loss": 0.56347811, + "learning_rate": 2.200042976240723e-06, + "loss": 0.58093679, + "num_input_tokens_seen": 173119000, + "step": 8051, + "time_per_iteration": 3.369068145751953 + }, + { + "auxiliary_loss_clip": 0.01035747, + "auxiliary_loss_mlp": 0.01031783, + "balance_loss_clip": 1.02590847, + "balance_loss_mlp": 1.01897383, + "epoch": 0.4841124304824891, + "flos": 22411490181120.0, + "grad_norm": 2.060304334163247, + "language_loss": 0.75168467, + "learning_rate": 2.199655463811236e-06, + "loss": 0.77235997, + "num_input_tokens_seen": 173137570, + "step": 8052, + "time_per_iteration": 2.8833281993865967 + }, + { + "auxiliary_loss_clip": 0.01063009, + "auxiliary_loss_mlp": 0.0102862, + "balance_loss_clip": 1.02841449, + "balance_loss_mlp": 1.01658034, + "epoch": 0.48417255373515705, + "flos": 13843959217920.0, + "grad_norm": 2.091296668903432, + "language_loss": 0.66413295, + "learning_rate": 2.1992679438107936e-06, + "loss": 0.68504918, + "num_input_tokens_seen": 173154355, + "step": 8053, + "time_per_iteration": 2.827162504196167 + }, + { + "auxiliary_loss_clip": 0.01058945, + "auxiliary_loss_mlp": 0.01032809, + "balance_loss_clip": 1.02564454, + "balance_loss_mlp": 1.02113867, + "epoch": 0.484232676987825, + "flos": 31649689935360.0, + "grad_norm": 1.9879478468098188, + "language_loss": 0.69044363, + "learning_rate": 2.198880416254091e-06, + "loss": 0.71136123, + "num_input_tokens_seen": 173174845, + "step": 8054, + "time_per_iteration": 2.7770135402679443 + }, + { + "auxiliary_loss_clip": 0.01000058, + "auxiliary_loss_mlp": 0.01033004, + "balance_loss_clip": 1.02137017, + "balance_loss_mlp": 1.02102375, + "epoch": 0.48429280024049304, + "flos": 24095377814400.0, + "grad_norm": 1.9717100504291558, + "language_loss": 0.69200218, + "learning_rate": 2.1984928811558233e-06, + "loss": 0.71233284, + "num_input_tokens_seen": 173195025, + "step": 8055, + "time_per_iteration": 2.86702561378479 + }, + { + "auxiliary_loss_clip": 0.01065125, + "auxiliary_loss_mlp": 0.01032182, + "balance_loss_clip": 1.02975416, + "balance_loss_mlp": 1.0198915, + "epoch": 0.484352923493161, + "flos": 17530081747200.0, + "grad_norm": 2.246541912354705, + "language_loss": 0.62772524, + "learning_rate": 2.198105338530685e-06, + "loss": 0.64869833, + "num_input_tokens_seen": 173213065, + "step": 8056, + "time_per_iteration": 2.647420883178711 + }, + { + "auxiliary_loss_clip": 0.01059361, + "auxiliary_loss_mlp": 0.01028457, + "balance_loss_clip": 1.02565169, + "balance_loss_mlp": 1.01591587, + "epoch": 0.48441304674582897, + "flos": 29166862043520.0, + "grad_norm": 1.704779220908398, + "language_loss": 0.67354786, + "learning_rate": 2.1977177883933726e-06, + "loss": 0.69442606, + "num_input_tokens_seen": 173234545, + "step": 8057, + "time_per_iteration": 2.7308807373046875 + }, + { + "auxiliary_loss_clip": 0.01028944, + "auxiliary_loss_mlp": 0.0104009, + "balance_loss_clip": 1.02247047, + "balance_loss_mlp": 1.0264883, + "epoch": 0.48447316999849693, + "flos": 15886701676800.0, + "grad_norm": 1.5491986372767121, + "language_loss": 0.82013917, + "learning_rate": 2.1973302307585827e-06, + "loss": 0.84082955, + "num_input_tokens_seen": 173252175, + "step": 8058, + "time_per_iteration": 2.7739315032958984 + }, + { + "auxiliary_loss_clip": 0.01055976, + "auxiliary_loss_mlp": 0.0103276, + "balance_loss_clip": 1.02520657, + "balance_loss_mlp": 1.01941442, + "epoch": 0.4845332932511649, + "flos": 24381405815040.0, + "grad_norm": 1.6879637211044003, + "language_loss": 0.79894638, + "learning_rate": 2.1969426656410097e-06, + "loss": 0.81983376, + "num_input_tokens_seen": 173268790, + "step": 8059, + "time_per_iteration": 2.635197639465332 + }, + { + "auxiliary_loss_clip": 0.01074745, + "auxiliary_loss_mlp": 0.01038253, + "balance_loss_clip": 1.02833509, + "balance_loss_mlp": 1.02514553, + "epoch": 0.48459341650383286, + "flos": 37116478316160.0, + "grad_norm": 1.9932101211022717, + "language_loss": 0.66298681, + "learning_rate": 2.196555093055352e-06, + "loss": 0.68411684, + "num_input_tokens_seen": 173288030, + "step": 8060, + "time_per_iteration": 2.6383512020111084 + }, + { + "auxiliary_loss_clip": 0.01065246, + "auxiliary_loss_mlp": 0.01034595, + "balance_loss_clip": 1.02926946, + "balance_loss_mlp": 1.02206612, + "epoch": 0.48465353975650083, + "flos": 22966777509120.0, + "grad_norm": 1.8121942957014638, + "language_loss": 0.6722163, + "learning_rate": 2.1961675130163046e-06, + "loss": 0.69321465, + "num_input_tokens_seen": 173305965, + "step": 8061, + "time_per_iteration": 2.5812153816223145 + }, + { + "auxiliary_loss_clip": 0.01053377, + "auxiliary_loss_mlp": 0.01041456, + "balance_loss_clip": 1.02691936, + "balance_loss_mlp": 1.02818203, + "epoch": 0.4847136630091688, + "flos": 17707695523200.0, + "grad_norm": 1.8345373105981462, + "language_loss": 0.82365298, + "learning_rate": 2.1957799255385653e-06, + "loss": 0.84460127, + "num_input_tokens_seen": 173321985, + "step": 8062, + "time_per_iteration": 2.6137757301330566 + }, + { + "auxiliary_loss_clip": 0.01012908, + "auxiliary_loss_mlp": 0.01033282, + "balance_loss_clip": 1.02651191, + "balance_loss_mlp": 1.0204078, + "epoch": 0.48477378626183676, + "flos": 22018269018240.0, + "grad_norm": 1.5750661220046551, + "language_loss": 0.7457642, + "learning_rate": 2.1953923306368325e-06, + "loss": 0.76622605, + "num_input_tokens_seen": 173341315, + "step": 8063, + "time_per_iteration": 4.393545150756836 + }, + { + "auxiliary_loss_clip": 0.01048483, + "auxiliary_loss_mlp": 0.01028388, + "balance_loss_clip": 1.02614355, + "balance_loss_mlp": 1.01543617, + "epoch": 0.4848339095145047, + "flos": 27962956874880.0, + "grad_norm": 2.155503302948518, + "language_loss": 0.79107201, + "learning_rate": 2.1950047283258023e-06, + "loss": 0.81184077, + "num_input_tokens_seen": 173361055, + "step": 8064, + "time_per_iteration": 2.656224250793457 + }, + { + "auxiliary_loss_clip": 0.01070121, + "auxiliary_loss_mlp": 0.00747304, + "balance_loss_clip": 1.02857447, + "balance_loss_mlp": 1.00014329, + "epoch": 0.4848940327671727, + "flos": 21688752625920.0, + "grad_norm": 1.849968442738675, + "language_loss": 0.79227191, + "learning_rate": 2.194617118620173e-06, + "loss": 0.81044614, + "num_input_tokens_seen": 173379255, + "step": 8065, + "time_per_iteration": 2.6133923530578613 + }, + { + "auxiliary_loss_clip": 0.01051172, + "auxiliary_loss_mlp": 0.00747405, + "balance_loss_clip": 1.02309203, + "balance_loss_mlp": 1.00021076, + "epoch": 0.48495415601984065, + "flos": 20631578515200.0, + "grad_norm": 1.7075668037626608, + "language_loss": 0.76371777, + "learning_rate": 2.194229501534644e-06, + "loss": 0.78170359, + "num_input_tokens_seen": 173398370, + "step": 8066, + "time_per_iteration": 2.6942341327667236 + }, + { + "auxiliary_loss_clip": 0.01071759, + "auxiliary_loss_mlp": 0.01029087, + "balance_loss_clip": 1.02863121, + "balance_loss_mlp": 1.01789892, + "epoch": 0.4850142792725086, + "flos": 25628152930560.0, + "grad_norm": 1.3463339344001735, + "language_loss": 0.72487736, + "learning_rate": 2.193841877083912e-06, + "loss": 0.74588585, + "num_input_tokens_seen": 173419595, + "step": 8067, + "time_per_iteration": 2.6953346729278564 + }, + { + "auxiliary_loss_clip": 0.01025451, + "auxiliary_loss_mlp": 0.01035716, + "balance_loss_clip": 1.0310837, + "balance_loss_mlp": 1.02275825, + "epoch": 0.4850744025251766, + "flos": 13771958405760.0, + "grad_norm": 2.4714225948076787, + "language_loss": 0.7875824, + "learning_rate": 2.1934542452826767e-06, + "loss": 0.80819404, + "num_input_tokens_seen": 173435390, + "step": 8068, + "time_per_iteration": 2.7418155670166016 + }, + { + "auxiliary_loss_clip": 0.01042642, + "auxiliary_loss_mlp": 0.01031908, + "balance_loss_clip": 1.02394199, + "balance_loss_mlp": 1.02069664, + "epoch": 0.4851345257778446, + "flos": 20261339078400.0, + "grad_norm": 1.5142125626975274, + "language_loss": 0.84478229, + "learning_rate": 2.193066606145638e-06, + "loss": 0.86552781, + "num_input_tokens_seen": 173454095, + "step": 8069, + "time_per_iteration": 2.67486310005188 + }, + { + "auxiliary_loss_clip": 0.01039843, + "auxiliary_loss_mlp": 0.01031758, + "balance_loss_clip": 1.02759361, + "balance_loss_mlp": 1.02059984, + "epoch": 0.48519464903051257, + "flos": 27089681420160.0, + "grad_norm": 1.5979942158774862, + "language_loss": 0.77906823, + "learning_rate": 2.192678959687493e-06, + "loss": 0.79978424, + "num_input_tokens_seen": 173475300, + "step": 8070, + "time_per_iteration": 2.822603464126587 + }, + { + "auxiliary_loss_clip": 0.01012789, + "auxiliary_loss_mlp": 0.0103235, + "balance_loss_clip": 1.02530861, + "balance_loss_mlp": 1.02046478, + "epoch": 0.48525477228318054, + "flos": 17127235739520.0, + "grad_norm": 1.8592595242508514, + "language_loss": 0.77900982, + "learning_rate": 2.192291305922943e-06, + "loss": 0.79946125, + "num_input_tokens_seen": 173492005, + "step": 8071, + "time_per_iteration": 2.713345766067505 + }, + { + "auxiliary_loss_clip": 0.01017638, + "auxiliary_loss_mlp": 0.01030876, + "balance_loss_clip": 1.0249486, + "balance_loss_mlp": 1.01849604, + "epoch": 0.4853148955358485, + "flos": 28180324028160.0, + "grad_norm": 1.9801420520854538, + "language_loss": 0.71967649, + "learning_rate": 2.1919036448666873e-06, + "loss": 0.7401616, + "num_input_tokens_seen": 173511995, + "step": 8072, + "time_per_iteration": 2.8152036666870117 + }, + { + "auxiliary_loss_clip": 0.01022911, + "auxiliary_loss_mlp": 0.01037233, + "balance_loss_clip": 1.02573192, + "balance_loss_mlp": 1.02428651, + "epoch": 0.48537501878851647, + "flos": 17493309198720.0, + "grad_norm": 1.853303639795776, + "language_loss": 0.87489992, + "learning_rate": 2.1915159765334262e-06, + "loss": 0.89550138, + "num_input_tokens_seen": 173530215, + "step": 8073, + "time_per_iteration": 2.6957144737243652 + }, + { + "auxiliary_loss_clip": 0.01025176, + "auxiliary_loss_mlp": 0.01033308, + "balance_loss_clip": 1.02169037, + "balance_loss_mlp": 1.02033806, + "epoch": 0.48543514204118443, + "flos": 28584857975040.0, + "grad_norm": 1.6806640457886826, + "language_loss": 0.60514814, + "learning_rate": 2.19112830093786e-06, + "loss": 0.62573302, + "num_input_tokens_seen": 173550920, + "step": 8074, + "time_per_iteration": 6.001493692398071 + }, + { + "auxiliary_loss_clip": 0.01036915, + "auxiliary_loss_mlp": 0.00747587, + "balance_loss_clip": 1.02503705, + "balance_loss_mlp": 1.00027049, + "epoch": 0.4854952652938524, + "flos": 20959981585920.0, + "grad_norm": 1.8339841646566226, + "language_loss": 0.72921348, + "learning_rate": 2.19074061809469e-06, + "loss": 0.74705851, + "num_input_tokens_seen": 173569065, + "step": 8075, + "time_per_iteration": 2.6615328788757324 + }, + { + "auxiliary_loss_clip": 0.0106814, + "auxiliary_loss_mlp": 0.01037034, + "balance_loss_clip": 1.02726054, + "balance_loss_mlp": 1.02587056, + "epoch": 0.48555538854652036, + "flos": 66529543155840.0, + "grad_norm": 1.6352863685572707, + "language_loss": 0.81759214, + "learning_rate": 2.1903529280186163e-06, + "loss": 0.83864385, + "num_input_tokens_seen": 173596085, + "step": 8076, + "time_per_iteration": 2.9229230880737305 + }, + { + "auxiliary_loss_clip": 0.01050686, + "auxiliary_loss_mlp": 0.01031337, + "balance_loss_clip": 1.02782476, + "balance_loss_mlp": 1.01838541, + "epoch": 0.4856155117991883, + "flos": 15924982596480.0, + "grad_norm": 2.2846415323726443, + "language_loss": 0.86178136, + "learning_rate": 2.1899652307243407e-06, + "loss": 0.88260162, + "num_input_tokens_seen": 173613900, + "step": 8077, + "time_per_iteration": 2.6554927825927734 + }, + { + "auxiliary_loss_clip": 0.00972083, + "auxiliary_loss_mlp": 0.01001924, + "balance_loss_clip": 1.00487113, + "balance_loss_mlp": 1.00061834, + "epoch": 0.4856756350518563, + "flos": 71047395060480.0, + "grad_norm": 0.9073617994470539, + "language_loss": 0.58445203, + "learning_rate": 2.189577526226564e-06, + "loss": 0.60419214, + "num_input_tokens_seen": 173671305, + "step": 8078, + "time_per_iteration": 3.2456345558166504 + }, + { + "auxiliary_loss_clip": 0.01072817, + "auxiliary_loss_mlp": 0.01030558, + "balance_loss_clip": 1.02826142, + "balance_loss_mlp": 1.01854777, + "epoch": 0.48573575830452426, + "flos": 29825679346560.0, + "grad_norm": 9.527628112235442, + "language_loss": 0.71988469, + "learning_rate": 2.1891898145399884e-06, + "loss": 0.74091852, + "num_input_tokens_seen": 173692070, + "step": 8079, + "time_per_iteration": 2.6809568405151367 + }, + { + "auxiliary_loss_clip": 0.0104165, + "auxiliary_loss_mlp": 0.01030174, + "balance_loss_clip": 1.02805197, + "balance_loss_mlp": 1.01814032, + "epoch": 0.4857958815571922, + "flos": 17639501552640.0, + "grad_norm": 3.3201814161859025, + "language_loss": 0.79774415, + "learning_rate": 2.1888020956793172e-06, + "loss": 0.81846237, + "num_input_tokens_seen": 173709785, + "step": 8080, + "time_per_iteration": 2.669074296951294 + }, + { + "auxiliary_loss_clip": 0.01043663, + "auxiliary_loss_mlp": 0.01029354, + "balance_loss_clip": 1.02475405, + "balance_loss_mlp": 1.01681936, + "epoch": 0.4858560048098602, + "flos": 21105491581440.0, + "grad_norm": 2.0313749604528786, + "language_loss": 0.8346138, + "learning_rate": 2.188414369659251e-06, + "loss": 0.85534394, + "num_input_tokens_seen": 173728770, + "step": 8081, + "time_per_iteration": 2.6393563747406006 + }, + { + "auxiliary_loss_clip": 0.01053182, + "auxiliary_loss_mlp": 0.01029, + "balance_loss_clip": 1.02380013, + "balance_loss_mlp": 1.016114, + "epoch": 0.4859161280625282, + "flos": 22090844448000.0, + "grad_norm": 1.6216541884551523, + "language_loss": 0.83151782, + "learning_rate": 2.1880266364944924e-06, + "loss": 0.85233963, + "num_input_tokens_seen": 173747355, + "step": 8082, + "time_per_iteration": 2.629396677017212 + }, + { + "auxiliary_loss_clip": 0.01046914, + "auxiliary_loss_mlp": 0.01026556, + "balance_loss_clip": 1.02682304, + "balance_loss_mlp": 1.01508856, + "epoch": 0.4859762513151962, + "flos": 17493452853120.0, + "grad_norm": 1.9504299896443609, + "language_loss": 0.86762029, + "learning_rate": 2.187638896199746e-06, + "loss": 0.88835508, + "num_input_tokens_seen": 173764825, + "step": 8083, + "time_per_iteration": 2.6045663356781006 + }, + { + "auxiliary_loss_clip": 0.01029055, + "auxiliary_loss_mlp": 0.01035892, + "balance_loss_clip": 1.02743626, + "balance_loss_mlp": 1.02508545, + "epoch": 0.48603637456786414, + "flos": 18004246208640.0, + "grad_norm": 1.6198683507544467, + "language_loss": 0.81052649, + "learning_rate": 2.1872511487897126e-06, + "loss": 0.83117598, + "num_input_tokens_seen": 173783215, + "step": 8084, + "time_per_iteration": 2.763559341430664 + }, + { + "auxiliary_loss_clip": 0.01059456, + "auxiliary_loss_mlp": 0.01029675, + "balance_loss_clip": 1.02623105, + "balance_loss_mlp": 1.01754498, + "epoch": 0.4860964978205321, + "flos": 22492038430080.0, + "grad_norm": 3.3073074927143287, + "language_loss": 0.68233323, + "learning_rate": 2.186863394279098e-06, + "loss": 0.70322454, + "num_input_tokens_seen": 173801905, + "step": 8085, + "time_per_iteration": 2.6125688552856445 + }, + { + "auxiliary_loss_clip": 0.01060435, + "auxiliary_loss_mlp": 0.01034819, + "balance_loss_clip": 1.02752948, + "balance_loss_mlp": 1.02308273, + "epoch": 0.48615662107320007, + "flos": 23372532518400.0, + "grad_norm": 2.609685915569696, + "language_loss": 0.77257276, + "learning_rate": 2.1864756326826046e-06, + "loss": 0.79352534, + "num_input_tokens_seen": 173824690, + "step": 8086, + "time_per_iteration": 2.7311418056488037 + }, + { + "auxiliary_loss_clip": 0.01069233, + "auxiliary_loss_mlp": 0.01028092, + "balance_loss_clip": 1.02627301, + "balance_loss_mlp": 1.01608729, + "epoch": 0.48621674432586803, + "flos": 34418833136640.0, + "grad_norm": 2.3745216888233753, + "language_loss": 0.69571847, + "learning_rate": 2.1860878640149355e-06, + "loss": 0.71669173, + "num_input_tokens_seen": 173844450, + "step": 8087, + "time_per_iteration": 2.907140016555786 + }, + { + "auxiliary_loss_clip": 0.0106467, + "auxiliary_loss_mlp": 0.01036616, + "balance_loss_clip": 1.02706957, + "balance_loss_mlp": 1.02340138, + "epoch": 0.486276867578536, + "flos": 33107555237760.0, + "grad_norm": 1.82219267041508, + "language_loss": 0.72567511, + "learning_rate": 2.1857000882907974e-06, + "loss": 0.74668801, + "num_input_tokens_seen": 173864975, + "step": 8088, + "time_per_iteration": 2.726778030395508 + }, + { + "auxiliary_loss_clip": 0.01046778, + "auxiliary_loss_mlp": 0.01035839, + "balance_loss_clip": 1.02473426, + "balance_loss_mlp": 1.02373362, + "epoch": 0.48633699083120396, + "flos": 21470703114240.0, + "grad_norm": 1.5878633976842282, + "language_loss": 0.75306642, + "learning_rate": 2.185312305524892e-06, + "loss": 0.77389258, + "num_input_tokens_seen": 173883805, + "step": 8089, + "time_per_iteration": 2.7643985748291016 + }, + { + "auxiliary_loss_clip": 0.01042017, + "auxiliary_loss_mlp": 0.01028703, + "balance_loss_clip": 1.02768004, + "balance_loss_mlp": 1.01660955, + "epoch": 0.48639711408387193, + "flos": 20084335833600.0, + "grad_norm": 2.9983544035343783, + "language_loss": 0.84300232, + "learning_rate": 2.184924515731926e-06, + "loss": 0.86370957, + "num_input_tokens_seen": 173903520, + "step": 8090, + "time_per_iteration": 2.7249090671539307 + }, + { + "auxiliary_loss_clip": 0.01067433, + "auxiliary_loss_mlp": 0.01030685, + "balance_loss_clip": 1.02626657, + "balance_loss_mlp": 1.01913333, + "epoch": 0.4864572373365399, + "flos": 20778884190720.0, + "grad_norm": 1.7085834562031392, + "language_loss": 0.76023763, + "learning_rate": 2.1845367189266045e-06, + "loss": 0.78121883, + "num_input_tokens_seen": 173924255, + "step": 8091, + "time_per_iteration": 2.6265830993652344 + }, + { + "auxiliary_loss_clip": 0.01061154, + "auxiliary_loss_mlp": 0.01026054, + "balance_loss_clip": 1.02686787, + "balance_loss_mlp": 1.01422298, + "epoch": 0.48651736058920786, + "flos": 26025360503040.0, + "grad_norm": 1.4559044283028673, + "language_loss": 0.80435842, + "learning_rate": 2.184148915123631e-06, + "loss": 0.82523054, + "num_input_tokens_seen": 173943285, + "step": 8092, + "time_per_iteration": 2.689073085784912 + }, + { + "auxiliary_loss_clip": 0.01043914, + "auxiliary_loss_mlp": 0.00747505, + "balance_loss_clip": 1.02532268, + "balance_loss_mlp": 1.00023508, + "epoch": 0.4865774838418758, + "flos": 20485601642880.0, + "grad_norm": 1.4854300442454835, + "language_loss": 0.7171008, + "learning_rate": 2.1837611043377126e-06, + "loss": 0.73501492, + "num_input_tokens_seen": 173962205, + "step": 8093, + "time_per_iteration": 2.6585261821746826 + }, + { + "auxiliary_loss_clip": 0.01069575, + "auxiliary_loss_mlp": 0.01032554, + "balance_loss_clip": 1.02720046, + "balance_loss_mlp": 1.02094853, + "epoch": 0.4866376070945438, + "flos": 23547704169600.0, + "grad_norm": 1.647383729261312, + "language_loss": 0.67780888, + "learning_rate": 2.1833732865835545e-06, + "loss": 0.69883025, + "num_input_tokens_seen": 173980945, + "step": 8094, + "time_per_iteration": 2.6331913471221924 + }, + { + "auxiliary_loss_clip": 0.0105437, + "auxiliary_loss_mlp": 0.01036762, + "balance_loss_clip": 1.0283227, + "balance_loss_mlp": 1.0247097, + "epoch": 0.4866977303472118, + "flos": 16690598012160.0, + "grad_norm": 1.935048543124708, + "language_loss": 0.66571879, + "learning_rate": 2.1829854618758636e-06, + "loss": 0.68663007, + "num_input_tokens_seen": 173998860, + "step": 8095, + "time_per_iteration": 2.6509792804718018 + }, + { + "auxiliary_loss_clip": 0.01051585, + "auxiliary_loss_mlp": 0.01029541, + "balance_loss_clip": 1.02481484, + "balance_loss_mlp": 1.01628518, + "epoch": 0.4867578535998798, + "flos": 17896011552000.0, + "grad_norm": 2.127429866214339, + "language_loss": 0.78250533, + "learning_rate": 2.182597630229345e-06, + "loss": 0.80331659, + "num_input_tokens_seen": 174016665, + "step": 8096, + "time_per_iteration": 2.626495599746704 + }, + { + "auxiliary_loss_clip": 0.01035893, + "auxiliary_loss_mlp": 0.01029698, + "balance_loss_clip": 1.02171671, + "balance_loss_mlp": 1.01775336, + "epoch": 0.48681797685254774, + "flos": 22637799820800.0, + "grad_norm": 1.7846915964126875, + "language_loss": 0.67792463, + "learning_rate": 2.1822097916587067e-06, + "loss": 0.6985805, + "num_input_tokens_seen": 174034800, + "step": 8097, + "time_per_iteration": 2.681980848312378 + }, + { + "auxiliary_loss_clip": 0.01039032, + "auxiliary_loss_mlp": 0.01034868, + "balance_loss_clip": 1.02343118, + "balance_loss_mlp": 1.02213669, + "epoch": 0.4868781001052157, + "flos": 20886077352960.0, + "grad_norm": 2.226943447718388, + "language_loss": 0.71668154, + "learning_rate": 2.1818219461786543e-06, + "loss": 0.73742056, + "num_input_tokens_seen": 174054445, + "step": 8098, + "time_per_iteration": 2.600275754928589 + }, + { + "auxiliary_loss_clip": 0.01063841, + "auxiliary_loss_mlp": 0.01031789, + "balance_loss_clip": 1.02698529, + "balance_loss_mlp": 1.01918817, + "epoch": 0.48693822335788367, + "flos": 41974940937600.0, + "grad_norm": 1.5164112660666782, + "language_loss": 0.66304076, + "learning_rate": 2.1814340938038956e-06, + "loss": 0.68399704, + "num_input_tokens_seen": 174077890, + "step": 8099, + "time_per_iteration": 2.701443910598755 + }, + { + "auxiliary_loss_clip": 0.01017315, + "auxiliary_loss_mlp": 0.01037281, + "balance_loss_clip": 1.02304471, + "balance_loss_mlp": 1.02537751, + "epoch": 0.48699834661055164, + "flos": 24243294021120.0, + "grad_norm": 1.910188993818567, + "language_loss": 0.67342019, + "learning_rate": 2.181046234549138e-06, + "loss": 0.69396615, + "num_input_tokens_seen": 174097460, + "step": 8100, + "time_per_iteration": 2.703677177429199 + }, + { + "auxiliary_loss_clip": 0.01037026, + "auxiliary_loss_mlp": 0.01029451, + "balance_loss_clip": 1.02563453, + "balance_loss_mlp": 1.018067, + "epoch": 0.4870584698632196, + "flos": 25923877603200.0, + "grad_norm": 1.4149812985975603, + "language_loss": 0.76385671, + "learning_rate": 2.180658368429088e-06, + "loss": 0.78452146, + "num_input_tokens_seen": 174120775, + "step": 8101, + "time_per_iteration": 2.754086494445801 + }, + { + "auxiliary_loss_clip": 0.01008848, + "auxiliary_loss_mlp": 0.0100712, + "balance_loss_clip": 1.00111818, + "balance_loss_mlp": 1.00582695, + "epoch": 0.48711859311588757, + "flos": 70211933648640.0, + "grad_norm": 0.6777389098361416, + "language_loss": 0.52321911, + "learning_rate": 2.1802704954584565e-06, + "loss": 0.54337883, + "num_input_tokens_seen": 174189135, + "step": 8102, + "time_per_iteration": 3.2853426933288574 + }, + { + "auxiliary_loss_clip": 0.01054034, + "auxiliary_loss_mlp": 0.01031175, + "balance_loss_clip": 1.03114676, + "balance_loss_mlp": 1.01940298, + "epoch": 0.48717871636855553, + "flos": 12342964659840.0, + "grad_norm": 1.9663705623283239, + "language_loss": 0.73991346, + "learning_rate": 2.1798826156519484e-06, + "loss": 0.76076555, + "num_input_tokens_seen": 174203250, + "step": 8103, + "time_per_iteration": 2.5736844539642334 + }, + { + "auxiliary_loss_clip": 0.01062931, + "auxiliary_loss_mlp": 0.0103828, + "balance_loss_clip": 1.02830803, + "balance_loss_mlp": 1.02618015, + "epoch": 0.4872388396212235, + "flos": 23477139901440.0, + "grad_norm": 1.9743135876896192, + "language_loss": 0.63054705, + "learning_rate": 2.1794947290242737e-06, + "loss": 0.65155923, + "num_input_tokens_seen": 174224145, + "step": 8104, + "time_per_iteration": 2.6124935150146484 + }, + { + "auxiliary_loss_clip": 0.0106966, + "auxiliary_loss_mlp": 0.01028364, + "balance_loss_clip": 1.02722347, + "balance_loss_mlp": 1.01627648, + "epoch": 0.48729896287389146, + "flos": 31427582186880.0, + "grad_norm": 1.5638417501113366, + "language_loss": 0.69110811, + "learning_rate": 2.1791068355901413e-06, + "loss": 0.71208835, + "num_input_tokens_seen": 174244435, + "step": 8105, + "time_per_iteration": 2.600950002670288 + }, + { + "auxiliary_loss_clip": 0.01034495, + "auxiliary_loss_mlp": 0.01026266, + "balance_loss_clip": 1.02679873, + "balance_loss_mlp": 1.01523328, + "epoch": 0.4873590861265594, + "flos": 19057936700160.0, + "grad_norm": 1.6436218383918397, + "language_loss": 0.73417652, + "learning_rate": 2.178718935364259e-06, + "loss": 0.75478417, + "num_input_tokens_seen": 174262710, + "step": 8106, + "time_per_iteration": 2.6019725799560547 + }, + { + "auxiliary_loss_clip": 0.01053382, + "auxiliary_loss_mlp": 0.00747469, + "balance_loss_clip": 1.02808523, + "balance_loss_mlp": 1.00020182, + "epoch": 0.4874192093792274, + "flos": 24348296453760.0, + "grad_norm": 2.06775991992461, + "language_loss": 0.76443887, + "learning_rate": 2.1783310283613373e-06, + "loss": 0.7824474, + "num_input_tokens_seen": 174281545, + "step": 8107, + "time_per_iteration": 2.727217197418213 + }, + { + "auxiliary_loss_clip": 0.01028559, + "auxiliary_loss_mlp": 0.01028993, + "balance_loss_clip": 1.02872467, + "balance_loss_mlp": 1.01730418, + "epoch": 0.4874793326318954, + "flos": 23112610727040.0, + "grad_norm": 1.5274835512607265, + "language_loss": 0.75067472, + "learning_rate": 2.1779431145960853e-06, + "loss": 0.77125025, + "num_input_tokens_seen": 174300290, + "step": 8108, + "time_per_iteration": 2.806511878967285 + }, + { + "auxiliary_loss_clip": 0.01056374, + "auxiliary_loss_mlp": 0.01025733, + "balance_loss_clip": 1.02623487, + "balance_loss_mlp": 1.01573753, + "epoch": 0.4875394558845634, + "flos": 19026156142080.0, + "grad_norm": 1.6552298736126145, + "language_loss": 0.73574877, + "learning_rate": 2.177555194083212e-06, + "loss": 0.7565698, + "num_input_tokens_seen": 174318490, + "step": 8109, + "time_per_iteration": 2.6033687591552734 + }, + { + "auxiliary_loss_clip": 0.01057889, + "auxiliary_loss_mlp": 0.01028272, + "balance_loss_clip": 1.02618122, + "balance_loss_mlp": 1.01695895, + "epoch": 0.48759957913723134, + "flos": 21433607343360.0, + "grad_norm": 8.490276520697616, + "language_loss": 0.78592098, + "learning_rate": 2.177167266837428e-06, + "loss": 0.80678266, + "num_input_tokens_seen": 174335505, + "step": 8110, + "time_per_iteration": 4.23996901512146 + }, + { + "auxiliary_loss_clip": 0.01059435, + "auxiliary_loss_mlp": 0.01033822, + "balance_loss_clip": 1.02695966, + "balance_loss_mlp": 1.022259, + "epoch": 0.4876597023898993, + "flos": 17748669962880.0, + "grad_norm": 1.7687159269300192, + "language_loss": 0.72038931, + "learning_rate": 2.176779332873444e-06, + "loss": 0.74132192, + "num_input_tokens_seen": 174353990, + "step": 8111, + "time_per_iteration": 4.453368425369263 + }, + { + "auxiliary_loss_clip": 0.01059813, + "auxiliary_loss_mlp": 0.01030685, + "balance_loss_clip": 1.02847445, + "balance_loss_mlp": 1.01944983, + "epoch": 0.4877198256425673, + "flos": 17019647527680.0, + "grad_norm": 1.5714964224851682, + "language_loss": 0.76155108, + "learning_rate": 2.17639139220597e-06, + "loss": 0.7824561, + "num_input_tokens_seen": 174373425, + "step": 8112, + "time_per_iteration": 2.6299197673797607 + }, + { + "auxiliary_loss_clip": 0.01065239, + "auxiliary_loss_mlp": 0.01036919, + "balance_loss_clip": 1.02896726, + "balance_loss_mlp": 1.02427745, + "epoch": 0.48777994889523524, + "flos": 22384091082240.0, + "grad_norm": 2.4859444448359733, + "language_loss": 0.75452793, + "learning_rate": 2.1760034448497166e-06, + "loss": 0.77554953, + "num_input_tokens_seen": 174393070, + "step": 8113, + "time_per_iteration": 2.614734411239624 + }, + { + "auxiliary_loss_clip": 0.00990406, + "auxiliary_loss_mlp": 0.00746559, + "balance_loss_clip": 1.00208044, + "balance_loss_mlp": 1.00012577, + "epoch": 0.4878400721479032, + "flos": 61241772159360.0, + "grad_norm": 0.7862004870732399, + "language_loss": 0.48914626, + "learning_rate": 2.1756154908193943e-06, + "loss": 0.50651586, + "num_input_tokens_seen": 174446880, + "step": 8114, + "time_per_iteration": 3.1508114337921143 + }, + { + "auxiliary_loss_clip": 0.01040178, + "auxiliary_loss_mlp": 0.01039584, + "balance_loss_clip": 1.0274272, + "balance_loss_mlp": 1.02718043, + "epoch": 0.48790019540057117, + "flos": 24536612482560.0, + "grad_norm": 1.4399279483800205, + "language_loss": 0.76820922, + "learning_rate": 2.1752275301297155e-06, + "loss": 0.78900683, + "num_input_tokens_seen": 174468485, + "step": 8115, + "time_per_iteration": 2.7514312267303467 + }, + { + "auxiliary_loss_clip": 0.01052289, + "auxiliary_loss_mlp": 0.01032135, + "balance_loss_clip": 1.02789164, + "balance_loss_mlp": 1.01951694, + "epoch": 0.48796031865323913, + "flos": 21833939399040.0, + "grad_norm": 2.0140659133508767, + "language_loss": 0.71732187, + "learning_rate": 2.1748395627953915e-06, + "loss": 0.73816609, + "num_input_tokens_seen": 174486360, + "step": 8116, + "time_per_iteration": 2.77358078956604 + }, + { + "auxiliary_loss_clip": 0.01035422, + "auxiliary_loss_mlp": 0.01028999, + "balance_loss_clip": 1.02522111, + "balance_loss_mlp": 1.01725101, + "epoch": 0.4880204419059071, + "flos": 18588907883520.0, + "grad_norm": 1.7959806989021125, + "language_loss": 0.62878668, + "learning_rate": 2.1744515888311335e-06, + "loss": 0.64943087, + "num_input_tokens_seen": 174505075, + "step": 8117, + "time_per_iteration": 2.773483991622925 + }, + { + "auxiliary_loss_clip": 0.0104071, + "auxiliary_loss_mlp": 0.01035042, + "balance_loss_clip": 1.02342904, + "balance_loss_mlp": 1.02255464, + "epoch": 0.48808056515857506, + "flos": 19172168928000.0, + "grad_norm": 2.1924353892324575, + "language_loss": 0.7973038, + "learning_rate": 2.1740636082516533e-06, + "loss": 0.81806135, + "num_input_tokens_seen": 174523385, + "step": 8118, + "time_per_iteration": 2.766389846801758 + }, + { + "auxiliary_loss_clip": 0.01050688, + "auxiliary_loss_mlp": 0.01031002, + "balance_loss_clip": 1.02750611, + "balance_loss_mlp": 1.0191052, + "epoch": 0.48814068841124303, + "flos": 20120497850880.0, + "grad_norm": 1.7625677791511927, + "language_loss": 0.63495827, + "learning_rate": 2.1736756210716645e-06, + "loss": 0.65577519, + "num_input_tokens_seen": 174542200, + "step": 8119, + "time_per_iteration": 2.8115851879119873 + }, + { + "auxiliary_loss_clip": 0.01016088, + "auxiliary_loss_mlp": 0.00747393, + "balance_loss_clip": 1.02679849, + "balance_loss_mlp": 1.00017154, + "epoch": 0.488200811663911, + "flos": 22965592360320.0, + "grad_norm": 1.830084701935421, + "language_loss": 0.72200203, + "learning_rate": 2.173287627305878e-06, + "loss": 0.7396369, + "num_input_tokens_seen": 174563620, + "step": 8120, + "time_per_iteration": 2.911733627319336 + }, + { + "auxiliary_loss_clip": 0.01060217, + "auxiliary_loss_mlp": 0.01028668, + "balance_loss_clip": 1.02598405, + "balance_loss_mlp": 1.01590121, + "epoch": 0.48826093491657896, + "flos": 33910697387520.0, + "grad_norm": 1.5984610773138033, + "language_loss": 0.63267517, + "learning_rate": 2.1728996269690075e-06, + "loss": 0.65356398, + "num_input_tokens_seen": 174586465, + "step": 8121, + "time_per_iteration": 4.2935731410980225 + }, + { + "auxiliary_loss_clip": 0.01057394, + "auxiliary_loss_mlp": 0.01036687, + "balance_loss_clip": 1.02615309, + "balance_loss_mlp": 1.02351451, + "epoch": 0.488321058169247, + "flos": 23070307484160.0, + "grad_norm": 1.9127441618175969, + "language_loss": 0.82707465, + "learning_rate": 2.1725116200757664e-06, + "loss": 0.84801543, + "num_input_tokens_seen": 174604035, + "step": 8122, + "time_per_iteration": 4.248549938201904 + }, + { + "auxiliary_loss_clip": 0.01055883, + "auxiliary_loss_mlp": 0.01038296, + "balance_loss_clip": 1.0254755, + "balance_loss_mlp": 1.02424693, + "epoch": 0.48838118142191494, + "flos": 19317714837120.0, + "grad_norm": 1.907678976002138, + "language_loss": 0.85400701, + "learning_rate": 2.172123606640866e-06, + "loss": 0.8749488, + "num_input_tokens_seen": 174621715, + "step": 8123, + "time_per_iteration": 2.585683584213257 + }, + { + "auxiliary_loss_clip": 0.01044055, + "auxiliary_loss_mlp": 0.01030405, + "balance_loss_clip": 1.02847815, + "balance_loss_mlp": 1.01847816, + "epoch": 0.4884413046745829, + "flos": 25410678036480.0, + "grad_norm": 1.3508223267404533, + "language_loss": 0.85528624, + "learning_rate": 2.1717355866790227e-06, + "loss": 0.87603086, + "num_input_tokens_seen": 174643835, + "step": 8124, + "time_per_iteration": 2.703516960144043 + }, + { + "auxiliary_loss_clip": 0.01051841, + "auxiliary_loss_mlp": 0.01030454, + "balance_loss_clip": 1.02717805, + "balance_loss_mlp": 1.01875377, + "epoch": 0.4885014279272509, + "flos": 20991546662400.0, + "grad_norm": 2.240882490178098, + "language_loss": 0.79567778, + "learning_rate": 2.171347560204948e-06, + "loss": 0.81650078, + "num_input_tokens_seen": 174660955, + "step": 8125, + "time_per_iteration": 2.616384744644165 + }, + { + "auxiliary_loss_clip": 0.01024218, + "auxiliary_loss_mlp": 0.01036307, + "balance_loss_clip": 1.02493858, + "balance_loss_mlp": 1.02352798, + "epoch": 0.48856155117991884, + "flos": 13771599269760.0, + "grad_norm": 2.1350223207480985, + "language_loss": 0.72473168, + "learning_rate": 2.170959527233356e-06, + "loss": 0.74533689, + "num_input_tokens_seen": 174678270, + "step": 8126, + "time_per_iteration": 2.7367188930511475 + }, + { + "auxiliary_loss_clip": 0.01055749, + "auxiliary_loss_mlp": 0.0103449, + "balance_loss_clip": 1.02478266, + "balance_loss_mlp": 1.02189541, + "epoch": 0.4886216744325868, + "flos": 32087764206720.0, + "grad_norm": 1.865755699380666, + "language_loss": 0.68885922, + "learning_rate": 2.1705714877789633e-06, + "loss": 0.70976162, + "num_input_tokens_seen": 174698360, + "step": 8127, + "time_per_iteration": 2.7370123863220215 + }, + { + "auxiliary_loss_clip": 0.01071618, + "auxiliary_loss_mlp": 0.01031309, + "balance_loss_clip": 1.02538383, + "balance_loss_mlp": 1.01930439, + "epoch": 0.48868179768525477, + "flos": 19610063631360.0, + "grad_norm": 1.7870895584456556, + "language_loss": 0.76284456, + "learning_rate": 2.170183441856481e-06, + "loss": 0.7838738, + "num_input_tokens_seen": 174716755, + "step": 8128, + "time_per_iteration": 2.6072239875793457 + }, + { + "auxiliary_loss_clip": 0.01072313, + "auxiliary_loss_mlp": 0.01031031, + "balance_loss_clip": 1.02846277, + "balance_loss_mlp": 1.01954496, + "epoch": 0.48874192093792274, + "flos": 21286912199040.0, + "grad_norm": 1.6377349654353026, + "language_loss": 0.75884569, + "learning_rate": 2.1697953894806265e-06, + "loss": 0.77987909, + "num_input_tokens_seen": 174735560, + "step": 8129, + "time_per_iteration": 2.562997579574585 + }, + { + "auxiliary_loss_clip": 0.01059514, + "auxiliary_loss_mlp": 0.01026712, + "balance_loss_clip": 1.02638757, + "balance_loss_mlp": 1.01451731, + "epoch": 0.4888020441905907, + "flos": 14173439696640.0, + "grad_norm": 3.0739827084144227, + "language_loss": 0.64609843, + "learning_rate": 2.169407330666114e-06, + "loss": 0.66696072, + "num_input_tokens_seen": 174752730, + "step": 8130, + "time_per_iteration": 2.6144025325775146 + }, + { + "auxiliary_loss_clip": 0.01030281, + "auxiliary_loss_mlp": 0.01029936, + "balance_loss_clip": 1.02124476, + "balance_loss_mlp": 1.01852202, + "epoch": 0.48886216744325867, + "flos": 24097891766400.0, + "grad_norm": 1.6389658436116203, + "language_loss": 0.71865416, + "learning_rate": 2.169019265427658e-06, + "loss": 0.73925632, + "num_input_tokens_seen": 174772520, + "step": 8131, + "time_per_iteration": 2.768249034881592 + }, + { + "auxiliary_loss_clip": 0.0106272, + "auxiliary_loss_mlp": 0.0103345, + "balance_loss_clip": 1.02827561, + "balance_loss_mlp": 1.02134407, + "epoch": 0.48892229069592663, + "flos": 38431419402240.0, + "grad_norm": 1.4055793447483491, + "language_loss": 0.69380069, + "learning_rate": 2.1686311937799745e-06, + "loss": 0.71476239, + "num_input_tokens_seen": 174796540, + "step": 8132, + "time_per_iteration": 2.8112030029296875 + }, + { + "auxiliary_loss_clip": 0.01052257, + "auxiliary_loss_mlp": 0.01028884, + "balance_loss_clip": 1.02615595, + "balance_loss_mlp": 1.01651621, + "epoch": 0.4889824139485946, + "flos": 23843321101440.0, + "grad_norm": 1.6233091358149698, + "language_loss": 0.69979763, + "learning_rate": 2.1682431157377797e-06, + "loss": 0.72060901, + "num_input_tokens_seen": 174817840, + "step": 8133, + "time_per_iteration": 2.8016819953918457 + }, + { + "auxiliary_loss_clip": 0.01011009, + "auxiliary_loss_mlp": 0.0103386, + "balance_loss_clip": 1.02174246, + "balance_loss_mlp": 1.02148652, + "epoch": 0.48904253720126256, + "flos": 24425827960320.0, + "grad_norm": 1.9830526891353437, + "language_loss": 0.70619726, + "learning_rate": 2.1678550313157883e-06, + "loss": 0.72664595, + "num_input_tokens_seen": 174837885, + "step": 8134, + "time_per_iteration": 2.812852621078491 + }, + { + "auxiliary_loss_clip": 0.01033758, + "auxiliary_loss_mlp": 0.01034734, + "balance_loss_clip": 1.02760577, + "balance_loss_mlp": 1.02219307, + "epoch": 0.4891026604539306, + "flos": 24170682677760.0, + "grad_norm": 1.813657472886984, + "language_loss": 0.80249155, + "learning_rate": 2.167466940528718e-06, + "loss": 0.82317638, + "num_input_tokens_seen": 174855240, + "step": 8135, + "time_per_iteration": 2.908078670501709 + }, + { + "auxiliary_loss_clip": 0.0106901, + "auxiliary_loss_mlp": 0.01030012, + "balance_loss_clip": 1.02649522, + "balance_loss_mlp": 1.01909804, + "epoch": 0.48916278370659855, + "flos": 21470954509440.0, + "grad_norm": 1.7045840560719565, + "language_loss": 0.74394149, + "learning_rate": 2.1670788433912843e-06, + "loss": 0.76493168, + "num_input_tokens_seen": 174875145, + "step": 8136, + "time_per_iteration": 2.7704946994781494 + }, + { + "auxiliary_loss_clip": 0.01042361, + "auxiliary_loss_mlp": 0.0103542, + "balance_loss_clip": 1.02494657, + "balance_loss_mlp": 1.02351725, + "epoch": 0.4892229069592665, + "flos": 22309755886080.0, + "grad_norm": 1.8323223224226461, + "language_loss": 0.73175704, + "learning_rate": 2.166690739918204e-06, + "loss": 0.75253481, + "num_input_tokens_seen": 174894770, + "step": 8137, + "time_per_iteration": 2.6574113368988037 + }, + { + "auxiliary_loss_clip": 0.01004525, + "auxiliary_loss_mlp": 0.01031211, + "balance_loss_clip": 1.02333391, + "balance_loss_mlp": 1.01918888, + "epoch": 0.4892830302119345, + "flos": 12786856934400.0, + "grad_norm": 2.0577015332472195, + "language_loss": 0.75492364, + "learning_rate": 2.1663026301241944e-06, + "loss": 0.77528095, + "num_input_tokens_seen": 174912780, + "step": 8138, + "time_per_iteration": 2.94288969039917 + }, + { + "auxiliary_loss_clip": 0.01041405, + "auxiliary_loss_mlp": 0.01031804, + "balance_loss_clip": 1.02832031, + "balance_loss_mlp": 1.02062249, + "epoch": 0.48934315346460244, + "flos": 20813896972800.0, + "grad_norm": 1.654177823111477, + "language_loss": 0.74478674, + "learning_rate": 2.165914514023972e-06, + "loss": 0.76551878, + "num_input_tokens_seen": 174931250, + "step": 8139, + "time_per_iteration": 2.885223150253296 + }, + { + "auxiliary_loss_clip": 0.01057535, + "auxiliary_loss_mlp": 0.01035798, + "balance_loss_clip": 1.02510214, + "balance_loss_mlp": 1.02425909, + "epoch": 0.4894032767172704, + "flos": 19755537713280.0, + "grad_norm": 1.7157727363092545, + "language_loss": 0.61859143, + "learning_rate": 2.165526391632255e-06, + "loss": 0.63952476, + "num_input_tokens_seen": 174951105, + "step": 8140, + "time_per_iteration": 2.671588182449341 + }, + { + "auxiliary_loss_clip": 0.01042084, + "auxiliary_loss_mlp": 0.01038961, + "balance_loss_clip": 1.02720213, + "balance_loss_mlp": 1.02631879, + "epoch": 0.4894633999699384, + "flos": 17818982835840.0, + "grad_norm": 1.7637967653592461, + "language_loss": 0.82255799, + "learning_rate": 2.1651382629637608e-06, + "loss": 0.84336853, + "num_input_tokens_seen": 174969120, + "step": 8141, + "time_per_iteration": 2.6109797954559326 + }, + { + "auxiliary_loss_clip": 0.01049319, + "auxiliary_loss_mlp": 0.0103502, + "balance_loss_clip": 1.03217244, + "balance_loss_mlp": 1.02211547, + "epoch": 0.48952352322260634, + "flos": 25523222325120.0, + "grad_norm": 1.693447423089055, + "language_loss": 0.72245824, + "learning_rate": 2.1647501280332066e-06, + "loss": 0.74330163, + "num_input_tokens_seen": 174991295, + "step": 8142, + "time_per_iteration": 2.8099470138549805 + }, + { + "auxiliary_loss_clip": 0.01067969, + "auxiliary_loss_mlp": 0.0103238, + "balance_loss_clip": 1.0259161, + "balance_loss_mlp": 1.0209713, + "epoch": 0.4895836464752743, + "flos": 29055502903680.0, + "grad_norm": 3.1913201100388413, + "language_loss": 0.66808498, + "learning_rate": 2.1643619868553105e-06, + "loss": 0.68908846, + "num_input_tokens_seen": 175012830, + "step": 8143, + "time_per_iteration": 2.668998956680298 + }, + { + "auxiliary_loss_clip": 0.01059651, + "auxiliary_loss_mlp": 0.00747405, + "balance_loss_clip": 1.02684689, + "balance_loss_mlp": 1.0001899, + "epoch": 0.48964376972794227, + "flos": 33546958312320.0, + "grad_norm": 1.8359335238650238, + "language_loss": 0.75082636, + "learning_rate": 2.163973839444793e-06, + "loss": 0.76889694, + "num_input_tokens_seen": 175035695, + "step": 8144, + "time_per_iteration": 2.7945263385772705 + }, + { + "auxiliary_loss_clip": 0.01041051, + "auxiliary_loss_mlp": 0.01028943, + "balance_loss_clip": 1.02373898, + "balance_loss_mlp": 1.01698077, + "epoch": 0.48970389298061023, + "flos": 22054035985920.0, + "grad_norm": 1.784725209765692, + "language_loss": 0.76035959, + "learning_rate": 2.1635856858163695e-06, + "loss": 0.7810595, + "num_input_tokens_seen": 175056425, + "step": 8145, + "time_per_iteration": 2.6098005771636963 + }, + { + "auxiliary_loss_clip": 0.01052943, + "auxiliary_loss_mlp": 0.00747652, + "balance_loss_clip": 1.02686024, + "balance_loss_mlp": 1.00019336, + "epoch": 0.4897640162332782, + "flos": 20084299920000.0, + "grad_norm": 1.7644237971862071, + "language_loss": 0.80400467, + "learning_rate": 2.163197525984761e-06, + "loss": 0.82201064, + "num_input_tokens_seen": 175074800, + "step": 8146, + "time_per_iteration": 2.6377272605895996 + }, + { + "auxiliary_loss_clip": 0.01056814, + "auxiliary_loss_mlp": 0.01032176, + "balance_loss_clip": 1.02437901, + "balance_loss_mlp": 1.02045178, + "epoch": 0.48982413948594616, + "flos": 23806225330560.0, + "grad_norm": 2.593175997961711, + "language_loss": 0.74255723, + "learning_rate": 2.162809359964687e-06, + "loss": 0.76344717, + "num_input_tokens_seen": 175094500, + "step": 8147, + "time_per_iteration": 2.6485278606414795 + }, + { + "auxiliary_loss_clip": 0.01034504, + "auxiliary_loss_mlp": 0.01030268, + "balance_loss_clip": 1.02473783, + "balance_loss_mlp": 1.01832342, + "epoch": 0.4898842627386142, + "flos": 17639645207040.0, + "grad_norm": 1.983639746762061, + "language_loss": 0.82904673, + "learning_rate": 2.162421187770864e-06, + "loss": 0.84969449, + "num_input_tokens_seen": 175112920, + "step": 8148, + "time_per_iteration": 2.6171443462371826 + }, + { + "auxiliary_loss_clip": 0.01033731, + "auxiliary_loss_mlp": 0.01032703, + "balance_loss_clip": 1.02473354, + "balance_loss_mlp": 1.02200997, + "epoch": 0.48994438599128215, + "flos": 16617914841600.0, + "grad_norm": 2.979477204983872, + "language_loss": 0.73881066, + "learning_rate": 2.162033009418015e-06, + "loss": 0.75947511, + "num_input_tokens_seen": 175129910, + "step": 8149, + "time_per_iteration": 2.661531925201416 + }, + { + "auxiliary_loss_clip": 0.01075652, + "auxiliary_loss_mlp": 0.01029118, + "balance_loss_clip": 1.02937794, + "balance_loss_mlp": 1.01597476, + "epoch": 0.4900045092439501, + "flos": 26614834600320.0, + "grad_norm": 2.105032542159916, + "language_loss": 0.75949448, + "learning_rate": 2.1616448249208567e-06, + "loss": 0.78054214, + "num_input_tokens_seen": 175148705, + "step": 8150, + "time_per_iteration": 2.6046299934387207 + }, + { + "auxiliary_loss_clip": 0.01053083, + "auxiliary_loss_mlp": 0.0103202, + "balance_loss_clip": 1.02800536, + "balance_loss_mlp": 1.01934814, + "epoch": 0.4900646324966181, + "flos": 19902125116800.0, + "grad_norm": 1.8059292149664592, + "language_loss": 0.7273562, + "learning_rate": 2.1612566342941106e-06, + "loss": 0.74820721, + "num_input_tokens_seen": 175167425, + "step": 8151, + "time_per_iteration": 2.5918736457824707 + }, + { + "auxiliary_loss_clip": 0.00971878, + "auxiliary_loss_mlp": 0.01006202, + "balance_loss_clip": 1.00448859, + "balance_loss_mlp": 1.00499177, + "epoch": 0.49012475574928605, + "flos": 59189620337280.0, + "grad_norm": 0.8376034089519829, + "language_loss": 0.54351437, + "learning_rate": 2.1608684375524977e-06, + "loss": 0.56329513, + "num_input_tokens_seen": 175227985, + "step": 8152, + "time_per_iteration": 3.2239997386932373 + }, + { + "auxiliary_loss_clip": 0.01016544, + "auxiliary_loss_mlp": 0.01034321, + "balance_loss_clip": 1.02483702, + "balance_loss_mlp": 1.02157116, + "epoch": 0.490184879001954, + "flos": 45259797657600.0, + "grad_norm": 1.686044057350974, + "language_loss": 0.60848236, + "learning_rate": 2.1604802347107364e-06, + "loss": 0.62899101, + "num_input_tokens_seen": 175251895, + "step": 8153, + "time_per_iteration": 3.0268046855926514 + }, + { + "auxiliary_loss_clip": 0.01039509, + "auxiliary_loss_mlp": 0.0103878, + "balance_loss_clip": 1.02584159, + "balance_loss_mlp": 1.02684689, + "epoch": 0.490245002254622, + "flos": 28002135634560.0, + "grad_norm": 1.4648355255229852, + "language_loss": 0.76626623, + "learning_rate": 2.160092025783549e-06, + "loss": 0.78704917, + "num_input_tokens_seen": 175272770, + "step": 8154, + "time_per_iteration": 2.850311756134033 + }, + { + "auxiliary_loss_clip": 0.00989659, + "auxiliary_loss_mlp": 0.01003861, + "balance_loss_clip": 1.00202656, + "balance_loss_mlp": 1.00250208, + "epoch": 0.49030512550728994, + "flos": 58951318533120.0, + "grad_norm": 0.9642738970799505, + "language_loss": 0.67046905, + "learning_rate": 2.1597038107856564e-06, + "loss": 0.69040424, + "num_input_tokens_seen": 175336320, + "step": 8155, + "time_per_iteration": 3.3031790256500244 + }, + { + "auxiliary_loss_clip": 0.01071815, + "auxiliary_loss_mlp": 0.01031106, + "balance_loss_clip": 1.02834523, + "balance_loss_mlp": 1.01933432, + "epoch": 0.4903652487599579, + "flos": 19791843384960.0, + "grad_norm": 1.9978409234307204, + "language_loss": 0.76669312, + "learning_rate": 2.1593155897317784e-06, + "loss": 0.78772235, + "num_input_tokens_seen": 175353540, + "step": 8156, + "time_per_iteration": 2.533846378326416 + }, + { + "auxiliary_loss_clip": 0.01062544, + "auxiliary_loss_mlp": 0.01031346, + "balance_loss_clip": 1.0276196, + "balance_loss_mlp": 1.01922798, + "epoch": 0.49042537201262587, + "flos": 21762082241280.0, + "grad_norm": 2.050952518846936, + "language_loss": 0.83510292, + "learning_rate": 2.1589273626366377e-06, + "loss": 0.85604179, + "num_input_tokens_seen": 175370445, + "step": 8157, + "time_per_iteration": 4.350445508956909 + }, + { + "auxiliary_loss_clip": 0.01060822, + "auxiliary_loss_mlp": 0.01027861, + "balance_loss_clip": 1.02624917, + "balance_loss_mlp": 1.01576686, + "epoch": 0.49048549526529384, + "flos": 18953042008320.0, + "grad_norm": 1.708434152854344, + "language_loss": 0.79700726, + "learning_rate": 2.158539129514956e-06, + "loss": 0.8178941, + "num_input_tokens_seen": 175389020, + "step": 8158, + "time_per_iteration": 4.1436991691589355 + }, + { + "auxiliary_loss_clip": 0.01075352, + "auxiliary_loss_mlp": 0.01030461, + "balance_loss_clip": 1.02988839, + "balance_loss_mlp": 1.01781344, + "epoch": 0.4905456185179618, + "flos": 26906393295360.0, + "grad_norm": 1.5455851172448773, + "language_loss": 0.69028473, + "learning_rate": 2.158150890381454e-06, + "loss": 0.71134281, + "num_input_tokens_seen": 175409545, + "step": 8159, + "time_per_iteration": 2.580686092376709 + }, + { + "auxiliary_loss_clip": 0.01052349, + "auxiliary_loss_mlp": 0.01032709, + "balance_loss_clip": 1.02431929, + "balance_loss_mlp": 1.01995325, + "epoch": 0.49060574177062977, + "flos": 20412343854720.0, + "grad_norm": 1.9639746720921176, + "language_loss": 0.73433375, + "learning_rate": 2.157762645250854e-06, + "loss": 0.75518435, + "num_input_tokens_seen": 175429335, + "step": 8160, + "time_per_iteration": 2.5519027709960938 + }, + { + "auxiliary_loss_clip": 0.0106308, + "auxiliary_loss_mlp": 0.01038104, + "balance_loss_clip": 1.02729034, + "balance_loss_mlp": 1.02431202, + "epoch": 0.4906658650232978, + "flos": 17493704248320.0, + "grad_norm": 1.7391076418759408, + "language_loss": 0.7156859, + "learning_rate": 2.1573743941378796e-06, + "loss": 0.73669779, + "num_input_tokens_seen": 175446955, + "step": 8161, + "time_per_iteration": 2.5815858840942383 + }, + { + "auxiliary_loss_clip": 0.0102538, + "auxiliary_loss_mlp": 0.0103478, + "balance_loss_clip": 1.02569997, + "balance_loss_mlp": 1.02287698, + "epoch": 0.49072598827596575, + "flos": 26614439550720.0, + "grad_norm": 22.940386692164783, + "language_loss": 0.68534255, + "learning_rate": 2.1569861370572517e-06, + "loss": 0.70594418, + "num_input_tokens_seen": 175468195, + "step": 8162, + "time_per_iteration": 2.738865375518799 + }, + { + "auxiliary_loss_clip": 0.01068454, + "auxiliary_loss_mlp": 0.01035025, + "balance_loss_clip": 1.03053284, + "balance_loss_mlp": 1.0211376, + "epoch": 0.4907861115286337, + "flos": 20412595249920.0, + "grad_norm": 1.5917650519544109, + "language_loss": 0.6342687, + "learning_rate": 2.1565978740236944e-06, + "loss": 0.65530348, + "num_input_tokens_seen": 175487455, + "step": 8163, + "time_per_iteration": 2.5507004261016846 + }, + { + "auxiliary_loss_clip": 0.0103887, + "auxiliary_loss_mlp": 0.01032816, + "balance_loss_clip": 1.0261296, + "balance_loss_mlp": 1.02071071, + "epoch": 0.4908462347813017, + "flos": 14064271286400.0, + "grad_norm": 2.362962946107199, + "language_loss": 0.77143776, + "learning_rate": 2.1562096050519293e-06, + "loss": 0.79215467, + "num_input_tokens_seen": 175504450, + "step": 8164, + "time_per_iteration": 2.6407523155212402 + }, + { + "auxiliary_loss_clip": 0.01053516, + "auxiliary_loss_mlp": 0.01031099, + "balance_loss_clip": 1.02417779, + "balance_loss_mlp": 1.01742589, + "epoch": 0.49090635803396965, + "flos": 18735100237440.0, + "grad_norm": 1.6505928011645565, + "language_loss": 0.76794052, + "learning_rate": 2.1558213301566806e-06, + "loss": 0.78878665, + "num_input_tokens_seen": 175523600, + "step": 8165, + "time_per_iteration": 2.560451030731201 + }, + { + "auxiliary_loss_clip": 0.01053866, + "auxiliary_loss_mlp": 0.01032724, + "balance_loss_clip": 1.03021741, + "balance_loss_mlp": 1.02038622, + "epoch": 0.4909664812866376, + "flos": 20558500295040.0, + "grad_norm": 1.5037562249801313, + "language_loss": 0.77285755, + "learning_rate": 2.1554330493526716e-06, + "loss": 0.79372346, + "num_input_tokens_seen": 175542720, + "step": 8166, + "time_per_iteration": 2.7387497425079346 + }, + { + "auxiliary_loss_clip": 0.01000779, + "auxiliary_loss_mlp": 0.01018615, + "balance_loss_clip": 1.00270629, + "balance_loss_mlp": 1.0172267, + "epoch": 0.4910266045393056, + "flos": 54684017948160.0, + "grad_norm": 0.8167183257244681, + "language_loss": 0.54207146, + "learning_rate": 2.1550447626546253e-06, + "loss": 0.5622654, + "num_input_tokens_seen": 175598640, + "step": 8167, + "time_per_iteration": 3.1723620891571045 + }, + { + "auxiliary_loss_clip": 0.01030198, + "auxiliary_loss_mlp": 0.01031105, + "balance_loss_clip": 1.02697587, + "balance_loss_mlp": 1.0195179, + "epoch": 0.49108672779197354, + "flos": 16246454342400.0, + "grad_norm": 1.8779689629187917, + "language_loss": 0.85459489, + "learning_rate": 2.1546564700772665e-06, + "loss": 0.8752079, + "num_input_tokens_seen": 175615675, + "step": 8168, + "time_per_iteration": 2.786133050918579 + }, + { + "auxiliary_loss_clip": 0.01050944, + "auxiliary_loss_mlp": 0.01035508, + "balance_loss_clip": 1.02439487, + "balance_loss_mlp": 1.02210307, + "epoch": 0.4911468510446415, + "flos": 19825419623040.0, + "grad_norm": 1.6913151482967108, + "language_loss": 0.7345624, + "learning_rate": 2.1542681716353193e-06, + "loss": 0.75542694, + "num_input_tokens_seen": 175632255, + "step": 8169, + "time_per_iteration": 5.842335224151611 + }, + { + "auxiliary_loss_clip": 0.01059369, + "auxiliary_loss_mlp": 0.01028023, + "balance_loss_clip": 1.02549791, + "balance_loss_mlp": 1.01673377, + "epoch": 0.4912069742973095, + "flos": 21212684743680.0, + "grad_norm": 1.4910705851697623, + "language_loss": 0.78151786, + "learning_rate": 2.1538798673435068e-06, + "loss": 0.80239183, + "num_input_tokens_seen": 175651625, + "step": 8170, + "time_per_iteration": 2.61149263381958 + }, + { + "auxiliary_loss_clip": 0.01039253, + "auxiliary_loss_mlp": 0.01033484, + "balance_loss_clip": 1.02310514, + "balance_loss_mlp": 1.02196252, + "epoch": 0.49126709754997744, + "flos": 19537129065600.0, + "grad_norm": 2.107158103753171, + "language_loss": 0.76257324, + "learning_rate": 2.1534915572165545e-06, + "loss": 0.78330058, + "num_input_tokens_seen": 175669265, + "step": 8171, + "time_per_iteration": 2.6629207134246826 + }, + { + "auxiliary_loss_clip": 0.0104863, + "auxiliary_loss_mlp": 0.01036186, + "balance_loss_clip": 1.02526748, + "balance_loss_mlp": 1.02467072, + "epoch": 0.4913272208026454, + "flos": 12239686080000.0, + "grad_norm": 1.958427475770498, + "language_loss": 0.81913209, + "learning_rate": 2.1531032412691875e-06, + "loss": 0.83998024, + "num_input_tokens_seen": 175686065, + "step": 8172, + "time_per_iteration": 2.7856011390686035 + }, + { + "auxiliary_loss_clip": 0.01003478, + "auxiliary_loss_mlp": 0.01007008, + "balance_loss_clip": 1.00599611, + "balance_loss_mlp": 1.00565493, + "epoch": 0.49138734405531337, + "flos": 65465871661440.0, + "grad_norm": 0.6914202508679641, + "language_loss": 0.53288853, + "learning_rate": 2.1527149195161295e-06, + "loss": 0.55299342, + "num_input_tokens_seen": 175748595, + "step": 8173, + "time_per_iteration": 3.1709110736846924 + }, + { + "auxiliary_loss_clip": 0.01062591, + "auxiliary_loss_mlp": 0.00747504, + "balance_loss_clip": 1.02687502, + "balance_loss_mlp": 1.00018454, + "epoch": 0.4914474673079814, + "flos": 18439052342400.0, + "grad_norm": 2.185069081664477, + "language_loss": 0.63064671, + "learning_rate": 2.152326591972107e-06, + "loss": 0.64874768, + "num_input_tokens_seen": 175766770, + "step": 8174, + "time_per_iteration": 2.6266841888427734 + }, + { + "auxiliary_loss_clip": 0.01034013, + "auxiliary_loss_mlp": 0.01037671, + "balance_loss_clip": 1.0261004, + "balance_loss_mlp": 1.02421188, + "epoch": 0.49150759056064935, + "flos": 21685053525120.0, + "grad_norm": 1.6875335541853669, + "language_loss": 0.69085062, + "learning_rate": 2.1519382586518445e-06, + "loss": 0.71156746, + "num_input_tokens_seen": 175783605, + "step": 8175, + "time_per_iteration": 2.639803171157837 + }, + { + "auxiliary_loss_clip": 0.01062569, + "auxiliary_loss_mlp": 0.01026621, + "balance_loss_clip": 1.02821696, + "balance_loss_mlp": 1.01522458, + "epoch": 0.4915677138133173, + "flos": 22382439056640.0, + "grad_norm": 2.0871083684631864, + "language_loss": 0.74534029, + "learning_rate": 2.151549919570068e-06, + "loss": 0.76623213, + "num_input_tokens_seen": 175801390, + "step": 8176, + "time_per_iteration": 2.6446428298950195 + }, + { + "auxiliary_loss_clip": 0.01064141, + "auxiliary_loss_mlp": 0.01038407, + "balance_loss_clip": 1.0287323, + "balance_loss_mlp": 1.02610445, + "epoch": 0.4916278370659853, + "flos": 18402890325120.0, + "grad_norm": 1.711957636064281, + "language_loss": 0.70038182, + "learning_rate": 2.1511615747415036e-06, + "loss": 0.72140729, + "num_input_tokens_seen": 175819830, + "step": 8177, + "time_per_iteration": 2.6203274726867676 + }, + { + "auxiliary_loss_clip": 0.00993536, + "auxiliary_loss_mlp": 0.007465, + "balance_loss_clip": 1.00566602, + "balance_loss_mlp": 1.00005114, + "epoch": 0.49168796031865325, + "flos": 66609124715520.0, + "grad_norm": 0.6816439670734067, + "language_loss": 0.46267194, + "learning_rate": 2.150773224180877e-06, + "loss": 0.48007226, + "num_input_tokens_seen": 175881765, + "step": 8178, + "time_per_iteration": 3.2016193866729736 + }, + { + "auxiliary_loss_clip": 0.01075301, + "auxiliary_loss_mlp": 0.01033871, + "balance_loss_clip": 1.02949631, + "balance_loss_mlp": 1.02090144, + "epoch": 0.4917480835713212, + "flos": 20959335141120.0, + "grad_norm": 1.7737934149815044, + "language_loss": 0.6551435, + "learning_rate": 2.1503848679029147e-06, + "loss": 0.6762352, + "num_input_tokens_seen": 175901795, + "step": 8179, + "time_per_iteration": 2.7546539306640625 + }, + { + "auxiliary_loss_clip": 0.00991795, + "auxiliary_loss_mlp": 0.01050912, + "balance_loss_clip": 1.02482986, + "balance_loss_mlp": 1.03684497, + "epoch": 0.4918082068239892, + "flos": 15772900412160.0, + "grad_norm": 2.0248196074638716, + "language_loss": 0.70134121, + "learning_rate": 2.149996505922343e-06, + "loss": 0.72176832, + "num_input_tokens_seen": 175917770, + "step": 8180, + "time_per_iteration": 2.993381977081299 + }, + { + "auxiliary_loss_clip": 0.01047778, + "auxiliary_loss_mlp": 0.01032036, + "balance_loss_clip": 1.02555108, + "balance_loss_mlp": 1.02041316, + "epoch": 0.49186833007665715, + "flos": 24604806453120.0, + "grad_norm": 1.7983900271897018, + "language_loss": 0.84137559, + "learning_rate": 2.1496081382538895e-06, + "loss": 0.86217368, + "num_input_tokens_seen": 175937000, + "step": 8181, + "time_per_iteration": 2.9563074111938477 + }, + { + "auxiliary_loss_clip": 0.01069665, + "auxiliary_loss_mlp": 0.01030416, + "balance_loss_clip": 1.02778101, + "balance_loss_mlp": 1.01921034, + "epoch": 0.4919284533293251, + "flos": 22090557139200.0, + "grad_norm": 2.0385780708354764, + "language_loss": 0.72917247, + "learning_rate": 2.1492197649122793e-06, + "loss": 0.75017327, + "num_input_tokens_seen": 175955170, + "step": 8182, + "time_per_iteration": 2.5152761936187744 + }, + { + "auxiliary_loss_clip": 0.01042083, + "auxiliary_loss_mlp": 0.01031145, + "balance_loss_clip": 1.02867115, + "balance_loss_mlp": 1.01990926, + "epoch": 0.4919885765819931, + "flos": 23368043318400.0, + "grad_norm": 1.9393230502188186, + "language_loss": 0.72766453, + "learning_rate": 2.1488313859122412e-06, + "loss": 0.74839681, + "num_input_tokens_seen": 175973725, + "step": 8183, + "time_per_iteration": 2.6837947368621826 + }, + { + "auxiliary_loss_clip": 0.01011873, + "auxiliary_loss_mlp": 0.01033996, + "balance_loss_clip": 1.02349794, + "balance_loss_mlp": 1.020895, + "epoch": 0.49204869983466104, + "flos": 21360493209600.0, + "grad_norm": 3.454399875645761, + "language_loss": 0.77794415, + "learning_rate": 2.1484430012685015e-06, + "loss": 0.79840291, + "num_input_tokens_seen": 175993885, + "step": 8184, + "time_per_iteration": 2.766352653503418 + }, + { + "auxiliary_loss_clip": 0.01044709, + "auxiliary_loss_mlp": 0.01034428, + "balance_loss_clip": 1.02710235, + "balance_loss_mlp": 1.02256095, + "epoch": 0.492108823087329, + "flos": 21142695093120.0, + "grad_norm": 1.9480477216227765, + "language_loss": 0.70702589, + "learning_rate": 2.148054610995789e-06, + "loss": 0.7278173, + "num_input_tokens_seen": 176014210, + "step": 8185, + "time_per_iteration": 2.6957345008850098 + }, + { + "auxiliary_loss_clip": 0.01052056, + "auxiliary_loss_mlp": 0.01033379, + "balance_loss_clip": 1.02730572, + "balance_loss_mlp": 1.02118945, + "epoch": 0.49216894633999697, + "flos": 25116605389440.0, + "grad_norm": 1.672594331683498, + "language_loss": 0.75558603, + "learning_rate": 2.147666215108831e-06, + "loss": 0.77644032, + "num_input_tokens_seen": 176033890, + "step": 8186, + "time_per_iteration": 2.7065398693084717 + }, + { + "auxiliary_loss_clip": 0.01062295, + "auxiliary_loss_mlp": 0.01031592, + "balance_loss_clip": 1.02743852, + "balance_loss_mlp": 1.01951599, + "epoch": 0.49222906959266494, + "flos": 22637943475200.0, + "grad_norm": 1.9491053801526632, + "language_loss": 0.67583156, + "learning_rate": 2.1472778136223545e-06, + "loss": 0.69677043, + "num_input_tokens_seen": 176052720, + "step": 8187, + "time_per_iteration": 3.0824358463287354 + }, + { + "auxiliary_loss_clip": 0.0102565, + "auxiliary_loss_mlp": 0.0103065, + "balance_loss_clip": 1.02495003, + "balance_loss_mlp": 1.01886046, + "epoch": 0.49228919284533296, + "flos": 20410548174720.0, + "grad_norm": 1.3716108716083737, + "language_loss": 0.67074221, + "learning_rate": 2.1468894065510894e-06, + "loss": 0.69130522, + "num_input_tokens_seen": 176072545, + "step": 8188, + "time_per_iteration": 2.7912747859954834 + }, + { + "auxiliary_loss_clip": 0.01064771, + "auxiliary_loss_mlp": 0.01025463, + "balance_loss_clip": 1.03042614, + "balance_loss_mlp": 1.01464438, + "epoch": 0.4923493160980009, + "flos": 27122359818240.0, + "grad_norm": 1.6049415333674326, + "language_loss": 0.74790347, + "learning_rate": 2.1465009939097623e-06, + "loss": 0.76880574, + "num_input_tokens_seen": 176091490, + "step": 8189, + "time_per_iteration": 2.6476404666900635 + }, + { + "auxiliary_loss_clip": 0.01048862, + "auxiliary_loss_mlp": 0.01026314, + "balance_loss_clip": 1.02599823, + "balance_loss_mlp": 1.01490557, + "epoch": 0.4924094393506689, + "flos": 35736683224320.0, + "grad_norm": 1.6081621733858664, + "language_loss": 0.6419251, + "learning_rate": 2.146112575713104e-06, + "loss": 0.66267681, + "num_input_tokens_seen": 176113200, + "step": 8190, + "time_per_iteration": 2.8049533367156982 + }, + { + "auxiliary_loss_clip": 0.010716, + "auxiliary_loss_mlp": 0.01024191, + "balance_loss_clip": 1.02965271, + "balance_loss_mlp": 1.01331282, + "epoch": 0.49246956260333685, + "flos": 20412487509120.0, + "grad_norm": 2.169793170882606, + "language_loss": 0.71486795, + "learning_rate": 2.1457241519758413e-06, + "loss": 0.73582578, + "num_input_tokens_seen": 176132485, + "step": 8191, + "time_per_iteration": 2.620713472366333 + }, + { + "auxiliary_loss_clip": 0.01070242, + "auxiliary_loss_mlp": 0.00747478, + "balance_loss_clip": 1.02701712, + "balance_loss_mlp": 1.00021005, + "epoch": 0.4925296858560048, + "flos": 38976938231040.0, + "grad_norm": 2.418181101568982, + "language_loss": 0.72084033, + "learning_rate": 2.1453357227127043e-06, + "loss": 0.73901749, + "num_input_tokens_seen": 176155755, + "step": 8192, + "time_per_iteration": 2.8331775665283203 + }, + { + "auxiliary_loss_clip": 0.0099004, + "auxiliary_loss_mlp": 0.01002135, + "balance_loss_clip": 1.00199604, + "balance_loss_mlp": 1.00069833, + "epoch": 0.4925898091086728, + "flos": 64278917712000.0, + "grad_norm": 0.7327429141201095, + "language_loss": 0.52165651, + "learning_rate": 2.1449472879384224e-06, + "loss": 0.54157823, + "num_input_tokens_seen": 176216295, + "step": 8193, + "time_per_iteration": 3.314979314804077 + }, + { + "auxiliary_loss_clip": 0.01071294, + "auxiliary_loss_mlp": 0.01037626, + "balance_loss_clip": 1.02925396, + "balance_loss_mlp": 1.02582383, + "epoch": 0.49264993236134075, + "flos": 23036372110080.0, + "grad_norm": 1.5948105222900222, + "language_loss": 0.7720046, + "learning_rate": 2.1445588476677246e-06, + "loss": 0.7930938, + "num_input_tokens_seen": 176235925, + "step": 8194, + "time_per_iteration": 2.6632189750671387 + }, + { + "auxiliary_loss_clip": 0.01040136, + "auxiliary_loss_mlp": 0.01026231, + "balance_loss_clip": 1.02361512, + "balance_loss_mlp": 1.01506686, + "epoch": 0.4927100556140087, + "flos": 24718212668160.0, + "grad_norm": 1.8750601873816712, + "language_loss": 0.70386481, + "learning_rate": 2.144170401915341e-06, + "loss": 0.72452855, + "num_input_tokens_seen": 176253865, + "step": 8195, + "time_per_iteration": 2.697751522064209 + }, + { + "auxiliary_loss_clip": 0.01033243, + "auxiliary_loss_mlp": 0.0102422, + "balance_loss_clip": 1.02836633, + "balance_loss_mlp": 1.01346755, + "epoch": 0.4927701788666767, + "flos": 23505544581120.0, + "grad_norm": 2.201111453011063, + "language_loss": 0.80731791, + "learning_rate": 2.143781950696001e-06, + "loss": 0.82789254, + "num_input_tokens_seen": 176271525, + "step": 8196, + "time_per_iteration": 2.8357150554656982 + }, + { + "auxiliary_loss_clip": 0.01031632, + "auxiliary_loss_mlp": 0.01034302, + "balance_loss_clip": 1.02287412, + "balance_loss_mlp": 1.02157068, + "epoch": 0.49283030211934464, + "flos": 22928891639040.0, + "grad_norm": 1.6850682551453633, + "language_loss": 0.70375144, + "learning_rate": 2.1433934940244356e-06, + "loss": 0.72441071, + "num_input_tokens_seen": 176290810, + "step": 8197, + "time_per_iteration": 2.7748637199401855 + }, + { + "auxiliary_loss_clip": 0.01059361, + "auxiliary_loss_mlp": 0.01029662, + "balance_loss_clip": 1.02765298, + "balance_loss_mlp": 1.0185101, + "epoch": 0.4928904253720126, + "flos": 16873024210560.0, + "grad_norm": 2.107829452633497, + "language_loss": 0.84082538, + "learning_rate": 2.143005031915374e-06, + "loss": 0.86171561, + "num_input_tokens_seen": 176309165, + "step": 8198, + "time_per_iteration": 2.654897928237915 + }, + { + "auxiliary_loss_clip": 0.01064011, + "auxiliary_loss_mlp": 0.01033368, + "balance_loss_clip": 1.0288403, + "balance_loss_mlp": 1.021173, + "epoch": 0.4929505486246806, + "flos": 14866551509760.0, + "grad_norm": 1.8752124820037772, + "language_loss": 0.76477301, + "learning_rate": 2.1426165643835467e-06, + "loss": 0.78574681, + "num_input_tokens_seen": 176324960, + "step": 8199, + "time_per_iteration": 2.626150131225586 + }, + { + "auxiliary_loss_clip": 0.01044422, + "auxiliary_loss_mlp": 0.01039672, + "balance_loss_clip": 1.02520943, + "balance_loss_mlp": 1.02643442, + "epoch": 0.49301067187734854, + "flos": 23842351434240.0, + "grad_norm": 1.4169142176679381, + "language_loss": 0.59969723, + "learning_rate": 2.1422280914436864e-06, + "loss": 0.62053812, + "num_input_tokens_seen": 176346195, + "step": 8200, + "time_per_iteration": 2.7198822498321533 + }, + { + "auxiliary_loss_clip": 0.01057698, + "auxiliary_loss_mlp": 0.01036229, + "balance_loss_clip": 1.02645016, + "balance_loss_mlp": 1.02473664, + "epoch": 0.49307079513001656, + "flos": 22491284244480.0, + "grad_norm": 1.530794285008555, + "language_loss": 0.7917819, + "learning_rate": 2.1418396131105213e-06, + "loss": 0.81272113, + "num_input_tokens_seen": 176366735, + "step": 8201, + "time_per_iteration": 2.734701156616211 + }, + { + "auxiliary_loss_clip": 0.01061416, + "auxiliary_loss_mlp": 0.01034138, + "balance_loss_clip": 1.02588224, + "balance_loss_mlp": 1.02079237, + "epoch": 0.4931309183826845, + "flos": 15924587546880.0, + "grad_norm": 2.518646508190283, + "language_loss": 0.67924988, + "learning_rate": 2.141451129398785e-06, + "loss": 0.70020545, + "num_input_tokens_seen": 176384475, + "step": 8202, + "time_per_iteration": 2.7250609397888184 + }, + { + "auxiliary_loss_clip": 0.0104711, + "auxiliary_loss_mlp": 0.01028752, + "balance_loss_clip": 1.0253675, + "balance_loss_mlp": 1.01723647, + "epoch": 0.4931910416353525, + "flos": 27309059735040.0, + "grad_norm": 2.940771591115864, + "language_loss": 0.75498319, + "learning_rate": 2.1410626403232076e-06, + "loss": 0.77574182, + "num_input_tokens_seen": 176402645, + "step": 8203, + "time_per_iteration": 2.7910170555114746 + }, + { + "auxiliary_loss_clip": 0.01030568, + "auxiliary_loss_mlp": 0.01035247, + "balance_loss_clip": 1.02781296, + "balance_loss_mlp": 1.02395189, + "epoch": 0.49325116488802045, + "flos": 20806139635200.0, + "grad_norm": 1.9838364492206293, + "language_loss": 0.8047359, + "learning_rate": 2.1406741458985197e-06, + "loss": 0.82539409, + "num_input_tokens_seen": 176416715, + "step": 8204, + "time_per_iteration": 4.461494445800781 + }, + { + "auxiliary_loss_clip": 0.01059257, + "auxiliary_loss_mlp": 0.01034156, + "balance_loss_clip": 1.02781868, + "balance_loss_mlp": 1.02332592, + "epoch": 0.4933112881406884, + "flos": 19865963099520.0, + "grad_norm": 1.869285982375042, + "language_loss": 0.65907758, + "learning_rate": 2.140285646139455e-06, + "loss": 0.68001163, + "num_input_tokens_seen": 176435755, + "step": 8205, + "time_per_iteration": 4.4054484367370605 + }, + { + "auxiliary_loss_clip": 0.01076228, + "auxiliary_loss_mlp": 0.01038716, + "balance_loss_clip": 1.03007317, + "balance_loss_mlp": 1.02584136, + "epoch": 0.4933714113933564, + "flos": 21827977741440.0, + "grad_norm": 2.038347144758469, + "language_loss": 0.66429722, + "learning_rate": 2.139897141060744e-06, + "loss": 0.68544668, + "num_input_tokens_seen": 176453915, + "step": 8206, + "time_per_iteration": 2.720041275024414 + }, + { + "auxiliary_loss_clip": 0.01027391, + "auxiliary_loss_mlp": 0.0102964, + "balance_loss_clip": 1.02536595, + "balance_loss_mlp": 1.01703906, + "epoch": 0.49343153464602435, + "flos": 27890130049920.0, + "grad_norm": 1.622286268288698, + "language_loss": 0.76517379, + "learning_rate": 2.1395086306771196e-06, + "loss": 0.78574413, + "num_input_tokens_seen": 176475175, + "step": 8207, + "time_per_iteration": 2.860440731048584 + }, + { + "auxiliary_loss_clip": 0.01048429, + "auxiliary_loss_mlp": 0.01037151, + "balance_loss_clip": 1.02772355, + "balance_loss_mlp": 1.02470565, + "epoch": 0.4934916578986923, + "flos": 24681080983680.0, + "grad_norm": 2.862689017446616, + "language_loss": 0.596955, + "learning_rate": 2.1391201150033147e-06, + "loss": 0.61781079, + "num_input_tokens_seen": 176494250, + "step": 8208, + "time_per_iteration": 2.759442090988159 + }, + { + "auxiliary_loss_clip": 0.01053506, + "auxiliary_loss_mlp": 0.01032382, + "balance_loss_clip": 1.02757859, + "balance_loss_mlp": 1.01994276, + "epoch": 0.4935517811513603, + "flos": 23405139089280.0, + "grad_norm": 1.8728972076557624, + "language_loss": 0.77957702, + "learning_rate": 2.1387315940540598e-06, + "loss": 0.8004359, + "num_input_tokens_seen": 176513325, + "step": 8209, + "time_per_iteration": 2.7674992084503174 + }, + { + "auxiliary_loss_clip": 0.01036562, + "auxiliary_loss_mlp": 0.00747505, + "balance_loss_clip": 1.02194643, + "balance_loss_mlp": 1.00017667, + "epoch": 0.49361190440402825, + "flos": 21944508439680.0, + "grad_norm": 2.780135015814985, + "language_loss": 0.79042745, + "learning_rate": 2.138343067844089e-06, + "loss": 0.80826819, + "num_input_tokens_seen": 176532915, + "step": 8210, + "time_per_iteration": 2.718085527420044 + }, + { + "auxiliary_loss_clip": 0.01064347, + "auxiliary_loss_mlp": 0.0103308, + "balance_loss_clip": 1.02877402, + "balance_loss_mlp": 1.02015185, + "epoch": 0.4936720276566962, + "flos": 25115671635840.0, + "grad_norm": 1.9293147693716497, + "language_loss": 0.80914128, + "learning_rate": 2.1379545363881363e-06, + "loss": 0.83011556, + "num_input_tokens_seen": 176552775, + "step": 8211, + "time_per_iteration": 2.6720669269561768 + }, + { + "auxiliary_loss_clip": 0.01024636, + "auxiliary_loss_mlp": 0.01044016, + "balance_loss_clip": 1.02331364, + "balance_loss_mlp": 1.03043234, + "epoch": 0.4937321509093642, + "flos": 26358935132160.0, + "grad_norm": 2.334980830294812, + "language_loss": 0.91690934, + "learning_rate": 2.137565999700933e-06, + "loss": 0.93759584, + "num_input_tokens_seen": 176572185, + "step": 8212, + "time_per_iteration": 2.752703905105591 + }, + { + "auxiliary_loss_clip": 0.01026308, + "auxiliary_loss_mlp": 0.01033962, + "balance_loss_clip": 1.02334094, + "balance_loss_mlp": 1.02102149, + "epoch": 0.49379227416203214, + "flos": 22961390469120.0, + "grad_norm": 1.8076173650183494, + "language_loss": 0.64909244, + "learning_rate": 2.1371774577972138e-06, + "loss": 0.66969514, + "num_input_tokens_seen": 176591490, + "step": 8213, + "time_per_iteration": 2.7620484828948975 + }, + { + "auxiliary_loss_clip": 0.01020764, + "auxiliary_loss_mlp": 0.00747438, + "balance_loss_clip": 1.02379584, + "balance_loss_mlp": 1.00021124, + "epoch": 0.49385239741470016, + "flos": 32489101843200.0, + "grad_norm": 2.2238694848494465, + "language_loss": 0.75531662, + "learning_rate": 2.136788910691711e-06, + "loss": 0.77299869, + "num_input_tokens_seen": 176612715, + "step": 8214, + "time_per_iteration": 2.8534226417541504 + }, + { + "auxiliary_loss_clip": 0.01072028, + "auxiliary_loss_mlp": 0.01032856, + "balance_loss_clip": 1.02810907, + "balance_loss_mlp": 1.02107263, + "epoch": 0.4939125206673681, + "flos": 22492864442880.0, + "grad_norm": 2.049697336479733, + "language_loss": 0.84090209, + "learning_rate": 2.1364003583991594e-06, + "loss": 0.86195099, + "num_input_tokens_seen": 176631950, + "step": 8215, + "time_per_iteration": 2.6115448474884033 + }, + { + "auxiliary_loss_clip": 0.01055813, + "auxiliary_loss_mlp": 0.01027627, + "balance_loss_clip": 1.02493048, + "balance_loss_mlp": 1.01775098, + "epoch": 0.4939726439200361, + "flos": 31176351486720.0, + "grad_norm": 1.784914753754071, + "language_loss": 0.83788896, + "learning_rate": 2.136011800934292e-06, + "loss": 0.85872334, + "num_input_tokens_seen": 176653060, + "step": 8216, + "time_per_iteration": 4.247039556503296 + }, + { + "auxiliary_loss_clip": 0.01047498, + "auxiliary_loss_mlp": 0.01031039, + "balance_loss_clip": 1.02573967, + "balance_loss_mlp": 1.01980317, + "epoch": 0.49403276717270406, + "flos": 22674213233280.0, + "grad_norm": 1.4006443484879558, + "language_loss": 0.74551284, + "learning_rate": 2.1356232383118442e-06, + "loss": 0.76629823, + "num_input_tokens_seen": 176673895, + "step": 8217, + "time_per_iteration": 4.266698598861694 + }, + { + "auxiliary_loss_clip": 0.01067694, + "auxiliary_loss_mlp": 0.00747079, + "balance_loss_clip": 1.02733397, + "balance_loss_mlp": 1.00019395, + "epoch": 0.494092890425372, + "flos": 20741070147840.0, + "grad_norm": 1.6038702843578703, + "language_loss": 0.7864902, + "learning_rate": 2.1352346705465494e-06, + "loss": 0.80463791, + "num_input_tokens_seen": 176692550, + "step": 8218, + "time_per_iteration": 2.5570600032806396 + }, + { + "auxiliary_loss_clip": 0.01023881, + "auxiliary_loss_mlp": 0.00747251, + "balance_loss_clip": 1.02428436, + "balance_loss_mlp": 1.00015998, + "epoch": 0.49415301367804, + "flos": 18369026778240.0, + "grad_norm": 2.1449943288997826, + "language_loss": 0.76310372, + "learning_rate": 2.134846097653142e-06, + "loss": 0.78081501, + "num_input_tokens_seen": 176709335, + "step": 8219, + "time_per_iteration": 2.7613601684570312 + }, + { + "auxiliary_loss_clip": 0.01049423, + "auxiliary_loss_mlp": 0.01029622, + "balance_loss_clip": 1.02699041, + "balance_loss_mlp": 1.01817226, + "epoch": 0.49421313693070795, + "flos": 17530620451200.0, + "grad_norm": 1.6845924985535674, + "language_loss": 0.62283409, + "learning_rate": 2.134457519646357e-06, + "loss": 0.64362454, + "num_input_tokens_seen": 176727715, + "step": 8220, + "time_per_iteration": 2.6438515186309814 + }, + { + "auxiliary_loss_clip": 0.0106747, + "auxiliary_loss_mlp": 0.01030427, + "balance_loss_clip": 1.0251559, + "balance_loss_mlp": 1.01932287, + "epoch": 0.4942732601833759, + "flos": 20812173120000.0, + "grad_norm": 1.7542395112179705, + "language_loss": 0.72068, + "learning_rate": 2.1340689365409296e-06, + "loss": 0.74165893, + "num_input_tokens_seen": 176747530, + "step": 8221, + "time_per_iteration": 2.532156467437744 + }, + { + "auxiliary_loss_clip": 0.01035586, + "auxiliary_loss_mlp": 0.01031407, + "balance_loss_clip": 1.02699232, + "balance_loss_mlp": 1.02098823, + "epoch": 0.4943333834360439, + "flos": 15048941794560.0, + "grad_norm": 1.8060008315447111, + "language_loss": 0.79256499, + "learning_rate": 2.133680348351595e-06, + "loss": 0.81323493, + "num_input_tokens_seen": 176765260, + "step": 8222, + "time_per_iteration": 2.7083489894866943 + }, + { + "auxiliary_loss_clip": 0.01058667, + "auxiliary_loss_mlp": 0.01030276, + "balance_loss_clip": 1.02681255, + "balance_loss_mlp": 1.01892114, + "epoch": 0.49439350668871185, + "flos": 16070420764800.0, + "grad_norm": 2.5607050826648314, + "language_loss": 0.72425997, + "learning_rate": 2.133291755093088e-06, + "loss": 0.74514937, + "num_input_tokens_seen": 176781770, + "step": 8223, + "time_per_iteration": 2.75307297706604 + }, + { + "auxiliary_loss_clip": 0.01059428, + "auxiliary_loss_mlp": 0.01036766, + "balance_loss_clip": 1.02610993, + "balance_loss_mlp": 1.02516675, + "epoch": 0.4944536299413798, + "flos": 20880079781760.0, + "grad_norm": 1.7476457560326766, + "language_loss": 0.74989784, + "learning_rate": 2.132903156780144e-06, + "loss": 0.77085984, + "num_input_tokens_seen": 176800655, + "step": 8224, + "time_per_iteration": 3.0667548179626465 + }, + { + "auxiliary_loss_clip": 0.01044397, + "auxiliary_loss_mlp": 0.01028768, + "balance_loss_clip": 1.02698612, + "balance_loss_mlp": 1.01775336, + "epoch": 0.4945137531940478, + "flos": 26608908856320.0, + "grad_norm": 2.066422012377059, + "language_loss": 0.6326685, + "learning_rate": 2.1325145534274997e-06, + "loss": 0.65340018, + "num_input_tokens_seen": 176820610, + "step": 8225, + "time_per_iteration": 2.859450578689575 + }, + { + "auxiliary_loss_clip": 0.01048772, + "auxiliary_loss_mlp": 0.01032101, + "balance_loss_clip": 1.02660728, + "balance_loss_mlp": 1.02118778, + "epoch": 0.49457387644671574, + "flos": 23988148738560.0, + "grad_norm": 2.0779689450845336, + "language_loss": 0.76404339, + "learning_rate": 2.1321259450498893e-06, + "loss": 0.78485215, + "num_input_tokens_seen": 176840520, + "step": 8226, + "time_per_iteration": 2.8242623805999756 + }, + { + "auxiliary_loss_clip": 0.01069879, + "auxiliary_loss_mlp": 0.01032649, + "balance_loss_clip": 1.02565217, + "balance_loss_mlp": 1.02039444, + "epoch": 0.49463399969938376, + "flos": 26976598427520.0, + "grad_norm": 1.6305344273958935, + "language_loss": 0.70741981, + "learning_rate": 2.131737331662051e-06, + "loss": 0.72844511, + "num_input_tokens_seen": 176860265, + "step": 8227, + "time_per_iteration": 2.835954189300537 + }, + { + "auxiliary_loss_clip": 0.01050419, + "auxiliary_loss_mlp": 0.01033048, + "balance_loss_clip": 1.02606344, + "balance_loss_mlp": 1.02158618, + "epoch": 0.49469412295205173, + "flos": 29681534067840.0, + "grad_norm": 1.5480287596921278, + "language_loss": 0.71460325, + "learning_rate": 2.131348713278718e-06, + "loss": 0.73543793, + "num_input_tokens_seen": 176882910, + "step": 8228, + "time_per_iteration": 2.7210519313812256 + }, + { + "auxiliary_loss_clip": 0.01066923, + "auxiliary_loss_mlp": 0.01025075, + "balance_loss_clip": 1.02543938, + "balance_loss_mlp": 1.01401258, + "epoch": 0.4947542462047197, + "flos": 24131791226880.0, + "grad_norm": 1.490630552940712, + "language_loss": 0.83793986, + "learning_rate": 2.1309600899146304e-06, + "loss": 0.85885984, + "num_input_tokens_seen": 176903030, + "step": 8229, + "time_per_iteration": 2.65669846534729 + }, + { + "auxiliary_loss_clip": 0.01058562, + "auxiliary_loss_mlp": 0.01031968, + "balance_loss_clip": 1.0251255, + "balance_loss_mlp": 1.01940894, + "epoch": 0.49481436945738766, + "flos": 20045049333120.0, + "grad_norm": 2.0661632444871465, + "language_loss": 0.74772608, + "learning_rate": 2.1305714615845227e-06, + "loss": 0.76863146, + "num_input_tokens_seen": 176919025, + "step": 8230, + "time_per_iteration": 2.7053043842315674 + }, + { + "auxiliary_loss_clip": 0.01059972, + "auxiliary_loss_mlp": 0.01026935, + "balance_loss_clip": 1.02782452, + "balance_loss_mlp": 1.01542521, + "epoch": 0.4948744927100556, + "flos": 15669550005120.0, + "grad_norm": 2.0097668336734387, + "language_loss": 0.79794991, + "learning_rate": 2.1301828283031314e-06, + "loss": 0.81881893, + "num_input_tokens_seen": 176937945, + "step": 8231, + "time_per_iteration": 2.6928420066833496 + }, + { + "auxiliary_loss_clip": 0.00999544, + "auxiliary_loss_mlp": 0.01001628, + "balance_loss_clip": 1.00205564, + "balance_loss_mlp": 1.00042427, + "epoch": 0.4949346159627236, + "flos": 68872071502080.0, + "grad_norm": 0.7539632771758306, + "language_loss": 0.60195988, + "learning_rate": 2.1297941900851944e-06, + "loss": 0.62197167, + "num_input_tokens_seen": 177004575, + "step": 8232, + "time_per_iteration": 3.356567144393921 + }, + { + "auxiliary_loss_clip": 0.01043719, + "auxiliary_loss_mlp": 0.01032326, + "balance_loss_clip": 1.02449596, + "balance_loss_mlp": 1.01987481, + "epoch": 0.49499473921539155, + "flos": 24790285307520.0, + "grad_norm": 1.5908609687083795, + "language_loss": 0.68564689, + "learning_rate": 2.1294055469454496e-06, + "loss": 0.70640743, + "num_input_tokens_seen": 177024155, + "step": 8233, + "time_per_iteration": 2.6973633766174316 + }, + { + "auxiliary_loss_clip": 0.01016612, + "auxiliary_loss_mlp": 0.01032408, + "balance_loss_clip": 1.02199936, + "balance_loss_mlp": 1.01951003, + "epoch": 0.4950548624680595, + "flos": 32707905540480.0, + "grad_norm": 2.213121656067354, + "language_loss": 0.66247904, + "learning_rate": 2.129016898898633e-06, + "loss": 0.68296921, + "num_input_tokens_seen": 177046185, + "step": 8234, + "time_per_iteration": 2.8519248962402344 + }, + { + "auxiliary_loss_clip": 0.00990051, + "auxiliary_loss_mlp": 0.01004227, + "balance_loss_clip": 1.00272036, + "balance_loss_mlp": 1.00280809, + "epoch": 0.4951149857207275, + "flos": 50082173066880.0, + "grad_norm": 0.8399924298634804, + "language_loss": 0.57967341, + "learning_rate": 2.128628245959482e-06, + "loss": 0.59961617, + "num_input_tokens_seen": 177099025, + "step": 8235, + "time_per_iteration": 3.1222739219665527 + }, + { + "auxiliary_loss_clip": 0.01034572, + "auxiliary_loss_mlp": 0.01034218, + "balance_loss_clip": 1.02352858, + "balance_loss_mlp": 1.02192712, + "epoch": 0.49517510897339545, + "flos": 22236785406720.0, + "grad_norm": 1.5691699481963053, + "language_loss": 0.76927727, + "learning_rate": 2.1282395881427355e-06, + "loss": 0.78996515, + "num_input_tokens_seen": 177118365, + "step": 8236, + "time_per_iteration": 2.700012445449829 + }, + { + "auxiliary_loss_clip": 0.01033121, + "auxiliary_loss_mlp": 0.01031699, + "balance_loss_clip": 1.02616906, + "balance_loss_mlp": 1.02032673, + "epoch": 0.4952352322260634, + "flos": 25374120969600.0, + "grad_norm": 1.7133789376378652, + "language_loss": 0.72356915, + "learning_rate": 2.1278509254631315e-06, + "loss": 0.74421734, + "num_input_tokens_seen": 177136415, + "step": 8237, + "time_per_iteration": 2.6935832500457764 + }, + { + "auxiliary_loss_clip": 0.01067591, + "auxiliary_loss_mlp": 0.01029207, + "balance_loss_clip": 1.0259819, + "balance_loss_mlp": 1.01847243, + "epoch": 0.4952953554787314, + "flos": 24608721035520.0, + "grad_norm": 1.7183087867289255, + "language_loss": 0.75850797, + "learning_rate": 2.127462257935406e-06, + "loss": 0.77947593, + "num_input_tokens_seen": 177155690, + "step": 8238, + "time_per_iteration": 2.5502121448516846 + }, + { + "auxiliary_loss_clip": 0.01034754, + "auxiliary_loss_mlp": 0.01036264, + "balance_loss_clip": 1.02530456, + "balance_loss_mlp": 1.02203596, + "epoch": 0.49535547873139935, + "flos": 17311278049920.0, + "grad_norm": 2.360756861645136, + "language_loss": 0.74324131, + "learning_rate": 2.1270735855743008e-06, + "loss": 0.76395148, + "num_input_tokens_seen": 177173350, + "step": 8239, + "time_per_iteration": 2.6341633796691895 + }, + { + "auxiliary_loss_clip": 0.00997085, + "auxiliary_loss_mlp": 0.01032777, + "balance_loss_clip": 1.0265882, + "balance_loss_mlp": 1.01924706, + "epoch": 0.4954156019840673, + "flos": 20740315962240.0, + "grad_norm": 2.1796649043179768, + "language_loss": 0.78593469, + "learning_rate": 2.126684908394552e-06, + "loss": 0.80623329, + "num_input_tokens_seen": 177191115, + "step": 8240, + "time_per_iteration": 3.009173631668091 + }, + { + "auxiliary_loss_clip": 0.01056743, + "auxiliary_loss_mlp": 0.01033948, + "balance_loss_clip": 1.02610302, + "balance_loss_mlp": 1.0228672, + "epoch": 0.49547572523673533, + "flos": 12820684567680.0, + "grad_norm": 2.400571180852278, + "language_loss": 0.85782826, + "learning_rate": 2.126296226410898e-06, + "loss": 0.87873518, + "num_input_tokens_seen": 177206155, + "step": 8241, + "time_per_iteration": 2.8722636699676514 + }, + { + "auxiliary_loss_clip": 0.01021483, + "auxiliary_loss_mlp": 0.01034457, + "balance_loss_clip": 1.02668869, + "balance_loss_mlp": 1.02327549, + "epoch": 0.4955358484894033, + "flos": 15597046402560.0, + "grad_norm": 1.8359822835566808, + "language_loss": 0.77383077, + "learning_rate": 2.1259075396380794e-06, + "loss": 0.7943902, + "num_input_tokens_seen": 177224815, + "step": 8242, + "time_per_iteration": 2.7989108562469482 + }, + { + "auxiliary_loss_clip": 0.01046197, + "auxiliary_loss_mlp": 0.00747073, + "balance_loss_clip": 1.02439857, + "balance_loss_mlp": 1.00015831, + "epoch": 0.49559597174207126, + "flos": 26464368528000.0, + "grad_norm": 2.047330502472216, + "language_loss": 0.67159092, + "learning_rate": 2.125518848090833e-06, + "loss": 0.68952358, + "num_input_tokens_seen": 177244490, + "step": 8243, + "time_per_iteration": 2.782926321029663 + }, + { + "auxiliary_loss_clip": 0.01059625, + "auxiliary_loss_mlp": 0.01025366, + "balance_loss_clip": 1.02779925, + "balance_loss_mlp": 1.0144521, + "epoch": 0.4956560949947392, + "flos": 23148234040320.0, + "grad_norm": 1.5414582956418819, + "language_loss": 0.68104291, + "learning_rate": 2.125130151783901e-06, + "loss": 0.70189285, + "num_input_tokens_seen": 177264340, + "step": 8244, + "time_per_iteration": 2.635406494140625 + }, + { + "auxiliary_loss_clip": 0.01035888, + "auxiliary_loss_mlp": 0.01032627, + "balance_loss_clip": 1.02399004, + "balance_loss_mlp": 1.02056873, + "epoch": 0.4957162182474072, + "flos": 20773461237120.0, + "grad_norm": 1.8086881860386674, + "language_loss": 0.75418466, + "learning_rate": 2.12474145073202e-06, + "loss": 0.7748698, + "num_input_tokens_seen": 177283055, + "step": 8245, + "time_per_iteration": 2.6650569438934326 + }, + { + "auxiliary_loss_clip": 0.01058496, + "auxiliary_loss_mlp": 0.01027696, + "balance_loss_clip": 1.02657509, + "balance_loss_mlp": 1.01610875, + "epoch": 0.49577634150007516, + "flos": 18734202397440.0, + "grad_norm": 2.060780336782546, + "language_loss": 0.81258577, + "learning_rate": 2.1243527449499306e-06, + "loss": 0.83344769, + "num_input_tokens_seen": 177301140, + "step": 8246, + "time_per_iteration": 2.639963388442993 + }, + { + "auxiliary_loss_clip": 0.01033567, + "auxiliary_loss_mlp": 0.01034248, + "balance_loss_clip": 1.02617478, + "balance_loss_mlp": 1.02154589, + "epoch": 0.4958364647527431, + "flos": 25554176870400.0, + "grad_norm": 1.6419133018777545, + "language_loss": 0.83531868, + "learning_rate": 2.1239640344523733e-06, + "loss": 0.85599685, + "num_input_tokens_seen": 177323095, + "step": 8247, + "time_per_iteration": 2.7257421016693115 + }, + { + "auxiliary_loss_clip": 0.01043773, + "auxiliary_loss_mlp": 0.01029542, + "balance_loss_clip": 1.02913654, + "balance_loss_mlp": 1.01809216, + "epoch": 0.4958965880054111, + "flos": 24425325169920.0, + "grad_norm": 1.7750890076785888, + "language_loss": 0.8352499, + "learning_rate": 2.123575319254087e-06, + "loss": 0.85598302, + "num_input_tokens_seen": 177339845, + "step": 8248, + "time_per_iteration": 2.79535174369812 + }, + { + "auxiliary_loss_clip": 0.01061498, + "auxiliary_loss_mlp": 0.01029268, + "balance_loss_clip": 1.02803636, + "balance_loss_mlp": 1.0176574, + "epoch": 0.49595671125807905, + "flos": 25083460114560.0, + "grad_norm": 1.791836931250138, + "language_loss": 0.73141927, + "learning_rate": 2.123186599369812e-06, + "loss": 0.75232691, + "num_input_tokens_seen": 177359980, + "step": 8249, + "time_per_iteration": 2.62614369392395 + }, + { + "auxiliary_loss_clip": 0.01052884, + "auxiliary_loss_mlp": 0.01035393, + "balance_loss_clip": 1.02860785, + "balance_loss_mlp": 1.02323329, + "epoch": 0.496016834510747, + "flos": 16435883692800.0, + "grad_norm": 1.7597300904091415, + "language_loss": 0.76097226, + "learning_rate": 2.122797874814289e-06, + "loss": 0.78185499, + "num_input_tokens_seen": 177378580, + "step": 8250, + "time_per_iteration": 2.6489439010620117 + }, + { + "auxiliary_loss_clip": 0.01071665, + "auxiliary_loss_mlp": 0.01037005, + "balance_loss_clip": 1.02751994, + "balance_loss_mlp": 1.02557898, + "epoch": 0.496076957763415, + "flos": 23437925228160.0, + "grad_norm": 1.8798569853263372, + "language_loss": 0.70132971, + "learning_rate": 2.1224091456022585e-06, + "loss": 0.7224164, + "num_input_tokens_seen": 177398790, + "step": 8251, + "time_per_iteration": 4.629547595977783 + }, + { + "auxiliary_loss_clip": 0.010397, + "auxiliary_loss_mlp": 0.00747265, + "balance_loss_clip": 1.03472352, + "balance_loss_mlp": 1.00015688, + "epoch": 0.49613708101608295, + "flos": 16909509450240.0, + "grad_norm": 1.9753348097522567, + "language_loss": 0.79807401, + "learning_rate": 2.122020411748461e-06, + "loss": 0.81594372, + "num_input_tokens_seen": 177416515, + "step": 8252, + "time_per_iteration": 4.784898042678833 + }, + { + "auxiliary_loss_clip": 0.01069798, + "auxiliary_loss_mlp": 0.01027547, + "balance_loss_clip": 1.02688956, + "balance_loss_mlp": 1.01445735, + "epoch": 0.4961972042687509, + "flos": 16618094409600.0, + "grad_norm": 1.7118912130532822, + "language_loss": 0.81045419, + "learning_rate": 2.1216316732676363e-06, + "loss": 0.83142763, + "num_input_tokens_seen": 177434425, + "step": 8253, + "time_per_iteration": 2.6446471214294434 + }, + { + "auxiliary_loss_clip": 0.01034821, + "auxiliary_loss_mlp": 0.01026678, + "balance_loss_clip": 1.02532053, + "balance_loss_mlp": 1.01548386, + "epoch": 0.49625732752141893, + "flos": 28956749437440.0, + "grad_norm": 1.701744230825782, + "language_loss": 0.67444956, + "learning_rate": 2.1212429301745275e-06, + "loss": 0.69506454, + "num_input_tokens_seen": 177459675, + "step": 8254, + "time_per_iteration": 2.897428274154663 + }, + { + "auxiliary_loss_clip": 0.01032401, + "auxiliary_loss_mlp": 0.01037232, + "balance_loss_clip": 1.0237143, + "balance_loss_mlp": 1.024405, + "epoch": 0.4963174507740869, + "flos": 23112359331840.0, + "grad_norm": 1.5868642197864278, + "language_loss": 0.74030781, + "learning_rate": 2.1208541824838743e-06, + "loss": 0.76100415, + "num_input_tokens_seen": 177478895, + "step": 8255, + "time_per_iteration": 2.733785390853882 + }, + { + "auxiliary_loss_clip": 0.01043526, + "auxiliary_loss_mlp": 0.01031371, + "balance_loss_clip": 1.0248853, + "balance_loss_mlp": 1.01955199, + "epoch": 0.49637757402675486, + "flos": 13917863450880.0, + "grad_norm": 3.7669323475004046, + "language_loss": 0.81289423, + "learning_rate": 2.1204654302104183e-06, + "loss": 0.8336432, + "num_input_tokens_seen": 177494920, + "step": 8256, + "time_per_iteration": 2.82114577293396 + }, + { + "auxiliary_loss_clip": 0.01047574, + "auxiliary_loss_mlp": 0.01025251, + "balance_loss_clip": 1.02609074, + "balance_loss_mlp": 1.01419449, + "epoch": 0.49643769727942283, + "flos": 22309001700480.0, + "grad_norm": 1.4274555269621167, + "language_loss": 0.8077662, + "learning_rate": 2.120076673368901e-06, + "loss": 0.82849443, + "num_input_tokens_seen": 177515455, + "step": 8257, + "time_per_iteration": 2.7616822719573975 + }, + { + "auxiliary_loss_clip": 0.01073772, + "auxiliary_loss_mlp": 0.01030719, + "balance_loss_clip": 1.02823329, + "balance_loss_mlp": 1.01847017, + "epoch": 0.4964978205320908, + "flos": 19500248776320.0, + "grad_norm": 1.9306591601308178, + "language_loss": 0.66487092, + "learning_rate": 2.1196879119740647e-06, + "loss": 0.68591583, + "num_input_tokens_seen": 177534040, + "step": 8258, + "time_per_iteration": 2.655294179916382 + }, + { + "auxiliary_loss_clip": 0.01056574, + "auxiliary_loss_mlp": 0.01024943, + "balance_loss_clip": 1.02606332, + "balance_loss_mlp": 1.01443434, + "epoch": 0.49655794378475876, + "flos": 23436524597760.0, + "grad_norm": 1.6392817781096782, + "language_loss": 0.77532947, + "learning_rate": 2.1192991460406502e-06, + "loss": 0.79614466, + "num_input_tokens_seen": 177554510, + "step": 8259, + "time_per_iteration": 2.7647178173065186 + }, + { + "auxiliary_loss_clip": 0.0104111, + "auxiliary_loss_mlp": 0.01028309, + "balance_loss_clip": 1.02451813, + "balance_loss_mlp": 1.01667416, + "epoch": 0.4966180670374267, + "flos": 26831124345600.0, + "grad_norm": 1.4806854122492699, + "language_loss": 0.78162885, + "learning_rate": 2.1189103755834e-06, + "loss": 0.80232298, + "num_input_tokens_seen": 177575780, + "step": 8260, + "time_per_iteration": 2.8262057304382324 + }, + { + "auxiliary_loss_clip": 0.0104342, + "auxiliary_loss_mlp": 0.01029716, + "balance_loss_clip": 1.02478111, + "balance_loss_mlp": 1.01787281, + "epoch": 0.4966781902900947, + "flos": 22009326531840.0, + "grad_norm": 2.7022691822879032, + "language_loss": 0.76184523, + "learning_rate": 2.1185216006170573e-06, + "loss": 0.78257656, + "num_input_tokens_seen": 177588965, + "step": 8261, + "time_per_iteration": 2.767374277114868 + }, + { + "auxiliary_loss_clip": 0.01022683, + "auxiliary_loss_mlp": 0.0102749, + "balance_loss_clip": 1.02491808, + "balance_loss_mlp": 1.01611185, + "epoch": 0.49673831354276266, + "flos": 26213353309440.0, + "grad_norm": 2.6818282986103728, + "language_loss": 0.8961516, + "learning_rate": 2.1181328211563627e-06, + "loss": 0.91665334, + "num_input_tokens_seen": 177608425, + "step": 8262, + "time_per_iteration": 2.796778440475464 + }, + { + "auxiliary_loss_clip": 0.01015863, + "auxiliary_loss_mlp": 0.01026916, + "balance_loss_clip": 1.02585411, + "balance_loss_mlp": 1.01621068, + "epoch": 0.4967984367954306, + "flos": 23182277155200.0, + "grad_norm": 1.4454086830585045, + "language_loss": 0.73971987, + "learning_rate": 2.11774403721606e-06, + "loss": 0.76014769, + "num_input_tokens_seen": 177628240, + "step": 8263, + "time_per_iteration": 4.515148639678955 + }, + { + "auxiliary_loss_clip": 0.0103653, + "auxiliary_loss_mlp": 0.01034386, + "balance_loss_clip": 1.03168654, + "balance_loss_mlp": 1.02133238, + "epoch": 0.4968585600480986, + "flos": 19281445079040.0, + "grad_norm": 1.992555438845069, + "language_loss": 0.70133698, + "learning_rate": 2.1173552488108923e-06, + "loss": 0.72204614, + "num_input_tokens_seen": 177645920, + "step": 8264, + "time_per_iteration": 4.439220428466797 + }, + { + "auxiliary_loss_clip": 0.01050947, + "auxiliary_loss_mlp": 0.0103166, + "balance_loss_clip": 1.02816403, + "balance_loss_mlp": 1.0198642, + "epoch": 0.49691868330076655, + "flos": 22528703237760.0, + "grad_norm": 1.421985150110164, + "language_loss": 0.64801812, + "learning_rate": 2.1169664559556007e-06, + "loss": 0.66884422, + "num_input_tokens_seen": 177667185, + "step": 8265, + "time_per_iteration": 2.73786997795105 + }, + { + "auxiliary_loss_clip": 0.00990405, + "auxiliary_loss_mlp": 0.01011115, + "balance_loss_clip": 1.00291729, + "balance_loss_mlp": 1.00951719, + "epoch": 0.4969788065534345, + "flos": 66577128675840.0, + "grad_norm": 0.9144827807429836, + "language_loss": 0.53460556, + "learning_rate": 2.1165776586649304e-06, + "loss": 0.55462074, + "num_input_tokens_seen": 177733020, + "step": 8266, + "time_per_iteration": 3.2820146083831787 + }, + { + "auxiliary_loss_clip": 0.01057948, + "auxiliary_loss_mlp": 0.01024666, + "balance_loss_clip": 1.02640939, + "balance_loss_mlp": 1.01372266, + "epoch": 0.49703892980610254, + "flos": 24059503105920.0, + "grad_norm": 1.553038462074873, + "language_loss": 0.79677057, + "learning_rate": 2.1161888569536223e-06, + "loss": 0.81759673, + "num_input_tokens_seen": 177753370, + "step": 8267, + "time_per_iteration": 2.6806952953338623 + }, + { + "auxiliary_loss_clip": 0.01053305, + "auxiliary_loss_mlp": 0.01027836, + "balance_loss_clip": 1.02899969, + "balance_loss_mlp": 1.0154326, + "epoch": 0.4970990530587705, + "flos": 29126174912640.0, + "grad_norm": 2.18318189327721, + "language_loss": 0.7516762, + "learning_rate": 2.1158000508364223e-06, + "loss": 0.77248764, + "num_input_tokens_seen": 177771530, + "step": 8268, + "time_per_iteration": 2.8767476081848145 + }, + { + "auxiliary_loss_clip": 0.01058393, + "auxiliary_loss_mlp": 0.0074741, + "balance_loss_clip": 1.02510273, + "balance_loss_mlp": 1.00013113, + "epoch": 0.49715917631143847, + "flos": 46026167258880.0, + "grad_norm": 1.4385171059337998, + "language_loss": 0.67715752, + "learning_rate": 2.115411240328073e-06, + "loss": 0.69521552, + "num_input_tokens_seen": 177796355, + "step": 8269, + "time_per_iteration": 2.8412203788757324 + }, + { + "auxiliary_loss_clip": 0.01046903, + "auxiliary_loss_mlp": 0.01032669, + "balance_loss_clip": 1.02609682, + "balance_loss_mlp": 1.02133179, + "epoch": 0.49721929956410643, + "flos": 20191277600640.0, + "grad_norm": 1.9538900995871995, + "language_loss": 0.85384202, + "learning_rate": 2.1150224254433167e-06, + "loss": 0.87463772, + "num_input_tokens_seen": 177814300, + "step": 8270, + "time_per_iteration": 2.823207139968872 + }, + { + "auxiliary_loss_clip": 0.01027601, + "auxiliary_loss_mlp": 0.00747207, + "balance_loss_clip": 1.02561843, + "balance_loss_mlp": 1.00011873, + "epoch": 0.4972794228167744, + "flos": 21653560275840.0, + "grad_norm": 1.8111305923322292, + "language_loss": 0.71115595, + "learning_rate": 2.114633606196899e-06, + "loss": 0.72890401, + "num_input_tokens_seen": 177833615, + "step": 8271, + "time_per_iteration": 2.8336167335510254 + }, + { + "auxiliary_loss_clip": 0.01051254, + "auxiliary_loss_mlp": 0.01029376, + "balance_loss_clip": 1.02547383, + "balance_loss_mlp": 1.01741326, + "epoch": 0.49733954606944236, + "flos": 24279743347200.0, + "grad_norm": 1.3974040921847242, + "language_loss": 0.78196502, + "learning_rate": 2.1142447826035635e-06, + "loss": 0.80277133, + "num_input_tokens_seen": 177855315, + "step": 8272, + "time_per_iteration": 2.647054672241211 + }, + { + "auxiliary_loss_clip": 0.01035497, + "auxiliary_loss_mlp": 0.01034384, + "balance_loss_clip": 1.02761626, + "balance_loss_mlp": 1.02289844, + "epoch": 0.4973996693221103, + "flos": 37852575730560.0, + "grad_norm": 1.976105667438249, + "language_loss": 0.66719615, + "learning_rate": 2.1138559546780544e-06, + "loss": 0.687895, + "num_input_tokens_seen": 177875590, + "step": 8273, + "time_per_iteration": 2.8147342205047607 + }, + { + "auxiliary_loss_clip": 0.01030648, + "auxiliary_loss_mlp": 0.0103372, + "balance_loss_clip": 1.02541566, + "balance_loss_mlp": 1.02303314, + "epoch": 0.4974597925747783, + "flos": 21361426963200.0, + "grad_norm": 1.916190290232849, + "language_loss": 0.77933645, + "learning_rate": 2.1134671224351163e-06, + "loss": 0.79998016, + "num_input_tokens_seen": 177894175, + "step": 8274, + "time_per_iteration": 2.7762844562530518 + }, + { + "auxiliary_loss_clip": 0.01036517, + "auxiliary_loss_mlp": 0.01032627, + "balance_loss_clip": 1.02475357, + "balance_loss_mlp": 1.02063429, + "epoch": 0.49751991582744626, + "flos": 30738133560960.0, + "grad_norm": 1.7710291239218243, + "language_loss": 0.75606817, + "learning_rate": 2.113078285889493e-06, + "loss": 0.77675956, + "num_input_tokens_seen": 177913920, + "step": 8275, + "time_per_iteration": 2.729400157928467 + }, + { + "auxiliary_loss_clip": 0.01063204, + "auxiliary_loss_mlp": 0.01035778, + "balance_loss_clip": 1.0275321, + "balance_loss_mlp": 1.02292109, + "epoch": 0.4975800390801142, + "flos": 14100541044480.0, + "grad_norm": 1.9694466536509674, + "language_loss": 0.83501124, + "learning_rate": 2.1126894450559303e-06, + "loss": 0.85600114, + "num_input_tokens_seen": 177930425, + "step": 8276, + "time_per_iteration": 2.5421791076660156 + }, + { + "auxiliary_loss_clip": 0.01065236, + "auxiliary_loss_mlp": 0.00747282, + "balance_loss_clip": 1.02568614, + "balance_loss_mlp": 1.00010753, + "epoch": 0.4976401623327822, + "flos": 24207275658240.0, + "grad_norm": 1.4341723019602537, + "language_loss": 0.7004177, + "learning_rate": 2.112300599949172e-06, + "loss": 0.71854293, + "num_input_tokens_seen": 177949885, + "step": 8277, + "time_per_iteration": 2.5960845947265625 + }, + { + "auxiliary_loss_clip": 0.01057178, + "auxiliary_loss_mlp": 0.01032109, + "balance_loss_clip": 1.02600598, + "balance_loss_mlp": 1.02095103, + "epoch": 0.49770028558545015, + "flos": 21136769349120.0, + "grad_norm": 1.7534744880712205, + "language_loss": 0.82413781, + "learning_rate": 2.111911750583964e-06, + "loss": 0.84503067, + "num_input_tokens_seen": 177965720, + "step": 8278, + "time_per_iteration": 2.819490671157837 + }, + { + "auxiliary_loss_clip": 0.01060945, + "auxiliary_loss_mlp": 0.01030135, + "balance_loss_clip": 1.02632451, + "balance_loss_mlp": 1.01891112, + "epoch": 0.4977604088381181, + "flos": 16763927627520.0, + "grad_norm": 2.0517185469946644, + "language_loss": 0.67609763, + "learning_rate": 2.111522896975052e-06, + "loss": 0.69700843, + "num_input_tokens_seen": 177983190, + "step": 8279, + "time_per_iteration": 2.601571798324585 + }, + { + "auxiliary_loss_clip": 0.01057668, + "auxiliary_loss_mlp": 0.01032722, + "balance_loss_clip": 1.02390468, + "balance_loss_mlp": 1.02049708, + "epoch": 0.49782053209078614, + "flos": 15703521292800.0, + "grad_norm": 2.623196868724821, + "language_loss": 0.70439273, + "learning_rate": 2.1111340391371794e-06, + "loss": 0.72529662, + "num_input_tokens_seen": 178000155, + "step": 8280, + "time_per_iteration": 2.5641114711761475 + }, + { + "auxiliary_loss_clip": 0.01036481, + "auxiliary_loss_mlp": 0.01030505, + "balance_loss_clip": 1.02312207, + "balance_loss_mlp": 1.01916265, + "epoch": 0.4978806553434541, + "flos": 24753692327040.0, + "grad_norm": 1.612099017983372, + "language_loss": 0.6506021, + "learning_rate": 2.1107451770850936e-06, + "loss": 0.67127192, + "num_input_tokens_seen": 178021060, + "step": 8281, + "time_per_iteration": 2.730646848678589 + }, + { + "auxiliary_loss_clip": 0.01059351, + "auxiliary_loss_mlp": 0.01032445, + "balance_loss_clip": 1.02543867, + "balance_loss_mlp": 1.02038145, + "epoch": 0.49794077859612207, + "flos": 13115726881920.0, + "grad_norm": 2.1205389999481103, + "language_loss": 0.72846496, + "learning_rate": 2.1103563108335387e-06, + "loss": 0.74938297, + "num_input_tokens_seen": 178038180, + "step": 8282, + "time_per_iteration": 2.5346217155456543 + }, + { + "auxiliary_loss_clip": 0.01044823, + "auxiliary_loss_mlp": 0.01027823, + "balance_loss_clip": 1.02541995, + "balance_loss_mlp": 1.01765466, + "epoch": 0.49800090184879003, + "flos": 27525133998720.0, + "grad_norm": 1.447654868297226, + "language_loss": 0.72987115, + "learning_rate": 2.109967440397263e-06, + "loss": 0.75059754, + "num_input_tokens_seen": 178057565, + "step": 8283, + "time_per_iteration": 2.7243926525115967 + }, + { + "auxiliary_loss_clip": 0.01008755, + "auxiliary_loss_mlp": 0.01043868, + "balance_loss_clip": 1.02000332, + "balance_loss_mlp": 1.03021264, + "epoch": 0.498061025101458, + "flos": 19792489829760.0, + "grad_norm": 1.5921219591750568, + "language_loss": 0.78890932, + "learning_rate": 2.1095785657910095e-06, + "loss": 0.80943549, + "num_input_tokens_seen": 178076965, + "step": 8284, + "time_per_iteration": 2.7746431827545166 + }, + { + "auxiliary_loss_clip": 0.01053029, + "auxiliary_loss_mlp": 0.01035641, + "balance_loss_clip": 1.02756381, + "balance_loss_mlp": 1.02303469, + "epoch": 0.49812114835412596, + "flos": 29893909230720.0, + "grad_norm": 1.714495147317186, + "language_loss": 0.73407376, + "learning_rate": 2.109189687029526e-06, + "loss": 0.75496042, + "num_input_tokens_seen": 178095105, + "step": 8285, + "time_per_iteration": 2.694783926010132 + }, + { + "auxiliary_loss_clip": 0.01054115, + "auxiliary_loss_mlp": 0.01027448, + "balance_loss_clip": 1.02756286, + "balance_loss_mlp": 1.01546705, + "epoch": 0.49818127160679393, + "flos": 23147048891520.0, + "grad_norm": 1.9254960770481035, + "language_loss": 0.73905289, + "learning_rate": 2.1088008041275598e-06, + "loss": 0.7598685, + "num_input_tokens_seen": 178114505, + "step": 8286, + "time_per_iteration": 2.694274663925171 + }, + { + "auxiliary_loss_clip": 0.01050375, + "auxiliary_loss_mlp": 0.01041067, + "balance_loss_clip": 1.02718103, + "balance_loss_mlp": 1.02942634, + "epoch": 0.4982413948594619, + "flos": 21652806090240.0, + "grad_norm": 1.8236702719636617, + "language_loss": 0.85261941, + "learning_rate": 2.1084119170998545e-06, + "loss": 0.87353384, + "num_input_tokens_seen": 178131595, + "step": 8287, + "time_per_iteration": 2.6240689754486084 + }, + { + "auxiliary_loss_clip": 0.01017586, + "auxiliary_loss_mlp": 0.01027831, + "balance_loss_clip": 1.02326977, + "balance_loss_mlp": 1.01617217, + "epoch": 0.49830151811212986, + "flos": 32486982940800.0, + "grad_norm": 1.5544843438526046, + "language_loss": 0.72140419, + "learning_rate": 2.108023025961159e-06, + "loss": 0.74185824, + "num_input_tokens_seen": 178152055, + "step": 8288, + "time_per_iteration": 2.8016927242279053 + }, + { + "auxiliary_loss_clip": 0.01049569, + "auxiliary_loss_mlp": 0.01033575, + "balance_loss_clip": 1.02552223, + "balance_loss_mlp": 1.02068281, + "epoch": 0.4983616413647978, + "flos": 18142358002560.0, + "grad_norm": 3.600414201923748, + "language_loss": 0.80764008, + "learning_rate": 2.10763413072622e-06, + "loss": 0.82847154, + "num_input_tokens_seen": 178168150, + "step": 8289, + "time_per_iteration": 2.728888750076294 + }, + { + "auxiliary_loss_clip": 0.010572, + "auxiliary_loss_mlp": 0.01031774, + "balance_loss_clip": 1.02429473, + "balance_loss_mlp": 1.0201273, + "epoch": 0.4984217646174658, + "flos": 19718836992000.0, + "grad_norm": 2.2225822157254025, + "language_loss": 0.72811091, + "learning_rate": 2.107245231409784e-06, + "loss": 0.74900067, + "num_input_tokens_seen": 178186150, + "step": 8290, + "time_per_iteration": 2.604452133178711 + }, + { + "auxiliary_loss_clip": 0.01062789, + "auxiliary_loss_mlp": 0.0103833, + "balance_loss_clip": 1.02870011, + "balance_loss_mlp": 1.02482355, + "epoch": 0.49848188787013376, + "flos": 24936549488640.0, + "grad_norm": 1.5454585935858556, + "language_loss": 0.84287578, + "learning_rate": 2.106856328026598e-06, + "loss": 0.86388695, + "num_input_tokens_seen": 178207665, + "step": 8291, + "time_per_iteration": 2.625037431716919 + }, + { + "auxiliary_loss_clip": 0.01043037, + "auxiliary_loss_mlp": 0.01040185, + "balance_loss_clip": 1.02508414, + "balance_loss_mlp": 1.02583814, + "epoch": 0.4985420111228017, + "flos": 22382439056640.0, + "grad_norm": 1.7452408572304718, + "language_loss": 0.67060101, + "learning_rate": 2.106467420591409e-06, + "loss": 0.69143319, + "num_input_tokens_seen": 178226325, + "step": 8292, + "time_per_iteration": 2.6852943897247314 + }, + { + "auxiliary_loss_clip": 0.01070327, + "auxiliary_loss_mlp": 0.01030504, + "balance_loss_clip": 1.02770162, + "balance_loss_mlp": 1.01984692, + "epoch": 0.4986021343754697, + "flos": 16216469464320.0, + "grad_norm": 1.6910018078250837, + "language_loss": 0.6711114, + "learning_rate": 2.106078509118965e-06, + "loss": 0.69211972, + "num_input_tokens_seen": 178244960, + "step": 8293, + "time_per_iteration": 2.5779550075531006 + }, + { + "auxiliary_loss_clip": 0.01053191, + "auxiliary_loss_mlp": 0.01028101, + "balance_loss_clip": 1.02572405, + "balance_loss_mlp": 1.0162096, + "epoch": 0.4986622576281377, + "flos": 23403594804480.0, + "grad_norm": 2.062914978125289, + "language_loss": 0.82233763, + "learning_rate": 2.1056895936240133e-06, + "loss": 0.84315056, + "num_input_tokens_seen": 178265400, + "step": 8294, + "time_per_iteration": 2.645420551300049 + }, + { + "auxiliary_loss_clip": 0.01058406, + "auxiliary_loss_mlp": 0.01025236, + "balance_loss_clip": 1.02449489, + "balance_loss_mlp": 1.01268888, + "epoch": 0.49872238088080567, + "flos": 19974556892160.0, + "grad_norm": 1.97553297102717, + "language_loss": 0.7273401, + "learning_rate": 2.1053006741213016e-06, + "loss": 0.74817652, + "num_input_tokens_seen": 178284535, + "step": 8295, + "time_per_iteration": 2.7111220359802246 + }, + { + "auxiliary_loss_clip": 0.01012602, + "auxiliary_loss_mlp": 0.01029645, + "balance_loss_clip": 1.02567077, + "balance_loss_mlp": 1.01865339, + "epoch": 0.49878250413347364, + "flos": 22893016930560.0, + "grad_norm": 1.906646226914591, + "language_loss": 0.67593175, + "learning_rate": 2.1049117506255775e-06, + "loss": 0.69635415, + "num_input_tokens_seen": 178302425, + "step": 8296, + "time_per_iteration": 2.7648088932037354 + }, + { + "auxiliary_loss_clip": 0.01045777, + "auxiliary_loss_mlp": 0.01032577, + "balance_loss_clip": 1.02656806, + "balance_loss_mlp": 1.02034652, + "epoch": 0.4988426273861416, + "flos": 32598449821440.0, + "grad_norm": 2.3415015908838486, + "language_loss": 0.64928645, + "learning_rate": 2.1045228231515895e-06, + "loss": 0.67006993, + "num_input_tokens_seen": 178323065, + "step": 8297, + "time_per_iteration": 2.715425968170166 + }, + { + "auxiliary_loss_clip": 0.01019729, + "auxiliary_loss_mlp": 0.01028512, + "balance_loss_clip": 1.0244838, + "balance_loss_mlp": 1.01846874, + "epoch": 0.49890275063880957, + "flos": 20923604087040.0, + "grad_norm": 1.9073292771730619, + "language_loss": 0.6948815, + "learning_rate": 2.1041338917140857e-06, + "loss": 0.71536392, + "num_input_tokens_seen": 178343985, + "step": 8298, + "time_per_iteration": 4.433349609375 + }, + { + "auxiliary_loss_clip": 0.01065921, + "auxiliary_loss_mlp": 0.010315, + "balance_loss_clip": 1.02513456, + "balance_loss_mlp": 1.02046704, + "epoch": 0.49896287389147753, + "flos": 18624459369600.0, + "grad_norm": 2.0729740921316044, + "language_loss": 0.84583044, + "learning_rate": 2.103744956327814e-06, + "loss": 0.86680466, + "num_input_tokens_seen": 178362345, + "step": 8299, + "time_per_iteration": 2.7001378536224365 + }, + { + "auxiliary_loss_clip": 0.01039356, + "auxiliary_loss_mlp": 0.01036881, + "balance_loss_clip": 1.02581871, + "balance_loss_mlp": 1.02407157, + "epoch": 0.4990229971441455, + "flos": 24826555065600.0, + "grad_norm": 2.577626543558763, + "language_loss": 0.69130123, + "learning_rate": 2.1033560170075234e-06, + "loss": 0.71206355, + "num_input_tokens_seen": 178383190, + "step": 8300, + "time_per_iteration": 4.439751148223877 + }, + { + "auxiliary_loss_clip": 0.01002905, + "auxiliary_loss_mlp": 0.01007452, + "balance_loss_clip": 1.01538336, + "balance_loss_mlp": 1.00595605, + "epoch": 0.49908312039681346, + "flos": 71384525136000.0, + "grad_norm": 0.764757640911397, + "language_loss": 0.5121361, + "learning_rate": 2.1029670737679623e-06, + "loss": 0.53223968, + "num_input_tokens_seen": 178444250, + "step": 8301, + "time_per_iteration": 3.2863926887512207 + }, + { + "auxiliary_loss_clip": 0.01046153, + "auxiliary_loss_mlp": 0.01034354, + "balance_loss_clip": 1.02540827, + "balance_loss_mlp": 1.02285028, + "epoch": 0.4991432436494814, + "flos": 19828651847040.0, + "grad_norm": 1.8969769298226338, + "language_loss": 0.84079719, + "learning_rate": 2.102578126623879e-06, + "loss": 0.86160231, + "num_input_tokens_seen": 178463250, + "step": 8302, + "time_per_iteration": 2.668926477432251 + }, + { + "auxiliary_loss_clip": 0.01058863, + "auxiliary_loss_mlp": 0.01028655, + "balance_loss_clip": 1.02814174, + "balance_loss_mlp": 1.01851082, + "epoch": 0.4992033669021494, + "flos": 15121912273920.0, + "grad_norm": 1.6151051938399688, + "language_loss": 0.69118643, + "learning_rate": 2.102189175590024e-06, + "loss": 0.71206164, + "num_input_tokens_seen": 178481340, + "step": 8303, + "time_per_iteration": 2.6005094051361084 + }, + { + "auxiliary_loss_clip": 0.01070649, + "auxiliary_loss_mlp": 0.01027867, + "balance_loss_clip": 1.02777016, + "balance_loss_mlp": 1.01645279, + "epoch": 0.49926349015481736, + "flos": 31207952476800.0, + "grad_norm": 1.8319767733496213, + "language_loss": 0.72688884, + "learning_rate": 2.101800220681144e-06, + "loss": 0.74787402, + "num_input_tokens_seen": 178501545, + "step": 8304, + "time_per_iteration": 2.667341947555542 + }, + { + "auxiliary_loss_clip": 0.01059255, + "auxiliary_loss_mlp": 0.01034593, + "balance_loss_clip": 1.02693856, + "balance_loss_mlp": 1.0239594, + "epoch": 0.4993236134074853, + "flos": 24900207903360.0, + "grad_norm": 2.0094141437213016, + "language_loss": 0.80445433, + "learning_rate": 2.10141126191199e-06, + "loss": 0.82539284, + "num_input_tokens_seen": 178519700, + "step": 8305, + "time_per_iteration": 2.6238391399383545 + }, + { + "auxiliary_loss_clip": 0.00984613, + "auxiliary_loss_mlp": 0.01003646, + "balance_loss_clip": 1.00618148, + "balance_loss_mlp": 1.00221527, + "epoch": 0.4993837366601533, + "flos": 70420573797120.0, + "grad_norm": 0.8624189356522187, + "language_loss": 0.56895638, + "learning_rate": 2.1010222992973107e-06, + "loss": 0.58883899, + "num_input_tokens_seen": 178576740, + "step": 8306, + "time_per_iteration": 3.3615102767944336 + }, + { + "auxiliary_loss_clip": 0.01071215, + "auxiliary_loss_mlp": 0.01033989, + "balance_loss_clip": 1.02921093, + "balance_loss_mlp": 1.02194238, + "epoch": 0.4994438599128213, + "flos": 15961216440960.0, + "grad_norm": 1.7473028248762623, + "language_loss": 0.82576835, + "learning_rate": 2.1006333328518556e-06, + "loss": 0.84682041, + "num_input_tokens_seen": 178594745, + "step": 8307, + "time_per_iteration": 2.506294012069702 + }, + { + "auxiliary_loss_clip": 0.01068501, + "auxiliary_loss_mlp": 0.01030526, + "balance_loss_clip": 1.02672863, + "balance_loss_mlp": 1.01885498, + "epoch": 0.4995039831654893, + "flos": 27928303228800.0, + "grad_norm": 1.6874712157362481, + "language_loss": 0.61034244, + "learning_rate": 2.1002443625903748e-06, + "loss": 0.6313327, + "num_input_tokens_seen": 178614110, + "step": 8308, + "time_per_iteration": 2.663886308670044 + }, + { + "auxiliary_loss_clip": 0.01066215, + "auxiliary_loss_mlp": 0.01029188, + "balance_loss_clip": 1.02577913, + "balance_loss_mlp": 1.01852465, + "epoch": 0.49956410641815724, + "flos": 24204797619840.0, + "grad_norm": 1.5520662259096898, + "language_loss": 0.74472386, + "learning_rate": 2.0998553885276168e-06, + "loss": 0.76567787, + "num_input_tokens_seen": 178634170, + "step": 8309, + "time_per_iteration": 2.589956521987915 + }, + { + "auxiliary_loss_clip": 0.0104724, + "auxiliary_loss_mlp": 0.01030992, + "balance_loss_clip": 1.02587664, + "balance_loss_mlp": 1.01997662, + "epoch": 0.4996242296708252, + "flos": 16180127879040.0, + "grad_norm": 2.1821325122271964, + "language_loss": 0.79574525, + "learning_rate": 2.0994664106783335e-06, + "loss": 0.81652755, + "num_input_tokens_seen": 178651775, + "step": 8310, + "time_per_iteration": 4.260061502456665 + }, + { + "auxiliary_loss_clip": 0.01055512, + "auxiliary_loss_mlp": 0.0103405, + "balance_loss_clip": 1.02818489, + "balance_loss_mlp": 1.02302325, + "epoch": 0.49968435292349317, + "flos": 16873527000960.0, + "grad_norm": 1.7430101818270305, + "language_loss": 0.70800996, + "learning_rate": 2.0990774290572735e-06, + "loss": 0.72890562, + "num_input_tokens_seen": 178669720, + "step": 8311, + "time_per_iteration": 2.7270619869232178 + }, + { + "auxiliary_loss_clip": 0.01049802, + "auxiliary_loss_mlp": 0.0103459, + "balance_loss_clip": 1.02919483, + "balance_loss_mlp": 1.02392042, + "epoch": 0.49974447617616113, + "flos": 14939521989120.0, + "grad_norm": 1.8513957681989412, + "language_loss": 0.77340806, + "learning_rate": 2.098688443679187e-06, + "loss": 0.79425192, + "num_input_tokens_seen": 178686765, + "step": 8312, + "time_per_iteration": 4.292522668838501 + }, + { + "auxiliary_loss_clip": 0.01040383, + "auxiliary_loss_mlp": 0.01031775, + "balance_loss_clip": 1.02913284, + "balance_loss_mlp": 1.02008677, + "epoch": 0.4998045994288291, + "flos": 26651535321600.0, + "grad_norm": 1.7358607839404385, + "language_loss": 0.84435308, + "learning_rate": 2.0982994545588256e-06, + "loss": 0.86507463, + "num_input_tokens_seen": 178705845, + "step": 8313, + "time_per_iteration": 2.732081651687622 + }, + { + "auxiliary_loss_clip": 0.01049009, + "auxiliary_loss_mlp": 0.01030383, + "balance_loss_clip": 1.02642822, + "balance_loss_mlp": 1.01849174, + "epoch": 0.49986472268149706, + "flos": 20953768533120.0, + "grad_norm": 1.837602717994567, + "language_loss": 0.80831856, + "learning_rate": 2.097910461710939e-06, + "loss": 0.82911247, + "num_input_tokens_seen": 178723410, + "step": 8314, + "time_per_iteration": 2.8825464248657227 + }, + { + "auxiliary_loss_clip": 0.01041213, + "auxiliary_loss_mlp": 0.00747346, + "balance_loss_clip": 1.02771091, + "balance_loss_mlp": 1.00005889, + "epoch": 0.49992484593416503, + "flos": 22783884433920.0, + "grad_norm": 2.081709960647792, + "language_loss": 0.79446805, + "learning_rate": 2.0975214651502773e-06, + "loss": 0.81235361, + "num_input_tokens_seen": 178743560, + "step": 8315, + "time_per_iteration": 2.691037654876709 + }, + { + "auxiliary_loss_clip": 0.01071183, + "auxiliary_loss_mlp": 0.01031796, + "balance_loss_clip": 1.03023529, + "balance_loss_mlp": 1.02091777, + "epoch": 0.499984969186833, + "flos": 46786970252160.0, + "grad_norm": 1.6368681423458467, + "language_loss": 0.74668986, + "learning_rate": 2.0971324648915926e-06, + "loss": 0.76771969, + "num_input_tokens_seen": 178767225, + "step": 8316, + "time_per_iteration": 2.845257043838501 + }, + { + "auxiliary_loss_clip": 0.01057761, + "auxiliary_loss_mlp": 0.01030662, + "balance_loss_clip": 1.02838373, + "balance_loss_mlp": 1.0198797, + "epoch": 0.500045092439501, + "flos": 25556978131200.0, + "grad_norm": 1.4984754885080247, + "language_loss": 0.81254089, + "learning_rate": 2.0967434609496343e-06, + "loss": 0.83342516, + "num_input_tokens_seen": 178786810, + "step": 8317, + "time_per_iteration": 2.614863157272339 + }, + { + "auxiliary_loss_clip": 0.01046391, + "auxiliary_loss_mlp": 0.01036783, + "balance_loss_clip": 1.02520204, + "balance_loss_mlp": 1.02455807, + "epoch": 0.5001052156921689, + "flos": 20704764476160.0, + "grad_norm": 1.61771911439231, + "language_loss": 0.83340925, + "learning_rate": 2.0963544533391548e-06, + "loss": 0.85424101, + "num_input_tokens_seen": 178805660, + "step": 8318, + "time_per_iteration": 2.6165037155151367 + }, + { + "auxiliary_loss_clip": 0.01059587, + "auxiliary_loss_mlp": 0.01028261, + "balance_loss_clip": 1.02720618, + "balance_loss_mlp": 1.0178659, + "epoch": 0.500165338944837, + "flos": 21251109317760.0, + "grad_norm": 1.810409877653417, + "language_loss": 0.81601512, + "learning_rate": 2.0959654420749045e-06, + "loss": 0.83689356, + "num_input_tokens_seen": 178824780, + "step": 8319, + "time_per_iteration": 2.7091264724731445 + }, + { + "auxiliary_loss_clip": 0.01024031, + "auxiliary_loss_mlp": 0.01030625, + "balance_loss_clip": 1.02227354, + "balance_loss_mlp": 1.01946735, + "epoch": 0.5002254621975049, + "flos": 27854398995840.0, + "grad_norm": 1.5572411634642458, + "language_loss": 0.71448219, + "learning_rate": 2.095576427171635e-06, + "loss": 0.7350288, + "num_input_tokens_seen": 178845640, + "step": 8320, + "time_per_iteration": 2.663489818572998 + }, + { + "auxiliary_loss_clip": 0.0103764, + "auxiliary_loss_mlp": 0.01042035, + "balance_loss_clip": 1.02634788, + "balance_loss_mlp": 1.02898192, + "epoch": 0.5002855854501729, + "flos": 15551941898880.0, + "grad_norm": 2.7345712095190287, + "language_loss": 0.76623833, + "learning_rate": 2.0951874086440978e-06, + "loss": 0.78703511, + "num_input_tokens_seen": 178862290, + "step": 8321, + "time_per_iteration": 2.609381914138794 + }, + { + "auxiliary_loss_clip": 0.01060149, + "auxiliary_loss_mlp": 0.00747178, + "balance_loss_clip": 1.02680314, + "balance_loss_mlp": 1.00003052, + "epoch": 0.5003457087028408, + "flos": 16107408794880.0, + "grad_norm": 1.7055033552017709, + "language_loss": 0.83199704, + "learning_rate": 2.0947983865070455e-06, + "loss": 0.85007036, + "num_input_tokens_seen": 178879805, + "step": 8322, + "time_per_iteration": 2.642484188079834 + }, + { + "auxiliary_loss_clip": 0.01060256, + "auxiliary_loss_mlp": 0.01032236, + "balance_loss_clip": 1.02618802, + "balance_loss_mlp": 1.02074993, + "epoch": 0.5004058319555088, + "flos": 22710518904960.0, + "grad_norm": 2.357158298558461, + "language_loss": 0.74073291, + "learning_rate": 2.094409360775228e-06, + "loss": 0.76165783, + "num_input_tokens_seen": 178896985, + "step": 8323, + "time_per_iteration": 2.645988702774048 + }, + { + "auxiliary_loss_clip": 0.01038685, + "auxiliary_loss_mlp": 0.01032704, + "balance_loss_clip": 1.0270555, + "balance_loss_mlp": 1.02069986, + "epoch": 0.5004659552081767, + "flos": 30117956313600.0, + "grad_norm": 1.467691080954881, + "language_loss": 0.68874443, + "learning_rate": 2.0940203314633977e-06, + "loss": 0.70945829, + "num_input_tokens_seen": 178920605, + "step": 8324, + "time_per_iteration": 2.67826509475708 + }, + { + "auxiliary_loss_clip": 0.01057614, + "auxiliary_loss_mlp": 0.00747316, + "balance_loss_clip": 1.02482855, + "balance_loss_mlp": 1.00005531, + "epoch": 0.5005260784608447, + "flos": 18624710764800.0, + "grad_norm": 2.7345785499740747, + "language_loss": 0.72524667, + "learning_rate": 2.0936312985863077e-06, + "loss": 0.74329603, + "num_input_tokens_seen": 178937760, + "step": 8325, + "time_per_iteration": 2.5687477588653564 + }, + { + "auxiliary_loss_clip": 0.01032519, + "auxiliary_loss_mlp": 0.01035108, + "balance_loss_clip": 1.02348328, + "balance_loss_mlp": 1.02229297, + "epoch": 0.5005862017135126, + "flos": 24859987649280.0, + "grad_norm": 1.5530610925675494, + "language_loss": 0.73458022, + "learning_rate": 2.093242262158709e-06, + "loss": 0.75525647, + "num_input_tokens_seen": 178957985, + "step": 8326, + "time_per_iteration": 2.6690454483032227 + }, + { + "auxiliary_loss_clip": 0.0104535, + "auxiliary_loss_mlp": 0.01028798, + "balance_loss_clip": 1.02547204, + "balance_loss_mlp": 1.0179379, + "epoch": 0.5006463249661807, + "flos": 18734381965440.0, + "grad_norm": 1.479492234983845, + "language_loss": 0.77853322, + "learning_rate": 2.0928532221953544e-06, + "loss": 0.79927468, + "num_input_tokens_seen": 178977070, + "step": 8327, + "time_per_iteration": 2.6883397102355957 + }, + { + "auxiliary_loss_clip": 0.01071803, + "auxiliary_loss_mlp": 0.01034834, + "balance_loss_clip": 1.028247, + "balance_loss_mlp": 1.02300262, + "epoch": 0.5007064482188487, + "flos": 13042145871360.0, + "grad_norm": 3.5480673738709574, + "language_loss": 0.87202907, + "learning_rate": 2.092464178710997e-06, + "loss": 0.89309543, + "num_input_tokens_seen": 178994175, + "step": 8328, + "time_per_iteration": 2.5759618282318115 + }, + { + "auxiliary_loss_clip": 0.01042607, + "auxiliary_loss_mlp": 0.01031263, + "balance_loss_clip": 1.02802968, + "balance_loss_mlp": 1.01927066, + "epoch": 0.5007665714715166, + "flos": 21288671965440.0, + "grad_norm": 2.2305140833805988, + "language_loss": 0.74582291, + "learning_rate": 2.092075131720388e-06, + "loss": 0.76656163, + "num_input_tokens_seen": 179013710, + "step": 8329, + "time_per_iteration": 2.7726569175720215 + }, + { + "auxiliary_loss_clip": 0.01067236, + "auxiliary_loss_mlp": 0.01027397, + "balance_loss_clip": 1.02660477, + "balance_loss_mlp": 1.01636362, + "epoch": 0.5008266947241846, + "flos": 29754576374400.0, + "grad_norm": 2.06095436889788, + "language_loss": 0.795964, + "learning_rate": 2.091686081238281e-06, + "loss": 0.81691027, + "num_input_tokens_seen": 179035255, + "step": 8330, + "time_per_iteration": 2.74623441696167 + }, + { + "auxiliary_loss_clip": 0.0099812, + "auxiliary_loss_mlp": 0.00746207, + "balance_loss_clip": 1.01109767, + "balance_loss_mlp": 0.9999243, + "epoch": 0.5008868179768525, + "flos": 63557829204480.0, + "grad_norm": 0.735888306424808, + "language_loss": 0.5609802, + "learning_rate": 2.0912970272794282e-06, + "loss": 0.5784235, + "num_input_tokens_seen": 179090915, + "step": 8331, + "time_per_iteration": 3.194253444671631 + }, + { + "auxiliary_loss_clip": 0.0105774, + "auxiliary_loss_mlp": 0.01029153, + "balance_loss_clip": 1.0264883, + "balance_loss_mlp": 1.01847219, + "epoch": 0.5009469412295205, + "flos": 27375637593600.0, + "grad_norm": 1.8777322307794624, + "language_loss": 0.65014911, + "learning_rate": 2.0909079698585833e-06, + "loss": 0.67101806, + "num_input_tokens_seen": 179109160, + "step": 8332, + "time_per_iteration": 2.8620123863220215 + }, + { + "auxiliary_loss_clip": 0.01065756, + "auxiliary_loss_mlp": 0.01031423, + "balance_loss_clip": 1.02539885, + "balance_loss_mlp": 1.02084923, + "epoch": 0.5010070644821885, + "flos": 27378833904000.0, + "grad_norm": 1.6332617279773725, + "language_loss": 0.74967498, + "learning_rate": 2.0905189089904993e-06, + "loss": 0.77064681, + "num_input_tokens_seen": 179130610, + "step": 8333, + "time_per_iteration": 2.732405185699463 + }, + { + "auxiliary_loss_clip": 0.01069355, + "auxiliary_loss_mlp": 0.01027491, + "balance_loss_clip": 1.0261693, + "balance_loss_mlp": 1.01641023, + "epoch": 0.5010671877348565, + "flos": 20662748542080.0, + "grad_norm": 2.055789356387094, + "language_loss": 0.80514801, + "learning_rate": 2.090129844689929e-06, + "loss": 0.82611644, + "num_input_tokens_seen": 179147860, + "step": 8334, + "time_per_iteration": 2.7976911067962646 + }, + { + "auxiliary_loss_clip": 0.0100048, + "auxiliary_loss_mlp": 0.01001502, + "balance_loss_clip": 1.00247025, + "balance_loss_mlp": 1.00013757, + "epoch": 0.5011273109875244, + "flos": 59128645000320.0, + "grad_norm": 0.8945560459458856, + "language_loss": 0.62615693, + "learning_rate": 2.089740776971626e-06, + "loss": 0.64617676, + "num_input_tokens_seen": 179210490, + "step": 8335, + "time_per_iteration": 3.1708080768585205 + }, + { + "auxiliary_loss_clip": 0.01056428, + "auxiliary_loss_mlp": 0.01027035, + "balance_loss_clip": 1.0261817, + "balance_loss_mlp": 1.01615667, + "epoch": 0.5011874342401924, + "flos": 25336342840320.0, + "grad_norm": 1.6067250719770865, + "language_loss": 0.79764771, + "learning_rate": 2.0893517058503435e-06, + "loss": 0.81848228, + "num_input_tokens_seen": 179231360, + "step": 8336, + "time_per_iteration": 2.71622896194458 + }, + { + "auxiliary_loss_clip": 0.01027084, + "auxiliary_loss_mlp": 0.01029236, + "balance_loss_clip": 1.0229336, + "balance_loss_mlp": 1.01744032, + "epoch": 0.5012475574928603, + "flos": 20229953569920.0, + "grad_norm": 1.6706285346767735, + "language_loss": 0.79997808, + "learning_rate": 2.088962631340836e-06, + "loss": 0.82054126, + "num_input_tokens_seen": 179250625, + "step": 8337, + "time_per_iteration": 2.707206964492798 + }, + { + "auxiliary_loss_clip": 0.0107103, + "auxiliary_loss_mlp": 0.0102812, + "balance_loss_clip": 1.0255543, + "balance_loss_mlp": 1.01647341, + "epoch": 0.5013076807455283, + "flos": 22710123855360.0, + "grad_norm": 1.852144774092863, + "language_loss": 0.79229194, + "learning_rate": 2.0885735534578555e-06, + "loss": 0.81328344, + "num_input_tokens_seen": 179267360, + "step": 8338, + "time_per_iteration": 2.5469229221343994 + }, + { + "auxiliary_loss_clip": 0.01046758, + "auxiliary_loss_mlp": 0.01023834, + "balance_loss_clip": 1.02532077, + "balance_loss_mlp": 1.01244926, + "epoch": 0.5013678039981962, + "flos": 24245161528320.0, + "grad_norm": 1.6881514408836376, + "language_loss": 0.85053355, + "learning_rate": 2.0881844722161583e-06, + "loss": 0.87123942, + "num_input_tokens_seen": 179289810, + "step": 8339, + "time_per_iteration": 2.742319107055664 + }, + { + "auxiliary_loss_clip": 0.01056413, + "auxiliary_loss_mlp": 0.01030647, + "balance_loss_clip": 1.02453613, + "balance_loss_mlp": 1.01985252, + "epoch": 0.5014279272508643, + "flos": 26176688501760.0, + "grad_norm": 1.4423201452801329, + "language_loss": 0.70785654, + "learning_rate": 2.0877953876304962e-06, + "loss": 0.72872722, + "num_input_tokens_seen": 179310620, + "step": 8340, + "time_per_iteration": 2.597064256668091 + }, + { + "auxiliary_loss_clip": 0.01032829, + "auxiliary_loss_mlp": 0.01035149, + "balance_loss_clip": 1.02453947, + "balance_loss_mlp": 1.02174997, + "epoch": 0.5014880505035323, + "flos": 21430446946560.0, + "grad_norm": 1.788783342725108, + "language_loss": 0.77724946, + "learning_rate": 2.0874062997156245e-06, + "loss": 0.79792929, + "num_input_tokens_seen": 179329005, + "step": 8341, + "time_per_iteration": 2.778426170349121 + }, + { + "auxiliary_loss_clip": 0.01044093, + "auxiliary_loss_mlp": 0.01033814, + "balance_loss_clip": 1.02761769, + "balance_loss_mlp": 1.02116585, + "epoch": 0.5015481737562002, + "flos": 15770745596160.0, + "grad_norm": 3.4017708170657524, + "language_loss": 0.88882005, + "learning_rate": 2.0870172084862975e-06, + "loss": 0.90959918, + "num_input_tokens_seen": 179343785, + "step": 8342, + "time_per_iteration": 2.7219693660736084 + }, + { + "auxiliary_loss_clip": 0.0104962, + "auxiliary_loss_mlp": 0.01031817, + "balance_loss_clip": 1.02734017, + "balance_loss_mlp": 1.0206238, + "epoch": 0.5016082970088682, + "flos": 26830801123200.0, + "grad_norm": 1.6168512829382167, + "language_loss": 0.76595151, + "learning_rate": 2.0866281139572682e-06, + "loss": 0.78676581, + "num_input_tokens_seen": 179364070, + "step": 8343, + "time_per_iteration": 2.645859718322754 + }, + { + "auxiliary_loss_clip": 0.01056492, + "auxiliary_loss_mlp": 0.01025346, + "balance_loss_clip": 1.02654421, + "balance_loss_mlp": 1.01499844, + "epoch": 0.5016684202615361, + "flos": 21470595373440.0, + "grad_norm": 6.437652714953565, + "language_loss": 0.67448956, + "learning_rate": 2.086239016143293e-06, + "loss": 0.69530785, + "num_input_tokens_seen": 179384225, + "step": 8344, + "time_per_iteration": 2.552464246749878 + }, + { + "auxiliary_loss_clip": 0.01049003, + "auxiliary_loss_mlp": 0.01030223, + "balance_loss_clip": 1.02592874, + "balance_loss_mlp": 1.01942849, + "epoch": 0.5017285435142042, + "flos": 26246821806720.0, + "grad_norm": 2.290467665910354, + "language_loss": 0.75296092, + "learning_rate": 2.0858499150591258e-06, + "loss": 0.77375317, + "num_input_tokens_seen": 179402595, + "step": 8345, + "time_per_iteration": 4.278750658035278 + }, + { + "auxiliary_loss_clip": 0.01062325, + "auxiliary_loss_mlp": 0.01030851, + "balance_loss_clip": 1.03026342, + "balance_loss_mlp": 1.01841784, + "epoch": 0.5017886667668721, + "flos": 20777555387520.0, + "grad_norm": 2.2488986490840652, + "language_loss": 0.78809738, + "learning_rate": 2.0854608107195203e-06, + "loss": 0.8090291, + "num_input_tokens_seen": 179419635, + "step": 8346, + "time_per_iteration": 2.5461924076080322 + }, + { + "auxiliary_loss_clip": 0.01047808, + "auxiliary_loss_mlp": 0.00747316, + "balance_loss_clip": 1.0251565, + "balance_loss_mlp": 1.00002182, + "epoch": 0.5018487900195401, + "flos": 20156408472960.0, + "grad_norm": 1.5869838231958344, + "language_loss": 0.69254369, + "learning_rate": 2.0850717031392333e-06, + "loss": 0.71049488, + "num_input_tokens_seen": 179438770, + "step": 8347, + "time_per_iteration": 4.207390069961548 + }, + { + "auxiliary_loss_clip": 0.01028418, + "auxiliary_loss_mlp": 0.01029411, + "balance_loss_clip": 1.02300024, + "balance_loss_mlp": 1.01796675, + "epoch": 0.501908913272208, + "flos": 18150689957760.0, + "grad_norm": 1.9062844709393703, + "language_loss": 0.7124536, + "learning_rate": 2.0846825923330174e-06, + "loss": 0.73303187, + "num_input_tokens_seen": 179457475, + "step": 8348, + "time_per_iteration": 2.6963858604431152 + }, + { + "auxiliary_loss_clip": 0.01056072, + "auxiliary_loss_mlp": 0.01027658, + "balance_loss_clip": 1.02633929, + "balance_loss_mlp": 1.01778173, + "epoch": 0.501969036524876, + "flos": 23112287504640.0, + "grad_norm": 1.5070210582475831, + "language_loss": 0.74117053, + "learning_rate": 2.0842934783156303e-06, + "loss": 0.76200777, + "num_input_tokens_seen": 179478140, + "step": 8349, + "time_per_iteration": 2.6384449005126953 + }, + { + "auxiliary_loss_clip": 0.01058601, + "auxiliary_loss_mlp": 0.01026318, + "balance_loss_clip": 1.02569127, + "balance_loss_mlp": 1.01405704, + "epoch": 0.5020291597775439, + "flos": 11363214314880.0, + "grad_norm": 2.2913621291652424, + "language_loss": 0.63382155, + "learning_rate": 2.0839043611018266e-06, + "loss": 0.65467072, + "num_input_tokens_seen": 179494325, + "step": 8350, + "time_per_iteration": 2.5673060417175293 + }, + { + "auxiliary_loss_clip": 0.00985316, + "auxiliary_loss_mlp": 0.01027667, + "balance_loss_clip": 1.0079248, + "balance_loss_mlp": 1.02617729, + "epoch": 0.5020892830302119, + "flos": 64011094928640.0, + "grad_norm": 0.8261449428112609, + "language_loss": 0.59803581, + "learning_rate": 2.0835152407063597e-06, + "loss": 0.61816567, + "num_input_tokens_seen": 179553545, + "step": 8351, + "time_per_iteration": 3.3918888568878174 + }, + { + "auxiliary_loss_clip": 0.01042345, + "auxiliary_loss_mlp": 0.01034235, + "balance_loss_clip": 1.02467501, + "balance_loss_mlp": 1.02208138, + "epoch": 0.5021494062828799, + "flos": 23732859801600.0, + "grad_norm": 1.7929013105826257, + "language_loss": 0.74900007, + "learning_rate": 2.0831261171439873e-06, + "loss": 0.76976585, + "num_input_tokens_seen": 179573645, + "step": 8352, + "time_per_iteration": 2.638497829437256 + }, + { + "auxiliary_loss_clip": 0.0103662, + "auxiliary_loss_mlp": 0.01032889, + "balance_loss_clip": 1.02442598, + "balance_loss_mlp": 1.02113509, + "epoch": 0.5022095295355479, + "flos": 21576747041280.0, + "grad_norm": 1.7458457274075847, + "language_loss": 0.72209817, + "learning_rate": 2.082736990429464e-06, + "loss": 0.74279332, + "num_input_tokens_seen": 179591435, + "step": 8353, + "time_per_iteration": 2.713639259338379 + }, + { + "auxiliary_loss_clip": 0.01064506, + "auxiliary_loss_mlp": 0.01032619, + "balance_loss_clip": 1.03067565, + "balance_loss_mlp": 1.01997089, + "epoch": 0.5022696527882159, + "flos": 21397229844480.0, + "grad_norm": 1.8663519879784187, + "language_loss": 0.74202228, + "learning_rate": 2.0823478605775455e-06, + "loss": 0.76299351, + "num_input_tokens_seen": 179609955, + "step": 8354, + "time_per_iteration": 2.74031138420105 + }, + { + "auxiliary_loss_clip": 0.0104505, + "auxiliary_loss_mlp": 0.01036497, + "balance_loss_clip": 1.02569938, + "balance_loss_mlp": 1.0245465, + "epoch": 0.5023297760408838, + "flos": 27160712565120.0, + "grad_norm": 1.4495961889666087, + "language_loss": 0.72540706, + "learning_rate": 2.0819587276029884e-06, + "loss": 0.74622256, + "num_input_tokens_seen": 179630875, + "step": 8355, + "time_per_iteration": 2.724022388458252 + }, + { + "auxiliary_loss_clip": 0.01059053, + "auxiliary_loss_mlp": 0.01032888, + "balance_loss_clip": 1.0260514, + "balance_loss_mlp": 1.02059197, + "epoch": 0.5023898992935518, + "flos": 26213820186240.0, + "grad_norm": 1.594778364183531, + "language_loss": 0.81052411, + "learning_rate": 2.081569591520548e-06, + "loss": 0.83144355, + "num_input_tokens_seen": 179649835, + "step": 8356, + "time_per_iteration": 2.724559783935547 + }, + { + "auxiliary_loss_clip": 0.01061466, + "auxiliary_loss_mlp": 0.01035917, + "balance_loss_clip": 1.02595425, + "balance_loss_mlp": 1.02249408, + "epoch": 0.5024500225462197, + "flos": 13440323111040.0, + "grad_norm": 1.9182981556825531, + "language_loss": 0.76666296, + "learning_rate": 2.0811804523449803e-06, + "loss": 0.78763676, + "num_input_tokens_seen": 179667605, + "step": 8357, + "time_per_iteration": 2.5381290912628174 + }, + { + "auxiliary_loss_clip": 0.01060009, + "auxiliary_loss_mlp": 0.01032386, + "balance_loss_clip": 1.02690506, + "balance_loss_mlp": 1.02008939, + "epoch": 0.5025101457988878, + "flos": 21579584215680.0, + "grad_norm": 1.6599594566022253, + "language_loss": 0.76127774, + "learning_rate": 2.0807913100910417e-06, + "loss": 0.78220165, + "num_input_tokens_seen": 179686910, + "step": 8358, + "time_per_iteration": 4.238914966583252 + }, + { + "auxiliary_loss_clip": 0.01048905, + "auxiliary_loss_mlp": 0.01030834, + "balance_loss_clip": 1.02690744, + "balance_loss_mlp": 1.01875842, + "epoch": 0.5025702690515557, + "flos": 24645134448000.0, + "grad_norm": 2.639396385212879, + "language_loss": 0.72012752, + "learning_rate": 2.0804021647734887e-06, + "loss": 0.74092495, + "num_input_tokens_seen": 179706395, + "step": 8359, + "time_per_iteration": 4.275082349777222 + }, + { + "auxiliary_loss_clip": 0.01039344, + "auxiliary_loss_mlp": 0.01039499, + "balance_loss_clip": 1.02464843, + "balance_loss_mlp": 1.02770901, + "epoch": 0.5026303923042237, + "flos": 22090162089600.0, + "grad_norm": 1.6797386914800405, + "language_loss": 0.77164042, + "learning_rate": 2.080013016407077e-06, + "loss": 0.79242885, + "num_input_tokens_seen": 179725735, + "step": 8360, + "time_per_iteration": 2.634007692337036 + }, + { + "auxiliary_loss_clip": 0.01033343, + "auxiliary_loss_mlp": 0.01032394, + "balance_loss_clip": 1.02750385, + "balance_loss_mlp": 1.02078271, + "epoch": 0.5026905155568916, + "flos": 23697200574720.0, + "grad_norm": 1.6911361938754486, + "language_loss": 0.7664097, + "learning_rate": 2.0796238650065645e-06, + "loss": 0.78706706, + "num_input_tokens_seen": 179746150, + "step": 8361, + "time_per_iteration": 2.6067705154418945 + }, + { + "auxiliary_loss_clip": 0.01037495, + "auxiliary_loss_mlp": 0.01035018, + "balance_loss_clip": 1.02427959, + "balance_loss_mlp": 1.0223403, + "epoch": 0.5027506388095596, + "flos": 25812410722560.0, + "grad_norm": 1.6037910959764672, + "language_loss": 0.84944332, + "learning_rate": 2.0792347105867065e-06, + "loss": 0.87016845, + "num_input_tokens_seen": 179767550, + "step": 8362, + "time_per_iteration": 2.651390790939331 + }, + { + "auxiliary_loss_clip": 0.01048568, + "auxiliary_loss_mlp": 0.01028397, + "balance_loss_clip": 1.02547598, + "balance_loss_mlp": 1.01704192, + "epoch": 0.5028107620622275, + "flos": 27526606456320.0, + "grad_norm": 2.452409258308733, + "language_loss": 0.78178024, + "learning_rate": 2.0788455531622605e-06, + "loss": 0.8025499, + "num_input_tokens_seen": 179790075, + "step": 8363, + "time_per_iteration": 2.634894371032715 + }, + { + "auxiliary_loss_clip": 0.01056123, + "auxiliary_loss_mlp": 0.01029837, + "balance_loss_clip": 1.02617705, + "balance_loss_mlp": 1.01813662, + "epoch": 0.5028708853148955, + "flos": 24534278098560.0, + "grad_norm": 2.0528214093736064, + "language_loss": 0.75378567, + "learning_rate": 2.0784563927479838e-06, + "loss": 0.77464533, + "num_input_tokens_seen": 179806515, + "step": 8364, + "time_per_iteration": 2.5226757526397705 + }, + { + "auxiliary_loss_clip": 0.01066521, + "auxiliary_loss_mlp": 0.01027511, + "balance_loss_clip": 1.02583122, + "balance_loss_mlp": 1.01648438, + "epoch": 0.5029310085675635, + "flos": 20813609664000.0, + "grad_norm": 1.6099272947706564, + "language_loss": 0.69578689, + "learning_rate": 2.0780672293586317e-06, + "loss": 0.7167272, + "num_input_tokens_seen": 179826450, + "step": 8365, + "time_per_iteration": 2.566732883453369 + }, + { + "auxiliary_loss_clip": 0.01051496, + "auxiliary_loss_mlp": 0.01034114, + "balance_loss_clip": 1.02649248, + "balance_loss_mlp": 1.02166891, + "epoch": 0.5029911318202315, + "flos": 22342470197760.0, + "grad_norm": 1.7646133494053562, + "language_loss": 0.73042488, + "learning_rate": 2.0776780630089635e-06, + "loss": 0.75128102, + "num_input_tokens_seen": 179846770, + "step": 8366, + "time_per_iteration": 2.6478450298309326 + }, + { + "auxiliary_loss_clip": 0.01059444, + "auxiliary_loss_mlp": 0.01029908, + "balance_loss_clip": 1.02853036, + "balance_loss_mlp": 1.01894104, + "epoch": 0.5030512550728995, + "flos": 24352713826560.0, + "grad_norm": 1.5976249852061575, + "language_loss": 0.78497279, + "learning_rate": 2.077288893713735e-06, + "loss": 0.8058663, + "num_input_tokens_seen": 179866585, + "step": 8367, + "time_per_iteration": 2.728379011154175 + }, + { + "auxiliary_loss_clip": 0.01054678, + "auxiliary_loss_mlp": 0.01026281, + "balance_loss_clip": 1.02388167, + "balance_loss_mlp": 1.01542091, + "epoch": 0.5031113783255674, + "flos": 18259930195200.0, + "grad_norm": 1.6652569030242217, + "language_loss": 0.70100689, + "learning_rate": 2.0768997214877035e-06, + "loss": 0.72181642, + "num_input_tokens_seen": 179885575, + "step": 8368, + "time_per_iteration": 2.6027235984802246 + }, + { + "auxiliary_loss_clip": 0.00999228, + "auxiliary_loss_mlp": 0.01001862, + "balance_loss_clip": 1.00173092, + "balance_loss_mlp": 1.00051498, + "epoch": 0.5031715015782354, + "flos": 57253173200640.0, + "grad_norm": 0.8622108712170898, + "language_loss": 0.63249421, + "learning_rate": 2.0765105463456274e-06, + "loss": 0.65250516, + "num_input_tokens_seen": 179939650, + "step": 8369, + "time_per_iteration": 3.136425018310547 + }, + { + "auxiliary_loss_clip": 0.01058586, + "auxiliary_loss_mlp": 0.01028914, + "balance_loss_clip": 1.02704906, + "balance_loss_mlp": 1.017977, + "epoch": 0.5032316248309033, + "flos": 27527360641920.0, + "grad_norm": 1.994530150938498, + "language_loss": 0.60240144, + "learning_rate": 2.076121368302263e-06, + "loss": 0.62327641, + "num_input_tokens_seen": 179961765, + "step": 8370, + "time_per_iteration": 2.6692659854888916 + }, + { + "auxiliary_loss_clip": 0.01018935, + "auxiliary_loss_mlp": 0.01034645, + "balance_loss_clip": 1.02250361, + "balance_loss_mlp": 1.02228928, + "epoch": 0.5032917480835714, + "flos": 34495825939200.0, + "grad_norm": 1.6806317348266246, + "language_loss": 0.68740743, + "learning_rate": 2.0757321873723695e-06, + "loss": 0.70794326, + "num_input_tokens_seen": 179983015, + "step": 8371, + "time_per_iteration": 2.89857816696167 + }, + { + "auxiliary_loss_clip": 0.01041489, + "auxiliary_loss_mlp": 0.01029032, + "balance_loss_clip": 1.02364826, + "balance_loss_mlp": 1.01592457, + "epoch": 0.5033518713362393, + "flos": 33656773167360.0, + "grad_norm": 1.9216404589683018, + "language_loss": 0.68051267, + "learning_rate": 2.0753430035707042e-06, + "loss": 0.70121789, + "num_input_tokens_seen": 180003210, + "step": 8372, + "time_per_iteration": 2.7004761695861816 + }, + { + "auxiliary_loss_clip": 0.01030973, + "auxiliary_loss_mlp": 0.01032776, + "balance_loss_clip": 1.02408814, + "balance_loss_mlp": 1.01979995, + "epoch": 0.5034119945889073, + "flos": 28185495586560.0, + "grad_norm": 1.4226722014803024, + "language_loss": 0.66641593, + "learning_rate": 2.0749538169120235e-06, + "loss": 0.68705344, + "num_input_tokens_seen": 180025530, + "step": 8373, + "time_per_iteration": 2.7850165367126465 + }, + { + "auxiliary_loss_clip": 0.01045461, + "auxiliary_loss_mlp": 0.0102517, + "balance_loss_clip": 1.02484465, + "balance_loss_mlp": 1.01380324, + "epoch": 0.5034721178415752, + "flos": 21358697529600.0, + "grad_norm": 1.5470987258960793, + "language_loss": 0.74475801, + "learning_rate": 2.0745646274110872e-06, + "loss": 0.76546431, + "num_input_tokens_seen": 180043180, + "step": 8374, + "time_per_iteration": 2.7412726879119873 + }, + { + "auxiliary_loss_clip": 0.01048964, + "auxiliary_loss_mlp": 0.01034911, + "balance_loss_clip": 1.02623367, + "balance_loss_mlp": 1.02218509, + "epoch": 0.5035322410942432, + "flos": 22674823764480.0, + "grad_norm": 1.5789973861537154, + "language_loss": 0.68328202, + "learning_rate": 2.0741754350826525e-06, + "loss": 0.70412076, + "num_input_tokens_seen": 180062905, + "step": 8375, + "time_per_iteration": 2.690202236175537 + }, + { + "auxiliary_loss_clip": 0.01024237, + "auxiliary_loss_mlp": 0.0103102, + "balance_loss_clip": 1.02681088, + "balance_loss_mlp": 1.01805556, + "epoch": 0.5035923643469111, + "flos": 19828723674240.0, + "grad_norm": 1.7611234345360303, + "language_loss": 0.78419828, + "learning_rate": 2.0737862399414777e-06, + "loss": 0.8047508, + "num_input_tokens_seen": 180082000, + "step": 8376, + "time_per_iteration": 2.8140182495117188 + }, + { + "auxiliary_loss_clip": 0.01061068, + "auxiliary_loss_mlp": 0.00747475, + "balance_loss_clip": 1.02578402, + "balance_loss_mlp": 1.00008559, + "epoch": 0.5036524875995791, + "flos": 30514625182080.0, + "grad_norm": 1.861882857821137, + "language_loss": 0.5953294, + "learning_rate": 2.0733970420023213e-06, + "loss": 0.61341476, + "num_input_tokens_seen": 180101340, + "step": 8377, + "time_per_iteration": 2.673006534576416 + }, + { + "auxiliary_loss_clip": 0.01049108, + "auxiliary_loss_mlp": 0.01035477, + "balance_loss_clip": 1.02603793, + "balance_loss_mlp": 1.02333534, + "epoch": 0.5037126108522471, + "flos": 14720574637440.0, + "grad_norm": 1.9683457416340775, + "language_loss": 0.75966769, + "learning_rate": 2.0730078412799425e-06, + "loss": 0.78051353, + "num_input_tokens_seen": 180119160, + "step": 8378, + "time_per_iteration": 2.698291301727295 + }, + { + "auxiliary_loss_clip": 0.01029781, + "auxiliary_loss_mlp": 0.0103305, + "balance_loss_clip": 1.02414012, + "balance_loss_mlp": 1.02167761, + "epoch": 0.5037727341049151, + "flos": 25297702784640.0, + "grad_norm": 1.8372771394918388, + "language_loss": 0.74563086, + "learning_rate": 2.0726186377890985e-06, + "loss": 0.76625925, + "num_input_tokens_seen": 180138730, + "step": 8379, + "time_per_iteration": 2.659693479537964 + }, + { + "auxiliary_loss_clip": 0.01056253, + "auxiliary_loss_mlp": 0.01028484, + "balance_loss_clip": 1.02639508, + "balance_loss_mlp": 1.01751041, + "epoch": 0.5038328573575831, + "flos": 28541764632960.0, + "grad_norm": 2.945548262559321, + "language_loss": 0.66757131, + "learning_rate": 2.072229431544548e-06, + "loss": 0.68841863, + "num_input_tokens_seen": 180158810, + "step": 8380, + "time_per_iteration": 2.6425704956054688 + }, + { + "auxiliary_loss_clip": 0.01019717, + "auxiliary_loss_mlp": 0.01029005, + "balance_loss_clip": 1.02536762, + "balance_loss_mlp": 1.01779914, + "epoch": 0.503892980610251, + "flos": 31649869503360.0, + "grad_norm": 1.8487175830637834, + "language_loss": 0.62754077, + "learning_rate": 2.071840222561051e-06, + "loss": 0.64802802, + "num_input_tokens_seen": 180179700, + "step": 8381, + "time_per_iteration": 2.8379313945770264 + }, + { + "auxiliary_loss_clip": 0.01040334, + "auxiliary_loss_mlp": 0.0103287, + "balance_loss_clip": 1.02223516, + "balance_loss_mlp": 1.02202833, + "epoch": 0.503953103862919, + "flos": 27089358197760.0, + "grad_norm": 1.4556050779062153, + "language_loss": 0.67284262, + "learning_rate": 2.071451010853365e-06, + "loss": 0.69357467, + "num_input_tokens_seen": 180199890, + "step": 8382, + "time_per_iteration": 2.678957939147949 + }, + { + "auxiliary_loss_clip": 0.01055895, + "auxiliary_loss_mlp": 0.01035178, + "balance_loss_clip": 1.02838206, + "balance_loss_mlp": 1.02282166, + "epoch": 0.5040132271155869, + "flos": 15632957024640.0, + "grad_norm": 3.0135002138595826, + "language_loss": 0.62213081, + "learning_rate": 2.0710617964362506e-06, + "loss": 0.64304149, + "num_input_tokens_seen": 180217840, + "step": 8383, + "time_per_iteration": 2.607130765914917 + }, + { + "auxiliary_loss_clip": 0.0102748, + "auxiliary_loss_mlp": 0.01029441, + "balance_loss_clip": 1.02417564, + "balance_loss_mlp": 1.01861072, + "epoch": 0.504073350368255, + "flos": 13590106824960.0, + "grad_norm": 1.709002092344553, + "language_loss": 0.66928738, + "learning_rate": 2.070672579324465e-06, + "loss": 0.68985665, + "num_input_tokens_seen": 180236465, + "step": 8384, + "time_per_iteration": 2.766529083251953 + }, + { + "auxiliary_loss_clip": 0.01061376, + "auxiliary_loss_mlp": 0.0103735, + "balance_loss_clip": 1.02965188, + "balance_loss_mlp": 1.02616811, + "epoch": 0.5041334736209229, + "flos": 29058160510080.0, + "grad_norm": 1.6318782246682308, + "language_loss": 0.70736551, + "learning_rate": 2.0702833595327674e-06, + "loss": 0.72835279, + "num_input_tokens_seen": 180258025, + "step": 8385, + "time_per_iteration": 2.7269275188446045 + }, + { + "auxiliary_loss_clip": 0.01053916, + "auxiliary_loss_mlp": 0.01025137, + "balance_loss_clip": 1.02481318, + "balance_loss_mlp": 1.01439071, + "epoch": 0.5041935968735909, + "flos": 24608361899520.0, + "grad_norm": 1.7887375208745082, + "language_loss": 0.82888591, + "learning_rate": 2.069894137075919e-06, + "loss": 0.84967649, + "num_input_tokens_seen": 180277825, + "step": 8386, + "time_per_iteration": 2.699829578399658 + }, + { + "auxiliary_loss_clip": 0.01059582, + "auxiliary_loss_mlp": 0.01030749, + "balance_loss_clip": 1.02727413, + "balance_loss_mlp": 1.01883399, + "epoch": 0.5042537201262588, + "flos": 26286934320000.0, + "grad_norm": 1.5233648247495917, + "language_loss": 0.66485035, + "learning_rate": 2.0695049119686766e-06, + "loss": 0.68575358, + "num_input_tokens_seen": 180300465, + "step": 8387, + "time_per_iteration": 2.6185314655303955 + }, + { + "auxiliary_loss_clip": 0.01010931, + "auxiliary_loss_mlp": 0.01031709, + "balance_loss_clip": 1.02246261, + "balance_loss_mlp": 1.0207541, + "epoch": 0.5043138433789268, + "flos": 22017371178240.0, + "grad_norm": 1.4722803627920071, + "language_loss": 0.80463445, + "learning_rate": 2.0691156842258016e-06, + "loss": 0.8250609, + "num_input_tokens_seen": 180321050, + "step": 8388, + "time_per_iteration": 2.776538610458374 + }, + { + "auxiliary_loss_clip": 0.01056524, + "auxiliary_loss_mlp": 0.01029606, + "balance_loss_clip": 1.02515686, + "balance_loss_mlp": 1.01859069, + "epoch": 0.5043739666315947, + "flos": 28767104605440.0, + "grad_norm": 2.2551091507528325, + "language_loss": 0.70203573, + "learning_rate": 2.0687264538620537e-06, + "loss": 0.72289705, + "num_input_tokens_seen": 180338870, + "step": 8389, + "time_per_iteration": 2.7472622394561768 + }, + { + "auxiliary_loss_clip": 0.01039583, + "auxiliary_loss_mlp": 0.01037143, + "balance_loss_clip": 1.02429247, + "balance_loss_mlp": 1.02563977, + "epoch": 0.5044340898842627, + "flos": 27599253713280.0, + "grad_norm": 1.6988100051159791, + "language_loss": 0.69300067, + "learning_rate": 2.068337220892191e-06, + "loss": 0.71376801, + "num_input_tokens_seen": 180361285, + "step": 8390, + "time_per_iteration": 2.8714282512664795 + }, + { + "auxiliary_loss_clip": 0.00991624, + "auxiliary_loss_mlp": 0.01014112, + "balance_loss_clip": 1.004282, + "balance_loss_mlp": 1.01275277, + "epoch": 0.5044942131369307, + "flos": 67458050749440.0, + "grad_norm": 0.877043810103007, + "language_loss": 0.52966952, + "learning_rate": 2.067947985330974e-06, + "loss": 0.54972696, + "num_input_tokens_seen": 180415170, + "step": 8391, + "time_per_iteration": 3.1042301654815674 + }, + { + "auxiliary_loss_clip": 0.00983544, + "auxiliary_loss_mlp": 0.01008248, + "balance_loss_clip": 1.00559568, + "balance_loss_mlp": 1.00673962, + "epoch": 0.5045543363895987, + "flos": 58630849390080.0, + "grad_norm": 1.0023080674655214, + "language_loss": 0.60711336, + "learning_rate": 2.0675587471931628e-06, + "loss": 0.62703127, + "num_input_tokens_seen": 180468060, + "step": 8392, + "time_per_iteration": 3.1189208030700684 + }, + { + "auxiliary_loss_clip": 0.01030312, + "auxiliary_loss_mlp": 0.01028955, + "balance_loss_clip": 1.02308118, + "balance_loss_mlp": 1.0178864, + "epoch": 0.5046144596422667, + "flos": 22526620248960.0, + "grad_norm": 1.7542222083846497, + "language_loss": 0.84316301, + "learning_rate": 2.067169506493517e-06, + "loss": 0.86375564, + "num_input_tokens_seen": 180486610, + "step": 8393, + "time_per_iteration": 4.361830711364746 + }, + { + "auxiliary_loss_clip": 0.01036153, + "auxiliary_loss_mlp": 0.0102923, + "balance_loss_clip": 1.02583098, + "balance_loss_mlp": 1.01799464, + "epoch": 0.5046745828949346, + "flos": 27454246508160.0, + "grad_norm": 2.617656261268525, + "language_loss": 0.50938475, + "learning_rate": 2.0667802632467974e-06, + "loss": 0.5300386, + "num_input_tokens_seen": 180508135, + "step": 8394, + "time_per_iteration": 2.833465576171875 + }, + { + "auxiliary_loss_clip": 0.01067954, + "auxiliary_loss_mlp": 0.01032455, + "balance_loss_clip": 1.02627826, + "balance_loss_mlp": 1.02024221, + "epoch": 0.5047347061476026, + "flos": 17274541415040.0, + "grad_norm": 1.5418032890166407, + "language_loss": 0.75110674, + "learning_rate": 2.0663910174677627e-06, + "loss": 0.77211082, + "num_input_tokens_seen": 180527000, + "step": 8395, + "time_per_iteration": 4.389748573303223 + }, + { + "auxiliary_loss_clip": 0.01047265, + "auxiliary_loss_mlp": 0.01034121, + "balance_loss_clip": 1.02354932, + "balance_loss_mlp": 1.02225947, + "epoch": 0.5047948294002705, + "flos": 16649515831680.0, + "grad_norm": 2.4093153805810195, + "language_loss": 0.67642856, + "learning_rate": 2.0660017691711737e-06, + "loss": 0.69724244, + "num_input_tokens_seen": 180544715, + "step": 8396, + "time_per_iteration": 2.7168281078338623 + }, + { + "auxiliary_loss_clip": 0.01059626, + "auxiliary_loss_mlp": 0.01034142, + "balance_loss_clip": 1.02855825, + "balance_loss_mlp": 1.02312064, + "epoch": 0.5048549526529386, + "flos": 26865706164480.0, + "grad_norm": 1.6009219406613664, + "language_loss": 0.78721493, + "learning_rate": 2.065612518371792e-06, + "loss": 0.80815256, + "num_input_tokens_seen": 180565365, + "step": 8397, + "time_per_iteration": 2.6653640270233154 + }, + { + "auxiliary_loss_clip": 0.01026101, + "auxiliary_loss_mlp": 0.01028151, + "balance_loss_clip": 1.02793312, + "balance_loss_mlp": 1.0171355, + "epoch": 0.5049150759056065, + "flos": 21833939399040.0, + "grad_norm": 1.6732175686984458, + "language_loss": 0.66187519, + "learning_rate": 2.065223265084376e-06, + "loss": 0.68241769, + "num_input_tokens_seen": 180586670, + "step": 8398, + "time_per_iteration": 2.717724084854126 + }, + { + "auxiliary_loss_clip": 0.01057915, + "auxiliary_loss_mlp": 0.00747358, + "balance_loss_clip": 1.0268302, + "balance_loss_mlp": 1.00002432, + "epoch": 0.5049751991582745, + "flos": 21685807710720.0, + "grad_norm": 2.042293162101556, + "language_loss": 0.71728551, + "learning_rate": 2.064834009323688e-06, + "loss": 0.73533833, + "num_input_tokens_seen": 180605085, + "step": 8399, + "time_per_iteration": 2.619454860687256 + }, + { + "auxiliary_loss_clip": 0.01043489, + "auxiliary_loss_mlp": 0.01044067, + "balance_loss_clip": 1.02631831, + "balance_loss_mlp": 1.0303818, + "epoch": 0.5050353224109424, + "flos": 21359379888000.0, + "grad_norm": 1.8766833755267405, + "language_loss": 0.81239247, + "learning_rate": 2.0644447511044878e-06, + "loss": 0.83326793, + "num_input_tokens_seen": 180624370, + "step": 8400, + "time_per_iteration": 2.6812736988067627 + }, + { + "auxiliary_loss_clip": 0.01033131, + "auxiliary_loss_mlp": 0.01037452, + "balance_loss_clip": 1.02539062, + "balance_loss_mlp": 1.02432704, + "epoch": 0.5050954456636104, + "flos": 22820082364800.0, + "grad_norm": 1.9996829187320686, + "language_loss": 0.78771687, + "learning_rate": 2.0640554904415362e-06, + "loss": 0.80842268, + "num_input_tokens_seen": 180642450, + "step": 8401, + "time_per_iteration": 2.66288161277771 + }, + { + "auxiliary_loss_clip": 0.01067671, + "auxiliary_loss_mlp": 0.00747324, + "balance_loss_clip": 1.02510095, + "balance_loss_mlp": 1.000072, + "epoch": 0.5051555689162783, + "flos": 30448226891520.0, + "grad_norm": 1.547189629509756, + "language_loss": 0.6990869, + "learning_rate": 2.063666227349593e-06, + "loss": 0.71723688, + "num_input_tokens_seen": 180665250, + "step": 8402, + "time_per_iteration": 2.6669986248016357 + }, + { + "auxiliary_loss_clip": 0.01054884, + "auxiliary_loss_mlp": 0.00747374, + "balance_loss_clip": 1.0231061, + "balance_loss_mlp": 1.00005543, + "epoch": 0.5052156921689464, + "flos": 21287953693440.0, + "grad_norm": 2.079643563929885, + "language_loss": 0.6893363, + "learning_rate": 2.063276961843422e-06, + "loss": 0.7073589, + "num_input_tokens_seen": 180687425, + "step": 8403, + "time_per_iteration": 2.686035633087158 + }, + { + "auxiliary_loss_clip": 0.01052444, + "auxiliary_loss_mlp": 0.0103473, + "balance_loss_clip": 1.0238961, + "balance_loss_mlp": 1.02391803, + "epoch": 0.5052758154216143, + "flos": 25081305298560.0, + "grad_norm": 1.483007518873027, + "language_loss": 0.86035311, + "learning_rate": 2.062887693937781e-06, + "loss": 0.88122487, + "num_input_tokens_seen": 180708725, + "step": 8404, + "time_per_iteration": 2.6002962589263916 + }, + { + "auxiliary_loss_clip": 0.01036076, + "auxiliary_loss_mlp": 0.00747228, + "balance_loss_clip": 1.02509975, + "balance_loss_mlp": 1.00003433, + "epoch": 0.5053359386742823, + "flos": 20885502735360.0, + "grad_norm": 1.5719466159703765, + "language_loss": 0.75507063, + "learning_rate": 2.0624984236474322e-06, + "loss": 0.77290368, + "num_input_tokens_seen": 180727990, + "step": 8405, + "time_per_iteration": 2.664275884628296 + }, + { + "auxiliary_loss_clip": 0.01068411, + "auxiliary_loss_mlp": 0.01028046, + "balance_loss_clip": 1.02599931, + "balance_loss_mlp": 1.01566601, + "epoch": 0.5053960619269503, + "flos": 37743335493120.0, + "grad_norm": 1.6994283609753889, + "language_loss": 0.7299552, + "learning_rate": 2.0621091509871378e-06, + "loss": 0.75091976, + "num_input_tokens_seen": 180749765, + "step": 8406, + "time_per_iteration": 4.492851495742798 + }, + { + "auxiliary_loss_clip": 0.0103576, + "auxiliary_loss_mlp": 0.01027961, + "balance_loss_clip": 1.02526248, + "balance_loss_mlp": 1.01700532, + "epoch": 0.5054561851796182, + "flos": 23513840622720.0, + "grad_norm": 1.583399757912677, + "language_loss": 0.7655288, + "learning_rate": 2.0617198759716568e-06, + "loss": 0.78616595, + "num_input_tokens_seen": 180769580, + "step": 8407, + "time_per_iteration": 4.447924375534058 + }, + { + "auxiliary_loss_clip": 0.01024458, + "auxiliary_loss_mlp": 0.01028123, + "balance_loss_clip": 1.02089739, + "balance_loss_mlp": 1.01741862, + "epoch": 0.5055163084322862, + "flos": 30410233280640.0, + "grad_norm": 1.621747619133468, + "language_loss": 0.62447619, + "learning_rate": 2.0613305986157535e-06, + "loss": 0.64500201, + "num_input_tokens_seen": 180790295, + "step": 8408, + "time_per_iteration": 2.8581888675689697 + }, + { + "auxiliary_loss_clip": 0.01031968, + "auxiliary_loss_mlp": 0.01034607, + "balance_loss_clip": 1.02208591, + "balance_loss_mlp": 1.02095759, + "epoch": 0.5055764316849541, + "flos": 20259651139200.0, + "grad_norm": 1.8265479110448626, + "language_loss": 0.63650703, + "learning_rate": 2.0609413189341865e-06, + "loss": 0.6571728, + "num_input_tokens_seen": 180807875, + "step": 8409, + "time_per_iteration": 2.83050799369812 + }, + { + "auxiliary_loss_clip": 0.0104597, + "auxiliary_loss_mlp": 0.01027408, + "balance_loss_clip": 1.02598941, + "balance_loss_mlp": 1.01750183, + "epoch": 0.5056365549376222, + "flos": 26070895969920.0, + "grad_norm": 1.331511369285181, + "language_loss": 0.70563799, + "learning_rate": 2.0605520369417193e-06, + "loss": 0.72637177, + "num_input_tokens_seen": 180831300, + "step": 8410, + "time_per_iteration": 2.859144926071167 + }, + { + "auxiliary_loss_clip": 0.01041236, + "auxiliary_loss_mlp": 0.01038407, + "balance_loss_clip": 1.02472281, + "balance_loss_mlp": 1.02621806, + "epoch": 0.5056966781902901, + "flos": 19279074781440.0, + "grad_norm": 1.5815622077366167, + "language_loss": 0.79456377, + "learning_rate": 2.060162752653113e-06, + "loss": 0.81536019, + "num_input_tokens_seen": 180849055, + "step": 8411, + "time_per_iteration": 2.659881353378296 + }, + { + "auxiliary_loss_clip": 0.01069187, + "auxiliary_loss_mlp": 0.01036292, + "balance_loss_clip": 1.02708673, + "balance_loss_mlp": 1.02358449, + "epoch": 0.5057568014429581, + "flos": 21323325611520.0, + "grad_norm": 1.8254162711152502, + "language_loss": 0.81855536, + "learning_rate": 2.0597734660831285e-06, + "loss": 0.8396101, + "num_input_tokens_seen": 180867395, + "step": 8412, + "time_per_iteration": 2.6992721557617188 + }, + { + "auxiliary_loss_clip": 0.01046869, + "auxiliary_loss_mlp": 0.01034547, + "balance_loss_clip": 1.02719545, + "balance_loss_mlp": 1.02335286, + "epoch": 0.505816924695626, + "flos": 17493596507520.0, + "grad_norm": 1.756598022224202, + "language_loss": 0.80338252, + "learning_rate": 2.0593841772465283e-06, + "loss": 0.82419658, + "num_input_tokens_seen": 180886670, + "step": 8413, + "time_per_iteration": 2.7300171852111816 + }, + { + "auxiliary_loss_clip": 0.01038005, + "auxiliary_loss_mlp": 0.00747313, + "balance_loss_clip": 1.02537692, + "balance_loss_mlp": 1.00006127, + "epoch": 0.505877047948294, + "flos": 21142084561920.0, + "grad_norm": 1.8735746312836674, + "language_loss": 0.80719531, + "learning_rate": 2.0589948861580737e-06, + "loss": 0.82504851, + "num_input_tokens_seen": 180904645, + "step": 8414, + "time_per_iteration": 2.8053672313690186 + }, + { + "auxiliary_loss_clip": 0.0105249, + "auxiliary_loss_mlp": 0.01025567, + "balance_loss_clip": 1.02172709, + "balance_loss_mlp": 1.01381934, + "epoch": 0.5059371712009619, + "flos": 36350036887680.0, + "grad_norm": 2.1166577565796403, + "language_loss": 0.62206382, + "learning_rate": 2.058605592832528e-06, + "loss": 0.64284444, + "num_input_tokens_seen": 180922340, + "step": 8415, + "time_per_iteration": 2.8820807933807373 + }, + { + "auxiliary_loss_clip": 0.0103297, + "auxiliary_loss_mlp": 0.01026823, + "balance_loss_clip": 1.02295518, + "balance_loss_mlp": 1.0159626, + "epoch": 0.50599729445363, + "flos": 22673387220480.0, + "grad_norm": 1.9816376650242504, + "language_loss": 0.81864655, + "learning_rate": 2.0582162972846515e-06, + "loss": 0.83924443, + "num_input_tokens_seen": 180941350, + "step": 8416, + "time_per_iteration": 2.6505861282348633 + }, + { + "auxiliary_loss_clip": 0.01038334, + "auxiliary_loss_mlp": 0.01028361, + "balance_loss_clip": 1.0276792, + "balance_loss_mlp": 1.0180614, + "epoch": 0.5060574177062979, + "flos": 22747866071040.0, + "grad_norm": 1.6718187781287377, + "language_loss": 0.79308796, + "learning_rate": 2.0578269995292078e-06, + "loss": 0.81375492, + "num_input_tokens_seen": 180960720, + "step": 8417, + "time_per_iteration": 2.690913200378418 + }, + { + "auxiliary_loss_clip": 0.01017331, + "auxiliary_loss_mlp": 0.01031914, + "balance_loss_clip": 1.02281117, + "balance_loss_mlp": 1.02049375, + "epoch": 0.5061175409589659, + "flos": 21653201139840.0, + "grad_norm": 1.8332380747779553, + "language_loss": 0.63172388, + "learning_rate": 2.0574376995809588e-06, + "loss": 0.65221632, + "num_input_tokens_seen": 180979725, + "step": 8418, + "time_per_iteration": 2.773590564727783 + }, + { + "auxiliary_loss_clip": 0.01037496, + "auxiliary_loss_mlp": 0.01030282, + "balance_loss_clip": 1.02449381, + "balance_loss_mlp": 1.01921308, + "epoch": 0.5061776642116339, + "flos": 21616249023360.0, + "grad_norm": 1.7333602698613417, + "language_loss": 0.77628386, + "learning_rate": 2.0570483974546653e-06, + "loss": 0.79696161, + "num_input_tokens_seen": 180998980, + "step": 8419, + "time_per_iteration": 2.673013687133789 + }, + { + "auxiliary_loss_clip": 0.01009908, + "auxiliary_loss_mlp": 0.01028592, + "balance_loss_clip": 1.02291512, + "balance_loss_mlp": 1.01692724, + "epoch": 0.5062377874643018, + "flos": 24426294837120.0, + "grad_norm": 1.9570832502550553, + "language_loss": 0.77176899, + "learning_rate": 2.0566590931650917e-06, + "loss": 0.79215395, + "num_input_tokens_seen": 181019165, + "step": 8420, + "time_per_iteration": 2.7806332111358643 + }, + { + "auxiliary_loss_clip": 0.01068355, + "auxiliary_loss_mlp": 0.01032145, + "balance_loss_clip": 1.02661037, + "balance_loss_mlp": 1.02055216, + "epoch": 0.5062979107169698, + "flos": 22524429519360.0, + "grad_norm": 1.6264164127713594, + "language_loss": 0.76984119, + "learning_rate": 2.056269786726999e-06, + "loss": 0.79084611, + "num_input_tokens_seen": 181037110, + "step": 8421, + "time_per_iteration": 2.5930445194244385 + }, + { + "auxiliary_loss_clip": 0.01048416, + "auxiliary_loss_mlp": 0.01026897, + "balance_loss_clip": 1.022403, + "balance_loss_mlp": 1.01483321, + "epoch": 0.5063580339696377, + "flos": 24571984400640.0, + "grad_norm": 1.4473611288224462, + "language_loss": 0.66879803, + "learning_rate": 2.0558804781551512e-06, + "loss": 0.68955123, + "num_input_tokens_seen": 181057775, + "step": 8422, + "time_per_iteration": 2.6303677558898926 + }, + { + "auxiliary_loss_clip": 0.01067584, + "auxiliary_loss_mlp": 0.01028928, + "balance_loss_clip": 1.02641129, + "balance_loss_mlp": 1.01738286, + "epoch": 0.5064181572223058, + "flos": 22596143022720.0, + "grad_norm": 1.6042264562297863, + "language_loss": 0.81619501, + "learning_rate": 2.05549116746431e-06, + "loss": 0.83716011, + "num_input_tokens_seen": 181078260, + "step": 8423, + "time_per_iteration": 2.5860679149627686 + }, + { + "auxiliary_loss_clip": 0.01069404, + "auxiliary_loss_mlp": 0.00747527, + "balance_loss_clip": 1.0263871, + "balance_loss_mlp": 1.00006151, + "epoch": 0.5064782804749737, + "flos": 25994944661760.0, + "grad_norm": 2.047925997232747, + "language_loss": 0.74862677, + "learning_rate": 2.055101854669237e-06, + "loss": 0.76679605, + "num_input_tokens_seen": 181098755, + "step": 8424, + "time_per_iteration": 2.6548526287078857 + }, + { + "auxiliary_loss_clip": 0.01065367, + "auxiliary_loss_mlp": 0.01032071, + "balance_loss_clip": 1.02495384, + "balance_loss_mlp": 1.02048421, + "epoch": 0.5065384037276417, + "flos": 28553041503360.0, + "grad_norm": 2.0971525826245894, + "language_loss": 0.71371907, + "learning_rate": 2.0547125397846975e-06, + "loss": 0.73469347, + "num_input_tokens_seen": 181121570, + "step": 8425, + "time_per_iteration": 2.6152491569519043 + }, + { + "auxiliary_loss_clip": 0.01029756, + "auxiliary_loss_mlp": 0.0103933, + "balance_loss_clip": 1.02328002, + "balance_loss_mlp": 1.02642512, + "epoch": 0.5065985269803096, + "flos": 22966023323520.0, + "grad_norm": 2.257345845057677, + "language_loss": 0.78492463, + "learning_rate": 2.0543232228254524e-06, + "loss": 0.80561543, + "num_input_tokens_seen": 181140240, + "step": 8426, + "time_per_iteration": 2.6773085594177246 + }, + { + "auxiliary_loss_clip": 0.01060879, + "auxiliary_loss_mlp": 0.01032859, + "balance_loss_clip": 1.02843618, + "balance_loss_mlp": 1.02160585, + "epoch": 0.5066586502329776, + "flos": 21608563512960.0, + "grad_norm": 2.2150007600791404, + "language_loss": 0.77585304, + "learning_rate": 2.053933903806265e-06, + "loss": 0.79679048, + "num_input_tokens_seen": 181158630, + "step": 8427, + "time_per_iteration": 2.627142906188965 + }, + { + "auxiliary_loss_clip": 0.01064393, + "auxiliary_loss_mlp": 0.01022517, + "balance_loss_clip": 1.02495503, + "balance_loss_mlp": 1.0114069, + "epoch": 0.5067187734856455, + "flos": 20339912079360.0, + "grad_norm": 2.185543275246687, + "language_loss": 0.71671128, + "learning_rate": 2.0535445827418997e-06, + "loss": 0.73758042, + "num_input_tokens_seen": 181176405, + "step": 8428, + "time_per_iteration": 2.656208038330078 + }, + { + "auxiliary_loss_clip": 0.01049332, + "auxiliary_loss_mlp": 0.00747351, + "balance_loss_clip": 1.02408051, + "balance_loss_mlp": 1.000103, + "epoch": 0.5067788967383136, + "flos": 28841080665600.0, + "grad_norm": 1.8196920059163966, + "language_loss": 0.82962978, + "learning_rate": 2.0531552596471168e-06, + "loss": 0.84759659, + "num_input_tokens_seen": 181197595, + "step": 8429, + "time_per_iteration": 2.681204319000244 + }, + { + "auxiliary_loss_clip": 0.01041372, + "auxiliary_loss_mlp": 0.01035827, + "balance_loss_clip": 1.02746129, + "balance_loss_mlp": 1.02286315, + "epoch": 0.5068390199909815, + "flos": 32450174478720.0, + "grad_norm": 1.950543304070814, + "language_loss": 0.73381621, + "learning_rate": 2.052765934536682e-06, + "loss": 0.75458825, + "num_input_tokens_seen": 181218560, + "step": 8430, + "time_per_iteration": 2.8029377460479736 + }, + { + "auxiliary_loss_clip": 0.00999653, + "auxiliary_loss_mlp": 0.01041171, + "balance_loss_clip": 1.01908135, + "balance_loss_mlp": 1.0287075, + "epoch": 0.5068991432436495, + "flos": 23146582014720.0, + "grad_norm": 1.6523912170651043, + "language_loss": 0.7685324, + "learning_rate": 2.0523766074253575e-06, + "loss": 0.78894067, + "num_input_tokens_seen": 181237095, + "step": 8431, + "time_per_iteration": 2.7873432636260986 + }, + { + "auxiliary_loss_clip": 0.01048715, + "auxiliary_loss_mlp": 0.01028706, + "balance_loss_clip": 1.02307701, + "balance_loss_mlp": 1.01740503, + "epoch": 0.5069592664963174, + "flos": 19936096404480.0, + "grad_norm": 1.7587345312504639, + "language_loss": 0.71949089, + "learning_rate": 2.0519872783279074e-06, + "loss": 0.74026513, + "num_input_tokens_seen": 181255940, + "step": 8432, + "time_per_iteration": 2.6203315258026123 + }, + { + "auxiliary_loss_clip": 0.00975171, + "auxiliary_loss_mlp": 0.01019534, + "balance_loss_clip": 1.00731635, + "balance_loss_mlp": 1.01801956, + "epoch": 0.5070193897489854, + "flos": 65793771941760.0, + "grad_norm": 0.7679725066015006, + "language_loss": 0.63666308, + "learning_rate": 2.0515979472590945e-06, + "loss": 0.65661013, + "num_input_tokens_seen": 181316945, + "step": 8433, + "time_per_iteration": 3.3393192291259766 + }, + { + "auxiliary_loss_clip": 0.01035823, + "auxiliary_loss_mlp": 0.01036378, + "balance_loss_clip": 1.02463198, + "balance_loss_mlp": 1.02502894, + "epoch": 0.5070795130016534, + "flos": 17275331514240.0, + "grad_norm": 1.8255141891068665, + "language_loss": 0.77635384, + "learning_rate": 2.051208614233681e-06, + "loss": 0.79707587, + "num_input_tokens_seen": 181335555, + "step": 8434, + "time_per_iteration": 2.682410478591919 + }, + { + "auxiliary_loss_clip": 0.01046571, + "auxiliary_loss_mlp": 0.01029028, + "balance_loss_clip": 1.02494586, + "balance_loss_mlp": 1.01736975, + "epoch": 0.5071396362543213, + "flos": 21069940095360.0, + "grad_norm": 1.5529412515592644, + "language_loss": 0.7093662, + "learning_rate": 2.0508192792664326e-06, + "loss": 0.73012215, + "num_input_tokens_seen": 181354580, + "step": 8435, + "time_per_iteration": 2.648353338241577 + }, + { + "auxiliary_loss_clip": 0.01059838, + "auxiliary_loss_mlp": 0.01036219, + "balance_loss_clip": 1.02680504, + "balance_loss_mlp": 1.02419662, + "epoch": 0.5071997595069894, + "flos": 23144822248320.0, + "grad_norm": 1.8037180332560256, + "language_loss": 0.7204951, + "learning_rate": 2.050429942372112e-06, + "loss": 0.74145567, + "num_input_tokens_seen": 181374320, + "step": 8436, + "time_per_iteration": 2.6978087425231934 + }, + { + "auxiliary_loss_clip": 0.01068689, + "auxiliary_loss_mlp": 0.01025097, + "balance_loss_clip": 1.02680039, + "balance_loss_mlp": 1.01325953, + "epoch": 0.5072598827596573, + "flos": 22747183712640.0, + "grad_norm": 3.6932567515529016, + "language_loss": 0.83676314, + "learning_rate": 2.050040603565483e-06, + "loss": 0.857701, + "num_input_tokens_seen": 181392190, + "step": 8437, + "time_per_iteration": 2.6268954277038574 + }, + { + "auxiliary_loss_clip": 0.01056168, + "auxiliary_loss_mlp": 0.01025603, + "balance_loss_clip": 1.02588344, + "balance_loss_mlp": 1.0146302, + "epoch": 0.5073200060123253, + "flos": 22566301799040.0, + "grad_norm": 1.4871952785267524, + "language_loss": 0.80643761, + "learning_rate": 2.049651262861309e-06, + "loss": 0.82725531, + "num_input_tokens_seen": 181413890, + "step": 8438, + "time_per_iteration": 2.6242856979370117 + }, + { + "auxiliary_loss_clip": 0.01039669, + "auxiliary_loss_mlp": 0.0103352, + "balance_loss_clip": 1.02732003, + "balance_loss_mlp": 1.02035975, + "epoch": 0.5073801292649932, + "flos": 25806341324160.0, + "grad_norm": 1.4686454944725702, + "language_loss": 0.79759228, + "learning_rate": 2.0492619202743543e-06, + "loss": 0.81832415, + "num_input_tokens_seen": 181433240, + "step": 8439, + "time_per_iteration": 2.711876630783081 + }, + { + "auxiliary_loss_clip": 0.01040243, + "auxiliary_loss_mlp": 0.00747546, + "balance_loss_clip": 1.0248208, + "balance_loss_mlp": 1.00010896, + "epoch": 0.5074402525176612, + "flos": 25373941401600.0, + "grad_norm": 1.5260473362866882, + "language_loss": 0.70921469, + "learning_rate": 2.048872575819383e-06, + "loss": 0.72709262, + "num_input_tokens_seen": 181453535, + "step": 8440, + "time_per_iteration": 4.292896270751953 + }, + { + "auxiliary_loss_clip": 0.01041671, + "auxiliary_loss_mlp": 0.01032267, + "balance_loss_clip": 1.02545345, + "balance_loss_mlp": 1.02095389, + "epoch": 0.5075003757703291, + "flos": 26064431521920.0, + "grad_norm": 1.6173486267089006, + "language_loss": 0.71034729, + "learning_rate": 2.048483229511158e-06, + "loss": 0.73108667, + "num_input_tokens_seen": 181474195, + "step": 8441, + "time_per_iteration": 2.7354490756988525 + }, + { + "auxiliary_loss_clip": 0.01059635, + "auxiliary_loss_mlp": 0.0074747, + "balance_loss_clip": 1.02551138, + "balance_loss_mlp": 1.0000608, + "epoch": 0.5075604990229972, + "flos": 21835447770240.0, + "grad_norm": 1.5611616771303491, + "language_loss": 0.63944662, + "learning_rate": 2.0480938813644445e-06, + "loss": 0.65751767, + "num_input_tokens_seen": 181494000, + "step": 8442, + "time_per_iteration": 4.363938331604004 + }, + { + "auxiliary_loss_clip": 0.01026598, + "auxiliary_loss_mlp": 0.01025467, + "balance_loss_clip": 1.02700639, + "balance_loss_mlp": 1.01525688, + "epoch": 0.5076206222756651, + "flos": 31978703537280.0, + "grad_norm": 1.4078621917548773, + "language_loss": 0.71312058, + "learning_rate": 2.047704531394006e-06, + "loss": 0.73364127, + "num_input_tokens_seen": 181515955, + "step": 8443, + "time_per_iteration": 2.8415229320526123 + }, + { + "auxiliary_loss_clip": 0.00982384, + "auxiliary_loss_mlp": 0.01037511, + "balance_loss_clip": 1.01814795, + "balance_loss_mlp": 1.02434993, + "epoch": 0.5076807455283331, + "flos": 36904031326080.0, + "grad_norm": 1.4381503977176167, + "language_loss": 0.61941051, + "learning_rate": 2.047315179614607e-06, + "loss": 0.63960946, + "num_input_tokens_seen": 181540225, + "step": 8444, + "time_per_iteration": 3.042630672454834 + }, + { + "auxiliary_loss_clip": 0.01035486, + "auxiliary_loss_mlp": 0.01025065, + "balance_loss_clip": 1.0256052, + "balance_loss_mlp": 1.01424098, + "epoch": 0.507740868781001, + "flos": 29862415981440.0, + "grad_norm": 1.6605693019700833, + "language_loss": 0.63867688, + "learning_rate": 2.046925826041012e-06, + "loss": 0.65928233, + "num_input_tokens_seen": 181560125, + "step": 8445, + "time_per_iteration": 2.985295057296753 + }, + { + "auxiliary_loss_clip": 0.00983276, + "auxiliary_loss_mlp": 0.01007462, + "balance_loss_clip": 1.01365066, + "balance_loss_mlp": 1.00618684, + "epoch": 0.507800992033669, + "flos": 61918974247680.0, + "grad_norm": 0.8298845862347289, + "language_loss": 0.61836112, + "learning_rate": 2.0465364706879845e-06, + "loss": 0.63826847, + "num_input_tokens_seen": 181618830, + "step": 8446, + "time_per_iteration": 3.2527644634246826 + }, + { + "auxiliary_loss_clip": 0.01034729, + "auxiliary_loss_mlp": 0.01024338, + "balance_loss_clip": 1.02420318, + "balance_loss_mlp": 1.01390767, + "epoch": 0.507861115286337, + "flos": 20700490757760.0, + "grad_norm": 1.9033709703215496, + "language_loss": 0.81017548, + "learning_rate": 2.04614711357029e-06, + "loss": 0.83076608, + "num_input_tokens_seen": 181637120, + "step": 8447, + "time_per_iteration": 2.7603800296783447 + }, + { + "auxiliary_loss_clip": 0.01056441, + "auxiliary_loss_mlp": 0.01027736, + "balance_loss_clip": 1.02589917, + "balance_loss_mlp": 1.01698899, + "epoch": 0.507921238539005, + "flos": 30847050576000.0, + "grad_norm": 1.465179454076438, + "language_loss": 0.70484352, + "learning_rate": 2.0457577547026916e-06, + "loss": 0.7256853, + "num_input_tokens_seen": 181659965, + "step": 8448, + "time_per_iteration": 2.6954736709594727 + }, + { + "auxiliary_loss_clip": 0.01065932, + "auxiliary_loss_mlp": 0.00747269, + "balance_loss_clip": 1.02577686, + "balance_loss_mlp": 1.00006938, + "epoch": 0.507981361791673, + "flos": 35700197984640.0, + "grad_norm": 2.711432003504362, + "language_loss": 0.71961403, + "learning_rate": 2.045368394099955e-06, + "loss": 0.73774612, + "num_input_tokens_seen": 181685290, + "step": 8449, + "time_per_iteration": 2.7706048488616943 + }, + { + "auxiliary_loss_clip": 0.01042485, + "auxiliary_loss_mlp": 0.01028977, + "balance_loss_clip": 1.0229826, + "balance_loss_mlp": 1.01809287, + "epoch": 0.5080414850443409, + "flos": 27161466750720.0, + "grad_norm": 1.631141752300396, + "language_loss": 0.72578275, + "learning_rate": 2.044979031776844e-06, + "loss": 0.74649739, + "num_input_tokens_seen": 181706080, + "step": 8450, + "time_per_iteration": 2.7741897106170654 + }, + { + "auxiliary_loss_clip": 0.01067569, + "auxiliary_loss_mlp": 0.01028027, + "balance_loss_clip": 1.02584076, + "balance_loss_mlp": 1.01689303, + "epoch": 0.5081016082970089, + "flos": 27085192220160.0, + "grad_norm": 1.6494809777420287, + "language_loss": 0.77000481, + "learning_rate": 2.0445896677481234e-06, + "loss": 0.79096079, + "num_input_tokens_seen": 181724805, + "step": 8451, + "time_per_iteration": 2.8299436569213867 + }, + { + "auxiliary_loss_clip": 0.01066749, + "auxiliary_loss_mlp": 0.010323, + "balance_loss_clip": 1.02537143, + "balance_loss_mlp": 1.02171445, + "epoch": 0.5081617315496768, + "flos": 22856531690880.0, + "grad_norm": 1.7635647905736511, + "language_loss": 0.85213655, + "learning_rate": 2.044200302028559e-06, + "loss": 0.8731271, + "num_input_tokens_seen": 181743725, + "step": 8452, + "time_per_iteration": 4.291552543640137 + }, + { + "auxiliary_loss_clip": 0.0107281, + "auxiliary_loss_mlp": 0.01029597, + "balance_loss_clip": 1.02826834, + "balance_loss_mlp": 1.01770627, + "epoch": 0.5082218548023448, + "flos": 16281898087680.0, + "grad_norm": 2.505895526730994, + "language_loss": 0.77564085, + "learning_rate": 2.0438109346329143e-06, + "loss": 0.79666495, + "num_input_tokens_seen": 181757720, + "step": 8453, + "time_per_iteration": 2.5310537815093994 + }, + { + "auxiliary_loss_clip": 0.01034536, + "auxiliary_loss_mlp": 0.01030418, + "balance_loss_clip": 1.02513409, + "balance_loss_mlp": 1.02002311, + "epoch": 0.5082819780550127, + "flos": 24460768915200.0, + "grad_norm": 2.0621594234736933, + "language_loss": 0.7636503, + "learning_rate": 2.0434215655759544e-06, + "loss": 0.78429985, + "num_input_tokens_seen": 181778545, + "step": 8454, + "time_per_iteration": 4.346530437469482 + }, + { + "auxiliary_loss_clip": 0.01043969, + "auxiliary_loss_mlp": 0.01031094, + "balance_loss_clip": 1.02358365, + "balance_loss_mlp": 1.0198884, + "epoch": 0.5083421013076808, + "flos": 23403271582080.0, + "grad_norm": 1.5315733521412693, + "language_loss": 0.89222014, + "learning_rate": 2.0430321948724446e-06, + "loss": 0.91297078, + "num_input_tokens_seen": 181799495, + "step": 8455, + "time_per_iteration": 2.655975341796875 + }, + { + "auxiliary_loss_clip": 0.01052552, + "auxiliary_loss_mlp": 0.00747457, + "balance_loss_clip": 1.02675533, + "balance_loss_mlp": 1.00000083, + "epoch": 0.5084022245603487, + "flos": 23872695448320.0, + "grad_norm": 2.177579822631588, + "language_loss": 0.62383902, + "learning_rate": 2.042642822537149e-06, + "loss": 0.64183909, + "num_input_tokens_seen": 181818400, + "step": 8456, + "time_per_iteration": 2.691002130508423 + }, + { + "auxiliary_loss_clip": 0.01001086, + "auxiliary_loss_mlp": 0.01003991, + "balance_loss_clip": 1.00368893, + "balance_loss_mlp": 1.00273347, + "epoch": 0.5084623478130167, + "flos": 62873336655360.0, + "grad_norm": 0.8415778440046086, + "language_loss": 0.62428874, + "learning_rate": 2.0422534485848343e-06, + "loss": 0.6443395, + "num_input_tokens_seen": 181875975, + "step": 8457, + "time_per_iteration": 3.096306800842285 + }, + { + "auxiliary_loss_clip": 0.01059899, + "auxiliary_loss_mlp": 0.01030941, + "balance_loss_clip": 1.02734768, + "balance_loss_mlp": 1.01907957, + "epoch": 0.5085224710656846, + "flos": 22346133384960.0, + "grad_norm": 1.7931548813984215, + "language_loss": 0.67547047, + "learning_rate": 2.0418640730302644e-06, + "loss": 0.69637889, + "num_input_tokens_seen": 181896450, + "step": 8458, + "time_per_iteration": 2.5957205295562744 + }, + { + "auxiliary_loss_clip": 0.01058498, + "auxiliary_loss_mlp": 0.01031303, + "balance_loss_clip": 1.0256846, + "balance_loss_mlp": 1.01977587, + "epoch": 0.5085825943183526, + "flos": 26066263115520.0, + "grad_norm": 2.4978942652563116, + "language_loss": 0.78075224, + "learning_rate": 2.0414746958882043e-06, + "loss": 0.80165029, + "num_input_tokens_seen": 181916770, + "step": 8459, + "time_per_iteration": 2.7018563747406006 + }, + { + "auxiliary_loss_clip": 0.01075311, + "auxiliary_loss_mlp": 0.01029788, + "balance_loss_clip": 1.03022361, + "balance_loss_mlp": 1.01787281, + "epoch": 0.5086427175710206, + "flos": 17420733768960.0, + "grad_norm": 2.104652829873958, + "language_loss": 0.80705863, + "learning_rate": 2.0410853171734196e-06, + "loss": 0.82810962, + "num_input_tokens_seen": 181932710, + "step": 8460, + "time_per_iteration": 2.501845359802246 + }, + { + "auxiliary_loss_clip": 0.01045585, + "auxiliary_loss_mlp": 0.01033756, + "balance_loss_clip": 1.02564764, + "balance_loss_mlp": 1.02231765, + "epoch": 0.5087028408236886, + "flos": 20631758083200.0, + "grad_norm": 1.478594223360882, + "language_loss": 0.69006652, + "learning_rate": 2.0406959369006754e-06, + "loss": 0.71085989, + "num_input_tokens_seen": 181950665, + "step": 8461, + "time_per_iteration": 2.6107590198516846 + }, + { + "auxiliary_loss_clip": 0.0106706, + "auxiliary_loss_mlp": 0.01027424, + "balance_loss_clip": 1.02662563, + "balance_loss_mlp": 1.01624203, + "epoch": 0.5087629640763566, + "flos": 25593822506880.0, + "grad_norm": 1.6318035626740117, + "language_loss": 0.76036632, + "learning_rate": 2.0403065550847375e-06, + "loss": 0.78131109, + "num_input_tokens_seen": 181971270, + "step": 8462, + "time_per_iteration": 2.596052408218384 + }, + { + "auxiliary_loss_clip": 0.01029469, + "auxiliary_loss_mlp": 0.01036539, + "balance_loss_clip": 1.02352953, + "balance_loss_mlp": 1.02453446, + "epoch": 0.5088230873290245, + "flos": 13261631927040.0, + "grad_norm": 1.933421026897097, + "language_loss": 0.8117606, + "learning_rate": 2.0399171717403706e-06, + "loss": 0.83242071, + "num_input_tokens_seen": 181988410, + "step": 8463, + "time_per_iteration": 2.767578363418579 + }, + { + "auxiliary_loss_clip": 0.01051, + "auxiliary_loss_mlp": 0.01033599, + "balance_loss_clip": 1.02504063, + "balance_loss_mlp": 1.02256668, + "epoch": 0.5088832105816925, + "flos": 20043469134720.0, + "grad_norm": 3.5601406935076323, + "language_loss": 0.76373935, + "learning_rate": 2.039527786882341e-06, + "loss": 0.78458536, + "num_input_tokens_seen": 182006530, + "step": 8464, + "time_per_iteration": 2.6611921787261963 + }, + { + "auxiliary_loss_clip": 0.01003201, + "auxiliary_loss_mlp": 0.01002678, + "balance_loss_clip": 1.00563538, + "balance_loss_mlp": 1.00138462, + "epoch": 0.5089433338343604, + "flos": 67422179018880.0, + "grad_norm": 0.6826879346058184, + "language_loss": 0.59385967, + "learning_rate": 2.0391384005254133e-06, + "loss": 0.61391842, + "num_input_tokens_seen": 182074240, + "step": 8465, + "time_per_iteration": 3.2784762382507324 + }, + { + "auxiliary_loss_clip": 0.01067798, + "auxiliary_loss_mlp": 0.0103672, + "balance_loss_clip": 1.0265348, + "balance_loss_mlp": 1.02574646, + "epoch": 0.5090034570870284, + "flos": 22710339336960.0, + "grad_norm": 1.8635887202672918, + "language_loss": 0.79620266, + "learning_rate": 2.038749012684354e-06, + "loss": 0.81724781, + "num_input_tokens_seen": 182093360, + "step": 8466, + "time_per_iteration": 2.591674566268921 + }, + { + "auxiliary_loss_clip": 0.0105361, + "auxiliary_loss_mlp": 0.01027299, + "balance_loss_clip": 1.02291942, + "balance_loss_mlp": 1.01583076, + "epoch": 0.5090635803396963, + "flos": 20445812352000.0, + "grad_norm": 1.7770844610802525, + "language_loss": 0.78523624, + "learning_rate": 2.0383596233739286e-06, + "loss": 0.80604529, + "num_input_tokens_seen": 182110170, + "step": 8467, + "time_per_iteration": 2.619884967803955 + }, + { + "auxiliary_loss_clip": 0.01066944, + "auxiliary_loss_mlp": 0.01030599, + "balance_loss_clip": 1.02800989, + "balance_loss_mlp": 1.02003729, + "epoch": 0.5091237035923644, + "flos": 23768878164480.0, + "grad_norm": 1.6542711137560273, + "language_loss": 0.74394131, + "learning_rate": 2.0379702326089013e-06, + "loss": 0.76491672, + "num_input_tokens_seen": 182129570, + "step": 8468, + "time_per_iteration": 2.6853904724121094 + }, + { + "auxiliary_loss_clip": 0.01066649, + "auxiliary_loss_mlp": 0.01030748, + "balance_loss_clip": 1.02550793, + "balance_loss_mlp": 1.01988161, + "epoch": 0.5091838268450323, + "flos": 18327908684160.0, + "grad_norm": 1.6960029725746273, + "language_loss": 0.77515709, + "learning_rate": 2.03758084040404e-06, + "loss": 0.79613107, + "num_input_tokens_seen": 182147565, + "step": 8469, + "time_per_iteration": 2.5331578254699707 + }, + { + "auxiliary_loss_clip": 0.01051612, + "auxiliary_loss_mlp": 0.01033388, + "balance_loss_clip": 1.02686131, + "balance_loss_mlp": 1.02118731, + "epoch": 0.5092439500977003, + "flos": 29057621806080.0, + "grad_norm": 1.6163429773403724, + "language_loss": 0.69875467, + "learning_rate": 2.037191446774109e-06, + "loss": 0.71960473, + "num_input_tokens_seen": 182169695, + "step": 8470, + "time_per_iteration": 2.6900784969329834 + }, + { + "auxiliary_loss_clip": 0.01042281, + "auxiliary_loss_mlp": 0.01038413, + "balance_loss_clip": 1.02376413, + "balance_loss_mlp": 1.02539563, + "epoch": 0.5093040733503682, + "flos": 13553908894080.0, + "grad_norm": 2.006699117656226, + "language_loss": 0.73466522, + "learning_rate": 2.0368020517338745e-06, + "loss": 0.75547218, + "num_input_tokens_seen": 182186385, + "step": 8471, + "time_per_iteration": 2.5963776111602783 + }, + { + "auxiliary_loss_clip": 0.0100841, + "auxiliary_loss_mlp": 0.01001241, + "balance_loss_clip": 1.00088382, + "balance_loss_mlp": 1.00000703, + "epoch": 0.5093641966030362, + "flos": 68906617407360.0, + "grad_norm": 0.7534462590259677, + "language_loss": 0.58088565, + "learning_rate": 2.036412655298103e-06, + "loss": 0.60098219, + "num_input_tokens_seen": 182247095, + "step": 8472, + "time_per_iteration": 3.118809938430786 + }, + { + "auxiliary_loss_clip": 0.01014622, + "auxiliary_loss_mlp": 0.01037233, + "balance_loss_clip": 1.02213407, + "balance_loss_mlp": 1.0267005, + "epoch": 0.5094243198557042, + "flos": 21580948932480.0, + "grad_norm": 1.888947114819429, + "language_loss": 0.69161934, + "learning_rate": 2.03602325748156e-06, + "loss": 0.71213794, + "num_input_tokens_seen": 182266380, + "step": 8473, + "time_per_iteration": 2.6938493251800537 + }, + { + "auxiliary_loss_clip": 0.0104554, + "auxiliary_loss_mlp": 0.01033954, + "balance_loss_clip": 1.02514315, + "balance_loss_mlp": 1.0228796, + "epoch": 0.5094844431083722, + "flos": 28840721529600.0, + "grad_norm": 2.5612891462906417, + "language_loss": 0.85071653, + "learning_rate": 2.0356338582990105e-06, + "loss": 0.87151146, + "num_input_tokens_seen": 182284685, + "step": 8474, + "time_per_iteration": 2.727018117904663 + }, + { + "auxiliary_loss_clip": 0.01043217, + "auxiliary_loss_mlp": 0.01031503, + "balance_loss_clip": 1.02507758, + "balance_loss_mlp": 1.02046394, + "epoch": 0.5095445663610402, + "flos": 14976114969600.0, + "grad_norm": 2.050215707180038, + "language_loss": 0.64909101, + "learning_rate": 2.035244457765222e-06, + "loss": 0.66983819, + "num_input_tokens_seen": 182301810, + "step": 8475, + "time_per_iteration": 2.594437599182129 + }, + { + "auxiliary_loss_clip": 0.01048787, + "auxiliary_loss_mlp": 0.01038192, + "balance_loss_clip": 1.02520227, + "balance_loss_mlp": 1.02608597, + "epoch": 0.5096046896137081, + "flos": 20777088510720.0, + "grad_norm": 2.15330110877057, + "language_loss": 0.8215884, + "learning_rate": 2.0348550558949605e-06, + "loss": 0.84245819, + "num_input_tokens_seen": 182320285, + "step": 8476, + "time_per_iteration": 2.6618969440460205 + }, + { + "auxiliary_loss_clip": 0.0101954, + "auxiliary_loss_mlp": 0.01036623, + "balance_loss_clip": 1.02426887, + "balance_loss_mlp": 1.02126944, + "epoch": 0.5096648128663761, + "flos": 23185078416000.0, + "grad_norm": 2.0898739003250983, + "language_loss": 0.80252433, + "learning_rate": 2.0344656527029917e-06, + "loss": 0.82308602, + "num_input_tokens_seen": 182339465, + "step": 8477, + "time_per_iteration": 2.828819513320923 + }, + { + "auxiliary_loss_clip": 0.01043152, + "auxiliary_loss_mlp": 0.01026995, + "balance_loss_clip": 1.02487326, + "balance_loss_mlp": 1.01417387, + "epoch": 0.509724936119044, + "flos": 22309432663680.0, + "grad_norm": 1.7130922966844517, + "language_loss": 0.61610579, + "learning_rate": 2.034076248204082e-06, + "loss": 0.6368072, + "num_input_tokens_seen": 182358375, + "step": 8478, + "time_per_iteration": 2.8086419105529785 + }, + { + "auxiliary_loss_clip": 0.01058575, + "auxiliary_loss_mlp": 0.01037865, + "balance_loss_clip": 1.02654719, + "balance_loss_mlp": 1.0267725, + "epoch": 0.509785059371712, + "flos": 26287077974400.0, + "grad_norm": 1.5564916309789845, + "language_loss": 0.65624076, + "learning_rate": 2.0336868424129968e-06, + "loss": 0.67720515, + "num_input_tokens_seen": 182377935, + "step": 8479, + "time_per_iteration": 2.698413610458374 + }, + { + "auxiliary_loss_clip": 0.01056233, + "auxiliary_loss_mlp": 0.0103334, + "balance_loss_clip": 1.02559543, + "balance_loss_mlp": 1.022331, + "epoch": 0.50984518262438, + "flos": 22964586779520.0, + "grad_norm": 1.563930204859889, + "language_loss": 0.69488513, + "learning_rate": 2.0332974353445037e-06, + "loss": 0.71578085, + "num_input_tokens_seen": 182396440, + "step": 8480, + "time_per_iteration": 2.7458136081695557 + }, + { + "auxiliary_loss_clip": 0.01069736, + "auxiliary_loss_mlp": 0.01032543, + "balance_loss_clip": 1.02551293, + "balance_loss_mlp": 1.02078938, + "epoch": 0.509905305877048, + "flos": 26213389223040.0, + "grad_norm": 1.7207453316366808, + "language_loss": 0.79082489, + "learning_rate": 2.0329080270133688e-06, + "loss": 0.81184775, + "num_input_tokens_seen": 182415890, + "step": 8481, + "time_per_iteration": 2.634899616241455 + }, + { + "auxiliary_loss_clip": 0.01053404, + "auxiliary_loss_mlp": 0.01032251, + "balance_loss_clip": 1.02414155, + "balance_loss_mlp": 1.02112269, + "epoch": 0.5099654291297159, + "flos": 20340055733760.0, + "grad_norm": 1.7242543983057148, + "language_loss": 0.83569139, + "learning_rate": 2.0325186174343578e-06, + "loss": 0.85654795, + "num_input_tokens_seen": 182434235, + "step": 8482, + "time_per_iteration": 2.7183525562286377 + }, + { + "auxiliary_loss_clip": 0.01061699, + "auxiliary_loss_mlp": 0.00747493, + "balance_loss_clip": 1.02715397, + "balance_loss_mlp": 1.00001729, + "epoch": 0.5100255523823839, + "flos": 29054820545280.0, + "grad_norm": 1.685476552457665, + "language_loss": 0.85333312, + "learning_rate": 2.032129206622238e-06, + "loss": 0.87142503, + "num_input_tokens_seen": 182454360, + "step": 8483, + "time_per_iteration": 2.7405576705932617 + }, + { + "auxiliary_loss_clip": 0.0105662, + "auxiliary_loss_mlp": 0.01031096, + "balance_loss_clip": 1.02470422, + "balance_loss_mlp": 1.01980054, + "epoch": 0.5100856756350518, + "flos": 22455912326400.0, + "grad_norm": 1.8714222372819156, + "language_loss": 0.82706964, + "learning_rate": 2.031739794591775e-06, + "loss": 0.84794676, + "num_input_tokens_seen": 182471940, + "step": 8484, + "time_per_iteration": 2.5907649993896484 + }, + { + "auxiliary_loss_clip": 0.01041843, + "auxiliary_loss_mlp": 0.01030073, + "balance_loss_clip": 1.02405167, + "balance_loss_mlp": 1.01774704, + "epoch": 0.5101457988877198, + "flos": 19171055606400.0, + "grad_norm": 1.8741920243567363, + "language_loss": 0.81125188, + "learning_rate": 2.031350381357736e-06, + "loss": 0.83197105, + "num_input_tokens_seen": 182490685, + "step": 8485, + "time_per_iteration": 2.753547191619873 + }, + { + "auxiliary_loss_clip": 0.01038994, + "auxiliary_loss_mlp": 0.01028634, + "balance_loss_clip": 1.02259707, + "balance_loss_mlp": 1.01742244, + "epoch": 0.5102059221403878, + "flos": 14866371941760.0, + "grad_norm": 2.0370192966892886, + "language_loss": 0.73698139, + "learning_rate": 2.0309609669348874e-06, + "loss": 0.75765765, + "num_input_tokens_seen": 182508325, + "step": 8486, + "time_per_iteration": 2.8736045360565186 + }, + { + "auxiliary_loss_clip": 0.01028014, + "auxiliary_loss_mlp": 0.01033849, + "balance_loss_clip": 1.02415931, + "balance_loss_mlp": 1.02119446, + "epoch": 0.5102660453930558, + "flos": 22961103160320.0, + "grad_norm": 1.4788079695560472, + "language_loss": 0.69830394, + "learning_rate": 2.0305715513379953e-06, + "loss": 0.71892262, + "num_input_tokens_seen": 182527020, + "step": 8487, + "time_per_iteration": 4.328176021575928 + }, + { + "auxiliary_loss_clip": 0.01046274, + "auxiliary_loss_mlp": 0.01028229, + "balance_loss_clip": 1.0259831, + "balance_loss_mlp": 1.01611722, + "epoch": 0.5103261686457238, + "flos": 23149311448320.0, + "grad_norm": 2.9730682359546474, + "language_loss": 0.72658646, + "learning_rate": 2.030182134581827e-06, + "loss": 0.7473315, + "num_input_tokens_seen": 182543505, + "step": 8488, + "time_per_iteration": 2.6749534606933594 + }, + { + "auxiliary_loss_clip": 0.01035314, + "auxiliary_loss_mlp": 0.00747592, + "balance_loss_clip": 1.02690291, + "balance_loss_mlp": 1.00003874, + "epoch": 0.5103862918983917, + "flos": 14319237000960.0, + "grad_norm": 2.0881627790218524, + "language_loss": 0.69139194, + "learning_rate": 2.0297927166811503e-06, + "loss": 0.70922095, + "num_input_tokens_seen": 182562250, + "step": 8489, + "time_per_iteration": 2.739006996154785 + }, + { + "auxiliary_loss_clip": 0.01043117, + "auxiliary_loss_mlp": 0.01028876, + "balance_loss_clip": 1.02494049, + "balance_loss_mlp": 1.0174799, + "epoch": 0.5104464151510597, + "flos": 25848536826240.0, + "grad_norm": 2.1397592007018713, + "language_loss": 0.72137403, + "learning_rate": 2.0294032976507297e-06, + "loss": 0.74209398, + "num_input_tokens_seen": 182581910, + "step": 8490, + "time_per_iteration": 4.346932649612427 + }, + { + "auxiliary_loss_clip": 0.0104377, + "auxiliary_loss_mlp": 0.01025843, + "balance_loss_clip": 1.02456486, + "balance_loss_mlp": 1.01501298, + "epoch": 0.5105065384037276, + "flos": 21652913831040.0, + "grad_norm": 1.5800499821034193, + "language_loss": 0.80682492, + "learning_rate": 2.0290138775053337e-06, + "loss": 0.82752109, + "num_input_tokens_seen": 182601350, + "step": 8491, + "time_per_iteration": 2.7256100177764893 + }, + { + "auxiliary_loss_clip": 0.01053645, + "auxiliary_loss_mlp": 0.01026899, + "balance_loss_clip": 1.02395535, + "balance_loss_mlp": 1.0159018, + "epoch": 0.5105666616563956, + "flos": 22491571553280.0, + "grad_norm": 2.0543525852967117, + "language_loss": 0.78849435, + "learning_rate": 2.028624456259728e-06, + "loss": 0.80929983, + "num_input_tokens_seen": 182619660, + "step": 8492, + "time_per_iteration": 2.687044620513916 + }, + { + "auxiliary_loss_clip": 0.01036223, + "auxiliary_loss_mlp": 0.01036783, + "balance_loss_clip": 1.02494848, + "balance_loss_mlp": 1.02468967, + "epoch": 0.5106267849090635, + "flos": 22455768672000.0, + "grad_norm": 1.8921736350862843, + "language_loss": 0.7785604, + "learning_rate": 2.0282350339286804e-06, + "loss": 0.79929042, + "num_input_tokens_seen": 182639815, + "step": 8493, + "time_per_iteration": 2.7079219818115234 + }, + { + "auxiliary_loss_clip": 0.01031335, + "auxiliary_loss_mlp": 0.01029586, + "balance_loss_clip": 1.02531445, + "balance_loss_mlp": 1.01686001, + "epoch": 0.5106869081617316, + "flos": 23547093638400.0, + "grad_norm": 1.8651726463194347, + "language_loss": 0.83371747, + "learning_rate": 2.0278456105269574e-06, + "loss": 0.85432673, + "num_input_tokens_seen": 182659655, + "step": 8494, + "time_per_iteration": 2.738384246826172 + }, + { + "auxiliary_loss_clip": 0.01070754, + "auxiliary_loss_mlp": 0.01033773, + "balance_loss_clip": 1.02788115, + "balance_loss_mlp": 1.02271652, + "epoch": 0.5107470314143995, + "flos": 26792987080320.0, + "grad_norm": 2.2494253909541855, + "language_loss": 0.7949515, + "learning_rate": 2.027456186069326e-06, + "loss": 0.81599671, + "num_input_tokens_seen": 182677075, + "step": 8495, + "time_per_iteration": 2.696242570877075 + }, + { + "auxiliary_loss_clip": 0.01036299, + "auxiliary_loss_mlp": 0.01028827, + "balance_loss_clip": 1.02592897, + "balance_loss_mlp": 1.01679862, + "epoch": 0.5108071546670675, + "flos": 25739691638400.0, + "grad_norm": 4.915473605355669, + "language_loss": 0.78108978, + "learning_rate": 2.0270667605705535e-06, + "loss": 0.801741, + "num_input_tokens_seen": 182699625, + "step": 8496, + "time_per_iteration": 2.7902088165283203 + }, + { + "auxiliary_loss_clip": 0.01056922, + "auxiliary_loss_mlp": 0.01029187, + "balance_loss_clip": 1.02615047, + "balance_loss_mlp": 1.01798761, + "epoch": 0.5108672779197354, + "flos": 18697537589760.0, + "grad_norm": 2.3596547098828515, + "language_loss": 0.78435832, + "learning_rate": 2.0266773340454066e-06, + "loss": 0.80521941, + "num_input_tokens_seen": 182717020, + "step": 8497, + "time_per_iteration": 2.658341407775879 + }, + { + "auxiliary_loss_clip": 0.01066292, + "auxiliary_loss_mlp": 0.01032451, + "balance_loss_clip": 1.02557397, + "balance_loss_mlp": 1.02153754, + "epoch": 0.5109274011724034, + "flos": 26688164215680.0, + "grad_norm": 1.678297927727899, + "language_loss": 0.81946743, + "learning_rate": 2.0262879065086525e-06, + "loss": 0.84045482, + "num_input_tokens_seen": 182736955, + "step": 8498, + "time_per_iteration": 2.572594404220581 + }, + { + "auxiliary_loss_clip": 0.01030728, + "auxiliary_loss_mlp": 0.00747346, + "balance_loss_clip": 1.02253187, + "balance_loss_mlp": 1.00000143, + "epoch": 0.5109875244250714, + "flos": 22784028088320.0, + "grad_norm": 1.7430938819575779, + "language_loss": 0.70622081, + "learning_rate": 2.0258984779750584e-06, + "loss": 0.72400153, + "num_input_tokens_seen": 182757620, + "step": 8499, + "time_per_iteration": 4.273430109024048 + }, + { + "auxiliary_loss_clip": 0.01021025, + "auxiliary_loss_mlp": 0.01033263, + "balance_loss_clip": 1.02484965, + "balance_loss_mlp": 1.02043056, + "epoch": 0.5110476476777394, + "flos": 35588515622400.0, + "grad_norm": 1.4774550175590295, + "language_loss": 0.72278172, + "learning_rate": 2.0255090484593914e-06, + "loss": 0.74332464, + "num_input_tokens_seen": 182780195, + "step": 8500, + "time_per_iteration": 2.742093086242676 + }, + { + "auxiliary_loss_clip": 0.01063072, + "auxiliary_loss_mlp": 0.01033536, + "balance_loss_clip": 1.02697432, + "balance_loss_mlp": 1.02110219, + "epoch": 0.5111077709304074, + "flos": 19280798634240.0, + "grad_norm": 2.8752858719286087, + "language_loss": 0.62645924, + "learning_rate": 2.0251196179764183e-06, + "loss": 0.64742529, + "num_input_tokens_seen": 182795765, + "step": 8501, + "time_per_iteration": 4.196256637573242 + }, + { + "auxiliary_loss_clip": 0.0106838, + "auxiliary_loss_mlp": 0.01033046, + "balance_loss_clip": 1.02545309, + "balance_loss_mlp": 1.02106524, + "epoch": 0.5111678941830753, + "flos": 20668207409280.0, + "grad_norm": 1.7331745005264305, + "language_loss": 0.87735105, + "learning_rate": 2.024730186540907e-06, + "loss": 0.89836538, + "num_input_tokens_seen": 182813120, + "step": 8502, + "time_per_iteration": 2.6042871475219727 + }, + { + "auxiliary_loss_clip": 0.01055995, + "auxiliary_loss_mlp": 0.01032181, + "balance_loss_clip": 1.02491391, + "balance_loss_mlp": 1.02118373, + "epoch": 0.5112280174357433, + "flos": 26287903987200.0, + "grad_norm": 2.0026224247886706, + "language_loss": 0.82468677, + "learning_rate": 2.0243407541676253e-06, + "loss": 0.84556854, + "num_input_tokens_seen": 182835745, + "step": 8503, + "time_per_iteration": 2.747218132019043 + }, + { + "auxiliary_loss_clip": 0.00989745, + "auxiliary_loss_mlp": 0.01003885, + "balance_loss_clip": 1.00251508, + "balance_loss_mlp": 1.00284243, + "epoch": 0.5112881406884112, + "flos": 59474247707520.0, + "grad_norm": 0.8577019180786115, + "language_loss": 0.63908434, + "learning_rate": 2.023951320871339e-06, + "loss": 0.65902066, + "num_input_tokens_seen": 182892540, + "step": 8504, + "time_per_iteration": 3.4352269172668457 + }, + { + "auxiliary_loss_clip": 0.01032222, + "auxiliary_loss_mlp": 0.00747351, + "balance_loss_clip": 1.02363563, + "balance_loss_mlp": 0.99999386, + "epoch": 0.5113482639410792, + "flos": 26468857728000.0, + "grad_norm": 2.4157405350416643, + "language_loss": 0.84658182, + "learning_rate": 2.023561886666816e-06, + "loss": 0.8643775, + "num_input_tokens_seen": 182911515, + "step": 8505, + "time_per_iteration": 2.788938045501709 + }, + { + "auxiliary_loss_clip": 0.01057272, + "auxiliary_loss_mlp": 0.0102691, + "balance_loss_clip": 1.02698398, + "balance_loss_mlp": 1.01610327, + "epoch": 0.5114083871937471, + "flos": 29895848565120.0, + "grad_norm": 2.0077139991552486, + "language_loss": 0.75471854, + "learning_rate": 2.0231724515688246e-06, + "loss": 0.77556032, + "num_input_tokens_seen": 182930860, + "step": 8506, + "time_per_iteration": 2.6714673042297363 + }, + { + "auxiliary_loss_clip": 0.01069143, + "auxiliary_loss_mlp": 0.01030582, + "balance_loss_clip": 1.02664328, + "balance_loss_mlp": 1.01802933, + "epoch": 0.5114685104464152, + "flos": 24314576561280.0, + "grad_norm": 1.6453521813373644, + "language_loss": 0.57793009, + "learning_rate": 2.022783015592131e-06, + "loss": 0.59892738, + "num_input_tokens_seen": 182949960, + "step": 8507, + "time_per_iteration": 2.5795798301696777 + }, + { + "auxiliary_loss_clip": 0.01059809, + "auxiliary_loss_mlp": 0.01035929, + "balance_loss_clip": 1.02756977, + "balance_loss_mlp": 1.02334094, + "epoch": 0.5115286336990831, + "flos": 17019288391680.0, + "grad_norm": 1.7915302266247932, + "language_loss": 0.85637295, + "learning_rate": 2.022393578751503e-06, + "loss": 0.8773303, + "num_input_tokens_seen": 182968085, + "step": 8508, + "time_per_iteration": 2.5747828483581543 + }, + { + "auxiliary_loss_clip": 0.01039477, + "auxiliary_loss_mlp": 0.00747477, + "balance_loss_clip": 1.02584958, + "balance_loss_mlp": 1.00003302, + "epoch": 0.5115887569517511, + "flos": 23659386531840.0, + "grad_norm": 1.6929345618786558, + "language_loss": 0.72127879, + "learning_rate": 2.022004141061709e-06, + "loss": 0.73914826, + "num_input_tokens_seen": 182987275, + "step": 8509, + "time_per_iteration": 2.7228517532348633 + }, + { + "auxiliary_loss_clip": 0.01068046, + "auxiliary_loss_mlp": 0.00747498, + "balance_loss_clip": 1.02716208, + "balance_loss_mlp": 1.00003934, + "epoch": 0.511648880204419, + "flos": 16107193313280.0, + "grad_norm": 1.8720941166678209, + "language_loss": 0.7660566, + "learning_rate": 2.0216147025375153e-06, + "loss": 0.78421205, + "num_input_tokens_seen": 183004700, + "step": 8510, + "time_per_iteration": 2.5630669593811035 + }, + { + "auxiliary_loss_clip": 0.01069085, + "auxiliary_loss_mlp": 0.01035542, + "balance_loss_clip": 1.0284369, + "balance_loss_mlp": 1.02424121, + "epoch": 0.511709003457087, + "flos": 32634970974720.0, + "grad_norm": 1.6986546480535205, + "language_loss": 0.70847422, + "learning_rate": 2.0212252631936907e-06, + "loss": 0.7295205, + "num_input_tokens_seen": 183025830, + "step": 8511, + "time_per_iteration": 2.6388943195343018 + }, + { + "auxiliary_loss_clip": 0.01040872, + "auxiliary_loss_mlp": 0.01026771, + "balance_loss_clip": 1.02461874, + "balance_loss_mlp": 1.01565456, + "epoch": 0.511769126709755, + "flos": 21762082241280.0, + "grad_norm": 2.3372862931549547, + "language_loss": 0.66949713, + "learning_rate": 2.020835823045001e-06, + "loss": 0.69017357, + "num_input_tokens_seen": 183045140, + "step": 8512, + "time_per_iteration": 2.6450695991516113 + }, + { + "auxiliary_loss_clip": 0.01003514, + "auxiliary_loss_mlp": 0.01035631, + "balance_loss_clip": 1.02181053, + "balance_loss_mlp": 1.02222586, + "epoch": 0.511829249962423, + "flos": 23915357827200.0, + "grad_norm": 2.1209686083262556, + "language_loss": 0.66800642, + "learning_rate": 2.0204463821062146e-06, + "loss": 0.68839782, + "num_input_tokens_seen": 183063935, + "step": 8513, + "time_per_iteration": 2.7143747806549072 + }, + { + "auxiliary_loss_clip": 0.01040373, + "auxiliary_loss_mlp": 0.01031045, + "balance_loss_clip": 1.03017497, + "balance_loss_mlp": 1.01881409, + "epoch": 0.511889373215091, + "flos": 23727005884800.0, + "grad_norm": 1.997775735451068, + "language_loss": 0.69139892, + "learning_rate": 2.0200569403921e-06, + "loss": 0.71211314, + "num_input_tokens_seen": 183084135, + "step": 8514, + "time_per_iteration": 2.729886531829834 + }, + { + "auxiliary_loss_clip": 0.01065075, + "auxiliary_loss_mlp": 0.01024476, + "balance_loss_clip": 1.02583909, + "balance_loss_mlp": 1.01387262, + "epoch": 0.5119494964677589, + "flos": 28111519526400.0, + "grad_norm": 1.5155735033861042, + "language_loss": 0.65752983, + "learning_rate": 2.019667497917424e-06, + "loss": 0.67842531, + "num_input_tokens_seen": 183104570, + "step": 8515, + "time_per_iteration": 2.646793842315674 + }, + { + "auxiliary_loss_clip": 0.01056161, + "auxiliary_loss_mlp": 0.01028832, + "balance_loss_clip": 1.02527988, + "balance_loss_mlp": 1.01775718, + "epoch": 0.5120096197204269, + "flos": 24973214296320.0, + "grad_norm": 1.8581215448351183, + "language_loss": 0.75176656, + "learning_rate": 2.019278054696955e-06, + "loss": 0.77261651, + "num_input_tokens_seen": 183123850, + "step": 8516, + "time_per_iteration": 2.723261833190918 + }, + { + "auxiliary_loss_clip": 0.01043136, + "auxiliary_loss_mlp": 0.01032564, + "balance_loss_clip": 1.02573895, + "balance_loss_mlp": 1.02074409, + "epoch": 0.5120697429730948, + "flos": 17968012364160.0, + "grad_norm": 1.8485360725554425, + "language_loss": 0.78045297, + "learning_rate": 2.0188886107454595e-06, + "loss": 0.80121005, + "num_input_tokens_seen": 183141725, + "step": 8517, + "time_per_iteration": 2.7375619411468506 + }, + { + "auxiliary_loss_clip": 0.0105928, + "auxiliary_loss_mlp": 0.01029673, + "balance_loss_clip": 1.02631855, + "balance_loss_mlp": 1.01791918, + "epoch": 0.5121298662257628, + "flos": 23292343405440.0, + "grad_norm": 1.7072714114959675, + "language_loss": 0.73919362, + "learning_rate": 2.0184991660777063e-06, + "loss": 0.76008314, + "num_input_tokens_seen": 183161300, + "step": 8518, + "time_per_iteration": 2.5953142642974854 + }, + { + "auxiliary_loss_clip": 0.01057037, + "auxiliary_loss_mlp": 0.01032141, + "balance_loss_clip": 1.02624524, + "balance_loss_mlp": 1.02092969, + "epoch": 0.5121899894784308, + "flos": 17311062568320.0, + "grad_norm": 1.5714527385869468, + "language_loss": 0.78028214, + "learning_rate": 2.0181097207084625e-06, + "loss": 0.80117393, + "num_input_tokens_seen": 183180495, + "step": 8519, + "time_per_iteration": 2.6582813262939453 + }, + { + "auxiliary_loss_clip": 0.01068901, + "auxiliary_loss_mlp": 0.01032229, + "balance_loss_clip": 1.0269258, + "balance_loss_mlp": 1.02036738, + "epoch": 0.5122501127310988, + "flos": 24930085040640.0, + "grad_norm": 1.4833221792814355, + "language_loss": 0.79638994, + "learning_rate": 2.017720274652497e-06, + "loss": 0.81740129, + "num_input_tokens_seen": 183200330, + "step": 8520, + "time_per_iteration": 2.580300807952881 + }, + { + "auxiliary_loss_clip": 0.01051324, + "auxiliary_loss_mlp": 0.01036429, + "balance_loss_clip": 1.02654338, + "balance_loss_mlp": 1.02382886, + "epoch": 0.5123102359837667, + "flos": 18442859184000.0, + "grad_norm": 1.6243001126223273, + "language_loss": 0.81250966, + "learning_rate": 2.0173308279245765e-06, + "loss": 0.8333872, + "num_input_tokens_seen": 183218230, + "step": 8521, + "time_per_iteration": 2.6655311584472656 + }, + { + "auxiliary_loss_clip": 0.01054081, + "auxiliary_loss_mlp": 0.01024304, + "balance_loss_clip": 1.02208757, + "balance_loss_mlp": 1.01200724, + "epoch": 0.5123703592364347, + "flos": 26684860164480.0, + "grad_norm": 2.318163480324359, + "language_loss": 0.68627077, + "learning_rate": 2.0169413805394692e-06, + "loss": 0.70705462, + "num_input_tokens_seen": 183236735, + "step": 8522, + "time_per_iteration": 2.629812240600586 + }, + { + "auxiliary_loss_clip": 0.01042974, + "auxiliary_loss_mlp": 0.01034682, + "balance_loss_clip": 1.02761495, + "balance_loss_mlp": 1.02011466, + "epoch": 0.5124304824891026, + "flos": 28803948981120.0, + "grad_norm": 1.9259325295705698, + "language_loss": 0.61411369, + "learning_rate": 2.0165519325119433e-06, + "loss": 0.63489032, + "num_input_tokens_seen": 183257550, + "step": 8523, + "time_per_iteration": 2.677903413772583 + }, + { + "auxiliary_loss_clip": 0.01029791, + "auxiliary_loss_mlp": 0.01031364, + "balance_loss_clip": 1.02399898, + "balance_loss_mlp": 1.01984787, + "epoch": 0.5124906057417706, + "flos": 21761830846080.0, + "grad_norm": 2.2168975474092942, + "language_loss": 0.7761113, + "learning_rate": 2.0161624838567656e-06, + "loss": 0.79672283, + "num_input_tokens_seen": 183275515, + "step": 8524, + "time_per_iteration": 2.6887497901916504 + }, + { + "auxiliary_loss_clip": 0.01043253, + "auxiliary_loss_mlp": 0.01030344, + "balance_loss_clip": 1.02509642, + "balance_loss_mlp": 1.01924527, + "epoch": 0.5125507289944387, + "flos": 18880538405760.0, + "grad_norm": 1.8064107862877585, + "language_loss": 0.74593508, + "learning_rate": 2.015773034588706e-06, + "loss": 0.76667106, + "num_input_tokens_seen": 183293880, + "step": 8525, + "time_per_iteration": 2.612470865249634 + }, + { + "auxiliary_loss_clip": 0.01042608, + "auxiliary_loss_mlp": 0.01036377, + "balance_loss_clip": 1.0243814, + "balance_loss_mlp": 1.0226078, + "epoch": 0.5126108522471066, + "flos": 35627838036480.0, + "grad_norm": 1.5464608972879383, + "language_loss": 0.74229527, + "learning_rate": 2.015383584722531e-06, + "loss": 0.76308507, + "num_input_tokens_seen": 183315860, + "step": 8526, + "time_per_iteration": 2.736565589904785 + }, + { + "auxiliary_loss_clip": 0.01057543, + "auxiliary_loss_mlp": 0.01031951, + "balance_loss_clip": 1.02569139, + "balance_loss_mlp": 1.02034593, + "epoch": 0.5126709754997746, + "flos": 20190918464640.0, + "grad_norm": 2.4325642509932783, + "language_loss": 0.64966905, + "learning_rate": 2.0149941342730088e-06, + "loss": 0.67056394, + "num_input_tokens_seen": 183335480, + "step": 8527, + "time_per_iteration": 2.752798080444336 + }, + { + "auxiliary_loss_clip": 0.01046032, + "auxiliary_loss_mlp": 0.01036535, + "balance_loss_clip": 1.02708125, + "balance_loss_mlp": 1.02612257, + "epoch": 0.5127310987524425, + "flos": 18588548747520.0, + "grad_norm": 1.4370163585259264, + "language_loss": 0.74388558, + "learning_rate": 2.014604683254908e-06, + "loss": 0.76471126, + "num_input_tokens_seen": 183354395, + "step": 8528, + "time_per_iteration": 2.864306926727295 + }, + { + "auxiliary_loss_clip": 0.01055314, + "auxiliary_loss_mlp": 0.01031618, + "balance_loss_clip": 1.02506888, + "balance_loss_mlp": 1.02001286, + "epoch": 0.5127912220051105, + "flos": 22454691264000.0, + "grad_norm": 1.8694879209246569, + "language_loss": 0.82886684, + "learning_rate": 2.014215231682995e-06, + "loss": 0.84973615, + "num_input_tokens_seen": 183372980, + "step": 8529, + "time_per_iteration": 2.649977684020996 + }, + { + "auxiliary_loss_clip": 0.01017851, + "auxiliary_loss_mlp": 0.01028789, + "balance_loss_clip": 1.02318501, + "balance_loss_mlp": 1.01746988, + "epoch": 0.5128513452577784, + "flos": 19093703667840.0, + "grad_norm": 1.6893963881313798, + "language_loss": 0.73912752, + "learning_rate": 2.01382577957204e-06, + "loss": 0.75959384, + "num_input_tokens_seen": 183390160, + "step": 8530, + "time_per_iteration": 2.732952356338501 + }, + { + "auxiliary_loss_clip": 0.00979819, + "auxiliary_loss_mlp": 0.01008216, + "balance_loss_clip": 1.00273657, + "balance_loss_mlp": 1.00694597, + "epoch": 0.5129114685104464, + "flos": 67892285243520.0, + "grad_norm": 0.745430346625383, + "language_loss": 0.60833889, + "learning_rate": 2.0134363269368095e-06, + "loss": 0.62821925, + "num_input_tokens_seen": 183455280, + "step": 8531, + "time_per_iteration": 3.3717682361602783 + }, + { + "auxiliary_loss_clip": 0.01041097, + "auxiliary_loss_mlp": 0.0103105, + "balance_loss_clip": 1.02483463, + "balance_loss_mlp": 1.01827621, + "epoch": 0.5129715917631144, + "flos": 20449152316800.0, + "grad_norm": 1.6345215851689727, + "language_loss": 0.76822972, + "learning_rate": 2.0130468737920725e-06, + "loss": 0.78895122, + "num_input_tokens_seen": 183473955, + "step": 8532, + "time_per_iteration": 2.7903435230255127 + }, + { + "auxiliary_loss_clip": 0.01044194, + "auxiliary_loss_mlp": 0.01032389, + "balance_loss_clip": 1.0238328, + "balance_loss_mlp": 1.02092719, + "epoch": 0.5130317150157824, + "flos": 35116146840960.0, + "grad_norm": 2.6284417279648724, + "language_loss": 0.67483175, + "learning_rate": 2.012657420152597e-06, + "loss": 0.69559759, + "num_input_tokens_seen": 183497195, + "step": 8533, + "time_per_iteration": 2.8619892597198486 + }, + { + "auxiliary_loss_clip": 0.01038985, + "auxiliary_loss_mlp": 0.01031929, + "balance_loss_clip": 1.02648997, + "balance_loss_mlp": 1.02017474, + "epoch": 0.5130918382684503, + "flos": 19791627903360.0, + "grad_norm": 1.8775172585256341, + "language_loss": 0.82293421, + "learning_rate": 2.01226796603315e-06, + "loss": 0.84364337, + "num_input_tokens_seen": 183513675, + "step": 8534, + "time_per_iteration": 4.459514617919922 + }, + { + "auxiliary_loss_clip": 0.01053511, + "auxiliary_loss_mlp": 0.01037955, + "balance_loss_clip": 1.02390909, + "balance_loss_mlp": 1.02537858, + "epoch": 0.5131519615211183, + "flos": 26323096337280.0, + "grad_norm": 2.01013696406672, + "language_loss": 0.63775283, + "learning_rate": 2.0118785114485017e-06, + "loss": 0.65866745, + "num_input_tokens_seen": 183535165, + "step": 8535, + "time_per_iteration": 2.7873218059539795 + }, + { + "auxiliary_loss_clip": 0.01059686, + "auxiliary_loss_mlp": 0.01027139, + "balance_loss_clip": 1.02756095, + "balance_loss_mlp": 1.01559901, + "epoch": 0.5132120847737862, + "flos": 19171917532800.0, + "grad_norm": 1.556790813307222, + "language_loss": 0.69583112, + "learning_rate": 2.011489056413418e-06, + "loss": 0.71669936, + "num_input_tokens_seen": 183553780, + "step": 8536, + "time_per_iteration": 2.6206610202789307 + }, + { + "auxiliary_loss_clip": 0.01058035, + "auxiliary_loss_mlp": 0.01028924, + "balance_loss_clip": 1.02535152, + "balance_loss_mlp": 1.01657343, + "epoch": 0.5132722080264542, + "flos": 20230420446720.0, + "grad_norm": 2.5459306046791403, + "language_loss": 0.71519661, + "learning_rate": 2.011099600942669e-06, + "loss": 0.73606622, + "num_input_tokens_seen": 183572285, + "step": 8537, + "time_per_iteration": 4.187178373336792 + }, + { + "auxiliary_loss_clip": 0.0102358, + "auxiliary_loss_mlp": 0.01031328, + "balance_loss_clip": 1.02411294, + "balance_loss_mlp": 1.01913857, + "epoch": 0.5133323312791223, + "flos": 16469459930880.0, + "grad_norm": 2.0576080617620782, + "language_loss": 0.79808855, + "learning_rate": 2.0107101450510214e-06, + "loss": 0.81863755, + "num_input_tokens_seen": 183589330, + "step": 8538, + "time_per_iteration": 2.6458451747894287 + }, + { + "auxiliary_loss_clip": 0.01055814, + "auxiliary_loss_mlp": 0.01028413, + "balance_loss_clip": 1.02436876, + "balance_loss_mlp": 1.01689112, + "epoch": 0.5133924545317902, + "flos": 26068094709120.0, + "grad_norm": 3.1306816743521932, + "language_loss": 0.78273177, + "learning_rate": 2.0103206887532437e-06, + "loss": 0.80357403, + "num_input_tokens_seen": 183609205, + "step": 8539, + "time_per_iteration": 2.588362455368042 + }, + { + "auxiliary_loss_clip": 0.01041904, + "auxiliary_loss_mlp": 0.01031386, + "balance_loss_clip": 1.02354252, + "balance_loss_mlp": 1.0198524, + "epoch": 0.5134525777844582, + "flos": 29131023248640.0, + "grad_norm": 1.7915608416421565, + "language_loss": 0.75927395, + "learning_rate": 2.009931232064105e-06, + "loss": 0.78000689, + "num_input_tokens_seen": 183629985, + "step": 8540, + "time_per_iteration": 2.681896209716797 + }, + { + "auxiliary_loss_clip": 0.01029868, + "auxiliary_loss_mlp": 0.01031345, + "balance_loss_clip": 1.0264138, + "balance_loss_mlp": 1.01894736, + "epoch": 0.5135127010371261, + "flos": 17454776883840.0, + "grad_norm": 2.1465961346706837, + "language_loss": 0.7493211, + "learning_rate": 2.0095417749983724e-06, + "loss": 0.76993334, + "num_input_tokens_seen": 183648220, + "step": 8541, + "time_per_iteration": 2.7803640365600586 + }, + { + "auxiliary_loss_clip": 0.0101105, + "auxiliary_loss_mlp": 0.01033602, + "balance_loss_clip": 1.02578449, + "balance_loss_mlp": 1.02221131, + "epoch": 0.5135728242897941, + "flos": 21944975316480.0, + "grad_norm": 1.9325539309406163, + "language_loss": 0.70150471, + "learning_rate": 2.0091523175708162e-06, + "loss": 0.72195125, + "num_input_tokens_seen": 183668230, + "step": 8542, + "time_per_iteration": 2.7096920013427734 + }, + { + "auxiliary_loss_clip": 0.01047019, + "auxiliary_loss_mlp": 0.01029164, + "balance_loss_clip": 1.02589822, + "balance_loss_mlp": 1.01806593, + "epoch": 0.513632947542462, + "flos": 22674859678080.0, + "grad_norm": 1.8241849309908904, + "language_loss": 0.79353046, + "learning_rate": 2.0087628597962023e-06, + "loss": 0.81429225, + "num_input_tokens_seen": 183687800, + "step": 8543, + "time_per_iteration": 2.6121468544006348 + }, + { + "auxiliary_loss_clip": 0.01049993, + "auxiliary_loss_mlp": 0.01034567, + "balance_loss_clip": 1.02821875, + "balance_loss_mlp": 1.02226496, + "epoch": 0.51369307079513, + "flos": 29457163762560.0, + "grad_norm": 1.727248965778977, + "language_loss": 0.67669821, + "learning_rate": 2.008373401689299e-06, + "loss": 0.69754386, + "num_input_tokens_seen": 183709025, + "step": 8544, + "time_per_iteration": 2.6861093044281006 + }, + { + "auxiliary_loss_clip": 0.01028044, + "auxiliary_loss_mlp": 0.01040688, + "balance_loss_clip": 1.0216856, + "balance_loss_mlp": 1.02831984, + "epoch": 0.513753194047798, + "flos": 18989347680000.0, + "grad_norm": 2.3057449088129767, + "language_loss": 0.71840978, + "learning_rate": 2.0079839432648765e-06, + "loss": 0.73909712, + "num_input_tokens_seen": 183725740, + "step": 8545, + "time_per_iteration": 2.6577415466308594 + }, + { + "auxiliary_loss_clip": 0.01060036, + "auxiliary_loss_mlp": 0.01036928, + "balance_loss_clip": 1.02663839, + "balance_loss_mlp": 1.02429748, + "epoch": 0.513813317300466, + "flos": 17821855923840.0, + "grad_norm": 2.0670337327653154, + "language_loss": 0.81826103, + "learning_rate": 2.0075944845377016e-06, + "loss": 0.83923066, + "num_input_tokens_seen": 183743995, + "step": 8546, + "time_per_iteration": 4.13478946685791 + }, + { + "auxiliary_loss_clip": 0.0105372, + "auxiliary_loss_mlp": 0.01032359, + "balance_loss_clip": 1.02477264, + "balance_loss_mlp": 1.01984799, + "epoch": 0.5138734405531339, + "flos": 24061191045120.0, + "grad_norm": 1.774116153786287, + "language_loss": 0.73249251, + "learning_rate": 2.007205025522544e-06, + "loss": 0.75335324, + "num_input_tokens_seen": 183764150, + "step": 8547, + "time_per_iteration": 2.5781214237213135 + }, + { + "auxiliary_loss_clip": 0.01054494, + "auxiliary_loss_mlp": 0.01040405, + "balance_loss_clip": 1.02323794, + "balance_loss_mlp": 1.02878785, + "epoch": 0.5139335638058019, + "flos": 26097253574400.0, + "grad_norm": 1.625802100908703, + "language_loss": 0.73196918, + "learning_rate": 2.0068155662341702e-06, + "loss": 0.75291812, + "num_input_tokens_seen": 183783280, + "step": 8548, + "time_per_iteration": 2.601386547088623 + }, + { + "auxiliary_loss_clip": 0.01028567, + "auxiliary_loss_mlp": 0.010316, + "balance_loss_clip": 1.02291036, + "balance_loss_mlp": 1.01941681, + "epoch": 0.5139936870584698, + "flos": 18917095472640.0, + "grad_norm": 1.6895222835889396, + "language_loss": 0.82425612, + "learning_rate": 2.0064261066873495e-06, + "loss": 0.84485781, + "num_input_tokens_seen": 183800725, + "step": 8549, + "time_per_iteration": 4.252683877944946 + }, + { + "auxiliary_loss_clip": 0.01057815, + "auxiliary_loss_mlp": 0.0102613, + "balance_loss_clip": 1.02698267, + "balance_loss_mlp": 1.01543069, + "epoch": 0.5140538103111378, + "flos": 16144001775360.0, + "grad_norm": 1.8073905221294833, + "language_loss": 0.71900439, + "learning_rate": 2.0060366468968504e-06, + "loss": 0.73984385, + "num_input_tokens_seen": 183818735, + "step": 8550, + "time_per_iteration": 2.597665786743164 + }, + { + "auxiliary_loss_clip": 0.01061753, + "auxiliary_loss_mlp": 0.0103309, + "balance_loss_clip": 1.02788877, + "balance_loss_mlp": 1.02117515, + "epoch": 0.5141139335638057, + "flos": 22420145358720.0, + "grad_norm": 1.5524610696818997, + "language_loss": 0.75146639, + "learning_rate": 2.0056471868774408e-06, + "loss": 0.77241486, + "num_input_tokens_seen": 183840015, + "step": 8551, + "time_per_iteration": 2.687228202819824 + }, + { + "auxiliary_loss_clip": 0.01040191, + "auxiliary_loss_mlp": 0.0102651, + "balance_loss_clip": 1.02626634, + "balance_loss_mlp": 1.01504803, + "epoch": 0.5141740568164738, + "flos": 27089645506560.0, + "grad_norm": 1.6140225323138024, + "language_loss": 0.68916506, + "learning_rate": 2.0052577266438897e-06, + "loss": 0.70983207, + "num_input_tokens_seen": 183860145, + "step": 8552, + "time_per_iteration": 2.782072067260742 + }, + { + "auxiliary_loss_clip": 0.0105827, + "auxiliary_loss_mlp": 0.01031394, + "balance_loss_clip": 1.02568913, + "balance_loss_mlp": 1.01972961, + "epoch": 0.5142341800691418, + "flos": 24973250209920.0, + "grad_norm": 1.8584619269372757, + "language_loss": 0.7434116, + "learning_rate": 2.004868266210965e-06, + "loss": 0.76430821, + "num_input_tokens_seen": 183880540, + "step": 8553, + "time_per_iteration": 2.6704394817352295 + }, + { + "auxiliary_loss_clip": 0.010679, + "auxiliary_loss_mlp": 0.01033032, + "balance_loss_clip": 1.02677035, + "balance_loss_mlp": 1.02204669, + "epoch": 0.5142943033218097, + "flos": 20704513080960.0, + "grad_norm": 1.6758881338786795, + "language_loss": 0.67986429, + "learning_rate": 2.004478805593435e-06, + "loss": 0.70087361, + "num_input_tokens_seen": 183900895, + "step": 8554, + "time_per_iteration": 2.536834239959717 + }, + { + "auxiliary_loss_clip": 0.01057329, + "auxiliary_loss_mlp": 0.01034162, + "balance_loss_clip": 1.02381492, + "balance_loss_mlp": 1.02050674, + "epoch": 0.5143544265744777, + "flos": 22925479847040.0, + "grad_norm": 1.8897934320854202, + "language_loss": 0.73183417, + "learning_rate": 2.004089344806068e-06, + "loss": 0.75274909, + "num_input_tokens_seen": 183920335, + "step": 8555, + "time_per_iteration": 2.6756675243377686 + }, + { + "auxiliary_loss_clip": 0.010425, + "auxiliary_loss_mlp": 0.01032646, + "balance_loss_clip": 1.03058434, + "balance_loss_mlp": 1.02119637, + "epoch": 0.5144145498271456, + "flos": 15921391236480.0, + "grad_norm": 2.363424159011617, + "language_loss": 0.74657577, + "learning_rate": 2.003699883863633e-06, + "loss": 0.76732725, + "num_input_tokens_seen": 183936220, + "step": 8556, + "time_per_iteration": 2.6705482006073 + }, + { + "auxiliary_loss_clip": 0.01040315, + "auxiliary_loss_mlp": 0.01029163, + "balance_loss_clip": 1.02854657, + "balance_loss_mlp": 1.01837492, + "epoch": 0.5144746730798136, + "flos": 19681238430720.0, + "grad_norm": 2.0054271266851798, + "language_loss": 0.86483562, + "learning_rate": 2.003310422780898e-06, + "loss": 0.88553035, + "num_input_tokens_seen": 183953250, + "step": 8557, + "time_per_iteration": 2.673468589782715 + }, + { + "auxiliary_loss_clip": 0.01047307, + "auxiliary_loss_mlp": 0.0103419, + "balance_loss_clip": 1.02236855, + "balance_loss_mlp": 1.02243638, + "epoch": 0.5145347963324816, + "flos": 23914711382400.0, + "grad_norm": 1.6415793293951624, + "language_loss": 0.88698357, + "learning_rate": 2.0029209615726307e-06, + "loss": 0.90779853, + "num_input_tokens_seen": 183973865, + "step": 8558, + "time_per_iteration": 2.7037594318389893 + }, + { + "auxiliary_loss_clip": 0.01065283, + "auxiliary_loss_mlp": 0.00747306, + "balance_loss_clip": 1.02537417, + "balance_loss_mlp": 0.9999581, + "epoch": 0.5145949195851496, + "flos": 18260002022400.0, + "grad_norm": 1.9588483615672942, + "language_loss": 0.649508, + "learning_rate": 2.002531500253602e-06, + "loss": 0.66763389, + "num_input_tokens_seen": 183992555, + "step": 8559, + "time_per_iteration": 2.602691888809204 + }, + { + "auxiliary_loss_clip": 0.01048076, + "auxiliary_loss_mlp": 0.00747345, + "balance_loss_clip": 1.02439356, + "balance_loss_mlp": 0.9999963, + "epoch": 0.5146550428378175, + "flos": 26213425136640.0, + "grad_norm": 1.596706600998947, + "language_loss": 0.63176024, + "learning_rate": 2.002142038838577e-06, + "loss": 0.64971441, + "num_input_tokens_seen": 184010825, + "step": 8560, + "time_per_iteration": 2.5809226036071777 + }, + { + "auxiliary_loss_clip": 0.01065694, + "auxiliary_loss_mlp": 0.01024187, + "balance_loss_clip": 1.02551532, + "balance_loss_mlp": 1.01322007, + "epoch": 0.5147151660904855, + "flos": 22674177319680.0, + "grad_norm": 1.7514625839355882, + "language_loss": 0.70228142, + "learning_rate": 2.0017525773423265e-06, + "loss": 0.72318029, + "num_input_tokens_seen": 184030155, + "step": 8561, + "time_per_iteration": 2.5295674800872803 + }, + { + "auxiliary_loss_clip": 0.01041119, + "auxiliary_loss_mlp": 0.01027126, + "balance_loss_clip": 1.02373278, + "balance_loss_mlp": 1.01639128, + "epoch": 0.5147752893431534, + "flos": 24972388283520.0, + "grad_norm": 1.5602984172115266, + "language_loss": 0.66804171, + "learning_rate": 2.0013631157796177e-06, + "loss": 0.68872416, + "num_input_tokens_seen": 184051440, + "step": 8562, + "time_per_iteration": 2.603649616241455 + }, + { + "auxiliary_loss_clip": 0.01060141, + "auxiliary_loss_mlp": 0.01029717, + "balance_loss_clip": 1.02646279, + "balance_loss_mlp": 1.01848769, + "epoch": 0.5148354125958214, + "flos": 22744669760640.0, + "grad_norm": 1.593682917521259, + "language_loss": 0.77667898, + "learning_rate": 2.0009736541652188e-06, + "loss": 0.79757756, + "num_input_tokens_seen": 184070205, + "step": 8563, + "time_per_iteration": 2.6239078044891357 + }, + { + "auxiliary_loss_clip": 0.01061017, + "auxiliary_loss_mlp": 0.01028065, + "balance_loss_clip": 1.02704811, + "balance_loss_mlp": 1.01491022, + "epoch": 0.5148955358484893, + "flos": 23068763199360.0, + "grad_norm": 1.8902504184836133, + "language_loss": 0.82603252, + "learning_rate": 2.0005841925139e-06, + "loss": 0.84692329, + "num_input_tokens_seen": 184087345, + "step": 8564, + "time_per_iteration": 2.6140835285186768 + }, + { + "auxiliary_loss_clip": 0.01051812, + "auxiliary_loss_mlp": 0.01030601, + "balance_loss_clip": 1.02691472, + "balance_loss_mlp": 1.01850152, + "epoch": 0.5149556591011574, + "flos": 20340127560960.0, + "grad_norm": 1.8312630006968458, + "language_loss": 0.73112774, + "learning_rate": 2.0001947308404283e-06, + "loss": 0.75195181, + "num_input_tokens_seen": 184107110, + "step": 8565, + "time_per_iteration": 2.6679134368896484 + }, + { + "auxiliary_loss_clip": 0.01055143, + "auxiliary_loss_mlp": 0.01029893, + "balance_loss_clip": 1.02482104, + "balance_loss_mlp": 1.01702404, + "epoch": 0.5150157823538254, + "flos": 22638230784000.0, + "grad_norm": 2.351760201152157, + "language_loss": 0.68406504, + "learning_rate": 1.9998052691595715e-06, + "loss": 0.7049154, + "num_input_tokens_seen": 184127105, + "step": 8566, + "time_per_iteration": 2.786147117614746 + }, + { + "auxiliary_loss_clip": 0.01068471, + "auxiliary_loss_mlp": 0.00747378, + "balance_loss_clip": 1.02469277, + "balance_loss_mlp": 0.99999392, + "epoch": 0.5150759056064933, + "flos": 26067627832320.0, + "grad_norm": 1.625032878589863, + "language_loss": 0.78106117, + "learning_rate": 1.9994158074861005e-06, + "loss": 0.79921973, + "num_input_tokens_seen": 184148060, + "step": 8567, + "time_per_iteration": 2.646305561065674 + }, + { + "auxiliary_loss_clip": 0.01060351, + "auxiliary_loss_mlp": 0.01028371, + "balance_loss_clip": 1.0269506, + "balance_loss_mlp": 1.01628363, + "epoch": 0.5151360288591613, + "flos": 25952641418880.0, + "grad_norm": 2.048186779189923, + "language_loss": 0.78974688, + "learning_rate": 1.9990263458347806e-06, + "loss": 0.81063408, + "num_input_tokens_seen": 184166175, + "step": 8568, + "time_per_iteration": 2.6176085472106934 + }, + { + "auxiliary_loss_clip": 0.01046981, + "auxiliary_loss_mlp": 0.01028646, + "balance_loss_clip": 1.02494502, + "balance_loss_mlp": 1.01756573, + "epoch": 0.5151961521118292, + "flos": 18507246312960.0, + "grad_norm": 2.8562446210536576, + "language_loss": 0.91118753, + "learning_rate": 1.9986368842203825e-06, + "loss": 0.93194377, + "num_input_tokens_seen": 184182600, + "step": 8569, + "time_per_iteration": 2.710836172103882 + }, + { + "auxiliary_loss_clip": 0.01069069, + "auxiliary_loss_mlp": 0.0102755, + "balance_loss_clip": 1.02589667, + "balance_loss_mlp": 1.01604033, + "epoch": 0.5152562753644973, + "flos": 22233696837120.0, + "grad_norm": 1.7016927728352191, + "language_loss": 0.76559484, + "learning_rate": 1.998247422657674e-06, + "loss": 0.78656101, + "num_input_tokens_seen": 184202020, + "step": 8570, + "time_per_iteration": 2.6213924884796143 + }, + { + "auxiliary_loss_clip": 0.01057873, + "auxiliary_loss_mlp": 0.01034634, + "balance_loss_clip": 1.02528143, + "balance_loss_mlp": 1.0216043, + "epoch": 0.5153163986171652, + "flos": 38436555047040.0, + "grad_norm": 1.6608584979049732, + "language_loss": 0.73739636, + "learning_rate": 1.9978579611614227e-06, + "loss": 0.7583214, + "num_input_tokens_seen": 184224850, + "step": 8571, + "time_per_iteration": 2.705585479736328 + }, + { + "auxiliary_loss_clip": 0.00991182, + "auxiliary_loss_mlp": 0.01001204, + "balance_loss_clip": 1.00366092, + "balance_loss_mlp": 1.00007105, + "epoch": 0.5153765218698332, + "flos": 66384503015040.0, + "grad_norm": 0.7772232595428722, + "language_loss": 0.52885115, + "learning_rate": 1.9974684997463984e-06, + "loss": 0.54877508, + "num_input_tokens_seen": 184288520, + "step": 8572, + "time_per_iteration": 3.285902738571167 + }, + { + "auxiliary_loss_clip": 0.01056494, + "auxiliary_loss_mlp": 0.0103562, + "balance_loss_clip": 1.02644825, + "balance_loss_mlp": 1.02402139, + "epoch": 0.5154366451225011, + "flos": 24024669891840.0, + "grad_norm": 1.614077224913516, + "language_loss": 0.76537645, + "learning_rate": 1.9970790384273687e-06, + "loss": 0.78629756, + "num_input_tokens_seen": 184308565, + "step": 8573, + "time_per_iteration": 2.585554838180542 + }, + { + "auxiliary_loss_clip": 0.01055557, + "auxiliary_loss_mlp": 0.01025371, + "balance_loss_clip": 1.02443171, + "balance_loss_mlp": 1.01374221, + "epoch": 0.5154967683751691, + "flos": 23468843859840.0, + "grad_norm": 1.843287873932819, + "language_loss": 0.76804334, + "learning_rate": 1.996689577219102e-06, + "loss": 0.78885263, + "num_input_tokens_seen": 184326795, + "step": 8574, + "time_per_iteration": 2.6002554893493652 + }, + { + "auxiliary_loss_clip": 0.01042009, + "auxiliary_loss_mlp": 0.0102797, + "balance_loss_clip": 1.02600884, + "balance_loss_mlp": 1.01705647, + "epoch": 0.515556891627837, + "flos": 23805650712960.0, + "grad_norm": 1.6984946973632604, + "language_loss": 0.85204679, + "learning_rate": 1.996300116136367e-06, + "loss": 0.87274659, + "num_input_tokens_seen": 184345990, + "step": 8575, + "time_per_iteration": 2.6565515995025635 + }, + { + "auxiliary_loss_clip": 0.01059163, + "auxiliary_loss_mlp": 0.01034642, + "balance_loss_clip": 1.02553439, + "balance_loss_mlp": 1.02318645, + "epoch": 0.515617014880505, + "flos": 19828544106240.0, + "grad_norm": 1.523237182320569, + "language_loss": 0.76959878, + "learning_rate": 1.995910655193932e-06, + "loss": 0.79053676, + "num_input_tokens_seen": 184366300, + "step": 8576, + "time_per_iteration": 2.626765012741089 + }, + { + "auxiliary_loss_clip": 0.01025349, + "auxiliary_loss_mlp": 0.00747441, + "balance_loss_clip": 1.02462125, + "balance_loss_mlp": 0.99999022, + "epoch": 0.515677138133173, + "flos": 14245907385600.0, + "grad_norm": 2.2818763696252926, + "language_loss": 0.76014781, + "learning_rate": 1.9955211944065654e-06, + "loss": 0.77787566, + "num_input_tokens_seen": 184383030, + "step": 8577, + "time_per_iteration": 2.6628944873809814 + }, + { + "auxiliary_loss_clip": 0.01042244, + "auxiliary_loss_mlp": 0.0103834, + "balance_loss_clip": 1.02412629, + "balance_loss_mlp": 1.02447605, + "epoch": 0.515737261385841, + "flos": 28289707920000.0, + "grad_norm": 2.042442205587685, + "language_loss": 0.81330705, + "learning_rate": 1.9951317337890353e-06, + "loss": 0.83411288, + "num_input_tokens_seen": 184403410, + "step": 8578, + "time_per_iteration": 2.7074198722839355 + }, + { + "auxiliary_loss_clip": 0.0106506, + "auxiliary_loss_mlp": 0.01034233, + "balance_loss_clip": 1.02438617, + "balance_loss_mlp": 1.02292609, + "epoch": 0.515797384638509, + "flos": 27891925729920.0, + "grad_norm": 1.754327902715632, + "language_loss": 0.75834495, + "learning_rate": 1.9947422733561105e-06, + "loss": 0.77933794, + "num_input_tokens_seen": 184423830, + "step": 8579, + "time_per_iteration": 2.627025604248047 + }, + { + "auxiliary_loss_clip": 0.01039644, + "auxiliary_loss_mlp": 0.01030362, + "balance_loss_clip": 1.02668905, + "balance_loss_mlp": 1.01869106, + "epoch": 0.5158575078911769, + "flos": 23040071210880.0, + "grad_norm": 1.6016943260979701, + "language_loss": 0.78851736, + "learning_rate": 1.994352813122559e-06, + "loss": 0.80921745, + "num_input_tokens_seen": 184445050, + "step": 8580, + "time_per_iteration": 2.7687127590179443 + }, + { + "auxiliary_loss_clip": 0.01033838, + "auxiliary_loss_mlp": 0.01045667, + "balance_loss_clip": 1.02504253, + "balance_loss_mlp": 1.03143358, + "epoch": 0.5159176311438449, + "flos": 12641346938880.0, + "grad_norm": 1.980009025933341, + "language_loss": 0.72427183, + "learning_rate": 1.99396335310315e-06, + "loss": 0.74506694, + "num_input_tokens_seen": 184460775, + "step": 8581, + "time_per_iteration": 2.747030735015869 + }, + { + "auxiliary_loss_clip": 0.01057961, + "auxiliary_loss_mlp": 0.01029633, + "balance_loss_clip": 1.0268209, + "balance_loss_mlp": 1.0188086, + "epoch": 0.5159777543965128, + "flos": 15558154951680.0, + "grad_norm": 2.091550879880235, + "language_loss": 0.74294251, + "learning_rate": 1.9935738933126508e-06, + "loss": 0.7638185, + "num_input_tokens_seen": 184477365, + "step": 8582, + "time_per_iteration": 4.123198509216309 + }, + { + "auxiliary_loss_clip": 0.01038419, + "auxiliary_loss_mlp": 0.01032531, + "balance_loss_clip": 1.02752709, + "balance_loss_mlp": 1.02124763, + "epoch": 0.5160378776491809, + "flos": 23221671396480.0, + "grad_norm": 2.6436920179760763, + "language_loss": 0.65651447, + "learning_rate": 1.99318443376583e-06, + "loss": 0.67722392, + "num_input_tokens_seen": 184497045, + "step": 8583, + "time_per_iteration": 2.7731258869171143 + }, + { + "auxiliary_loss_clip": 0.0105392, + "auxiliary_loss_mlp": 0.01030867, + "balance_loss_clip": 1.0235858, + "balance_loss_mlp": 1.01894665, + "epoch": 0.5160980009018488, + "flos": 21944616180480.0, + "grad_norm": 1.50046780982664, + "language_loss": 0.76078522, + "learning_rate": 1.9927949744774568e-06, + "loss": 0.78163314, + "num_input_tokens_seen": 184517675, + "step": 8584, + "time_per_iteration": 2.637962818145752 + }, + { + "auxiliary_loss_clip": 0.01039725, + "auxiliary_loss_mlp": 0.01040869, + "balance_loss_clip": 1.02535725, + "balance_loss_mlp": 1.02894163, + "epoch": 0.5161581241545168, + "flos": 22784064001920.0, + "grad_norm": 1.888203274849327, + "language_loss": 0.78912592, + "learning_rate": 1.9924055154622983e-06, + "loss": 0.80993176, + "num_input_tokens_seen": 184537745, + "step": 8585, + "time_per_iteration": 4.328392505645752 + }, + { + "auxiliary_loss_clip": 0.01047963, + "auxiliary_loss_mlp": 0.01029018, + "balance_loss_clip": 1.02418137, + "balance_loss_mlp": 1.01835489, + "epoch": 0.5162182474071847, + "flos": 19675384513920.0, + "grad_norm": 2.172336642878776, + "language_loss": 0.80760133, + "learning_rate": 1.9920160567351238e-06, + "loss": 0.82837117, + "num_input_tokens_seen": 184553630, + "step": 8586, + "time_per_iteration": 2.6760501861572266 + }, + { + "auxiliary_loss_clip": 0.01049866, + "auxiliary_loss_mlp": 0.01030381, + "balance_loss_clip": 1.02660871, + "balance_loss_mlp": 1.01901996, + "epoch": 0.5162783706598527, + "flos": 20046198568320.0, + "grad_norm": 1.9872787249450008, + "language_loss": 0.71562719, + "learning_rate": 1.991626598310701e-06, + "loss": 0.73642969, + "num_input_tokens_seen": 184573530, + "step": 8587, + "time_per_iteration": 2.8537440299987793 + }, + { + "auxiliary_loss_clip": 0.01000279, + "auxiliary_loss_mlp": 0.01001524, + "balance_loss_clip": 1.00244284, + "balance_loss_mlp": 1.00022435, + "epoch": 0.5163384939125206, + "flos": 69959553713280.0, + "grad_norm": 0.7356760826918036, + "language_loss": 0.57850909, + "learning_rate": 1.9912371402037984e-06, + "loss": 0.59852707, + "num_input_tokens_seen": 184637875, + "step": 8588, + "time_per_iteration": 3.264117956161499 + }, + { + "auxiliary_loss_clip": 0.01043491, + "auxiliary_loss_mlp": 0.01037552, + "balance_loss_clip": 1.02565217, + "balance_loss_mlp": 1.02485657, + "epoch": 0.5163986171651886, + "flos": 17417034668160.0, + "grad_norm": 1.972444351820903, + "language_loss": 0.7577486, + "learning_rate": 1.990847682429185e-06, + "loss": 0.77855909, + "num_input_tokens_seen": 184656125, + "step": 8589, + "time_per_iteration": 2.5663678646087646 + }, + { + "auxiliary_loss_clip": 0.01057775, + "auxiliary_loss_mlp": 0.01031239, + "balance_loss_clip": 1.02496958, + "balance_loss_mlp": 1.01997983, + "epoch": 0.5164587404178566, + "flos": 21322679166720.0, + "grad_norm": 2.199720550780994, + "language_loss": 0.68068171, + "learning_rate": 1.990458225001627e-06, + "loss": 0.70157182, + "num_input_tokens_seen": 184675920, + "step": 8590, + "time_per_iteration": 2.634711742401123 + }, + { + "auxiliary_loss_clip": 0.01000485, + "auxiliary_loss_mlp": 0.01000781, + "balance_loss_clip": 1.00316262, + "balance_loss_mlp": 0.99949992, + "epoch": 0.5165188636705246, + "flos": 68057149691520.0, + "grad_norm": 0.7864548322373102, + "language_loss": 0.55883461, + "learning_rate": 1.990068767935895e-06, + "loss": 0.57884729, + "num_input_tokens_seen": 184730520, + "step": 8591, + "time_per_iteration": 3.1057302951812744 + }, + { + "auxiliary_loss_clip": 0.0104473, + "auxiliary_loss_mlp": 0.01025932, + "balance_loss_clip": 1.02570033, + "balance_loss_mlp": 1.01536381, + "epoch": 0.5165789869231926, + "flos": 19385657412480.0, + "grad_norm": 1.8985142719477894, + "language_loss": 0.8150571, + "learning_rate": 1.9896793112467566e-06, + "loss": 0.83576369, + "num_input_tokens_seen": 184748340, + "step": 8592, + "time_per_iteration": 2.687260627746582 + }, + { + "auxiliary_loss_clip": 0.01052033, + "auxiliary_loss_mlp": 0.01028401, + "balance_loss_clip": 1.02517271, + "balance_loss_mlp": 1.01683784, + "epoch": 0.5166391101758605, + "flos": 20960197067520.0, + "grad_norm": 1.7249560095816683, + "language_loss": 0.83255023, + "learning_rate": 1.989289854948979e-06, + "loss": 0.85335457, + "num_input_tokens_seen": 184766615, + "step": 8593, + "time_per_iteration": 4.24436616897583 + }, + { + "auxiliary_loss_clip": 0.01043419, + "auxiliary_loss_mlp": 0.01036501, + "balance_loss_clip": 1.02600074, + "balance_loss_mlp": 1.02400815, + "epoch": 0.5166992334285285, + "flos": 29462407148160.0, + "grad_norm": 2.5983269585513664, + "language_loss": 0.69601035, + "learning_rate": 1.9889003990573314e-06, + "loss": 0.71680951, + "num_input_tokens_seen": 184788075, + "step": 8594, + "time_per_iteration": 2.6772119998931885 + }, + { + "auxiliary_loss_clip": 0.01027088, + "auxiliary_loss_mlp": 0.01027593, + "balance_loss_clip": 1.02357638, + "balance_loss_mlp": 1.01586294, + "epoch": 0.5167593566811964, + "flos": 20304360593280.0, + "grad_norm": 1.4285191675219728, + "language_loss": 0.77310371, + "learning_rate": 1.988510943586582e-06, + "loss": 0.79365051, + "num_input_tokens_seen": 184808710, + "step": 8595, + "time_per_iteration": 2.6788785457611084 + }, + { + "auxiliary_loss_clip": 0.01069818, + "auxiliary_loss_mlp": 0.01034622, + "balance_loss_clip": 1.02860165, + "balance_loss_mlp": 1.02309453, + "epoch": 0.5168194799338645, + "flos": 14611370313600.0, + "grad_norm": 1.4184790078889173, + "language_loss": 0.64970648, + "learning_rate": 1.9881214885514986e-06, + "loss": 0.67075086, + "num_input_tokens_seen": 184826475, + "step": 8596, + "time_per_iteration": 4.160227537155151 + }, + { + "auxiliary_loss_clip": 0.01030066, + "auxiliary_loss_mlp": 0.01035451, + "balance_loss_clip": 1.02570784, + "balance_loss_mlp": 1.02223659, + "epoch": 0.5168796031865324, + "flos": 25007257411200.0, + "grad_norm": 1.5130649814655943, + "language_loss": 0.75507486, + "learning_rate": 1.9877320339668492e-06, + "loss": 0.77573001, + "num_input_tokens_seen": 184845245, + "step": 8597, + "time_per_iteration": 2.7172610759735107 + }, + { + "auxiliary_loss_clip": 0.01067548, + "auxiliary_loss_mlp": 0.01022987, + "balance_loss_clip": 1.02546525, + "balance_loss_mlp": 1.01216817, + "epoch": 0.5169397264392004, + "flos": 26939969533440.0, + "grad_norm": 1.5707300008356024, + "language_loss": 0.80960637, + "learning_rate": 1.987342579847403e-06, + "loss": 0.83051169, + "num_input_tokens_seen": 184866605, + "step": 8598, + "time_per_iteration": 2.6684932708740234 + }, + { + "auxiliary_loss_clip": 0.01014709, + "auxiliary_loss_mlp": 0.0103716, + "balance_loss_clip": 1.02048707, + "balance_loss_mlp": 1.02522755, + "epoch": 0.5169998496918683, + "flos": 25407804948480.0, + "grad_norm": 1.4993907258509451, + "language_loss": 0.7518152, + "learning_rate": 1.9869531262079273e-06, + "loss": 0.77233392, + "num_input_tokens_seen": 184886945, + "step": 8599, + "time_per_iteration": 2.84360933303833 + }, + { + "auxiliary_loss_clip": 0.01048862, + "auxiliary_loss_mlp": 0.01032373, + "balance_loss_clip": 1.02653432, + "balance_loss_mlp": 1.02137589, + "epoch": 0.5170599729445363, + "flos": 24680793674880.0, + "grad_norm": 2.7714483491716435, + "language_loss": 0.72150207, + "learning_rate": 1.9865636730631904e-06, + "loss": 0.74231434, + "num_input_tokens_seen": 184905590, + "step": 8600, + "time_per_iteration": 2.732264518737793 + }, + { + "auxiliary_loss_clip": 0.01030901, + "auxiliary_loss_mlp": 0.0103011, + "balance_loss_clip": 1.02367854, + "balance_loss_mlp": 1.01809943, + "epoch": 0.5171200961972042, + "flos": 20994455664000.0, + "grad_norm": 1.4811226202740024, + "language_loss": 0.74606597, + "learning_rate": 1.9861742204279602e-06, + "loss": 0.76667613, + "num_input_tokens_seen": 184925555, + "step": 8601, + "time_per_iteration": 2.694113254547119 + }, + { + "auxiliary_loss_clip": 0.01057266, + "auxiliary_loss_mlp": 0.01035993, + "balance_loss_clip": 1.02538967, + "balance_loss_mlp": 1.02423298, + "epoch": 0.5171802194498722, + "flos": 22745639427840.0, + "grad_norm": 1.960518947949651, + "language_loss": 0.83971506, + "learning_rate": 1.9857847683170045e-06, + "loss": 0.86064768, + "num_input_tokens_seen": 184944490, + "step": 8602, + "time_per_iteration": 2.5756635665893555 + }, + { + "auxiliary_loss_clip": 0.0106886, + "auxiliary_loss_mlp": 0.01027917, + "balance_loss_clip": 1.02657104, + "balance_loss_mlp": 1.01598978, + "epoch": 0.5172403427025402, + "flos": 28176732668160.0, + "grad_norm": 1.7919187800367922, + "language_loss": 0.74455762, + "learning_rate": 1.9853953167450926e-06, + "loss": 0.76552534, + "num_input_tokens_seen": 184963190, + "step": 8603, + "time_per_iteration": 2.5969090461730957 + }, + { + "auxiliary_loss_clip": 0.01050723, + "auxiliary_loss_mlp": 0.01031525, + "balance_loss_clip": 1.02808154, + "balance_loss_mlp": 1.02018797, + "epoch": 0.5173004659552082, + "flos": 20337829090560.0, + "grad_norm": 2.548756816678901, + "language_loss": 0.72538364, + "learning_rate": 1.9850058657269915e-06, + "loss": 0.7462061, + "num_input_tokens_seen": 184981220, + "step": 8604, + "time_per_iteration": 2.657846450805664 + }, + { + "auxiliary_loss_clip": 0.01053216, + "auxiliary_loss_mlp": 0.01034518, + "balance_loss_clip": 1.02690518, + "balance_loss_mlp": 1.02198339, + "epoch": 0.5173605892078762, + "flos": 19063323740160.0, + "grad_norm": 1.7979776698603922, + "language_loss": 0.85272062, + "learning_rate": 1.984616415277469e-06, + "loss": 0.87359798, + "num_input_tokens_seen": 184998810, + "step": 8605, + "time_per_iteration": 2.6478984355926514 + }, + { + "auxiliary_loss_clip": 0.0105699, + "auxiliary_loss_mlp": 0.01027247, + "balance_loss_clip": 1.02561021, + "balance_loss_mlp": 1.01608872, + "epoch": 0.5174207124605441, + "flos": 27995168396160.0, + "grad_norm": 1.6545928596912456, + "language_loss": 0.64785945, + "learning_rate": 1.984226965411294e-06, + "loss": 0.66870189, + "num_input_tokens_seen": 185021185, + "step": 8606, + "time_per_iteration": 2.6410670280456543 + }, + { + "auxiliary_loss_clip": 0.01047743, + "auxiliary_loss_mlp": 0.01028506, + "balance_loss_clip": 1.02606869, + "balance_loss_mlp": 1.01688266, + "epoch": 0.5174808357132121, + "flos": 19496657416320.0, + "grad_norm": 2.164644863690599, + "language_loss": 0.7803036, + "learning_rate": 1.983837516143234e-06, + "loss": 0.80106604, + "num_input_tokens_seen": 185038465, + "step": 8607, + "time_per_iteration": 2.6288936138153076 + }, + { + "auxiliary_loss_clip": 0.01060163, + "auxiliary_loss_mlp": 0.01032124, + "balance_loss_clip": 1.02725351, + "balance_loss_mlp": 1.0197562, + "epoch": 0.51754095896588, + "flos": 22784171742720.0, + "grad_norm": 2.596463341964627, + "language_loss": 0.71943396, + "learning_rate": 1.983448067488057e-06, + "loss": 0.74035686, + "num_input_tokens_seen": 185057340, + "step": 8608, + "time_per_iteration": 2.589345693588257 + }, + { + "auxiliary_loss_clip": 0.01064438, + "auxiliary_loss_mlp": 0.01031204, + "balance_loss_clip": 1.02722049, + "balance_loss_mlp": 1.01883006, + "epoch": 0.5176010822185481, + "flos": 22669257156480.0, + "grad_norm": 1.8945571768924545, + "language_loss": 0.86543745, + "learning_rate": 1.983058619460531e-06, + "loss": 0.88639379, + "num_input_tokens_seen": 185074935, + "step": 8609, + "time_per_iteration": 2.579798460006714 + }, + { + "auxiliary_loss_clip": 0.01052458, + "auxiliary_loss_mlp": 0.01029407, + "balance_loss_clip": 1.02294946, + "balance_loss_mlp": 1.01866674, + "epoch": 0.517661205471216, + "flos": 23951196622080.0, + "grad_norm": 2.2256429895026515, + "language_loss": 0.73557961, + "learning_rate": 1.9826691720754237e-06, + "loss": 0.75639832, + "num_input_tokens_seen": 185095050, + "step": 8610, + "time_per_iteration": 2.5942413806915283 + }, + { + "auxiliary_loss_clip": 0.01071401, + "auxiliary_loss_mlp": 0.01032584, + "balance_loss_clip": 1.02620792, + "balance_loss_mlp": 1.01935732, + "epoch": 0.517721328723884, + "flos": 15596076735360.0, + "grad_norm": 2.174751525149693, + "language_loss": 0.67272329, + "learning_rate": 1.9822797253475034e-06, + "loss": 0.69376314, + "num_input_tokens_seen": 185112275, + "step": 8611, + "time_per_iteration": 2.5956408977508545 + }, + { + "auxiliary_loss_clip": 0.01067472, + "auxiliary_loss_mlp": 0.01033134, + "balance_loss_clip": 1.02559328, + "balance_loss_mlp": 1.02137446, + "epoch": 0.5177814519765519, + "flos": 20960197067520.0, + "grad_norm": 1.9269598925045495, + "language_loss": 0.76752645, + "learning_rate": 1.9818902792915373e-06, + "loss": 0.7885325, + "num_input_tokens_seen": 185132165, + "step": 8612, + "time_per_iteration": 2.7332940101623535 + }, + { + "auxiliary_loss_clip": 0.01057712, + "auxiliary_loss_mlp": 0.01033093, + "balance_loss_clip": 1.02541232, + "balance_loss_mlp": 1.02181566, + "epoch": 0.5178415752292199, + "flos": 17967832796160.0, + "grad_norm": 3.544359177089442, + "language_loss": 0.82171571, + "learning_rate": 1.981500833922294e-06, + "loss": 0.84262371, + "num_input_tokens_seen": 185151025, + "step": 8613, + "time_per_iteration": 2.5857226848602295 + }, + { + "auxiliary_loss_clip": 0.01070659, + "auxiliary_loss_mlp": 0.01033772, + "balance_loss_clip": 1.0283246, + "balance_loss_mlp": 1.02136278, + "epoch": 0.5179016984818878, + "flos": 17821496787840.0, + "grad_norm": 2.2160415258545894, + "language_loss": 0.66036952, + "learning_rate": 1.981111389254541e-06, + "loss": 0.68141377, + "num_input_tokens_seen": 185168455, + "step": 8614, + "time_per_iteration": 2.5627756118774414 + }, + { + "auxiliary_loss_clip": 0.01043484, + "auxiliary_loss_mlp": 0.01031249, + "balance_loss_clip": 1.02499676, + "balance_loss_mlp": 1.01891136, + "epoch": 0.5179618217345558, + "flos": 17820455293440.0, + "grad_norm": 1.882596694139549, + "language_loss": 0.8617776, + "learning_rate": 1.9807219453030453e-06, + "loss": 0.88252497, + "num_input_tokens_seen": 185184415, + "step": 8615, + "time_per_iteration": 2.6400015354156494 + }, + { + "auxiliary_loss_clip": 0.0105724, + "auxiliary_loss_mlp": 0.0103423, + "balance_loss_clip": 1.02613747, + "balance_loss_mlp": 1.02314925, + "epoch": 0.5180219449872238, + "flos": 22522131048960.0, + "grad_norm": 1.6468871286846334, + "language_loss": 0.8058458, + "learning_rate": 1.9803325020825763e-06, + "loss": 0.82676047, + "num_input_tokens_seen": 185202910, + "step": 8616, + "time_per_iteration": 2.6061224937438965 + }, + { + "auxiliary_loss_clip": 0.01065727, + "auxiliary_loss_mlp": 0.0074742, + "balance_loss_clip": 1.03113127, + "balance_loss_mlp": 1.00009108, + "epoch": 0.5180820682398918, + "flos": 23915465568000.0, + "grad_norm": 1.7602329368336336, + "language_loss": 0.75607234, + "learning_rate": 1.9799430596079e-06, + "loss": 0.77420384, + "num_input_tokens_seen": 185223085, + "step": 8617, + "time_per_iteration": 2.629103422164917 + }, + { + "auxiliary_loss_clip": 0.01068115, + "auxiliary_loss_mlp": 0.01032696, + "balance_loss_clip": 1.02535892, + "balance_loss_mlp": 1.02001214, + "epoch": 0.5181421914925598, + "flos": 16979930064000.0, + "grad_norm": 1.6182125856281147, + "language_loss": 0.70335674, + "learning_rate": 1.979553617893785e-06, + "loss": 0.72436476, + "num_input_tokens_seen": 185241295, + "step": 8618, + "time_per_iteration": 2.7810285091400146 + }, + { + "auxiliary_loss_clip": 0.00999977, + "auxiliary_loss_mlp": 0.0100383, + "balance_loss_clip": 1.00253153, + "balance_loss_mlp": 1.00265551, + "epoch": 0.5182023147452277, + "flos": 66059870872320.0, + "grad_norm": 0.9417561703987684, + "language_loss": 0.67242801, + "learning_rate": 1.979164176954999e-06, + "loss": 0.69246602, + "num_input_tokens_seen": 185298295, + "step": 8619, + "time_per_iteration": 3.323010206222534 + }, + { + "auxiliary_loss_clip": 0.01024732, + "auxiliary_loss_mlp": 0.01028707, + "balance_loss_clip": 1.02520895, + "balance_loss_mlp": 1.01721525, + "epoch": 0.5182624379978957, + "flos": 18187749815040.0, + "grad_norm": 1.910860809944583, + "language_loss": 0.79309332, + "learning_rate": 1.97877473680631e-06, + "loss": 0.81362772, + "num_input_tokens_seen": 185317000, + "step": 8620, + "time_per_iteration": 2.701960802078247 + }, + { + "auxiliary_loss_clip": 0.01007301, + "auxiliary_loss_mlp": 0.00747379, + "balance_loss_clip": 1.02448285, + "balance_loss_mlp": 1.00006104, + "epoch": 0.5183225612505636, + "flos": 14026708638720.0, + "grad_norm": 2.050541842407905, + "language_loss": 0.81581652, + "learning_rate": 1.9783852974624846e-06, + "loss": 0.83336329, + "num_input_tokens_seen": 185331185, + "step": 8621, + "time_per_iteration": 2.76277232170105 + }, + { + "auxiliary_loss_clip": 0.01036879, + "auxiliary_loss_mlp": 0.0103307, + "balance_loss_clip": 1.02205491, + "balance_loss_mlp": 1.02191162, + "epoch": 0.5183826845032317, + "flos": 23659781581440.0, + "grad_norm": 1.931655676067744, + "language_loss": 0.65960211, + "learning_rate": 1.9779958589382905e-06, + "loss": 0.68030155, + "num_input_tokens_seen": 185348955, + "step": 8622, + "time_per_iteration": 2.795243263244629 + }, + { + "auxiliary_loss_clip": 0.01047859, + "auxiliary_loss_mlp": 0.01034148, + "balance_loss_clip": 1.02440596, + "balance_loss_mlp": 1.02185738, + "epoch": 0.5184428077558996, + "flos": 15888605097600.0, + "grad_norm": 2.377024086725483, + "language_loss": 0.60487491, + "learning_rate": 1.977606421248497e-06, + "loss": 0.62569499, + "num_input_tokens_seen": 185367330, + "step": 8623, + "time_per_iteration": 2.676347017288208 + }, + { + "auxiliary_loss_clip": 0.0106811, + "auxiliary_loss_mlp": 0.01029873, + "balance_loss_clip": 1.02581906, + "balance_loss_mlp": 1.01921594, + "epoch": 0.5185029310085676, + "flos": 21030833162880.0, + "grad_norm": 1.7581879219704628, + "language_loss": 0.76159525, + "learning_rate": 1.9772169844078685e-06, + "loss": 0.78257513, + "num_input_tokens_seen": 185385060, + "step": 8624, + "time_per_iteration": 2.557875633239746 + }, + { + "auxiliary_loss_clip": 0.0102557, + "auxiliary_loss_mlp": 0.01036496, + "balance_loss_clip": 1.02262843, + "balance_loss_mlp": 1.02390742, + "epoch": 0.5185630542612355, + "flos": 26542690133760.0, + "grad_norm": 1.8186544035286074, + "language_loss": 0.71269906, + "learning_rate": 1.9768275484311756e-06, + "loss": 0.73331964, + "num_input_tokens_seen": 185403745, + "step": 8625, + "time_per_iteration": 2.8400723934173584 + }, + { + "auxiliary_loss_clip": 0.01045897, + "auxiliary_loss_mlp": 0.01028607, + "balance_loss_clip": 1.02491283, + "balance_loss_mlp": 1.01763952, + "epoch": 0.5186231775139035, + "flos": 20668422890880.0, + "grad_norm": 1.9277711293749535, + "language_loss": 0.67645097, + "learning_rate": 1.976438113333184e-06, + "loss": 0.69719601, + "num_input_tokens_seen": 185422620, + "step": 8626, + "time_per_iteration": 2.778122901916504 + }, + { + "auxiliary_loss_clip": 0.01054302, + "auxiliary_loss_mlp": 0.01031546, + "balance_loss_clip": 1.02440429, + "balance_loss_mlp": 1.02023864, + "epoch": 0.5186833007665714, + "flos": 20885502735360.0, + "grad_norm": 1.9704501635889984, + "language_loss": 0.70586067, + "learning_rate": 1.9760486791286612e-06, + "loss": 0.72671914, + "num_input_tokens_seen": 185439380, + "step": 8627, + "time_per_iteration": 2.766429901123047 + }, + { + "auxiliary_loss_clip": 0.01071785, + "auxiliary_loss_mlp": 0.00747654, + "balance_loss_clip": 1.02750087, + "balance_loss_mlp": 1.00015235, + "epoch": 0.5187434240192395, + "flos": 20886903365760.0, + "grad_norm": 1.7807963998905072, + "language_loss": 0.73430693, + "learning_rate": 1.9756592458323753e-06, + "loss": 0.75250125, + "num_input_tokens_seen": 185458830, + "step": 8628, + "time_per_iteration": 2.709233045578003 + }, + { + "auxiliary_loss_clip": 0.01047202, + "auxiliary_loss_mlp": 0.01029848, + "balance_loss_clip": 1.02630496, + "balance_loss_mlp": 1.01898849, + "epoch": 0.5188035472719074, + "flos": 19859929614720.0, + "grad_norm": 4.434743644424817, + "language_loss": 0.77225381, + "learning_rate": 1.9752698134590927e-06, + "loss": 0.7930243, + "num_input_tokens_seen": 185477270, + "step": 8629, + "time_per_iteration": 4.25313138961792 + }, + { + "auxiliary_loss_clip": 0.01060322, + "auxiliary_loss_mlp": 0.01029981, + "balance_loss_clip": 1.02755356, + "balance_loss_mlp": 1.01754189, + "epoch": 0.5188636705245754, + "flos": 21138313633920.0, + "grad_norm": 1.8820540587295227, + "language_loss": 0.74969101, + "learning_rate": 1.9748803820235815e-06, + "loss": 0.770594, + "num_input_tokens_seen": 185495795, + "step": 8630, + "time_per_iteration": 2.6274256706237793 + }, + { + "auxiliary_loss_clip": 0.01058693, + "auxiliary_loss_mlp": 0.0103287, + "balance_loss_clip": 1.02602506, + "balance_loss_mlp": 1.02042484, + "epoch": 0.5189237937772434, + "flos": 22419786222720.0, + "grad_norm": 1.8209502671445248, + "language_loss": 0.80343056, + "learning_rate": 1.9744909515406093e-06, + "loss": 0.82434618, + "num_input_tokens_seen": 185514885, + "step": 8631, + "time_per_iteration": 2.591609001159668 + }, + { + "auxiliary_loss_clip": 0.01061525, + "auxiliary_loss_mlp": 0.01029298, + "balance_loss_clip": 1.02752519, + "balance_loss_mlp": 1.01713908, + "epoch": 0.5189839170299113, + "flos": 25446696399360.0, + "grad_norm": 1.6142494663436608, + "language_loss": 0.7456845, + "learning_rate": 1.974101522024942e-06, + "loss": 0.76659274, + "num_input_tokens_seen": 185537155, + "step": 8632, + "time_per_iteration": 4.153696537017822 + }, + { + "auxiliary_loss_clip": 0.01038942, + "auxiliary_loss_mlp": 0.01026366, + "balance_loss_clip": 1.02736878, + "balance_loss_mlp": 1.01504719, + "epoch": 0.5190440402825793, + "flos": 18587722734720.0, + "grad_norm": 1.792386218013556, + "language_loss": 0.78355837, + "learning_rate": 1.9737120934913477e-06, + "loss": 0.8042115, + "num_input_tokens_seen": 185555520, + "step": 8633, + "time_per_iteration": 2.6842377185821533 + }, + { + "auxiliary_loss_clip": 0.01057226, + "auxiliary_loss_mlp": 0.01035004, + "balance_loss_clip": 1.02514434, + "balance_loss_mlp": 1.02338743, + "epoch": 0.5191041635352472, + "flos": 21908633731200.0, + "grad_norm": 1.8655677886405428, + "language_loss": 0.80253744, + "learning_rate": 1.9733226659545936e-06, + "loss": 0.82345974, + "num_input_tokens_seen": 185573855, + "step": 8634, + "time_per_iteration": 2.661569595336914 + }, + { + "auxiliary_loss_clip": 0.01068113, + "auxiliary_loss_mlp": 0.01033839, + "balance_loss_clip": 1.0280838, + "balance_loss_mlp": 1.02246082, + "epoch": 0.5191642867879153, + "flos": 27527971173120.0, + "grad_norm": 7.34243347491829, + "language_loss": 0.68705142, + "learning_rate": 1.9729332394294467e-06, + "loss": 0.70807087, + "num_input_tokens_seen": 185595145, + "step": 8635, + "time_per_iteration": 2.633685350418091 + }, + { + "auxiliary_loss_clip": 0.01044538, + "auxiliary_loss_mlp": 0.01033457, + "balance_loss_clip": 1.02481461, + "balance_loss_mlp": 1.02182841, + "epoch": 0.5192244100405832, + "flos": 15705999331200.0, + "grad_norm": 1.9063677377068582, + "language_loss": 0.77568448, + "learning_rate": 1.9725438139306742e-06, + "loss": 0.79646444, + "num_input_tokens_seen": 185613320, + "step": 8636, + "time_per_iteration": 2.7304537296295166 + }, + { + "auxiliary_loss_clip": 0.01070462, + "auxiliary_loss_mlp": 0.01029492, + "balance_loss_clip": 1.02721322, + "balance_loss_mlp": 1.0176549, + "epoch": 0.5192845332932512, + "flos": 12057080313600.0, + "grad_norm": 2.026585728606575, + "language_loss": 0.71631145, + "learning_rate": 1.9721543894730425e-06, + "loss": 0.73731095, + "num_input_tokens_seen": 185630730, + "step": 8637, + "time_per_iteration": 2.5314414501190186 + }, + { + "auxiliary_loss_clip": 0.01035979, + "auxiliary_loss_mlp": 0.01031277, + "balance_loss_clip": 1.02580822, + "balance_loss_mlp": 1.01961851, + "epoch": 0.5193446565459191, + "flos": 18953185662720.0, + "grad_norm": 3.2492659724709485, + "language_loss": 0.7612797, + "learning_rate": 1.9717649660713194e-06, + "loss": 0.78195232, + "num_input_tokens_seen": 185648515, + "step": 8638, + "time_per_iteration": 2.7077345848083496 + }, + { + "auxiliary_loss_clip": 0.010367, + "auxiliary_loss_mlp": 0.01027018, + "balance_loss_clip": 1.02500725, + "balance_loss_mlp": 1.01544321, + "epoch": 0.5194047797985871, + "flos": 20374960775040.0, + "grad_norm": 2.0763770311226777, + "language_loss": 0.74856377, + "learning_rate": 1.971375543740272e-06, + "loss": 0.76920092, + "num_input_tokens_seen": 185665220, + "step": 8639, + "time_per_iteration": 2.7224104404449463 + }, + { + "auxiliary_loss_clip": 0.01067492, + "auxiliary_loss_mlp": 0.01025509, + "balance_loss_clip": 1.02663159, + "balance_loss_mlp": 1.01359427, + "epoch": 0.519464903051255, + "flos": 24353001135360.0, + "grad_norm": 1.6378153933704467, + "language_loss": 0.7751298, + "learning_rate": 1.9709861224946665e-06, + "loss": 0.79605973, + "num_input_tokens_seen": 185683750, + "step": 8640, + "time_per_iteration": 2.5735068321228027 + }, + { + "auxiliary_loss_clip": 0.01032733, + "auxiliary_loss_mlp": 0.01031673, + "balance_loss_clip": 1.02544498, + "balance_loss_mlp": 1.01993108, + "epoch": 0.519525026303923, + "flos": 14061829161600.0, + "grad_norm": 1.6108476424506246, + "language_loss": 0.66123998, + "learning_rate": 1.97059670234927e-06, + "loss": 0.68188405, + "num_input_tokens_seen": 185700625, + "step": 8641, + "time_per_iteration": 4.34346079826355 + }, + { + "auxiliary_loss_clip": 0.01067093, + "auxiliary_loss_mlp": 0.01028557, + "balance_loss_clip": 1.02637577, + "balance_loss_mlp": 1.01759624, + "epoch": 0.519585149556591, + "flos": 28835873193600.0, + "grad_norm": 1.7078479204120571, + "language_loss": 0.76221919, + "learning_rate": 1.97020728331885e-06, + "loss": 0.78317571, + "num_input_tokens_seen": 185721155, + "step": 8642, + "time_per_iteration": 2.8045127391815186 + }, + { + "auxiliary_loss_clip": 0.01065933, + "auxiliary_loss_mlp": 0.01026312, + "balance_loss_clip": 1.02519357, + "balance_loss_mlp": 1.01497555, + "epoch": 0.519645272809259, + "flos": 25373007648000.0, + "grad_norm": 1.5351798861398567, + "language_loss": 0.83055776, + "learning_rate": 1.9698178654181726e-06, + "loss": 0.85148025, + "num_input_tokens_seen": 185740990, + "step": 8643, + "time_per_iteration": 4.338962554931641 + }, + { + "auxiliary_loss_clip": 0.01068549, + "auxiliary_loss_mlp": 0.01036185, + "balance_loss_clip": 1.02565551, + "balance_loss_mlp": 1.02432942, + "epoch": 0.519705396061927, + "flos": 25372863993600.0, + "grad_norm": 1.6818057301219662, + "language_loss": 0.70229554, + "learning_rate": 1.969428448662004e-06, + "loss": 0.7233429, + "num_input_tokens_seen": 185762235, + "step": 8644, + "time_per_iteration": 2.5774834156036377 + }, + { + "auxiliary_loss_clip": 0.01057767, + "auxiliary_loss_mlp": 0.00747549, + "balance_loss_clip": 1.02576625, + "balance_loss_mlp": 1.00020075, + "epoch": 0.5197655193145949, + "flos": 28476228268800.0, + "grad_norm": 1.811077727785819, + "language_loss": 0.806485, + "learning_rate": 1.9690390330651133e-06, + "loss": 0.82453817, + "num_input_tokens_seen": 185783415, + "step": 8645, + "time_per_iteration": 2.631837844848633 + }, + { + "auxiliary_loss_clip": 0.0106753, + "auxiliary_loss_mlp": 0.01029951, + "balance_loss_clip": 1.02512562, + "balance_loss_mlp": 1.01807225, + "epoch": 0.5198256425672629, + "flos": 20009138711040.0, + "grad_norm": 2.253707471284402, + "language_loss": 0.78113854, + "learning_rate": 1.968649618642264e-06, + "loss": 0.80211335, + "num_input_tokens_seen": 185801345, + "step": 8646, + "time_per_iteration": 2.59546160697937 + }, + { + "auxiliary_loss_clip": 0.01057821, + "auxiliary_loss_mlp": 0.01031352, + "balance_loss_clip": 1.02594411, + "balance_loss_mlp": 1.0202775, + "epoch": 0.5198857658199308, + "flos": 19828867328640.0, + "grad_norm": 1.7953176217092017, + "language_loss": 0.66075432, + "learning_rate": 1.9682602054082252e-06, + "loss": 0.68164599, + "num_input_tokens_seen": 185820815, + "step": 8647, + "time_per_iteration": 2.5505683422088623 + }, + { + "auxiliary_loss_clip": 0.01071853, + "auxiliary_loss_mlp": 0.01032724, + "balance_loss_clip": 1.02726722, + "balance_loss_mlp": 1.01950359, + "epoch": 0.5199458890725989, + "flos": 24461918150400.0, + "grad_norm": 2.0287787861057476, + "language_loss": 0.70997453, + "learning_rate": 1.967870793377763e-06, + "loss": 0.73102033, + "num_input_tokens_seen": 185841450, + "step": 8648, + "time_per_iteration": 2.595383405685425 + }, + { + "auxiliary_loss_clip": 0.01052425, + "auxiliary_loss_mlp": 0.01031425, + "balance_loss_clip": 1.02829719, + "balance_loss_mlp": 1.01885414, + "epoch": 0.5200060123252668, + "flos": 23404779953280.0, + "grad_norm": 3.5390942638376517, + "language_loss": 0.64529842, + "learning_rate": 1.967481382565642e-06, + "loss": 0.66613692, + "num_input_tokens_seen": 185859935, + "step": 8649, + "time_per_iteration": 2.597980260848999 + }, + { + "auxiliary_loss_clip": 0.01051758, + "auxiliary_loss_mlp": 0.01039829, + "balance_loss_clip": 1.02711439, + "balance_loss_mlp": 1.02627468, + "epoch": 0.5200661355779348, + "flos": 17201355454080.0, + "grad_norm": 2.1970283960317647, + "language_loss": 0.70396125, + "learning_rate": 1.9670919729866315e-06, + "loss": 0.72487712, + "num_input_tokens_seen": 185876795, + "step": 8650, + "time_per_iteration": 2.5591063499450684 + }, + { + "auxiliary_loss_clip": 0.01067435, + "auxiliary_loss_mlp": 0.01026838, + "balance_loss_clip": 1.02598107, + "balance_loss_mlp": 1.01539993, + "epoch": 0.5201262588306027, + "flos": 18515075477760.0, + "grad_norm": 1.7547165644370761, + "language_loss": 0.77594143, + "learning_rate": 1.966702564655496e-06, + "loss": 0.79688418, + "num_input_tokens_seen": 185895570, + "step": 8651, + "time_per_iteration": 2.5144894123077393 + }, + { + "auxiliary_loss_clip": 0.01018747, + "auxiliary_loss_mlp": 0.01035749, + "balance_loss_clip": 1.0263021, + "balance_loss_mlp": 1.02316654, + "epoch": 0.5201863820832707, + "flos": 18619395552000.0, + "grad_norm": 1.7479855763504992, + "language_loss": 0.78541851, + "learning_rate": 1.966313157587003e-06, + "loss": 0.80596352, + "num_input_tokens_seen": 185913700, + "step": 8652, + "time_per_iteration": 2.7965285778045654 + }, + { + "auxiliary_loss_clip": 0.01037183, + "auxiliary_loss_mlp": 0.01032151, + "balance_loss_clip": 1.02625, + "balance_loss_mlp": 1.01894236, + "epoch": 0.5202465053359386, + "flos": 22857142222080.0, + "grad_norm": 1.9709195960375159, + "language_loss": 0.70186114, + "learning_rate": 1.9659237517959187e-06, + "loss": 0.7225545, + "num_input_tokens_seen": 185932460, + "step": 8653, + "time_per_iteration": 2.7202517986297607 + }, + { + "auxiliary_loss_clip": 0.01041603, + "auxiliary_loss_mlp": 0.01042374, + "balance_loss_clip": 1.02789199, + "balance_loss_mlp": 1.02960062, + "epoch": 0.5203066285886067, + "flos": 21981532383360.0, + "grad_norm": 1.5421839633256116, + "language_loss": 0.78894734, + "learning_rate": 1.965534347297008e-06, + "loss": 0.80978715, + "num_input_tokens_seen": 185952030, + "step": 8654, + "time_per_iteration": 2.683936834335327 + }, + { + "auxiliary_loss_clip": 0.01055434, + "auxiliary_loss_mlp": 0.01035535, + "balance_loss_clip": 1.0251801, + "balance_loss_mlp": 1.02246952, + "epoch": 0.5203667518412746, + "flos": 20233329448320.0, + "grad_norm": 1.9419847075179693, + "language_loss": 0.84070063, + "learning_rate": 1.9651449441050393e-06, + "loss": 0.86161029, + "num_input_tokens_seen": 185973130, + "step": 8655, + "time_per_iteration": 2.769362688064575 + }, + { + "auxiliary_loss_clip": 0.01059799, + "auxiliary_loss_mlp": 0.01030358, + "balance_loss_clip": 1.0284164, + "balance_loss_mlp": 1.01931286, + "epoch": 0.5204268750939426, + "flos": 15705460627200.0, + "grad_norm": 8.485986209002089, + "language_loss": 0.65743876, + "learning_rate": 1.9647555422347777e-06, + "loss": 0.67834032, + "num_input_tokens_seen": 185990200, + "step": 8656, + "time_per_iteration": 2.6568429470062256 + }, + { + "auxiliary_loss_clip": 0.0103469, + "auxiliary_loss_mlp": 0.01035683, + "balance_loss_clip": 1.02844286, + "balance_loss_mlp": 1.02335632, + "epoch": 0.5204869983466105, + "flos": 27449469999360.0, + "grad_norm": 1.7714280230505564, + "language_loss": 0.73130131, + "learning_rate": 1.9643661417009893e-06, + "loss": 0.7520051, + "num_input_tokens_seen": 186009880, + "step": 8657, + "time_per_iteration": 2.797729253768921 + }, + { + "auxiliary_loss_clip": 0.01037927, + "auxiliary_loss_mlp": 0.01038378, + "balance_loss_clip": 1.02586162, + "balance_loss_mlp": 1.02543807, + "epoch": 0.5205471215992785, + "flos": 20595452411520.0, + "grad_norm": 1.9734674965340422, + "language_loss": 0.7177887, + "learning_rate": 1.9639767425184408e-06, + "loss": 0.73855174, + "num_input_tokens_seen": 186026680, + "step": 8658, + "time_per_iteration": 2.7719454765319824 + }, + { + "auxiliary_loss_clip": 0.01068227, + "auxiliary_loss_mlp": 0.01032464, + "balance_loss_clip": 1.02579904, + "balance_loss_mlp": 1.02002454, + "epoch": 0.5206072448519465, + "flos": 22127904305280.0, + "grad_norm": 4.707483399902172, + "language_loss": 0.8331964, + "learning_rate": 1.963587344701897e-06, + "loss": 0.85420334, + "num_input_tokens_seen": 186046920, + "step": 8659, + "time_per_iteration": 2.568358898162842 + }, + { + "auxiliary_loss_clip": 0.01043453, + "auxiliary_loss_mlp": 0.01037936, + "balance_loss_clip": 1.02453232, + "balance_loss_mlp": 1.02395892, + "epoch": 0.5206673681046144, + "flos": 18330422636160.0, + "grad_norm": 3.264132061997196, + "language_loss": 0.75385135, + "learning_rate": 1.9631979482661253e-06, + "loss": 0.77466524, + "num_input_tokens_seen": 186062090, + "step": 8660, + "time_per_iteration": 2.599564552307129 + }, + { + "auxiliary_loss_clip": 0.0106758, + "auxiliary_loss_mlp": 0.01029819, + "balance_loss_clip": 1.02656269, + "balance_loss_mlp": 1.01852441, + "epoch": 0.5207274913572825, + "flos": 20230240878720.0, + "grad_norm": 1.8250351550800537, + "language_loss": 0.77631748, + "learning_rate": 1.9628085532258906e-06, + "loss": 0.79729152, + "num_input_tokens_seen": 186081135, + "step": 8661, + "time_per_iteration": 2.582319498062134 + }, + { + "auxiliary_loss_clip": 0.01048188, + "auxiliary_loss_mlp": 0.01029546, + "balance_loss_clip": 1.02500415, + "balance_loss_mlp": 1.01786971, + "epoch": 0.5207876146099504, + "flos": 22127042378880.0, + "grad_norm": 1.8250541140486425, + "language_loss": 0.70483971, + "learning_rate": 1.9624191595959603e-06, + "loss": 0.72561711, + "num_input_tokens_seen": 186099700, + "step": 8662, + "time_per_iteration": 2.707517147064209 + }, + { + "auxiliary_loss_clip": 0.01050924, + "auxiliary_loss_mlp": 0.01031737, + "balance_loss_clip": 1.02435791, + "balance_loss_mlp": 1.01900601, + "epoch": 0.5208477378626184, + "flos": 23878908501120.0, + "grad_norm": 1.6331940112653038, + "language_loss": 0.69927603, + "learning_rate": 1.962029767391098e-06, + "loss": 0.72010267, + "num_input_tokens_seen": 186119740, + "step": 8663, + "time_per_iteration": 2.580226421356201 + }, + { + "auxiliary_loss_clip": 0.01038175, + "auxiliary_loss_mlp": 0.00747625, + "balance_loss_clip": 1.02328253, + "balance_loss_mlp": 1.00024903, + "epoch": 0.5209078611152863, + "flos": 20961525870720.0, + "grad_norm": 1.5910068247045122, + "language_loss": 0.77069533, + "learning_rate": 1.961640376626072e-06, + "loss": 0.7885533, + "num_input_tokens_seen": 186140645, + "step": 8664, + "time_per_iteration": 2.644709348678589 + }, + { + "auxiliary_loss_clip": 0.01048004, + "auxiliary_loss_mlp": 0.01034505, + "balance_loss_clip": 1.02596498, + "balance_loss_mlp": 1.02279902, + "epoch": 0.5209679843679543, + "flos": 20667740532480.0, + "grad_norm": 2.0350662623766738, + "language_loss": 0.76202476, + "learning_rate": 1.961250987315646e-06, + "loss": 0.78284979, + "num_input_tokens_seen": 186160130, + "step": 8665, + "time_per_iteration": 2.5813965797424316 + }, + { + "auxiliary_loss_clip": 0.01062673, + "auxiliary_loss_mlp": 0.01030545, + "balance_loss_clip": 1.02981007, + "balance_loss_mlp": 1.0193454, + "epoch": 0.5210281076206222, + "flos": 20227295963520.0, + "grad_norm": 1.7848008568764684, + "language_loss": 0.71931028, + "learning_rate": 1.960861599474586e-06, + "loss": 0.74024248, + "num_input_tokens_seen": 186179485, + "step": 8666, + "time_per_iteration": 2.535949945449829 + }, + { + "auxiliary_loss_clip": 0.01054894, + "auxiliary_loss_mlp": 0.01037799, + "balance_loss_clip": 1.02581513, + "balance_loss_mlp": 1.02339888, + "epoch": 0.5210882308732903, + "flos": 16069989801600.0, + "grad_norm": 5.044967004323372, + "language_loss": 0.68975192, + "learning_rate": 1.9604722131176592e-06, + "loss": 0.71067882, + "num_input_tokens_seen": 186197140, + "step": 8667, + "time_per_iteration": 2.5776398181915283 + }, + { + "auxiliary_loss_clip": 0.01034979, + "auxiliary_loss_mlp": 0.01030088, + "balance_loss_clip": 1.03121734, + "balance_loss_mlp": 1.01870966, + "epoch": 0.5211483541259582, + "flos": 24825298089600.0, + "grad_norm": 1.6530903843307405, + "language_loss": 0.80925524, + "learning_rate": 1.960082828259629e-06, + "loss": 0.82990599, + "num_input_tokens_seen": 186216800, + "step": 8668, + "time_per_iteration": 2.772550106048584 + }, + { + "auxiliary_loss_clip": 0.01046589, + "auxiliary_loss_mlp": 0.01028697, + "balance_loss_clip": 1.02482653, + "balance_loss_mlp": 1.01656771, + "epoch": 0.5212084773786262, + "flos": 20370651143040.0, + "grad_norm": 1.8157600399736733, + "language_loss": 0.63352478, + "learning_rate": 1.9596934449152623e-06, + "loss": 0.65427756, + "num_input_tokens_seen": 186235320, + "step": 8669, + "time_per_iteration": 2.654627799987793 + }, + { + "auxiliary_loss_clip": 0.01049482, + "auxiliary_loss_mlp": 0.00747588, + "balance_loss_clip": 1.0280056, + "balance_loss_mlp": 1.00027609, + "epoch": 0.5212686006312941, + "flos": 23145468693120.0, + "grad_norm": 1.596883479644651, + "language_loss": 0.66783428, + "learning_rate": 1.959304063099325e-06, + "loss": 0.68580496, + "num_input_tokens_seen": 186254460, + "step": 8670, + "time_per_iteration": 2.627387762069702 + }, + { + "auxiliary_loss_clip": 0.01033405, + "auxiliary_loss_mlp": 0.01033468, + "balance_loss_clip": 1.02417612, + "balance_loss_mlp": 1.02197635, + "epoch": 0.5213287238839621, + "flos": 27774030314880.0, + "grad_norm": 2.4019326701964383, + "language_loss": 0.75989258, + "learning_rate": 1.9589146828265806e-06, + "loss": 0.78056139, + "num_input_tokens_seen": 186269465, + "step": 8671, + "time_per_iteration": 2.669640302658081 + }, + { + "auxiliary_loss_clip": 0.0104715, + "auxiliary_loss_mlp": 0.01035671, + "balance_loss_clip": 1.03167152, + "balance_loss_mlp": 1.0229032, + "epoch": 0.5213888471366301, + "flos": 19937676602880.0, + "grad_norm": 3.193394285673231, + "language_loss": 0.78529322, + "learning_rate": 1.958525304111796e-06, + "loss": 0.80612147, + "num_input_tokens_seen": 186288660, + "step": 8672, + "time_per_iteration": 2.638209581375122 + }, + { + "auxiliary_loss_clip": 0.01025728, + "auxiliary_loss_mlp": 0.01029904, + "balance_loss_clip": 1.02012718, + "balance_loss_mlp": 1.01862717, + "epoch": 0.521448970389298, + "flos": 16982731324800.0, + "grad_norm": 1.8627809915101456, + "language_loss": 0.72258478, + "learning_rate": 1.958135926969736e-06, + "loss": 0.74314111, + "num_input_tokens_seen": 186305760, + "step": 8673, + "time_per_iteration": 2.6798856258392334 + }, + { + "auxiliary_loss_clip": 0.01052268, + "auxiliary_loss_mlp": 0.01028495, + "balance_loss_clip": 1.0251925, + "balance_loss_mlp": 1.01622224, + "epoch": 0.5215090936419661, + "flos": 18989706816000.0, + "grad_norm": 1.4740842594272539, + "language_loss": 0.74512291, + "learning_rate": 1.957746551415166e-06, + "loss": 0.76593053, + "num_input_tokens_seen": 186324135, + "step": 8674, + "time_per_iteration": 2.6439218521118164 + }, + { + "auxiliary_loss_clip": 0.01049238, + "auxiliary_loss_mlp": 0.01034393, + "balance_loss_clip": 1.0256865, + "balance_loss_mlp": 1.02074337, + "epoch": 0.521569216894634, + "flos": 16143427157760.0, + "grad_norm": 2.1887364558014646, + "language_loss": 0.85614419, + "learning_rate": 1.9573571774628506e-06, + "loss": 0.87698048, + "num_input_tokens_seen": 186340205, + "step": 8675, + "time_per_iteration": 2.6462860107421875 + }, + { + "auxiliary_loss_clip": 0.00995923, + "auxiliary_loss_mlp": 0.01001462, + "balance_loss_clip": 1.00871301, + "balance_loss_mlp": 1.00003171, + "epoch": 0.521629340147302, + "flos": 57579493282560.0, + "grad_norm": 0.886466833199088, + "language_loss": 0.63140184, + "learning_rate": 1.9569678051275556e-06, + "loss": 0.65137571, + "num_input_tokens_seen": 186396940, + "step": 8676, + "time_per_iteration": 3.1940531730651855 + }, + { + "auxiliary_loss_clip": 0.0105673, + "auxiliary_loss_mlp": 0.01028647, + "balance_loss_clip": 1.02640092, + "balance_loss_mlp": 1.01717949, + "epoch": 0.5216894633999699, + "flos": 26796901662720.0, + "grad_norm": 1.5124665885569313, + "language_loss": 0.68892765, + "learning_rate": 1.956578434424046e-06, + "loss": 0.70978141, + "num_input_tokens_seen": 186418680, + "step": 8677, + "time_per_iteration": 4.276286840438843 + }, + { + "auxiliary_loss_clip": 0.01053093, + "auxiliary_loss_mlp": 0.0103081, + "balance_loss_clip": 1.02327919, + "balance_loss_mlp": 1.0187881, + "epoch": 0.5217495866526379, + "flos": 26358719650560.0, + "grad_norm": 1.5733399873235856, + "language_loss": 0.64941156, + "learning_rate": 1.956189065367086e-06, + "loss": 0.67025054, + "num_input_tokens_seen": 186438265, + "step": 8678, + "time_per_iteration": 2.6639292240142822 + }, + { + "auxiliary_loss_clip": 0.01044278, + "auxiliary_loss_mlp": 0.01033257, + "balance_loss_clip": 1.02351618, + "balance_loss_mlp": 1.02047145, + "epoch": 0.5218097099053058, + "flos": 23584009841280.0, + "grad_norm": 3.1189653514863176, + "language_loss": 0.68149722, + "learning_rate": 1.9557996979714414e-06, + "loss": 0.70227259, + "num_input_tokens_seen": 186456870, + "step": 8679, + "time_per_iteration": 4.353177785873413 + }, + { + "auxiliary_loss_clip": 0.01073278, + "auxiliary_loss_mlp": 0.01033698, + "balance_loss_clip": 1.029863, + "balance_loss_mlp": 1.02154422, + "epoch": 0.5218698331579739, + "flos": 18077396256000.0, + "grad_norm": 1.6374756232684038, + "language_loss": 0.66648346, + "learning_rate": 1.9554103322518764e-06, + "loss": 0.68755317, + "num_input_tokens_seen": 186476425, + "step": 8680, + "time_per_iteration": 2.607208728790283 + }, + { + "auxiliary_loss_clip": 0.01069636, + "auxiliary_loss_mlp": 0.01031514, + "balance_loss_clip": 1.02668262, + "balance_loss_mlp": 1.01882398, + "epoch": 0.5219299564106418, + "flos": 19281121856640.0, + "grad_norm": 1.8187009731536725, + "language_loss": 0.83147764, + "learning_rate": 1.955020968223156e-06, + "loss": 0.85248917, + "num_input_tokens_seen": 186492555, + "step": 8681, + "time_per_iteration": 2.8507132530212402 + }, + { + "auxiliary_loss_clip": 0.01047288, + "auxiliary_loss_mlp": 0.01028703, + "balance_loss_clip": 1.0251658, + "balance_loss_mlp": 1.01696658, + "epoch": 0.5219900796633098, + "flos": 26651355753600.0, + "grad_norm": 2.008063847009258, + "language_loss": 0.77654016, + "learning_rate": 1.9546316059000454e-06, + "loss": 0.7973001, + "num_input_tokens_seen": 186513190, + "step": 8682, + "time_per_iteration": 2.732506275177002 + }, + { + "auxiliary_loss_clip": 0.01029157, + "auxiliary_loss_mlp": 0.01037338, + "balance_loss_clip": 1.02317226, + "balance_loss_mlp": 1.02582252, + "epoch": 0.5220502029159777, + "flos": 34312717382400.0, + "grad_norm": 1.436492888438757, + "language_loss": 0.69071364, + "learning_rate": 1.9542422452973082e-06, + "loss": 0.71137857, + "num_input_tokens_seen": 186534830, + "step": 8683, + "time_per_iteration": 2.7906792163848877 + }, + { + "auxiliary_loss_clip": 0.01039918, + "auxiliary_loss_mlp": 0.01036862, + "balance_loss_clip": 1.02655399, + "balance_loss_mlp": 1.02398086, + "epoch": 0.5221103261686457, + "flos": 22156488552960.0, + "grad_norm": 1.6314417573607536, + "language_loss": 0.76025057, + "learning_rate": 1.9538528864297104e-06, + "loss": 0.78101838, + "num_input_tokens_seen": 186554390, + "step": 8684, + "time_per_iteration": 2.6706206798553467 + }, + { + "auxiliary_loss_clip": 0.01047249, + "auxiliary_loss_mlp": 0.00747509, + "balance_loss_clip": 1.02269459, + "balance_loss_mlp": 1.00020039, + "epoch": 0.5221704494213137, + "flos": 19208402772480.0, + "grad_norm": 1.8899444287039735, + "language_loss": 0.75942147, + "learning_rate": 1.9534635293120153e-06, + "loss": 0.77736902, + "num_input_tokens_seen": 186572360, + "step": 8685, + "time_per_iteration": 2.595653772354126 + }, + { + "auxiliary_loss_clip": 0.01051196, + "auxiliary_loss_mlp": 0.01033733, + "balance_loss_clip": 1.02822793, + "balance_loss_mlp": 1.02210999, + "epoch": 0.5222305726739817, + "flos": 19354056422400.0, + "grad_norm": 1.7542620751877005, + "language_loss": 0.80823106, + "learning_rate": 1.9530741739589876e-06, + "loss": 0.82908034, + "num_input_tokens_seen": 186590655, + "step": 8686, + "time_per_iteration": 2.6572680473327637 + }, + { + "auxiliary_loss_clip": 0.01048384, + "auxiliary_loss_mlp": 0.01033392, + "balance_loss_clip": 1.02637148, + "balance_loss_mlp": 1.02243721, + "epoch": 0.5222906959266497, + "flos": 27814789272960.0, + "grad_norm": 1.801215144714294, + "language_loss": 0.69897199, + "learning_rate": 1.9526848203853927e-06, + "loss": 0.71978968, + "num_input_tokens_seen": 186610345, + "step": 8687, + "time_per_iteration": 2.6832146644592285 + }, + { + "auxiliary_loss_clip": 0.01066984, + "auxiliary_loss_mlp": 0.01028484, + "balance_loss_clip": 1.0263629, + "balance_loss_mlp": 1.01715922, + "epoch": 0.5223508191793176, + "flos": 12712988615040.0, + "grad_norm": 2.145641525805892, + "language_loss": 0.82993281, + "learning_rate": 1.9522954686059936e-06, + "loss": 0.85088748, + "num_input_tokens_seen": 186624360, + "step": 8688, + "time_per_iteration": 4.0998005867004395 + }, + { + "auxiliary_loss_clip": 0.0105875, + "auxiliary_loss_mlp": 0.00747596, + "balance_loss_clip": 1.02759695, + "balance_loss_mlp": 1.00021267, + "epoch": 0.5224109424319856, + "flos": 15632238752640.0, + "grad_norm": 2.3419585221760215, + "language_loss": 0.73940563, + "learning_rate": 1.9519061186355558e-06, + "loss": 0.75746906, + "num_input_tokens_seen": 186638680, + "step": 8689, + "time_per_iteration": 2.599494695663452 + }, + { + "auxiliary_loss_clip": 0.01041188, + "auxiliary_loss_mlp": 0.01028738, + "balance_loss_clip": 1.02313197, + "balance_loss_mlp": 1.01715088, + "epoch": 0.5224710656846535, + "flos": 15742233175680.0, + "grad_norm": 1.8429982203276942, + "language_loss": 0.82736832, + "learning_rate": 1.9515167704888417e-06, + "loss": 0.84806758, + "num_input_tokens_seen": 186655840, + "step": 8690, + "time_per_iteration": 4.208852767944336 + }, + { + "auxiliary_loss_clip": 0.01034218, + "auxiliary_loss_mlp": 0.01034156, + "balance_loss_clip": 1.02620745, + "balance_loss_mlp": 1.02142453, + "epoch": 0.5225311889373215, + "flos": 26030998938240.0, + "grad_norm": 1.9173268674089112, + "language_loss": 0.78975403, + "learning_rate": 1.9511274241806173e-06, + "loss": 0.81043774, + "num_input_tokens_seen": 186674150, + "step": 8691, + "time_per_iteration": 2.666158437728882 + }, + { + "auxiliary_loss_clip": 0.0106446, + "auxiliary_loss_mlp": 0.01036006, + "balance_loss_clip": 1.02866912, + "balance_loss_mlp": 1.02243376, + "epoch": 0.5225913121899894, + "flos": 18369278173440.0, + "grad_norm": 2.9067341642795306, + "language_loss": 0.7639358, + "learning_rate": 1.950738079725646e-06, + "loss": 0.78494048, + "num_input_tokens_seen": 186690675, + "step": 8692, + "time_per_iteration": 2.5992069244384766 + }, + { + "auxiliary_loss_clip": 0.01056762, + "auxiliary_loss_mlp": 0.01030225, + "balance_loss_clip": 1.02691817, + "balance_loss_mlp": 1.01943636, + "epoch": 0.5226514354426575, + "flos": 29273516501760.0, + "grad_norm": 1.8147013981811144, + "language_loss": 0.72501242, + "learning_rate": 1.950348737138691e-06, + "loss": 0.74588233, + "num_input_tokens_seen": 186710380, + "step": 8693, + "time_per_iteration": 2.668745279312134 + }, + { + "auxiliary_loss_clip": 0.0107249, + "auxiliary_loss_mlp": 0.01035719, + "balance_loss_clip": 1.02750444, + "balance_loss_mlp": 1.02213514, + "epoch": 0.5227115586953254, + "flos": 22853299466880.0, + "grad_norm": 2.911967903436244, + "language_loss": 0.8158325, + "learning_rate": 1.949959396434517e-06, + "loss": 0.8369146, + "num_input_tokens_seen": 186729135, + "step": 8694, + "time_per_iteration": 2.6301801204681396 + }, + { + "auxiliary_loss_clip": 0.00982413, + "auxiliary_loss_mlp": 0.0100236, + "balance_loss_clip": 1.00430262, + "balance_loss_mlp": 1.00100124, + "epoch": 0.5227716819479934, + "flos": 57474419022720.0, + "grad_norm": 0.7811903723371633, + "language_loss": 0.55651277, + "learning_rate": 1.949570057627888e-06, + "loss": 0.57636046, + "num_input_tokens_seen": 186791115, + "step": 8695, + "time_per_iteration": 3.2333736419677734 + }, + { + "auxiliary_loss_clip": 0.01012948, + "auxiliary_loss_mlp": 0.01036803, + "balance_loss_clip": 1.02772641, + "balance_loss_mlp": 1.02490568, + "epoch": 0.5228318052006613, + "flos": 13808264077440.0, + "grad_norm": 1.8450066765426154, + "language_loss": 0.73178542, + "learning_rate": 1.9491807207335672e-06, + "loss": 0.75228292, + "num_input_tokens_seen": 186808660, + "step": 8696, + "time_per_iteration": 2.715985059738159 + }, + { + "auxiliary_loss_clip": 0.01045072, + "auxiliary_loss_mlp": 0.01033639, + "balance_loss_clip": 1.02519345, + "balance_loss_mlp": 1.02156341, + "epoch": 0.5228919284533293, + "flos": 15596184476160.0, + "grad_norm": 1.685571635350792, + "language_loss": 0.71188194, + "learning_rate": 1.948791385766319e-06, + "loss": 0.73266906, + "num_input_tokens_seen": 186825900, + "step": 8697, + "time_per_iteration": 2.6433863639831543 + }, + { + "auxiliary_loss_clip": 0.01046731, + "auxiliary_loss_mlp": 0.01031596, + "balance_loss_clip": 1.02687144, + "balance_loss_mlp": 1.02061701, + "epoch": 0.5229520517059973, + "flos": 22491499726080.0, + "grad_norm": 1.6952697064880764, + "language_loss": 0.80864406, + "learning_rate": 1.948402052740906e-06, + "loss": 0.82942736, + "num_input_tokens_seen": 186843735, + "step": 8698, + "time_per_iteration": 2.608415126800537 + }, + { + "auxiliary_loss_clip": 0.01053746, + "auxiliary_loss_mlp": 0.0103663, + "balance_loss_clip": 1.0245626, + "balance_loss_mlp": 1.02437508, + "epoch": 0.5230121749586653, + "flos": 22090880361600.0, + "grad_norm": 3.5969219538257655, + "language_loss": 0.74493051, + "learning_rate": 1.948012721672093e-06, + "loss": 0.76583421, + "num_input_tokens_seen": 186862440, + "step": 8699, + "time_per_iteration": 2.5847887992858887 + }, + { + "auxiliary_loss_clip": 0.01055851, + "auxiliary_loss_mlp": 0.00747717, + "balance_loss_clip": 1.0244199, + "balance_loss_mlp": 1.00023627, + "epoch": 0.5230722982113333, + "flos": 22127150119680.0, + "grad_norm": 1.5629880885561247, + "language_loss": 0.73266721, + "learning_rate": 1.947623392574642e-06, + "loss": 0.75070286, + "num_input_tokens_seen": 186880940, + "step": 8700, + "time_per_iteration": 2.6063637733459473 + }, + { + "auxiliary_loss_clip": 0.01042742, + "auxiliary_loss_mlp": 0.01035623, + "balance_loss_clip": 1.02605343, + "balance_loss_mlp": 1.02290344, + "epoch": 0.5231324214640012, + "flos": 25009268572800.0, + "grad_norm": 1.9877305909331373, + "language_loss": 0.67078841, + "learning_rate": 1.947234065463318e-06, + "loss": 0.69157207, + "num_input_tokens_seen": 186900785, + "step": 8701, + "time_per_iteration": 2.70579195022583 + }, + { + "auxiliary_loss_clip": 0.01055665, + "auxiliary_loss_mlp": 0.0074774, + "balance_loss_clip": 1.03073359, + "balance_loss_mlp": 1.00028205, + "epoch": 0.5231925447166692, + "flos": 25740517651200.0, + "grad_norm": 2.0751327276482847, + "language_loss": 0.67074841, + "learning_rate": 1.9468447403528826e-06, + "loss": 0.68878245, + "num_input_tokens_seen": 186920895, + "step": 8702, + "time_per_iteration": 2.766432523727417 + }, + { + "auxiliary_loss_clip": 0.01046938, + "auxiliary_loss_mlp": 0.0103341, + "balance_loss_clip": 1.02626014, + "balance_loss_mlp": 1.02107728, + "epoch": 0.5232526679693371, + "flos": 21433930565760.0, + "grad_norm": 1.9580117528810213, + "language_loss": 0.76745337, + "learning_rate": 1.946455417258101e-06, + "loss": 0.78825688, + "num_input_tokens_seen": 186940605, + "step": 8703, + "time_per_iteration": 2.610985517501831 + }, + { + "auxiliary_loss_clip": 0.01064775, + "auxiliary_loss_mlp": 0.0103682, + "balance_loss_clip": 1.02893353, + "balance_loss_mlp": 1.02290869, + "epoch": 0.5233127912220051, + "flos": 35298393471360.0, + "grad_norm": 2.3426029196335865, + "language_loss": 0.76452291, + "learning_rate": 1.9460660961937348e-06, + "loss": 0.78553885, + "num_input_tokens_seen": 186960820, + "step": 8704, + "time_per_iteration": 2.7392873764038086 + }, + { + "auxiliary_loss_clip": 0.01050651, + "auxiliary_loss_mlp": 0.01038076, + "balance_loss_clip": 1.02912951, + "balance_loss_mlp": 1.02612519, + "epoch": 0.523372914474673, + "flos": 17051320344960.0, + "grad_norm": 1.751493237359707, + "language_loss": 0.78304094, + "learning_rate": 1.9456767771745474e-06, + "loss": 0.80392826, + "num_input_tokens_seen": 186976240, + "step": 8705, + "time_per_iteration": 2.6871941089630127 + }, + { + "auxiliary_loss_clip": 0.01051675, + "auxiliary_loss_mlp": 0.01032282, + "balance_loss_clip": 1.02742624, + "balance_loss_mlp": 1.01952052, + "epoch": 0.5234330377273411, + "flos": 18406302117120.0, + "grad_norm": 1.88793205025927, + "language_loss": 0.69615734, + "learning_rate": 1.9452874602153027e-06, + "loss": 0.71699685, + "num_input_tokens_seen": 186992855, + "step": 8706, + "time_per_iteration": 2.6582376956939697 + }, + { + "auxiliary_loss_clip": 0.01003251, + "auxiliary_loss_mlp": 0.01003587, + "balance_loss_clip": 1.00557256, + "balance_loss_mlp": 1.00229955, + "epoch": 0.523493160980009, + "flos": 65850296970240.0, + "grad_norm": 0.6914170394997017, + "language_loss": 0.52463406, + "learning_rate": 1.9448981453307623e-06, + "loss": 0.54470241, + "num_input_tokens_seen": 187051205, + "step": 8707, + "time_per_iteration": 3.2265775203704834 + }, + { + "auxiliary_loss_clip": 0.01043711, + "auxiliary_loss_mlp": 0.01036885, + "balance_loss_clip": 1.02413702, + "balance_loss_mlp": 1.02483261, + "epoch": 0.523553284232677, + "flos": 21872076664320.0, + "grad_norm": 1.6164423953177012, + "language_loss": 0.7487458, + "learning_rate": 1.9445088325356904e-06, + "loss": 0.76955175, + "num_input_tokens_seen": 187070540, + "step": 8708, + "time_per_iteration": 2.7019476890563965 + }, + { + "auxiliary_loss_clip": 0.01050866, + "auxiliary_loss_mlp": 0.01025824, + "balance_loss_clip": 1.02963495, + "balance_loss_mlp": 1.01383734, + "epoch": 0.5236134074853449, + "flos": 20848191482880.0, + "grad_norm": 1.8879825391750582, + "language_loss": 0.77196538, + "learning_rate": 1.944119521844849e-06, + "loss": 0.79273224, + "num_input_tokens_seen": 187089975, + "step": 8709, + "time_per_iteration": 2.6338369846343994 + }, + { + "auxiliary_loss_clip": 0.01020464, + "auxiliary_loss_mlp": 0.01040028, + "balance_loss_clip": 1.02317071, + "balance_loss_mlp": 1.02479291, + "epoch": 0.5236735307380129, + "flos": 25520421064320.0, + "grad_norm": 2.819386119505251, + "language_loss": 0.83807993, + "learning_rate": 1.9437302132730003e-06, + "loss": 0.85868484, + "num_input_tokens_seen": 187108775, + "step": 8710, + "time_per_iteration": 2.849189043045044 + }, + { + "auxiliary_loss_clip": 0.01048153, + "auxiliary_loss_mlp": 0.01031677, + "balance_loss_clip": 1.02730739, + "balance_loss_mlp": 1.01950026, + "epoch": 0.523733653990681, + "flos": 23583112001280.0, + "grad_norm": 1.8776560140497014, + "language_loss": 0.6925385, + "learning_rate": 1.943340906834908e-06, + "loss": 0.71333671, + "num_input_tokens_seen": 187128830, + "step": 8711, + "time_per_iteration": 2.7772912979125977 + }, + { + "auxiliary_loss_clip": 0.01060594, + "auxiliary_loss_mlp": 0.01039289, + "balance_loss_clip": 1.02667546, + "balance_loss_mlp": 1.02660537, + "epoch": 0.5237937772433489, + "flos": 21106245767040.0, + "grad_norm": 1.9151776441658346, + "language_loss": 0.8316375, + "learning_rate": 1.9429516025453345e-06, + "loss": 0.85263634, + "num_input_tokens_seen": 187149570, + "step": 8712, + "time_per_iteration": 2.7119743824005127 + }, + { + "auxiliary_loss_clip": 0.01070043, + "auxiliary_loss_mlp": 0.01041297, + "balance_loss_clip": 1.02629495, + "balance_loss_mlp": 1.02813053, + "epoch": 0.5238539004960169, + "flos": 19172887200000.0, + "grad_norm": 1.749758319033612, + "language_loss": 0.69574904, + "learning_rate": 1.9425623004190415e-06, + "loss": 0.71686244, + "num_input_tokens_seen": 187170575, + "step": 8713, + "time_per_iteration": 2.6459224224090576 + }, + { + "auxiliary_loss_clip": 0.01033512, + "auxiliary_loss_mlp": 0.01040574, + "balance_loss_clip": 1.02325869, + "balance_loss_mlp": 1.02542257, + "epoch": 0.5239140237486848, + "flos": 17888218300800.0, + "grad_norm": 2.5584190137184106, + "language_loss": 0.76952899, + "learning_rate": 1.9421730004707925e-06, + "loss": 0.79026985, + "num_input_tokens_seen": 187187190, + "step": 8714, + "time_per_iteration": 2.7245070934295654 + }, + { + "auxiliary_loss_clip": 0.01034247, + "auxiliary_loss_mlp": 0.01038238, + "balance_loss_clip": 1.02554488, + "balance_loss_mlp": 1.02383137, + "epoch": 0.5239741470013528, + "flos": 17930413802880.0, + "grad_norm": 2.2923081344969334, + "language_loss": 0.75909865, + "learning_rate": 1.9417837027153483e-06, + "loss": 0.77982354, + "num_input_tokens_seen": 187204350, + "step": 8715, + "time_per_iteration": 2.7006800174713135 + }, + { + "auxiliary_loss_clip": 0.01044554, + "auxiliary_loss_mlp": 0.01033353, + "balance_loss_clip": 1.02461672, + "balance_loss_mlp": 1.02078819, + "epoch": 0.5240342702540207, + "flos": 30993386584320.0, + "grad_norm": 1.46310538601436, + "language_loss": 0.7130518, + "learning_rate": 1.9413944071674723e-06, + "loss": 0.73383087, + "num_input_tokens_seen": 187225605, + "step": 8716, + "time_per_iteration": 2.717029333114624 + }, + { + "auxiliary_loss_clip": 0.01069246, + "auxiliary_loss_mlp": 0.01035939, + "balance_loss_clip": 1.02675581, + "balance_loss_mlp": 1.02462554, + "epoch": 0.5240943935066887, + "flos": 25005066681600.0, + "grad_norm": 2.0402213471273845, + "language_loss": 0.86751735, + "learning_rate": 1.941005113841926e-06, + "loss": 0.88856918, + "num_input_tokens_seen": 187241335, + "step": 8717, + "time_per_iteration": 2.5550038814544678 + }, + { + "auxiliary_loss_clip": 0.01061678, + "auxiliary_loss_mlp": 0.01030292, + "balance_loss_clip": 1.02931142, + "balance_loss_mlp": 1.01807857, + "epoch": 0.5241545167593566, + "flos": 23659099223040.0, + "grad_norm": 2.0445031188005904, + "language_loss": 0.61247307, + "learning_rate": 1.9406158227534723e-06, + "loss": 0.63339281, + "num_input_tokens_seen": 187259925, + "step": 8718, + "time_per_iteration": 2.672743320465088 + }, + { + "auxiliary_loss_clip": 0.01042562, + "auxiliary_loss_mlp": 0.01036418, + "balance_loss_clip": 1.02520537, + "balance_loss_mlp": 1.02317405, + "epoch": 0.5242146400120247, + "flos": 23400398494080.0, + "grad_norm": 6.980043730874661, + "language_loss": 0.72012091, + "learning_rate": 1.940226533916872e-06, + "loss": 0.74091077, + "num_input_tokens_seen": 187279035, + "step": 8719, + "time_per_iteration": 2.7540509700775146 + }, + { + "auxiliary_loss_clip": 0.01056161, + "auxiliary_loss_mlp": 0.0102783, + "balance_loss_clip": 1.02606714, + "balance_loss_mlp": 1.01679742, + "epoch": 0.5242747632646926, + "flos": 17749065012480.0, + "grad_norm": 1.7904967325784515, + "language_loss": 0.72806221, + "learning_rate": 1.9398372473468877e-06, + "loss": 0.74890214, + "num_input_tokens_seen": 187297555, + "step": 8720, + "time_per_iteration": 2.7714669704437256 + }, + { + "auxiliary_loss_clip": 0.01054064, + "auxiliary_loss_mlp": 0.01035332, + "balance_loss_clip": 1.0244559, + "balance_loss_mlp": 1.02291632, + "epoch": 0.5243348865173606, + "flos": 32597731549440.0, + "grad_norm": 1.6871091676960048, + "language_loss": 0.70188749, + "learning_rate": 1.939447963058281e-06, + "loss": 0.72278142, + "num_input_tokens_seen": 187320265, + "step": 8721, + "time_per_iteration": 2.6773152351379395 + }, + { + "auxiliary_loss_clip": 0.01012367, + "auxiliary_loss_mlp": 0.01033, + "balance_loss_clip": 1.02183151, + "balance_loss_mlp": 1.02035213, + "epoch": 0.5243950097700285, + "flos": 25484115392640.0, + "grad_norm": 1.7326297655122154, + "language_loss": 0.86660272, + "learning_rate": 1.939058681065813e-06, + "loss": 0.88705641, + "num_input_tokens_seen": 187338045, + "step": 8722, + "time_per_iteration": 2.7289915084838867 + }, + { + "auxiliary_loss_clip": 0.01069326, + "auxiliary_loss_mlp": 0.01031631, + "balance_loss_clip": 1.02784824, + "balance_loss_mlp": 1.01907837, + "epoch": 0.5244551330226965, + "flos": 15268391936640.0, + "grad_norm": 1.790362960183247, + "language_loss": 0.8000896, + "learning_rate": 1.938669401384247e-06, + "loss": 0.82109916, + "num_input_tokens_seen": 187356040, + "step": 8723, + "time_per_iteration": 2.611811637878418 + }, + { + "auxiliary_loss_clip": 0.01065097, + "auxiliary_loss_mlp": 0.0104449, + "balance_loss_clip": 1.03079462, + "balance_loss_mlp": 1.03146052, + "epoch": 0.5245152562753645, + "flos": 22237108629120.0, + "grad_norm": 1.8151126349006772, + "language_loss": 0.74774879, + "learning_rate": 1.9382801240283426e-06, + "loss": 0.76884466, + "num_input_tokens_seen": 187374185, + "step": 8724, + "time_per_iteration": 4.2598631381988525 + }, + { + "auxiliary_loss_clip": 0.01072669, + "auxiliary_loss_mlp": 0.01031971, + "balance_loss_clip": 1.02637243, + "balance_loss_mlp": 1.01830947, + "epoch": 0.5245753795280325, + "flos": 29426460612480.0, + "grad_norm": 2.2148181208555036, + "language_loss": 0.70337987, + "learning_rate": 1.9378908490128625e-06, + "loss": 0.72442627, + "num_input_tokens_seen": 187396640, + "step": 8725, + "time_per_iteration": 2.667572021484375 + }, + { + "auxiliary_loss_clip": 0.00982969, + "auxiliary_loss_mlp": 0.01002895, + "balance_loss_clip": 1.00622666, + "balance_loss_mlp": 1.00139856, + "epoch": 0.5246355027807005, + "flos": 58834392785280.0, + "grad_norm": 0.7566345049097452, + "language_loss": 0.55650318, + "learning_rate": 1.937501576352568e-06, + "loss": 0.57636178, + "num_input_tokens_seen": 187455945, + "step": 8726, + "time_per_iteration": 4.736093044281006 + }, + { + "auxiliary_loss_clip": 0.01001947, + "auxiliary_loss_mlp": 0.01009493, + "balance_loss_clip": 1.01277804, + "balance_loss_mlp": 1.00809848, + "epoch": 0.5246956260333684, + "flos": 64526592965760.0, + "grad_norm": 0.798464700903352, + "language_loss": 0.58335745, + "learning_rate": 1.937112306062219e-06, + "loss": 0.60347188, + "num_input_tokens_seen": 187519975, + "step": 8727, + "time_per_iteration": 3.176658868789673 + }, + { + "auxiliary_loss_clip": 0.01060061, + "auxiliary_loss_mlp": 0.01033181, + "balance_loss_clip": 1.02657342, + "balance_loss_mlp": 1.01995444, + "epoch": 0.5247557492860364, + "flos": 24533631653760.0, + "grad_norm": 1.547904218693808, + "language_loss": 0.7080732, + "learning_rate": 1.9367230381565786e-06, + "loss": 0.72900558, + "num_input_tokens_seen": 187541775, + "step": 8728, + "time_per_iteration": 2.6449081897735596 + }, + { + "auxiliary_loss_clip": 0.01059112, + "auxiliary_loss_mlp": 0.01026883, + "balance_loss_clip": 1.02587545, + "balance_loss_mlp": 1.01485443, + "epoch": 0.5248158725387043, + "flos": 18806131382400.0, + "grad_norm": 1.4770620901100318, + "language_loss": 0.69543076, + "learning_rate": 1.9363337726504062e-06, + "loss": 0.71629071, + "num_input_tokens_seen": 187560425, + "step": 8729, + "time_per_iteration": 2.556760787963867 + }, + { + "auxiliary_loss_clip": 0.01041003, + "auxiliary_loss_mlp": 0.01033059, + "balance_loss_clip": 1.02848947, + "balance_loss_mlp": 1.0198741, + "epoch": 0.5248759957913723, + "flos": 20955851521920.0, + "grad_norm": 1.613854909809421, + "language_loss": 0.83556014, + "learning_rate": 1.935944509558464e-06, + "loss": 0.85630083, + "num_input_tokens_seen": 187579930, + "step": 8730, + "time_per_iteration": 2.7110769748687744 + }, + { + "auxiliary_loss_clip": 0.01033651, + "auxiliary_loss_mlp": 0.01032118, + "balance_loss_clip": 1.02556181, + "balance_loss_mlp": 1.01925588, + "epoch": 0.5249361190440403, + "flos": 18660980522880.0, + "grad_norm": 1.9672825757221593, + "language_loss": 0.79313523, + "learning_rate": 1.9355552488955125e-06, + "loss": 0.81379294, + "num_input_tokens_seen": 187595365, + "step": 8731, + "time_per_iteration": 2.753628969192505 + }, + { + "auxiliary_loss_clip": 0.01054943, + "auxiliary_loss_mlp": 0.0103163, + "balance_loss_clip": 1.02503896, + "balance_loss_mlp": 1.01927376, + "epoch": 0.5249962422967083, + "flos": 24863327614080.0, + "grad_norm": 1.9763856179588342, + "language_loss": 0.83224773, + "learning_rate": 1.935165990676312e-06, + "loss": 0.85311347, + "num_input_tokens_seen": 187614715, + "step": 8732, + "time_per_iteration": 2.6535415649414062 + }, + { + "auxiliary_loss_clip": 0.01058354, + "auxiliary_loss_mlp": 0.01030567, + "balance_loss_clip": 1.0266397, + "balance_loss_mlp": 1.01910496, + "epoch": 0.5250563655493762, + "flos": 15262681674240.0, + "grad_norm": 1.5966720549035645, + "language_loss": 0.77746254, + "learning_rate": 1.9347767349156237e-06, + "loss": 0.79835176, + "num_input_tokens_seen": 187630745, + "step": 8733, + "time_per_iteration": 2.525723457336426 + }, + { + "auxiliary_loss_clip": 0.01071293, + "auxiliary_loss_mlp": 0.01033772, + "balance_loss_clip": 1.02707553, + "balance_loss_mlp": 1.02061737, + "epoch": 0.5251164888020442, + "flos": 18625177641600.0, + "grad_norm": 2.134073585642658, + "language_loss": 0.81130916, + "learning_rate": 1.934387481628208e-06, + "loss": 0.83235979, + "num_input_tokens_seen": 187648200, + "step": 8734, + "time_per_iteration": 2.5461254119873047 + }, + { + "auxiliary_loss_clip": 0.01046865, + "auxiliary_loss_mlp": 0.01026183, + "balance_loss_clip": 1.02649975, + "balance_loss_mlp": 1.01419687, + "epoch": 0.5251766120547121, + "flos": 29710764760320.0, + "grad_norm": 1.4990678584055157, + "language_loss": 0.76593012, + "learning_rate": 1.933998230828826e-06, + "loss": 0.78666061, + "num_input_tokens_seen": 187669205, + "step": 8735, + "time_per_iteration": 4.3201940059661865 + }, + { + "auxiliary_loss_clip": 0.01059952, + "auxiliary_loss_mlp": 0.01030984, + "balance_loss_clip": 1.02766049, + "balance_loss_mlp": 1.01987994, + "epoch": 0.5252367353073801, + "flos": 23440295525760.0, + "grad_norm": 1.5146510468400414, + "language_loss": 0.80434293, + "learning_rate": 1.9336089825322376e-06, + "loss": 0.82525229, + "num_input_tokens_seen": 187690890, + "step": 8736, + "time_per_iteration": 2.643676996231079 + }, + { + "auxiliary_loss_clip": 0.01069878, + "auxiliary_loss_mlp": 0.0103088, + "balance_loss_clip": 1.02662086, + "balance_loss_mlp": 1.01811266, + "epoch": 0.5252968585600482, + "flos": 30810708990720.0, + "grad_norm": 2.0842949085510347, + "language_loss": 0.7008431, + "learning_rate": 1.9332197367532033e-06, + "loss": 0.72185063, + "num_input_tokens_seen": 187713045, + "step": 8737, + "time_per_iteration": 4.252533197402954 + }, + { + "auxiliary_loss_clip": 0.01049914, + "auxiliary_loss_mlp": 0.01031722, + "balance_loss_clip": 1.02660894, + "balance_loss_mlp": 1.01936579, + "epoch": 0.5253569818127161, + "flos": 20628274464000.0, + "grad_norm": 1.6083344279980174, + "language_loss": 0.77365339, + "learning_rate": 1.9328304935064833e-06, + "loss": 0.79446977, + "num_input_tokens_seen": 187733640, + "step": 8738, + "time_per_iteration": 2.667509078979492 + }, + { + "auxiliary_loss_clip": 0.00980305, + "auxiliary_loss_mlp": 0.00746932, + "balance_loss_clip": 1.00266719, + "balance_loss_mlp": 1.0006851, + "epoch": 0.5254171050653841, + "flos": 63428695810560.0, + "grad_norm": 0.8533935146799951, + "language_loss": 0.54522699, + "learning_rate": 1.932441252806837e-06, + "loss": 0.56249934, + "num_input_tokens_seen": 187792930, + "step": 8739, + "time_per_iteration": 3.2162363529205322 + }, + { + "auxiliary_loss_clip": 0.01042619, + "auxiliary_loss_mlp": 0.01034804, + "balance_loss_clip": 1.02719951, + "balance_loss_mlp": 1.02327681, + "epoch": 0.525477228318052, + "flos": 34670782108800.0, + "grad_norm": 1.8410298307706143, + "language_loss": 0.8481344, + "learning_rate": 1.9320520146690263e-06, + "loss": 0.86890858, + "num_input_tokens_seen": 187812495, + "step": 8740, + "time_per_iteration": 2.817394495010376 + }, + { + "auxiliary_loss_clip": 0.01052263, + "auxiliary_loss_mlp": 0.00747734, + "balance_loss_clip": 1.02401602, + "balance_loss_mlp": 1.00026298, + "epoch": 0.52553735157072, + "flos": 17930844766080.0, + "grad_norm": 2.276373773342116, + "language_loss": 0.69731951, + "learning_rate": 1.9316627791078093e-06, + "loss": 0.71531951, + "num_input_tokens_seen": 187829685, + "step": 8741, + "time_per_iteration": 2.62117862701416 + }, + { + "auxiliary_loss_clip": 0.01048486, + "auxiliary_loss_mlp": 0.01033272, + "balance_loss_clip": 1.02629912, + "balance_loss_mlp": 1.02095807, + "epoch": 0.5255974748233879, + "flos": 9940864584960.0, + "grad_norm": 1.8655128201455902, + "language_loss": 0.65919816, + "learning_rate": 1.931273546137947e-06, + "loss": 0.6800158, + "num_input_tokens_seen": 187846495, + "step": 8742, + "time_per_iteration": 2.740596055984497 + }, + { + "auxiliary_loss_clip": 0.01034696, + "auxiliary_loss_mlp": 0.01041918, + "balance_loss_clip": 1.0238986, + "balance_loss_mlp": 1.02807832, + "epoch": 0.5256575980760559, + "flos": 16868427269760.0, + "grad_norm": 1.9661684255675935, + "language_loss": 0.62830818, + "learning_rate": 1.9308843157741983e-06, + "loss": 0.64907432, + "num_input_tokens_seen": 187862010, + "step": 8743, + "time_per_iteration": 2.661468029022217 + }, + { + "auxiliary_loss_clip": 0.00998146, + "auxiliary_loss_mlp": 0.01007516, + "balance_loss_clip": 1.00117338, + "balance_loss_mlp": 1.00638986, + "epoch": 0.5257177213287239, + "flos": 62386210362240.0, + "grad_norm": 0.7760496559742526, + "language_loss": 0.5416162, + "learning_rate": 1.930495088031323e-06, + "loss": 0.56167281, + "num_input_tokens_seen": 187922730, + "step": 8744, + "time_per_iteration": 3.2646267414093018 + }, + { + "auxiliary_loss_clip": 0.01057989, + "auxiliary_loss_mlp": 0.01032248, + "balance_loss_clip": 1.0306536, + "balance_loss_mlp": 1.01845562, + "epoch": 0.5257778445813919, + "flos": 20776908942720.0, + "grad_norm": 2.174709776015415, + "language_loss": 0.75336182, + "learning_rate": 1.9301058629240814e-06, + "loss": 0.77426416, + "num_input_tokens_seen": 187940160, + "step": 8745, + "time_per_iteration": 2.5946285724639893 + }, + { + "auxiliary_loss_clip": 0.01055309, + "auxiliary_loss_mlp": 0.01033333, + "balance_loss_clip": 1.02536035, + "balance_loss_mlp": 1.0214119, + "epoch": 0.5258379678340598, + "flos": 17018606033280.0, + "grad_norm": 2.001462325344909, + "language_loss": 0.81078553, + "learning_rate": 1.9297166404672324e-06, + "loss": 0.83167201, + "num_input_tokens_seen": 187958625, + "step": 8746, + "time_per_iteration": 2.5639359951019287 + }, + { + "auxiliary_loss_clip": 0.01058123, + "auxiliary_loss_mlp": 0.01033547, + "balance_loss_clip": 1.02630854, + "balance_loss_mlp": 1.02107763, + "epoch": 0.5258980910867278, + "flos": 21068754946560.0, + "grad_norm": 2.334581558736624, + "language_loss": 0.75041693, + "learning_rate": 1.9293274206755353e-06, + "loss": 0.77133363, + "num_input_tokens_seen": 187977575, + "step": 8747, + "time_per_iteration": 2.7914562225341797 + }, + { + "auxiliary_loss_clip": 0.01004725, + "auxiliary_loss_mlp": 0.01031444, + "balance_loss_clip": 1.02264857, + "balance_loss_mlp": 1.01952934, + "epoch": 0.5259582143393957, + "flos": 18004461690240.0, + "grad_norm": 1.7909114661193122, + "language_loss": 0.82443249, + "learning_rate": 1.9289382035637505e-06, + "loss": 0.84479421, + "num_input_tokens_seen": 187996650, + "step": 8748, + "time_per_iteration": 2.7829484939575195 + }, + { + "auxiliary_loss_clip": 0.01045675, + "auxiliary_loss_mlp": 0.01035806, + "balance_loss_clip": 1.02361047, + "balance_loss_mlp": 1.02297878, + "epoch": 0.5260183375920637, + "flos": 22783848520320.0, + "grad_norm": 1.9412575268242183, + "language_loss": 0.8025437, + "learning_rate": 1.9285489891466345e-06, + "loss": 0.82335854, + "num_input_tokens_seen": 188013510, + "step": 8749, + "time_per_iteration": 2.607029914855957 + }, + { + "auxiliary_loss_clip": 0.01061658, + "auxiliary_loss_mlp": 0.01038707, + "balance_loss_clip": 1.02939117, + "balance_loss_mlp": 1.02644002, + "epoch": 0.5260784608447318, + "flos": 27052406081280.0, + "grad_norm": 5.191941122073076, + "language_loss": 0.72150564, + "learning_rate": 1.9281597774389487e-06, + "loss": 0.74250925, + "num_input_tokens_seen": 188032085, + "step": 8750, + "time_per_iteration": 2.7762598991394043 + }, + { + "auxiliary_loss_clip": 0.01044125, + "auxiliary_loss_mlp": 0.0102972, + "balance_loss_clip": 1.02356446, + "balance_loss_mlp": 1.01770926, + "epoch": 0.5261385840973997, + "flos": 20662820369280.0, + "grad_norm": 1.3964140261942666, + "language_loss": 0.76433909, + "learning_rate": 1.9277705684554517e-06, + "loss": 0.78507757, + "num_input_tokens_seen": 188050590, + "step": 8751, + "time_per_iteration": 2.7012228965759277 + }, + { + "auxiliary_loss_clip": 0.01067545, + "auxiliary_loss_mlp": 0.01035605, + "balance_loss_clip": 1.02763367, + "balance_loss_mlp": 1.02450037, + "epoch": 0.5261987073500677, + "flos": 23622649896960.0, + "grad_norm": 1.4893329263551804, + "language_loss": 0.75760561, + "learning_rate": 1.927381362210902e-06, + "loss": 0.77863705, + "num_input_tokens_seen": 188071620, + "step": 8752, + "time_per_iteration": 2.725846290588379 + }, + { + "auxiliary_loss_clip": 0.01063432, + "auxiliary_loss_mlp": 0.01039625, + "balance_loss_clip": 1.02973831, + "balance_loss_mlp": 1.02709651, + "epoch": 0.5262588306027356, + "flos": 27636241743360.0, + "grad_norm": 1.574022092860652, + "language_loss": 0.6811682, + "learning_rate": 1.926992158720058e-06, + "loss": 0.70219874, + "num_input_tokens_seen": 188091740, + "step": 8753, + "time_per_iteration": 2.737194299697876 + }, + { + "auxiliary_loss_clip": 0.01060909, + "auxiliary_loss_mlp": 0.01035982, + "balance_loss_clip": 1.02841377, + "balance_loss_mlp": 1.02414489, + "epoch": 0.5263189538554036, + "flos": 21759711943680.0, + "grad_norm": 2.0747777172154187, + "language_loss": 0.83741665, + "learning_rate": 1.9266029579976785e-06, + "loss": 0.85838556, + "num_input_tokens_seen": 188111165, + "step": 8754, + "time_per_iteration": 2.623600959777832 + }, + { + "auxiliary_loss_clip": 0.01061226, + "auxiliary_loss_mlp": 0.01035217, + "balance_loss_clip": 1.02833414, + "balance_loss_mlp": 1.02302158, + "epoch": 0.5263790771080715, + "flos": 14276359140480.0, + "grad_norm": 2.0868051350657093, + "language_loss": 0.87658781, + "learning_rate": 1.926213760058522e-06, + "loss": 0.89755213, + "num_input_tokens_seen": 188127825, + "step": 8755, + "time_per_iteration": 2.6492018699645996 + }, + { + "auxiliary_loss_clip": 0.00984257, + "auxiliary_loss_mlp": 0.01009605, + "balance_loss_clip": 1.00603926, + "balance_loss_mlp": 1.00853825, + "epoch": 0.5264392003607395, + "flos": 65806413528960.0, + "grad_norm": 0.7257572480979214, + "language_loss": 0.58858359, + "learning_rate": 1.9258245649173477e-06, + "loss": 0.6085223, + "num_input_tokens_seen": 188194050, + "step": 8756, + "time_per_iteration": 3.325061559677124 + }, + { + "auxiliary_loss_clip": 0.0103558, + "auxiliary_loss_mlp": 0.01033802, + "balance_loss_clip": 1.0231843, + "balance_loss_mlp": 1.02141595, + "epoch": 0.5264993236134075, + "flos": 21032413361280.0, + "grad_norm": 1.6612876783076904, + "language_loss": 0.70803553, + "learning_rate": 1.925435372588913e-06, + "loss": 0.72872931, + "num_input_tokens_seen": 188212565, + "step": 8757, + "time_per_iteration": 2.7267649173736572 + }, + { + "auxiliary_loss_clip": 0.01059673, + "auxiliary_loss_mlp": 0.01034856, + "balance_loss_clip": 1.02622485, + "balance_loss_mlp": 1.02267325, + "epoch": 0.5265594468660755, + "flos": 16618202150400.0, + "grad_norm": 1.549609319340007, + "language_loss": 0.88023293, + "learning_rate": 1.9250461830879768e-06, + "loss": 0.90117824, + "num_input_tokens_seen": 188229505, + "step": 8758, + "time_per_iteration": 2.678095579147339 + }, + { + "auxiliary_loss_clip": 0.01009488, + "auxiliary_loss_mlp": 0.01034453, + "balance_loss_clip": 1.02340698, + "balance_loss_mlp": 1.02123845, + "epoch": 0.5266195701187434, + "flos": 24134125610880.0, + "grad_norm": 1.3521418260251186, + "language_loss": 0.76023316, + "learning_rate": 1.9246569964292965e-06, + "loss": 0.78067255, + "num_input_tokens_seen": 188250395, + "step": 8759, + "time_per_iteration": 2.757070302963257 + }, + { + "auxiliary_loss_clip": 0.01047503, + "auxiliary_loss_mlp": 0.01029227, + "balance_loss_clip": 1.02646422, + "balance_loss_mlp": 1.0176456, + "epoch": 0.5266796933714114, + "flos": 15844111125120.0, + "grad_norm": 3.9194901373998916, + "language_loss": 0.71590954, + "learning_rate": 1.9242678126276307e-06, + "loss": 0.73667687, + "num_input_tokens_seen": 188266785, + "step": 8760, + "time_per_iteration": 2.6501152515411377 + }, + { + "auxiliary_loss_clip": 0.01052737, + "auxiliary_loss_mlp": 0.01036607, + "balance_loss_clip": 1.02718985, + "balance_loss_mlp": 1.02356577, + "epoch": 0.5267398166240793, + "flos": 20951434149120.0, + "grad_norm": 2.026605130570549, + "language_loss": 0.75963122, + "learning_rate": 1.923878631697736e-06, + "loss": 0.78052467, + "num_input_tokens_seen": 188282525, + "step": 8761, + "time_per_iteration": 2.6669235229492188 + }, + { + "auxiliary_loss_clip": 0.01052765, + "auxiliary_loss_mlp": 0.00747541, + "balance_loss_clip": 1.0258075, + "balance_loss_mlp": 1.00022399, + "epoch": 0.5267999398767473, + "flos": 20996394998400.0, + "grad_norm": 2.124014400765762, + "language_loss": 0.70698905, + "learning_rate": 1.923489453654373e-06, + "loss": 0.7249921, + "num_input_tokens_seen": 188301395, + "step": 8762, + "time_per_iteration": 2.6928415298461914 + }, + { + "auxiliary_loss_clip": 0.00993218, + "auxiliary_loss_mlp": 0.01003558, + "balance_loss_clip": 1.00579619, + "balance_loss_mlp": 1.00221717, + "epoch": 0.5268600631294152, + "flos": 66849401767680.0, + "grad_norm": 0.9257000996299485, + "language_loss": 0.65401983, + "learning_rate": 1.9231002785122963e-06, + "loss": 0.67398763, + "num_input_tokens_seen": 188357665, + "step": 8763, + "time_per_iteration": 3.131619930267334 + }, + { + "auxiliary_loss_clip": 0.0105989, + "auxiliary_loss_mlp": 0.01034692, + "balance_loss_clip": 1.02748585, + "balance_loss_mlp": 1.02234173, + "epoch": 0.5269201863820833, + "flos": 17165552572800.0, + "grad_norm": 1.640983442852713, + "language_loss": 0.7102918, + "learning_rate": 1.922711106286265e-06, + "loss": 0.73123765, + "num_input_tokens_seen": 188376935, + "step": 8764, + "time_per_iteration": 2.7000715732574463 + }, + { + "auxiliary_loss_clip": 0.01022965, + "auxiliary_loss_mlp": 0.01032147, + "balance_loss_clip": 1.02286458, + "balance_loss_mlp": 1.01849735, + "epoch": 0.5269803096347513, + "flos": 20522589672960.0, + "grad_norm": 1.5644858496241942, + "language_loss": 0.74220544, + "learning_rate": 1.9223219369910368e-06, + "loss": 0.76275659, + "num_input_tokens_seen": 188394995, + "step": 8765, + "time_per_iteration": 2.6385936737060547 + }, + { + "auxiliary_loss_clip": 0.01046197, + "auxiliary_loss_mlp": 0.01033868, + "balance_loss_clip": 1.02301335, + "balance_loss_mlp": 1.02092147, + "epoch": 0.5270404328874192, + "flos": 27230989524480.0, + "grad_norm": 1.5623957267450206, + "language_loss": 0.85791028, + "learning_rate": 1.9219327706413677e-06, + "loss": 0.87871087, + "num_input_tokens_seen": 188415475, + "step": 8766, + "time_per_iteration": 2.7497076988220215 + }, + { + "auxiliary_loss_clip": 0.01075129, + "auxiliary_loss_mlp": 0.01037003, + "balance_loss_clip": 1.03122985, + "balance_loss_mlp": 1.02376533, + "epoch": 0.5271005561400872, + "flos": 23110491824640.0, + "grad_norm": 1.7842701091329007, + "language_loss": 0.78866094, + "learning_rate": 1.921543607252017e-06, + "loss": 0.80978227, + "num_input_tokens_seen": 188435665, + "step": 8767, + "time_per_iteration": 2.682549476623535 + }, + { + "auxiliary_loss_clip": 0.01064666, + "auxiliary_loss_mlp": 0.01035328, + "balance_loss_clip": 1.03039134, + "balance_loss_mlp": 1.02197051, + "epoch": 0.5271606793927551, + "flos": 22564793427840.0, + "grad_norm": 2.0895402258594657, + "language_loss": 0.73856533, + "learning_rate": 1.9211544468377394e-06, + "loss": 0.75956523, + "num_input_tokens_seen": 188455405, + "step": 8768, + "time_per_iteration": 2.659118175506592 + }, + { + "auxiliary_loss_clip": 0.01038533, + "auxiliary_loss_mlp": 0.01039515, + "balance_loss_clip": 1.02314913, + "balance_loss_mlp": 1.02752852, + "epoch": 0.5272208026454231, + "flos": 18764259102720.0, + "grad_norm": 3.7204982773085504, + "language_loss": 0.73686284, + "learning_rate": 1.9207652894132933e-06, + "loss": 0.75764334, + "num_input_tokens_seen": 188472940, + "step": 8769, + "time_per_iteration": 2.698474645614624 + }, + { + "auxiliary_loss_clip": 0.01031214, + "auxiliary_loss_mlp": 0.01031709, + "balance_loss_clip": 1.02313352, + "balance_loss_mlp": 1.0199784, + "epoch": 0.5272809258980911, + "flos": 20412164286720.0, + "grad_norm": 1.594744624881188, + "language_loss": 0.73184836, + "learning_rate": 1.920376134993436e-06, + "loss": 0.75247753, + "num_input_tokens_seen": 188493035, + "step": 8770, + "time_per_iteration": 2.660670042037964 + }, + { + "auxiliary_loss_clip": 0.01071782, + "auxiliary_loss_mlp": 0.01030565, + "balance_loss_clip": 1.02845716, + "balance_loss_mlp": 1.01865625, + "epoch": 0.5273410491507591, + "flos": 28256742213120.0, + "grad_norm": 1.929171190965881, + "language_loss": 0.67831248, + "learning_rate": 1.9199869835929224e-06, + "loss": 0.69933599, + "num_input_tokens_seen": 188513860, + "step": 8771, + "time_per_iteration": 2.567336320877075 + }, + { + "auxiliary_loss_clip": 0.01057627, + "auxiliary_loss_mlp": 0.01034438, + "balance_loss_clip": 1.02617955, + "balance_loss_mlp": 1.02202868, + "epoch": 0.527401172403427, + "flos": 22455158140800.0, + "grad_norm": 1.7863555507984017, + "language_loss": 0.76595235, + "learning_rate": 1.9195978352265115e-06, + "loss": 0.78687298, + "num_input_tokens_seen": 188533345, + "step": 8772, + "time_per_iteration": 4.1634438037872314 + }, + { + "auxiliary_loss_clip": 0.01051002, + "auxiliary_loss_mlp": 0.01042731, + "balance_loss_clip": 1.0249871, + "balance_loss_mlp": 1.02977872, + "epoch": 0.527461295656095, + "flos": 21031084558080.0, + "grad_norm": 1.8735670960961013, + "language_loss": 0.65942007, + "learning_rate": 1.9192086899089585e-06, + "loss": 0.68035734, + "num_input_tokens_seen": 188551550, + "step": 8773, + "time_per_iteration": 2.548842191696167 + }, + { + "auxiliary_loss_clip": 0.01042883, + "auxiliary_loss_mlp": 0.01041065, + "balance_loss_clip": 1.02792239, + "balance_loss_mlp": 1.0295434, + "epoch": 0.5275214189087629, + "flos": 26322018929280.0, + "grad_norm": 1.544909393756512, + "language_loss": 0.85906267, + "learning_rate": 1.91881954765502e-06, + "loss": 0.87990212, + "num_input_tokens_seen": 188571615, + "step": 8774, + "time_per_iteration": 4.3944761753082275 + }, + { + "auxiliary_loss_clip": 0.01042712, + "auxiliary_loss_mlp": 0.01028396, + "balance_loss_clip": 1.02595317, + "balance_loss_mlp": 1.01689827, + "epoch": 0.5275815421614309, + "flos": 20047024581120.0, + "grad_norm": 1.575645164018757, + "language_loss": 0.79851711, + "learning_rate": 1.9184304084794523e-06, + "loss": 0.81922817, + "num_input_tokens_seen": 188591965, + "step": 8775, + "time_per_iteration": 2.6537528038024902 + }, + { + "auxiliary_loss_clip": 0.01042758, + "auxiliary_loss_mlp": 0.01034623, + "balance_loss_clip": 1.02391398, + "balance_loss_mlp": 1.02248144, + "epoch": 0.5276416654140988, + "flos": 21432206712960.0, + "grad_norm": 1.7141230383888986, + "language_loss": 0.83637953, + "learning_rate": 1.918041272397012e-06, + "loss": 0.8571533, + "num_input_tokens_seen": 188610675, + "step": 8776, + "time_per_iteration": 2.6076343059539795 + }, + { + "auxiliary_loss_clip": 0.01050007, + "auxiliary_loss_mlp": 0.01033263, + "balance_loss_clip": 1.02645755, + "balance_loss_mlp": 1.02078772, + "epoch": 0.5277017886667669, + "flos": 17165085696000.0, + "grad_norm": 1.6740241385090895, + "language_loss": 0.67699766, + "learning_rate": 1.9176521394224547e-06, + "loss": 0.69783044, + "num_input_tokens_seen": 188628235, + "step": 8777, + "time_per_iteration": 2.612950325012207 + }, + { + "auxiliary_loss_clip": 0.01042675, + "auxiliary_loss_mlp": 0.01036398, + "balance_loss_clip": 1.02667189, + "balance_loss_mlp": 1.0245012, + "epoch": 0.5277619119194349, + "flos": 20448146736000.0, + "grad_norm": 1.49079505036861, + "language_loss": 0.81990409, + "learning_rate": 1.9172630095705358e-06, + "loss": 0.84069479, + "num_input_tokens_seen": 188648925, + "step": 8778, + "time_per_iteration": 2.626453161239624 + }, + { + "auxiliary_loss_clip": 0.01060664, + "auxiliary_loss_mlp": 0.01031922, + "balance_loss_clip": 1.02753282, + "balance_loss_mlp": 1.01938093, + "epoch": 0.5278220351721028, + "flos": 24061083304320.0, + "grad_norm": 3.445756851025083, + "language_loss": 0.79397476, + "learning_rate": 1.916873882856013e-06, + "loss": 0.81490058, + "num_input_tokens_seen": 188668125, + "step": 8779, + "time_per_iteration": 2.687446355819702 + }, + { + "auxiliary_loss_clip": 0.01053732, + "auxiliary_loss_mlp": 0.01030649, + "balance_loss_clip": 1.0241096, + "balance_loss_mlp": 1.0197413, + "epoch": 0.5278821584247708, + "flos": 24642907804800.0, + "grad_norm": 1.981971179359841, + "language_loss": 0.76741421, + "learning_rate": 1.9164847592936406e-06, + "loss": 0.78825796, + "num_input_tokens_seen": 188684410, + "step": 8780, + "time_per_iteration": 2.644479274749756 + }, + { + "auxiliary_loss_clip": 0.01042794, + "auxiliary_loss_mlp": 0.01027792, + "balance_loss_clip": 1.02823758, + "balance_loss_mlp": 1.01501858, + "epoch": 0.5279422816774387, + "flos": 35408244240000.0, + "grad_norm": 2.2001046810657714, + "language_loss": 0.69336104, + "learning_rate": 1.916095638898174e-06, + "loss": 0.71406692, + "num_input_tokens_seen": 188706130, + "step": 8781, + "time_per_iteration": 2.8597912788391113 + }, + { + "auxiliary_loss_clip": 0.01056624, + "auxiliary_loss_mlp": 0.01033869, + "balance_loss_clip": 1.02573228, + "balance_loss_mlp": 1.02327144, + "epoch": 0.5280024049301068, + "flos": 22967028904320.0, + "grad_norm": 1.7000045465332223, + "language_loss": 0.72326946, + "learning_rate": 1.9157065216843696e-06, + "loss": 0.74417436, + "num_input_tokens_seen": 188725030, + "step": 8782, + "time_per_iteration": 4.209277629852295 + }, + { + "auxiliary_loss_clip": 0.01045929, + "auxiliary_loss_mlp": 0.01029597, + "balance_loss_clip": 1.02537966, + "balance_loss_mlp": 1.01827836, + "epoch": 0.5280625281827747, + "flos": 21507619317120.0, + "grad_norm": 1.852454881367771, + "language_loss": 0.68756485, + "learning_rate": 1.915317407666982e-06, + "loss": 0.70832014, + "num_input_tokens_seen": 188744325, + "step": 8783, + "time_per_iteration": 2.7523365020751953 + }, + { + "auxiliary_loss_clip": 0.01062671, + "auxiliary_loss_mlp": 0.01039192, + "balance_loss_clip": 1.02717435, + "balance_loss_mlp": 1.02487516, + "epoch": 0.5281226514354427, + "flos": 31208167958400.0, + "grad_norm": 1.8785847759005734, + "language_loss": 0.6931569, + "learning_rate": 1.9149282968607674e-06, + "loss": 0.71417552, + "num_input_tokens_seen": 188765100, + "step": 8784, + "time_per_iteration": 2.931262493133545 + }, + { + "auxiliary_loss_clip": 0.01071161, + "auxiliary_loss_mlp": 0.0103281, + "balance_loss_clip": 1.02588201, + "balance_loss_mlp": 1.01941073, + "epoch": 0.5281827746881106, + "flos": 25077821679360.0, + "grad_norm": 2.7973875638448296, + "language_loss": 0.75148833, + "learning_rate": 1.91453918928048e-06, + "loss": 0.77252805, + "num_input_tokens_seen": 188783995, + "step": 8785, + "time_per_iteration": 4.149796009063721 + }, + { + "auxiliary_loss_clip": 0.01060664, + "auxiliary_loss_mlp": 0.01030027, + "balance_loss_clip": 1.02752388, + "balance_loss_mlp": 1.01723027, + "epoch": 0.5282428979407786, + "flos": 20631255292800.0, + "grad_norm": 1.641740719925195, + "language_loss": 0.83322948, + "learning_rate": 1.9141500849408745e-06, + "loss": 0.85413641, + "num_input_tokens_seen": 188803120, + "step": 8786, + "time_per_iteration": 2.566042184829712 + }, + { + "auxiliary_loss_clip": 0.01034129, + "auxiliary_loss_mlp": 0.01024926, + "balance_loss_clip": 1.02426946, + "balance_loss_mlp": 1.01437628, + "epoch": 0.5283030211934465, + "flos": 22419391173120.0, + "grad_norm": 2.340368841687031, + "language_loss": 0.82745326, + "learning_rate": 1.9137609838567076e-06, + "loss": 0.8480438, + "num_input_tokens_seen": 188820960, + "step": 8787, + "time_per_iteration": 2.6629865169525146 + }, + { + "auxiliary_loss_clip": 0.01023214, + "auxiliary_loss_mlp": 0.01028141, + "balance_loss_clip": 1.02552152, + "balance_loss_mlp": 1.01710248, + "epoch": 0.5283631444461145, + "flos": 23615467176960.0, + "grad_norm": 1.8813390367468026, + "language_loss": 0.83105755, + "learning_rate": 1.9133718860427316e-06, + "loss": 0.85157108, + "num_input_tokens_seen": 188837165, + "step": 8788, + "time_per_iteration": 2.7223379611968994 + }, + { + "auxiliary_loss_clip": 0.01044874, + "auxiliary_loss_mlp": 0.01038112, + "balance_loss_clip": 1.03235841, + "balance_loss_mlp": 1.02473688, + "epoch": 0.5284232676987825, + "flos": 32671994918400.0, + "grad_norm": 2.0958923127554585, + "language_loss": 0.74574387, + "learning_rate": 1.9129827915137027e-06, + "loss": 0.76657379, + "num_input_tokens_seen": 188858555, + "step": 8789, + "time_per_iteration": 2.842512369155884 + }, + { + "auxiliary_loss_clip": 0.01060344, + "auxiliary_loss_mlp": 0.01033916, + "balance_loss_clip": 1.0269922, + "balance_loss_mlp": 1.02192378, + "epoch": 0.5284833909514505, + "flos": 26760919213440.0, + "grad_norm": 2.217813542157546, + "language_loss": 0.69642389, + "learning_rate": 1.9125937002843754e-06, + "loss": 0.71736652, + "num_input_tokens_seen": 188879050, + "step": 8790, + "time_per_iteration": 2.622851610183716 + }, + { + "auxiliary_loss_clip": 0.01065494, + "auxiliary_loss_mlp": 0.0102534, + "balance_loss_clip": 1.02549863, + "balance_loss_mlp": 1.01474237, + "epoch": 0.5285435142041185, + "flos": 22090700793600.0, + "grad_norm": 2.0247114681644103, + "language_loss": 0.79061407, + "learning_rate": 1.9122046123695036e-06, + "loss": 0.81152242, + "num_input_tokens_seen": 188898885, + "step": 8791, + "time_per_iteration": 2.583956480026245 + }, + { + "auxiliary_loss_clip": 0.01016542, + "auxiliary_loss_mlp": 0.0102868, + "balance_loss_clip": 1.02558804, + "balance_loss_mlp": 1.01668751, + "epoch": 0.5286036374567864, + "flos": 20375463565440.0, + "grad_norm": 2.1452233631514894, + "language_loss": 0.66245008, + "learning_rate": 1.9118155277838423e-06, + "loss": 0.68290228, + "num_input_tokens_seen": 188917225, + "step": 8792, + "time_per_iteration": 2.820220708847046 + }, + { + "auxiliary_loss_clip": 0.01037524, + "auxiliary_loss_mlp": 0.01035019, + "balance_loss_clip": 1.02204227, + "balance_loss_mlp": 1.02290118, + "epoch": 0.5286637607094544, + "flos": 24352175122560.0, + "grad_norm": 2.2217362490263506, + "language_loss": 0.79827988, + "learning_rate": 1.9114264465421443e-06, + "loss": 0.81900525, + "num_input_tokens_seen": 188936120, + "step": 8793, + "time_per_iteration": 2.7224833965301514 + }, + { + "auxiliary_loss_clip": 0.01067275, + "auxiliary_loss_mlp": 0.01039723, + "balance_loss_clip": 1.02566814, + "balance_loss_mlp": 1.02756929, + "epoch": 0.5287238839621223, + "flos": 17271165536640.0, + "grad_norm": 2.0698621616976465, + "language_loss": 0.84748489, + "learning_rate": 1.9110373686591645e-06, + "loss": 0.86855495, + "num_input_tokens_seen": 188953405, + "step": 8794, + "time_per_iteration": 2.653231143951416 + }, + { + "auxiliary_loss_clip": 0.01046737, + "auxiliary_loss_mlp": 0.01036126, + "balance_loss_clip": 1.02454948, + "balance_loss_mlp": 1.02291727, + "epoch": 0.5287840072147904, + "flos": 17566890209280.0, + "grad_norm": 2.059663601150219, + "language_loss": 0.68416667, + "learning_rate": 1.9106482941496564e-06, + "loss": 0.70499533, + "num_input_tokens_seen": 188971150, + "step": 8795, + "time_per_iteration": 2.6835005283355713 + }, + { + "auxiliary_loss_clip": 0.01050772, + "auxiliary_loss_mlp": 0.01031806, + "balance_loss_clip": 1.02700257, + "balance_loss_mlp": 1.01983106, + "epoch": 0.5288441304674583, + "flos": 18552099421440.0, + "grad_norm": 2.0292058942417652, + "language_loss": 0.80674362, + "learning_rate": 1.910259223028374e-06, + "loss": 0.82756943, + "num_input_tokens_seen": 188989550, + "step": 8796, + "time_per_iteration": 2.812020778656006 + }, + { + "auxiliary_loss_clip": 0.01022628, + "auxiliary_loss_mlp": 0.01047834, + "balance_loss_clip": 1.02245414, + "balance_loss_mlp": 1.03368402, + "epoch": 0.5289042537201263, + "flos": 20814507504000.0, + "grad_norm": 1.790653564588355, + "language_loss": 0.69217622, + "learning_rate": 1.909870155310071e-06, + "loss": 0.71288085, + "num_input_tokens_seen": 189008795, + "step": 8797, + "time_per_iteration": 2.716073513031006 + }, + { + "auxiliary_loss_clip": 0.01056288, + "auxiliary_loss_mlp": 0.01034265, + "balance_loss_clip": 1.02689493, + "balance_loss_mlp": 1.02280855, + "epoch": 0.5289643769727942, + "flos": 15735265937280.0, + "grad_norm": 1.538340124401326, + "language_loss": 0.82264978, + "learning_rate": 1.9094810910095005e-06, + "loss": 0.84355527, + "num_input_tokens_seen": 189025540, + "step": 8798, + "time_per_iteration": 2.569973945617676 + }, + { + "auxiliary_loss_clip": 0.01042679, + "auxiliary_loss_mlp": 0.00747494, + "balance_loss_clip": 1.02372384, + "balance_loss_mlp": 1.0002048, + "epoch": 0.5290245002254622, + "flos": 19537308633600.0, + "grad_norm": 2.078181402480863, + "language_loss": 0.70962858, + "learning_rate": 1.9090920301414166e-06, + "loss": 0.72753024, + "num_input_tokens_seen": 189044885, + "step": 8799, + "time_per_iteration": 2.7173097133636475 + }, + { + "auxiliary_loss_clip": 0.01058334, + "auxiliary_loss_mlp": 0.01030101, + "balance_loss_clip": 1.02908897, + "balance_loss_mlp": 1.01894903, + "epoch": 0.5290846234781301, + "flos": 15815131827840.0, + "grad_norm": 2.3351211692633123, + "language_loss": 0.69563901, + "learning_rate": 1.9087029727205716e-06, + "loss": 0.71652341, + "num_input_tokens_seen": 189061280, + "step": 8800, + "time_per_iteration": 2.5821890830993652 + }, + { + "auxiliary_loss_clip": 0.00986338, + "auxiliary_loss_mlp": 0.01010037, + "balance_loss_clip": 1.008214, + "balance_loss_mlp": 1.00870764, + "epoch": 0.5291447467307981, + "flos": 70057624821120.0, + "grad_norm": 0.9874982211760411, + "language_loss": 0.56950772, + "learning_rate": 1.9083139187617193e-06, + "loss": 0.58947146, + "num_input_tokens_seen": 189114775, + "step": 8801, + "time_per_iteration": 3.1264612674713135 + }, + { + "auxiliary_loss_clip": 0.01057086, + "auxiliary_loss_mlp": 0.01033957, + "balance_loss_clip": 1.03126872, + "balance_loss_mlp": 1.02220345, + "epoch": 0.529204869983466, + "flos": 28364186770560.0, + "grad_norm": 1.5227252977404964, + "language_loss": 0.6396842, + "learning_rate": 1.9079248682796123e-06, + "loss": 0.66059458, + "num_input_tokens_seen": 189134700, + "step": 8802, + "time_per_iteration": 2.707366704940796 + }, + { + "auxiliary_loss_clip": 0.01047433, + "auxiliary_loss_mlp": 0.01025001, + "balance_loss_clip": 1.02603745, + "balance_loss_mlp": 1.01361609, + "epoch": 0.5292649932361341, + "flos": 33758830684800.0, + "grad_norm": 1.5994624119402394, + "language_loss": 0.688281, + "learning_rate": 1.907535821289003e-06, + "loss": 0.70900524, + "num_input_tokens_seen": 189155365, + "step": 8803, + "time_per_iteration": 2.7787516117095947 + }, + { + "auxiliary_loss_clip": 0.01053672, + "auxiliary_loss_mlp": 0.0074749, + "balance_loss_clip": 1.02476382, + "balance_loss_mlp": 1.00020003, + "epoch": 0.5293251164888021, + "flos": 20447679859200.0, + "grad_norm": 1.959220888521819, + "language_loss": 0.76207966, + "learning_rate": 1.9071467778046458e-06, + "loss": 0.78009129, + "num_input_tokens_seen": 189173885, + "step": 8804, + "time_per_iteration": 2.7897417545318604 + }, + { + "auxiliary_loss_clip": 0.01002463, + "auxiliary_loss_mlp": 0.01002005, + "balance_loss_clip": 1.00479984, + "balance_loss_mlp": 1.00077677, + "epoch": 0.52938523974147, + "flos": 66545312204160.0, + "grad_norm": 0.755968421005109, + "language_loss": 0.52958041, + "learning_rate": 1.906757737841291e-06, + "loss": 0.54962504, + "num_input_tokens_seen": 189236515, + "step": 8805, + "time_per_iteration": 3.2726399898529053 + }, + { + "auxiliary_loss_clip": 0.0100114, + "auxiliary_loss_mlp": 0.01004593, + "balance_loss_clip": 1.00420308, + "balance_loss_mlp": 1.00335348, + "epoch": 0.529445362994138, + "flos": 67151734542720.0, + "grad_norm": 0.7387842905038577, + "language_loss": 0.63826716, + "learning_rate": 1.906368701413693e-06, + "loss": 0.65832448, + "num_input_tokens_seen": 189300500, + "step": 8806, + "time_per_iteration": 3.1878738403320312 + }, + { + "auxiliary_loss_clip": 0.01060099, + "auxiliary_loss_mlp": 0.01033518, + "balance_loss_clip": 1.02654719, + "balance_loss_mlp": 1.02221656, + "epoch": 0.5295054862468059, + "flos": 17749316407680.0, + "grad_norm": 1.6950425494677581, + "language_loss": 0.72040021, + "learning_rate": 1.9059796685366026e-06, + "loss": 0.74133635, + "num_input_tokens_seen": 189319745, + "step": 8807, + "time_per_iteration": 2.6337387561798096 + }, + { + "auxiliary_loss_clip": 0.01030906, + "auxiliary_loss_mlp": 0.01029504, + "balance_loss_clip": 1.02686119, + "balance_loss_mlp": 1.01906109, + "epoch": 0.529565609499474, + "flos": 11397401084160.0, + "grad_norm": 2.2485437813215032, + "language_loss": 0.68982816, + "learning_rate": 1.9055906392247723e-06, + "loss": 0.71043229, + "num_input_tokens_seen": 189334550, + "step": 8808, + "time_per_iteration": 2.601511240005493 + }, + { + "auxiliary_loss_clip": 0.01055476, + "auxiliary_loss_mlp": 0.01032488, + "balance_loss_clip": 1.02490282, + "balance_loss_mlp": 1.02219391, + "epoch": 0.5296257327521419, + "flos": 17196363463680.0, + "grad_norm": 1.8654925284574462, + "language_loss": 0.86854291, + "learning_rate": 1.9052016134929554e-06, + "loss": 0.88942254, + "num_input_tokens_seen": 189351735, + "step": 8809, + "time_per_iteration": 2.5190842151641846 + }, + { + "auxiliary_loss_clip": 0.01061152, + "auxiliary_loss_mlp": 0.01037515, + "balance_loss_clip": 1.02741289, + "balance_loss_mlp": 1.02455664, + "epoch": 0.5296858560048099, + "flos": 39964086777600.0, + "grad_norm": 1.8298308091422582, + "language_loss": 0.64175189, + "learning_rate": 1.9048125913559016e-06, + "loss": 0.66273856, + "num_input_tokens_seen": 189373105, + "step": 8810, + "time_per_iteration": 2.730475425720215 + }, + { + "auxiliary_loss_clip": 0.0106718, + "auxiliary_loss_mlp": 0.01037208, + "balance_loss_clip": 1.02633023, + "balance_loss_mlp": 1.02612185, + "epoch": 0.5297459792574778, + "flos": 20961418129920.0, + "grad_norm": 1.564893603752091, + "language_loss": 0.67814255, + "learning_rate": 1.9044235728283646e-06, + "loss": 0.69918644, + "num_input_tokens_seen": 189394615, + "step": 8811, + "time_per_iteration": 2.5582549571990967 + }, + { + "auxiliary_loss_clip": 0.00971057, + "auxiliary_loss_mlp": 0.01005467, + "balance_loss_clip": 1.00374174, + "balance_loss_mlp": 1.00420904, + "epoch": 0.5298061025101458, + "flos": 66523620389760.0, + "grad_norm": 0.6949826698518089, + "language_loss": 0.53398144, + "learning_rate": 1.9040345579250953e-06, + "loss": 0.55374664, + "num_input_tokens_seen": 189459750, + "step": 8812, + "time_per_iteration": 3.3280553817749023 + }, + { + "auxiliary_loss_clip": 0.00993731, + "auxiliary_loss_mlp": 0.01003286, + "balance_loss_clip": 1.00639176, + "balance_loss_mlp": 1.0019505, + "epoch": 0.5298662257628137, + "flos": 67662994775040.0, + "grad_norm": 0.7368189113087767, + "language_loss": 0.56302035, + "learning_rate": 1.9036455466608453e-06, + "loss": 0.58299053, + "num_input_tokens_seen": 189527540, + "step": 8813, + "time_per_iteration": 3.2188165187835693 + }, + { + "auxiliary_loss_clip": 0.0100824, + "auxiliary_loss_mlp": 0.01033208, + "balance_loss_clip": 1.0230689, + "balance_loss_mlp": 1.02210951, + "epoch": 0.5299263490154817, + "flos": 19646405216640.0, + "grad_norm": 2.055463864843196, + "language_loss": 0.81823361, + "learning_rate": 1.9032565390503657e-06, + "loss": 0.83864808, + "num_input_tokens_seen": 189546900, + "step": 8814, + "time_per_iteration": 2.7251462936401367 + }, + { + "auxiliary_loss_clip": 0.01072762, + "auxiliary_loss_mlp": 0.01030077, + "balance_loss_clip": 1.03004885, + "balance_loss_mlp": 1.01873398, + "epoch": 0.5299864722681497, + "flos": 22055005653120.0, + "grad_norm": 1.628094015012205, + "language_loss": 0.84862864, + "learning_rate": 1.9028675351084076e-06, + "loss": 0.86965704, + "num_input_tokens_seen": 189566490, + "step": 8815, + "time_per_iteration": 2.6412906646728516 + }, + { + "auxiliary_loss_clip": 0.01065132, + "auxiliary_loss_mlp": 0.01034038, + "balance_loss_clip": 1.02678573, + "balance_loss_mlp": 1.02327299, + "epoch": 0.5300465955208177, + "flos": 21763698353280.0, + "grad_norm": 2.139171426371312, + "language_loss": 0.66691303, + "learning_rate": 1.9024785348497225e-06, + "loss": 0.68790472, + "num_input_tokens_seen": 189585580, + "step": 8816, + "time_per_iteration": 2.5228705406188965 + }, + { + "auxiliary_loss_clip": 0.0104404, + "auxiliary_loss_mlp": 0.01032867, + "balance_loss_clip": 1.02534127, + "balance_loss_mlp": 1.02107704, + "epoch": 0.5301067187734857, + "flos": 42996491735040.0, + "grad_norm": 1.5329561140633838, + "language_loss": 0.72295237, + "learning_rate": 1.9020895382890611e-06, + "loss": 0.74372143, + "num_input_tokens_seen": 189608485, + "step": 8817, + "time_per_iteration": 2.818504810333252 + }, + { + "auxiliary_loss_clip": 0.01041943, + "auxiliary_loss_mlp": 0.01029931, + "balance_loss_clip": 1.02470565, + "balance_loss_mlp": 1.01703835, + "epoch": 0.5301668420261536, + "flos": 20554298403840.0, + "grad_norm": 2.738757265209003, + "language_loss": 0.65117788, + "learning_rate": 1.9017005454411743e-06, + "loss": 0.67189664, + "num_input_tokens_seen": 189627815, + "step": 8818, + "time_per_iteration": 2.724152088165283 + }, + { + "auxiliary_loss_clip": 0.01023266, + "auxiliary_loss_mlp": 0.01025209, + "balance_loss_clip": 1.02443695, + "balance_loss_mlp": 1.01287723, + "epoch": 0.5302269652788216, + "flos": 17486665182720.0, + "grad_norm": 2.517074445690977, + "language_loss": 0.74885452, + "learning_rate": 1.9013115563208126e-06, + "loss": 0.76933926, + "num_input_tokens_seen": 189644850, + "step": 8819, + "time_per_iteration": 4.5819385051727295 + }, + { + "auxiliary_loss_clip": 0.01031398, + "auxiliary_loss_mlp": 0.01036572, + "balance_loss_clip": 1.02288556, + "balance_loss_mlp": 1.02358365, + "epoch": 0.5302870885314895, + "flos": 14574202715520.0, + "grad_norm": 2.1693807839635935, + "language_loss": 0.82260466, + "learning_rate": 1.9009225709427267e-06, + "loss": 0.84328437, + "num_input_tokens_seen": 189660945, + "step": 8820, + "time_per_iteration": 4.356089115142822 + }, + { + "auxiliary_loss_clip": 0.01046465, + "auxiliary_loss_mlp": 0.01025424, + "balance_loss_clip": 1.02478504, + "balance_loss_mlp": 1.01493347, + "epoch": 0.5303472117841576, + "flos": 23438032968960.0, + "grad_norm": 1.4201887969631182, + "language_loss": 0.72508603, + "learning_rate": 1.9005335893216667e-06, + "loss": 0.74580497, + "num_input_tokens_seen": 189680425, + "step": 8821, + "time_per_iteration": 2.672508478164673 + }, + { + "auxiliary_loss_clip": 0.0103616, + "auxiliary_loss_mlp": 0.01026317, + "balance_loss_clip": 1.02343321, + "balance_loss_mlp": 1.01607132, + "epoch": 0.5304073350368255, + "flos": 22709010533760.0, + "grad_norm": 1.7398969296508653, + "language_loss": 0.74201584, + "learning_rate": 1.9001446114723824e-06, + "loss": 0.7626406, + "num_input_tokens_seen": 189700375, + "step": 8822, + "time_per_iteration": 2.670665740966797 + }, + { + "auxiliary_loss_clip": 0.01032767, + "auxiliary_loss_mlp": 0.0103016, + "balance_loss_clip": 1.02385724, + "balance_loss_mlp": 1.01819694, + "epoch": 0.5304674582894935, + "flos": 27928554624000.0, + "grad_norm": 1.6789978424891525, + "language_loss": 0.67850959, + "learning_rate": 1.8997556374096257e-06, + "loss": 0.69913888, + "num_input_tokens_seen": 189721225, + "step": 8823, + "time_per_iteration": 2.829442262649536 + }, + { + "auxiliary_loss_clip": 0.01072803, + "auxiliary_loss_mlp": 0.01034411, + "balance_loss_clip": 1.02915335, + "balance_loss_mlp": 1.02201307, + "epoch": 0.5305275815421614, + "flos": 21250642440960.0, + "grad_norm": 1.7968671911649148, + "language_loss": 0.69389999, + "learning_rate": 1.8993666671481444e-06, + "loss": 0.71497214, + "num_input_tokens_seen": 189740170, + "step": 8824, + "time_per_iteration": 2.657276153564453 + }, + { + "auxiliary_loss_clip": 0.01045105, + "auxiliary_loss_mlp": 0.00747356, + "balance_loss_clip": 1.02584553, + "balance_loss_mlp": 1.00017309, + "epoch": 0.5305877047948294, + "flos": 17603088140160.0, + "grad_norm": 1.8758482441866, + "language_loss": 0.75600916, + "learning_rate": 1.898977700702689e-06, + "loss": 0.77393377, + "num_input_tokens_seen": 189757890, + "step": 8825, + "time_per_iteration": 2.7291414737701416 + }, + { + "auxiliary_loss_clip": 0.00999523, + "auxiliary_loss_mlp": 0.0103365, + "balance_loss_clip": 1.02427924, + "balance_loss_mlp": 1.02196169, + "epoch": 0.5306478280474973, + "flos": 15195493284480.0, + "grad_norm": 1.8775608212725596, + "language_loss": 0.85359037, + "learning_rate": 1.8985887380880103e-06, + "loss": 0.87392211, + "num_input_tokens_seen": 189775390, + "step": 8826, + "time_per_iteration": 2.838003635406494 + }, + { + "auxiliary_loss_clip": 0.01065976, + "auxiliary_loss_mlp": 0.0103038, + "balance_loss_clip": 1.02640629, + "balance_loss_mlp": 1.01932299, + "epoch": 0.5307079513001653, + "flos": 15341218761600.0, + "grad_norm": 1.3514767435164277, + "language_loss": 0.64300078, + "learning_rate": 1.8981997793188558e-06, + "loss": 0.66396433, + "num_input_tokens_seen": 189793975, + "step": 8827, + "time_per_iteration": 2.5268807411193848 + }, + { + "auxiliary_loss_clip": 0.01045171, + "auxiliary_loss_mlp": 0.01036022, + "balance_loss_clip": 1.02591419, + "balance_loss_mlp": 1.02411914, + "epoch": 0.5307680745528333, + "flos": 43544452688640.0, + "grad_norm": 1.628921008530673, + "language_loss": 0.5994488, + "learning_rate": 1.8978108244099762e-06, + "loss": 0.62026072, + "num_input_tokens_seen": 189817870, + "step": 8828, + "time_per_iteration": 2.7694454193115234 + }, + { + "auxiliary_loss_clip": 0.01056972, + "auxiliary_loss_mlp": 0.01030885, + "balance_loss_clip": 1.02516723, + "balance_loss_mlp": 1.01831436, + "epoch": 0.5308281978055013, + "flos": 20048928001920.0, + "grad_norm": 1.560114515850169, + "language_loss": 0.81143844, + "learning_rate": 1.8974218733761208e-06, + "loss": 0.83231699, + "num_input_tokens_seen": 189837905, + "step": 8829, + "time_per_iteration": 2.6982059478759766 + }, + { + "auxiliary_loss_clip": 0.01059759, + "auxiliary_loss_mlp": 0.01028913, + "balance_loss_clip": 1.02940822, + "balance_loss_mlp": 1.01745749, + "epoch": 0.5308883210581693, + "flos": 20703938463360.0, + "grad_norm": 1.7609422893666098, + "language_loss": 0.78180349, + "learning_rate": 1.8970329262320375e-06, + "loss": 0.80269027, + "num_input_tokens_seen": 189856970, + "step": 8830, + "time_per_iteration": 4.186580657958984 + }, + { + "auxiliary_loss_clip": 0.01057213, + "auxiliary_loss_mlp": 0.01030613, + "balance_loss_clip": 1.02680457, + "balance_loss_mlp": 1.0197947, + "epoch": 0.5309484443108372, + "flos": 14355506759040.0, + "grad_norm": 2.124256862830565, + "language_loss": 0.80681616, + "learning_rate": 1.8966439829924768e-06, + "loss": 0.82769442, + "num_input_tokens_seen": 189872830, + "step": 8831, + "time_per_iteration": 2.5178427696228027 + }, + { + "auxiliary_loss_clip": 0.0105582, + "auxiliary_loss_mlp": 0.01026301, + "balance_loss_clip": 1.02562451, + "balance_loss_mlp": 1.01557231, + "epoch": 0.5310085675635052, + "flos": 20010503427840.0, + "grad_norm": 1.9529814297612484, + "language_loss": 0.72696179, + "learning_rate": 1.896255043672186e-06, + "loss": 0.74778306, + "num_input_tokens_seen": 189891635, + "step": 8832, + "time_per_iteration": 4.157714128494263 + }, + { + "auxiliary_loss_clip": 0.0103323, + "auxiliary_loss_mlp": 0.01031222, + "balance_loss_clip": 1.02546072, + "balance_loss_mlp": 1.01880002, + "epoch": 0.5310686908161731, + "flos": 22127293774080.0, + "grad_norm": 1.8847701736688505, + "language_loss": 0.75546181, + "learning_rate": 1.8958661082859143e-06, + "loss": 0.77610636, + "num_input_tokens_seen": 189909050, + "step": 8833, + "time_per_iteration": 2.6585397720336914 + }, + { + "auxiliary_loss_clip": 0.01035536, + "auxiliary_loss_mlp": 0.01027818, + "balance_loss_clip": 1.02411985, + "balance_loss_mlp": 1.01587367, + "epoch": 0.5311288140688412, + "flos": 24717889445760.0, + "grad_norm": 1.9628096009071667, + "language_loss": 0.735659, + "learning_rate": 1.8954771768484103e-06, + "loss": 0.75629258, + "num_input_tokens_seen": 189927405, + "step": 8834, + "time_per_iteration": 2.6986913681030273 + }, + { + "auxiliary_loss_clip": 0.01071635, + "auxiliary_loss_mlp": 0.01039047, + "balance_loss_clip": 1.02710271, + "balance_loss_mlp": 1.02606511, + "epoch": 0.5311889373215091, + "flos": 24097712198400.0, + "grad_norm": 1.8176358781170308, + "language_loss": 0.77795959, + "learning_rate": 1.8950882493744226e-06, + "loss": 0.79906642, + "num_input_tokens_seen": 189947740, + "step": 8835, + "time_per_iteration": 2.5954878330230713 + }, + { + "auxiliary_loss_clip": 0.01048215, + "auxiliary_loss_mlp": 0.0104057, + "balance_loss_clip": 1.02509367, + "balance_loss_mlp": 1.02756393, + "epoch": 0.5312490605741771, + "flos": 22017012042240.0, + "grad_norm": 1.6000859032489618, + "language_loss": 0.72432858, + "learning_rate": 1.8946993258786985e-06, + "loss": 0.74521649, + "num_input_tokens_seen": 189966495, + "step": 8836, + "time_per_iteration": 2.6346752643585205 + }, + { + "auxiliary_loss_clip": 0.01051426, + "auxiliary_loss_mlp": 0.01036251, + "balance_loss_clip": 1.02717233, + "balance_loss_mlp": 1.02383542, + "epoch": 0.531309183826845, + "flos": 19390541662080.0, + "grad_norm": 1.906071569370584, + "language_loss": 0.80777979, + "learning_rate": 1.894310406375987e-06, + "loss": 0.82865661, + "num_input_tokens_seen": 189985325, + "step": 8837, + "time_per_iteration": 2.643160581588745 + }, + { + "auxiliary_loss_clip": 0.0105945, + "auxiliary_loss_mlp": 0.01029619, + "balance_loss_clip": 1.02883911, + "balance_loss_mlp": 1.01765037, + "epoch": 0.531369307079513, + "flos": 20190056538240.0, + "grad_norm": 1.9907849286615724, + "language_loss": 0.85888207, + "learning_rate": 1.893921490881035e-06, + "loss": 0.87977272, + "num_input_tokens_seen": 190003290, + "step": 8838, + "time_per_iteration": 2.6304662227630615 + }, + { + "auxiliary_loss_clip": 0.01045244, + "auxiliary_loss_mlp": 0.01027384, + "balance_loss_clip": 1.02499807, + "balance_loss_mlp": 1.01673234, + "epoch": 0.5314294303321809, + "flos": 18880143356160.0, + "grad_norm": 1.6519397947906536, + "language_loss": 0.72805262, + "learning_rate": 1.8935325794085906e-06, + "loss": 0.74877894, + "num_input_tokens_seen": 190023260, + "step": 8839, + "time_per_iteration": 2.6745431423187256 + }, + { + "auxiliary_loss_clip": 0.01045457, + "auxiliary_loss_mlp": 0.01033711, + "balance_loss_clip": 1.02310324, + "balance_loss_mlp": 1.02238643, + "epoch": 0.531489553584849, + "flos": 23040035297280.0, + "grad_norm": 7.009142461148354, + "language_loss": 0.76312631, + "learning_rate": 1.8931436719734023e-06, + "loss": 0.78391802, + "num_input_tokens_seen": 190042035, + "step": 8840, + "time_per_iteration": 2.7435927391052246 + }, + { + "auxiliary_loss_clip": 0.01040587, + "auxiliary_loss_mlp": 0.01034952, + "balance_loss_clip": 1.0262394, + "balance_loss_mlp": 1.02227974, + "epoch": 0.5315496768375169, + "flos": 19790478668160.0, + "grad_norm": 2.119344926953351, + "language_loss": 0.77390909, + "learning_rate": 1.892754768590216e-06, + "loss": 0.7946645, + "num_input_tokens_seen": 190057545, + "step": 8841, + "time_per_iteration": 2.6797688007354736 + }, + { + "auxiliary_loss_clip": 0.00992496, + "auxiliary_loss_mlp": 0.01003452, + "balance_loss_clip": 1.00483894, + "balance_loss_mlp": 1.00220621, + "epoch": 0.5316098000901849, + "flos": 71023228185600.0, + "grad_norm": 0.6959059877380778, + "language_loss": 0.56831133, + "learning_rate": 1.8923658692737793e-06, + "loss": 0.58827078, + "num_input_tokens_seen": 190123800, + "step": 8842, + "time_per_iteration": 3.3554437160491943 + }, + { + "auxiliary_loss_clip": 0.01049368, + "auxiliary_loss_mlp": 0.01036471, + "balance_loss_clip": 1.02586794, + "balance_loss_mlp": 1.02345347, + "epoch": 0.5316699233428529, + "flos": 16435560470400.0, + "grad_norm": 1.9367405407382174, + "language_loss": 0.73483014, + "learning_rate": 1.8919769740388407e-06, + "loss": 0.75568855, + "num_input_tokens_seen": 190141625, + "step": 8843, + "time_per_iteration": 2.62648868560791 + }, + { + "auxiliary_loss_clip": 0.0098928, + "auxiliary_loss_mlp": 0.01004074, + "balance_loss_clip": 1.00282514, + "balance_loss_mlp": 1.00294161, + "epoch": 0.5317300465955208, + "flos": 67420814302080.0, + "grad_norm": 0.8799442915104109, + "language_loss": 0.60994673, + "learning_rate": 1.891588082900145e-06, + "loss": 0.62988031, + "num_input_tokens_seen": 190198110, + "step": 8844, + "time_per_iteration": 3.3364055156707764 + }, + { + "auxiliary_loss_clip": 0.00998394, + "auxiliary_loss_mlp": 0.01001529, + "balance_loss_clip": 1.00143445, + "balance_loss_mlp": 1.00025332, + "epoch": 0.5317901698481888, + "flos": 59508075340800.0, + "grad_norm": 0.838993939411571, + "language_loss": 0.62176156, + "learning_rate": 1.8911991958724411e-06, + "loss": 0.64176083, + "num_input_tokens_seen": 190259950, + "step": 8845, + "time_per_iteration": 3.179004192352295 + }, + { + "auxiliary_loss_clip": 0.01035917, + "auxiliary_loss_mlp": 0.01033821, + "balance_loss_clip": 1.02396059, + "balance_loss_mlp": 1.02082729, + "epoch": 0.5318502931008567, + "flos": 19129219240320.0, + "grad_norm": 1.7181883184632014, + "language_loss": 0.74999785, + "learning_rate": 1.890810312970474e-06, + "loss": 0.77069527, + "num_input_tokens_seen": 190278265, + "step": 8846, + "time_per_iteration": 2.767357587814331 + }, + { + "auxiliary_loss_clip": 0.01058135, + "auxiliary_loss_mlp": 0.01036075, + "balance_loss_clip": 1.02645707, + "balance_loss_mlp": 1.02538192, + "epoch": 0.5319104163535248, + "flos": 24681045070080.0, + "grad_norm": 1.5870757391842016, + "language_loss": 0.75428629, + "learning_rate": 1.8904214342089903e-06, + "loss": 0.77522844, + "num_input_tokens_seen": 190298400, + "step": 8847, + "time_per_iteration": 2.5788466930389404 + }, + { + "auxiliary_loss_clip": 0.0104838, + "auxiliary_loss_mlp": 0.01031564, + "balance_loss_clip": 1.02602184, + "balance_loss_mlp": 1.02051973, + "epoch": 0.5319705396061927, + "flos": 19385513758080.0, + "grad_norm": 1.5109069893645475, + "language_loss": 0.87855363, + "learning_rate": 1.8900325596027378e-06, + "loss": 0.89935315, + "num_input_tokens_seen": 190316235, + "step": 8848, + "time_per_iteration": 2.62752366065979 + }, + { + "auxiliary_loss_clip": 0.01029841, + "auxiliary_loss_mlp": 0.01039295, + "balance_loss_clip": 1.02427387, + "balance_loss_mlp": 1.02592516, + "epoch": 0.5320306628588607, + "flos": 18259319664000.0, + "grad_norm": 2.027249960669907, + "language_loss": 0.7419163, + "learning_rate": 1.8896436891664609e-06, + "loss": 0.76260769, + "num_input_tokens_seen": 190335060, + "step": 8849, + "time_per_iteration": 2.6240234375 + }, + { + "auxiliary_loss_clip": 0.01052519, + "auxiliary_loss_mlp": 0.01029179, + "balance_loss_clip": 1.02470422, + "balance_loss_mlp": 1.01662064, + "epoch": 0.5320907861115286, + "flos": 23732321097600.0, + "grad_norm": 1.7391894771641707, + "language_loss": 0.79272652, + "learning_rate": 1.8892548229149066e-06, + "loss": 0.81354344, + "num_input_tokens_seen": 190353265, + "step": 8850, + "time_per_iteration": 2.62195086479187 + }, + { + "auxiliary_loss_clip": 0.0106744, + "auxiliary_loss_mlp": 0.01029881, + "balance_loss_clip": 1.02569354, + "balance_loss_mlp": 1.01847231, + "epoch": 0.5321509093641966, + "flos": 34495251321600.0, + "grad_norm": 1.4496427890659196, + "language_loss": 0.54921222, + "learning_rate": 1.888865960862821e-06, + "loss": 0.57018548, + "num_input_tokens_seen": 190376575, + "step": 8851, + "time_per_iteration": 2.6966967582702637 + }, + { + "auxiliary_loss_clip": 0.01053635, + "auxiliary_loss_mlp": 0.01033086, + "balance_loss_clip": 1.02540088, + "balance_loss_mlp": 1.02123618, + "epoch": 0.5322110326168645, + "flos": 20010934391040.0, + "grad_norm": 2.515173318272107, + "language_loss": 0.6850419, + "learning_rate": 1.8884771030249484e-06, + "loss": 0.70590913, + "num_input_tokens_seen": 190395185, + "step": 8852, + "time_per_iteration": 2.5514676570892334 + }, + { + "auxiliary_loss_clip": 0.00993141, + "auxiliary_loss_mlp": 0.00746854, + "balance_loss_clip": 1.00596762, + "balance_loss_mlp": 1.0009234, + "epoch": 0.5322711558695326, + "flos": 64631164435200.0, + "grad_norm": 0.8086325104018761, + "language_loss": 0.62794161, + "learning_rate": 1.8880882494160357e-06, + "loss": 0.64534152, + "num_input_tokens_seen": 190452595, + "step": 8853, + "time_per_iteration": 3.1481573581695557 + }, + { + "auxiliary_loss_clip": 0.01058307, + "auxiliary_loss_mlp": 0.01029998, + "balance_loss_clip": 1.02508783, + "balance_loss_mlp": 1.01816607, + "epoch": 0.5323312791222005, + "flos": 14939342421120.0, + "grad_norm": 2.400981997765823, + "language_loss": 0.79101479, + "learning_rate": 1.8876994000508278e-06, + "loss": 0.81189775, + "num_input_tokens_seen": 190469140, + "step": 8854, + "time_per_iteration": 2.6166296005249023 + }, + { + "auxiliary_loss_clip": 0.01049312, + "auxiliary_loss_mlp": 0.01029539, + "balance_loss_clip": 1.02948999, + "balance_loss_mlp": 1.01894152, + "epoch": 0.5323914023748685, + "flos": 23440834229760.0, + "grad_norm": 1.617712466544943, + "language_loss": 0.73147762, + "learning_rate": 1.8873105549440698e-06, + "loss": 0.75226617, + "num_input_tokens_seen": 190489015, + "step": 8855, + "time_per_iteration": 2.682515859603882 + }, + { + "auxiliary_loss_clip": 0.01041869, + "auxiliary_loss_mlp": 0.00747417, + "balance_loss_clip": 1.02376628, + "balance_loss_mlp": 1.00019073, + "epoch": 0.5324515256275365, + "flos": 26286180134400.0, + "grad_norm": 1.831595301903293, + "language_loss": 0.6463654, + "learning_rate": 1.886921714110507e-06, + "loss": 0.66425824, + "num_input_tokens_seen": 190508065, + "step": 8856, + "time_per_iteration": 2.6760425567626953 + }, + { + "auxiliary_loss_clip": 0.01052092, + "auxiliary_loss_mlp": 0.01033163, + "balance_loss_clip": 1.02826118, + "balance_loss_mlp": 1.02049732, + "epoch": 0.5325116488802044, + "flos": 26870913636480.0, + "grad_norm": 1.7542869680096371, + "language_loss": 0.7768867, + "learning_rate": 1.8865328775648842e-06, + "loss": 0.79773927, + "num_input_tokens_seen": 190527045, + "step": 8857, + "time_per_iteration": 2.7979111671447754 + }, + { + "auxiliary_loss_clip": 0.01034206, + "auxiliary_loss_mlp": 0.01034956, + "balance_loss_clip": 1.02470255, + "balance_loss_mlp": 1.02285624, + "epoch": 0.5325717721328724, + "flos": 25884734757120.0, + "grad_norm": 2.508276654910927, + "language_loss": 0.70722842, + "learning_rate": 1.8861440453219456e-06, + "loss": 0.72792006, + "num_input_tokens_seen": 190544075, + "step": 8858, + "time_per_iteration": 2.8404228687286377 + }, + { + "auxiliary_loss_clip": 0.01053214, + "auxiliary_loss_mlp": 0.01036878, + "balance_loss_clip": 1.02521276, + "balance_loss_mlp": 1.02358603, + "epoch": 0.5326318953855403, + "flos": 21799321666560.0, + "grad_norm": 1.9872037789015204, + "language_loss": 0.69421941, + "learning_rate": 1.8857552173964367e-06, + "loss": 0.71512032, + "num_input_tokens_seen": 190566030, + "step": 8859, + "time_per_iteration": 2.7358341217041016 + }, + { + "auxiliary_loss_clip": 0.01055444, + "auxiliary_loss_mlp": 0.01030769, + "balance_loss_clip": 1.02684712, + "balance_loss_mlp": 1.02037406, + "epoch": 0.5326920186382084, + "flos": 20922921728640.0, + "grad_norm": 1.4769115910804422, + "language_loss": 0.69569111, + "learning_rate": 1.8853663938031013e-06, + "loss": 0.71655321, + "num_input_tokens_seen": 190585605, + "step": 8860, + "time_per_iteration": 2.704787015914917 + }, + { + "auxiliary_loss_clip": 0.01049326, + "auxiliary_loss_mlp": 0.01031068, + "balance_loss_clip": 1.02751899, + "balance_loss_mlp": 1.02027321, + "epoch": 0.5327521418908763, + "flos": 21433427775360.0, + "grad_norm": 1.7874793590987097, + "language_loss": 0.78172839, + "learning_rate": 1.884977574556683e-06, + "loss": 0.80253232, + "num_input_tokens_seen": 190604625, + "step": 8861, + "time_per_iteration": 2.784079074859619 + }, + { + "auxiliary_loss_clip": 0.01028923, + "auxiliary_loss_mlp": 0.01034922, + "balance_loss_clip": 1.02715993, + "balance_loss_mlp": 1.02239871, + "epoch": 0.5328122651435443, + "flos": 21760250647680.0, + "grad_norm": 1.759211767019732, + "language_loss": 0.85575747, + "learning_rate": 1.8845887596719279e-06, + "loss": 0.87639594, + "num_input_tokens_seen": 190625060, + "step": 8862, + "time_per_iteration": 2.857973575592041 + }, + { + "auxiliary_loss_clip": 0.01044528, + "auxiliary_loss_mlp": 0.010333, + "balance_loss_clip": 1.02296162, + "balance_loss_mlp": 1.02058625, + "epoch": 0.5328723883962122, + "flos": 18296487262080.0, + "grad_norm": 1.9505277082219925, + "language_loss": 0.61629152, + "learning_rate": 1.8841999491635778e-06, + "loss": 0.63706982, + "num_input_tokens_seen": 190643150, + "step": 8863, + "time_per_iteration": 2.711742639541626 + }, + { + "auxiliary_loss_clip": 0.01042834, + "auxiliary_loss_mlp": 0.01034372, + "balance_loss_clip": 1.02693987, + "balance_loss_mlp": 1.02258825, + "epoch": 0.5329325116488802, + "flos": 25374911068800.0, + "grad_norm": 2.369656791068879, + "language_loss": 0.73260248, + "learning_rate": 1.883811143046377e-06, + "loss": 0.75337452, + "num_input_tokens_seen": 190662725, + "step": 8864, + "time_per_iteration": 2.73958158493042 + }, + { + "auxiliary_loss_clip": 0.01068069, + "auxiliary_loss_mlp": 0.01034976, + "balance_loss_clip": 1.02694726, + "balance_loss_mlp": 1.02419972, + "epoch": 0.5329926349015481, + "flos": 25592098654080.0, + "grad_norm": 1.6671323607345294, + "language_loss": 0.64339018, + "learning_rate": 1.8834223413350702e-06, + "loss": 0.66442066, + "num_input_tokens_seen": 190683680, + "step": 8865, + "time_per_iteration": 2.686652660369873 + }, + { + "auxiliary_loss_clip": 0.01055607, + "auxiliary_loss_mlp": 0.01029108, + "balance_loss_clip": 1.02356637, + "balance_loss_mlp": 1.01729476, + "epoch": 0.5330527581542162, + "flos": 22889605138560.0, + "grad_norm": 1.8032159582054692, + "language_loss": 0.78175426, + "learning_rate": 1.8830335440443989e-06, + "loss": 0.8026014, + "num_input_tokens_seen": 190703350, + "step": 8866, + "time_per_iteration": 4.182575702667236 + }, + { + "auxiliary_loss_clip": 0.01056019, + "auxiliary_loss_mlp": 0.01030081, + "balance_loss_clip": 1.02613449, + "balance_loss_mlp": 1.01906562, + "epoch": 0.5331128814068841, + "flos": 16026752805120.0, + "grad_norm": 1.8568275927503801, + "language_loss": 0.73424405, + "learning_rate": 1.882644751189108e-06, + "loss": 0.75510502, + "num_input_tokens_seen": 190721170, + "step": 8867, + "time_per_iteration": 2.7182295322418213 + }, + { + "auxiliary_loss_clip": 0.0104395, + "auxiliary_loss_mlp": 0.01034427, + "balance_loss_clip": 1.02457476, + "balance_loss_mlp": 1.02212501, + "epoch": 0.5331730046595521, + "flos": 39344699629440.0, + "grad_norm": 1.6743229289909187, + "language_loss": 0.72402513, + "learning_rate": 1.88225596278394e-06, + "loss": 0.74480891, + "num_input_tokens_seen": 190743795, + "step": 8868, + "time_per_iteration": 4.418957471847534 + }, + { + "auxiliary_loss_clip": 0.01035299, + "auxiliary_loss_mlp": 0.01028849, + "balance_loss_clip": 1.02417493, + "balance_loss_mlp": 1.01789963, + "epoch": 0.5332331279122201, + "flos": 24024382583040.0, + "grad_norm": 2.0374486575750033, + "language_loss": 0.78650349, + "learning_rate": 1.881867178843637e-06, + "loss": 0.807145, + "num_input_tokens_seen": 190761560, + "step": 8869, + "time_per_iteration": 2.713191270828247 + }, + { + "auxiliary_loss_clip": 0.01061041, + "auxiliary_loss_mlp": 0.01033586, + "balance_loss_clip": 1.02799726, + "balance_loss_mlp": 1.02223766, + "epoch": 0.533293251164888, + "flos": 17129318728320.0, + "grad_norm": 1.6661092845237464, + "language_loss": 0.75692272, + "learning_rate": 1.8814783993829434e-06, + "loss": 0.77786899, + "num_input_tokens_seen": 190778875, + "step": 8870, + "time_per_iteration": 2.577031135559082 + }, + { + "auxiliary_loss_clip": 0.0105235, + "auxiliary_loss_mlp": 0.01036244, + "balance_loss_clip": 1.02888346, + "balance_loss_mlp": 1.02396536, + "epoch": 0.533353374417556, + "flos": 22126360020480.0, + "grad_norm": 1.8284615466039231, + "language_loss": 0.75326836, + "learning_rate": 1.8810896244165997e-06, + "loss": 0.77415425, + "num_input_tokens_seen": 190799830, + "step": 8871, + "time_per_iteration": 2.7051684856414795 + }, + { + "auxiliary_loss_clip": 0.01046941, + "auxiliary_loss_mlp": 0.01026239, + "balance_loss_clip": 1.02586019, + "balance_loss_mlp": 1.01535511, + "epoch": 0.533413497670224, + "flos": 15011091838080.0, + "grad_norm": 3.6198648157586755, + "language_loss": 0.7220099, + "learning_rate": 1.8807008539593498e-06, + "loss": 0.7427417, + "num_input_tokens_seen": 190817155, + "step": 8872, + "time_per_iteration": 2.6252543926239014 + }, + { + "auxiliary_loss_clip": 0.01043086, + "auxiliary_loss_mlp": 0.01038402, + "balance_loss_clip": 1.02804434, + "balance_loss_mlp": 1.026618, + "epoch": 0.533473620922892, + "flos": 19609955890560.0, + "grad_norm": 1.8094274329953035, + "language_loss": 0.65117526, + "learning_rate": 1.880312088025936e-06, + "loss": 0.67199016, + "num_input_tokens_seen": 190835240, + "step": 8873, + "time_per_iteration": 2.6235437393188477 + }, + { + "auxiliary_loss_clip": 0.0104847, + "auxiliary_loss_mlp": 0.01041941, + "balance_loss_clip": 1.02691865, + "balance_loss_mlp": 1.03046107, + "epoch": 0.5335337441755599, + "flos": 14282644020480.0, + "grad_norm": 3.558625434898605, + "language_loss": 0.79858708, + "learning_rate": 1.879923326631099e-06, + "loss": 0.81949115, + "num_input_tokens_seen": 190851620, + "step": 8874, + "time_per_iteration": 2.5683324337005615 + }, + { + "auxiliary_loss_clip": 0.01056678, + "auxiliary_loss_mlp": 0.0103003, + "balance_loss_clip": 1.0253799, + "balance_loss_mlp": 1.01886559, + "epoch": 0.5335938674282279, + "flos": 20814830726400.0, + "grad_norm": 1.7561594831766634, + "language_loss": 0.70333183, + "learning_rate": 1.879534569789582e-06, + "loss": 0.72419888, + "num_input_tokens_seen": 190870545, + "step": 8875, + "time_per_iteration": 2.677948236465454 + }, + { + "auxiliary_loss_clip": 0.01010371, + "auxiliary_loss_mlp": 0.01002121, + "balance_loss_clip": 1.00331569, + "balance_loss_mlp": 1.00076795, + "epoch": 0.5336539906808958, + "flos": 71396448451200.0, + "grad_norm": 0.7210960174322688, + "language_loss": 0.59666258, + "learning_rate": 1.879145817516126e-06, + "loss": 0.61678749, + "num_input_tokens_seen": 190931995, + "step": 8876, + "time_per_iteration": 3.223839282989502 + }, + { + "auxiliary_loss_clip": 0.01057648, + "auxiliary_loss_mlp": 0.01032922, + "balance_loss_clip": 1.02577734, + "balance_loss_mlp": 1.02159715, + "epoch": 0.5337141139335638, + "flos": 20152996680960.0, + "grad_norm": 2.183378899461941, + "language_loss": 0.74675333, + "learning_rate": 1.8787570698254727e-06, + "loss": 0.76765907, + "num_input_tokens_seen": 190949890, + "step": 8877, + "time_per_iteration": 4.1305272579193115 + }, + { + "auxiliary_loss_clip": 0.01002564, + "auxiliary_loss_mlp": 0.01007762, + "balance_loss_clip": 1.00542951, + "balance_loss_mlp": 1.00648022, + "epoch": 0.5337742371862317, + "flos": 67728387484800.0, + "grad_norm": 0.7624791203242544, + "language_loss": 0.5719583, + "learning_rate": 1.8783683267323629e-06, + "loss": 0.59206158, + "num_input_tokens_seen": 191008480, + "step": 8878, + "time_per_iteration": 3.062499523162842 + }, + { + "auxiliary_loss_clip": 0.01071655, + "auxiliary_loss_mlp": 0.01032146, + "balance_loss_clip": 1.02719653, + "balance_loss_mlp": 1.01974273, + "epoch": 0.5338343604388998, + "flos": 25008909436800.0, + "grad_norm": 4.815906523530701, + "language_loss": 0.72458071, + "learning_rate": 1.8779795882515395e-06, + "loss": 0.7456187, + "num_input_tokens_seen": 191028995, + "step": 8879, + "time_per_iteration": 2.612294912338257 + }, + { + "auxiliary_loss_clip": 0.01069224, + "auxiliary_loss_mlp": 0.01026617, + "balance_loss_clip": 1.02646399, + "balance_loss_mlp": 1.01494026, + "epoch": 0.5338944836915677, + "flos": 17601256546560.0, + "grad_norm": 2.449363308797458, + "language_loss": 0.83528352, + "learning_rate": 1.8775908543977416e-06, + "loss": 0.85624188, + "num_input_tokens_seen": 191045285, + "step": 8880, + "time_per_iteration": 4.142926931381226 + }, + { + "auxiliary_loss_clip": 0.01009713, + "auxiliary_loss_mlp": 0.01031531, + "balance_loss_clip": 1.024297, + "balance_loss_mlp": 1.02021217, + "epoch": 0.5339546069442357, + "flos": 21724124544000.0, + "grad_norm": 1.495369181551116, + "language_loss": 0.79446685, + "learning_rate": 1.8772021251857107e-06, + "loss": 0.8148793, + "num_input_tokens_seen": 191066105, + "step": 8881, + "time_per_iteration": 2.732975721359253 + }, + { + "auxiliary_loss_clip": 0.00990945, + "auxiliary_loss_mlp": 0.01008523, + "balance_loss_clip": 1.00408363, + "balance_loss_mlp": 1.00739026, + "epoch": 0.5340147301969036, + "flos": 69723583315200.0, + "grad_norm": 0.7932686944545919, + "language_loss": 0.59219599, + "learning_rate": 1.8768134006301882e-06, + "loss": 0.61219072, + "num_input_tokens_seen": 191126315, + "step": 8882, + "time_per_iteration": 3.111276865005493 + }, + { + "auxiliary_loss_clip": 0.00991174, + "auxiliary_loss_mlp": 0.01001958, + "balance_loss_clip": 1.00407064, + "balance_loss_mlp": 1.0007118, + "epoch": 0.5340748534495716, + "flos": 63880701580800.0, + "grad_norm": 0.8678103367551874, + "language_loss": 0.6367448, + "learning_rate": 1.876424680745913e-06, + "loss": 0.65667611, + "num_input_tokens_seen": 191174240, + "step": 8883, + "time_per_iteration": 3.005952835083008 + }, + { + "auxiliary_loss_clip": 0.01022553, + "auxiliary_loss_mlp": 0.01036941, + "balance_loss_clip": 1.02230644, + "balance_loss_mlp": 1.02403045, + "epoch": 0.5341349767022396, + "flos": 28694313694080.0, + "grad_norm": 2.9497610694362892, + "language_loss": 0.82601881, + "learning_rate": 1.8760359655476272e-06, + "loss": 0.84661371, + "num_input_tokens_seen": 191193335, + "step": 8884, + "time_per_iteration": 2.8239970207214355 + }, + { + "auxiliary_loss_clip": 0.01028175, + "auxiliary_loss_mlp": 0.01035669, + "balance_loss_clip": 1.02120459, + "balance_loss_mlp": 1.02355719, + "epoch": 0.5341950999549075, + "flos": 16289691338880.0, + "grad_norm": 1.656186687217737, + "language_loss": 0.72672963, + "learning_rate": 1.8756472550500695e-06, + "loss": 0.7473681, + "num_input_tokens_seen": 191210900, + "step": 8885, + "time_per_iteration": 2.6303727626800537 + }, + { + "auxiliary_loss_clip": 0.01047516, + "auxiliary_loss_mlp": 0.01028158, + "balance_loss_clip": 1.02351248, + "balance_loss_mlp": 1.01588547, + "epoch": 0.5342552232075756, + "flos": 14355650413440.0, + "grad_norm": 2.1264325389293, + "language_loss": 0.78463888, + "learning_rate": 1.87525854926798e-06, + "loss": 0.8053956, + "num_input_tokens_seen": 191226730, + "step": 8886, + "time_per_iteration": 2.5761830806732178 + }, + { + "auxiliary_loss_clip": 0.01037661, + "auxiliary_loss_mlp": 0.00747556, + "balance_loss_clip": 1.02602959, + "balance_loss_mlp": 1.00028336, + "epoch": 0.5343153464602435, + "flos": 30297976300800.0, + "grad_norm": 1.7218457925276065, + "language_loss": 0.7486015, + "learning_rate": 1.8748698482160996e-06, + "loss": 0.76645374, + "num_input_tokens_seen": 191250435, + "step": 8887, + "time_per_iteration": 2.7194929122924805 + }, + { + "auxiliary_loss_clip": 0.01039929, + "auxiliary_loss_mlp": 0.01029794, + "balance_loss_clip": 1.02192438, + "balance_loss_mlp": 1.01807618, + "epoch": 0.5343754697129115, + "flos": 15596292216960.0, + "grad_norm": 2.216269541720739, + "language_loss": 0.69368941, + "learning_rate": 1.8744811519091663e-06, + "loss": 0.71438664, + "num_input_tokens_seen": 191268315, + "step": 8888, + "time_per_iteration": 2.62672758102417 + }, + { + "auxiliary_loss_clip": 0.01058362, + "auxiliary_loss_mlp": 0.01033406, + "balance_loss_clip": 1.02487445, + "balance_loss_mlp": 1.02147329, + "epoch": 0.5344355929655794, + "flos": 16909617191040.0, + "grad_norm": 2.037752938318696, + "language_loss": 0.77304918, + "learning_rate": 1.8740924603619208e-06, + "loss": 0.79396689, + "num_input_tokens_seen": 191287000, + "step": 8889, + "time_per_iteration": 2.6860859394073486 + }, + { + "auxiliary_loss_clip": 0.01069328, + "auxiliary_loss_mlp": 0.01039289, + "balance_loss_clip": 1.02713823, + "balance_loss_mlp": 1.02692103, + "epoch": 0.5344957162182474, + "flos": 16798186224000.0, + "grad_norm": 2.639766413740384, + "language_loss": 0.69270992, + "learning_rate": 1.873703773589102e-06, + "loss": 0.71379614, + "num_input_tokens_seen": 191304565, + "step": 8890, + "time_per_iteration": 2.5196614265441895 + }, + { + "auxiliary_loss_clip": 0.01071475, + "auxiliary_loss_mlp": 0.01036225, + "balance_loss_clip": 1.02650237, + "balance_loss_mlp": 1.02293372, + "epoch": 0.5345558394709153, + "flos": 12705590413440.0, + "grad_norm": 2.4463942225653117, + "language_loss": 0.76745951, + "learning_rate": 1.8733150916054483e-06, + "loss": 0.78853655, + "num_input_tokens_seen": 191318300, + "step": 8891, + "time_per_iteration": 2.529083013534546 + }, + { + "auxiliary_loss_clip": 0.01047895, + "auxiliary_loss_mlp": 0.01036022, + "balance_loss_clip": 1.02321017, + "balance_loss_mlp": 1.02394056, + "epoch": 0.5346159627235834, + "flos": 22455050400000.0, + "grad_norm": 1.513045945831992, + "language_loss": 0.74197006, + "learning_rate": 1.872926414425699e-06, + "loss": 0.76280922, + "num_input_tokens_seen": 191337925, + "step": 8892, + "time_per_iteration": 2.6611130237579346 + }, + { + "auxiliary_loss_clip": 0.0105407, + "auxiliary_loss_mlp": 0.0103336, + "balance_loss_clip": 1.03066754, + "balance_loss_mlp": 1.02177858, + "epoch": 0.5346760859762513, + "flos": 22415763899520.0, + "grad_norm": 1.7939821150131285, + "language_loss": 0.8788054, + "learning_rate": 1.8725377420645932e-06, + "loss": 0.89967966, + "num_input_tokens_seen": 191357120, + "step": 8893, + "time_per_iteration": 2.6038174629211426 + }, + { + "auxiliary_loss_clip": 0.01063821, + "auxiliary_loss_mlp": 0.01027923, + "balance_loss_clip": 1.02425551, + "balance_loss_mlp": 1.01736093, + "epoch": 0.5347362092289193, + "flos": 22816131868800.0, + "grad_norm": 1.7485828557271221, + "language_loss": 0.72988021, + "learning_rate": 1.872149074536869e-06, + "loss": 0.75079763, + "num_input_tokens_seen": 191375395, + "step": 8894, + "time_per_iteration": 2.666126251220703 + }, + { + "auxiliary_loss_clip": 0.01057014, + "auxiliary_loss_mlp": 0.01028983, + "balance_loss_clip": 1.02570581, + "balance_loss_mlp": 1.01734877, + "epoch": 0.5347963324815872, + "flos": 23219480666880.0, + "grad_norm": 1.5370076206132448, + "language_loss": 0.74851286, + "learning_rate": 1.8717604118572648e-06, + "loss": 0.76937282, + "num_input_tokens_seen": 191395595, + "step": 8895, + "time_per_iteration": 2.6155035495758057 + }, + { + "auxiliary_loss_clip": 0.01038085, + "auxiliary_loss_mlp": 0.01031792, + "balance_loss_clip": 1.02344656, + "balance_loss_mlp": 1.01965094, + "epoch": 0.5348564557342552, + "flos": 22601350494720.0, + "grad_norm": 1.6273330325310325, + "language_loss": 0.7698884, + "learning_rate": 1.8713717540405178e-06, + "loss": 0.79058719, + "num_input_tokens_seen": 191413730, + "step": 8896, + "time_per_iteration": 2.598919153213501 + }, + { + "auxiliary_loss_clip": 0.01046877, + "auxiliary_loss_mlp": 0.0102595, + "balance_loss_clip": 1.02714968, + "balance_loss_mlp": 1.01432729, + "epoch": 0.5349165789869232, + "flos": 18002378701440.0, + "grad_norm": 1.6628877328705718, + "language_loss": 0.78329206, + "learning_rate": 1.8709831011013676e-06, + "loss": 0.8040204, + "num_input_tokens_seen": 191432400, + "step": 8897, + "time_per_iteration": 2.5937068462371826 + }, + { + "auxiliary_loss_clip": 0.01059603, + "auxiliary_loss_mlp": 0.01031848, + "balance_loss_clip": 1.02758884, + "balance_loss_mlp": 1.02026725, + "epoch": 0.5349767022395912, + "flos": 17159770483200.0, + "grad_norm": 1.8255649615615124, + "language_loss": 0.76196128, + "learning_rate": 1.8705944530545509e-06, + "loss": 0.78287584, + "num_input_tokens_seen": 191448855, + "step": 8898, + "time_per_iteration": 2.5912363529205322 + }, + { + "auxiliary_loss_clip": 0.00998648, + "auxiliary_loss_mlp": 0.0100612, + "balance_loss_clip": 1.00192308, + "balance_loss_mlp": 1.00482619, + "epoch": 0.5350368254922592, + "flos": 70992058158720.0, + "grad_norm": 0.8451471659191258, + "language_loss": 0.58000159, + "learning_rate": 1.8702058099148052e-06, + "loss": 0.60004926, + "num_input_tokens_seen": 191519690, + "step": 8899, + "time_per_iteration": 3.357499599456787 + }, + { + "auxiliary_loss_clip": 0.01047236, + "auxiliary_loss_mlp": 0.01027712, + "balance_loss_clip": 1.02654672, + "balance_loss_mlp": 1.0168879, + "epoch": 0.5350969487449271, + "flos": 27417833095680.0, + "grad_norm": 1.6363333094293162, + "language_loss": 0.69987381, + "learning_rate": 1.869817171696868e-06, + "loss": 0.72062325, + "num_input_tokens_seen": 191539380, + "step": 8900, + "time_per_iteration": 2.727609872817993 + }, + { + "auxiliary_loss_clip": 0.01044117, + "auxiliary_loss_mlp": 0.01032119, + "balance_loss_clip": 1.02319324, + "balance_loss_mlp": 1.020437, + "epoch": 0.5351570719975951, + "flos": 19316134638720.0, + "grad_norm": 1.8420596197185721, + "language_loss": 0.71446663, + "learning_rate": 1.8694285384154777e-06, + "loss": 0.73522902, + "num_input_tokens_seen": 191557400, + "step": 8901, + "time_per_iteration": 2.606051206588745 + }, + { + "auxiliary_loss_clip": 0.01028538, + "auxiliary_loss_mlp": 0.01029489, + "balance_loss_clip": 1.0213424, + "balance_loss_mlp": 1.0171808, + "epoch": 0.535217195250263, + "flos": 19828580019840.0, + "grad_norm": 1.8829427087461355, + "language_loss": 0.77395046, + "learning_rate": 1.8690399100853699e-06, + "loss": 0.79453075, + "num_input_tokens_seen": 191575860, + "step": 8902, + "time_per_iteration": 2.674525260925293 + }, + { + "auxiliary_loss_clip": 0.01037283, + "auxiliary_loss_mlp": 0.01037778, + "balance_loss_clip": 1.02336717, + "balance_loss_mlp": 1.02610159, + "epoch": 0.535277318502931, + "flos": 22127868391680.0, + "grad_norm": 1.460645573781238, + "language_loss": 0.69988579, + "learning_rate": 1.868651286721281e-06, + "loss": 0.72063637, + "num_input_tokens_seen": 191595775, + "step": 8903, + "time_per_iteration": 2.671315908432007 + }, + { + "auxiliary_loss_clip": 0.01059206, + "auxiliary_loss_mlp": 0.00747512, + "balance_loss_clip": 1.02550662, + "balance_loss_mlp": 1.00017369, + "epoch": 0.5353374417555989, + "flos": 25045897466880.0, + "grad_norm": 1.8028219867106585, + "language_loss": 0.72420871, + "learning_rate": 1.86826266833795e-06, + "loss": 0.74227595, + "num_input_tokens_seen": 191617785, + "step": 8904, + "time_per_iteration": 2.603482246398926 + }, + { + "auxiliary_loss_clip": 0.01048561, + "auxiliary_loss_mlp": 0.01034741, + "balance_loss_clip": 1.02661705, + "balance_loss_mlp": 1.02224791, + "epoch": 0.535397565008267, + "flos": 19388710068480.0, + "grad_norm": 1.898336064885352, + "language_loss": 0.73490256, + "learning_rate": 1.8678740549501103e-06, + "loss": 0.75573552, + "num_input_tokens_seen": 191636900, + "step": 8905, + "time_per_iteration": 2.6955037117004395 + }, + { + "auxiliary_loss_clip": 0.01053895, + "auxiliary_loss_mlp": 0.01034272, + "balance_loss_clip": 1.02592957, + "balance_loss_mlp": 1.02435398, + "epoch": 0.5354576882609349, + "flos": 21471205904640.0, + "grad_norm": 1.4857787972735426, + "language_loss": 0.83276081, + "learning_rate": 1.8674854465725005e-06, + "loss": 0.85364246, + "num_input_tokens_seen": 191656720, + "step": 8906, + "time_per_iteration": 2.602078437805176 + }, + { + "auxiliary_loss_clip": 0.01059316, + "auxiliary_loss_mlp": 0.00747508, + "balance_loss_clip": 1.02619863, + "balance_loss_mlp": 1.00015163, + "epoch": 0.5355178115136029, + "flos": 20777519473920.0, + "grad_norm": 1.893781270757607, + "language_loss": 0.73826563, + "learning_rate": 1.8670968432198563e-06, + "loss": 0.75633395, + "num_input_tokens_seen": 191674445, + "step": 8907, + "time_per_iteration": 2.616783618927002 + }, + { + "auxiliary_loss_clip": 0.0105105, + "auxiliary_loss_mlp": 0.0103296, + "balance_loss_clip": 1.02375209, + "balance_loss_mlp": 1.02053857, + "epoch": 0.5355779347662708, + "flos": 23514020190720.0, + "grad_norm": 1.8863729897726864, + "language_loss": 0.76384497, + "learning_rate": 1.866708244906912e-06, + "loss": 0.78468502, + "num_input_tokens_seen": 191695000, + "step": 8908, + "time_per_iteration": 2.635105609893799 + }, + { + "auxiliary_loss_clip": 0.01040288, + "auxiliary_loss_mlp": 0.00747424, + "balance_loss_clip": 1.02439177, + "balance_loss_mlp": 1.00013089, + "epoch": 0.5356380580189388, + "flos": 20303211358080.0, + "grad_norm": 2.2230995198056775, + "language_loss": 0.74324518, + "learning_rate": 1.8663196516484055e-06, + "loss": 0.76112223, + "num_input_tokens_seen": 191713295, + "step": 8909, + "time_per_iteration": 2.6455793380737305 + }, + { + "auxiliary_loss_clip": 0.01025018, + "auxiliary_loss_mlp": 0.01034328, + "balance_loss_clip": 1.02590156, + "balance_loss_mlp": 1.0230149, + "epoch": 0.5356981812716068, + "flos": 21361642444800.0, + "grad_norm": 2.081750479540636, + "language_loss": 0.84371984, + "learning_rate": 1.8659310634590702e-06, + "loss": 0.86431336, + "num_input_tokens_seen": 191732725, + "step": 8910, + "time_per_iteration": 2.7120940685272217 + }, + { + "auxiliary_loss_clip": 0.01047415, + "auxiliary_loss_mlp": 0.01028973, + "balance_loss_clip": 1.02447391, + "balance_loss_mlp": 1.01760602, + "epoch": 0.5357583045242748, + "flos": 23111246010240.0, + "grad_norm": 1.8077662288146932, + "language_loss": 0.81950951, + "learning_rate": 1.8655424803536427e-06, + "loss": 0.84027338, + "num_input_tokens_seen": 191753765, + "step": 8911, + "time_per_iteration": 2.625991106033325 + }, + { + "auxiliary_loss_clip": 0.01028942, + "auxiliary_loss_mlp": 0.01033641, + "balance_loss_clip": 1.02349436, + "balance_loss_mlp": 1.02192307, + "epoch": 0.5358184277769428, + "flos": 21141761339520.0, + "grad_norm": 1.6304439831151618, + "language_loss": 0.68933505, + "learning_rate": 1.8651539023468585e-06, + "loss": 0.70996088, + "num_input_tokens_seen": 191773560, + "step": 8912, + "time_per_iteration": 2.6118009090423584 + }, + { + "auxiliary_loss_clip": 0.01046715, + "auxiliary_loss_mlp": 0.0103457, + "balance_loss_clip": 1.026088, + "balance_loss_mlp": 1.02306044, + "epoch": 0.5358785510296107, + "flos": 16282400878080.0, + "grad_norm": 1.9093740549558131, + "language_loss": 0.71464479, + "learning_rate": 1.8647653294534509e-06, + "loss": 0.73545766, + "num_input_tokens_seen": 191791255, + "step": 8913, + "time_per_iteration": 4.22602653503418 + }, + { + "auxiliary_loss_clip": 0.01040018, + "auxiliary_loss_mlp": 0.01036291, + "balance_loss_clip": 1.02592719, + "balance_loss_mlp": 1.02422714, + "epoch": 0.5359386742822787, + "flos": 16976877408000.0, + "grad_norm": 1.9038004747632105, + "language_loss": 0.71716082, + "learning_rate": 1.864376761688156e-06, + "loss": 0.73792392, + "num_input_tokens_seen": 191809325, + "step": 8914, + "time_per_iteration": 2.6450512409210205 + }, + { + "auxiliary_loss_clip": 0.01048032, + "auxiliary_loss_mlp": 0.01040798, + "balance_loss_clip": 1.0284636, + "balance_loss_mlp": 1.02808416, + "epoch": 0.5359987975349466, + "flos": 20812927305600.0, + "grad_norm": 1.8659295195085426, + "language_loss": 0.70661813, + "learning_rate": 1.8639881990657079e-06, + "loss": 0.72750646, + "num_input_tokens_seen": 191829795, + "step": 8915, + "time_per_iteration": 4.223742246627808 + }, + { + "auxiliary_loss_clip": 0.01038202, + "auxiliary_loss_mlp": 0.0103734, + "balance_loss_clip": 1.02344441, + "balance_loss_mlp": 1.02509093, + "epoch": 0.5360589207876146, + "flos": 22199941031040.0, + "grad_norm": 1.5970742230986972, + "language_loss": 0.7505604, + "learning_rate": 1.8635996416008408e-06, + "loss": 0.77131581, + "num_input_tokens_seen": 191850840, + "step": 8916, + "time_per_iteration": 2.735837697982788 + }, + { + "auxiliary_loss_clip": 0.01028468, + "auxiliary_loss_mlp": 0.00747483, + "balance_loss_clip": 1.02472115, + "balance_loss_mlp": 1.00018549, + "epoch": 0.5361190440402825, + "flos": 31394365084800.0, + "grad_norm": 2.064813099031579, + "language_loss": 0.72599328, + "learning_rate": 1.863211089308289e-06, + "loss": 0.74375284, + "num_input_tokens_seen": 191869520, + "step": 8917, + "time_per_iteration": 3.0278260707855225 + }, + { + "auxiliary_loss_clip": 0.01047708, + "auxiliary_loss_mlp": 0.01032084, + "balance_loss_clip": 1.02618647, + "balance_loss_mlp": 1.02054453, + "epoch": 0.5361791672929506, + "flos": 16069882060800.0, + "grad_norm": 2.0334576632974666, + "language_loss": 0.71316826, + "learning_rate": 1.8628225422027865e-06, + "loss": 0.73396623, + "num_input_tokens_seen": 191887240, + "step": 8918, + "time_per_iteration": 2.6898720264434814 + }, + { + "auxiliary_loss_clip": 0.01051375, + "auxiliary_loss_mlp": 0.01037454, + "balance_loss_clip": 1.02820039, + "balance_loss_mlp": 1.02577138, + "epoch": 0.5362392905456185, + "flos": 20740926493440.0, + "grad_norm": 1.8999360224139807, + "language_loss": 0.75061131, + "learning_rate": 1.862434000299067e-06, + "loss": 0.77149963, + "num_input_tokens_seen": 191905690, + "step": 8919, + "time_per_iteration": 2.6782350540161133 + }, + { + "auxiliary_loss_clip": 0.01046897, + "auxiliary_loss_mlp": 0.01031095, + "balance_loss_clip": 1.02465987, + "balance_loss_mlp": 1.01988292, + "epoch": 0.5362994137982865, + "flos": 17340077779200.0, + "grad_norm": 2.1371472749343163, + "language_loss": 0.71739322, + "learning_rate": 1.862045463611864e-06, + "loss": 0.73817313, + "num_input_tokens_seen": 191920725, + "step": 8920, + "time_per_iteration": 2.5991344451904297 + }, + { + "auxiliary_loss_clip": 0.01047407, + "auxiliary_loss_mlp": 0.01029171, + "balance_loss_clip": 1.02174807, + "balance_loss_mlp": 1.01668382, + "epoch": 0.5363595370509544, + "flos": 42813957795840.0, + "grad_norm": 1.573944883464422, + "language_loss": 0.68602651, + "learning_rate": 1.8616569321559105e-06, + "loss": 0.70679229, + "num_input_tokens_seen": 191944645, + "step": 8921, + "time_per_iteration": 2.8057615756988525 + }, + { + "auxiliary_loss_clip": 0.01058695, + "auxiliary_loss_mlp": 0.01028846, + "balance_loss_clip": 1.02749228, + "balance_loss_mlp": 1.01749158, + "epoch": 0.5364196603036224, + "flos": 19171953446400.0, + "grad_norm": 1.8812980892067026, + "language_loss": 0.81982052, + "learning_rate": 1.86126840594594e-06, + "loss": 0.84069586, + "num_input_tokens_seen": 191962265, + "step": 8922, + "time_per_iteration": 2.611820936203003 + }, + { + "auxiliary_loss_clip": 0.01060224, + "auxiliary_loss_mlp": 0.01028658, + "balance_loss_clip": 1.02690196, + "balance_loss_mlp": 1.0174408, + "epoch": 0.5364797835562904, + "flos": 17931060247680.0, + "grad_norm": 1.9104254648355792, + "language_loss": 0.76504123, + "learning_rate": 1.860879884996686e-06, + "loss": 0.78593004, + "num_input_tokens_seen": 191978850, + "step": 8923, + "time_per_iteration": 2.617886543273926 + }, + { + "auxiliary_loss_clip": 0.01047478, + "auxiliary_loss_mlp": 0.01032404, + "balance_loss_clip": 1.02679443, + "balance_loss_mlp": 1.01988745, + "epoch": 0.5365399068089584, + "flos": 30228058477440.0, + "grad_norm": 1.3832145538561924, + "language_loss": 0.70261121, + "learning_rate": 1.8604913693228804e-06, + "loss": 0.72341007, + "num_input_tokens_seen": 192002000, + "step": 8924, + "time_per_iteration": 4.337195634841919 + }, + { + "auxiliary_loss_clip": 0.01038225, + "auxiliary_loss_mlp": 0.01033485, + "balance_loss_clip": 1.02512693, + "balance_loss_mlp": 1.02087247, + "epoch": 0.5366000300616264, + "flos": 24891696380160.0, + "grad_norm": 1.9428582743922953, + "language_loss": 0.87171471, + "learning_rate": 1.8601028589392558e-06, + "loss": 0.89243186, + "num_input_tokens_seen": 192019100, + "step": 8925, + "time_per_iteration": 2.7306134700775146 + }, + { + "auxiliary_loss_clip": 0.0106676, + "auxiliary_loss_mlp": 0.01029024, + "balance_loss_clip": 1.02400017, + "balance_loss_mlp": 1.01727629, + "epoch": 0.5366601533142943, + "flos": 29826649013760.0, + "grad_norm": 1.9509859631837962, + "language_loss": 0.78112054, + "learning_rate": 1.8597143538605455e-06, + "loss": 0.80207837, + "num_input_tokens_seen": 192041660, + "step": 8926, + "time_per_iteration": 2.6339948177337646 + }, + { + "auxiliary_loss_clip": 0.01036555, + "auxiliary_loss_mlp": 0.0102637, + "balance_loss_clip": 1.02584958, + "balance_loss_mlp": 1.01603484, + "epoch": 0.5367202765669623, + "flos": 27199352620800.0, + "grad_norm": 2.148796835675792, + "language_loss": 0.67196423, + "learning_rate": 1.85932585410148e-06, + "loss": 0.69259346, + "num_input_tokens_seen": 192063540, + "step": 8927, + "time_per_iteration": 4.397145986557007 + }, + { + "auxiliary_loss_clip": 0.01056427, + "auxiliary_loss_mlp": 0.01027785, + "balance_loss_clip": 1.02457118, + "balance_loss_mlp": 1.01690698, + "epoch": 0.5367803998196302, + "flos": 20229953569920.0, + "grad_norm": 1.9347535890119505, + "language_loss": 0.73281968, + "learning_rate": 1.8589373596767929e-06, + "loss": 0.75366175, + "num_input_tokens_seen": 192081760, + "step": 8928, + "time_per_iteration": 2.6144447326660156 + }, + { + "auxiliary_loss_clip": 0.01040178, + "auxiliary_loss_mlp": 0.0102979, + "balance_loss_clip": 1.02263415, + "balance_loss_mlp": 1.01884031, + "epoch": 0.5368405230722982, + "flos": 32154629374080.0, + "grad_norm": 2.698714999526633, + "language_loss": 0.62577415, + "learning_rate": 1.8585488706012154e-06, + "loss": 0.64647388, + "num_input_tokens_seen": 192101620, + "step": 8929, + "time_per_iteration": 2.742241859436035 + }, + { + "auxiliary_loss_clip": 0.01057738, + "auxiliary_loss_mlp": 0.01029776, + "balance_loss_clip": 1.02613509, + "balance_loss_mlp": 1.01832628, + "epoch": 0.5369006463249661, + "flos": 26247935128320.0, + "grad_norm": 2.4412323543112526, + "language_loss": 0.66105676, + "learning_rate": 1.8581603868894781e-06, + "loss": 0.68193191, + "num_input_tokens_seen": 192121805, + "step": 8930, + "time_per_iteration": 2.6386606693267822 + }, + { + "auxiliary_loss_clip": 0.0102249, + "auxiliary_loss_mlp": 0.01026499, + "balance_loss_clip": 1.02336037, + "balance_loss_mlp": 1.01478076, + "epoch": 0.5369607695776342, + "flos": 26211306234240.0, + "grad_norm": 1.4688839961148852, + "language_loss": 0.66897154, + "learning_rate": 1.8577719085563136e-06, + "loss": 0.68946147, + "num_input_tokens_seen": 192141765, + "step": 8931, + "time_per_iteration": 2.751089096069336 + }, + { + "auxiliary_loss_clip": 0.01014071, + "auxiliary_loss_mlp": 0.01034072, + "balance_loss_clip": 1.02350593, + "balance_loss_mlp": 1.02152562, + "epoch": 0.5370208928303021, + "flos": 25009017177600.0, + "grad_norm": 1.5391999462061992, + "language_loss": 0.75718564, + "learning_rate": 1.8573834356164525e-06, + "loss": 0.77766705, + "num_input_tokens_seen": 192161560, + "step": 8932, + "time_per_iteration": 2.701698064804077 + }, + { + "auxiliary_loss_clip": 0.01028029, + "auxiliary_loss_mlp": 0.01034205, + "balance_loss_clip": 1.0244323, + "balance_loss_mlp": 1.02282643, + "epoch": 0.5370810160829701, + "flos": 31792147274880.0, + "grad_norm": 1.6873533063130246, + "language_loss": 0.66411728, + "learning_rate": 1.8569949680846261e-06, + "loss": 0.68473959, + "num_input_tokens_seen": 192180190, + "step": 8933, + "time_per_iteration": 2.7149648666381836 + }, + { + "auxiliary_loss_clip": 0.01055326, + "auxiliary_loss_mlp": 0.00747495, + "balance_loss_clip": 1.02595246, + "balance_loss_mlp": 1.00022292, + "epoch": 0.537141139335638, + "flos": 23842602829440.0, + "grad_norm": 1.9785480988720654, + "language_loss": 0.82628322, + "learning_rate": 1.856606505975565e-06, + "loss": 0.84431136, + "num_input_tokens_seen": 192198855, + "step": 8934, + "time_per_iteration": 2.5857155323028564 + }, + { + "auxiliary_loss_clip": 0.01031858, + "auxiliary_loss_mlp": 0.01036518, + "balance_loss_clip": 1.02308524, + "balance_loss_mlp": 1.02479935, + "epoch": 0.537201262588306, + "flos": 18508826511360.0, + "grad_norm": 2.041219440704127, + "language_loss": 0.80111998, + "learning_rate": 1.856218049303999e-06, + "loss": 0.82180375, + "num_input_tokens_seen": 192216555, + "step": 8935, + "time_per_iteration": 2.682002305984497 + }, + { + "auxiliary_loss_clip": 0.01056112, + "auxiliary_loss_mlp": 0.01034089, + "balance_loss_clip": 1.02495468, + "balance_loss_mlp": 1.02292562, + "epoch": 0.537261385840974, + "flos": 25662950231040.0, + "grad_norm": 1.9264715864554045, + "language_loss": 0.83732271, + "learning_rate": 1.855829598084659e-06, + "loss": 0.85822475, + "num_input_tokens_seen": 192236910, + "step": 8936, + "time_per_iteration": 2.6725943088531494 + }, + { + "auxiliary_loss_clip": 0.01035665, + "auxiliary_loss_mlp": 0.01027298, + "balance_loss_clip": 1.02548301, + "balance_loss_mlp": 1.01675403, + "epoch": 0.537321509093642, + "flos": 40735017406080.0, + "grad_norm": 1.2737992583069853, + "language_loss": 0.72775888, + "learning_rate": 1.8554411523322754e-06, + "loss": 0.74838853, + "num_input_tokens_seen": 192260790, + "step": 8937, + "time_per_iteration": 2.865077018737793 + }, + { + "auxiliary_loss_clip": 0.01040049, + "auxiliary_loss_mlp": 0.01032945, + "balance_loss_clip": 1.02185309, + "balance_loss_mlp": 1.02001715, + "epoch": 0.53738163234631, + "flos": 17238487138560.0, + "grad_norm": 2.052980731353456, + "language_loss": 0.81603527, + "learning_rate": 1.8550527120615778e-06, + "loss": 0.83676517, + "num_input_tokens_seen": 192277230, + "step": 8938, + "time_per_iteration": 2.6331734657287598 + }, + { + "auxiliary_loss_clip": 0.01071352, + "auxiliary_loss_mlp": 0.01034395, + "balance_loss_clip": 1.02767801, + "balance_loss_mlp": 1.02311778, + "epoch": 0.5374417555989779, + "flos": 12821977457280.0, + "grad_norm": 2.6531246631147276, + "language_loss": 0.8119846, + "learning_rate": 1.8546642772872957e-06, + "loss": 0.83304203, + "num_input_tokens_seen": 192292840, + "step": 8939, + "time_per_iteration": 2.5304479598999023 + }, + { + "auxiliary_loss_clip": 0.00982487, + "auxiliary_loss_mlp": 0.01012225, + "balance_loss_clip": 1.00540698, + "balance_loss_mlp": 1.01096773, + "epoch": 0.5375018788516459, + "flos": 67256018703360.0, + "grad_norm": 0.7158103219276113, + "language_loss": 0.52441537, + "learning_rate": 1.8542758480241589e-06, + "loss": 0.54436249, + "num_input_tokens_seen": 192358240, + "step": 8940, + "time_per_iteration": 3.2543258666992188 + }, + { + "auxiliary_loss_clip": 0.01032508, + "auxiliary_loss_mlp": 0.01028818, + "balance_loss_clip": 1.02504969, + "balance_loss_mlp": 1.01757109, + "epoch": 0.5375620021043138, + "flos": 18114168804480.0, + "grad_norm": 2.548760665375602, + "language_loss": 0.71833241, + "learning_rate": 1.8538874242868965e-06, + "loss": 0.73894572, + "num_input_tokens_seen": 192377370, + "step": 8941, + "time_per_iteration": 2.6878533363342285 + }, + { + "auxiliary_loss_clip": 0.01039113, + "auxiliary_loss_mlp": 0.01024597, + "balance_loss_clip": 1.02286124, + "balance_loss_mlp": 1.01380241, + "epoch": 0.5376221253569818, + "flos": 23149383275520.0, + "grad_norm": 1.807063733409756, + "language_loss": 0.79681695, + "learning_rate": 1.853499006090237e-06, + "loss": 0.81745404, + "num_input_tokens_seen": 192396450, + "step": 8942, + "time_per_iteration": 2.639244318008423 + }, + { + "auxiliary_loss_clip": 0.01069624, + "auxiliary_loss_mlp": 0.01031645, + "balance_loss_clip": 1.02659404, + "balance_loss_mlp": 1.01970005, + "epoch": 0.5376822486096497, + "flos": 29972302663680.0, + "grad_norm": 2.1415113701586455, + "language_loss": 0.70214695, + "learning_rate": 1.853110593448911e-06, + "loss": 0.72315961, + "num_input_tokens_seen": 192417390, + "step": 8943, + "time_per_iteration": 2.6869404315948486 + }, + { + "auxiliary_loss_clip": 0.01000316, + "auxiliary_loss_mlp": 0.0100229, + "balance_loss_clip": 1.00373101, + "balance_loss_mlp": 1.00115156, + "epoch": 0.5377423718623178, + "flos": 54168950874240.0, + "grad_norm": 0.8118844005856098, + "language_loss": 0.59667516, + "learning_rate": 1.852722186377645e-06, + "loss": 0.61670119, + "num_input_tokens_seen": 192478060, + "step": 8944, + "time_per_iteration": 3.3285231590270996 + }, + { + "auxiliary_loss_clip": 0.01030833, + "auxiliary_loss_mlp": 0.01028121, + "balance_loss_clip": 1.02671766, + "balance_loss_mlp": 1.01535308, + "epoch": 0.5378024951149857, + "flos": 23257079228160.0, + "grad_norm": 2.137890357589997, + "language_loss": 0.77721012, + "learning_rate": 1.852333784891169e-06, + "loss": 0.79779965, + "num_input_tokens_seen": 192495985, + "step": 8945, + "time_per_iteration": 2.9772472381591797 + }, + { + "auxiliary_loss_clip": 0.01058773, + "auxiliary_loss_mlp": 0.01030278, + "balance_loss_clip": 1.02609611, + "balance_loss_mlp": 1.01910782, + "epoch": 0.5378626183676537, + "flos": 24024095274240.0, + "grad_norm": 2.361935761201788, + "language_loss": 0.68709016, + "learning_rate": 1.8519453890042112e-06, + "loss": 0.70798063, + "num_input_tokens_seen": 192515445, + "step": 8946, + "time_per_iteration": 2.739962577819824 + }, + { + "auxiliary_loss_clip": 0.0102683, + "auxiliary_loss_mlp": 0.01033861, + "balance_loss_clip": 1.02598739, + "balance_loss_mlp": 1.02266669, + "epoch": 0.5379227416203216, + "flos": 27161789973120.0, + "grad_norm": 1.5462908658410397, + "language_loss": 0.77112687, + "learning_rate": 1.851556998731498e-06, + "loss": 0.79173374, + "num_input_tokens_seen": 192536530, + "step": 8947, + "time_per_iteration": 2.9440746307373047 + }, + { + "auxiliary_loss_clip": 0.01056648, + "auxiliary_loss_mlp": 0.01027463, + "balance_loss_clip": 1.02595425, + "balance_loss_mlp": 1.01645994, + "epoch": 0.5379828648729896, + "flos": 24681619687680.0, + "grad_norm": 1.4342810663471206, + "language_loss": 0.6038208, + "learning_rate": 1.8511686140877592e-06, + "loss": 0.62466192, + "num_input_tokens_seen": 192556075, + "step": 8948, + "time_per_iteration": 2.9512064456939697 + }, + { + "auxiliary_loss_clip": 0.01021372, + "auxiliary_loss_mlp": 0.01031966, + "balance_loss_clip": 1.02118587, + "balance_loss_mlp": 1.02123737, + "epoch": 0.5380429881256577, + "flos": 22523280284160.0, + "grad_norm": 2.3336929738380583, + "language_loss": 0.79594272, + "learning_rate": 1.8507802350877205e-06, + "loss": 0.81647611, + "num_input_tokens_seen": 192575535, + "step": 8949, + "time_per_iteration": 2.736034393310547 + }, + { + "auxiliary_loss_clip": 0.01021874, + "auxiliary_loss_mlp": 0.0103355, + "balance_loss_clip": 1.02116179, + "balance_loss_mlp": 1.02172458, + "epoch": 0.5381031113783256, + "flos": 26979543342720.0, + "grad_norm": 1.684860312256751, + "language_loss": 0.77766746, + "learning_rate": 1.850391861746111e-06, + "loss": 0.79822171, + "num_input_tokens_seen": 192594490, + "step": 8950, + "time_per_iteration": 2.7593395709991455 + }, + { + "auxiliary_loss_clip": 0.01054456, + "auxiliary_loss_mlp": 0.01025673, + "balance_loss_clip": 1.03208876, + "balance_loss_mlp": 1.01487255, + "epoch": 0.5381632346309936, + "flos": 24754087376640.0, + "grad_norm": 4.106154504455205, + "language_loss": 0.72730911, + "learning_rate": 1.8500034940776573e-06, + "loss": 0.74811041, + "num_input_tokens_seen": 192615650, + "step": 8951, + "time_per_iteration": 2.7119269371032715 + }, + { + "auxiliary_loss_clip": 0.01064201, + "auxiliary_loss_mlp": 0.00747614, + "balance_loss_clip": 1.02375019, + "balance_loss_mlp": 1.00032043, + "epoch": 0.5382233578836615, + "flos": 15560058372480.0, + "grad_norm": 1.7556920970849441, + "language_loss": 0.75440073, + "learning_rate": 1.849615132097085e-06, + "loss": 0.77251881, + "num_input_tokens_seen": 192633840, + "step": 8952, + "time_per_iteration": 2.5113418102264404 + }, + { + "auxiliary_loss_clip": 0.01046923, + "auxiliary_loss_mlp": 0.01026793, + "balance_loss_clip": 1.02552819, + "balance_loss_mlp": 1.01525402, + "epoch": 0.5382834811363295, + "flos": 25084501608960.0, + "grad_norm": 1.4440593408899431, + "language_loss": 0.7976495, + "learning_rate": 1.8492267758191228e-06, + "loss": 0.81838667, + "num_input_tokens_seen": 192655890, + "step": 8953, + "time_per_iteration": 2.636573076248169 + }, + { + "auxiliary_loss_clip": 0.01030971, + "auxiliary_loss_mlp": 0.01030893, + "balance_loss_clip": 1.02513933, + "balance_loss_mlp": 1.01864386, + "epoch": 0.5383436043889974, + "flos": 13297901685120.0, + "grad_norm": 1.756718235434403, + "language_loss": 0.80819434, + "learning_rate": 1.8488384252584964e-06, + "loss": 0.82881302, + "num_input_tokens_seen": 192673025, + "step": 8954, + "time_per_iteration": 2.6800804138183594 + }, + { + "auxiliary_loss_clip": 0.01067563, + "auxiliary_loss_mlp": 0.01027467, + "balance_loss_clip": 1.02575266, + "balance_loss_mlp": 1.01648211, + "epoch": 0.5384037276416654, + "flos": 23039388852480.0, + "grad_norm": 2.0912753345585555, + "language_loss": 0.76407123, + "learning_rate": 1.8484500804299318e-06, + "loss": 0.78502154, + "num_input_tokens_seen": 192692190, + "step": 8955, + "time_per_iteration": 2.792959690093994 + }, + { + "auxiliary_loss_clip": 0.01045963, + "auxiliary_loss_mlp": 0.01030672, + "balance_loss_clip": 1.02611113, + "balance_loss_mlp": 1.01987195, + "epoch": 0.5384638508943334, + "flos": 20631147552000.0, + "grad_norm": 1.5197731140622162, + "language_loss": 0.78228462, + "learning_rate": 1.8480617413481557e-06, + "loss": 0.80305099, + "num_input_tokens_seen": 192710380, + "step": 8956, + "time_per_iteration": 2.7558915615081787 + }, + { + "auxiliary_loss_clip": 0.00980439, + "auxiliary_loss_mlp": 0.01001783, + "balance_loss_clip": 1.00385344, + "balance_loss_mlp": 1.00042999, + "epoch": 0.5385239741470014, + "flos": 66737683491840.0, + "grad_norm": 0.8611375260128081, + "language_loss": 0.63419741, + "learning_rate": 1.8476734080278932e-06, + "loss": 0.65401959, + "num_input_tokens_seen": 192768995, + "step": 8957, + "time_per_iteration": 3.086923599243164 + }, + { + "auxiliary_loss_clip": 0.00984609, + "auxiliary_loss_mlp": 0.01005669, + "balance_loss_clip": 1.01643682, + "balance_loss_mlp": 1.0044409, + "epoch": 0.5385840973996693, + "flos": 64716058229760.0, + "grad_norm": 0.7066854652774381, + "language_loss": 0.51593137, + "learning_rate": 1.8472850804838705e-06, + "loss": 0.53583407, + "num_input_tokens_seen": 192825585, + "step": 8958, + "time_per_iteration": 3.292027235031128 + }, + { + "auxiliary_loss_clip": 0.01060659, + "auxiliary_loss_mlp": 0.0103012, + "balance_loss_clip": 1.02845502, + "balance_loss_mlp": 1.01828277, + "epoch": 0.5386442206523373, + "flos": 26141783460480.0, + "grad_norm": 1.7538081242211077, + "language_loss": 0.77177489, + "learning_rate": 1.8468967587308128e-06, + "loss": 0.79268277, + "num_input_tokens_seen": 192847335, + "step": 8959, + "time_per_iteration": 2.7759246826171875 + }, + { + "auxiliary_loss_clip": 0.01026935, + "auxiliary_loss_mlp": 0.01026742, + "balance_loss_clip": 1.02466297, + "balance_loss_mlp": 1.0154289, + "epoch": 0.5387043439050052, + "flos": 18251849635200.0, + "grad_norm": 2.0899216664674864, + "language_loss": 0.8382858, + "learning_rate": 1.8465084427834455e-06, + "loss": 0.85882258, + "num_input_tokens_seen": 192862205, + "step": 8960, + "time_per_iteration": 4.284580230712891 + }, + { + "auxiliary_loss_clip": 0.01058485, + "auxiliary_loss_mlp": 0.01026977, + "balance_loss_clip": 1.0283401, + "balance_loss_mlp": 1.01582468, + "epoch": 0.5387644671576732, + "flos": 29788296266880.0, + "grad_norm": 1.503409608264264, + "language_loss": 0.78242171, + "learning_rate": 1.8461201326564933e-06, + "loss": 0.8032763, + "num_input_tokens_seen": 192883695, + "step": 8961, + "time_per_iteration": 2.73042368888855 + }, + { + "auxiliary_loss_clip": 0.01034488, + "auxiliary_loss_mlp": 0.0103059, + "balance_loss_clip": 1.02417517, + "balance_loss_mlp": 1.01930058, + "epoch": 0.5388245904103413, + "flos": 22374466237440.0, + "grad_norm": 1.661785181173583, + "language_loss": 0.84375298, + "learning_rate": 1.845731828364681e-06, + "loss": 0.86440372, + "num_input_tokens_seen": 192900190, + "step": 8962, + "time_per_iteration": 4.3128132820129395 + }, + { + "auxiliary_loss_clip": 0.00993714, + "auxiliary_loss_mlp": 0.01002818, + "balance_loss_clip": 1.00664234, + "balance_loss_mlp": 1.00168538, + "epoch": 0.5388847136630092, + "flos": 69807794751360.0, + "grad_norm": 0.7357370006469955, + "language_loss": 0.54174685, + "learning_rate": 1.8453435299227333e-06, + "loss": 0.56171215, + "num_input_tokens_seen": 192958675, + "step": 8963, + "time_per_iteration": 3.097907066345215 + }, + { + "auxiliary_loss_clip": 0.00992604, + "auxiliary_loss_mlp": 0.01002925, + "balance_loss_clip": 1.00561023, + "balance_loss_mlp": 1.00171471, + "epoch": 0.5389448369156772, + "flos": 69822303845760.0, + "grad_norm": 0.8293015757018731, + "language_loss": 0.6337871, + "learning_rate": 1.8449552373453744e-06, + "loss": 0.65374237, + "num_input_tokens_seen": 193033135, + "step": 8964, + "time_per_iteration": 3.27709698677063 + }, + { + "auxiliary_loss_clip": 0.01010401, + "auxiliary_loss_mlp": 0.01028167, + "balance_loss_clip": 1.02253366, + "balance_loss_mlp": 1.01657414, + "epoch": 0.5390049601683451, + "flos": 31722444933120.0, + "grad_norm": 1.4953405380366716, + "language_loss": 0.70050699, + "learning_rate": 1.8445669506473287e-06, + "loss": 0.72089267, + "num_input_tokens_seen": 193055570, + "step": 8965, + "time_per_iteration": 2.895399332046509 + }, + { + "auxiliary_loss_clip": 0.01040607, + "auxiliary_loss_mlp": 0.00747408, + "balance_loss_clip": 1.02533138, + "balance_loss_mlp": 1.0002476, + "epoch": 0.5390650834210131, + "flos": 18113486446080.0, + "grad_norm": 2.0179030260488497, + "language_loss": 0.82403809, + "learning_rate": 1.8441786698433192e-06, + "loss": 0.84191823, + "num_input_tokens_seen": 193073120, + "step": 8966, + "time_per_iteration": 2.632143020629883 + }, + { + "auxiliary_loss_clip": 0.01067982, + "auxiliary_loss_mlp": 0.01030703, + "balance_loss_clip": 1.02759981, + "balance_loss_mlp": 1.01963401, + "epoch": 0.539125206673681, + "flos": 17416711445760.0, + "grad_norm": 2.0679415455246146, + "language_loss": 0.72024179, + "learning_rate": 1.8437903949480706e-06, + "loss": 0.74122858, + "num_input_tokens_seen": 193090105, + "step": 8967, + "time_per_iteration": 2.5777482986450195 + }, + { + "auxiliary_loss_clip": 0.01044537, + "auxiliary_loss_mlp": 0.01024932, + "balance_loss_clip": 1.02419806, + "balance_loss_mlp": 1.01457286, + "epoch": 0.539185329926349, + "flos": 22198935450240.0, + "grad_norm": 1.9903694245385428, + "language_loss": 0.81636083, + "learning_rate": 1.8434021259763065e-06, + "loss": 0.83705544, + "num_input_tokens_seen": 193109325, + "step": 8968, + "time_per_iteration": 2.7033350467681885 + }, + { + "auxiliary_loss_clip": 0.01039184, + "auxiliary_loss_mlp": 0.01031259, + "balance_loss_clip": 1.02631521, + "balance_loss_mlp": 1.01989245, + "epoch": 0.539245453179017, + "flos": 21434397442560.0, + "grad_norm": 1.515077405247529, + "language_loss": 0.74057281, + "learning_rate": 1.8430138629427484e-06, + "loss": 0.76127726, + "num_input_tokens_seen": 193130595, + "step": 8969, + "time_per_iteration": 2.761367082595825 + }, + { + "auxiliary_loss_clip": 0.0103075, + "auxiliary_loss_mlp": 0.00747612, + "balance_loss_clip": 1.02233851, + "balance_loss_mlp": 1.00024056, + "epoch": 0.539305576431685, + "flos": 20735000749440.0, + "grad_norm": 2.157383554090014, + "language_loss": 0.82036471, + "learning_rate": 1.8426256058621205e-06, + "loss": 0.83814836, + "num_input_tokens_seen": 193148930, + "step": 8970, + "time_per_iteration": 2.638489007949829 + }, + { + "auxiliary_loss_clip": 0.01046741, + "auxiliary_loss_mlp": 0.01027894, + "balance_loss_clip": 1.02725649, + "balance_loss_mlp": 1.01718891, + "epoch": 0.5393656996843529, + "flos": 30920452018560.0, + "grad_norm": 1.3361912188921865, + "language_loss": 0.75159752, + "learning_rate": 1.842237354749146e-06, + "loss": 0.77234393, + "num_input_tokens_seen": 193170140, + "step": 8971, + "time_per_iteration": 2.788661241531372 + }, + { + "auxiliary_loss_clip": 0.00999601, + "auxiliary_loss_mlp": 0.01006976, + "balance_loss_clip": 1.00308812, + "balance_loss_mlp": 1.00581408, + "epoch": 0.5394258229370209, + "flos": 50317781351040.0, + "grad_norm": 0.8852194569483788, + "language_loss": 0.60374779, + "learning_rate": 1.8418491096185465e-06, + "loss": 0.62381351, + "num_input_tokens_seen": 193227235, + "step": 8972, + "time_per_iteration": 4.706791400909424 + }, + { + "auxiliary_loss_clip": 0.01055636, + "auxiliary_loss_mlp": 0.01037485, + "balance_loss_clip": 1.02489567, + "balance_loss_mlp": 1.02623773, + "epoch": 0.5394859461896888, + "flos": 25411935012480.0, + "grad_norm": 1.643202583902409, + "language_loss": 0.78245783, + "learning_rate": 1.841460870485045e-06, + "loss": 0.80338907, + "num_input_tokens_seen": 193248435, + "step": 8973, + "time_per_iteration": 2.6465752124786377 + }, + { + "auxiliary_loss_clip": 0.01063053, + "auxiliary_loss_mlp": 0.0103273, + "balance_loss_clip": 1.02709317, + "balance_loss_mlp": 1.01972449, + "epoch": 0.5395460694423568, + "flos": 25478476957440.0, + "grad_norm": 1.9407017369375597, + "language_loss": 0.73808599, + "learning_rate": 1.8410726373633623e-06, + "loss": 0.75904375, + "num_input_tokens_seen": 193267490, + "step": 8974, + "time_per_iteration": 4.245934963226318 + }, + { + "auxiliary_loss_clip": 0.01008661, + "auxiliary_loss_mlp": 0.01002622, + "balance_loss_clip": 1.00227523, + "balance_loss_mlp": 1.00141191, + "epoch": 0.5396061926950249, + "flos": 53249493507840.0, + "grad_norm": 0.77745991191262, + "language_loss": 0.51034546, + "learning_rate": 1.8406844102682215e-06, + "loss": 0.53045833, + "num_input_tokens_seen": 193326050, + "step": 8975, + "time_per_iteration": 3.1247544288635254 + }, + { + "auxiliary_loss_clip": 0.01051767, + "auxiliary_loss_mlp": 0.01038103, + "balance_loss_clip": 1.02419782, + "balance_loss_mlp": 1.02681351, + "epoch": 0.5396663159476928, + "flos": 26725080418560.0, + "grad_norm": 1.431608225736641, + "language_loss": 0.72033083, + "learning_rate": 1.840296189214344e-06, + "loss": 0.74122953, + "num_input_tokens_seen": 193348785, + "step": 8976, + "time_per_iteration": 2.7236547470092773 + }, + { + "auxiliary_loss_clip": 0.01052184, + "auxiliary_loss_mlp": 0.00747524, + "balance_loss_clip": 1.02392793, + "balance_loss_mlp": 1.00033319, + "epoch": 0.5397264392003608, + "flos": 23253380127360.0, + "grad_norm": 1.7307247688868492, + "language_loss": 0.70099783, + "learning_rate": 1.8399079742164509e-06, + "loss": 0.71899492, + "num_input_tokens_seen": 193367080, + "step": 8977, + "time_per_iteration": 2.799685478210449 + }, + { + "auxiliary_loss_clip": 0.01008281, + "auxiliary_loss_mlp": 0.01033767, + "balance_loss_clip": 1.0243144, + "balance_loss_mlp": 1.022192, + "epoch": 0.5397865624530287, + "flos": 18294188791680.0, + "grad_norm": 2.097066084261315, + "language_loss": 0.72518337, + "learning_rate": 1.8395197652892636e-06, + "loss": 0.7456038, + "num_input_tokens_seen": 193383715, + "step": 8978, + "time_per_iteration": 2.848459482192993 + }, + { + "auxiliary_loss_clip": 0.01039082, + "auxiliary_loss_mlp": 0.01034698, + "balance_loss_clip": 1.02751207, + "balance_loss_mlp": 1.02181113, + "epoch": 0.5398466857056967, + "flos": 15297514888320.0, + "grad_norm": 1.9063726978195532, + "language_loss": 0.74050295, + "learning_rate": 1.8391315624475028e-06, + "loss": 0.76124078, + "num_input_tokens_seen": 193400560, + "step": 8979, + "time_per_iteration": 2.754819869995117 + }, + { + "auxiliary_loss_clip": 0.01013715, + "auxiliary_loss_mlp": 0.01047322, + "balance_loss_clip": 1.02544117, + "balance_loss_mlp": 1.0343107, + "epoch": 0.5399068089583646, + "flos": 17821748183040.0, + "grad_norm": 1.975565595365814, + "language_loss": 0.77507764, + "learning_rate": 1.8387433657058892e-06, + "loss": 0.79568797, + "num_input_tokens_seen": 193418680, + "step": 8980, + "time_per_iteration": 2.9765617847442627 + }, + { + "auxiliary_loss_clip": 0.01067075, + "auxiliary_loss_mlp": 0.01031273, + "balance_loss_clip": 1.02550387, + "balance_loss_mlp": 1.02063322, + "epoch": 0.5399669322110326, + "flos": 27381635164800.0, + "grad_norm": 1.9034475884879212, + "language_loss": 0.82120907, + "learning_rate": 1.8383551750791431e-06, + "loss": 0.84219253, + "num_input_tokens_seen": 193439310, + "step": 8981, + "time_per_iteration": 2.634385108947754 + }, + { + "auxiliary_loss_clip": 0.0105785, + "auxiliary_loss_mlp": 0.01032017, + "balance_loss_clip": 1.02512193, + "balance_loss_mlp": 1.0200845, + "epoch": 0.5400270554637006, + "flos": 20449116403200.0, + "grad_norm": 1.8049554070712281, + "language_loss": 0.66545486, + "learning_rate": 1.8379669905819857e-06, + "loss": 0.68635356, + "num_input_tokens_seen": 193458115, + "step": 8982, + "time_per_iteration": 2.705092668533325 + }, + { + "auxiliary_loss_clip": 0.01041094, + "auxiliary_loss_mlp": 0.00747455, + "balance_loss_clip": 1.0286386, + "balance_loss_mlp": 1.00028193, + "epoch": 0.5400871787163686, + "flos": 21689578638720.0, + "grad_norm": 1.5544618217897321, + "language_loss": 0.8264544, + "learning_rate": 1.8375788122291358e-06, + "loss": 0.84433985, + "num_input_tokens_seen": 193477365, + "step": 8983, + "time_per_iteration": 2.772183895111084 + }, + { + "auxiliary_loss_clip": 0.0102106, + "auxiliary_loss_mlp": 0.01033605, + "balance_loss_clip": 1.02348256, + "balance_loss_mlp": 1.02217293, + "epoch": 0.5401473019690365, + "flos": 19204739585280.0, + "grad_norm": 2.2687683282836217, + "language_loss": 0.70924032, + "learning_rate": 1.8371906400353138e-06, + "loss": 0.72978687, + "num_input_tokens_seen": 193495595, + "step": 8984, + "time_per_iteration": 2.752845287322998 + }, + { + "auxiliary_loss_clip": 0.01070423, + "auxiliary_loss_mlp": 0.01030148, + "balance_loss_clip": 1.02692091, + "balance_loss_mlp": 1.01779783, + "epoch": 0.5402074252217045, + "flos": 20627376624000.0, + "grad_norm": 2.219906172149874, + "language_loss": 0.79833841, + "learning_rate": 1.8368024740152386e-06, + "loss": 0.8193441, + "num_input_tokens_seen": 193514035, + "step": 8985, + "time_per_iteration": 2.598538637161255 + }, + { + "auxiliary_loss_clip": 0.01026778, + "auxiliary_loss_mlp": 0.01027234, + "balance_loss_clip": 1.02234352, + "balance_loss_mlp": 1.01606452, + "epoch": 0.5402675484743724, + "flos": 24973465691520.0, + "grad_norm": 1.7405124902397096, + "language_loss": 0.78643519, + "learning_rate": 1.83641431418363e-06, + "loss": 0.80697525, + "num_input_tokens_seen": 193535445, + "step": 8986, + "time_per_iteration": 2.771574020385742 + }, + { + "auxiliary_loss_clip": 0.01050184, + "auxiliary_loss_mlp": 0.01030617, + "balance_loss_clip": 1.02355838, + "balance_loss_mlp": 1.0187856, + "epoch": 0.5403276717270404, + "flos": 19459022941440.0, + "grad_norm": 1.584151256559483, + "language_loss": 0.76833731, + "learning_rate": 1.8360261605552075e-06, + "loss": 0.78914535, + "num_input_tokens_seen": 193554780, + "step": 8987, + "time_per_iteration": 2.553006649017334 + }, + { + "auxiliary_loss_clip": 0.01046063, + "auxiliary_loss_mlp": 0.01031304, + "balance_loss_clip": 1.02517152, + "balance_loss_mlp": 1.0200212, + "epoch": 0.5403877949797083, + "flos": 18442140912000.0, + "grad_norm": 1.6652080929839308, + "language_loss": 0.7109139, + "learning_rate": 1.8356380131446887e-06, + "loss": 0.73168755, + "num_input_tokens_seen": 193573580, + "step": 8988, + "time_per_iteration": 2.595477819442749 + }, + { + "auxiliary_loss_clip": 0.01023102, + "auxiliary_loss_mlp": 0.01033159, + "balance_loss_clip": 1.02328873, + "balance_loss_mlp": 1.02149451, + "epoch": 0.5404479182323764, + "flos": 28292868316800.0, + "grad_norm": 2.4631169314443753, + "language_loss": 0.67198265, + "learning_rate": 1.8352498719667934e-06, + "loss": 0.69254524, + "num_input_tokens_seen": 193590490, + "step": 8989, + "time_per_iteration": 2.7348971366882324 + }, + { + "auxiliary_loss_clip": 0.01057477, + "auxiliary_loss_mlp": 0.01033634, + "balance_loss_clip": 1.02536011, + "balance_loss_mlp": 1.02171862, + "epoch": 0.5405080414850444, + "flos": 23367325046400.0, + "grad_norm": 1.5483033667745882, + "language_loss": 0.77599519, + "learning_rate": 1.8348617370362399e-06, + "loss": 0.79690635, + "num_input_tokens_seen": 193609900, + "step": 8990, + "time_per_iteration": 2.7050933837890625 + }, + { + "auxiliary_loss_clip": 0.01053682, + "auxiliary_loss_mlp": 0.01025801, + "balance_loss_clip": 1.02459514, + "balance_loss_mlp": 1.01569819, + "epoch": 0.5405681647377123, + "flos": 21106425335040.0, + "grad_norm": 1.8058495129415664, + "language_loss": 0.69240898, + "learning_rate": 1.834473608367745e-06, + "loss": 0.71320379, + "num_input_tokens_seen": 193629775, + "step": 8991, + "time_per_iteration": 2.583767890930176 + }, + { + "auxiliary_loss_clip": 0.0100326, + "auxiliary_loss_mlp": 0.0102838, + "balance_loss_clip": 1.01971257, + "balance_loss_mlp": 1.0164237, + "epoch": 0.5406282879903803, + "flos": 20449188230400.0, + "grad_norm": 1.8215548505238206, + "language_loss": 0.76158768, + "learning_rate": 1.8340854859760277e-06, + "loss": 0.7819041, + "num_input_tokens_seen": 193648070, + "step": 8992, + "time_per_iteration": 2.8120691776275635 + }, + { + "auxiliary_loss_clip": 0.01039786, + "auxiliary_loss_mlp": 0.01033683, + "balance_loss_clip": 1.02245522, + "balance_loss_mlp": 1.02094007, + "epoch": 0.5406884112430482, + "flos": 14209493973120.0, + "grad_norm": 2.5556628678671554, + "language_loss": 0.75864279, + "learning_rate": 1.8336973698758056e-06, + "loss": 0.77937746, + "num_input_tokens_seen": 193665060, + "step": 8993, + "time_per_iteration": 2.5909361839294434 + }, + { + "auxiliary_loss_clip": 0.01051241, + "auxiliary_loss_mlp": 0.01026069, + "balance_loss_clip": 1.0234282, + "balance_loss_mlp": 1.0157094, + "epoch": 0.5407485344957162, + "flos": 23875568536320.0, + "grad_norm": 1.6026428042347367, + "language_loss": 0.70674539, + "learning_rate": 1.8333092600817959e-06, + "loss": 0.7275185, + "num_input_tokens_seen": 193683620, + "step": 8994, + "time_per_iteration": 2.6114020347595215 + }, + { + "auxiliary_loss_clip": 0.0105063, + "auxiliary_loss_mlp": 0.01030317, + "balance_loss_clip": 1.02343917, + "balance_loss_mlp": 1.01804399, + "epoch": 0.5408086577483842, + "flos": 23148485435520.0, + "grad_norm": 1.7800287076780237, + "language_loss": 0.75189304, + "learning_rate": 1.8329211566087157e-06, + "loss": 0.77270252, + "num_input_tokens_seen": 193702990, + "step": 8995, + "time_per_iteration": 2.555206060409546 + }, + { + "auxiliary_loss_clip": 0.01054325, + "auxiliary_loss_mlp": 0.01028052, + "balance_loss_clip": 1.02486658, + "balance_loss_mlp": 1.01790142, + "epoch": 0.5408687810010522, + "flos": 18771046773120.0, + "grad_norm": 1.7221110736190135, + "language_loss": 0.73220539, + "learning_rate": 1.832533059471282e-06, + "loss": 0.75302917, + "num_input_tokens_seen": 193721785, + "step": 8996, + "time_per_iteration": 2.5376198291778564 + }, + { + "auxiliary_loss_clip": 0.0102062, + "auxiliary_loss_mlp": 0.01031999, + "balance_loss_clip": 1.02242494, + "balance_loss_mlp": 1.0216459, + "epoch": 0.5409289042537201, + "flos": 13881557779200.0, + "grad_norm": 1.6870313279818823, + "language_loss": 0.73588836, + "learning_rate": 1.8321449686842115e-06, + "loss": 0.75641459, + "num_input_tokens_seen": 193740315, + "step": 8997, + "time_per_iteration": 2.693056106567383 + }, + { + "auxiliary_loss_clip": 0.01065451, + "auxiliary_loss_mlp": 0.0102879, + "balance_loss_clip": 1.02542567, + "balance_loss_mlp": 1.01759005, + "epoch": 0.5409890275063881, + "flos": 14465357527680.0, + "grad_norm": 2.2821805868443934, + "language_loss": 0.72174305, + "learning_rate": 1.8317568842622207e-06, + "loss": 0.74268544, + "num_input_tokens_seen": 193757580, + "step": 8998, + "time_per_iteration": 2.482619047164917 + }, + { + "auxiliary_loss_clip": 0.01033592, + "auxiliary_loss_mlp": 0.01032125, + "balance_loss_clip": 1.02416754, + "balance_loss_mlp": 1.0210917, + "epoch": 0.541049150759056, + "flos": 48977449349760.0, + "grad_norm": 1.4175448711898526, + "language_loss": 0.70521963, + "learning_rate": 1.8313688062200256e-06, + "loss": 0.72587681, + "num_input_tokens_seen": 193780965, + "step": 8999, + "time_per_iteration": 2.8743131160736084 + }, + { + "auxiliary_loss_clip": 0.01043299, + "auxiliary_loss_mlp": 0.01027131, + "balance_loss_clip": 1.02414918, + "balance_loss_mlp": 1.0158422, + "epoch": 0.541109274011724, + "flos": 18147601388160.0, + "grad_norm": 2.1951723867435313, + "language_loss": 0.80833244, + "learning_rate": 1.8309807345723422e-06, + "loss": 0.82903665, + "num_input_tokens_seen": 193797855, + "step": 9000, + "time_per_iteration": 2.729060411453247 + }, + { + "auxiliary_loss_clip": 0.01013737, + "auxiliary_loss_mlp": 0.01025819, + "balance_loss_clip": 1.02140224, + "balance_loss_mlp": 1.01432121, + "epoch": 0.541169397264392, + "flos": 20522553759360.0, + "grad_norm": 1.538099946735505, + "language_loss": 0.73101038, + "learning_rate": 1.8305926693338863e-06, + "loss": 0.75140595, + "num_input_tokens_seen": 193817375, + "step": 9001, + "time_per_iteration": 2.739750862121582 + }, + { + "auxiliary_loss_clip": 0.01030921, + "auxiliary_loss_mlp": 0.0103062, + "balance_loss_clip": 1.02212834, + "balance_loss_mlp": 1.01841879, + "epoch": 0.54122952051706, + "flos": 20044043752320.0, + "grad_norm": 2.5005293129253996, + "language_loss": 0.84649968, + "learning_rate": 1.8302046105193734e-06, + "loss": 0.86711514, + "num_input_tokens_seen": 193832205, + "step": 9002, + "time_per_iteration": 2.660789728164673 + }, + { + "auxiliary_loss_clip": 0.01017143, + "auxiliary_loss_mlp": 0.01029517, + "balance_loss_clip": 1.02321112, + "balance_loss_mlp": 1.01961029, + "epoch": 0.541289643769728, + "flos": 19062246332160.0, + "grad_norm": 2.1014527435176276, + "language_loss": 0.77510178, + "learning_rate": 1.8298165581435183e-06, + "loss": 0.79556841, + "num_input_tokens_seen": 193849830, + "step": 9003, + "time_per_iteration": 2.734071969985962 + }, + { + "auxiliary_loss_clip": 0.01053542, + "auxiliary_loss_mlp": 0.01027077, + "balance_loss_clip": 1.02438331, + "balance_loss_mlp": 1.01546049, + "epoch": 0.5413497670223959, + "flos": 22382295402240.0, + "grad_norm": 1.7872834720606345, + "language_loss": 0.68675447, + "learning_rate": 1.8294285122210372e-06, + "loss": 0.70756066, + "num_input_tokens_seen": 193869945, + "step": 9004, + "time_per_iteration": 2.6346778869628906 + }, + { + "auxiliary_loss_clip": 0.00998472, + "auxiliary_loss_mlp": 0.01000958, + "balance_loss_clip": 1.00204551, + "balance_loss_mlp": 0.99973601, + "epoch": 0.5414098902750639, + "flos": 70031734093440.0, + "grad_norm": 0.9583091541396841, + "language_loss": 0.59078211, + "learning_rate": 1.8290404727666434e-06, + "loss": 0.61077642, + "num_input_tokens_seen": 193930860, + "step": 9005, + "time_per_iteration": 3.2946338653564453 + }, + { + "auxiliary_loss_clip": 0.0106665, + "auxiliary_loss_mlp": 0.00747639, + "balance_loss_clip": 1.02586055, + "balance_loss_mlp": 1.0003562, + "epoch": 0.5414700135277318, + "flos": 21798962530560.0, + "grad_norm": 2.022383401483721, + "language_loss": 0.78284883, + "learning_rate": 1.8286524397950517e-06, + "loss": 0.80099171, + "num_input_tokens_seen": 193949075, + "step": 9006, + "time_per_iteration": 2.664017915725708 + }, + { + "auxiliary_loss_clip": 0.0104541, + "auxiliary_loss_mlp": 0.01030947, + "balance_loss_clip": 1.02595091, + "balance_loss_mlp": 1.02152312, + "epoch": 0.5415301367803999, + "flos": 16907929251840.0, + "grad_norm": 1.6957068621140121, + "language_loss": 0.82952911, + "learning_rate": 1.8282644133209777e-06, + "loss": 0.85029262, + "num_input_tokens_seen": 193967630, + "step": 9007, + "time_per_iteration": 2.643031597137451 + }, + { + "auxiliary_loss_clip": 0.0105704, + "auxiliary_loss_mlp": 0.01031112, + "balance_loss_clip": 1.02694535, + "balance_loss_mlp": 1.019871, + "epoch": 0.5415902600330678, + "flos": 25704176065920.0, + "grad_norm": 1.8963530593980276, + "language_loss": 0.67096388, + "learning_rate": 1.8278763933591334e-06, + "loss": 0.69184542, + "num_input_tokens_seen": 193988730, + "step": 9008, + "time_per_iteration": 4.304261922836304 + }, + { + "auxiliary_loss_clip": 0.01069924, + "auxiliary_loss_mlp": 0.01032621, + "balance_loss_clip": 1.02688968, + "balance_loss_mlp": 1.02065229, + "epoch": 0.5416503832857358, + "flos": 19208151377280.0, + "grad_norm": 2.125289148657509, + "language_loss": 0.73803186, + "learning_rate": 1.827488379924234e-06, + "loss": 0.75905734, + "num_input_tokens_seen": 194005160, + "step": 9009, + "time_per_iteration": 4.067074298858643 + }, + { + "auxiliary_loss_clip": 0.01029987, + "auxiliary_loss_mlp": 0.01036682, + "balance_loss_clip": 1.02886081, + "balance_loss_mlp": 1.0247556, + "epoch": 0.5417105065384037, + "flos": 12713706887040.0, + "grad_norm": 5.565855185272774, + "language_loss": 0.87597388, + "learning_rate": 1.8271003730309923e-06, + "loss": 0.89664054, + "num_input_tokens_seen": 194021700, + "step": 9010, + "time_per_iteration": 2.6186728477478027 + }, + { + "auxiliary_loss_clip": 0.01063236, + "auxiliary_loss_mlp": 0.01031567, + "balance_loss_clip": 1.02379799, + "balance_loss_mlp": 1.02074313, + "epoch": 0.5417706297910717, + "flos": 30335933998080.0, + "grad_norm": 1.8732667102409024, + "language_loss": 0.65246809, + "learning_rate": 1.826712372694122e-06, + "loss": 0.67341614, + "num_input_tokens_seen": 194042620, + "step": 9011, + "time_per_iteration": 2.6006176471710205 + }, + { + "auxiliary_loss_clip": 0.01055485, + "auxiliary_loss_mlp": 0.0103539, + "balance_loss_clip": 1.02582347, + "balance_loss_mlp": 1.02503085, + "epoch": 0.5418307530437396, + "flos": 29020992912000.0, + "grad_norm": 2.2859150853037935, + "language_loss": 0.78915119, + "learning_rate": 1.8263243789283362e-06, + "loss": 0.81005991, + "num_input_tokens_seen": 194061800, + "step": 9012, + "time_per_iteration": 2.592916250228882 + }, + { + "auxiliary_loss_clip": 0.01064413, + "auxiliary_loss_mlp": 0.01028346, + "balance_loss_clip": 1.02430987, + "balance_loss_mlp": 1.01750445, + "epoch": 0.5418908762964076, + "flos": 16873455173760.0, + "grad_norm": 1.9811251652103294, + "language_loss": 0.74447131, + "learning_rate": 1.8259363917483466e-06, + "loss": 0.76539886, + "num_input_tokens_seen": 194079890, + "step": 9013, + "time_per_iteration": 2.664306163787842 + }, + { + "auxiliary_loss_clip": 0.01031893, + "auxiliary_loss_mlp": 0.01029709, + "balance_loss_clip": 1.02544904, + "balance_loss_mlp": 1.01816964, + "epoch": 0.5419509995490756, + "flos": 18949702043520.0, + "grad_norm": 4.884163710503976, + "language_loss": 0.71916127, + "learning_rate": 1.8255484111688667e-06, + "loss": 0.73977727, + "num_input_tokens_seen": 194097625, + "step": 9014, + "time_per_iteration": 2.642871856689453 + }, + { + "auxiliary_loss_clip": 0.01047097, + "auxiliary_loss_mlp": 0.01031755, + "balance_loss_clip": 1.02632213, + "balance_loss_mlp": 1.02035284, + "epoch": 0.5420111228017436, + "flos": 18077719478400.0, + "grad_norm": 1.780543756854263, + "language_loss": 0.8070364, + "learning_rate": 1.8251604372046085e-06, + "loss": 0.82782495, + "num_input_tokens_seen": 194116055, + "step": 9015, + "time_per_iteration": 2.557530403137207 + }, + { + "auxiliary_loss_clip": 0.01062071, + "auxiliary_loss_mlp": 0.01032704, + "balance_loss_clip": 1.02792585, + "balance_loss_mlp": 1.0215286, + "epoch": 0.5420712460544116, + "flos": 19061779455360.0, + "grad_norm": 2.022559100304258, + "language_loss": 0.81184518, + "learning_rate": 1.8247724698702843e-06, + "loss": 0.83279294, + "num_input_tokens_seen": 194130365, + "step": 9016, + "time_per_iteration": 2.5276856422424316 + }, + { + "auxiliary_loss_clip": 0.01067022, + "auxiliary_loss_mlp": 0.01028403, + "balance_loss_clip": 1.02691531, + "balance_loss_mlp": 1.01782274, + "epoch": 0.5421313693070795, + "flos": 18187103370240.0, + "grad_norm": 1.85017685291843, + "language_loss": 0.8139627, + "learning_rate": 1.8243845091806053e-06, + "loss": 0.83491695, + "num_input_tokens_seen": 194148975, + "step": 9017, + "time_per_iteration": 2.615323305130005 + }, + { + "auxiliary_loss_clip": 0.01063905, + "auxiliary_loss_mlp": 0.01028762, + "balance_loss_clip": 1.02444685, + "balance_loss_mlp": 1.01765215, + "epoch": 0.5421914925597475, + "flos": 13005947940480.0, + "grad_norm": 1.72833897678947, + "language_loss": 0.77451342, + "learning_rate": 1.8239965551502837e-06, + "loss": 0.79544008, + "num_input_tokens_seen": 194167185, + "step": 9018, + "time_per_iteration": 2.5977556705474854 + }, + { + "auxiliary_loss_clip": 0.01065766, + "auxiliary_loss_mlp": 0.01033011, + "balance_loss_clip": 1.02334571, + "balance_loss_mlp": 1.02150154, + "epoch": 0.5422516158124154, + "flos": 46758457831680.0, + "grad_norm": 1.4225115353536706, + "language_loss": 0.66347259, + "learning_rate": 1.8236086077940303e-06, + "loss": 0.68446034, + "num_input_tokens_seen": 194192840, + "step": 9019, + "time_per_iteration": 4.386204719543457 + }, + { + "auxiliary_loss_clip": 0.01046588, + "auxiliary_loss_mlp": 0.01026935, + "balance_loss_clip": 1.02301538, + "balance_loss_mlp": 1.01633132, + "epoch": 0.5423117390650835, + "flos": 31758642864000.0, + "grad_norm": 2.0888097064557747, + "language_loss": 0.70095378, + "learning_rate": 1.8232206671265555e-06, + "loss": 0.72168899, + "num_input_tokens_seen": 194213150, + "step": 9020, + "time_per_iteration": 2.7543206214904785 + }, + { + "auxiliary_loss_clip": 0.01029882, + "auxiliary_loss_mlp": 0.01034535, + "balance_loss_clip": 1.0230577, + "balance_loss_mlp": 1.02425361, + "epoch": 0.5423718623177514, + "flos": 27201974313600.0, + "grad_norm": 1.5275924256757571, + "language_loss": 0.80515289, + "learning_rate": 1.8228327331625717e-06, + "loss": 0.82579708, + "num_input_tokens_seen": 194234665, + "step": 9021, + "time_per_iteration": 2.908769130706787 + }, + { + "auxiliary_loss_clip": 0.01022598, + "auxiliary_loss_mlp": 0.01034603, + "balance_loss_clip": 1.02506065, + "balance_loss_mlp": 1.0235045, + "epoch": 0.5424319855704194, + "flos": 23546447193600.0, + "grad_norm": 1.5353139682866295, + "language_loss": 0.78781158, + "learning_rate": 1.822444805916788e-06, + "loss": 0.80838358, + "num_input_tokens_seen": 194253790, + "step": 9022, + "time_per_iteration": 4.479204177856445 + }, + { + "auxiliary_loss_clip": 0.01039871, + "auxiliary_loss_mlp": 0.00747603, + "balance_loss_clip": 1.02453208, + "balance_loss_mlp": 1.00025058, + "epoch": 0.5424921088230873, + "flos": 26615624699520.0, + "grad_norm": 2.259588750389444, + "language_loss": 0.82191819, + "learning_rate": 1.822056885403915e-06, + "loss": 0.83979291, + "num_input_tokens_seen": 194274950, + "step": 9023, + "time_per_iteration": 2.7761523723602295 + }, + { + "auxiliary_loss_clip": 0.01054761, + "auxiliary_loss_mlp": 0.01026463, + "balance_loss_clip": 1.02493668, + "balance_loss_mlp": 1.01596701, + "epoch": 0.5425522320757553, + "flos": 23586811102080.0, + "grad_norm": 1.708690403533341, + "language_loss": 0.71105456, + "learning_rate": 1.8216689716386627e-06, + "loss": 0.73186678, + "num_input_tokens_seen": 194296155, + "step": 9024, + "time_per_iteration": 2.784604072570801 + }, + { + "auxiliary_loss_clip": 0.01056055, + "auxiliary_loss_mlp": 0.01029563, + "balance_loss_clip": 1.0245378, + "balance_loss_mlp": 1.01904273, + "epoch": 0.5426123553284232, + "flos": 30592264429440.0, + "grad_norm": 1.8970723882816072, + "language_loss": 0.65526187, + "learning_rate": 1.8212810646357405e-06, + "loss": 0.67611808, + "num_input_tokens_seen": 194318025, + "step": 9025, + "time_per_iteration": 2.810575485229492 + }, + { + "auxiliary_loss_clip": 0.01045347, + "auxiliary_loss_mlp": 0.00747563, + "balance_loss_clip": 1.03187823, + "balance_loss_mlp": 1.00031304, + "epoch": 0.5426724785810912, + "flos": 12495118671360.0, + "grad_norm": 1.8224543613142166, + "language_loss": 0.73921371, + "learning_rate": 1.8208931644098591e-06, + "loss": 0.75714278, + "num_input_tokens_seen": 194336150, + "step": 9026, + "time_per_iteration": 2.925565719604492 + }, + { + "auxiliary_loss_clip": 0.01041939, + "auxiliary_loss_mlp": 0.01033935, + "balance_loss_clip": 1.02200556, + "balance_loss_mlp": 1.02101862, + "epoch": 0.5427326018337592, + "flos": 26064611089920.0, + "grad_norm": 1.8687889080485873, + "language_loss": 0.78829932, + "learning_rate": 1.8205052709757265e-06, + "loss": 0.80905807, + "num_input_tokens_seen": 194355980, + "step": 9027, + "time_per_iteration": 2.7762694358825684 + }, + { + "auxiliary_loss_clip": 0.00982649, + "auxiliary_loss_mlp": 0.01004175, + "balance_loss_clip": 1.00498402, + "balance_loss_mlp": 1.00281608, + "epoch": 0.5427927250864272, + "flos": 65984745576960.0, + "grad_norm": 0.7464621384951108, + "language_loss": 0.56562531, + "learning_rate": 1.8201173843480515e-06, + "loss": 0.58549356, + "num_input_tokens_seen": 194422660, + "step": 9028, + "time_per_iteration": 3.2507894039154053 + }, + { + "auxiliary_loss_clip": 0.01030762, + "auxiliary_loss_mlp": 0.01028388, + "balance_loss_clip": 1.02739608, + "balance_loss_mlp": 1.01678872, + "epoch": 0.5428528483390952, + "flos": 19975382904960.0, + "grad_norm": 2.047867004714994, + "language_loss": 0.77742249, + "learning_rate": 1.8197295045415442e-06, + "loss": 0.79801393, + "num_input_tokens_seen": 194438545, + "step": 9029, + "time_per_iteration": 2.7503936290740967 + }, + { + "auxiliary_loss_clip": 0.01017485, + "auxiliary_loss_mlp": 0.01026918, + "balance_loss_clip": 1.02252591, + "balance_loss_mlp": 1.01502061, + "epoch": 0.5429129715917631, + "flos": 21832323287040.0, + "grad_norm": 1.487398615914586, + "language_loss": 0.83280563, + "learning_rate": 1.8193416315709112e-06, + "loss": 0.85324967, + "num_input_tokens_seen": 194458060, + "step": 9030, + "time_per_iteration": 2.7839927673339844 + }, + { + "auxiliary_loss_clip": 0.01064658, + "auxiliary_loss_mlp": 0.01031976, + "balance_loss_clip": 1.02500761, + "balance_loss_mlp": 1.02097297, + "epoch": 0.5429730948444311, + "flos": 27782685492480.0, + "grad_norm": 1.799249541140334, + "language_loss": 0.75236374, + "learning_rate": 1.8189537654508623e-06, + "loss": 0.77333009, + "num_input_tokens_seen": 194477405, + "step": 9031, + "time_per_iteration": 2.636230707168579 + }, + { + "auxiliary_loss_clip": 0.01046799, + "auxiliary_loss_mlp": 0.01029905, + "balance_loss_clip": 1.02336013, + "balance_loss_mlp": 1.01889658, + "epoch": 0.543033218097099, + "flos": 26760452336640.0, + "grad_norm": 1.7479769911771834, + "language_loss": 0.85442698, + "learning_rate": 1.8185659061961045e-06, + "loss": 0.87519407, + "num_input_tokens_seen": 194497085, + "step": 9032, + "time_per_iteration": 2.654528856277466 + }, + { + "auxiliary_loss_clip": 0.01048889, + "auxiliary_loss_mlp": 0.01031771, + "balance_loss_clip": 1.02606106, + "balance_loss_mlp": 1.0203867, + "epoch": 0.5430933413497671, + "flos": 22675254727680.0, + "grad_norm": 2.10647503570917, + "language_loss": 0.74081039, + "learning_rate": 1.8181780538213457e-06, + "loss": 0.76161706, + "num_input_tokens_seen": 194516785, + "step": 9033, + "time_per_iteration": 2.680485725402832 + }, + { + "auxiliary_loss_clip": 0.0103651, + "auxiliary_loss_mlp": 0.01032528, + "balance_loss_clip": 1.02559876, + "balance_loss_mlp": 1.02074432, + "epoch": 0.543153464602435, + "flos": 24607499973120.0, + "grad_norm": 1.59526834550975, + "language_loss": 0.75374532, + "learning_rate": 1.8177902083412935e-06, + "loss": 0.7744357, + "num_input_tokens_seen": 194536475, + "step": 9034, + "time_per_iteration": 2.6576366424560547 + }, + { + "auxiliary_loss_clip": 0.0103601, + "auxiliary_loss_mlp": 0.01024992, + "balance_loss_clip": 1.0274595, + "balance_loss_mlp": 1.01428664, + "epoch": 0.543213587855103, + "flos": 19025725178880.0, + "grad_norm": 1.8969870052949214, + "language_loss": 0.84520233, + "learning_rate": 1.817402369770655e-06, + "loss": 0.8658123, + "num_input_tokens_seen": 194554495, + "step": 9035, + "time_per_iteration": 2.686889171600342 + }, + { + "auxiliary_loss_clip": 0.00980172, + "auxiliary_loss_mlp": 0.01004316, + "balance_loss_clip": 1.00427532, + "balance_loss_mlp": 1.00321949, + "epoch": 0.5432737111077709, + "flos": 65686435125120.0, + "grad_norm": 0.7241324182318468, + "language_loss": 0.55932766, + "learning_rate": 1.8170145381241364e-06, + "loss": 0.57917255, + "num_input_tokens_seen": 194617620, + "step": 9036, + "time_per_iteration": 3.221813678741455 + }, + { + "auxiliary_loss_clip": 0.01018428, + "auxiliary_loss_mlp": 0.01027514, + "balance_loss_clip": 1.02609921, + "balance_loss_mlp": 1.0161953, + "epoch": 0.5433338343604389, + "flos": 22091670460800.0, + "grad_norm": 1.753622198177211, + "language_loss": 0.75194883, + "learning_rate": 1.8166267134164451e-06, + "loss": 0.77240825, + "num_input_tokens_seen": 194637690, + "step": 9037, + "time_per_iteration": 2.780001401901245 + }, + { + "auxiliary_loss_clip": 0.01041801, + "auxiliary_loss_mlp": 0.01030903, + "balance_loss_clip": 1.02552903, + "balance_loss_mlp": 1.01959658, + "epoch": 0.5433939576131068, + "flos": 34672649616000.0, + "grad_norm": 2.122453827349323, + "language_loss": 0.66905814, + "learning_rate": 1.8162388956622875e-06, + "loss": 0.68978518, + "num_input_tokens_seen": 194659520, + "step": 9038, + "time_per_iteration": 2.7779688835144043 + }, + { + "auxiliary_loss_clip": 0.01050439, + "auxiliary_loss_mlp": 0.01029033, + "balance_loss_clip": 1.022012, + "balance_loss_mlp": 1.01846528, + "epoch": 0.5434540808657748, + "flos": 20303355012480.0, + "grad_norm": 1.9623397019886173, + "language_loss": 0.77974647, + "learning_rate": 1.8158510848763692e-06, + "loss": 0.80054116, + "num_input_tokens_seen": 194677645, + "step": 9039, + "time_per_iteration": 2.616425037384033 + }, + { + "auxiliary_loss_clip": 0.01021148, + "auxiliary_loss_mlp": 0.01033001, + "balance_loss_clip": 1.02340961, + "balance_loss_mlp": 1.02227235, + "epoch": 0.5435142041184428, + "flos": 23112790295040.0, + "grad_norm": 1.7845139351890902, + "language_loss": 0.76458859, + "learning_rate": 1.8154632810733962e-06, + "loss": 0.78513014, + "num_input_tokens_seen": 194697400, + "step": 9040, + "time_per_iteration": 2.6951534748077393 + }, + { + "auxiliary_loss_clip": 0.00989633, + "auxiliary_loss_mlp": 0.01000635, + "balance_loss_clip": 1.00304818, + "balance_loss_mlp": 0.9995262, + "epoch": 0.5435743273711108, + "flos": 64012746954240.0, + "grad_norm": 0.662719244237788, + "language_loss": 0.52404481, + "learning_rate": 1.815075484268074e-06, + "loss": 0.54394746, + "num_input_tokens_seen": 194761205, + "step": 9041, + "time_per_iteration": 3.1976001262664795 + }, + { + "auxiliary_loss_clip": 0.01043884, + "auxiliary_loss_mlp": 0.01031217, + "balance_loss_clip": 1.02498817, + "balance_loss_mlp": 1.01999354, + "epoch": 0.5436344506237788, + "flos": 25118903859840.0, + "grad_norm": 1.7845374758721986, + "language_loss": 0.75865638, + "learning_rate": 1.8146876944751078e-06, + "loss": 0.77940738, + "num_input_tokens_seen": 194782445, + "step": 9042, + "time_per_iteration": 2.713959217071533 + }, + { + "auxiliary_loss_clip": 0.01032674, + "auxiliary_loss_mlp": 0.01028589, + "balance_loss_clip": 1.02395606, + "balance_loss_mlp": 1.01811635, + "epoch": 0.5436945738764467, + "flos": 19572967860480.0, + "grad_norm": 1.5326363364771833, + "language_loss": 0.66957569, + "learning_rate": 1.8142999117092033e-06, + "loss": 0.69018829, + "num_input_tokens_seen": 194800325, + "step": 9043, + "time_per_iteration": 2.6891119480133057 + }, + { + "auxiliary_loss_clip": 0.01026831, + "auxiliary_loss_mlp": 0.01028023, + "balance_loss_clip": 1.02182746, + "balance_loss_mlp": 1.01737714, + "epoch": 0.5437546971291147, + "flos": 21142515525120.0, + "grad_norm": 1.5749968889516015, + "language_loss": 0.84243864, + "learning_rate": 1.8139121359850644e-06, + "loss": 0.86298716, + "num_input_tokens_seen": 194818675, + "step": 9044, + "time_per_iteration": 2.6694462299346924 + }, + { + "auxiliary_loss_clip": 0.01068094, + "auxiliary_loss_mlp": 0.01028978, + "balance_loss_clip": 1.02582395, + "balance_loss_mlp": 1.01738501, + "epoch": 0.5438148203817826, + "flos": 25118688378240.0, + "grad_norm": 1.507788278602945, + "language_loss": 0.61526519, + "learning_rate": 1.8135243673173956e-06, + "loss": 0.63623595, + "num_input_tokens_seen": 194836595, + "step": 9045, + "time_per_iteration": 2.6273744106292725 + }, + { + "auxiliary_loss_clip": 0.01067338, + "auxiliary_loss_mlp": 0.01025838, + "balance_loss_clip": 1.02618194, + "balance_loss_mlp": 1.01506758, + "epoch": 0.5438749436344507, + "flos": 23002939526400.0, + "grad_norm": 1.6714754698640724, + "language_loss": 0.69926381, + "learning_rate": 1.8131366057209023e-06, + "loss": 0.72019553, + "num_input_tokens_seen": 194857520, + "step": 9046, + "time_per_iteration": 2.7259321212768555 + }, + { + "auxiliary_loss_clip": 0.01064848, + "auxiliary_loss_mlp": 0.01029808, + "balance_loss_clip": 1.02553129, + "balance_loss_mlp": 1.01884675, + "epoch": 0.5439350668871186, + "flos": 15487016065920.0, + "grad_norm": 1.4686947639803418, + "language_loss": 0.77377522, + "learning_rate": 1.8127488512102868e-06, + "loss": 0.79472172, + "num_input_tokens_seen": 194876020, + "step": 9047, + "time_per_iteration": 2.5938591957092285 + }, + { + "auxiliary_loss_clip": 0.01039931, + "auxiliary_loss_mlp": 0.01034824, + "balance_loss_clip": 1.02420235, + "balance_loss_mlp": 1.02256298, + "epoch": 0.5439951901397866, + "flos": 17238415311360.0, + "grad_norm": 1.6504852033659225, + "language_loss": 0.72930783, + "learning_rate": 1.8123611038002547e-06, + "loss": 0.75005543, + "num_input_tokens_seen": 194894650, + "step": 9048, + "time_per_iteration": 2.610374927520752 + }, + { + "auxiliary_loss_clip": 0.01012349, + "auxiliary_loss_mlp": 0.01037969, + "balance_loss_clip": 1.02077365, + "balance_loss_mlp": 1.02495766, + "epoch": 0.5440553133924545, + "flos": 18661016436480.0, + "grad_norm": 2.249190080056461, + "language_loss": 0.93302226, + "learning_rate": 1.8119733635055076e-06, + "loss": 0.95352548, + "num_input_tokens_seen": 194911935, + "step": 9049, + "time_per_iteration": 2.6592135429382324 + }, + { + "auxiliary_loss_clip": 0.01053921, + "auxiliary_loss_mlp": 0.0102758, + "balance_loss_clip": 1.02468669, + "balance_loss_mlp": 1.01738715, + "epoch": 0.5441154366451225, + "flos": 27122934435840.0, + "grad_norm": 1.740991883315544, + "language_loss": 0.73763156, + "learning_rate": 1.8115856303407492e-06, + "loss": 0.75844657, + "num_input_tokens_seen": 194931620, + "step": 9050, + "time_per_iteration": 2.706857204437256 + }, + { + "auxiliary_loss_clip": 0.01057321, + "auxiliary_loss_mlp": 0.01025976, + "balance_loss_clip": 1.02628541, + "balance_loss_mlp": 1.01485395, + "epoch": 0.5441755598977904, + "flos": 25993867253760.0, + "grad_norm": 2.3004746439093458, + "language_loss": 0.66867501, + "learning_rate": 1.8111979043206832e-06, + "loss": 0.68950796, + "num_input_tokens_seen": 194952560, + "step": 9051, + "time_per_iteration": 2.6641995906829834 + }, + { + "auxiliary_loss_clip": 0.01029419, + "auxiliary_loss_mlp": 0.0102994, + "balance_loss_clip": 1.02277017, + "balance_loss_mlp": 1.01922321, + "epoch": 0.5442356831504584, + "flos": 32380041173760.0, + "grad_norm": 1.5928720610348102, + "language_loss": 0.67567229, + "learning_rate": 1.810810185460011e-06, + "loss": 0.69626594, + "num_input_tokens_seen": 194973915, + "step": 9052, + "time_per_iteration": 2.7018308639526367 + }, + { + "auxiliary_loss_clip": 0.0106737, + "auxiliary_loss_mlp": 0.01031525, + "balance_loss_clip": 1.02577198, + "balance_loss_mlp": 1.0205934, + "epoch": 0.5442958064031264, + "flos": 24164290056960.0, + "grad_norm": 1.8009384105128412, + "language_loss": 0.92785418, + "learning_rate": 1.810422473773436e-06, + "loss": 0.94884318, + "num_input_tokens_seen": 194990170, + "step": 9053, + "time_per_iteration": 2.4928717613220215 + }, + { + "auxiliary_loss_clip": 0.01039759, + "auxiliary_loss_mlp": 0.01035173, + "balance_loss_clip": 1.02481771, + "balance_loss_mlp": 1.02405095, + "epoch": 0.5443559296557944, + "flos": 18764690065920.0, + "grad_norm": 2.470513459752567, + "language_loss": 0.83507097, + "learning_rate": 1.8100347692756595e-06, + "loss": 0.85582036, + "num_input_tokens_seen": 195006395, + "step": 9054, + "time_per_iteration": 2.6490225791931152 + }, + { + "auxiliary_loss_clip": 0.01034069, + "auxiliary_loss_mlp": 0.01028969, + "balance_loss_clip": 1.02397525, + "balance_loss_mlp": 1.01732266, + "epoch": 0.5444160529084624, + "flos": 22632556435200.0, + "grad_norm": 1.9717389535266232, + "language_loss": 0.68161917, + "learning_rate": 1.8096470719813836e-06, + "loss": 0.70224953, + "num_input_tokens_seen": 195025080, + "step": 9055, + "time_per_iteration": 4.311201333999634 + }, + { + "auxiliary_loss_clip": 0.00968734, + "auxiliary_loss_mlp": 0.01001912, + "balance_loss_clip": 1.00208235, + "balance_loss_mlp": 1.00073147, + "epoch": 0.5444761761611303, + "flos": 69671909600640.0, + "grad_norm": 0.7334593145175236, + "language_loss": 0.57732671, + "learning_rate": 1.80925938190531e-06, + "loss": 0.5970332, + "num_input_tokens_seen": 195085725, + "step": 9056, + "time_per_iteration": 3.2173237800598145 + }, + { + "auxiliary_loss_clip": 0.0103881, + "auxiliary_loss_mlp": 0.01030213, + "balance_loss_clip": 1.02634144, + "balance_loss_mlp": 1.01864409, + "epoch": 0.5445362994137983, + "flos": 14278442129280.0, + "grad_norm": 2.056308326153913, + "language_loss": 0.69509709, + "learning_rate": 1.8088716990621395e-06, + "loss": 0.71578729, + "num_input_tokens_seen": 195102585, + "step": 9057, + "time_per_iteration": 4.241285085678101 + }, + { + "auxiliary_loss_clip": 0.01055232, + "auxiliary_loss_mlp": 0.01033311, + "balance_loss_clip": 1.02493525, + "balance_loss_mlp": 1.02198005, + "epoch": 0.5445964226664662, + "flos": 28986195611520.0, + "grad_norm": 2.0232374749051845, + "language_loss": 0.75233167, + "learning_rate": 1.8084840234665738e-06, + "loss": 0.77321708, + "num_input_tokens_seen": 195120055, + "step": 9058, + "time_per_iteration": 2.753404378890991 + }, + { + "auxiliary_loss_clip": 0.0098112, + "auxiliary_loss_mlp": 0.01005044, + "balance_loss_clip": 1.00336611, + "balance_loss_mlp": 1.00365496, + "epoch": 0.5446565459191343, + "flos": 68620230270720.0, + "grad_norm": 0.7992986047038452, + "language_loss": 0.62652117, + "learning_rate": 1.808096355133312e-06, + "loss": 0.64638281, + "num_input_tokens_seen": 195181045, + "step": 9059, + "time_per_iteration": 3.4419686794281006 + }, + { + "auxiliary_loss_clip": 0.01054388, + "auxiliary_loss_mlp": 0.01032321, + "balance_loss_clip": 1.02465725, + "balance_loss_mlp": 1.02160406, + "epoch": 0.5447166691718022, + "flos": 16216469464320.0, + "grad_norm": 1.6323031904638456, + "language_loss": 0.79282904, + "learning_rate": 1.8077086940770572e-06, + "loss": 0.81369615, + "num_input_tokens_seen": 195198840, + "step": 9060, + "time_per_iteration": 2.6785783767700195 + }, + { + "auxiliary_loss_clip": 0.01058, + "auxiliary_loss_mlp": 0.01031202, + "balance_loss_clip": 1.02576137, + "balance_loss_mlp": 1.02004421, + "epoch": 0.5447767924244702, + "flos": 25849039616640.0, + "grad_norm": 1.6324017664831625, + "language_loss": 0.79564381, + "learning_rate": 1.8073210403125072e-06, + "loss": 0.81653577, + "num_input_tokens_seen": 195218720, + "step": 9061, + "time_per_iteration": 2.799065589904785 + }, + { + "auxiliary_loss_clip": 0.01054826, + "auxiliary_loss_mlp": 0.01026911, + "balance_loss_clip": 1.02508998, + "balance_loss_mlp": 1.01624787, + "epoch": 0.5448369156771381, + "flos": 19677718897920.0, + "grad_norm": 1.6914732796695475, + "language_loss": 0.86866963, + "learning_rate": 1.8069333938543627e-06, + "loss": 0.88948703, + "num_input_tokens_seen": 195235770, + "step": 9062, + "time_per_iteration": 2.6749870777130127 + }, + { + "auxiliary_loss_clip": 0.01036212, + "auxiliary_loss_mlp": 0.01035349, + "balance_loss_clip": 1.02259398, + "balance_loss_mlp": 1.02197337, + "epoch": 0.5448970389298061, + "flos": 19281804215040.0, + "grad_norm": 3.8180878924945625, + "language_loss": 0.82189727, + "learning_rate": 1.8065457547173233e-06, + "loss": 0.84261286, + "num_input_tokens_seen": 195254870, + "step": 9063, + "time_per_iteration": 2.7463645935058594 + }, + { + "auxiliary_loss_clip": 0.01065138, + "auxiliary_loss_mlp": 0.01028691, + "balance_loss_clip": 1.02456546, + "balance_loss_mlp": 1.01707387, + "epoch": 0.544957162182474, + "flos": 20991690316800.0, + "grad_norm": 2.429927681864382, + "language_loss": 0.63670635, + "learning_rate": 1.8061581229160878e-06, + "loss": 0.65764463, + "num_input_tokens_seen": 195273390, + "step": 9064, + "time_per_iteration": 2.5927634239196777 + }, + { + "auxiliary_loss_clip": 0.01067412, + "auxiliary_loss_mlp": 0.01031889, + "balance_loss_clip": 1.02587926, + "balance_loss_mlp": 1.02027178, + "epoch": 0.545017285435142, + "flos": 25374587846400.0, + "grad_norm": 1.5438141662849574, + "language_loss": 0.79897404, + "learning_rate": 1.8057704984653566e-06, + "loss": 0.81996703, + "num_input_tokens_seen": 195295635, + "step": 9065, + "time_per_iteration": 4.219212532043457 + }, + { + "auxiliary_loss_clip": 0.01033131, + "auxiliary_loss_mlp": 0.0103172, + "balance_loss_clip": 1.02508581, + "balance_loss_mlp": 1.02156353, + "epoch": 0.54507740868781, + "flos": 19134749934720.0, + "grad_norm": 2.057602271433344, + "language_loss": 0.78113097, + "learning_rate": 1.805382881379827e-06, + "loss": 0.80177945, + "num_input_tokens_seen": 195312545, + "step": 9066, + "time_per_iteration": 2.7284328937530518 + }, + { + "auxiliary_loss_clip": 0.01051028, + "auxiliary_loss_mlp": 0.01028848, + "balance_loss_clip": 1.02348435, + "balance_loss_mlp": 1.01724911, + "epoch": 0.545137531940478, + "flos": 26249802635520.0, + "grad_norm": 2.590237669066119, + "language_loss": 0.75851858, + "learning_rate": 1.8049952716741975e-06, + "loss": 0.77931738, + "num_input_tokens_seen": 195332955, + "step": 9067, + "time_per_iteration": 2.7204689979553223 + }, + { + "auxiliary_loss_clip": 0.01045686, + "auxiliary_loss_mlp": 0.01036408, + "balance_loss_clip": 1.03126431, + "balance_loss_mlp": 1.0226748, + "epoch": 0.545197655193146, + "flos": 37555629995520.0, + "grad_norm": 1.8069731971426564, + "language_loss": 0.63551772, + "learning_rate": 1.8046076693631682e-06, + "loss": 0.65633863, + "num_input_tokens_seen": 195355930, + "step": 9068, + "time_per_iteration": 2.8628766536712646 + }, + { + "auxiliary_loss_clip": 0.01035268, + "auxiliary_loss_mlp": 0.01034622, + "balance_loss_clip": 1.03014123, + "balance_loss_mlp": 1.02341604, + "epoch": 0.5452577784458139, + "flos": 26031250333440.0, + "grad_norm": 1.502294880145507, + "language_loss": 0.72243237, + "learning_rate": 1.8042200744614343e-06, + "loss": 0.74313128, + "num_input_tokens_seen": 195376445, + "step": 9069, + "time_per_iteration": 4.334020376205444 + }, + { + "auxiliary_loss_clip": 0.0106494, + "auxiliary_loss_mlp": 0.01026267, + "balance_loss_clip": 1.02688122, + "balance_loss_mlp": 1.01646829, + "epoch": 0.5453179016984819, + "flos": 17639034675840.0, + "grad_norm": 2.453029231626763, + "language_loss": 0.74024153, + "learning_rate": 1.8038324869836957e-06, + "loss": 0.76115358, + "num_input_tokens_seen": 195393725, + "step": 9070, + "time_per_iteration": 2.517259359359741 + }, + { + "auxiliary_loss_clip": 0.01055532, + "auxiliary_loss_mlp": 0.0103148, + "balance_loss_clip": 1.02551496, + "balance_loss_mlp": 1.02033997, + "epoch": 0.5453780249511498, + "flos": 23216679406080.0, + "grad_norm": 1.9988371594516645, + "language_loss": 0.60144556, + "learning_rate": 1.8034449069446489e-06, + "loss": 0.62231565, + "num_input_tokens_seen": 195411380, + "step": 9071, + "time_per_iteration": 2.6530075073242188 + }, + { + "auxiliary_loss_clip": 0.01008303, + "auxiliary_loss_mlp": 0.01004328, + "balance_loss_clip": 1.00211573, + "balance_loss_mlp": 1.00307655, + "epoch": 0.5454381482038179, + "flos": 68696504801280.0, + "grad_norm": 0.7203344920011572, + "language_loss": 0.57199639, + "learning_rate": 1.80305733435899e-06, + "loss": 0.59212267, + "num_input_tokens_seen": 195482015, + "step": 9072, + "time_per_iteration": 3.2727465629577637 + }, + { + "auxiliary_loss_clip": 0.01035606, + "auxiliary_loss_mlp": 0.01033251, + "balance_loss_clip": 1.02219176, + "balance_loss_mlp": 1.02145565, + "epoch": 0.5454982714564858, + "flos": 13260626346240.0, + "grad_norm": 2.593528092263596, + "language_loss": 0.70059276, + "learning_rate": 1.8026697692414174e-06, + "loss": 0.72128129, + "num_input_tokens_seen": 195500440, + "step": 9073, + "time_per_iteration": 2.5894620418548584 + }, + { + "auxiliary_loss_clip": 0.01042582, + "auxiliary_loss_mlp": 0.01034178, + "balance_loss_clip": 1.02367449, + "balance_loss_mlp": 1.02378869, + "epoch": 0.5455583947091538, + "flos": 21835878733440.0, + "grad_norm": 1.6876973581307992, + "language_loss": 0.70963639, + "learning_rate": 1.802282211606627e-06, + "loss": 0.73040402, + "num_input_tokens_seen": 195520860, + "step": 9074, + "time_per_iteration": 2.8393454551696777 + }, + { + "auxiliary_loss_clip": 0.01054218, + "auxiliary_loss_mlp": 0.01038366, + "balance_loss_clip": 1.02370644, + "balance_loss_mlp": 1.02767873, + "epoch": 0.5456185179618217, + "flos": 17817438551040.0, + "grad_norm": 2.3151696479990416, + "language_loss": 0.68483758, + "learning_rate": 1.8018946614693148e-06, + "loss": 0.70576346, + "num_input_tokens_seen": 195538615, + "step": 9075, + "time_per_iteration": 2.541287660598755 + }, + { + "auxiliary_loss_clip": 0.01055606, + "auxiliary_loss_mlp": 0.01029389, + "balance_loss_clip": 1.02679837, + "balance_loss_mlp": 1.01950705, + "epoch": 0.5456786412144897, + "flos": 21069401391360.0, + "grad_norm": 1.6677313358608405, + "language_loss": 0.81052446, + "learning_rate": 1.8015071188441768e-06, + "loss": 0.83137441, + "num_input_tokens_seen": 195557460, + "step": 9076, + "time_per_iteration": 2.685037136077881 + }, + { + "auxiliary_loss_clip": 0.01053345, + "auxiliary_loss_mlp": 0.01032122, + "balance_loss_clip": 1.02303696, + "balance_loss_mlp": 1.02145243, + "epoch": 0.5457387644671576, + "flos": 23294965098240.0, + "grad_norm": 1.5668029325479127, + "language_loss": 0.80255783, + "learning_rate": 1.8011195837459089e-06, + "loss": 0.82341254, + "num_input_tokens_seen": 195577985, + "step": 9077, + "time_per_iteration": 2.797541379928589 + }, + { + "auxiliary_loss_clip": 0.01048362, + "auxiliary_loss_mlp": 0.01028115, + "balance_loss_clip": 1.02336621, + "balance_loss_mlp": 1.01772606, + "epoch": 0.5457988877198257, + "flos": 21617039122560.0, + "grad_norm": 2.7089880556037063, + "language_loss": 0.68196321, + "learning_rate": 1.8007320561892064e-06, + "loss": 0.70272803, + "num_input_tokens_seen": 195597620, + "step": 9078, + "time_per_iteration": 2.744990825653076 + }, + { + "auxiliary_loss_clip": 0.01058063, + "auxiliary_loss_mlp": 0.01032169, + "balance_loss_clip": 1.02515078, + "balance_loss_mlp": 1.02062941, + "epoch": 0.5458590109724936, + "flos": 23762485543680.0, + "grad_norm": 1.6830427427088162, + "language_loss": 0.8071841, + "learning_rate": 1.800344536188764e-06, + "loss": 0.8280865, + "num_input_tokens_seen": 195615910, + "step": 9079, + "time_per_iteration": 2.657158613204956 + }, + { + "auxiliary_loss_clip": 0.01070346, + "auxiliary_loss_mlp": 0.01032747, + "balance_loss_clip": 1.02671242, + "balance_loss_mlp": 1.02053428, + "epoch": 0.5459191342251616, + "flos": 24424283675520.0, + "grad_norm": 1.5381356280222835, + "language_loss": 0.75702035, + "learning_rate": 1.799957023759277e-06, + "loss": 0.77805132, + "num_input_tokens_seen": 195635620, + "step": 9080, + "time_per_iteration": 2.615584135055542 + }, + { + "auxiliary_loss_clip": 0.01031748, + "auxiliary_loss_mlp": 0.01033267, + "balance_loss_clip": 1.02336204, + "balance_loss_mlp": 1.02114379, + "epoch": 0.5459792574778296, + "flos": 23623009032960.0, + "grad_norm": 1.980522008061127, + "language_loss": 0.83197254, + "learning_rate": 1.7995695189154392e-06, + "loss": 0.85262263, + "num_input_tokens_seen": 195652495, + "step": 9081, + "time_per_iteration": 2.7429254055023193 + }, + { + "auxiliary_loss_clip": 0.01069224, + "auxiliary_loss_mlp": 0.01027487, + "balance_loss_clip": 1.02652788, + "balance_loss_mlp": 1.01556027, + "epoch": 0.5460393807304975, + "flos": 19135540033920.0, + "grad_norm": 3.5459105325589615, + "language_loss": 0.69687486, + "learning_rate": 1.7991820216719461e-06, + "loss": 0.71784192, + "num_input_tokens_seen": 195671965, + "step": 9082, + "time_per_iteration": 2.5623419284820557 + }, + { + "auxiliary_loss_clip": 0.01062326, + "auxiliary_loss_mlp": 0.01023011, + "balance_loss_clip": 1.02331412, + "balance_loss_mlp": 1.01198375, + "epoch": 0.5460995039831655, + "flos": 35918534805120.0, + "grad_norm": 1.5013316704227213, + "language_loss": 0.66182232, + "learning_rate": 1.7987945320434906e-06, + "loss": 0.68267566, + "num_input_tokens_seen": 195694725, + "step": 9083, + "time_per_iteration": 2.8697807788848877 + }, + { + "auxiliary_loss_clip": 0.01044954, + "auxiliary_loss_mlp": 0.01027295, + "balance_loss_clip": 1.02492118, + "balance_loss_mlp": 1.01660156, + "epoch": 0.5461596272358334, + "flos": 26759231274240.0, + "grad_norm": 1.5853432323690195, + "language_loss": 0.78838277, + "learning_rate": 1.798407050044766e-06, + "loss": 0.80910528, + "num_input_tokens_seen": 195714090, + "step": 9084, + "time_per_iteration": 2.739305257797241 + }, + { + "auxiliary_loss_clip": 0.0105833, + "auxiliary_loss_mlp": 0.01032893, + "balance_loss_clip": 1.026618, + "balance_loss_mlp": 1.0216341, + "epoch": 0.5462197504885015, + "flos": 20886580143360.0, + "grad_norm": 1.9839656577456917, + "language_loss": 0.75223815, + "learning_rate": 1.7980195756904675e-06, + "loss": 0.77315038, + "num_input_tokens_seen": 195733585, + "step": 9085, + "time_per_iteration": 2.603977680206299 + }, + { + "auxiliary_loss_clip": 0.01042196, + "auxiliary_loss_mlp": 0.01029867, + "balance_loss_clip": 1.02295673, + "balance_loss_mlp": 1.01805902, + "epoch": 0.5462798737411694, + "flos": 25804976607360.0, + "grad_norm": 2.183780050413555, + "language_loss": 0.74936885, + "learning_rate": 1.7976321089952857e-06, + "loss": 0.77008951, + "num_input_tokens_seen": 195752820, + "step": 9086, + "time_per_iteration": 2.874995708465576 + }, + { + "auxiliary_loss_clip": 0.01055449, + "auxiliary_loss_mlp": 0.01030522, + "balance_loss_clip": 1.02454829, + "balance_loss_mlp": 1.01930404, + "epoch": 0.5463399969938374, + "flos": 25775027642880.0, + "grad_norm": 1.6754487602097021, + "language_loss": 0.76954776, + "learning_rate": 1.7972446499739155e-06, + "loss": 0.79040748, + "num_input_tokens_seen": 195773740, + "step": 9087, + "time_per_iteration": 2.7284014225006104 + }, + { + "auxiliary_loss_clip": 0.0106162, + "auxiliary_loss_mlp": 0.01039684, + "balance_loss_clip": 1.02809095, + "balance_loss_mlp": 1.02702379, + "epoch": 0.5464001202465053, + "flos": 18843298980480.0, + "grad_norm": 1.8555406207275023, + "language_loss": 0.77775013, + "learning_rate": 1.7968571986410484e-06, + "loss": 0.79876316, + "num_input_tokens_seen": 195792125, + "step": 9088, + "time_per_iteration": 2.7492525577545166 + }, + { + "auxiliary_loss_clip": 0.00963287, + "auxiliary_loss_mlp": 0.01009001, + "balance_loss_clip": 1.01471853, + "balance_loss_mlp": 1.00788617, + "epoch": 0.5464602434991733, + "flos": 69049541623680.0, + "grad_norm": 0.7561011081567552, + "language_loss": 0.57750267, + "learning_rate": 1.7964697550113758e-06, + "loss": 0.59722555, + "num_input_tokens_seen": 195854935, + "step": 9089, + "time_per_iteration": 3.4187748432159424 + }, + { + "auxiliary_loss_clip": 0.01030007, + "auxiliary_loss_mlp": 0.01035283, + "balance_loss_clip": 1.02296114, + "balance_loss_mlp": 1.02312982, + "epoch": 0.5465203667518412, + "flos": 27560039040000.0, + "grad_norm": 1.7214940755090544, + "language_loss": 0.76590514, + "learning_rate": 1.7960823190995918e-06, + "loss": 0.78655803, + "num_input_tokens_seen": 195874715, + "step": 9090, + "time_per_iteration": 2.969984292984009 + }, + { + "auxiliary_loss_clip": 0.01049545, + "auxiliary_loss_mlp": 0.01031861, + "balance_loss_clip": 1.02354884, + "balance_loss_mlp": 1.01829529, + "epoch": 0.5465804900045093, + "flos": 21210206705280.0, + "grad_norm": 1.842037804530753, + "language_loss": 0.73914444, + "learning_rate": 1.7956948909203855e-06, + "loss": 0.75995851, + "num_input_tokens_seen": 195892610, + "step": 9091, + "time_per_iteration": 2.6865010261535645 + }, + { + "auxiliary_loss_clip": 0.01049505, + "auxiliary_loss_mlp": 0.01033964, + "balance_loss_clip": 1.02696359, + "balance_loss_mlp": 1.02217984, + "epoch": 0.5466406132571772, + "flos": 22488949860480.0, + "grad_norm": 1.6703481187769855, + "language_loss": 0.78207576, + "learning_rate": 1.7953074704884498e-06, + "loss": 0.80291045, + "num_input_tokens_seen": 195911085, + "step": 9092, + "time_per_iteration": 2.858696460723877 + }, + { + "auxiliary_loss_clip": 0.01070102, + "auxiliary_loss_mlp": 0.01028778, + "balance_loss_clip": 1.02740669, + "balance_loss_mlp": 1.01637447, + "epoch": 0.5467007365098452, + "flos": 17675843137920.0, + "grad_norm": 2.1502297753642385, + "language_loss": 0.75005305, + "learning_rate": 1.794920057818476e-06, + "loss": 0.77104187, + "num_input_tokens_seen": 195929845, + "step": 9093, + "time_per_iteration": 2.708040952682495 + }, + { + "auxiliary_loss_clip": 0.01056285, + "auxiliary_loss_mlp": 0.01031206, + "balance_loss_clip": 1.02357602, + "balance_loss_mlp": 1.01836705, + "epoch": 0.5467608597625132, + "flos": 15698852524800.0, + "grad_norm": 2.129859393591302, + "language_loss": 0.68538618, + "learning_rate": 1.7945326529251533e-06, + "loss": 0.70626104, + "num_input_tokens_seen": 195946350, + "step": 9094, + "time_per_iteration": 2.729922294616699 + }, + { + "auxiliary_loss_clip": 0.01050261, + "auxiliary_loss_mlp": 0.01034177, + "balance_loss_clip": 1.02925408, + "balance_loss_mlp": 1.0232271, + "epoch": 0.5468209830151811, + "flos": 24312816794880.0, + "grad_norm": 2.9299307486240953, + "language_loss": 0.67392349, + "learning_rate": 1.7941452558231731e-06, + "loss": 0.69476789, + "num_input_tokens_seen": 195959840, + "step": 9095, + "time_per_iteration": 2.6814544200897217 + }, + { + "auxiliary_loss_clip": 0.01037072, + "auxiliary_loss_mlp": 0.01031523, + "balance_loss_clip": 1.02708793, + "balance_loss_mlp": 1.02086008, + "epoch": 0.5468811062678491, + "flos": 29166323339520.0, + "grad_norm": 1.7235819532944336, + "language_loss": 0.66488093, + "learning_rate": 1.7937578665272256e-06, + "loss": 0.6855669, + "num_input_tokens_seen": 195981125, + "step": 9096, + "time_per_iteration": 2.8433902263641357 + }, + { + "auxiliary_loss_clip": 0.00982209, + "auxiliary_loss_mlp": 0.01002108, + "balance_loss_clip": 1.00503349, + "balance_loss_mlp": 1.00095737, + "epoch": 0.546941229520517, + "flos": 67867037982720.0, + "grad_norm": 0.7418771980064098, + "language_loss": 0.57538319, + "learning_rate": 1.7933704850520007e-06, + "loss": 0.59522629, + "num_input_tokens_seen": 196038880, + "step": 9097, + "time_per_iteration": 3.4291107654571533 + }, + { + "auxiliary_loss_clip": 0.01004573, + "auxiliary_loss_mlp": 0.01004634, + "balance_loss_clip": 1.00721526, + "balance_loss_mlp": 1.00331068, + "epoch": 0.5470013527731851, + "flos": 58270306625280.0, + "grad_norm": 1.9361546425943128, + "language_loss": 0.64743084, + "learning_rate": 1.7929831114121868e-06, + "loss": 0.66752291, + "num_input_tokens_seen": 196099215, + "step": 9098, + "time_per_iteration": 3.2528269290924072 + }, + { + "auxiliary_loss_clip": 0.01060366, + "auxiliary_loss_mlp": 0.01034912, + "balance_loss_clip": 1.0268538, + "balance_loss_mlp": 1.02280045, + "epoch": 0.547061476025853, + "flos": 22965915582720.0, + "grad_norm": 1.4607737325352719, + "language_loss": 0.73115993, + "learning_rate": 1.7925957456224753e-06, + "loss": 0.75211275, + "num_input_tokens_seen": 196120370, + "step": 9099, + "time_per_iteration": 2.719038486480713 + }, + { + "auxiliary_loss_clip": 0.01046757, + "auxiliary_loss_mlp": 0.01028841, + "balance_loss_clip": 1.02680528, + "balance_loss_mlp": 1.0186193, + "epoch": 0.547121599278521, + "flos": 29968244426880.0, + "grad_norm": 1.9939782865521118, + "language_loss": 0.72684681, + "learning_rate": 1.7922083876975537e-06, + "loss": 0.74760282, + "num_input_tokens_seen": 196139075, + "step": 9100, + "time_per_iteration": 2.921520709991455 + }, + { + "auxiliary_loss_clip": 0.01056485, + "auxiliary_loss_mlp": 0.00747599, + "balance_loss_clip": 1.02647471, + "balance_loss_mlp": 1.00033879, + "epoch": 0.5471817225311889, + "flos": 36535443914880.0, + "grad_norm": 2.63230136137111, + "language_loss": 0.67587209, + "learning_rate": 1.7918210376521102e-06, + "loss": 0.69391298, + "num_input_tokens_seen": 196159990, + "step": 9101, + "time_per_iteration": 2.725057363510132 + }, + { + "auxiliary_loss_clip": 0.01066977, + "auxiliary_loss_mlp": 0.01025192, + "balance_loss_clip": 1.02631652, + "balance_loss_mlp": 1.01386142, + "epoch": 0.5472418457838569, + "flos": 25775243124480.0, + "grad_norm": 1.787487942621482, + "language_loss": 0.78138745, + "learning_rate": 1.7914336955008343e-06, + "loss": 0.80230916, + "num_input_tokens_seen": 196180570, + "step": 9102, + "time_per_iteration": 4.320237636566162 + }, + { + "auxiliary_loss_clip": 0.01031718, + "auxiliary_loss_mlp": 0.0103349, + "balance_loss_clip": 1.02401423, + "balance_loss_mlp": 1.0214144, + "epoch": 0.5473019690365248, + "flos": 27887687925120.0, + "grad_norm": 1.6358458321069225, + "language_loss": 0.72427487, + "learning_rate": 1.791046361258413e-06, + "loss": 0.74492705, + "num_input_tokens_seen": 196200300, + "step": 9103, + "time_per_iteration": 2.8816165924072266 + }, + { + "auxiliary_loss_clip": 0.01041426, + "auxiliary_loss_mlp": 0.01028184, + "balance_loss_clip": 1.02761996, + "balance_loss_mlp": 1.01693702, + "epoch": 0.5473620922891929, + "flos": 57631490219520.0, + "grad_norm": 1.3562834593707485, + "language_loss": 0.6544627, + "learning_rate": 1.7906590349395356e-06, + "loss": 0.6751588, + "num_input_tokens_seen": 196228525, + "step": 9104, + "time_per_iteration": 3.113600730895996 + }, + { + "auxiliary_loss_clip": 0.01065892, + "auxiliary_loss_mlp": 0.01032537, + "balance_loss_clip": 1.03237748, + "balance_loss_mlp": 1.02052689, + "epoch": 0.5474222155418608, + "flos": 19354056422400.0, + "grad_norm": 1.8131841774470336, + "language_loss": 0.8165074, + "learning_rate": 1.790271716558888e-06, + "loss": 0.83749175, + "num_input_tokens_seen": 196247690, + "step": 9105, + "time_per_iteration": 4.303205966949463 + }, + { + "auxiliary_loss_clip": 0.01065396, + "auxiliary_loss_mlp": 0.01027684, + "balance_loss_clip": 1.02539408, + "balance_loss_mlp": 1.01705062, + "epoch": 0.5474823387945288, + "flos": 25120448144640.0, + "grad_norm": 1.4183290452176984, + "language_loss": 0.80375588, + "learning_rate": 1.7898844061311575e-06, + "loss": 0.82468677, + "num_input_tokens_seen": 196268555, + "step": 9106, + "time_per_iteration": 2.6143300533294678 + }, + { + "auxiliary_loss_clip": 0.01059972, + "auxiliary_loss_mlp": 0.01030385, + "balance_loss_clip": 1.02853119, + "balance_loss_mlp": 1.01956701, + "epoch": 0.5475424620471967, + "flos": 18004174381440.0, + "grad_norm": 1.7519786734090126, + "language_loss": 0.69239008, + "learning_rate": 1.7894971036710322e-06, + "loss": 0.71329361, + "num_input_tokens_seen": 196285585, + "step": 9107, + "time_per_iteration": 2.63804292678833 + }, + { + "auxiliary_loss_clip": 0.01057869, + "auxiliary_loss_mlp": 0.01027346, + "balance_loss_clip": 1.02479029, + "balance_loss_mlp": 1.01609254, + "epoch": 0.5476025852998647, + "flos": 22309324922880.0, + "grad_norm": 1.9406381352279016, + "language_loss": 0.63266057, + "learning_rate": 1.789109809193197e-06, + "loss": 0.65351272, + "num_input_tokens_seen": 196305085, + "step": 9108, + "time_per_iteration": 2.6551473140716553 + }, + { + "auxiliary_loss_clip": 0.01065177, + "auxiliary_loss_mlp": 0.01026939, + "balance_loss_clip": 1.02546203, + "balance_loss_mlp": 1.01661587, + "epoch": 0.5476627085525327, + "flos": 20120497850880.0, + "grad_norm": 1.8151516850024174, + "language_loss": 0.74776673, + "learning_rate": 1.7887225227123396e-06, + "loss": 0.76868784, + "num_input_tokens_seen": 196323945, + "step": 9109, + "time_per_iteration": 2.606191396713257 + }, + { + "auxiliary_loss_clip": 0.01037554, + "auxiliary_loss_mlp": 0.01028834, + "balance_loss_clip": 1.02552938, + "balance_loss_mlp": 1.0173955, + "epoch": 0.5477228318052006, + "flos": 17712579772800.0, + "grad_norm": 1.8174382754798262, + "language_loss": 0.77163917, + "learning_rate": 1.7883352442431457e-06, + "loss": 0.79230297, + "num_input_tokens_seen": 196342200, + "step": 9110, + "time_per_iteration": 2.6712968349456787 + }, + { + "auxiliary_loss_clip": 0.01055315, + "auxiliary_loss_mlp": 0.01027995, + "balance_loss_clip": 1.02578759, + "balance_loss_mlp": 1.0173434, + "epoch": 0.5477829550578687, + "flos": 25848895962240.0, + "grad_norm": 1.465769470099458, + "language_loss": 0.71448845, + "learning_rate": 1.7879479738002993e-06, + "loss": 0.73532152, + "num_input_tokens_seen": 196362940, + "step": 9111, + "time_per_iteration": 2.624565839767456 + }, + { + "auxiliary_loss_clip": 0.01055313, + "auxiliary_loss_mlp": 0.01033337, + "balance_loss_clip": 1.02457237, + "balance_loss_mlp": 1.02206039, + "epoch": 0.5478430783105366, + "flos": 23039676161280.0, + "grad_norm": 5.80293495306049, + "language_loss": 0.71111351, + "learning_rate": 1.7875607113984876e-06, + "loss": 0.73199999, + "num_input_tokens_seen": 196383070, + "step": 9112, + "time_per_iteration": 4.226572275161743 + }, + { + "auxiliary_loss_clip": 0.01020112, + "auxiliary_loss_mlp": 0.01028784, + "balance_loss_clip": 1.02532256, + "balance_loss_mlp": 1.01765573, + "epoch": 0.5479032015632046, + "flos": 16071210864000.0, + "grad_norm": 2.1122789479505424, + "language_loss": 0.87753046, + "learning_rate": 1.7871734570523953e-06, + "loss": 0.89801943, + "num_input_tokens_seen": 196398485, + "step": 9113, + "time_per_iteration": 2.6132736206054688 + }, + { + "auxiliary_loss_clip": 0.01022677, + "auxiliary_loss_mlp": 0.01027987, + "balance_loss_clip": 1.02865887, + "balance_loss_mlp": 1.01632237, + "epoch": 0.5479633248158725, + "flos": 24278701852800.0, + "grad_norm": 1.7599900334916443, + "language_loss": 0.73227036, + "learning_rate": 1.7867862107767067e-06, + "loss": 0.7527771, + "num_input_tokens_seen": 196417725, + "step": 9114, + "time_per_iteration": 2.7389943599700928 + }, + { + "auxiliary_loss_clip": 0.01038524, + "auxiliary_loss_mlp": 0.00747534, + "balance_loss_clip": 1.02134156, + "balance_loss_mlp": 1.00031042, + "epoch": 0.5480234480685405, + "flos": 26358216860160.0, + "grad_norm": 2.6442148162491415, + "language_loss": 0.72127116, + "learning_rate": 1.7863989725861066e-06, + "loss": 0.73913169, + "num_input_tokens_seen": 196437840, + "step": 9115, + "time_per_iteration": 2.788961887359619 + }, + { + "auxiliary_loss_clip": 0.01029087, + "auxiliary_loss_mlp": 0.00747678, + "balance_loss_clip": 1.02211237, + "balance_loss_mlp": 1.00034475, + "epoch": 0.5480835713212084, + "flos": 22055077480320.0, + "grad_norm": 1.5699466100026833, + "language_loss": 0.71988034, + "learning_rate": 1.7860117424952781e-06, + "loss": 0.73764801, + "num_input_tokens_seen": 196457300, + "step": 9116, + "time_per_iteration": 4.326659679412842 + }, + { + "auxiliary_loss_clip": 0.01040726, + "auxiliary_loss_mlp": 0.01036611, + "balance_loss_clip": 1.02496219, + "balance_loss_mlp": 1.02513707, + "epoch": 0.5481436945738765, + "flos": 25301042749440.0, + "grad_norm": 1.9779814114805332, + "language_loss": 0.76449907, + "learning_rate": 1.7856245205189063e-06, + "loss": 0.78527248, + "num_input_tokens_seen": 196476720, + "step": 9117, + "time_per_iteration": 2.6524107456207275 + }, + { + "auxiliary_loss_clip": 0.01021436, + "auxiliary_loss_mlp": 0.01032091, + "balance_loss_clip": 1.02108693, + "balance_loss_mlp": 1.02117145, + "epoch": 0.5482038178265444, + "flos": 33580857772800.0, + "grad_norm": 1.5324375264232777, + "language_loss": 0.62597907, + "learning_rate": 1.785237306671674e-06, + "loss": 0.6465143, + "num_input_tokens_seen": 196496765, + "step": 9118, + "time_per_iteration": 2.763014316558838 + }, + { + "auxiliary_loss_clip": 0.01068253, + "auxiliary_loss_mlp": 0.01030723, + "balance_loss_clip": 1.02659202, + "balance_loss_mlp": 1.0187366, + "epoch": 0.5482639410792124, + "flos": 19026192055680.0, + "grad_norm": 1.7085863449709624, + "language_loss": 0.78612351, + "learning_rate": 1.7848501009682646e-06, + "loss": 0.80711329, + "num_input_tokens_seen": 196516220, + "step": 9119, + "time_per_iteration": 2.558255910873413 + }, + { + "auxiliary_loss_clip": 0.01046934, + "auxiliary_loss_mlp": 0.00747543, + "balance_loss_clip": 1.02742124, + "balance_loss_mlp": 1.00033879, + "epoch": 0.5483240643318803, + "flos": 25410318900480.0, + "grad_norm": 1.616438449508276, + "language_loss": 0.82063305, + "learning_rate": 1.7844629034233604e-06, + "loss": 0.83857781, + "num_input_tokens_seen": 196533860, + "step": 9120, + "time_per_iteration": 2.7478227615356445 + }, + { + "auxiliary_loss_clip": 0.01038669, + "auxiliary_loss_mlp": 0.01033169, + "balance_loss_clip": 1.02683949, + "balance_loss_mlp": 1.02107489, + "epoch": 0.5483841875845483, + "flos": 21466896272640.0, + "grad_norm": 2.115293422149327, + "language_loss": 0.8018719, + "learning_rate": 1.7840757140516455e-06, + "loss": 0.82259023, + "num_input_tokens_seen": 196551305, + "step": 9121, + "time_per_iteration": 2.817003011703491 + }, + { + "auxiliary_loss_clip": 0.01024158, + "auxiliary_loss_mlp": 0.01036865, + "balance_loss_clip": 1.02466917, + "balance_loss_mlp": 1.02372181, + "epoch": 0.5484443108372163, + "flos": 24747263792640.0, + "grad_norm": 1.7041776613280597, + "language_loss": 0.61333174, + "learning_rate": 1.7836885328678008e-06, + "loss": 0.63394201, + "num_input_tokens_seen": 196569420, + "step": 9122, + "time_per_iteration": 2.8327012062072754 + }, + { + "auxiliary_loss_clip": 0.01053549, + "auxiliary_loss_mlp": 0.01033663, + "balance_loss_clip": 1.03155386, + "balance_loss_mlp": 1.02307689, + "epoch": 0.5485044340898843, + "flos": 25375377945600.0, + "grad_norm": 1.5882735715695566, + "language_loss": 0.71467471, + "learning_rate": 1.7833013598865084e-06, + "loss": 0.73554683, + "num_input_tokens_seen": 196590610, + "step": 9123, + "time_per_iteration": 3.0000178813934326 + }, + { + "auxiliary_loss_clip": 0.01066727, + "auxiliary_loss_mlp": 0.01028332, + "balance_loss_clip": 1.02561712, + "balance_loss_mlp": 1.01779962, + "epoch": 0.5485645573425523, + "flos": 12641167370880.0, + "grad_norm": 3.0998547051082666, + "language_loss": 0.83563083, + "learning_rate": 1.7829141951224505e-06, + "loss": 0.85658145, + "num_input_tokens_seen": 196606495, + "step": 9124, + "time_per_iteration": 2.719900131225586 + }, + { + "auxiliary_loss_clip": 0.0104802, + "auxiliary_loss_mlp": 0.01031435, + "balance_loss_clip": 1.02750123, + "balance_loss_mlp": 1.02050352, + "epoch": 0.5486246805952202, + "flos": 28329425383680.0, + "grad_norm": 1.5678042419278602, + "language_loss": 0.80177611, + "learning_rate": 1.7825270385903075e-06, + "loss": 0.82257062, + "num_input_tokens_seen": 196626365, + "step": 9125, + "time_per_iteration": 2.9649863243103027 + }, + { + "auxiliary_loss_clip": 0.0105856, + "auxiliary_loss_mlp": 0.01028939, + "balance_loss_clip": 1.02666068, + "balance_loss_mlp": 1.01735735, + "epoch": 0.5486848038478882, + "flos": 16800017817600.0, + "grad_norm": 1.9036004590730418, + "language_loss": 0.74519718, + "learning_rate": 1.7821398903047617e-06, + "loss": 0.76607215, + "num_input_tokens_seen": 196644465, + "step": 9126, + "time_per_iteration": 2.6052472591400146 + }, + { + "auxiliary_loss_clip": 0.01051835, + "auxiliary_loss_mlp": 0.01031002, + "balance_loss_clip": 1.02388752, + "balance_loss_mlp": 1.0182941, + "epoch": 0.5487449271005561, + "flos": 17236224581760.0, + "grad_norm": 2.5483963249878423, + "language_loss": 0.67541301, + "learning_rate": 1.7817527502804928e-06, + "loss": 0.69624138, + "num_input_tokens_seen": 196659160, + "step": 9127, + "time_per_iteration": 2.6051011085510254 + }, + { + "auxiliary_loss_clip": 0.01023275, + "auxiliary_loss_mlp": 0.01033788, + "balance_loss_clip": 1.02343667, + "balance_loss_mlp": 1.02158678, + "epoch": 0.5488050503532241, + "flos": 17340867878400.0, + "grad_norm": 1.6793107450729214, + "language_loss": 0.8344593, + "learning_rate": 1.781365618532181e-06, + "loss": 0.85502994, + "num_input_tokens_seen": 196677410, + "step": 9128, + "time_per_iteration": 2.642130136489868 + }, + { + "auxiliary_loss_clip": 0.01023106, + "auxiliary_loss_mlp": 0.01031826, + "balance_loss_clip": 1.0225333, + "balance_loss_mlp": 1.01926136, + "epoch": 0.548865173605892, + "flos": 17239169496960.0, + "grad_norm": 1.9189270339005862, + "language_loss": 0.7405237, + "learning_rate": 1.7809784950745078e-06, + "loss": 0.76107299, + "num_input_tokens_seen": 196696765, + "step": 9129, + "time_per_iteration": 2.7314727306365967 + }, + { + "auxiliary_loss_clip": 0.01024017, + "auxiliary_loss_mlp": 0.01028323, + "balance_loss_clip": 1.02182817, + "balance_loss_mlp": 1.0160563, + "epoch": 0.5489252968585601, + "flos": 17456716218240.0, + "grad_norm": 3.2672012123200456, + "language_loss": 0.63277411, + "learning_rate": 1.7805913799221511e-06, + "loss": 0.65329748, + "num_input_tokens_seen": 196714895, + "step": 9130, + "time_per_iteration": 2.8544111251831055 + }, + { + "auxiliary_loss_clip": 0.01068973, + "auxiliary_loss_mlp": 0.00747573, + "balance_loss_clip": 1.02622795, + "balance_loss_mlp": 1.00024915, + "epoch": 0.548985420111228, + "flos": 26323383646080.0, + "grad_norm": 2.046408401322353, + "language_loss": 0.62772119, + "learning_rate": 1.7802042730897915e-06, + "loss": 0.64588672, + "num_input_tokens_seen": 196735510, + "step": 9131, + "time_per_iteration": 2.6360905170440674 + }, + { + "auxiliary_loss_clip": 0.01054106, + "auxiliary_loss_mlp": 0.01032389, + "balance_loss_clip": 1.024055, + "balance_loss_mlp": 1.01985979, + "epoch": 0.549045543363896, + "flos": 18693730748160.0, + "grad_norm": 1.9863008366991541, + "language_loss": 0.74750495, + "learning_rate": 1.7798171745921084e-06, + "loss": 0.76836991, + "num_input_tokens_seen": 196752855, + "step": 9132, + "time_per_iteration": 2.7427589893341064 + }, + { + "auxiliary_loss_clip": 0.01051961, + "auxiliary_loss_mlp": 0.01025122, + "balance_loss_clip": 1.02169132, + "balance_loss_mlp": 1.01435173, + "epoch": 0.5491056666165639, + "flos": 24717386655360.0, + "grad_norm": 1.5767308081510725, + "language_loss": 0.81137967, + "learning_rate": 1.7794300844437795e-06, + "loss": 0.83215046, + "num_input_tokens_seen": 196772230, + "step": 9133, + "time_per_iteration": 2.670633554458618 + }, + { + "auxiliary_loss_clip": 0.010462, + "auxiliary_loss_mlp": 0.00747536, + "balance_loss_clip": 1.02668655, + "balance_loss_mlp": 1.00026917, + "epoch": 0.5491657898692319, + "flos": 21576926609280.0, + "grad_norm": 1.8996852935962085, + "language_loss": 0.69969523, + "learning_rate": 1.7790430026594841e-06, + "loss": 0.71763253, + "num_input_tokens_seen": 196790405, + "step": 9134, + "time_per_iteration": 2.8204188346862793 + }, + { + "auxiliary_loss_clip": 0.01031747, + "auxiliary_loss_mlp": 0.01028844, + "balance_loss_clip": 1.0240047, + "balance_loss_mlp": 1.01752472, + "epoch": 0.5492259131219, + "flos": 50476432746240.0, + "grad_norm": 2.42613914233534, + "language_loss": 0.61803681, + "learning_rate": 1.7786559292539004e-06, + "loss": 0.63864267, + "num_input_tokens_seen": 196813785, + "step": 9135, + "time_per_iteration": 2.937013864517212 + }, + { + "auxiliary_loss_clip": 0.0105857, + "auxiliary_loss_mlp": 0.01030459, + "balance_loss_clip": 1.02561402, + "balance_loss_mlp": 1.01809669, + "epoch": 0.5492860363745679, + "flos": 25119262995840.0, + "grad_norm": 1.4662323946913693, + "language_loss": 0.72410178, + "learning_rate": 1.7782688642417058e-06, + "loss": 0.74499202, + "num_input_tokens_seen": 196834390, + "step": 9136, + "time_per_iteration": 2.82568097114563 + }, + { + "auxiliary_loss_clip": 0.0101721, + "auxiliary_loss_mlp": 0.01033547, + "balance_loss_clip": 1.02596509, + "balance_loss_mlp": 1.02068996, + "epoch": 0.5493461596272359, + "flos": 22633777497600.0, + "grad_norm": 2.0902540372461464, + "language_loss": 0.68412775, + "learning_rate": 1.7778818076375781e-06, + "loss": 0.70463538, + "num_input_tokens_seen": 196853290, + "step": 9137, + "time_per_iteration": 2.829266309738159 + }, + { + "auxiliary_loss_clip": 0.01000128, + "auxiliary_loss_mlp": 0.01005313, + "balance_loss_clip": 1.00349784, + "balance_loss_mlp": 1.00414467, + "epoch": 0.5494062828799038, + "flos": 66151800754560.0, + "grad_norm": 0.7487358096313647, + "language_loss": 0.65374231, + "learning_rate": 1.7774947594561947e-06, + "loss": 0.67379677, + "num_input_tokens_seen": 196913120, + "step": 9138, + "time_per_iteration": 3.370607376098633 + }, + { + "auxiliary_loss_clip": 0.01057742, + "auxiliary_loss_mlp": 0.01031131, + "balance_loss_clip": 1.0259366, + "balance_loss_mlp": 1.01933527, + "epoch": 0.5494664061325718, + "flos": 21105958458240.0, + "grad_norm": 1.8121297077350673, + "language_loss": 0.753263, + "learning_rate": 1.7771077197122321e-06, + "loss": 0.77415174, + "num_input_tokens_seen": 196931530, + "step": 9139, + "time_per_iteration": 2.70949649810791 + }, + { + "auxiliary_loss_clip": 0.01056401, + "auxiliary_loss_mlp": 0.01024258, + "balance_loss_clip": 1.02512527, + "balance_loss_mlp": 1.01331472, + "epoch": 0.5495265293852397, + "flos": 14392566616320.0, + "grad_norm": 1.650090712961343, + "language_loss": 0.70835906, + "learning_rate": 1.7767206884203672e-06, + "loss": 0.72916567, + "num_input_tokens_seen": 196949430, + "step": 9140, + "time_per_iteration": 2.6562275886535645 + }, + { + "auxiliary_loss_clip": 0.01041936, + "auxiliary_loss_mlp": 0.01031381, + "balance_loss_clip": 1.0227257, + "balance_loss_mlp": 1.01917994, + "epoch": 0.5495866526379077, + "flos": 25549148966400.0, + "grad_norm": 3.0449222824065725, + "language_loss": 0.76589608, + "learning_rate": 1.7763336655952762e-06, + "loss": 0.78662932, + "num_input_tokens_seen": 196968265, + "step": 9141, + "time_per_iteration": 2.7396047115325928 + }, + { + "auxiliary_loss_clip": 0.01033103, + "auxiliary_loss_mlp": 0.01028342, + "balance_loss_clip": 1.02468777, + "balance_loss_mlp": 1.01776814, + "epoch": 0.5496467758905756, + "flos": 21317256213120.0, + "grad_norm": 1.8180413683595744, + "language_loss": 0.74966282, + "learning_rate": 1.7759466512516346e-06, + "loss": 0.77027726, + "num_input_tokens_seen": 196984930, + "step": 9142, + "time_per_iteration": 2.676757335662842 + }, + { + "auxiliary_loss_clip": 0.01042172, + "auxiliary_loss_mlp": 0.01034959, + "balance_loss_clip": 1.0267899, + "balance_loss_mlp": 1.02282333, + "epoch": 0.5497068991432437, + "flos": 22233086305920.0, + "grad_norm": 2.4283552407118543, + "language_loss": 0.76609755, + "learning_rate": 1.7755596454041192e-06, + "loss": 0.78686887, + "num_input_tokens_seen": 197002320, + "step": 9143, + "time_per_iteration": 2.6208348274230957 + }, + { + "auxiliary_loss_clip": 0.01038334, + "auxiliary_loss_mlp": 0.01031344, + "balance_loss_clip": 1.02314663, + "balance_loss_mlp": 1.01953602, + "epoch": 0.5497670223959116, + "flos": 18479093028480.0, + "grad_norm": 2.4540773730164642, + "language_loss": 0.79530501, + "learning_rate": 1.7751726480674044e-06, + "loss": 0.81600177, + "num_input_tokens_seen": 197020825, + "step": 9144, + "time_per_iteration": 2.6203885078430176 + }, + { + "auxiliary_loss_clip": 0.01058278, + "auxiliary_loss_mlp": 0.01028954, + "balance_loss_clip": 1.02661467, + "balance_loss_mlp": 1.01792133, + "epoch": 0.5498271456485796, + "flos": 29205107049600.0, + "grad_norm": 1.7132224216726883, + "language_loss": 0.71008021, + "learning_rate": 1.7747856592561645e-06, + "loss": 0.7309525, + "num_input_tokens_seen": 197040450, + "step": 9145, + "time_per_iteration": 2.672041893005371 + }, + { + "auxiliary_loss_clip": 0.01052945, + "auxiliary_loss_mlp": 0.01024136, + "balance_loss_clip": 1.02414799, + "balance_loss_mlp": 1.01368761, + "epoch": 0.5498872689012475, + "flos": 34824372664320.0, + "grad_norm": 1.4995953081438445, + "language_loss": 0.70352888, + "learning_rate": 1.774398678985076e-06, + "loss": 0.72429967, + "num_input_tokens_seen": 197063930, + "step": 9146, + "time_per_iteration": 2.709213972091675 + }, + { + "auxiliary_loss_clip": 0.01037353, + "auxiliary_loss_mlp": 0.01026156, + "balance_loss_clip": 1.02258468, + "balance_loss_mlp": 1.01600552, + "epoch": 0.5499473921539155, + "flos": 25921938268800.0, + "grad_norm": 1.7779902741657032, + "language_loss": 0.63927966, + "learning_rate": 1.7740117072688113e-06, + "loss": 0.65991473, + "num_input_tokens_seen": 197082660, + "step": 9147, + "time_per_iteration": 2.808748960494995 + }, + { + "auxiliary_loss_clip": 0.01067704, + "auxiliary_loss_mlp": 0.010336, + "balance_loss_clip": 1.02709961, + "balance_loss_mlp": 1.02278161, + "epoch": 0.5500075154065835, + "flos": 22273701609600.0, + "grad_norm": 2.0431941861808367, + "language_loss": 0.80871278, + "learning_rate": 1.7736247441220458e-06, + "loss": 0.82972586, + "num_input_tokens_seen": 197100675, + "step": 9148, + "time_per_iteration": 2.595278024673462 + }, + { + "auxiliary_loss_clip": 0.01047312, + "auxiliary_loss_mlp": 0.01036694, + "balance_loss_clip": 1.02568471, + "balance_loss_mlp": 1.02583969, + "epoch": 0.5500676386592515, + "flos": 28037507552640.0, + "grad_norm": 1.6438188494603234, + "language_loss": 0.79348904, + "learning_rate": 1.773237789559453e-06, + "loss": 0.81432903, + "num_input_tokens_seen": 197121320, + "step": 9149, + "time_per_iteration": 4.215680122375488 + }, + { + "auxiliary_loss_clip": 0.0103569, + "auxiliary_loss_mlp": 0.0102939, + "balance_loss_clip": 1.02478623, + "balance_loss_mlp": 1.01851177, + "epoch": 0.5501277619119195, + "flos": 23914819123200.0, + "grad_norm": 1.8694417456807355, + "language_loss": 0.71787721, + "learning_rate": 1.7728508435957052e-06, + "loss": 0.73852801, + "num_input_tokens_seen": 197138965, + "step": 9150, + "time_per_iteration": 2.654170274734497 + }, + { + "auxiliary_loss_clip": 0.01044042, + "auxiliary_loss_mlp": 0.01026881, + "balance_loss_clip": 1.02280259, + "balance_loss_mlp": 1.01479959, + "epoch": 0.5501878851645874, + "flos": 20923783655040.0, + "grad_norm": 1.9526252631604322, + "language_loss": 0.75254524, + "learning_rate": 1.772463906245477e-06, + "loss": 0.77325451, + "num_input_tokens_seen": 197156460, + "step": 9151, + "time_per_iteration": 2.6887197494506836 + }, + { + "auxiliary_loss_clip": 0.01041307, + "auxiliary_loss_mlp": 0.01025512, + "balance_loss_clip": 1.02345943, + "balance_loss_mlp": 1.01506329, + "epoch": 0.5502480084172554, + "flos": 20665298407680.0, + "grad_norm": 2.7631406471561544, + "language_loss": 0.76114142, + "learning_rate": 1.7720769775234394e-06, + "loss": 0.78180957, + "num_input_tokens_seen": 197175140, + "step": 9152, + "time_per_iteration": 4.155466318130493 + }, + { + "auxiliary_loss_clip": 0.01043754, + "auxiliary_loss_mlp": 0.01034376, + "balance_loss_clip": 1.02440143, + "balance_loss_mlp": 1.02405882, + "epoch": 0.5503081316699233, + "flos": 26432552056320.0, + "grad_norm": 1.7877860265577636, + "language_loss": 0.82305861, + "learning_rate": 1.7716900574442662e-06, + "loss": 0.84383988, + "num_input_tokens_seen": 197194345, + "step": 9153, + "time_per_iteration": 2.7091825008392334 + }, + { + "auxiliary_loss_clip": 0.01057163, + "auxiliary_loss_mlp": 0.01032483, + "balance_loss_clip": 1.02639115, + "balance_loss_mlp": 1.02166533, + "epoch": 0.5503682549225913, + "flos": 30629144718720.0, + "grad_norm": 1.7990867858281028, + "language_loss": 0.74160147, + "learning_rate": 1.7713031460226294e-06, + "loss": 0.7624979, + "num_input_tokens_seen": 197215535, + "step": 9154, + "time_per_iteration": 2.7453670501708984 + }, + { + "auxiliary_loss_clip": 0.01045865, + "auxiliary_loss_mlp": 0.01029782, + "balance_loss_clip": 1.02427733, + "balance_loss_mlp": 1.01761031, + "epoch": 0.5504283781752592, + "flos": 22565439872640.0, + "grad_norm": 1.7007042784151927, + "language_loss": 0.72294199, + "learning_rate": 1.770916243273199e-06, + "loss": 0.74369848, + "num_input_tokens_seen": 197234945, + "step": 9155, + "time_per_iteration": 2.6572043895721436 + }, + { + "auxiliary_loss_clip": 0.00990137, + "auxiliary_loss_mlp": 0.01004746, + "balance_loss_clip": 1.00287592, + "balance_loss_mlp": 1.00368488, + "epoch": 0.5504885014279273, + "flos": 67901009270400.0, + "grad_norm": 0.7658494039738822, + "language_loss": 0.55378962, + "learning_rate": 1.7705293492106483e-06, + "loss": 0.57373846, + "num_input_tokens_seen": 197302285, + "step": 9156, + "time_per_iteration": 3.325174570083618 + }, + { + "auxiliary_loss_clip": 0.01051666, + "auxiliary_loss_mlp": 0.01031459, + "balance_loss_clip": 1.0228467, + "balance_loss_mlp": 1.02111185, + "epoch": 0.5505486246805952, + "flos": 22450058409600.0, + "grad_norm": 1.8935114950727305, + "language_loss": 0.82552981, + "learning_rate": 1.7701424638496475e-06, + "loss": 0.84636104, + "num_input_tokens_seen": 197321575, + "step": 9157, + "time_per_iteration": 2.617295503616333 + }, + { + "auxiliary_loss_clip": 0.01069748, + "auxiliary_loss_mlp": 0.01030652, + "balance_loss_clip": 1.02665877, + "balance_loss_mlp": 1.01827788, + "epoch": 0.5506087479332632, + "flos": 26906896085760.0, + "grad_norm": 2.1293708072175828, + "language_loss": 0.75540686, + "learning_rate": 1.7697555872048677e-06, + "loss": 0.77641082, + "num_input_tokens_seen": 197340255, + "step": 9158, + "time_per_iteration": 2.67301869392395 + }, + { + "auxiliary_loss_clip": 0.01027614, + "auxiliary_loss_mlp": 0.01031987, + "balance_loss_clip": 1.02799106, + "balance_loss_mlp": 1.02174735, + "epoch": 0.5506688711859311, + "flos": 22930256355840.0, + "grad_norm": 1.9863989070414385, + "language_loss": 0.69637668, + "learning_rate": 1.769368719290979e-06, + "loss": 0.71697271, + "num_input_tokens_seen": 197360360, + "step": 9159, + "time_per_iteration": 4.44550347328186 + }, + { + "auxiliary_loss_clip": 0.0101921, + "auxiliary_loss_mlp": 0.00747303, + "balance_loss_clip": 1.02437615, + "balance_loss_mlp": 1.00021076, + "epoch": 0.5507289944385991, + "flos": 29606408772480.0, + "grad_norm": 1.4291859825005349, + "language_loss": 0.67955279, + "learning_rate": 1.7689818601226516e-06, + "loss": 0.69721794, + "num_input_tokens_seen": 197381905, + "step": 9160, + "time_per_iteration": 2.8851959705352783 + }, + { + "auxiliary_loss_clip": 0.01063401, + "auxiliary_loss_mlp": 0.01025915, + "balance_loss_clip": 1.02509356, + "balance_loss_mlp": 1.015311, + "epoch": 0.5507891176912671, + "flos": 15334431091200.0, + "grad_norm": 2.0725030806160083, + "language_loss": 0.71396446, + "learning_rate": 1.7685950097145552e-06, + "loss": 0.73485768, + "num_input_tokens_seen": 197398555, + "step": 9161, + "time_per_iteration": 2.7762091159820557 + }, + { + "auxiliary_loss_clip": 0.01055442, + "auxiliary_loss_mlp": 0.01031367, + "balance_loss_clip": 1.02479005, + "balance_loss_mlp": 1.02059627, + "epoch": 0.5508492409439351, + "flos": 26578313447040.0, + "grad_norm": 1.585980502320658, + "language_loss": 0.69358152, + "learning_rate": 1.768208168081359e-06, + "loss": 0.71444958, + "num_input_tokens_seen": 197419630, + "step": 9162, + "time_per_iteration": 2.6583175659179688 + }, + { + "auxiliary_loss_clip": 0.01066396, + "auxiliary_loss_mlp": 0.01034449, + "balance_loss_clip": 1.02623188, + "balance_loss_mlp": 1.02366614, + "epoch": 0.5509093641966031, + "flos": 25443428261760.0, + "grad_norm": 1.7171553519534775, + "language_loss": 0.86001766, + "learning_rate": 1.767821335237733e-06, + "loss": 0.88102609, + "num_input_tokens_seen": 197438480, + "step": 9163, + "time_per_iteration": 2.7130002975463867 + }, + { + "auxiliary_loss_clip": 0.01032339, + "auxiliary_loss_mlp": 0.01027052, + "balance_loss_clip": 1.02469838, + "balance_loss_mlp": 1.01697254, + "epoch": 0.550969487449271, + "flos": 18698543170560.0, + "grad_norm": 1.594025026764634, + "language_loss": 0.8022418, + "learning_rate": 1.7674345111983441e-06, + "loss": 0.82283568, + "num_input_tokens_seen": 197456755, + "step": 9164, + "time_per_iteration": 4.350232839584351 + }, + { + "auxiliary_loss_clip": 0.01051673, + "auxiliary_loss_mlp": 0.01025259, + "balance_loss_clip": 1.02953494, + "balance_loss_mlp": 1.01408863, + "epoch": 0.551029610701939, + "flos": 22708723224960.0, + "grad_norm": 1.6844412655116292, + "language_loss": 0.73705077, + "learning_rate": 1.767047695977863e-06, + "loss": 0.75782007, + "num_input_tokens_seen": 197475530, + "step": 9165, + "time_per_iteration": 2.7591922283172607 + }, + { + "auxiliary_loss_clip": 0.0104617, + "auxiliary_loss_mlp": 0.01028232, + "balance_loss_clip": 1.02172697, + "balance_loss_mlp": 1.01697922, + "epoch": 0.5510897339546069, + "flos": 12420496166400.0, + "grad_norm": 2.195246233487678, + "language_loss": 0.79119051, + "learning_rate": 1.7666608895909563e-06, + "loss": 0.81193459, + "num_input_tokens_seen": 197490835, + "step": 9166, + "time_per_iteration": 2.626556396484375 + }, + { + "auxiliary_loss_clip": 0.01037371, + "auxiliary_loss_mlp": 0.01027701, + "balance_loss_clip": 1.02542448, + "balance_loss_mlp": 1.01657295, + "epoch": 0.5511498572072749, + "flos": 18770579896320.0, + "grad_norm": 2.312131961095953, + "language_loss": 0.76257229, + "learning_rate": 1.7662740920522913e-06, + "loss": 0.78322297, + "num_input_tokens_seen": 197508770, + "step": 9167, + "time_per_iteration": 2.68459153175354 + }, + { + "auxiliary_loss_clip": 0.01055181, + "auxiliary_loss_mlp": 0.01027761, + "balance_loss_clip": 1.02487051, + "balance_loss_mlp": 1.01665115, + "epoch": 0.5512099804599428, + "flos": 19573326996480.0, + "grad_norm": 2.0574429716556586, + "language_loss": 0.7990678, + "learning_rate": 1.7658873033765374e-06, + "loss": 0.81989717, + "num_input_tokens_seen": 197527340, + "step": 9168, + "time_per_iteration": 2.527193784713745 + }, + { + "auxiliary_loss_clip": 0.01057639, + "auxiliary_loss_mlp": 0.01031258, + "balance_loss_clip": 1.02624965, + "balance_loss_mlp": 1.02044594, + "epoch": 0.5512701037126109, + "flos": 26245600744320.0, + "grad_norm": 1.5809334561620778, + "language_loss": 0.68638945, + "learning_rate": 1.7655005235783591e-06, + "loss": 0.70727843, + "num_input_tokens_seen": 197547280, + "step": 9169, + "time_per_iteration": 2.6825995445251465 + }, + { + "auxiliary_loss_clip": 0.01051375, + "auxiliary_loss_mlp": 0.01023045, + "balance_loss_clip": 1.02290869, + "balance_loss_mlp": 1.01300204, + "epoch": 0.5513302269652788, + "flos": 21945406279680.0, + "grad_norm": 2.040550988927399, + "language_loss": 0.85561472, + "learning_rate": 1.7651137526724251e-06, + "loss": 0.87635893, + "num_input_tokens_seen": 197565045, + "step": 9170, + "time_per_iteration": 2.5487477779388428 + }, + { + "auxiliary_loss_clip": 0.00998409, + "auxiliary_loss_mlp": 0.0100321, + "balance_loss_clip": 1.01160705, + "balance_loss_mlp": 1.00164878, + "epoch": 0.5513903502179468, + "flos": 68235948616320.0, + "grad_norm": 0.7825040569522591, + "language_loss": 0.59918445, + "learning_rate": 1.7647269906734017e-06, + "loss": 0.61920065, + "num_input_tokens_seen": 197625005, + "step": 9171, + "time_per_iteration": 3.201671838760376 + }, + { + "auxiliary_loss_clip": 0.01027976, + "auxiliary_loss_mlp": 0.0103336, + "balance_loss_clip": 1.02214408, + "balance_loss_mlp": 1.02214217, + "epoch": 0.5514504734706147, + "flos": 18734238311040.0, + "grad_norm": 1.3985814734137414, + "language_loss": 0.70370591, + "learning_rate": 1.7643402375959533e-06, + "loss": 0.72431934, + "num_input_tokens_seen": 197645050, + "step": 9172, + "time_per_iteration": 2.6360068321228027 + }, + { + "auxiliary_loss_clip": 0.0106395, + "auxiliary_loss_mlp": 0.01027492, + "balance_loss_clip": 1.02500665, + "balance_loss_mlp": 1.01697719, + "epoch": 0.5515105967232827, + "flos": 22270972176000.0, + "grad_norm": 1.7281770705364858, + "language_loss": 0.76176006, + "learning_rate": 1.7639534934547474e-06, + "loss": 0.78267443, + "num_input_tokens_seen": 197663910, + "step": 9173, + "time_per_iteration": 2.538525342941284 + }, + { + "auxiliary_loss_clip": 0.01033042, + "auxiliary_loss_mlp": 0.01026159, + "balance_loss_clip": 1.0255754, + "balance_loss_mlp": 1.01566291, + "epoch": 0.5515707199759508, + "flos": 22557682535040.0, + "grad_norm": 1.6770030109477176, + "language_loss": 0.75092489, + "learning_rate": 1.7635667582644484e-06, + "loss": 0.77151692, + "num_input_tokens_seen": 197681580, + "step": 9174, + "time_per_iteration": 2.6568763256073 + }, + { + "auxiliary_loss_clip": 0.01047479, + "auxiliary_loss_mlp": 0.01031101, + "balance_loss_clip": 1.02648067, + "balance_loss_mlp": 1.01993132, + "epoch": 0.5516308432286187, + "flos": 28291072636800.0, + "grad_norm": 1.9287031957359937, + "language_loss": 0.72475076, + "learning_rate": 1.7631800320397217e-06, + "loss": 0.74553651, + "num_input_tokens_seen": 197702095, + "step": 9175, + "time_per_iteration": 2.6762561798095703 + }, + { + "auxiliary_loss_clip": 0.01056023, + "auxiliary_loss_mlp": 0.01031595, + "balance_loss_clip": 1.02605319, + "balance_loss_mlp": 1.02108026, + "epoch": 0.5516909664812867, + "flos": 18764474584320.0, + "grad_norm": 2.358311420845507, + "language_loss": 0.69308841, + "learning_rate": 1.7627933147952318e-06, + "loss": 0.71396458, + "num_input_tokens_seen": 197720720, + "step": 9176, + "time_per_iteration": 2.569153308868408 + }, + { + "auxiliary_loss_clip": 0.01057334, + "auxiliary_loss_mlp": 0.01026624, + "balance_loss_clip": 1.02717793, + "balance_loss_mlp": 1.0163182, + "epoch": 0.5517510897339546, + "flos": 27740346336000.0, + "grad_norm": 6.161341797451331, + "language_loss": 0.70732147, + "learning_rate": 1.7624066065456435e-06, + "loss": 0.72816104, + "num_input_tokens_seen": 197741820, + "step": 9177, + "time_per_iteration": 2.6613149642944336 + }, + { + "auxiliary_loss_clip": 0.01056889, + "auxiliary_loss_mlp": 0.01025603, + "balance_loss_clip": 1.02714503, + "balance_loss_mlp": 1.01542807, + "epoch": 0.5518112129866226, + "flos": 18404470523520.0, + "grad_norm": 2.169999690523127, + "language_loss": 0.80142736, + "learning_rate": 1.7620199073056204e-06, + "loss": 0.82225227, + "num_input_tokens_seen": 197759160, + "step": 9178, + "time_per_iteration": 2.6938436031341553 + }, + { + "auxiliary_loss_clip": 0.01006606, + "auxiliary_loss_mlp": 0.01042473, + "balance_loss_clip": 1.02562082, + "balance_loss_mlp": 1.03076649, + "epoch": 0.5518713362392905, + "flos": 25082670015360.0, + "grad_norm": 1.6856521811255611, + "language_loss": 0.74676311, + "learning_rate": 1.761633217089826e-06, + "loss": 0.76725382, + "num_input_tokens_seen": 197779760, + "step": 9179, + "time_per_iteration": 2.80948543548584 + }, + { + "auxiliary_loss_clip": 0.01057967, + "auxiliary_loss_mlp": 0.01033203, + "balance_loss_clip": 1.02718377, + "balance_loss_mlp": 1.0226891, + "epoch": 0.5519314594919585, + "flos": 36538999361280.0, + "grad_norm": 1.9646345921315034, + "language_loss": 0.69939172, + "learning_rate": 1.761246535912924e-06, + "loss": 0.72030348, + "num_input_tokens_seen": 197801545, + "step": 9180, + "time_per_iteration": 2.8262791633605957 + }, + { + "auxiliary_loss_clip": 0.01050359, + "auxiliary_loss_mlp": 0.01032186, + "balance_loss_clip": 1.02476847, + "balance_loss_mlp": 1.02081394, + "epoch": 0.5519915827446265, + "flos": 20448613612800.0, + "grad_norm": 4.8739577540947705, + "language_loss": 0.6743995, + "learning_rate": 1.7608598637895776e-06, + "loss": 0.695225, + "num_input_tokens_seen": 197820760, + "step": 9181, + "time_per_iteration": 2.6839699745178223 + }, + { + "auxiliary_loss_clip": 0.01069552, + "auxiliary_loss_mlp": 0.01027653, + "balance_loss_clip": 1.02688038, + "balance_loss_mlp": 1.01637602, + "epoch": 0.5520517059972945, + "flos": 23768052151680.0, + "grad_norm": 1.9114654270032283, + "language_loss": 0.79025674, + "learning_rate": 1.7604732007344486e-06, + "loss": 0.81122875, + "num_input_tokens_seen": 197840195, + "step": 9182, + "time_per_iteration": 2.696626663208008 + }, + { + "auxiliary_loss_clip": 0.01036129, + "auxiliary_loss_mlp": 0.01026144, + "balance_loss_clip": 1.02518034, + "balance_loss_mlp": 1.01537371, + "epoch": 0.5521118292499624, + "flos": 22196457411840.0, + "grad_norm": 1.9573684922740238, + "language_loss": 0.83276653, + "learning_rate": 1.7600865467622003e-06, + "loss": 0.85338932, + "num_input_tokens_seen": 197859475, + "step": 9183, + "time_per_iteration": 2.733520269393921 + }, + { + "auxiliary_loss_clip": 0.01043982, + "auxiliary_loss_mlp": 0.01024043, + "balance_loss_clip": 1.02430105, + "balance_loss_mlp": 1.01370716, + "epoch": 0.5521719525026304, + "flos": 23583291569280.0, + "grad_norm": 1.351465532857679, + "language_loss": 0.67127979, + "learning_rate": 1.7596999018874936e-06, + "loss": 0.6919601, + "num_input_tokens_seen": 197879395, + "step": 9184, + "time_per_iteration": 2.7569260597229004 + }, + { + "auxiliary_loss_clip": 0.01058589, + "auxiliary_loss_mlp": 0.01021757, + "balance_loss_clip": 1.02845478, + "balance_loss_mlp": 1.0109508, + "epoch": 0.5522320757552983, + "flos": 26137617482880.0, + "grad_norm": 1.5813354456226272, + "language_loss": 0.76485765, + "learning_rate": 1.7593132661249917e-06, + "loss": 0.7856611, + "num_input_tokens_seen": 197900815, + "step": 9185, + "time_per_iteration": 2.6574153900146484 + }, + { + "auxiliary_loss_clip": 0.01027337, + "auxiliary_loss_mlp": 0.01034618, + "balance_loss_clip": 1.02422166, + "balance_loss_mlp": 1.02288771, + "epoch": 0.5522921990079663, + "flos": 24676160820480.0, + "grad_norm": 1.6881190645331787, + "language_loss": 0.74102896, + "learning_rate": 1.7589266394893536e-06, + "loss": 0.76164854, + "num_input_tokens_seen": 197918985, + "step": 9186, + "time_per_iteration": 2.6867265701293945 + }, + { + "auxiliary_loss_clip": 0.01042309, + "auxiliary_loss_mlp": 0.01033695, + "balance_loss_clip": 1.02966285, + "balance_loss_mlp": 1.02332425, + "epoch": 0.5523523222606344, + "flos": 22748153379840.0, + "grad_norm": 2.175339347266036, + "language_loss": 0.65974784, + "learning_rate": 1.7585400219952421e-06, + "loss": 0.68050784, + "num_input_tokens_seen": 197937725, + "step": 9187, + "time_per_iteration": 2.767085313796997 + }, + { + "auxiliary_loss_clip": 0.0104773, + "auxiliary_loss_mlp": 0.01025015, + "balance_loss_clip": 1.02699184, + "balance_loss_mlp": 1.01425052, + "epoch": 0.5524124455133023, + "flos": 19755825022080.0, + "grad_norm": 1.5476798367169255, + "language_loss": 0.77892065, + "learning_rate": 1.758153413657318e-06, + "loss": 0.79964817, + "num_input_tokens_seen": 197955635, + "step": 9188, + "time_per_iteration": 2.627685070037842 + }, + { + "auxiliary_loss_clip": 0.01039843, + "auxiliary_loss_mlp": 0.01027597, + "balance_loss_clip": 1.02292812, + "balance_loss_mlp": 1.01685631, + "epoch": 0.5524725687659703, + "flos": 23294821443840.0, + "grad_norm": 2.8214742198366696, + "language_loss": 0.8119266, + "learning_rate": 1.7577668144902394e-06, + "loss": 0.83260095, + "num_input_tokens_seen": 197974490, + "step": 9189, + "time_per_iteration": 2.619549512863159 + }, + { + "auxiliary_loss_clip": 0.01056851, + "auxiliary_loss_mlp": 0.00747266, + "balance_loss_clip": 1.02699125, + "balance_loss_mlp": 1.0003283, + "epoch": 0.5525326920186382, + "flos": 24862178378880.0, + "grad_norm": 1.380068447584999, + "language_loss": 0.76794541, + "learning_rate": 1.7573802245086684e-06, + "loss": 0.78598654, + "num_input_tokens_seen": 197995735, + "step": 9190, + "time_per_iteration": 2.659538507461548 + }, + { + "auxiliary_loss_clip": 0.01068725, + "auxiliary_loss_mlp": 0.01031761, + "balance_loss_clip": 1.02617931, + "balance_loss_mlp": 1.01924372, + "epoch": 0.5525928152713062, + "flos": 13735580906880.0, + "grad_norm": 2.488401048263527, + "language_loss": 0.7924819, + "learning_rate": 1.7569936437272627e-06, + "loss": 0.8134867, + "num_input_tokens_seen": 198009685, + "step": 9191, + "time_per_iteration": 2.4803719520568848 + }, + { + "auxiliary_loss_clip": 0.00998319, + "auxiliary_loss_mlp": 0.01032534, + "balance_loss_clip": 1.02092016, + "balance_loss_mlp": 1.02113211, + "epoch": 0.5526529385239741, + "flos": 13071592045440.0, + "grad_norm": 1.8734099253654315, + "language_loss": 0.68723524, + "learning_rate": 1.7566070721606829e-06, + "loss": 0.70754379, + "num_input_tokens_seen": 198026845, + "step": 9192, + "time_per_iteration": 2.7317006587982178 + }, + { + "auxiliary_loss_clip": 0.01056624, + "auxiliary_loss_mlp": 0.01029026, + "balance_loss_clip": 1.02782989, + "balance_loss_mlp": 1.01953685, + "epoch": 0.5527130617766421, + "flos": 23148377694720.0, + "grad_norm": 2.1083619673149743, + "language_loss": 0.77403522, + "learning_rate": 1.756220509823588e-06, + "loss": 0.79489172, + "num_input_tokens_seen": 198045275, + "step": 9193, + "time_per_iteration": 2.6741089820861816 + }, + { + "auxiliary_loss_clip": 0.01026251, + "auxiliary_loss_mlp": 0.01032851, + "balance_loss_clip": 1.02071714, + "balance_loss_mlp": 1.02174067, + "epoch": 0.55277318502931, + "flos": 21285547482240.0, + "grad_norm": 1.5261741678519416, + "language_loss": 0.78273535, + "learning_rate": 1.7558339567306344e-06, + "loss": 0.80332637, + "num_input_tokens_seen": 198065760, + "step": 9194, + "time_per_iteration": 2.746198892593384 + }, + { + "auxiliary_loss_clip": 0.01033576, + "auxiliary_loss_mlp": 0.01030882, + "balance_loss_clip": 1.02516961, + "balance_loss_mlp": 1.01960444, + "epoch": 0.5528333082819781, + "flos": 38324549462400.0, + "grad_norm": 1.770010682384178, + "language_loss": 0.69598317, + "learning_rate": 1.7554474128964825e-06, + "loss": 0.71662772, + "num_input_tokens_seen": 198087595, + "step": 9195, + "time_per_iteration": 2.7547547817230225 + }, + { + "auxiliary_loss_clip": 0.01052012, + "auxiliary_loss_mlp": 0.01032794, + "balance_loss_clip": 1.02719498, + "balance_loss_mlp": 1.02071178, + "epoch": 0.552893431534646, + "flos": 13553621585280.0, + "grad_norm": 2.063522691695304, + "language_loss": 0.7432884, + "learning_rate": 1.7550608783357887e-06, + "loss": 0.76413643, + "num_input_tokens_seen": 198104620, + "step": 9196, + "time_per_iteration": 4.218538999557495 + }, + { + "auxiliary_loss_clip": 0.01047536, + "auxiliary_loss_mlp": 0.01033512, + "balance_loss_clip": 1.02400088, + "balance_loss_mlp": 1.02267575, + "epoch": 0.552953554787314, + "flos": 21939408708480.0, + "grad_norm": 1.6305911863925335, + "language_loss": 0.77100229, + "learning_rate": 1.7546743530632115e-06, + "loss": 0.79181278, + "num_input_tokens_seen": 198123565, + "step": 9197, + "time_per_iteration": 2.5695669651031494 + }, + { + "auxiliary_loss_clip": 0.01044273, + "auxiliary_loss_mlp": 0.01024767, + "balance_loss_clip": 1.02421677, + "balance_loss_mlp": 1.01484275, + "epoch": 0.5530136780399819, + "flos": 43658002558080.0, + "grad_norm": 1.5109893376929626, + "language_loss": 0.76518548, + "learning_rate": 1.754287837093407e-06, + "loss": 0.78587586, + "num_input_tokens_seen": 198148270, + "step": 9198, + "time_per_iteration": 2.8466522693634033 + }, + { + "auxiliary_loss_clip": 0.0106354, + "auxiliary_loss_mlp": 0.01023513, + "balance_loss_clip": 1.02487588, + "balance_loss_mlp": 1.01354098, + "epoch": 0.5530738012926499, + "flos": 25045502417280.0, + "grad_norm": 1.5145885354723851, + "language_loss": 0.791435, + "learning_rate": 1.7539013304410327e-06, + "loss": 0.81230551, + "num_input_tokens_seen": 198168810, + "step": 9199, + "time_per_iteration": 4.294855833053589 + }, + { + "auxiliary_loss_clip": 0.01029611, + "auxiliary_loss_mlp": 0.01034602, + "balance_loss_clip": 1.0237596, + "balance_loss_mlp": 1.0232296, + "epoch": 0.553133924545318, + "flos": 16472081623680.0, + "grad_norm": 1.7263821271553383, + "language_loss": 0.63948035, + "learning_rate": 1.7535148331207443e-06, + "loss": 0.66012251, + "num_input_tokens_seen": 198186200, + "step": 9200, + "time_per_iteration": 2.7133171558380127 + }, + { + "auxiliary_loss_clip": 0.01048063, + "auxiliary_loss_mlp": 0.01026307, + "balance_loss_clip": 1.02590561, + "balance_loss_mlp": 1.01501751, + "epoch": 0.5531940477979859, + "flos": 24606207083520.0, + "grad_norm": 1.4643234473608724, + "language_loss": 0.66195881, + "learning_rate": 1.7531283451471978e-06, + "loss": 0.68270248, + "num_input_tokens_seen": 198207050, + "step": 9201, + "time_per_iteration": 2.7139089107513428 + }, + { + "auxiliary_loss_clip": 0.01050563, + "auxiliary_loss_mlp": 0.01032198, + "balance_loss_clip": 1.02418113, + "balance_loss_mlp": 1.02048004, + "epoch": 0.5532541710506539, + "flos": 22159577122560.0, + "grad_norm": 2.638913418629286, + "language_loss": 0.60994059, + "learning_rate": 1.7527418665350502e-06, + "loss": 0.63076818, + "num_input_tokens_seen": 198224565, + "step": 9202, + "time_per_iteration": 2.6317555904388428 + }, + { + "auxiliary_loss_clip": 0.01054633, + "auxiliary_loss_mlp": 0.00747372, + "balance_loss_clip": 1.02502084, + "balance_loss_mlp": 1.00031257, + "epoch": 0.5533142943033218, + "flos": 21397265758080.0, + "grad_norm": 1.6995409688743641, + "language_loss": 0.64243257, + "learning_rate": 1.7523553972989548e-06, + "loss": 0.6604526, + "num_input_tokens_seen": 198244790, + "step": 9203, + "time_per_iteration": 2.671661853790283 + }, + { + "auxiliary_loss_clip": 0.01053648, + "auxiliary_loss_mlp": 0.01027783, + "balance_loss_clip": 1.02385807, + "balance_loss_mlp": 1.01719117, + "epoch": 0.5533744175559898, + "flos": 23550541344000.0, + "grad_norm": 8.771738439262021, + "language_loss": 0.63821125, + "learning_rate": 1.7519689374535683e-06, + "loss": 0.65902555, + "num_input_tokens_seen": 198264375, + "step": 9204, + "time_per_iteration": 2.699878215789795 + }, + { + "auxiliary_loss_clip": 0.01052229, + "auxiliary_loss_mlp": 0.01026452, + "balance_loss_clip": 1.02319074, + "balance_loss_mlp": 1.01664686, + "epoch": 0.5534345408086577, + "flos": 24061514267520.0, + "grad_norm": 1.5716169286134494, + "language_loss": 0.77157819, + "learning_rate": 1.7515824870135445e-06, + "loss": 0.79236501, + "num_input_tokens_seen": 198283895, + "step": 9205, + "time_per_iteration": 2.658777952194214 + }, + { + "auxiliary_loss_clip": 0.01003687, + "auxiliary_loss_mlp": 0.01036395, + "balance_loss_clip": 1.02053714, + "balance_loss_mlp": 1.02521324, + "epoch": 0.5534946640613257, + "flos": 33771831408000.0, + "grad_norm": 1.486000378096214, + "language_loss": 0.72597241, + "learning_rate": 1.751196045993537e-06, + "loss": 0.74637324, + "num_input_tokens_seen": 198310035, + "step": 9206, + "time_per_iteration": 4.503590106964111 + }, + { + "auxiliary_loss_clip": 0.01014063, + "auxiliary_loss_mlp": 0.01030121, + "balance_loss_clip": 1.02141738, + "balance_loss_mlp": 1.01908779, + "epoch": 0.5535547873139937, + "flos": 15159223526400.0, + "grad_norm": 2.0323775666569124, + "language_loss": 0.75386834, + "learning_rate": 1.7508096144082012e-06, + "loss": 0.77431017, + "num_input_tokens_seen": 198327810, + "step": 9207, + "time_per_iteration": 2.6753406524658203 + }, + { + "auxiliary_loss_clip": 0.01038029, + "auxiliary_loss_mlp": 0.01030938, + "balance_loss_clip": 1.02614474, + "balance_loss_mlp": 1.01936853, + "epoch": 0.5536149105666617, + "flos": 16980863817600.0, + "grad_norm": 7.835994578140807, + "language_loss": 0.61343753, + "learning_rate": 1.750423192272189e-06, + "loss": 0.6341272, + "num_input_tokens_seen": 198343150, + "step": 9208, + "time_per_iteration": 2.660141944885254 + }, + { + "auxiliary_loss_clip": 0.01064284, + "auxiliary_loss_mlp": 0.0103304, + "balance_loss_clip": 1.02401435, + "balance_loss_mlp": 1.0223949, + "epoch": 0.5536750338193296, + "flos": 18149935772160.0, + "grad_norm": 2.1852636736192514, + "language_loss": 0.64064974, + "learning_rate": 1.7500367796001547e-06, + "loss": 0.661623, + "num_input_tokens_seen": 198360925, + "step": 9209, + "time_per_iteration": 2.58150577545166 + }, + { + "auxiliary_loss_clip": 0.01031938, + "auxiliary_loss_mlp": 0.01035049, + "balance_loss_clip": 1.02462912, + "balance_loss_mlp": 1.02316344, + "epoch": 0.5537351570719976, + "flos": 22747794243840.0, + "grad_norm": 2.303965784534861, + "language_loss": 0.82686031, + "learning_rate": 1.7496503764067513e-06, + "loss": 0.84753013, + "num_input_tokens_seen": 198379265, + "step": 9210, + "time_per_iteration": 2.671987771987915 + }, + { + "auxiliary_loss_clip": 0.010456, + "auxiliary_loss_mlp": 0.01024451, + "balance_loss_clip": 1.02626038, + "balance_loss_mlp": 1.01433015, + "epoch": 0.5537952803246655, + "flos": 26356026130560.0, + "grad_norm": 2.6835787159931646, + "language_loss": 0.72963184, + "learning_rate": 1.74926398270663e-06, + "loss": 0.75033236, + "num_input_tokens_seen": 198399490, + "step": 9211, + "time_per_iteration": 2.7780518531799316 + }, + { + "auxiliary_loss_clip": 0.01038197, + "auxiliary_loss_mlp": 0.01033508, + "balance_loss_clip": 1.02594018, + "balance_loss_mlp": 1.02128339, + "epoch": 0.5538554035773335, + "flos": 18037427397120.0, + "grad_norm": 2.4650665211237475, + "language_loss": 0.66841352, + "learning_rate": 1.7488775985144437e-06, + "loss": 0.68913054, + "num_input_tokens_seen": 198419110, + "step": 9212, + "time_per_iteration": 4.356136083602905 + }, + { + "auxiliary_loss_clip": 0.01038041, + "auxiliary_loss_mlp": 0.01027426, + "balance_loss_clip": 1.02505744, + "balance_loss_mlp": 1.01536798, + "epoch": 0.5539155268300014, + "flos": 31686247002240.0, + "grad_norm": 1.472143890145205, + "language_loss": 0.51371235, + "learning_rate": 1.7484912238448443e-06, + "loss": 0.53436702, + "num_input_tokens_seen": 198441360, + "step": 9213, + "time_per_iteration": 2.7303268909454346 + }, + { + "auxiliary_loss_clip": 0.0103797, + "auxiliary_loss_mlp": 0.01031263, + "balance_loss_clip": 1.02608752, + "balance_loss_mlp": 1.02024817, + "epoch": 0.5539756500826695, + "flos": 15193769431680.0, + "grad_norm": 2.1542163614298286, + "language_loss": 0.85547209, + "learning_rate": 1.7481048587124827e-06, + "loss": 0.87616444, + "num_input_tokens_seen": 198459835, + "step": 9214, + "time_per_iteration": 2.6687376499176025 + }, + { + "auxiliary_loss_clip": 0.01055524, + "auxiliary_loss_mlp": 0.01027889, + "balance_loss_clip": 1.02624011, + "balance_loss_mlp": 1.01789355, + "epoch": 0.5540357733353375, + "flos": 26353117128960.0, + "grad_norm": 1.5742118303996067, + "language_loss": 0.70189685, + "learning_rate": 1.7477185031320108e-06, + "loss": 0.72273099, + "num_input_tokens_seen": 198478955, + "step": 9215, + "time_per_iteration": 2.7235724925994873 + }, + { + "auxiliary_loss_clip": 0.01039525, + "auxiliary_loss_mlp": 0.01029785, + "balance_loss_clip": 1.02414608, + "balance_loss_mlp": 1.01801896, + "epoch": 0.5540958965880054, + "flos": 21323684747520.0, + "grad_norm": 1.6577282915635139, + "language_loss": 0.73399264, + "learning_rate": 1.7473321571180773e-06, + "loss": 0.7546857, + "num_input_tokens_seen": 198499030, + "step": 9216, + "time_per_iteration": 2.7700343132019043 + }, + { + "auxiliary_loss_clip": 0.0104276, + "auxiliary_loss_mlp": 0.0102881, + "balance_loss_clip": 1.0245533, + "balance_loss_mlp": 1.01810491, + "epoch": 0.5541560198406734, + "flos": 25666828899840.0, + "grad_norm": 1.9942510415653774, + "language_loss": 0.71570182, + "learning_rate": 1.7469458206853345e-06, + "loss": 0.73641747, + "num_input_tokens_seen": 198520265, + "step": 9217, + "time_per_iteration": 2.7691268920898438 + }, + { + "auxiliary_loss_clip": 0.0105291, + "auxiliary_loss_mlp": 0.01028129, + "balance_loss_clip": 1.0248872, + "balance_loss_mlp": 1.01769197, + "epoch": 0.5542161430933413, + "flos": 21939624190080.0, + "grad_norm": 1.7155334287864752, + "language_loss": 0.78221178, + "learning_rate": 1.7465594938484315e-06, + "loss": 0.80302215, + "num_input_tokens_seen": 198539645, + "step": 9218, + "time_per_iteration": 2.7252933979034424 + }, + { + "auxiliary_loss_clip": 0.01026538, + "auxiliary_loss_mlp": 0.01031144, + "balance_loss_clip": 1.021806, + "balance_loss_mlp": 1.01842415, + "epoch": 0.5542762663460093, + "flos": 19571459489280.0, + "grad_norm": 1.6134336015337423, + "language_loss": 0.72339499, + "learning_rate": 1.7461731766220176e-06, + "loss": 0.74397177, + "num_input_tokens_seen": 198558710, + "step": 9219, + "time_per_iteration": 3.0004265308380127 + }, + { + "auxiliary_loss_clip": 0.01061879, + "auxiliary_loss_mlp": 0.01036584, + "balance_loss_clip": 1.03015518, + "balance_loss_mlp": 1.02487171, + "epoch": 0.5543363895986773, + "flos": 19499063627520.0, + "grad_norm": 1.5466780432322405, + "language_loss": 0.71675175, + "learning_rate": 1.7457868690207426e-06, + "loss": 0.7377364, + "num_input_tokens_seen": 198577050, + "step": 9220, + "time_per_iteration": 2.8324007987976074 + }, + { + "auxiliary_loss_clip": 0.01063974, + "auxiliary_loss_mlp": 0.01022679, + "balance_loss_clip": 1.02471542, + "balance_loss_mlp": 1.01282668, + "epoch": 0.5543965128513453, + "flos": 22635609091200.0, + "grad_norm": 1.6117196701138314, + "language_loss": 0.79725337, + "learning_rate": 1.7454005710592547e-06, + "loss": 0.81811988, + "num_input_tokens_seen": 198595290, + "step": 9221, + "time_per_iteration": 2.684049129486084 + }, + { + "auxiliary_loss_clip": 0.01032357, + "auxiliary_loss_mlp": 0.01027513, + "balance_loss_clip": 1.02658033, + "balance_loss_mlp": 1.01634336, + "epoch": 0.5544566361040132, + "flos": 25989952671360.0, + "grad_norm": 1.9366327969532031, + "language_loss": 0.83417213, + "learning_rate": 1.7450142827522027e-06, + "loss": 0.85477084, + "num_input_tokens_seen": 198614110, + "step": 9222, + "time_per_iteration": 2.7491061687469482 + }, + { + "auxiliary_loss_clip": 0.01046964, + "auxiliary_loss_mlp": 0.00747568, + "balance_loss_clip": 1.03145576, + "balance_loss_mlp": 1.00028014, + "epoch": 0.5545167593566812, + "flos": 28257568225920.0, + "grad_norm": 1.6753517888588518, + "language_loss": 0.75186992, + "learning_rate": 1.7446280041142344e-06, + "loss": 0.76981527, + "num_input_tokens_seen": 198633880, + "step": 9223, + "time_per_iteration": 2.7610023021698 + }, + { + "auxiliary_loss_clip": 0.01036206, + "auxiliary_loss_mlp": 0.01027245, + "balance_loss_clip": 1.0235095, + "balance_loss_mlp": 1.01566327, + "epoch": 0.5545768826093491, + "flos": 28476551491200.0, + "grad_norm": 1.6586279531840369, + "language_loss": 0.82132941, + "learning_rate": 1.7442417351599986e-06, + "loss": 0.84196395, + "num_input_tokens_seen": 198653505, + "step": 9224, + "time_per_iteration": 2.7002456188201904 + }, + { + "auxiliary_loss_clip": 0.01061342, + "auxiliary_loss_mlp": 0.01039267, + "balance_loss_clip": 1.02885914, + "balance_loss_mlp": 1.02758455, + "epoch": 0.5546370058620171, + "flos": 18478051534080.0, + "grad_norm": 2.05868026902823, + "language_loss": 0.57024312, + "learning_rate": 1.743855475904141e-06, + "loss": 0.59124923, + "num_input_tokens_seen": 198671890, + "step": 9225, + "time_per_iteration": 2.687187910079956 + }, + { + "auxiliary_loss_clip": 0.01058641, + "auxiliary_loss_mlp": 0.0103615, + "balance_loss_clip": 1.02645445, + "balance_loss_mlp": 1.02499771, + "epoch": 0.554697129114685, + "flos": 22930507751040.0, + "grad_norm": 1.619480582081857, + "language_loss": 0.67477763, + "learning_rate": 1.7434692263613098e-06, + "loss": 0.69572544, + "num_input_tokens_seen": 198691995, + "step": 9226, + "time_per_iteration": 2.718822956085205 + }, + { + "auxiliary_loss_clip": 0.01030959, + "auxiliary_loss_mlp": 0.01034789, + "balance_loss_clip": 1.02245069, + "balance_loss_mlp": 1.02346444, + "epoch": 0.5547572523673531, + "flos": 21797166850560.0, + "grad_norm": 1.410793029105667, + "language_loss": 0.74526727, + "learning_rate": 1.7430829865461518e-06, + "loss": 0.76592481, + "num_input_tokens_seen": 198712440, + "step": 9227, + "time_per_iteration": 2.843997001647949 + }, + { + "auxiliary_loss_clip": 0.01030823, + "auxiliary_loss_mlp": 0.01030766, + "balance_loss_clip": 1.02552772, + "balance_loss_mlp": 1.01936388, + "epoch": 0.5548173756200211, + "flos": 22342829333760.0, + "grad_norm": 1.7092446052470405, + "language_loss": 0.73337764, + "learning_rate": 1.7426967564733118e-06, + "loss": 0.75399351, + "num_input_tokens_seen": 198731515, + "step": 9228, + "time_per_iteration": 2.8691611289978027 + }, + { + "auxiliary_loss_clip": 0.01067647, + "auxiliary_loss_mlp": 0.01028921, + "balance_loss_clip": 1.02606821, + "balance_loss_mlp": 1.01815033, + "epoch": 0.554877498872689, + "flos": 17858736213120.0, + "grad_norm": 2.5552154740084103, + "language_loss": 0.75710195, + "learning_rate": 1.7423105361574373e-06, + "loss": 0.77806759, + "num_input_tokens_seen": 198749750, + "step": 9229, + "time_per_iteration": 2.6159443855285645 + }, + { + "auxiliary_loss_clip": 0.01056044, + "auxiliary_loss_mlp": 0.00747497, + "balance_loss_clip": 1.02612376, + "balance_loss_mlp": 1.00034952, + "epoch": 0.554937622125357, + "flos": 17238343484160.0, + "grad_norm": 1.5189123318541713, + "language_loss": 0.68490088, + "learning_rate": 1.741924325613172e-06, + "loss": 0.70293629, + "num_input_tokens_seen": 198768320, + "step": 9230, + "time_per_iteration": 2.6917710304260254 + }, + { + "auxiliary_loss_clip": 0.01028325, + "auxiliary_loss_mlp": 0.01033513, + "balance_loss_clip": 1.0266701, + "balance_loss_mlp": 1.02090085, + "epoch": 0.5549977453780249, + "flos": 25368087484800.0, + "grad_norm": 2.0339152987095175, + "language_loss": 0.67773831, + "learning_rate": 1.741538124855163e-06, + "loss": 0.69835675, + "num_input_tokens_seen": 198787230, + "step": 9231, + "time_per_iteration": 2.788472890853882 + }, + { + "auxiliary_loss_clip": 0.01069358, + "auxiliary_loss_mlp": 0.01033289, + "balance_loss_clip": 1.0269165, + "balance_loss_mlp": 1.02114761, + "epoch": 0.555057868630693, + "flos": 25079114568960.0, + "grad_norm": 1.6017115458187816, + "language_loss": 0.78111613, + "learning_rate": 1.7411519338980548e-06, + "loss": 0.80214262, + "num_input_tokens_seen": 198806720, + "step": 9232, + "time_per_iteration": 2.6495754718780518 + }, + { + "auxiliary_loss_clip": 0.01020172, + "auxiliary_loss_mlp": 0.01033379, + "balance_loss_clip": 1.0213244, + "balance_loss_mlp": 1.02296007, + "epoch": 0.5551179918833609, + "flos": 26104220812800.0, + "grad_norm": 1.7416837509887324, + "language_loss": 0.83034813, + "learning_rate": 1.7407657527564898e-06, + "loss": 0.85088366, + "num_input_tokens_seen": 198826235, + "step": 9233, + "time_per_iteration": 2.6492316722869873 + }, + { + "auxiliary_loss_clip": 0.01050551, + "auxiliary_loss_mlp": 0.01031789, + "balance_loss_clip": 1.02377796, + "balance_loss_mlp": 1.02091718, + "epoch": 0.5551781151360289, + "flos": 19384759572480.0, + "grad_norm": 2.0036907442965335, + "language_loss": 0.7495783, + "learning_rate": 1.7403795814451142e-06, + "loss": 0.77040172, + "num_input_tokens_seen": 198842655, + "step": 9234, + "time_per_iteration": 2.5755393505096436 + }, + { + "auxiliary_loss_clip": 0.01040965, + "auxiliary_loss_mlp": 0.01025439, + "balance_loss_clip": 1.02276683, + "balance_loss_mlp": 1.0150919, + "epoch": 0.5552382383886968, + "flos": 21725956137600.0, + "grad_norm": 1.9186738731038973, + "language_loss": 0.64120412, + "learning_rate": 1.7399934199785706e-06, + "loss": 0.66186816, + "num_input_tokens_seen": 198861210, + "step": 9235, + "time_per_iteration": 2.723982810974121 + }, + { + "auxiliary_loss_clip": 0.01006231, + "auxiliary_loss_mlp": 0.01031488, + "balance_loss_clip": 1.0182457, + "balance_loss_mlp": 1.02005601, + "epoch": 0.5552983616413648, + "flos": 14356189117440.0, + "grad_norm": 2.3540046905427525, + "language_loss": 0.68367314, + "learning_rate": 1.7396072683715029e-06, + "loss": 0.7040503, + "num_input_tokens_seen": 198880045, + "step": 9236, + "time_per_iteration": 2.8854076862335205 + }, + { + "auxiliary_loss_clip": 0.01060737, + "auxiliary_loss_mlp": 0.01022242, + "balance_loss_clip": 1.023206, + "balance_loss_mlp": 1.01166201, + "epoch": 0.5553584848940327, + "flos": 25478548784640.0, + "grad_norm": 1.8222931088630705, + "language_loss": 0.8630814, + "learning_rate": 1.7392211266385536e-06, + "loss": 0.88391119, + "num_input_tokens_seen": 198900210, + "step": 9237, + "time_per_iteration": 2.594862937927246 + }, + { + "auxiliary_loss_clip": 0.01053026, + "auxiliary_loss_mlp": 0.01028202, + "balance_loss_clip": 1.02404308, + "balance_loss_mlp": 1.01760459, + "epoch": 0.5554186081467007, + "flos": 22163850840960.0, + "grad_norm": 1.6274769237874898, + "language_loss": 0.73689425, + "learning_rate": 1.7388349947943652e-06, + "loss": 0.75770652, + "num_input_tokens_seen": 198919055, + "step": 9238, + "time_per_iteration": 2.7504045963287354 + }, + { + "auxiliary_loss_clip": 0.01048397, + "auxiliary_loss_mlp": 0.01027384, + "balance_loss_clip": 1.02238083, + "balance_loss_mlp": 1.01626778, + "epoch": 0.5554787313993687, + "flos": 49746656125440.0, + "grad_norm": 1.7099354762648291, + "language_loss": 0.78280413, + "learning_rate": 1.73844887285358e-06, + "loss": 0.80356193, + "num_input_tokens_seen": 198943505, + "step": 9239, + "time_per_iteration": 2.9286787509918213 + }, + { + "auxiliary_loss_clip": 0.01047089, + "auxiliary_loss_mlp": 0.0102976, + "balance_loss_clip": 1.02655232, + "balance_loss_mlp": 1.01922798, + "epoch": 0.5555388546520367, + "flos": 22127365601280.0, + "grad_norm": 1.5080054329818735, + "language_loss": 0.80077761, + "learning_rate": 1.7380627608308393e-06, + "loss": 0.82154608, + "num_input_tokens_seen": 198963590, + "step": 9240, + "time_per_iteration": 2.6288020610809326 + }, + { + "auxiliary_loss_clip": 0.01043777, + "auxiliary_loss_mlp": 0.01024683, + "balance_loss_clip": 1.02464604, + "balance_loss_mlp": 1.01424074, + "epoch": 0.5555989779047047, + "flos": 24682122478080.0, + "grad_norm": 2.664631356885276, + "language_loss": 0.652188, + "learning_rate": 1.737676658740786e-06, + "loss": 0.67287254, + "num_input_tokens_seen": 198982680, + "step": 9241, + "time_per_iteration": 2.817603588104248 + }, + { + "auxiliary_loss_clip": 0.01057136, + "auxiliary_loss_mlp": 0.00747441, + "balance_loss_clip": 1.02650523, + "balance_loss_mlp": 1.00036955, + "epoch": 0.5556591011573726, + "flos": 16106510954880.0, + "grad_norm": 1.9034954472340377, + "language_loss": 0.72586942, + "learning_rate": 1.7372905665980594e-06, + "loss": 0.7439152, + "num_input_tokens_seen": 199000185, + "step": 9242, + "time_per_iteration": 2.6024646759033203 + }, + { + "auxiliary_loss_clip": 0.01051114, + "auxiliary_loss_mlp": 0.0103345, + "balance_loss_clip": 1.02811134, + "balance_loss_mlp": 1.02138007, + "epoch": 0.5557192244100406, + "flos": 12933695733120.0, + "grad_norm": 1.850640921995123, + "language_loss": 0.63395131, + "learning_rate": 1.7369044844173012e-06, + "loss": 0.65479696, + "num_input_tokens_seen": 199018380, + "step": 9243, + "time_per_iteration": 2.8778719902038574 + }, + { + "auxiliary_loss_clip": 0.01048054, + "auxiliary_loss_mlp": 0.00747569, + "balance_loss_clip": 1.02867782, + "balance_loss_mlp": 1.00039649, + "epoch": 0.5557793476627085, + "flos": 23111712887040.0, + "grad_norm": 2.6871768406522447, + "language_loss": 0.75380278, + "learning_rate": 1.7365184122131509e-06, + "loss": 0.77175897, + "num_input_tokens_seen": 199037115, + "step": 9244, + "time_per_iteration": 4.312524318695068 + }, + { + "auxiliary_loss_clip": 0.01035376, + "auxiliary_loss_mlp": 0.01024411, + "balance_loss_clip": 1.02072632, + "balance_loss_mlp": 1.01482034, + "epoch": 0.5558394709153766, + "flos": 21428040735360.0, + "grad_norm": 2.110420754469528, + "language_loss": 0.74133879, + "learning_rate": 1.7361323500002486e-06, + "loss": 0.76193666, + "num_input_tokens_seen": 199053375, + "step": 9245, + "time_per_iteration": 2.747215747833252 + }, + { + "auxiliary_loss_clip": 0.01040041, + "auxiliary_loss_mlp": 0.01031683, + "balance_loss_clip": 1.0235796, + "balance_loss_mlp": 1.0202086, + "epoch": 0.5558995941680445, + "flos": 25078324469760.0, + "grad_norm": 2.054274050372646, + "language_loss": 0.79769778, + "learning_rate": 1.7357462977932348e-06, + "loss": 0.81841505, + "num_input_tokens_seen": 199070930, + "step": 9246, + "time_per_iteration": 4.298212289810181 + }, + { + "auxiliary_loss_clip": 0.0106476, + "auxiliary_loss_mlp": 0.01028569, + "balance_loss_clip": 1.02532673, + "balance_loss_mlp": 1.01798916, + "epoch": 0.5559597174207125, + "flos": 20011149872640.0, + "grad_norm": 2.041033313984552, + "language_loss": 0.74489141, + "learning_rate": 1.7353602556067471e-06, + "loss": 0.76582468, + "num_input_tokens_seen": 199088675, + "step": 9247, + "time_per_iteration": 2.608238458633423 + }, + { + "auxiliary_loss_clip": 0.01046247, + "auxiliary_loss_mlp": 0.01030137, + "balance_loss_clip": 1.02513289, + "balance_loss_mlp": 1.01862717, + "epoch": 0.5560198406733804, + "flos": 16835677044480.0, + "grad_norm": 2.9782742700784945, + "language_loss": 0.76372933, + "learning_rate": 1.7349742234554254e-06, + "loss": 0.78449321, + "num_input_tokens_seen": 199103075, + "step": 9248, + "time_per_iteration": 2.657625913619995 + }, + { + "auxiliary_loss_clip": 0.00969859, + "auxiliary_loss_mlp": 0.01013545, + "balance_loss_clip": 1.00310802, + "balance_loss_mlp": 1.01236439, + "epoch": 0.5560799639260484, + "flos": 70697051758080.0, + "grad_norm": 0.8870915773729219, + "language_loss": 0.59398377, + "learning_rate": 1.7345882013539081e-06, + "loss": 0.61381781, + "num_input_tokens_seen": 199160325, + "step": 9249, + "time_per_iteration": 3.3779733180999756 + }, + { + "auxiliary_loss_clip": 0.01062545, + "auxiliary_loss_mlp": 0.01025587, + "balance_loss_clip": 1.02332795, + "balance_loss_mlp": 1.01489949, + "epoch": 0.5561400871787163, + "flos": 23148593176320.0, + "grad_norm": 1.8814058906844802, + "language_loss": 0.79789805, + "learning_rate": 1.734202189316832e-06, + "loss": 0.81877935, + "num_input_tokens_seen": 199179760, + "step": 9250, + "time_per_iteration": 2.6567978858947754 + }, + { + "auxiliary_loss_clip": 0.0104414, + "auxiliary_loss_mlp": 0.01029264, + "balance_loss_clip": 1.02364826, + "balance_loss_mlp": 1.01803458, + "epoch": 0.5562002104313843, + "flos": 17566423332480.0, + "grad_norm": 9.055104499663681, + "language_loss": 0.68763471, + "learning_rate": 1.733816187358836e-06, + "loss": 0.70836872, + "num_input_tokens_seen": 199196695, + "step": 9251, + "time_per_iteration": 2.689732551574707 + }, + { + "auxiliary_loss_clip": 0.01053033, + "auxiliary_loss_mlp": 0.01028717, + "balance_loss_clip": 1.02345335, + "balance_loss_mlp": 1.01813114, + "epoch": 0.5562603336840523, + "flos": 25045430590080.0, + "grad_norm": 1.594906195632086, + "language_loss": 0.7584905, + "learning_rate": 1.7334301954945569e-06, + "loss": 0.77930796, + "num_input_tokens_seen": 199217845, + "step": 9252, + "time_per_iteration": 2.62184476852417 + }, + { + "auxiliary_loss_clip": 0.01050137, + "auxiliary_loss_mlp": 0.01037017, + "balance_loss_clip": 1.02355373, + "balance_loss_mlp": 1.02485704, + "epoch": 0.5563204569367203, + "flos": 29059022436480.0, + "grad_norm": 1.622646128135071, + "language_loss": 0.72508758, + "learning_rate": 1.7330442137386313e-06, + "loss": 0.74595916, + "num_input_tokens_seen": 199239250, + "step": 9253, + "time_per_iteration": 4.332143306732178 + }, + { + "auxiliary_loss_clip": 0.01038352, + "auxiliary_loss_mlp": 0.01030453, + "balance_loss_clip": 1.02744424, + "balance_loss_mlp": 1.02049911, + "epoch": 0.5563805801893883, + "flos": 22090449398400.0, + "grad_norm": 1.7689159705859228, + "language_loss": 0.83034408, + "learning_rate": 1.7326582421056965e-06, + "loss": 0.85103214, + "num_input_tokens_seen": 199258320, + "step": 9254, + "time_per_iteration": 2.672454595565796 + }, + { + "auxiliary_loss_clip": 0.00981026, + "auxiliary_loss_mlp": 0.0100332, + "balance_loss_clip": 1.00334597, + "balance_loss_mlp": 1.00224078, + "epoch": 0.5564407034420562, + "flos": 58636128689280.0, + "grad_norm": 0.8691313146730691, + "language_loss": 0.64902592, + "learning_rate": 1.732272280610387e-06, + "loss": 0.66886938, + "num_input_tokens_seen": 199314840, + "step": 9255, + "time_per_iteration": 3.061232328414917 + }, + { + "auxiliary_loss_clip": 0.01055576, + "auxiliary_loss_mlp": 0.01030659, + "balance_loss_clip": 1.02676666, + "balance_loss_mlp": 1.02003145, + "epoch": 0.5565008266947242, + "flos": 23112323418240.0, + "grad_norm": 1.7166388558151253, + "language_loss": 0.69533968, + "learning_rate": 1.7318863292673399e-06, + "loss": 0.71620202, + "num_input_tokens_seen": 199335405, + "step": 9256, + "time_per_iteration": 2.6451666355133057 + }, + { + "auxiliary_loss_clip": 0.01030444, + "auxiliary_loss_mlp": 0.01028328, + "balance_loss_clip": 1.02282643, + "balance_loss_mlp": 1.0186305, + "epoch": 0.5565609499473921, + "flos": 21578399066880.0, + "grad_norm": 1.569189045371725, + "language_loss": 0.75735581, + "learning_rate": 1.73150038809119e-06, + "loss": 0.77794349, + "num_input_tokens_seen": 199354345, + "step": 9257, + "time_per_iteration": 2.69083309173584 + }, + { + "auxiliary_loss_clip": 0.01025317, + "auxiliary_loss_mlp": 0.01032285, + "balance_loss_clip": 1.02458537, + "balance_loss_mlp": 1.02248037, + "epoch": 0.5566210732000602, + "flos": 18369637309440.0, + "grad_norm": 1.9369526479078625, + "language_loss": 0.60952532, + "learning_rate": 1.7311144570965724e-06, + "loss": 0.63010132, + "num_input_tokens_seen": 199372250, + "step": 9258, + "time_per_iteration": 2.670217514038086 + }, + { + "auxiliary_loss_clip": 0.01029263, + "auxiliary_loss_mlp": 0.01030889, + "balance_loss_clip": 1.02206004, + "balance_loss_mlp": 1.01933765, + "epoch": 0.5566811964527281, + "flos": 25703350053120.0, + "grad_norm": 1.6528778747595345, + "language_loss": 0.78810424, + "learning_rate": 1.7307285362981215e-06, + "loss": 0.80870569, + "num_input_tokens_seen": 199392815, + "step": 9259, + "time_per_iteration": 4.3082661628723145 + }, + { + "auxiliary_loss_clip": 0.01050226, + "auxiliary_loss_mlp": 0.01031702, + "balance_loss_clip": 1.02937841, + "balance_loss_mlp": 1.02046609, + "epoch": 0.5567413197053961, + "flos": 26943991856640.0, + "grad_norm": 1.8564706731001386, + "language_loss": 0.81123149, + "learning_rate": 1.7303426257104712e-06, + "loss": 0.83205074, + "num_input_tokens_seen": 199412375, + "step": 9260, + "time_per_iteration": 2.668795108795166 + }, + { + "auxiliary_loss_clip": 0.01064445, + "auxiliary_loss_mlp": 0.01035003, + "balance_loss_clip": 1.02524757, + "balance_loss_mlp": 1.0241673, + "epoch": 0.556801442958064, + "flos": 20850597694080.0, + "grad_norm": 1.7643052030031396, + "language_loss": 0.69021893, + "learning_rate": 1.729956725348256e-06, + "loss": 0.71121347, + "num_input_tokens_seen": 199431490, + "step": 9261, + "time_per_iteration": 2.5421853065490723 + }, + { + "auxiliary_loss_clip": 0.00984993, + "auxiliary_loss_mlp": 0.01004469, + "balance_loss_clip": 1.0083077, + "balance_loss_mlp": 1.00330687, + "epoch": 0.556861566210732, + "flos": 70498213044480.0, + "grad_norm": 0.7296339834938904, + "language_loss": 0.61081254, + "learning_rate": 1.729570835226108e-06, + "loss": 0.63070714, + "num_input_tokens_seen": 199495855, + "step": 9262, + "time_per_iteration": 3.27864408493042 + }, + { + "auxiliary_loss_clip": 0.01054531, + "auxiliary_loss_mlp": 0.01028899, + "balance_loss_clip": 1.02423108, + "balance_loss_mlp": 1.01782393, + "epoch": 0.5569216894633999, + "flos": 25337276593920.0, + "grad_norm": 1.7193976907027089, + "language_loss": 0.64589453, + "learning_rate": 1.7291849553586622e-06, + "loss": 0.66672885, + "num_input_tokens_seen": 199515870, + "step": 9263, + "time_per_iteration": 2.6236109733581543 + }, + { + "auxiliary_loss_clip": 0.01039963, + "auxiliary_loss_mlp": 0.010326, + "balance_loss_clip": 1.02297604, + "balance_loss_mlp": 1.02216911, + "epoch": 0.556981812716068, + "flos": 22638733574400.0, + "grad_norm": 2.0567464098286434, + "language_loss": 0.73257154, + "learning_rate": 1.7287990857605497e-06, + "loss": 0.75329715, + "num_input_tokens_seen": 199535745, + "step": 9264, + "time_per_iteration": 2.625115394592285 + }, + { + "auxiliary_loss_clip": 0.01041065, + "auxiliary_loss_mlp": 0.01024741, + "balance_loss_clip": 1.02875483, + "balance_loss_mlp": 1.01398826, + "epoch": 0.5570419359687359, + "flos": 11035852738560.0, + "grad_norm": 1.9341174603232802, + "language_loss": 0.76227373, + "learning_rate": 1.7284132264464022e-06, + "loss": 0.7829318, + "num_input_tokens_seen": 199554035, + "step": 9265, + "time_per_iteration": 2.6663389205932617 + }, + { + "auxiliary_loss_clip": 0.01042634, + "auxiliary_loss_mlp": 0.01028178, + "balance_loss_clip": 1.02525735, + "balance_loss_mlp": 1.01903474, + "epoch": 0.5571020592214039, + "flos": 22823135020800.0, + "grad_norm": 1.4343443920246177, + "language_loss": 0.71096981, + "learning_rate": 1.7280273774308536e-06, + "loss": 0.73167789, + "num_input_tokens_seen": 199576120, + "step": 9266, + "time_per_iteration": 2.675821542739868 + }, + { + "auxiliary_loss_clip": 0.01039976, + "auxiliary_loss_mlp": 0.0102982, + "balance_loss_clip": 1.02285039, + "balance_loss_mlp": 1.01901388, + "epoch": 0.5571621824740719, + "flos": 22927778317440.0, + "grad_norm": 1.7819469040406444, + "language_loss": 0.68313879, + "learning_rate": 1.727641538728533e-06, + "loss": 0.7038368, + "num_input_tokens_seen": 199593780, + "step": 9267, + "time_per_iteration": 2.705132484436035 + }, + { + "auxiliary_loss_clip": 0.01051753, + "auxiliary_loss_mlp": 0.01035782, + "balance_loss_clip": 1.02496481, + "balance_loss_mlp": 1.0257926, + "epoch": 0.5572223057267398, + "flos": 22966705681920.0, + "grad_norm": 1.7463828286950935, + "language_loss": 0.74429685, + "learning_rate": 1.7272557103540736e-06, + "loss": 0.76517212, + "num_input_tokens_seen": 199613220, + "step": 9268, + "time_per_iteration": 2.6559977531433105 + }, + { + "auxiliary_loss_clip": 0.01056024, + "auxiliary_loss_mlp": 0.00747417, + "balance_loss_clip": 1.02733517, + "balance_loss_mlp": 1.00035036, + "epoch": 0.5572824289794078, + "flos": 20960053413120.0, + "grad_norm": 2.7421634953279304, + "language_loss": 0.75217557, + "learning_rate": 1.726869892322104e-06, + "loss": 0.77020997, + "num_input_tokens_seen": 199632085, + "step": 9269, + "time_per_iteration": 2.624964475631714 + }, + { + "auxiliary_loss_clip": 0.01026945, + "auxiliary_loss_mlp": 0.0103462, + "balance_loss_clip": 1.02265882, + "balance_loss_mlp": 1.02385533, + "epoch": 0.5573425522320757, + "flos": 25042413847680.0, + "grad_norm": 1.6695570017972161, + "language_loss": 0.82392263, + "learning_rate": 1.726484084647256e-06, + "loss": 0.84453827, + "num_input_tokens_seen": 199649295, + "step": 9270, + "time_per_iteration": 2.6710011959075928 + }, + { + "auxiliary_loss_clip": 0.01019822, + "auxiliary_loss_mlp": 0.01030951, + "balance_loss_clip": 1.02349377, + "balance_loss_mlp": 1.01962662, + "epoch": 0.5574026754847438, + "flos": 23659637927040.0, + "grad_norm": 2.049246943560366, + "language_loss": 0.80052423, + "learning_rate": 1.7260982873441591e-06, + "loss": 0.82103193, + "num_input_tokens_seen": 199668870, + "step": 9271, + "time_per_iteration": 2.705726146697998 + }, + { + "auxiliary_loss_clip": 0.01044107, + "auxiliary_loss_mlp": 0.01027846, + "balance_loss_clip": 1.02452826, + "balance_loss_mlp": 1.01715326, + "epoch": 0.5574627987374117, + "flos": 24782240661120.0, + "grad_norm": 1.7101264620553733, + "language_loss": 0.90241605, + "learning_rate": 1.725712500427442e-06, + "loss": 0.92313558, + "num_input_tokens_seen": 199684870, + "step": 9272, + "time_per_iteration": 2.596149206161499 + }, + { + "auxiliary_loss_clip": 0.01034351, + "auxiliary_loss_mlp": 0.0102817, + "balance_loss_clip": 1.02563775, + "balance_loss_mlp": 1.01768529, + "epoch": 0.5575229219900797, + "flos": 21834944979840.0, + "grad_norm": 3.562054529133169, + "language_loss": 0.84175158, + "learning_rate": 1.7253267239117347e-06, + "loss": 0.86237681, + "num_input_tokens_seen": 199701975, + "step": 9273, + "time_per_iteration": 3.0033342838287354 + }, + { + "auxiliary_loss_clip": 0.01056731, + "auxiliary_loss_mlp": 0.01036101, + "balance_loss_clip": 1.02579713, + "balance_loss_mlp": 1.02455556, + "epoch": 0.5575830452427476, + "flos": 27815148408960.0, + "grad_norm": 1.8154560102949124, + "language_loss": 0.74035621, + "learning_rate": 1.7249409578116655e-06, + "loss": 0.76128453, + "num_input_tokens_seen": 199721865, + "step": 9274, + "time_per_iteration": 2.58626651763916 + }, + { + "auxiliary_loss_clip": 0.01053476, + "auxiliary_loss_mlp": 0.01031302, + "balance_loss_clip": 1.03026152, + "balance_loss_mlp": 1.01910102, + "epoch": 0.5576431684954156, + "flos": 17812805696640.0, + "grad_norm": 3.168065241559152, + "language_loss": 0.77056158, + "learning_rate": 1.7245552021418629e-06, + "loss": 0.79140937, + "num_input_tokens_seen": 199736455, + "step": 9275, + "time_per_iteration": 2.5605106353759766 + }, + { + "auxiliary_loss_clip": 0.0104451, + "auxiliary_loss_mlp": 0.01023736, + "balance_loss_clip": 1.0258193, + "balance_loss_mlp": 1.01347184, + "epoch": 0.5577032917480835, + "flos": 15486872411520.0, + "grad_norm": 1.8192576496938473, + "language_loss": 0.74999499, + "learning_rate": 1.7241694569169546e-06, + "loss": 0.77067745, + "num_input_tokens_seen": 199753125, + "step": 9276, + "time_per_iteration": 2.630861282348633 + }, + { + "auxiliary_loss_clip": 0.01041785, + "auxiliary_loss_mlp": 0.01028192, + "balance_loss_clip": 1.0230515, + "balance_loss_mlp": 1.01804733, + "epoch": 0.5577634150007516, + "flos": 21579763783680.0, + "grad_norm": 3.5752741249251576, + "language_loss": 0.75426131, + "learning_rate": 1.7237837221515678e-06, + "loss": 0.77496111, + "num_input_tokens_seen": 199771365, + "step": 9277, + "time_per_iteration": 2.597773551940918 + }, + { + "auxiliary_loss_clip": 0.01062313, + "auxiliary_loss_mlp": 0.01030259, + "balance_loss_clip": 1.02464449, + "balance_loss_mlp": 1.02050173, + "epoch": 0.5578235382534195, + "flos": 21139750177920.0, + "grad_norm": 1.4484128107123129, + "language_loss": 0.71745765, + "learning_rate": 1.7233979978603304e-06, + "loss": 0.73838341, + "num_input_tokens_seen": 199790035, + "step": 9278, + "time_per_iteration": 2.562279224395752 + }, + { + "auxiliary_loss_clip": 0.01039618, + "auxiliary_loss_mlp": 0.01032758, + "balance_loss_clip": 1.02847433, + "balance_loss_mlp": 1.02075362, + "epoch": 0.5578836615060875, + "flos": 26505199313280.0, + "grad_norm": 1.55832850861462, + "language_loss": 0.75865424, + "learning_rate": 1.723012284057868e-06, + "loss": 0.779378, + "num_input_tokens_seen": 199811125, + "step": 9279, + "time_per_iteration": 2.9719438552856445 + }, + { + "auxiliary_loss_clip": 0.0103919, + "auxiliary_loss_mlp": 0.01026966, + "balance_loss_clip": 1.02187264, + "balance_loss_mlp": 1.01649928, + "epoch": 0.5579437847587555, + "flos": 20153786780160.0, + "grad_norm": 1.9688168521272553, + "language_loss": 0.67468095, + "learning_rate": 1.7226265807588082e-06, + "loss": 0.69534254, + "num_input_tokens_seen": 199829915, + "step": 9280, + "time_per_iteration": 2.7309787273406982 + }, + { + "auxiliary_loss_clip": 0.01049056, + "auxiliary_loss_mlp": 0.01033623, + "balance_loss_clip": 1.02299452, + "balance_loss_mlp": 1.02245903, + "epoch": 0.5580039080114234, + "flos": 26102281478400.0, + "grad_norm": 1.7606235990799795, + "language_loss": 0.73597729, + "learning_rate": 1.7222408879777763e-06, + "loss": 0.75680411, + "num_input_tokens_seen": 199850670, + "step": 9281, + "time_per_iteration": 2.743551254272461 + }, + { + "auxiliary_loss_clip": 0.01029537, + "auxiliary_loss_mlp": 0.00747454, + "balance_loss_clip": 1.02176571, + "balance_loss_mlp": 1.00037944, + "epoch": 0.5580640312640914, + "flos": 13771671096960.0, + "grad_norm": 3.340906839297444, + "language_loss": 0.75145066, + "learning_rate": 1.7218552057293974e-06, + "loss": 0.76922053, + "num_input_tokens_seen": 199867645, + "step": 9282, + "time_per_iteration": 2.724921703338623 + }, + { + "auxiliary_loss_clip": 0.01005767, + "auxiliary_loss_mlp": 0.0102418, + "balance_loss_clip": 1.02104068, + "balance_loss_mlp": 1.01368904, + "epoch": 0.5581241545167593, + "flos": 17675986792320.0, + "grad_norm": 1.566953115364926, + "language_loss": 0.66045624, + "learning_rate": 1.721469534028297e-06, + "loss": 0.68075573, + "num_input_tokens_seen": 199886320, + "step": 9283, + "time_per_iteration": 2.8754096031188965 + }, + { + "auxiliary_loss_clip": 0.01033342, + "auxiliary_loss_mlp": 0.01024608, + "balance_loss_clip": 1.02470708, + "balance_loss_mlp": 1.01503539, + "epoch": 0.5581842777694274, + "flos": 19569161018880.0, + "grad_norm": 1.8068317879987217, + "language_loss": 0.83006221, + "learning_rate": 1.7210838728890994e-06, + "loss": 0.85064173, + "num_input_tokens_seen": 199904895, + "step": 9284, + "time_per_iteration": 2.8098204135894775 + }, + { + "auxiliary_loss_clip": 0.01043616, + "auxiliary_loss_mlp": 0.01028036, + "balance_loss_clip": 1.0238117, + "balance_loss_mlp": 1.01743865, + "epoch": 0.5582444010220953, + "flos": 20595165102720.0, + "grad_norm": 2.6125434194439427, + "language_loss": 0.8494302, + "learning_rate": 1.7206982223264304e-06, + "loss": 0.87014675, + "num_input_tokens_seen": 199921090, + "step": 9285, + "time_per_iteration": 2.7148947715759277 + }, + { + "auxiliary_loss_clip": 0.01045954, + "auxiliary_loss_mlp": 0.01029221, + "balance_loss_clip": 1.02518499, + "balance_loss_mlp": 1.01883149, + "epoch": 0.5583045242747633, + "flos": 19135504120320.0, + "grad_norm": 2.2763677450695012, + "language_loss": 0.74063671, + "learning_rate": 1.720312582354912e-06, + "loss": 0.76138842, + "num_input_tokens_seen": 199939925, + "step": 9286, + "time_per_iteration": 2.6602654457092285 + }, + { + "auxiliary_loss_clip": 0.01065279, + "auxiliary_loss_mlp": 0.01027388, + "balance_loss_clip": 1.02456355, + "balance_loss_mlp": 1.01685619, + "epoch": 0.5583646475274312, + "flos": 27454569730560.0, + "grad_norm": 2.2329112978947756, + "language_loss": 0.73958206, + "learning_rate": 1.7199269529891684e-06, + "loss": 0.76050872, + "num_input_tokens_seen": 199960015, + "step": 9287, + "time_per_iteration": 2.6205272674560547 + }, + { + "auxiliary_loss_clip": 0.01029659, + "auxiliary_loss_mlp": 0.01030162, + "balance_loss_clip": 1.02421629, + "balance_loss_mlp": 1.01769865, + "epoch": 0.5584247707800992, + "flos": 23653784010240.0, + "grad_norm": 1.6454323534456414, + "language_loss": 0.75321054, + "learning_rate": 1.7195413342438233e-06, + "loss": 0.77380878, + "num_input_tokens_seen": 199980505, + "step": 9288, + "time_per_iteration": 2.687577962875366 + }, + { + "auxiliary_loss_clip": 0.01043823, + "auxiliary_loss_mlp": 0.01036301, + "balance_loss_clip": 1.0251894, + "balance_loss_mlp": 1.02405226, + "epoch": 0.5584848940327671, + "flos": 13698880185600.0, + "grad_norm": 2.1995298294464742, + "language_loss": 0.7746321, + "learning_rate": 1.7191557261334984e-06, + "loss": 0.79543334, + "num_input_tokens_seen": 199999020, + "step": 9289, + "time_per_iteration": 2.6472742557525635 + }, + { + "auxiliary_loss_clip": 0.01040357, + "auxiliary_loss_mlp": 0.01031449, + "balance_loss_clip": 1.02581871, + "balance_loss_mlp": 1.02012432, + "epoch": 0.5585450172854352, + "flos": 27016208150400.0, + "grad_norm": 1.6293466443485407, + "language_loss": 0.6099261, + "learning_rate": 1.718770128672817e-06, + "loss": 0.63064414, + "num_input_tokens_seen": 200019020, + "step": 9290, + "time_per_iteration": 2.675354480743408 + }, + { + "auxiliary_loss_clip": 0.01024787, + "auxiliary_loss_mlp": 0.01029639, + "balance_loss_clip": 1.02510166, + "balance_loss_mlp": 1.01778913, + "epoch": 0.5586051405381031, + "flos": 23185653033600.0, + "grad_norm": 2.1555555575226273, + "language_loss": 0.6787957, + "learning_rate": 1.7183845418764e-06, + "loss": 0.69933999, + "num_input_tokens_seen": 200038110, + "step": 9291, + "time_per_iteration": 4.441439390182495 + }, + { + "auxiliary_loss_clip": 0.01035911, + "auxiliary_loss_mlp": 0.01034615, + "balance_loss_clip": 1.02365851, + "balance_loss_mlp": 1.02287245, + "epoch": 0.5586652637907711, + "flos": 20775544225920.0, + "grad_norm": 3.855994841067396, + "language_loss": 0.83680129, + "learning_rate": 1.7179989657588698e-06, + "loss": 0.85750663, + "num_input_tokens_seen": 200056210, + "step": 9292, + "time_per_iteration": 2.8198049068450928 + }, + { + "auxiliary_loss_clip": 0.01036946, + "auxiliary_loss_mlp": 0.01034676, + "balance_loss_clip": 1.02381194, + "balance_loss_mlp": 1.02347016, + "epoch": 0.5587253870434391, + "flos": 28219897837440.0, + "grad_norm": 2.6058883410110028, + "language_loss": 0.73670483, + "learning_rate": 1.7176134003348476e-06, + "loss": 0.75742102, + "num_input_tokens_seen": 200075620, + "step": 9293, + "time_per_iteration": 4.304965972900391 + }, + { + "auxiliary_loss_clip": 0.01039354, + "auxiliary_loss_mlp": 0.01034007, + "balance_loss_clip": 1.02646208, + "balance_loss_mlp": 1.02357578, + "epoch": 0.558785510296107, + "flos": 26615732440320.0, + "grad_norm": 1.7750667069861237, + "language_loss": 0.72448587, + "learning_rate": 1.7172278456189523e-06, + "loss": 0.74521953, + "num_input_tokens_seen": 200095945, + "step": 9294, + "time_per_iteration": 2.6376755237579346 + }, + { + "auxiliary_loss_clip": 0.01045704, + "auxiliary_loss_mlp": 0.00747492, + "balance_loss_clip": 1.02589583, + "balance_loss_mlp": 1.00036252, + "epoch": 0.558845633548775, + "flos": 20156767608960.0, + "grad_norm": 3.1886893271938757, + "language_loss": 0.68147469, + "learning_rate": 1.716842301625806e-06, + "loss": 0.69940662, + "num_input_tokens_seen": 200114185, + "step": 9295, + "time_per_iteration": 2.588958263397217 + }, + { + "auxiliary_loss_clip": 0.01068255, + "auxiliary_loss_mlp": 0.0102909, + "balance_loss_clip": 1.02767682, + "balance_loss_mlp": 1.01783681, + "epoch": 0.5589057568014429, + "flos": 24350774492160.0, + "grad_norm": 1.4870410421091163, + "language_loss": 0.80704892, + "learning_rate": 1.7164567683700281e-06, + "loss": 0.82802236, + "num_input_tokens_seen": 200135030, + "step": 9296, + "time_per_iteration": 2.628350257873535 + }, + { + "auxiliary_loss_clip": 0.01056504, + "auxiliary_loss_mlp": 0.01028635, + "balance_loss_clip": 1.02628171, + "balance_loss_mlp": 1.01785851, + "epoch": 0.558965880054111, + "flos": 21105168359040.0, + "grad_norm": 1.5600386201792777, + "language_loss": 0.65528458, + "learning_rate": 1.7160712458662379e-06, + "loss": 0.67613602, + "num_input_tokens_seen": 200154290, + "step": 9297, + "time_per_iteration": 2.569636583328247 + }, + { + "auxiliary_loss_clip": 0.01038755, + "auxiliary_loss_mlp": 0.01034424, + "balance_loss_clip": 1.02639854, + "balance_loss_mlp": 1.02260447, + "epoch": 0.5590260033067789, + "flos": 18436071513600.0, + "grad_norm": 2.269289154344512, + "language_loss": 0.75092947, + "learning_rate": 1.7156857341290544e-06, + "loss": 0.77166122, + "num_input_tokens_seen": 200171555, + "step": 9298, + "time_per_iteration": 2.6023974418640137 + }, + { + "auxiliary_loss_clip": 0.00988326, + "auxiliary_loss_mlp": 0.01024082, + "balance_loss_clip": 1.00180125, + "balance_loss_mlp": 1.02283669, + "epoch": 0.5590861265594469, + "flos": 70577432490240.0, + "grad_norm": 0.7110964140948205, + "language_loss": 0.52404219, + "learning_rate": 1.7153002331730967e-06, + "loss": 0.54416633, + "num_input_tokens_seen": 200237010, + "step": 9299, + "time_per_iteration": 3.2329535484313965 + }, + { + "auxiliary_loss_clip": 0.01046962, + "auxiliary_loss_mlp": 0.01028732, + "balance_loss_clip": 1.02293515, + "balance_loss_mlp": 1.01841486, + "epoch": 0.5591462498121148, + "flos": 30664408896000.0, + "grad_norm": 1.7915933900766396, + "language_loss": 0.68969917, + "learning_rate": 1.7149147430129824e-06, + "loss": 0.71045613, + "num_input_tokens_seen": 200260820, + "step": 9300, + "time_per_iteration": 2.7122011184692383 + }, + { + "auxiliary_loss_clip": 0.01007047, + "auxiliary_loss_mlp": 0.01045523, + "balance_loss_clip": 1.01892614, + "balance_loss_mlp": 1.03206992, + "epoch": 0.5592063730647828, + "flos": 18150438562560.0, + "grad_norm": 2.1570254704064764, + "language_loss": 0.81975543, + "learning_rate": 1.7145292636633293e-06, + "loss": 0.84028113, + "num_input_tokens_seen": 200278035, + "step": 9301, + "time_per_iteration": 4.322679281234741 + }, + { + "auxiliary_loss_clip": 0.0106444, + "auxiliary_loss_mlp": 0.01027758, + "balance_loss_clip": 1.02457404, + "balance_loss_mlp": 1.01678479, + "epoch": 0.5592664963174507, + "flos": 24060400945920.0, + "grad_norm": 4.103386095132098, + "language_loss": 0.67904526, + "learning_rate": 1.714143795138756e-06, + "loss": 0.69996721, + "num_input_tokens_seen": 200297255, + "step": 9302, + "time_per_iteration": 2.599994421005249 + }, + { + "auxiliary_loss_clip": 0.01030656, + "auxiliary_loss_mlp": 0.01025729, + "balance_loss_clip": 1.02475309, + "balance_loss_mlp": 1.01371908, + "epoch": 0.5593266195701188, + "flos": 19827897661440.0, + "grad_norm": 1.7261734970724751, + "language_loss": 0.70872676, + "learning_rate": 1.713758337453878e-06, + "loss": 0.7292906, + "num_input_tokens_seen": 200317505, + "step": 9303, + "time_per_iteration": 2.7636709213256836 + }, + { + "auxiliary_loss_clip": 0.00990905, + "auxiliary_loss_mlp": 0.01032749, + "balance_loss_clip": 1.01992905, + "balance_loss_mlp": 1.02125764, + "epoch": 0.5593867428227867, + "flos": 25300755440640.0, + "grad_norm": 1.6607218328847528, + "language_loss": 0.72910941, + "learning_rate": 1.7133728906233124e-06, + "loss": 0.74934602, + "num_input_tokens_seen": 200338350, + "step": 9304, + "time_per_iteration": 2.8487143516540527 + }, + { + "auxiliary_loss_clip": 0.01053794, + "auxiliary_loss_mlp": 0.01028529, + "balance_loss_clip": 1.02352643, + "balance_loss_mlp": 1.0178715, + "epoch": 0.5594468660754547, + "flos": 12933013374720.0, + "grad_norm": 1.9388612616490624, + "language_loss": 0.77706218, + "learning_rate": 1.7129874546616763e-06, + "loss": 0.79788542, + "num_input_tokens_seen": 200353965, + "step": 9305, + "time_per_iteration": 2.559542655944824 + }, + { + "auxiliary_loss_clip": 0.01024884, + "auxiliary_loss_mlp": 0.01024715, + "balance_loss_clip": 1.02606916, + "balance_loss_mlp": 1.01457047, + "epoch": 0.5595069893281227, + "flos": 19062713208960.0, + "grad_norm": 2.003936750648322, + "language_loss": 0.68933749, + "learning_rate": 1.7126020295835836e-06, + "loss": 0.7098335, + "num_input_tokens_seen": 200373595, + "step": 9306, + "time_per_iteration": 4.392803907394409 + }, + { + "auxiliary_loss_clip": 0.00995043, + "auxiliary_loss_mlp": 0.01004127, + "balance_loss_clip": 1.00724995, + "balance_loss_mlp": 1.00289905, + "epoch": 0.5595671125807906, + "flos": 70273375862400.0, + "grad_norm": 0.9157986202772609, + "language_loss": 0.60314798, + "learning_rate": 1.7122166154036518e-06, + "loss": 0.62313974, + "num_input_tokens_seen": 200429155, + "step": 9307, + "time_per_iteration": 3.336430549621582 + }, + { + "auxiliary_loss_clip": 0.01048015, + "auxiliary_loss_mlp": 0.01032636, + "balance_loss_clip": 1.02471757, + "balance_loss_mlp": 1.02187085, + "epoch": 0.5596272358334586, + "flos": 20665513889280.0, + "grad_norm": 1.4876204529891377, + "language_loss": 0.73692429, + "learning_rate": 1.7118312121364943e-06, + "loss": 0.75773084, + "num_input_tokens_seen": 200448290, + "step": 9308, + "time_per_iteration": 2.577190399169922 + }, + { + "auxiliary_loss_clip": 0.00993775, + "auxiliary_loss_mlp": 0.01035514, + "balance_loss_clip": 1.01857471, + "balance_loss_mlp": 1.02288949, + "epoch": 0.5596873590861265, + "flos": 25041013217280.0, + "grad_norm": 3.457699518392467, + "language_loss": 0.69714773, + "learning_rate": 1.7114458197967257e-06, + "loss": 0.71744061, + "num_input_tokens_seen": 200466555, + "step": 9309, + "time_per_iteration": 2.8431665897369385 + }, + { + "auxiliary_loss_clip": 0.01047924, + "auxiliary_loss_mlp": 0.01029869, + "balance_loss_clip": 1.02646184, + "balance_loss_mlp": 1.0169822, + "epoch": 0.5597474823387946, + "flos": 25958387594880.0, + "grad_norm": 2.087406420275459, + "language_loss": 0.7538656, + "learning_rate": 1.7110604383989613e-06, + "loss": 0.7746436, + "num_input_tokens_seen": 200485980, + "step": 9310, + "time_per_iteration": 2.7130656242370605 + }, + { + "auxiliary_loss_clip": 0.01057925, + "auxiliary_loss_mlp": 0.01031282, + "balance_loss_clip": 1.02596009, + "balance_loss_mlp": 1.01924753, + "epoch": 0.5598076055914625, + "flos": 26177442687360.0, + "grad_norm": 2.5289518475188237, + "language_loss": 0.69460332, + "learning_rate": 1.7106750679578133e-06, + "loss": 0.71549535, + "num_input_tokens_seen": 200504555, + "step": 9311, + "time_per_iteration": 2.628981828689575 + }, + { + "auxiliary_loss_clip": 0.0105501, + "auxiliary_loss_mlp": 0.01026854, + "balance_loss_clip": 1.02502608, + "balance_loss_mlp": 1.0156126, + "epoch": 0.5598677288441305, + "flos": 11655778590720.0, + "grad_norm": 1.8072169191599803, + "language_loss": 0.7240175, + "learning_rate": 1.7102897084878962e-06, + "loss": 0.74483609, + "num_input_tokens_seen": 200522700, + "step": 9312, + "time_per_iteration": 2.715104103088379 + }, + { + "auxiliary_loss_clip": 0.01035722, + "auxiliary_loss_mlp": 0.0102939, + "balance_loss_clip": 1.02651, + "balance_loss_mlp": 1.01794028, + "epoch": 0.5599278520967984, + "flos": 22966597941120.0, + "grad_norm": 3.280711727888146, + "language_loss": 0.88585359, + "learning_rate": 1.709904360003822e-06, + "loss": 0.90650469, + "num_input_tokens_seen": 200541910, + "step": 9313, + "time_per_iteration": 2.6877641677856445 + }, + { + "auxiliary_loss_clip": 0.01026839, + "auxiliary_loss_mlp": 0.01034397, + "balance_loss_clip": 1.02430451, + "balance_loss_mlp": 1.02289891, + "epoch": 0.5599879753494664, + "flos": 21215557831680.0, + "grad_norm": 1.4736092523529436, + "language_loss": 0.77905715, + "learning_rate": 1.709519022520204e-06, + "loss": 0.7996695, + "num_input_tokens_seen": 200562600, + "step": 9314, + "time_per_iteration": 2.8099145889282227 + }, + { + "auxiliary_loss_clip": 0.01037704, + "auxiliary_loss_mlp": 0.01026306, + "balance_loss_clip": 1.0266819, + "balance_loss_mlp": 1.01532638, + "epoch": 0.5600480986021343, + "flos": 31903219105920.0, + "grad_norm": 1.5693009789111472, + "language_loss": 0.7021271, + "learning_rate": 1.7091336960516537e-06, + "loss": 0.72276723, + "num_input_tokens_seen": 200584795, + "step": 9315, + "time_per_iteration": 2.931877374649048 + }, + { + "auxiliary_loss_clip": 0.01045036, + "auxiliary_loss_mlp": 0.01037421, + "balance_loss_clip": 1.02349603, + "balance_loss_mlp": 1.02525008, + "epoch": 0.5601082218548024, + "flos": 28476048700800.0, + "grad_norm": 1.9094767829150068, + "language_loss": 0.66665477, + "learning_rate": 1.7087483806127824e-06, + "loss": 0.68747932, + "num_input_tokens_seen": 200606945, + "step": 9316, + "time_per_iteration": 2.776137590408325 + }, + { + "auxiliary_loss_clip": 0.01029608, + "auxiliary_loss_mlp": 0.01035775, + "balance_loss_clip": 1.02348793, + "balance_loss_mlp": 1.02275157, + "epoch": 0.5601683451074703, + "flos": 24097173494400.0, + "grad_norm": 5.160731415084056, + "language_loss": 0.86554027, + "learning_rate": 1.7083630762182022e-06, + "loss": 0.88619411, + "num_input_tokens_seen": 200626340, + "step": 9317, + "time_per_iteration": 2.7421505451202393 + }, + { + "auxiliary_loss_clip": 0.01058938, + "auxiliary_loss_mlp": 0.01035887, + "balance_loss_clip": 1.02559841, + "balance_loss_mlp": 1.02307153, + "epoch": 0.5602284683601383, + "flos": 26356205698560.0, + "grad_norm": 2.2055831847628835, + "language_loss": 0.77369952, + "learning_rate": 1.7079777828825233e-06, + "loss": 0.79464781, + "num_input_tokens_seen": 200644520, + "step": 9318, + "time_per_iteration": 2.794217348098755 + }, + { + "auxiliary_loss_clip": 0.01053498, + "auxiliary_loss_mlp": 0.01031647, + "balance_loss_clip": 1.023525, + "balance_loss_mlp": 1.02102578, + "epoch": 0.5602885916128063, + "flos": 24496392228480.0, + "grad_norm": 1.8028640717227338, + "language_loss": 0.7647258, + "learning_rate": 1.7075925006203558e-06, + "loss": 0.78557724, + "num_input_tokens_seen": 200664845, + "step": 9319, + "time_per_iteration": 2.8823482990264893 + }, + { + "auxiliary_loss_clip": 0.01053811, + "auxiliary_loss_mlp": 0.01032306, + "balance_loss_clip": 1.0244329, + "balance_loss_mlp": 1.02129698, + "epoch": 0.5603487148654742, + "flos": 27345006270720.0, + "grad_norm": 1.3985296612639437, + "language_loss": 0.85277355, + "learning_rate": 1.7072072294463101e-06, + "loss": 0.8736347, + "num_input_tokens_seen": 200686535, + "step": 9320, + "time_per_iteration": 2.7483747005462646 + }, + { + "auxiliary_loss_clip": 0.00999469, + "auxiliary_loss_mlp": 0.01002741, + "balance_loss_clip": 1.00294495, + "balance_loss_mlp": 1.00161481, + "epoch": 0.5604088381181422, + "flos": 54087756180480.0, + "grad_norm": 0.7558861943308995, + "language_loss": 0.52589279, + "learning_rate": 1.706821969374996e-06, + "loss": 0.54591489, + "num_input_tokens_seen": 200736965, + "step": 9321, + "time_per_iteration": 3.0733134746551514 + }, + { + "auxiliary_loss_clip": 0.01043359, + "auxiliary_loss_mlp": 0.01030107, + "balance_loss_clip": 1.02500451, + "balance_loss_mlp": 1.01941407, + "epoch": 0.5604689613708101, + "flos": 22236390357120.0, + "grad_norm": 1.3786746452989969, + "language_loss": 0.74481791, + "learning_rate": 1.7064367204210216e-06, + "loss": 0.76555252, + "num_input_tokens_seen": 200757420, + "step": 9322, + "time_per_iteration": 2.775073766708374 + }, + { + "auxiliary_loss_clip": 0.01066, + "auxiliary_loss_mlp": 0.0103008, + "balance_loss_clip": 1.02582073, + "balance_loss_mlp": 1.01796246, + "epoch": 0.5605290846234782, + "flos": 35297782940160.0, + "grad_norm": 1.5407357791103928, + "language_loss": 0.73780739, + "learning_rate": 1.7060514825989963e-06, + "loss": 0.7587682, + "num_input_tokens_seen": 200779520, + "step": 9323, + "time_per_iteration": 2.8643102645874023 + }, + { + "auxiliary_loss_clip": 0.01049692, + "auxiliary_loss_mlp": 0.01028237, + "balance_loss_clip": 1.02787733, + "balance_loss_mlp": 1.01686478, + "epoch": 0.5605892078761461, + "flos": 20263314326400.0, + "grad_norm": 1.497986703361996, + "language_loss": 0.6142211, + "learning_rate": 1.7056662559235286e-06, + "loss": 0.63500035, + "num_input_tokens_seen": 200799485, + "step": 9324, + "time_per_iteration": 2.8145482540130615 + }, + { + "auxiliary_loss_clip": 0.01009982, + "auxiliary_loss_mlp": 0.01029629, + "balance_loss_clip": 1.02005816, + "balance_loss_mlp": 1.01759505, + "epoch": 0.5606493311288141, + "flos": 17308333134720.0, + "grad_norm": 1.7465261391200324, + "language_loss": 0.87402487, + "learning_rate": 1.705281040409226e-06, + "loss": 0.89442098, + "num_input_tokens_seen": 200817540, + "step": 9325, + "time_per_iteration": 2.8355584144592285 + }, + { + "auxiliary_loss_clip": 0.01042238, + "auxiliary_loss_mlp": 0.01034376, + "balance_loss_clip": 1.02368069, + "balance_loss_mlp": 1.02171612, + "epoch": 0.560709454381482, + "flos": 21652985658240.0, + "grad_norm": 1.472878163937435, + "language_loss": 0.73827732, + "learning_rate": 1.7048958360706952e-06, + "loss": 0.75904351, + "num_input_tokens_seen": 200838380, + "step": 9326, + "time_per_iteration": 2.7571394443511963 + }, + { + "auxiliary_loss_clip": 0.0104157, + "auxiliary_loss_mlp": 0.0103199, + "balance_loss_clip": 1.02400458, + "balance_loss_mlp": 1.01892483, + "epoch": 0.56076957763415, + "flos": 20303355012480.0, + "grad_norm": 2.056164744379275, + "language_loss": 0.77580386, + "learning_rate": 1.7045106429225447e-06, + "loss": 0.79653949, + "num_input_tokens_seen": 200855640, + "step": 9327, + "time_per_iteration": 2.7652645111083984 + }, + { + "auxiliary_loss_clip": 0.01061974, + "auxiliary_loss_mlp": 0.01032112, + "balance_loss_clip": 1.03058195, + "balance_loss_mlp": 1.01976204, + "epoch": 0.5608297008868179, + "flos": 25045897466880.0, + "grad_norm": 1.493494159418311, + "language_loss": 0.78521544, + "learning_rate": 1.7041254609793795e-06, + "loss": 0.80615634, + "num_input_tokens_seen": 200876585, + "step": 9328, + "time_per_iteration": 2.8812758922576904 + }, + { + "auxiliary_loss_clip": 0.01065201, + "auxiliary_loss_mlp": 0.01027506, + "balance_loss_clip": 1.02450132, + "balance_loss_mlp": 1.01620555, + "epoch": 0.560889824139486, + "flos": 19866825025920.0, + "grad_norm": 1.5506888462479518, + "language_loss": 0.73504484, + "learning_rate": 1.7037402902558066e-06, + "loss": 0.75597191, + "num_input_tokens_seen": 200898175, + "step": 9329, + "time_per_iteration": 2.654146194458008 + }, + { + "auxiliary_loss_clip": 0.0104688, + "auxiliary_loss_mlp": 0.00747726, + "balance_loss_clip": 1.0245291, + "balance_loss_mlp": 1.00041318, + "epoch": 0.5609499473921539, + "flos": 22929394429440.0, + "grad_norm": 1.515782505659881, + "language_loss": 0.83322334, + "learning_rate": 1.7033551307664324e-06, + "loss": 0.85116947, + "num_input_tokens_seen": 200917515, + "step": 9330, + "time_per_iteration": 2.6273460388183594 + }, + { + "auxiliary_loss_clip": 0.01007793, + "auxiliary_loss_mlp": 0.01002011, + "balance_loss_clip": 1.00147057, + "balance_loss_mlp": 1.00090814, + "epoch": 0.5610100706448219, + "flos": 53035825455360.0, + "grad_norm": 0.7661507095522289, + "language_loss": 0.57854068, + "learning_rate": 1.7029699825258603e-06, + "loss": 0.59863877, + "num_input_tokens_seen": 200978615, + "step": 9331, + "time_per_iteration": 3.2389230728149414 + }, + { + "auxiliary_loss_clip": 0.01028527, + "auxiliary_loss_mlp": 0.01028376, + "balance_loss_clip": 1.02720094, + "balance_loss_mlp": 1.01693809, + "epoch": 0.5610701938974898, + "flos": 21834944979840.0, + "grad_norm": 1.9042312950663616, + "language_loss": 0.81729215, + "learning_rate": 1.7025848455486971e-06, + "loss": 0.83786118, + "num_input_tokens_seen": 200997745, + "step": 9332, + "time_per_iteration": 2.8871212005615234 + }, + { + "auxiliary_loss_clip": 0.01050812, + "auxiliary_loss_mlp": 0.01038649, + "balance_loss_clip": 1.02457452, + "balance_loss_mlp": 1.02512455, + "epoch": 0.5611303171501578, + "flos": 17457183095040.0, + "grad_norm": 1.7478657189624065, + "language_loss": 0.8187229, + "learning_rate": 1.7021997198495454e-06, + "loss": 0.83961749, + "num_input_tokens_seen": 201016370, + "step": 9333, + "time_per_iteration": 2.741276741027832 + }, + { + "auxiliary_loss_clip": 0.01067146, + "auxiliary_loss_mlp": 0.01028367, + "balance_loss_clip": 1.0261085, + "balance_loss_mlp": 1.01708961, + "epoch": 0.5611904404028258, + "flos": 22637799820800.0, + "grad_norm": 5.513138664427853, + "language_loss": 0.72955275, + "learning_rate": 1.7018146054430108e-06, + "loss": 0.75050789, + "num_input_tokens_seen": 201034310, + "step": 9334, + "time_per_iteration": 2.556569814682007 + }, + { + "auxiliary_loss_clip": 0.01042469, + "auxiliary_loss_mlp": 0.01035569, + "balance_loss_clip": 1.02787387, + "balance_loss_mlp": 1.02405953, + "epoch": 0.5612505636554938, + "flos": 14316327999360.0, + "grad_norm": 1.7715904526463488, + "language_loss": 0.71099389, + "learning_rate": 1.7014295023436961e-06, + "loss": 0.73177433, + "num_input_tokens_seen": 201052030, + "step": 9335, + "time_per_iteration": 2.723590612411499 + }, + { + "auxiliary_loss_clip": 0.01043623, + "auxiliary_loss_mlp": 0.01030019, + "balance_loss_clip": 1.02423418, + "balance_loss_mlp": 1.01824713, + "epoch": 0.5613106869081618, + "flos": 16508279554560.0, + "grad_norm": 1.7886508344201286, + "language_loss": 0.76728928, + "learning_rate": 1.701044410566205e-06, + "loss": 0.78802574, + "num_input_tokens_seen": 201068445, + "step": 9336, + "time_per_iteration": 2.6854307651519775 + }, + { + "auxiliary_loss_clip": 0.01055761, + "auxiliary_loss_mlp": 0.01031215, + "balance_loss_clip": 1.02536583, + "balance_loss_mlp": 1.02002728, + "epoch": 0.5613708101608297, + "flos": 24058569352320.0, + "grad_norm": 2.467423471551919, + "language_loss": 0.64017248, + "learning_rate": 1.7006593301251393e-06, + "loss": 0.66104227, + "num_input_tokens_seen": 201082140, + "step": 9337, + "time_per_iteration": 2.779625415802002 + }, + { + "auxiliary_loss_clip": 0.00989961, + "auxiliary_loss_mlp": 0.01004491, + "balance_loss_clip": 1.00336695, + "balance_loss_mlp": 1.00330472, + "epoch": 0.5614309334134977, + "flos": 64905735997440.0, + "grad_norm": 0.8900451424211233, + "language_loss": 0.62582564, + "learning_rate": 1.700274261035102e-06, + "loss": 0.64577019, + "num_input_tokens_seen": 201137245, + "step": 9338, + "time_per_iteration": 4.757747411727905 + }, + { + "auxiliary_loss_clip": 0.01036198, + "auxiliary_loss_mlp": 0.01028871, + "balance_loss_clip": 1.02468479, + "balance_loss_mlp": 1.01727223, + "epoch": 0.5614910566661656, + "flos": 32919849740160.0, + "grad_norm": 1.7100705964156262, + "language_loss": 0.65604484, + "learning_rate": 1.6998892033106946e-06, + "loss": 0.67669559, + "num_input_tokens_seen": 201157270, + "step": 9339, + "time_per_iteration": 2.781635284423828 + }, + { + "auxiliary_loss_clip": 0.01049249, + "auxiliary_loss_mlp": 0.01033108, + "balance_loss_clip": 1.0238924, + "balance_loss_mlp": 1.02044833, + "epoch": 0.5615511799188336, + "flos": 18588871969920.0, + "grad_norm": 1.8422774388298928, + "language_loss": 0.70013094, + "learning_rate": 1.6995041569665184e-06, + "loss": 0.72095454, + "num_input_tokens_seen": 201174530, + "step": 9340, + "time_per_iteration": 2.611896276473999 + }, + { + "auxiliary_loss_clip": 0.01034877, + "auxiliary_loss_mlp": 0.01030349, + "balance_loss_clip": 1.02757287, + "balance_loss_mlp": 1.01848137, + "epoch": 0.5616113031715015, + "flos": 22820010537600.0, + "grad_norm": 2.7034597491719863, + "language_loss": 0.77473104, + "learning_rate": 1.6991191220171756e-06, + "loss": 0.79538333, + "num_input_tokens_seen": 201194905, + "step": 9341, + "time_per_iteration": 4.3834333419799805 + }, + { + "auxiliary_loss_clip": 0.01018936, + "auxiliary_loss_mlp": 0.01031534, + "balance_loss_clip": 1.02381051, + "balance_loss_mlp": 1.01925504, + "epoch": 0.5616714264241696, + "flos": 22345702421760.0, + "grad_norm": 2.022613609356884, + "language_loss": 0.79538846, + "learning_rate": 1.6987340984772653e-06, + "loss": 0.81589311, + "num_input_tokens_seen": 201213715, + "step": 9342, + "time_per_iteration": 2.7994654178619385 + }, + { + "auxiliary_loss_clip": 0.01039225, + "auxiliary_loss_mlp": 0.01029291, + "balance_loss_clip": 1.02637815, + "balance_loss_mlp": 1.01720285, + "epoch": 0.5617315496768375, + "flos": 18807783408000.0, + "grad_norm": 2.0451560977356626, + "language_loss": 0.76449239, + "learning_rate": 1.6983490863613882e-06, + "loss": 0.78517759, + "num_input_tokens_seen": 201231415, + "step": 9343, + "time_per_iteration": 2.690088987350464 + }, + { + "auxiliary_loss_clip": 0.01028983, + "auxiliary_loss_mlp": 0.0103603, + "balance_loss_clip": 1.02736938, + "balance_loss_mlp": 1.02451444, + "epoch": 0.5617916729295055, + "flos": 18369314087040.0, + "grad_norm": 1.6685067987948803, + "language_loss": 0.68893629, + "learning_rate": 1.6979640856841442e-06, + "loss": 0.70958638, + "num_input_tokens_seen": 201249625, + "step": 9344, + "time_per_iteration": 2.816507339477539 + }, + { + "auxiliary_loss_clip": 0.01066538, + "auxiliary_loss_mlp": 0.01035115, + "balance_loss_clip": 1.025635, + "balance_loss_mlp": 1.02303863, + "epoch": 0.5618517961821734, + "flos": 28179964892160.0, + "grad_norm": 2.073122930135853, + "language_loss": 0.66676772, + "learning_rate": 1.6975790964601318e-06, + "loss": 0.68778419, + "num_input_tokens_seen": 201271205, + "step": 9345, + "time_per_iteration": 2.7211103439331055 + }, + { + "auxiliary_loss_clip": 0.01047419, + "auxiliary_loss_mlp": 0.01028414, + "balance_loss_clip": 1.02673113, + "balance_loss_mlp": 1.01767349, + "epoch": 0.5619119194348414, + "flos": 15486872411520.0, + "grad_norm": 2.4302804901821182, + "language_loss": 0.8764025, + "learning_rate": 1.6971941187039512e-06, + "loss": 0.89716083, + "num_input_tokens_seen": 201287700, + "step": 9346, + "time_per_iteration": 2.654902935028076 + }, + { + "auxiliary_loss_clip": 0.01039903, + "auxiliary_loss_mlp": 0.01030851, + "balance_loss_clip": 1.02461267, + "balance_loss_mlp": 1.01882327, + "epoch": 0.5619720426875094, + "flos": 29128652951040.0, + "grad_norm": 2.3395219760746135, + "language_loss": 0.59334254, + "learning_rate": 1.6968091524301993e-06, + "loss": 0.61405015, + "num_input_tokens_seen": 201307530, + "step": 9347, + "time_per_iteration": 2.7076804637908936 + }, + { + "auxiliary_loss_clip": 0.0105923, + "auxiliary_loss_mlp": 0.01032133, + "balance_loss_clip": 1.02683175, + "balance_loss_mlp": 1.01911557, + "epoch": 0.5620321659401774, + "flos": 18003743418240.0, + "grad_norm": 2.224110329901913, + "language_loss": 0.68883747, + "learning_rate": 1.6964241976534745e-06, + "loss": 0.70975113, + "num_input_tokens_seen": 201326210, + "step": 9348, + "time_per_iteration": 4.1927478313446045 + }, + { + "auxiliary_loss_clip": 0.01024245, + "auxiliary_loss_mlp": 0.01024866, + "balance_loss_clip": 1.0227139, + "balance_loss_mlp": 1.01180112, + "epoch": 0.5620922891928454, + "flos": 20594518657920.0, + "grad_norm": 1.9614933857121695, + "language_loss": 0.79473466, + "learning_rate": 1.6960392543883754e-06, + "loss": 0.81522584, + "num_input_tokens_seen": 201346120, + "step": 9349, + "time_per_iteration": 2.735153913497925 + }, + { + "auxiliary_loss_clip": 0.01014366, + "auxiliary_loss_mlp": 0.0103244, + "balance_loss_clip": 1.02438688, + "balance_loss_mlp": 1.01997638, + "epoch": 0.5621524124455133, + "flos": 26287006147200.0, + "grad_norm": 1.8743955430088957, + "language_loss": 0.67149693, + "learning_rate": 1.6956543226494975e-06, + "loss": 0.69196498, + "num_input_tokens_seen": 201365700, + "step": 9350, + "time_per_iteration": 2.75602388381958 + }, + { + "auxiliary_loss_clip": 0.01016222, + "auxiliary_loss_mlp": 0.01036003, + "balance_loss_clip": 1.02464592, + "balance_loss_mlp": 1.02339053, + "epoch": 0.5622125356981813, + "flos": 12750299867520.0, + "grad_norm": 1.842218508916537, + "language_loss": 0.78638196, + "learning_rate": 1.6952694024514381e-06, + "loss": 0.8069042, + "num_input_tokens_seen": 201382795, + "step": 9351, + "time_per_iteration": 2.663404703140259 + }, + { + "auxiliary_loss_clip": 0.01043009, + "auxiliary_loss_mlp": 0.00747703, + "balance_loss_clip": 1.02391899, + "balance_loss_mlp": 1.00047004, + "epoch": 0.5622726589508492, + "flos": 23805327490560.0, + "grad_norm": 1.6850259470367186, + "language_loss": 0.58983743, + "learning_rate": 1.6948844938087945e-06, + "loss": 0.60774451, + "num_input_tokens_seen": 201402780, + "step": 9352, + "time_per_iteration": 2.695796489715576 + }, + { + "auxiliary_loss_clip": 0.01052399, + "auxiliary_loss_mlp": 0.01029665, + "balance_loss_clip": 1.02480233, + "balance_loss_mlp": 1.0189724, + "epoch": 0.5623327822035172, + "flos": 24718212668160.0, + "grad_norm": 1.2914343525314176, + "language_loss": 0.71858883, + "learning_rate": 1.6944995967361604e-06, + "loss": 0.73940945, + "num_input_tokens_seen": 201424140, + "step": 9353, + "time_per_iteration": 2.6186792850494385 + }, + { + "auxiliary_loss_clip": 0.01048412, + "auxiliary_loss_mlp": 0.01026681, + "balance_loss_clip": 1.02643025, + "balance_loss_mlp": 1.01484358, + "epoch": 0.5623929054561851, + "flos": 14019274523520.0, + "grad_norm": 2.469029784981588, + "language_loss": 0.75839448, + "learning_rate": 1.6941147112481327e-06, + "loss": 0.77914536, + "num_input_tokens_seen": 201439645, + "step": 9354, + "time_per_iteration": 4.275767087936401 + }, + { + "auxiliary_loss_clip": 0.01039188, + "auxiliary_loss_mlp": 0.0103196, + "balance_loss_clip": 1.0260005, + "balance_loss_mlp": 1.02020597, + "epoch": 0.5624530287088532, + "flos": 20704405340160.0, + "grad_norm": 1.955562208474424, + "language_loss": 0.72950196, + "learning_rate": 1.6937298373593056e-06, + "loss": 0.7502135, + "num_input_tokens_seen": 201459970, + "step": 9355, + "time_per_iteration": 2.7360613346099854 + }, + { + "auxiliary_loss_clip": 0.01057299, + "auxiliary_loss_mlp": 0.01028264, + "balance_loss_clip": 1.02640748, + "balance_loss_mlp": 1.0168736, + "epoch": 0.5625131519615211, + "flos": 21470918595840.0, + "grad_norm": 1.4499351699953391, + "language_loss": 0.73730993, + "learning_rate": 1.693344975084274e-06, + "loss": 0.7581656, + "num_input_tokens_seen": 201480055, + "step": 9356, + "time_per_iteration": 2.5687923431396484 + }, + { + "auxiliary_loss_clip": 0.01067394, + "auxiliary_loss_mlp": 0.01032884, + "balance_loss_clip": 1.02760482, + "balance_loss_mlp": 1.02151787, + "epoch": 0.5625732752141891, + "flos": 18698004466560.0, + "grad_norm": 1.9666226290361197, + "language_loss": 0.83507794, + "learning_rate": 1.6929601244376318e-06, + "loss": 0.85608071, + "num_input_tokens_seen": 201497645, + "step": 9357, + "time_per_iteration": 2.496483087539673 + }, + { + "auxiliary_loss_clip": 0.01055511, + "auxiliary_loss_mlp": 0.01029749, + "balance_loss_clip": 1.02440786, + "balance_loss_mlp": 1.01851368, + "epoch": 0.562633398466857, + "flos": 16216900427520.0, + "grad_norm": 2.040119288554703, + "language_loss": 0.72062361, + "learning_rate": 1.6925752854339722e-06, + "loss": 0.74147624, + "num_input_tokens_seen": 201515455, + "step": 9358, + "time_per_iteration": 2.5268194675445557 + }, + { + "auxiliary_loss_clip": 0.01065675, + "auxiliary_loss_mlp": 0.01038164, + "balance_loss_clip": 1.02582693, + "balance_loss_mlp": 1.0266012, + "epoch": 0.562693521719525, + "flos": 22491930689280.0, + "grad_norm": 3.1908392638661924, + "language_loss": 0.77928984, + "learning_rate": 1.6921904580878885e-06, + "loss": 0.8003282, + "num_input_tokens_seen": 201534500, + "step": 9359, + "time_per_iteration": 2.5938222408294678 + }, + { + "auxiliary_loss_clip": 0.01045023, + "auxiliary_loss_mlp": 0.01029554, + "balance_loss_clip": 1.024225, + "balance_loss_mlp": 1.01862204, + "epoch": 0.562753644972193, + "flos": 25331171281920.0, + "grad_norm": 1.8005579433725456, + "language_loss": 0.70584756, + "learning_rate": 1.6918056424139736e-06, + "loss": 0.72659332, + "num_input_tokens_seen": 201553280, + "step": 9360, + "time_per_iteration": 2.6782796382904053 + }, + { + "auxiliary_loss_clip": 0.00962185, + "auxiliary_loss_mlp": 0.01005608, + "balance_loss_clip": 1.00467634, + "balance_loss_mlp": 1.0042311, + "epoch": 0.562813768224861, + "flos": 67392622126080.0, + "grad_norm": 0.7837305910092562, + "language_loss": 0.55602574, + "learning_rate": 1.6914208384268197e-06, + "loss": 0.57570362, + "num_input_tokens_seen": 201610030, + "step": 9361, + "time_per_iteration": 3.1984059810638428 + }, + { + "auxiliary_loss_clip": 0.01045038, + "auxiliary_loss_mlp": 0.01034014, + "balance_loss_clip": 1.0262444, + "balance_loss_mlp": 1.02353513, + "epoch": 0.562873891477529, + "flos": 23331163029120.0, + "grad_norm": 1.4829249230666486, + "language_loss": 0.81860614, + "learning_rate": 1.691036046141018e-06, + "loss": 0.83939672, + "num_input_tokens_seen": 201628370, + "step": 9362, + "time_per_iteration": 2.6050400733947754 + }, + { + "auxiliary_loss_clip": 0.01034902, + "auxiliary_loss_mlp": 0.00747597, + "balance_loss_clip": 1.02469981, + "balance_loss_mlp": 1.00041533, + "epoch": 0.5629340147301969, + "flos": 38472824805120.0, + "grad_norm": 1.5741731533442245, + "language_loss": 0.74633265, + "learning_rate": 1.6906512655711614e-06, + "loss": 0.76415759, + "num_input_tokens_seen": 201649790, + "step": 9363, + "time_per_iteration": 2.8649234771728516 + }, + { + "auxiliary_loss_clip": 0.01057221, + "auxiliary_loss_mlp": 0.0103264, + "balance_loss_clip": 1.02478325, + "balance_loss_mlp": 1.0206778, + "epoch": 0.5629941379828649, + "flos": 29242023252480.0, + "grad_norm": 1.758116378956645, + "language_loss": 0.82779658, + "learning_rate": 1.690266496731839e-06, + "loss": 0.84869516, + "num_input_tokens_seen": 201669175, + "step": 9364, + "time_per_iteration": 2.883665084838867 + }, + { + "auxiliary_loss_clip": 0.01025293, + "auxiliary_loss_mlp": 0.01032124, + "balance_loss_clip": 1.02323294, + "balance_loss_mlp": 1.02172327, + "epoch": 0.5630542612355328, + "flos": 19420885676160.0, + "grad_norm": 2.040914814190168, + "language_loss": 0.64951658, + "learning_rate": 1.689881739637642e-06, + "loss": 0.67009079, + "num_input_tokens_seen": 201687000, + "step": 9365, + "time_per_iteration": 3.03125 + }, + { + "auxiliary_loss_clip": 0.0105381, + "auxiliary_loss_mlp": 0.01035535, + "balance_loss_clip": 1.0284301, + "balance_loss_mlp": 1.02287483, + "epoch": 0.5631143844882008, + "flos": 22266303408000.0, + "grad_norm": 4.074276516332284, + "language_loss": 0.81836617, + "learning_rate": 1.6894969943031611e-06, + "loss": 0.83925962, + "num_input_tokens_seen": 201703335, + "step": 9366, + "time_per_iteration": 2.8808722496032715 + }, + { + "auxiliary_loss_clip": 0.0106723, + "auxiliary_loss_mlp": 0.01028371, + "balance_loss_clip": 1.02763748, + "balance_loss_mlp": 1.01794004, + "epoch": 0.5631745077408687, + "flos": 22965305051520.0, + "grad_norm": 1.7094231619338114, + "language_loss": 0.73044145, + "learning_rate": 1.6891122607429845e-06, + "loss": 0.75139749, + "num_input_tokens_seen": 201723495, + "step": 9367, + "time_per_iteration": 2.663409471511841 + }, + { + "auxiliary_loss_clip": 0.00990197, + "auxiliary_loss_mlp": 0.01001769, + "balance_loss_clip": 1.00377679, + "balance_loss_mlp": 1.00070858, + "epoch": 0.5632346309935368, + "flos": 65080515576960.0, + "grad_norm": 0.6165782212361233, + "language_loss": 0.534899, + "learning_rate": 1.6887275389717028e-06, + "loss": 0.55481869, + "num_input_tokens_seen": 201792615, + "step": 9368, + "time_per_iteration": 3.2661445140838623 + }, + { + "auxiliary_loss_clip": 0.01067417, + "auxiliary_loss_mlp": 0.01033029, + "balance_loss_clip": 1.02723813, + "balance_loss_mlp": 1.02188337, + "epoch": 0.5632947542462047, + "flos": 23002903612800.0, + "grad_norm": 7.902803662307048, + "language_loss": 0.68727791, + "learning_rate": 1.6883428290039046e-06, + "loss": 0.70828241, + "num_input_tokens_seen": 201812520, + "step": 9369, + "time_per_iteration": 2.4942288398742676 + }, + { + "auxiliary_loss_clip": 0.0102811, + "auxiliary_loss_mlp": 0.01033508, + "balance_loss_clip": 1.02154684, + "balance_loss_mlp": 1.02173603, + "epoch": 0.5633548774988727, + "flos": 30482593228800.0, + "grad_norm": 1.6565248431210586, + "language_loss": 0.75637227, + "learning_rate": 1.6879581308541763e-06, + "loss": 0.77698851, + "num_input_tokens_seen": 201834185, + "step": 9370, + "time_per_iteration": 2.6831748485565186 + }, + { + "auxiliary_loss_clip": 0.01043167, + "auxiliary_loss_mlp": 0.01031906, + "balance_loss_clip": 1.02559185, + "balance_loss_mlp": 1.01986575, + "epoch": 0.5634150007515406, + "flos": 18515039564160.0, + "grad_norm": 2.902939857793512, + "language_loss": 0.75405598, + "learning_rate": 1.687573444537108e-06, + "loss": 0.77480674, + "num_input_tokens_seen": 201851305, + "step": 9371, + "time_per_iteration": 2.546858549118042 + }, + { + "auxiliary_loss_clip": 0.01053696, + "auxiliary_loss_mlp": 0.01034265, + "balance_loss_clip": 1.02450609, + "balance_loss_mlp": 1.02364314, + "epoch": 0.5634751240042086, + "flos": 19244672530560.0, + "grad_norm": 1.6978960351108967, + "language_loss": 0.7560252, + "learning_rate": 1.687188770067285e-06, + "loss": 0.77690476, + "num_input_tokens_seen": 201870350, + "step": 9372, + "time_per_iteration": 2.6111016273498535 + }, + { + "auxiliary_loss_clip": 0.01040059, + "auxiliary_loss_mlp": 0.01031406, + "balance_loss_clip": 1.0243125, + "balance_loss_mlp": 1.02021861, + "epoch": 0.5635352472568766, + "flos": 12020630987520.0, + "grad_norm": 1.831800289430379, + "language_loss": 0.71584153, + "learning_rate": 1.6868041074592956e-06, + "loss": 0.73655617, + "num_input_tokens_seen": 201886800, + "step": 9373, + "time_per_iteration": 2.5407423973083496 + }, + { + "auxiliary_loss_clip": 0.01036294, + "auxiliary_loss_mlp": 0.01032666, + "balance_loss_clip": 1.02586651, + "balance_loss_mlp": 1.02063227, + "epoch": 0.5635953705095446, + "flos": 21871645701120.0, + "grad_norm": 2.3201171867443375, + "language_loss": 0.82835108, + "learning_rate": 1.6864194567277264e-06, + "loss": 0.84904075, + "num_input_tokens_seen": 201904730, + "step": 9374, + "time_per_iteration": 2.640533447265625 + }, + { + "auxiliary_loss_clip": 0.01051408, + "auxiliary_loss_mlp": 0.0102551, + "balance_loss_clip": 1.02264392, + "balance_loss_mlp": 1.01483464, + "epoch": 0.5636554937622126, + "flos": 27126166659840.0, + "grad_norm": 1.8760653525925144, + "language_loss": 0.66845489, + "learning_rate": 1.6860348178871618e-06, + "loss": 0.68922406, + "num_input_tokens_seen": 201924850, + "step": 9375, + "time_per_iteration": 2.6192314624786377 + }, + { + "auxiliary_loss_clip": 0.01041776, + "auxiliary_loss_mlp": 0.00747618, + "balance_loss_clip": 1.0289191, + "balance_loss_mlp": 1.00045276, + "epoch": 0.5637156170148805, + "flos": 12926405272320.0, + "grad_norm": 2.1284191182477104, + "language_loss": 0.81134313, + "learning_rate": 1.6856501909521889e-06, + "loss": 0.82923704, + "num_input_tokens_seen": 201939500, + "step": 9376, + "time_per_iteration": 2.6504902839660645 + }, + { + "auxiliary_loss_clip": 0.01047492, + "auxiliary_loss_mlp": 0.01032003, + "balance_loss_clip": 1.02426064, + "balance_loss_mlp": 1.01978397, + "epoch": 0.5637757402675485, + "flos": 45551033130240.0, + "grad_norm": 1.383080231987003, + "language_loss": 0.69232869, + "learning_rate": 1.6852655759373925e-06, + "loss": 0.71312368, + "num_input_tokens_seen": 201963000, + "step": 9377, + "time_per_iteration": 2.8242716789245605 + }, + { + "auxiliary_loss_clip": 0.01019077, + "auxiliary_loss_mlp": 0.01027435, + "balance_loss_clip": 1.02249599, + "balance_loss_mlp": 1.01650381, + "epoch": 0.5638358635202164, + "flos": 20886041439360.0, + "grad_norm": 1.393108619487279, + "language_loss": 0.7468031, + "learning_rate": 1.6848809728573565e-06, + "loss": 0.76726824, + "num_input_tokens_seen": 201983145, + "step": 9378, + "time_per_iteration": 2.8202896118164062 + }, + { + "auxiliary_loss_clip": 0.01069821, + "auxiliary_loss_mlp": 0.0103506, + "balance_loss_clip": 1.02554154, + "balance_loss_mlp": 1.02264476, + "epoch": 0.5638959867728844, + "flos": 18806562345600.0, + "grad_norm": 4.553094530479648, + "language_loss": 0.81880033, + "learning_rate": 1.6844963817266656e-06, + "loss": 0.83984911, + "num_input_tokens_seen": 202000335, + "step": 9379, + "time_per_iteration": 2.690945625305176 + }, + { + "auxiliary_loss_clip": 0.01040834, + "auxiliary_loss_mlp": 0.01030223, + "balance_loss_clip": 1.02325141, + "balance_loss_mlp": 1.01889205, + "epoch": 0.5639561100255523, + "flos": 27490336698240.0, + "grad_norm": 3.5487790696046284, + "language_loss": 0.71382809, + "learning_rate": 1.6841118025599042e-06, + "loss": 0.73453867, + "num_input_tokens_seen": 202018275, + "step": 9380, + "time_per_iteration": 2.7364988327026367 + }, + { + "auxiliary_loss_clip": 0.01034152, + "auxiliary_loss_mlp": 0.01033798, + "balance_loss_clip": 1.0307889, + "balance_loss_mlp": 1.02212787, + "epoch": 0.5640162332782204, + "flos": 18076570243200.0, + "grad_norm": 1.9886787787768754, + "language_loss": 0.74205673, + "learning_rate": 1.6837272353716542e-06, + "loss": 0.7627362, + "num_input_tokens_seen": 202034330, + "step": 9381, + "time_per_iteration": 2.664071798324585 + }, + { + "auxiliary_loss_clip": 0.01009703, + "auxiliary_loss_mlp": 0.01033236, + "balance_loss_clip": 1.01973343, + "balance_loss_mlp": 1.02151752, + "epoch": 0.5640763565308883, + "flos": 20884856290560.0, + "grad_norm": 2.238258110882435, + "language_loss": 0.72015345, + "learning_rate": 1.683342680176499e-06, + "loss": 0.74058282, + "num_input_tokens_seen": 202053100, + "step": 9382, + "time_per_iteration": 2.6801962852478027 + }, + { + "auxiliary_loss_clip": 0.01007717, + "auxiliary_loss_mlp": 0.01001361, + "balance_loss_clip": 1.00154781, + "balance_loss_mlp": 1.00031221, + "epoch": 0.5641364797835563, + "flos": 64447912224000.0, + "grad_norm": 0.7105392863289135, + "language_loss": 0.54424143, + "learning_rate": 1.682958136989022e-06, + "loss": 0.56433225, + "num_input_tokens_seen": 202120125, + "step": 9383, + "time_per_iteration": 3.298630952835083 + }, + { + "auxiliary_loss_clip": 0.01050927, + "auxiliary_loss_mlp": 0.01030237, + "balance_loss_clip": 1.02420235, + "balance_loss_mlp": 1.01800609, + "epoch": 0.5641966030362242, + "flos": 18660944609280.0, + "grad_norm": 1.6694906120355095, + "language_loss": 0.70461667, + "learning_rate": 1.6825736058238033e-06, + "loss": 0.72542828, + "num_input_tokens_seen": 202138030, + "step": 9384, + "time_per_iteration": 2.565993070602417 + }, + { + "auxiliary_loss_clip": 0.01046856, + "auxiliary_loss_mlp": 0.01030811, + "balance_loss_clip": 1.02610588, + "balance_loss_mlp": 1.01933074, + "epoch": 0.5642567262888922, + "flos": 22492325738880.0, + "grad_norm": 1.9408297185074916, + "language_loss": 0.75967038, + "learning_rate": 1.6821890866954263e-06, + "loss": 0.78044701, + "num_input_tokens_seen": 202155580, + "step": 9385, + "time_per_iteration": 4.300196647644043 + }, + { + "auxiliary_loss_clip": 0.01048826, + "auxiliary_loss_mlp": 0.01030963, + "balance_loss_clip": 1.02191305, + "balance_loss_mlp": 1.01969743, + "epoch": 0.5643168495415603, + "flos": 13003972692480.0, + "grad_norm": 2.6063687097648596, + "language_loss": 0.81951904, + "learning_rate": 1.6818045796184703e-06, + "loss": 0.84031689, + "num_input_tokens_seen": 202170365, + "step": 9386, + "time_per_iteration": 2.5667240619659424 + }, + { + "auxiliary_loss_clip": 0.01053083, + "auxiliary_loss_mlp": 0.0103334, + "balance_loss_clip": 1.02684045, + "balance_loss_mlp": 1.02125812, + "epoch": 0.5643769727942282, + "flos": 18588297352320.0, + "grad_norm": 1.9278275739015556, + "language_loss": 0.69749427, + "learning_rate": 1.681420084607516e-06, + "loss": 0.71835852, + "num_input_tokens_seen": 202189095, + "step": 9387, + "time_per_iteration": 2.6643707752227783 + }, + { + "auxiliary_loss_clip": 0.01059126, + "auxiliary_loss_mlp": 0.01033952, + "balance_loss_clip": 1.02703047, + "balance_loss_mlp": 1.02294278, + "epoch": 0.5644370960468962, + "flos": 33806269572480.0, + "grad_norm": 3.0585313725209295, + "language_loss": 0.74771416, + "learning_rate": 1.6810356016771452e-06, + "loss": 0.76864493, + "num_input_tokens_seen": 202213500, + "step": 9388, + "time_per_iteration": 4.314350843429565 + }, + { + "auxiliary_loss_clip": 0.01052747, + "auxiliary_loss_mlp": 0.01029786, + "balance_loss_clip": 1.02479219, + "balance_loss_mlp": 1.02014828, + "epoch": 0.5644972192995641, + "flos": 21214911386880.0, + "grad_norm": 1.568573595455805, + "language_loss": 0.82271063, + "learning_rate": 1.6806511308419353e-06, + "loss": 0.84353602, + "num_input_tokens_seen": 202231920, + "step": 9389, + "time_per_iteration": 2.6489176750183105 + }, + { + "auxiliary_loss_clip": 0.0102936, + "auxiliary_loss_mlp": 0.01032608, + "balance_loss_clip": 1.02179027, + "balance_loss_mlp": 1.02014494, + "epoch": 0.5645573425522321, + "flos": 18587722734720.0, + "grad_norm": 2.3688856646065206, + "language_loss": 0.63849854, + "learning_rate": 1.680266672116467e-06, + "loss": 0.65911829, + "num_input_tokens_seen": 202247600, + "step": 9390, + "time_per_iteration": 2.621819496154785 + }, + { + "auxiliary_loss_clip": 0.01047157, + "auxiliary_loss_mlp": 0.01029293, + "balance_loss_clip": 1.02767038, + "balance_loss_mlp": 1.01918411, + "epoch": 0.5646174658049, + "flos": 18113809668480.0, + "grad_norm": 9.34581340898836, + "language_loss": 0.92002928, + "learning_rate": 1.6798822255153192e-06, + "loss": 0.94079381, + "num_input_tokens_seen": 202265350, + "step": 9391, + "time_per_iteration": 2.58583664894104 + }, + { + "auxiliary_loss_clip": 0.01062465, + "auxiliary_loss_mlp": 0.0103448, + "balance_loss_clip": 1.02726865, + "balance_loss_mlp": 1.02182603, + "epoch": 0.564677589057568, + "flos": 28329964087680.0, + "grad_norm": 1.8346758539838337, + "language_loss": 0.59692949, + "learning_rate": 1.6794977910530684e-06, + "loss": 0.61789894, + "num_input_tokens_seen": 202284285, + "step": 9392, + "time_per_iteration": 2.6027400493621826 + }, + { + "auxiliary_loss_clip": 0.01018565, + "auxiliary_loss_mlp": 0.01025221, + "balance_loss_clip": 1.02248979, + "balance_loss_mlp": 1.0135386, + "epoch": 0.564737712310236, + "flos": 22163743100160.0, + "grad_norm": 2.318742523987303, + "language_loss": 0.8114351, + "learning_rate": 1.6791133687442937e-06, + "loss": 0.83187306, + "num_input_tokens_seen": 202303450, + "step": 9393, + "time_per_iteration": 2.7278707027435303 + }, + { + "auxiliary_loss_clip": 0.01045953, + "auxiliary_loss_mlp": 0.01030725, + "balance_loss_clip": 1.02643943, + "balance_loss_mlp": 1.02002025, + "epoch": 0.564797835562904, + "flos": 20959011918720.0, + "grad_norm": 1.6310412090538933, + "language_loss": 0.86986363, + "learning_rate": 1.6787289586035725e-06, + "loss": 0.89063042, + "num_input_tokens_seen": 202322315, + "step": 9394, + "time_per_iteration": 2.7215845584869385 + }, + { + "auxiliary_loss_clip": 0.01053186, + "auxiliary_loss_mlp": 0.01029383, + "balance_loss_clip": 1.02646971, + "balance_loss_mlp": 1.01873815, + "epoch": 0.5648579588155719, + "flos": 17420302805760.0, + "grad_norm": 2.8554273174900864, + "language_loss": 0.84576881, + "learning_rate": 1.6783445606454814e-06, + "loss": 0.86659449, + "num_input_tokens_seen": 202339905, + "step": 9395, + "time_per_iteration": 4.204956769943237 + }, + { + "auxiliary_loss_clip": 0.00997342, + "auxiliary_loss_mlp": 0.01001936, + "balance_loss_clip": 1.00118613, + "balance_loss_mlp": 1.00087535, + "epoch": 0.5649180820682399, + "flos": 69929568835200.0, + "grad_norm": 1.4794038175977822, + "language_loss": 0.58310735, + "learning_rate": 1.677960174884597e-06, + "loss": 0.60310012, + "num_input_tokens_seen": 202397320, + "step": 9396, + "time_per_iteration": 3.2101330757141113 + }, + { + "auxiliary_loss_clip": 0.0104876, + "auxiliary_loss_mlp": 0.01030504, + "balance_loss_clip": 1.02595258, + "balance_loss_mlp": 1.01947165, + "epoch": 0.5649782053209078, + "flos": 24973070641920.0, + "grad_norm": 2.591299276055574, + "language_loss": 0.69928533, + "learning_rate": 1.6775758013354943e-06, + "loss": 0.72007799, + "num_input_tokens_seen": 202416865, + "step": 9397, + "time_per_iteration": 2.699143886566162 + }, + { + "auxiliary_loss_clip": 0.01041102, + "auxiliary_loss_mlp": 0.01032566, + "balance_loss_clip": 1.02790928, + "balance_loss_mlp": 1.02104485, + "epoch": 0.5650383285735758, + "flos": 21726602582400.0, + "grad_norm": 1.8307563449364328, + "language_loss": 0.67381197, + "learning_rate": 1.67719144001275e-06, + "loss": 0.69454861, + "num_input_tokens_seen": 202436210, + "step": 9398, + "time_per_iteration": 2.6634602546691895 + }, + { + "auxiliary_loss_clip": 0.00988954, + "auxiliary_loss_mlp": 0.01001508, + "balance_loss_clip": 1.00203991, + "balance_loss_mlp": 1.0003159, + "epoch": 0.5650984518262439, + "flos": 65904484636800.0, + "grad_norm": 0.7661246490377432, + "language_loss": 0.58107889, + "learning_rate": 1.6768070909309386e-06, + "loss": 0.6009835, + "num_input_tokens_seen": 202492925, + "step": 9399, + "time_per_iteration": 3.1954355239868164 + }, + { + "auxiliary_loss_clip": 0.01020017, + "auxiliary_loss_mlp": 0.01033809, + "balance_loss_clip": 1.021909, + "balance_loss_mlp": 1.0209583, + "epoch": 0.5651585750789118, + "flos": 21032592929280.0, + "grad_norm": 1.823525922080291, + "language_loss": 0.73650181, + "learning_rate": 1.6764227541046347e-06, + "loss": 0.75704002, + "num_input_tokens_seen": 202511905, + "step": 9400, + "time_per_iteration": 2.795546054840088 + }, + { + "auxiliary_loss_clip": 0.01041438, + "auxiliary_loss_mlp": 0.01032808, + "balance_loss_clip": 1.02702451, + "balance_loss_mlp": 1.02026749, + "epoch": 0.5652186983315798, + "flos": 18551919853440.0, + "grad_norm": 1.7585553604675128, + "language_loss": 0.60652971, + "learning_rate": 1.676038429548412e-06, + "loss": 0.62727225, + "num_input_tokens_seen": 202529815, + "step": 9401, + "time_per_iteration": 4.383429527282715 + }, + { + "auxiliary_loss_clip": 0.01023719, + "auxiliary_loss_mlp": 0.01026712, + "balance_loss_clip": 1.0217675, + "balance_loss_mlp": 1.01607835, + "epoch": 0.5652788215842477, + "flos": 18478662065280.0, + "grad_norm": 1.8528177730526116, + "language_loss": 0.81195927, + "learning_rate": 1.6756541172768453e-06, + "loss": 0.83246356, + "num_input_tokens_seen": 202547710, + "step": 9402, + "time_per_iteration": 2.835390567779541 + }, + { + "auxiliary_loss_clip": 0.01007795, + "auxiliary_loss_mlp": 0.01035935, + "balance_loss_clip": 1.01885116, + "balance_loss_mlp": 1.02371562, + "epoch": 0.5653389448369157, + "flos": 30044052080640.0, + "grad_norm": 1.4423172362176333, + "language_loss": 0.77566552, + "learning_rate": 1.6752698173045068e-06, + "loss": 0.79610282, + "num_input_tokens_seen": 202568835, + "step": 9403, + "time_per_iteration": 2.7917447090148926 + }, + { + "auxiliary_loss_clip": 0.01011374, + "auxiliary_loss_mlp": 0.01035024, + "balance_loss_clip": 1.02175164, + "balance_loss_mlp": 1.02241755, + "epoch": 0.5653990680895836, + "flos": 16727550128640.0, + "grad_norm": 1.5425217125219461, + "language_loss": 0.69022, + "learning_rate": 1.6748855296459685e-06, + "loss": 0.710684, + "num_input_tokens_seen": 202587385, + "step": 9404, + "time_per_iteration": 2.679079532623291 + }, + { + "auxiliary_loss_clip": 0.01038652, + "auxiliary_loss_mlp": 0.01027229, + "balance_loss_clip": 1.02714467, + "balance_loss_mlp": 1.01663172, + "epoch": 0.5654591913422516, + "flos": 14538256179840.0, + "grad_norm": 1.9286023917791644, + "language_loss": 0.67201626, + "learning_rate": 1.6745012543158045e-06, + "loss": 0.69267499, + "num_input_tokens_seen": 202604815, + "step": 9405, + "time_per_iteration": 2.5880117416381836 + }, + { + "auxiliary_loss_clip": 0.01034013, + "auxiliary_loss_mlp": 0.0103113, + "balance_loss_clip": 1.02320123, + "balance_loss_mlp": 1.02060938, + "epoch": 0.5655193145949196, + "flos": 26209905603840.0, + "grad_norm": 5.088020573059957, + "language_loss": 0.74321413, + "learning_rate": 1.6741169913285852e-06, + "loss": 0.76386553, + "num_input_tokens_seen": 202623775, + "step": 9406, + "time_per_iteration": 2.6851541996002197 + }, + { + "auxiliary_loss_clip": 0.0101543, + "auxiliary_loss_mlp": 0.01036173, + "balance_loss_clip": 1.0253377, + "balance_loss_mlp": 1.02260113, + "epoch": 0.5655794378475876, + "flos": 25046579825280.0, + "grad_norm": 2.1481001345814352, + "language_loss": 0.79558086, + "learning_rate": 1.673732740698882e-06, + "loss": 0.8160969, + "num_input_tokens_seen": 202643375, + "step": 9407, + "time_per_iteration": 2.7381975650787354 + }, + { + "auxiliary_loss_clip": 0.01027469, + "auxiliary_loss_mlp": 0.01033598, + "balance_loss_clip": 1.02316773, + "balance_loss_mlp": 1.02187395, + "epoch": 0.5656395611002555, + "flos": 31032852652800.0, + "grad_norm": 1.6106303635783006, + "language_loss": 0.70998937, + "learning_rate": 1.6733485024412666e-06, + "loss": 0.73060006, + "num_input_tokens_seen": 202668400, + "step": 9408, + "time_per_iteration": 2.7307403087615967 + }, + { + "auxiliary_loss_clip": 0.01018499, + "auxiliary_loss_mlp": 0.01031956, + "balance_loss_clip": 1.02692008, + "balance_loss_mlp": 1.02001715, + "epoch": 0.5656996843529235, + "flos": 20229522606720.0, + "grad_norm": 1.877757524200675, + "language_loss": 0.81342518, + "learning_rate": 1.672964276570308e-06, + "loss": 0.83392978, + "num_input_tokens_seen": 202685125, + "step": 9409, + "time_per_iteration": 2.820570230484009 + }, + { + "auxiliary_loss_clip": 0.01021388, + "auxiliary_loss_mlp": 0.01027729, + "balance_loss_clip": 1.02286792, + "balance_loss_mlp": 1.0161891, + "epoch": 0.5657598076055914, + "flos": 20996251344000.0, + "grad_norm": 1.5866582749066869, + "language_loss": 0.78189123, + "learning_rate": 1.6725800631005776e-06, + "loss": 0.80238235, + "num_input_tokens_seen": 202703830, + "step": 9410, + "time_per_iteration": 2.731595993041992 + }, + { + "auxiliary_loss_clip": 0.01068435, + "auxiliary_loss_mlp": 0.01033626, + "balance_loss_clip": 1.02702832, + "balance_loss_mlp": 1.02245593, + "epoch": 0.5658199308582594, + "flos": 11545999649280.0, + "grad_norm": 2.101825729882577, + "language_loss": 0.83247197, + "learning_rate": 1.6721958620466432e-06, + "loss": 0.85349262, + "num_input_tokens_seen": 202719835, + "step": 9411, + "time_per_iteration": 2.6028778553009033 + }, + { + "auxiliary_loss_clip": 0.01059418, + "auxiliary_loss_mlp": 0.0102871, + "balance_loss_clip": 1.02563787, + "balance_loss_mlp": 1.01656878, + "epoch": 0.5658800541109275, + "flos": 14172146807040.0, + "grad_norm": 2.2426174311833433, + "language_loss": 0.67058599, + "learning_rate": 1.6718116734230749e-06, + "loss": 0.69146723, + "num_input_tokens_seen": 202736795, + "step": 9412, + "time_per_iteration": 2.654797077178955 + }, + { + "auxiliary_loss_clip": 0.01053015, + "auxiliary_loss_mlp": 0.01025006, + "balance_loss_clip": 1.02490163, + "balance_loss_mlp": 1.01585102, + "epoch": 0.5659401773635954, + "flos": 27305073325440.0, + "grad_norm": 2.2752839520647123, + "language_loss": 0.58024842, + "learning_rate": 1.6714274972444413e-06, + "loss": 0.60102868, + "num_input_tokens_seen": 202756900, + "step": 9413, + "time_per_iteration": 2.6445152759552 + }, + { + "auxiliary_loss_clip": 0.00989146, + "auxiliary_loss_mlp": 0.01038644, + "balance_loss_clip": 1.02074409, + "balance_loss_mlp": 1.02643108, + "epoch": 0.5660003006162634, + "flos": 16728196573440.0, + "grad_norm": 1.5023973940192508, + "language_loss": 0.69263703, + "learning_rate": 1.6710433335253092e-06, + "loss": 0.71291494, + "num_input_tokens_seen": 202775145, + "step": 9414, + "time_per_iteration": 2.7940099239349365 + }, + { + "auxiliary_loss_clip": 0.01010576, + "auxiliary_loss_mlp": 0.01025212, + "balance_loss_clip": 1.03133237, + "balance_loss_mlp": 1.0150615, + "epoch": 0.5660604238689313, + "flos": 21653452535040.0, + "grad_norm": 1.6359707232577598, + "language_loss": 0.7825368, + "learning_rate": 1.670659182280247e-06, + "loss": 0.80289471, + "num_input_tokens_seen": 202794505, + "step": 9415, + "time_per_iteration": 2.804058790206909 + }, + { + "auxiliary_loss_clip": 0.00989766, + "auxiliary_loss_mlp": 0.01003428, + "balance_loss_clip": 1.00345898, + "balance_loss_mlp": 1.00224137, + "epoch": 0.5661205471215993, + "flos": 68824022083200.0, + "grad_norm": 0.6846034322192438, + "language_loss": 0.49165747, + "learning_rate": 1.670275043523822e-06, + "loss": 0.51158941, + "num_input_tokens_seen": 202858580, + "step": 9416, + "time_per_iteration": 3.479126214981079 + }, + { + "auxiliary_loss_clip": 0.01055139, + "auxiliary_loss_mlp": 0.00747613, + "balance_loss_clip": 1.02535546, + "balance_loss_mlp": 1.00053501, + "epoch": 0.5661806703742672, + "flos": 28621774177920.0, + "grad_norm": 2.4534691837789477, + "language_loss": 0.63059205, + "learning_rate": 1.6698909172706e-06, + "loss": 0.64861959, + "num_input_tokens_seen": 202878565, + "step": 9417, + "time_per_iteration": 2.61301326751709 + }, + { + "auxiliary_loss_clip": 0.01046568, + "auxiliary_loss_mlp": 0.01027217, + "balance_loss_clip": 1.0247165, + "balance_loss_mlp": 1.01615453, + "epoch": 0.5662407936269352, + "flos": 21397948116480.0, + "grad_norm": 1.8225445797888382, + "language_loss": 0.68851864, + "learning_rate": 1.6695068035351479e-06, + "loss": 0.70925653, + "num_input_tokens_seen": 202897350, + "step": 9418, + "time_per_iteration": 2.7105531692504883 + }, + { + "auxiliary_loss_clip": 0.01051206, + "auxiliary_loss_mlp": 0.01030384, + "balance_loss_clip": 1.02261806, + "balance_loss_mlp": 1.01788521, + "epoch": 0.5663009168796032, + "flos": 25660005315840.0, + "grad_norm": 1.9374248277045583, + "language_loss": 0.64278603, + "learning_rate": 1.6691227023320304e-06, + "loss": 0.66360193, + "num_input_tokens_seen": 202916745, + "step": 9419, + "time_per_iteration": 2.70723032951355 + }, + { + "auxiliary_loss_clip": 0.00963938, + "auxiliary_loss_mlp": 0.01004946, + "balance_loss_clip": 1.01578975, + "balance_loss_mlp": 1.00370026, + "epoch": 0.5663610401322712, + "flos": 67930458422400.0, + "grad_norm": 0.7658613827564781, + "language_loss": 0.5972048, + "learning_rate": 1.6687386136758135e-06, + "loss": 0.61689365, + "num_input_tokens_seen": 202982375, + "step": 9420, + "time_per_iteration": 3.485581874847412 + }, + { + "auxiliary_loss_clip": 0.01043591, + "auxiliary_loss_mlp": 0.00747359, + "balance_loss_clip": 1.0241847, + "balance_loss_mlp": 1.00042355, + "epoch": 0.5664211633849391, + "flos": 24609367480320.0, + "grad_norm": 1.8588133159115066, + "language_loss": 0.74254084, + "learning_rate": 1.6683545375810618e-06, + "loss": 0.76045036, + "num_input_tokens_seen": 203002430, + "step": 9421, + "time_per_iteration": 3.0739872455596924 + }, + { + "auxiliary_loss_clip": 0.01035, + "auxiliary_loss_mlp": 0.01031452, + "balance_loss_clip": 1.02439928, + "balance_loss_mlp": 1.02018106, + "epoch": 0.5664812866376071, + "flos": 11648811352320.0, + "grad_norm": 1.9068104936688883, + "language_loss": 0.72808599, + "learning_rate": 1.6679704740623389e-06, + "loss": 0.74875057, + "num_input_tokens_seen": 203019425, + "step": 9422, + "time_per_iteration": 2.7887964248657227 + }, + { + "auxiliary_loss_clip": 0.01053722, + "auxiliary_loss_mlp": 0.01031835, + "balance_loss_clip": 1.02667749, + "balance_loss_mlp": 1.02232838, + "epoch": 0.566541409890275, + "flos": 24643985212800.0, + "grad_norm": 1.7388537786492302, + "language_loss": 0.81614298, + "learning_rate": 1.6675864231342085e-06, + "loss": 0.83699858, + "num_input_tokens_seen": 203039035, + "step": 9423, + "time_per_iteration": 2.5996556282043457 + }, + { + "auxiliary_loss_clip": 0.01037705, + "auxiliary_loss_mlp": 0.0102947, + "balance_loss_clip": 1.02469611, + "balance_loss_mlp": 1.01783562, + "epoch": 0.566601533142943, + "flos": 22270577126400.0, + "grad_norm": 2.174483436488918, + "language_loss": 0.80994391, + "learning_rate": 1.6672023848112353e-06, + "loss": 0.83061564, + "num_input_tokens_seen": 203059320, + "step": 9424, + "time_per_iteration": 2.608132839202881 + }, + { + "auxiliary_loss_clip": 0.01069986, + "auxiliary_loss_mlp": 0.00747591, + "balance_loss_clip": 1.02734268, + "balance_loss_mlp": 1.00045455, + "epoch": 0.5666616563956111, + "flos": 29971656218880.0, + "grad_norm": 1.8305604227726795, + "language_loss": 0.78836834, + "learning_rate": 1.6668183591079805e-06, + "loss": 0.80654418, + "num_input_tokens_seen": 203078490, + "step": 9425, + "time_per_iteration": 2.645615816116333 + }, + { + "auxiliary_loss_clip": 0.0104976, + "auxiliary_loss_mlp": 0.010332, + "balance_loss_clip": 1.02945471, + "balance_loss_mlp": 1.0218035, + "epoch": 0.566721779648279, + "flos": 17781456101760.0, + "grad_norm": 1.8840378121612535, + "language_loss": 0.59039944, + "learning_rate": 1.6664343460390064e-06, + "loss": 0.61122906, + "num_input_tokens_seen": 203096065, + "step": 9426, + "time_per_iteration": 2.660953998565674 + }, + { + "auxiliary_loss_clip": 0.01058215, + "auxiliary_loss_mlp": 0.01026591, + "balance_loss_clip": 1.02574241, + "balance_loss_mlp": 1.01596296, + "epoch": 0.566781902900947, + "flos": 21033490769280.0, + "grad_norm": 1.7349511503625443, + "language_loss": 0.81834519, + "learning_rate": 1.6660503456188764e-06, + "loss": 0.83919322, + "num_input_tokens_seen": 203115270, + "step": 9427, + "time_per_iteration": 2.643024206161499 + }, + { + "auxiliary_loss_clip": 0.01064682, + "auxiliary_loss_mlp": 0.01029791, + "balance_loss_clip": 1.02659464, + "balance_loss_mlp": 1.01899624, + "epoch": 0.5668420261536149, + "flos": 23148593176320.0, + "grad_norm": 2.0380894753052448, + "language_loss": 0.86147803, + "learning_rate": 1.6656663578621498e-06, + "loss": 0.88242269, + "num_input_tokens_seen": 203134290, + "step": 9428, + "time_per_iteration": 2.5299508571624756 + }, + { + "auxiliary_loss_clip": 0.01043842, + "auxiliary_loss_mlp": 0.01032835, + "balance_loss_clip": 1.0264051, + "balance_loss_mlp": 1.02086031, + "epoch": 0.5669021494062829, + "flos": 22601601889920.0, + "grad_norm": 3.780731400756615, + "language_loss": 0.73440778, + "learning_rate": 1.6652823827833886e-06, + "loss": 0.75517452, + "num_input_tokens_seen": 203152935, + "step": 9429, + "time_per_iteration": 2.600595474243164 + }, + { + "auxiliary_loss_clip": 0.01049453, + "auxiliary_loss_mlp": 0.00747552, + "balance_loss_clip": 1.0263741, + "balance_loss_mlp": 1.00043571, + "epoch": 0.5669622726589508, + "flos": 17381231786880.0, + "grad_norm": 2.468364104773643, + "language_loss": 0.75305295, + "learning_rate": 1.6648984203971538e-06, + "loss": 0.77102304, + "num_input_tokens_seen": 203170110, + "step": 9430, + "time_per_iteration": 2.613452196121216 + }, + { + "auxiliary_loss_clip": 0.01067283, + "auxiliary_loss_mlp": 0.01030142, + "balance_loss_clip": 1.02614069, + "balance_loss_mlp": 1.01921642, + "epoch": 0.5670223959116188, + "flos": 18763253521920.0, + "grad_norm": 1.8034347717301293, + "language_loss": 0.724168, + "learning_rate": 1.6645144707180032e-06, + "loss": 0.74514228, + "num_input_tokens_seen": 203188825, + "step": 9431, + "time_per_iteration": 2.5306813716888428 + }, + { + "auxiliary_loss_clip": 0.01017718, + "auxiliary_loss_mlp": 0.01028835, + "balance_loss_clip": 1.02426887, + "balance_loss_mlp": 1.0186193, + "epoch": 0.5670825191642868, + "flos": 13553334276480.0, + "grad_norm": 1.821224062184977, + "language_loss": 0.73447829, + "learning_rate": 1.6641305337604984e-06, + "loss": 0.75494385, + "num_input_tokens_seen": 203206860, + "step": 9432, + "time_per_iteration": 4.525464773178101 + }, + { + "auxiliary_loss_clip": 0.01009184, + "auxiliary_loss_mlp": 0.01030265, + "balance_loss_clip": 1.02116406, + "balance_loss_mlp": 1.01988196, + "epoch": 0.5671426424169548, + "flos": 22054035985920.0, + "grad_norm": 2.6416955208795123, + "language_loss": 0.77617967, + "learning_rate": 1.663746609539197e-06, + "loss": 0.79657418, + "num_input_tokens_seen": 203225625, + "step": 9433, + "time_per_iteration": 2.8820173740386963 + }, + { + "auxiliary_loss_clip": 0.01069849, + "auxiliary_loss_mlp": 0.01031838, + "balance_loss_clip": 1.02651739, + "balance_loss_mlp": 1.01866508, + "epoch": 0.5672027656696227, + "flos": 21323972056320.0, + "grad_norm": 2.0057451111287925, + "language_loss": 0.63819826, + "learning_rate": 1.6633626980686582e-06, + "loss": 0.65921515, + "num_input_tokens_seen": 203242920, + "step": 9434, + "time_per_iteration": 2.7626194953918457 + }, + { + "auxiliary_loss_clip": 0.01054016, + "auxiliary_loss_mlp": 0.01023902, + "balance_loss_clip": 1.02431345, + "balance_loss_mlp": 1.01388884, + "epoch": 0.5672628889222907, + "flos": 23514056104320.0, + "grad_norm": 1.6499779028935009, + "language_loss": 0.66492248, + "learning_rate": 1.6629787993634399e-06, + "loss": 0.68570167, + "num_input_tokens_seen": 203261995, + "step": 9435, + "time_per_iteration": 4.430245876312256 + }, + { + "auxiliary_loss_clip": 0.01039828, + "auxiliary_loss_mlp": 0.00747514, + "balance_loss_clip": 1.02242661, + "balance_loss_mlp": 1.00039506, + "epoch": 0.5673230121749586, + "flos": 27121928855040.0, + "grad_norm": 1.510805574424587, + "language_loss": 0.71546853, + "learning_rate": 1.6625949134380984e-06, + "loss": 0.73334193, + "num_input_tokens_seen": 203280670, + "step": 9436, + "time_per_iteration": 2.744457483291626 + }, + { + "auxiliary_loss_clip": 0.01067279, + "auxiliary_loss_mlp": 0.01028838, + "balance_loss_clip": 1.02661693, + "balance_loss_mlp": 1.01798403, + "epoch": 0.5673831354276266, + "flos": 31141985149440.0, + "grad_norm": 2.4594468587794087, + "language_loss": 0.74119228, + "learning_rate": 1.6622110403071921e-06, + "loss": 0.76215345, + "num_input_tokens_seen": 203304800, + "step": 9437, + "time_per_iteration": 2.7264769077301025 + }, + { + "auxiliary_loss_clip": 0.01062391, + "auxiliary_loss_mlp": 0.01034768, + "balance_loss_clip": 1.02977633, + "balance_loss_mlp": 1.02306819, + "epoch": 0.5674432586802945, + "flos": 27673193859840.0, + "grad_norm": 2.6232299287843897, + "language_loss": 0.6098386, + "learning_rate": 1.661827179985277e-06, + "loss": 0.63081014, + "num_input_tokens_seen": 203324060, + "step": 9438, + "time_per_iteration": 2.9211924076080322 + }, + { + "auxiliary_loss_clip": 0.01044096, + "auxiliary_loss_mlp": 0.01028555, + "balance_loss_clip": 1.02381778, + "balance_loss_mlp": 1.01810658, + "epoch": 0.5675033819329626, + "flos": 26615157822720.0, + "grad_norm": 1.7543788364418318, + "language_loss": 0.74989653, + "learning_rate": 1.661443332486909e-06, + "loss": 0.77062309, + "num_input_tokens_seen": 203344360, + "step": 9439, + "time_per_iteration": 2.671294927597046 + }, + { + "auxiliary_loss_clip": 0.01047723, + "auxiliary_loss_mlp": 0.01033072, + "balance_loss_clip": 1.02727938, + "balance_loss_mlp": 1.02060258, + "epoch": 0.5675635051856306, + "flos": 19098372435840.0, + "grad_norm": 2.384998803452523, + "language_loss": 0.83627295, + "learning_rate": 1.6610594978266438e-06, + "loss": 0.85708088, + "num_input_tokens_seen": 203362115, + "step": 9440, + "time_per_iteration": 2.6907923221588135 + }, + { + "auxiliary_loss_clip": 0.01036448, + "auxiliary_loss_mlp": 0.01036687, + "balance_loss_clip": 1.02352202, + "balance_loss_mlp": 1.0246408, + "epoch": 0.5676236284382985, + "flos": 17566315591680.0, + "grad_norm": 1.8856159308930338, + "language_loss": 0.75242484, + "learning_rate": 1.6606756760190365e-06, + "loss": 0.77315617, + "num_input_tokens_seen": 203380550, + "step": 9441, + "time_per_iteration": 2.7030582427978516 + }, + { + "auxiliary_loss_clip": 0.01013939, + "auxiliary_loss_mlp": 0.01031206, + "balance_loss_clip": 1.02187622, + "balance_loss_mlp": 1.02000654, + "epoch": 0.5676837516909665, + "flos": 15954069634560.0, + "grad_norm": 1.8045128117013047, + "language_loss": 0.831671, + "learning_rate": 1.6602918670786413e-06, + "loss": 0.85212249, + "num_input_tokens_seen": 203396590, + "step": 9442, + "time_per_iteration": 2.7267768383026123 + }, + { + "auxiliary_loss_clip": 0.01032844, + "auxiliary_loss_mlp": 0.01027834, + "balance_loss_clip": 1.02635336, + "balance_loss_mlp": 1.01783252, + "epoch": 0.5677438749436344, + "flos": 18295912644480.0, + "grad_norm": 1.8069676995083934, + "language_loss": 0.745148, + "learning_rate": 1.6599080710200126e-06, + "loss": 0.76575476, + "num_input_tokens_seen": 203414280, + "step": 9443, + "time_per_iteration": 4.329670667648315 + }, + { + "auxiliary_loss_clip": 0.01040644, + "auxiliary_loss_mlp": 0.01031435, + "balance_loss_clip": 1.02676582, + "balance_loss_mlp": 1.02055693, + "epoch": 0.5678039981963025, + "flos": 17931311642880.0, + "grad_norm": 2.1512212591781963, + "language_loss": 0.77460676, + "learning_rate": 1.6595242878577046e-06, + "loss": 0.79532754, + "num_input_tokens_seen": 203433280, + "step": 9444, + "time_per_iteration": 2.6160597801208496 + }, + { + "auxiliary_loss_clip": 0.01038646, + "auxiliary_loss_mlp": 0.01039019, + "balance_loss_clip": 1.02785397, + "balance_loss_mlp": 1.02733684, + "epoch": 0.5678641214489704, + "flos": 19316350120320.0, + "grad_norm": 1.6861351463543854, + "language_loss": 0.80426073, + "learning_rate": 1.6591405176062687e-06, + "loss": 0.82503736, + "num_input_tokens_seen": 203449935, + "step": 9445, + "time_per_iteration": 2.7565128803253174 + }, + { + "auxiliary_loss_clip": 0.01062531, + "auxiliary_loss_mlp": 0.01025876, + "balance_loss_clip": 1.02274632, + "balance_loss_mlp": 1.01486135, + "epoch": 0.5679242447016384, + "flos": 27751084502400.0, + "grad_norm": 1.2439172796907505, + "language_loss": 0.70862049, + "learning_rate": 1.658756760280259e-06, + "loss": 0.72950459, + "num_input_tokens_seen": 203473025, + "step": 9446, + "time_per_iteration": 2.680203437805176 + }, + { + "auxiliary_loss_clip": 0.01029641, + "auxiliary_loss_mlp": 0.01028874, + "balance_loss_clip": 1.02272654, + "balance_loss_mlp": 1.01735222, + "epoch": 0.5679843679543063, + "flos": 23769093646080.0, + "grad_norm": 2.2162847617399377, + "language_loss": 0.73426354, + "learning_rate": 1.6583730158942276e-06, + "loss": 0.75484866, + "num_input_tokens_seen": 203492895, + "step": 9447, + "time_per_iteration": 2.701829195022583 + }, + { + "auxiliary_loss_clip": 0.01042929, + "auxiliary_loss_mlp": 0.01031827, + "balance_loss_clip": 1.0237999, + "balance_loss_mlp": 1.02090144, + "epoch": 0.5680444912069743, + "flos": 25591883172480.0, + "grad_norm": 1.744020824454026, + "language_loss": 0.75087464, + "learning_rate": 1.657989284462725e-06, + "loss": 0.77162218, + "num_input_tokens_seen": 203513710, + "step": 9448, + "time_per_iteration": 4.2676074504852295 + }, + { + "auxiliary_loss_clip": 0.01024668, + "auxiliary_loss_mlp": 0.01043079, + "balance_loss_clip": 1.0263685, + "balance_loss_mlp": 1.03080666, + "epoch": 0.5681046144596422, + "flos": 23695799944320.0, + "grad_norm": 2.064497505706944, + "language_loss": 0.75976837, + "learning_rate": 1.6576055660003038e-06, + "loss": 0.78044587, + "num_input_tokens_seen": 203531630, + "step": 9449, + "time_per_iteration": 2.7682759761810303 + }, + { + "auxiliary_loss_clip": 0.01037783, + "auxiliary_loss_mlp": 0.0103773, + "balance_loss_clip": 1.0224812, + "balance_loss_mlp": 1.02580953, + "epoch": 0.5681647377123102, + "flos": 28000770917760.0, + "grad_norm": 1.7143687552726408, + "language_loss": 0.74763346, + "learning_rate": 1.6572218605215128e-06, + "loss": 0.76838863, + "num_input_tokens_seen": 203551885, + "step": 9450, + "time_per_iteration": 2.778215169906616 + }, + { + "auxiliary_loss_clip": 0.01047758, + "auxiliary_loss_mlp": 0.01035844, + "balance_loss_clip": 1.02708101, + "balance_loss_mlp": 1.02498436, + "epoch": 0.5682248609649782, + "flos": 22747758330240.0, + "grad_norm": 2.0480640235013046, + "language_loss": 0.66324061, + "learning_rate": 1.6568381680409038e-06, + "loss": 0.68407667, + "num_input_tokens_seen": 203572250, + "step": 9451, + "time_per_iteration": 2.6321167945861816 + }, + { + "auxiliary_loss_clip": 0.01044392, + "auxiliary_loss_mlp": 0.01029772, + "balance_loss_clip": 1.02483106, + "balance_loss_mlp": 1.01698709, + "epoch": 0.5682849842176462, + "flos": 21288600138240.0, + "grad_norm": 3.0097727270990955, + "language_loss": 0.71514964, + "learning_rate": 1.656454488573026e-06, + "loss": 0.73589128, + "num_input_tokens_seen": 203590605, + "step": 9452, + "time_per_iteration": 2.627443552017212 + }, + { + "auxiliary_loss_clip": 0.01030763, + "auxiliary_loss_mlp": 0.01027792, + "balance_loss_clip": 1.02558088, + "balance_loss_mlp": 1.01696754, + "epoch": 0.5683451074703142, + "flos": 21141689512320.0, + "grad_norm": 1.4318561413586495, + "language_loss": 0.70228547, + "learning_rate": 1.656070822132428e-06, + "loss": 0.72287107, + "num_input_tokens_seen": 203610080, + "step": 9453, + "time_per_iteration": 2.644207239151001 + }, + { + "auxiliary_loss_clip": 0.01025255, + "auxiliary_loss_mlp": 0.00747346, + "balance_loss_clip": 1.0252136, + "balance_loss_mlp": 1.00040674, + "epoch": 0.5684052307229821, + "flos": 22344481359360.0, + "grad_norm": 1.6143896380681835, + "language_loss": 0.69431925, + "learning_rate": 1.6556871687336592e-06, + "loss": 0.71204531, + "num_input_tokens_seen": 203630060, + "step": 9454, + "time_per_iteration": 2.861128807067871 + }, + { + "auxiliary_loss_clip": 0.01043215, + "auxiliary_loss_mlp": 0.01028177, + "balance_loss_clip": 1.02375388, + "balance_loss_mlp": 1.01818705, + "epoch": 0.5684653539756501, + "flos": 21798639308160.0, + "grad_norm": 1.8876783701745548, + "language_loss": 0.6046617, + "learning_rate": 1.6553035283912671e-06, + "loss": 0.62537563, + "num_input_tokens_seen": 203649065, + "step": 9455, + "time_per_iteration": 2.6332101821899414 + }, + { + "auxiliary_loss_clip": 0.01023989, + "auxiliary_loss_mlp": 0.01029879, + "balance_loss_clip": 1.02472746, + "balance_loss_mlp": 1.01852393, + "epoch": 0.568525477228318, + "flos": 22999635475200.0, + "grad_norm": 1.969981634320086, + "language_loss": 0.73227483, + "learning_rate": 1.6549199011198e-06, + "loss": 0.75281352, + "num_input_tokens_seen": 203667545, + "step": 9456, + "time_per_iteration": 2.717027187347412 + }, + { + "auxiliary_loss_clip": 0.01044548, + "auxiliary_loss_mlp": 0.01030598, + "balance_loss_clip": 1.02442229, + "balance_loss_mlp": 1.02048337, + "epoch": 0.568585600480986, + "flos": 21392489249280.0, + "grad_norm": 1.5913170123091087, + "language_loss": 0.76799494, + "learning_rate": 1.6545362869338048e-06, + "loss": 0.78874636, + "num_input_tokens_seen": 203686025, + "step": 9457, + "time_per_iteration": 2.645113945007324 + }, + { + "auxiliary_loss_clip": 0.01056655, + "auxiliary_loss_mlp": 0.01031134, + "balance_loss_clip": 1.0259707, + "balance_loss_mlp": 1.02009523, + "epoch": 0.568645723733654, + "flos": 30007351359360.0, + "grad_norm": 1.9209539519881178, + "language_loss": 0.66352797, + "learning_rate": 1.6541526858478285e-06, + "loss": 0.6844058, + "num_input_tokens_seen": 203705540, + "step": 9458, + "time_per_iteration": 2.833784341812134 + }, + { + "auxiliary_loss_clip": 0.01057518, + "auxiliary_loss_mlp": 0.01029939, + "balance_loss_clip": 1.02596593, + "balance_loss_mlp": 1.01817274, + "epoch": 0.568705846986322, + "flos": 20412667077120.0, + "grad_norm": 2.1214001270366403, + "language_loss": 0.68164229, + "learning_rate": 1.6537690978764167e-06, + "loss": 0.70251685, + "num_input_tokens_seen": 203723670, + "step": 9459, + "time_per_iteration": 2.811706781387329 + }, + { + "auxiliary_loss_clip": 0.01038459, + "auxiliary_loss_mlp": 0.01030532, + "balance_loss_clip": 1.02682209, + "balance_loss_mlp": 1.01903439, + "epoch": 0.5687659702389899, + "flos": 17456752131840.0, + "grad_norm": 2.1076881597323225, + "language_loss": 0.76619267, + "learning_rate": 1.6533855230341155e-06, + "loss": 0.78688264, + "num_input_tokens_seen": 203739705, + "step": 9460, + "time_per_iteration": 2.823293447494507 + }, + { + "auxiliary_loss_clip": 0.01016149, + "auxiliary_loss_mlp": 0.01033265, + "balance_loss_clip": 1.02536726, + "balance_loss_mlp": 1.02198744, + "epoch": 0.5688260934916579, + "flos": 25406081095680.0, + "grad_norm": 1.7512768259237534, + "language_loss": 0.71986902, + "learning_rate": 1.65300196133547e-06, + "loss": 0.74036318, + "num_input_tokens_seen": 203759000, + "step": 9461, + "time_per_iteration": 2.918262243270874 + }, + { + "auxiliary_loss_clip": 0.01058003, + "auxiliary_loss_mlp": 0.01026677, + "balance_loss_clip": 1.02688849, + "balance_loss_mlp": 1.01519752, + "epoch": 0.5688862167443258, + "flos": 21608024808960.0, + "grad_norm": 2.538000860606123, + "language_loss": 0.72950709, + "learning_rate": 1.6526184127950249e-06, + "loss": 0.75035393, + "num_input_tokens_seen": 203774295, + "step": 9462, + "time_per_iteration": 2.669435501098633 + }, + { + "auxiliary_loss_clip": 0.01051312, + "auxiliary_loss_mlp": 0.01025332, + "balance_loss_clip": 1.02298367, + "balance_loss_mlp": 1.01569414, + "epoch": 0.5689463399969938, + "flos": 22418996123520.0, + "grad_norm": 1.8858079295298087, + "language_loss": 0.72595173, + "learning_rate": 1.6522348774273246e-06, + "loss": 0.74671817, + "num_input_tokens_seen": 203792710, + "step": 9463, + "time_per_iteration": 2.621483325958252 + }, + { + "auxiliary_loss_clip": 0.01055576, + "auxiliary_loss_mlp": 0.01029107, + "balance_loss_clip": 1.02601862, + "balance_loss_mlp": 1.01841354, + "epoch": 0.5690064632496618, + "flos": 18296810484480.0, + "grad_norm": 1.8444852988912739, + "language_loss": 0.73842382, + "learning_rate": 1.6518513552469123e-06, + "loss": 0.75927067, + "num_input_tokens_seen": 203811645, + "step": 9464, + "time_per_iteration": 2.6284983158111572 + }, + { + "auxiliary_loss_clip": 0.01057499, + "auxiliary_loss_mlp": 0.00747497, + "balance_loss_clip": 1.02574551, + "balance_loss_mlp": 1.00039685, + "epoch": 0.5690665865023298, + "flos": 21579260993280.0, + "grad_norm": 1.7129551119328261, + "language_loss": 0.84131056, + "learning_rate": 1.6514678462683312e-06, + "loss": 0.85936052, + "num_input_tokens_seen": 203830040, + "step": 9465, + "time_per_iteration": 2.631939172744751 + }, + { + "auxiliary_loss_clip": 0.01038587, + "auxiliary_loss_mlp": 0.01028853, + "balance_loss_clip": 1.02159607, + "balance_loss_mlp": 1.01805842, + "epoch": 0.5691267097549978, + "flos": 24421446501120.0, + "grad_norm": 1.7564965588219477, + "language_loss": 0.72448301, + "learning_rate": 1.651084350506125e-06, + "loss": 0.74515742, + "num_input_tokens_seen": 203851245, + "step": 9466, + "time_per_iteration": 2.7325291633605957 + }, + { + "auxiliary_loss_clip": 0.00980683, + "auxiliary_loss_mlp": 0.0100063, + "balance_loss_clip": 1.0037719, + "balance_loss_mlp": 0.99953347, + "epoch": 0.5691868330076657, + "flos": 61657906199040.0, + "grad_norm": 0.711767027440698, + "language_loss": 0.55382931, + "learning_rate": 1.6507008679748343e-06, + "loss": 0.57364237, + "num_input_tokens_seen": 203916400, + "step": 9467, + "time_per_iteration": 3.358313798904419 + }, + { + "auxiliary_loss_clip": 0.01049771, + "auxiliary_loss_mlp": 0.01031631, + "balance_loss_clip": 1.02442217, + "balance_loss_mlp": 1.01911402, + "epoch": 0.5692469562603337, + "flos": 21325193118720.0, + "grad_norm": 2.991979882314083, + "language_loss": 0.63694978, + "learning_rate": 1.6503173986890023e-06, + "loss": 0.65776384, + "num_input_tokens_seen": 203935870, + "step": 9468, + "time_per_iteration": 2.578517198562622 + }, + { + "auxiliary_loss_clip": 0.01018174, + "auxiliary_loss_mlp": 0.01033261, + "balance_loss_clip": 1.02199817, + "balance_loss_mlp": 1.02104211, + "epoch": 0.5693070795130016, + "flos": 23367899664000.0, + "grad_norm": 2.3766699200493386, + "language_loss": 0.79059315, + "learning_rate": 1.64993394266317e-06, + "loss": 0.81110746, + "num_input_tokens_seen": 203954950, + "step": 9469, + "time_per_iteration": 2.7012746334075928 + }, + { + "auxiliary_loss_clip": 0.01031543, + "auxiliary_loss_mlp": 0.01042264, + "balance_loss_clip": 1.02403212, + "balance_loss_mlp": 1.02932978, + "epoch": 0.5693672027656697, + "flos": 18697250280960.0, + "grad_norm": 2.4674392912641068, + "language_loss": 0.69189048, + "learning_rate": 1.6495504999118769e-06, + "loss": 0.7126286, + "num_input_tokens_seen": 203972715, + "step": 9470, + "time_per_iteration": 2.627674102783203 + }, + { + "auxiliary_loss_clip": 0.01049097, + "auxiliary_loss_mlp": 0.01034031, + "balance_loss_clip": 1.02754414, + "balance_loss_mlp": 1.02224171, + "epoch": 0.5694273260183376, + "flos": 20449188230400.0, + "grad_norm": 1.684224045950155, + "language_loss": 0.74797595, + "learning_rate": 1.6491670704496644e-06, + "loss": 0.76880723, + "num_input_tokens_seen": 203990775, + "step": 9471, + "time_per_iteration": 2.643317937850952 + }, + { + "auxiliary_loss_clip": 0.01031458, + "auxiliary_loss_mlp": 0.01037216, + "balance_loss_clip": 1.02604759, + "balance_loss_mlp": 1.02499688, + "epoch": 0.5694874492710056, + "flos": 17603195880960.0, + "grad_norm": 1.9058182402374377, + "language_loss": 0.57569754, + "learning_rate": 1.6487836542910716e-06, + "loss": 0.59638429, + "num_input_tokens_seen": 204008845, + "step": 9472, + "time_per_iteration": 2.71659517288208 + }, + { + "auxiliary_loss_clip": 0.0102754, + "auxiliary_loss_mlp": 0.01028266, + "balance_loss_clip": 1.02326632, + "balance_loss_mlp": 1.01718521, + "epoch": 0.5695475725236735, + "flos": 13370836250880.0, + "grad_norm": 2.3966047042855476, + "language_loss": 0.73888379, + "learning_rate": 1.648400251450638e-06, + "loss": 0.75944185, + "num_input_tokens_seen": 204023755, + "step": 9473, + "time_per_iteration": 2.65311598777771 + }, + { + "auxiliary_loss_clip": 0.00979876, + "auxiliary_loss_mlp": 0.01005006, + "balance_loss_clip": 1.0027647, + "balance_loss_mlp": 1.00401103, + "epoch": 0.5696076957763415, + "flos": 68174398661760.0, + "grad_norm": 0.657760481755984, + "language_loss": 0.5756672, + "learning_rate": 1.6480168619429023e-06, + "loss": 0.59551603, + "num_input_tokens_seen": 204091255, + "step": 9474, + "time_per_iteration": 3.279768705368042 + }, + { + "auxiliary_loss_clip": 0.01053744, + "auxiliary_loss_mlp": 0.01030976, + "balance_loss_clip": 1.02642035, + "balance_loss_mlp": 1.01929343, + "epoch": 0.5696678190290094, + "flos": 33838301525760.0, + "grad_norm": 1.997009590843055, + "language_loss": 0.5459187, + "learning_rate": 1.6476334857824017e-06, + "loss": 0.5667659, + "num_input_tokens_seen": 204113285, + "step": 9475, + "time_per_iteration": 2.717877149581909 + }, + { + "auxiliary_loss_clip": 0.01067819, + "auxiliary_loss_mlp": 0.01035284, + "balance_loss_clip": 1.0266428, + "balance_loss_mlp": 1.02414417, + "epoch": 0.5697279422816774, + "flos": 26356600748160.0, + "grad_norm": 1.6790183149670752, + "language_loss": 0.79694998, + "learning_rate": 1.647250122983675e-06, + "loss": 0.817981, + "num_input_tokens_seen": 204133045, + "step": 9476, + "time_per_iteration": 2.6245715618133545 + }, + { + "auxiliary_loss_clip": 0.01049751, + "auxiliary_loss_mlp": 0.01034101, + "balance_loss_clip": 1.02799153, + "balance_loss_mlp": 1.02292526, + "epoch": 0.5697880655343454, + "flos": 22930507751040.0, + "grad_norm": 1.7393307022230462, + "language_loss": 0.67096746, + "learning_rate": 1.6468667735612592e-06, + "loss": 0.69180602, + "num_input_tokens_seen": 204152590, + "step": 9477, + "time_per_iteration": 2.733783721923828 + }, + { + "auxiliary_loss_clip": 0.01035733, + "auxiliary_loss_mlp": 0.01029975, + "balance_loss_clip": 1.02481818, + "balance_loss_mlp": 1.01823282, + "epoch": 0.5698481887870134, + "flos": 26761314263040.0, + "grad_norm": 1.6501609793916496, + "language_loss": 0.70825124, + "learning_rate": 1.6464834375296906e-06, + "loss": 0.7289083, + "num_input_tokens_seen": 204171815, + "step": 9478, + "time_per_iteration": 2.706594228744507 + }, + { + "auxiliary_loss_clip": 0.010341, + "auxiliary_loss_mlp": 0.01027585, + "balance_loss_clip": 1.02505004, + "balance_loss_mlp": 1.01756001, + "epoch": 0.5699083120396814, + "flos": 15742269089280.0, + "grad_norm": 1.7710521528225247, + "language_loss": 0.69233996, + "learning_rate": 1.6461001149035055e-06, + "loss": 0.71295679, + "num_input_tokens_seen": 204188535, + "step": 9479, + "time_per_iteration": 4.266912460327148 + }, + { + "auxiliary_loss_clip": 0.01033507, + "auxiliary_loss_mlp": 0.01031813, + "balance_loss_clip": 1.02568984, + "balance_loss_mlp": 1.02126944, + "epoch": 0.5699684352923493, + "flos": 19537272720000.0, + "grad_norm": 1.4759639279322274, + "language_loss": 0.71534365, + "learning_rate": 1.6457168056972392e-06, + "loss": 0.73599684, + "num_input_tokens_seen": 204208365, + "step": 9480, + "time_per_iteration": 2.6996591091156006 + }, + { + "auxiliary_loss_clip": 0.01036296, + "auxiliary_loss_mlp": 0.00747438, + "balance_loss_clip": 1.02520013, + "balance_loss_mlp": 1.00033998, + "epoch": 0.5700285585450173, + "flos": 16253349753600.0, + "grad_norm": 2.3453217125632704, + "language_loss": 0.72297895, + "learning_rate": 1.6453335099254276e-06, + "loss": 0.7408163, + "num_input_tokens_seen": 204226560, + "step": 9481, + "time_per_iteration": 2.664276123046875 + }, + { + "auxiliary_loss_clip": 0.01056, + "auxiliary_loss_mlp": 0.01030064, + "balance_loss_clip": 1.025666, + "balance_loss_mlp": 1.01879871, + "epoch": 0.5700886817976852, + "flos": 19864993432320.0, + "grad_norm": 1.6573899484437085, + "language_loss": 0.78303039, + "learning_rate": 1.6449502276026041e-06, + "loss": 0.80389106, + "num_input_tokens_seen": 204245410, + "step": 9482, + "time_per_iteration": 4.215393781661987 + }, + { + "auxiliary_loss_clip": 0.0104371, + "auxiliary_loss_mlp": 0.01024949, + "balance_loss_clip": 1.02436805, + "balance_loss_mlp": 1.01486981, + "epoch": 0.5701488050503533, + "flos": 23841704989440.0, + "grad_norm": 2.9403054750577615, + "language_loss": 0.77934349, + "learning_rate": 1.6445669587433043e-06, + "loss": 0.80003011, + "num_input_tokens_seen": 204264840, + "step": 9483, + "time_per_iteration": 2.6784780025482178 + }, + { + "auxiliary_loss_clip": 0.01045774, + "auxiliary_loss_mlp": 0.01030538, + "balance_loss_clip": 1.02606201, + "balance_loss_mlp": 1.0200119, + "epoch": 0.5702089283030212, + "flos": 23659673840640.0, + "grad_norm": 2.3195307054801364, + "language_loss": 0.81305969, + "learning_rate": 1.6441837033620612e-06, + "loss": 0.83382285, + "num_input_tokens_seen": 204284335, + "step": 9484, + "time_per_iteration": 2.6790308952331543 + }, + { + "auxiliary_loss_clip": 0.01066129, + "auxiliary_loss_mlp": 0.00747591, + "balance_loss_clip": 1.02542782, + "balance_loss_mlp": 1.00039744, + "epoch": 0.5702690515556892, + "flos": 27891171544320.0, + "grad_norm": 2.639687538498362, + "language_loss": 0.6105361, + "learning_rate": 1.6438004614734073e-06, + "loss": 0.62867332, + "num_input_tokens_seen": 204302590, + "step": 9485, + "time_per_iteration": 2.6015825271606445 + }, + { + "auxiliary_loss_clip": 0.01054799, + "auxiliary_loss_mlp": 0.01030156, + "balance_loss_clip": 1.02526331, + "balance_loss_mlp": 1.0196116, + "epoch": 0.5703291748083571, + "flos": 24023951619840.0, + "grad_norm": 1.7595621089141242, + "language_loss": 0.65160578, + "learning_rate": 1.6434172330918757e-06, + "loss": 0.67245537, + "num_input_tokens_seen": 204323055, + "step": 9486, + "time_per_iteration": 2.657330274581909 + }, + { + "auxiliary_loss_clip": 0.00991037, + "auxiliary_loss_mlp": 0.01002097, + "balance_loss_clip": 1.00392318, + "balance_loss_mlp": 1.00114965, + "epoch": 0.5703892980610251, + "flos": 57023382919680.0, + "grad_norm": 0.6685926560557615, + "language_loss": 0.47983956, + "learning_rate": 1.6430340182319978e-06, + "loss": 0.49977088, + "num_input_tokens_seen": 204386160, + "step": 9487, + "time_per_iteration": 3.2872912883758545 + }, + { + "auxiliary_loss_clip": 0.01033009, + "auxiliary_loss_mlp": 0.00747507, + "balance_loss_clip": 1.02374768, + "balance_loss_mlp": 1.00033927, + "epoch": 0.570449421313693, + "flos": 24351025887360.0, + "grad_norm": 1.4488040433539324, + "language_loss": 0.8590489, + "learning_rate": 1.6426508169083067e-06, + "loss": 0.87685406, + "num_input_tokens_seen": 204406315, + "step": 9488, + "time_per_iteration": 2.884901285171509 + }, + { + "auxiliary_loss_clip": 0.01031544, + "auxiliary_loss_mlp": 0.01029753, + "balance_loss_clip": 1.02442288, + "balance_loss_mlp": 1.01794553, + "epoch": 0.570509544566361, + "flos": 24828566227200.0, + "grad_norm": 1.592482157095599, + "language_loss": 0.78950542, + "learning_rate": 1.6422676291353314e-06, + "loss": 0.81011844, + "num_input_tokens_seen": 204427645, + "step": 9489, + "time_per_iteration": 2.683387279510498 + }, + { + "auxiliary_loss_clip": 0.01047271, + "auxiliary_loss_mlp": 0.01025311, + "balance_loss_clip": 1.02678955, + "balance_loss_mlp": 1.01530969, + "epoch": 0.570569667819029, + "flos": 21397301671680.0, + "grad_norm": 1.6912899229500695, + "language_loss": 0.7005536, + "learning_rate": 1.641884454927604e-06, + "loss": 0.72127938, + "num_input_tokens_seen": 204445910, + "step": 9490, + "time_per_iteration": 4.308244705200195 + }, + { + "auxiliary_loss_clip": 0.01032821, + "auxiliary_loss_mlp": 0.01024204, + "balance_loss_clip": 1.02423811, + "balance_loss_mlp": 1.01407671, + "epoch": 0.570629791071697, + "flos": 23216751233280.0, + "grad_norm": 1.7438030957665909, + "language_loss": 0.76237547, + "learning_rate": 1.6415012942996548e-06, + "loss": 0.78294569, + "num_input_tokens_seen": 204464680, + "step": 9491, + "time_per_iteration": 2.664097547531128 + }, + { + "auxiliary_loss_clip": 0.00981013, + "auxiliary_loss_mlp": 0.00746827, + "balance_loss_clip": 1.00496376, + "balance_loss_mlp": 1.00110495, + "epoch": 0.570689914324365, + "flos": 65284666525440.0, + "grad_norm": 0.7934148632885125, + "language_loss": 0.57361102, + "learning_rate": 1.641118147266011e-06, + "loss": 0.59088945, + "num_input_tokens_seen": 204525580, + "step": 9492, + "time_per_iteration": 3.228100299835205 + }, + { + "auxiliary_loss_clip": 0.01045794, + "auxiliary_loss_mlp": 0.00747531, + "balance_loss_clip": 1.02628398, + "balance_loss_mlp": 1.00038171, + "epoch": 0.5707500375770329, + "flos": 21141904993920.0, + "grad_norm": 1.9172591418399865, + "language_loss": 0.71925318, + "learning_rate": 1.6407350138412035e-06, + "loss": 0.73718643, + "num_input_tokens_seen": 204541320, + "step": 9493, + "time_per_iteration": 2.6105408668518066 + }, + { + "auxiliary_loss_clip": 0.01067558, + "auxiliary_loss_mlp": 0.01026617, + "balance_loss_clip": 1.02636456, + "balance_loss_mlp": 1.01550674, + "epoch": 0.5708101608297009, + "flos": 20812747737600.0, + "grad_norm": 1.5328707759018374, + "language_loss": 0.77959692, + "learning_rate": 1.6403518940397606e-06, + "loss": 0.80053872, + "num_input_tokens_seen": 204560275, + "step": 9494, + "time_per_iteration": 2.6066126823425293 + }, + { + "auxiliary_loss_clip": 0.01069142, + "auxiliary_loss_mlp": 0.0103368, + "balance_loss_clip": 1.02625299, + "balance_loss_mlp": 1.02207494, + "epoch": 0.5708702840823688, + "flos": 25812338895360.0, + "grad_norm": 2.2699083539940825, + "language_loss": 0.8005448, + "learning_rate": 1.6399687878762096e-06, + "loss": 0.82157302, + "num_input_tokens_seen": 204579430, + "step": 9495, + "time_per_iteration": 2.6649374961853027 + }, + { + "auxiliary_loss_clip": 0.01018841, + "auxiliary_loss_mlp": 0.01036744, + "balance_loss_clip": 1.02334774, + "balance_loss_mlp": 1.0233922, + "epoch": 0.5709304073350369, + "flos": 23651916503040.0, + "grad_norm": 3.1301151234959965, + "language_loss": 0.66428041, + "learning_rate": 1.6395856953650784e-06, + "loss": 0.68483627, + "num_input_tokens_seen": 204597710, + "step": 9496, + "time_per_iteration": 4.358773708343506 + }, + { + "auxiliary_loss_clip": 0.0106987, + "auxiliary_loss_mlp": 0.01035038, + "balance_loss_clip": 1.02739596, + "balance_loss_mlp": 1.02279568, + "epoch": 0.5709905305877048, + "flos": 16107552449280.0, + "grad_norm": 2.145728155206067, + "language_loss": 0.69422334, + "learning_rate": 1.6392026165208938e-06, + "loss": 0.71527243, + "num_input_tokens_seen": 204616140, + "step": 9497, + "time_per_iteration": 2.5265002250671387 + }, + { + "auxiliary_loss_clip": 0.0105957, + "auxiliary_loss_mlp": 0.00747676, + "balance_loss_clip": 1.02814436, + "balance_loss_mlp": 1.00045252, + "epoch": 0.5710506538403728, + "flos": 24750819239040.0, + "grad_norm": 2.9455822008645094, + "language_loss": 0.81176823, + "learning_rate": 1.638819551358182e-06, + "loss": 0.82984066, + "num_input_tokens_seen": 204636470, + "step": 9498, + "time_per_iteration": 2.8320608139038086 + }, + { + "auxiliary_loss_clip": 0.0106792, + "auxiliary_loss_mlp": 0.01032227, + "balance_loss_clip": 1.02637005, + "balance_loss_mlp": 1.01985919, + "epoch": 0.5711107770930407, + "flos": 21982250655360.0, + "grad_norm": 1.719700803375848, + "language_loss": 0.65916979, + "learning_rate": 1.638436499891469e-06, + "loss": 0.68017131, + "num_input_tokens_seen": 204656640, + "step": 9499, + "time_per_iteration": 2.7643513679504395 + }, + { + "auxiliary_loss_clip": 0.01038065, + "auxiliary_loss_mlp": 0.01033798, + "balance_loss_clip": 1.02508664, + "balance_loss_mlp": 1.02242517, + "epoch": 0.5711709003457087, + "flos": 19574009354880.0, + "grad_norm": 1.5734131494620018, + "language_loss": 0.72029948, + "learning_rate": 1.6380534621352805e-06, + "loss": 0.74101812, + "num_input_tokens_seen": 204675475, + "step": 9500, + "time_per_iteration": 3.0632870197296143 + }, + { + "auxiliary_loss_clip": 0.01039308, + "auxiliary_loss_mlp": 0.01031919, + "balance_loss_clip": 1.02538764, + "balance_loss_mlp": 1.0206356, + "epoch": 0.5712310235983766, + "flos": 24242683489920.0, + "grad_norm": 1.8400327474040452, + "language_loss": 0.76368117, + "learning_rate": 1.6376704381041407e-06, + "loss": 0.78439337, + "num_input_tokens_seen": 204695385, + "step": 9501, + "time_per_iteration": 2.795302391052246 + }, + { + "auxiliary_loss_clip": 0.01047341, + "auxiliary_loss_mlp": 0.01028685, + "balance_loss_clip": 1.02531075, + "balance_loss_mlp": 1.01796246, + "epoch": 0.5712911468510447, + "flos": 20996143603200.0, + "grad_norm": 1.7036927311456431, + "language_loss": 0.74738324, + "learning_rate": 1.6372874278125742e-06, + "loss": 0.76814348, + "num_input_tokens_seen": 204714730, + "step": 9502, + "time_per_iteration": 2.9384377002716064 + }, + { + "auxiliary_loss_clip": 0.01040962, + "auxiliary_loss_mlp": 0.01022856, + "balance_loss_clip": 1.02845097, + "balance_loss_mlp": 1.01190639, + "epoch": 0.5713512701037126, + "flos": 18916987731840.0, + "grad_norm": 2.2139536454314284, + "language_loss": 0.82224417, + "learning_rate": 1.636904431275105e-06, + "loss": 0.84288228, + "num_input_tokens_seen": 204735025, + "step": 9503, + "time_per_iteration": 2.836937189102173 + }, + { + "auxiliary_loss_clip": 0.01036043, + "auxiliary_loss_mlp": 0.01029637, + "balance_loss_clip": 1.02646244, + "balance_loss_mlp": 1.01906323, + "epoch": 0.5714113933563806, + "flos": 17413443308160.0, + "grad_norm": 2.1321860920465507, + "language_loss": 0.86295563, + "learning_rate": 1.6365214485062553e-06, + "loss": 0.88361239, + "num_input_tokens_seen": 204751365, + "step": 9504, + "time_per_iteration": 2.750734806060791 + }, + { + "auxiliary_loss_clip": 0.01026062, + "auxiliary_loss_mlp": 0.01023786, + "balance_loss_clip": 1.0239085, + "balance_loss_mlp": 1.01297939, + "epoch": 0.5714715166090486, + "flos": 20193360589440.0, + "grad_norm": 1.7790164330763596, + "language_loss": 0.75565839, + "learning_rate": 1.6361384795205496e-06, + "loss": 0.77615684, + "num_input_tokens_seen": 204768980, + "step": 9505, + "time_per_iteration": 2.696143627166748 + }, + { + "auxiliary_loss_clip": 0.01063764, + "auxiliary_loss_mlp": 0.01028165, + "balance_loss_clip": 1.02384603, + "balance_loss_mlp": 1.01816392, + "epoch": 0.5715316398617165, + "flos": 18551668458240.0, + "grad_norm": 3.0057975703897117, + "language_loss": 0.81596291, + "learning_rate": 1.635755524332509e-06, + "loss": 0.83688223, + "num_input_tokens_seen": 204788110, + "step": 9506, + "time_per_iteration": 2.630430221557617 + }, + { + "auxiliary_loss_clip": 0.01025314, + "auxiliary_loss_mlp": 0.00747499, + "balance_loss_clip": 1.02135992, + "balance_loss_mlp": 1.00032902, + "epoch": 0.5715917631143845, + "flos": 18478195188480.0, + "grad_norm": 1.6543532724062449, + "language_loss": 0.77056432, + "learning_rate": 1.6353725829566552e-06, + "loss": 0.78829241, + "num_input_tokens_seen": 204807240, + "step": 9507, + "time_per_iteration": 2.611466407775879 + }, + { + "auxiliary_loss_clip": 0.01046877, + "auxiliary_loss_mlp": 0.01035202, + "balance_loss_clip": 1.02566099, + "balance_loss_mlp": 1.02354372, + "epoch": 0.5716518863670524, + "flos": 24020037037440.0, + "grad_norm": 1.6045918868603406, + "language_loss": 0.68451852, + "learning_rate": 1.63498965540751e-06, + "loss": 0.70533931, + "num_input_tokens_seen": 204826415, + "step": 9508, + "time_per_iteration": 2.651944637298584 + }, + { + "auxiliary_loss_clip": 0.0106668, + "auxiliary_loss_mlp": 0.01026321, + "balance_loss_clip": 1.02505839, + "balance_loss_mlp": 1.01512098, + "epoch": 0.5717120096197205, + "flos": 17819485626240.0, + "grad_norm": 2.7910656023250717, + "language_loss": 0.79843128, + "learning_rate": 1.634606741699593e-06, + "loss": 0.81936133, + "num_input_tokens_seen": 204844305, + "step": 9509, + "time_per_iteration": 2.557008743286133 + }, + { + "auxiliary_loss_clip": 0.01053614, + "auxiliary_loss_mlp": 0.01027851, + "balance_loss_clip": 1.02505541, + "balance_loss_mlp": 1.01718223, + "epoch": 0.5717721328723884, + "flos": 21866043179520.0, + "grad_norm": 1.8384642308424701, + "language_loss": 0.72208869, + "learning_rate": 1.6342238418474255e-06, + "loss": 0.74290335, + "num_input_tokens_seen": 204861765, + "step": 9510, + "time_per_iteration": 2.5808281898498535 + }, + { + "auxiliary_loss_clip": 0.01045447, + "auxiliary_loss_mlp": 0.01025519, + "balance_loss_clip": 1.02511382, + "balance_loss_mlp": 1.0148797, + "epoch": 0.5718322561250564, + "flos": 28437624126720.0, + "grad_norm": 1.416754790454731, + "language_loss": 0.69469571, + "learning_rate": 1.6338409558655264e-06, + "loss": 0.71540529, + "num_input_tokens_seen": 204882505, + "step": 9511, + "time_per_iteration": 2.656733274459839 + }, + { + "auxiliary_loss_clip": 0.01046564, + "auxiliary_loss_mlp": 0.01030274, + "balance_loss_clip": 1.02632153, + "balance_loss_mlp": 1.01979589, + "epoch": 0.5718923793777243, + "flos": 13551825905280.0, + "grad_norm": 1.7408556764316585, + "language_loss": 0.61668873, + "learning_rate": 1.6334580837684152e-06, + "loss": 0.63745713, + "num_input_tokens_seen": 204899830, + "step": 9512, + "time_per_iteration": 2.6309621334075928 + }, + { + "auxiliary_loss_clip": 0.01044753, + "auxiliary_loss_mlp": 0.01023754, + "balance_loss_clip": 1.0254705, + "balance_loss_mlp": 1.01362681, + "epoch": 0.5719525026303923, + "flos": 17822035491840.0, + "grad_norm": 2.2937783783556465, + "language_loss": 0.76199424, + "learning_rate": 1.6330752255706104e-06, + "loss": 0.78267932, + "num_input_tokens_seen": 204918100, + "step": 9513, + "time_per_iteration": 2.621398687362671 + }, + { + "auxiliary_loss_clip": 0.00998459, + "auxiliary_loss_mlp": 0.01003327, + "balance_loss_clip": 1.00149846, + "balance_loss_mlp": 1.00232553, + "epoch": 0.5720126258830602, + "flos": 61298042814720.0, + "grad_norm": 0.8933226309587939, + "language_loss": 0.66819465, + "learning_rate": 1.6326923812866288e-06, + "loss": 0.68821251, + "num_input_tokens_seen": 204972925, + "step": 9514, + "time_per_iteration": 3.1348676681518555 + }, + { + "auxiliary_loss_clip": 0.01059286, + "auxiliary_loss_mlp": 0.01034635, + "balance_loss_clip": 1.02647972, + "balance_loss_mlp": 1.02353096, + "epoch": 0.5720727491357283, + "flos": 23988040997760.0, + "grad_norm": 1.9118118104967774, + "language_loss": 0.8174749, + "learning_rate": 1.63230955093099e-06, + "loss": 0.83841407, + "num_input_tokens_seen": 204990910, + "step": 9515, + "time_per_iteration": 2.587456226348877 + }, + { + "auxiliary_loss_clip": 0.01043267, + "auxiliary_loss_mlp": 0.01025673, + "balance_loss_clip": 1.02147102, + "balance_loss_mlp": 1.01494455, + "epoch": 0.5721328723883962, + "flos": 23405426398080.0, + "grad_norm": 1.518717594505182, + "language_loss": 0.85874611, + "learning_rate": 1.6319267345182092e-06, + "loss": 0.87943554, + "num_input_tokens_seen": 205010500, + "step": 9516, + "time_per_iteration": 2.5701756477355957 + }, + { + "auxiliary_loss_clip": 0.01045377, + "auxiliary_loss_mlp": 0.01029974, + "balance_loss_clip": 1.02570927, + "balance_loss_mlp": 1.01830983, + "epoch": 0.5721929956410642, + "flos": 18804910320000.0, + "grad_norm": 2.036195332945231, + "language_loss": 0.87556177, + "learning_rate": 1.6315439320628038e-06, + "loss": 0.89631528, + "num_input_tokens_seen": 205028560, + "step": 9517, + "time_per_iteration": 2.6007750034332275 + }, + { + "auxiliary_loss_clip": 0.01021353, + "auxiliary_loss_mlp": 0.01026848, + "balance_loss_clip": 1.02317762, + "balance_loss_mlp": 1.01576185, + "epoch": 0.5722531188937322, + "flos": 27196659100800.0, + "grad_norm": 1.7450253556225035, + "language_loss": 0.85347867, + "learning_rate": 1.6311611435792893e-06, + "loss": 0.87396073, + "num_input_tokens_seen": 205048650, + "step": 9518, + "time_per_iteration": 2.750678777694702 + }, + { + "auxiliary_loss_clip": 0.01051896, + "auxiliary_loss_mlp": 0.01028756, + "balance_loss_clip": 1.02403879, + "balance_loss_mlp": 1.01841497, + "epoch": 0.5723132421464001, + "flos": 15195672852480.0, + "grad_norm": 1.8075365599357431, + "language_loss": 0.7894448, + "learning_rate": 1.6307783690821812e-06, + "loss": 0.81025136, + "num_input_tokens_seen": 205066480, + "step": 9519, + "time_per_iteration": 2.5561389923095703 + }, + { + "auxiliary_loss_clip": 0.01063557, + "auxiliary_loss_mlp": 0.01028333, + "balance_loss_clip": 1.02499342, + "balance_loss_mlp": 1.01828384, + "epoch": 0.5723733653990681, + "flos": 27599433281280.0, + "grad_norm": 1.4948929064872014, + "language_loss": 0.82836783, + "learning_rate": 1.6303956085859944e-06, + "loss": 0.84928668, + "num_input_tokens_seen": 205087475, + "step": 9520, + "time_per_iteration": 2.740520715713501 + }, + { + "auxiliary_loss_clip": 0.01047392, + "auxiliary_loss_mlp": 0.01037158, + "balance_loss_clip": 1.02639318, + "balance_loss_mlp": 1.02585101, + "epoch": 0.572433488651736, + "flos": 18222870337920.0, + "grad_norm": 2.145376838682132, + "language_loss": 0.72528422, + "learning_rate": 1.630012862105243e-06, + "loss": 0.74612975, + "num_input_tokens_seen": 205106495, + "step": 9521, + "time_per_iteration": 2.6740143299102783 + }, + { + "auxiliary_loss_clip": 0.01063596, + "auxiliary_loss_mlp": 0.00747441, + "balance_loss_clip": 1.0241034, + "balance_loss_mlp": 1.00039947, + "epoch": 0.5724936119044041, + "flos": 31249106484480.0, + "grad_norm": 1.8788622238717205, + "language_loss": 0.78210926, + "learning_rate": 1.6296301296544415e-06, + "loss": 0.80021966, + "num_input_tokens_seen": 205128285, + "step": 9522, + "time_per_iteration": 2.6468703746795654 + }, + { + "auxiliary_loss_clip": 0.01042472, + "auxiliary_loss_mlp": 0.01028222, + "balance_loss_clip": 1.02478242, + "balance_loss_mlp": 1.01879215, + "epoch": 0.572553735157072, + "flos": 19202189719680.0, + "grad_norm": 1.6691963690463678, + "language_loss": 0.71724784, + "learning_rate": 1.629247411248102e-06, + "loss": 0.73795474, + "num_input_tokens_seen": 205146595, + "step": 9523, + "time_per_iteration": 2.64386248588562 + }, + { + "auxiliary_loss_clip": 0.01043641, + "auxiliary_loss_mlp": 0.01023643, + "balance_loss_clip": 1.02508044, + "balance_loss_mlp": 1.01377296, + "epoch": 0.57261385840974, + "flos": 21214911386880.0, + "grad_norm": 1.834284890393683, + "language_loss": 0.70220125, + "learning_rate": 1.628864706900738e-06, + "loss": 0.72287405, + "num_input_tokens_seen": 205164295, + "step": 9524, + "time_per_iteration": 2.7258710861206055 + }, + { + "auxiliary_loss_clip": 0.01054253, + "auxiliary_loss_mlp": 0.01026452, + "balance_loss_clip": 1.02526999, + "balance_loss_mlp": 1.01649833, + "epoch": 0.5726739816624079, + "flos": 33984529793280.0, + "grad_norm": 1.872261106994597, + "language_loss": 0.65376753, + "learning_rate": 1.6284820166268615e-06, + "loss": 0.67457461, + "num_input_tokens_seen": 205185380, + "step": 9525, + "time_per_iteration": 2.7614352703094482 + }, + { + "auxiliary_loss_clip": 0.01040514, + "auxiliary_loss_mlp": 0.01028492, + "balance_loss_clip": 1.02224612, + "balance_loss_mlp": 1.01877069, + "epoch": 0.5727341049150759, + "flos": 24275972419200.0, + "grad_norm": 2.1867878637041027, + "language_loss": 0.72129643, + "learning_rate": 1.628099340440984e-06, + "loss": 0.74198651, + "num_input_tokens_seen": 205204895, + "step": 9526, + "time_per_iteration": 4.366199970245361 + }, + { + "auxiliary_loss_clip": 0.01050911, + "auxiliary_loss_mlp": 0.0103458, + "balance_loss_clip": 1.02361059, + "balance_loss_mlp": 1.02436984, + "epoch": 0.5727942281677438, + "flos": 28400564269440.0, + "grad_norm": 1.5971816154668115, + "language_loss": 0.80005962, + "learning_rate": 1.6277166783576176e-06, + "loss": 0.82091451, + "num_input_tokens_seen": 205223440, + "step": 9527, + "time_per_iteration": 2.670328378677368 + }, + { + "auxiliary_loss_clip": 0.01051298, + "auxiliary_loss_mlp": 0.01031448, + "balance_loss_clip": 1.02343702, + "balance_loss_mlp": 1.02089214, + "epoch": 0.5728543514204119, + "flos": 19536769929600.0, + "grad_norm": 1.941027060843055, + "language_loss": 0.72640622, + "learning_rate": 1.6273340303912713e-06, + "loss": 0.74723375, + "num_input_tokens_seen": 205242800, + "step": 9528, + "time_per_iteration": 2.622471332550049 + }, + { + "auxiliary_loss_clip": 0.01064444, + "auxiliary_loss_mlp": 0.01031055, + "balance_loss_clip": 1.02540135, + "balance_loss_mlp": 1.0205828, + "epoch": 0.5729144746730798, + "flos": 21506757390720.0, + "grad_norm": 2.5382971718998406, + "language_loss": 0.86010635, + "learning_rate": 1.6269513965564557e-06, + "loss": 0.88106132, + "num_input_tokens_seen": 205259465, + "step": 9529, + "time_per_iteration": 4.059460878372192 + }, + { + "auxiliary_loss_clip": 0.00992238, + "auxiliary_loss_mlp": 0.0100135, + "balance_loss_clip": 1.00528932, + "balance_loss_mlp": 1.00033689, + "epoch": 0.5729745979257478, + "flos": 58681628242560.0, + "grad_norm": 0.7638836103347504, + "language_loss": 0.56148398, + "learning_rate": 1.6265687768676813e-06, + "loss": 0.58141994, + "num_input_tokens_seen": 205314100, + "step": 9530, + "time_per_iteration": 3.044032335281372 + }, + { + "auxiliary_loss_clip": 0.01046377, + "auxiliary_loss_mlp": 0.01026678, + "balance_loss_clip": 1.02628231, + "balance_loss_mlp": 1.01632476, + "epoch": 0.5730347211784158, + "flos": 18552099421440.0, + "grad_norm": 2.5956452235217906, + "language_loss": 0.66497719, + "learning_rate": 1.6261861713394553e-06, + "loss": 0.68570781, + "num_input_tokens_seen": 205333420, + "step": 9531, + "time_per_iteration": 2.592371940612793 + }, + { + "auxiliary_loss_clip": 0.01048843, + "auxiliary_loss_mlp": 0.01033336, + "balance_loss_clip": 1.0236398, + "balance_loss_mlp": 1.02106333, + "epoch": 0.5730948444310837, + "flos": 38031482396160.0, + "grad_norm": 2.104519006255278, + "language_loss": 0.75467241, + "learning_rate": 1.6258035799862876e-06, + "loss": 0.77549422, + "num_input_tokens_seen": 205350995, + "step": 9532, + "time_per_iteration": 2.7389700412750244 + }, + { + "auxiliary_loss_clip": 0.01063246, + "auxiliary_loss_mlp": 0.01025457, + "balance_loss_clip": 1.02417004, + "balance_loss_mlp": 1.01479411, + "epoch": 0.5731549676837517, + "flos": 25227066689280.0, + "grad_norm": 1.3000058739056453, + "language_loss": 0.78812343, + "learning_rate": 1.625421002822686e-06, + "loss": 0.80901051, + "num_input_tokens_seen": 205372675, + "step": 9533, + "time_per_iteration": 2.600492238998413 + }, + { + "auxiliary_loss_clip": 0.01055249, + "auxiliary_loss_mlp": 0.01026749, + "balance_loss_clip": 1.02706277, + "balance_loss_mlp": 1.01691461, + "epoch": 0.5732150909364196, + "flos": 23368222886400.0, + "grad_norm": 1.6273959481647176, + "language_loss": 0.85893166, + "learning_rate": 1.6250384398631574e-06, + "loss": 0.87975168, + "num_input_tokens_seen": 205392590, + "step": 9534, + "time_per_iteration": 2.603935718536377 + }, + { + "auxiliary_loss_clip": 0.01044314, + "auxiliary_loss_mlp": 0.0102912, + "balance_loss_clip": 1.02473068, + "balance_loss_mlp": 1.01778889, + "epoch": 0.5732752141890877, + "flos": 23079357711360.0, + "grad_norm": 8.206053397847825, + "language_loss": 0.75601369, + "learning_rate": 1.6246558911222085e-06, + "loss": 0.77674806, + "num_input_tokens_seen": 205414885, + "step": 9535, + "time_per_iteration": 2.664461851119995 + }, + { + "auxiliary_loss_clip": 0.01045387, + "auxiliary_loss_mlp": 0.01030398, + "balance_loss_clip": 1.0251379, + "balance_loss_mlp": 1.01947236, + "epoch": 0.5733353374417556, + "flos": 24352282863360.0, + "grad_norm": 1.6227553106989736, + "language_loss": 0.70940745, + "learning_rate": 1.624273356614346e-06, + "loss": 0.73016524, + "num_input_tokens_seen": 205434440, + "step": 9536, + "time_per_iteration": 2.6484031677246094 + }, + { + "auxiliary_loss_clip": 0.01020561, + "auxiliary_loss_mlp": 0.01028621, + "balance_loss_clip": 1.02155209, + "balance_loss_mlp": 1.01725423, + "epoch": 0.5733954606944236, + "flos": 27198849830400.0, + "grad_norm": 1.9108500722286013, + "language_loss": 0.69816673, + "learning_rate": 1.6238908363540755e-06, + "loss": 0.71865851, + "num_input_tokens_seen": 205454225, + "step": 9537, + "time_per_iteration": 4.266822576522827 + }, + { + "auxiliary_loss_clip": 0.01066451, + "auxiliary_loss_mlp": 0.01028472, + "balance_loss_clip": 1.02702904, + "balance_loss_mlp": 1.01764202, + "epoch": 0.5734555839470915, + "flos": 28765129357440.0, + "grad_norm": 1.8370023838879197, + "language_loss": 0.62862229, + "learning_rate": 1.623508330355902e-06, + "loss": 0.6495716, + "num_input_tokens_seen": 205474750, + "step": 9538, + "time_per_iteration": 2.6492486000061035 + }, + { + "auxiliary_loss_clip": 0.01053421, + "auxiliary_loss_mlp": 0.01032493, + "balance_loss_clip": 1.02545309, + "balance_loss_mlp": 1.0212574, + "epoch": 0.5735157071997595, + "flos": 22966813422720.0, + "grad_norm": 1.6632709228091491, + "language_loss": 0.83157992, + "learning_rate": 1.6231258386343306e-06, + "loss": 0.85243911, + "num_input_tokens_seen": 205495495, + "step": 9539, + "time_per_iteration": 2.607116460800171 + }, + { + "auxiliary_loss_clip": 0.01035826, + "auxiliary_loss_mlp": 0.01027636, + "balance_loss_clip": 1.03152943, + "balance_loss_mlp": 1.01651955, + "epoch": 0.5735758304524274, + "flos": 18989455420800.0, + "grad_norm": 2.5788852181073243, + "language_loss": 0.72887337, + "learning_rate": 1.6227433612038647e-06, + "loss": 0.74950796, + "num_input_tokens_seen": 205510070, + "step": 9540, + "time_per_iteration": 2.7625951766967773 + }, + { + "auxiliary_loss_clip": 0.01051018, + "auxiliary_loss_mlp": 0.00747477, + "balance_loss_clip": 1.02347469, + "balance_loss_mlp": 1.0003525, + "epoch": 0.5736359537050955, + "flos": 28397942576640.0, + "grad_norm": 1.7967903888759857, + "language_loss": 0.80396098, + "learning_rate": 1.6223608980790089e-06, + "loss": 0.82194591, + "num_input_tokens_seen": 205530190, + "step": 9541, + "time_per_iteration": 2.6922237873077393 + }, + { + "auxiliary_loss_clip": 0.01046246, + "auxiliary_loss_mlp": 0.01030175, + "balance_loss_clip": 1.0254674, + "balance_loss_mlp": 1.01922584, + "epoch": 0.5736960769577634, + "flos": 15627210848640.0, + "grad_norm": 2.029542197879016, + "language_loss": 0.64142537, + "learning_rate": 1.6219784492742654e-06, + "loss": 0.6621896, + "num_input_tokens_seen": 205547380, + "step": 9542, + "time_per_iteration": 2.659926176071167 + }, + { + "auxiliary_loss_clip": 0.01044345, + "auxiliary_loss_mlp": 0.01025549, + "balance_loss_clip": 1.02395713, + "balance_loss_mlp": 1.01542234, + "epoch": 0.5737562002104314, + "flos": 18003994813440.0, + "grad_norm": 2.3905283228399212, + "language_loss": 0.82718909, + "learning_rate": 1.6215960148041365e-06, + "loss": 0.84788799, + "num_input_tokens_seen": 205566540, + "step": 9543, + "time_per_iteration": 4.260450601577759 + }, + { + "auxiliary_loss_clip": 0.01028101, + "auxiliary_loss_mlp": 0.01028704, + "balance_loss_clip": 1.02346206, + "balance_loss_mlp": 1.01629388, + "epoch": 0.5738163234630994, + "flos": 20698192287360.0, + "grad_norm": 3.031881190667688, + "language_loss": 0.7375899, + "learning_rate": 1.6212135946831257e-06, + "loss": 0.75815791, + "num_input_tokens_seen": 205584200, + "step": 9544, + "time_per_iteration": 2.7128682136535645 + }, + { + "auxiliary_loss_clip": 0.01014144, + "auxiliary_loss_mlp": 0.01027191, + "balance_loss_clip": 1.02100921, + "balance_loss_mlp": 1.01600361, + "epoch": 0.5738764467157673, + "flos": 23149311448320.0, + "grad_norm": 3.143884473775309, + "language_loss": 0.76304352, + "learning_rate": 1.620831188925733e-06, + "loss": 0.78345692, + "num_input_tokens_seen": 205604675, + "step": 9545, + "time_per_iteration": 2.7132503986358643 + }, + { + "auxiliary_loss_clip": 0.01045409, + "auxiliary_loss_mlp": 0.01033024, + "balance_loss_clip": 1.0256021, + "balance_loss_mlp": 1.02193725, + "epoch": 0.5739365699684353, + "flos": 29492930730240.0, + "grad_norm": 1.7713727247141948, + "language_loss": 0.56728661, + "learning_rate": 1.620448797546459e-06, + "loss": 0.58807099, + "num_input_tokens_seen": 205624680, + "step": 9546, + "time_per_iteration": 2.8292441368103027 + }, + { + "auxiliary_loss_clip": 0.01036127, + "auxiliary_loss_mlp": 0.01033645, + "balance_loss_clip": 1.02234292, + "balance_loss_mlp": 1.0219686, + "epoch": 0.5739966932211032, + "flos": 14027247342720.0, + "grad_norm": 2.418506902884738, + "language_loss": 0.76085997, + "learning_rate": 1.6200664205598055e-06, + "loss": 0.78155768, + "num_input_tokens_seen": 205641950, + "step": 9547, + "time_per_iteration": 2.5993149280548096 + }, + { + "auxiliary_loss_clip": 0.01049536, + "auxiliary_loss_mlp": 0.01032358, + "balance_loss_clip": 1.02257872, + "balance_loss_mlp": 1.02066374, + "epoch": 0.5740568164737713, + "flos": 19062030850560.0, + "grad_norm": 2.0237098281903485, + "language_loss": 0.74439394, + "learning_rate": 1.6196840579802704e-06, + "loss": 0.76521289, + "num_input_tokens_seen": 205660130, + "step": 9548, + "time_per_iteration": 2.6446750164031982 + }, + { + "auxiliary_loss_clip": 0.0102815, + "auxiliary_loss_mlp": 0.01031789, + "balance_loss_clip": 1.02028596, + "balance_loss_mlp": 1.02103639, + "epoch": 0.5741169397264392, + "flos": 22127832478080.0, + "grad_norm": 1.998645438616022, + "language_loss": 0.69582844, + "learning_rate": 1.619301709822355e-06, + "loss": 0.7164278, + "num_input_tokens_seen": 205678895, + "step": 9549, + "time_per_iteration": 2.7444965839385986 + }, + { + "auxiliary_loss_clip": 0.01019031, + "auxiliary_loss_mlp": 0.01026178, + "balance_loss_clip": 1.02643824, + "balance_loss_mlp": 1.01599157, + "epoch": 0.5741770629791072, + "flos": 24936836797440.0, + "grad_norm": 1.714588560561044, + "language_loss": 0.79779112, + "learning_rate": 1.6189193761005564e-06, + "loss": 0.81824321, + "num_input_tokens_seen": 205698450, + "step": 9550, + "time_per_iteration": 2.82737398147583 + }, + { + "auxiliary_loss_clip": 0.01040746, + "auxiliary_loss_mlp": 0.01038175, + "balance_loss_clip": 1.02504206, + "balance_loss_mlp": 1.02560484, + "epoch": 0.5742371862317751, + "flos": 18801462614400.0, + "grad_norm": 4.139441490097532, + "language_loss": 0.6775552, + "learning_rate": 1.6185370568293727e-06, + "loss": 0.69834441, + "num_input_tokens_seen": 205714870, + "step": 9551, + "time_per_iteration": 2.6389732360839844 + }, + { + "auxiliary_loss_clip": 0.0103616, + "auxiliary_loss_mlp": 0.0103102, + "balance_loss_clip": 1.02554512, + "balance_loss_mlp": 1.02007675, + "epoch": 0.5742973094844431, + "flos": 24460661174400.0, + "grad_norm": 3.2065853198470604, + "language_loss": 0.7191565, + "learning_rate": 1.6181547520233031e-06, + "loss": 0.73982835, + "num_input_tokens_seen": 205736045, + "step": 9552, + "time_per_iteration": 2.679379940032959 + }, + { + "auxiliary_loss_clip": 0.01059451, + "auxiliary_loss_mlp": 0.01033216, + "balance_loss_clip": 1.02970481, + "balance_loss_mlp": 1.02221251, + "epoch": 0.574357432737111, + "flos": 21652770176640.0, + "grad_norm": 2.066339495795643, + "language_loss": 0.80020386, + "learning_rate": 1.617772461696843e-06, + "loss": 0.82113051, + "num_input_tokens_seen": 205754445, + "step": 9553, + "time_per_iteration": 2.583955764770508 + }, + { + "auxiliary_loss_clip": 0.01057861, + "auxiliary_loss_mlp": 0.01026895, + "balance_loss_clip": 1.02564526, + "balance_loss_mlp": 1.01624918, + "epoch": 0.5744175559897791, + "flos": 16544728880640.0, + "grad_norm": 2.1300080587916086, + "language_loss": 0.8338359, + "learning_rate": 1.6173901858644895e-06, + "loss": 0.85468346, + "num_input_tokens_seen": 205770595, + "step": 9554, + "time_per_iteration": 2.581756830215454 + }, + { + "auxiliary_loss_clip": 0.01058207, + "auxiliary_loss_mlp": 0.00747569, + "balance_loss_clip": 1.02574229, + "balance_loss_mlp": 1.00047994, + "epoch": 0.574477679242447, + "flos": 24207598880640.0, + "grad_norm": 1.3817379937822798, + "language_loss": 0.7115742, + "learning_rate": 1.6170079245407385e-06, + "loss": 0.7296319, + "num_input_tokens_seen": 205791935, + "step": 9555, + "time_per_iteration": 2.6479780673980713 + }, + { + "auxiliary_loss_clip": 0.0104107, + "auxiliary_loss_mlp": 0.01024402, + "balance_loss_clip": 1.02218843, + "balance_loss_mlp": 1.01350617, + "epoch": 0.574537802495115, + "flos": 14903000835840.0, + "grad_norm": 2.3523086229887564, + "language_loss": 0.72909063, + "learning_rate": 1.6166256777400853e-06, + "loss": 0.74974537, + "num_input_tokens_seen": 205807260, + "step": 9556, + "time_per_iteration": 2.6345765590667725 + }, + { + "auxiliary_loss_clip": 0.01056422, + "auxiliary_loss_mlp": 0.01028497, + "balance_loss_clip": 1.02618408, + "balance_loss_mlp": 1.01767921, + "epoch": 0.5745979257477829, + "flos": 24934969290240.0, + "grad_norm": 1.5331344729119794, + "language_loss": 0.74164462, + "learning_rate": 1.6162434454770248e-06, + "loss": 0.76249385, + "num_input_tokens_seen": 205826885, + "step": 9557, + "time_per_iteration": 2.594050407409668 + }, + { + "auxiliary_loss_clip": 0.01052239, + "auxiliary_loss_mlp": 0.01034652, + "balance_loss_clip": 1.02428937, + "balance_loss_mlp": 1.02389348, + "epoch": 0.5746580490004509, + "flos": 17235757704960.0, + "grad_norm": 1.6191189323238868, + "language_loss": 0.68001378, + "learning_rate": 1.6158612277660514e-06, + "loss": 0.70088267, + "num_input_tokens_seen": 205844630, + "step": 9558, + "time_per_iteration": 2.6022396087646484 + }, + { + "auxiliary_loss_clip": 0.01043108, + "auxiliary_loss_mlp": 0.01042052, + "balance_loss_clip": 1.0247283, + "balance_loss_mlp": 1.02684116, + "epoch": 0.5747181722531189, + "flos": 13187871348480.0, + "grad_norm": 1.9330049816566566, + "language_loss": 0.7060613, + "learning_rate": 1.615479024621659e-06, + "loss": 0.72691286, + "num_input_tokens_seen": 205860960, + "step": 9559, + "time_per_iteration": 2.7104289531707764 + }, + { + "auxiliary_loss_clip": 0.01041674, + "auxiliary_loss_mlp": 0.00747315, + "balance_loss_clip": 1.02389753, + "balance_loss_mlp": 1.00047994, + "epoch": 0.5747782955057869, + "flos": 22963006581120.0, + "grad_norm": 1.5969491041549522, + "language_loss": 0.79125011, + "learning_rate": 1.6150968360583398e-06, + "loss": 0.80913997, + "num_input_tokens_seen": 205880675, + "step": 9560, + "time_per_iteration": 2.6323771476745605 + }, + { + "auxiliary_loss_clip": 0.01007027, + "auxiliary_loss_mlp": 0.01029256, + "balance_loss_clip": 1.02408659, + "balance_loss_mlp": 1.01795459, + "epoch": 0.5748384187584549, + "flos": 23403235668480.0, + "grad_norm": 2.04773973465742, + "language_loss": 0.63875401, + "learning_rate": 1.614714662090588e-06, + "loss": 0.6591168, + "num_input_tokens_seen": 205900050, + "step": 9561, + "time_per_iteration": 3.049988031387329 + }, + { + "auxiliary_loss_clip": 0.01058453, + "auxiliary_loss_mlp": 0.01032202, + "balance_loss_clip": 1.02620053, + "balance_loss_mlp": 1.02037072, + "epoch": 0.5748985420111228, + "flos": 17785514338560.0, + "grad_norm": 1.654720179623099, + "language_loss": 0.71393293, + "learning_rate": 1.6143325027328945e-06, + "loss": 0.73483944, + "num_input_tokens_seen": 205918855, + "step": 9562, + "time_per_iteration": 2.6112723350524902 + }, + { + "auxiliary_loss_clip": 0.01017079, + "auxiliary_loss_mlp": 0.01032578, + "balance_loss_clip": 1.02316332, + "balance_loss_mlp": 1.02242136, + "epoch": 0.5749586652637908, + "flos": 19866250408320.0, + "grad_norm": 1.46529433735871, + "language_loss": 0.84162676, + "learning_rate": 1.613950357999751e-06, + "loss": 0.86212325, + "num_input_tokens_seen": 205936970, + "step": 9563, + "time_per_iteration": 2.7313456535339355 + }, + { + "auxiliary_loss_clip": 0.01016519, + "auxiliary_loss_mlp": 0.01036266, + "balance_loss_clip": 1.02450907, + "balance_loss_mlp": 1.02462554, + "epoch": 0.5750187885164587, + "flos": 21287235421440.0, + "grad_norm": 1.9949337735993433, + "language_loss": 0.5733729, + "learning_rate": 1.6135682279056488e-06, + "loss": 0.59390074, + "num_input_tokens_seen": 205954630, + "step": 9564, + "time_per_iteration": 2.828784227371216 + }, + { + "auxiliary_loss_clip": 0.01037344, + "auxiliary_loss_mlp": 0.01028953, + "balance_loss_clip": 1.02289701, + "balance_loss_mlp": 1.0178014, + "epoch": 0.5750789117691267, + "flos": 18804658924800.0, + "grad_norm": 1.9769235546743433, + "language_loss": 0.75618327, + "learning_rate": 1.613186112465078e-06, + "loss": 0.77684617, + "num_input_tokens_seen": 205971510, + "step": 9565, + "time_per_iteration": 2.6793689727783203 + }, + { + "auxiliary_loss_clip": 0.00974289, + "auxiliary_loss_mlp": 0.01004196, + "balance_loss_clip": 1.00745749, + "balance_loss_mlp": 1.00298595, + "epoch": 0.5751390350217946, + "flos": 70663224124800.0, + "grad_norm": 0.7470819471144244, + "language_loss": 0.60741341, + "learning_rate": 1.6128040116925287e-06, + "loss": 0.62719834, + "num_input_tokens_seen": 206035125, + "step": 9566, + "time_per_iteration": 3.374385118484497 + }, + { + "auxiliary_loss_clip": 0.01043998, + "auxiliary_loss_mlp": 0.01031508, + "balance_loss_clip": 1.02576661, + "balance_loss_mlp": 1.02064192, + "epoch": 0.5751991582744627, + "flos": 14246338348800.0, + "grad_norm": 2.0565577218187623, + "language_loss": 0.7544744, + "learning_rate": 1.6124219256024901e-06, + "loss": 0.77522945, + "num_input_tokens_seen": 206052075, + "step": 9567, + "time_per_iteration": 2.653728723526001 + }, + { + "auxiliary_loss_clip": 0.01054485, + "auxiliary_loss_mlp": 0.01026819, + "balance_loss_clip": 1.02579427, + "balance_loss_mlp": 1.01685905, + "epoch": 0.5752592815271306, + "flos": 18328160079360.0, + "grad_norm": 1.7652687830115648, + "language_loss": 0.74812549, + "learning_rate": 1.6120398542094504e-06, + "loss": 0.76893854, + "num_input_tokens_seen": 206069970, + "step": 9568, + "time_per_iteration": 2.602973699569702 + }, + { + "auxiliary_loss_clip": 0.01065586, + "auxiliary_loss_mlp": 0.01029044, + "balance_loss_clip": 1.02509081, + "balance_loss_mlp": 1.01804662, + "epoch": 0.5753194047797986, + "flos": 20922742160640.0, + "grad_norm": 1.6851580109281172, + "language_loss": 0.71038401, + "learning_rate": 1.6116577975278994e-06, + "loss": 0.73133028, + "num_input_tokens_seen": 206088950, + "step": 9569, + "time_per_iteration": 2.579655408859253 + }, + { + "auxiliary_loss_clip": 0.01057095, + "auxiliary_loss_mlp": 0.01031945, + "balance_loss_clip": 1.02649188, + "balance_loss_mlp": 1.02077532, + "epoch": 0.5753795280324665, + "flos": 19281804215040.0, + "grad_norm": 2.431805620571243, + "language_loss": 0.55572784, + "learning_rate": 1.6112757555723223e-06, + "loss": 0.57661825, + "num_input_tokens_seen": 206107780, + "step": 9570, + "time_per_iteration": 2.5350818634033203 + }, + { + "auxiliary_loss_clip": 0.01062854, + "auxiliary_loss_mlp": 0.0102901, + "balance_loss_clip": 1.0236764, + "balance_loss_mlp": 1.019032, + "epoch": 0.5754396512851345, + "flos": 21652877917440.0, + "grad_norm": 2.14465329776966, + "language_loss": 0.64032626, + "learning_rate": 1.6108937283572082e-06, + "loss": 0.66124493, + "num_input_tokens_seen": 206127445, + "step": 9571, + "time_per_iteration": 2.525664806365967 + }, + { + "auxiliary_loss_clip": 0.01056234, + "auxiliary_loss_mlp": 0.01029958, + "balance_loss_clip": 1.02544403, + "balance_loss_mlp": 1.0190506, + "epoch": 0.5754997745378025, + "flos": 51021700179840.0, + "grad_norm": 1.5335580204256658, + "language_loss": 0.66913974, + "learning_rate": 1.6105117158970434e-06, + "loss": 0.69000167, + "num_input_tokens_seen": 206152005, + "step": 9572, + "time_per_iteration": 2.886845350265503 + }, + { + "auxiliary_loss_clip": 0.01046428, + "auxiliary_loss_mlp": 0.01029124, + "balance_loss_clip": 1.02642882, + "balance_loss_mlp": 1.01771557, + "epoch": 0.5755598977904705, + "flos": 22856890826880.0, + "grad_norm": 1.8202312541645607, + "language_loss": 0.72409892, + "learning_rate": 1.6101297182063123e-06, + "loss": 0.74485439, + "num_input_tokens_seen": 206169875, + "step": 9573, + "time_per_iteration": 4.293658494949341 + }, + { + "auxiliary_loss_clip": 0.01062668, + "auxiliary_loss_mlp": 0.01028394, + "balance_loss_clip": 1.02701521, + "balance_loss_mlp": 1.0190835, + "epoch": 0.5756200210431385, + "flos": 38472824805120.0, + "grad_norm": 1.8089287593132732, + "language_loss": 0.76577598, + "learning_rate": 1.6097477352995022e-06, + "loss": 0.78668666, + "num_input_tokens_seen": 206192635, + "step": 9574, + "time_per_iteration": 2.655473470687866 + }, + { + "auxiliary_loss_clip": 0.01010244, + "auxiliary_loss_mlp": 0.01035921, + "balance_loss_clip": 1.02059293, + "balance_loss_mlp": 1.02342224, + "epoch": 0.5756801442958064, + "flos": 23910006700800.0, + "grad_norm": 3.2270555643155974, + "language_loss": 0.66566962, + "learning_rate": 1.6093657671910968e-06, + "loss": 0.6861313, + "num_input_tokens_seen": 206211485, + "step": 9575, + "time_per_iteration": 4.328810930252075 + }, + { + "auxiliary_loss_clip": 0.01045389, + "auxiliary_loss_mlp": 0.0102857, + "balance_loss_clip": 1.02691007, + "balance_loss_mlp": 1.0180912, + "epoch": 0.5757402675484744, + "flos": 21105276099840.0, + "grad_norm": 1.5114005050949553, + "language_loss": 0.79791224, + "learning_rate": 1.6089838138955804e-06, + "loss": 0.8186518, + "num_input_tokens_seen": 206231740, + "step": 9576, + "time_per_iteration": 2.640730381011963 + }, + { + "auxiliary_loss_clip": 0.0104204, + "auxiliary_loss_mlp": 0.01028215, + "balance_loss_clip": 1.02356482, + "balance_loss_mlp": 1.01836252, + "epoch": 0.5758003908011423, + "flos": 20559110826240.0, + "grad_norm": 1.7989347877449915, + "language_loss": 0.69556636, + "learning_rate": 1.6086018754274372e-06, + "loss": 0.7162689, + "num_input_tokens_seen": 206250975, + "step": 9577, + "time_per_iteration": 2.6637351512908936 + }, + { + "auxiliary_loss_clip": 0.01056667, + "auxiliary_loss_mlp": 0.01029052, + "balance_loss_clip": 1.02512419, + "balance_loss_mlp": 1.01880622, + "epoch": 0.5758605140538103, + "flos": 16473015377280.0, + "grad_norm": 1.6417924117994134, + "language_loss": 0.6663937, + "learning_rate": 1.6082199518011504e-06, + "loss": 0.68725085, + "num_input_tokens_seen": 206268800, + "step": 9578, + "time_per_iteration": 2.5677249431610107 + }, + { + "auxiliary_loss_clip": 0.01035573, + "auxiliary_loss_mlp": 0.01025053, + "balance_loss_clip": 1.02550006, + "balance_loss_mlp": 1.01515865, + "epoch": 0.5759206373064782, + "flos": 21287558643840.0, + "grad_norm": 1.5946232354484686, + "language_loss": 0.72708082, + "learning_rate": 1.6078380430312016e-06, + "loss": 0.74768716, + "num_input_tokens_seen": 206287190, + "step": 9579, + "time_per_iteration": 2.5787341594696045 + }, + { + "auxiliary_loss_clip": 0.01048235, + "auxiliary_loss_mlp": 0.01028241, + "balance_loss_clip": 1.02567792, + "balance_loss_mlp": 1.0164988, + "epoch": 0.5759807605591463, + "flos": 26067879227520.0, + "grad_norm": 2.5119663316142806, + "language_loss": 0.64595896, + "learning_rate": 1.6074561491320742e-06, + "loss": 0.66672367, + "num_input_tokens_seen": 206307020, + "step": 9580, + "time_per_iteration": 2.6091363430023193 + }, + { + "auxiliary_loss_clip": 0.01036935, + "auxiliary_loss_mlp": 0.0103419, + "balance_loss_clip": 1.02300608, + "balance_loss_mlp": 1.02280521, + "epoch": 0.5760408838118142, + "flos": 18873068376960.0, + "grad_norm": 1.7470676515745067, + "language_loss": 0.85578877, + "learning_rate": 1.6070742701182486e-06, + "loss": 0.87650001, + "num_input_tokens_seen": 206324095, + "step": 9581, + "time_per_iteration": 2.62127423286438 + }, + { + "auxiliary_loss_clip": 0.01073186, + "auxiliary_loss_mlp": 0.01035274, + "balance_loss_clip": 1.03045952, + "balance_loss_mlp": 1.02396727, + "epoch": 0.5761010070644822, + "flos": 15378134964480.0, + "grad_norm": 2.1267859960614572, + "language_loss": 0.66798109, + "learning_rate": 1.6066924060042057e-06, + "loss": 0.68906569, + "num_input_tokens_seen": 206343210, + "step": 9582, + "time_per_iteration": 2.556192636489868 + }, + { + "auxiliary_loss_clip": 0.00992487, + "auxiliary_loss_mlp": 0.01006577, + "balance_loss_clip": 1.00493956, + "balance_loss_mlp": 1.0052712, + "epoch": 0.5761611303171501, + "flos": 71471932882560.0, + "grad_norm": 0.6984542270510752, + "language_loss": 0.57171559, + "learning_rate": 1.6063105568044271e-06, + "loss": 0.59170628, + "num_input_tokens_seen": 206415935, + "step": 9583, + "time_per_iteration": 3.372565269470215 + }, + { + "auxiliary_loss_clip": 0.0104333, + "auxiliary_loss_mlp": 0.01031228, + "balance_loss_clip": 1.02640951, + "balance_loss_mlp": 1.02048683, + "epoch": 0.5762212535698181, + "flos": 16246167033600.0, + "grad_norm": 1.8498099853427816, + "language_loss": 0.82531261, + "learning_rate": 1.6059287225333912e-06, + "loss": 0.84605819, + "num_input_tokens_seen": 206431900, + "step": 9584, + "time_per_iteration": 4.257268190383911 + }, + { + "auxiliary_loss_clip": 0.01008528, + "auxiliary_loss_mlp": 0.01002389, + "balance_loss_clip": 1.0022496, + "balance_loss_mlp": 1.00129843, + "epoch": 0.5762813768224861, + "flos": 70185504216960.0, + "grad_norm": 0.6241594623188649, + "language_loss": 0.49500197, + "learning_rate": 1.6055469032055773e-06, + "loss": 0.51511115, + "num_input_tokens_seen": 206501200, + "step": 9585, + "time_per_iteration": 3.1811981201171875 + }, + { + "auxiliary_loss_clip": 0.01041817, + "auxiliary_loss_mlp": 0.01023521, + "balance_loss_clip": 1.02365899, + "balance_loss_mlp": 1.01329851, + "epoch": 0.5763415000751541, + "flos": 20518028645760.0, + "grad_norm": 1.4732085273775497, + "language_loss": 0.84896934, + "learning_rate": 1.605165098835465e-06, + "loss": 0.86962271, + "num_input_tokens_seen": 206520575, + "step": 9586, + "time_per_iteration": 2.5982325077056885 + }, + { + "auxiliary_loss_clip": 0.01055296, + "auxiliary_loss_mlp": 0.01031537, + "balance_loss_clip": 1.02562737, + "balance_loss_mlp": 1.0195682, + "epoch": 0.5764016233278221, + "flos": 15815526877440.0, + "grad_norm": 7.057311968852774, + "language_loss": 0.79908633, + "learning_rate": 1.6047833094375308e-06, + "loss": 0.81995463, + "num_input_tokens_seen": 206538060, + "step": 9587, + "time_per_iteration": 2.7594122886657715 + }, + { + "auxiliary_loss_clip": 0.01034487, + "auxiliary_loss_mlp": 0.01034549, + "balance_loss_clip": 1.02228522, + "balance_loss_mlp": 1.02296793, + "epoch": 0.57646174658049, + "flos": 20772312001920.0, + "grad_norm": 1.7733455101817555, + "language_loss": 0.66160595, + "learning_rate": 1.6044015350262542e-06, + "loss": 0.68229634, + "num_input_tokens_seen": 206557320, + "step": 9588, + "time_per_iteration": 2.693173408508301 + }, + { + "auxiliary_loss_clip": 0.0103799, + "auxiliary_loss_mlp": 0.01034909, + "balance_loss_clip": 1.02384806, + "balance_loss_mlp": 1.02254069, + "epoch": 0.576521869833158, + "flos": 23549930812800.0, + "grad_norm": 1.7763709951191906, + "language_loss": 0.78505313, + "learning_rate": 1.6040197756161104e-06, + "loss": 0.80578208, + "num_input_tokens_seen": 206575780, + "step": 9589, + "time_per_iteration": 2.6358675956726074 + }, + { + "auxiliary_loss_clip": 0.01061589, + "auxiliary_loss_mlp": 0.01020305, + "balance_loss_clip": 1.02332175, + "balance_loss_mlp": 1.01055408, + "epoch": 0.5765819930858259, + "flos": 20266582464000.0, + "grad_norm": 1.9926428472281774, + "language_loss": 0.7935729, + "learning_rate": 1.6036380312215762e-06, + "loss": 0.81439179, + "num_input_tokens_seen": 206594100, + "step": 9590, + "time_per_iteration": 2.5230770111083984 + }, + { + "auxiliary_loss_clip": 0.01009197, + "auxiliary_loss_mlp": 0.00747542, + "balance_loss_clip": 1.02288604, + "balance_loss_mlp": 1.00042009, + "epoch": 0.5766421163384939, + "flos": 23148772744320.0, + "grad_norm": 1.6852513950879413, + "language_loss": 0.63053119, + "learning_rate": 1.6032563018571283e-06, + "loss": 0.64809859, + "num_input_tokens_seen": 206613325, + "step": 9591, + "time_per_iteration": 4.33865213394165 + }, + { + "auxiliary_loss_clip": 0.01066124, + "auxiliary_loss_mlp": 0.00747587, + "balance_loss_clip": 1.02681565, + "balance_loss_mlp": 1.00052702, + "epoch": 0.5767022395911618, + "flos": 25848895962240.0, + "grad_norm": 1.7726845556533326, + "language_loss": 0.78102481, + "learning_rate": 1.6028745875372406e-06, + "loss": 0.79916191, + "num_input_tokens_seen": 206634265, + "step": 9592, + "time_per_iteration": 2.836641311645508 + }, + { + "auxiliary_loss_clip": 0.00963582, + "auxiliary_loss_mlp": 0.01001428, + "balance_loss_clip": 1.00590777, + "balance_loss_mlp": 1.00023007, + "epoch": 0.5767623628438299, + "flos": 68293299657600.0, + "grad_norm": 0.7283090856437092, + "language_loss": 0.59645271, + "learning_rate": 1.6024928882763885e-06, + "loss": 0.61610281, + "num_input_tokens_seen": 206696990, + "step": 9593, + "time_per_iteration": 3.539196491241455 + }, + { + "auxiliary_loss_clip": 0.01055286, + "auxiliary_loss_mlp": 0.01033602, + "balance_loss_clip": 1.0250119, + "balance_loss_mlp": 1.02197874, + "epoch": 0.5768224860964978, + "flos": 30188448754560.0, + "grad_norm": 1.8742599333659204, + "language_loss": 0.71021318, + "learning_rate": 1.6021112040890463e-06, + "loss": 0.73110199, + "num_input_tokens_seen": 206717815, + "step": 9594, + "time_per_iteration": 3.1171462535858154 + }, + { + "auxiliary_loss_clip": 0.01033039, + "auxiliary_loss_mlp": 0.01026753, + "balance_loss_clip": 1.02348471, + "balance_loss_mlp": 1.01711464, + "epoch": 0.5768826093491658, + "flos": 17895041884800.0, + "grad_norm": 1.6822223962755685, + "language_loss": 0.71147668, + "learning_rate": 1.6017295349896863e-06, + "loss": 0.73207462, + "num_input_tokens_seen": 206735985, + "step": 9595, + "time_per_iteration": 2.720193862915039 + }, + { + "auxiliary_loss_clip": 0.01064679, + "auxiliary_loss_mlp": 0.01024793, + "balance_loss_clip": 1.02521586, + "balance_loss_mlp": 1.0139513, + "epoch": 0.5769427326018337, + "flos": 17457183095040.0, + "grad_norm": 2.1451010146770586, + "language_loss": 0.69818723, + "learning_rate": 1.6013478809927828e-06, + "loss": 0.71908194, + "num_input_tokens_seen": 206753370, + "step": 9596, + "time_per_iteration": 2.775252342224121 + }, + { + "auxiliary_loss_clip": 0.01052229, + "auxiliary_loss_mlp": 0.01034425, + "balance_loss_clip": 1.03000534, + "balance_loss_mlp": 1.02237868, + "epoch": 0.5770028558545017, + "flos": 39421728345600.0, + "grad_norm": 2.384205168663852, + "language_loss": 0.67545414, + "learning_rate": 1.6009662421128074e-06, + "loss": 0.69632065, + "num_input_tokens_seen": 206777645, + "step": 9597, + "time_per_iteration": 2.887108087539673 + }, + { + "auxiliary_loss_clip": 0.01045082, + "auxiliary_loss_mlp": 0.01032088, + "balance_loss_clip": 1.02595139, + "balance_loss_mlp": 1.02135301, + "epoch": 0.5770629791071697, + "flos": 21536383132800.0, + "grad_norm": 1.7701275902564364, + "language_loss": 0.81724542, + "learning_rate": 1.6005846183642323e-06, + "loss": 0.83801711, + "num_input_tokens_seen": 206794865, + "step": 9598, + "time_per_iteration": 2.719355821609497 + }, + { + "auxiliary_loss_clip": 0.01016863, + "auxiliary_loss_mlp": 0.01041757, + "balance_loss_clip": 1.01992285, + "balance_loss_mlp": 1.02838755, + "epoch": 0.5771231023598377, + "flos": 20886795624960.0, + "grad_norm": 1.7273162371474846, + "language_loss": 0.72551483, + "learning_rate": 1.6002030097615277e-06, + "loss": 0.74610102, + "num_input_tokens_seen": 206814095, + "step": 9599, + "time_per_iteration": 2.8153457641601562 + }, + { + "auxiliary_loss_clip": 0.01061648, + "auxiliary_loss_mlp": 0.0102732, + "balance_loss_clip": 1.02433205, + "balance_loss_mlp": 1.01701403, + "epoch": 0.5771832256125057, + "flos": 18077216688000.0, + "grad_norm": 2.29942071915832, + "language_loss": 0.77967596, + "learning_rate": 1.5998214163191663e-06, + "loss": 0.8005656, + "num_input_tokens_seen": 206832245, + "step": 9600, + "time_per_iteration": 2.6832077503204346 + }, + { + "auxiliary_loss_clip": 0.01056662, + "auxiliary_loss_mlp": 0.00747647, + "balance_loss_clip": 1.02630973, + "balance_loss_mlp": 1.00047874, + "epoch": 0.5772433488651736, + "flos": 26359078786560.0, + "grad_norm": 1.6337310243101963, + "language_loss": 0.72376096, + "learning_rate": 1.5994398380516163e-06, + "loss": 0.741804, + "num_input_tokens_seen": 206851535, + "step": 9601, + "time_per_iteration": 2.6127002239227295 + }, + { + "auxiliary_loss_clip": 0.01019944, + "auxiliary_loss_mlp": 0.01033254, + "balance_loss_clip": 1.03068876, + "balance_loss_mlp": 1.02232277, + "epoch": 0.5773034721178416, + "flos": 19680987035520.0, + "grad_norm": 1.7265080095280516, + "language_loss": 0.68500519, + "learning_rate": 1.599058274973348e-06, + "loss": 0.7055372, + "num_input_tokens_seen": 206870595, + "step": 9602, + "time_per_iteration": 2.7885501384735107 + }, + { + "auxiliary_loss_clip": 0.01034775, + "auxiliary_loss_mlp": 0.01034967, + "balance_loss_clip": 1.02220178, + "balance_loss_mlp": 1.02364802, + "epoch": 0.5773635953705095, + "flos": 25082885496960.0, + "grad_norm": 1.4459133584524204, + "language_loss": 0.73073, + "learning_rate": 1.5986767270988297e-06, + "loss": 0.75142741, + "num_input_tokens_seen": 206892320, + "step": 9603, + "time_per_iteration": 2.7208895683288574 + }, + { + "auxiliary_loss_clip": 0.01054626, + "auxiliary_loss_mlp": 0.01026715, + "balance_loss_clip": 1.02644503, + "balance_loss_mlp": 1.01614654, + "epoch": 0.5774237186231775, + "flos": 21032987978880.0, + "grad_norm": 1.8185565313068077, + "language_loss": 0.76591438, + "learning_rate": 1.5982951944425298e-06, + "loss": 0.78672779, + "num_input_tokens_seen": 206912485, + "step": 9604, + "time_per_iteration": 2.6335670948028564 + }, + { + "auxiliary_loss_clip": 0.01036228, + "auxiliary_loss_mlp": 0.01033134, + "balance_loss_clip": 1.02616334, + "balance_loss_mlp": 1.02167225, + "epoch": 0.5774838418758454, + "flos": 15231727128960.0, + "grad_norm": 1.781570609401977, + "language_loss": 0.83302736, + "learning_rate": 1.5979136770189174e-06, + "loss": 0.85372102, + "num_input_tokens_seen": 206929100, + "step": 9605, + "time_per_iteration": 2.720242738723755 + }, + { + "auxiliary_loss_clip": 0.01040114, + "auxiliary_loss_mlp": 0.01029867, + "balance_loss_clip": 1.02612042, + "balance_loss_mlp": 1.01658702, + "epoch": 0.5775439651285135, + "flos": 23582609210880.0, + "grad_norm": 1.6227280249736815, + "language_loss": 0.78034651, + "learning_rate": 1.5975321748424581e-06, + "loss": 0.80104625, + "num_input_tokens_seen": 206947020, + "step": 9606, + "time_per_iteration": 2.858137845993042 + }, + { + "auxiliary_loss_clip": 0.010636, + "auxiliary_loss_mlp": 0.01031073, + "balance_loss_clip": 1.02582264, + "balance_loss_mlp": 1.02102375, + "epoch": 0.5776040883811814, + "flos": 18040515966720.0, + "grad_norm": 1.746027667927609, + "language_loss": 0.73742914, + "learning_rate": 1.597150687927619e-06, + "loss": 0.75837588, + "num_input_tokens_seen": 206964065, + "step": 9607, + "time_per_iteration": 2.6023991107940674 + }, + { + "auxiliary_loss_clip": 0.01033975, + "auxiliary_loss_mlp": 0.01033081, + "balance_loss_clip": 1.02659917, + "balance_loss_mlp": 1.02169025, + "epoch": 0.5776642116338494, + "flos": 18624638937600.0, + "grad_norm": 1.7275924844665729, + "language_loss": 0.69423342, + "learning_rate": 1.5967692162888664e-06, + "loss": 0.71490401, + "num_input_tokens_seen": 206981940, + "step": 9608, + "time_per_iteration": 2.6724114418029785 + }, + { + "auxiliary_loss_clip": 0.01032173, + "auxiliary_loss_mlp": 0.01029905, + "balance_loss_clip": 1.02323043, + "balance_loss_mlp": 1.01901484, + "epoch": 0.5777243348865173, + "flos": 28402539517440.0, + "grad_norm": 2.090118172916055, + "language_loss": 0.76180363, + "learning_rate": 1.596387759940665e-06, + "loss": 0.78242445, + "num_input_tokens_seen": 207002365, + "step": 9609, + "time_per_iteration": 2.8284246921539307 + }, + { + "auxiliary_loss_clip": 0.01037755, + "auxiliary_loss_mlp": 0.01030309, + "balance_loss_clip": 1.02579069, + "balance_loss_mlp": 1.01912749, + "epoch": 0.5777844581391853, + "flos": 24024705805440.0, + "grad_norm": 1.6786251875570768, + "language_loss": 0.77253103, + "learning_rate": 1.5960063188974808e-06, + "loss": 0.79321164, + "num_input_tokens_seen": 207021195, + "step": 9610, + "time_per_iteration": 2.6970632076263428 + }, + { + "auxiliary_loss_clip": 0.01028917, + "auxiliary_loss_mlp": 0.0102739, + "balance_loss_clip": 1.02251172, + "balance_loss_mlp": 1.01591063, + "epoch": 0.5778445813918534, + "flos": 17777361951360.0, + "grad_norm": 2.414770297559562, + "language_loss": 0.68461484, + "learning_rate": 1.5956248931737777e-06, + "loss": 0.70517796, + "num_input_tokens_seen": 207037465, + "step": 9611, + "time_per_iteration": 2.634280204772949 + }, + { + "auxiliary_loss_clip": 0.01048499, + "auxiliary_loss_mlp": 0.01024199, + "balance_loss_clip": 1.0219512, + "balance_loss_mlp": 1.01358902, + "epoch": 0.5779047046445213, + "flos": 22233194046720.0, + "grad_norm": 1.9887859087992166, + "language_loss": 0.83140641, + "learning_rate": 1.5952434827840185e-06, + "loss": 0.85213339, + "num_input_tokens_seen": 207054230, + "step": 9612, + "time_per_iteration": 2.5655264854431152 + }, + { + "auxiliary_loss_clip": 0.01066139, + "auxiliary_loss_mlp": 0.01029764, + "balance_loss_clip": 1.02641058, + "balance_loss_mlp": 1.01870751, + "epoch": 0.5779648278971893, + "flos": 21434361528960.0, + "grad_norm": 1.9019736305112231, + "language_loss": 0.79603744, + "learning_rate": 1.594862087742667e-06, + "loss": 0.81699646, + "num_input_tokens_seen": 207073150, + "step": 9613, + "time_per_iteration": 2.5901474952697754 + }, + { + "auxiliary_loss_clip": 0.01054592, + "auxiliary_loss_mlp": 0.0102808, + "balance_loss_clip": 1.02524364, + "balance_loss_mlp": 1.01855528, + "epoch": 0.5780249511498572, + "flos": 19026120228480.0, + "grad_norm": 1.9270414612644589, + "language_loss": 0.7756201, + "learning_rate": 1.5944807080641863e-06, + "loss": 0.7964468, + "num_input_tokens_seen": 207090375, + "step": 9614, + "time_per_iteration": 2.538684368133545 + }, + { + "auxiliary_loss_clip": 0.010363, + "auxiliary_loss_mlp": 0.01031621, + "balance_loss_clip": 1.02514613, + "balance_loss_mlp": 1.02060628, + "epoch": 0.5780850744025252, + "flos": 12124663752960.0, + "grad_norm": 2.0637098453907776, + "language_loss": 0.81329376, + "learning_rate": 1.5940993437630375e-06, + "loss": 0.83397293, + "num_input_tokens_seen": 207106030, + "step": 9615, + "time_per_iteration": 2.595228672027588 + }, + { + "auxiliary_loss_clip": 0.01049946, + "auxiliary_loss_mlp": 0.01029238, + "balance_loss_clip": 1.02306247, + "balance_loss_mlp": 1.01803827, + "epoch": 0.5781451976551931, + "flos": 25044425009280.0, + "grad_norm": 1.4524474469671256, + "language_loss": 0.67131346, + "learning_rate": 1.5937179948536825e-06, + "loss": 0.69210535, + "num_input_tokens_seen": 207125435, + "step": 9616, + "time_per_iteration": 2.5966460704803467 + }, + { + "auxiliary_loss_clip": 0.01054842, + "auxiliary_loss_mlp": 0.01024538, + "balance_loss_clip": 1.02587521, + "balance_loss_mlp": 1.01424384, + "epoch": 0.5782053209078611, + "flos": 19245606284160.0, + "grad_norm": 1.6681482880843415, + "language_loss": 0.7759937, + "learning_rate": 1.5933366613505812e-06, + "loss": 0.7967875, + "num_input_tokens_seen": 207145095, + "step": 9617, + "time_per_iteration": 2.675462007522583 + }, + { + "auxiliary_loss_clip": 0.01044, + "auxiliary_loss_mlp": 0.01027125, + "balance_loss_clip": 1.02499938, + "balance_loss_mlp": 1.01564491, + "epoch": 0.578265444160529, + "flos": 25993831340160.0, + "grad_norm": 3.031804281207547, + "language_loss": 0.75329244, + "learning_rate": 1.5929553432681947e-06, + "loss": 0.77400374, + "num_input_tokens_seen": 207166045, + "step": 9618, + "time_per_iteration": 2.725411891937256 + }, + { + "auxiliary_loss_clip": 0.01062499, + "auxiliary_loss_mlp": 0.01029455, + "balance_loss_clip": 1.02483225, + "balance_loss_mlp": 1.01866639, + "epoch": 0.5783255674131971, + "flos": 21798603394560.0, + "grad_norm": 1.5796222305202612, + "language_loss": 0.8162021, + "learning_rate": 1.5925740406209826e-06, + "loss": 0.83712161, + "num_input_tokens_seen": 207185290, + "step": 9619, + "time_per_iteration": 2.5046401023864746 + }, + { + "auxiliary_loss_clip": 0.01044664, + "auxiliary_loss_mlp": 0.01028094, + "balance_loss_clip": 1.02398682, + "balance_loss_mlp": 1.0176158, + "epoch": 0.578385690665865, + "flos": 24789746603520.0, + "grad_norm": 1.6408773776261198, + "language_loss": 0.72318292, + "learning_rate": 1.5921927534234039e-06, + "loss": 0.74391049, + "num_input_tokens_seen": 207205505, + "step": 9620, + "time_per_iteration": 2.6316254138946533 + }, + { + "auxiliary_loss_clip": 0.01043691, + "auxiliary_loss_mlp": 0.01026153, + "balance_loss_clip": 1.02400291, + "balance_loss_mlp": 1.01542401, + "epoch": 0.578445813918533, + "flos": 21212864311680.0, + "grad_norm": 1.4561122047337165, + "language_loss": 0.76823843, + "learning_rate": 1.591811481689916e-06, + "loss": 0.78893691, + "num_input_tokens_seen": 207225315, + "step": 9621, + "time_per_iteration": 4.212768793106079 + }, + { + "auxiliary_loss_clip": 0.01000918, + "auxiliary_loss_mlp": 0.01038544, + "balance_loss_clip": 1.02046776, + "balance_loss_mlp": 1.02604485, + "epoch": 0.5785059371712009, + "flos": 25046795306880.0, + "grad_norm": 1.5670246382243107, + "language_loss": 0.70212126, + "learning_rate": 1.5914302254349787e-06, + "loss": 0.72251594, + "num_input_tokens_seen": 207247690, + "step": 9622, + "time_per_iteration": 4.433140993118286 + }, + { + "auxiliary_loss_clip": 0.00991479, + "auxiliary_loss_mlp": 0.01004207, + "balance_loss_clip": 1.00491595, + "balance_loss_mlp": 1.00308657, + "epoch": 0.5785660604238689, + "flos": 70843172284800.0, + "grad_norm": 0.7746255647415861, + "language_loss": 0.55936509, + "learning_rate": 1.5910489846730476e-06, + "loss": 0.57932198, + "num_input_tokens_seen": 207301735, + "step": 9623, + "time_per_iteration": 3.222620725631714 + }, + { + "auxiliary_loss_clip": 0.01037424, + "auxiliary_loss_mlp": 0.01035311, + "balance_loss_clip": 1.02570999, + "balance_loss_mlp": 1.02340174, + "epoch": 0.578626183676537, + "flos": 31649977244160.0, + "grad_norm": 1.9390937856487214, + "language_loss": 0.71045339, + "learning_rate": 1.5906677594185799e-06, + "loss": 0.73118067, + "num_input_tokens_seen": 207321240, + "step": 9624, + "time_per_iteration": 2.7505857944488525 + }, + { + "auxiliary_loss_clip": 0.01030003, + "auxiliary_loss_mlp": 0.0103584, + "balance_loss_clip": 1.02338862, + "balance_loss_mlp": 1.02430689, + "epoch": 0.5786863069292049, + "flos": 21865181253120.0, + "grad_norm": 1.9119144366556178, + "language_loss": 0.82807893, + "learning_rate": 1.5902865496860322e-06, + "loss": 0.84873736, + "num_input_tokens_seen": 207339540, + "step": 9625, + "time_per_iteration": 2.726334810256958 + }, + { + "auxiliary_loss_clip": 0.0106229, + "auxiliary_loss_mlp": 0.01030995, + "balance_loss_clip": 1.0238204, + "balance_loss_mlp": 1.0193187, + "epoch": 0.5787464301818729, + "flos": 23364954748800.0, + "grad_norm": 1.4266724870786265, + "language_loss": 0.69933927, + "learning_rate": 1.5899053554898591e-06, + "loss": 0.72027212, + "num_input_tokens_seen": 207360470, + "step": 9626, + "time_per_iteration": 2.577538013458252 + }, + { + "auxiliary_loss_clip": 0.0104268, + "auxiliary_loss_mlp": 0.01031338, + "balance_loss_clip": 1.02393866, + "balance_loss_mlp": 1.02050221, + "epoch": 0.5788065534345408, + "flos": 30004011394560.0, + "grad_norm": 1.5793156962186208, + "language_loss": 0.71757138, + "learning_rate": 1.5895241768445166e-06, + "loss": 0.73831153, + "num_input_tokens_seen": 207383080, + "step": 9627, + "time_per_iteration": 2.74210262298584 + }, + { + "auxiliary_loss_clip": 0.01052462, + "auxiliary_loss_mlp": 0.01028109, + "balance_loss_clip": 1.02350807, + "balance_loss_mlp": 1.0176065, + "epoch": 0.5788666766872088, + "flos": 24527849564160.0, + "grad_norm": 1.8638470698424094, + "language_loss": 0.83807313, + "learning_rate": 1.589143013764458e-06, + "loss": 0.85887879, + "num_input_tokens_seen": 207401000, + "step": 9628, + "time_per_iteration": 2.7719662189483643 + }, + { + "auxiliary_loss_clip": 0.01043038, + "auxiliary_loss_mlp": 0.01027713, + "balance_loss_clip": 1.02315164, + "balance_loss_mlp": 1.01654267, + "epoch": 0.5789267999398767, + "flos": 23732823888000.0, + "grad_norm": 1.5377097790696594, + "language_loss": 0.72085631, + "learning_rate": 1.5887618662641376e-06, + "loss": 0.7415638, + "num_input_tokens_seen": 207419230, + "step": 9629, + "time_per_iteration": 2.888087511062622 + }, + { + "auxiliary_loss_clip": 0.01043832, + "auxiliary_loss_mlp": 0.01030229, + "balance_loss_clip": 1.02500391, + "balance_loss_mlp": 1.01891017, + "epoch": 0.5789869231925447, + "flos": 21135045496320.0, + "grad_norm": 2.3831299767318948, + "language_loss": 0.74538553, + "learning_rate": 1.5883807343580087e-06, + "loss": 0.76612616, + "num_input_tokens_seen": 207437615, + "step": 9630, + "time_per_iteration": 2.7953908443450928 + }, + { + "auxiliary_loss_clip": 0.01034061, + "auxiliary_loss_mlp": 0.00747574, + "balance_loss_clip": 1.0261023, + "balance_loss_mlp": 1.00053358, + "epoch": 0.5790470464452127, + "flos": 21209632087680.0, + "grad_norm": 1.5723227853900177, + "language_loss": 0.79227197, + "learning_rate": 1.587999618060523e-06, + "loss": 0.81008828, + "num_input_tokens_seen": 207457270, + "step": 9631, + "time_per_iteration": 4.51269006729126 + }, + { + "auxiliary_loss_clip": 0.0106348, + "auxiliary_loss_mlp": 0.01024954, + "balance_loss_clip": 1.02426744, + "balance_loss_mlp": 1.01404071, + "epoch": 0.5791071696978807, + "flos": 23404384903680.0, + "grad_norm": 1.4456529454285039, + "language_loss": 0.75101316, + "learning_rate": 1.5876185173861333e-06, + "loss": 0.77189749, + "num_input_tokens_seen": 207477890, + "step": 9632, + "time_per_iteration": 2.549290895462036 + }, + { + "auxiliary_loss_clip": 0.01035871, + "auxiliary_loss_mlp": 0.01025394, + "balance_loss_clip": 1.02398932, + "balance_loss_mlp": 1.01392031, + "epoch": 0.5791672929505486, + "flos": 24206521472640.0, + "grad_norm": 1.8958692235558603, + "language_loss": 0.79100895, + "learning_rate": 1.5872374323492915e-06, + "loss": 0.81162161, + "num_input_tokens_seen": 207497670, + "step": 9633, + "time_per_iteration": 2.6169683933258057 + }, + { + "auxiliary_loss_clip": 0.01037388, + "auxiliary_loss_mlp": 0.01033449, + "balance_loss_clip": 1.02599096, + "balance_loss_mlp": 1.020998, + "epoch": 0.5792274162032166, + "flos": 24348871071360.0, + "grad_norm": 2.378600159305112, + "language_loss": 0.77804863, + "learning_rate": 1.5868563629644464e-06, + "loss": 0.79875696, + "num_input_tokens_seen": 207516105, + "step": 9634, + "time_per_iteration": 2.7380802631378174 + }, + { + "auxiliary_loss_clip": 0.0104642, + "auxiliary_loss_mlp": 0.01033411, + "balance_loss_clip": 1.02480841, + "balance_loss_mlp": 1.0220685, + "epoch": 0.5792875394558845, + "flos": 20449403712000.0, + "grad_norm": 2.8474946244840993, + "language_loss": 0.6399551, + "learning_rate": 1.5864753092460502e-06, + "loss": 0.66075337, + "num_input_tokens_seen": 207533685, + "step": 9635, + "time_per_iteration": 2.701312780380249 + }, + { + "auxiliary_loss_clip": 0.01035888, + "auxiliary_loss_mlp": 0.010342, + "balance_loss_clip": 1.02402496, + "balance_loss_mlp": 1.02336359, + "epoch": 0.5793476627085525, + "flos": 24060329118720.0, + "grad_norm": 1.404116477112268, + "language_loss": 0.77360582, + "learning_rate": 1.5860942712085516e-06, + "loss": 0.7943067, + "num_input_tokens_seen": 207552840, + "step": 9636, + "time_per_iteration": 2.6419098377227783 + }, + { + "auxiliary_loss_clip": 0.0103568, + "auxiliary_loss_mlp": 0.01028188, + "balance_loss_clip": 1.02222443, + "balance_loss_mlp": 1.01810908, + "epoch": 0.5794077859612206, + "flos": 22054287381120.0, + "grad_norm": 2.070533979595539, + "language_loss": 0.68417895, + "learning_rate": 1.5857132488663998e-06, + "loss": 0.70481765, + "num_input_tokens_seen": 207572095, + "step": 9637, + "time_per_iteration": 2.6244382858276367 + }, + { + "auxiliary_loss_clip": 0.01028214, + "auxiliary_loss_mlp": 0.01027931, + "balance_loss_clip": 1.02589941, + "balance_loss_mlp": 1.01673114, + "epoch": 0.5794679092138885, + "flos": 11434855991040.0, + "grad_norm": 3.277384826473053, + "language_loss": 0.72239733, + "learning_rate": 1.585332242234043e-06, + "loss": 0.74295878, + "num_input_tokens_seen": 207587495, + "step": 9638, + "time_per_iteration": 4.422152996063232 + }, + { + "auxiliary_loss_clip": 0.01056672, + "auxiliary_loss_mlp": 0.01031043, + "balance_loss_clip": 1.02785683, + "balance_loss_mlp": 1.02001631, + "epoch": 0.5795280324665565, + "flos": 18880215183360.0, + "grad_norm": 1.677743325186833, + "language_loss": 0.72347558, + "learning_rate": 1.5849512513259291e-06, + "loss": 0.7443527, + "num_input_tokens_seen": 207606795, + "step": 9639, + "time_per_iteration": 2.5889432430267334 + }, + { + "auxiliary_loss_clip": 0.01041808, + "auxiliary_loss_mlp": 0.01031857, + "balance_loss_clip": 1.02467012, + "balance_loss_mlp": 1.02081776, + "epoch": 0.5795881557192244, + "flos": 13005947940480.0, + "grad_norm": 2.7064602788014946, + "language_loss": 0.69388545, + "learning_rate": 1.5845702761565054e-06, + "loss": 0.71462214, + "num_input_tokens_seen": 207623620, + "step": 9640, + "time_per_iteration": 2.6455211639404297 + }, + { + "auxiliary_loss_clip": 0.01040968, + "auxiliary_loss_mlp": 0.01035146, + "balance_loss_clip": 1.02547085, + "balance_loss_mlp": 1.02269506, + "epoch": 0.5796482789718924, + "flos": 19932397303680.0, + "grad_norm": 2.4433098510477262, + "language_loss": 0.78038102, + "learning_rate": 1.5841893167402183e-06, + "loss": 0.80114216, + "num_input_tokens_seen": 207639380, + "step": 9641, + "time_per_iteration": 2.633167266845703 + }, + { + "auxiliary_loss_clip": 0.0106618, + "auxiliary_loss_mlp": 0.01029759, + "balance_loss_clip": 1.02714348, + "balance_loss_mlp": 1.01931667, + "epoch": 0.5797084022245603, + "flos": 21650794928640.0, + "grad_norm": 1.783668917004321, + "language_loss": 0.73560536, + "learning_rate": 1.5838083730915143e-06, + "loss": 0.75656474, + "num_input_tokens_seen": 207657915, + "step": 9642, + "time_per_iteration": 2.7137303352355957 + }, + { + "auxiliary_loss_clip": 0.01049993, + "auxiliary_loss_mlp": 0.01030066, + "balance_loss_clip": 1.02868533, + "balance_loss_mlp": 1.01897335, + "epoch": 0.5797685254772283, + "flos": 26031573555840.0, + "grad_norm": 2.9972718344768077, + "language_loss": 0.73369706, + "learning_rate": 1.5834274452248378e-06, + "loss": 0.75449765, + "num_input_tokens_seen": 207678620, + "step": 9643, + "time_per_iteration": 2.789996385574341 + }, + { + "auxiliary_loss_clip": 0.01066947, + "auxiliary_loss_mlp": 0.01029783, + "balance_loss_clip": 1.02613258, + "balance_loss_mlp": 1.01858926, + "epoch": 0.5798286487298963, + "flos": 22705167778560.0, + "grad_norm": 1.8714990737505857, + "language_loss": 0.67126751, + "learning_rate": 1.5830465331546352e-06, + "loss": 0.69223481, + "num_input_tokens_seen": 207696980, + "step": 9644, + "time_per_iteration": 2.6040775775909424 + }, + { + "auxiliary_loss_clip": 0.01059607, + "auxiliary_loss_mlp": 0.01028779, + "balance_loss_clip": 1.02785802, + "balance_loss_mlp": 1.01723981, + "epoch": 0.5798887719825643, + "flos": 23148988225920.0, + "grad_norm": 2.343031431950508, + "language_loss": 0.85713297, + "learning_rate": 1.5826656368953496e-06, + "loss": 0.87801683, + "num_input_tokens_seen": 207714065, + "step": 9645, + "time_per_iteration": 2.7441558837890625 + }, + { + "auxiliary_loss_clip": 0.0106622, + "auxiliary_loss_mlp": 0.01031179, + "balance_loss_clip": 1.02605093, + "balance_loss_mlp": 1.02059889, + "epoch": 0.5799488952352322, + "flos": 24426043441920.0, + "grad_norm": 1.985635968044092, + "language_loss": 0.75521719, + "learning_rate": 1.5822847564614244e-06, + "loss": 0.77619123, + "num_input_tokens_seen": 207734720, + "step": 9646, + "time_per_iteration": 2.6443653106689453 + }, + { + "auxiliary_loss_clip": 0.01047813, + "auxiliary_loss_mlp": 0.01031416, + "balance_loss_clip": 1.02592003, + "balance_loss_mlp": 1.01950145, + "epoch": 0.5800090184879002, + "flos": 38395903829760.0, + "grad_norm": 2.6499689492800402, + "language_loss": 0.58988583, + "learning_rate": 1.5819038918673038e-06, + "loss": 0.6106782, + "num_input_tokens_seen": 207755435, + "step": 9647, + "time_per_iteration": 2.8043506145477295 + }, + { + "auxiliary_loss_clip": 0.01025207, + "auxiliary_loss_mlp": 0.01046527, + "balance_loss_clip": 1.02730596, + "balance_loss_mlp": 1.03286552, + "epoch": 0.5800691417405681, + "flos": 19784840232960.0, + "grad_norm": 1.619498210538193, + "language_loss": 0.84280777, + "learning_rate": 1.5815230431274288e-06, + "loss": 0.86352509, + "num_input_tokens_seen": 207773570, + "step": 9648, + "time_per_iteration": 2.686981201171875 + }, + { + "auxiliary_loss_clip": 0.00998991, + "auxiliary_loss_mlp": 0.01005789, + "balance_loss_clip": 1.00229883, + "balance_loss_mlp": 1.00473964, + "epoch": 0.5801292649932361, + "flos": 70314565783680.0, + "grad_norm": 0.8332745845209723, + "language_loss": 0.62965524, + "learning_rate": 1.581142210256242e-06, + "loss": 0.64970303, + "num_input_tokens_seen": 207830095, + "step": 9649, + "time_per_iteration": 3.2855286598205566 + }, + { + "auxiliary_loss_clip": 0.01022883, + "auxiliary_loss_mlp": 0.01032254, + "balance_loss_clip": 1.02075779, + "balance_loss_mlp": 1.02049387, + "epoch": 0.5801893882459042, + "flos": 18734812928640.0, + "grad_norm": 1.7610511256487495, + "language_loss": 0.81792074, + "learning_rate": 1.5807613932681857e-06, + "loss": 0.83847213, + "num_input_tokens_seen": 207848555, + "step": 9650, + "time_per_iteration": 2.6624391078948975 + }, + { + "auxiliary_loss_clip": 0.01035792, + "auxiliary_loss_mlp": 0.01033335, + "balance_loss_clip": 1.02493501, + "balance_loss_mlp": 1.02159882, + "epoch": 0.5802495114985721, + "flos": 15596507698560.0, + "grad_norm": 2.2188356480145837, + "language_loss": 0.7754637, + "learning_rate": 1.580380592177698e-06, + "loss": 0.79615498, + "num_input_tokens_seen": 207867060, + "step": 9651, + "time_per_iteration": 2.679141044616699 + }, + { + "auxiliary_loss_clip": 0.0104901, + "auxiliary_loss_mlp": 0.01034649, + "balance_loss_clip": 1.0267328, + "balance_loss_mlp": 1.02262676, + "epoch": 0.5803096347512401, + "flos": 18255405081600.0, + "grad_norm": 1.801734255387372, + "language_loss": 0.74256808, + "learning_rate": 1.5799998069992213e-06, + "loss": 0.76340461, + "num_input_tokens_seen": 207884520, + "step": 9652, + "time_per_iteration": 2.6124532222747803 + }, + { + "auxiliary_loss_clip": 0.01038612, + "auxiliary_loss_mlp": 0.01026438, + "balance_loss_clip": 1.0246346, + "balance_loss_mlp": 1.01488674, + "epoch": 0.580369758003908, + "flos": 22893160584960.0, + "grad_norm": 2.0158855150466874, + "language_loss": 0.76529634, + "learning_rate": 1.579619037747193e-06, + "loss": 0.78594685, + "num_input_tokens_seen": 207905370, + "step": 9653, + "time_per_iteration": 2.6909961700439453 + }, + { + "auxiliary_loss_clip": 0.01066191, + "auxiliary_loss_mlp": 0.0102904, + "balance_loss_clip": 1.02592421, + "balance_loss_mlp": 1.01692832, + "epoch": 0.580429881256576, + "flos": 18697681244160.0, + "grad_norm": 2.123752091615996, + "language_loss": 0.74444127, + "learning_rate": 1.5792382844360534e-06, + "loss": 0.76539361, + "num_input_tokens_seen": 207923790, + "step": 9654, + "time_per_iteration": 2.5782690048217773 + }, + { + "auxiliary_loss_clip": 0.01017382, + "auxiliary_loss_mlp": 0.01034342, + "balance_loss_clip": 1.02865219, + "balance_loss_mlp": 1.02376246, + "epoch": 0.5804900045092439, + "flos": 24681978823680.0, + "grad_norm": 1.7627874224258053, + "language_loss": 0.70483512, + "learning_rate": 1.5788575470802408e-06, + "loss": 0.72535235, + "num_input_tokens_seen": 207942335, + "step": 9655, + "time_per_iteration": 2.9346206188201904 + }, + { + "auxiliary_loss_clip": 0.01069211, + "auxiliary_loss_mlp": 0.01030442, + "balance_loss_clip": 1.02581167, + "balance_loss_mlp": 1.01852131, + "epoch": 0.580550127761912, + "flos": 23112790295040.0, + "grad_norm": 7.730604394266173, + "language_loss": 0.69397795, + "learning_rate": 1.5784768256941915e-06, + "loss": 0.71497452, + "num_input_tokens_seen": 207961975, + "step": 9656, + "time_per_iteration": 2.686037302017212 + }, + { + "auxiliary_loss_clip": 0.0105388, + "auxiliary_loss_mlp": 0.01031516, + "balance_loss_clip": 1.02651346, + "balance_loss_mlp": 1.02128816, + "epoch": 0.5806102510145799, + "flos": 18475681236480.0, + "grad_norm": 1.5047356254629982, + "language_loss": 0.71782833, + "learning_rate": 1.5780961202923433e-06, + "loss": 0.73868233, + "num_input_tokens_seen": 207979520, + "step": 9657, + "time_per_iteration": 2.6388161182403564 + }, + { + "auxiliary_loss_clip": 0.01058903, + "auxiliary_loss_mlp": 0.01035447, + "balance_loss_clip": 1.02632582, + "balance_loss_mlp": 1.02332377, + "epoch": 0.5806703742672479, + "flos": 23915645136000.0, + "grad_norm": 1.8618784202945053, + "language_loss": 0.70785385, + "learning_rate": 1.5777154308891328e-06, + "loss": 0.72879732, + "num_input_tokens_seen": 207998375, + "step": 9658, + "time_per_iteration": 2.568805694580078 + }, + { + "auxiliary_loss_clip": 0.01000691, + "auxiliary_loss_mlp": 0.01000811, + "balance_loss_clip": 1.00379491, + "balance_loss_mlp": 0.99973196, + "epoch": 0.5807304975199158, + "flos": 66311999412480.0, + "grad_norm": 0.6465136640623742, + "language_loss": 0.53596652, + "learning_rate": 1.5773347574989953e-06, + "loss": 0.55598152, + "num_input_tokens_seen": 208060605, + "step": 9659, + "time_per_iteration": 3.1674437522888184 + }, + { + "auxiliary_loss_clip": 0.01057276, + "auxiliary_loss_mlp": 0.01039135, + "balance_loss_clip": 1.02623582, + "balance_loss_mlp": 1.02804279, + "epoch": 0.5807906207725838, + "flos": 31722444933120.0, + "grad_norm": 2.002850297837036, + "language_loss": 0.6265955, + "learning_rate": 1.576954100136366e-06, + "loss": 0.64755964, + "num_input_tokens_seen": 208080320, + "step": 9660, + "time_per_iteration": 2.6743710041046143 + }, + { + "auxiliary_loss_clip": 0.01051349, + "auxiliary_loss_mlp": 0.01030742, + "balance_loss_clip": 1.02243412, + "balance_loss_mlp": 1.01899362, + "epoch": 0.5808507440252517, + "flos": 23801161512960.0, + "grad_norm": 1.5030288081430738, + "language_loss": 0.65167487, + "learning_rate": 1.5765734588156797e-06, + "loss": 0.67249572, + "num_input_tokens_seen": 208099305, + "step": 9661, + "time_per_iteration": 2.6473817825317383 + }, + { + "auxiliary_loss_clip": 0.01015236, + "auxiliary_loss_mlp": 0.01026338, + "balance_loss_clip": 1.02198219, + "balance_loss_mlp": 1.01631236, + "epoch": 0.5809108672779197, + "flos": 13698449222400.0, + "grad_norm": 1.553418706652338, + "language_loss": 0.74594581, + "learning_rate": 1.5761928335513704e-06, + "loss": 0.76636147, + "num_input_tokens_seen": 208116960, + "step": 9662, + "time_per_iteration": 2.6433870792388916 + }, + { + "auxiliary_loss_clip": 0.01007806, + "auxiliary_loss_mlp": 0.01001418, + "balance_loss_clip": 1.00133693, + "balance_loss_mlp": 1.00037503, + "epoch": 0.5809709905305876, + "flos": 69134866381440.0, + "grad_norm": 0.8867906314440009, + "language_loss": 0.58356929, + "learning_rate": 1.5758122243578709e-06, + "loss": 0.60366154, + "num_input_tokens_seen": 208182190, + "step": 9663, + "time_per_iteration": 3.269293785095215 + }, + { + "auxiliary_loss_clip": 0.01048091, + "auxiliary_loss_mlp": 0.01028919, + "balance_loss_clip": 1.02739501, + "balance_loss_mlp": 1.01813626, + "epoch": 0.5810311137832557, + "flos": 19827538525440.0, + "grad_norm": 2.180929859480389, + "language_loss": 0.81753528, + "learning_rate": 1.5754316312496152e-06, + "loss": 0.83830547, + "num_input_tokens_seen": 208197015, + "step": 9664, + "time_per_iteration": 2.7548623085021973 + }, + { + "auxiliary_loss_clip": 0.0103214, + "auxiliary_loss_mlp": 0.00747619, + "balance_loss_clip": 1.02150512, + "balance_loss_mlp": 1.00040698, + "epoch": 0.5810912370359237, + "flos": 29238503719680.0, + "grad_norm": 1.7967668232355818, + "language_loss": 0.81748211, + "learning_rate": 1.5750510542410337e-06, + "loss": 0.8352797, + "num_input_tokens_seen": 208215795, + "step": 9665, + "time_per_iteration": 2.7021124362945557 + }, + { + "auxiliary_loss_clip": 0.01051486, + "auxiliary_loss_mlp": 0.01031404, + "balance_loss_clip": 1.02852702, + "balance_loss_mlp": 1.01852357, + "epoch": 0.5811513602885916, + "flos": 22785572373120.0, + "grad_norm": 1.888794799915935, + "language_loss": 0.8119117, + "learning_rate": 1.5746704933465599e-06, + "loss": 0.83274066, + "num_input_tokens_seen": 208234655, + "step": 9666, + "time_per_iteration": 2.7002604007720947 + }, + { + "auxiliary_loss_clip": 0.01054706, + "auxiliary_loss_mlp": 0.01033202, + "balance_loss_clip": 1.02582991, + "balance_loss_mlp": 1.02305746, + "epoch": 0.5812114835412596, + "flos": 18734346051840.0, + "grad_norm": 1.7622477176399607, + "language_loss": 0.79818422, + "learning_rate": 1.5742899485806227e-06, + "loss": 0.81906331, + "num_input_tokens_seen": 208251300, + "step": 9667, + "time_per_iteration": 2.5909552574157715 + }, + { + "auxiliary_loss_clip": 0.01060204, + "auxiliary_loss_mlp": 0.01034411, + "balance_loss_clip": 1.02597141, + "balance_loss_mlp": 1.02172744, + "epoch": 0.5812716067939275, + "flos": 26431295080320.0, + "grad_norm": 1.6276915497856004, + "language_loss": 0.78638756, + "learning_rate": 1.573909419957653e-06, + "loss": 0.80733365, + "num_input_tokens_seen": 208272685, + "step": 9668, + "time_per_iteration": 5.807290554046631 + }, + { + "auxiliary_loss_clip": 0.0104648, + "auxiliary_loss_mlp": 0.01036826, + "balance_loss_clip": 1.02565336, + "balance_loss_mlp": 1.02637148, + "epoch": 0.5813317300465956, + "flos": 43397865285120.0, + "grad_norm": 1.8036649700767116, + "language_loss": 0.64774454, + "learning_rate": 1.5735289074920819e-06, + "loss": 0.66857755, + "num_input_tokens_seen": 208294315, + "step": 9669, + "time_per_iteration": 2.899789333343506 + }, + { + "auxiliary_loss_clip": 0.01020441, + "auxiliary_loss_mlp": 0.01039874, + "balance_loss_clip": 1.02383268, + "balance_loss_mlp": 1.02860856, + "epoch": 0.5813918532992635, + "flos": 24785472885120.0, + "grad_norm": 1.4756293383085821, + "language_loss": 0.73057288, + "learning_rate": 1.5731484111983363e-06, + "loss": 0.751176, + "num_input_tokens_seen": 208315610, + "step": 9670, + "time_per_iteration": 2.779670238494873 + }, + { + "auxiliary_loss_clip": 0.01039344, + "auxiliary_loss_mlp": 0.01037324, + "balance_loss_clip": 1.02700925, + "balance_loss_mlp": 1.02650607, + "epoch": 0.5814519765519315, + "flos": 22857357703680.0, + "grad_norm": 1.9495127540624084, + "language_loss": 0.78670967, + "learning_rate": 1.5727679310908464e-06, + "loss": 0.80747634, + "num_input_tokens_seen": 208334725, + "step": 9671, + "time_per_iteration": 2.687406063079834 + }, + { + "auxiliary_loss_clip": 0.01025245, + "auxiliary_loss_mlp": 0.01036071, + "balance_loss_clip": 1.02624011, + "balance_loss_mlp": 1.02354205, + "epoch": 0.5815120998045994, + "flos": 24060831909120.0, + "grad_norm": 2.1366452688914115, + "language_loss": 0.60568482, + "learning_rate": 1.5723874671840399e-06, + "loss": 0.62629795, + "num_input_tokens_seen": 208353825, + "step": 9672, + "time_per_iteration": 2.7189109325408936 + }, + { + "auxiliary_loss_clip": 0.01009247, + "auxiliary_loss_mlp": 0.01035453, + "balance_loss_clip": 1.02303147, + "balance_loss_mlp": 1.02371657, + "epoch": 0.5815722230572674, + "flos": 24279491952000.0, + "grad_norm": 1.7148153917748317, + "language_loss": 0.81585622, + "learning_rate": 1.572007019492342e-06, + "loss": 0.83630323, + "num_input_tokens_seen": 208374160, + "step": 9673, + "time_per_iteration": 2.680488348007202 + }, + { + "auxiliary_loss_clip": 0.01042777, + "auxiliary_loss_mlp": 0.01034879, + "balance_loss_clip": 1.02948618, + "balance_loss_mlp": 1.02307701, + "epoch": 0.5816323463099353, + "flos": 22200371994240.0, + "grad_norm": 1.8484561440309006, + "language_loss": 0.87888229, + "learning_rate": 1.5716265880301817e-06, + "loss": 0.8996588, + "num_input_tokens_seen": 208392105, + "step": 9674, + "time_per_iteration": 2.661465883255005 + }, + { + "auxiliary_loss_clip": 0.01064838, + "auxiliary_loss_mlp": 0.00747313, + "balance_loss_clip": 1.02487469, + "balance_loss_mlp": 1.00030124, + "epoch": 0.5816924695626033, + "flos": 24134448833280.0, + "grad_norm": 1.4564167986604557, + "language_loss": 0.79236138, + "learning_rate": 1.571246172811984e-06, + "loss": 0.81048286, + "num_input_tokens_seen": 208411755, + "step": 9675, + "time_per_iteration": 2.659148693084717 + }, + { + "auxiliary_loss_clip": 0.01051052, + "auxiliary_loss_mlp": 0.01031389, + "balance_loss_clip": 1.02510047, + "balance_loss_mlp": 1.0198493, + "epoch": 0.5817525928152713, + "flos": 21324223451520.0, + "grad_norm": 1.8206237433784342, + "language_loss": 0.70503587, + "learning_rate": 1.5708657738521748e-06, + "loss": 0.72586024, + "num_input_tokens_seen": 208429995, + "step": 9676, + "time_per_iteration": 2.606128692626953 + }, + { + "auxiliary_loss_clip": 0.0101046, + "auxiliary_loss_mlp": 0.01029557, + "balance_loss_clip": 1.02760768, + "balance_loss_mlp": 1.01829159, + "epoch": 0.5818127160679393, + "flos": 26934510666240.0, + "grad_norm": 2.335915706631195, + "language_loss": 0.63475263, + "learning_rate": 1.5704853911651779e-06, + "loss": 0.6551528, + "num_input_tokens_seen": 208443655, + "step": 9677, + "time_per_iteration": 2.6984753608703613 + }, + { + "auxiliary_loss_clip": 0.00993769, + "auxiliary_loss_mlp": 0.01003111, + "balance_loss_clip": 1.00628686, + "balance_loss_mlp": 1.0017581, + "epoch": 0.5818728393206073, + "flos": 63918626342400.0, + "grad_norm": 0.8366946484818438, + "language_loss": 0.54216492, + "learning_rate": 1.5701050247654182e-06, + "loss": 0.56213379, + "num_input_tokens_seen": 208498405, + "step": 9678, + "time_per_iteration": 3.2400062084198 + }, + { + "auxiliary_loss_clip": 0.00987943, + "auxiliary_loss_mlp": 0.01009388, + "balance_loss_clip": 1.00139129, + "balance_loss_mlp": 1.00822532, + "epoch": 0.5819329625732752, + "flos": 64954108638720.0, + "grad_norm": 0.7537795822921589, + "language_loss": 0.56181037, + "learning_rate": 1.569724674667319e-06, + "loss": 0.58178365, + "num_input_tokens_seen": 208559075, + "step": 9679, + "time_per_iteration": 4.862929582595825 + }, + { + "auxiliary_loss_clip": 0.01064353, + "auxiliary_loss_mlp": 0.01027956, + "balance_loss_clip": 1.02499545, + "balance_loss_mlp": 1.01780546, + "epoch": 0.5819930858259432, + "flos": 21215270522880.0, + "grad_norm": 2.071062998344465, + "language_loss": 0.65553153, + "learning_rate": 1.5693443408853032e-06, + "loss": 0.67645466, + "num_input_tokens_seen": 208577770, + "step": 9680, + "time_per_iteration": 2.5470871925354004 + }, + { + "auxiliary_loss_clip": 0.01043578, + "auxiliary_loss_mlp": 0.01025974, + "balance_loss_clip": 1.02407742, + "balance_loss_mlp": 1.0159725, + "epoch": 0.5820532090786111, + "flos": 19458520151040.0, + "grad_norm": 1.956392975680473, + "language_loss": 0.83161771, + "learning_rate": 1.5689640234337933e-06, + "loss": 0.85231322, + "num_input_tokens_seen": 208595110, + "step": 9681, + "time_per_iteration": 2.6532187461853027 + }, + { + "auxiliary_loss_clip": 0.01066474, + "auxiliary_loss_mlp": 0.01029505, + "balance_loss_clip": 1.0265007, + "balance_loss_mlp": 1.01879442, + "epoch": 0.5821133323312792, + "flos": 17712615686400.0, + "grad_norm": 2.1085766882618, + "language_loss": 0.7557314, + "learning_rate": 1.5685837223272109e-06, + "loss": 0.77669114, + "num_input_tokens_seen": 208612080, + "step": 9682, + "time_per_iteration": 2.579448699951172 + }, + { + "auxiliary_loss_clip": 0.00995806, + "auxiliary_loss_mlp": 0.01030719, + "balance_loss_clip": 1.02045667, + "balance_loss_mlp": 1.01856565, + "epoch": 0.5821734555839471, + "flos": 24571804832640.0, + "grad_norm": 2.015619619585567, + "language_loss": 0.74742436, + "learning_rate": 1.568203437579977e-06, + "loss": 0.76768959, + "num_input_tokens_seen": 208630235, + "step": 9683, + "time_per_iteration": 2.833737850189209 + }, + { + "auxiliary_loss_clip": 0.01052031, + "auxiliary_loss_mlp": 0.01032315, + "balance_loss_clip": 1.02903795, + "balance_loss_mlp": 1.02042985, + "epoch": 0.5822335788366151, + "flos": 22382259488640.0, + "grad_norm": 1.7864444486477356, + "language_loss": 0.73940611, + "learning_rate": 1.5678231692065116e-06, + "loss": 0.76024961, + "num_input_tokens_seen": 208647925, + "step": 9684, + "time_per_iteration": 2.7391107082366943 + }, + { + "auxiliary_loss_clip": 0.01039458, + "auxiliary_loss_mlp": 0.01031218, + "balance_loss_clip": 1.02515149, + "balance_loss_mlp": 1.02031624, + "epoch": 0.582293702089283, + "flos": 26722494639360.0, + "grad_norm": 2.1040272877897297, + "language_loss": 0.78053284, + "learning_rate": 1.5674429172212348e-06, + "loss": 0.80123949, + "num_input_tokens_seen": 208666180, + "step": 9685, + "time_per_iteration": 2.6937413215637207 + }, + { + "auxiliary_loss_clip": 0.01065809, + "auxiliary_loss_mlp": 0.01031565, + "balance_loss_clip": 1.02647531, + "balance_loss_mlp": 1.02062154, + "epoch": 0.582353825341951, + "flos": 17348661129600.0, + "grad_norm": 1.809882624670172, + "language_loss": 0.75391877, + "learning_rate": 1.5670626816385667e-06, + "loss": 0.77489251, + "num_input_tokens_seen": 208684240, + "step": 9686, + "time_per_iteration": 4.280918121337891 + }, + { + "auxiliary_loss_clip": 0.01001048, + "auxiliary_loss_mlp": 0.01003573, + "balance_loss_clip": 1.00440025, + "balance_loss_mlp": 1.0025773, + "epoch": 0.5824139485946189, + "flos": 55473261534720.0, + "grad_norm": 0.8172281315122794, + "language_loss": 0.57396972, + "learning_rate": 1.5666824624729244e-06, + "loss": 0.59401596, + "num_input_tokens_seen": 208736090, + "step": 9687, + "time_per_iteration": 3.066739797592163 + }, + { + "auxiliary_loss_clip": 0.01030912, + "auxiliary_loss_mlp": 0.01032509, + "balance_loss_clip": 1.02464199, + "balance_loss_mlp": 1.01952744, + "epoch": 0.582474071847287, + "flos": 20303031790080.0, + "grad_norm": 1.7759269317467785, + "language_loss": 0.69936764, + "learning_rate": 1.566302259738727e-06, + "loss": 0.72000182, + "num_input_tokens_seen": 208754600, + "step": 9688, + "time_per_iteration": 2.830897569656372 + }, + { + "auxiliary_loss_clip": 0.01058492, + "auxiliary_loss_mlp": 0.01029981, + "balance_loss_clip": 1.02729344, + "balance_loss_mlp": 1.0194366, + "epoch": 0.5825341950999549, + "flos": 23878010661120.0, + "grad_norm": 2.7816320812061868, + "language_loss": 0.65009582, + "learning_rate": 1.5659220734503918e-06, + "loss": 0.67098057, + "num_input_tokens_seen": 208773140, + "step": 9689, + "time_per_iteration": 2.695186138153076 + }, + { + "auxiliary_loss_clip": 0.01045282, + "auxiliary_loss_mlp": 0.00747341, + "balance_loss_clip": 1.02662039, + "balance_loss_mlp": 1.00036836, + "epoch": 0.5825943183526229, + "flos": 23113041690240.0, + "grad_norm": 2.306574944743195, + "language_loss": 0.73328495, + "learning_rate": 1.5655419036223341e-06, + "loss": 0.75121117, + "num_input_tokens_seen": 208793410, + "step": 9690, + "time_per_iteration": 2.729743242263794 + }, + { + "auxiliary_loss_clip": 0.01042019, + "auxiliary_loss_mlp": 0.01030709, + "balance_loss_clip": 1.02397442, + "balance_loss_mlp": 1.01846623, + "epoch": 0.5826544416052909, + "flos": 22857429530880.0, + "grad_norm": 1.745723701634811, + "language_loss": 0.76190722, + "learning_rate": 1.5651617502689717e-06, + "loss": 0.7826345, + "num_input_tokens_seen": 208811920, + "step": 9691, + "time_per_iteration": 2.6471638679504395 + }, + { + "auxiliary_loss_clip": 0.01055313, + "auxiliary_loss_mlp": 0.01025649, + "balance_loss_clip": 1.025141, + "balance_loss_mlp": 1.01484275, + "epoch": 0.5827145648579588, + "flos": 31501845555840.0, + "grad_norm": 1.8588963462272337, + "language_loss": 0.80374193, + "learning_rate": 1.5647816134047184e-06, + "loss": 0.82455158, + "num_input_tokens_seen": 208834720, + "step": 9692, + "time_per_iteration": 2.7066996097564697 + }, + { + "auxiliary_loss_clip": 0.00999676, + "auxiliary_loss_mlp": 0.01001423, + "balance_loss_clip": 1.00312281, + "balance_loss_mlp": 1.00036764, + "epoch": 0.5827746881106268, + "flos": 69811817074560.0, + "grad_norm": 0.7585457545576133, + "language_loss": 0.56985879, + "learning_rate": 1.5644014930439907e-06, + "loss": 0.5898698, + "num_input_tokens_seen": 208898415, + "step": 9693, + "time_per_iteration": 3.2080893516540527 + }, + { + "auxiliary_loss_clip": 0.01054692, + "auxiliary_loss_mlp": 0.0074746, + "balance_loss_clip": 1.02577829, + "balance_loss_mlp": 1.00040603, + "epoch": 0.5828348113632947, + "flos": 23112395245440.0, + "grad_norm": 1.802738568876962, + "language_loss": 0.79131299, + "learning_rate": 1.5640213892012025e-06, + "loss": 0.80933452, + "num_input_tokens_seen": 208919045, + "step": 9694, + "time_per_iteration": 2.7060329914093018 + }, + { + "auxiliary_loss_clip": 0.01036962, + "auxiliary_loss_mlp": 0.01035117, + "balance_loss_clip": 1.0238688, + "balance_loss_mlp": 1.02438176, + "epoch": 0.5828949346159628, + "flos": 21873082245120.0, + "grad_norm": 1.3469816998876172, + "language_loss": 0.7613976, + "learning_rate": 1.5636413018907656e-06, + "loss": 0.78211838, + "num_input_tokens_seen": 208939375, + "step": 9695, + "time_per_iteration": 2.6994142532348633 + }, + { + "auxiliary_loss_clip": 0.00998515, + "auxiliary_loss_mlp": 0.01002991, + "balance_loss_clip": 1.00186849, + "balance_loss_mlp": 1.00182271, + "epoch": 0.5829550578686307, + "flos": 65962553950080.0, + "grad_norm": 0.7733185976984334, + "language_loss": 0.55015081, + "learning_rate": 1.563261231127095e-06, + "loss": 0.57016593, + "num_input_tokens_seen": 209004760, + "step": 9696, + "time_per_iteration": 3.3712596893310547 + }, + { + "auxiliary_loss_clip": 0.01040021, + "auxiliary_loss_mlp": 0.01026516, + "balance_loss_clip": 1.02937984, + "balance_loss_mlp": 1.01593041, + "epoch": 0.5830151811212987, + "flos": 16289799079680.0, + "grad_norm": 1.9437338163364382, + "language_loss": 0.76288891, + "learning_rate": 1.5628811769246021e-06, + "loss": 0.78355432, + "num_input_tokens_seen": 209022930, + "step": 9697, + "time_per_iteration": 2.6829116344451904 + }, + { + "auxiliary_loss_clip": 0.01066079, + "auxiliary_loss_mlp": 0.01030467, + "balance_loss_clip": 1.02507377, + "balance_loss_mlp": 1.01924372, + "epoch": 0.5830753043739666, + "flos": 24168851084160.0, + "grad_norm": 1.9672659808288884, + "language_loss": 0.77846366, + "learning_rate": 1.5625011392976991e-06, + "loss": 0.79942918, + "num_input_tokens_seen": 209043740, + "step": 9698, + "time_per_iteration": 2.644050359725952 + }, + { + "auxiliary_loss_clip": 0.01022242, + "auxiliary_loss_mlp": 0.01035386, + "balance_loss_clip": 1.02449131, + "balance_loss_mlp": 1.02387667, + "epoch": 0.5831354276266346, + "flos": 27059050097280.0, + "grad_norm": 2.2117928241798483, + "language_loss": 0.83529085, + "learning_rate": 1.5621211182607966e-06, + "loss": 0.85586715, + "num_input_tokens_seen": 209068885, + "step": 9699, + "time_per_iteration": 2.913835048675537 + }, + { + "auxiliary_loss_clip": 0.0103803, + "auxiliary_loss_mlp": 0.01030432, + "balance_loss_clip": 1.02308917, + "balance_loss_mlp": 1.01868939, + "epoch": 0.5831955508793025, + "flos": 23623475909760.0, + "grad_norm": 2.1109020321300975, + "language_loss": 0.66358113, + "learning_rate": 1.561741113828305e-06, + "loss": 0.68426573, + "num_input_tokens_seen": 209087340, + "step": 9700, + "time_per_iteration": 2.8177683353424072 + }, + { + "auxiliary_loss_clip": 0.0105313, + "auxiliary_loss_mlp": 0.01029153, + "balance_loss_clip": 1.02320743, + "balance_loss_mlp": 1.01812041, + "epoch": 0.5832556741319705, + "flos": 24973250209920.0, + "grad_norm": 1.6364061499642337, + "language_loss": 0.71554357, + "learning_rate": 1.5613611260146344e-06, + "loss": 0.73636639, + "num_input_tokens_seen": 209108840, + "step": 9701, + "time_per_iteration": 2.6875579357147217 + }, + { + "auxiliary_loss_clip": 0.01044837, + "auxiliary_loss_mlp": 0.01032843, + "balance_loss_clip": 1.02523541, + "balance_loss_mlp": 1.02181029, + "epoch": 0.5833157973846385, + "flos": 23221563655680.0, + "grad_norm": 1.7170019840607047, + "language_loss": 0.85522848, + "learning_rate": 1.5609811548341936e-06, + "loss": 0.87600523, + "num_input_tokens_seen": 209127985, + "step": 9702, + "time_per_iteration": 2.762113332748413 + }, + { + "auxiliary_loss_clip": 0.01051342, + "auxiliary_loss_mlp": 0.01028331, + "balance_loss_clip": 1.02419043, + "balance_loss_mlp": 1.01829982, + "epoch": 0.5833759206373065, + "flos": 21977941023360.0, + "grad_norm": 1.3990985892570793, + "language_loss": 0.77812243, + "learning_rate": 1.560601200301392e-06, + "loss": 0.7989192, + "num_input_tokens_seen": 209146885, + "step": 9703, + "time_per_iteration": 2.6281161308288574 + }, + { + "auxiliary_loss_clip": 0.01068732, + "auxiliary_loss_mlp": 0.01027873, + "balance_loss_clip": 1.02799416, + "balance_loss_mlp": 1.01696527, + "epoch": 0.5834360438899745, + "flos": 21762405463680.0, + "grad_norm": 1.7869225943289435, + "language_loss": 0.71026599, + "learning_rate": 1.5602212624306366e-06, + "loss": 0.73123205, + "num_input_tokens_seen": 209166130, + "step": 9704, + "time_per_iteration": 2.5318291187286377 + }, + { + "auxiliary_loss_clip": 0.0104493, + "auxiliary_loss_mlp": 0.01029928, + "balance_loss_clip": 1.02604008, + "balance_loss_mlp": 1.01993823, + "epoch": 0.5834961671426424, + "flos": 15992566035840.0, + "grad_norm": 1.8368502942951908, + "language_loss": 0.81481397, + "learning_rate": 1.559841341236335e-06, + "loss": 0.83556259, + "num_input_tokens_seen": 209183350, + "step": 9705, + "time_per_iteration": 2.6656548976898193 + }, + { + "auxiliary_loss_clip": 0.01019999, + "auxiliary_loss_mlp": 0.01028211, + "balance_loss_clip": 1.02250636, + "balance_loss_mlp": 1.01734495, + "epoch": 0.5835562903953104, + "flos": 22818322598400.0, + "grad_norm": 1.5272420400263258, + "language_loss": 0.80638027, + "learning_rate": 1.5594614367328937e-06, + "loss": 0.82686245, + "num_input_tokens_seen": 209203945, + "step": 9706, + "time_per_iteration": 2.7015554904937744 + }, + { + "auxiliary_loss_clip": 0.01049331, + "auxiliary_loss_mlp": 0.01029891, + "balance_loss_clip": 1.02283764, + "balance_loss_mlp": 1.018417, + "epoch": 0.5836164136479783, + "flos": 48468056624640.0, + "grad_norm": 2.3122685341749256, + "language_loss": 0.74898392, + "learning_rate": 1.5590815489347187e-06, + "loss": 0.76977611, + "num_input_tokens_seen": 209227080, + "step": 9707, + "time_per_iteration": 2.7844321727752686 + }, + { + "auxiliary_loss_clip": 0.0103225, + "auxiliary_loss_mlp": 0.01025751, + "balance_loss_clip": 1.02393675, + "balance_loss_mlp": 1.01543975, + "epoch": 0.5836765369006464, + "flos": 26905998245760.0, + "grad_norm": 1.8668118726490717, + "language_loss": 0.81854808, + "learning_rate": 1.5587016778562163e-06, + "loss": 0.83912814, + "num_input_tokens_seen": 209248170, + "step": 9708, + "time_per_iteration": 2.839921474456787 + }, + { + "auxiliary_loss_clip": 0.01057809, + "auxiliary_loss_mlp": 0.01029557, + "balance_loss_clip": 1.02926564, + "balance_loss_mlp": 1.01851869, + "epoch": 0.5837366601533143, + "flos": 20084048524800.0, + "grad_norm": 1.6337820618861334, + "language_loss": 0.78509575, + "learning_rate": 1.5583218235117896e-06, + "loss": 0.80596942, + "num_input_tokens_seen": 209267730, + "step": 9709, + "time_per_iteration": 2.6028692722320557 + }, + { + "auxiliary_loss_clip": 0.0098864, + "auxiliary_loss_mlp": 0.01002841, + "balance_loss_clip": 1.00178361, + "balance_loss_mlp": 1.00168467, + "epoch": 0.5837967834059823, + "flos": 65363885971200.0, + "grad_norm": 0.7631408619705846, + "language_loss": 0.56568241, + "learning_rate": 1.557941985915844e-06, + "loss": 0.58559716, + "num_input_tokens_seen": 209332510, + "step": 9710, + "time_per_iteration": 3.184089422225952 + }, + { + "auxiliary_loss_clip": 0.01033388, + "auxiliary_loss_mlp": 0.01029978, + "balance_loss_clip": 1.02527523, + "balance_loss_mlp": 1.02053642, + "epoch": 0.5838569066586502, + "flos": 25338641310720.0, + "grad_norm": 1.425432921496097, + "language_loss": 0.65583366, + "learning_rate": 1.5575621650827833e-06, + "loss": 0.67646736, + "num_input_tokens_seen": 209353355, + "step": 9711, + "time_per_iteration": 2.6828722953796387 + }, + { + "auxiliary_loss_clip": 0.01068903, + "auxiliary_loss_mlp": 0.01032107, + "balance_loss_clip": 1.02592492, + "balance_loss_mlp": 1.02002525, + "epoch": 0.5839170299113182, + "flos": 22229243550720.0, + "grad_norm": 1.6579764241669552, + "language_loss": 0.7873866, + "learning_rate": 1.5571823610270085e-06, + "loss": 0.8083967, + "num_input_tokens_seen": 209370960, + "step": 9712, + "time_per_iteration": 2.59623646736145 + }, + { + "auxiliary_loss_clip": 0.01022517, + "auxiliary_loss_mlp": 0.00747485, + "balance_loss_clip": 1.02124679, + "balance_loss_mlp": 1.00042045, + "epoch": 0.5839771531639861, + "flos": 22200012858240.0, + "grad_norm": 1.9953622700901577, + "language_loss": 0.732768, + "learning_rate": 1.5568025737629234e-06, + "loss": 0.75046802, + "num_input_tokens_seen": 209390955, + "step": 9713, + "time_per_iteration": 2.7201921939849854 + }, + { + "auxiliary_loss_clip": 0.01046756, + "auxiliary_loss_mlp": 0.01027046, + "balance_loss_clip": 1.02421975, + "balance_loss_mlp": 1.01521456, + "epoch": 0.5840372764166541, + "flos": 22419355259520.0, + "grad_norm": 2.0005649701939787, + "language_loss": 0.69451857, + "learning_rate": 1.5564228033049292e-06, + "loss": 0.71525657, + "num_input_tokens_seen": 209410260, + "step": 9714, + "time_per_iteration": 2.6318633556365967 + }, + { + "auxiliary_loss_clip": 0.0106574, + "auxiliary_loss_mlp": 0.01026665, + "balance_loss_clip": 1.0247103, + "balance_loss_mlp": 1.01565588, + "epoch": 0.5840973996693221, + "flos": 19828256797440.0, + "grad_norm": 1.7729151399102108, + "language_loss": 0.80456102, + "learning_rate": 1.5560430496674268e-06, + "loss": 0.82548505, + "num_input_tokens_seen": 209429920, + "step": 9715, + "time_per_iteration": 4.1559062004089355 + }, + { + "auxiliary_loss_clip": 0.01041099, + "auxiliary_loss_mlp": 0.0103048, + "balance_loss_clip": 1.02307081, + "balance_loss_mlp": 1.01928067, + "epoch": 0.5841575229219901, + "flos": 21142982401920.0, + "grad_norm": 2.1868167026336494, + "language_loss": 0.73210829, + "learning_rate": 1.5556633128648167e-06, + "loss": 0.75282413, + "num_input_tokens_seen": 209449470, + "step": 9716, + "time_per_iteration": 4.17188835144043 + }, + { + "auxiliary_loss_clip": 0.01036413, + "auxiliary_loss_mlp": 0.01026318, + "balance_loss_clip": 1.02262282, + "balance_loss_mlp": 1.01594663, + "epoch": 0.5842176461746581, + "flos": 24640322025600.0, + "grad_norm": 1.6726437537410481, + "language_loss": 0.74909824, + "learning_rate": 1.5552835929114976e-06, + "loss": 0.76972556, + "num_input_tokens_seen": 209467695, + "step": 9717, + "time_per_iteration": 2.7018356323242188 + }, + { + "auxiliary_loss_clip": 0.01056054, + "auxiliary_loss_mlp": 0.01032417, + "balance_loss_clip": 1.02639818, + "balance_loss_mlp": 1.02125275, + "epoch": 0.584277769427326, + "flos": 19131158574720.0, + "grad_norm": 3.142305217669083, + "language_loss": 0.80285442, + "learning_rate": 1.5549038898218697e-06, + "loss": 0.82373911, + "num_input_tokens_seen": 209484250, + "step": 9718, + "time_per_iteration": 2.630082368850708 + }, + { + "auxiliary_loss_clip": 0.01041876, + "auxiliary_loss_mlp": 0.01025667, + "balance_loss_clip": 1.0243063, + "balance_loss_mlp": 1.0140028, + "epoch": 0.584337892679994, + "flos": 22675111073280.0, + "grad_norm": 1.868818316345396, + "language_loss": 0.67820108, + "learning_rate": 1.5545242036103306e-06, + "loss": 0.6988765, + "num_input_tokens_seen": 209502830, + "step": 9719, + "time_per_iteration": 2.7323038578033447 + }, + { + "auxiliary_loss_clip": 0.0106695, + "auxiliary_loss_mlp": 0.01029972, + "balance_loss_clip": 1.02596676, + "balance_loss_mlp": 1.01916015, + "epoch": 0.5843980159326619, + "flos": 31284083352960.0, + "grad_norm": 3.7948465257885635, + "language_loss": 0.75235701, + "learning_rate": 1.5541445342912786e-06, + "loss": 0.77332628, + "num_input_tokens_seen": 209525995, + "step": 9720, + "time_per_iteration": 2.674934148788452 + }, + { + "auxiliary_loss_clip": 0.01028309, + "auxiliary_loss_mlp": 0.01030516, + "balance_loss_clip": 1.02488732, + "balance_loss_mlp": 1.02003753, + "epoch": 0.58445813918533, + "flos": 22748117466240.0, + "grad_norm": 1.6926417784896783, + "language_loss": 0.83093953, + "learning_rate": 1.5537648818791105e-06, + "loss": 0.85152769, + "num_input_tokens_seen": 209545895, + "step": 9721, + "time_per_iteration": 2.6507625579833984 + }, + { + "auxiliary_loss_clip": 0.01008, + "auxiliary_loss_mlp": 0.01004575, + "balance_loss_clip": 1.00176954, + "balance_loss_mlp": 1.00354373, + "epoch": 0.5845182624379979, + "flos": 60686556658560.0, + "grad_norm": 0.9305235120214276, + "language_loss": 0.71355069, + "learning_rate": 1.5533852463882226e-06, + "loss": 0.73367643, + "num_input_tokens_seen": 209602315, + "step": 9722, + "time_per_iteration": 3.154590129852295 + }, + { + "auxiliary_loss_clip": 0.01044578, + "auxiliary_loss_mlp": 0.01032768, + "balance_loss_clip": 1.02241826, + "balance_loss_mlp": 1.02198505, + "epoch": 0.5845783856906659, + "flos": 16362446336640.0, + "grad_norm": 2.0022167684792254, + "language_loss": 0.89306104, + "learning_rate": 1.5530056278330113e-06, + "loss": 0.91383445, + "num_input_tokens_seen": 209617615, + "step": 9723, + "time_per_iteration": 2.628710985183716 + }, + { + "auxiliary_loss_clip": 0.01044841, + "auxiliary_loss_mlp": 0.01027461, + "balance_loss_clip": 1.02638471, + "balance_loss_mlp": 1.01705444, + "epoch": 0.5846385089433338, + "flos": 20083402080000.0, + "grad_norm": 1.4030147310597896, + "language_loss": 0.6847204, + "learning_rate": 1.5526260262278709e-06, + "loss": 0.70544338, + "num_input_tokens_seen": 209637005, + "step": 9724, + "time_per_iteration": 2.6978540420532227 + }, + { + "auxiliary_loss_clip": 0.01058141, + "auxiliary_loss_mlp": 0.01030714, + "balance_loss_clip": 1.0269922, + "balance_loss_mlp": 1.01909089, + "epoch": 0.5846986321960018, + "flos": 17311062568320.0, + "grad_norm": 1.6596552911509344, + "language_loss": 0.86070454, + "learning_rate": 1.552246441587197e-06, + "loss": 0.88159311, + "num_input_tokens_seen": 209653170, + "step": 9725, + "time_per_iteration": 4.193404197692871 + }, + { + "auxiliary_loss_clip": 0.01040859, + "auxiliary_loss_mlp": 0.01035194, + "balance_loss_clip": 1.02600121, + "balance_loss_mlp": 1.02400613, + "epoch": 0.5847587554486697, + "flos": 17197907748480.0, + "grad_norm": 1.5655015787490725, + "language_loss": 0.83010733, + "learning_rate": 1.5518668739253821e-06, + "loss": 0.85086787, + "num_input_tokens_seen": 209671275, + "step": 9726, + "time_per_iteration": 2.6029629707336426 + }, + { + "auxiliary_loss_clip": 0.01009786, + "auxiliary_loss_mlp": 0.0074758, + "balance_loss_clip": 1.0270071, + "balance_loss_mlp": 1.00051689, + "epoch": 0.5848188787013378, + "flos": 24529106540160.0, + "grad_norm": 1.8744943116269506, + "language_loss": 0.66603863, + "learning_rate": 1.5514873232568206e-06, + "loss": 0.68361229, + "num_input_tokens_seen": 209690380, + "step": 9727, + "time_per_iteration": 2.793272018432617 + }, + { + "auxiliary_loss_clip": 0.01023493, + "auxiliary_loss_mlp": 0.01041205, + "balance_loss_clip": 1.02150214, + "balance_loss_mlp": 1.02816331, + "epoch": 0.5848790019540057, + "flos": 20628382204800.0, + "grad_norm": 1.5893462644612049, + "language_loss": 0.81805164, + "learning_rate": 1.5511077895959055e-06, + "loss": 0.83869863, + "num_input_tokens_seen": 209708845, + "step": 9728, + "time_per_iteration": 2.6803040504455566 + }, + { + "auxiliary_loss_clip": 0.01050826, + "auxiliary_loss_mlp": 0.01033492, + "balance_loss_clip": 1.0243082, + "balance_loss_mlp": 1.02335966, + "epoch": 0.5849391252066737, + "flos": 22418852469120.0, + "grad_norm": 1.7913748440588402, + "language_loss": 0.77578616, + "learning_rate": 1.550728272957027e-06, + "loss": 0.79662937, + "num_input_tokens_seen": 209729000, + "step": 9729, + "time_per_iteration": 2.768932819366455 + }, + { + "auxiliary_loss_clip": 0.01045501, + "auxiliary_loss_mlp": 0.01030489, + "balance_loss_clip": 1.02239656, + "balance_loss_mlp": 1.01842535, + "epoch": 0.5849992484593417, + "flos": 25410929431680.0, + "grad_norm": 1.7434918008778288, + "language_loss": 0.70488232, + "learning_rate": 1.5503487733545782e-06, + "loss": 0.7256422, + "num_input_tokens_seen": 209747435, + "step": 9730, + "time_per_iteration": 2.6441609859466553 + }, + { + "auxiliary_loss_clip": 0.01070276, + "auxiliary_loss_mlp": 0.01031136, + "balance_loss_clip": 1.02789295, + "balance_loss_mlp": 1.01904798, + "epoch": 0.5850593717120096, + "flos": 21065163586560.0, + "grad_norm": 1.839858121676477, + "language_loss": 0.78438848, + "learning_rate": 1.5499692908029482e-06, + "loss": 0.80540264, + "num_input_tokens_seen": 209764910, + "step": 9731, + "time_per_iteration": 2.538573741912842 + }, + { + "auxiliary_loss_clip": 0.01048977, + "auxiliary_loss_mlp": 0.01037256, + "balance_loss_clip": 1.02383447, + "balance_loss_mlp": 1.02457166, + "epoch": 0.5851194949646776, + "flos": 25301545539840.0, + "grad_norm": 3.361206551052575, + "language_loss": 0.70130211, + "learning_rate": 1.549589825316528e-06, + "loss": 0.72216439, + "num_input_tokens_seen": 209786115, + "step": 9732, + "time_per_iteration": 2.648247480392456 + }, + { + "auxiliary_loss_clip": 0.0100997, + "auxiliary_loss_mlp": 0.01036472, + "balance_loss_clip": 1.02106285, + "balance_loss_mlp": 1.02317381, + "epoch": 0.5851796182173455, + "flos": 23587242065280.0, + "grad_norm": 2.1465328519500093, + "language_loss": 0.52526355, + "learning_rate": 1.5492103769097075e-06, + "loss": 0.54572797, + "num_input_tokens_seen": 209806095, + "step": 9733, + "time_per_iteration": 2.695389986038208 + }, + { + "auxiliary_loss_clip": 0.01047936, + "auxiliary_loss_mlp": 0.01031889, + "balance_loss_clip": 1.02419758, + "balance_loss_mlp": 1.01986122, + "epoch": 0.5852397414700136, + "flos": 24822712310400.0, + "grad_norm": 2.164248081024882, + "language_loss": 0.87853014, + "learning_rate": 1.5488309455968739e-06, + "loss": 0.89932841, + "num_input_tokens_seen": 209823650, + "step": 9734, + "time_per_iteration": 4.1468589305877686 + }, + { + "auxiliary_loss_clip": 0.01035073, + "auxiliary_loss_mlp": 0.01032331, + "balance_loss_clip": 1.0227654, + "balance_loss_mlp": 1.02190042, + "epoch": 0.5852998647226815, + "flos": 19937784343680.0, + "grad_norm": 1.5219521728453487, + "language_loss": 0.72317576, + "learning_rate": 1.5484515313924163e-06, + "loss": 0.74384987, + "num_input_tokens_seen": 209843220, + "step": 9735, + "time_per_iteration": 2.6498095989227295 + }, + { + "auxiliary_loss_clip": 0.01057513, + "auxiliary_loss_mlp": 0.01037239, + "balance_loss_clip": 1.02546847, + "balance_loss_mlp": 1.02518654, + "epoch": 0.5853599879753495, + "flos": 16720367408640.0, + "grad_norm": 2.2937321032877347, + "language_loss": 0.74096352, + "learning_rate": 1.5480721343107217e-06, + "loss": 0.76191109, + "num_input_tokens_seen": 209854880, + "step": 9736, + "time_per_iteration": 2.656798839569092 + }, + { + "auxiliary_loss_clip": 0.01028399, + "auxiliary_loss_mlp": 0.01032157, + "balance_loss_clip": 1.02309251, + "balance_loss_mlp": 1.02072525, + "epoch": 0.5854201112280174, + "flos": 44456583680640.0, + "grad_norm": 1.4901507537765242, + "language_loss": 0.70607328, + "learning_rate": 1.5476927543661772e-06, + "loss": 0.72667885, + "num_input_tokens_seen": 209877870, + "step": 9737, + "time_per_iteration": 2.866558313369751 + }, + { + "auxiliary_loss_clip": 0.01026155, + "auxiliary_loss_mlp": 0.01035738, + "balance_loss_clip": 1.02265334, + "balance_loss_mlp": 1.02464533, + "epoch": 0.5854802344806854, + "flos": 20339193807360.0, + "grad_norm": 1.9270897593643481, + "language_loss": 0.82432222, + "learning_rate": 1.547313391573169e-06, + "loss": 0.84494114, + "num_input_tokens_seen": 209896690, + "step": 9738, + "time_per_iteration": 2.7566330432891846 + }, + { + "auxiliary_loss_clip": 0.01068907, + "auxiliary_loss_mlp": 0.00747629, + "balance_loss_clip": 1.02662253, + "balance_loss_mlp": 1.00044823, + "epoch": 0.5855403577333533, + "flos": 20921054221440.0, + "grad_norm": 1.9498127950605677, + "language_loss": 0.68456101, + "learning_rate": 1.546934045946082e-06, + "loss": 0.70272636, + "num_input_tokens_seen": 209914640, + "step": 9739, + "time_per_iteration": 2.660862445831299 + }, + { + "auxiliary_loss_clip": 0.01065874, + "auxiliary_loss_mlp": 0.01024713, + "balance_loss_clip": 1.02474737, + "balance_loss_mlp": 1.01328111, + "epoch": 0.5856004809860214, + "flos": 20448649526400.0, + "grad_norm": 2.198605803007019, + "language_loss": 0.58540797, + "learning_rate": 1.5465547174993017e-06, + "loss": 0.60631382, + "num_input_tokens_seen": 209933375, + "step": 9740, + "time_per_iteration": 2.6818318367004395 + }, + { + "auxiliary_loss_clip": 0.01042744, + "auxiliary_loss_mlp": 0.01025525, + "balance_loss_clip": 1.02310836, + "balance_loss_mlp": 1.01474798, + "epoch": 0.5856606042386893, + "flos": 19640766781440.0, + "grad_norm": 1.9558385791391162, + "language_loss": 0.75436312, + "learning_rate": 1.5461754062472113e-06, + "loss": 0.77504581, + "num_input_tokens_seen": 209952055, + "step": 9741, + "time_per_iteration": 2.7154605388641357 + }, + { + "auxiliary_loss_clip": 0.01031673, + "auxiliary_loss_mlp": 0.01026125, + "balance_loss_clip": 1.02495277, + "balance_loss_mlp": 1.01532435, + "epoch": 0.5857207274913573, + "flos": 21686166846720.0, + "grad_norm": 3.1034983996445207, + "language_loss": 0.75652838, + "learning_rate": 1.5457961122041959e-06, + "loss": 0.77710629, + "num_input_tokens_seen": 209971190, + "step": 9742, + "time_per_iteration": 2.770662307739258 + }, + { + "auxiliary_loss_clip": 0.01044744, + "auxiliary_loss_mlp": 0.0102587, + "balance_loss_clip": 1.02542377, + "balance_loss_mlp": 1.01527274, + "epoch": 0.5857808507440253, + "flos": 23182708118400.0, + "grad_norm": 2.021969514378698, + "language_loss": 0.75028843, + "learning_rate": 1.5454168353846369e-06, + "loss": 0.77099454, + "num_input_tokens_seen": 209990695, + "step": 9743, + "time_per_iteration": 2.6527018547058105 + }, + { + "auxiliary_loss_clip": 0.01043842, + "auxiliary_loss_mlp": 0.01026924, + "balance_loss_clip": 1.02551842, + "balance_loss_mlp": 1.01729214, + "epoch": 0.5858409739966932, + "flos": 27235299156480.0, + "grad_norm": 1.7085360181154543, + "language_loss": 0.80528212, + "learning_rate": 1.5450375758029172e-06, + "loss": 0.82598984, + "num_input_tokens_seen": 210010210, + "step": 9744, + "time_per_iteration": 2.708550214767456 + }, + { + "auxiliary_loss_clip": 0.01043966, + "auxiliary_loss_mlp": 0.01032683, + "balance_loss_clip": 1.02620959, + "balance_loss_mlp": 1.02130437, + "epoch": 0.5859010972493612, + "flos": 27855512317440.0, + "grad_norm": 1.5806944681455848, + "language_loss": 0.71636522, + "learning_rate": 1.5446583334734183e-06, + "loss": 0.73713166, + "num_input_tokens_seen": 210030030, + "step": 9745, + "time_per_iteration": 2.685232162475586 + }, + { + "auxiliary_loss_clip": 0.00987758, + "auxiliary_loss_mlp": 0.01001653, + "balance_loss_clip": 1.00197411, + "balance_loss_mlp": 1.00064588, + "epoch": 0.5859612205020291, + "flos": 70007064428160.0, + "grad_norm": 0.7246973352674247, + "language_loss": 0.53337574, + "learning_rate": 1.5442791084105204e-06, + "loss": 0.55326986, + "num_input_tokens_seen": 210094840, + "step": 9746, + "time_per_iteration": 3.285555124282837 + }, + { + "auxiliary_loss_clip": 0.01037986, + "auxiliary_loss_mlp": 0.01029533, + "balance_loss_clip": 1.02465391, + "balance_loss_mlp": 1.01779091, + "epoch": 0.5860213437546972, + "flos": 24056019486720.0, + "grad_norm": 5.625026552531023, + "language_loss": 0.72904193, + "learning_rate": 1.5438999006286054e-06, + "loss": 0.74971712, + "num_input_tokens_seen": 210114660, + "step": 9747, + "time_per_iteration": 2.6609580516815186 + }, + { + "auxiliary_loss_clip": 0.01033925, + "auxiliary_loss_mlp": 0.01034063, + "balance_loss_clip": 1.02199566, + "balance_loss_mlp": 1.02159381, + "epoch": 0.5860814670073651, + "flos": 18947583141120.0, + "grad_norm": 1.8948996348298417, + "language_loss": 0.81344545, + "learning_rate": 1.543520710142051e-06, + "loss": 0.83412534, + "num_input_tokens_seen": 210132770, + "step": 9748, + "time_per_iteration": 2.5941097736358643 + }, + { + "auxiliary_loss_clip": 0.01056351, + "auxiliary_loss_mlp": 0.01029996, + "balance_loss_clip": 1.02584791, + "balance_loss_mlp": 1.01910663, + "epoch": 0.5861415902600331, + "flos": 22561848512640.0, + "grad_norm": 1.756574880684623, + "language_loss": 0.72063172, + "learning_rate": 1.5431415369652375e-06, + "loss": 0.74149525, + "num_input_tokens_seen": 210151895, + "step": 9749, + "time_per_iteration": 2.710705518722534 + }, + { + "auxiliary_loss_clip": 0.01047072, + "auxiliary_loss_mlp": 0.01025551, + "balance_loss_clip": 1.02791047, + "balance_loss_mlp": 1.01516247, + "epoch": 0.586201713512701, + "flos": 14392027912320.0, + "grad_norm": 2.5595947803989865, + "language_loss": 0.75100398, + "learning_rate": 1.5427623811125428e-06, + "loss": 0.77173018, + "num_input_tokens_seen": 210168040, + "step": 9750, + "time_per_iteration": 2.6935505867004395 + }, + { + "auxiliary_loss_clip": 0.01036455, + "auxiliary_loss_mlp": 0.0103203, + "balance_loss_clip": 1.02652693, + "balance_loss_mlp": 1.02095592, + "epoch": 0.586261836765369, + "flos": 19498560837120.0, + "grad_norm": 1.5928580573944668, + "language_loss": 0.70845747, + "learning_rate": 1.542383242598344e-06, + "loss": 0.72914231, + "num_input_tokens_seen": 210187720, + "step": 9751, + "time_per_iteration": 2.669773578643799 + }, + { + "auxiliary_loss_clip": 0.01066659, + "auxiliary_loss_mlp": 0.01034092, + "balance_loss_clip": 1.02487111, + "balance_loss_mlp": 1.02186728, + "epoch": 0.5863219600180369, + "flos": 20701819560960.0, + "grad_norm": 1.882070040089236, + "language_loss": 0.74487948, + "learning_rate": 1.5420041214370184e-06, + "loss": 0.76588702, + "num_input_tokens_seen": 210206080, + "step": 9752, + "time_per_iteration": 2.5600500106811523 + }, + { + "auxiliary_loss_clip": 0.0105366, + "auxiliary_loss_mlp": 0.01029067, + "balance_loss_clip": 1.0248456, + "balance_loss_mlp": 1.01858282, + "epoch": 0.586382083270705, + "flos": 19792130693760.0, + "grad_norm": 1.7889886854965609, + "language_loss": 0.77250874, + "learning_rate": 1.541625017642943e-06, + "loss": 0.79333603, + "num_input_tokens_seen": 210225660, + "step": 9753, + "time_per_iteration": 2.574798107147217 + }, + { + "auxiliary_loss_clip": 0.01061751, + "auxiliary_loss_mlp": 0.01025091, + "balance_loss_clip": 1.02447629, + "balance_loss_mlp": 1.01516688, + "epoch": 0.5864422065233729, + "flos": 16500558130560.0, + "grad_norm": 1.7907032307460202, + "language_loss": 0.71013844, + "learning_rate": 1.5412459312304927e-06, + "loss": 0.73100686, + "num_input_tokens_seen": 210242725, + "step": 9754, + "time_per_iteration": 2.6383187770843506 + }, + { + "auxiliary_loss_clip": 0.01037942, + "auxiliary_loss_mlp": 0.0102868, + "balance_loss_clip": 1.02214682, + "balance_loss_mlp": 1.01746249, + "epoch": 0.5865023297760409, + "flos": 20413277608320.0, + "grad_norm": 3.0066241679351084, + "language_loss": 0.72190261, + "learning_rate": 1.540866862214043e-06, + "loss": 0.74256891, + "num_input_tokens_seen": 210263225, + "step": 9755, + "time_per_iteration": 2.5429601669311523 + }, + { + "auxiliary_loss_clip": 0.00981262, + "auxiliary_loss_mlp": 0.01001347, + "balance_loss_clip": 1.00498128, + "balance_loss_mlp": 1.00016069, + "epoch": 0.5865624530287089, + "flos": 63350769254400.0, + "grad_norm": 0.733053306032874, + "language_loss": 0.56942236, + "learning_rate": 1.540487810607967e-06, + "loss": 0.58924842, + "num_input_tokens_seen": 210322310, + "step": 9756, + "time_per_iteration": 3.1953046321868896 + }, + { + "auxiliary_loss_clip": 0.01062269, + "auxiliary_loss_mlp": 0.01029855, + "balance_loss_clip": 1.02454376, + "balance_loss_mlp": 1.01985312, + "epoch": 0.5866225762813768, + "flos": 27016279977600.0, + "grad_norm": 1.668431704502581, + "language_loss": 0.76175857, + "learning_rate": 1.5401087764266396e-06, + "loss": 0.78267986, + "num_input_tokens_seen": 210340845, + "step": 9757, + "time_per_iteration": 2.55202054977417 + }, + { + "auxiliary_loss_clip": 0.00988882, + "auxiliary_loss_mlp": 0.01003495, + "balance_loss_clip": 1.00281024, + "balance_loss_mlp": 1.00223148, + "epoch": 0.5866826995340448, + "flos": 72987038507520.0, + "grad_norm": 0.85376797033598, + "language_loss": 0.6051563, + "learning_rate": 1.5397297596844337e-06, + "loss": 0.62508011, + "num_input_tokens_seen": 210397815, + "step": 9758, + "time_per_iteration": 3.1648852825164795 + }, + { + "auxiliary_loss_clip": 0.01067786, + "auxiliary_loss_mlp": 0.01032339, + "balance_loss_clip": 1.02530313, + "balance_loss_mlp": 1.02062082, + "epoch": 0.5867428227867127, + "flos": 21285727050240.0, + "grad_norm": 2.743670806372251, + "language_loss": 0.72147334, + "learning_rate": 1.5393507603957212e-06, + "loss": 0.74247456, + "num_input_tokens_seen": 210413900, + "step": 9759, + "time_per_iteration": 2.4919486045837402 + }, + { + "auxiliary_loss_clip": 0.01038897, + "auxiliary_loss_mlp": 0.01028878, + "balance_loss_clip": 1.02233338, + "balance_loss_mlp": 1.01858425, + "epoch": 0.5868029460393808, + "flos": 33468852188160.0, + "grad_norm": 1.589450099420741, + "language_loss": 0.73062807, + "learning_rate": 1.5389717785748742e-06, + "loss": 0.75130582, + "num_input_tokens_seen": 210434110, + "step": 9760, + "time_per_iteration": 2.6905784606933594 + }, + { + "auxiliary_loss_clip": 0.01053361, + "auxiliary_loss_mlp": 0.01025172, + "balance_loss_clip": 1.02480769, + "balance_loss_mlp": 1.01420438, + "epoch": 0.5868630692920487, + "flos": 17889475276800.0, + "grad_norm": 1.8010191250998688, + "language_loss": 0.73129284, + "learning_rate": 1.5385928142362637e-06, + "loss": 0.75207818, + "num_input_tokens_seen": 210451685, + "step": 9761, + "time_per_iteration": 2.5262184143066406 + }, + { + "auxiliary_loss_clip": 0.01051507, + "auxiliary_loss_mlp": 0.01027442, + "balance_loss_clip": 1.02853739, + "balance_loss_mlp": 1.0154016, + "epoch": 0.5869231925447167, + "flos": 21035035054080.0, + "grad_norm": 1.8331311485111328, + "language_loss": 0.75100875, + "learning_rate": 1.5382138673942597e-06, + "loss": 0.77179825, + "num_input_tokens_seen": 210470825, + "step": 9762, + "time_per_iteration": 4.240907192230225 + }, + { + "auxiliary_loss_clip": 0.01026808, + "auxiliary_loss_mlp": 0.01029977, + "balance_loss_clip": 1.02234757, + "balance_loss_mlp": 1.01871741, + "epoch": 0.5869833157973846, + "flos": 74738219293440.0, + "grad_norm": 1.2856409628838632, + "language_loss": 0.72131646, + "learning_rate": 1.5378349380632317e-06, + "loss": 0.74188435, + "num_input_tokens_seen": 210500075, + "step": 9763, + "time_per_iteration": 4.70911431312561 + }, + { + "auxiliary_loss_clip": 0.01047287, + "auxiliary_loss_mlp": 0.01027355, + "balance_loss_clip": 1.02223015, + "balance_loss_mlp": 1.0169127, + "epoch": 0.5870434390500526, + "flos": 17638998762240.0, + "grad_norm": 1.4964671129113565, + "language_loss": 0.80046165, + "learning_rate": 1.53745602625755e-06, + "loss": 0.82120812, + "num_input_tokens_seen": 210518150, + "step": 9764, + "time_per_iteration": 2.5906598567962646 + }, + { + "auxiliary_loss_clip": 0.0104768, + "auxiliary_loss_mlp": 0.01033635, + "balance_loss_clip": 1.02818727, + "balance_loss_mlp": 1.02273321, + "epoch": 0.5871035623027205, + "flos": 21506146859520.0, + "grad_norm": 1.5912136419072578, + "language_loss": 0.78918493, + "learning_rate": 1.5370771319915819e-06, + "loss": 0.80999804, + "num_input_tokens_seen": 210537760, + "step": 9765, + "time_per_iteration": 2.6451284885406494 + }, + { + "auxiliary_loss_clip": 0.0103727, + "auxiliary_loss_mlp": 0.01032724, + "balance_loss_clip": 1.02515233, + "balance_loss_mlp": 1.02143526, + "epoch": 0.5871636855553886, + "flos": 13551861818880.0, + "grad_norm": 1.8747561010839469, + "language_loss": 0.83322197, + "learning_rate": 1.5366982552796947e-06, + "loss": 0.85392189, + "num_input_tokens_seen": 210555515, + "step": 9766, + "time_per_iteration": 2.7093966007232666 + }, + { + "auxiliary_loss_clip": 0.0105778, + "auxiliary_loss_mlp": 0.01032134, + "balance_loss_clip": 1.02601767, + "balance_loss_mlp": 1.02128637, + "epoch": 0.5872238088080565, + "flos": 26212922346240.0, + "grad_norm": 1.6446158801032926, + "language_loss": 0.69800591, + "learning_rate": 1.536319396136257e-06, + "loss": 0.71890509, + "num_input_tokens_seen": 210575000, + "step": 9767, + "time_per_iteration": 2.686420440673828 + }, + { + "auxiliary_loss_clip": 0.01047299, + "auxiliary_loss_mlp": 0.00747637, + "balance_loss_clip": 1.02342391, + "balance_loss_mlp": 1.00056148, + "epoch": 0.5872839320607245, + "flos": 30665198995200.0, + "grad_norm": 1.7001546317494651, + "language_loss": 0.63429481, + "learning_rate": 1.5359405545756336e-06, + "loss": 0.65224421, + "num_input_tokens_seen": 210595185, + "step": 9768, + "time_per_iteration": 2.6865365505218506 + }, + { + "auxiliary_loss_clip": 0.01007952, + "auxiliary_loss_mlp": 0.00746967, + "balance_loss_clip": 1.00192833, + "balance_loss_mlp": 1.00141585, + "epoch": 0.5873440553133924, + "flos": 60303570871680.0, + "grad_norm": 0.717964463996929, + "language_loss": 0.53883111, + "learning_rate": 1.5355617306121914e-06, + "loss": 0.55638027, + "num_input_tokens_seen": 210653210, + "step": 9769, + "time_per_iteration": 3.1981680393218994 + }, + { + "auxiliary_loss_clip": 0.01029434, + "auxiliary_loss_mlp": 0.01028387, + "balance_loss_clip": 1.02215767, + "balance_loss_mlp": 1.0179379, + "epoch": 0.5874041785660604, + "flos": 21539292134400.0, + "grad_norm": 1.4167882832412777, + "language_loss": 0.70954239, + "learning_rate": 1.5351829242602945e-06, + "loss": 0.73012054, + "num_input_tokens_seen": 210673750, + "step": 9770, + "time_per_iteration": 2.7787392139434814 + }, + { + "auxiliary_loss_clip": 0.01022767, + "auxiliary_loss_mlp": 0.01029991, + "balance_loss_clip": 1.02451873, + "balance_loss_mlp": 1.01893449, + "epoch": 0.5874643018187284, + "flos": 24388947671040.0, + "grad_norm": 1.759463885923714, + "language_loss": 0.67452461, + "learning_rate": 1.5348041355343077e-06, + "loss": 0.69505215, + "num_input_tokens_seen": 210692960, + "step": 9771, + "time_per_iteration": 2.7164623737335205 + }, + { + "auxiliary_loss_clip": 0.01017363, + "auxiliary_loss_mlp": 0.01036348, + "balance_loss_clip": 1.02075315, + "balance_loss_mlp": 1.02298439, + "epoch": 0.5875244250713964, + "flos": 28147717457280.0, + "grad_norm": 1.5649003397319465, + "language_loss": 0.65983343, + "learning_rate": 1.5344253644485954e-06, + "loss": 0.68037057, + "num_input_tokens_seen": 210714040, + "step": 9772, + "time_per_iteration": 2.7220218181610107 + }, + { + "auxiliary_loss_clip": 0.01067128, + "auxiliary_loss_mlp": 0.01035727, + "balance_loss_clip": 1.02568412, + "balance_loss_mlp": 1.02378869, + "epoch": 0.5875845483240644, + "flos": 25812410722560.0, + "grad_norm": 1.6307405444687062, + "language_loss": 0.74014241, + "learning_rate": 1.534046611017519e-06, + "loss": 0.76117098, + "num_input_tokens_seen": 210733710, + "step": 9773, + "time_per_iteration": 4.352917194366455 + }, + { + "auxiliary_loss_clip": 0.01035301, + "auxiliary_loss_mlp": 0.01031758, + "balance_loss_clip": 1.02575541, + "balance_loss_mlp": 1.02010572, + "epoch": 0.5876446715767323, + "flos": 26906572863360.0, + "grad_norm": 2.623513986864115, + "language_loss": 0.53713071, + "learning_rate": 1.5336678752554421e-06, + "loss": 0.55780137, + "num_input_tokens_seen": 210753580, + "step": 9774, + "time_per_iteration": 2.8160557746887207 + }, + { + "auxiliary_loss_clip": 0.01057477, + "auxiliary_loss_mlp": 0.01031518, + "balance_loss_clip": 1.02622724, + "balance_loss_mlp": 1.01982319, + "epoch": 0.5877047948294003, + "flos": 36684832579200.0, + "grad_norm": 2.306287885966426, + "language_loss": 0.64807534, + "learning_rate": 1.5332891571767264e-06, + "loss": 0.66896534, + "num_input_tokens_seen": 210773495, + "step": 9775, + "time_per_iteration": 2.7319095134735107 + }, + { + "auxiliary_loss_clip": 0.01054346, + "auxiliary_loss_mlp": 0.01029018, + "balance_loss_clip": 1.02472115, + "balance_loss_mlp": 1.01790214, + "epoch": 0.5877649180820682, + "flos": 26724721282560.0, + "grad_norm": 2.356400259884834, + "language_loss": 0.73559457, + "learning_rate": 1.5329104567957326e-06, + "loss": 0.75642824, + "num_input_tokens_seen": 210793645, + "step": 9776, + "time_per_iteration": 2.603069305419922 + }, + { + "auxiliary_loss_clip": 0.01062949, + "auxiliary_loss_mlp": 0.01028402, + "balance_loss_clip": 1.02414739, + "balance_loss_mlp": 1.01801872, + "epoch": 0.5878250413347362, + "flos": 21032197879680.0, + "grad_norm": 1.9985254126960623, + "language_loss": 0.74417824, + "learning_rate": 1.532531774126821e-06, + "loss": 0.76509178, + "num_input_tokens_seen": 210813415, + "step": 9777, + "time_per_iteration": 2.5216217041015625 + }, + { + "auxiliary_loss_clip": 0.01033952, + "auxiliary_loss_mlp": 0.01028656, + "balance_loss_clip": 1.02710938, + "balance_loss_mlp": 1.01849949, + "epoch": 0.5878851645874041, + "flos": 25484259047040.0, + "grad_norm": 1.4997773574443243, + "language_loss": 0.74251056, + "learning_rate": 1.5321531091843512e-06, + "loss": 0.76313663, + "num_input_tokens_seen": 210833850, + "step": 9778, + "time_per_iteration": 2.715691089630127 + }, + { + "auxiliary_loss_clip": 0.01016065, + "auxiliary_loss_mlp": 0.01029333, + "balance_loss_clip": 1.02035999, + "balance_loss_mlp": 1.01745403, + "epoch": 0.5879452878400722, + "flos": 23769129559680.0, + "grad_norm": 1.8504176321486863, + "language_loss": 0.70038998, + "learning_rate": 1.5317744619826824e-06, + "loss": 0.72084391, + "num_input_tokens_seen": 210853115, + "step": 9779, + "time_per_iteration": 2.73936128616333 + }, + { + "auxiliary_loss_clip": 0.01065629, + "auxiliary_loss_mlp": 0.00747699, + "balance_loss_clip": 1.02439451, + "balance_loss_mlp": 1.00059533, + "epoch": 0.5880054110927401, + "flos": 17824513530240.0, + "grad_norm": 1.8535227441828759, + "language_loss": 0.66947848, + "learning_rate": 1.5313958325361727e-06, + "loss": 0.68761176, + "num_input_tokens_seen": 210872090, + "step": 9780, + "time_per_iteration": 2.5851478576660156 + }, + { + "auxiliary_loss_clip": 0.0105082, + "auxiliary_loss_mlp": 0.0103264, + "balance_loss_clip": 1.03106725, + "balance_loss_mlp": 1.02086782, + "epoch": 0.5880655343454081, + "flos": 19463404400640.0, + "grad_norm": 1.7950547306325233, + "language_loss": 0.72729456, + "learning_rate": 1.5310172208591807e-06, + "loss": 0.74812913, + "num_input_tokens_seen": 210888490, + "step": 9781, + "time_per_iteration": 2.675269365310669 + }, + { + "auxiliary_loss_clip": 0.01036557, + "auxiliary_loss_mlp": 0.00747597, + "balance_loss_clip": 1.02469897, + "balance_loss_mlp": 1.00054038, + "epoch": 0.588125657598076, + "flos": 21397588980480.0, + "grad_norm": 1.381291044534111, + "language_loss": 0.70518553, + "learning_rate": 1.5306386269660622e-06, + "loss": 0.72302705, + "num_input_tokens_seen": 210908220, + "step": 9782, + "time_per_iteration": 4.231954574584961 + }, + { + "auxiliary_loss_clip": 0.01049319, + "auxiliary_loss_mlp": 0.01030064, + "balance_loss_clip": 1.02241075, + "balance_loss_mlp": 1.01764202, + "epoch": 0.588185780850744, + "flos": 16034653797120.0, + "grad_norm": 4.438465161192885, + "language_loss": 0.70274138, + "learning_rate": 1.5302600508711741e-06, + "loss": 0.72353524, + "num_input_tokens_seen": 210923945, + "step": 9783, + "time_per_iteration": 2.5483298301696777 + }, + { + "auxiliary_loss_clip": 0.01030235, + "auxiliary_loss_mlp": 0.01032046, + "balance_loss_clip": 1.02630353, + "balance_loss_mlp": 1.01962435, + "epoch": 0.588245904103412, + "flos": 23728226947200.0, + "grad_norm": 2.402493623444614, + "language_loss": 0.69039434, + "learning_rate": 1.5298814925888719e-06, + "loss": 0.71101713, + "num_input_tokens_seen": 210941955, + "step": 9784, + "time_per_iteration": 2.694216251373291 + }, + { + "auxiliary_loss_clip": 0.01026254, + "auxiliary_loss_mlp": 0.01030501, + "balance_loss_clip": 1.0243814, + "balance_loss_mlp": 1.01961112, + "epoch": 0.58830602735608, + "flos": 33802534558080.0, + "grad_norm": 2.3311524564267807, + "language_loss": 0.69608676, + "learning_rate": 1.5295029521335102e-06, + "loss": 0.7166543, + "num_input_tokens_seen": 210963105, + "step": 9785, + "time_per_iteration": 2.8718526363372803 + }, + { + "auxiliary_loss_clip": 0.01053722, + "auxiliary_loss_mlp": 0.01024375, + "balance_loss_clip": 1.02504551, + "balance_loss_mlp": 1.01420665, + "epoch": 0.588366150608748, + "flos": 17090714586240.0, + "grad_norm": 2.5829003350490307, + "language_loss": 0.77400279, + "learning_rate": 1.5291244295194448e-06, + "loss": 0.79478371, + "num_input_tokens_seen": 210978720, + "step": 9786, + "time_per_iteration": 2.578862190246582 + }, + { + "auxiliary_loss_clip": 0.01047951, + "auxiliary_loss_mlp": 0.01028925, + "balance_loss_clip": 1.02602601, + "balance_loss_mlp": 1.01797509, + "epoch": 0.5884262738614159, + "flos": 22127186033280.0, + "grad_norm": 1.756113362942012, + "language_loss": 0.79364908, + "learning_rate": 1.5287459247610276e-06, + "loss": 0.81441784, + "num_input_tokens_seen": 210998750, + "step": 9787, + "time_per_iteration": 2.6369147300720215 + }, + { + "auxiliary_loss_clip": 0.01037434, + "auxiliary_loss_mlp": 0.01034374, + "balance_loss_clip": 1.02658927, + "balance_loss_mlp": 1.02365065, + "epoch": 0.5884863971140839, + "flos": 21031838743680.0, + "grad_norm": 1.4527112900185697, + "language_loss": 0.66296983, + "learning_rate": 1.5283674378726116e-06, + "loss": 0.68368793, + "num_input_tokens_seen": 211017550, + "step": 9788, + "time_per_iteration": 2.6486713886260986 + }, + { + "auxiliary_loss_clip": 0.01036495, + "auxiliary_loss_mlp": 0.01033474, + "balance_loss_clip": 1.02303195, + "balance_loss_mlp": 1.02084398, + "epoch": 0.5885465203667518, + "flos": 23805112008960.0, + "grad_norm": 2.0026268364514808, + "language_loss": 0.80314445, + "learning_rate": 1.5279889688685506e-06, + "loss": 0.82384419, + "num_input_tokens_seen": 211034135, + "step": 9789, + "time_per_iteration": 2.6664488315582275 + }, + { + "auxiliary_loss_clip": 0.01034472, + "auxiliary_loss_mlp": 0.00747496, + "balance_loss_clip": 1.02204084, + "balance_loss_mlp": 1.00050402, + "epoch": 0.5886066436194198, + "flos": 18880574319360.0, + "grad_norm": 1.5491800826086373, + "language_loss": 0.7053901, + "learning_rate": 1.5276105177631944e-06, + "loss": 0.7232098, + "num_input_tokens_seen": 211053850, + "step": 9790, + "time_per_iteration": 2.6815717220306396 + }, + { + "auxiliary_loss_clip": 0.01035035, + "auxiliary_loss_mlp": 0.01029365, + "balance_loss_clip": 1.02588081, + "balance_loss_mlp": 1.01796854, + "epoch": 0.5886667668720877, + "flos": 24790141653120.0, + "grad_norm": 1.630624568467038, + "language_loss": 0.83226013, + "learning_rate": 1.527232084570895e-06, + "loss": 0.85290408, + "num_input_tokens_seen": 211072165, + "step": 9791, + "time_per_iteration": 2.6733386516571045 + }, + { + "auxiliary_loss_clip": 0.01050028, + "auxiliary_loss_mlp": 0.01034198, + "balance_loss_clip": 1.0241015, + "balance_loss_mlp": 1.02202642, + "epoch": 0.5887268901247558, + "flos": 21614381516160.0, + "grad_norm": 1.519561835283685, + "language_loss": 0.7642113, + "learning_rate": 1.5268536693060026e-06, + "loss": 0.78505355, + "num_input_tokens_seen": 211089630, + "step": 9792, + "time_per_iteration": 2.5942249298095703 + }, + { + "auxiliary_loss_clip": 0.0100348, + "auxiliary_loss_mlp": 0.01037856, + "balance_loss_clip": 1.01885045, + "balance_loss_mlp": 1.02495193, + "epoch": 0.5887870133774237, + "flos": 20481722974080.0, + "grad_norm": 2.062545096600662, + "language_loss": 0.69000417, + "learning_rate": 1.5264752719828662e-06, + "loss": 0.71041751, + "num_input_tokens_seen": 211106120, + "step": 9793, + "time_per_iteration": 2.8350489139556885 + }, + { + "auxiliary_loss_clip": 0.01062632, + "auxiliary_loss_mlp": 0.01028516, + "balance_loss_clip": 1.02507412, + "balance_loss_mlp": 1.0170598, + "epoch": 0.5888471366300917, + "flos": 19206283870080.0, + "grad_norm": 1.6191652181195009, + "language_loss": 0.60079467, + "learning_rate": 1.5260968926158353e-06, + "loss": 0.62170619, + "num_input_tokens_seen": 211122450, + "step": 9794, + "time_per_iteration": 2.5012030601501465 + }, + { + "auxiliary_loss_clip": 0.01033798, + "auxiliary_loss_mlp": 0.01037101, + "balance_loss_clip": 1.02438402, + "balance_loss_mlp": 1.02523994, + "epoch": 0.5889072598827596, + "flos": 19972904866560.0, + "grad_norm": 1.4369482865106116, + "language_loss": 0.64965987, + "learning_rate": 1.525718531219257e-06, + "loss": 0.67036885, + "num_input_tokens_seen": 211141765, + "step": 9795, + "time_per_iteration": 2.629924774169922 + }, + { + "auxiliary_loss_clip": 0.01022514, + "auxiliary_loss_mlp": 0.01028344, + "balance_loss_clip": 1.02200437, + "balance_loss_mlp": 1.01777053, + "epoch": 0.5889673831354276, + "flos": 20741249715840.0, + "grad_norm": 1.9672916724715905, + "language_loss": 0.74044871, + "learning_rate": 1.5253401878074801e-06, + "loss": 0.76095736, + "num_input_tokens_seen": 211160475, + "step": 9796, + "time_per_iteration": 2.659012794494629 + }, + { + "auxiliary_loss_clip": 0.01035691, + "auxiliary_loss_mlp": 0.01027763, + "balance_loss_clip": 1.02253056, + "balance_loss_mlp": 1.01673055, + "epoch": 0.5890275063880956, + "flos": 25300935008640.0, + "grad_norm": 1.5590243128188073, + "language_loss": 0.83272648, + "learning_rate": 1.5249618623948507e-06, + "loss": 0.85336101, + "num_input_tokens_seen": 211180480, + "step": 9797, + "time_per_iteration": 2.605616569519043 + }, + { + "auxiliary_loss_clip": 0.01036021, + "auxiliary_loss_mlp": 0.01027539, + "balance_loss_clip": 1.02130222, + "balance_loss_mlp": 1.01624978, + "epoch": 0.5890876296407636, + "flos": 11765377964160.0, + "grad_norm": 1.627144484944594, + "language_loss": 0.79095876, + "learning_rate": 1.5245835549957152e-06, + "loss": 0.81159437, + "num_input_tokens_seen": 211198000, + "step": 9798, + "time_per_iteration": 2.54582142829895 + }, + { + "auxiliary_loss_clip": 0.01062792, + "auxiliary_loss_mlp": 0.01032177, + "balance_loss_clip": 1.02470255, + "balance_loss_mlp": 1.02167487, + "epoch": 0.5891477528934316, + "flos": 13589460380160.0, + "grad_norm": 2.0008962777076142, + "language_loss": 0.74513698, + "learning_rate": 1.5242052656244186e-06, + "loss": 0.7660867, + "num_input_tokens_seen": 211214765, + "step": 9799, + "time_per_iteration": 2.511587381362915 + }, + { + "auxiliary_loss_clip": 0.01027709, + "auxiliary_loss_mlp": 0.01030532, + "balance_loss_clip": 1.02252388, + "balance_loss_mlp": 1.01844394, + "epoch": 0.5892078761460995, + "flos": 15049193189760.0, + "grad_norm": 2.035430224946708, + "language_loss": 0.76450235, + "learning_rate": 1.5238269942953064e-06, + "loss": 0.78508472, + "num_input_tokens_seen": 211232335, + "step": 9800, + "time_per_iteration": 2.5773138999938965 + }, + { + "auxiliary_loss_clip": 0.01014059, + "auxiliary_loss_mlp": 0.01040936, + "balance_loss_clip": 1.02126241, + "balance_loss_mlp": 1.02860367, + "epoch": 0.5892679993987675, + "flos": 15778215624960.0, + "grad_norm": 1.7148931218794385, + "language_loss": 0.78933835, + "learning_rate": 1.523448741022722e-06, + "loss": 0.8098883, + "num_input_tokens_seen": 211249985, + "step": 9801, + "time_per_iteration": 2.6908304691314697 + }, + { + "auxiliary_loss_clip": 0.01040199, + "auxiliary_loss_mlp": 0.0102915, + "balance_loss_clip": 1.0282228, + "balance_loss_mlp": 1.01747918, + "epoch": 0.5893281226514354, + "flos": 25265203954560.0, + "grad_norm": 1.7375321658992215, + "language_loss": 0.66238892, + "learning_rate": 1.5230705058210088e-06, + "loss": 0.68308246, + "num_input_tokens_seen": 211268425, + "step": 9802, + "time_per_iteration": 2.738878011703491 + }, + { + "auxiliary_loss_clip": 0.01053073, + "auxiliary_loss_mlp": 0.01026502, + "balance_loss_clip": 1.02454329, + "balance_loss_mlp": 1.01561856, + "epoch": 0.5893882459041034, + "flos": 19458232842240.0, + "grad_norm": 1.688155491005301, + "language_loss": 0.78196549, + "learning_rate": 1.5226922887045108e-06, + "loss": 0.8027612, + "num_input_tokens_seen": 211286680, + "step": 9803, + "time_per_iteration": 2.756707191467285 + }, + { + "auxiliary_loss_clip": 0.01054741, + "auxiliary_loss_mlp": 0.01034458, + "balance_loss_clip": 1.02470088, + "balance_loss_mlp": 1.02306128, + "epoch": 0.5894483691567713, + "flos": 20634056553600.0, + "grad_norm": 1.8673267798348958, + "language_loss": 0.73125011, + "learning_rate": 1.5223140896875686e-06, + "loss": 0.75214207, + "num_input_tokens_seen": 211307700, + "step": 9804, + "time_per_iteration": 2.7298710346221924 + }, + { + "auxiliary_loss_clip": 0.01044974, + "auxiliary_loss_mlp": 0.01027454, + "balance_loss_clip": 1.02669024, + "balance_loss_mlp": 1.01630807, + "epoch": 0.5895084924094394, + "flos": 17778223877760.0, + "grad_norm": 1.7806236127940145, + "language_loss": 0.74695277, + "learning_rate": 1.5219359087845234e-06, + "loss": 0.76767707, + "num_input_tokens_seen": 211324835, + "step": 9805, + "time_per_iteration": 2.6690163612365723 + }, + { + "auxiliary_loss_clip": 0.01059567, + "auxiliary_loss_mlp": 0.00747768, + "balance_loss_clip": 1.02578306, + "balance_loss_mlp": 1.00047398, + "epoch": 0.5895686156621073, + "flos": 20121072468480.0, + "grad_norm": 1.817988556983267, + "language_loss": 0.7787981, + "learning_rate": 1.5215577460097174e-06, + "loss": 0.79687148, + "num_input_tokens_seen": 211344130, + "step": 9806, + "time_per_iteration": 2.6007001399993896 + }, + { + "auxiliary_loss_clip": 0.0106472, + "auxiliary_loss_mlp": 0.01028801, + "balance_loss_clip": 1.02431023, + "balance_loss_mlp": 1.01701164, + "epoch": 0.5896287389147753, + "flos": 20850058990080.0, + "grad_norm": 1.941148081760114, + "language_loss": 0.76903129, + "learning_rate": 1.5211796013774887e-06, + "loss": 0.78996652, + "num_input_tokens_seen": 211362915, + "step": 9807, + "time_per_iteration": 2.5607359409332275 + }, + { + "auxiliary_loss_clip": 0.01059332, + "auxiliary_loss_mlp": 0.01029675, + "balance_loss_clip": 1.02761769, + "balance_loss_mlp": 1.01792669, + "epoch": 0.5896888621674432, + "flos": 14537897043840.0, + "grad_norm": 1.833035395023982, + "language_loss": 0.74502426, + "learning_rate": 1.5208014749021786e-06, + "loss": 0.76591432, + "num_input_tokens_seen": 211380700, + "step": 9808, + "time_per_iteration": 2.5817041397094727 + }, + { + "auxiliary_loss_clip": 0.01015911, + "auxiliary_loss_mlp": 0.01028335, + "balance_loss_clip": 1.0220294, + "balance_loss_mlp": 1.01575828, + "epoch": 0.5897489854201112, + "flos": 20886759711360.0, + "grad_norm": 2.0177987225855447, + "language_loss": 0.71825022, + "learning_rate": 1.5204233665981236e-06, + "loss": 0.7386927, + "num_input_tokens_seen": 211400095, + "step": 9809, + "time_per_iteration": 2.7458293437957764 + }, + { + "auxiliary_loss_clip": 0.01047997, + "auxiliary_loss_mlp": 0.01030283, + "balance_loss_clip": 1.0261848, + "balance_loss_mlp": 1.01818299, + "epoch": 0.5898091086727792, + "flos": 20011149872640.0, + "grad_norm": 1.9389481222768667, + "language_loss": 0.82292736, + "learning_rate": 1.5200452764796627e-06, + "loss": 0.84371018, + "num_input_tokens_seen": 211417810, + "step": 9810, + "time_per_iteration": 4.318255186080933 + }, + { + "auxiliary_loss_clip": 0.01055819, + "auxiliary_loss_mlp": 0.01029138, + "balance_loss_clip": 1.02571118, + "balance_loss_mlp": 1.01802766, + "epoch": 0.5898692319254472, + "flos": 16253242012800.0, + "grad_norm": 1.8638537862346942, + "language_loss": 0.81290662, + "learning_rate": 1.5196672045611336e-06, + "loss": 0.83375615, + "num_input_tokens_seen": 211436020, + "step": 9811, + "time_per_iteration": 4.303525686264038 + }, + { + "auxiliary_loss_clip": 0.01058917, + "auxiliary_loss_mlp": 0.0102593, + "balance_loss_clip": 1.02632737, + "balance_loss_mlp": 1.01351416, + "epoch": 0.5899293551781152, + "flos": 20448541785600.0, + "grad_norm": 1.6230489920340683, + "language_loss": 0.76737702, + "learning_rate": 1.5192891508568715e-06, + "loss": 0.78822553, + "num_input_tokens_seen": 211454335, + "step": 9812, + "time_per_iteration": 2.6128487586975098 + }, + { + "auxiliary_loss_clip": 0.01032134, + "auxiliary_loss_mlp": 0.01030558, + "balance_loss_clip": 1.02433705, + "balance_loss_mlp": 1.02051497, + "epoch": 0.5899894784307831, + "flos": 13881701433600.0, + "grad_norm": 1.7263152270981708, + "language_loss": 0.70409071, + "learning_rate": 1.5189111153812133e-06, + "loss": 0.72471762, + "num_input_tokens_seen": 211472775, + "step": 9813, + "time_per_iteration": 2.7135372161865234 + }, + { + "auxiliary_loss_clip": 0.01049891, + "auxiliary_loss_mlp": 0.01034747, + "balance_loss_clip": 1.02969933, + "balance_loss_mlp": 1.02301693, + "epoch": 0.5900496016834511, + "flos": 20083797129600.0, + "grad_norm": 1.6613536190954554, + "language_loss": 0.72253084, + "learning_rate": 1.518533098148494e-06, + "loss": 0.74337721, + "num_input_tokens_seen": 211492195, + "step": 9814, + "time_per_iteration": 2.616532564163208 + }, + { + "auxiliary_loss_clip": 0.01039297, + "auxiliary_loss_mlp": 0.01029438, + "balance_loss_clip": 1.0260452, + "balance_loss_mlp": 1.01809549, + "epoch": 0.590109724936119, + "flos": 20259148348800.0, + "grad_norm": 1.7987259011225014, + "language_loss": 0.78750932, + "learning_rate": 1.5181550991730476e-06, + "loss": 0.80819666, + "num_input_tokens_seen": 211510220, + "step": 9815, + "time_per_iteration": 2.618300199508667 + }, + { + "auxiliary_loss_clip": 0.01041922, + "auxiliary_loss_mlp": 0.00747765, + "balance_loss_clip": 1.02704942, + "balance_loss_mlp": 1.00050497, + "epoch": 0.590169848188787, + "flos": 24235069806720.0, + "grad_norm": 1.8593207612484608, + "language_loss": 0.75640309, + "learning_rate": 1.5177771184692083e-06, + "loss": 0.77429998, + "num_input_tokens_seen": 211526260, + "step": 9816, + "time_per_iteration": 2.694983959197998 + }, + { + "auxiliary_loss_clip": 0.01065549, + "auxiliary_loss_mlp": 0.01030967, + "balance_loss_clip": 1.02660191, + "balance_loss_mlp": 1.02007759, + "epoch": 0.590229971441455, + "flos": 17784724239360.0, + "grad_norm": 1.954288331155956, + "language_loss": 0.80861712, + "learning_rate": 1.517399156051309e-06, + "loss": 0.82958221, + "num_input_tokens_seen": 211542890, + "step": 9817, + "time_per_iteration": 2.60981822013855 + }, + { + "auxiliary_loss_clip": 0.01009125, + "auxiliary_loss_mlp": 0.0103645, + "balance_loss_clip": 1.02235222, + "balance_loss_mlp": 1.02439201, + "epoch": 0.590290094694123, + "flos": 22236893147520.0, + "grad_norm": 1.680868049851478, + "language_loss": 0.76475132, + "learning_rate": 1.517021211933682e-06, + "loss": 0.78520703, + "num_input_tokens_seen": 211562685, + "step": 9818, + "time_per_iteration": 2.7249958515167236 + }, + { + "auxiliary_loss_clip": 0.01033333, + "auxiliary_loss_mlp": 0.01027103, + "balance_loss_clip": 1.02483618, + "balance_loss_mlp": 1.0169996, + "epoch": 0.5903502179467909, + "flos": 19098623831040.0, + "grad_norm": 1.8414991012153006, + "language_loss": 0.66836077, + "learning_rate": 1.5166432861306592e-06, + "loss": 0.68896514, + "num_input_tokens_seen": 211579960, + "step": 9819, + "time_per_iteration": 2.741058826446533 + }, + { + "auxiliary_loss_clip": 0.0106606, + "auxiliary_loss_mlp": 0.01029513, + "balance_loss_clip": 1.02622163, + "balance_loss_mlp": 1.01846802, + "epoch": 0.5904103411994589, + "flos": 24235500769920.0, + "grad_norm": 1.9337762071614275, + "language_loss": 0.78340203, + "learning_rate": 1.5162653786565714e-06, + "loss": 0.80435777, + "num_input_tokens_seen": 211599310, + "step": 9820, + "time_per_iteration": 4.185360908508301 + }, + { + "auxiliary_loss_clip": 0.00980963, + "auxiliary_loss_mlp": 0.01004683, + "balance_loss_clip": 1.00460887, + "balance_loss_mlp": 1.00352657, + "epoch": 0.5904704644521268, + "flos": 64876613045760.0, + "grad_norm": 0.9444349928224544, + "language_loss": 0.65160072, + "learning_rate": 1.5158874895257487e-06, + "loss": 0.67145717, + "num_input_tokens_seen": 211658790, + "step": 9821, + "time_per_iteration": 3.3140130043029785 + }, + { + "auxiliary_loss_clip": 0.01025274, + "auxiliary_loss_mlp": 0.01037449, + "balance_loss_clip": 1.02361798, + "balance_loss_mlp": 1.02672005, + "epoch": 0.5905305877047948, + "flos": 19609991804160.0, + "grad_norm": 2.073906364907294, + "language_loss": 0.61282194, + "learning_rate": 1.515509618752521e-06, + "loss": 0.6334492, + "num_input_tokens_seen": 211677240, + "step": 9822, + "time_per_iteration": 2.6728501319885254 + }, + { + "auxiliary_loss_clip": 0.01068299, + "auxiliary_loss_mlp": 0.01036599, + "balance_loss_clip": 1.02711535, + "balance_loss_mlp": 1.02534568, + "epoch": 0.5905907109574628, + "flos": 18989634988800.0, + "grad_norm": 1.869802793165867, + "language_loss": 0.82645226, + "learning_rate": 1.5151317663512173e-06, + "loss": 0.84750128, + "num_input_tokens_seen": 211695485, + "step": 9823, + "time_per_iteration": 2.589853286743164 + }, + { + "auxiliary_loss_clip": 0.01042759, + "auxiliary_loss_mlp": 0.01026199, + "balance_loss_clip": 1.02390766, + "balance_loss_mlp": 1.01525581, + "epoch": 0.5906508342101308, + "flos": 22200407907840.0, + "grad_norm": 2.7306127342937465, + "language_loss": 0.73346198, + "learning_rate": 1.514753932336165e-06, + "loss": 0.75415158, + "num_input_tokens_seen": 211713090, + "step": 9824, + "time_per_iteration": 2.7645933628082275 + }, + { + "auxiliary_loss_clip": 0.01038486, + "auxiliary_loss_mlp": 0.00747793, + "balance_loss_clip": 1.02464914, + "balance_loss_mlp": 1.00049686, + "epoch": 0.5907109574627988, + "flos": 20886687884160.0, + "grad_norm": 2.2550509784106714, + "language_loss": 0.82370961, + "learning_rate": 1.514376116721693e-06, + "loss": 0.8415724, + "num_input_tokens_seen": 211732510, + "step": 9825, + "time_per_iteration": 2.751565456390381 + }, + { + "auxiliary_loss_clip": 0.01052308, + "auxiliary_loss_mlp": 0.01031903, + "balance_loss_clip": 1.0245682, + "balance_loss_mlp": 1.021842, + "epoch": 0.5907710807154667, + "flos": 21506649649920.0, + "grad_norm": 1.7363751259900284, + "language_loss": 0.76853418, + "learning_rate": 1.5139983195221272e-06, + "loss": 0.78937632, + "num_input_tokens_seen": 211748695, + "step": 9826, + "time_per_iteration": 2.650390148162842 + }, + { + "auxiliary_loss_clip": 0.01045003, + "auxiliary_loss_mlp": 0.01024615, + "balance_loss_clip": 1.02602482, + "balance_loss_mlp": 1.01406491, + "epoch": 0.5908312039681347, + "flos": 22018376759040.0, + "grad_norm": 1.5670751411709452, + "language_loss": 0.72224796, + "learning_rate": 1.513620540751793e-06, + "loss": 0.74294412, + "num_input_tokens_seen": 211768545, + "step": 9827, + "time_per_iteration": 2.6908977031707764 + }, + { + "auxiliary_loss_clip": 0.01024725, + "auxiliary_loss_mlp": 0.01030408, + "balance_loss_clip": 1.02424765, + "balance_loss_mlp": 1.0195601, + "epoch": 0.5908913272208026, + "flos": 18479523991680.0, + "grad_norm": 1.669271652886018, + "language_loss": 0.79622167, + "learning_rate": 1.5132427804250178e-06, + "loss": 0.816773, + "num_input_tokens_seen": 211786665, + "step": 9828, + "time_per_iteration": 2.7332189083099365 + }, + { + "auxiliary_loss_clip": 0.01019262, + "auxiliary_loss_mlp": 0.01033802, + "balance_loss_clip": 1.02567482, + "balance_loss_mlp": 1.02203023, + "epoch": 0.5909514504734706, + "flos": 12312189682560.0, + "grad_norm": 2.0047015332719207, + "language_loss": 0.88198572, + "learning_rate": 1.5128650385561241e-06, + "loss": 0.90251637, + "num_input_tokens_seen": 211801215, + "step": 9829, + "time_per_iteration": 4.438213109970093 + }, + { + "auxiliary_loss_clip": 0.00990414, + "auxiliary_loss_mlp": 0.01001328, + "balance_loss_clip": 1.00398397, + "balance_loss_mlp": 1.00024366, + "epoch": 0.5910115737261386, + "flos": 70213262451840.0, + "grad_norm": 0.7563115960162203, + "language_loss": 0.57870883, + "learning_rate": 1.5124873151594376e-06, + "loss": 0.59862626, + "num_input_tokens_seen": 211857005, + "step": 9830, + "time_per_iteration": 3.19509220123291 + }, + { + "auxiliary_loss_clip": 0.01061681, + "auxiliary_loss_mlp": 0.00747595, + "balance_loss_clip": 1.02640605, + "balance_loss_mlp": 1.00041366, + "epoch": 0.5910716969788066, + "flos": 22017766227840.0, + "grad_norm": 2.157909062749728, + "language_loss": 0.76151025, + "learning_rate": 1.5121096102492812e-06, + "loss": 0.77960306, + "num_input_tokens_seen": 211876675, + "step": 9831, + "time_per_iteration": 2.6473851203918457 + }, + { + "auxiliary_loss_clip": 0.01045532, + "auxiliary_loss_mlp": 0.01026981, + "balance_loss_clip": 1.02782249, + "balance_loss_mlp": 1.01573384, + "epoch": 0.5911318202314745, + "flos": 21251648021760.0, + "grad_norm": 1.6449901710396118, + "language_loss": 0.77136862, + "learning_rate": 1.5117319238399767e-06, + "loss": 0.79209375, + "num_input_tokens_seen": 211895725, + "step": 9832, + "time_per_iteration": 2.5901052951812744 + }, + { + "auxiliary_loss_clip": 0.01055369, + "auxiliary_loss_mlp": 0.01024797, + "balance_loss_clip": 1.0260036, + "balance_loss_mlp": 1.01402617, + "epoch": 0.5911919434841425, + "flos": 17821604528640.0, + "grad_norm": 1.7916296343323317, + "language_loss": 0.83889645, + "learning_rate": 1.511354255945847e-06, + "loss": 0.85969806, + "num_input_tokens_seen": 211913860, + "step": 9833, + "time_per_iteration": 2.678218364715576 + }, + { + "auxiliary_loss_clip": 0.01053907, + "auxiliary_loss_mlp": 0.01028918, + "balance_loss_clip": 1.02464056, + "balance_loss_mlp": 1.01780224, + "epoch": 0.5912520667368104, + "flos": 20374781207040.0, + "grad_norm": 1.653338917420814, + "language_loss": 0.7395311, + "learning_rate": 1.5109766065812123e-06, + "loss": 0.76035935, + "num_input_tokens_seen": 211932880, + "step": 9834, + "time_per_iteration": 2.537956714630127 + }, + { + "auxiliary_loss_clip": 0.0106393, + "auxiliary_loss_mlp": 0.01030646, + "balance_loss_clip": 1.02450979, + "balance_loss_mlp": 1.01967263, + "epoch": 0.5913121899894784, + "flos": 17930557457280.0, + "grad_norm": 2.161778867576667, + "language_loss": 0.77590024, + "learning_rate": 1.5105989757603942e-06, + "loss": 0.79684609, + "num_input_tokens_seen": 211948625, + "step": 9835, + "time_per_iteration": 2.5082972049713135 + }, + { + "auxiliary_loss_clip": 0.01040087, + "auxiliary_loss_mlp": 0.01028338, + "balance_loss_clip": 1.02316141, + "balance_loss_mlp": 1.01747811, + "epoch": 0.5913723132421465, + "flos": 22126934638080.0, + "grad_norm": 2.0638307211153673, + "language_loss": 0.74109274, + "learning_rate": 1.5102213634977117e-06, + "loss": 0.76177692, + "num_input_tokens_seen": 211965355, + "step": 9836, + "time_per_iteration": 2.696409225463867 + }, + { + "auxiliary_loss_clip": 0.01028982, + "auxiliary_loss_mlp": 0.0103166, + "balance_loss_clip": 1.02282715, + "balance_loss_mlp": 1.01975679, + "epoch": 0.5914324364948144, + "flos": 15697918771200.0, + "grad_norm": 2.168027077308982, + "language_loss": 0.81990826, + "learning_rate": 1.5098437698074841e-06, + "loss": 0.84051472, + "num_input_tokens_seen": 211982245, + "step": 9837, + "time_per_iteration": 2.716386318206787 + }, + { + "auxiliary_loss_clip": 0.01022228, + "auxiliary_loss_mlp": 0.01028938, + "balance_loss_clip": 1.02127314, + "balance_loss_mlp": 1.01620698, + "epoch": 0.5914925597474824, + "flos": 22747327367040.0, + "grad_norm": 1.7948298780289071, + "language_loss": 0.79759669, + "learning_rate": 1.5094661947040304e-06, + "loss": 0.81810832, + "num_input_tokens_seen": 212000250, + "step": 9838, + "time_per_iteration": 2.7604682445526123 + }, + { + "auxiliary_loss_clip": 0.01026441, + "auxiliary_loss_mlp": 0.01034594, + "balance_loss_clip": 1.02592862, + "balance_loss_mlp": 1.02300048, + "epoch": 0.5915526830001503, + "flos": 18292788161280.0, + "grad_norm": 1.808927906849192, + "language_loss": 0.69546801, + "learning_rate": 1.5090886382016673e-06, + "loss": 0.71607828, + "num_input_tokens_seen": 212017505, + "step": 9839, + "time_per_iteration": 2.975842237472534 + }, + { + "auxiliary_loss_clip": 0.01048777, + "auxiliary_loss_mlp": 0.01039448, + "balance_loss_clip": 1.02710283, + "balance_loss_mlp": 1.02796865, + "epoch": 0.5916128062528183, + "flos": 17019072910080.0, + "grad_norm": 2.0657479203925027, + "language_loss": 0.65220839, + "learning_rate": 1.5087111003147124e-06, + "loss": 0.67309058, + "num_input_tokens_seen": 212034595, + "step": 9840, + "time_per_iteration": 2.6315624713897705 + }, + { + "auxiliary_loss_clip": 0.01040271, + "auxiliary_loss_mlp": 0.01030234, + "balance_loss_clip": 1.02399731, + "balance_loss_mlp": 1.01864052, + "epoch": 0.5916729295054862, + "flos": 24754231031040.0, + "grad_norm": 1.803166779454891, + "language_loss": 0.81696343, + "learning_rate": 1.5083335810574813e-06, + "loss": 0.83766848, + "num_input_tokens_seen": 212055775, + "step": 9841, + "time_per_iteration": 2.7665657997131348 + }, + { + "auxiliary_loss_clip": 0.01045937, + "auxiliary_loss_mlp": 0.01026858, + "balance_loss_clip": 1.0261395, + "balance_loss_mlp": 1.01605725, + "epoch": 0.5917330527581542, + "flos": 15958199698560.0, + "grad_norm": 1.5406163116406753, + "language_loss": 0.69245821, + "learning_rate": 1.507956080444291e-06, + "loss": 0.7131862, + "num_input_tokens_seen": 212074000, + "step": 9842, + "time_per_iteration": 2.676011562347412 + }, + { + "auxiliary_loss_clip": 0.01039679, + "auxiliary_loss_mlp": 0.01030065, + "balance_loss_clip": 1.02298808, + "balance_loss_mlp": 1.01887739, + "epoch": 0.5917931760108222, + "flos": 23800730549760.0, + "grad_norm": 1.7620316449536089, + "language_loss": 0.82664716, + "learning_rate": 1.5075785984894549e-06, + "loss": 0.84734458, + "num_input_tokens_seen": 212091415, + "step": 9843, + "time_per_iteration": 2.6983799934387207 + }, + { + "auxiliary_loss_clip": 0.01032773, + "auxiliary_loss_mlp": 0.01028299, + "balance_loss_clip": 1.02026224, + "balance_loss_mlp": 1.01659894, + "epoch": 0.5918532992634902, + "flos": 23249609199360.0, + "grad_norm": 2.407886649508471, + "language_loss": 0.81825817, + "learning_rate": 1.5072011352072875e-06, + "loss": 0.83886892, + "num_input_tokens_seen": 212105255, + "step": 9844, + "time_per_iteration": 2.772111654281616 + }, + { + "auxiliary_loss_clip": 0.01019313, + "auxiliary_loss_mlp": 0.01025047, + "balance_loss_clip": 1.02556908, + "balance_loss_mlp": 1.01375175, + "epoch": 0.5919134225161581, + "flos": 19499853726720.0, + "grad_norm": 2.2128499004287314, + "language_loss": 0.73751247, + "learning_rate": 1.5068236906121032e-06, + "loss": 0.75795603, + "num_input_tokens_seen": 212122765, + "step": 9845, + "time_per_iteration": 2.7376325130462646 + }, + { + "auxiliary_loss_clip": 0.01028765, + "auxiliary_loss_mlp": 0.01029473, + "balance_loss_clip": 1.0226059, + "balance_loss_mlp": 1.01631224, + "epoch": 0.5919735457688261, + "flos": 38800940567040.0, + "grad_norm": 1.7238669298997975, + "language_loss": 0.64031339, + "learning_rate": 1.506446264718213e-06, + "loss": 0.66089582, + "num_input_tokens_seen": 212143960, + "step": 9846, + "time_per_iteration": 2.85542893409729 + }, + { + "auxiliary_loss_clip": 0.01031028, + "auxiliary_loss_mlp": 0.00747351, + "balance_loss_clip": 1.02474809, + "balance_loss_mlp": 1.00041318, + "epoch": 0.592033669021494, + "flos": 22163994495360.0, + "grad_norm": 1.7313243845511197, + "language_loss": 0.76173013, + "learning_rate": 1.506068857539931e-06, + "loss": 0.7795139, + "num_input_tokens_seen": 212162005, + "step": 9847, + "time_per_iteration": 2.6863603591918945 + }, + { + "auxiliary_loss_clip": 0.01039396, + "auxiliary_loss_mlp": 0.01028983, + "balance_loss_clip": 1.02379096, + "balance_loss_mlp": 1.01719952, + "epoch": 0.592093792274162, + "flos": 22710985781760.0, + "grad_norm": 1.5777463445062605, + "language_loss": 0.61948872, + "learning_rate": 1.5056914690915667e-06, + "loss": 0.6401726, + "num_input_tokens_seen": 212181635, + "step": 9848, + "time_per_iteration": 2.8986523151397705 + }, + { + "auxiliary_loss_clip": 0.01056658, + "auxiliary_loss_mlp": 0.01034602, + "balance_loss_clip": 1.02646208, + "balance_loss_mlp": 1.02376032, + "epoch": 0.59215391552683, + "flos": 22528954632960.0, + "grad_norm": 1.7185694092666084, + "language_loss": 0.75861919, + "learning_rate": 1.5053140993874312e-06, + "loss": 0.77953184, + "num_input_tokens_seen": 212201615, + "step": 9849, + "time_per_iteration": 2.831179141998291 + }, + { + "auxiliary_loss_clip": 0.01041197, + "auxiliary_loss_mlp": 0.01030982, + "balance_loss_clip": 1.02393675, + "balance_loss_mlp": 1.01925743, + "epoch": 0.592214038779498, + "flos": 24499013921280.0, + "grad_norm": 1.8185454710829418, + "language_loss": 0.75596428, + "learning_rate": 1.5049367484418353e-06, + "loss": 0.77668607, + "num_input_tokens_seen": 212219355, + "step": 9850, + "time_per_iteration": 2.719588041305542 + }, + { + "auxiliary_loss_clip": 0.01026494, + "auxiliary_loss_mlp": 0.01030562, + "balance_loss_clip": 1.02339637, + "balance_loss_mlp": 1.01883161, + "epoch": 0.592274162032166, + "flos": 21831353619840.0, + "grad_norm": 1.7268807844060814, + "language_loss": 0.7564823, + "learning_rate": 1.5045594162690868e-06, + "loss": 0.77705288, + "num_input_tokens_seen": 212236710, + "step": 9851, + "time_per_iteration": 2.738215208053589 + }, + { + "auxiliary_loss_clip": 0.01046121, + "auxiliary_loss_mlp": 0.01031214, + "balance_loss_clip": 1.02514851, + "balance_loss_mlp": 1.01998472, + "epoch": 0.5923342852848339, + "flos": 24608146417920.0, + "grad_norm": 1.7035099725371352, + "language_loss": 0.70441467, + "learning_rate": 1.5041821028834954e-06, + "loss": 0.72518802, + "num_input_tokens_seen": 212256195, + "step": 9852, + "time_per_iteration": 2.654876708984375 + }, + { + "auxiliary_loss_clip": 0.01049597, + "auxiliary_loss_mlp": 0.00747808, + "balance_loss_clip": 1.02716804, + "balance_loss_mlp": 1.00051284, + "epoch": 0.5923944085375019, + "flos": 19938143479680.0, + "grad_norm": 1.8255760296372927, + "language_loss": 0.80721426, + "learning_rate": 1.5038048082993685e-06, + "loss": 0.82518828, + "num_input_tokens_seen": 212274085, + "step": 9853, + "time_per_iteration": 2.6793153285980225 + }, + { + "auxiliary_loss_clip": 0.01041186, + "auxiliary_loss_mlp": 0.01025038, + "balance_loss_clip": 1.02385092, + "balance_loss_mlp": 1.01509047, + "epoch": 0.5924545317901698, + "flos": 28658510812800.0, + "grad_norm": 1.5496474790821895, + "language_loss": 0.67491591, + "learning_rate": 1.5034275325310124e-06, + "loss": 0.6955781, + "num_input_tokens_seen": 212295530, + "step": 9854, + "time_per_iteration": 2.6994752883911133 + }, + { + "auxiliary_loss_clip": 0.01025599, + "auxiliary_loss_mlp": 0.01027854, + "balance_loss_clip": 1.0237782, + "balance_loss_mlp": 1.01702988, + "epoch": 0.5925146550428378, + "flos": 19864885691520.0, + "grad_norm": 1.9203537999799816, + "language_loss": 0.89047748, + "learning_rate": 1.5030502755927344e-06, + "loss": 0.91101205, + "num_input_tokens_seen": 212313770, + "step": 9855, + "time_per_iteration": 2.6416854858398438 + }, + { + "auxiliary_loss_clip": 0.01046406, + "auxiliary_loss_mlp": 0.01025877, + "balance_loss_clip": 1.02576375, + "balance_loss_mlp": 1.01560152, + "epoch": 0.5925747782955058, + "flos": 15122989681920.0, + "grad_norm": 1.7262256888721985, + "language_loss": 0.86895287, + "learning_rate": 1.5026730374988397e-06, + "loss": 0.88967574, + "num_input_tokens_seen": 212331525, + "step": 9856, + "time_per_iteration": 2.567739725112915 + }, + { + "auxiliary_loss_clip": 0.01054578, + "auxiliary_loss_mlp": 0.01032543, + "balance_loss_clip": 1.02438354, + "balance_loss_mlp": 1.02211225, + "epoch": 0.5926349015481738, + "flos": 18405440190720.0, + "grad_norm": 2.2525240772207242, + "language_loss": 0.77648115, + "learning_rate": 1.5022958182636332e-06, + "loss": 0.79735243, + "num_input_tokens_seen": 212347295, + "step": 9857, + "time_per_iteration": 4.180936098098755 + }, + { + "auxiliary_loss_clip": 0.01019197, + "auxiliary_loss_mlp": 0.0104242, + "balance_loss_clip": 1.02210474, + "balance_loss_mlp": 1.02912784, + "epoch": 0.5926950248008417, + "flos": 23111138269440.0, + "grad_norm": 2.3126707914243716, + "language_loss": 0.64995468, + "learning_rate": 1.501918617901419e-06, + "loss": 0.67057079, + "num_input_tokens_seen": 212365750, + "step": 9858, + "time_per_iteration": 4.332481145858765 + }, + { + "auxiliary_loss_clip": 0.01052345, + "auxiliary_loss_mlp": 0.01029467, + "balance_loss_clip": 1.02471614, + "balance_loss_mlp": 1.01867819, + "epoch": 0.5927551480535097, + "flos": 28033916192640.0, + "grad_norm": 1.9268540005760824, + "language_loss": 0.76887763, + "learning_rate": 1.501541436426501e-06, + "loss": 0.78969574, + "num_input_tokens_seen": 212385300, + "step": 9859, + "time_per_iteration": 2.6130971908569336 + }, + { + "auxiliary_loss_clip": 0.01027674, + "auxiliary_loss_mlp": 0.00747659, + "balance_loss_clip": 1.02664566, + "balance_loss_mlp": 1.00039411, + "epoch": 0.5928152713061776, + "flos": 21798675221760.0, + "grad_norm": 2.343745103913518, + "language_loss": 0.75414395, + "learning_rate": 1.5011642738531818e-06, + "loss": 0.7718972, + "num_input_tokens_seen": 212402140, + "step": 9860, + "time_per_iteration": 2.6894867420196533 + }, + { + "auxiliary_loss_clip": 0.01035505, + "auxiliary_loss_mlp": 0.01029425, + "balance_loss_clip": 1.02647531, + "balance_loss_mlp": 1.01923287, + "epoch": 0.5928753945588456, + "flos": 24316839118080.0, + "grad_norm": 1.6730154135005268, + "language_loss": 0.7587049, + "learning_rate": 1.500787130195763e-06, + "loss": 0.77935421, + "num_input_tokens_seen": 212421790, + "step": 9861, + "time_per_iteration": 2.65132999420166 + }, + { + "auxiliary_loss_clip": 0.01032357, + "auxiliary_loss_mlp": 0.01024115, + "balance_loss_clip": 1.02370119, + "balance_loss_mlp": 1.01460195, + "epoch": 0.5929355178115137, + "flos": 26464619923200.0, + "grad_norm": 1.6727996417663715, + "language_loss": 0.70924926, + "learning_rate": 1.5004100054685465e-06, + "loss": 0.72981393, + "num_input_tokens_seen": 212442115, + "step": 9862, + "time_per_iteration": 2.7362406253814697 + }, + { + "auxiliary_loss_clip": 0.01010177, + "auxiliary_loss_mlp": 0.0102973, + "balance_loss_clip": 1.02088797, + "balance_loss_mlp": 1.01907265, + "epoch": 0.5929956410641816, + "flos": 24965995662720.0, + "grad_norm": 2.5547582124495527, + "language_loss": 0.77927023, + "learning_rate": 1.500032899685832e-06, + "loss": 0.79966927, + "num_input_tokens_seen": 212459535, + "step": 9863, + "time_per_iteration": 2.8470840454101562 + }, + { + "auxiliary_loss_clip": 0.01046506, + "auxiliary_loss_mlp": 0.01034811, + "balance_loss_clip": 1.02742076, + "balance_loss_mlp": 1.02294946, + "epoch": 0.5930557643168496, + "flos": 26208325405440.0, + "grad_norm": 2.233939281370258, + "language_loss": 0.70548987, + "learning_rate": 1.499655812861921e-06, + "loss": 0.7263031, + "num_input_tokens_seen": 212479385, + "step": 9864, + "time_per_iteration": 2.7142131328582764 + }, + { + "auxiliary_loss_clip": 0.01037629, + "auxiliary_loss_mlp": 0.01034046, + "balance_loss_clip": 1.02419984, + "balance_loss_mlp": 1.02204752, + "epoch": 0.5931158875695175, + "flos": 27854937699840.0, + "grad_norm": 1.4484234726817755, + "language_loss": 0.67673391, + "learning_rate": 1.4992787450111112e-06, + "loss": 0.69745064, + "num_input_tokens_seen": 212500060, + "step": 9865, + "time_per_iteration": 2.689770221710205 + }, + { + "auxiliary_loss_clip": 0.01047143, + "auxiliary_loss_mlp": 0.01031372, + "balance_loss_clip": 1.02444935, + "balance_loss_mlp": 1.01976085, + "epoch": 0.5931760108221855, + "flos": 15413650536960.0, + "grad_norm": 2.1473023313999264, + "language_loss": 0.7817896, + "learning_rate": 1.4989016961477015e-06, + "loss": 0.80257475, + "num_input_tokens_seen": 212518590, + "step": 9866, + "time_per_iteration": 4.2650251388549805 + }, + { + "auxiliary_loss_clip": 0.01043008, + "auxiliary_loss_mlp": 0.01026634, + "balance_loss_clip": 1.0249722, + "balance_loss_mlp": 1.01678777, + "epoch": 0.5932361340748534, + "flos": 30188520581760.0, + "grad_norm": 2.254396205046419, + "language_loss": 0.72055638, + "learning_rate": 1.4985246662859903e-06, + "loss": 0.74125278, + "num_input_tokens_seen": 212538190, + "step": 9867, + "time_per_iteration": 2.8246922492980957 + }, + { + "auxiliary_loss_clip": 0.01044151, + "auxiliary_loss_mlp": 0.01028529, + "balance_loss_clip": 1.02561867, + "balance_loss_mlp": 1.01699543, + "epoch": 0.5932962573275214, + "flos": 20157557708160.0, + "grad_norm": 2.842505680825702, + "language_loss": 0.66849512, + "learning_rate": 1.4981476554402732e-06, + "loss": 0.68922192, + "num_input_tokens_seen": 212557820, + "step": 9868, + "time_per_iteration": 2.6146180629730225 + }, + { + "auxiliary_loss_clip": 0.01009039, + "auxiliary_loss_mlp": 0.00747586, + "balance_loss_clip": 1.0228312, + "balance_loss_mlp": 1.00037825, + "epoch": 0.5933563805801894, + "flos": 25445906300160.0, + "grad_norm": 2.058620501475398, + "language_loss": 0.75188875, + "learning_rate": 1.4977706636248478e-06, + "loss": 0.76945508, + "num_input_tokens_seen": 212577645, + "step": 9869, + "time_per_iteration": 2.820688486099243 + }, + { + "auxiliary_loss_clip": 0.01017631, + "auxiliary_loss_mlp": 0.01036775, + "balance_loss_clip": 1.02524841, + "balance_loss_mlp": 1.02487183, + "epoch": 0.5934165038328574, + "flos": 59995740337920.0, + "grad_norm": 1.840013560305798, + "language_loss": 0.74365562, + "learning_rate": 1.4973936908540091e-06, + "loss": 0.76419967, + "num_input_tokens_seen": 212603430, + "step": 9870, + "time_per_iteration": 3.080925941467285 + }, + { + "auxiliary_loss_clip": 0.01019647, + "auxiliary_loss_mlp": 0.01026904, + "balance_loss_clip": 1.022964, + "balance_loss_mlp": 1.01572227, + "epoch": 0.5934766270855253, + "flos": 24420548661120.0, + "grad_norm": 2.1654463754912823, + "language_loss": 0.71985757, + "learning_rate": 1.4970167371420517e-06, + "loss": 0.74032307, + "num_input_tokens_seen": 212620730, + "step": 9871, + "time_per_iteration": 2.7240066528320312 + }, + { + "auxiliary_loss_clip": 0.01032426, + "auxiliary_loss_mlp": 0.0103124, + "balance_loss_clip": 1.02473938, + "balance_loss_mlp": 1.01971841, + "epoch": 0.5935367503381933, + "flos": 23513158264320.0, + "grad_norm": 2.07340867542387, + "language_loss": 0.74231374, + "learning_rate": 1.496639802503271e-06, + "loss": 0.7629503, + "num_input_tokens_seen": 212639745, + "step": 9872, + "time_per_iteration": 2.6838133335113525 + }, + { + "auxiliary_loss_clip": 0.0105808, + "auxiliary_loss_mlp": 0.01032636, + "balance_loss_clip": 1.02616918, + "balance_loss_mlp": 1.02089405, + "epoch": 0.5935968735908612, + "flos": 18948337326720.0, + "grad_norm": 2.103918122514628, + "language_loss": 0.79034793, + "learning_rate": 1.4962628869519583e-06, + "loss": 0.8112551, + "num_input_tokens_seen": 212655915, + "step": 9873, + "time_per_iteration": 2.5623793601989746 + }, + { + "auxiliary_loss_clip": 0.01053739, + "auxiliary_loss_mlp": 0.01029613, + "balance_loss_clip": 1.02439737, + "balance_loss_mlp": 1.01864004, + "epoch": 0.5936569968435292, + "flos": 25483433034240.0, + "grad_norm": 1.5602430193098074, + "language_loss": 0.84885061, + "learning_rate": 1.4958859905024078e-06, + "loss": 0.86968416, + "num_input_tokens_seen": 212676115, + "step": 9874, + "time_per_iteration": 2.6771559715270996 + }, + { + "auxiliary_loss_clip": 0.0099085, + "auxiliary_loss_mlp": 0.01000828, + "balance_loss_clip": 1.00395143, + "balance_loss_mlp": 0.99963027, + "epoch": 0.5937171200961973, + "flos": 66378361789440.0, + "grad_norm": 0.7094948807273118, + "language_loss": 0.60023403, + "learning_rate": 1.4955091131689115e-06, + "loss": 0.6201508, + "num_input_tokens_seen": 212737560, + "step": 9875, + "time_per_iteration": 3.2863292694091797 + }, + { + "auxiliary_loss_clip": 0.01038811, + "auxiliary_loss_mlp": 0.01027127, + "balance_loss_clip": 1.02260351, + "balance_loss_mlp": 1.01450229, + "epoch": 0.5937772433488652, + "flos": 14903467712640.0, + "grad_norm": 1.9173229963695209, + "language_loss": 0.77567625, + "learning_rate": 1.4951322549657594e-06, + "loss": 0.79633564, + "num_input_tokens_seen": 212755365, + "step": 9876, + "time_per_iteration": 4.196768522262573 + }, + { + "auxiliary_loss_clip": 0.01045927, + "auxiliary_loss_mlp": 0.0102376, + "balance_loss_clip": 1.0210638, + "balance_loss_mlp": 1.01368666, + "epoch": 0.5938373666015332, + "flos": 22561489376640.0, + "grad_norm": 1.5163374563136804, + "language_loss": 0.75808311, + "learning_rate": 1.494755415907243e-06, + "loss": 0.77877998, + "num_input_tokens_seen": 212773875, + "step": 9877, + "time_per_iteration": 2.587681531906128 + }, + { + "auxiliary_loss_clip": 0.0105405, + "auxiliary_loss_mlp": 0.01028311, + "balance_loss_clip": 1.02314663, + "balance_loss_mlp": 1.01727211, + "epoch": 0.5938974898542011, + "flos": 18440883936000.0, + "grad_norm": 2.4766252628245544, + "language_loss": 0.81320274, + "learning_rate": 1.4943785960076522e-06, + "loss": 0.83402634, + "num_input_tokens_seen": 212790590, + "step": 9878, + "time_per_iteration": 2.555703639984131 + }, + { + "auxiliary_loss_clip": 0.01036812, + "auxiliary_loss_mlp": 0.00747633, + "balance_loss_clip": 1.02315342, + "balance_loss_mlp": 1.00044537, + "epoch": 0.5939576131068691, + "flos": 45586728270720.0, + "grad_norm": 1.7308535447069158, + "language_loss": 0.7181291, + "learning_rate": 1.4940017952812754e-06, + "loss": 0.73597354, + "num_input_tokens_seen": 212812265, + "step": 9879, + "time_per_iteration": 2.832826852798462 + }, + { + "auxiliary_loss_clip": 0.01055308, + "auxiliary_loss_mlp": 0.01030349, + "balance_loss_clip": 1.02584648, + "balance_loss_mlp": 1.01916718, + "epoch": 0.594017736359537, + "flos": 23587708942080.0, + "grad_norm": 1.4555285699477438, + "language_loss": 0.57141489, + "learning_rate": 1.493625013742401e-06, + "loss": 0.59227145, + "num_input_tokens_seen": 212831915, + "step": 9880, + "time_per_iteration": 2.6254711151123047 + }, + { + "auxiliary_loss_clip": 0.01054486, + "auxiliary_loss_mlp": 0.01032444, + "balance_loss_clip": 1.02443886, + "balance_loss_mlp": 1.02092218, + "epoch": 0.594077859612205, + "flos": 29457235589760.0, + "grad_norm": 2.016762133832519, + "language_loss": 0.77519488, + "learning_rate": 1.4932482514053177e-06, + "loss": 0.79606414, + "num_input_tokens_seen": 212851350, + "step": 9881, + "time_per_iteration": 2.632267713546753 + }, + { + "auxiliary_loss_clip": 0.01051751, + "auxiliary_loss_mlp": 0.01024714, + "balance_loss_clip": 1.02343464, + "balance_loss_mlp": 1.01374125, + "epoch": 0.594137982864873, + "flos": 16800089644800.0, + "grad_norm": 9.107236445569844, + "language_loss": 0.82806993, + "learning_rate": 1.4928715082843112e-06, + "loss": 0.84883457, + "num_input_tokens_seen": 212867995, + "step": 9882, + "time_per_iteration": 2.5896005630493164 + }, + { + "auxiliary_loss_clip": 0.01055822, + "auxiliary_loss_mlp": 0.01031933, + "balance_loss_clip": 1.02624965, + "balance_loss_mlp": 1.02146697, + "epoch": 0.594198106117541, + "flos": 12750263953920.0, + "grad_norm": 3.908803671767259, + "language_loss": 0.79380113, + "learning_rate": 1.492494784393667e-06, + "loss": 0.81467867, + "num_input_tokens_seen": 212885220, + "step": 9883, + "time_per_iteration": 2.5650391578674316 + }, + { + "auxiliary_loss_clip": 0.0104114, + "auxiliary_loss_mlp": 0.00747847, + "balance_loss_clip": 1.02750909, + "balance_loss_mlp": 1.00048649, + "epoch": 0.5942582293702089, + "flos": 20996538652800.0, + "grad_norm": 2.1749069290134435, + "language_loss": 0.74341917, + "learning_rate": 1.4921180797476725e-06, + "loss": 0.76130903, + "num_input_tokens_seen": 212903195, + "step": 9884, + "time_per_iteration": 2.695417642593384 + }, + { + "auxiliary_loss_clip": 0.01067275, + "auxiliary_loss_mlp": 0.01026803, + "balance_loss_clip": 1.02726293, + "balance_loss_mlp": 1.01597881, + "epoch": 0.5943183526228769, + "flos": 28291431772800.0, + "grad_norm": 2.3512894836910805, + "language_loss": 0.66410345, + "learning_rate": 1.4917413943606106e-06, + "loss": 0.68504423, + "num_input_tokens_seen": 212923340, + "step": 9885, + "time_per_iteration": 2.676790475845337 + }, + { + "auxiliary_loss_clip": 0.01043764, + "auxiliary_loss_mlp": 0.01033614, + "balance_loss_clip": 1.02546811, + "balance_loss_mlp": 1.02225327, + "epoch": 0.5943784758755448, + "flos": 26614619118720.0, + "grad_norm": 2.8115813318947467, + "language_loss": 0.77051282, + "learning_rate": 1.4913647282467667e-06, + "loss": 0.79128659, + "num_input_tokens_seen": 212942755, + "step": 9886, + "time_per_iteration": 2.6952030658721924 + }, + { + "auxiliary_loss_clip": 0.01000574, + "auxiliary_loss_mlp": 0.01005864, + "balance_loss_clip": 1.00429869, + "balance_loss_mlp": 1.00462401, + "epoch": 0.5944385991282128, + "flos": 64190935347840.0, + "grad_norm": 0.8381588252443928, + "language_loss": 0.64531541, + "learning_rate": 1.490988081420423e-06, + "loss": 0.66537976, + "num_input_tokens_seen": 212999355, + "step": 9887, + "time_per_iteration": 3.1299984455108643 + }, + { + "auxiliary_loss_clip": 0.01054239, + "auxiliary_loss_mlp": 0.01026681, + "balance_loss_clip": 1.02528477, + "balance_loss_mlp": 1.01573765, + "epoch": 0.5944987223808808, + "flos": 19571998193280.0, + "grad_norm": 1.650348302985403, + "language_loss": 0.69056547, + "learning_rate": 1.4906114538958615e-06, + "loss": 0.71137464, + "num_input_tokens_seen": 213018570, + "step": 9888, + "time_per_iteration": 2.7181193828582764 + }, + { + "auxiliary_loss_clip": 0.01034056, + "auxiliary_loss_mlp": 0.01027953, + "balance_loss_clip": 1.02166307, + "balance_loss_mlp": 1.01624656, + "epoch": 0.5945588456335488, + "flos": 26177586341760.0, + "grad_norm": 1.5289843385629904, + "language_loss": 0.79845572, + "learning_rate": 1.490234845687366e-06, + "loss": 0.81907582, + "num_input_tokens_seen": 213037735, + "step": 9889, + "time_per_iteration": 2.7176952362060547 + }, + { + "auxiliary_loss_clip": 0.01032594, + "auxiliary_loss_mlp": 0.01025058, + "balance_loss_clip": 1.02360821, + "balance_loss_mlp": 1.01469326, + "epoch": 0.5946189688862168, + "flos": 20446494710400.0, + "grad_norm": 1.558497171434514, + "language_loss": 0.70798731, + "learning_rate": 1.4898582568092154e-06, + "loss": 0.72856379, + "num_input_tokens_seen": 213057160, + "step": 9890, + "time_per_iteration": 2.7327840328216553 + }, + { + "auxiliary_loss_clip": 0.01037262, + "auxiliary_loss_mlp": 0.01028807, + "balance_loss_clip": 1.0259068, + "balance_loss_mlp": 1.01678503, + "epoch": 0.5946790921388847, + "flos": 13437521850240.0, + "grad_norm": 1.895273218003934, + "language_loss": 0.69249797, + "learning_rate": 1.489481687275691e-06, + "loss": 0.71315873, + "num_input_tokens_seen": 213073630, + "step": 9891, + "time_per_iteration": 2.686281681060791 + }, + { + "auxiliary_loss_clip": 0.01052614, + "auxiliary_loss_mlp": 0.01031701, + "balance_loss_clip": 1.02413893, + "balance_loss_mlp": 1.02132344, + "epoch": 0.5947392153915527, + "flos": 20412272027520.0, + "grad_norm": 1.7501407371945725, + "language_loss": 0.53739911, + "learning_rate": 1.4891051371010726e-06, + "loss": 0.55824226, + "num_input_tokens_seen": 213092450, + "step": 9892, + "time_per_iteration": 2.7351279258728027 + }, + { + "auxiliary_loss_clip": 0.00980945, + "auxiliary_loss_mlp": 0.01002346, + "balance_loss_clip": 1.00373387, + "balance_loss_mlp": 1.00138044, + "epoch": 0.5947993386442206, + "flos": 65619138994560.0, + "grad_norm": 0.6663987571750756, + "language_loss": 0.54535437, + "learning_rate": 1.4887286062996375e-06, + "loss": 0.56518728, + "num_input_tokens_seen": 213155465, + "step": 9893, + "time_per_iteration": 3.3358747959136963 + }, + { + "auxiliary_loss_clip": 0.01034942, + "auxiliary_loss_mlp": 0.01028333, + "balance_loss_clip": 1.02636349, + "balance_loss_mlp": 1.01809287, + "epoch": 0.5948594618968887, + "flos": 23183103168000.0, + "grad_norm": 1.5900509764050015, + "language_loss": 0.74829793, + "learning_rate": 1.4883520948856658e-06, + "loss": 0.76893073, + "num_input_tokens_seen": 213174875, + "step": 9894, + "time_per_iteration": 2.756451368331909 + }, + { + "auxiliary_loss_clip": 0.01034011, + "auxiliary_loss_mlp": 0.01027166, + "balance_loss_clip": 1.02476156, + "balance_loss_mlp": 1.01663423, + "epoch": 0.5949195851495566, + "flos": 13626771632640.0, + "grad_norm": 1.6413803389734423, + "language_loss": 0.77925146, + "learning_rate": 1.487975602873434e-06, + "loss": 0.79986328, + "num_input_tokens_seen": 213192695, + "step": 9895, + "time_per_iteration": 2.8961780071258545 + }, + { + "auxiliary_loss_clip": 0.01012456, + "auxiliary_loss_mlp": 0.0103554, + "balance_loss_clip": 1.02237833, + "balance_loss_mlp": 1.02380967, + "epoch": 0.5949797084022246, + "flos": 19751012599680.0, + "grad_norm": 1.6446841252180768, + "language_loss": 0.79217815, + "learning_rate": 1.4875991302772182e-06, + "loss": 0.81265813, + "num_input_tokens_seen": 213211195, + "step": 9896, + "time_per_iteration": 2.888822078704834 + }, + { + "auxiliary_loss_clip": 0.01054577, + "auxiliary_loss_mlp": 0.01029486, + "balance_loss_clip": 1.02487922, + "balance_loss_mlp": 1.01890063, + "epoch": 0.5950398316548925, + "flos": 25773878407680.0, + "grad_norm": 1.49739301487839, + "language_loss": 0.83600116, + "learning_rate": 1.4872226771112954e-06, + "loss": 0.8568418, + "num_input_tokens_seen": 213231975, + "step": 9897, + "time_per_iteration": 2.6600406169891357 + }, + { + "auxiliary_loss_clip": 0.01037852, + "auxiliary_loss_mlp": 0.010314, + "balance_loss_clip": 1.02461314, + "balance_loss_mlp": 1.02073622, + "epoch": 0.5950999549075605, + "flos": 23039029716480.0, + "grad_norm": 1.7236059507346495, + "language_loss": 0.70911312, + "learning_rate": 1.486846243389939e-06, + "loss": 0.72980571, + "num_input_tokens_seen": 213249760, + "step": 9898, + "time_per_iteration": 2.6362502574920654 + }, + { + "auxiliary_loss_clip": 0.01050503, + "auxiliary_loss_mlp": 0.01039721, + "balance_loss_clip": 1.02272749, + "balance_loss_mlp": 1.02586341, + "epoch": 0.5951600781602284, + "flos": 32446367637120.0, + "grad_norm": 2.2213355116413522, + "language_loss": 0.63722765, + "learning_rate": 1.4864698291274251e-06, + "loss": 0.65812987, + "num_input_tokens_seen": 213269890, + "step": 9899, + "time_per_iteration": 2.6362407207489014 + }, + { + "auxiliary_loss_clip": 0.01063922, + "auxiliary_loss_mlp": 0.01026174, + "balance_loss_clip": 1.02552438, + "balance_loss_mlp": 1.0164938, + "epoch": 0.5952202014128964, + "flos": 23800874204160.0, + "grad_norm": 1.6105802210788447, + "language_loss": 0.72120631, + "learning_rate": 1.4860934343380267e-06, + "loss": 0.74210727, + "num_input_tokens_seen": 213289400, + "step": 9900, + "time_per_iteration": 2.62854266166687 + }, + { + "auxiliary_loss_clip": 0.01061882, + "auxiliary_loss_mlp": 0.01028627, + "balance_loss_clip": 1.02549553, + "balance_loss_mlp": 1.01799393, + "epoch": 0.5952803246655644, + "flos": 22492182084480.0, + "grad_norm": 1.779020213395415, + "language_loss": 0.84477508, + "learning_rate": 1.4857170590360169e-06, + "loss": 0.86568022, + "num_input_tokens_seen": 213308040, + "step": 9901, + "time_per_iteration": 2.5798885822296143 + }, + { + "auxiliary_loss_clip": 0.00961314, + "auxiliary_loss_mlp": 0.01003796, + "balance_loss_clip": 1.00564861, + "balance_loss_mlp": 1.00269926, + "epoch": 0.5953404479182324, + "flos": 51234688851840.0, + "grad_norm": 0.8060636470023782, + "language_loss": 0.58213127, + "learning_rate": 1.4853407032356674e-06, + "loss": 0.60178232, + "num_input_tokens_seen": 213358585, + "step": 9902, + "time_per_iteration": 3.1328516006469727 + }, + { + "auxiliary_loss_clip": 0.01011579, + "auxiliary_loss_mlp": 0.01032381, + "balance_loss_clip": 1.02381074, + "balance_loss_mlp": 1.02069879, + "epoch": 0.5954005711709004, + "flos": 23112682554240.0, + "grad_norm": 1.5804594272945331, + "language_loss": 0.76917952, + "learning_rate": 1.4849643669512503e-06, + "loss": 0.78961915, + "num_input_tokens_seen": 213379585, + "step": 9903, + "time_per_iteration": 2.784208059310913 + }, + { + "auxiliary_loss_clip": 0.01034709, + "auxiliary_loss_mlp": 0.0102967, + "balance_loss_clip": 1.02577281, + "balance_loss_mlp": 1.01962042, + "epoch": 0.5954606944235683, + "flos": 35954732736000.0, + "grad_norm": 1.6395691925954112, + "language_loss": 0.77631497, + "learning_rate": 1.4845880501970362e-06, + "loss": 0.79695868, + "num_input_tokens_seen": 213401465, + "step": 9904, + "time_per_iteration": 2.7923471927642822 + }, + { + "auxiliary_loss_clip": 0.0105892, + "auxiliary_loss_mlp": 0.01033957, + "balance_loss_clip": 1.02821112, + "balance_loss_mlp": 1.02291286, + "epoch": 0.5955208176762363, + "flos": 30443665864320.0, + "grad_norm": 1.4673314737402647, + "language_loss": 0.72641259, + "learning_rate": 1.4842117529872942e-06, + "loss": 0.74734139, + "num_input_tokens_seen": 213422720, + "step": 9905, + "time_per_iteration": 5.757880210876465 + }, + { + "auxiliary_loss_clip": 0.01053153, + "auxiliary_loss_mlp": 0.01023991, + "balance_loss_clip": 1.02390122, + "balance_loss_mlp": 1.01299965, + "epoch": 0.5955809409289042, + "flos": 17640112083840.0, + "grad_norm": 2.5828714494446734, + "language_loss": 0.69908333, + "learning_rate": 1.483835475336295e-06, + "loss": 0.71985477, + "num_input_tokens_seen": 213439480, + "step": 9906, + "time_per_iteration": 2.5849156379699707 + }, + { + "auxiliary_loss_clip": 0.01056828, + "auxiliary_loss_mlp": 0.01031064, + "balance_loss_clip": 1.02726388, + "balance_loss_mlp": 1.0201447, + "epoch": 0.5956410641815723, + "flos": 24279887001600.0, + "grad_norm": 1.76832645163328, + "language_loss": 0.75322104, + "learning_rate": 1.4834592172583057e-06, + "loss": 0.77409995, + "num_input_tokens_seen": 213458895, + "step": 9907, + "time_per_iteration": 2.6311309337615967 + }, + { + "auxiliary_loss_clip": 0.01039801, + "auxiliary_loss_mlp": 0.01027909, + "balance_loss_clip": 1.02376485, + "balance_loss_mlp": 1.01744235, + "epoch": 0.5957011874342402, + "flos": 35734277013120.0, + "grad_norm": 1.522231004111879, + "language_loss": 0.67227411, + "learning_rate": 1.483082978767595e-06, + "loss": 0.6929512, + "num_input_tokens_seen": 213481730, + "step": 9908, + "time_per_iteration": 2.7745046615600586 + }, + { + "auxiliary_loss_clip": 0.00989551, + "auxiliary_loss_mlp": 0.01033307, + "balance_loss_clip": 1.02009773, + "balance_loss_mlp": 1.02217925, + "epoch": 0.5957613106869082, + "flos": 21245004005760.0, + "grad_norm": 2.0895154687431816, + "language_loss": 0.76142704, + "learning_rate": 1.4827067598784298e-06, + "loss": 0.78165567, + "num_input_tokens_seen": 213497225, + "step": 9909, + "time_per_iteration": 2.827346086502075 + }, + { + "auxiliary_loss_clip": 0.01008181, + "auxiliary_loss_mlp": 0.01001239, + "balance_loss_clip": 1.00176466, + "balance_loss_mlp": 1.00010109, + "epoch": 0.5958214339395761, + "flos": 65940969876480.0, + "grad_norm": 0.9288261460638714, + "language_loss": 0.73407269, + "learning_rate": 1.4823305606050753e-06, + "loss": 0.75416684, + "num_input_tokens_seen": 213556890, + "step": 9910, + "time_per_iteration": 3.242121458053589 + }, + { + "auxiliary_loss_clip": 0.01045274, + "auxiliary_loss_mlp": 0.01028773, + "balance_loss_clip": 1.02575886, + "balance_loss_mlp": 1.01810408, + "epoch": 0.5958815571922441, + "flos": 23218690567680.0, + "grad_norm": 1.7740089391726566, + "language_loss": 0.69744045, + "learning_rate": 1.481954380961799e-06, + "loss": 0.71818095, + "num_input_tokens_seen": 213575800, + "step": 9911, + "time_per_iteration": 2.6564364433288574 + }, + { + "auxiliary_loss_clip": 0.01057827, + "auxiliary_loss_mlp": 0.01033747, + "balance_loss_clip": 1.02545238, + "balance_loss_mlp": 1.02196908, + "epoch": 0.595941680444912, + "flos": 16538623568640.0, + "grad_norm": 3.2586029871426074, + "language_loss": 0.66146648, + "learning_rate": 1.4815782209628631e-06, + "loss": 0.68238223, + "num_input_tokens_seen": 213592740, + "step": 9912, + "time_per_iteration": 2.6359777450561523 + }, + { + "auxiliary_loss_clip": 0.01035367, + "auxiliary_loss_mlp": 0.01033021, + "balance_loss_clip": 1.02514648, + "balance_loss_mlp": 1.02166653, + "epoch": 0.59600180369758, + "flos": 27818883423360.0, + "grad_norm": 2.2461506451778197, + "language_loss": 0.73572814, + "learning_rate": 1.4812020806225337e-06, + "loss": 0.75641197, + "num_input_tokens_seen": 213611970, + "step": 9913, + "time_per_iteration": 2.7916603088378906 + }, + { + "auxiliary_loss_clip": 0.01030179, + "auxiliary_loss_mlp": 0.00747575, + "balance_loss_clip": 1.02354157, + "balance_loss_mlp": 1.00044262, + "epoch": 0.596061926950248, + "flos": 29491566013440.0, + "grad_norm": 1.8198764002430672, + "language_loss": 0.79719114, + "learning_rate": 1.4808259599550738e-06, + "loss": 0.81496871, + "num_input_tokens_seen": 213632230, + "step": 9914, + "time_per_iteration": 4.31429648399353 + }, + { + "auxiliary_loss_clip": 0.01027326, + "auxiliary_loss_mlp": 0.01027254, + "balance_loss_clip": 1.02236688, + "balance_loss_mlp": 1.01693606, + "epoch": 0.596122050202916, + "flos": 16836790366080.0, + "grad_norm": 2.2162712745506377, + "language_loss": 0.67396986, + "learning_rate": 1.4804498589747448e-06, + "loss": 0.69451565, + "num_input_tokens_seen": 213649645, + "step": 9915, + "time_per_iteration": 2.6639156341552734 + }, + { + "auxiliary_loss_clip": 0.01044525, + "auxiliary_loss_mlp": 0.01027412, + "balance_loss_clip": 1.02572334, + "balance_loss_mlp": 1.01676047, + "epoch": 0.596182173455584, + "flos": 20996646393600.0, + "grad_norm": 1.4806974228818175, + "language_loss": 0.78691053, + "learning_rate": 1.4800737776958095e-06, + "loss": 0.80762994, + "num_input_tokens_seen": 213668850, + "step": 9916, + "time_per_iteration": 2.64717435836792 + }, + { + "auxiliary_loss_clip": 0.01038307, + "auxiliary_loss_mlp": 0.01028269, + "balance_loss_clip": 1.02176428, + "balance_loss_mlp": 1.01748657, + "epoch": 0.5962422967082519, + "flos": 16065680169600.0, + "grad_norm": 1.8565203034985673, + "language_loss": 0.82893586, + "learning_rate": 1.4796977161325286e-06, + "loss": 0.84960163, + "num_input_tokens_seen": 213685695, + "step": 9917, + "time_per_iteration": 2.76975679397583 + }, + { + "auxiliary_loss_clip": 0.01035641, + "auxiliary_loss_mlp": 0.01032106, + "balance_loss_clip": 1.02358639, + "balance_loss_mlp": 1.0218842, + "epoch": 0.5963024199609199, + "flos": 12166966995840.0, + "grad_norm": 2.1372740372890693, + "language_loss": 0.7750268, + "learning_rate": 1.4793216742991625e-06, + "loss": 0.79570425, + "num_input_tokens_seen": 213703515, + "step": 9918, + "time_per_iteration": 2.673725128173828 + }, + { + "auxiliary_loss_clip": 0.01056292, + "auxiliary_loss_mlp": 0.01031186, + "balance_loss_clip": 1.02701569, + "balance_loss_mlp": 1.02068329, + "epoch": 0.5963625432135878, + "flos": 28074280101120.0, + "grad_norm": 1.4581059980128879, + "language_loss": 0.78792864, + "learning_rate": 1.4789456522099707e-06, + "loss": 0.80880344, + "num_input_tokens_seen": 213724170, + "step": 9919, + "time_per_iteration": 2.636338710784912 + }, + { + "auxiliary_loss_clip": 0.01042931, + "auxiliary_loss_mlp": 0.01030003, + "balance_loss_clip": 1.02468944, + "balance_loss_mlp": 1.01902401, + "epoch": 0.5964226664662559, + "flos": 19860324664320.0, + "grad_norm": 1.9012312614075741, + "language_loss": 0.77525657, + "learning_rate": 1.4785696498792122e-06, + "loss": 0.795986, + "num_input_tokens_seen": 213740620, + "step": 9920, + "time_per_iteration": 2.8260655403137207 + }, + { + "auxiliary_loss_clip": 0.01052593, + "auxiliary_loss_mlp": 0.0103453, + "balance_loss_clip": 1.02731502, + "balance_loss_mlp": 1.02257311, + "epoch": 0.5964827897189238, + "flos": 12932618325120.0, + "grad_norm": 2.30201419837006, + "language_loss": 0.82557786, + "learning_rate": 1.4781936673211446e-06, + "loss": 0.84644914, + "num_input_tokens_seen": 213755390, + "step": 9921, + "time_per_iteration": 2.7390146255493164 + }, + { + "auxiliary_loss_clip": 0.01047851, + "auxiliary_loss_mlp": 0.01026997, + "balance_loss_clip": 1.02353597, + "balance_loss_mlp": 1.01571369, + "epoch": 0.5965429129715918, + "flos": 18150797698560.0, + "grad_norm": 3.7976279496487426, + "language_loss": 0.80993468, + "learning_rate": 1.4778177045500252e-06, + "loss": 0.83068317, + "num_input_tokens_seen": 213773225, + "step": 9922, + "time_per_iteration": 2.6784346103668213 + }, + { + "auxiliary_loss_clip": 0.01052268, + "auxiliary_loss_mlp": 0.00747513, + "balance_loss_clip": 1.02312374, + "balance_loss_mlp": 1.00038385, + "epoch": 0.5966030362242597, + "flos": 21763231476480.0, + "grad_norm": 1.8810362669664702, + "language_loss": 0.77063477, + "learning_rate": 1.477441761580111e-06, + "loss": 0.78863263, + "num_input_tokens_seen": 213791860, + "step": 9923, + "time_per_iteration": 4.516089200973511 + }, + { + "auxiliary_loss_clip": 0.0104836, + "auxiliary_loss_mlp": 0.01030098, + "balance_loss_clip": 1.02540255, + "balance_loss_mlp": 1.01773, + "epoch": 0.5966631594769277, + "flos": 18807208790400.0, + "grad_norm": 1.9450992043183444, + "language_loss": 0.75999242, + "learning_rate": 1.4770658384256573e-06, + "loss": 0.78077698, + "num_input_tokens_seen": 213809455, + "step": 9924, + "time_per_iteration": 2.743623971939087 + }, + { + "auxiliary_loss_clip": 0.01045785, + "auxiliary_loss_mlp": 0.01032441, + "balance_loss_clip": 1.02314878, + "balance_loss_mlp": 1.02028728, + "epoch": 0.5967232827295956, + "flos": 14064163545600.0, + "grad_norm": 1.680160529314686, + "language_loss": 0.66391563, + "learning_rate": 1.4766899351009204e-06, + "loss": 0.68469787, + "num_input_tokens_seen": 213826615, + "step": 9925, + "time_per_iteration": 2.7062556743621826 + }, + { + "auxiliary_loss_clip": 0.01038965, + "auxiliary_loss_mlp": 0.01033998, + "balance_loss_clip": 1.02614927, + "balance_loss_mlp": 1.02233386, + "epoch": 0.5967834059822636, + "flos": 17238235743360.0, + "grad_norm": 2.3795874552194154, + "language_loss": 0.71863472, + "learning_rate": 1.4763140516201528e-06, + "loss": 0.73936439, + "num_input_tokens_seen": 213844495, + "step": 9926, + "time_per_iteration": 2.6364657878875732 + }, + { + "auxiliary_loss_clip": 0.01016912, + "auxiliary_loss_mlp": 0.00747586, + "balance_loss_clip": 1.02145767, + "balance_loss_mlp": 1.00040174, + "epoch": 0.5968435292349316, + "flos": 42520244284800.0, + "grad_norm": 1.7100563792480634, + "language_loss": 0.70287341, + "learning_rate": 1.4759381879976088e-06, + "loss": 0.72051835, + "num_input_tokens_seen": 213869125, + "step": 9927, + "time_per_iteration": 2.9494128227233887 + }, + { + "auxiliary_loss_clip": 0.01026419, + "auxiliary_loss_mlp": 0.01026232, + "balance_loss_clip": 1.02484584, + "balance_loss_mlp": 1.01450801, + "epoch": 0.5969036524875996, + "flos": 37630898945280.0, + "grad_norm": 1.564742033308709, + "language_loss": 0.63671541, + "learning_rate": 1.4755623442475415e-06, + "loss": 0.65724194, + "num_input_tokens_seen": 213891115, + "step": 9928, + "time_per_iteration": 2.9746270179748535 + }, + { + "auxiliary_loss_clip": 0.0106065, + "auxiliary_loss_mlp": 0.01030213, + "balance_loss_clip": 1.02327049, + "balance_loss_mlp": 1.02016377, + "epoch": 0.5969637757402676, + "flos": 23148377694720.0, + "grad_norm": 2.0147771381392756, + "language_loss": 0.69620764, + "learning_rate": 1.4751865203842022e-06, + "loss": 0.7171163, + "num_input_tokens_seen": 213911925, + "step": 9929, + "time_per_iteration": 2.653973340988159 + }, + { + "auxiliary_loss_clip": 0.01022328, + "auxiliary_loss_mlp": 0.01029614, + "balance_loss_clip": 1.02515244, + "balance_loss_mlp": 1.01957059, + "epoch": 0.5970238989929355, + "flos": 24020934877440.0, + "grad_norm": 1.9152722258523487, + "language_loss": 0.76529205, + "learning_rate": 1.4748107164218431e-06, + "loss": 0.78581148, + "num_input_tokens_seen": 213930715, + "step": 9930, + "time_per_iteration": 2.7982585430145264 + }, + { + "auxiliary_loss_clip": 0.01050256, + "auxiliary_loss_mlp": 0.01031199, + "balance_loss_clip": 1.02931523, + "balance_loss_mlp": 1.01897407, + "epoch": 0.5970840222456035, + "flos": 19426883247360.0, + "grad_norm": 1.8022305812643544, + "language_loss": 0.68485117, + "learning_rate": 1.4744349323747146e-06, + "loss": 0.70566571, + "num_input_tokens_seen": 213950015, + "step": 9931, + "time_per_iteration": 2.6704471111297607 + }, + { + "auxiliary_loss_clip": 0.01001596, + "auxiliary_loss_mlp": 0.01001766, + "balance_loss_clip": 1.00429404, + "balance_loss_mlp": 1.00058603, + "epoch": 0.5971441454982714, + "flos": 62976615235200.0, + "grad_norm": 3.1536016490684835, + "language_loss": 0.64226156, + "learning_rate": 1.474059168257065e-06, + "loss": 0.66229516, + "num_input_tokens_seen": 214003330, + "step": 9932, + "time_per_iteration": 3.1068167686462402 + }, + { + "auxiliary_loss_clip": 0.01031738, + "auxiliary_loss_mlp": 0.01027446, + "balance_loss_clip": 1.02335858, + "balance_loss_mlp": 1.01666963, + "epoch": 0.5972042687509395, + "flos": 20266223328000.0, + "grad_norm": 1.809072713798956, + "language_loss": 0.743909, + "learning_rate": 1.4736834240831454e-06, + "loss": 0.76450086, + "num_input_tokens_seen": 214021680, + "step": 9933, + "time_per_iteration": 2.6647229194641113 + }, + { + "auxiliary_loss_clip": 0.01003533, + "auxiliary_loss_mlp": 0.01003353, + "balance_loss_clip": 1.00679755, + "balance_loss_mlp": 1.0020119, + "epoch": 0.5972643920036074, + "flos": 71652383832960.0, + "grad_norm": 0.6715386032989483, + "language_loss": 0.52054989, + "learning_rate": 1.473307699867203e-06, + "loss": 0.54061878, + "num_input_tokens_seen": 214090265, + "step": 9934, + "time_per_iteration": 3.2984836101531982 + }, + { + "auxiliary_loss_clip": 0.01008787, + "auxiliary_loss_mlp": 0.01000988, + "balance_loss_clip": 1.00261807, + "balance_loss_mlp": 0.99982017, + "epoch": 0.5973245152562754, + "flos": 56892702263040.0, + "grad_norm": 0.8294060156973063, + "language_loss": 0.54181111, + "learning_rate": 1.4729319956234849e-06, + "loss": 0.56190884, + "num_input_tokens_seen": 214146375, + "step": 9935, + "time_per_iteration": 3.0957956314086914 + }, + { + "auxiliary_loss_clip": 0.01042755, + "auxiliary_loss_mlp": 0.01032171, + "balance_loss_clip": 1.02370858, + "balance_loss_mlp": 1.0208106, + "epoch": 0.5973846385089433, + "flos": 24164361884160.0, + "grad_norm": 1.6602383147011968, + "language_loss": 0.6617974, + "learning_rate": 1.4725563113662394e-06, + "loss": 0.68254662, + "num_input_tokens_seen": 214165340, + "step": 9936, + "time_per_iteration": 2.6773386001586914 + }, + { + "auxiliary_loss_clip": 0.01021922, + "auxiliary_loss_mlp": 0.01029131, + "balance_loss_clip": 1.02564549, + "balance_loss_mlp": 1.01839018, + "epoch": 0.5974447617616113, + "flos": 17670599752320.0, + "grad_norm": 3.4607360609164113, + "language_loss": 0.67559505, + "learning_rate": 1.4721806471097103e-06, + "loss": 0.6961056, + "num_input_tokens_seen": 214181360, + "step": 9937, + "time_per_iteration": 2.7808797359466553 + }, + { + "auxiliary_loss_clip": 0.01056752, + "auxiliary_loss_mlp": 0.01029216, + "balance_loss_clip": 1.02535665, + "balance_loss_mlp": 1.01784301, + "epoch": 0.5975048850142792, + "flos": 22892514140160.0, + "grad_norm": 2.1650571104367544, + "language_loss": 0.77395916, + "learning_rate": 1.4718050028681442e-06, + "loss": 0.79481876, + "num_input_tokens_seen": 214198525, + "step": 9938, + "time_per_iteration": 2.6419029235839844 + }, + { + "auxiliary_loss_clip": 0.01049867, + "auxiliary_loss_mlp": 0.0102872, + "balance_loss_clip": 1.02194655, + "balance_loss_mlp": 1.01717496, + "epoch": 0.5975650082669473, + "flos": 24353108876160.0, + "grad_norm": 1.7224583387689198, + "language_loss": 0.75789809, + "learning_rate": 1.4714293786557855e-06, + "loss": 0.77868396, + "num_input_tokens_seen": 214218710, + "step": 9939, + "time_per_iteration": 2.669373035430908 + }, + { + "auxiliary_loss_clip": 0.01017339, + "auxiliary_loss_mlp": 0.01032734, + "balance_loss_clip": 1.02403295, + "balance_loss_mlp": 1.01981759, + "epoch": 0.5976251315196152, + "flos": 20923352691840.0, + "grad_norm": 2.3100964294563364, + "language_loss": 0.69015384, + "learning_rate": 1.471053774486878e-06, + "loss": 0.7106545, + "num_input_tokens_seen": 214237800, + "step": 9940, + "time_per_iteration": 2.732417345046997 + }, + { + "auxiliary_loss_clip": 0.01042143, + "auxiliary_loss_mlp": 0.01034822, + "balance_loss_clip": 1.02437818, + "balance_loss_mlp": 1.02435541, + "epoch": 0.5976852547722832, + "flos": 35844594658560.0, + "grad_norm": 1.3077621784761482, + "language_loss": 0.70574468, + "learning_rate": 1.470678190375664e-06, + "loss": 0.72651428, + "num_input_tokens_seen": 214260355, + "step": 9941, + "time_per_iteration": 2.7576637268066406 + }, + { + "auxiliary_loss_clip": 0.01041432, + "auxiliary_loss_mlp": 0.01029339, + "balance_loss_clip": 1.02310514, + "balance_loss_mlp": 1.01875317, + "epoch": 0.5977453780249512, + "flos": 12855948744960.0, + "grad_norm": 2.0044610210335656, + "language_loss": 0.77765822, + "learning_rate": 1.470302626336386e-06, + "loss": 0.79836595, + "num_input_tokens_seen": 214277120, + "step": 9942, + "time_per_iteration": 2.620725393295288 + }, + { + "auxiliary_loss_clip": 0.01023804, + "auxiliary_loss_mlp": 0.0103757, + "balance_loss_clip": 1.02331448, + "balance_loss_mlp": 1.02632236, + "epoch": 0.5978055012776191, + "flos": 20959155573120.0, + "grad_norm": 1.6979046998379848, + "language_loss": 0.75527191, + "learning_rate": 1.4699270823832857e-06, + "loss": 0.77588564, + "num_input_tokens_seen": 214295300, + "step": 9943, + "time_per_iteration": 2.735929250717163 + }, + { + "auxiliary_loss_clip": 0.00996508, + "auxiliary_loss_mlp": 0.01027907, + "balance_loss_clip": 1.02141476, + "balance_loss_mlp": 1.01781559, + "epoch": 0.5978656245302871, + "flos": 34058003063040.0, + "grad_norm": 1.7105624528455308, + "language_loss": 0.62155986, + "learning_rate": 1.4695515585306032e-06, + "loss": 0.64180398, + "num_input_tokens_seen": 214317050, + "step": 9944, + "time_per_iteration": 2.8427369594573975 + }, + { + "auxiliary_loss_clip": 0.0104664, + "auxiliary_loss_mlp": 0.01034749, + "balance_loss_clip": 1.02598, + "balance_loss_mlp": 1.02348924, + "epoch": 0.597925747782955, + "flos": 37373275624320.0, + "grad_norm": 11.444373764364784, + "language_loss": 0.72654188, + "learning_rate": 1.4691760547925795e-06, + "loss": 0.74735582, + "num_input_tokens_seen": 214337470, + "step": 9945, + "time_per_iteration": 2.7470691204071045 + }, + { + "auxiliary_loss_clip": 0.01023218, + "auxiliary_loss_mlp": 0.01030946, + "balance_loss_clip": 1.02563584, + "balance_loss_mlp": 1.01960969, + "epoch": 0.5979858710356231, + "flos": 25374803328000.0, + "grad_norm": 1.80830623938138, + "language_loss": 0.67060596, + "learning_rate": 1.4688005711834522e-06, + "loss": 0.69114757, + "num_input_tokens_seen": 214357975, + "step": 9946, + "time_per_iteration": 2.7134265899658203 + }, + { + "auxiliary_loss_clip": 0.01049211, + "auxiliary_loss_mlp": 0.01037948, + "balance_loss_clip": 1.02280521, + "balance_loss_mlp": 1.02532411, + "epoch": 0.598045994288291, + "flos": 13698413308800.0, + "grad_norm": 2.4117423641234867, + "language_loss": 0.88669449, + "learning_rate": 1.468425107717461e-06, + "loss": 0.90756607, + "num_input_tokens_seen": 214374125, + "step": 9947, + "time_per_iteration": 2.582911729812622 + }, + { + "auxiliary_loss_clip": 0.01059704, + "auxiliary_loss_mlp": 0.01030868, + "balance_loss_clip": 1.02307999, + "balance_loss_mlp": 1.02128339, + "epoch": 0.598106117540959, + "flos": 21981352815360.0, + "grad_norm": 1.6170472977083536, + "language_loss": 0.72069615, + "learning_rate": 1.4680496644088432e-06, + "loss": 0.74160182, + "num_input_tokens_seen": 214393395, + "step": 9948, + "time_per_iteration": 2.6694889068603516 + }, + { + "auxiliary_loss_clip": 0.0104526, + "auxiliary_loss_mlp": 0.01027727, + "balance_loss_clip": 1.02578712, + "balance_loss_mlp": 1.01617575, + "epoch": 0.5981662407936269, + "flos": 20559362221440.0, + "grad_norm": 2.0666255138211453, + "language_loss": 0.8933965, + "learning_rate": 1.4676742412718347e-06, + "loss": 0.91412634, + "num_input_tokens_seen": 214411550, + "step": 9949, + "time_per_iteration": 2.691633462905884 + }, + { + "auxiliary_loss_clip": 0.01053497, + "auxiliary_loss_mlp": 0.01027839, + "balance_loss_clip": 1.02608192, + "balance_loss_mlp": 1.01777744, + "epoch": 0.5982263640462949, + "flos": 14063840323200.0, + "grad_norm": 2.137768246796189, + "language_loss": 0.70655191, + "learning_rate": 1.467298838320673e-06, + "loss": 0.72736526, + "num_input_tokens_seen": 214429780, + "step": 9950, + "time_per_iteration": 2.646167516708374 + }, + { + "auxiliary_loss_clip": 0.01053936, + "auxiliary_loss_mlp": 0.01028555, + "balance_loss_clip": 1.02485335, + "balance_loss_mlp": 1.01772499, + "epoch": 0.5982864872989628, + "flos": 17707228646400.0, + "grad_norm": 1.5790435340869065, + "language_loss": 0.78249502, + "learning_rate": 1.4669234555695921e-06, + "loss": 0.80331993, + "num_input_tokens_seen": 214447775, + "step": 9951, + "time_per_iteration": 4.334156513214111 + }, + { + "auxiliary_loss_clip": 0.0104373, + "auxiliary_loss_mlp": 0.01036206, + "balance_loss_clip": 1.02459311, + "balance_loss_mlp": 1.0241245, + "epoch": 0.5983466105516309, + "flos": 16764789553920.0, + "grad_norm": 1.4226663343984711, + "language_loss": 0.74033821, + "learning_rate": 1.4665480930328275e-06, + "loss": 0.76113755, + "num_input_tokens_seen": 214467245, + "step": 9952, + "time_per_iteration": 4.235555410385132 + }, + { + "auxiliary_loss_clip": 0.0104785, + "auxiliary_loss_mlp": 0.00747724, + "balance_loss_clip": 1.02561355, + "balance_loss_mlp": 1.00042725, + "epoch": 0.5984067338042988, + "flos": 20042714949120.0, + "grad_norm": 1.9934290898318754, + "language_loss": 0.78702796, + "learning_rate": 1.466172750724613e-06, + "loss": 0.80498374, + "num_input_tokens_seen": 214484385, + "step": 9953, + "time_per_iteration": 2.6213433742523193 + }, + { + "auxiliary_loss_clip": 0.01037717, + "auxiliary_loss_mlp": 0.01029573, + "balance_loss_clip": 1.02801919, + "balance_loss_mlp": 1.01955366, + "epoch": 0.5984668570569668, + "flos": 26319900026880.0, + "grad_norm": 1.354206010211653, + "language_loss": 0.69905007, + "learning_rate": 1.4657974286591807e-06, + "loss": 0.71972299, + "num_input_tokens_seen": 214503465, + "step": 9954, + "time_per_iteration": 2.7242889404296875 + }, + { + "auxiliary_loss_clip": 0.01042464, + "auxiliary_loss_mlp": 0.01030684, + "balance_loss_clip": 1.02437043, + "balance_loss_mlp": 1.02005637, + "epoch": 0.5985269803096348, + "flos": 20593728558720.0, + "grad_norm": 2.059629799553322, + "language_loss": 0.73104954, + "learning_rate": 1.4654221268507637e-06, + "loss": 0.75178099, + "num_input_tokens_seen": 214520725, + "step": 9955, + "time_per_iteration": 2.6307356357574463 + }, + { + "auxiliary_loss_clip": 0.01063563, + "auxiliary_loss_mlp": 0.01024993, + "balance_loss_clip": 1.0240494, + "balance_loss_mlp": 1.01480699, + "epoch": 0.5985871035623027, + "flos": 26865382942080.0, + "grad_norm": 1.5975788676907727, + "language_loss": 0.68406892, + "learning_rate": 1.4650468453135934e-06, + "loss": 0.70495445, + "num_input_tokens_seen": 214540675, + "step": 9956, + "time_per_iteration": 2.5786056518554688 + }, + { + "auxiliary_loss_clip": 0.01066521, + "auxiliary_loss_mlp": 0.01028127, + "balance_loss_clip": 1.02637053, + "balance_loss_mlp": 1.01704073, + "epoch": 0.5986472268149707, + "flos": 19609704495360.0, + "grad_norm": 2.1967539124522943, + "language_loss": 0.73056257, + "learning_rate": 1.4646715840618999e-06, + "loss": 0.75150907, + "num_input_tokens_seen": 214559910, + "step": 9957, + "time_per_iteration": 2.5182878971099854 + }, + { + "auxiliary_loss_clip": 0.01029981, + "auxiliary_loss_mlp": 0.01026342, + "balance_loss_clip": 1.02352321, + "balance_loss_mlp": 1.01625657, + "epoch": 0.5987073500676386, + "flos": 21794616984960.0, + "grad_norm": 1.723424797184079, + "language_loss": 0.84307647, + "learning_rate": 1.4642963431099138e-06, + "loss": 0.86363971, + "num_input_tokens_seen": 214575960, + "step": 9958, + "time_per_iteration": 2.706634759902954 + }, + { + "auxiliary_loss_clip": 0.01036364, + "auxiliary_loss_mlp": 0.0074772, + "balance_loss_clip": 1.02641404, + "balance_loss_mlp": 1.00043249, + "epoch": 0.5987674733203067, + "flos": 24314361079680.0, + "grad_norm": 1.9087863334885644, + "language_loss": 0.66014135, + "learning_rate": 1.463921122471864e-06, + "loss": 0.67798221, + "num_input_tokens_seen": 214594230, + "step": 9959, + "time_per_iteration": 2.7955996990203857 + }, + { + "auxiliary_loss_clip": 0.01053645, + "auxiliary_loss_mlp": 0.01026765, + "balance_loss_clip": 1.02485847, + "balance_loss_mlp": 1.01580918, + "epoch": 0.5988275965729746, + "flos": 21320201128320.0, + "grad_norm": 1.6905464872251017, + "language_loss": 0.83652377, + "learning_rate": 1.4635459221619796e-06, + "loss": 0.85732782, + "num_input_tokens_seen": 214613130, + "step": 9960, + "time_per_iteration": 2.5842208862304688 + }, + { + "auxiliary_loss_clip": 0.01046029, + "auxiliary_loss_mlp": 0.01025239, + "balance_loss_clip": 1.0258162, + "balance_loss_mlp": 1.0149219, + "epoch": 0.5988877198256426, + "flos": 25118041933440.0, + "grad_norm": 1.4582292746417316, + "language_loss": 0.79506612, + "learning_rate": 1.4631707421944868e-06, + "loss": 0.81577885, + "num_input_tokens_seen": 214634470, + "step": 9961, + "time_per_iteration": 4.432247638702393 + }, + { + "auxiliary_loss_clip": 0.01062975, + "auxiliary_loss_mlp": 0.01025634, + "balance_loss_clip": 1.02447784, + "balance_loss_mlp": 1.01518512, + "epoch": 0.5989478430783105, + "flos": 26429104350720.0, + "grad_norm": 1.7680708134705663, + "language_loss": 0.66926765, + "learning_rate": 1.4627955825836136e-06, + "loss": 0.69015372, + "num_input_tokens_seen": 214654030, + "step": 9962, + "time_per_iteration": 2.6220920085906982 + }, + { + "auxiliary_loss_clip": 0.01046984, + "auxiliary_loss_mlp": 0.01035415, + "balance_loss_clip": 1.02280593, + "balance_loss_mlp": 1.02333272, + "epoch": 0.5990079663309785, + "flos": 25778439434880.0, + "grad_norm": 1.3585578656666486, + "language_loss": 0.74170327, + "learning_rate": 1.4624204433435857e-06, + "loss": 0.76252723, + "num_input_tokens_seen": 214676985, + "step": 9963, + "time_per_iteration": 2.6501567363739014 + }, + { + "auxiliary_loss_clip": 0.01052968, + "auxiliary_loss_mlp": 0.01030434, + "balance_loss_clip": 1.02452636, + "balance_loss_mlp": 1.02024722, + "epoch": 0.5990680895836464, + "flos": 36831779118720.0, + "grad_norm": 2.957533207807453, + "language_loss": 0.67525101, + "learning_rate": 1.4620453244886281e-06, + "loss": 0.69608504, + "num_input_tokens_seen": 214700105, + "step": 9964, + "time_per_iteration": 2.717434883117676 + }, + { + "auxiliary_loss_clip": 0.01032994, + "auxiliary_loss_mlp": 0.01028388, + "balance_loss_clip": 1.02503157, + "balance_loss_mlp": 1.01700914, + "epoch": 0.5991282128363145, + "flos": 24133550993280.0, + "grad_norm": 2.40148043523082, + "language_loss": 0.76725191, + "learning_rate": 1.4616702260329662e-06, + "loss": 0.78786576, + "num_input_tokens_seen": 214717885, + "step": 9965, + "time_per_iteration": 2.739980936050415 + }, + { + "auxiliary_loss_clip": 0.01056353, + "auxiliary_loss_mlp": 0.01027852, + "balance_loss_clip": 1.02758193, + "balance_loss_mlp": 1.01704526, + "epoch": 0.5991883360889824, + "flos": 10304064956160.0, + "grad_norm": 1.77866633709281, + "language_loss": 0.77771127, + "learning_rate": 1.4612951479908229e-06, + "loss": 0.79855329, + "num_input_tokens_seen": 214733680, + "step": 9966, + "time_per_iteration": 2.5918734073638916 + }, + { + "auxiliary_loss_clip": 0.01033909, + "auxiliary_loss_mlp": 0.010214, + "balance_loss_clip": 1.0260644, + "balance_loss_mlp": 1.01160669, + "epoch": 0.5992484593416504, + "flos": 23951196622080.0, + "grad_norm": 1.3931708488489136, + "language_loss": 0.73356092, + "learning_rate": 1.460920090376422e-06, + "loss": 0.75411397, + "num_input_tokens_seen": 214753285, + "step": 9967, + "time_per_iteration": 2.7247655391693115 + }, + { + "auxiliary_loss_clip": 0.01058437, + "auxiliary_loss_mlp": 0.01030771, + "balance_loss_clip": 1.02634454, + "balance_loss_mlp": 1.0190531, + "epoch": 0.5993085825943184, + "flos": 11944105061760.0, + "grad_norm": 2.0772593931505106, + "language_loss": 0.68532068, + "learning_rate": 1.4605450532039847e-06, + "loss": 0.70621276, + "num_input_tokens_seen": 214767810, + "step": 9968, + "time_per_iteration": 2.599889039993286 + }, + { + "auxiliary_loss_clip": 0.01048846, + "auxiliary_loss_mlp": 0.0103238, + "balance_loss_clip": 1.02178895, + "balance_loss_mlp": 1.02068591, + "epoch": 0.5993687058469863, + "flos": 19026838500480.0, + "grad_norm": 1.4975131166272067, + "language_loss": 0.79327822, + "learning_rate": 1.4601700364877334e-06, + "loss": 0.81409049, + "num_input_tokens_seen": 214786040, + "step": 9969, + "time_per_iteration": 2.556183338165283 + }, + { + "auxiliary_loss_clip": 0.01045663, + "auxiliary_loss_mlp": 0.01025801, + "balance_loss_clip": 1.02312493, + "balance_loss_mlp": 1.01480412, + "epoch": 0.5994288290996543, + "flos": 14282967242880.0, + "grad_norm": 1.8537792385762943, + "language_loss": 0.81233811, + "learning_rate": 1.4597950402418889e-06, + "loss": 0.83305275, + "num_input_tokens_seen": 214803110, + "step": 9970, + "time_per_iteration": 4.214498996734619 + }, + { + "auxiliary_loss_clip": 0.01017365, + "auxiliary_loss_mlp": 0.0103438, + "balance_loss_clip": 1.0239836, + "balance_loss_mlp": 1.02120137, + "epoch": 0.5994889523523222, + "flos": 19206643006080.0, + "grad_norm": 2.0203334518224314, + "language_loss": 0.61970252, + "learning_rate": 1.4594200644806697e-06, + "loss": 0.64021993, + "num_input_tokens_seen": 214819945, + "step": 9971, + "time_per_iteration": 2.7853453159332275 + }, + { + "auxiliary_loss_clip": 0.01060987, + "auxiliary_loss_mlp": 0.01027972, + "balance_loss_clip": 1.02492785, + "balance_loss_mlp": 1.01772034, + "epoch": 0.5995490756049903, + "flos": 28037040675840.0, + "grad_norm": 1.8960508490379184, + "language_loss": 0.79026628, + "learning_rate": 1.4590451092182962e-06, + "loss": 0.81115592, + "num_input_tokens_seen": 214838810, + "step": 9972, + "time_per_iteration": 2.6527833938598633 + }, + { + "auxiliary_loss_clip": 0.01027908, + "auxiliary_loss_mlp": 0.01034848, + "balance_loss_clip": 1.02533472, + "balance_loss_mlp": 1.02317119, + "epoch": 0.5996091988576582, + "flos": 29052953038080.0, + "grad_norm": 2.4980546572661053, + "language_loss": 0.76696455, + "learning_rate": 1.4586701744689864e-06, + "loss": 0.78759211, + "num_input_tokens_seen": 214857040, + "step": 9973, + "time_per_iteration": 2.8070669174194336 + }, + { + "auxiliary_loss_clip": 0.01033136, + "auxiliary_loss_mlp": 0.01029737, + "balance_loss_clip": 1.02311695, + "balance_loss_mlp": 1.01862073, + "epoch": 0.5996693221103262, + "flos": 20813968800000.0, + "grad_norm": 1.9663455762755808, + "language_loss": 0.65118432, + "learning_rate": 1.4582952602469578e-06, + "loss": 0.67181307, + "num_input_tokens_seen": 214873375, + "step": 9974, + "time_per_iteration": 2.6540160179138184 + }, + { + "auxiliary_loss_clip": 0.01051087, + "auxiliary_loss_mlp": 0.01025778, + "balance_loss_clip": 1.02252936, + "balance_loss_mlp": 1.01503134, + "epoch": 0.5997294453629941, + "flos": 23768914078080.0, + "grad_norm": 1.4505662097850838, + "language_loss": 0.74591541, + "learning_rate": 1.457920366566428e-06, + "loss": 0.76668406, + "num_input_tokens_seen": 214893900, + "step": 9975, + "time_per_iteration": 2.621797800064087 + }, + { + "auxiliary_loss_clip": 0.01064666, + "auxiliary_loss_mlp": 0.01027454, + "balance_loss_clip": 1.02549171, + "balance_loss_mlp": 1.01580775, + "epoch": 0.5997895686156621, + "flos": 20960017499520.0, + "grad_norm": 1.9438134329996055, + "language_loss": 0.77331489, + "learning_rate": 1.457545493441611e-06, + "loss": 0.79423606, + "num_input_tokens_seen": 214912110, + "step": 9976, + "time_per_iteration": 2.682647705078125 + }, + { + "auxiliary_loss_clip": 0.01044494, + "auxiliary_loss_mlp": 0.01033264, + "balance_loss_clip": 1.02451587, + "balance_loss_mlp": 1.02152824, + "epoch": 0.59984969186833, + "flos": 28365443746560.0, + "grad_norm": 2.948131928814106, + "language_loss": 0.75571293, + "learning_rate": 1.4571706408867237e-06, + "loss": 0.77649051, + "num_input_tokens_seen": 214930140, + "step": 9977, + "time_per_iteration": 2.6561951637268066 + }, + { + "auxiliary_loss_clip": 0.01028766, + "auxiliary_loss_mlp": 0.0102934, + "balance_loss_clip": 1.02229083, + "balance_loss_mlp": 1.01824737, + "epoch": 0.5999098151209981, + "flos": 22565906749440.0, + "grad_norm": 2.160416015006326, + "language_loss": 0.68775308, + "learning_rate": 1.4567958089159802e-06, + "loss": 0.70833409, + "num_input_tokens_seen": 214949200, + "step": 9978, + "time_per_iteration": 2.6549746990203857 + }, + { + "auxiliary_loss_clip": 0.01070432, + "auxiliary_loss_mlp": 0.01031615, + "balance_loss_clip": 1.02856421, + "balance_loss_mlp": 1.02002811, + "epoch": 0.599969938373666, + "flos": 18768712389120.0, + "grad_norm": 2.993665196368669, + "language_loss": 0.8131901, + "learning_rate": 1.456420997543594e-06, + "loss": 0.83421057, + "num_input_tokens_seen": 214965775, + "step": 9979, + "time_per_iteration": 2.512347936630249 + }, + { + "auxiliary_loss_clip": 0.01061244, + "auxiliary_loss_mlp": 0.01025816, + "balance_loss_clip": 1.024225, + "balance_loss_mlp": 1.01527762, + "epoch": 0.600030061626334, + "flos": 11327231865600.0, + "grad_norm": 1.970766485079663, + "language_loss": 0.69771147, + "learning_rate": 1.4560462067837782e-06, + "loss": 0.71858209, + "num_input_tokens_seen": 214982480, + "step": 9980, + "time_per_iteration": 2.5467894077301025 + }, + { + "auxiliary_loss_clip": 0.01049712, + "auxiliary_loss_mlp": 0.0102877, + "balance_loss_clip": 1.02337003, + "balance_loss_mlp": 1.01680112, + "epoch": 0.600090184879002, + "flos": 16578664254720.0, + "grad_norm": 2.5155402685595383, + "language_loss": 0.68333066, + "learning_rate": 1.4556714366507445e-06, + "loss": 0.70411545, + "num_input_tokens_seen": 214998110, + "step": 9981, + "time_per_iteration": 2.528095006942749 + }, + { + "auxiliary_loss_clip": 0.01052904, + "auxiliary_loss_mlp": 0.01033842, + "balance_loss_clip": 1.02557957, + "balance_loss_mlp": 1.02370954, + "epoch": 0.6001503081316699, + "flos": 23618627573760.0, + "grad_norm": 1.8559257916959455, + "language_loss": 0.78752458, + "learning_rate": 1.4552966871587048e-06, + "loss": 0.80839205, + "num_input_tokens_seen": 215017995, + "step": 9982, + "time_per_iteration": 2.5737454891204834 + }, + { + "auxiliary_loss_clip": 0.01011035, + "auxiliary_loss_mlp": 0.01037373, + "balance_loss_clip": 1.02156949, + "balance_loss_mlp": 1.02464104, + "epoch": 0.6002104313843379, + "flos": 20667668705280.0, + "grad_norm": 1.793363078176868, + "language_loss": 0.72770834, + "learning_rate": 1.4549219583218686e-06, + "loss": 0.74819243, + "num_input_tokens_seen": 215038285, + "step": 9983, + "time_per_iteration": 2.6924538612365723 + }, + { + "auxiliary_loss_clip": 0.01028611, + "auxiliary_loss_mlp": 0.01031537, + "balance_loss_clip": 1.02234793, + "balance_loss_mlp": 1.0195744, + "epoch": 0.6002705546370058, + "flos": 22455229968000.0, + "grad_norm": 1.9383253429100047, + "language_loss": 0.77619147, + "learning_rate": 1.454547250154447e-06, + "loss": 0.79679298, + "num_input_tokens_seen": 215057825, + "step": 9984, + "time_per_iteration": 2.7605931758880615 + }, + { + "auxiliary_loss_clip": 0.01054496, + "auxiliary_loss_mlp": 0.01028582, + "balance_loss_clip": 1.02512026, + "balance_loss_mlp": 1.01799643, + "epoch": 0.6003306778896739, + "flos": 25191982080000.0, + "grad_norm": 1.5352917499074328, + "language_loss": 0.83102286, + "learning_rate": 1.4541725626706485e-06, + "loss": 0.85185361, + "num_input_tokens_seen": 215077790, + "step": 9985, + "time_per_iteration": 2.9172818660736084 + }, + { + "auxiliary_loss_clip": 0.01053774, + "auxiliary_loss_mlp": 0.01032568, + "balance_loss_clip": 1.02564728, + "balance_loss_mlp": 1.02235126, + "epoch": 0.6003908011423418, + "flos": 26687733252480.0, + "grad_norm": 2.0492783907363052, + "language_loss": 0.71526879, + "learning_rate": 1.4537978958846809e-06, + "loss": 0.7361322, + "num_input_tokens_seen": 215097650, + "step": 9986, + "time_per_iteration": 2.8117825984954834 + }, + { + "auxiliary_loss_clip": 0.01068128, + "auxiliary_loss_mlp": 0.00747721, + "balance_loss_clip": 1.02750611, + "balance_loss_mlp": 1.00039744, + "epoch": 0.6004509243950098, + "flos": 22565080736640.0, + "grad_norm": 2.030125803717083, + "language_loss": 0.71677864, + "learning_rate": 1.4534232498107514e-06, + "loss": 0.73493719, + "num_input_tokens_seen": 215118235, + "step": 9987, + "time_per_iteration": 2.805114507675171 + }, + { + "auxiliary_loss_clip": 0.01044396, + "auxiliary_loss_mlp": 0.01030455, + "balance_loss_clip": 1.02511048, + "balance_loss_mlp": 1.02014971, + "epoch": 0.6005110476476777, + "flos": 19719303868800.0, + "grad_norm": 1.8974041995424096, + "language_loss": 0.84909582, + "learning_rate": 1.4530486244630673e-06, + "loss": 0.86984444, + "num_input_tokens_seen": 215136755, + "step": 9988, + "time_per_iteration": 2.91168475151062 + }, + { + "auxiliary_loss_clip": 0.01053529, + "auxiliary_loss_mlp": 0.0103072, + "balance_loss_clip": 1.02450013, + "balance_loss_mlp": 1.01987767, + "epoch": 0.6005711709003457, + "flos": 17712543859200.0, + "grad_norm": 1.6618884982842772, + "language_loss": 0.65677828, + "learning_rate": 1.4526740198558346e-06, + "loss": 0.67762077, + "num_input_tokens_seen": 215155225, + "step": 9989, + "time_per_iteration": 2.8296093940734863 + }, + { + "auxiliary_loss_clip": 0.01052883, + "auxiliary_loss_mlp": 0.01029094, + "balance_loss_clip": 1.02530324, + "balance_loss_mlp": 1.01905632, + "epoch": 0.6006312941530136, + "flos": 18514464946560.0, + "grad_norm": 1.5602106451140683, + "language_loss": 0.80467951, + "learning_rate": 1.452299436003257e-06, + "loss": 0.8254993, + "num_input_tokens_seen": 215174815, + "step": 9990, + "time_per_iteration": 2.8821427822113037 + }, + { + "auxiliary_loss_clip": 0.01037642, + "auxiliary_loss_mlp": 0.01031373, + "balance_loss_clip": 1.02714407, + "balance_loss_mlp": 1.02045965, + "epoch": 0.6006914174056817, + "flos": 21390837223680.0, + "grad_norm": 1.688933002489091, + "language_loss": 0.82760048, + "learning_rate": 1.4519248729195403e-06, + "loss": 0.84829068, + "num_input_tokens_seen": 215192045, + "step": 9991, + "time_per_iteration": 2.822610378265381 + }, + { + "auxiliary_loss_clip": 0.01015692, + "auxiliary_loss_mlp": 0.0103396, + "balance_loss_clip": 1.02164865, + "balance_loss_mlp": 1.02171111, + "epoch": 0.6007515406583496, + "flos": 12750515349120.0, + "grad_norm": 1.7442879584596722, + "language_loss": 0.82907391, + "learning_rate": 1.4515503306188878e-06, + "loss": 0.84957045, + "num_input_tokens_seen": 215209885, + "step": 9992, + "time_per_iteration": 2.7557966709136963 + }, + { + "auxiliary_loss_clip": 0.01042902, + "auxiliary_loss_mlp": 0.00747766, + "balance_loss_clip": 1.0249095, + "balance_loss_mlp": 1.00053036, + "epoch": 0.6008116639110176, + "flos": 19206894401280.0, + "grad_norm": 2.1565258152839384, + "language_loss": 0.66604948, + "learning_rate": 1.4511758091155008e-06, + "loss": 0.68395621, + "num_input_tokens_seen": 215228150, + "step": 9993, + "time_per_iteration": 2.718832015991211 + }, + { + "auxiliary_loss_clip": 0.01029107, + "auxiliary_loss_mlp": 0.0102925, + "balance_loss_clip": 1.02255905, + "balance_loss_mlp": 1.01786017, + "epoch": 0.6008717871636855, + "flos": 17055342668160.0, + "grad_norm": 3.676484193165876, + "language_loss": 0.8154034, + "learning_rate": 1.4508013084235826e-06, + "loss": 0.83598697, + "num_input_tokens_seen": 215243755, + "step": 9994, + "time_per_iteration": 2.6976356506347656 + }, + { + "auxiliary_loss_clip": 0.01017078, + "auxiliary_loss_mlp": 0.01026183, + "balance_loss_clip": 1.02079701, + "balance_loss_mlp": 1.01579332, + "epoch": 0.6009319104163535, + "flos": 20298686244480.0, + "grad_norm": 1.7833255790133957, + "language_loss": 0.71886158, + "learning_rate": 1.4504268285573337e-06, + "loss": 0.73929417, + "num_input_tokens_seen": 215262130, + "step": 9995, + "time_per_iteration": 2.8021087646484375 + }, + { + "auxiliary_loss_clip": 0.01042207, + "auxiliary_loss_mlp": 0.01029739, + "balance_loss_clip": 1.02282763, + "balance_loss_mlp": 1.01847374, + "epoch": 0.6009920336690215, + "flos": 21836776573440.0, + "grad_norm": 3.522873986892138, + "language_loss": 0.80973029, + "learning_rate": 1.4500523695309546e-06, + "loss": 0.8304497, + "num_input_tokens_seen": 215281785, + "step": 9996, + "time_per_iteration": 2.767122745513916 + }, + { + "auxiliary_loss_clip": 0.01001565, + "auxiliary_loss_mlp": 0.01034993, + "balance_loss_clip": 1.0207628, + "balance_loss_mlp": 1.02218974, + "epoch": 0.6010521569216895, + "flos": 22596107109120.0, + "grad_norm": 1.739979913534958, + "language_loss": 0.78482473, + "learning_rate": 1.4496779313586447e-06, + "loss": 0.80519032, + "num_input_tokens_seen": 215297550, + "step": 9997, + "time_per_iteration": 4.477954864501953 + }, + { + "auxiliary_loss_clip": 0.01056869, + "auxiliary_loss_mlp": 0.01029152, + "balance_loss_clip": 1.02592301, + "balance_loss_mlp": 1.01767206, + "epoch": 0.6011122801743575, + "flos": 19171702051200.0, + "grad_norm": 1.606003310477025, + "language_loss": 0.73178172, + "learning_rate": 1.4493035140546028e-06, + "loss": 0.75264192, + "num_input_tokens_seen": 215316360, + "step": 9998, + "time_per_iteration": 4.30703330039978 + }, + { + "auxiliary_loss_clip": 0.01034019, + "auxiliary_loss_mlp": 0.01028978, + "balance_loss_clip": 1.02241385, + "balance_loss_mlp": 1.01826143, + "epoch": 0.6011724034270254, + "flos": 25010022758400.0, + "grad_norm": 1.4851368129546392, + "language_loss": 0.72372931, + "learning_rate": 1.448929117633027e-06, + "loss": 0.74435925, + "num_input_tokens_seen": 215336405, + "step": 9999, + "time_per_iteration": 2.7103254795074463 + }, + { + "auxiliary_loss_clip": 0.01025824, + "auxiliary_loss_mlp": 0.01029432, + "balance_loss_clip": 1.02550125, + "balance_loss_mlp": 1.01798809, + "epoch": 0.6012325266796934, + "flos": 21797669640960.0, + "grad_norm": 1.4119978605101546, + "language_loss": 0.78540087, + "learning_rate": 1.4485547421081142e-06, + "loss": 0.80595338, + "num_input_tokens_seen": 215356590, + "step": 10000, + "time_per_iteration": 2.8019285202026367 + }, + { + "auxiliary_loss_clip": 0.01067645, + "auxiliary_loss_mlp": 0.01031792, + "balance_loss_clip": 1.0262984, + "balance_loss_mlp": 1.01976991, + "epoch": 0.6012926499323613, + "flos": 19573003774080.0, + "grad_norm": 1.9560870005517978, + "language_loss": 0.77660227, + "learning_rate": 1.4481803874940608e-06, + "loss": 0.79759657, + "num_input_tokens_seen": 215374295, + "step": 10001, + "time_per_iteration": 2.5724618434906006 + }, + { + "auxiliary_loss_clip": 0.0105596, + "auxiliary_loss_mlp": 0.01027718, + "balance_loss_clip": 1.02511549, + "balance_loss_mlp": 1.01646483, + "epoch": 0.6013527731850293, + "flos": 34860786076800.0, + "grad_norm": 1.724891434349585, + "language_loss": 0.58689749, + "learning_rate": 1.4478060538050624e-06, + "loss": 0.60773426, + "num_input_tokens_seen": 215394535, + "step": 10002, + "time_per_iteration": 2.7090578079223633 + }, + { + "auxiliary_loss_clip": 0.01047437, + "auxiliary_loss_mlp": 0.01034201, + "balance_loss_clip": 1.027426, + "balance_loss_mlp": 1.02194607, + "epoch": 0.6014128964376972, + "flos": 23291948355840.0, + "grad_norm": 1.6534556755330707, + "language_loss": 0.77809417, + "learning_rate": 1.447431741055314e-06, + "loss": 0.79891056, + "num_input_tokens_seen": 215414355, + "step": 10003, + "time_per_iteration": 2.738734245300293 + }, + { + "auxiliary_loss_clip": 0.01067303, + "auxiliary_loss_mlp": 0.01030217, + "balance_loss_clip": 1.02707219, + "balance_loss_mlp": 1.01879644, + "epoch": 0.6014730196903653, + "flos": 24820916630400.0, + "grad_norm": 8.037217229158276, + "language_loss": 0.77851224, + "learning_rate": 1.4470574492590091e-06, + "loss": 0.79948735, + "num_input_tokens_seen": 215428280, + "step": 10004, + "time_per_iteration": 2.6917288303375244 + }, + { + "auxiliary_loss_clip": 0.01052596, + "auxiliary_loss_mlp": 0.01025332, + "balance_loss_clip": 1.02461648, + "balance_loss_mlp": 1.01426363, + "epoch": 0.6015331429430332, + "flos": 23112359331840.0, + "grad_norm": 1.5252266778407388, + "language_loss": 0.7222141, + "learning_rate": 1.4466831784303408e-06, + "loss": 0.74299335, + "num_input_tokens_seen": 215448970, + "step": 10005, + "time_per_iteration": 2.5665066242218018 + }, + { + "auxiliary_loss_clip": 0.01062767, + "auxiliary_loss_mlp": 0.0102607, + "balance_loss_clip": 1.025594, + "balance_loss_mlp": 1.01528776, + "epoch": 0.6015932661957012, + "flos": 19201363706880.0, + "grad_norm": 2.2193814636697624, + "language_loss": 0.75023818, + "learning_rate": 1.4463089285835026e-06, + "loss": 0.77112663, + "num_input_tokens_seen": 215465260, + "step": 10006, + "time_per_iteration": 2.53751802444458 + }, + { + "auxiliary_loss_clip": 0.01041959, + "auxiliary_loss_mlp": 0.01028925, + "balance_loss_clip": 1.02253056, + "balance_loss_mlp": 1.01747465, + "epoch": 0.6016533894483691, + "flos": 18113630100480.0, + "grad_norm": 1.789580411569114, + "language_loss": 0.73712885, + "learning_rate": 1.445934699732685e-06, + "loss": 0.75783765, + "num_input_tokens_seen": 215482725, + "step": 10007, + "time_per_iteration": 2.6606743335723877 + }, + { + "auxiliary_loss_clip": 0.01042753, + "auxiliary_loss_mlp": 0.01024457, + "balance_loss_clip": 1.02486897, + "balance_loss_mlp": 1.01443744, + "epoch": 0.6017135127010371, + "flos": 16216900427520.0, + "grad_norm": 1.7879724255683438, + "language_loss": 0.70168173, + "learning_rate": 1.4455604918920785e-06, + "loss": 0.72235382, + "num_input_tokens_seen": 215500420, + "step": 10008, + "time_per_iteration": 4.5503058433532715 + }, + { + "auxiliary_loss_clip": 0.01050451, + "auxiliary_loss_mlp": 0.01025598, + "balance_loss_clip": 1.0234201, + "balance_loss_mlp": 1.01497602, + "epoch": 0.6017736359537051, + "flos": 23444246021760.0, + "grad_norm": 1.427243295617218, + "language_loss": 0.76721764, + "learning_rate": 1.4451863050758748e-06, + "loss": 0.78797811, + "num_input_tokens_seen": 215522260, + "step": 10009, + "time_per_iteration": 2.6281490325927734 + }, + { + "auxiliary_loss_clip": 0.01037383, + "auxiliary_loss_mlp": 0.0074757, + "balance_loss_clip": 1.02329993, + "balance_loss_mlp": 1.00052619, + "epoch": 0.601833759206373, + "flos": 23514056104320.0, + "grad_norm": 2.28161006906113, + "language_loss": 0.7420944, + "learning_rate": 1.4448121392982608e-06, + "loss": 0.7599439, + "num_input_tokens_seen": 215541715, + "step": 10010, + "time_per_iteration": 2.7917206287384033 + }, + { + "auxiliary_loss_clip": 0.01000082, + "auxiliary_loss_mlp": 0.01002843, + "balance_loss_clip": 1.00299501, + "balance_loss_mlp": 1.0015434, + "epoch": 0.6018938824590411, + "flos": 63991668648960.0, + "grad_norm": 0.8096557005815204, + "language_loss": 0.55059481, + "learning_rate": 1.4444379945734268e-06, + "loss": 0.57062411, + "num_input_tokens_seen": 215603020, + "step": 10011, + "time_per_iteration": 3.3437399864196777 + }, + { + "auxiliary_loss_clip": 0.01053092, + "auxiliary_loss_mlp": 0.0103204, + "balance_loss_clip": 1.02439499, + "balance_loss_mlp": 1.02191949, + "epoch": 0.601954005711709, + "flos": 34640007131520.0, + "grad_norm": 1.326266829235854, + "language_loss": 0.62180877, + "learning_rate": 1.44406387091556e-06, + "loss": 0.64266014, + "num_input_tokens_seen": 215625115, + "step": 10012, + "time_per_iteration": 2.756152868270874 + }, + { + "auxiliary_loss_clip": 0.01028397, + "auxiliary_loss_mlp": 0.01023107, + "balance_loss_clip": 1.02413082, + "balance_loss_mlp": 1.01292682, + "epoch": 0.602014128964377, + "flos": 19427062815360.0, + "grad_norm": 5.38995282492485, + "language_loss": 0.75123549, + "learning_rate": 1.4436897683388462e-06, + "loss": 0.77175051, + "num_input_tokens_seen": 215643730, + "step": 10013, + "time_per_iteration": 2.7359209060668945 + }, + { + "auxiliary_loss_clip": 0.01059684, + "auxiliary_loss_mlp": 0.01023561, + "balance_loss_clip": 1.02410817, + "balance_loss_mlp": 1.01429856, + "epoch": 0.6020742522170449, + "flos": 28329389470080.0, + "grad_norm": 1.749763195826676, + "language_loss": 0.81513953, + "learning_rate": 1.4433156868574732e-06, + "loss": 0.83597201, + "num_input_tokens_seen": 215664425, + "step": 10014, + "time_per_iteration": 2.6975483894348145 + }, + { + "auxiliary_loss_clip": 0.01037703, + "auxiliary_loss_mlp": 0.01023351, + "balance_loss_clip": 1.02245843, + "balance_loss_mlp": 1.01335573, + "epoch": 0.6021343754697129, + "flos": 22747040058240.0, + "grad_norm": 1.385229359926548, + "language_loss": 0.72550017, + "learning_rate": 1.442941626485624e-06, + "loss": 0.74611068, + "num_input_tokens_seen": 215684280, + "step": 10015, + "time_per_iteration": 2.6803951263427734 + }, + { + "auxiliary_loss_clip": 0.00990287, + "auxiliary_loss_mlp": 0.01005516, + "balance_loss_clip": 1.00403428, + "balance_loss_mlp": 1.00439537, + "epoch": 0.6021944987223808, + "flos": 65752007402880.0, + "grad_norm": 0.8221563993063947, + "language_loss": 0.54872262, + "learning_rate": 1.4425675872374848e-06, + "loss": 0.56868064, + "num_input_tokens_seen": 215739780, + "step": 10016, + "time_per_iteration": 3.166214942932129 + }, + { + "auxiliary_loss_clip": 0.01043582, + "auxiliary_loss_mlp": 0.01025489, + "balance_loss_clip": 1.02584839, + "balance_loss_mlp": 1.01477218, + "epoch": 0.6022546219750489, + "flos": 16105182151680.0, + "grad_norm": 1.5007048881293632, + "language_loss": 0.83138335, + "learning_rate": 1.4421935691272381e-06, + "loss": 0.85207409, + "num_input_tokens_seen": 215757885, + "step": 10017, + "time_per_iteration": 4.30490779876709 + }, + { + "auxiliary_loss_clip": 0.01042386, + "auxiliary_loss_mlp": 0.01026319, + "balance_loss_clip": 1.02472281, + "balance_loss_mlp": 1.01538754, + "epoch": 0.6023147452277168, + "flos": 25512555985920.0, + "grad_norm": 2.702489421210579, + "language_loss": 0.84189117, + "learning_rate": 1.4418195721690677e-06, + "loss": 0.86257827, + "num_input_tokens_seen": 215776415, + "step": 10018, + "time_per_iteration": 2.8147876262664795 + }, + { + "auxiliary_loss_clip": 0.01038405, + "auxiliary_loss_mlp": 0.01032356, + "balance_loss_clip": 1.02275324, + "balance_loss_mlp": 1.02089977, + "epoch": 0.6023748684803848, + "flos": 22636075968000.0, + "grad_norm": 2.3021655408650274, + "language_loss": 0.78345215, + "learning_rate": 1.4414455963771549e-06, + "loss": 0.80415976, + "num_input_tokens_seen": 215794865, + "step": 10019, + "time_per_iteration": 2.6407830715179443 + }, + { + "auxiliary_loss_clip": 0.01033168, + "auxiliary_loss_mlp": 0.00747615, + "balance_loss_clip": 1.02535045, + "balance_loss_mlp": 1.00047517, + "epoch": 0.6024349917330527, + "flos": 26210444307840.0, + "grad_norm": 1.475516257436622, + "language_loss": 0.73842037, + "learning_rate": 1.441071641765681e-06, + "loss": 0.75622821, + "num_input_tokens_seen": 215816840, + "step": 10020, + "time_per_iteration": 2.816009521484375 + }, + { + "auxiliary_loss_clip": 0.01041996, + "auxiliary_loss_mlp": 0.01026053, + "balance_loss_clip": 1.02301073, + "balance_loss_mlp": 1.01586056, + "epoch": 0.6024951149857207, + "flos": 21251755762560.0, + "grad_norm": 1.6151377369910072, + "language_loss": 0.63870859, + "learning_rate": 1.4406977083488264e-06, + "loss": 0.65938908, + "num_input_tokens_seen": 215836100, + "step": 10021, + "time_per_iteration": 2.748692274093628 + }, + { + "auxiliary_loss_clip": 0.01053217, + "auxiliary_loss_mlp": 0.01029126, + "balance_loss_clip": 1.02502668, + "balance_loss_mlp": 1.01836729, + "epoch": 0.6025552382383887, + "flos": 26943453152640.0, + "grad_norm": 1.527991711229753, + "language_loss": 0.8060292, + "learning_rate": 1.4403237961407704e-06, + "loss": 0.82685268, + "num_input_tokens_seen": 215858480, + "step": 10022, + "time_per_iteration": 2.7179510593414307 + }, + { + "auxiliary_loss_clip": 0.01049421, + "auxiliary_loss_mlp": 0.0102862, + "balance_loss_clip": 1.02586412, + "balance_loss_mlp": 1.01779616, + "epoch": 0.6026153614910567, + "flos": 31684379495040.0, + "grad_norm": 1.5795588311074797, + "language_loss": 0.66797984, + "learning_rate": 1.439949905155693e-06, + "loss": 0.68876028, + "num_input_tokens_seen": 215879950, + "step": 10023, + "time_per_iteration": 2.6870322227478027 + }, + { + "auxiliary_loss_clip": 0.01051863, + "auxiliary_loss_mlp": 0.01028107, + "balance_loss_clip": 1.02278757, + "balance_loss_mlp": 1.01714563, + "epoch": 0.6026754847437247, + "flos": 29312731175040.0, + "grad_norm": 1.7819301506881486, + "language_loss": 0.74473953, + "learning_rate": 1.4395760354077707e-06, + "loss": 0.76553923, + "num_input_tokens_seen": 215899830, + "step": 10024, + "time_per_iteration": 2.6621222496032715 + }, + { + "auxiliary_loss_clip": 0.01053562, + "auxiliary_loss_mlp": 0.01029699, + "balance_loss_clip": 1.02520657, + "balance_loss_mlp": 1.01854706, + "epoch": 0.6027356079963926, + "flos": 23586775188480.0, + "grad_norm": 1.806763680199878, + "language_loss": 0.733877, + "learning_rate": 1.4392021869111815e-06, + "loss": 0.7547096, + "num_input_tokens_seen": 215920440, + "step": 10025, + "time_per_iteration": 2.6344244480133057 + }, + { + "auxiliary_loss_clip": 0.01067123, + "auxiliary_loss_mlp": 0.010296, + "balance_loss_clip": 1.02525592, + "balance_loss_mlp": 1.01781654, + "epoch": 0.6027957312490606, + "flos": 20813753318400.0, + "grad_norm": 2.8573755733468675, + "language_loss": 0.67552644, + "learning_rate": 1.4388283596801016e-06, + "loss": 0.69649363, + "num_input_tokens_seen": 215940535, + "step": 10026, + "time_per_iteration": 2.570119857788086 + }, + { + "auxiliary_loss_clip": 0.01058903, + "auxiliary_loss_mlp": 0.01028762, + "balance_loss_clip": 1.0229702, + "balance_loss_mlp": 1.01887369, + "epoch": 0.6028558545017285, + "flos": 19935773182080.0, + "grad_norm": 1.7610612266369774, + "language_loss": 0.80272597, + "learning_rate": 1.4384545537287061e-06, + "loss": 0.82360268, + "num_input_tokens_seen": 215958045, + "step": 10027, + "time_per_iteration": 2.733363389968872 + }, + { + "auxiliary_loss_clip": 0.01029378, + "auxiliary_loss_mlp": 0.01031418, + "balance_loss_clip": 1.02190065, + "balance_loss_mlp": 1.02026558, + "epoch": 0.6029159777543965, + "flos": 22820836550400.0, + "grad_norm": 1.7598544207371247, + "language_loss": 0.70742524, + "learning_rate": 1.438080769071171e-06, + "loss": 0.72803319, + "num_input_tokens_seen": 215977330, + "step": 10028, + "time_per_iteration": 2.715569496154785 + }, + { + "auxiliary_loss_clip": 0.01030066, + "auxiliary_loss_mlp": 0.01028273, + "balance_loss_clip": 1.0272851, + "balance_loss_mlp": 1.01767516, + "epoch": 0.6029761010070644, + "flos": 23587242065280.0, + "grad_norm": 2.2195412440772877, + "language_loss": 0.84398699, + "learning_rate": 1.437707005721669e-06, + "loss": 0.86457038, + "num_input_tokens_seen": 215997865, + "step": 10029, + "time_per_iteration": 2.7570204734802246 + }, + { + "auxiliary_loss_clip": 0.01043395, + "auxiliary_loss_mlp": 0.01029033, + "balance_loss_clip": 1.02463663, + "balance_loss_mlp": 1.01897788, + "epoch": 0.6030362242597325, + "flos": 13662430859520.0, + "grad_norm": 1.7246920982978386, + "language_loss": 0.79739439, + "learning_rate": 1.437333263694373e-06, + "loss": 0.81811869, + "num_input_tokens_seen": 216016230, + "step": 10030, + "time_per_iteration": 2.6259889602661133 + }, + { + "auxiliary_loss_clip": 0.00993436, + "auxiliary_loss_mlp": 0.01031229, + "balance_loss_clip": 1.02089667, + "balance_loss_mlp": 1.01939166, + "epoch": 0.6030963475124004, + "flos": 24422883045120.0, + "grad_norm": 1.512660358516962, + "language_loss": 0.71185136, + "learning_rate": 1.4369595430034572e-06, + "loss": 0.73209798, + "num_input_tokens_seen": 216035785, + "step": 10031, + "time_per_iteration": 2.847593069076538 + }, + { + "auxiliary_loss_clip": 0.01020659, + "auxiliary_loss_mlp": 0.01030875, + "balance_loss_clip": 1.02210546, + "balance_loss_mlp": 1.01874566, + "epoch": 0.6031564707650684, + "flos": 29644043247360.0, + "grad_norm": 1.7757397689751222, + "language_loss": 0.7322681, + "learning_rate": 1.4365858436630912e-06, + "loss": 0.75278342, + "num_input_tokens_seen": 216059555, + "step": 10032, + "time_per_iteration": 2.965968132019043 + }, + { + "auxiliary_loss_clip": 0.01044801, + "auxiliary_loss_mlp": 0.0103057, + "balance_loss_clip": 1.02569473, + "balance_loss_mlp": 1.01905394, + "epoch": 0.6032165940177363, + "flos": 16618776768000.0, + "grad_norm": 2.228259561684905, + "language_loss": 0.68405437, + "learning_rate": 1.4362121656874465e-06, + "loss": 0.704808, + "num_input_tokens_seen": 216077235, + "step": 10033, + "time_per_iteration": 2.955284595489502 + }, + { + "auxiliary_loss_clip": 0.01036579, + "auxiliary_loss_mlp": 0.01029689, + "balance_loss_clip": 1.02414393, + "balance_loss_mlp": 1.0182271, + "epoch": 0.6032767172704043, + "flos": 17488173553920.0, + "grad_norm": 1.7431093744842596, + "language_loss": 0.7552498, + "learning_rate": 1.4358385090906934e-06, + "loss": 0.77591252, + "num_input_tokens_seen": 216094985, + "step": 10034, + "time_per_iteration": 3.1334121227264404 + }, + { + "auxiliary_loss_clip": 0.01045884, + "auxiliary_loss_mlp": 0.01028185, + "balance_loss_clip": 1.02581692, + "balance_loss_mlp": 1.01637733, + "epoch": 0.6033368405230723, + "flos": 26832955939200.0, + "grad_norm": 1.9887457374360538, + "language_loss": 0.74281758, + "learning_rate": 1.4354648738870004e-06, + "loss": 0.76355827, + "num_input_tokens_seen": 216115905, + "step": 10035, + "time_per_iteration": 3.166633129119873 + }, + { + "auxiliary_loss_clip": 0.01036263, + "auxiliary_loss_mlp": 0.01023358, + "balance_loss_clip": 1.022614, + "balance_loss_mlp": 1.01299858, + "epoch": 0.6033969637757403, + "flos": 16909904499840.0, + "grad_norm": 1.577768800747712, + "language_loss": 0.86709362, + "learning_rate": 1.435091260090536e-06, + "loss": 0.88768983, + "num_input_tokens_seen": 216132420, + "step": 10036, + "time_per_iteration": 3.0160045623779297 + }, + { + "auxiliary_loss_clip": 0.01035679, + "auxiliary_loss_mlp": 0.01029543, + "balance_loss_clip": 1.02619898, + "balance_loss_mlp": 1.01837277, + "epoch": 0.6034570870284083, + "flos": 22930076787840.0, + "grad_norm": 1.9036494503480559, + "language_loss": 0.70385396, + "learning_rate": 1.4347176677154676e-06, + "loss": 0.72450614, + "num_input_tokens_seen": 216149800, + "step": 10037, + "time_per_iteration": 2.982374906539917 + }, + { + "auxiliary_loss_clip": 0.01046708, + "auxiliary_loss_mlp": 0.01031891, + "balance_loss_clip": 1.0236094, + "balance_loss_mlp": 1.02017856, + "epoch": 0.6035172102810762, + "flos": 23366319465600.0, + "grad_norm": 2.6845040577225525, + "language_loss": 0.85111272, + "learning_rate": 1.4343440967759616e-06, + "loss": 0.87189877, + "num_input_tokens_seen": 216168200, + "step": 10038, + "time_per_iteration": 2.785508871078491 + }, + { + "auxiliary_loss_clip": 0.01044787, + "auxiliary_loss_mlp": 0.01028818, + "balance_loss_clip": 1.02468264, + "balance_loss_mlp": 1.01797652, + "epoch": 0.6035773335337442, + "flos": 20887082933760.0, + "grad_norm": 2.176007533614456, + "language_loss": 0.75934517, + "learning_rate": 1.4339705472861846e-06, + "loss": 0.78008121, + "num_input_tokens_seen": 216187105, + "step": 10039, + "time_per_iteration": 2.9239673614501953 + }, + { + "auxiliary_loss_clip": 0.01052663, + "auxiliary_loss_mlp": 0.01027042, + "balance_loss_clip": 1.02419329, + "balance_loss_mlp": 1.01652765, + "epoch": 0.6036374567864121, + "flos": 24936298093440.0, + "grad_norm": 1.7483155432052764, + "language_loss": 0.71045041, + "learning_rate": 1.433597019260301e-06, + "loss": 0.73124748, + "num_input_tokens_seen": 216205440, + "step": 10040, + "time_per_iteration": 3.0417017936706543 + }, + { + "auxiliary_loss_clip": 0.01057505, + "auxiliary_loss_mlp": 0.01028026, + "balance_loss_clip": 1.02562332, + "balance_loss_mlp": 1.0154494, + "epoch": 0.6036975800390801, + "flos": 23148269953920.0, + "grad_norm": 2.4504601667714407, + "language_loss": 0.78517473, + "learning_rate": 1.433223512712475e-06, + "loss": 0.80603004, + "num_input_tokens_seen": 216223130, + "step": 10041, + "time_per_iteration": 2.820152759552002 + }, + { + "auxiliary_loss_clip": 0.01046015, + "auxiliary_loss_mlp": 0.01027962, + "balance_loss_clip": 1.02598071, + "balance_loss_mlp": 1.01718557, + "epoch": 0.603757703291748, + "flos": 18660729127680.0, + "grad_norm": 1.7233075140071057, + "language_loss": 0.75371623, + "learning_rate": 1.4328500276568704e-06, + "loss": 0.77445602, + "num_input_tokens_seen": 216240260, + "step": 10042, + "time_per_iteration": 2.813060760498047 + }, + { + "auxiliary_loss_clip": 0.01015715, + "auxiliary_loss_mlp": 0.01022426, + "balance_loss_clip": 1.02065325, + "balance_loss_mlp": 1.01203656, + "epoch": 0.6038178265444161, + "flos": 19682603147520.0, + "grad_norm": 1.809398537129184, + "language_loss": 0.84680569, + "learning_rate": 1.4324765641076498e-06, + "loss": 0.86718708, + "num_input_tokens_seen": 216258510, + "step": 10043, + "time_per_iteration": 2.9110050201416016 + }, + { + "auxiliary_loss_clip": 0.01028328, + "auxiliary_loss_mlp": 0.01032226, + "balance_loss_clip": 1.02125704, + "balance_loss_mlp": 1.01967931, + "epoch": 0.603877949797084, + "flos": 22638230784000.0, + "grad_norm": 1.803760485718034, + "language_loss": 0.69401443, + "learning_rate": 1.432103122078974e-06, + "loss": 0.71461999, + "num_input_tokens_seen": 216277550, + "step": 10044, + "time_per_iteration": 4.415853261947632 + }, + { + "auxiliary_loss_clip": 0.01058024, + "auxiliary_loss_mlp": 0.01028902, + "balance_loss_clip": 1.0270623, + "balance_loss_mlp": 1.01732111, + "epoch": 0.603938073049752, + "flos": 25447881548160.0, + "grad_norm": 1.6757627413192695, + "language_loss": 0.78087735, + "learning_rate": 1.4317297015850057e-06, + "loss": 0.80174661, + "num_input_tokens_seen": 216296690, + "step": 10045, + "time_per_iteration": 2.7210328578948975 + }, + { + "auxiliary_loss_clip": 0.01022209, + "auxiliary_loss_mlp": 0.01029476, + "balance_loss_clip": 1.03012693, + "balance_loss_mlp": 1.01837146, + "epoch": 0.6039981963024199, + "flos": 22340135813760.0, + "grad_norm": 1.6067957152919388, + "language_loss": 0.77180052, + "learning_rate": 1.4313563026399036e-06, + "loss": 0.79231739, + "num_input_tokens_seen": 216316110, + "step": 10046, + "time_per_iteration": 2.953242540359497 + }, + { + "auxiliary_loss_clip": 0.01006384, + "auxiliary_loss_mlp": 0.01034014, + "balance_loss_clip": 1.01936734, + "balance_loss_mlp": 1.02267718, + "epoch": 0.6040583195550879, + "flos": 20703148364160.0, + "grad_norm": 5.240626860090762, + "language_loss": 0.87072057, + "learning_rate": 1.430982925257827e-06, + "loss": 0.89112455, + "num_input_tokens_seen": 216333855, + "step": 10047, + "time_per_iteration": 2.8649981021881104 + }, + { + "auxiliary_loss_clip": 0.01054825, + "auxiliary_loss_mlp": 0.01025637, + "balance_loss_clip": 1.0268389, + "balance_loss_mlp": 1.01527786, + "epoch": 0.604118442807756, + "flos": 27163118776320.0, + "grad_norm": 1.4335292241632627, + "language_loss": 0.75601768, + "learning_rate": 1.4306095694529358e-06, + "loss": 0.77682227, + "num_input_tokens_seen": 216354890, + "step": 10048, + "time_per_iteration": 2.898664712905884 + }, + { + "auxiliary_loss_clip": 0.01052791, + "auxiliary_loss_mlp": 0.01035946, + "balance_loss_clip": 1.02568805, + "balance_loss_mlp": 1.02214789, + "epoch": 0.6041785660604239, + "flos": 30881524654080.0, + "grad_norm": 2.929578363370876, + "language_loss": 0.66248775, + "learning_rate": 1.430236235239386e-06, + "loss": 0.68337512, + "num_input_tokens_seen": 216376055, + "step": 10049, + "time_per_iteration": 2.8329567909240723 + }, + { + "auxiliary_loss_clip": 0.01035061, + "auxiliary_loss_mlp": 0.01031931, + "balance_loss_clip": 1.02149653, + "balance_loss_mlp": 1.01989067, + "epoch": 0.6042386893130919, + "flos": 19938215306880.0, + "grad_norm": 1.4372444570442742, + "language_loss": 0.6640681, + "learning_rate": 1.429862922631336e-06, + "loss": 0.68473804, + "num_input_tokens_seen": 216396295, + "step": 10050, + "time_per_iteration": 2.860780715942383 + }, + { + "auxiliary_loss_clip": 0.01032525, + "auxiliary_loss_mlp": 0.01030468, + "balance_loss_clip": 1.02421141, + "balance_loss_mlp": 1.02013206, + "epoch": 0.6042988125657598, + "flos": 32415915882240.0, + "grad_norm": 1.9876913278597, + "language_loss": 0.69733804, + "learning_rate": 1.4294896316429408e-06, + "loss": 0.71796799, + "num_input_tokens_seen": 216416605, + "step": 10051, + "time_per_iteration": 2.9498367309570312 + }, + { + "auxiliary_loss_clip": 0.01051014, + "auxiliary_loss_mlp": 0.01026678, + "balance_loss_clip": 1.02263927, + "balance_loss_mlp": 1.01587188, + "epoch": 0.6043589358184278, + "flos": 17420805596160.0, + "grad_norm": 1.9020444045430611, + "language_loss": 0.64836848, + "learning_rate": 1.4291163622883553e-06, + "loss": 0.66914535, + "num_input_tokens_seen": 216435130, + "step": 10052, + "time_per_iteration": 2.7642765045166016 + }, + { + "auxiliary_loss_clip": 0.01043477, + "auxiliary_loss_mlp": 0.0102919, + "balance_loss_clip": 1.02468705, + "balance_loss_mlp": 1.01768041, + "epoch": 0.6044190590710957, + "flos": 27672834723840.0, + "grad_norm": 1.593086838055544, + "language_loss": 0.68852931, + "learning_rate": 1.4287431145817358e-06, + "loss": 0.70925593, + "num_input_tokens_seen": 216455640, + "step": 10053, + "time_per_iteration": 2.7213058471679688 + }, + { + "auxiliary_loss_clip": 0.00990427, + "auxiliary_loss_mlp": 0.01003291, + "balance_loss_clip": 1.00352359, + "balance_loss_mlp": 1.00202179, + "epoch": 0.6044791823237637, + "flos": 65316267515520.0, + "grad_norm": 0.7351975560928369, + "language_loss": 0.60436988, + "learning_rate": 1.4283698885372336e-06, + "loss": 0.62430704, + "num_input_tokens_seen": 216518130, + "step": 10054, + "time_per_iteration": 3.3567299842834473 + }, + { + "auxiliary_loss_clip": 0.01012179, + "auxiliary_loss_mlp": 0.0102612, + "balance_loss_clip": 1.02381754, + "balance_loss_mlp": 1.01506889, + "epoch": 0.6045393055764317, + "flos": 24492369905280.0, + "grad_norm": 5.044041690657327, + "language_loss": 0.85593987, + "learning_rate": 1.4279966841690027e-06, + "loss": 0.87632287, + "num_input_tokens_seen": 216536845, + "step": 10055, + "time_per_iteration": 2.8755805492401123 + }, + { + "auxiliary_loss_clip": 0.01043729, + "auxiliary_loss_mlp": 0.01039222, + "balance_loss_clip": 1.02710247, + "balance_loss_mlp": 1.02646637, + "epoch": 0.6045994288290997, + "flos": 19054345340160.0, + "grad_norm": 2.9981107623798304, + "language_loss": 0.73428947, + "learning_rate": 1.4276235014911952e-06, + "loss": 0.75511897, + "num_input_tokens_seen": 216551860, + "step": 10056, + "time_per_iteration": 4.523420572280884 + }, + { + "auxiliary_loss_clip": 0.01027871, + "auxiliary_loss_mlp": 0.01030739, + "balance_loss_clip": 1.02430272, + "balance_loss_mlp": 1.01926482, + "epoch": 0.6046595520817676, + "flos": 26576697335040.0, + "grad_norm": 1.5258346765112472, + "language_loss": 0.80444956, + "learning_rate": 1.4272503405179616e-06, + "loss": 0.82503569, + "num_input_tokens_seen": 216574775, + "step": 10057, + "time_per_iteration": 2.9927291870117188 + }, + { + "auxiliary_loss_clip": 0.01062132, + "auxiliary_loss_mlp": 0.00747529, + "balance_loss_clip": 1.02436519, + "balance_loss_mlp": 1.00054407, + "epoch": 0.6047196753344356, + "flos": 13582277660160.0, + "grad_norm": 2.370943002206788, + "language_loss": 0.75224555, + "learning_rate": 1.4268772012634527e-06, + "loss": 0.77034211, + "num_input_tokens_seen": 216590100, + "step": 10058, + "time_per_iteration": 2.67240309715271 + }, + { + "auxiliary_loss_clip": 0.01051912, + "auxiliary_loss_mlp": 0.01027255, + "balance_loss_clip": 1.02403474, + "balance_loss_mlp": 1.01642525, + "epoch": 0.6047797985871035, + "flos": 25520456977920.0, + "grad_norm": 2.306325144278245, + "language_loss": 0.71505076, + "learning_rate": 1.4265040837418176e-06, + "loss": 0.73584247, + "num_input_tokens_seen": 216610145, + "step": 10059, + "time_per_iteration": 2.868549108505249 + }, + { + "auxiliary_loss_clip": 0.01041897, + "auxiliary_loss_mlp": 0.01024528, + "balance_loss_clip": 1.02351654, + "balance_loss_mlp": 1.01406777, + "epoch": 0.6048399218397715, + "flos": 20520147548160.0, + "grad_norm": 1.518000648994571, + "language_loss": 0.76172292, + "learning_rate": 1.4261309879672054e-06, + "loss": 0.78238714, + "num_input_tokens_seen": 216630625, + "step": 10060, + "time_per_iteration": 2.7029619216918945 + }, + { + "auxiliary_loss_clip": 0.01052036, + "auxiliary_loss_mlp": 0.01030235, + "balance_loss_clip": 1.02408814, + "balance_loss_mlp": 1.01940441, + "epoch": 0.6049000450924396, + "flos": 20408788408320.0, + "grad_norm": 2.0925930857698303, + "language_loss": 0.73536938, + "learning_rate": 1.4257579139537628e-06, + "loss": 0.75619215, + "num_input_tokens_seen": 216649255, + "step": 10061, + "time_per_iteration": 2.61384916305542 + }, + { + "auxiliary_loss_clip": 0.01021772, + "auxiliary_loss_mlp": 0.00747751, + "balance_loss_clip": 1.02355266, + "balance_loss_mlp": 1.00062561, + "epoch": 0.6049601683451075, + "flos": 20741357456640.0, + "grad_norm": 1.9101555812618898, + "language_loss": 0.67629075, + "learning_rate": 1.425384861715639e-06, + "loss": 0.69398594, + "num_input_tokens_seen": 216668100, + "step": 10062, + "time_per_iteration": 2.7310030460357666 + }, + { + "auxiliary_loss_clip": 0.01044837, + "auxiliary_loss_mlp": 0.01032269, + "balance_loss_clip": 1.02280045, + "balance_loss_mlp": 1.02103949, + "epoch": 0.6050202915977755, + "flos": 20083114771200.0, + "grad_norm": 2.1046069722224185, + "language_loss": 0.71661818, + "learning_rate": 1.425011831266978e-06, + "loss": 0.73738921, + "num_input_tokens_seen": 216686125, + "step": 10063, + "time_per_iteration": 2.5947399139404297 + }, + { + "auxiliary_loss_clip": 0.01060297, + "auxiliary_loss_mlp": 0.01030552, + "balance_loss_clip": 1.02316546, + "balance_loss_mlp": 1.02008498, + "epoch": 0.6050804148504434, + "flos": 15960821391360.0, + "grad_norm": 1.6200490208610143, + "language_loss": 0.84917319, + "learning_rate": 1.424638822621926e-06, + "loss": 0.87008166, + "num_input_tokens_seen": 216704265, + "step": 10064, + "time_per_iteration": 4.135560989379883 + }, + { + "auxiliary_loss_clip": 0.0105463, + "auxiliary_loss_mlp": 0.01027534, + "balance_loss_clip": 1.02553535, + "balance_loss_mlp": 1.01659107, + "epoch": 0.6051405381031114, + "flos": 17456644391040.0, + "grad_norm": 2.73932231650446, + "language_loss": 0.79511058, + "learning_rate": 1.4242658357946278e-06, + "loss": 0.81593221, + "num_input_tokens_seen": 216721765, + "step": 10065, + "time_per_iteration": 2.635103702545166 + }, + { + "auxiliary_loss_clip": 0.01026592, + "auxiliary_loss_mlp": 0.01031706, + "balance_loss_clip": 1.0264343, + "balance_loss_mlp": 1.01897478, + "epoch": 0.6052006613557793, + "flos": 11400130517760.0, + "grad_norm": 2.562399205075833, + "language_loss": 0.78354204, + "learning_rate": 1.423892870799226e-06, + "loss": 0.80412507, + "num_input_tokens_seen": 216738295, + "step": 10066, + "time_per_iteration": 2.6548948287963867 + }, + { + "auxiliary_loss_clip": 0.01005632, + "auxiliary_loss_mlp": 0.01026034, + "balance_loss_clip": 1.02524829, + "balance_loss_mlp": 1.01491165, + "epoch": 0.6052607846084473, + "flos": 24750998807040.0, + "grad_norm": 1.820122452493257, + "language_loss": 0.73153031, + "learning_rate": 1.4235199276498655e-06, + "loss": 0.75184697, + "num_input_tokens_seen": 216759875, + "step": 10067, + "time_per_iteration": 2.784938097000122 + }, + { + "auxiliary_loss_clip": 0.01043563, + "auxiliary_loss_mlp": 0.0074747, + "balance_loss_clip": 1.02493668, + "balance_loss_mlp": 1.00053966, + "epoch": 0.6053209078611153, + "flos": 20741141975040.0, + "grad_norm": 1.3684162315577948, + "language_loss": 0.68938088, + "learning_rate": 1.4231470063606863e-06, + "loss": 0.70729125, + "num_input_tokens_seen": 216780705, + "step": 10068, + "time_per_iteration": 2.757995367050171 + }, + { + "auxiliary_loss_clip": 0.01055301, + "auxiliary_loss_mlp": 0.01028729, + "balance_loss_clip": 1.02546763, + "balance_loss_mlp": 1.01742244, + "epoch": 0.6053810311137833, + "flos": 18953149749120.0, + "grad_norm": 1.7764775827660868, + "language_loss": 0.86907673, + "learning_rate": 1.4227741069458303e-06, + "loss": 0.88991702, + "num_input_tokens_seen": 216797625, + "step": 10069, + "time_per_iteration": 2.754791736602783 + }, + { + "auxiliary_loss_clip": 0.01031503, + "auxiliary_loss_mlp": 0.01025563, + "balance_loss_clip": 1.02222776, + "balance_loss_mlp": 1.01522195, + "epoch": 0.6054411543664512, + "flos": 23951124794880.0, + "grad_norm": 2.4689831473493813, + "language_loss": 0.83092076, + "learning_rate": 1.4224012294194387e-06, + "loss": 0.85149145, + "num_input_tokens_seen": 216817610, + "step": 10070, + "time_per_iteration": 2.821223020553589 + }, + { + "auxiliary_loss_clip": 0.01048217, + "auxiliary_loss_mlp": 0.01029882, + "balance_loss_clip": 1.02675951, + "balance_loss_mlp": 1.01885509, + "epoch": 0.6055012776191192, + "flos": 20593979953920.0, + "grad_norm": 1.608325924644788, + "language_loss": 0.85976839, + "learning_rate": 1.4220283737956496e-06, + "loss": 0.88054937, + "num_input_tokens_seen": 216836835, + "step": 10071, + "time_per_iteration": 2.9044318199157715 + }, + { + "auxiliary_loss_clip": 0.01059028, + "auxiliary_loss_mlp": 0.01032338, + "balance_loss_clip": 1.02726102, + "balance_loss_mlp": 1.02000618, + "epoch": 0.6055614008717871, + "flos": 30298191782400.0, + "grad_norm": 1.8158971133820285, + "language_loss": 0.76972663, + "learning_rate": 1.421655540088603e-06, + "loss": 0.79064029, + "num_input_tokens_seen": 216856760, + "step": 10072, + "time_per_iteration": 2.7906696796417236 + }, + { + "auxiliary_loss_clip": 0.01041953, + "auxiliary_loss_mlp": 0.0102739, + "balance_loss_clip": 1.02299058, + "balance_loss_mlp": 1.01537406, + "epoch": 0.6056215241244551, + "flos": 27125017424640.0, + "grad_norm": 1.7186001611264212, + "language_loss": 0.74469692, + "learning_rate": 1.4212827283124367e-06, + "loss": 0.7653904, + "num_input_tokens_seen": 216878795, + "step": 10073, + "time_per_iteration": 2.7304365634918213 + }, + { + "auxiliary_loss_clip": 0.00970607, + "auxiliary_loss_mlp": 0.01001713, + "balance_loss_clip": 1.00381982, + "balance_loss_mlp": 1.00040174, + "epoch": 0.6056816473771232, + "flos": 56007323925120.0, + "grad_norm": 0.7561182914260192, + "language_loss": 0.55212939, + "learning_rate": 1.4209099384812863e-06, + "loss": 0.57185268, + "num_input_tokens_seen": 216937800, + "step": 10074, + "time_per_iteration": 3.3588802814483643 + }, + { + "auxiliary_loss_clip": 0.01028515, + "auxiliary_loss_mlp": 0.01035397, + "balance_loss_clip": 1.02450204, + "balance_loss_mlp": 1.02307665, + "epoch": 0.6057417706297911, + "flos": 23549499849600.0, + "grad_norm": 2.1389375944849514, + "language_loss": 0.81738997, + "learning_rate": 1.4205371706092894e-06, + "loss": 0.83802909, + "num_input_tokens_seen": 216955280, + "step": 10075, + "time_per_iteration": 2.830542802810669 + }, + { + "auxiliary_loss_clip": 0.01054123, + "auxiliary_loss_mlp": 0.01022766, + "balance_loss_clip": 1.02417302, + "balance_loss_mlp": 1.01139951, + "epoch": 0.6058018938824591, + "flos": 27744296832000.0, + "grad_norm": 1.80056056548091, + "language_loss": 0.78042841, + "learning_rate": 1.4201644247105813e-06, + "loss": 0.80119729, + "num_input_tokens_seen": 216976950, + "step": 10076, + "time_per_iteration": 2.768998861312866 + }, + { + "auxiliary_loss_clip": 0.01050011, + "auxiliary_loss_mlp": 0.01030357, + "balance_loss_clip": 1.02148342, + "balance_loss_mlp": 1.01833439, + "epoch": 0.605862017135127, + "flos": 22783381643520.0, + "grad_norm": 1.5998491895083675, + "language_loss": 0.71798867, + "learning_rate": 1.4197917007992964e-06, + "loss": 0.73879236, + "num_input_tokens_seen": 216996945, + "step": 10077, + "time_per_iteration": 2.6910300254821777 + }, + { + "auxiliary_loss_clip": 0.01065292, + "auxiliary_loss_mlp": 0.01032722, + "balance_loss_clip": 1.02492344, + "balance_loss_mlp": 1.021016, + "epoch": 0.605922140387795, + "flos": 21215019127680.0, + "grad_norm": 1.497681547604468, + "language_loss": 0.54832911, + "learning_rate": 1.4194189988895682e-06, + "loss": 0.56930929, + "num_input_tokens_seen": 217016580, + "step": 10078, + "time_per_iteration": 2.6594560146331787 + }, + { + "auxiliary_loss_clip": 0.0102172, + "auxiliary_loss_mlp": 0.01031888, + "balance_loss_clip": 1.02273703, + "balance_loss_mlp": 1.02021718, + "epoch": 0.6059822636404629, + "flos": 27268372604160.0, + "grad_norm": 1.6075044201000872, + "language_loss": 0.7025283, + "learning_rate": 1.4190463189955297e-06, + "loss": 0.72306436, + "num_input_tokens_seen": 217037300, + "step": 10079, + "time_per_iteration": 2.8542537689208984 + }, + { + "auxiliary_loss_clip": 0.01042815, + "auxiliary_loss_mlp": 0.01033715, + "balance_loss_clip": 1.02404165, + "balance_loss_mlp": 1.02264082, + "epoch": 0.606042386893131, + "flos": 20631327120000.0, + "grad_norm": 1.8263485255810477, + "language_loss": 0.62233996, + "learning_rate": 1.4186736611313131e-06, + "loss": 0.64310527, + "num_input_tokens_seen": 217055805, + "step": 10080, + "time_per_iteration": 2.768880844116211 + }, + { + "auxiliary_loss_clip": 0.01041933, + "auxiliary_loss_mlp": 0.01027851, + "balance_loss_clip": 1.02380919, + "balance_loss_mlp": 1.01557231, + "epoch": 0.6061025101457989, + "flos": 23002293081600.0, + "grad_norm": 1.7246327539076212, + "language_loss": 0.71378344, + "learning_rate": 1.4183010253110492e-06, + "loss": 0.73448128, + "num_input_tokens_seen": 217074175, + "step": 10081, + "time_per_iteration": 2.791686534881592 + }, + { + "auxiliary_loss_clip": 0.01045044, + "auxiliary_loss_mlp": 0.01028714, + "balance_loss_clip": 1.02636623, + "balance_loss_mlp": 1.01750207, + "epoch": 0.6061626333984669, + "flos": 29898937134720.0, + "grad_norm": 2.085254032272721, + "language_loss": 0.69374275, + "learning_rate": 1.4179284115488691e-06, + "loss": 0.7144804, + "num_input_tokens_seen": 217095695, + "step": 10082, + "time_per_iteration": 2.828002452850342 + }, + { + "auxiliary_loss_clip": 0.0106378, + "auxiliary_loss_mlp": 0.0103102, + "balance_loss_clip": 1.02617526, + "balance_loss_mlp": 1.01996326, + "epoch": 0.6062227566511348, + "flos": 25009196745600.0, + "grad_norm": 1.3253138115191727, + "language_loss": 0.65740603, + "learning_rate": 1.4175558198589015e-06, + "loss": 0.67835402, + "num_input_tokens_seen": 217116260, + "step": 10083, + "time_per_iteration": 2.6632444858551025 + }, + { + "auxiliary_loss_clip": 0.01052453, + "auxiliary_loss_mlp": 0.01026022, + "balance_loss_clip": 1.02389503, + "balance_loss_mlp": 1.01481009, + "epoch": 0.6062828799038028, + "flos": 19463943104640.0, + "grad_norm": 2.2798666246099097, + "language_loss": 0.73830223, + "learning_rate": 1.4171832502552764e-06, + "loss": 0.75908697, + "num_input_tokens_seen": 217134465, + "step": 10084, + "time_per_iteration": 2.783360242843628 + }, + { + "auxiliary_loss_clip": 0.01042607, + "auxiliary_loss_mlp": 0.01031461, + "balance_loss_clip": 1.02303624, + "balance_loss_mlp": 1.01998127, + "epoch": 0.6063430031564707, + "flos": 13589568120960.0, + "grad_norm": 2.2956526367612886, + "language_loss": 0.72516418, + "learning_rate": 1.4168107027521204e-06, + "loss": 0.7459048, + "num_input_tokens_seen": 217149920, + "step": 10085, + "time_per_iteration": 2.69150710105896 + }, + { + "auxiliary_loss_clip": 0.01063145, + "auxiliary_loss_mlp": 0.01033851, + "balance_loss_clip": 1.02500486, + "balance_loss_mlp": 1.02337229, + "epoch": 0.6064031264091387, + "flos": 23255499029760.0, + "grad_norm": 2.510563095839501, + "language_loss": 0.7616787, + "learning_rate": 1.4164381773635605e-06, + "loss": 0.78264868, + "num_input_tokens_seen": 217168165, + "step": 10086, + "time_per_iteration": 2.5654027462005615 + }, + { + "auxiliary_loss_clip": 0.01033152, + "auxiliary_loss_mlp": 0.01031811, + "balance_loss_clip": 1.02231812, + "balance_loss_mlp": 1.02043247, + "epoch": 0.6064632496618068, + "flos": 22458462192000.0, + "grad_norm": 1.3539489224328518, + "language_loss": 0.72645628, + "learning_rate": 1.4160656741037246e-06, + "loss": 0.74710596, + "num_input_tokens_seen": 217190070, + "step": 10087, + "time_per_iteration": 2.768044948577881 + }, + { + "auxiliary_loss_clip": 0.0105181, + "auxiliary_loss_mlp": 0.01029572, + "balance_loss_clip": 1.02510393, + "balance_loss_mlp": 1.02006459, + "epoch": 0.6065233729144747, + "flos": 25118652464640.0, + "grad_norm": 1.8349772359423284, + "language_loss": 0.83912933, + "learning_rate": 1.4156931929867355e-06, + "loss": 0.85994315, + "num_input_tokens_seen": 217209370, + "step": 10088, + "time_per_iteration": 2.8938968181610107 + }, + { + "auxiliary_loss_clip": 0.01012771, + "auxiliary_loss_mlp": 0.00747641, + "balance_loss_clip": 1.02117419, + "balance_loss_mlp": 1.00051916, + "epoch": 0.6065834961671427, + "flos": 23477355383040.0, + "grad_norm": 1.9928414367652918, + "language_loss": 0.71043062, + "learning_rate": 1.4153207340267201e-06, + "loss": 0.72803473, + "num_input_tokens_seen": 217226990, + "step": 10089, + "time_per_iteration": 2.8560662269592285 + }, + { + "auxiliary_loss_clip": 0.01054328, + "auxiliary_loss_mlp": 0.01032498, + "balance_loss_clip": 1.0257659, + "balance_loss_mlp": 1.02198982, + "epoch": 0.6066436194198106, + "flos": 17019396132480.0, + "grad_norm": 1.8821177485066514, + "language_loss": 0.82599998, + "learning_rate": 1.4149482972378009e-06, + "loss": 0.84686828, + "num_input_tokens_seen": 217244585, + "step": 10090, + "time_per_iteration": 2.7057275772094727 + }, + { + "auxiliary_loss_clip": 0.01041084, + "auxiliary_loss_mlp": 0.01038564, + "balance_loss_clip": 1.02806377, + "balance_loss_mlp": 1.02579081, + "epoch": 0.6067037426724786, + "flos": 18514752255360.0, + "grad_norm": 2.027430243896291, + "language_loss": 0.75899422, + "learning_rate": 1.4145758826341e-06, + "loss": 0.77979076, + "num_input_tokens_seen": 217263435, + "step": 10091, + "time_per_iteration": 5.919898748397827 + }, + { + "auxiliary_loss_clip": 0.01061492, + "auxiliary_loss_mlp": 0.01030398, + "balance_loss_clip": 1.02419734, + "balance_loss_mlp": 1.01956236, + "epoch": 0.6067638659251465, + "flos": 22345989730560.0, + "grad_norm": 1.596595380529648, + "language_loss": 0.79492766, + "learning_rate": 1.4142034902297415e-06, + "loss": 0.81584656, + "num_input_tokens_seen": 217283725, + "step": 10092, + "time_per_iteration": 2.6102511882781982 + }, + { + "auxiliary_loss_clip": 0.01044948, + "auxiliary_loss_mlp": 0.01034379, + "balance_loss_clip": 1.0246582, + "balance_loss_mlp": 1.02335215, + "epoch": 0.6068239891778145, + "flos": 12451019748480.0, + "grad_norm": 1.6475248252211168, + "language_loss": 0.75393617, + "learning_rate": 1.4138311200388444e-06, + "loss": 0.77472949, + "num_input_tokens_seen": 217301120, + "step": 10093, + "time_per_iteration": 2.7956483364105225 + }, + { + "auxiliary_loss_clip": 0.01036848, + "auxiliary_loss_mlp": 0.01034169, + "balance_loss_clip": 1.02395558, + "balance_loss_mlp": 1.02283168, + "epoch": 0.6068841124304825, + "flos": 23185868515200.0, + "grad_norm": 1.7097508968681565, + "language_loss": 0.87737179, + "learning_rate": 1.4134587720755304e-06, + "loss": 0.89808196, + "num_input_tokens_seen": 217319585, + "step": 10094, + "time_per_iteration": 2.717292308807373 + }, + { + "auxiliary_loss_clip": 0.01055139, + "auxiliary_loss_mlp": 0.01025381, + "balance_loss_clip": 1.02584136, + "balance_loss_mlp": 1.01427042, + "epoch": 0.6069442356831505, + "flos": 18587902302720.0, + "grad_norm": 1.5430087873393472, + "language_loss": 0.72295344, + "learning_rate": 1.413086446353919e-06, + "loss": 0.74375856, + "num_input_tokens_seen": 217338880, + "step": 10095, + "time_per_iteration": 2.6664180755615234 + }, + { + "auxiliary_loss_clip": 0.01035996, + "auxiliary_loss_mlp": 0.01025442, + "balance_loss_clip": 1.02182662, + "balance_loss_mlp": 1.01502323, + "epoch": 0.6070043589358184, + "flos": 20960340721920.0, + "grad_norm": 1.6119599267116054, + "language_loss": 0.76392692, + "learning_rate": 1.4127141428881273e-06, + "loss": 0.78454125, + "num_input_tokens_seen": 217357480, + "step": 10096, + "time_per_iteration": 2.71600604057312 + }, + { + "auxiliary_loss_clip": 0.01064406, + "auxiliary_loss_mlp": 0.01031796, + "balance_loss_clip": 1.02502906, + "balance_loss_mlp": 1.02140737, + "epoch": 0.6070644821884864, + "flos": 11692443398400.0, + "grad_norm": 3.2836209021447873, + "language_loss": 0.79333651, + "learning_rate": 1.4123418616922749e-06, + "loss": 0.81429857, + "num_input_tokens_seen": 217374575, + "step": 10097, + "time_per_iteration": 2.6317245960235596 + }, + { + "auxiliary_loss_clip": 0.01046414, + "auxiliary_loss_mlp": 0.01025831, + "balance_loss_clip": 1.02794671, + "balance_loss_mlp": 1.01538849, + "epoch": 0.6071246054411543, + "flos": 19310568030720.0, + "grad_norm": 1.3810553185280656, + "language_loss": 0.66810465, + "learning_rate": 1.411969602780478e-06, + "loss": 0.68882716, + "num_input_tokens_seen": 217392950, + "step": 10098, + "time_per_iteration": 3.0471489429473877 + }, + { + "auxiliary_loss_clip": 0.01062887, + "auxiliary_loss_mlp": 0.01025349, + "balance_loss_clip": 1.02451909, + "balance_loss_mlp": 1.0153656, + "epoch": 0.6071847286938223, + "flos": 17749029098880.0, + "grad_norm": 1.7542291674681618, + "language_loss": 0.80981237, + "learning_rate": 1.4115973661668523e-06, + "loss": 0.83069468, + "num_input_tokens_seen": 217412145, + "step": 10099, + "time_per_iteration": 2.6238479614257812 + }, + { + "auxiliary_loss_clip": 0.01035069, + "auxiliary_loss_mlp": 0.01030204, + "balance_loss_clip": 1.0229094, + "balance_loss_mlp": 1.01909924, + "epoch": 0.6072448519464904, + "flos": 22637512512000.0, + "grad_norm": 2.051506313973979, + "language_loss": 0.70517653, + "learning_rate": 1.4112251518655133e-06, + "loss": 0.72582924, + "num_input_tokens_seen": 217432080, + "step": 10100, + "time_per_iteration": 2.8390450477600098 + }, + { + "auxiliary_loss_clip": 0.01031442, + "auxiliary_loss_mlp": 0.01033174, + "balance_loss_clip": 1.02471066, + "balance_loss_mlp": 1.02114582, + "epoch": 0.6073049751991583, + "flos": 19537308633600.0, + "grad_norm": 1.8288512443452267, + "language_loss": 0.70525396, + "learning_rate": 1.4108529598905764e-06, + "loss": 0.72590011, + "num_input_tokens_seen": 217450945, + "step": 10101, + "time_per_iteration": 2.6524345874786377 + }, + { + "auxiliary_loss_clip": 0.01037007, + "auxiliary_loss_mlp": 0.01023643, + "balance_loss_clip": 1.02185631, + "balance_loss_mlp": 1.0129981, + "epoch": 0.6073650984518263, + "flos": 28294233033600.0, + "grad_norm": 1.6420807141523424, + "language_loss": 0.69633263, + "learning_rate": 1.410480790256154e-06, + "loss": 0.71693915, + "num_input_tokens_seen": 217473105, + "step": 10102, + "time_per_iteration": 2.6943278312683105 + }, + { + "auxiliary_loss_clip": 0.01064374, + "auxiliary_loss_mlp": 0.01029771, + "balance_loss_clip": 1.0256232, + "balance_loss_mlp": 1.01961446, + "epoch": 0.6074252217044942, + "flos": 25664422688640.0, + "grad_norm": 1.9621501877996694, + "language_loss": 0.73618519, + "learning_rate": 1.4101086429763589e-06, + "loss": 0.75712663, + "num_input_tokens_seen": 217491780, + "step": 10103, + "time_per_iteration": 4.304890871047974 + }, + { + "auxiliary_loss_clip": 0.01032393, + "auxiliary_loss_mlp": 0.01031599, + "balance_loss_clip": 1.0283885, + "balance_loss_mlp": 1.01935601, + "epoch": 0.6074853449571622, + "flos": 22857106308480.0, + "grad_norm": 6.800847982597521, + "language_loss": 0.76678324, + "learning_rate": 1.4097365180653032e-06, + "loss": 0.78742319, + "num_input_tokens_seen": 217510605, + "step": 10104, + "time_per_iteration": 2.689655303955078 + }, + { + "auxiliary_loss_clip": 0.00985215, + "auxiliary_loss_mlp": 0.01000995, + "balance_loss_clip": 1.00810552, + "balance_loss_mlp": 0.99983281, + "epoch": 0.6075454682098301, + "flos": 67111406547840.0, + "grad_norm": 0.7149249056909608, + "language_loss": 0.55987859, + "learning_rate": 1.4093644155370977e-06, + "loss": 0.57974064, + "num_input_tokens_seen": 217574815, + "step": 10105, + "time_per_iteration": 3.3321127891540527 + }, + { + "auxiliary_loss_clip": 0.01000161, + "auxiliary_loss_mlp": 0.01001599, + "balance_loss_clip": 1.00379181, + "balance_loss_mlp": 1.00037694, + "epoch": 0.6076055914624982, + "flos": 70712024751360.0, + "grad_norm": 0.7648963390748407, + "language_loss": 0.56878597, + "learning_rate": 1.4089923354058533e-06, + "loss": 0.58880353, + "num_input_tokens_seen": 217632375, + "step": 10106, + "time_per_iteration": 3.17399525642395 + }, + { + "auxiliary_loss_clip": 0.01011743, + "auxiliary_loss_mlp": 0.01036903, + "balance_loss_clip": 1.02104855, + "balance_loss_mlp": 1.02564955, + "epoch": 0.6076657147151661, + "flos": 28364545906560.0, + "grad_norm": 1.5130076252008897, + "language_loss": 0.68893754, + "learning_rate": 1.4086202776856784e-06, + "loss": 0.70942402, + "num_input_tokens_seen": 217653055, + "step": 10107, + "time_per_iteration": 2.7537262439727783 + }, + { + "auxiliary_loss_clip": 0.01050213, + "auxiliary_loss_mlp": 0.01026912, + "balance_loss_clip": 1.0234381, + "balance_loss_mlp": 1.0159204, + "epoch": 0.6077258379678341, + "flos": 15049767807360.0, + "grad_norm": 1.8234361626562035, + "language_loss": 0.80895388, + "learning_rate": 1.4082482423906815e-06, + "loss": 0.82972515, + "num_input_tokens_seen": 217671520, + "step": 10108, + "time_per_iteration": 2.5869839191436768 + }, + { + "auxiliary_loss_clip": 0.01038065, + "auxiliary_loss_mlp": 0.01029333, + "balance_loss_clip": 1.02347326, + "balance_loss_mlp": 1.01746595, + "epoch": 0.607785961220502, + "flos": 36167251553280.0, + "grad_norm": 1.852284237365599, + "language_loss": 0.71361631, + "learning_rate": 1.4078762295349714e-06, + "loss": 0.7342903, + "num_input_tokens_seen": 217691880, + "step": 10109, + "time_per_iteration": 2.78998064994812 + }, + { + "auxiliary_loss_clip": 0.01039814, + "auxiliary_loss_mlp": 0.01028305, + "balance_loss_clip": 1.0234139, + "balance_loss_mlp": 1.01838684, + "epoch": 0.60784608447317, + "flos": 22524249951360.0, + "grad_norm": 1.556887947988966, + "language_loss": 0.80248308, + "learning_rate": 1.407504239132653e-06, + "loss": 0.82316422, + "num_input_tokens_seen": 217710530, + "step": 10110, + "time_per_iteration": 2.666905641555786 + }, + { + "auxiliary_loss_clip": 0.01036212, + "auxiliary_loss_mlp": 0.01028704, + "balance_loss_clip": 1.02272236, + "balance_loss_mlp": 1.0170753, + "epoch": 0.6079062077258379, + "flos": 23841166285440.0, + "grad_norm": 2.207152406589817, + "language_loss": 0.70742214, + "learning_rate": 1.4071322711978338e-06, + "loss": 0.72807127, + "num_input_tokens_seen": 217728650, + "step": 10111, + "time_per_iteration": 4.357933521270752 + }, + { + "auxiliary_loss_clip": 0.01036063, + "auxiliary_loss_mlp": 0.01028618, + "balance_loss_clip": 1.02558613, + "balance_loss_mlp": 1.01751399, + "epoch": 0.6079663309785059, + "flos": 23367037737600.0, + "grad_norm": 2.1654184189911896, + "language_loss": 0.65076685, + "learning_rate": 1.4067603257446186e-06, + "loss": 0.67141366, + "num_input_tokens_seen": 217747135, + "step": 10112, + "time_per_iteration": 2.7503249645233154 + }, + { + "auxiliary_loss_clip": 0.0099944, + "auxiliary_loss_mlp": 0.01003135, + "balance_loss_clip": 1.00282645, + "balance_loss_mlp": 1.00202048, + "epoch": 0.6080264542311739, + "flos": 71382873110400.0, + "grad_norm": 0.9272420776387181, + "language_loss": 0.49562573, + "learning_rate": 1.4063884027871105e-06, + "loss": 0.51565146, + "num_input_tokens_seen": 217811860, + "step": 10113, + "time_per_iteration": 3.383913040161133 + }, + { + "auxiliary_loss_clip": 0.0099886, + "auxiliary_loss_mlp": 0.01001734, + "balance_loss_clip": 1.00216889, + "balance_loss_mlp": 1.00054765, + "epoch": 0.6080865774838419, + "flos": 66529833442560.0, + "grad_norm": 0.8369783158280113, + "language_loss": 0.57010633, + "learning_rate": 1.4060165023394147e-06, + "loss": 0.59011227, + "num_input_tokens_seen": 217866510, + "step": 10114, + "time_per_iteration": 3.1080260276794434 + }, + { + "auxiliary_loss_clip": 0.01066377, + "auxiliary_loss_mlp": 0.01024895, + "balance_loss_clip": 1.02571809, + "balance_loss_mlp": 1.01295006, + "epoch": 0.6081467007365099, + "flos": 19207935895680.0, + "grad_norm": 1.9577090129895725, + "language_loss": 0.69954354, + "learning_rate": 1.4056446244156317e-06, + "loss": 0.72045624, + "num_input_tokens_seen": 217885650, + "step": 10115, + "time_per_iteration": 2.6351571083068848 + }, + { + "auxiliary_loss_clip": 0.01022114, + "auxiliary_loss_mlp": 0.01027139, + "balance_loss_clip": 1.02117872, + "balance_loss_mlp": 1.0160172, + "epoch": 0.6082068239891778, + "flos": 24167737762560.0, + "grad_norm": 2.0053859931574154, + "language_loss": 0.72418416, + "learning_rate": 1.4052727690298642e-06, + "loss": 0.74467665, + "num_input_tokens_seen": 217905300, + "step": 10116, + "time_per_iteration": 2.781681537628174 + }, + { + "auxiliary_loss_clip": 0.01044316, + "auxiliary_loss_mlp": 0.01034521, + "balance_loss_clip": 1.02510357, + "balance_loss_mlp": 1.02195013, + "epoch": 0.6082669472418458, + "flos": 37413316310400.0, + "grad_norm": 1.7233219944531737, + "language_loss": 0.53699529, + "learning_rate": 1.4049009361962138e-06, + "loss": 0.5577836, + "num_input_tokens_seen": 217927845, + "step": 10117, + "time_per_iteration": 2.8478710651397705 + }, + { + "auxiliary_loss_clip": 0.01046835, + "auxiliary_loss_mlp": 0.01024567, + "balance_loss_clip": 1.02605176, + "balance_loss_mlp": 1.01405275, + "epoch": 0.6083270704945137, + "flos": 15085534775040.0, + "grad_norm": 1.8838989184131272, + "language_loss": 0.70054656, + "learning_rate": 1.4045291259287786e-06, + "loss": 0.72126055, + "num_input_tokens_seen": 217946145, + "step": 10118, + "time_per_iteration": 2.827573776245117 + }, + { + "auxiliary_loss_clip": 0.00998438, + "auxiliary_loss_mlp": 0.01028278, + "balance_loss_clip": 1.02155185, + "balance_loss_mlp": 1.01773369, + "epoch": 0.6083871937471818, + "flos": 20668458804480.0, + "grad_norm": 1.5119585144744354, + "language_loss": 0.74783516, + "learning_rate": 1.4041573382416588e-06, + "loss": 0.76810235, + "num_input_tokens_seen": 217965190, + "step": 10119, + "time_per_iteration": 2.825330972671509 + }, + { + "auxiliary_loss_clip": 0.01052739, + "auxiliary_loss_mlp": 0.01026752, + "balance_loss_clip": 1.02481508, + "balance_loss_mlp": 1.01636899, + "epoch": 0.6084473169998497, + "flos": 21506901045120.0, + "grad_norm": 1.7765695444219083, + "language_loss": 0.67390746, + "learning_rate": 1.4037855731489525e-06, + "loss": 0.69470233, + "num_input_tokens_seen": 217983625, + "step": 10120, + "time_per_iteration": 2.736295461654663 + }, + { + "auxiliary_loss_clip": 0.01057984, + "auxiliary_loss_mlp": 0.01030219, + "balance_loss_clip": 1.02714777, + "balance_loss_mlp": 1.01870298, + "epoch": 0.6085074402525177, + "flos": 26870051710080.0, + "grad_norm": 1.7652080269679336, + "language_loss": 0.74265528, + "learning_rate": 1.4034138306647571e-06, + "loss": 0.76353729, + "num_input_tokens_seen": 218006005, + "step": 10121, + "time_per_iteration": 2.707827568054199 + }, + { + "auxiliary_loss_clip": 0.01054712, + "auxiliary_loss_mlp": 0.01025097, + "balance_loss_clip": 1.02541065, + "balance_loss_mlp": 1.01504755, + "epoch": 0.6085675635051856, + "flos": 10889839952640.0, + "grad_norm": 1.6529353381591134, + "language_loss": 0.80605948, + "learning_rate": 1.4030421108031685e-06, + "loss": 0.82685757, + "num_input_tokens_seen": 218024195, + "step": 10122, + "time_per_iteration": 2.6728434562683105 + }, + { + "auxiliary_loss_clip": 0.01046212, + "auxiliary_loss_mlp": 0.01029201, + "balance_loss_clip": 1.02376354, + "balance_loss_mlp": 1.01806676, + "epoch": 0.6086276867578536, + "flos": 34862186707200.0, + "grad_norm": 1.6678923387325173, + "language_loss": 0.55641603, + "learning_rate": 1.402670413578284e-06, + "loss": 0.57717013, + "num_input_tokens_seen": 218047190, + "step": 10123, + "time_per_iteration": 2.747068166732788 + }, + { + "auxiliary_loss_clip": 0.01057743, + "auxiliary_loss_mlp": 0.01032406, + "balance_loss_clip": 1.02823377, + "balance_loss_mlp": 1.02145684, + "epoch": 0.6086878100105215, + "flos": 20047706939520.0, + "grad_norm": 1.769090258428718, + "language_loss": 0.74115998, + "learning_rate": 1.4022987390041965e-06, + "loss": 0.76206148, + "num_input_tokens_seen": 218065945, + "step": 10124, + "time_per_iteration": 2.627836227416992 + }, + { + "auxiliary_loss_clip": 0.01035695, + "auxiliary_loss_mlp": 0.01027907, + "balance_loss_clip": 1.02230823, + "balance_loss_mlp": 1.01677847, + "epoch": 0.6087479332631895, + "flos": 18332469711360.0, + "grad_norm": 2.6605966795280995, + "language_loss": 0.65638971, + "learning_rate": 1.4019270870950006e-06, + "loss": 0.67702568, + "num_input_tokens_seen": 218085285, + "step": 10125, + "time_per_iteration": 2.6490206718444824 + }, + { + "auxiliary_loss_clip": 0.01063765, + "auxiliary_loss_mlp": 0.01027744, + "balance_loss_clip": 1.0264225, + "balance_loss_mlp": 1.01708114, + "epoch": 0.6088080565158575, + "flos": 24493411399680.0, + "grad_norm": 1.5626933393203404, + "language_loss": 0.76145083, + "learning_rate": 1.40155545786479e-06, + "loss": 0.78236592, + "num_input_tokens_seen": 218104735, + "step": 10126, + "time_per_iteration": 2.6416008472442627 + }, + { + "auxiliary_loss_clip": 0.01038043, + "auxiliary_loss_mlp": 0.01026464, + "balance_loss_clip": 1.02693105, + "balance_loss_mlp": 1.01467466, + "epoch": 0.6088681797685255, + "flos": 10269016260480.0, + "grad_norm": 2.5501676332663425, + "language_loss": 0.71653467, + "learning_rate": 1.4011838513276558e-06, + "loss": 0.73717976, + "num_input_tokens_seen": 218121855, + "step": 10127, + "time_per_iteration": 2.673098564147949 + }, + { + "auxiliary_loss_clip": 0.0106711, + "auxiliary_loss_mlp": 0.01029613, + "balance_loss_clip": 1.02554822, + "balance_loss_mlp": 1.0180676, + "epoch": 0.6089283030211935, + "flos": 21973703218560.0, + "grad_norm": 3.5906668941035846, + "language_loss": 0.72716641, + "learning_rate": 1.400812267497691e-06, + "loss": 0.74813366, + "num_input_tokens_seen": 218137325, + "step": 10128, + "time_per_iteration": 2.7094578742980957 + }, + { + "auxiliary_loss_clip": 0.01017586, + "auxiliary_loss_mlp": 0.01025237, + "balance_loss_clip": 1.02617073, + "balance_loss_mlp": 1.01474059, + "epoch": 0.6089884262738614, + "flos": 17785191116160.0, + "grad_norm": 2.6591323598633223, + "language_loss": 0.73220068, + "learning_rate": 1.4004407063889842e-06, + "loss": 0.75262892, + "num_input_tokens_seen": 218155530, + "step": 10129, + "time_per_iteration": 2.8824045658111572 + }, + { + "auxiliary_loss_clip": 0.01062272, + "auxiliary_loss_mlp": 0.01030171, + "balance_loss_clip": 1.02387309, + "balance_loss_mlp": 1.01928186, + "epoch": 0.6090485495265294, + "flos": 36910423946880.0, + "grad_norm": 2.367849226388532, + "language_loss": 0.65607631, + "learning_rate": 1.400069168015626e-06, + "loss": 0.67700076, + "num_input_tokens_seen": 218182535, + "step": 10130, + "time_per_iteration": 2.8181662559509277 + }, + { + "auxiliary_loss_clip": 0.01040815, + "auxiliary_loss_mlp": 0.01024709, + "balance_loss_clip": 1.02390265, + "balance_loss_mlp": 1.01514888, + "epoch": 0.6091086727791973, + "flos": 19899036547200.0, + "grad_norm": 1.7601501155331287, + "language_loss": 0.77240556, + "learning_rate": 1.3996976523917054e-06, + "loss": 0.79306078, + "num_input_tokens_seen": 218201740, + "step": 10131, + "time_per_iteration": 2.8118057250976562 + }, + { + "auxiliary_loss_clip": 0.01034815, + "auxiliary_loss_mlp": 0.01027881, + "balance_loss_clip": 1.0259304, + "balance_loss_mlp": 1.01845121, + "epoch": 0.6091687960318654, + "flos": 22163635359360.0, + "grad_norm": 1.7123839131032386, + "language_loss": 0.77058661, + "learning_rate": 1.3993261595313093e-06, + "loss": 0.79121357, + "num_input_tokens_seen": 218219800, + "step": 10132, + "time_per_iteration": 2.8280839920043945 + }, + { + "auxiliary_loss_clip": 0.01060911, + "auxiliary_loss_mlp": 0.01029954, + "balance_loss_clip": 1.02539754, + "balance_loss_mlp": 1.02032804, + "epoch": 0.6092289192845333, + "flos": 21465280160640.0, + "grad_norm": 1.7374567187557797, + "language_loss": 0.75810248, + "learning_rate": 1.3989546894485261e-06, + "loss": 0.77901113, + "num_input_tokens_seen": 218237585, + "step": 10133, + "time_per_iteration": 2.625810384750366 + }, + { + "auxiliary_loss_clip": 0.01052584, + "auxiliary_loss_mlp": 0.01026701, + "balance_loss_clip": 1.02479053, + "balance_loss_mlp": 1.01528049, + "epoch": 0.6092890425372013, + "flos": 28694924225280.0, + "grad_norm": 1.7185064566016126, + "language_loss": 0.63911432, + "learning_rate": 1.3985832421574414e-06, + "loss": 0.6599071, + "num_input_tokens_seen": 218258700, + "step": 10134, + "time_per_iteration": 2.685149908065796 + }, + { + "auxiliary_loss_clip": 0.01040333, + "auxiliary_loss_mlp": 0.01025212, + "balance_loss_clip": 1.02334261, + "balance_loss_mlp": 1.01491201, + "epoch": 0.6093491657898692, + "flos": 20813178700800.0, + "grad_norm": 1.7736612074256357, + "language_loss": 0.78451395, + "learning_rate": 1.3982118176721397e-06, + "loss": 0.8051694, + "num_input_tokens_seen": 218275655, + "step": 10135, + "time_per_iteration": 2.628350257873535 + }, + { + "auxiliary_loss_clip": 0.01043748, + "auxiliary_loss_mlp": 0.01026519, + "balance_loss_clip": 1.02491236, + "balance_loss_mlp": 1.01636231, + "epoch": 0.6094092890425372, + "flos": 25446983708160.0, + "grad_norm": 1.9317351508284368, + "language_loss": 0.72209489, + "learning_rate": 1.3978404160067069e-06, + "loss": 0.74279755, + "num_input_tokens_seen": 218295720, + "step": 10136, + "time_per_iteration": 2.7248148918151855 + }, + { + "auxiliary_loss_clip": 0.01064961, + "auxiliary_loss_mlp": 0.01028611, + "balance_loss_clip": 1.02583909, + "balance_loss_mlp": 1.01805484, + "epoch": 0.6094694122952051, + "flos": 35621265847680.0, + "grad_norm": 1.8469681915451848, + "language_loss": 0.74302638, + "learning_rate": 1.3974690371752253e-06, + "loss": 0.76396215, + "num_input_tokens_seen": 218316745, + "step": 10137, + "time_per_iteration": 2.7271554470062256 + }, + { + "auxiliary_loss_clip": 0.01047736, + "auxiliary_loss_mlp": 0.01033738, + "balance_loss_clip": 1.02283359, + "balance_loss_mlp": 1.02164984, + "epoch": 0.6095295355478731, + "flos": 24456962073600.0, + "grad_norm": 1.7615606039965732, + "language_loss": 0.80269015, + "learning_rate": 1.3970976811917785e-06, + "loss": 0.82350492, + "num_input_tokens_seen": 218335385, + "step": 10138, + "time_per_iteration": 4.35729193687439 + }, + { + "auxiliary_loss_clip": 0.01038228, + "auxiliary_loss_mlp": 0.01027773, + "balance_loss_clip": 1.02280283, + "balance_loss_mlp": 1.0179143, + "epoch": 0.6095896588005411, + "flos": 15633208419840.0, + "grad_norm": 1.5606810585169575, + "language_loss": 0.81131673, + "learning_rate": 1.3967263480704481e-06, + "loss": 0.83197665, + "num_input_tokens_seen": 218353320, + "step": 10139, + "time_per_iteration": 4.234931707382202 + }, + { + "auxiliary_loss_clip": 0.01022551, + "auxiliary_loss_mlp": 0.01029925, + "balance_loss_clip": 1.02254534, + "balance_loss_mlp": 1.01850438, + "epoch": 0.6096497820532091, + "flos": 15550577182080.0, + "grad_norm": 2.1226541661477287, + "language_loss": 0.83688068, + "learning_rate": 1.396355037825315e-06, + "loss": 0.85740548, + "num_input_tokens_seen": 218365620, + "step": 10140, + "time_per_iteration": 2.6737403869628906 + }, + { + "auxiliary_loss_clip": 0.01053637, + "auxiliary_loss_mlp": 0.01025878, + "balance_loss_clip": 1.02377677, + "balance_loss_mlp": 1.01510179, + "epoch": 0.6097099053058771, + "flos": 24204474397440.0, + "grad_norm": 1.8728009802992545, + "language_loss": 0.75367242, + "learning_rate": 1.3959837504704592e-06, + "loss": 0.77446759, + "num_input_tokens_seen": 218383785, + "step": 10141, + "time_per_iteration": 2.801238536834717 + }, + { + "auxiliary_loss_clip": 0.0103654, + "auxiliary_loss_mlp": 0.01028155, + "balance_loss_clip": 1.02285373, + "balance_loss_mlp": 1.0165205, + "epoch": 0.609770028558545, + "flos": 19570238426880.0, + "grad_norm": 1.7909113595787822, + "language_loss": 0.76538408, + "learning_rate": 1.3956124860199603e-06, + "loss": 0.78603107, + "num_input_tokens_seen": 218399055, + "step": 10142, + "time_per_iteration": 2.7652976512908936 + }, + { + "auxiliary_loss_clip": 0.01062412, + "auxiliary_loss_mlp": 0.01029259, + "balance_loss_clip": 1.02443004, + "balance_loss_mlp": 1.01726031, + "epoch": 0.609830151811213, + "flos": 23949185460480.0, + "grad_norm": 1.7089374549812761, + "language_loss": 0.76472473, + "learning_rate": 1.3952412444878964e-06, + "loss": 0.78564137, + "num_input_tokens_seen": 218419120, + "step": 10143, + "time_per_iteration": 2.643529176712036 + }, + { + "auxiliary_loss_clip": 0.0104656, + "auxiliary_loss_mlp": 0.0103318, + "balance_loss_clip": 1.0220356, + "balance_loss_mlp": 1.02079451, + "epoch": 0.6098902750638809, + "flos": 16179732829440.0, + "grad_norm": 1.6728442945364297, + "language_loss": 0.74912488, + "learning_rate": 1.3948700258883448e-06, + "loss": 0.76992226, + "num_input_tokens_seen": 218435290, + "step": 10144, + "time_per_iteration": 2.5881056785583496 + }, + { + "auxiliary_loss_clip": 0.01038335, + "auxiliary_loss_mlp": 0.01027453, + "balance_loss_clip": 1.02338958, + "balance_loss_mlp": 1.01587772, + "epoch": 0.609950398316549, + "flos": 44526393763200.0, + "grad_norm": 2.533467948167268, + "language_loss": 0.72772384, + "learning_rate": 1.394498830235383e-06, + "loss": 0.74838173, + "num_input_tokens_seen": 218457880, + "step": 10145, + "time_per_iteration": 2.819160223007202 + }, + { + "auxiliary_loss_clip": 0.01036839, + "auxiliary_loss_mlp": 0.01030796, + "balance_loss_clip": 1.02222908, + "balance_loss_mlp": 1.02040148, + "epoch": 0.6100105215692169, + "flos": 23221743223680.0, + "grad_norm": 3.026733940663667, + "language_loss": 0.69331115, + "learning_rate": 1.3941276575430862e-06, + "loss": 0.71398753, + "num_input_tokens_seen": 218475930, + "step": 10146, + "time_per_iteration": 2.6478195190429688 + }, + { + "auxiliary_loss_clip": 0.01021433, + "auxiliary_loss_mlp": 0.00747467, + "balance_loss_clip": 1.0241487, + "balance_loss_mlp": 1.00054216, + "epoch": 0.6100706448218849, + "flos": 15012564295680.0, + "grad_norm": 1.6420727281450151, + "language_loss": 0.77074838, + "learning_rate": 1.3937565078255289e-06, + "loss": 0.78843737, + "num_input_tokens_seen": 218493675, + "step": 10147, + "time_per_iteration": 2.7024996280670166 + }, + { + "auxiliary_loss_clip": 0.01041112, + "auxiliary_loss_mlp": 0.01022375, + "balance_loss_clip": 1.02302575, + "balance_loss_mlp": 1.01212287, + "epoch": 0.6101307680745528, + "flos": 19639976682240.0, + "grad_norm": 1.8057152921554402, + "language_loss": 0.78561264, + "learning_rate": 1.393385381096786e-06, + "loss": 0.80624747, + "num_input_tokens_seen": 218511780, + "step": 10148, + "time_per_iteration": 2.6708829402923584 + }, + { + "auxiliary_loss_clip": 0.01026073, + "auxiliary_loss_mlp": 0.01032929, + "balance_loss_clip": 1.02042556, + "balance_loss_mlp": 1.02045369, + "epoch": 0.6101908913272208, + "flos": 29935566028800.0, + "grad_norm": 2.390618317974661, + "language_loss": 0.53723061, + "learning_rate": 1.39301427737093e-06, + "loss": 0.55782062, + "num_input_tokens_seen": 218531850, + "step": 10149, + "time_per_iteration": 2.7844042778015137 + }, + { + "auxiliary_loss_clip": 0.01042769, + "auxiliary_loss_mlp": 0.01030568, + "balance_loss_clip": 1.02655554, + "balance_loss_mlp": 1.02005351, + "epoch": 0.6102510145798887, + "flos": 21798639308160.0, + "grad_norm": 1.6941093177177151, + "language_loss": 0.80283916, + "learning_rate": 1.3926431966620333e-06, + "loss": 0.82357252, + "num_input_tokens_seen": 218551245, + "step": 10150, + "time_per_iteration": 2.7430577278137207 + }, + { + "auxiliary_loss_clip": 0.01047116, + "auxiliary_loss_mlp": 0.0103372, + "balance_loss_clip": 1.02669525, + "balance_loss_mlp": 1.02202582, + "epoch": 0.6103111378325567, + "flos": 20706129192960.0, + "grad_norm": 2.301964048790761, + "language_loss": 0.68825054, + "learning_rate": 1.3922721389841684e-06, + "loss": 0.70905888, + "num_input_tokens_seen": 218571365, + "step": 10151, + "time_per_iteration": 4.371477127075195 + }, + { + "auxiliary_loss_clip": 0.01061776, + "auxiliary_loss_mlp": 0.01026022, + "balance_loss_clip": 1.02349639, + "balance_loss_mlp": 1.01590681, + "epoch": 0.6103712610852247, + "flos": 29381643417600.0, + "grad_norm": 1.6417734722890465, + "language_loss": 0.71141607, + "learning_rate": 1.3919011043514036e-06, + "loss": 0.73229396, + "num_input_tokens_seen": 218588315, + "step": 10152, + "time_per_iteration": 2.6926422119140625 + }, + { + "auxiliary_loss_clip": 0.0102997, + "auxiliary_loss_mlp": 0.01031529, + "balance_loss_clip": 1.0266819, + "balance_loss_mlp": 1.02061534, + "epoch": 0.6104313843378927, + "flos": 20813035046400.0, + "grad_norm": 1.6733446372978698, + "language_loss": 0.78378749, + "learning_rate": 1.391530092777811e-06, + "loss": 0.80440247, + "num_input_tokens_seen": 218605940, + "step": 10153, + "time_per_iteration": 2.783538818359375 + }, + { + "auxiliary_loss_clip": 0.01037367, + "auxiliary_loss_mlp": 0.01027525, + "balance_loss_clip": 1.02291846, + "balance_loss_mlp": 1.01666474, + "epoch": 0.6104915075905607, + "flos": 26578457101440.0, + "grad_norm": 1.9457802804836404, + "language_loss": 0.79344738, + "learning_rate": 1.3911591042774573e-06, + "loss": 0.81409633, + "num_input_tokens_seen": 218626100, + "step": 10154, + "time_per_iteration": 2.808467149734497 + }, + { + "auxiliary_loss_clip": 0.01052881, + "auxiliary_loss_mlp": 0.01024698, + "balance_loss_clip": 1.02487481, + "balance_loss_mlp": 1.01437426, + "epoch": 0.6105516308432286, + "flos": 23915788790400.0, + "grad_norm": 1.4942594914745084, + "language_loss": 0.70175123, + "learning_rate": 1.3907881388644116e-06, + "loss": 0.72252703, + "num_input_tokens_seen": 218645060, + "step": 10155, + "time_per_iteration": 2.724544048309326 + }, + { + "auxiliary_loss_clip": 0.01053607, + "auxiliary_loss_mlp": 0.0102657, + "balance_loss_clip": 1.02513337, + "balance_loss_mlp": 1.01503682, + "epoch": 0.6106117540958966, + "flos": 31577365900800.0, + "grad_norm": 4.299659037039452, + "language_loss": 0.71170509, + "learning_rate": 1.3904171965527413e-06, + "loss": 0.73250687, + "num_input_tokens_seen": 218667690, + "step": 10156, + "time_per_iteration": 2.775263547897339 + }, + { + "auxiliary_loss_clip": 0.01041071, + "auxiliary_loss_mlp": 0.010285, + "balance_loss_clip": 1.02438235, + "balance_loss_mlp": 1.01724708, + "epoch": 0.6106718773485645, + "flos": 19608160210560.0, + "grad_norm": 2.8383359209144055, + "language_loss": 0.67435825, + "learning_rate": 1.3900462773565114e-06, + "loss": 0.69505399, + "num_input_tokens_seen": 218687505, + "step": 10157, + "time_per_iteration": 2.7228145599365234 + }, + { + "auxiliary_loss_clip": 0.01023793, + "auxiliary_loss_mlp": 0.01025244, + "balance_loss_clip": 1.02147031, + "balance_loss_mlp": 1.01442552, + "epoch": 0.6107320006012326, + "flos": 17123895774720.0, + "grad_norm": 2.4544178856349204, + "language_loss": 0.72615319, + "learning_rate": 1.3896753812897877e-06, + "loss": 0.7466436, + "num_input_tokens_seen": 218705315, + "step": 10158, + "time_per_iteration": 2.7292070388793945 + }, + { + "auxiliary_loss_clip": 0.01058262, + "auxiliary_loss_mlp": 0.01030055, + "balance_loss_clip": 1.02829599, + "balance_loss_mlp": 1.01929617, + "epoch": 0.6107921238539005, + "flos": 30148228500480.0, + "grad_norm": 1.4486624396242849, + "language_loss": 0.6892103, + "learning_rate": 1.389304508366635e-06, + "loss": 0.7100935, + "num_input_tokens_seen": 218725735, + "step": 10159, + "time_per_iteration": 4.566304683685303 + }, + { + "auxiliary_loss_clip": 0.01065456, + "auxiliary_loss_mlp": 0.01027649, + "balance_loss_clip": 1.02563488, + "balance_loss_mlp": 1.01596105, + "epoch": 0.6108522471065685, + "flos": 18440273404800.0, + "grad_norm": 2.091056321211972, + "language_loss": 0.78815681, + "learning_rate": 1.3889336586011167e-06, + "loss": 0.80908787, + "num_input_tokens_seen": 218743215, + "step": 10160, + "time_per_iteration": 2.619668960571289 + }, + { + "auxiliary_loss_clip": 0.00999506, + "auxiliary_loss_mlp": 0.01007155, + "balance_loss_clip": 1.00289726, + "balance_loss_mlp": 1.00594473, + "epoch": 0.6109123703592364, + "flos": 64135454791680.0, + "grad_norm": 0.8263478215526234, + "language_loss": 0.61543906, + "learning_rate": 1.388562832007295e-06, + "loss": 0.63550568, + "num_input_tokens_seen": 218806440, + "step": 10161, + "time_per_iteration": 3.4448468685150146 + }, + { + "auxiliary_loss_clip": 0.01046916, + "auxiliary_loss_mlp": 0.00747751, + "balance_loss_clip": 1.02603936, + "balance_loss_mlp": 1.00053072, + "epoch": 0.6109724936119044, + "flos": 20667848273280.0, + "grad_norm": 1.6864696559626071, + "language_loss": 0.76147318, + "learning_rate": 1.3881920285992324e-06, + "loss": 0.7794199, + "num_input_tokens_seen": 218825720, + "step": 10162, + "time_per_iteration": 2.690876007080078 + }, + { + "auxiliary_loss_clip": 0.01062936, + "auxiliary_loss_mlp": 0.01030936, + "balance_loss_clip": 1.02476358, + "balance_loss_mlp": 1.01962292, + "epoch": 0.6110326168645723, + "flos": 31351882273920.0, + "grad_norm": 5.884783573538306, + "language_loss": 0.71577013, + "learning_rate": 1.3878212483909888e-06, + "loss": 0.73670888, + "num_input_tokens_seen": 218847735, + "step": 10163, + "time_per_iteration": 2.7159831523895264 + }, + { + "auxiliary_loss_clip": 0.01061265, + "auxiliary_loss_mlp": 0.01026742, + "balance_loss_clip": 1.02491772, + "balance_loss_mlp": 1.01713967, + "epoch": 0.6110927401172404, + "flos": 25003378742400.0, + "grad_norm": 13.533474656047884, + "language_loss": 0.59798837, + "learning_rate": 1.387450491396625e-06, + "loss": 0.61886847, + "num_input_tokens_seen": 218866585, + "step": 10164, + "time_per_iteration": 2.6661157608032227 + }, + { + "auxiliary_loss_clip": 0.0105444, + "auxiliary_loss_mlp": 0.01029035, + "balance_loss_clip": 1.02575803, + "balance_loss_mlp": 1.01828861, + "epoch": 0.6111528633699083, + "flos": 26248078782720.0, + "grad_norm": 1.5505863761740881, + "language_loss": 0.75733817, + "learning_rate": 1.3870797576302003e-06, + "loss": 0.77817297, + "num_input_tokens_seen": 218885560, + "step": 10165, + "time_per_iteration": 2.6465511322021484 + }, + { + "auxiliary_loss_clip": 0.01044846, + "auxiliary_loss_mlp": 0.01028203, + "balance_loss_clip": 1.02787971, + "balance_loss_mlp": 1.01737845, + "epoch": 0.6112129866225763, + "flos": 22382474970240.0, + "grad_norm": 1.4850508148835482, + "language_loss": 0.79351723, + "learning_rate": 1.3867090471057719e-06, + "loss": 0.81424773, + "num_input_tokens_seen": 218905055, + "step": 10166, + "time_per_iteration": 2.7587060928344727 + }, + { + "auxiliary_loss_clip": 0.01047241, + "auxiliary_loss_mlp": 0.01028441, + "balance_loss_clip": 1.02651334, + "balance_loss_mlp": 1.01728296, + "epoch": 0.6112731098752443, + "flos": 25227892702080.0, + "grad_norm": 2.124396628585164, + "language_loss": 0.67199802, + "learning_rate": 1.3863383598373987e-06, + "loss": 0.69275481, + "num_input_tokens_seen": 218924030, + "step": 10167, + "time_per_iteration": 2.684004783630371 + }, + { + "auxiliary_loss_clip": 0.01063401, + "auxiliary_loss_mlp": 0.01034124, + "balance_loss_clip": 1.0255363, + "balance_loss_mlp": 1.024194, + "epoch": 0.6113332331279122, + "flos": 22893160584960.0, + "grad_norm": 2.621898519814346, + "language_loss": 0.78939027, + "learning_rate": 1.3859676958391364e-06, + "loss": 0.8103655, + "num_input_tokens_seen": 218943750, + "step": 10168, + "time_per_iteration": 2.612603187561035 + }, + { + "auxiliary_loss_clip": 0.01068529, + "auxiliary_loss_mlp": 0.01038255, + "balance_loss_clip": 1.02515888, + "balance_loss_mlp": 1.02563679, + "epoch": 0.6113933563805802, + "flos": 18620329305600.0, + "grad_norm": 2.693863519609053, + "language_loss": 0.85566777, + "learning_rate": 1.3855970551250398e-06, + "loss": 0.87673563, + "num_input_tokens_seen": 218957585, + "step": 10169, + "time_per_iteration": 2.501713991165161 + }, + { + "auxiliary_loss_clip": 0.01061581, + "auxiliary_loss_mlp": 0.01027128, + "balance_loss_clip": 1.02467966, + "balance_loss_mlp": 1.01760328, + "epoch": 0.6114534796332481, + "flos": 41866275317760.0, + "grad_norm": 1.6512762840479713, + "language_loss": 0.78723025, + "learning_rate": 1.3852264377091652e-06, + "loss": 0.80811727, + "num_input_tokens_seen": 218980025, + "step": 10170, + "time_per_iteration": 2.7390382289886475 + }, + { + "auxiliary_loss_clip": 0.01047021, + "auxiliary_loss_mlp": 0.01037366, + "balance_loss_clip": 1.02490389, + "balance_loss_mlp": 1.02560043, + "epoch": 0.6115136028859162, + "flos": 21908454163200.0, + "grad_norm": 3.2282654362356884, + "language_loss": 0.68734276, + "learning_rate": 1.3848558436055651e-06, + "loss": 0.70818669, + "num_input_tokens_seen": 218998200, + "step": 10171, + "time_per_iteration": 2.63561749458313 + }, + { + "auxiliary_loss_clip": 0.01034519, + "auxiliary_loss_mlp": 0.0103477, + "balance_loss_clip": 1.02400279, + "balance_loss_mlp": 1.02188969, + "epoch": 0.6115737261385841, + "flos": 28804846821120.0, + "grad_norm": 2.1895428485315667, + "language_loss": 0.78872496, + "learning_rate": 1.3844852728282934e-06, + "loss": 0.80941784, + "num_input_tokens_seen": 219017910, + "step": 10172, + "time_per_iteration": 2.734250545501709 + }, + { + "auxiliary_loss_clip": 0.01041213, + "auxiliary_loss_mlp": 0.01032464, + "balance_loss_clip": 1.02778745, + "balance_loss_mlp": 1.02087665, + "epoch": 0.6116338493912521, + "flos": 21251468453760.0, + "grad_norm": 1.8196737486968835, + "language_loss": 0.67056644, + "learning_rate": 1.3841147253914022e-06, + "loss": 0.69130319, + "num_input_tokens_seen": 219037730, + "step": 10173, + "time_per_iteration": 2.7216761112213135 + }, + { + "auxiliary_loss_clip": 0.01039174, + "auxiliary_loss_mlp": 0.01030438, + "balance_loss_clip": 1.02460861, + "balance_loss_mlp": 1.01908922, + "epoch": 0.61169397264392, + "flos": 17530189488000.0, + "grad_norm": 1.6268268232732113, + "language_loss": 0.55855137, + "learning_rate": 1.3837442013089416e-06, + "loss": 0.57924747, + "num_input_tokens_seen": 219056755, + "step": 10174, + "time_per_iteration": 2.6252458095550537 + }, + { + "auxiliary_loss_clip": 0.01046585, + "auxiliary_loss_mlp": 0.01031241, + "balance_loss_clip": 1.02673936, + "balance_loss_mlp": 1.01970744, + "epoch": 0.611754095896588, + "flos": 23951555758080.0, + "grad_norm": 2.0817676132894127, + "language_loss": 0.66179556, + "learning_rate": 1.3833737005949628e-06, + "loss": 0.6825738, + "num_input_tokens_seen": 219076985, + "step": 10175, + "time_per_iteration": 2.827580213546753 + }, + { + "auxiliary_loss_clip": 0.01051665, + "auxiliary_loss_mlp": 0.0074767, + "balance_loss_clip": 1.02285361, + "balance_loss_mlp": 1.00051546, + "epoch": 0.6118142191492559, + "flos": 25994872834560.0, + "grad_norm": 2.0565200132488854, + "language_loss": 0.82754064, + "learning_rate": 1.3830032232635154e-06, + "loss": 0.84553397, + "num_input_tokens_seen": 219096050, + "step": 10176, + "time_per_iteration": 2.684657335281372 + }, + { + "auxiliary_loss_clip": 0.01046414, + "auxiliary_loss_mlp": 0.01033097, + "balance_loss_clip": 1.02541375, + "balance_loss_mlp": 1.02164125, + "epoch": 0.611874342401924, + "flos": 24603190341120.0, + "grad_norm": 1.8358244679671505, + "language_loss": 0.77454937, + "learning_rate": 1.3826327693286474e-06, + "loss": 0.79534453, + "num_input_tokens_seen": 219112665, + "step": 10177, + "time_per_iteration": 2.7100303173065186 + }, + { + "auxiliary_loss_clip": 0.0104665, + "auxiliary_loss_mlp": 0.00747595, + "balance_loss_clip": 1.02255452, + "balance_loss_mlp": 1.0004313, + "epoch": 0.6119344656545919, + "flos": 15887132640000.0, + "grad_norm": 2.0490150977580606, + "language_loss": 0.75317651, + "learning_rate": 1.3822623388044065e-06, + "loss": 0.771119, + "num_input_tokens_seen": 219129120, + "step": 10178, + "time_per_iteration": 2.6362571716308594 + }, + { + "auxiliary_loss_clip": 0.0103753, + "auxiliary_loss_mlp": 0.01036717, + "balance_loss_clip": 1.02322638, + "balance_loss_mlp": 1.02409315, + "epoch": 0.6119945889072599, + "flos": 21652877917440.0, + "grad_norm": 1.570031478369824, + "language_loss": 0.66910887, + "learning_rate": 1.3818919317048402e-06, + "loss": 0.6898514, + "num_input_tokens_seen": 219148950, + "step": 10179, + "time_per_iteration": 2.731710910797119 + }, + { + "auxiliary_loss_clip": 0.01047963, + "auxiliary_loss_mlp": 0.01032865, + "balance_loss_clip": 1.02706409, + "balance_loss_mlp": 1.02211225, + "epoch": 0.6120547121599279, + "flos": 13772533023360.0, + "grad_norm": 1.7645927673959403, + "language_loss": 0.83382106, + "learning_rate": 1.3815215480439933e-06, + "loss": 0.85462934, + "num_input_tokens_seen": 219165585, + "step": 10180, + "time_per_iteration": 2.73952579498291 + }, + { + "auxiliary_loss_clip": 0.01063893, + "auxiliary_loss_mlp": 0.01026234, + "balance_loss_clip": 1.02555728, + "balance_loss_mlp": 1.01467085, + "epoch": 0.6121148354125958, + "flos": 20079164275200.0, + "grad_norm": 1.695797630717618, + "language_loss": 0.77810359, + "learning_rate": 1.3811511878359113e-06, + "loss": 0.79900485, + "num_input_tokens_seen": 219183280, + "step": 10181, + "time_per_iteration": 2.564382553100586 + }, + { + "auxiliary_loss_clip": 0.01064547, + "auxiliary_loss_mlp": 0.01030691, + "balance_loss_clip": 1.02527654, + "balance_loss_mlp": 1.01940763, + "epoch": 0.6121749586652638, + "flos": 13471313569920.0, + "grad_norm": 1.7823024764361026, + "language_loss": 0.80787516, + "learning_rate": 1.3807808510946384e-06, + "loss": 0.8288275, + "num_input_tokens_seen": 219197200, + "step": 10182, + "time_per_iteration": 2.541815996170044 + }, + { + "auxiliary_loss_clip": 0.01026173, + "auxiliary_loss_mlp": 0.01031126, + "balance_loss_clip": 1.02239823, + "balance_loss_mlp": 1.02091563, + "epoch": 0.6122350819179317, + "flos": 20120533764480.0, + "grad_norm": 1.6888164022135772, + "language_loss": 0.82966614, + "learning_rate": 1.3804105378342177e-06, + "loss": 0.85023904, + "num_input_tokens_seen": 219216825, + "step": 10183, + "time_per_iteration": 2.7531120777130127 + }, + { + "auxiliary_loss_clip": 0.01002971, + "auxiliary_loss_mlp": 0.01008436, + "balance_loss_clip": 1.00600815, + "balance_loss_mlp": 1.0073092, + "epoch": 0.6122952051705998, + "flos": 65429242767360.0, + "grad_norm": 0.8252265471003483, + "language_loss": 0.62859571, + "learning_rate": 1.3800402480686914e-06, + "loss": 0.64870977, + "num_input_tokens_seen": 219283795, + "step": 10184, + "time_per_iteration": 3.330664873123169 + }, + { + "auxiliary_loss_clip": 0.01056882, + "auxiliary_loss_mlp": 0.01029392, + "balance_loss_clip": 1.02700841, + "balance_loss_mlp": 1.01899099, + "epoch": 0.6123553284232677, + "flos": 20376253664640.0, + "grad_norm": 2.3809497584004786, + "language_loss": 0.82209933, + "learning_rate": 1.379669981812101e-06, + "loss": 0.84296215, + "num_input_tokens_seen": 219302385, + "step": 10185, + "time_per_iteration": 2.6084249019622803 + }, + { + "auxiliary_loss_clip": 0.01039085, + "auxiliary_loss_mlp": 0.01035754, + "balance_loss_clip": 1.02434504, + "balance_loss_mlp": 1.02388656, + "epoch": 0.6124154516759357, + "flos": 23987645948160.0, + "grad_norm": 1.7624656460276962, + "language_loss": 0.74845189, + "learning_rate": 1.3792997390784868e-06, + "loss": 0.76920033, + "num_input_tokens_seen": 219319765, + "step": 10186, + "time_per_iteration": 4.288726091384888 + }, + { + "auxiliary_loss_clip": 0.01048447, + "auxiliary_loss_mlp": 0.01030543, + "balance_loss_clip": 1.02365196, + "balance_loss_mlp": 1.02035022, + "epoch": 0.6124755749286036, + "flos": 21468799693440.0, + "grad_norm": 1.5320075251654257, + "language_loss": 0.78391457, + "learning_rate": 1.3789295198818895e-06, + "loss": 0.80470443, + "num_input_tokens_seen": 219337440, + "step": 10187, + "time_per_iteration": 4.238589763641357 + }, + { + "auxiliary_loss_clip": 0.01062146, + "auxiliary_loss_mlp": 0.01030053, + "balance_loss_clip": 1.02380252, + "balance_loss_mlp": 1.01912737, + "epoch": 0.6125356981812716, + "flos": 23879195809920.0, + "grad_norm": 1.490092284360824, + "language_loss": 0.83305442, + "learning_rate": 1.3785593242363462e-06, + "loss": 0.85397637, + "num_input_tokens_seen": 219357525, + "step": 10188, + "time_per_iteration": 2.5879437923431396 + }, + { + "auxiliary_loss_clip": 0.01034943, + "auxiliary_loss_mlp": 0.01031696, + "balance_loss_clip": 1.02538443, + "balance_loss_mlp": 1.02067542, + "epoch": 0.6125958214339395, + "flos": 14425604150400.0, + "grad_norm": 1.6824876289548174, + "language_loss": 0.75333285, + "learning_rate": 1.378189152155896e-06, + "loss": 0.77399921, + "num_input_tokens_seen": 219374855, + "step": 10189, + "time_per_iteration": 2.665802240371704 + }, + { + "auxiliary_loss_clip": 0.01046737, + "auxiliary_loss_mlp": 0.01028335, + "balance_loss_clip": 1.02268589, + "balance_loss_mlp": 1.01690257, + "epoch": 0.6126559446866076, + "flos": 23259090389760.0, + "grad_norm": 1.4879032208381398, + "language_loss": 0.74228269, + "learning_rate": 1.3778190036545758e-06, + "loss": 0.76303339, + "num_input_tokens_seen": 219394740, + "step": 10190, + "time_per_iteration": 2.5962653160095215 + }, + { + "auxiliary_loss_clip": 0.01054906, + "auxiliary_loss_mlp": 0.01029949, + "balance_loss_clip": 1.02515364, + "balance_loss_mlp": 1.01836193, + "epoch": 0.6127160679392755, + "flos": 26864808324480.0, + "grad_norm": 1.5770363765148008, + "language_loss": 0.68955344, + "learning_rate": 1.3774488787464207e-06, + "loss": 0.71040201, + "num_input_tokens_seen": 219413755, + "step": 10191, + "time_per_iteration": 2.700612783432007 + }, + { + "auxiliary_loss_clip": 0.01048518, + "auxiliary_loss_mlp": 0.01034136, + "balance_loss_clip": 1.02286434, + "balance_loss_mlp": 1.02296007, + "epoch": 0.6127761911919435, + "flos": 26396425952640.0, + "grad_norm": 2.382529361083788, + "language_loss": 0.73753321, + "learning_rate": 1.377078777445467e-06, + "loss": 0.75835973, + "num_input_tokens_seen": 219433560, + "step": 10192, + "time_per_iteration": 2.6647205352783203 + }, + { + "auxiliary_loss_clip": 0.01024847, + "auxiliary_loss_mlp": 0.01023466, + "balance_loss_clip": 1.02476597, + "balance_loss_mlp": 1.01313651, + "epoch": 0.6128363144446115, + "flos": 22634747164800.0, + "grad_norm": 1.9921585377128326, + "language_loss": 0.8338992, + "learning_rate": 1.3767086997657478e-06, + "loss": 0.85438234, + "num_input_tokens_seen": 219452640, + "step": 10193, + "time_per_iteration": 2.7413175106048584 + }, + { + "auxiliary_loss_clip": 0.01036411, + "auxiliary_loss_mlp": 0.01030848, + "balance_loss_clip": 1.02525353, + "balance_loss_mlp": 1.01930881, + "epoch": 0.6128964376972794, + "flos": 26759051706240.0, + "grad_norm": 2.1763366414879273, + "language_loss": 0.70320308, + "learning_rate": 1.3763386457212979e-06, + "loss": 0.72387564, + "num_input_tokens_seen": 219468585, + "step": 10194, + "time_per_iteration": 2.737457275390625 + }, + { + "auxiliary_loss_clip": 0.00983367, + "auxiliary_loss_mlp": 0.01000506, + "balance_loss_clip": 1.006428, + "balance_loss_mlp": 0.999201, + "epoch": 0.6129565609499474, + "flos": 65567929178880.0, + "grad_norm": 0.8351564377813514, + "language_loss": 0.58699304, + "learning_rate": 1.375968615326149e-06, + "loss": 0.60683179, + "num_input_tokens_seen": 219523015, + "step": 10195, + "time_per_iteration": 3.1129980087280273 + }, + { + "auxiliary_loss_clip": 0.01044807, + "auxiliary_loss_mlp": 0.01030168, + "balance_loss_clip": 1.02487957, + "balance_loss_mlp": 1.01883078, + "epoch": 0.6130166842026153, + "flos": 16362087200640.0, + "grad_norm": 2.053196347921856, + "language_loss": 0.70001036, + "learning_rate": 1.3755986085943324e-06, + "loss": 0.72076011, + "num_input_tokens_seen": 219539980, + "step": 10196, + "time_per_iteration": 2.765872001647949 + }, + { + "auxiliary_loss_clip": 0.01034027, + "auxiliary_loss_mlp": 0.01033884, + "balance_loss_clip": 1.02149904, + "balance_loss_mlp": 1.02244592, + "epoch": 0.6130768074552834, + "flos": 23652455207040.0, + "grad_norm": 1.740389897341162, + "language_loss": 0.71305764, + "learning_rate": 1.3752286255398788e-06, + "loss": 0.73373675, + "num_input_tokens_seen": 219556980, + "step": 10197, + "time_per_iteration": 2.699733018875122 + }, + { + "auxiliary_loss_clip": 0.01050435, + "auxiliary_loss_mlp": 0.01037627, + "balance_loss_clip": 1.0243144, + "balance_loss_mlp": 1.02482414, + "epoch": 0.6131369307079513, + "flos": 20047455544320.0, + "grad_norm": 1.818398645132689, + "language_loss": 0.7889176, + "learning_rate": 1.3748586661768191e-06, + "loss": 0.80979824, + "num_input_tokens_seen": 219576410, + "step": 10198, + "time_per_iteration": 4.301099538803101 + }, + { + "auxiliary_loss_clip": 0.01040078, + "auxiliary_loss_mlp": 0.01030456, + "balance_loss_clip": 1.02785599, + "balance_loss_mlp": 1.01916695, + "epoch": 0.6131970539606193, + "flos": 22672166158080.0, + "grad_norm": 1.427087621940306, + "language_loss": 0.74474418, + "learning_rate": 1.374488730519181e-06, + "loss": 0.76544964, + "num_input_tokens_seen": 219597180, + "step": 10199, + "time_per_iteration": 2.770873546600342 + }, + { + "auxiliary_loss_clip": 0.01046438, + "auxiliary_loss_mlp": 0.01035697, + "balance_loss_clip": 1.02549708, + "balance_loss_mlp": 1.02382421, + "epoch": 0.6132571772132872, + "flos": 26870913636480.0, + "grad_norm": 1.9262392564308626, + "language_loss": 0.60900617, + "learning_rate": 1.374118818580993e-06, + "loss": 0.6298275, + "num_input_tokens_seen": 219617630, + "step": 10200, + "time_per_iteration": 2.7339882850646973 + }, + { + "auxiliary_loss_clip": 0.01038956, + "auxiliary_loss_mlp": 0.01028631, + "balance_loss_clip": 1.02476287, + "balance_loss_mlp": 1.01764584, + "epoch": 0.6133173004659552, + "flos": 22892657794560.0, + "grad_norm": 1.8548952785657262, + "language_loss": 0.68628317, + "learning_rate": 1.3737489303762822e-06, + "loss": 0.70695901, + "num_input_tokens_seen": 219637025, + "step": 10201, + "time_per_iteration": 2.7400524616241455 + }, + { + "auxiliary_loss_clip": 0.01042454, + "auxiliary_loss_mlp": 0.01024284, + "balance_loss_clip": 1.02411258, + "balance_loss_mlp": 1.01325738, + "epoch": 0.6133774237186231, + "flos": 20485098852480.0, + "grad_norm": 1.7159354439440446, + "language_loss": 0.83376586, + "learning_rate": 1.3733790659190746e-06, + "loss": 0.85443318, + "num_input_tokens_seen": 219656625, + "step": 10202, + "time_per_iteration": 2.684373140335083 + }, + { + "auxiliary_loss_clip": 0.01007458, + "auxiliary_loss_mlp": 0.01001553, + "balance_loss_clip": 1.00137019, + "balance_loss_mlp": 1.00032544, + "epoch": 0.6134375469712912, + "flos": 69413065217280.0, + "grad_norm": 0.8932067086488136, + "language_loss": 0.67095184, + "learning_rate": 1.3730092252233953e-06, + "loss": 0.69104195, + "num_input_tokens_seen": 219718090, + "step": 10203, + "time_per_iteration": 3.308497667312622 + }, + { + "auxiliary_loss_clip": 0.0105525, + "auxiliary_loss_mlp": 0.01028114, + "balance_loss_clip": 1.02551627, + "balance_loss_mlp": 1.0170933, + "epoch": 0.6134976702239591, + "flos": 41281541815680.0, + "grad_norm": 1.7142545878817086, + "language_loss": 0.61336011, + "learning_rate": 1.37263940830327e-06, + "loss": 0.63419378, + "num_input_tokens_seen": 219740100, + "step": 10204, + "time_per_iteration": 2.9270901679992676 + }, + { + "auxiliary_loss_clip": 0.0103217, + "auxiliary_loss_mlp": 0.0102829, + "balance_loss_clip": 1.02327931, + "balance_loss_mlp": 1.01726353, + "epoch": 0.6135577934766271, + "flos": 22346600261760.0, + "grad_norm": 1.6936328822751792, + "language_loss": 0.7239942, + "learning_rate": 1.3722696151727204e-06, + "loss": 0.74459881, + "num_input_tokens_seen": 219761225, + "step": 10205, + "time_per_iteration": 2.7859573364257812 + }, + { + "auxiliary_loss_clip": 0.01052295, + "auxiliary_loss_mlp": 0.01022684, + "balance_loss_clip": 1.0243187, + "balance_loss_mlp": 1.01106095, + "epoch": 0.6136179167292951, + "flos": 23728155120000.0, + "grad_norm": 1.704811429180865, + "language_loss": 0.76126748, + "learning_rate": 1.3718998458457701e-06, + "loss": 0.78201723, + "num_input_tokens_seen": 219780085, + "step": 10206, + "time_per_iteration": 2.604191303253174 + }, + { + "auxiliary_loss_clip": 0.01030416, + "auxiliary_loss_mlp": 0.01031594, + "balance_loss_clip": 1.02656186, + "balance_loss_mlp": 1.02026927, + "epoch": 0.613678039981963, + "flos": 26024678144640.0, + "grad_norm": 2.112714596997449, + "language_loss": 0.7584734, + "learning_rate": 1.3715301003364407e-06, + "loss": 0.7790935, + "num_input_tokens_seen": 219797895, + "step": 10207, + "time_per_iteration": 4.500079154968262 + }, + { + "auxiliary_loss_clip": 0.01054122, + "auxiliary_loss_mlp": 0.01031707, + "balance_loss_clip": 1.02538109, + "balance_loss_mlp": 1.02078152, + "epoch": 0.613738163234631, + "flos": 9859957200000.0, + "grad_norm": 2.040084550319385, + "language_loss": 0.82431924, + "learning_rate": 1.3711603786587525e-06, + "loss": 0.84517747, + "num_input_tokens_seen": 219811295, + "step": 10208, + "time_per_iteration": 2.55100679397583 + }, + { + "auxiliary_loss_clip": 0.01048273, + "auxiliary_loss_mlp": 0.01029358, + "balance_loss_clip": 1.02758288, + "balance_loss_mlp": 1.01693606, + "epoch": 0.613798286487299, + "flos": 33182070001920.0, + "grad_norm": 1.7372716197733051, + "language_loss": 0.72337008, + "learning_rate": 1.3707906808267265e-06, + "loss": 0.74414635, + "num_input_tokens_seen": 219832735, + "step": 10209, + "time_per_iteration": 2.750056028366089 + }, + { + "auxiliary_loss_clip": 0.01063892, + "auxiliary_loss_mlp": 0.01032266, + "balance_loss_clip": 1.0259968, + "balance_loss_mlp": 1.02118564, + "epoch": 0.613858409739967, + "flos": 25627901535360.0, + "grad_norm": 1.6493837011346832, + "language_loss": 0.74152553, + "learning_rate": 1.37042100685438e-06, + "loss": 0.76248717, + "num_input_tokens_seen": 219852755, + "step": 10210, + "time_per_iteration": 2.5742697715759277 + }, + { + "auxiliary_loss_clip": 0.00981588, + "auxiliary_loss_mlp": 0.01006172, + "balance_loss_clip": 1.00337899, + "balance_loss_mlp": 1.00480747, + "epoch": 0.6139185329926349, + "flos": 67192313932800.0, + "grad_norm": 0.858497042865271, + "language_loss": 0.64983088, + "learning_rate": 1.3700513567557325e-06, + "loss": 0.66970843, + "num_input_tokens_seen": 219922785, + "step": 10211, + "time_per_iteration": 3.4064958095550537 + }, + { + "auxiliary_loss_clip": 0.01044449, + "auxiliary_loss_mlp": 0.00747655, + "balance_loss_clip": 1.02477825, + "balance_loss_mlp": 1.00042903, + "epoch": 0.6139786562453029, + "flos": 21543637680000.0, + "grad_norm": 1.69586771252502, + "language_loss": 0.75632203, + "learning_rate": 1.369681730544801e-06, + "loss": 0.774243, + "num_input_tokens_seen": 219942215, + "step": 10212, + "time_per_iteration": 2.6409475803375244 + }, + { + "auxiliary_loss_clip": 0.01039059, + "auxiliary_loss_mlp": 0.01036006, + "balance_loss_clip": 1.02394342, + "balance_loss_mlp": 1.02314878, + "epoch": 0.6140387794979708, + "flos": 26068489758720.0, + "grad_norm": 1.421652665109811, + "language_loss": 0.7391206, + "learning_rate": 1.3693121282356009e-06, + "loss": 0.75987124, + "num_input_tokens_seen": 219963830, + "step": 10213, + "time_per_iteration": 2.6739983558654785 + }, + { + "auxiliary_loss_clip": 0.01049617, + "auxiliary_loss_mlp": 0.01032757, + "balance_loss_clip": 1.02690172, + "balance_loss_mlp": 1.02065182, + "epoch": 0.6140989027506388, + "flos": 23694614795520.0, + "grad_norm": 1.621523757545119, + "language_loss": 0.72866344, + "learning_rate": 1.3689425498421483e-06, + "loss": 0.74948716, + "num_input_tokens_seen": 219983815, + "step": 10214, + "time_per_iteration": 2.662292003631592 + }, + { + "auxiliary_loss_clip": 0.01065933, + "auxiliary_loss_mlp": 0.01027229, + "balance_loss_clip": 1.0254035, + "balance_loss_mlp": 1.01543331, + "epoch": 0.6141590260033067, + "flos": 22231721589120.0, + "grad_norm": 1.747726484229967, + "language_loss": 0.74404848, + "learning_rate": 1.3685729953784572e-06, + "loss": 0.76498008, + "num_input_tokens_seen": 220003165, + "step": 10215, + "time_per_iteration": 2.69331693649292 + }, + { + "auxiliary_loss_clip": 0.01055334, + "auxiliary_loss_mlp": 0.0103374, + "balance_loss_clip": 1.02624345, + "balance_loss_mlp": 1.02249873, + "epoch": 0.6142191492559748, + "flos": 23871653953920.0, + "grad_norm": 1.6622928630257305, + "language_loss": 0.78665072, + "learning_rate": 1.368203464858542e-06, + "loss": 0.80754149, + "num_input_tokens_seen": 220021015, + "step": 10216, + "time_per_iteration": 2.6777236461639404 + }, + { + "auxiliary_loss_clip": 0.01065624, + "auxiliary_loss_mlp": 0.01029808, + "balance_loss_clip": 1.02737546, + "balance_loss_mlp": 1.01815486, + "epoch": 0.6142792725086427, + "flos": 15042513260160.0, + "grad_norm": 1.9366768248733597, + "language_loss": 0.7969929, + "learning_rate": 1.3678339582964147e-06, + "loss": 0.81794721, + "num_input_tokens_seen": 220035780, + "step": 10217, + "time_per_iteration": 2.5932555198669434 + }, + { + "auxiliary_loss_clip": 0.01045646, + "auxiliary_loss_mlp": 0.01024931, + "balance_loss_clip": 1.024845, + "balance_loss_mlp": 1.01332617, + "epoch": 0.6143393957613107, + "flos": 23330947547520.0, + "grad_norm": 2.4305845859099424, + "language_loss": 0.78635991, + "learning_rate": 1.3674644757060865e-06, + "loss": 0.80706567, + "num_input_tokens_seen": 220054280, + "step": 10218, + "time_per_iteration": 2.65704083442688 + }, + { + "auxiliary_loss_clip": 0.0105444, + "auxiliary_loss_mlp": 0.01031685, + "balance_loss_clip": 1.02525187, + "balance_loss_mlp": 1.02081847, + "epoch": 0.6143995190139786, + "flos": 20117086058880.0, + "grad_norm": 1.8312264021050373, + "language_loss": 0.81802124, + "learning_rate": 1.367095017101569e-06, + "loss": 0.83888245, + "num_input_tokens_seen": 220074120, + "step": 10219, + "time_per_iteration": 2.722191095352173 + }, + { + "auxiliary_loss_clip": 0.01055796, + "auxiliary_loss_mlp": 0.01030271, + "balance_loss_clip": 1.02445769, + "balance_loss_mlp": 1.01843929, + "epoch": 0.6144596422666466, + "flos": 42303559489920.0, + "grad_norm": 1.8343580439267126, + "language_loss": 0.66917849, + "learning_rate": 1.3667255824968717e-06, + "loss": 0.69003916, + "num_input_tokens_seen": 220096320, + "step": 10220, + "time_per_iteration": 2.763211488723755 + }, + { + "auxiliary_loss_clip": 0.01052327, + "auxiliary_loss_mlp": 0.01023556, + "balance_loss_clip": 1.0234406, + "balance_loss_mlp": 1.0124402, + "epoch": 0.6145197655193146, + "flos": 21573622558080.0, + "grad_norm": 2.2365916695411183, + "language_loss": 0.71793818, + "learning_rate": 1.3663561719060041e-06, + "loss": 0.73869705, + "num_input_tokens_seen": 220114850, + "step": 10221, + "time_per_iteration": 2.738830089569092 + }, + { + "auxiliary_loss_clip": 0.01021516, + "auxiliary_loss_mlp": 0.01027539, + "balance_loss_clip": 1.02338314, + "balance_loss_mlp": 1.01653004, + "epoch": 0.6145798887719826, + "flos": 21471098163840.0, + "grad_norm": 2.1888690195777434, + "language_loss": 0.79259193, + "learning_rate": 1.3659867853429735e-06, + "loss": 0.81308246, + "num_input_tokens_seen": 220133395, + "step": 10222, + "time_per_iteration": 2.717003583908081 + }, + { + "auxiliary_loss_clip": 0.01040927, + "auxiliary_loss_mlp": 0.01032783, + "balance_loss_clip": 1.02537692, + "balance_loss_mlp": 1.02079058, + "epoch": 0.6146400120246506, + "flos": 20777016683520.0, + "grad_norm": 1.8888375792773486, + "language_loss": 0.75583857, + "learning_rate": 1.365617422821788e-06, + "loss": 0.77657562, + "num_input_tokens_seen": 220152790, + "step": 10223, + "time_per_iteration": 2.665916681289673 + }, + { + "auxiliary_loss_clip": 0.01042195, + "auxiliary_loss_mlp": 0.01028372, + "balance_loss_clip": 1.02390492, + "balance_loss_mlp": 1.01702297, + "epoch": 0.6147001352773185, + "flos": 13881306384000.0, + "grad_norm": 1.8624093236152925, + "language_loss": 0.7809009, + "learning_rate": 1.3652480843564535e-06, + "loss": 0.80160654, + "num_input_tokens_seen": 220169535, + "step": 10224, + "time_per_iteration": 2.7555503845214844 + }, + { + "auxiliary_loss_clip": 0.01029426, + "auxiliary_loss_mlp": 0.01027426, + "balance_loss_clip": 1.02199054, + "balance_loss_mlp": 1.0170784, + "epoch": 0.6147602585299865, + "flos": 56641791807360.0, + "grad_norm": 1.2032942078842028, + "language_loss": 0.66585356, + "learning_rate": 1.3648787699609746e-06, + "loss": 0.68642205, + "num_input_tokens_seen": 220195305, + "step": 10225, + "time_per_iteration": 3.091613292694092 + }, + { + "auxiliary_loss_clip": 0.0105959, + "auxiliary_loss_mlp": 0.00747824, + "balance_loss_clip": 1.0279789, + "balance_loss_mlp": 1.00047863, + "epoch": 0.6148203817826544, + "flos": 32817217605120.0, + "grad_norm": 2.2480400460216345, + "language_loss": 0.63415325, + "learning_rate": 1.364509479649357e-06, + "loss": 0.6522274, + "num_input_tokens_seen": 220215040, + "step": 10226, + "time_per_iteration": 2.8218741416931152 + }, + { + "auxiliary_loss_clip": 0.01038251, + "auxiliary_loss_mlp": 0.01032713, + "balance_loss_clip": 1.02352893, + "balance_loss_mlp": 1.02024412, + "epoch": 0.6148805050353224, + "flos": 18332038748160.0, + "grad_norm": 1.7716651665574337, + "language_loss": 0.75840777, + "learning_rate": 1.3641402134356037e-06, + "loss": 0.77911741, + "num_input_tokens_seen": 220234205, + "step": 10227, + "time_per_iteration": 2.733008861541748 + }, + { + "auxiliary_loss_clip": 0.01002793, + "auxiliary_loss_mlp": 0.01035551, + "balance_loss_clip": 1.01968265, + "balance_loss_mlp": 1.02119851, + "epoch": 0.6149406282879903, + "flos": 14063983977600.0, + "grad_norm": 2.1533869516642126, + "language_loss": 0.61778718, + "learning_rate": 1.3637709713337164e-06, + "loss": 0.6381706, + "num_input_tokens_seen": 220252730, + "step": 10228, + "time_per_iteration": 2.8763294219970703 + }, + { + "auxiliary_loss_clip": 0.01038337, + "auxiliary_loss_mlp": 0.0102809, + "balance_loss_clip": 1.0228256, + "balance_loss_mlp": 1.01698601, + "epoch": 0.6150007515406584, + "flos": 25190186400000.0, + "grad_norm": 1.2710256204404453, + "language_loss": 0.74227035, + "learning_rate": 1.3634017533576985e-06, + "loss": 0.76293457, + "num_input_tokens_seen": 220273345, + "step": 10229, + "time_per_iteration": 2.882040023803711 + }, + { + "auxiliary_loss_clip": 0.0106708, + "auxiliary_loss_mlp": 0.01034681, + "balance_loss_clip": 1.02714038, + "balance_loss_mlp": 1.02277184, + "epoch": 0.6150608747933263, + "flos": 21945262625280.0, + "grad_norm": 1.9296633290337253, + "language_loss": 0.78021222, + "learning_rate": 1.3630325595215493e-06, + "loss": 0.80122983, + "num_input_tokens_seen": 220293845, + "step": 10230, + "time_per_iteration": 2.7121763229370117 + }, + { + "auxiliary_loss_clip": 0.01038714, + "auxiliary_loss_mlp": 0.01032643, + "balance_loss_clip": 1.02393687, + "balance_loss_mlp": 1.02096009, + "epoch": 0.6151209980459943, + "flos": 30117453523200.0, + "grad_norm": 1.670929689394107, + "language_loss": 0.72835213, + "learning_rate": 1.36266338983927e-06, + "loss": 0.7490657, + "num_input_tokens_seen": 220316070, + "step": 10231, + "time_per_iteration": 2.6961166858673096 + }, + { + "auxiliary_loss_clip": 0.01045205, + "auxiliary_loss_mlp": 0.0103245, + "balance_loss_clip": 1.02543449, + "balance_loss_mlp": 1.02164984, + "epoch": 0.6151811212986622, + "flos": 30008356940160.0, + "grad_norm": 1.4688572195995937, + "language_loss": 0.70029926, + "learning_rate": 1.362294244324858e-06, + "loss": 0.72107589, + "num_input_tokens_seen": 220335695, + "step": 10232, + "time_per_iteration": 2.734548330307007 + }, + { + "auxiliary_loss_clip": 0.01048553, + "auxiliary_loss_mlp": 0.0074764, + "balance_loss_clip": 1.0235033, + "balance_loss_mlp": 1.00046444, + "epoch": 0.6152412445513302, + "flos": 18872888808960.0, + "grad_norm": 1.9424465601073624, + "language_loss": 0.9197709, + "learning_rate": 1.3619251229923126e-06, + "loss": 0.93773282, + "num_input_tokens_seen": 220353720, + "step": 10233, + "time_per_iteration": 4.241269588470459 + }, + { + "auxiliary_loss_clip": 0.01044932, + "auxiliary_loss_mlp": 0.01033468, + "balance_loss_clip": 1.02682376, + "balance_loss_mlp": 1.02288842, + "epoch": 0.6153013678039982, + "flos": 25703601448320.0, + "grad_norm": 1.6768283674050475, + "language_loss": 0.71822387, + "learning_rate": 1.3615560258556306e-06, + "loss": 0.73900783, + "num_input_tokens_seen": 220372515, + "step": 10234, + "time_per_iteration": 4.299499273300171 + }, + { + "auxiliary_loss_clip": 0.01056114, + "auxiliary_loss_mlp": 0.0074773, + "balance_loss_clip": 1.02496803, + "balance_loss_mlp": 1.0005846, + "epoch": 0.6153614910566662, + "flos": 28510271383680.0, + "grad_norm": 1.8810974154562015, + "language_loss": 0.6670109, + "learning_rate": 1.3611869529288077e-06, + "loss": 0.68504936, + "num_input_tokens_seen": 220393490, + "step": 10235, + "time_per_iteration": 2.685049533843994 + }, + { + "auxiliary_loss_clip": 0.01049923, + "auxiliary_loss_mlp": 0.01027622, + "balance_loss_clip": 1.02555943, + "balance_loss_mlp": 1.01611853, + "epoch": 0.6154216143093342, + "flos": 23549787158400.0, + "grad_norm": 3.269146840419992, + "language_loss": 0.814255, + "learning_rate": 1.3608179042258398e-06, + "loss": 0.83503044, + "num_input_tokens_seen": 220412855, + "step": 10236, + "time_per_iteration": 2.6268134117126465 + }, + { + "auxiliary_loss_clip": 0.01068009, + "auxiliary_loss_mlp": 0.01028572, + "balance_loss_clip": 1.02604389, + "balance_loss_mlp": 1.01716948, + "epoch": 0.6154817375620021, + "flos": 22748081552640.0, + "grad_norm": 1.3712875497149422, + "language_loss": 0.80732799, + "learning_rate": 1.360448879760721e-06, + "loss": 0.8282938, + "num_input_tokens_seen": 220433440, + "step": 10237, + "time_per_iteration": 2.638575553894043 + }, + { + "auxiliary_loss_clip": 0.0105854, + "auxiliary_loss_mlp": 0.01036958, + "balance_loss_clip": 1.02862489, + "balance_loss_mlp": 1.02565134, + "epoch": 0.6155418608146701, + "flos": 27162975121920.0, + "grad_norm": 2.4517014399825006, + "language_loss": 0.75980163, + "learning_rate": 1.3600798795474449e-06, + "loss": 0.78075659, + "num_input_tokens_seen": 220453445, + "step": 10238, + "time_per_iteration": 2.694504499435425 + }, + { + "auxiliary_loss_clip": 0.00966205, + "auxiliary_loss_mlp": 0.010087, + "balance_loss_clip": 1.00899208, + "balance_loss_mlp": 1.00722826, + "epoch": 0.615601984067338, + "flos": 68811165014400.0, + "grad_norm": 0.7702787605842067, + "language_loss": 0.57656443, + "learning_rate": 1.3597109036000036e-06, + "loss": 0.59631348, + "num_input_tokens_seen": 220509730, + "step": 10239, + "time_per_iteration": 3.4806113243103027 + }, + { + "auxiliary_loss_clip": 0.01045331, + "auxiliary_loss_mlp": 0.01033712, + "balance_loss_clip": 1.02493191, + "balance_loss_mlp": 1.02196383, + "epoch": 0.615662107320006, + "flos": 15517144598400.0, + "grad_norm": 1.8565198076588854, + "language_loss": 0.77207041, + "learning_rate": 1.3593419519323892e-06, + "loss": 0.79286087, + "num_input_tokens_seen": 220527295, + "step": 10240, + "time_per_iteration": 2.897615909576416 + }, + { + "auxiliary_loss_clip": 0.01069128, + "auxiliary_loss_mlp": 0.0103416, + "balance_loss_clip": 1.02852547, + "balance_loss_mlp": 1.02223325, + "epoch": 0.615722230572674, + "flos": 21063691128960.0, + "grad_norm": 2.4171131949071674, + "language_loss": 0.72877097, + "learning_rate": 1.3589730245585922e-06, + "loss": 0.74980384, + "num_input_tokens_seen": 220542730, + "step": 10241, + "time_per_iteration": 2.697711229324341 + }, + { + "auxiliary_loss_clip": 0.01065474, + "auxiliary_loss_mlp": 0.01027131, + "balance_loss_clip": 1.02629328, + "balance_loss_mlp": 1.01643825, + "epoch": 0.615782353825342, + "flos": 23256791919360.0, + "grad_norm": 1.6147556814947468, + "language_loss": 0.71952927, + "learning_rate": 1.3586041214926018e-06, + "loss": 0.74045533, + "num_input_tokens_seen": 220562995, + "step": 10242, + "time_per_iteration": 2.6679041385650635 + }, + { + "auxiliary_loss_clip": 0.01056221, + "auxiliary_loss_mlp": 0.0103048, + "balance_loss_clip": 1.02633357, + "balance_loss_mlp": 1.0196259, + "epoch": 0.6158424770780099, + "flos": 21103911383040.0, + "grad_norm": 1.823015264119206, + "language_loss": 0.72296178, + "learning_rate": 1.3582352427484086e-06, + "loss": 0.74382877, + "num_input_tokens_seen": 220581775, + "step": 10243, + "time_per_iteration": 2.721660614013672 + }, + { + "auxiliary_loss_clip": 0.01000031, + "auxiliary_loss_mlp": 0.01005901, + "balance_loss_clip": 1.0036087, + "balance_loss_mlp": 1.00479209, + "epoch": 0.6159026003306779, + "flos": 70333276769280.0, + "grad_norm": 0.9637902737027375, + "language_loss": 0.56892693, + "learning_rate": 1.3578663883399984e-06, + "loss": 0.58898622, + "num_input_tokens_seen": 220646395, + "step": 10244, + "time_per_iteration": 3.229480266571045 + }, + { + "auxiliary_loss_clip": 0.01065122, + "auxiliary_loss_mlp": 0.01029867, + "balance_loss_clip": 1.02481079, + "balance_loss_mlp": 1.01786232, + "epoch": 0.6159627235833458, + "flos": 33874355802240.0, + "grad_norm": 2.1534925334361548, + "language_loss": 0.63392609, + "learning_rate": 1.3574975582813593e-06, + "loss": 0.65487599, + "num_input_tokens_seen": 220668335, + "step": 10245, + "time_per_iteration": 4.371721982955933 + }, + { + "auxiliary_loss_clip": 0.01014133, + "auxiliary_loss_mlp": 0.01029368, + "balance_loss_clip": 1.02126098, + "balance_loss_mlp": 1.01813316, + "epoch": 0.6160228468360138, + "flos": 26575440359040.0, + "grad_norm": 1.768608569305865, + "language_loss": 0.7906146, + "learning_rate": 1.3571287525864771e-06, + "loss": 0.81104958, + "num_input_tokens_seen": 220688915, + "step": 10246, + "time_per_iteration": 2.795492172241211 + }, + { + "auxiliary_loss_clip": 0.01037413, + "auxiliary_loss_mlp": 0.0074791, + "balance_loss_clip": 1.02710462, + "balance_loss_mlp": 1.00048852, + "epoch": 0.6160829700886818, + "flos": 17193274894080.0, + "grad_norm": 4.459106291551726, + "language_loss": 0.87133276, + "learning_rate": 1.3567599712693368e-06, + "loss": 0.88918602, + "num_input_tokens_seen": 220703465, + "step": 10247, + "time_per_iteration": 2.6947338581085205 + }, + { + "auxiliary_loss_clip": 0.01004631, + "auxiliary_loss_mlp": 0.0103803, + "balance_loss_clip": 1.02423275, + "balance_loss_mlp": 1.02486968, + "epoch": 0.6161430933413498, + "flos": 23623547736960.0, + "grad_norm": 1.6318686197834593, + "language_loss": 0.80156177, + "learning_rate": 1.3563912143439235e-06, + "loss": 0.82198834, + "num_input_tokens_seen": 220722090, + "step": 10248, + "time_per_iteration": 2.7807881832122803 + }, + { + "auxiliary_loss_clip": 0.01020969, + "auxiliary_loss_mlp": 0.01033577, + "balance_loss_clip": 1.02488089, + "balance_loss_mlp": 1.0227046, + "epoch": 0.6162032165940178, + "flos": 23002436736000.0, + "grad_norm": 1.8053009209979083, + "language_loss": 0.87005448, + "learning_rate": 1.3560224818242191e-06, + "loss": 0.89059991, + "num_input_tokens_seen": 220741075, + "step": 10249, + "time_per_iteration": 2.8250226974487305 + }, + { + "auxiliary_loss_clip": 0.01065231, + "auxiliary_loss_mlp": 0.01025609, + "balance_loss_clip": 1.02621198, + "balance_loss_mlp": 1.01376605, + "epoch": 0.6162633398466857, + "flos": 39421979740800.0, + "grad_norm": 2.5895011080934136, + "language_loss": 0.6875422, + "learning_rate": 1.3556537737242072e-06, + "loss": 0.70845062, + "num_input_tokens_seen": 220763395, + "step": 10250, + "time_per_iteration": 2.8430163860321045 + }, + { + "auxiliary_loss_clip": 0.01034348, + "auxiliary_loss_mlp": 0.01026478, + "balance_loss_clip": 1.0223124, + "balance_loss_mlp": 1.01577902, + "epoch": 0.6163234630993537, + "flos": 19244672530560.0, + "grad_norm": 3.6009807930726336, + "language_loss": 0.74218237, + "learning_rate": 1.3552850900578692e-06, + "loss": 0.76279068, + "num_input_tokens_seen": 220780640, + "step": 10251, + "time_per_iteration": 2.8130483627319336 + }, + { + "auxiliary_loss_clip": 0.0104724, + "auxiliary_loss_mlp": 0.01031412, + "balance_loss_clip": 1.0230689, + "balance_loss_mlp": 1.01884794, + "epoch": 0.6163835863520216, + "flos": 15961791058560.0, + "grad_norm": 2.3440213868622886, + "language_loss": 0.68638664, + "learning_rate": 1.3549164308391844e-06, + "loss": 0.70717317, + "num_input_tokens_seen": 220797960, + "step": 10252, + "time_per_iteration": 2.5749714374542236 + }, + { + "auxiliary_loss_clip": 0.00953014, + "auxiliary_loss_mlp": 0.01003386, + "balance_loss_clip": 1.00623989, + "balance_loss_mlp": 1.00231874, + "epoch": 0.6164437096046896, + "flos": 68103834393600.0, + "grad_norm": 0.8932389174742461, + "language_loss": 0.57861632, + "learning_rate": 1.3545477960821333e-06, + "loss": 0.59818029, + "num_input_tokens_seen": 220856930, + "step": 10253, + "time_per_iteration": 3.529618740081787 + }, + { + "auxiliary_loss_clip": 0.0104359, + "auxiliary_loss_mlp": 0.0103067, + "balance_loss_clip": 1.02469015, + "balance_loss_mlp": 1.0193274, + "epoch": 0.6165038328573575, + "flos": 21361211481600.0, + "grad_norm": 1.6806178141997914, + "language_loss": 0.79814899, + "learning_rate": 1.3541791858006946e-06, + "loss": 0.81889164, + "num_input_tokens_seen": 220877595, + "step": 10254, + "time_per_iteration": 5.094131231307983 + }, + { + "auxiliary_loss_clip": 0.01048544, + "auxiliary_loss_mlp": 0.01030348, + "balance_loss_clip": 1.02655268, + "balance_loss_mlp": 1.01883841, + "epoch": 0.6165639561100256, + "flos": 21101972048640.0, + "grad_norm": 1.6750752817873205, + "language_loss": 0.8075316, + "learning_rate": 1.353810600008846e-06, + "loss": 0.8283205, + "num_input_tokens_seen": 220896880, + "step": 10255, + "time_per_iteration": 2.761697292327881 + }, + { + "auxiliary_loss_clip": 0.01046647, + "auxiliary_loss_mlp": 0.01032885, + "balance_loss_clip": 1.02539086, + "balance_loss_mlp": 1.02075577, + "epoch": 0.6166240793626935, + "flos": 25338533569920.0, + "grad_norm": 1.880137111804278, + "language_loss": 0.65343595, + "learning_rate": 1.3534420387205646e-06, + "loss": 0.67423123, + "num_input_tokens_seen": 220916425, + "step": 10256, + "time_per_iteration": 2.70172381401062 + }, + { + "auxiliary_loss_clip": 0.01055872, + "auxiliary_loss_mlp": 0.01025001, + "balance_loss_clip": 1.02733278, + "balance_loss_mlp": 1.01447475, + "epoch": 0.6166842026153615, + "flos": 19682639061120.0, + "grad_norm": 1.7835380198077164, + "language_loss": 0.71908724, + "learning_rate": 1.353073501949825e-06, + "loss": 0.73989606, + "num_input_tokens_seen": 220935050, + "step": 10257, + "time_per_iteration": 2.6645147800445557 + }, + { + "auxiliary_loss_clip": 0.01044186, + "auxiliary_loss_mlp": 0.01027523, + "balance_loss_clip": 1.0253402, + "balance_loss_mlp": 1.01560795, + "epoch": 0.6167443258680294, + "flos": 19318361281920.0, + "grad_norm": 1.7654375957326252, + "language_loss": 0.72133929, + "learning_rate": 1.3527049897106034e-06, + "loss": 0.74205637, + "num_input_tokens_seen": 220953085, + "step": 10258, + "time_per_iteration": 2.7054696083068848 + }, + { + "auxiliary_loss_clip": 0.01033857, + "auxiliary_loss_mlp": 0.01036285, + "balance_loss_clip": 1.02119136, + "balance_loss_mlp": 1.02324378, + "epoch": 0.6168044491206974, + "flos": 25265239868160.0, + "grad_norm": 2.241433475480404, + "language_loss": 0.63726437, + "learning_rate": 1.3523365020168735e-06, + "loss": 0.65796578, + "num_input_tokens_seen": 220969050, + "step": 10259, + "time_per_iteration": 2.779177188873291 + }, + { + "auxiliary_loss_clip": 0.01035626, + "auxiliary_loss_mlp": 0.01030804, + "balance_loss_clip": 1.02701235, + "balance_loss_mlp": 1.01882958, + "epoch": 0.6168645723733654, + "flos": 13219903301760.0, + "grad_norm": 1.7634347331860267, + "language_loss": 0.71308941, + "learning_rate": 1.3519680388826084e-06, + "loss": 0.73375368, + "num_input_tokens_seen": 220985825, + "step": 10260, + "time_per_iteration": 2.8990325927734375 + }, + { + "auxiliary_loss_clip": 0.01060924, + "auxiliary_loss_mlp": 0.01033977, + "balance_loss_clip": 1.02895737, + "balance_loss_mlp": 1.02085853, + "epoch": 0.6169246956260334, + "flos": 26652038112000.0, + "grad_norm": 1.921069667990864, + "language_loss": 0.68213141, + "learning_rate": 1.3515996003217803e-06, + "loss": 0.70308042, + "num_input_tokens_seen": 221004465, + "step": 10261, + "time_per_iteration": 2.751000165939331 + }, + { + "auxiliary_loss_clip": 0.01034825, + "auxiliary_loss_mlp": 0.01033493, + "balance_loss_clip": 1.0262444, + "balance_loss_mlp": 1.02338386, + "epoch": 0.6169848188787014, + "flos": 23148413608320.0, + "grad_norm": 1.8195361221718662, + "language_loss": 0.71840876, + "learning_rate": 1.3512311863483602e-06, + "loss": 0.73909199, + "num_input_tokens_seen": 221023260, + "step": 10262, + "time_per_iteration": 2.96528959274292 + }, + { + "auxiliary_loss_clip": 0.01044619, + "auxiliary_loss_mlp": 0.01030411, + "balance_loss_clip": 1.02460933, + "balance_loss_mlp": 1.0190084, + "epoch": 0.6170449421313693, + "flos": 23331917214720.0, + "grad_norm": 1.6071518963693727, + "language_loss": 0.69973183, + "learning_rate": 1.3508627969763188e-06, + "loss": 0.72048217, + "num_input_tokens_seen": 221043090, + "step": 10263, + "time_per_iteration": 2.826355457305908 + }, + { + "auxiliary_loss_clip": 0.01004871, + "auxiliary_loss_mlp": 0.01029678, + "balance_loss_clip": 1.02262366, + "balance_loss_mlp": 1.01816881, + "epoch": 0.6171050653840373, + "flos": 15851617067520.0, + "grad_norm": 2.606250375188036, + "language_loss": 0.76501513, + "learning_rate": 1.3504944322196244e-06, + "loss": 0.78536063, + "num_input_tokens_seen": 221061435, + "step": 10264, + "time_per_iteration": 2.9234402179718018 + }, + { + "auxiliary_loss_clip": 0.0106832, + "auxiliary_loss_mlp": 0.01029663, + "balance_loss_clip": 1.02768517, + "balance_loss_mlp": 1.01750946, + "epoch": 0.6171651886367052, + "flos": 20045516209920.0, + "grad_norm": 2.5012739947597695, + "language_loss": 0.85052645, + "learning_rate": 1.350126092092247e-06, + "loss": 0.87150627, + "num_input_tokens_seen": 221078705, + "step": 10265, + "time_per_iteration": 2.5948526859283447 + }, + { + "auxiliary_loss_clip": 0.01013589, + "auxiliary_loss_mlp": 0.01030555, + "balance_loss_clip": 1.02713013, + "balance_loss_mlp": 1.01883662, + "epoch": 0.6172253118893732, + "flos": 26432695710720.0, + "grad_norm": 1.7453623865925627, + "language_loss": 0.6435883, + "learning_rate": 1.349757776608153e-06, + "loss": 0.66402978, + "num_input_tokens_seen": 221099245, + "step": 10266, + "time_per_iteration": 2.903219699859619 + }, + { + "auxiliary_loss_clip": 0.01026762, + "auxiliary_loss_mlp": 0.01029524, + "balance_loss_clip": 1.02225852, + "balance_loss_mlp": 1.01842546, + "epoch": 0.6172854351420412, + "flos": 22632879657600.0, + "grad_norm": 1.681350859473636, + "language_loss": 0.75501722, + "learning_rate": 1.3493894857813094e-06, + "loss": 0.77558005, + "num_input_tokens_seen": 221116930, + "step": 10267, + "time_per_iteration": 2.679157018661499 + }, + { + "auxiliary_loss_clip": 0.01033081, + "auxiliary_loss_mlp": 0.01027273, + "balance_loss_clip": 1.02338493, + "balance_loss_mlp": 1.0154413, + "epoch": 0.6173455583947092, + "flos": 21212936138880.0, + "grad_norm": 2.027381994118958, + "language_loss": 0.74916494, + "learning_rate": 1.3490212196256818e-06, + "loss": 0.76976848, + "num_input_tokens_seen": 221137660, + "step": 10268, + "time_per_iteration": 2.7678074836730957 + }, + { + "auxiliary_loss_clip": 0.01047035, + "auxiliary_loss_mlp": 0.01029003, + "balance_loss_clip": 1.02584112, + "balance_loss_mlp": 1.01736259, + "epoch": 0.6174056816473771, + "flos": 19500284689920.0, + "grad_norm": 1.860270525002865, + "language_loss": 0.75545609, + "learning_rate": 1.3486529781552342e-06, + "loss": 0.77621651, + "num_input_tokens_seen": 221156225, + "step": 10269, + "time_per_iteration": 2.5854547023773193 + }, + { + "auxiliary_loss_clip": 0.01062078, + "auxiliary_loss_mlp": 0.01026485, + "balance_loss_clip": 1.02323043, + "balance_loss_mlp": 1.01542258, + "epoch": 0.6174658049000451, + "flos": 15997342544640.0, + "grad_norm": 2.1214585674220063, + "language_loss": 0.76858264, + "learning_rate": 1.3482847613839318e-06, + "loss": 0.78946829, + "num_input_tokens_seen": 221173820, + "step": 10270, + "time_per_iteration": 2.5594451427459717 + }, + { + "auxiliary_loss_clip": 0.010445, + "auxiliary_loss_mlp": 0.01025528, + "balance_loss_clip": 1.02454424, + "balance_loss_mlp": 1.01455474, + "epoch": 0.617525928152713, + "flos": 21903893136000.0, + "grad_norm": 1.7712918064750534, + "language_loss": 0.8236559, + "learning_rate": 1.347916569325736e-06, + "loss": 0.84435612, + "num_input_tokens_seen": 221191815, + "step": 10271, + "time_per_iteration": 2.766706943511963 + }, + { + "auxiliary_loss_clip": 0.0106546, + "auxiliary_loss_mlp": 0.00747713, + "balance_loss_clip": 1.02544165, + "balance_loss_mlp": 1.00046647, + "epoch": 0.617586051405381, + "flos": 21105958458240.0, + "grad_norm": 1.6368726169145214, + "language_loss": 0.77271366, + "learning_rate": 1.3475484019946093e-06, + "loss": 0.79084539, + "num_input_tokens_seen": 221211205, + "step": 10272, + "time_per_iteration": 2.6519837379455566 + }, + { + "auxiliary_loss_clip": 0.00980734, + "auxiliary_loss_mlp": 0.01002061, + "balance_loss_clip": 1.00360775, + "balance_loss_mlp": 1.00088108, + "epoch": 0.617646174658049, + "flos": 58610776665600.0, + "grad_norm": 0.8032768412093326, + "language_loss": 0.59145713, + "learning_rate": 1.347180259404513e-06, + "loss": 0.61128509, + "num_input_tokens_seen": 221268430, + "step": 10273, + "time_per_iteration": 3.222564697265625 + }, + { + "auxiliary_loss_clip": 0.01031399, + "auxiliary_loss_mlp": 0.01031849, + "balance_loss_clip": 1.02034807, + "balance_loss_mlp": 1.01887298, + "epoch": 0.617706297910717, + "flos": 13878684691200.0, + "grad_norm": 4.576307147911196, + "language_loss": 0.73221207, + "learning_rate": 1.3468121415694059e-06, + "loss": 0.75284457, + "num_input_tokens_seen": 221281930, + "step": 10274, + "time_per_iteration": 2.6166884899139404 + }, + { + "auxiliary_loss_clip": 0.01052996, + "auxiliary_loss_mlp": 0.00747739, + "balance_loss_clip": 1.02369213, + "balance_loss_mlp": 1.00048459, + "epoch": 0.617766421163385, + "flos": 19208438686080.0, + "grad_norm": 1.9319186161912343, + "language_loss": 0.77310973, + "learning_rate": 1.3464440485032484e-06, + "loss": 0.79111707, + "num_input_tokens_seen": 221301605, + "step": 10275, + "time_per_iteration": 2.826669454574585 + }, + { + "auxiliary_loss_clip": 0.01035055, + "auxiliary_loss_mlp": 0.01027545, + "balance_loss_clip": 1.02550125, + "balance_loss_mlp": 1.01685786, + "epoch": 0.6178265444160529, + "flos": 22565978576640.0, + "grad_norm": 2.6390281609418533, + "language_loss": 0.79296291, + "learning_rate": 1.346075980219998e-06, + "loss": 0.81358892, + "num_input_tokens_seen": 221320105, + "step": 10276, + "time_per_iteration": 2.8053274154663086 + }, + { + "auxiliary_loss_clip": 0.00996646, + "auxiliary_loss_mlp": 0.01037929, + "balance_loss_clip": 1.02022123, + "balance_loss_mlp": 1.02537632, + "epoch": 0.6178866676687209, + "flos": 11984289402240.0, + "grad_norm": 1.8672839932872072, + "language_loss": 0.8079893, + "learning_rate": 1.345707936733612e-06, + "loss": 0.82833505, + "num_input_tokens_seen": 221335915, + "step": 10277, + "time_per_iteration": 2.7926836013793945 + }, + { + "auxiliary_loss_clip": 0.01037855, + "auxiliary_loss_mlp": 0.01025814, + "balance_loss_clip": 1.02620173, + "balance_loss_mlp": 1.0139761, + "epoch": 0.6179467909213888, + "flos": 20991510748800.0, + "grad_norm": 1.763315747898048, + "language_loss": 0.81453347, + "learning_rate": 1.3453399180580466e-06, + "loss": 0.83517021, + "num_input_tokens_seen": 221353965, + "step": 10278, + "time_per_iteration": 2.762446641921997 + }, + { + "auxiliary_loss_clip": 0.01021489, + "auxiliary_loss_mlp": 0.00747505, + "balance_loss_clip": 1.02295721, + "balance_loss_mlp": 1.00039351, + "epoch": 0.6180069141740568, + "flos": 25338102606720.0, + "grad_norm": 1.851087887982321, + "language_loss": 0.74100006, + "learning_rate": 1.3449719242072567e-06, + "loss": 0.75869, + "num_input_tokens_seen": 221374080, + "step": 10279, + "time_per_iteration": 2.8927440643310547 + }, + { + "auxiliary_loss_clip": 0.01044994, + "auxiliary_loss_mlp": 0.01026913, + "balance_loss_clip": 1.02149415, + "balance_loss_mlp": 1.01598179, + "epoch": 0.6180670374267248, + "flos": 19645722858240.0, + "grad_norm": 1.6525444666684501, + "language_loss": 0.70643163, + "learning_rate": 1.3446039551951975e-06, + "loss": 0.72715068, + "num_input_tokens_seen": 221392910, + "step": 10280, + "time_per_iteration": 4.263262033462524 + }, + { + "auxiliary_loss_clip": 0.01064213, + "auxiliary_loss_mlp": 0.01030299, + "balance_loss_clip": 1.02486038, + "balance_loss_mlp": 1.01903915, + "epoch": 0.6181271606793928, + "flos": 19464876858240.0, + "grad_norm": 1.4529405360000005, + "language_loss": 0.72619581, + "learning_rate": 1.3442360110358215e-06, + "loss": 0.747141, + "num_input_tokens_seen": 221410990, + "step": 10281, + "time_per_iteration": 2.5851314067840576 + }, + { + "auxiliary_loss_clip": 0.01042758, + "auxiliary_loss_mlp": 0.01030987, + "balance_loss_clip": 1.02536762, + "balance_loss_mlp": 1.02096188, + "epoch": 0.6181872839320607, + "flos": 25594289383680.0, + "grad_norm": 1.5322601959500723, + "language_loss": 0.76499313, + "learning_rate": 1.3438680917430827e-06, + "loss": 0.7857306, + "num_input_tokens_seen": 221431020, + "step": 10282, + "time_per_iteration": 4.330819845199585 + }, + { + "auxiliary_loss_clip": 0.01037562, + "auxiliary_loss_mlp": 0.01034075, + "balance_loss_clip": 1.02198422, + "balance_loss_mlp": 1.01913846, + "epoch": 0.6182474071847287, + "flos": 25551806572800.0, + "grad_norm": 1.595142428491001, + "language_loss": 0.68612313, + "learning_rate": 1.343500197330931e-06, + "loss": 0.7068395, + "num_input_tokens_seen": 221453235, + "step": 10283, + "time_per_iteration": 2.9122297763824463 + }, + { + "auxiliary_loss_clip": 0.01058167, + "auxiliary_loss_mlp": 0.01028166, + "balance_loss_clip": 1.02513862, + "balance_loss_mlp": 1.01613808, + "epoch": 0.6183075304373966, + "flos": 22123738327680.0, + "grad_norm": 1.512560667525096, + "language_loss": 0.75257981, + "learning_rate": 1.3431323278133176e-06, + "loss": 0.77344316, + "num_input_tokens_seen": 221472560, + "step": 10284, + "time_per_iteration": 2.7202584743499756 + }, + { + "auxiliary_loss_clip": 0.01053664, + "auxiliary_loss_mlp": 0.01036725, + "balance_loss_clip": 1.02613711, + "balance_loss_mlp": 1.02563286, + "epoch": 0.6183676536900646, + "flos": 22455589104000.0, + "grad_norm": 1.4181002641468716, + "language_loss": 0.7572881, + "learning_rate": 1.3427644832041922e-06, + "loss": 0.77819198, + "num_input_tokens_seen": 221492835, + "step": 10285, + "time_per_iteration": 2.8058745861053467 + }, + { + "auxiliary_loss_clip": 0.01028959, + "auxiliary_loss_mlp": 0.01028231, + "balance_loss_clip": 1.02243793, + "balance_loss_mlp": 1.01682854, + "epoch": 0.6184277769427327, + "flos": 23364128736000.0, + "grad_norm": 1.4676715205574444, + "language_loss": 0.72711229, + "learning_rate": 1.342396663517503e-06, + "loss": 0.74768418, + "num_input_tokens_seen": 221511870, + "step": 10286, + "time_per_iteration": 2.659742832183838 + }, + { + "auxiliary_loss_clip": 0.01062971, + "auxiliary_loss_mlp": 0.01026026, + "balance_loss_clip": 1.02487016, + "balance_loss_mlp": 1.01552367, + "epoch": 0.6184879001954006, + "flos": 22711057608960.0, + "grad_norm": 1.5368025030623311, + "language_loss": 0.7596249, + "learning_rate": 1.342028868767199e-06, + "loss": 0.7805149, + "num_input_tokens_seen": 221529915, + "step": 10287, + "time_per_iteration": 2.727713108062744 + }, + { + "auxiliary_loss_clip": 0.01025352, + "auxiliary_loss_mlp": 0.01030116, + "balance_loss_clip": 1.02324629, + "balance_loss_mlp": 1.01888657, + "epoch": 0.6185480234480686, + "flos": 23841920471040.0, + "grad_norm": 1.5767542307479008, + "language_loss": 0.728176, + "learning_rate": 1.3416610989672262e-06, + "loss": 0.74873066, + "num_input_tokens_seen": 221549745, + "step": 10288, + "time_per_iteration": 2.827009916305542 + }, + { + "auxiliary_loss_clip": 0.01048438, + "auxiliary_loss_mlp": 0.01030021, + "balance_loss_clip": 1.02314615, + "balance_loss_mlp": 1.01969147, + "epoch": 0.6186081467007365, + "flos": 45477595774080.0, + "grad_norm": 1.4782840136999873, + "language_loss": 0.72836488, + "learning_rate": 1.3412933541315296e-06, + "loss": 0.7491495, + "num_input_tokens_seen": 221572455, + "step": 10289, + "time_per_iteration": 2.9796547889709473 + }, + { + "auxiliary_loss_clip": 0.01037501, + "auxiliary_loss_mlp": 0.01029327, + "balance_loss_clip": 1.02345681, + "balance_loss_mlp": 1.01769769, + "epoch": 0.6186682699534045, + "flos": 23550864566400.0, + "grad_norm": 1.4478268227668187, + "language_loss": 0.79253244, + "learning_rate": 1.340925634274056e-06, + "loss": 0.81320071, + "num_input_tokens_seen": 221591325, + "step": 10290, + "time_per_iteration": 2.8732199668884277 + }, + { + "auxiliary_loss_clip": 0.01056893, + "auxiliary_loss_mlp": 0.01029552, + "balance_loss_clip": 1.02567649, + "balance_loss_mlp": 1.01831675, + "epoch": 0.6187283932060724, + "flos": 25774201630080.0, + "grad_norm": 2.8499483034730346, + "language_loss": 0.81143153, + "learning_rate": 1.3405579394087475e-06, + "loss": 0.83229601, + "num_input_tokens_seen": 221611640, + "step": 10291, + "time_per_iteration": 2.6563916206359863 + }, + { + "auxiliary_loss_clip": 0.0106461, + "auxiliary_loss_mlp": 0.01033604, + "balance_loss_clip": 1.02554476, + "balance_loss_mlp": 1.02241588, + "epoch": 0.6187885164587404, + "flos": 25265203954560.0, + "grad_norm": 1.6600763484480499, + "language_loss": 0.77523053, + "learning_rate": 1.3401902695495487e-06, + "loss": 0.79621267, + "num_input_tokens_seen": 221631225, + "step": 10292, + "time_per_iteration": 4.256412506103516 + }, + { + "auxiliary_loss_clip": 0.01031695, + "auxiliary_loss_mlp": 0.01035435, + "balance_loss_clip": 1.02249229, + "balance_loss_mlp": 1.0215534, + "epoch": 0.6188486397114084, + "flos": 26250772302720.0, + "grad_norm": 1.9824551109483242, + "language_loss": 0.73373705, + "learning_rate": 1.339822624710401e-06, + "loss": 0.75440836, + "num_input_tokens_seen": 221651035, + "step": 10293, + "time_per_iteration": 2.65617036819458 + }, + { + "auxiliary_loss_clip": 0.01036466, + "auxiliary_loss_mlp": 0.00747613, + "balance_loss_clip": 1.0259198, + "balance_loss_mlp": 1.00046527, + "epoch": 0.6189087629640764, + "flos": 20923388605440.0, + "grad_norm": 1.712389081245906, + "language_loss": 0.83017987, + "learning_rate": 1.3394550049052454e-06, + "loss": 0.84802067, + "num_input_tokens_seen": 221671300, + "step": 10294, + "time_per_iteration": 2.967193365097046 + }, + { + "auxiliary_loss_clip": 0.01045674, + "auxiliary_loss_mlp": 0.010264, + "balance_loss_clip": 1.0253098, + "balance_loss_mlp": 1.01561201, + "epoch": 0.6189688862167443, + "flos": 14829814874880.0, + "grad_norm": 1.919206050938621, + "language_loss": 0.70616239, + "learning_rate": 1.3390874101480225e-06, + "loss": 0.72688317, + "num_input_tokens_seen": 221687320, + "step": 10295, + "time_per_iteration": 2.9475739002227783 + }, + { + "auxiliary_loss_clip": 0.01066018, + "auxiliary_loss_mlp": 0.01033022, + "balance_loss_clip": 1.02683258, + "balance_loss_mlp": 1.02160132, + "epoch": 0.6190290094694123, + "flos": 24285058560000.0, + "grad_norm": 1.7579459018523473, + "language_loss": 0.70345271, + "learning_rate": 1.3387198404526705e-06, + "loss": 0.72444308, + "num_input_tokens_seen": 221710175, + "step": 10296, + "time_per_iteration": 2.6793360710144043 + }, + { + "auxiliary_loss_clip": 0.01030518, + "auxiliary_loss_mlp": 0.01033334, + "balance_loss_clip": 1.02490699, + "balance_loss_mlp": 1.02096021, + "epoch": 0.6190891327220802, + "flos": 22529457423360.0, + "grad_norm": 1.8310199373555645, + "language_loss": 0.71260142, + "learning_rate": 1.3383522958331287e-06, + "loss": 0.73323995, + "num_input_tokens_seen": 221728145, + "step": 10297, + "time_per_iteration": 2.7603230476379395 + }, + { + "auxiliary_loss_clip": 0.01008379, + "auxiliary_loss_mlp": 0.01001974, + "balance_loss_clip": 1.00194907, + "balance_loss_mlp": 1.00091863, + "epoch": 0.6191492559747482, + "flos": 67729357152000.0, + "grad_norm": 0.8822443627766523, + "language_loss": 0.64153624, + "learning_rate": 1.3379847763033345e-06, + "loss": 0.66163981, + "num_input_tokens_seen": 221786100, + "step": 10298, + "time_per_iteration": 3.066457986831665 + }, + { + "auxiliary_loss_clip": 0.01068142, + "auxiliary_loss_mlp": 0.01034438, + "balance_loss_clip": 1.02764368, + "balance_loss_mlp": 1.02366161, + "epoch": 0.6192093792274163, + "flos": 22346672088960.0, + "grad_norm": 1.7665491912998894, + "language_loss": 0.74225938, + "learning_rate": 1.3376172818772236e-06, + "loss": 0.76328522, + "num_input_tokens_seen": 221806450, + "step": 10299, + "time_per_iteration": 2.6545896530151367 + }, + { + "auxiliary_loss_clip": 0.01058395, + "auxiliary_loss_mlp": 0.01031275, + "balance_loss_clip": 1.02636957, + "balance_loss_mlp": 1.01996815, + "epoch": 0.6192695024800842, + "flos": 13553944807680.0, + "grad_norm": 1.7476325045672505, + "language_loss": 0.68438721, + "learning_rate": 1.337249812568732e-06, + "loss": 0.70528388, + "num_input_tokens_seen": 221823330, + "step": 10300, + "time_per_iteration": 2.695746898651123 + }, + { + "auxiliary_loss_clip": 0.01057362, + "auxiliary_loss_mlp": 0.00747602, + "balance_loss_clip": 1.02713072, + "balance_loss_mlp": 1.00046289, + "epoch": 0.6193296257327522, + "flos": 17415310815360.0, + "grad_norm": 1.7559397162539596, + "language_loss": 0.67234039, + "learning_rate": 1.3368823683917939e-06, + "loss": 0.69038999, + "num_input_tokens_seen": 221839360, + "step": 10301, + "time_per_iteration": 4.322078227996826 + }, + { + "auxiliary_loss_clip": 0.01035239, + "auxiliary_loss_mlp": 0.0102908, + "balance_loss_clip": 1.02431238, + "balance_loss_mlp": 1.01805878, + "epoch": 0.6193897489854201, + "flos": 31101118450560.0, + "grad_norm": 1.56515067009342, + "language_loss": 0.72781938, + "learning_rate": 1.3365149493603424e-06, + "loss": 0.74846256, + "num_input_tokens_seen": 221859465, + "step": 10302, + "time_per_iteration": 2.890714645385742 + }, + { + "auxiliary_loss_clip": 0.01047227, + "auxiliary_loss_mlp": 0.01031672, + "balance_loss_clip": 1.02740884, + "balance_loss_mlp": 1.01956058, + "epoch": 0.6194498722380881, + "flos": 19134031662720.0, + "grad_norm": 2.169873010141017, + "language_loss": 0.80817086, + "learning_rate": 1.3361475554883107e-06, + "loss": 0.82895988, + "num_input_tokens_seen": 221878555, + "step": 10303, + "time_per_iteration": 2.719531536102295 + }, + { + "auxiliary_loss_clip": 0.01066381, + "auxiliary_loss_mlp": 0.01030384, + "balance_loss_clip": 1.02479494, + "balance_loss_mlp": 1.01831985, + "epoch": 0.619509995490756, + "flos": 21835088634240.0, + "grad_norm": 1.7245458498931419, + "language_loss": 0.76559901, + "learning_rate": 1.3357801867896307e-06, + "loss": 0.78656662, + "num_input_tokens_seen": 221898790, + "step": 10304, + "time_per_iteration": 2.6384172439575195 + }, + { + "auxiliary_loss_clip": 0.01040782, + "auxiliary_loss_mlp": 0.01035265, + "balance_loss_clip": 1.02712035, + "balance_loss_mlp": 1.02308798, + "epoch": 0.619570118743424, + "flos": 23806548552960.0, + "grad_norm": 1.9437828520479417, + "language_loss": 0.76582229, + "learning_rate": 1.3354128432782324e-06, + "loss": 0.78658277, + "num_input_tokens_seen": 221918875, + "step": 10305, + "time_per_iteration": 2.769650459289551 + }, + { + "auxiliary_loss_clip": 0.01061143, + "auxiliary_loss_mlp": 0.01031245, + "balance_loss_clip": 1.02765894, + "balance_loss_mlp": 1.01814342, + "epoch": 0.619630241996092, + "flos": 21101612912640.0, + "grad_norm": 1.7577442846752567, + "language_loss": 0.79123497, + "learning_rate": 1.335045524968045e-06, + "loss": 0.81215882, + "num_input_tokens_seen": 221937895, + "step": 10306, + "time_per_iteration": 2.7898035049438477 + }, + { + "auxiliary_loss_clip": 0.01015014, + "auxiliary_loss_mlp": 0.0102821, + "balance_loss_clip": 1.02420449, + "balance_loss_mlp": 1.01798224, + "epoch": 0.61969036524876, + "flos": 27308269635840.0, + "grad_norm": 1.6237636147547938, + "language_loss": 0.80212778, + "learning_rate": 1.3346782318729988e-06, + "loss": 0.82256001, + "num_input_tokens_seen": 221955920, + "step": 10307, + "time_per_iteration": 2.926414728164673 + }, + { + "auxiliary_loss_clip": 0.0097159, + "auxiliary_loss_mlp": 0.01000792, + "balance_loss_clip": 1.00563824, + "balance_loss_mlp": 0.99958843, + "epoch": 0.6197504885014279, + "flos": 51648955384320.0, + "grad_norm": 0.8119504622979522, + "language_loss": 0.59415269, + "learning_rate": 1.3343109640070203e-06, + "loss": 0.61387652, + "num_input_tokens_seen": 222011405, + "step": 10308, + "time_per_iteration": 3.284776449203491 + }, + { + "auxiliary_loss_clip": 0.01043367, + "auxiliary_loss_mlp": 0.0102608, + "balance_loss_clip": 1.02608895, + "balance_loss_mlp": 1.01655483, + "epoch": 0.6198106117540959, + "flos": 30557107992960.0, + "grad_norm": 1.756811728488494, + "language_loss": 0.68069053, + "learning_rate": 1.333943721384037e-06, + "loss": 0.70138502, + "num_input_tokens_seen": 222034545, + "step": 10309, + "time_per_iteration": 2.740192413330078 + }, + { + "auxiliary_loss_clip": 0.01036218, + "auxiliary_loss_mlp": 0.01033223, + "balance_loss_clip": 1.02351546, + "balance_loss_mlp": 1.02209473, + "epoch": 0.6198707350067638, + "flos": 18909733184640.0, + "grad_norm": 1.6862038169985898, + "language_loss": 0.71966779, + "learning_rate": 1.3335765040179746e-06, + "loss": 0.74036217, + "num_input_tokens_seen": 222052690, + "step": 10310, + "time_per_iteration": 2.682006359100342 + }, + { + "auxiliary_loss_clip": 0.01051823, + "auxiliary_loss_mlp": 0.01033336, + "balance_loss_clip": 1.02968526, + "balance_loss_mlp": 1.02047384, + "epoch": 0.6199308582594318, + "flos": 21433858738560.0, + "grad_norm": 1.92682923852563, + "language_loss": 0.78691453, + "learning_rate": 1.3332093119227573e-06, + "loss": 0.80776608, + "num_input_tokens_seen": 222069095, + "step": 10311, + "time_per_iteration": 2.6686601638793945 + }, + { + "auxiliary_loss_clip": 0.01037931, + "auxiliary_loss_mlp": 0.01031655, + "balance_loss_clip": 1.02587283, + "balance_loss_mlp": 1.01988912, + "epoch": 0.6199909815120999, + "flos": 18407379525120.0, + "grad_norm": 1.764737562643936, + "language_loss": 0.72618568, + "learning_rate": 1.3328421451123105e-06, + "loss": 0.74688148, + "num_input_tokens_seen": 222087360, + "step": 10312, + "time_per_iteration": 2.6853535175323486 + }, + { + "auxiliary_loss_clip": 0.01029391, + "auxiliary_loss_mlp": 0.01031899, + "balance_loss_clip": 1.02834797, + "balance_loss_mlp": 1.0203836, + "epoch": 0.6200511047647678, + "flos": 21466860359040.0, + "grad_norm": 1.7809114675830906, + "language_loss": 0.71663356, + "learning_rate": 1.3324750036005557e-06, + "loss": 0.73724651, + "num_input_tokens_seen": 222106130, + "step": 10313, + "time_per_iteration": 2.7322494983673096 + }, + { + "auxiliary_loss_clip": 0.01060253, + "auxiliary_loss_mlp": 0.01031869, + "balance_loss_clip": 1.02793813, + "balance_loss_mlp": 1.01978683, + "epoch": 0.6201112280174358, + "flos": 18215903099520.0, + "grad_norm": 1.8855258188326243, + "language_loss": 0.7852537, + "learning_rate": 1.332107887401416e-06, + "loss": 0.80617499, + "num_input_tokens_seen": 222123125, + "step": 10314, + "time_per_iteration": 2.6134121417999268 + }, + { + "auxiliary_loss_clip": 0.01053593, + "auxiliary_loss_mlp": 0.01032124, + "balance_loss_clip": 1.02383375, + "balance_loss_mlp": 1.020823, + "epoch": 0.6201713512701037, + "flos": 20011185786240.0, + "grad_norm": 1.8102747498397118, + "language_loss": 0.77867067, + "learning_rate": 1.331740796528812e-06, + "loss": 0.79952782, + "num_input_tokens_seen": 222140655, + "step": 10315, + "time_per_iteration": 2.5992209911346436 + }, + { + "auxiliary_loss_clip": 0.01038803, + "auxiliary_loss_mlp": 0.01034478, + "balance_loss_clip": 1.02819896, + "balance_loss_mlp": 1.02327287, + "epoch": 0.6202314745227717, + "flos": 22487692884480.0, + "grad_norm": 1.6198970837333768, + "language_loss": 0.76071811, + "learning_rate": 1.3313737309966641e-06, + "loss": 0.78145093, + "num_input_tokens_seen": 222160450, + "step": 10316, + "time_per_iteration": 2.8503305912017822 + }, + { + "auxiliary_loss_clip": 0.01065943, + "auxiliary_loss_mlp": 0.01027761, + "balance_loss_clip": 1.02518451, + "balance_loss_mlp": 1.01657379, + "epoch": 0.6202915977754396, + "flos": 26828682220800.0, + "grad_norm": 2.5747425232990198, + "language_loss": 0.77139306, + "learning_rate": 1.3310066908188915e-06, + "loss": 0.79233009, + "num_input_tokens_seen": 222179170, + "step": 10317, + "time_per_iteration": 2.649782419204712 + }, + { + "auxiliary_loss_clip": 0.00988565, + "auxiliary_loss_mlp": 0.01003933, + "balance_loss_clip": 1.00209832, + "balance_loss_mlp": 1.00287235, + "epoch": 0.6203517210281076, + "flos": 62742694890240.0, + "grad_norm": 0.6875652907726262, + "language_loss": 0.59130549, + "learning_rate": 1.3306396760094122e-06, + "loss": 0.61123049, + "num_input_tokens_seen": 222242660, + "step": 10318, + "time_per_iteration": 3.395638942718506 + }, + { + "auxiliary_loss_clip": 0.01043368, + "auxiliary_loss_mlp": 0.0103383, + "balance_loss_clip": 1.02819562, + "balance_loss_mlp": 1.02155185, + "epoch": 0.6204118442807756, + "flos": 23404277162880.0, + "grad_norm": 1.5861038191027186, + "language_loss": 0.77856123, + "learning_rate": 1.330272686582143e-06, + "loss": 0.79933321, + "num_input_tokens_seen": 222262170, + "step": 10319, + "time_per_iteration": 2.695770263671875 + }, + { + "auxiliary_loss_clip": 0.01047283, + "auxiliary_loss_mlp": 0.01034723, + "balance_loss_clip": 1.02769732, + "balance_loss_mlp": 1.02399468, + "epoch": 0.6204719675334436, + "flos": 20193647898240.0, + "grad_norm": 2.4344050589032595, + "language_loss": 0.66493177, + "learning_rate": 1.3299057225510013e-06, + "loss": 0.6857518, + "num_input_tokens_seen": 222280375, + "step": 10320, + "time_per_iteration": 2.672539234161377 + }, + { + "auxiliary_loss_clip": 0.01023385, + "auxiliary_loss_mlp": 0.01027063, + "balance_loss_clip": 1.02273369, + "balance_loss_mlp": 1.01651883, + "epoch": 0.6205320907861115, + "flos": 13188050916480.0, + "grad_norm": 3.4662240061096328, + "language_loss": 0.76130778, + "learning_rate": 1.3295387839299013e-06, + "loss": 0.78181225, + "num_input_tokens_seen": 222297325, + "step": 10321, + "time_per_iteration": 2.6847076416015625 + }, + { + "auxiliary_loss_clip": 0.01042715, + "auxiliary_loss_mlp": 0.01024899, + "balance_loss_clip": 1.0247457, + "balance_loss_mlp": 1.01398563, + "epoch": 0.6205922140387795, + "flos": 20668386977280.0, + "grad_norm": 1.6976244423057951, + "language_loss": 0.73575252, + "learning_rate": 1.329171870732758e-06, + "loss": 0.75642866, + "num_input_tokens_seen": 222317095, + "step": 10322, + "time_per_iteration": 2.663120746612549 + }, + { + "auxiliary_loss_clip": 0.01022033, + "auxiliary_loss_mlp": 0.0102586, + "balance_loss_clip": 1.02169108, + "balance_loss_mlp": 1.01526248, + "epoch": 0.6206523372914474, + "flos": 23877831093120.0, + "grad_norm": 1.6374825830683053, + "language_loss": 0.72555524, + "learning_rate": 1.3288049829734845e-06, + "loss": 0.7460342, + "num_input_tokens_seen": 222337055, + "step": 10323, + "time_per_iteration": 2.7302322387695312 + }, + { + "auxiliary_loss_clip": 0.01060905, + "auxiliary_loss_mlp": 0.01029491, + "balance_loss_clip": 1.02793431, + "balance_loss_mlp": 1.01740909, + "epoch": 0.6207124605441154, + "flos": 13406603218560.0, + "grad_norm": 2.548298506032104, + "language_loss": 0.58619666, + "learning_rate": 1.3284381206659933e-06, + "loss": 0.60710061, + "num_input_tokens_seen": 222354515, + "step": 10324, + "time_per_iteration": 2.695439338684082 + }, + { + "auxiliary_loss_clip": 0.01030185, + "auxiliary_loss_mlp": 0.01034489, + "balance_loss_clip": 1.02475607, + "balance_loss_mlp": 1.02117372, + "epoch": 0.6207725837967835, + "flos": 18916341287040.0, + "grad_norm": 1.6672553894040438, + "language_loss": 0.76317286, + "learning_rate": 1.3280712838241956e-06, + "loss": 0.78381962, + "num_input_tokens_seen": 222372755, + "step": 10325, + "time_per_iteration": 2.6696465015411377 + }, + { + "auxiliary_loss_clip": 0.01057364, + "auxiliary_loss_mlp": 0.01029265, + "balance_loss_clip": 1.02663946, + "balance_loss_mlp": 1.01725483, + "epoch": 0.6208327070494514, + "flos": 23980211832960.0, + "grad_norm": 1.8260172935441115, + "language_loss": 0.72469485, + "learning_rate": 1.327704472462003e-06, + "loss": 0.74556112, + "num_input_tokens_seen": 222391380, + "step": 10326, + "time_per_iteration": 2.6036579608917236 + }, + { + "auxiliary_loss_clip": 0.01058388, + "auxiliary_loss_mlp": 0.01036253, + "balance_loss_clip": 1.02713466, + "balance_loss_mlp": 1.02470803, + "epoch": 0.6208928303021194, + "flos": 22820405587200.0, + "grad_norm": 2.435364105219742, + "language_loss": 0.73816371, + "learning_rate": 1.3273376865933234e-06, + "loss": 0.75911009, + "num_input_tokens_seen": 222411165, + "step": 10327, + "time_per_iteration": 4.467868089675903 + }, + { + "auxiliary_loss_clip": 0.01049619, + "auxiliary_loss_mlp": 0.01036258, + "balance_loss_clip": 1.02728724, + "balance_loss_mlp": 1.02352619, + "epoch": 0.6209529535547873, + "flos": 17564519911680.0, + "grad_norm": 1.9053166748745076, + "language_loss": 0.79430902, + "learning_rate": 1.326970926232066e-06, + "loss": 0.81516778, + "num_input_tokens_seen": 222428110, + "step": 10328, + "time_per_iteration": 2.8040475845336914 + }, + { + "auxiliary_loss_clip": 0.01034341, + "auxiliary_loss_mlp": 0.01034425, + "balance_loss_clip": 1.02371573, + "balance_loss_mlp": 1.02239132, + "epoch": 0.6210130768074553, + "flos": 22011912311040.0, + "grad_norm": 1.8997942558692893, + "language_loss": 0.78248739, + "learning_rate": 1.3266041913921396e-06, + "loss": 0.80317509, + "num_input_tokens_seen": 222446385, + "step": 10329, + "time_per_iteration": 4.330296516418457 + }, + { + "auxiliary_loss_clip": 0.0099837, + "auxiliary_loss_mlp": 0.01003469, + "balance_loss_clip": 1.0025456, + "balance_loss_mlp": 1.00240827, + "epoch": 0.6210732000601232, + "flos": 63676873854720.0, + "grad_norm": 0.8370103747482033, + "language_loss": 0.62219095, + "learning_rate": 1.3262374820874484e-06, + "loss": 0.64220929, + "num_input_tokens_seen": 222502150, + "step": 10330, + "time_per_iteration": 3.1355385780334473 + }, + { + "auxiliary_loss_clip": 0.01057001, + "auxiliary_loss_mlp": 0.01033278, + "balance_loss_clip": 1.02538443, + "balance_loss_mlp": 1.02108276, + "epoch": 0.6211333233127913, + "flos": 24243365848320.0, + "grad_norm": 3.7428723555324184, + "language_loss": 0.77432787, + "learning_rate": 1.3258707983319002e-06, + "loss": 0.79523069, + "num_input_tokens_seen": 222519880, + "step": 10331, + "time_per_iteration": 2.6194798946380615 + }, + { + "auxiliary_loss_clip": 0.01067854, + "auxiliary_loss_mlp": 0.01030009, + "balance_loss_clip": 1.02687168, + "balance_loss_mlp": 1.01839828, + "epoch": 0.6211934465654592, + "flos": 16943803960320.0, + "grad_norm": 2.3584001666313443, + "language_loss": 0.67503989, + "learning_rate": 1.3255041401393992e-06, + "loss": 0.69601852, + "num_input_tokens_seen": 222538545, + "step": 10332, + "time_per_iteration": 2.552232027053833 + }, + { + "auxiliary_loss_clip": 0.01033052, + "auxiliary_loss_mlp": 0.01029345, + "balance_loss_clip": 1.02397203, + "balance_loss_mlp": 1.01866937, + "epoch": 0.6212535698181272, + "flos": 15267386355840.0, + "grad_norm": 1.4744018507206536, + "language_loss": 0.76539302, + "learning_rate": 1.3251375075238476e-06, + "loss": 0.78601706, + "num_input_tokens_seen": 222556935, + "step": 10333, + "time_per_iteration": 2.6805474758148193 + }, + { + "auxiliary_loss_clip": 0.01045717, + "auxiliary_loss_mlp": 0.01032555, + "balance_loss_clip": 1.02628052, + "balance_loss_mlp": 1.02170074, + "epoch": 0.6213136930707951, + "flos": 13443950384640.0, + "grad_norm": 2.088583748016515, + "language_loss": 0.69479394, + "learning_rate": 1.3247709004991507e-06, + "loss": 0.71557665, + "num_input_tokens_seen": 222574035, + "step": 10334, + "time_per_iteration": 2.7093729972839355 + }, + { + "auxiliary_loss_clip": 0.0104397, + "auxiliary_loss_mlp": 0.00747554, + "balance_loss_clip": 1.02595901, + "balance_loss_mlp": 1.00051486, + "epoch": 0.6213738163234631, + "flos": 18111223889280.0, + "grad_norm": 1.7810255509820363, + "language_loss": 0.70246899, + "learning_rate": 1.3244043190792078e-06, + "loss": 0.72038424, + "num_input_tokens_seen": 222592290, + "step": 10335, + "time_per_iteration": 2.712803363800049 + }, + { + "auxiliary_loss_clip": 0.01012456, + "auxiliary_loss_mlp": 0.0102994, + "balance_loss_clip": 1.02073908, + "balance_loss_mlp": 1.01881814, + "epoch": 0.621433939576131, + "flos": 25337348421120.0, + "grad_norm": 1.7403391976732336, + "language_loss": 0.80067158, + "learning_rate": 1.3240377632779213e-06, + "loss": 0.82109547, + "num_input_tokens_seen": 222612805, + "step": 10336, + "time_per_iteration": 2.7837276458740234 + }, + { + "auxiliary_loss_clip": 0.01064401, + "auxiliary_loss_mlp": 0.01033272, + "balance_loss_clip": 1.02688885, + "balance_loss_mlp": 1.02194715, + "epoch": 0.621494062828799, + "flos": 22565619440640.0, + "grad_norm": 1.6986080328888808, + "language_loss": 0.72981989, + "learning_rate": 1.3236712331091907e-06, + "loss": 0.75079656, + "num_input_tokens_seen": 222632260, + "step": 10337, + "time_per_iteration": 2.6193907260894775 + }, + { + "auxiliary_loss_clip": 0.01068854, + "auxiliary_loss_mlp": 0.01033147, + "balance_loss_clip": 1.02729034, + "balance_loss_mlp": 1.02145255, + "epoch": 0.621554186081467, + "flos": 27417976750080.0, + "grad_norm": 1.8755995576093136, + "language_loss": 0.63123107, + "learning_rate": 1.3233047285869145e-06, + "loss": 0.65225106, + "num_input_tokens_seen": 222653570, + "step": 10338, + "time_per_iteration": 2.656188726425171 + }, + { + "auxiliary_loss_clip": 0.01055511, + "auxiliary_loss_mlp": 0.01028929, + "balance_loss_clip": 1.02633858, + "balance_loss_mlp": 1.0174607, + "epoch": 0.621614309334135, + "flos": 22346815743360.0, + "grad_norm": 1.6810722153779267, + "language_loss": 0.7143268, + "learning_rate": 1.322938249724991e-06, + "loss": 0.7351712, + "num_input_tokens_seen": 222672480, + "step": 10339, + "time_per_iteration": 2.726647138595581 + }, + { + "auxiliary_loss_clip": 0.01015805, + "auxiliary_loss_mlp": 0.01033139, + "balance_loss_clip": 1.0225203, + "balance_loss_mlp": 1.02091455, + "epoch": 0.621674432586803, + "flos": 19281229597440.0, + "grad_norm": 1.5749124440567488, + "language_loss": 0.69445086, + "learning_rate": 1.3225717965373166e-06, + "loss": 0.71494031, + "num_input_tokens_seen": 222691200, + "step": 10340, + "time_per_iteration": 4.506685495376587 + }, + { + "auxiliary_loss_clip": 0.01023671, + "auxiliary_loss_mlp": 0.01029148, + "balance_loss_clip": 1.0222733, + "balance_loss_mlp": 1.0183593, + "epoch": 0.6217345558394709, + "flos": 21609533180160.0, + "grad_norm": 1.8739853775556314, + "language_loss": 0.68875068, + "learning_rate": 1.322205369037788e-06, + "loss": 0.70927882, + "num_input_tokens_seen": 222709975, + "step": 10341, + "time_per_iteration": 2.7192769050598145 + }, + { + "auxiliary_loss_clip": 0.01055796, + "auxiliary_loss_mlp": 0.01030487, + "balance_loss_clip": 1.02562237, + "balance_loss_mlp": 1.01826823, + "epoch": 0.6217946790921389, + "flos": 18004102554240.0, + "grad_norm": 1.6431312676663539, + "language_loss": 0.8074134, + "learning_rate": 1.321838967240299e-06, + "loss": 0.82827628, + "num_input_tokens_seen": 222729005, + "step": 10342, + "time_per_iteration": 2.770596981048584 + }, + { + "auxiliary_loss_clip": 0.00989919, + "auxiliary_loss_mlp": 0.01009918, + "balance_loss_clip": 1.00288057, + "balance_loss_mlp": 1.00880897, + "epoch": 0.6218548023448068, + "flos": 61973631768960.0, + "grad_norm": 0.8066053032321044, + "language_loss": 0.57391095, + "learning_rate": 1.3214725911587452e-06, + "loss": 0.59390932, + "num_input_tokens_seen": 222786090, + "step": 10343, + "time_per_iteration": 3.2286741733551025 + }, + { + "auxiliary_loss_clip": 0.01020097, + "auxiliary_loss_mlp": 0.01027344, + "balance_loss_clip": 1.02009237, + "balance_loss_mlp": 1.01716316, + "epoch": 0.6219149255974749, + "flos": 25739152934400.0, + "grad_norm": 2.7870942585021137, + "language_loss": 0.72829723, + "learning_rate": 1.3211062408070184e-06, + "loss": 0.74877161, + "num_input_tokens_seen": 222806100, + "step": 10344, + "time_per_iteration": 2.928278684616089 + }, + { + "auxiliary_loss_clip": 0.01056594, + "auxiliary_loss_mlp": 0.01038811, + "balance_loss_clip": 1.02653503, + "balance_loss_mlp": 1.02799869, + "epoch": 0.6219750488501428, + "flos": 25411073086080.0, + "grad_norm": 1.7147529049539687, + "language_loss": 0.60616779, + "learning_rate": 1.3207399161990105e-06, + "loss": 0.62712187, + "num_input_tokens_seen": 222826575, + "step": 10345, + "time_per_iteration": 2.6199862957000732 + }, + { + "auxiliary_loss_clip": 0.01002954, + "auxiliary_loss_mlp": 0.01036841, + "balance_loss_clip": 1.0203408, + "balance_loss_mlp": 1.02489066, + "epoch": 0.6220351721028108, + "flos": 20047383717120.0, + "grad_norm": 2.212771526795999, + "language_loss": 0.78135324, + "learning_rate": 1.320373617348614e-06, + "loss": 0.80175126, + "num_input_tokens_seen": 222845285, + "step": 10346, + "time_per_iteration": 2.76035475730896 + }, + { + "auxiliary_loss_clip": 0.01038047, + "auxiliary_loss_mlp": 0.01035472, + "balance_loss_clip": 1.0254451, + "balance_loss_mlp": 1.02349126, + "epoch": 0.6220952953554787, + "flos": 27488397363840.0, + "grad_norm": 1.7017110020568431, + "language_loss": 0.71203971, + "learning_rate": 1.3200073442697171e-06, + "loss": 0.73277491, + "num_input_tokens_seen": 222864575, + "step": 10347, + "time_per_iteration": 2.75380277633667 + }, + { + "auxiliary_loss_clip": 0.0105035, + "auxiliary_loss_mlp": 0.01028277, + "balance_loss_clip": 1.02235222, + "balance_loss_mlp": 1.01713729, + "epoch": 0.6221554186081467, + "flos": 19207612673280.0, + "grad_norm": 2.702280291317141, + "language_loss": 0.71747971, + "learning_rate": 1.3196410969762108e-06, + "loss": 0.73826599, + "num_input_tokens_seen": 222884420, + "step": 10348, + "time_per_iteration": 2.6654319763183594 + }, + { + "auxiliary_loss_clip": 0.00981244, + "auxiliary_loss_mlp": 0.01001877, + "balance_loss_clip": 1.00363445, + "balance_loss_mlp": 1.00061977, + "epoch": 0.6222155418608146, + "flos": 62950939989120.0, + "grad_norm": 0.8134123373260348, + "language_loss": 0.5413537, + "learning_rate": 1.3192748754819815e-06, + "loss": 0.56118488, + "num_input_tokens_seen": 222944690, + "step": 10349, + "time_per_iteration": 4.909745693206787 + }, + { + "auxiliary_loss_clip": 0.0102837, + "auxiliary_loss_mlp": 0.01025567, + "balance_loss_clip": 1.02333987, + "balance_loss_mlp": 1.01438487, + "epoch": 0.6222756651134826, + "flos": 22601099099520.0, + "grad_norm": 2.1955331552066553, + "language_loss": 0.69439888, + "learning_rate": 1.3189086798009173e-06, + "loss": 0.71493822, + "num_input_tokens_seen": 222962990, + "step": 10350, + "time_per_iteration": 2.7881357669830322 + }, + { + "auxiliary_loss_clip": 0.01066298, + "auxiliary_loss_mlp": 0.01029705, + "balance_loss_clip": 1.0262816, + "balance_loss_mlp": 1.01873159, + "epoch": 0.6223357883661506, + "flos": 21142228216320.0, + "grad_norm": 1.891866641023389, + "language_loss": 0.57144809, + "learning_rate": 1.3185425099469046e-06, + "loss": 0.59240812, + "num_input_tokens_seen": 222980715, + "step": 10351, + "time_per_iteration": 2.5972962379455566 + }, + { + "auxiliary_loss_clip": 0.01002658, + "auxiliary_loss_mlp": 0.0100478, + "balance_loss_clip": 1.00617552, + "balance_loss_mlp": 1.0035882, + "epoch": 0.6223959116188186, + "flos": 63765071700480.0, + "grad_norm": 0.8053529830664259, + "language_loss": 0.61121905, + "learning_rate": 1.3181763659338276e-06, + "loss": 0.63129342, + "num_input_tokens_seen": 223040685, + "step": 10352, + "time_per_iteration": 3.1709177494049072 + }, + { + "auxiliary_loss_clip": 0.01061855, + "auxiliary_loss_mlp": 0.01027444, + "balance_loss_clip": 1.02425575, + "balance_loss_mlp": 1.01642287, + "epoch": 0.6224560348714866, + "flos": 22565727181440.0, + "grad_norm": 1.866175398411784, + "language_loss": 0.81663686, + "learning_rate": 1.3178102477755714e-06, + "loss": 0.83752984, + "num_input_tokens_seen": 223059000, + "step": 10353, + "time_per_iteration": 2.6657285690307617 + }, + { + "auxiliary_loss_clip": 0.01050746, + "auxiliary_loss_mlp": 0.01026823, + "balance_loss_clip": 1.0243597, + "balance_loss_mlp": 1.01702404, + "epoch": 0.6225161581241545, + "flos": 24097748112000.0, + "grad_norm": 1.5016586641431584, + "language_loss": 0.75575697, + "learning_rate": 1.3174441554860195e-06, + "loss": 0.77653271, + "num_input_tokens_seen": 223079345, + "step": 10354, + "time_per_iteration": 2.6481664180755615 + }, + { + "auxiliary_loss_clip": 0.01019703, + "auxiliary_loss_mlp": 0.01025814, + "balance_loss_clip": 1.02337933, + "balance_loss_mlp": 1.014781, + "epoch": 0.6225762813768225, + "flos": 20443513881600.0, + "grad_norm": 1.9047049706718442, + "language_loss": 0.78683758, + "learning_rate": 1.3170780890790528e-06, + "loss": 0.80729282, + "num_input_tokens_seen": 223097880, + "step": 10355, + "time_per_iteration": 2.7413063049316406 + }, + { + "auxiliary_loss_clip": 0.01056929, + "auxiliary_loss_mlp": 0.01030464, + "balance_loss_clip": 1.02833164, + "balance_loss_mlp": 1.01990223, + "epoch": 0.6226364046294904, + "flos": 27198131558400.0, + "grad_norm": 1.4585483718672354, + "language_loss": 0.77992797, + "learning_rate": 1.3167120485685538e-06, + "loss": 0.80080187, + "num_input_tokens_seen": 223118185, + "step": 10356, + "time_per_iteration": 2.7525103092193604 + }, + { + "auxiliary_loss_clip": 0.01046628, + "auxiliary_loss_mlp": 0.00747702, + "balance_loss_clip": 1.02572179, + "balance_loss_mlp": 1.00053227, + "epoch": 0.6226965278821585, + "flos": 20445776438400.0, + "grad_norm": 1.7825836351714377, + "language_loss": 0.67564297, + "learning_rate": 1.3163460339684024e-06, + "loss": 0.69358629, + "num_input_tokens_seen": 223137600, + "step": 10357, + "time_per_iteration": 2.7320053577423096 + }, + { + "auxiliary_loss_clip": 0.01045032, + "auxiliary_loss_mlp": 0.01037884, + "balance_loss_clip": 1.02509952, + "balance_loss_mlp": 1.025105, + "epoch": 0.6227566511348264, + "flos": 22162737519360.0, + "grad_norm": 2.3921670128329584, + "language_loss": 0.76141995, + "learning_rate": 1.3159800452924778e-06, + "loss": 0.78224909, + "num_input_tokens_seen": 223154360, + "step": 10358, + "time_per_iteration": 2.6937143802642822 + }, + { + "auxiliary_loss_clip": 0.01042578, + "auxiliary_loss_mlp": 0.01029823, + "balance_loss_clip": 1.02391934, + "balance_loss_mlp": 1.01849842, + "epoch": 0.6228167743874944, + "flos": 18040875102720.0, + "grad_norm": 2.1101016867922016, + "language_loss": 0.82175672, + "learning_rate": 1.3156140825546588e-06, + "loss": 0.84248078, + "num_input_tokens_seen": 223172255, + "step": 10359, + "time_per_iteration": 2.695852518081665 + }, + { + "auxiliary_loss_clip": 0.01035812, + "auxiliary_loss_mlp": 0.0103981, + "balance_loss_clip": 1.0224402, + "balance_loss_mlp": 1.02723932, + "epoch": 0.6228768976401623, + "flos": 17742851959680.0, + "grad_norm": 2.360863420330003, + "language_loss": 0.73332191, + "learning_rate": 1.315248145768822e-06, + "loss": 0.75407815, + "num_input_tokens_seen": 223186965, + "step": 10360, + "time_per_iteration": 2.6382884979248047 + }, + { + "auxiliary_loss_clip": 0.01052964, + "auxiliary_loss_mlp": 0.01028806, + "balance_loss_clip": 1.02422547, + "balance_loss_mlp": 1.01735008, + "epoch": 0.6229370208928303, + "flos": 17894934144000.0, + "grad_norm": 2.869160470902304, + "language_loss": 0.78106844, + "learning_rate": 1.3148822349488442e-06, + "loss": 0.8018862, + "num_input_tokens_seen": 223206045, + "step": 10361, + "time_per_iteration": 2.6868250370025635 + }, + { + "auxiliary_loss_clip": 0.01027809, + "auxiliary_loss_mlp": 0.01028001, + "balance_loss_clip": 1.02391815, + "balance_loss_mlp": 1.01751685, + "epoch": 0.6229971441454982, + "flos": 17347763289600.0, + "grad_norm": 1.5484010710439116, + "language_loss": 0.67510396, + "learning_rate": 1.3145163501086005e-06, + "loss": 0.69566214, + "num_input_tokens_seen": 223224820, + "step": 10362, + "time_per_iteration": 2.822845697402954 + }, + { + "auxiliary_loss_clip": 0.01043185, + "auxiliary_loss_mlp": 0.0102863, + "balance_loss_clip": 1.0239079, + "balance_loss_mlp": 1.01709008, + "epoch": 0.6230572673981662, + "flos": 29241376807680.0, + "grad_norm": 2.1243561564440396, + "language_loss": 0.67576432, + "learning_rate": 1.3141504912619658e-06, + "loss": 0.69648248, + "num_input_tokens_seen": 223243205, + "step": 10363, + "time_per_iteration": 2.7560746669769287 + }, + { + "auxiliary_loss_clip": 0.01016592, + "auxiliary_loss_mlp": 0.01031928, + "balance_loss_clip": 1.02307022, + "balance_loss_mlp": 1.01977456, + "epoch": 0.6231173906508342, + "flos": 16325961096960.0, + "grad_norm": 1.7604097349719436, + "language_loss": 0.86639738, + "learning_rate": 1.3137846584228127e-06, + "loss": 0.88688254, + "num_input_tokens_seen": 223261370, + "step": 10364, + "time_per_iteration": 2.72304105758667 + }, + { + "auxiliary_loss_clip": 0.00990294, + "auxiliary_loss_mlp": 0.01002949, + "balance_loss_clip": 1.00381279, + "balance_loss_mlp": 1.00178039, + "epoch": 0.6231775139035022, + "flos": 68702032517760.0, + "grad_norm": 0.8919907165913685, + "language_loss": 0.60781747, + "learning_rate": 1.313418851605015e-06, + "loss": 0.62774992, + "num_input_tokens_seen": 223315050, + "step": 10365, + "time_per_iteration": 3.3188796043395996 + }, + { + "auxiliary_loss_clip": 0.01040233, + "auxiliary_loss_mlp": 0.00747784, + "balance_loss_clip": 1.02759826, + "balance_loss_mlp": 1.00051832, + "epoch": 0.6232376371561702, + "flos": 19821038163840.0, + "grad_norm": 1.8758621118867662, + "language_loss": 0.75415105, + "learning_rate": 1.3130530708224427e-06, + "loss": 0.77203119, + "num_input_tokens_seen": 223332130, + "step": 10366, + "time_per_iteration": 2.7211248874664307 + }, + { + "auxiliary_loss_clip": 0.01055318, + "auxiliary_loss_mlp": 0.0103291, + "balance_loss_clip": 1.02551866, + "balance_loss_mlp": 1.02153182, + "epoch": 0.6232977604088381, + "flos": 23258264376960.0, + "grad_norm": 2.0222714745408026, + "language_loss": 0.76479387, + "learning_rate": 1.3126873160889665e-06, + "loss": 0.78567612, + "num_input_tokens_seen": 223351605, + "step": 10367, + "time_per_iteration": 2.6636221408843994 + }, + { + "auxiliary_loss_clip": 0.01051579, + "auxiliary_loss_mlp": 0.01031124, + "balance_loss_clip": 1.02538121, + "balance_loss_mlp": 1.02000225, + "epoch": 0.6233578836615061, + "flos": 21106425335040.0, + "grad_norm": 2.7366450675758998, + "language_loss": 0.78446591, + "learning_rate": 1.312321587418457e-06, + "loss": 0.80529296, + "num_input_tokens_seen": 223372090, + "step": 10368, + "time_per_iteration": 2.6614339351654053 + }, + { + "auxiliary_loss_clip": 0.01001083, + "auxiliary_loss_mlp": 0.01030666, + "balance_loss_clip": 1.02295637, + "balance_loss_mlp": 1.01863801, + "epoch": 0.623418006914174, + "flos": 23769416868480.0, + "grad_norm": 2.2595431006871665, + "language_loss": 0.68644154, + "learning_rate": 1.3119558848247811e-06, + "loss": 0.70675904, + "num_input_tokens_seen": 223390110, + "step": 10369, + "time_per_iteration": 2.7759594917297363 + }, + { + "auxiliary_loss_clip": 0.01066526, + "auxiliary_loss_mlp": 0.01031323, + "balance_loss_clip": 1.02635765, + "balance_loss_mlp": 1.01972342, + "epoch": 0.6234781301668421, + "flos": 17890480857600.0, + "grad_norm": 2.3854284461500352, + "language_loss": 0.8807596, + "learning_rate": 1.3115902083218072e-06, + "loss": 0.90173811, + "num_input_tokens_seen": 223404205, + "step": 10370, + "time_per_iteration": 2.5490243434906006 + }, + { + "auxiliary_loss_clip": 0.01062158, + "auxiliary_loss_mlp": 0.01023649, + "balance_loss_clip": 1.02452314, + "balance_loss_mlp": 1.01285434, + "epoch": 0.62353825341951, + "flos": 26175503352960.0, + "grad_norm": 1.9061484906834774, + "language_loss": 0.6585502, + "learning_rate": 1.311224557923402e-06, + "loss": 0.67940831, + "num_input_tokens_seen": 223424855, + "step": 10371, + "time_per_iteration": 2.6146981716156006 + }, + { + "auxiliary_loss_clip": 0.01050065, + "auxiliary_loss_mlp": 0.01026437, + "balance_loss_clip": 1.02342761, + "balance_loss_mlp": 1.01701319, + "epoch": 0.623598376672178, + "flos": 31139902160640.0, + "grad_norm": 1.3042290872305116, + "language_loss": 0.77756071, + "learning_rate": 1.3108589336434298e-06, + "loss": 0.79832566, + "num_input_tokens_seen": 223447225, + "step": 10372, + "time_per_iteration": 2.7214057445526123 + }, + { + "auxiliary_loss_clip": 0.01055263, + "auxiliary_loss_mlp": 0.01030473, + "balance_loss_clip": 1.02503538, + "balance_loss_mlp": 1.0187012, + "epoch": 0.6236584999248459, + "flos": 23730202195200.0, + "grad_norm": 1.5979829118511235, + "language_loss": 0.77383006, + "learning_rate": 1.3104933354957568e-06, + "loss": 0.79468739, + "num_input_tokens_seen": 223467520, + "step": 10373, + "time_per_iteration": 2.768371820449829 + }, + { + "auxiliary_loss_clip": 0.01051433, + "auxiliary_loss_mlp": 0.0102479, + "balance_loss_clip": 1.02422333, + "balance_loss_mlp": 1.01423991, + "epoch": 0.6237186231775139, + "flos": 21762764599680.0, + "grad_norm": 1.564208458670311, + "language_loss": 0.69425654, + "learning_rate": 1.3101277634942448e-06, + "loss": 0.71501875, + "num_input_tokens_seen": 223488130, + "step": 10374, + "time_per_iteration": 2.6794981956481934 + }, + { + "auxiliary_loss_clip": 0.01049042, + "auxiliary_loss_mlp": 0.01026269, + "balance_loss_clip": 1.02756095, + "balance_loss_mlp": 1.0157969, + "epoch": 0.6237787464301818, + "flos": 14939486075520.0, + "grad_norm": 1.7872237550816201, + "language_loss": 0.77200186, + "learning_rate": 1.3097622176527577e-06, + "loss": 0.79275501, + "num_input_tokens_seen": 223505105, + "step": 10375, + "time_per_iteration": 6.091882944107056 + }, + { + "auxiliary_loss_clip": 0.01044343, + "auxiliary_loss_mlp": 0.01023841, + "balance_loss_clip": 1.02545714, + "balance_loss_mlp": 1.01357698, + "epoch": 0.6238388696828499, + "flos": 35590311302400.0, + "grad_norm": 1.3147233826048788, + "language_loss": 0.69935429, + "learning_rate": 1.3093966979851566e-06, + "loss": 0.72003615, + "num_input_tokens_seen": 223528065, + "step": 10376, + "time_per_iteration": 2.85648250579834 + }, + { + "auxiliary_loss_clip": 0.01036198, + "auxiliary_loss_mlp": 0.01034072, + "balance_loss_clip": 1.02544606, + "balance_loss_mlp": 1.02148306, + "epoch": 0.6238989929355178, + "flos": 23623511823360.0, + "grad_norm": 1.5508585897762415, + "language_loss": 0.76804185, + "learning_rate": 1.309031204505301e-06, + "loss": 0.78874451, + "num_input_tokens_seen": 223547305, + "step": 10377, + "time_per_iteration": 2.8518781661987305 + }, + { + "auxiliary_loss_clip": 0.01046498, + "auxiliary_loss_mlp": 0.01025524, + "balance_loss_clip": 1.02616739, + "balance_loss_mlp": 1.01520097, + "epoch": 0.6239591161881858, + "flos": 22087468569600.0, + "grad_norm": 1.6487890217673888, + "language_loss": 0.68156242, + "learning_rate": 1.308665737227052e-06, + "loss": 0.70228267, + "num_input_tokens_seen": 223567205, + "step": 10378, + "time_per_iteration": 3.0747292041778564 + }, + { + "auxiliary_loss_clip": 0.01047113, + "auxiliary_loss_mlp": 0.01031148, + "balance_loss_clip": 1.02651024, + "balance_loss_mlp": 1.01991272, + "epoch": 0.6240192394408538, + "flos": 24535930124160.0, + "grad_norm": 2.3218551336508972, + "language_loss": 0.76233536, + "learning_rate": 1.3083002961642675e-06, + "loss": 0.78311801, + "num_input_tokens_seen": 223586560, + "step": 10379, + "time_per_iteration": 2.8715715408325195 + }, + { + "auxiliary_loss_clip": 0.01043432, + "auxiliary_loss_mlp": 0.01030179, + "balance_loss_clip": 1.02460361, + "balance_loss_mlp": 1.01884806, + "epoch": 0.6240793626935217, + "flos": 27931930502400.0, + "grad_norm": 1.7603874944134372, + "language_loss": 0.79291123, + "learning_rate": 1.3079348813308051e-06, + "loss": 0.81364727, + "num_input_tokens_seen": 223610595, + "step": 10380, + "time_per_iteration": 2.8244433403015137 + }, + { + "auxiliary_loss_clip": 0.01053836, + "auxiliary_loss_mlp": 0.01031128, + "balance_loss_clip": 1.02627409, + "balance_loss_mlp": 1.02108467, + "epoch": 0.6241394859461897, + "flos": 22892514140160.0, + "grad_norm": 1.4375313786156387, + "language_loss": 0.79905689, + "learning_rate": 1.3075694927405207e-06, + "loss": 0.81990647, + "num_input_tokens_seen": 223630230, + "step": 10381, + "time_per_iteration": 2.6798062324523926 + }, + { + "auxiliary_loss_clip": 0.01038016, + "auxiliary_loss_mlp": 0.01034908, + "balance_loss_clip": 1.02294254, + "balance_loss_mlp": 1.02225387, + "epoch": 0.6241996091988576, + "flos": 12750766744320.0, + "grad_norm": 1.9615818167648895, + "language_loss": 0.74418366, + "learning_rate": 1.3072041304072718e-06, + "loss": 0.76491284, + "num_input_tokens_seen": 223648360, + "step": 10382, + "time_per_iteration": 2.7306292057037354 + }, + { + "auxiliary_loss_clip": 0.01052915, + "auxiliary_loss_mlp": 0.010254, + "balance_loss_clip": 1.02471924, + "balance_loss_mlp": 1.01486158, + "epoch": 0.6242597324515257, + "flos": 25851302173440.0, + "grad_norm": 1.4116717812275232, + "language_loss": 0.78580284, + "learning_rate": 1.306838794344911e-06, + "loss": 0.80658597, + "num_input_tokens_seen": 223671255, + "step": 10383, + "time_per_iteration": 2.835554361343384 + }, + { + "auxiliary_loss_clip": 0.0102488, + "auxiliary_loss_mlp": 0.01025482, + "balance_loss_clip": 1.02075899, + "balance_loss_mlp": 1.01464593, + "epoch": 0.6243198557041936, + "flos": 19937712516480.0, + "grad_norm": 3.0020070344096057, + "language_loss": 0.75240207, + "learning_rate": 1.3064734845672925e-06, + "loss": 0.77290571, + "num_input_tokens_seen": 223689860, + "step": 10384, + "time_per_iteration": 2.9213061332702637 + }, + { + "auxiliary_loss_clip": 0.01038803, + "auxiliary_loss_mlp": 0.01038024, + "balance_loss_clip": 1.0227493, + "balance_loss_mlp": 1.02537549, + "epoch": 0.6243799789568616, + "flos": 18406194376320.0, + "grad_norm": 1.702698930944848, + "language_loss": 0.66608816, + "learning_rate": 1.3061082010882694e-06, + "loss": 0.68685639, + "num_input_tokens_seen": 223707835, + "step": 10385, + "time_per_iteration": 2.724565267562866 + }, + { + "auxiliary_loss_clip": 0.00993176, + "auxiliary_loss_mlp": 0.01001939, + "balance_loss_clip": 1.00560796, + "balance_loss_mlp": 1.00077093, + "epoch": 0.6244401022095295, + "flos": 66027587523840.0, + "grad_norm": 0.7592550493056598, + "language_loss": 0.62001723, + "learning_rate": 1.305742943921692e-06, + "loss": 0.6399684, + "num_input_tokens_seen": 223771875, + "step": 10386, + "time_per_iteration": 3.339298725128174 + }, + { + "auxiliary_loss_clip": 0.01055572, + "auxiliary_loss_mlp": 0.01029375, + "balance_loss_clip": 1.02517188, + "balance_loss_mlp": 1.01791942, + "epoch": 0.6245002254621975, + "flos": 24571266128640.0, + "grad_norm": 3.1739031085912015, + "language_loss": 0.72689998, + "learning_rate": 1.3053777130814128e-06, + "loss": 0.74774945, + "num_input_tokens_seen": 223788895, + "step": 10387, + "time_per_iteration": 4.3112218379974365 + }, + { + "auxiliary_loss_clip": 0.01060712, + "auxiliary_loss_mlp": 0.0103345, + "balance_loss_clip": 1.02692652, + "balance_loss_mlp": 1.02077866, + "epoch": 0.6245603487148654, + "flos": 29168837291520.0, + "grad_norm": 3.520372224363255, + "language_loss": 0.65349507, + "learning_rate": 1.3050125085812798e-06, + "loss": 0.67443669, + "num_input_tokens_seen": 223810385, + "step": 10388, + "time_per_iteration": 2.744415760040283 + }, + { + "auxiliary_loss_clip": 0.0103546, + "auxiliary_loss_mlp": 0.01024266, + "balance_loss_clip": 1.02606988, + "balance_loss_mlp": 1.01427662, + "epoch": 0.6246204719675335, + "flos": 14790097411200.0, + "grad_norm": 2.175731178043054, + "language_loss": 0.79004109, + "learning_rate": 1.3046473304351417e-06, + "loss": 0.81063831, + "num_input_tokens_seen": 223826040, + "step": 10389, + "time_per_iteration": 2.7239601612091064 + }, + { + "auxiliary_loss_clip": 0.01036658, + "auxiliary_loss_mlp": 0.01031164, + "balance_loss_clip": 1.02223182, + "balance_loss_mlp": 1.01949322, + "epoch": 0.6246805952202014, + "flos": 12493538472960.0, + "grad_norm": 1.661897191159559, + "language_loss": 0.60703099, + "learning_rate": 1.3042821786568475e-06, + "loss": 0.62770915, + "num_input_tokens_seen": 223842300, + "step": 10390, + "time_per_iteration": 2.662856101989746 + }, + { + "auxiliary_loss_clip": 0.0104475, + "auxiliary_loss_mlp": 0.01028729, + "balance_loss_clip": 1.02494812, + "balance_loss_mlp": 1.01720119, + "epoch": 0.6247407184728694, + "flos": 12786677366400.0, + "grad_norm": 1.8640862004236645, + "language_loss": 0.77475417, + "learning_rate": 1.3039170532602416e-06, + "loss": 0.79548895, + "num_input_tokens_seen": 223858320, + "step": 10391, + "time_per_iteration": 2.642915964126587 + }, + { + "auxiliary_loss_clip": 0.01045898, + "auxiliary_loss_mlp": 0.01027531, + "balance_loss_clip": 1.02580249, + "balance_loss_mlp": 1.01602125, + "epoch": 0.6248008417255374, + "flos": 40629188960640.0, + "grad_norm": 1.4769584585122741, + "language_loss": 0.64830983, + "learning_rate": 1.3035519542591718e-06, + "loss": 0.66904408, + "num_input_tokens_seen": 223883545, + "step": 10392, + "time_per_iteration": 2.823784112930298 + }, + { + "auxiliary_loss_clip": 0.01048758, + "auxiliary_loss_mlp": 0.01030651, + "balance_loss_clip": 1.02774835, + "balance_loss_mlp": 1.01875353, + "epoch": 0.6248609649782053, + "flos": 19902017376000.0, + "grad_norm": 1.6600642096187415, + "language_loss": 0.7672298, + "learning_rate": 1.3031868816674819e-06, + "loss": 0.78802389, + "num_input_tokens_seen": 223901445, + "step": 10393, + "time_per_iteration": 2.718276262283325 + }, + { + "auxiliary_loss_clip": 0.01025262, + "auxiliary_loss_mlp": 0.00747704, + "balance_loss_clip": 1.02376819, + "balance_loss_mlp": 1.00041866, + "epoch": 0.6249210882308733, + "flos": 19682746801920.0, + "grad_norm": 1.6244648852586687, + "language_loss": 0.82409871, + "learning_rate": 1.3028218354990142e-06, + "loss": 0.84182835, + "num_input_tokens_seen": 223920170, + "step": 10394, + "time_per_iteration": 2.7056853771209717 + }, + { + "auxiliary_loss_clip": 0.01047266, + "auxiliary_loss_mlp": 0.01034811, + "balance_loss_clip": 1.02586854, + "balance_loss_mlp": 1.0222106, + "epoch": 0.6249812114835412, + "flos": 13990726189440.0, + "grad_norm": 1.9554165941154074, + "language_loss": 0.75240207, + "learning_rate": 1.3024568157676128e-06, + "loss": 0.7732228, + "num_input_tokens_seen": 223936495, + "step": 10395, + "time_per_iteration": 2.6939055919647217 + }, + { + "auxiliary_loss_clip": 0.01043494, + "auxiliary_loss_mlp": 0.01028679, + "balance_loss_clip": 1.02353787, + "balance_loss_mlp": 1.01663303, + "epoch": 0.6250413347362093, + "flos": 14530031965440.0, + "grad_norm": 2.419477221240473, + "language_loss": 0.72268081, + "learning_rate": 1.302091822487119e-06, + "loss": 0.7434026, + "num_input_tokens_seen": 223950070, + "step": 10396, + "time_per_iteration": 4.441821575164795 + }, + { + "auxiliary_loss_clip": 0.01030359, + "auxiliary_loss_mlp": 0.01035712, + "balance_loss_clip": 1.02716374, + "balance_loss_mlp": 1.0246253, + "epoch": 0.6251014579888772, + "flos": 22963006581120.0, + "grad_norm": 1.6152879132309377, + "language_loss": 0.75668877, + "learning_rate": 1.3017268556713732e-06, + "loss": 0.77734947, + "num_input_tokens_seen": 223970065, + "step": 10397, + "time_per_iteration": 2.814507246017456 + }, + { + "auxiliary_loss_clip": 0.01043899, + "auxiliary_loss_mlp": 0.01032637, + "balance_loss_clip": 1.0248481, + "balance_loss_mlp": 1.02098393, + "epoch": 0.6251615812415452, + "flos": 28111232217600.0, + "grad_norm": 2.0890360870657436, + "language_loss": 0.75036263, + "learning_rate": 1.3013619153342154e-06, + "loss": 0.77112794, + "num_input_tokens_seen": 223990315, + "step": 10398, + "time_per_iteration": 2.779675245285034 + }, + { + "auxiliary_loss_clip": 0.01067363, + "auxiliary_loss_mlp": 0.01032098, + "balance_loss_clip": 1.02577782, + "balance_loss_mlp": 1.0193305, + "epoch": 0.6252217044942131, + "flos": 26724469887360.0, + "grad_norm": 2.2651493212183746, + "language_loss": 0.73935187, + "learning_rate": 1.300997001489483e-06, + "loss": 0.76034641, + "num_input_tokens_seen": 224009960, + "step": 10399, + "time_per_iteration": 2.652484655380249 + }, + { + "auxiliary_loss_clip": 0.01032648, + "auxiliary_loss_mlp": 0.0103493, + "balance_loss_clip": 1.0239898, + "balance_loss_mlp": 1.02348566, + "epoch": 0.6252818277468811, + "flos": 20006768413440.0, + "grad_norm": 1.660790237117246, + "language_loss": 0.74522734, + "learning_rate": 1.3006321141510147e-06, + "loss": 0.76590312, + "num_input_tokens_seen": 224028870, + "step": 10400, + "time_per_iteration": 2.7468132972717285 + }, + { + "auxiliary_loss_clip": 0.00980657, + "auxiliary_loss_mlp": 0.01004775, + "balance_loss_clip": 1.00282633, + "balance_loss_mlp": 1.00342202, + "epoch": 0.625341950999549, + "flos": 59278285059840.0, + "grad_norm": 0.8418733443077353, + "language_loss": 0.56505173, + "learning_rate": 1.3002672533326465e-06, + "loss": 0.58490604, + "num_input_tokens_seen": 224094140, + "step": 10401, + "time_per_iteration": 3.3656232357025146 + }, + { + "auxiliary_loss_clip": 0.01058253, + "auxiliary_loss_mlp": 0.01029777, + "balance_loss_clip": 1.02664483, + "balance_loss_mlp": 1.01746821, + "epoch": 0.625402074252217, + "flos": 20157090831360.0, + "grad_norm": 2.3689541118856714, + "language_loss": 0.82803363, + "learning_rate": 1.2999024190482146e-06, + "loss": 0.84891391, + "num_input_tokens_seen": 224113235, + "step": 10402, + "time_per_iteration": 2.6682090759277344 + }, + { + "auxiliary_loss_clip": 0.00996266, + "auxiliary_loss_mlp": 0.01027555, + "balance_loss_clip": 1.0243237, + "balance_loss_mlp": 1.01666546, + "epoch": 0.625462197504885, + "flos": 29132531619840.0, + "grad_norm": 2.029607878250183, + "language_loss": 0.68963772, + "learning_rate": 1.2995376113115527e-06, + "loss": 0.70987594, + "num_input_tokens_seen": 224134530, + "step": 10403, + "time_per_iteration": 3.045836925506592 + }, + { + "auxiliary_loss_clip": 0.01028163, + "auxiliary_loss_mlp": 0.01028709, + "balance_loss_clip": 1.02246618, + "balance_loss_mlp": 1.01612043, + "epoch": 0.625522320757553, + "flos": 26104436294400.0, + "grad_norm": 1.6177856121871304, + "language_loss": 0.71563303, + "learning_rate": 1.2991728301364954e-06, + "loss": 0.73620176, + "num_input_tokens_seen": 224154170, + "step": 10404, + "time_per_iteration": 2.8369314670562744 + }, + { + "auxiliary_loss_clip": 0.01022575, + "auxiliary_loss_mlp": 0.01035218, + "balance_loss_clip": 1.02387881, + "balance_loss_mlp": 1.02355909, + "epoch": 0.625582444010221, + "flos": 20630967984000.0, + "grad_norm": 1.9588940714534204, + "language_loss": 0.69251311, + "learning_rate": 1.2988080755368742e-06, + "loss": 0.71309102, + "num_input_tokens_seen": 224172730, + "step": 10405, + "time_per_iteration": 2.7417662143707275 + }, + { + "auxiliary_loss_clip": 0.01038499, + "auxiliary_loss_mlp": 0.01032394, + "balance_loss_clip": 1.02504075, + "balance_loss_mlp": 1.01998401, + "epoch": 0.6256425672628889, + "flos": 20521512264960.0, + "grad_norm": 1.5780829936389182, + "language_loss": 0.78867984, + "learning_rate": 1.2984433475265207e-06, + "loss": 0.80938882, + "num_input_tokens_seen": 224192620, + "step": 10406, + "time_per_iteration": 2.685620069503784 + }, + { + "auxiliary_loss_clip": 0.0102665, + "auxiliary_loss_mlp": 0.01031779, + "balance_loss_clip": 1.02503669, + "balance_loss_mlp": 1.02073979, + "epoch": 0.6257026905155569, + "flos": 29529200488320.0, + "grad_norm": 1.6465561426286217, + "language_loss": 0.68804818, + "learning_rate": 1.2980786461192666e-06, + "loss": 0.70863247, + "num_input_tokens_seen": 224214660, + "step": 10407, + "time_per_iteration": 2.877439022064209 + }, + { + "auxiliary_loss_clip": 0.01051245, + "auxiliary_loss_mlp": 0.00747567, + "balance_loss_clip": 1.02408457, + "balance_loss_mlp": 1.00044394, + "epoch": 0.6257628137682248, + "flos": 24024885373440.0, + "grad_norm": 4.1930511077152195, + "language_loss": 0.85499394, + "learning_rate": 1.2977139713289398e-06, + "loss": 0.87298203, + "num_input_tokens_seen": 224234170, + "step": 10408, + "time_per_iteration": 2.7443342208862305 + }, + { + "auxiliary_loss_clip": 0.01040959, + "auxiliary_loss_mlp": 0.00747642, + "balance_loss_clip": 1.02252316, + "balance_loss_mlp": 1.00045824, + "epoch": 0.6258229370208929, + "flos": 20850956830080.0, + "grad_norm": 1.653770946709444, + "language_loss": 0.79735363, + "learning_rate": 1.2973493231693699e-06, + "loss": 0.81523967, + "num_input_tokens_seen": 224253115, + "step": 10409, + "time_per_iteration": 2.6459133625030518 + }, + { + "auxiliary_loss_clip": 0.01041322, + "auxiliary_loss_mlp": 0.01031907, + "balance_loss_clip": 1.02330387, + "balance_loss_mlp": 1.02067125, + "epoch": 0.6258830602735608, + "flos": 22231542021120.0, + "grad_norm": 3.4896227284983152, + "language_loss": 0.69473946, + "learning_rate": 1.2969847016543845e-06, + "loss": 0.71547174, + "num_input_tokens_seen": 224271375, + "step": 10410, + "time_per_iteration": 2.7960731983184814 + }, + { + "auxiliary_loss_clip": 0.01016049, + "auxiliary_loss_mlp": 0.01025047, + "balance_loss_clip": 1.02284205, + "balance_loss_mlp": 1.01421118, + "epoch": 0.6259431835262288, + "flos": 25076887925760.0, + "grad_norm": 1.6390297731117882, + "language_loss": 0.67602336, + "learning_rate": 1.2966201067978086e-06, + "loss": 0.69643438, + "num_input_tokens_seen": 224290315, + "step": 10411, + "time_per_iteration": 2.743574619293213 + }, + { + "auxiliary_loss_clip": 0.01024978, + "auxiliary_loss_mlp": 0.01030753, + "balance_loss_clip": 1.02392185, + "balance_loss_mlp": 1.01955307, + "epoch": 0.6260033067788967, + "flos": 28252288926720.0, + "grad_norm": 1.631546387372873, + "language_loss": 0.6922009, + "learning_rate": 1.2962555386134702e-06, + "loss": 0.71275818, + "num_input_tokens_seen": 224310545, + "step": 10412, + "time_per_iteration": 2.8598453998565674 + }, + { + "auxiliary_loss_clip": 0.01036545, + "auxiliary_loss_mlp": 0.01030795, + "balance_loss_clip": 1.02274156, + "balance_loss_mlp": 1.01853466, + "epoch": 0.6260634300315647, + "flos": 23367432787200.0, + "grad_norm": 1.5519116504785115, + "language_loss": 0.69310391, + "learning_rate": 1.2958909971151908e-06, + "loss": 0.7137773, + "num_input_tokens_seen": 224331115, + "step": 10413, + "time_per_iteration": 2.631757974624634 + }, + { + "auxiliary_loss_clip": 0.01032669, + "auxiliary_loss_mlp": 0.01033191, + "balance_loss_clip": 1.02216959, + "balance_loss_mlp": 1.02025712, + "epoch": 0.6261235532842326, + "flos": 18035308494720.0, + "grad_norm": 5.737583974677493, + "language_loss": 0.80366725, + "learning_rate": 1.295526482316796e-06, + "loss": 0.8243258, + "num_input_tokens_seen": 224347525, + "step": 10414, + "time_per_iteration": 2.6407668590545654 + }, + { + "auxiliary_loss_clip": 0.01058523, + "auxiliary_loss_mlp": 0.01035742, + "balance_loss_clip": 1.02805972, + "balance_loss_mlp": 1.02423871, + "epoch": 0.6261836765369007, + "flos": 22011265866240.0, + "grad_norm": 1.6572569780227717, + "language_loss": 0.74592066, + "learning_rate": 1.2951619942321083e-06, + "loss": 0.76686335, + "num_input_tokens_seen": 224367045, + "step": 10415, + "time_per_iteration": 2.8622148036956787 + }, + { + "auxiliary_loss_clip": 0.01018011, + "auxiliary_loss_mlp": 0.0102832, + "balance_loss_clip": 1.02379203, + "balance_loss_mlp": 1.01696575, + "epoch": 0.6262437997895686, + "flos": 24936010784640.0, + "grad_norm": 1.4328305850623742, + "language_loss": 0.73926389, + "learning_rate": 1.2947975328749472e-06, + "loss": 0.75972724, + "num_input_tokens_seen": 224388860, + "step": 10416, + "time_per_iteration": 2.9338672161102295 + }, + { + "auxiliary_loss_clip": 0.01045008, + "auxiliary_loss_mlp": 0.01029289, + "balance_loss_clip": 1.02714062, + "balance_loss_mlp": 1.01865518, + "epoch": 0.6263039230422366, + "flos": 31608428186880.0, + "grad_norm": 1.5426947916122007, + "language_loss": 0.84370977, + "learning_rate": 1.2944330982591352e-06, + "loss": 0.86445272, + "num_input_tokens_seen": 224409645, + "step": 10417, + "time_per_iteration": 2.930267572402954 + }, + { + "auxiliary_loss_clip": 0.01055687, + "auxiliary_loss_mlp": 0.01026348, + "balance_loss_clip": 1.02512181, + "balance_loss_mlp": 1.01447511, + "epoch": 0.6263640462949046, + "flos": 17639465639040.0, + "grad_norm": 2.455281243675921, + "language_loss": 0.56853628, + "learning_rate": 1.2940686903984904e-06, + "loss": 0.58935666, + "num_input_tokens_seen": 224428530, + "step": 10418, + "time_per_iteration": 2.7911057472229004 + }, + { + "auxiliary_loss_clip": 0.01054918, + "auxiliary_loss_mlp": 0.01034463, + "balance_loss_clip": 1.02349615, + "balance_loss_mlp": 1.02213693, + "epoch": 0.6264241695475725, + "flos": 19974951941760.0, + "grad_norm": 1.9663379953986693, + "language_loss": 0.84718782, + "learning_rate": 1.2937043093068316e-06, + "loss": 0.86808163, + "num_input_tokens_seen": 224447175, + "step": 10419, + "time_per_iteration": 2.760789394378662 + }, + { + "auxiliary_loss_clip": 0.01067463, + "auxiliary_loss_mlp": 0.01032359, + "balance_loss_clip": 1.02730393, + "balance_loss_mlp": 1.02086139, + "epoch": 0.6264842928002405, + "flos": 27344323912320.0, + "grad_norm": 1.4007788138438106, + "language_loss": 0.64957893, + "learning_rate": 1.2933399549979762e-06, + "loss": 0.67057717, + "num_input_tokens_seen": 224469445, + "step": 10420, + "time_per_iteration": 2.7299726009368896 + }, + { + "auxiliary_loss_clip": 0.01032421, + "auxiliary_loss_mlp": 0.01035067, + "balance_loss_clip": 1.02604067, + "balance_loss_mlp": 1.0220139, + "epoch": 0.6265444160529084, + "flos": 22997265177600.0, + "grad_norm": 1.7633050298799686, + "language_loss": 0.85851717, + "learning_rate": 1.292975627485741e-06, + "loss": 0.87919199, + "num_input_tokens_seen": 224486590, + "step": 10421, + "time_per_iteration": 2.716379404067993 + }, + { + "auxiliary_loss_clip": 0.01026865, + "auxiliary_loss_mlp": 0.01030717, + "balance_loss_clip": 1.0241971, + "balance_loss_mlp": 1.01948142, + "epoch": 0.6266045393055765, + "flos": 19938323047680.0, + "grad_norm": 2.2618606921971574, + "language_loss": 0.79647362, + "learning_rate": 1.2926113267839403e-06, + "loss": 0.81704938, + "num_input_tokens_seen": 224502795, + "step": 10422, + "time_per_iteration": 5.970808029174805 + }, + { + "auxiliary_loss_clip": 0.01053114, + "auxiliary_loss_mlp": 0.01022763, + "balance_loss_clip": 1.02391231, + "balance_loss_mlp": 1.01126575, + "epoch": 0.6266646625582444, + "flos": 24389091325440.0, + "grad_norm": 1.6616932638234791, + "language_loss": 0.74379754, + "learning_rate": 1.292247052906389e-06, + "loss": 0.76455635, + "num_input_tokens_seen": 224522300, + "step": 10423, + "time_per_iteration": 2.691471815109253 + }, + { + "auxiliary_loss_clip": 0.0106489, + "auxiliary_loss_mlp": 0.01027298, + "balance_loss_clip": 1.02600718, + "balance_loss_mlp": 1.01669478, + "epoch": 0.6267247858109124, + "flos": 14683802088960.0, + "grad_norm": 1.891221900969101, + "language_loss": 0.77760941, + "learning_rate": 1.2918828058669004e-06, + "loss": 0.79853129, + "num_input_tokens_seen": 224538260, + "step": 10424, + "time_per_iteration": 2.544513702392578 + }, + { + "auxiliary_loss_clip": 0.01065439, + "auxiliary_loss_mlp": 0.01030256, + "balance_loss_clip": 1.02642083, + "balance_loss_mlp": 1.01810861, + "epoch": 0.6267849090635803, + "flos": 24929977299840.0, + "grad_norm": 2.0479973922683925, + "language_loss": 0.69007349, + "learning_rate": 1.2915185856792868e-06, + "loss": 0.71103048, + "num_input_tokens_seen": 224559155, + "step": 10425, + "time_per_iteration": 2.618853807449341 + }, + { + "auxiliary_loss_clip": 0.01040679, + "auxiliary_loss_mlp": 0.01026824, + "balance_loss_clip": 1.02428472, + "balance_loss_mlp": 1.01725721, + "epoch": 0.6268450323162483, + "flos": 25337851211520.0, + "grad_norm": 1.6883043589611697, + "language_loss": 0.74271798, + "learning_rate": 1.2911543923573598e-06, + "loss": 0.76339304, + "num_input_tokens_seen": 224578660, + "step": 10426, + "time_per_iteration": 2.759697914123535 + }, + { + "auxiliary_loss_clip": 0.01055992, + "auxiliary_loss_mlp": 0.00747682, + "balance_loss_clip": 1.02547586, + "balance_loss_mlp": 1.00051522, + "epoch": 0.6269051555689162, + "flos": 26177299032960.0, + "grad_norm": 1.4615117636747255, + "language_loss": 0.80348766, + "learning_rate": 1.290790225914929e-06, + "loss": 0.82152444, + "num_input_tokens_seen": 224599080, + "step": 10427, + "time_per_iteration": 2.6640141010284424 + }, + { + "auxiliary_loss_clip": 0.01032131, + "auxiliary_loss_mlp": 0.01032797, + "balance_loss_clip": 1.02385759, + "balance_loss_mlp": 1.02125716, + "epoch": 0.6269652788215843, + "flos": 18256877539200.0, + "grad_norm": 6.440505021818421, + "language_loss": 0.68696958, + "learning_rate": 1.2904260863658034e-06, + "loss": 0.70761889, + "num_input_tokens_seen": 224614225, + "step": 10428, + "time_per_iteration": 2.6560239791870117 + }, + { + "auxiliary_loss_clip": 0.01019592, + "auxiliary_loss_mlp": 0.01040807, + "balance_loss_clip": 1.02285361, + "balance_loss_mlp": 1.02830219, + "epoch": 0.6270254020742522, + "flos": 11765413877760.0, + "grad_norm": 1.9931671151099217, + "language_loss": 0.71427512, + "learning_rate": 1.2900619737237928e-06, + "loss": 0.73487914, + "num_input_tokens_seen": 224632365, + "step": 10429, + "time_per_iteration": 2.7709805965423584 + }, + { + "auxiliary_loss_clip": 0.01056617, + "auxiliary_loss_mlp": 0.01031776, + "balance_loss_clip": 1.0259186, + "balance_loss_mlp": 1.01977801, + "epoch": 0.6270855253269202, + "flos": 23475631530240.0, + "grad_norm": 1.4039496867660373, + "language_loss": 0.79864502, + "learning_rate": 1.2896978880027023e-06, + "loss": 0.81952894, + "num_input_tokens_seen": 224651125, + "step": 10430, + "time_per_iteration": 2.6294031143188477 + }, + { + "auxiliary_loss_clip": 0.01008824, + "auxiliary_loss_mlp": 0.0100359, + "balance_loss_clip": 1.00239468, + "balance_loss_mlp": 1.00242746, + "epoch": 0.6271456485795882, + "flos": 70064520232320.0, + "grad_norm": 0.7965647727400755, + "language_loss": 0.59164572, + "learning_rate": 1.2893338292163393e-06, + "loss": 0.6117698, + "num_input_tokens_seen": 224716115, + "step": 10431, + "time_per_iteration": 3.266554832458496 + }, + { + "auxiliary_loss_clip": 0.00989184, + "auxiliary_loss_mlp": 0.0100079, + "balance_loss_clip": 1.00255442, + "balance_loss_mlp": 0.99962789, + "epoch": 0.6272057718322561, + "flos": 65156718280320.0, + "grad_norm": 0.9056675973927542, + "language_loss": 0.63727057, + "learning_rate": 1.2889697973785095e-06, + "loss": 0.6571703, + "num_input_tokens_seen": 224782930, + "step": 10432, + "time_per_iteration": 3.305368185043335 + }, + { + "auxiliary_loss_clip": 0.0104313, + "auxiliary_loss_mlp": 0.0103069, + "balance_loss_clip": 1.02496684, + "balance_loss_mlp": 1.02030659, + "epoch": 0.6272658950849241, + "flos": 24389342720640.0, + "grad_norm": 1.8325500332120201, + "language_loss": 0.64895082, + "learning_rate": 1.2886057925030153e-06, + "loss": 0.66968894, + "num_input_tokens_seen": 224802010, + "step": 10433, + "time_per_iteration": 2.7308566570281982 + }, + { + "auxiliary_loss_clip": 0.01058522, + "auxiliary_loss_mlp": 0.01035262, + "balance_loss_clip": 1.02672422, + "balance_loss_mlp": 1.02348399, + "epoch": 0.627326018337592, + "flos": 17966001202560.0, + "grad_norm": 2.194824870499616, + "language_loss": 0.62245488, + "learning_rate": 1.2882418146036612e-06, + "loss": 0.64339274, + "num_input_tokens_seen": 224818875, + "step": 10434, + "time_per_iteration": 2.649958848953247 + }, + { + "auxiliary_loss_clip": 0.01033915, + "auxiliary_loss_mlp": 0.01027109, + "balance_loss_clip": 1.02416933, + "balance_loss_mlp": 1.01611197, + "epoch": 0.6273861415902601, + "flos": 20230097224320.0, + "grad_norm": 1.6885510387139078, + "language_loss": 0.84488451, + "learning_rate": 1.2878778636942484e-06, + "loss": 0.86549479, + "num_input_tokens_seen": 224837790, + "step": 10435, + "time_per_iteration": 4.348103761672974 + }, + { + "auxiliary_loss_clip": 0.01007306, + "auxiliary_loss_mlp": 0.01002912, + "balance_loss_clip": 1.00096238, + "balance_loss_mlp": 1.00188053, + "epoch": 0.627446264842928, + "flos": 64953210798720.0, + "grad_norm": 0.7302364614484067, + "language_loss": 0.61562598, + "learning_rate": 1.2875139397885786e-06, + "loss": 0.63572812, + "num_input_tokens_seen": 224899685, + "step": 10436, + "time_per_iteration": 3.1371235847473145 + }, + { + "auxiliary_loss_clip": 0.01046629, + "auxiliary_loss_mlp": 0.01033204, + "balance_loss_clip": 1.02768683, + "balance_loss_mlp": 1.02125299, + "epoch": 0.627506388095596, + "flos": 23584261236480.0, + "grad_norm": 1.6024865801345933, + "language_loss": 0.77462709, + "learning_rate": 1.2871500429004523e-06, + "loss": 0.79542542, + "num_input_tokens_seen": 224918650, + "step": 10437, + "time_per_iteration": 2.6207990646362305 + }, + { + "auxiliary_loss_clip": 0.0099974, + "auxiliary_loss_mlp": 0.01011777, + "balance_loss_clip": 1.00291038, + "balance_loss_mlp": 1.01082885, + "epoch": 0.6275665113482639, + "flos": 67583631674880.0, + "grad_norm": 0.7269810299189353, + "language_loss": 0.54335153, + "learning_rate": 1.2867861730436667e-06, + "loss": 0.56346673, + "num_input_tokens_seen": 224981575, + "step": 10438, + "time_per_iteration": 3.072565793991089 + }, + { + "auxiliary_loss_clip": 0.01012919, + "auxiliary_loss_mlp": 0.01043342, + "balance_loss_clip": 1.02079308, + "balance_loss_mlp": 1.03155863, + "epoch": 0.6276266346009319, + "flos": 27636924101760.0, + "grad_norm": 1.7207279390961738, + "language_loss": 0.83940852, + "learning_rate": 1.2864223302320214e-06, + "loss": 0.85997111, + "num_input_tokens_seen": 225000820, + "step": 10439, + "time_per_iteration": 2.794745445251465 + }, + { + "auxiliary_loss_clip": 0.01033621, + "auxiliary_loss_mlp": 0.010454, + "balance_loss_clip": 1.02879477, + "balance_loss_mlp": 1.03284144, + "epoch": 0.6276867578535998, + "flos": 22746142218240.0, + "grad_norm": 2.275412580008307, + "language_loss": 0.80181128, + "learning_rate": 1.2860585144793128e-06, + "loss": 0.82260156, + "num_input_tokens_seen": 225017585, + "step": 10440, + "time_per_iteration": 2.744946002960205 + }, + { + "auxiliary_loss_clip": 0.0101456, + "auxiliary_loss_mlp": 0.01031148, + "balance_loss_clip": 1.02170956, + "balance_loss_mlp": 1.02096152, + "epoch": 0.6277468811062679, + "flos": 24644200694400.0, + "grad_norm": 1.9596746468206934, + "language_loss": 0.74675983, + "learning_rate": 1.285694725799337e-06, + "loss": 0.76721692, + "num_input_tokens_seen": 225039085, + "step": 10441, + "time_per_iteration": 2.894540548324585 + }, + { + "auxiliary_loss_clip": 0.01045361, + "auxiliary_loss_mlp": 0.01031674, + "balance_loss_clip": 1.02247381, + "balance_loss_mlp": 1.01997316, + "epoch": 0.6278070043589358, + "flos": 19678975873920.0, + "grad_norm": 1.770992091690889, + "language_loss": 0.72206378, + "learning_rate": 1.2853309642058884e-06, + "loss": 0.74283415, + "num_input_tokens_seen": 225058105, + "step": 10442, + "time_per_iteration": 2.6188013553619385 + }, + { + "auxiliary_loss_clip": 0.01029894, + "auxiliary_loss_mlp": 0.01031429, + "balance_loss_clip": 1.0234344, + "balance_loss_mlp": 1.01991379, + "epoch": 0.6278671276116038, + "flos": 22121834906880.0, + "grad_norm": 1.456529845758097, + "language_loss": 0.71473765, + "learning_rate": 1.284967229712762e-06, + "loss": 0.73535085, + "num_input_tokens_seen": 225077605, + "step": 10443, + "time_per_iteration": 2.7418413162231445 + }, + { + "auxiliary_loss_clip": 0.01066903, + "auxiliary_loss_mlp": 0.01027615, + "balance_loss_clip": 1.02724361, + "balance_loss_mlp": 1.0162425, + "epoch": 0.6279272508642717, + "flos": 23038562839680.0, + "grad_norm": 1.9941352706885465, + "language_loss": 0.73299301, + "learning_rate": 1.2846035223337492e-06, + "loss": 0.7539382, + "num_input_tokens_seen": 225097775, + "step": 10444, + "time_per_iteration": 4.25124192237854 + }, + { + "auxiliary_loss_clip": 0.01041913, + "auxiliary_loss_mlp": 0.01030955, + "balance_loss_clip": 1.0325706, + "balance_loss_mlp": 1.0197854, + "epoch": 0.6279873741169397, + "flos": 19824090819840.0, + "grad_norm": 2.022966118654341, + "language_loss": 0.72553849, + "learning_rate": 1.2842398420826423e-06, + "loss": 0.7462672, + "num_input_tokens_seen": 225115585, + "step": 10445, + "time_per_iteration": 2.7029476165771484 + }, + { + "auxiliary_loss_clip": 0.01055541, + "auxiliary_loss_mlp": 0.01029899, + "balance_loss_clip": 1.0262202, + "balance_loss_mlp": 1.01902699, + "epoch": 0.6280474973696077, + "flos": 23915393740800.0, + "grad_norm": 1.5235014833830398, + "language_loss": 0.69215798, + "learning_rate": 1.2838761889732331e-06, + "loss": 0.71301234, + "num_input_tokens_seen": 225135575, + "step": 10446, + "time_per_iteration": 2.586463451385498 + }, + { + "auxiliary_loss_clip": 0.01025354, + "auxiliary_loss_mlp": 0.01029925, + "balance_loss_clip": 1.02678323, + "balance_loss_mlp": 1.01729441, + "epoch": 0.6281076206222757, + "flos": 17967976450560.0, + "grad_norm": 1.8380915365353185, + "language_loss": 0.73722279, + "learning_rate": 1.2835125630193102e-06, + "loss": 0.75777555, + "num_input_tokens_seen": 225154230, + "step": 10447, + "time_per_iteration": 2.7819395065307617 + }, + { + "auxiliary_loss_clip": 0.01002466, + "auxiliary_loss_mlp": 0.01003, + "balance_loss_clip": 1.00587511, + "balance_loss_mlp": 1.0019213, + "epoch": 0.6281677438749437, + "flos": 66778370622720.0, + "grad_norm": 0.6810708984554067, + "language_loss": 0.52384353, + "learning_rate": 1.2831489642346626e-06, + "loss": 0.54389822, + "num_input_tokens_seen": 225213650, + "step": 10448, + "time_per_iteration": 3.1598572731018066 + }, + { + "auxiliary_loss_clip": 0.01039568, + "auxiliary_loss_mlp": 0.01047472, + "balance_loss_clip": 1.02413714, + "balance_loss_mlp": 1.03472829, + "epoch": 0.6282278671276116, + "flos": 11656173640320.0, + "grad_norm": 2.2631955233011922, + "language_loss": 0.91004568, + "learning_rate": 1.282785392633079e-06, + "loss": 0.93091607, + "num_input_tokens_seen": 225230135, + "step": 10449, + "time_per_iteration": 2.61386775970459 + }, + { + "auxiliary_loss_clip": 0.01063845, + "auxiliary_loss_mlp": 0.01028869, + "balance_loss_clip": 1.02485776, + "balance_loss_mlp": 1.01861727, + "epoch": 0.6282879903802796, + "flos": 42741597847680.0, + "grad_norm": 1.530816191848619, + "language_loss": 0.60195768, + "learning_rate": 1.2824218482283438e-06, + "loss": 0.62288487, + "num_input_tokens_seen": 225253520, + "step": 10450, + "time_per_iteration": 2.8241829872131348 + }, + { + "auxiliary_loss_clip": 0.01046074, + "auxiliary_loss_mlp": 0.01028834, + "balance_loss_clip": 1.02738643, + "balance_loss_mlp": 1.01816487, + "epoch": 0.6283481136329475, + "flos": 20009210538240.0, + "grad_norm": 1.4919742094648172, + "language_loss": 0.77134323, + "learning_rate": 1.2820583310342452e-06, + "loss": 0.79209232, + "num_input_tokens_seen": 225272460, + "step": 10451, + "time_per_iteration": 2.7457692623138428 + }, + { + "auxiliary_loss_clip": 0.0104153, + "auxiliary_loss_mlp": 0.0102915, + "balance_loss_clip": 1.02306986, + "balance_loss_mlp": 1.01767659, + "epoch": 0.6284082368856155, + "flos": 21904431840000.0, + "grad_norm": 1.7091065206449017, + "language_loss": 0.7756027, + "learning_rate": 1.281694841064566e-06, + "loss": 0.79630953, + "num_input_tokens_seen": 225291700, + "step": 10452, + "time_per_iteration": 2.703059673309326 + }, + { + "auxiliary_loss_clip": 0.01040078, + "auxiliary_loss_mlp": 0.01032014, + "balance_loss_clip": 1.03048825, + "balance_loss_mlp": 1.02084446, + "epoch": 0.6284683601382834, + "flos": 25484187219840.0, + "grad_norm": 1.7437476999755042, + "language_loss": 0.72657919, + "learning_rate": 1.2813313783330904e-06, + "loss": 0.74730009, + "num_input_tokens_seen": 225311470, + "step": 10453, + "time_per_iteration": 2.8312103748321533 + }, + { + "auxiliary_loss_clip": 0.01007683, + "auxiliary_loss_mlp": 0.01035899, + "balance_loss_clip": 1.01959562, + "balance_loss_mlp": 1.02226758, + "epoch": 0.6285284833909515, + "flos": 16538695395840.0, + "grad_norm": 1.586573340465542, + "language_loss": 0.80342853, + "learning_rate": 1.2809679428536013e-06, + "loss": 0.82386434, + "num_input_tokens_seen": 225328385, + "step": 10454, + "time_per_iteration": 2.8003883361816406 + }, + { + "auxiliary_loss_clip": 0.01037998, + "auxiliary_loss_mlp": 0.01029764, + "balance_loss_clip": 1.02862465, + "balance_loss_mlp": 1.01941681, + "epoch": 0.6285886066436194, + "flos": 22820692896000.0, + "grad_norm": 1.9165637210707143, + "language_loss": 0.82074636, + "learning_rate": 1.2806045346398792e-06, + "loss": 0.84142399, + "num_input_tokens_seen": 225348415, + "step": 10455, + "time_per_iteration": 2.7910983562469482 + }, + { + "auxiliary_loss_clip": 0.01014248, + "auxiliary_loss_mlp": 0.00747643, + "balance_loss_clip": 1.02158535, + "balance_loss_mlp": 1.00051308, + "epoch": 0.6286487298962874, + "flos": 24715734629760.0, + "grad_norm": 1.6243450489086457, + "language_loss": 0.81769443, + "learning_rate": 1.280241153705706e-06, + "loss": 0.83531332, + "num_input_tokens_seen": 225367740, + "step": 10456, + "time_per_iteration": 2.919262647628784 + }, + { + "auxiliary_loss_clip": 0.01040984, + "auxiliary_loss_mlp": 0.01029169, + "balance_loss_clip": 1.02762485, + "balance_loss_mlp": 1.01734924, + "epoch": 0.6287088531489553, + "flos": 20740818752640.0, + "grad_norm": 1.558819003636473, + "language_loss": 0.72018164, + "learning_rate": 1.27987780006486e-06, + "loss": 0.74088323, + "num_input_tokens_seen": 225388405, + "step": 10457, + "time_per_iteration": 2.766191005706787 + }, + { + "auxiliary_loss_clip": 0.01054836, + "auxiliary_loss_mlp": 0.01032841, + "balance_loss_clip": 1.02331209, + "balance_loss_mlp": 1.0211103, + "epoch": 0.6287689764016233, + "flos": 23070630706560.0, + "grad_norm": 1.877271553936505, + "language_loss": 0.7946409, + "learning_rate": 1.2795144737311202e-06, + "loss": 0.81551766, + "num_input_tokens_seen": 225408360, + "step": 10458, + "time_per_iteration": 2.73453688621521 + }, + { + "auxiliary_loss_clip": 0.01057232, + "auxiliary_loss_mlp": 0.01030288, + "balance_loss_clip": 1.02640343, + "balance_loss_mlp": 1.01915932, + "epoch": 0.6288290996542913, + "flos": 32233669251840.0, + "grad_norm": 4.033156365514623, + "language_loss": 0.61178589, + "learning_rate": 1.2791511747182635e-06, + "loss": 0.6326611, + "num_input_tokens_seen": 225431310, + "step": 10459, + "time_per_iteration": 2.802022695541382 + }, + { + "auxiliary_loss_clip": 0.01044859, + "auxiliary_loss_mlp": 0.01028239, + "balance_loss_clip": 1.02654088, + "balance_loss_mlp": 1.01772511, + "epoch": 0.6288892229069593, + "flos": 24641327606400.0, + "grad_norm": 9.158346992612232, + "language_loss": 0.78561711, + "learning_rate": 1.2787879030400666e-06, + "loss": 0.80634815, + "num_input_tokens_seen": 225450385, + "step": 10460, + "time_per_iteration": 2.755478620529175 + }, + { + "auxiliary_loss_clip": 0.01026289, + "auxiliary_loss_mlp": 0.01030514, + "balance_loss_clip": 1.02276838, + "balance_loss_mlp": 1.01901662, + "epoch": 0.6289493461596273, + "flos": 17858341163520.0, + "grad_norm": 1.6126296513776959, + "language_loss": 0.74262124, + "learning_rate": 1.2784246587103047e-06, + "loss": 0.76318926, + "num_input_tokens_seen": 225467325, + "step": 10461, + "time_per_iteration": 2.7400014400482178 + }, + { + "auxiliary_loss_clip": 0.01042688, + "auxiliary_loss_mlp": 0.01037506, + "balance_loss_clip": 1.0241456, + "balance_loss_mlp": 1.02639008, + "epoch": 0.6290094694122952, + "flos": 22345379199360.0, + "grad_norm": 1.872128132786842, + "language_loss": 0.70129174, + "learning_rate": 1.2780614417427523e-06, + "loss": 0.7220937, + "num_input_tokens_seen": 225487370, + "step": 10462, + "time_per_iteration": 2.6788904666900635 + }, + { + "auxiliary_loss_clip": 0.01059531, + "auxiliary_loss_mlp": 0.01027107, + "balance_loss_clip": 1.02518737, + "balance_loss_mlp": 1.0175581, + "epoch": 0.6290695926649632, + "flos": 28402431776640.0, + "grad_norm": 1.8411312930687913, + "language_loss": 0.7205162, + "learning_rate": 1.2776982521511821e-06, + "loss": 0.74138254, + "num_input_tokens_seen": 225506915, + "step": 10463, + "time_per_iteration": 2.717029333114624 + }, + { + "auxiliary_loss_clip": 0.01045781, + "auxiliary_loss_mlp": 0.01034718, + "balance_loss_clip": 1.02719748, + "balance_loss_mlp": 1.02354777, + "epoch": 0.6291297159176311, + "flos": 21505464501120.0, + "grad_norm": 1.6221243211786234, + "language_loss": 0.72783518, + "learning_rate": 1.2773350899493665e-06, + "loss": 0.74864018, + "num_input_tokens_seen": 225525670, + "step": 10464, + "time_per_iteration": 2.6703643798828125 + }, + { + "auxiliary_loss_clip": 0.010368, + "auxiliary_loss_mlp": 0.01026542, + "balance_loss_clip": 1.02646351, + "balance_loss_mlp": 1.01634395, + "epoch": 0.6291898391702991, + "flos": 12203308581120.0, + "grad_norm": 1.6638729078580172, + "language_loss": 0.69476068, + "learning_rate": 1.2769719551510768e-06, + "loss": 0.71539414, + "num_input_tokens_seen": 225542235, + "step": 10465, + "time_per_iteration": 2.5832982063293457 + }, + { + "auxiliary_loss_clip": 0.00999744, + "auxiliary_loss_mlp": 0.01006046, + "balance_loss_clip": 1.00298381, + "balance_loss_mlp": 1.00494373, + "epoch": 0.629249962422967, + "flos": 69299479434240.0, + "grad_norm": 0.6799793346985195, + "language_loss": 0.59751827, + "learning_rate": 1.2766088477700832e-06, + "loss": 0.61757618, + "num_input_tokens_seen": 225607185, + "step": 10466, + "time_per_iteration": 3.271010160446167 + }, + { + "auxiliary_loss_clip": 0.01030474, + "auxiliary_loss_mlp": 0.01028001, + "balance_loss_clip": 1.02159214, + "balance_loss_mlp": 1.01797533, + "epoch": 0.6293100856756351, + "flos": 40077888042240.0, + "grad_norm": 1.9391486269165519, + "language_loss": 0.64530301, + "learning_rate": 1.276245767820154e-06, + "loss": 0.66588771, + "num_input_tokens_seen": 225628785, + "step": 10467, + "time_per_iteration": 2.8177008628845215 + }, + { + "auxiliary_loss_clip": 0.00989414, + "auxiliary_loss_mlp": 0.01002188, + "balance_loss_clip": 1.00339818, + "balance_loss_mlp": 1.00125182, + "epoch": 0.629370208928303, + "flos": 67501108177920.0, + "grad_norm": 0.8023877971646673, + "language_loss": 0.56933224, + "learning_rate": 1.2758827153150586e-06, + "loss": 0.58924824, + "num_input_tokens_seen": 225678980, + "step": 10468, + "time_per_iteration": 4.586151123046875 + }, + { + "auxiliary_loss_clip": 0.0097174, + "auxiliary_loss_mlp": 0.01003248, + "balance_loss_clip": 1.00360417, + "balance_loss_mlp": 1.00217497, + "epoch": 0.629430332180971, + "flos": 60660450449280.0, + "grad_norm": 0.740974523704105, + "language_loss": 0.57999557, + "learning_rate": 1.2755196902685626e-06, + "loss": 0.59974545, + "num_input_tokens_seen": 225740295, + "step": 10469, + "time_per_iteration": 4.852872371673584 + }, + { + "auxiliary_loss_clip": 0.01003125, + "auxiliary_loss_mlp": 0.01001546, + "balance_loss_clip": 1.00648093, + "balance_loss_mlp": 1.0001812, + "epoch": 0.6294904554336389, + "flos": 66869764778880.0, + "grad_norm": 0.6926532799877383, + "language_loss": 0.52110171, + "learning_rate": 1.2751566926944329e-06, + "loss": 0.54114842, + "num_input_tokens_seen": 225805615, + "step": 10470, + "time_per_iteration": 3.2957708835601807 + }, + { + "auxiliary_loss_clip": 0.0105363, + "auxiliary_loss_mlp": 0.01029635, + "balance_loss_clip": 1.0257684, + "balance_loss_mlp": 1.01913297, + "epoch": 0.6295505786863069, + "flos": 42522794150400.0, + "grad_norm": 1.7316748400085848, + "language_loss": 0.74476802, + "learning_rate": 1.2747937226064342e-06, + "loss": 0.76560068, + "num_input_tokens_seen": 225826585, + "step": 10471, + "time_per_iteration": 2.865450620651245 + }, + { + "auxiliary_loss_clip": 0.01048408, + "auxiliary_loss_mlp": 0.01029084, + "balance_loss_clip": 1.02938747, + "balance_loss_mlp": 1.01826596, + "epoch": 0.629610701938975, + "flos": 17384140788480.0, + "grad_norm": 1.7563924279049385, + "language_loss": 0.62396359, + "learning_rate": 1.2744307800183297e-06, + "loss": 0.64473855, + "num_input_tokens_seen": 225844095, + "step": 10472, + "time_per_iteration": 2.751567840576172 + }, + { + "auxiliary_loss_clip": 0.01070734, + "auxiliary_loss_mlp": 0.01033524, + "balance_loss_clip": 1.03006852, + "balance_loss_mlp": 1.02234793, + "epoch": 0.6296708251916429, + "flos": 24242934885120.0, + "grad_norm": 1.626920255593424, + "language_loss": 0.69239223, + "learning_rate": 1.2740678649438828e-06, + "loss": 0.71343482, + "num_input_tokens_seen": 225864310, + "step": 10473, + "time_per_iteration": 2.8435685634613037 + }, + { + "auxiliary_loss_clip": 0.01039127, + "auxiliary_loss_mlp": 0.01028896, + "balance_loss_clip": 1.02318001, + "balance_loss_mlp": 1.01805413, + "epoch": 0.6297309484443109, + "flos": 19278536077440.0, + "grad_norm": 1.6018250444920041, + "language_loss": 0.74567175, + "learning_rate": 1.2737049773968554e-06, + "loss": 0.766352, + "num_input_tokens_seen": 225883830, + "step": 10474, + "time_per_iteration": 2.836400270462036 + }, + { + "auxiliary_loss_clip": 0.01039683, + "auxiliary_loss_mlp": 0.00747648, + "balance_loss_clip": 1.02344441, + "balance_loss_mlp": 1.00058472, + "epoch": 0.6297910716969788, + "flos": 30662685043200.0, + "grad_norm": 2.2032484059738446, + "language_loss": 0.66277885, + "learning_rate": 1.2733421173910081e-06, + "loss": 0.6806522, + "num_input_tokens_seen": 225905755, + "step": 10475, + "time_per_iteration": 2.8756346702575684 + }, + { + "auxiliary_loss_clip": 0.01021789, + "auxiliary_loss_mlp": 0.01027999, + "balance_loss_clip": 1.0256741, + "balance_loss_mlp": 1.01808631, + "epoch": 0.6298511949496468, + "flos": 14423018371200.0, + "grad_norm": 1.7818614901545824, + "language_loss": 0.90303409, + "learning_rate": 1.272979284940101e-06, + "loss": 0.92353195, + "num_input_tokens_seen": 225922155, + "step": 10476, + "time_per_iteration": 2.7922263145446777 + }, + { + "auxiliary_loss_clip": 0.01065308, + "auxiliary_loss_mlp": 0.01030061, + "balance_loss_clip": 1.02736986, + "balance_loss_mlp": 1.01991582, + "epoch": 0.6299113182023147, + "flos": 23514163845120.0, + "grad_norm": 1.7452189354324785, + "language_loss": 0.75470138, + "learning_rate": 1.2726164800578913e-06, + "loss": 0.77565503, + "num_input_tokens_seen": 225941060, + "step": 10477, + "time_per_iteration": 2.700345516204834 + }, + { + "auxiliary_loss_clip": 0.01055931, + "auxiliary_loss_mlp": 0.01025514, + "balance_loss_clip": 1.02653646, + "balance_loss_mlp": 1.01451123, + "epoch": 0.6299714414549827, + "flos": 22674500542080.0, + "grad_norm": 1.8139045185995024, + "language_loss": 0.70532215, + "learning_rate": 1.272253702758138e-06, + "loss": 0.72613668, + "num_input_tokens_seen": 225960870, + "step": 10478, + "time_per_iteration": 2.718966484069824 + }, + { + "auxiliary_loss_clip": 0.0105776, + "auxiliary_loss_mlp": 0.01026482, + "balance_loss_clip": 1.02629507, + "balance_loss_mlp": 1.01472163, + "epoch": 0.6300315647076506, + "flos": 14501735026560.0, + "grad_norm": 2.4282946409759, + "language_loss": 0.67547274, + "learning_rate": 1.2718909530545974e-06, + "loss": 0.69631511, + "num_input_tokens_seen": 225977895, + "step": 10479, + "time_per_iteration": 2.672072172164917 + }, + { + "auxiliary_loss_clip": 0.01041854, + "auxiliary_loss_mlp": 0.00747608, + "balance_loss_clip": 1.02497959, + "balance_loss_mlp": 1.0005517, + "epoch": 0.6300916879603187, + "flos": 21871681614720.0, + "grad_norm": 1.7265858073478237, + "language_loss": 0.73719388, + "learning_rate": 1.2715282309610245e-06, + "loss": 0.75508851, + "num_input_tokens_seen": 225997835, + "step": 10480, + "time_per_iteration": 2.852064371109009 + }, + { + "auxiliary_loss_clip": 0.01055155, + "auxiliary_loss_mlp": 0.01030483, + "balance_loss_clip": 1.02612364, + "balance_loss_mlp": 1.01871085, + "epoch": 0.6301518112129866, + "flos": 21834047139840.0, + "grad_norm": 2.1708809501159183, + "language_loss": 0.7904833, + "learning_rate": 1.2711655364911744e-06, + "loss": 0.81133962, + "num_input_tokens_seen": 226017620, + "step": 10481, + "time_per_iteration": 4.243902206420898 + }, + { + "auxiliary_loss_clip": 0.00989914, + "auxiliary_loss_mlp": 0.01001073, + "balance_loss_clip": 1.00244117, + "balance_loss_mlp": 0.99997061, + "epoch": 0.6302119344656546, + "flos": 44334237957120.0, + "grad_norm": 0.8953260463015312, + "language_loss": 0.61870134, + "learning_rate": 1.2708028696588e-06, + "loss": 0.6386112, + "num_input_tokens_seen": 226068755, + "step": 10482, + "time_per_iteration": 2.9740333557128906 + }, + { + "auxiliary_loss_clip": 0.01058814, + "auxiliary_loss_mlp": 0.01031708, + "balance_loss_clip": 1.02648282, + "balance_loss_mlp": 1.01935148, + "epoch": 0.6302720577183225, + "flos": 11217919800960.0, + "grad_norm": 2.138882910201586, + "language_loss": 0.82666171, + "learning_rate": 1.2704402304776541e-06, + "loss": 0.84756696, + "num_input_tokens_seen": 226084395, + "step": 10483, + "time_per_iteration": 2.565833568572998 + }, + { + "auxiliary_loss_clip": 0.01051632, + "auxiliary_loss_mlp": 0.0103453, + "balance_loss_clip": 1.0255754, + "balance_loss_mlp": 1.02403331, + "epoch": 0.6303321809709905, + "flos": 27964932122880.0, + "grad_norm": 1.5574656079982492, + "language_loss": 0.72643661, + "learning_rate": 1.270077618961487e-06, + "loss": 0.74729824, + "num_input_tokens_seen": 226105890, + "step": 10484, + "time_per_iteration": 2.6642253398895264 + }, + { + "auxiliary_loss_clip": 0.01037446, + "auxiliary_loss_mlp": 0.01024398, + "balance_loss_clip": 1.02668464, + "balance_loss_mlp": 1.01342463, + "epoch": 0.6303923042236586, + "flos": 28220759763840.0, + "grad_norm": 1.9642306002320398, + "language_loss": 0.74538213, + "learning_rate": 1.2697150351240506e-06, + "loss": 0.76600063, + "num_input_tokens_seen": 226126760, + "step": 10485, + "time_per_iteration": 2.7709081172943115 + }, + { + "auxiliary_loss_clip": 0.01049216, + "auxiliary_loss_mlp": 0.00747826, + "balance_loss_clip": 1.02757287, + "balance_loss_mlp": 1.0006423, + "epoch": 0.6304524274763265, + "flos": 27631034271360.0, + "grad_norm": 1.603543670990848, + "language_loss": 0.81299096, + "learning_rate": 1.269352478979093e-06, + "loss": 0.83096135, + "num_input_tokens_seen": 226147315, + "step": 10486, + "time_per_iteration": 2.7380077838897705 + }, + { + "auxiliary_loss_clip": 0.01044396, + "auxiliary_loss_mlp": 0.01035582, + "balance_loss_clip": 1.02594066, + "balance_loss_mlp": 1.02502024, + "epoch": 0.6305125507289945, + "flos": 17311313963520.0, + "grad_norm": 1.6617071047505687, + "language_loss": 0.63744974, + "learning_rate": 1.2689899505403628e-06, + "loss": 0.6582495, + "num_input_tokens_seen": 226165935, + "step": 10487, + "time_per_iteration": 2.749865770339966 + }, + { + "auxiliary_loss_clip": 0.01064296, + "auxiliary_loss_mlp": 0.01034407, + "balance_loss_clip": 1.02564073, + "balance_loss_mlp": 1.02398777, + "epoch": 0.6305726739816624, + "flos": 25808280658560.0, + "grad_norm": 1.5814425036270534, + "language_loss": 0.67351449, + "learning_rate": 1.2686274498216065e-06, + "loss": 0.69450152, + "num_input_tokens_seen": 226186890, + "step": 10488, + "time_per_iteration": 2.656324625015259 + }, + { + "auxiliary_loss_clip": 0.01045457, + "auxiliary_loss_mlp": 0.01027391, + "balance_loss_clip": 1.02502251, + "balance_loss_mlp": 1.01667404, + "epoch": 0.6306327972343304, + "flos": 21797454159360.0, + "grad_norm": 1.7654238152496455, + "language_loss": 0.67366344, + "learning_rate": 1.2682649768365706e-06, + "loss": 0.69439197, + "num_input_tokens_seen": 226206710, + "step": 10489, + "time_per_iteration": 2.6770548820495605 + }, + { + "auxiliary_loss_clip": 0.01032565, + "auxiliary_loss_mlp": 0.01037467, + "balance_loss_clip": 1.02508378, + "balance_loss_mlp": 1.0241158, + "epoch": 0.6306929204869983, + "flos": 20777375819520.0, + "grad_norm": 1.9320455036482296, + "language_loss": 0.68859756, + "learning_rate": 1.2679025315990007e-06, + "loss": 0.7092979, + "num_input_tokens_seen": 226225565, + "step": 10490, + "time_per_iteration": 2.713027000427246 + }, + { + "auxiliary_loss_clip": 0.01039718, + "auxiliary_loss_mlp": 0.01035093, + "balance_loss_clip": 1.02309632, + "balance_loss_mlp": 1.02370286, + "epoch": 0.6307530437396663, + "flos": 23654214973440.0, + "grad_norm": 1.9092513241334235, + "language_loss": 0.78105474, + "learning_rate": 1.2675401141226393e-06, + "loss": 0.80180287, + "num_input_tokens_seen": 226243680, + "step": 10491, + "time_per_iteration": 4.5049378871917725 + }, + { + "auxiliary_loss_clip": 0.0103707, + "auxiliary_loss_mlp": 0.01030994, + "balance_loss_clip": 1.02327919, + "balance_loss_mlp": 1.02041411, + "epoch": 0.6308131669923343, + "flos": 24719002767360.0, + "grad_norm": 1.7546841015400607, + "language_loss": 0.55285048, + "learning_rate": 1.2671777244212308e-06, + "loss": 0.57353115, + "num_input_tokens_seen": 226264345, + "step": 10492, + "time_per_iteration": 2.915705442428589 + }, + { + "auxiliary_loss_clip": 0.01066164, + "auxiliary_loss_mlp": 0.01033408, + "balance_loss_clip": 1.02546275, + "balance_loss_mlp": 1.0213623, + "epoch": 0.6308732902450023, + "flos": 22565403959040.0, + "grad_norm": 1.8864480255223661, + "language_loss": 0.63644481, + "learning_rate": 1.2668153625085168e-06, + "loss": 0.65744048, + "num_input_tokens_seen": 226283165, + "step": 10493, + "time_per_iteration": 2.7339887619018555 + }, + { + "auxiliary_loss_clip": 0.01036752, + "auxiliary_loss_mlp": 0.0102522, + "balance_loss_clip": 1.02688396, + "balance_loss_mlp": 1.01399064, + "epoch": 0.6309334134976702, + "flos": 24644200694400.0, + "grad_norm": 1.5128886168750841, + "language_loss": 0.82658565, + "learning_rate": 1.2664530283982367e-06, + "loss": 0.8472054, + "num_input_tokens_seen": 226304080, + "step": 10494, + "time_per_iteration": 2.9093546867370605 + }, + { + "auxiliary_loss_clip": 0.01052024, + "auxiliary_loss_mlp": 0.0103213, + "balance_loss_clip": 1.02987432, + "balance_loss_mlp": 1.02084076, + "epoch": 0.6309935367503382, + "flos": 41427949651200.0, + "grad_norm": 1.7538688334804782, + "language_loss": 0.79206055, + "learning_rate": 1.2660907221041317e-06, + "loss": 0.81290209, + "num_input_tokens_seen": 226325925, + "step": 10495, + "time_per_iteration": 3.0033655166625977 + }, + { + "auxiliary_loss_clip": 0.01035749, + "auxiliary_loss_mlp": 0.01030913, + "balance_loss_clip": 1.02320135, + "balance_loss_mlp": 1.01902187, + "epoch": 0.6310536600030061, + "flos": 15118931445120.0, + "grad_norm": 5.971607297176601, + "language_loss": 0.70413721, + "learning_rate": 1.2657284436399403e-06, + "loss": 0.72480381, + "num_input_tokens_seen": 226344190, + "step": 10496, + "time_per_iteration": 2.828552484512329 + }, + { + "auxiliary_loss_clip": 0.01046716, + "auxiliary_loss_mlp": 0.01032391, + "balance_loss_clip": 1.02575624, + "balance_loss_mlp": 1.02139437, + "epoch": 0.6311137832556741, + "flos": 15231619388160.0, + "grad_norm": 1.9603343124547756, + "language_loss": 0.80198038, + "learning_rate": 1.2653661930193997e-06, + "loss": 0.82277137, + "num_input_tokens_seen": 226361520, + "step": 10497, + "time_per_iteration": 2.785412311553955 + }, + { + "auxiliary_loss_clip": 0.0103432, + "auxiliary_loss_mlp": 0.01030149, + "balance_loss_clip": 1.02294588, + "balance_loss_mlp": 1.01979601, + "epoch": 0.6311739065083422, + "flos": 22018664067840.0, + "grad_norm": 1.963187338595948, + "language_loss": 0.73401618, + "learning_rate": 1.265003970256247e-06, + "loss": 0.75466084, + "num_input_tokens_seen": 226381920, + "step": 10498, + "time_per_iteration": 2.6689393520355225 + }, + { + "auxiliary_loss_clip": 0.01053142, + "auxiliary_loss_mlp": 0.01031427, + "balance_loss_clip": 1.02453458, + "balance_loss_mlp": 1.0205729, + "epoch": 0.6312340297610101, + "flos": 22710770300160.0, + "grad_norm": 1.9049461848248517, + "language_loss": 0.69421792, + "learning_rate": 1.264641775364217e-06, + "loss": 0.71506363, + "num_input_tokens_seen": 226400035, + "step": 10499, + "time_per_iteration": 2.679337739944458 + }, + { + "auxiliary_loss_clip": 0.01055242, + "auxiliary_loss_mlp": 0.01034032, + "balance_loss_clip": 1.02756023, + "balance_loss_mlp": 1.02326167, + "epoch": 0.6312941530136781, + "flos": 24280102483200.0, + "grad_norm": 1.8530716256303494, + "language_loss": 0.69810134, + "learning_rate": 1.2642796083570448e-06, + "loss": 0.71899414, + "num_input_tokens_seen": 226418280, + "step": 10500, + "time_per_iteration": 2.762791872024536 + }, + { + "auxiliary_loss_clip": 0.01065654, + "auxiliary_loss_mlp": 0.01031406, + "balance_loss_clip": 1.02653706, + "balance_loss_mlp": 1.02118409, + "epoch": 0.631354276266346, + "flos": 21725956137600.0, + "grad_norm": 1.809907285871172, + "language_loss": 0.74154949, + "learning_rate": 1.2639174692484634e-06, + "loss": 0.76252007, + "num_input_tokens_seen": 226436650, + "step": 10501, + "time_per_iteration": 2.6043686866760254 + }, + { + "auxiliary_loss_clip": 0.01053748, + "auxiliary_loss_mlp": 0.00747649, + "balance_loss_clip": 1.02506185, + "balance_loss_mlp": 1.00051439, + "epoch": 0.631414399519014, + "flos": 24025100855040.0, + "grad_norm": 1.7145909041800844, + "language_loss": 0.75565493, + "learning_rate": 1.2635553580522053e-06, + "loss": 0.77366889, + "num_input_tokens_seen": 226456275, + "step": 10502, + "time_per_iteration": 2.725769519805908 + }, + { + "auxiliary_loss_clip": 0.01058991, + "auxiliary_loss_mlp": 0.01044246, + "balance_loss_clip": 1.02737737, + "balance_loss_mlp": 1.03254008, + "epoch": 0.6314745227716819, + "flos": 24315797623680.0, + "grad_norm": 1.9811408677622848, + "language_loss": 0.85765946, + "learning_rate": 1.2631932747820022e-06, + "loss": 0.87869191, + "num_input_tokens_seen": 226473610, + "step": 10503, + "time_per_iteration": 2.593721389770508 + }, + { + "auxiliary_loss_clip": 0.01037997, + "auxiliary_loss_mlp": 0.01029544, + "balance_loss_clip": 1.02497661, + "balance_loss_mlp": 1.01821887, + "epoch": 0.6315346460243499, + "flos": 23366391292800.0, + "grad_norm": 2.6010552465509065, + "language_loss": 0.87020642, + "learning_rate": 1.2628312194515838e-06, + "loss": 0.89088184, + "num_input_tokens_seen": 226493665, + "step": 10504, + "time_per_iteration": 2.6372692584991455 + }, + { + "auxiliary_loss_clip": 0.01036991, + "auxiliary_loss_mlp": 0.01034132, + "balance_loss_clip": 1.02535522, + "balance_loss_mlp": 1.02210975, + "epoch": 0.6315947692770179, + "flos": 20260333497600.0, + "grad_norm": 1.744334628886999, + "language_loss": 0.76509511, + "learning_rate": 1.2624691920746793e-06, + "loss": 0.7858063, + "num_input_tokens_seen": 226511625, + "step": 10505, + "time_per_iteration": 2.6552581787109375 + }, + { + "auxiliary_loss_clip": 0.01023131, + "auxiliary_loss_mlp": 0.01033738, + "balance_loss_clip": 1.02391827, + "balance_loss_mlp": 1.0219959, + "epoch": 0.6316548925296859, + "flos": 25265850399360.0, + "grad_norm": 1.8273114488155835, + "language_loss": 0.81950301, + "learning_rate": 1.2621071926650166e-06, + "loss": 0.84007168, + "num_input_tokens_seen": 226530085, + "step": 10506, + "time_per_iteration": 2.7410097122192383 + }, + { + "auxiliary_loss_clip": 0.0106633, + "auxiliary_loss_mlp": 0.0102821, + "balance_loss_clip": 1.02632892, + "balance_loss_mlp": 1.01689076, + "epoch": 0.6317150157823538, + "flos": 22930579578240.0, + "grad_norm": 4.319051187082405, + "language_loss": 0.74222082, + "learning_rate": 1.2617452212363238e-06, + "loss": 0.76316619, + "num_input_tokens_seen": 226548115, + "step": 10507, + "time_per_iteration": 2.566833972930908 + }, + { + "auxiliary_loss_clip": 0.01049839, + "auxiliary_loss_mlp": 0.01035282, + "balance_loss_clip": 1.02718472, + "balance_loss_mlp": 1.02299774, + "epoch": 0.6317751390350218, + "flos": 22527051212160.0, + "grad_norm": 1.5516659862666238, + "language_loss": 0.67536896, + "learning_rate": 1.2613832778023258e-06, + "loss": 0.69622016, + "num_input_tokens_seen": 226567955, + "step": 10508, + "time_per_iteration": 2.632993459701538 + }, + { + "auxiliary_loss_clip": 0.01027421, + "auxiliary_loss_mlp": 0.01029898, + "balance_loss_clip": 1.02317691, + "balance_loss_mlp": 1.01882982, + "epoch": 0.6318352622876897, + "flos": 23294749616640.0, + "grad_norm": 1.7729176794957382, + "language_loss": 0.70558691, + "learning_rate": 1.2610213623767478e-06, + "loss": 0.72616017, + "num_input_tokens_seen": 226588205, + "step": 10509, + "time_per_iteration": 2.7785606384277344 + }, + { + "auxiliary_loss_clip": 0.0105629, + "auxiliary_loss_mlp": 0.01025522, + "balance_loss_clip": 1.02679181, + "balance_loss_mlp": 1.01432824, + "epoch": 0.6318953855403577, + "flos": 20704082117760.0, + "grad_norm": 1.7030663783192979, + "language_loss": 0.79357821, + "learning_rate": 1.2606594749733143e-06, + "loss": 0.81439632, + "num_input_tokens_seen": 226606965, + "step": 10510, + "time_per_iteration": 2.7039108276367188 + }, + { + "auxiliary_loss_clip": 0.01026488, + "auxiliary_loss_mlp": 0.00747822, + "balance_loss_clip": 1.02626252, + "balance_loss_mlp": 1.0005331, + "epoch": 0.6319555087930258, + "flos": 22820046451200.0, + "grad_norm": 1.4570110410893067, + "language_loss": 0.70264655, + "learning_rate": 1.2602976156057469e-06, + "loss": 0.7203896, + "num_input_tokens_seen": 226627845, + "step": 10511, + "time_per_iteration": 2.827500581741333 + }, + { + "auxiliary_loss_clip": 0.0106273, + "auxiliary_loss_mlp": 0.01028358, + "balance_loss_clip": 1.02542675, + "balance_loss_mlp": 1.01779568, + "epoch": 0.6320156320456937, + "flos": 19970929618560.0, + "grad_norm": 1.5853189133265562, + "language_loss": 0.80005705, + "learning_rate": 1.2599357842877684e-06, + "loss": 0.82096797, + "num_input_tokens_seen": 226645855, + "step": 10512, + "time_per_iteration": 2.606851100921631 + }, + { + "auxiliary_loss_clip": 0.01055796, + "auxiliary_loss_mlp": 0.01028615, + "balance_loss_clip": 1.02634335, + "balance_loss_mlp": 1.01717651, + "epoch": 0.6320757552983617, + "flos": 27013406889600.0, + "grad_norm": 2.0991171506716744, + "language_loss": 0.70443124, + "learning_rate": 1.2595739810330994e-06, + "loss": 0.7252754, + "num_input_tokens_seen": 226665375, + "step": 10513, + "time_per_iteration": 2.745224952697754 + }, + { + "auxiliary_loss_clip": 0.01052867, + "auxiliary_loss_mlp": 0.01029448, + "balance_loss_clip": 1.02417123, + "balance_loss_mlp": 1.01736653, + "epoch": 0.6321358785510296, + "flos": 23695943598720.0, + "grad_norm": 1.5744600438728498, + "language_loss": 0.66305017, + "learning_rate": 1.259212205855459e-06, + "loss": 0.6838733, + "num_input_tokens_seen": 226685270, + "step": 10514, + "time_per_iteration": 2.69234561920166 + }, + { + "auxiliary_loss_clip": 0.01025898, + "auxiliary_loss_mlp": 0.01031466, + "balance_loss_clip": 1.02109826, + "balance_loss_mlp": 1.02079129, + "epoch": 0.6321960018036976, + "flos": 25995231970560.0, + "grad_norm": 1.672051482657879, + "language_loss": 0.74507153, + "learning_rate": 1.2588504587685663e-06, + "loss": 0.76564515, + "num_input_tokens_seen": 226705325, + "step": 10515, + "time_per_iteration": 2.8459274768829346 + }, + { + "auxiliary_loss_clip": 0.01043132, + "auxiliary_loss_mlp": 0.01027592, + "balance_loss_clip": 1.02521515, + "balance_loss_mlp": 1.01681542, + "epoch": 0.6322561250563655, + "flos": 22821016118400.0, + "grad_norm": 1.6909638562413447, + "language_loss": 0.89850974, + "learning_rate": 1.2584887397861379e-06, + "loss": 0.91921699, + "num_input_tokens_seen": 226723815, + "step": 10516, + "time_per_iteration": 4.399543285369873 + }, + { + "auxiliary_loss_clip": 0.01068237, + "auxiliary_loss_mlp": 0.01029319, + "balance_loss_clip": 1.02606559, + "balance_loss_mlp": 1.01624155, + "epoch": 0.6323162483090335, + "flos": 18988413926400.0, + "grad_norm": 2.0245295049230827, + "language_loss": 0.82174766, + "learning_rate": 1.2581270489218911e-06, + "loss": 0.84272325, + "num_input_tokens_seen": 226741550, + "step": 10517, + "time_per_iteration": 2.5692012310028076 + }, + { + "auxiliary_loss_clip": 0.01014077, + "auxiliary_loss_mlp": 0.01035478, + "balance_loss_clip": 1.02469873, + "balance_loss_mlp": 1.02476144, + "epoch": 0.6323763715617015, + "flos": 19865173000320.0, + "grad_norm": 1.712094021067693, + "language_loss": 0.77634943, + "learning_rate": 1.257765386189541e-06, + "loss": 0.79684496, + "num_input_tokens_seen": 226761115, + "step": 10518, + "time_per_iteration": 2.78627347946167 + }, + { + "auxiliary_loss_clip": 0.01054261, + "auxiliary_loss_mlp": 0.01031051, + "balance_loss_clip": 1.02593684, + "balance_loss_mlp": 1.02038157, + "epoch": 0.6324364948143695, + "flos": 22782699285120.0, + "grad_norm": 1.5035426881885474, + "language_loss": 0.85213709, + "learning_rate": 1.2574037516028018e-06, + "loss": 0.87299019, + "num_input_tokens_seen": 226782225, + "step": 10519, + "time_per_iteration": 2.8221893310546875 + }, + { + "auxiliary_loss_clip": 0.01037198, + "auxiliary_loss_mlp": 0.01033862, + "balance_loss_clip": 1.02332652, + "balance_loss_mlp": 1.02351522, + "epoch": 0.6324966180670374, + "flos": 22235923480320.0, + "grad_norm": 1.6537304444650653, + "language_loss": 0.72209579, + "learning_rate": 1.2570421451753867e-06, + "loss": 0.74280643, + "num_input_tokens_seen": 226802375, + "step": 10520, + "time_per_iteration": 2.782742977142334 + }, + { + "auxiliary_loss_clip": 0.01051152, + "auxiliary_loss_mlp": 0.01027504, + "balance_loss_clip": 1.02344465, + "balance_loss_mlp": 1.01702547, + "epoch": 0.6325567413197054, + "flos": 21689183589120.0, + "grad_norm": 1.6727915027841087, + "language_loss": 0.71826249, + "learning_rate": 1.2566805669210081e-06, + "loss": 0.73904908, + "num_input_tokens_seen": 226822165, + "step": 10521, + "time_per_iteration": 2.717180013656616 + }, + { + "auxiliary_loss_clip": 0.01030583, + "auxiliary_loss_mlp": 0.01035827, + "balance_loss_clip": 1.02508545, + "balance_loss_mlp": 1.02260709, + "epoch": 0.6326168645723733, + "flos": 19937137898880.0, + "grad_norm": 1.7074752672429077, + "language_loss": 0.71903765, + "learning_rate": 1.256319016853377e-06, + "loss": 0.73970175, + "num_input_tokens_seen": 226841645, + "step": 10522, + "time_per_iteration": 2.667628526687622 + }, + { + "auxiliary_loss_clip": 0.01029247, + "auxiliary_loss_mlp": 0.01031829, + "balance_loss_clip": 1.02779496, + "balance_loss_mlp": 1.02135086, + "epoch": 0.6326769878250413, + "flos": 20230348619520.0, + "grad_norm": 1.7824163035669962, + "language_loss": 0.81935942, + "learning_rate": 1.2559574949862023e-06, + "loss": 0.83997023, + "num_input_tokens_seen": 226860355, + "step": 10523, + "time_per_iteration": 2.7168562412261963 + }, + { + "auxiliary_loss_clip": 0.01053494, + "auxiliary_loss_mlp": 0.01024931, + "balance_loss_clip": 1.02500093, + "balance_loss_mlp": 1.01433289, + "epoch": 0.6327371110777094, + "flos": 20775759707520.0, + "grad_norm": 2.203562974841021, + "language_loss": 0.73518217, + "learning_rate": 1.255596001333195e-06, + "loss": 0.75596642, + "num_input_tokens_seen": 226878390, + "step": 10524, + "time_per_iteration": 2.721684455871582 + }, + { + "auxiliary_loss_clip": 0.01050436, + "auxiliary_loss_mlp": 0.01033164, + "balance_loss_clip": 1.0254482, + "balance_loss_mlp": 1.02045631, + "epoch": 0.6327972343303773, + "flos": 30336544529280.0, + "grad_norm": 2.487595511838623, + "language_loss": 0.84834349, + "learning_rate": 1.2552345359080615e-06, + "loss": 0.86917949, + "num_input_tokens_seen": 226898420, + "step": 10525, + "time_per_iteration": 2.810912609100342 + }, + { + "auxiliary_loss_clip": 0.01029998, + "auxiliary_loss_mlp": 0.01028393, + "balance_loss_clip": 1.02028966, + "balance_loss_mlp": 1.01639509, + "epoch": 0.6328573575830453, + "flos": 17092258871040.0, + "grad_norm": 1.6019915569956984, + "language_loss": 0.67084873, + "learning_rate": 1.2548730987245093e-06, + "loss": 0.6914326, + "num_input_tokens_seen": 226916305, + "step": 10526, + "time_per_iteration": 2.70455002784729 + }, + { + "auxiliary_loss_clip": 0.01057911, + "auxiliary_loss_mlp": 0.01031868, + "balance_loss_clip": 1.0275135, + "balance_loss_mlp": 1.02000642, + "epoch": 0.6329174808357132, + "flos": 25047154442880.0, + "grad_norm": 1.5126163521189882, + "language_loss": 0.7370801, + "learning_rate": 1.254511689796244e-06, + "loss": 0.75797784, + "num_input_tokens_seen": 226937705, + "step": 10527, + "time_per_iteration": 2.6350111961364746 + }, + { + "auxiliary_loss_clip": 0.01053711, + "auxiliary_loss_mlp": 0.01028685, + "balance_loss_clip": 1.02622843, + "balance_loss_mlp": 1.01802206, + "epoch": 0.6329776040883812, + "flos": 16836826279680.0, + "grad_norm": 2.035423067951222, + "language_loss": 0.71824086, + "learning_rate": 1.2541503091369693e-06, + "loss": 0.73906481, + "num_input_tokens_seen": 226954880, + "step": 10528, + "time_per_iteration": 4.191166400909424 + }, + { + "auxiliary_loss_clip": 0.01053978, + "auxiliary_loss_mlp": 0.01029195, + "balance_loss_clip": 1.02455187, + "balance_loss_mlp": 1.01723242, + "epoch": 0.6330377273410491, + "flos": 13516705382400.0, + "grad_norm": 2.8045282903395914, + "language_loss": 0.66417849, + "learning_rate": 1.2537889567603905e-06, + "loss": 0.68501019, + "num_input_tokens_seen": 226972595, + "step": 10529, + "time_per_iteration": 2.572624683380127 + }, + { + "auxiliary_loss_clip": 0.01052445, + "auxiliary_loss_mlp": 0.01029981, + "balance_loss_clip": 1.02412534, + "balance_loss_mlp": 1.01806593, + "epoch": 0.6330978505937171, + "flos": 21538825257600.0, + "grad_norm": 1.8470537114429804, + "language_loss": 0.74974078, + "learning_rate": 1.2534276326802092e-06, + "loss": 0.77056503, + "num_input_tokens_seen": 226991910, + "step": 10530, + "time_per_iteration": 2.650045394897461 + }, + { + "auxiliary_loss_clip": 0.01057639, + "auxiliary_loss_mlp": 0.00747421, + "balance_loss_clip": 1.02789116, + "balance_loss_mlp": 1.00051737, + "epoch": 0.6331579738463851, + "flos": 25009484054400.0, + "grad_norm": 1.4188193576994423, + "language_loss": 0.7370466, + "learning_rate": 1.2530663369101259e-06, + "loss": 0.75509715, + "num_input_tokens_seen": 227010175, + "step": 10531, + "time_per_iteration": 2.662992000579834 + }, + { + "auxiliary_loss_clip": 0.01032598, + "auxiliary_loss_mlp": 0.01027476, + "balance_loss_clip": 1.02525973, + "balance_loss_mlp": 1.01680088, + "epoch": 0.6332180970990531, + "flos": 14976007228800.0, + "grad_norm": 2.1222045507503555, + "language_loss": 0.79812849, + "learning_rate": 1.2527050694638432e-06, + "loss": 0.81872928, + "num_input_tokens_seen": 227025540, + "step": 10532, + "time_per_iteration": 2.649885416030884 + }, + { + "auxiliary_loss_clip": 0.0105297, + "auxiliary_loss_mlp": 0.01028772, + "balance_loss_clip": 1.02486634, + "balance_loss_mlp": 1.0192827, + "epoch": 0.633278220351721, + "flos": 22706963458560.0, + "grad_norm": 1.5806822915769196, + "language_loss": 0.74569201, + "learning_rate": 1.2523438303550582e-06, + "loss": 0.76650947, + "num_input_tokens_seen": 227045520, + "step": 10533, + "time_per_iteration": 2.7129459381103516 + }, + { + "auxiliary_loss_clip": 0.01051306, + "auxiliary_loss_mlp": 0.01033145, + "balance_loss_clip": 1.0284766, + "balance_loss_mlp": 1.02110505, + "epoch": 0.633338343604389, + "flos": 12602922364800.0, + "grad_norm": 2.6552613298824155, + "language_loss": 0.76876783, + "learning_rate": 1.2519826195974706e-06, + "loss": 0.78961235, + "num_input_tokens_seen": 227059420, + "step": 10534, + "time_per_iteration": 2.6508796215057373 + }, + { + "auxiliary_loss_clip": 0.0102678, + "auxiliary_loss_mlp": 0.0103846, + "balance_loss_clip": 1.02312183, + "balance_loss_mlp": 1.02635419, + "epoch": 0.6333984668570569, + "flos": 25960111447680.0, + "grad_norm": 1.5055710202346004, + "language_loss": 0.85772014, + "learning_rate": 1.251621437204777e-06, + "loss": 0.87837255, + "num_input_tokens_seen": 227081310, + "step": 10535, + "time_per_iteration": 2.8050973415374756 + }, + { + "auxiliary_loss_clip": 0.01054813, + "auxiliary_loss_mlp": 0.01032034, + "balance_loss_clip": 1.02454913, + "balance_loss_mlp": 1.02036369, + "epoch": 0.6334585901097249, + "flos": 23659242877440.0, + "grad_norm": 2.7158254658648304, + "language_loss": 0.7664625, + "learning_rate": 1.2512602831906733e-06, + "loss": 0.78733099, + "num_input_tokens_seen": 227100365, + "step": 10536, + "time_per_iteration": 2.745163679122925 + }, + { + "auxiliary_loss_clip": 0.01049243, + "auxiliary_loss_mlp": 0.01030772, + "balance_loss_clip": 1.02548623, + "balance_loss_mlp": 1.0190475, + "epoch": 0.633518713362393, + "flos": 28760496503040.0, + "grad_norm": 1.657729785963284, + "language_loss": 0.59922779, + "learning_rate": 1.250899157568855e-06, + "loss": 0.6200279, + "num_input_tokens_seen": 227119680, + "step": 10537, + "time_per_iteration": 2.763209342956543 + }, + { + "auxiliary_loss_clip": 0.009819, + "auxiliary_loss_mlp": 0.01002979, + "balance_loss_clip": 1.00542808, + "balance_loss_mlp": 1.00194156, + "epoch": 0.6335788366150609, + "flos": 70420322401920.0, + "grad_norm": 0.782103619468738, + "language_loss": 0.52440149, + "learning_rate": 1.2505380603530155e-06, + "loss": 0.54425025, + "num_input_tokens_seen": 227184465, + "step": 10538, + "time_per_iteration": 5.146785259246826 + }, + { + "auxiliary_loss_clip": 0.01047401, + "auxiliary_loss_mlp": 0.01030792, + "balance_loss_clip": 1.02520704, + "balance_loss_mlp": 1.01867986, + "epoch": 0.6336389598677289, + "flos": 23732069702400.0, + "grad_norm": 1.901473034900407, + "language_loss": 0.83408201, + "learning_rate": 1.250176991556848e-06, + "loss": 0.854864, + "num_input_tokens_seen": 227202185, + "step": 10539, + "time_per_iteration": 2.711432695388794 + }, + { + "auxiliary_loss_clip": 0.0103639, + "auxiliary_loss_mlp": 0.01030989, + "balance_loss_clip": 1.0234375, + "balance_loss_mlp": 1.01888371, + "epoch": 0.6336990831203968, + "flos": 29276676898560.0, + "grad_norm": 1.618433976771314, + "language_loss": 0.87138021, + "learning_rate": 1.2498159511940438e-06, + "loss": 0.89205396, + "num_input_tokens_seen": 227222020, + "step": 10540, + "time_per_iteration": 2.7902748584747314 + }, + { + "auxiliary_loss_clip": 0.01039954, + "auxiliary_loss_mlp": 0.01029047, + "balance_loss_clip": 1.02365327, + "balance_loss_mlp": 1.01964772, + "epoch": 0.6337592063730648, + "flos": 29096836479360.0, + "grad_norm": 1.5931428817113809, + "language_loss": 0.72050142, + "learning_rate": 1.2494549392782943e-06, + "loss": 0.74119139, + "num_input_tokens_seen": 227240885, + "step": 10541, + "time_per_iteration": 2.786604881286621 + }, + { + "auxiliary_loss_clip": 0.01059129, + "auxiliary_loss_mlp": 0.0103237, + "balance_loss_clip": 1.02720833, + "balance_loss_mlp": 1.02028787, + "epoch": 0.6338193296257327, + "flos": 34706477249280.0, + "grad_norm": 1.8950413717042027, + "language_loss": 0.84940988, + "learning_rate": 1.2490939558232887e-06, + "loss": 0.87032485, + "num_input_tokens_seen": 227257880, + "step": 10542, + "time_per_iteration": 2.756699323654175 + }, + { + "auxiliary_loss_clip": 0.01055437, + "auxiliary_loss_mlp": 0.01025544, + "balance_loss_clip": 1.02618957, + "balance_loss_mlp": 1.01334929, + "epoch": 0.6338794528784008, + "flos": 16687581269760.0, + "grad_norm": 1.839482152602784, + "language_loss": 0.77678561, + "learning_rate": 1.2487330008427153e-06, + "loss": 0.79759544, + "num_input_tokens_seen": 227274840, + "step": 10543, + "time_per_iteration": 2.617631673812866 + }, + { + "auxiliary_loss_clip": 0.01022349, + "auxiliary_loss_mlp": 0.01034904, + "balance_loss_clip": 1.02516007, + "balance_loss_mlp": 1.02435434, + "epoch": 0.6339395761310687, + "flos": 22346600261760.0, + "grad_norm": 1.6526530755595796, + "language_loss": 0.73359799, + "learning_rate": 1.2483720743502618e-06, + "loss": 0.75417048, + "num_input_tokens_seen": 227294835, + "step": 10544, + "time_per_iteration": 2.7200496196746826 + }, + { + "auxiliary_loss_clip": 0.01038537, + "auxiliary_loss_mlp": 0.01032997, + "balance_loss_clip": 1.02675092, + "balance_loss_mlp": 1.02152908, + "epoch": 0.6339996993837367, + "flos": 18551812112640.0, + "grad_norm": 1.808403823200606, + "language_loss": 0.68759543, + "learning_rate": 1.2480111763596144e-06, + "loss": 0.70831078, + "num_input_tokens_seen": 227314935, + "step": 10545, + "time_per_iteration": 2.7963578701019287 + }, + { + "auxiliary_loss_clip": 0.01041768, + "auxiliary_loss_mlp": 0.01031769, + "balance_loss_clip": 1.02334702, + "balance_loss_mlp": 1.0201695, + "epoch": 0.6340598226364046, + "flos": 12969498614400.0, + "grad_norm": 1.9903206524097345, + "language_loss": 0.71328402, + "learning_rate": 1.2476503068844592e-06, + "loss": 0.7340194, + "num_input_tokens_seen": 227332905, + "step": 10546, + "time_per_iteration": 2.781238079071045 + }, + { + "auxiliary_loss_clip": 0.01053618, + "auxiliary_loss_mlp": 0.01027935, + "balance_loss_clip": 1.02680016, + "balance_loss_mlp": 1.01764154, + "epoch": 0.6341199458890726, + "flos": 26687984647680.0, + "grad_norm": 1.3033384607332308, + "language_loss": 0.78284961, + "learning_rate": 1.2472894659384792e-06, + "loss": 0.80366516, + "num_input_tokens_seen": 227354915, + "step": 10547, + "time_per_iteration": 2.7055721282958984 + }, + { + "auxiliary_loss_clip": 0.01019602, + "auxiliary_loss_mlp": 0.01032903, + "balance_loss_clip": 1.02206874, + "balance_loss_mlp": 1.02163792, + "epoch": 0.6341800691417405, + "flos": 18734274224640.0, + "grad_norm": 1.8341355752448731, + "language_loss": 0.62967104, + "learning_rate": 1.2469286535353578e-06, + "loss": 0.65019608, + "num_input_tokens_seen": 227372990, + "step": 10548, + "time_per_iteration": 2.95082950592041 + }, + { + "auxiliary_loss_clip": 0.01032741, + "auxiliary_loss_mlp": 0.01029755, + "balance_loss_clip": 1.0214479, + "balance_loss_mlp": 1.01887691, + "epoch": 0.6342401923944085, + "flos": 26249443499520.0, + "grad_norm": 1.755206839359631, + "language_loss": 0.61653513, + "learning_rate": 1.2465678696887785e-06, + "loss": 0.63716006, + "num_input_tokens_seen": 227393270, + "step": 10549, + "time_per_iteration": 2.716489315032959 + }, + { + "auxiliary_loss_clip": 0.01025947, + "auxiliary_loss_mlp": 0.01029146, + "balance_loss_clip": 1.02672791, + "balance_loss_mlp": 1.01925755, + "epoch": 0.6343003156470765, + "flos": 24680937329280.0, + "grad_norm": 2.660776605925485, + "language_loss": 0.7327435, + "learning_rate": 1.2462071144124197e-06, + "loss": 0.75329447, + "num_input_tokens_seen": 227413630, + "step": 10550, + "time_per_iteration": 2.756345272064209 + }, + { + "auxiliary_loss_clip": 0.00983681, + "auxiliary_loss_mlp": 0.01009232, + "balance_loss_clip": 1.00678158, + "balance_loss_mlp": 1.00800371, + "epoch": 0.6343604388997445, + "flos": 69805352626560.0, + "grad_norm": 0.6984404130243169, + "language_loss": 0.57711363, + "learning_rate": 1.2458463877199638e-06, + "loss": 0.59704274, + "num_input_tokens_seen": 227476630, + "step": 10551, + "time_per_iteration": 3.24796199798584 + }, + { + "auxiliary_loss_clip": 0.01034191, + "auxiliary_loss_mlp": 0.01026432, + "balance_loss_clip": 1.02557349, + "balance_loss_mlp": 1.01706243, + "epoch": 0.6344205621524125, + "flos": 21982430223360.0, + "grad_norm": 1.7178017166972297, + "language_loss": 0.66629565, + "learning_rate": 1.2454856896250881e-06, + "loss": 0.68690187, + "num_input_tokens_seen": 227496060, + "step": 10552, + "time_per_iteration": 2.7517776489257812 + }, + { + "auxiliary_loss_clip": 0.01032576, + "auxiliary_loss_mlp": 0.01029248, + "balance_loss_clip": 1.02408171, + "balance_loss_mlp": 1.01713073, + "epoch": 0.6344806854050804, + "flos": 20448865008000.0, + "grad_norm": 1.8972409805538486, + "language_loss": 0.81836027, + "learning_rate": 1.24512502014147e-06, + "loss": 0.83897853, + "num_input_tokens_seen": 227513440, + "step": 10553, + "time_per_iteration": 2.751365900039673 + }, + { + "auxiliary_loss_clip": 0.01055353, + "auxiliary_loss_mlp": 0.01028281, + "balance_loss_clip": 1.02456331, + "balance_loss_mlp": 1.01729584, + "epoch": 0.6345408086577484, + "flos": 40510611187200.0, + "grad_norm": 1.756288099451159, + "language_loss": 0.54715502, + "learning_rate": 1.2447643792827879e-06, + "loss": 0.56799132, + "num_input_tokens_seen": 227535395, + "step": 10554, + "time_per_iteration": 2.9020745754241943 + }, + { + "auxiliary_loss_clip": 0.01047256, + "auxiliary_loss_mlp": 0.01030377, + "balance_loss_clip": 1.02692664, + "balance_loss_mlp": 1.01949382, + "epoch": 0.6346009319104163, + "flos": 21361319222400.0, + "grad_norm": 1.7139769370537652, + "language_loss": 0.7075175, + "learning_rate": 1.2444037670627153e-06, + "loss": 0.7282939, + "num_input_tokens_seen": 227554545, + "step": 10555, + "time_per_iteration": 2.7134196758270264 + }, + { + "auxiliary_loss_clip": 0.00987897, + "auxiliary_loss_mlp": 0.01002137, + "balance_loss_clip": 1.00175929, + "balance_loss_mlp": 1.00105858, + "epoch": 0.6346610551630844, + "flos": 71365419100800.0, + "grad_norm": 0.7806741620014928, + "language_loss": 0.55350256, + "learning_rate": 1.2440431834949276e-06, + "loss": 0.57340288, + "num_input_tokens_seen": 227608575, + "step": 10556, + "time_per_iteration": 3.1749398708343506 + }, + { + "auxiliary_loss_clip": 0.01044356, + "auxiliary_loss_mlp": 0.01032133, + "balance_loss_clip": 1.02384043, + "balance_loss_mlp": 1.01984811, + "epoch": 0.6347211784157523, + "flos": 25411504049280.0, + "grad_norm": 1.604251972639413, + "language_loss": 0.67913562, + "learning_rate": 1.2436826285930985e-06, + "loss": 0.69990051, + "num_input_tokens_seen": 227628175, + "step": 10557, + "time_per_iteration": 2.820183753967285 + }, + { + "auxiliary_loss_clip": 0.01035549, + "auxiliary_loss_mlp": 0.01028057, + "balance_loss_clip": 1.02404571, + "balance_loss_mlp": 1.01688671, + "epoch": 0.6347813016684203, + "flos": 15742735966080.0, + "grad_norm": 1.5447860810700715, + "language_loss": 0.70075905, + "learning_rate": 1.2433221023709002e-06, + "loss": 0.72139513, + "num_input_tokens_seen": 227645330, + "step": 10558, + "time_per_iteration": 2.672229766845703 + }, + { + "auxiliary_loss_clip": 0.01034496, + "auxiliary_loss_mlp": 0.0102967, + "balance_loss_clip": 1.02371526, + "balance_loss_mlp": 1.01849461, + "epoch": 0.6348414249210882, + "flos": 21464777370240.0, + "grad_norm": 1.5001791376832694, + "language_loss": 0.78136367, + "learning_rate": 1.2429616048420031e-06, + "loss": 0.80200541, + "num_input_tokens_seen": 227665250, + "step": 10559, + "time_per_iteration": 2.7998745441436768 + }, + { + "auxiliary_loss_clip": 0.01033523, + "auxiliary_loss_mlp": 0.01033557, + "balance_loss_clip": 1.02226138, + "balance_loss_mlp": 1.02167177, + "epoch": 0.6349015481737562, + "flos": 21653057485440.0, + "grad_norm": 1.6569062522767455, + "language_loss": 0.68162763, + "learning_rate": 1.242601136020078e-06, + "loss": 0.70229846, + "num_input_tokens_seen": 227685070, + "step": 10560, + "time_per_iteration": 2.790504217147827 + }, + { + "auxiliary_loss_clip": 0.01037585, + "auxiliary_loss_mlp": 0.01034376, + "balance_loss_clip": 1.02371526, + "balance_loss_mlp": 1.02196026, + "epoch": 0.6349616714264241, + "flos": 22194984954240.0, + "grad_norm": 2.0872598165906235, + "language_loss": 0.76999909, + "learning_rate": 1.2422406959187939e-06, + "loss": 0.79071867, + "num_input_tokens_seen": 227704430, + "step": 10561, + "time_per_iteration": 2.764777421951294 + }, + { + "auxiliary_loss_clip": 0.01042931, + "auxiliary_loss_mlp": 0.01029383, + "balance_loss_clip": 1.02409005, + "balance_loss_mlp": 1.01845717, + "epoch": 0.6350217946790921, + "flos": 25410354814080.0, + "grad_norm": 3.3401564373090773, + "language_loss": 0.71908045, + "learning_rate": 1.2418802845518178e-06, + "loss": 0.73980361, + "num_input_tokens_seen": 227724920, + "step": 10562, + "time_per_iteration": 2.7097349166870117 + }, + { + "auxiliary_loss_clip": 0.01058857, + "auxiliary_loss_mlp": 0.01027038, + "balance_loss_clip": 1.02787924, + "balance_loss_mlp": 1.0147891, + "epoch": 0.63508191793176, + "flos": 19718944732800.0, + "grad_norm": 2.0864143352494637, + "language_loss": 0.81300235, + "learning_rate": 1.2415199019328185e-06, + "loss": 0.83386129, + "num_input_tokens_seen": 227743400, + "step": 10563, + "time_per_iteration": 4.263977766036987 + }, + { + "auxiliary_loss_clip": 0.01038093, + "auxiliary_loss_mlp": 0.01033833, + "balance_loss_clip": 1.02679145, + "balance_loss_mlp": 1.02237082, + "epoch": 0.6351420411844281, + "flos": 18186923802240.0, + "grad_norm": 2.1696682291593214, + "language_loss": 0.81388408, + "learning_rate": 1.2411595480754597e-06, + "loss": 0.83460337, + "num_input_tokens_seen": 227759990, + "step": 10564, + "time_per_iteration": 4.410966873168945 + }, + { + "auxiliary_loss_clip": 0.0104075, + "auxiliary_loss_mlp": 0.01032419, + "balance_loss_clip": 1.02616227, + "balance_loss_mlp": 1.01929975, + "epoch": 0.6352021644370961, + "flos": 33726511422720.0, + "grad_norm": 1.4426501741485074, + "language_loss": 0.72522223, + "learning_rate": 1.240799222993407e-06, + "loss": 0.74595392, + "num_input_tokens_seen": 227780835, + "step": 10565, + "time_per_iteration": 2.7491018772125244 + }, + { + "auxiliary_loss_clip": 0.01057675, + "auxiliary_loss_mlp": 0.01030878, + "balance_loss_clip": 1.0277828, + "balance_loss_mlp": 1.01834285, + "epoch": 0.635262287689764, + "flos": 20374781207040.0, + "grad_norm": 1.88447741146965, + "language_loss": 0.69099772, + "learning_rate": 1.240438926700324e-06, + "loss": 0.71188325, + "num_input_tokens_seen": 227798580, + "step": 10566, + "time_per_iteration": 2.7491910457611084 + }, + { + "auxiliary_loss_clip": 0.01051765, + "auxiliary_loss_mlp": 0.01026668, + "balance_loss_clip": 1.02513206, + "balance_loss_mlp": 1.01640439, + "epoch": 0.635322410942432, + "flos": 27525421307520.0, + "grad_norm": 1.653153123897305, + "language_loss": 0.69733572, + "learning_rate": 1.2400786592098725e-06, + "loss": 0.71811998, + "num_input_tokens_seen": 227819210, + "step": 10567, + "time_per_iteration": 2.7267916202545166 + }, + { + "auxiliary_loss_clip": 0.01052864, + "auxiliary_loss_mlp": 0.01027756, + "balance_loss_clip": 1.02594757, + "balance_loss_mlp": 1.01730108, + "epoch": 0.6353825341950999, + "flos": 21543601766400.0, + "grad_norm": 2.718456519297766, + "language_loss": 0.84645021, + "learning_rate": 1.2397184205357154e-06, + "loss": 0.8672564, + "num_input_tokens_seen": 227838340, + "step": 10568, + "time_per_iteration": 2.6466803550720215 + }, + { + "auxiliary_loss_clip": 0.01009687, + "auxiliary_loss_mlp": 0.01038771, + "balance_loss_clip": 1.02383828, + "balance_loss_mlp": 1.02679634, + "epoch": 0.635442657447768, + "flos": 31759756185600.0, + "grad_norm": 1.605899988828761, + "language_loss": 0.83682787, + "learning_rate": 1.2393582106915113e-06, + "loss": 0.8573125, + "num_input_tokens_seen": 227859170, + "step": 10569, + "time_per_iteration": 2.8115432262420654 + }, + { + "auxiliary_loss_clip": 0.01052013, + "auxiliary_loss_mlp": 0.01024995, + "balance_loss_clip": 1.02483308, + "balance_loss_mlp": 1.01423013, + "epoch": 0.6355027807004359, + "flos": 19828831415040.0, + "grad_norm": 1.7949240582164419, + "language_loss": 0.69115949, + "learning_rate": 1.2389980296909198e-06, + "loss": 0.71192956, + "num_input_tokens_seen": 227878545, + "step": 10570, + "time_per_iteration": 2.597620964050293 + }, + { + "auxiliary_loss_clip": 0.01056124, + "auxiliary_loss_mlp": 0.01031331, + "balance_loss_clip": 1.02492547, + "balance_loss_mlp": 1.01958907, + "epoch": 0.6355629039531039, + "flos": 30372383324160.0, + "grad_norm": 1.6615481221997297, + "language_loss": 0.65733856, + "learning_rate": 1.2386378775476e-06, + "loss": 0.67821312, + "num_input_tokens_seen": 227898875, + "step": 10571, + "time_per_iteration": 2.8083276748657227 + }, + { + "auxiliary_loss_clip": 0.01059869, + "auxiliary_loss_mlp": 0.01027636, + "balance_loss_clip": 1.02816296, + "balance_loss_mlp": 1.0164243, + "epoch": 0.6356230272057718, + "flos": 17932065828480.0, + "grad_norm": 1.6793619386930938, + "language_loss": 0.71138966, + "learning_rate": 1.2382777542752074e-06, + "loss": 0.73226464, + "num_input_tokens_seen": 227917130, + "step": 10572, + "time_per_iteration": 2.6266016960144043 + }, + { + "auxiliary_loss_clip": 0.01033424, + "auxiliary_loss_mlp": 0.01027523, + "balance_loss_clip": 1.02565646, + "balance_loss_mlp": 1.01709831, + "epoch": 0.6356831504584398, + "flos": 25375844822400.0, + "grad_norm": 1.849574843544003, + "language_loss": 0.81476164, + "learning_rate": 1.2379176598873992e-06, + "loss": 0.83537114, + "num_input_tokens_seen": 227939550, + "step": 10573, + "time_per_iteration": 2.9211325645446777 + }, + { + "auxiliary_loss_clip": 0.0104645, + "auxiliary_loss_mlp": 0.01028837, + "balance_loss_clip": 1.02627349, + "balance_loss_mlp": 1.01808989, + "epoch": 0.6357432737111077, + "flos": 46500331720320.0, + "grad_norm": 1.524614557548826, + "language_loss": 0.68529826, + "learning_rate": 1.2375575943978303e-06, + "loss": 0.70605111, + "num_input_tokens_seen": 227962200, + "step": 10574, + "time_per_iteration": 2.9899520874023438 + }, + { + "auxiliary_loss_clip": 0.01064902, + "auxiliary_loss_mlp": 0.01029459, + "balance_loss_clip": 1.02653074, + "balance_loss_mlp": 1.01884985, + "epoch": 0.6358033969637757, + "flos": 17274361847040.0, + "grad_norm": 2.504611521730341, + "language_loss": 0.86785674, + "learning_rate": 1.2371975578201525e-06, + "loss": 0.88880032, + "num_input_tokens_seen": 227979270, + "step": 10575, + "time_per_iteration": 2.610758066177368 + }, + { + "auxiliary_loss_clip": 0.01063704, + "auxiliary_loss_mlp": 0.01029492, + "balance_loss_clip": 1.02600849, + "balance_loss_mlp": 1.01874506, + "epoch": 0.6358635202164437, + "flos": 27125520215040.0, + "grad_norm": 1.5695427646522329, + "language_loss": 0.72169858, + "learning_rate": 1.2368375501680204e-06, + "loss": 0.74263054, + "num_input_tokens_seen": 228000550, + "step": 10576, + "time_per_iteration": 4.310005187988281 + }, + { + "auxiliary_loss_clip": 0.01044547, + "auxiliary_loss_mlp": 0.01029791, + "balance_loss_clip": 1.02502561, + "balance_loss_mlp": 1.01855612, + "epoch": 0.6359236434691117, + "flos": 27525205825920.0, + "grad_norm": 1.522128239481714, + "language_loss": 0.69717723, + "learning_rate": 1.236477571455085e-06, + "loss": 0.7179206, + "num_input_tokens_seen": 228022005, + "step": 10577, + "time_per_iteration": 2.704169750213623 + }, + { + "auxiliary_loss_clip": 0.01023985, + "auxiliary_loss_mlp": 0.01028038, + "balance_loss_clip": 1.02533031, + "balance_loss_mlp": 1.01752949, + "epoch": 0.6359837667217797, + "flos": 39348290989440.0, + "grad_norm": 1.646921478598858, + "language_loss": 0.72416234, + "learning_rate": 1.2361176216949964e-06, + "loss": 0.74468261, + "num_input_tokens_seen": 228043770, + "step": 10578, + "time_per_iteration": 3.044302225112915 + }, + { + "auxiliary_loss_clip": 0.00985429, + "auxiliary_loss_mlp": 0.00747134, + "balance_loss_clip": 1.00698388, + "balance_loss_mlp": 1.00141299, + "epoch": 0.6360438899744476, + "flos": 56413797206400.0, + "grad_norm": 0.7023432798956369, + "language_loss": 0.54493672, + "learning_rate": 1.2357577009014044e-06, + "loss": 0.5622623, + "num_input_tokens_seen": 228104985, + "step": 10579, + "time_per_iteration": 3.3923254013061523 + }, + { + "auxiliary_loss_clip": 0.01039797, + "auxiliary_loss_mlp": 0.01025247, + "balance_loss_clip": 1.02358413, + "balance_loss_mlp": 1.01402342, + "epoch": 0.6361040132271156, + "flos": 24973106555520.0, + "grad_norm": 1.5300166956699481, + "language_loss": 0.7759524, + "learning_rate": 1.2353978090879568e-06, + "loss": 0.79660285, + "num_input_tokens_seen": 228125620, + "step": 10580, + "time_per_iteration": 2.7403159141540527 + }, + { + "auxiliary_loss_clip": 0.01034282, + "auxiliary_loss_mlp": 0.00747616, + "balance_loss_clip": 1.02488029, + "balance_loss_mlp": 1.00054884, + "epoch": 0.6361641364797835, + "flos": 23259198130560.0, + "grad_norm": 2.1420899279178607, + "language_loss": 0.66557693, + "learning_rate": 1.235037946268301e-06, + "loss": 0.68339592, + "num_input_tokens_seen": 228143495, + "step": 10581, + "time_per_iteration": 2.7414064407348633 + }, + { + "auxiliary_loss_clip": 0.01053099, + "auxiliary_loss_mlp": 0.01026386, + "balance_loss_clip": 1.02435923, + "balance_loss_mlp": 1.01606262, + "epoch": 0.6362242597324516, + "flos": 25994513698560.0, + "grad_norm": 1.3719198795852372, + "language_loss": 0.6857217, + "learning_rate": 1.2346781124560828e-06, + "loss": 0.7065165, + "num_input_tokens_seen": 228166500, + "step": 10582, + "time_per_iteration": 2.7020821571350098 + }, + { + "auxiliary_loss_clip": 0.01047197, + "auxiliary_loss_mlp": 0.0103304, + "balance_loss_clip": 1.02664483, + "balance_loss_mlp": 1.02221608, + "epoch": 0.6362843829851195, + "flos": 25703242312320.0, + "grad_norm": 1.8993345763683411, + "language_loss": 0.84587079, + "learning_rate": 1.2343183076649473e-06, + "loss": 0.86667317, + "num_input_tokens_seen": 228185325, + "step": 10583, + "time_per_iteration": 2.72709584236145 + }, + { + "auxiliary_loss_clip": 0.0104562, + "auxiliary_loss_mlp": 0.01026569, + "balance_loss_clip": 1.02640581, + "balance_loss_mlp": 1.01611996, + "epoch": 0.6363445062377875, + "flos": 20522912895360.0, + "grad_norm": 1.6400604631252527, + "language_loss": 0.75191963, + "learning_rate": 1.233958531908538e-06, + "loss": 0.77264154, + "num_input_tokens_seen": 228204050, + "step": 10584, + "time_per_iteration": 2.676575183868408 + }, + { + "auxiliary_loss_clip": 0.01039597, + "auxiliary_loss_mlp": 0.01031198, + "balance_loss_clip": 1.02615333, + "balance_loss_mlp": 1.01896703, + "epoch": 0.6364046294904554, + "flos": 19463799450240.0, + "grad_norm": 1.8232159633106368, + "language_loss": 0.72712016, + "learning_rate": 1.2335987852004985e-06, + "loss": 0.74782813, + "num_input_tokens_seen": 228222430, + "step": 10585, + "time_per_iteration": 4.423760175704956 + }, + { + "auxiliary_loss_clip": 0.01034223, + "auxiliary_loss_mlp": 0.01029238, + "balance_loss_clip": 1.02494884, + "balance_loss_mlp": 1.01874137, + "epoch": 0.6364647527431234, + "flos": 20995892208000.0, + "grad_norm": 1.8375373690789942, + "language_loss": 0.82741147, + "learning_rate": 1.2332390675544697e-06, + "loss": 0.84804612, + "num_input_tokens_seen": 228241925, + "step": 10586, + "time_per_iteration": 2.7207443714141846 + }, + { + "auxiliary_loss_clip": 0.01052594, + "auxiliary_loss_mlp": 0.01025347, + "balance_loss_clip": 1.02499366, + "balance_loss_mlp": 1.01489234, + "epoch": 0.6365248759957913, + "flos": 25770789838080.0, + "grad_norm": 1.459574797359489, + "language_loss": 0.72373188, + "learning_rate": 1.2328793789840918e-06, + "loss": 0.74451131, + "num_input_tokens_seen": 228262535, + "step": 10587, + "time_per_iteration": 2.771029233932495 + }, + { + "auxiliary_loss_clip": 0.01044563, + "auxiliary_loss_mlp": 0.01029913, + "balance_loss_clip": 1.0266695, + "balance_loss_mlp": 1.01961982, + "epoch": 0.6365849992484593, + "flos": 22455589104000.0, + "grad_norm": 2.0088219782969707, + "language_loss": 0.77145088, + "learning_rate": 1.2325197195030058e-06, + "loss": 0.79219562, + "num_input_tokens_seen": 228281340, + "step": 10588, + "time_per_iteration": 2.8587563037872314 + }, + { + "auxiliary_loss_clip": 0.01018677, + "auxiliary_loss_mlp": 0.01028814, + "balance_loss_clip": 1.02394164, + "balance_loss_mlp": 1.0180856, + "epoch": 0.6366451225011273, + "flos": 19025689265280.0, + "grad_norm": 1.8571501295511283, + "language_loss": 0.79785305, + "learning_rate": 1.2321600891248478e-06, + "loss": 0.81832796, + "num_input_tokens_seen": 228300865, + "step": 10589, + "time_per_iteration": 2.830457925796509 + }, + { + "auxiliary_loss_clip": 0.01040414, + "auxiliary_loss_mlp": 0.01028748, + "balance_loss_clip": 1.02361596, + "balance_loss_mlp": 1.01783442, + "epoch": 0.6367052457537953, + "flos": 25228395492480.0, + "grad_norm": 2.0878582259425156, + "language_loss": 0.67258728, + "learning_rate": 1.231800487863257e-06, + "loss": 0.69327891, + "num_input_tokens_seen": 228320815, + "step": 10590, + "time_per_iteration": 2.7450661659240723 + }, + { + "auxiliary_loss_clip": 0.01058877, + "auxiliary_loss_mlp": 0.01030932, + "balance_loss_clip": 1.02615118, + "balance_loss_mlp": 1.01954794, + "epoch": 0.6367653690064633, + "flos": 19208438686080.0, + "grad_norm": 1.685633074622038, + "language_loss": 0.79443955, + "learning_rate": 1.2314409157318685e-06, + "loss": 0.81533772, + "num_input_tokens_seen": 228339065, + "step": 10591, + "time_per_iteration": 2.592941999435425 + }, + { + "auxiliary_loss_clip": 0.01039749, + "auxiliary_loss_mlp": 0.01026374, + "balance_loss_clip": 1.02374387, + "balance_loss_mlp": 1.01618779, + "epoch": 0.6368254922591312, + "flos": 23546806329600.0, + "grad_norm": 1.448638167321253, + "language_loss": 0.89069015, + "learning_rate": 1.231081372744317e-06, + "loss": 0.91135138, + "num_input_tokens_seen": 228359210, + "step": 10592, + "time_per_iteration": 2.77245831489563 + }, + { + "auxiliary_loss_clip": 0.01046665, + "auxiliary_loss_mlp": 0.01023681, + "balance_loss_clip": 1.0221858, + "balance_loss_mlp": 1.01417959, + "epoch": 0.6368856155117992, + "flos": 26467313443200.0, + "grad_norm": 1.4194449170973664, + "language_loss": 0.68116009, + "learning_rate": 1.2307218589142376e-06, + "loss": 0.70186353, + "num_input_tokens_seen": 228379630, + "step": 10593, + "time_per_iteration": 2.6836767196655273 + }, + { + "auxiliary_loss_clip": 0.0100747, + "auxiliary_loss_mlp": 0.01033324, + "balance_loss_clip": 1.01816082, + "balance_loss_mlp": 1.02201724, + "epoch": 0.6369457387644671, + "flos": 33692432394240.0, + "grad_norm": 1.7267471018069207, + "language_loss": 0.63371575, + "learning_rate": 1.2303623742552618e-06, + "loss": 0.65412372, + "num_input_tokens_seen": 228401410, + "step": 10594, + "time_per_iteration": 2.92254638671875 + }, + { + "auxiliary_loss_clip": 0.01002046, + "auxiliary_loss_mlp": 0.01005475, + "balance_loss_clip": 1.00562453, + "balance_loss_mlp": 1.00434256, + "epoch": 0.6370058620171352, + "flos": 70908600908160.0, + "grad_norm": 0.7637956562424588, + "language_loss": 0.54673958, + "learning_rate": 1.230002918781022e-06, + "loss": 0.56681478, + "num_input_tokens_seen": 228470335, + "step": 10595, + "time_per_iteration": 3.4035544395446777 + }, + { + "auxiliary_loss_clip": 0.01066904, + "auxiliary_loss_mlp": 0.01032708, + "balance_loss_clip": 1.02683878, + "balance_loss_mlp": 1.02116835, + "epoch": 0.6370659852698031, + "flos": 21141940907520.0, + "grad_norm": 1.8567697779994896, + "language_loss": 0.66546786, + "learning_rate": 1.2296434925051493e-06, + "loss": 0.68646401, + "num_input_tokens_seen": 228490765, + "step": 10596, + "time_per_iteration": 2.6789286136627197 + }, + { + "auxiliary_loss_clip": 0.01046971, + "auxiliary_loss_mlp": 0.01031145, + "balance_loss_clip": 1.02521539, + "balance_loss_mlp": 1.02004063, + "epoch": 0.6371261085224711, + "flos": 20193288762240.0, + "grad_norm": 2.022969236267269, + "language_loss": 0.78886652, + "learning_rate": 1.2292840954412718e-06, + "loss": 0.80964768, + "num_input_tokens_seen": 228509700, + "step": 10597, + "time_per_iteration": 2.7112176418304443 + }, + { + "auxiliary_loss_clip": 0.01054481, + "auxiliary_loss_mlp": 0.01030575, + "balance_loss_clip": 1.02616048, + "balance_loss_mlp": 1.0208174, + "epoch": 0.637186231775139, + "flos": 19683536901120.0, + "grad_norm": 1.595949162867922, + "language_loss": 0.74764985, + "learning_rate": 1.2289247276030189e-06, + "loss": 0.76850033, + "num_input_tokens_seen": 228529050, + "step": 10598, + "time_per_iteration": 2.777797222137451 + }, + { + "auxiliary_loss_clip": 0.01032702, + "auxiliary_loss_mlp": 0.00747517, + "balance_loss_clip": 1.02397597, + "balance_loss_mlp": 1.00047278, + "epoch": 0.637246355027807, + "flos": 13071196995840.0, + "grad_norm": 3.0949429849467314, + "language_loss": 0.68242669, + "learning_rate": 1.2285653890040176e-06, + "loss": 0.70022887, + "num_input_tokens_seen": 228544665, + "step": 10599, + "time_per_iteration": 2.7807116508483887 + }, + { + "auxiliary_loss_clip": 0.01022676, + "auxiliary_loss_mlp": 0.01032449, + "balance_loss_clip": 1.02255011, + "balance_loss_mlp": 1.02052808, + "epoch": 0.6373064782804749, + "flos": 18222654856320.0, + "grad_norm": 20.480370734558942, + "language_loss": 0.81009924, + "learning_rate": 1.2282060796578942e-06, + "loss": 0.83065045, + "num_input_tokens_seen": 228562060, + "step": 10600, + "time_per_iteration": 2.982860565185547 + }, + { + "auxiliary_loss_clip": 0.01050306, + "auxiliary_loss_mlp": 0.01029231, + "balance_loss_clip": 1.02296436, + "balance_loss_mlp": 1.01872325, + "epoch": 0.637366601533143, + "flos": 24498475217280.0, + "grad_norm": 1.4393082052830046, + "language_loss": 0.79996687, + "learning_rate": 1.2278467995782732e-06, + "loss": 0.82076222, + "num_input_tokens_seen": 228582550, + "step": 10601, + "time_per_iteration": 2.6875362396240234 + }, + { + "auxiliary_loss_clip": 0.01034459, + "auxiliary_loss_mlp": 0.01023467, + "balance_loss_clip": 1.02536106, + "balance_loss_mlp": 1.01289296, + "epoch": 0.6374267247858109, + "flos": 26359042872960.0, + "grad_norm": 1.9885529636355324, + "language_loss": 0.66896063, + "learning_rate": 1.2274875487787797e-06, + "loss": 0.68953991, + "num_input_tokens_seen": 228604960, + "step": 10602, + "time_per_iteration": 2.7192962169647217 + }, + { + "auxiliary_loss_clip": 0.00987128, + "auxiliary_loss_mlp": 0.01029031, + "balance_loss_clip": 1.01904106, + "balance_loss_mlp": 1.01745534, + "epoch": 0.6374868480384789, + "flos": 20371728551040.0, + "grad_norm": 1.6802999691852887, + "language_loss": 0.79707205, + "learning_rate": 1.2271283272730354e-06, + "loss": 0.81723368, + "num_input_tokens_seen": 228622195, + "step": 10603, + "time_per_iteration": 2.918163299560547 + }, + { + "auxiliary_loss_clip": 0.01022337, + "auxiliary_loss_mlp": 0.00747489, + "balance_loss_clip": 1.02314138, + "balance_loss_mlp": 1.00052786, + "epoch": 0.6375469712911469, + "flos": 20996251344000.0, + "grad_norm": 1.7253372290912057, + "language_loss": 0.76855499, + "learning_rate": 1.2267691350746621e-06, + "loss": 0.78625327, + "num_input_tokens_seen": 228639735, + "step": 10604, + "time_per_iteration": 2.706305503845215 + }, + { + "auxiliary_loss_clip": 0.01044345, + "auxiliary_loss_mlp": 0.0102664, + "balance_loss_clip": 1.02400732, + "balance_loss_mlp": 1.01589966, + "epoch": 0.6376070945438148, + "flos": 19715748422400.0, + "grad_norm": 1.6563891922150211, + "language_loss": 0.76915669, + "learning_rate": 1.226409972197281e-06, + "loss": 0.78986645, + "num_input_tokens_seen": 228658195, + "step": 10605, + "time_per_iteration": 2.652113199234009 + }, + { + "auxiliary_loss_clip": 0.0100825, + "auxiliary_loss_mlp": 0.01031599, + "balance_loss_clip": 1.02318227, + "balance_loss_mlp": 1.0177052, + "epoch": 0.6376672177964828, + "flos": 21506757390720.0, + "grad_norm": 1.8211733577690503, + "language_loss": 0.65744269, + "learning_rate": 1.2260508386545106e-06, + "loss": 0.67784119, + "num_input_tokens_seen": 228677415, + "step": 10606, + "time_per_iteration": 2.876072645187378 + }, + { + "auxiliary_loss_clip": 0.01040445, + "auxiliary_loss_mlp": 0.01031862, + "balance_loss_clip": 1.02450907, + "balance_loss_mlp": 1.02215266, + "epoch": 0.6377273410491507, + "flos": 18843873598080.0, + "grad_norm": 1.5472686708488321, + "language_loss": 0.75695306, + "learning_rate": 1.225691734459971e-06, + "loss": 0.77767611, + "num_input_tokens_seen": 228696450, + "step": 10607, + "time_per_iteration": 2.6877963542938232 + }, + { + "auxiliary_loss_clip": 0.01044625, + "auxiliary_loss_mlp": 0.01034017, + "balance_loss_clip": 1.02548218, + "balance_loss_mlp": 1.0232048, + "epoch": 0.6377874643018188, + "flos": 53062970181120.0, + "grad_norm": 1.4846083319671313, + "language_loss": 0.65904915, + "learning_rate": 1.225332659627278e-06, + "loss": 0.67983556, + "num_input_tokens_seen": 228721600, + "step": 10608, + "time_per_iteration": 2.9539613723754883 + }, + { + "auxiliary_loss_clip": 0.00961446, + "auxiliary_loss_mlp": 0.01003808, + "balance_loss_clip": 1.01304936, + "balance_loss_mlp": 1.00244868, + "epoch": 0.6378475875544867, + "flos": 65135026465920.0, + "grad_norm": 0.7273471871019221, + "language_loss": 0.51928967, + "learning_rate": 1.2249736141700475e-06, + "loss": 0.53894222, + "num_input_tokens_seen": 228784535, + "step": 10609, + "time_per_iteration": 3.430729866027832 + }, + { + "auxiliary_loss_clip": 0.01047168, + "auxiliary_loss_mlp": 0.01023164, + "balance_loss_clip": 1.02172494, + "balance_loss_mlp": 1.01357961, + "epoch": 0.6379077108071547, + "flos": 23002759958400.0, + "grad_norm": 1.6593965318403654, + "language_loss": 0.74631482, + "learning_rate": 1.2246145981018965e-06, + "loss": 0.76701814, + "num_input_tokens_seen": 228804110, + "step": 10610, + "time_per_iteration": 4.56810998916626 + }, + { + "auxiliary_loss_clip": 0.00991203, + "auxiliary_loss_mlp": 0.01003512, + "balance_loss_clip": 1.00540614, + "balance_loss_mlp": 1.00239718, + "epoch": 0.6379678340598226, + "flos": 67601947610880.0, + "grad_norm": 0.86860941408354, + "language_loss": 0.6311242, + "learning_rate": 1.2242556114364364e-06, + "loss": 0.65107131, + "num_input_tokens_seen": 228867705, + "step": 10611, + "time_per_iteration": 4.836812734603882 + }, + { + "auxiliary_loss_clip": 0.01053811, + "auxiliary_loss_mlp": 0.01028947, + "balance_loss_clip": 1.02484274, + "balance_loss_mlp": 1.01811719, + "epoch": 0.6380279573124906, + "flos": 29680061610240.0, + "grad_norm": 10.204685468901182, + "language_loss": 0.72000623, + "learning_rate": 1.223896654187282e-06, + "loss": 0.74083382, + "num_input_tokens_seen": 228889215, + "step": 10612, + "time_per_iteration": 2.73356032371521 + }, + { + "auxiliary_loss_clip": 0.00991685, + "auxiliary_loss_mlp": 0.01002002, + "balance_loss_clip": 1.0042311, + "balance_loss_mlp": 1.00091171, + "epoch": 0.6380880805651585, + "flos": 66484046580480.0, + "grad_norm": 0.7088006991295394, + "language_loss": 0.57861745, + "learning_rate": 1.2235377263680446e-06, + "loss": 0.59855437, + "num_input_tokens_seen": 228948465, + "step": 10613, + "time_per_iteration": 3.24054217338562 + }, + { + "auxiliary_loss_clip": 0.01017232, + "auxiliary_loss_mlp": 0.01028764, + "balance_loss_clip": 1.02187407, + "balance_loss_mlp": 1.01730216, + "epoch": 0.6381482038178266, + "flos": 23914998691200.0, + "grad_norm": 1.7841957410002929, + "language_loss": 0.7554431, + "learning_rate": 1.2231788279923334e-06, + "loss": 0.77590305, + "num_input_tokens_seen": 228967955, + "step": 10614, + "time_per_iteration": 2.9685347080230713 + }, + { + "auxiliary_loss_clip": 0.01042279, + "auxiliary_loss_mlp": 0.00747545, + "balance_loss_clip": 1.02501965, + "balance_loss_mlp": 1.00051665, + "epoch": 0.6382083270704945, + "flos": 24243042625920.0, + "grad_norm": 1.7934830203675782, + "language_loss": 0.80115175, + "learning_rate": 1.2228199590737599e-06, + "loss": 0.81905007, + "num_input_tokens_seen": 228985495, + "step": 10615, + "time_per_iteration": 2.850222110748291 + }, + { + "auxiliary_loss_clip": 0.00992136, + "auxiliary_loss_mlp": 0.01003097, + "balance_loss_clip": 1.00547779, + "balance_loss_mlp": 1.00185704, + "epoch": 0.6382684503231625, + "flos": 70775552931840.0, + "grad_norm": 0.6550758704623856, + "language_loss": 0.55634838, + "learning_rate": 1.2224611196259305e-06, + "loss": 0.57630074, + "num_input_tokens_seen": 229052995, + "step": 10616, + "time_per_iteration": 3.412088632583618 + }, + { + "auxiliary_loss_clip": 0.01033082, + "auxiliary_loss_mlp": 0.01032236, + "balance_loss_clip": 1.022048, + "balance_loss_mlp": 1.02117991, + "epoch": 0.6383285735758305, + "flos": 16544836621440.0, + "grad_norm": 1.959639391561221, + "language_loss": 0.84139276, + "learning_rate": 1.2221023096624538e-06, + "loss": 0.86204594, + "num_input_tokens_seen": 229071030, + "step": 10617, + "time_per_iteration": 2.7592320442199707 + }, + { + "auxiliary_loss_clip": 0.01054572, + "auxiliary_loss_mlp": 0.01034383, + "balance_loss_clip": 1.02498317, + "balance_loss_mlp": 1.02288508, + "epoch": 0.6383886968284984, + "flos": 14427651225600.0, + "grad_norm": 2.583577571569376, + "language_loss": 0.87461901, + "learning_rate": 1.221743529196936e-06, + "loss": 0.89550853, + "num_input_tokens_seen": 229088275, + "step": 10618, + "time_per_iteration": 2.7197442054748535 + }, + { + "auxiliary_loss_clip": 0.01018005, + "auxiliary_loss_mlp": 0.01032496, + "balance_loss_clip": 1.02799606, + "balance_loss_mlp": 1.02278638, + "epoch": 0.6384488200811664, + "flos": 17929659617280.0, + "grad_norm": 1.9136085385061534, + "language_loss": 0.73424822, + "learning_rate": 1.2213847782429806e-06, + "loss": 0.75475323, + "num_input_tokens_seen": 229105190, + "step": 10619, + "time_per_iteration": 2.827327251434326 + }, + { + "auxiliary_loss_clip": 0.01039613, + "auxiliary_loss_mlp": 0.01033207, + "balance_loss_clip": 1.02356529, + "balance_loss_mlp": 1.02118468, + "epoch": 0.6385089433338343, + "flos": 18515578268160.0, + "grad_norm": 2.038534910846972, + "language_loss": 0.76253486, + "learning_rate": 1.221026056814193e-06, + "loss": 0.78326303, + "num_input_tokens_seen": 229122290, + "step": 10620, + "time_per_iteration": 2.766780138015747 + }, + { + "auxiliary_loss_clip": 0.01041894, + "auxiliary_loss_mlp": 0.01026682, + "balance_loss_clip": 1.02497745, + "balance_loss_mlp": 1.01614368, + "epoch": 0.6385690665865024, + "flos": 24753620499840.0, + "grad_norm": 2.981958303883768, + "language_loss": 0.7070601, + "learning_rate": 1.2206673649241752e-06, + "loss": 0.72774589, + "num_input_tokens_seen": 229141620, + "step": 10621, + "time_per_iteration": 2.767709970474243 + }, + { + "auxiliary_loss_clip": 0.01037042, + "auxiliary_loss_mlp": 0.01026644, + "balance_loss_clip": 1.0220778, + "balance_loss_mlp": 1.01701784, + "epoch": 0.6386291898391703, + "flos": 20120569678080.0, + "grad_norm": 1.52524225273382, + "language_loss": 0.77892029, + "learning_rate": 1.220308702586529e-06, + "loss": 0.79955715, + "num_input_tokens_seen": 229161570, + "step": 10622, + "time_per_iteration": 2.684302568435669 + }, + { + "auxiliary_loss_clip": 0.01031516, + "auxiliary_loss_mlp": 0.01030261, + "balance_loss_clip": 1.02495968, + "balance_loss_mlp": 1.02021146, + "epoch": 0.6386893130918383, + "flos": 16867278034560.0, + "grad_norm": 1.6905575397787385, + "language_loss": 0.74610102, + "learning_rate": 1.2199500698148546e-06, + "loss": 0.76671886, + "num_input_tokens_seen": 229178465, + "step": 10623, + "time_per_iteration": 2.6741690635681152 + }, + { + "auxiliary_loss_clip": 0.01041069, + "auxiliary_loss_mlp": 0.01026254, + "balance_loss_clip": 1.02418005, + "balance_loss_mlp": 1.0169971, + "epoch": 0.6387494363445062, + "flos": 22966274718720.0, + "grad_norm": 1.45253846402369, + "language_loss": 0.76536667, + "learning_rate": 1.2195914666227527e-06, + "loss": 0.78603995, + "num_input_tokens_seen": 229198975, + "step": 10624, + "time_per_iteration": 4.365555286407471 + }, + { + "auxiliary_loss_clip": 0.01005564, + "auxiliary_loss_mlp": 0.01032923, + "balance_loss_clip": 1.02137864, + "balance_loss_mlp": 1.0216161, + "epoch": 0.6388095595971742, + "flos": 22857716839680.0, + "grad_norm": 1.640008156996928, + "language_loss": 0.80632973, + "learning_rate": 1.21923289302382e-06, + "loss": 0.82671463, + "num_input_tokens_seen": 229218825, + "step": 10625, + "time_per_iteration": 2.7462031841278076 + }, + { + "auxiliary_loss_clip": 0.0104431, + "auxiliary_loss_mlp": 0.01032993, + "balance_loss_clip": 1.02614141, + "balance_loss_mlp": 1.02205002, + "epoch": 0.6388696828498421, + "flos": 17311529445120.0, + "grad_norm": 1.7461943275372482, + "language_loss": 0.73061991, + "learning_rate": 1.218874349031654e-06, + "loss": 0.75139296, + "num_input_tokens_seen": 229236060, + "step": 10626, + "time_per_iteration": 2.685614585876465 + }, + { + "auxiliary_loss_clip": 0.01042075, + "auxiliary_loss_mlp": 0.01028051, + "balance_loss_clip": 1.02338922, + "balance_loss_mlp": 1.01679742, + "epoch": 0.6389298061025102, + "flos": 17128636369920.0, + "grad_norm": 1.9482077920439793, + "language_loss": 0.72293568, + "learning_rate": 1.2185158346598517e-06, + "loss": 0.74363697, + "num_input_tokens_seen": 229255160, + "step": 10627, + "time_per_iteration": 2.671562671661377 + }, + { + "auxiliary_loss_clip": 0.01041509, + "auxiliary_loss_mlp": 0.01030092, + "balance_loss_clip": 1.02759528, + "balance_loss_mlp": 1.01792622, + "epoch": 0.6389899293551781, + "flos": 27710971989120.0, + "grad_norm": 1.6271336670535614, + "language_loss": 0.67006892, + "learning_rate": 1.2181573499220064e-06, + "loss": 0.69078487, + "num_input_tokens_seen": 229278705, + "step": 10628, + "time_per_iteration": 2.8561573028564453 + }, + { + "auxiliary_loss_clip": 0.01059819, + "auxiliary_loss_mlp": 0.01025958, + "balance_loss_clip": 1.02443027, + "balance_loss_mlp": 1.01615262, + "epoch": 0.6390500526078461, + "flos": 21215701486080.0, + "grad_norm": 2.8950681683717834, + "language_loss": 0.677001, + "learning_rate": 1.2177988948317135e-06, + "loss": 0.69785869, + "num_input_tokens_seen": 229299990, + "step": 10629, + "time_per_iteration": 2.6997880935668945 + }, + { + "auxiliary_loss_clip": 0.01029047, + "auxiliary_loss_mlp": 0.01034494, + "balance_loss_clip": 1.02427006, + "balance_loss_mlp": 1.02143514, + "epoch": 0.6391101758605141, + "flos": 21581056673280.0, + "grad_norm": 1.731500201745803, + "language_loss": 0.75237435, + "learning_rate": 1.2174404694025646e-06, + "loss": 0.77300978, + "num_input_tokens_seen": 229319230, + "step": 10630, + "time_per_iteration": 2.7301809787750244 + }, + { + "auxiliary_loss_clip": 0.01033134, + "auxiliary_loss_mlp": 0.01029982, + "balance_loss_clip": 1.02123189, + "balance_loss_mlp": 1.02032018, + "epoch": 0.639170299113182, + "flos": 19900473091200.0, + "grad_norm": 1.5690019391957588, + "language_loss": 0.70299119, + "learning_rate": 1.2170820736481511e-06, + "loss": 0.72362238, + "num_input_tokens_seen": 229338600, + "step": 10631, + "time_per_iteration": 2.7783632278442383 + }, + { + "auxiliary_loss_clip": 0.00990791, + "auxiliary_loss_mlp": 0.01013176, + "balance_loss_clip": 1.00391054, + "balance_loss_mlp": 1.01156652, + "epoch": 0.63923042236585, + "flos": 69877604833920.0, + "grad_norm": 0.7686430513100004, + "language_loss": 0.62930506, + "learning_rate": 1.2167237075820646e-06, + "loss": 0.64934474, + "num_input_tokens_seen": 229402420, + "step": 10632, + "time_per_iteration": 4.982485294342041 + }, + { + "auxiliary_loss_clip": 0.01040305, + "auxiliary_loss_mlp": 0.01030418, + "balance_loss_clip": 1.02350163, + "balance_loss_mlp": 1.01998115, + "epoch": 0.639290545618518, + "flos": 22674823764480.0, + "grad_norm": 1.7403303711409013, + "language_loss": 0.66574931, + "learning_rate": 1.216365371217893e-06, + "loss": 0.68645656, + "num_input_tokens_seen": 229419185, + "step": 10633, + "time_per_iteration": 2.7854959964752197 + }, + { + "auxiliary_loss_clip": 0.01005072, + "auxiliary_loss_mlp": 0.01026626, + "balance_loss_clip": 1.02620816, + "balance_loss_mlp": 1.01650512, + "epoch": 0.639350668871186, + "flos": 19829190551040.0, + "grad_norm": 2.1601725919402956, + "language_loss": 0.8188892, + "learning_rate": 1.216007064569225e-06, + "loss": 0.83920622, + "num_input_tokens_seen": 229436735, + "step": 10634, + "time_per_iteration": 2.865833044052124 + }, + { + "auxiliary_loss_clip": 0.01044267, + "auxiliary_loss_mlp": 0.01029418, + "balance_loss_clip": 1.02507353, + "balance_loss_mlp": 1.01778901, + "epoch": 0.6394107921238539, + "flos": 20553328736640.0, + "grad_norm": 2.428739794945261, + "language_loss": 0.75165117, + "learning_rate": 1.2156487876496483e-06, + "loss": 0.77238804, + "num_input_tokens_seen": 229455595, + "step": 10635, + "time_per_iteration": 2.6768312454223633 + }, + { + "auxiliary_loss_clip": 0.01057055, + "auxiliary_loss_mlp": 0.01029202, + "balance_loss_clip": 1.02710032, + "balance_loss_mlp": 1.01794863, + "epoch": 0.6394709153765219, + "flos": 25774991729280.0, + "grad_norm": 1.7122663813223808, + "language_loss": 0.71401495, + "learning_rate": 1.2152905404727475e-06, + "loss": 0.73487753, + "num_input_tokens_seen": 229476230, + "step": 10636, + "time_per_iteration": 2.683863639831543 + }, + { + "auxiliary_loss_clip": 0.0104635, + "auxiliary_loss_mlp": 0.01031019, + "balance_loss_clip": 1.02591646, + "balance_loss_mlp": 1.01953948, + "epoch": 0.6395310386291898, + "flos": 17530153574400.0, + "grad_norm": 1.95418602195817, + "language_loss": 0.73455626, + "learning_rate": 1.2149323230521085e-06, + "loss": 0.75532997, + "num_input_tokens_seen": 229494300, + "step": 10637, + "time_per_iteration": 2.7230777740478516 + }, + { + "auxiliary_loss_clip": 0.01055837, + "auxiliary_loss_mlp": 0.01028422, + "balance_loss_clip": 1.02596903, + "balance_loss_mlp": 1.01645374, + "epoch": 0.6395911618818578, + "flos": 18588225525120.0, + "grad_norm": 1.8731456595000355, + "language_loss": 0.77638495, + "learning_rate": 1.2145741354013143e-06, + "loss": 0.7972275, + "num_input_tokens_seen": 229512985, + "step": 10638, + "time_per_iteration": 2.634676933288574 + }, + { + "auxiliary_loss_clip": 0.01042332, + "auxiliary_loss_mlp": 0.01028826, + "balance_loss_clip": 1.02462196, + "balance_loss_mlp": 1.01799631, + "epoch": 0.6396512851345257, + "flos": 28366557068160.0, + "grad_norm": 1.4743647635582788, + "language_loss": 0.81771302, + "learning_rate": 1.2142159775339478e-06, + "loss": 0.83842462, + "num_input_tokens_seen": 229534270, + "step": 10639, + "time_per_iteration": 2.691171407699585 + }, + { + "auxiliary_loss_clip": 0.0099146, + "auxiliary_loss_mlp": 0.0100193, + "balance_loss_clip": 1.0048728, + "balance_loss_mlp": 1.0009402, + "epoch": 0.6397114083871938, + "flos": 70724307202560.0, + "grad_norm": 0.8279166087302098, + "language_loss": 0.59042406, + "learning_rate": 1.21385784946359e-06, + "loss": 0.61035794, + "num_input_tokens_seen": 229596455, + "step": 10640, + "time_per_iteration": 3.214984893798828 + }, + { + "auxiliary_loss_clip": 0.01028667, + "auxiliary_loss_mlp": 0.01023249, + "balance_loss_clip": 1.02051544, + "balance_loss_mlp": 1.01324129, + "epoch": 0.6397715316398617, + "flos": 18142537570560.0, + "grad_norm": 2.413687586143014, + "language_loss": 0.7869491, + "learning_rate": 1.2134997512038215e-06, + "loss": 0.80746824, + "num_input_tokens_seen": 229612860, + "step": 10641, + "time_per_iteration": 2.6469030380249023 + }, + { + "auxiliary_loss_clip": 0.01021171, + "auxiliary_loss_mlp": 0.01028713, + "balance_loss_clip": 1.02263165, + "balance_loss_mlp": 1.01690543, + "epoch": 0.6398316548925297, + "flos": 25739512070400.0, + "grad_norm": 1.7037102128731683, + "language_loss": 0.63184029, + "learning_rate": 1.2131416827682209e-06, + "loss": 0.6523391, + "num_input_tokens_seen": 229633960, + "step": 10642, + "time_per_iteration": 2.9391417503356934 + }, + { + "auxiliary_loss_clip": 0.00975764, + "auxiliary_loss_mlp": 0.01006894, + "balance_loss_clip": 1.00146508, + "balance_loss_mlp": 1.00566006, + "epoch": 0.6398917781451977, + "flos": 71214234756480.0, + "grad_norm": 0.943527932668615, + "language_loss": 0.55934709, + "learning_rate": 1.2127836441703667e-06, + "loss": 0.57917368, + "num_input_tokens_seen": 229686730, + "step": 10643, + "time_per_iteration": 3.208986759185791 + }, + { + "auxiliary_loss_clip": 0.01028196, + "auxiliary_loss_mlp": 0.01026254, + "balance_loss_clip": 1.02191818, + "balance_loss_mlp": 1.01482749, + "epoch": 0.6399519013978656, + "flos": 20521835487360.0, + "grad_norm": 1.8743029927953043, + "language_loss": 0.76606309, + "learning_rate": 1.2124256354238358e-06, + "loss": 0.7866075, + "num_input_tokens_seen": 229704800, + "step": 10644, + "time_per_iteration": 2.6565232276916504 + }, + { + "auxiliary_loss_clip": 0.01034457, + "auxiliary_loss_mlp": 0.01029191, + "balance_loss_clip": 1.02651834, + "balance_loss_mlp": 1.01849234, + "epoch": 0.6400120246505336, + "flos": 24460840742400.0, + "grad_norm": 1.4846983999945955, + "language_loss": 0.82798493, + "learning_rate": 1.212067656542203e-06, + "loss": 0.84862143, + "num_input_tokens_seen": 229725265, + "step": 10645, + "time_per_iteration": 2.733356237411499 + }, + { + "auxiliary_loss_clip": 0.01053178, + "auxiliary_loss_mlp": 0.0103478, + "balance_loss_clip": 1.0245676, + "balance_loss_mlp": 1.02243614, + "epoch": 0.6400721479032015, + "flos": 28366090191360.0, + "grad_norm": 1.7035128695828075, + "language_loss": 0.73641598, + "learning_rate": 1.2117097075390447e-06, + "loss": 0.75729549, + "num_input_tokens_seen": 229744840, + "step": 10646, + "time_per_iteration": 2.7627992630004883 + }, + { + "auxiliary_loss_clip": 0.01028325, + "auxiliary_loss_mlp": 0.01030707, + "balance_loss_clip": 1.02296162, + "balance_loss_mlp": 1.01928687, + "epoch": 0.6401322711558696, + "flos": 17816540711040.0, + "grad_norm": 2.1487468736976534, + "language_loss": 0.80021483, + "learning_rate": 1.2113517884279327e-06, + "loss": 0.82080513, + "num_input_tokens_seen": 229759095, + "step": 10647, + "time_per_iteration": 2.6323819160461426 + }, + { + "auxiliary_loss_clip": 0.01015547, + "auxiliary_loss_mlp": 0.01025897, + "balance_loss_clip": 1.02228475, + "balance_loss_mlp": 1.01582384, + "epoch": 0.6401923944085375, + "flos": 26030855283840.0, + "grad_norm": 1.9368392353578396, + "language_loss": 0.75541091, + "learning_rate": 1.2109938992224399e-06, + "loss": 0.77582538, + "num_input_tokens_seen": 229777750, + "step": 10648, + "time_per_iteration": 2.716747283935547 + }, + { + "auxiliary_loss_clip": 0.01035002, + "auxiliary_loss_mlp": 0.01024581, + "balance_loss_clip": 1.02241957, + "balance_loss_mlp": 1.01364338, + "epoch": 0.6402525176612055, + "flos": 23586451966080.0, + "grad_norm": 1.8026049341814827, + "language_loss": 0.78617561, + "learning_rate": 1.210636039936138e-06, + "loss": 0.80677152, + "num_input_tokens_seen": 229796785, + "step": 10649, + "time_per_iteration": 2.6484243869781494 + }, + { + "auxiliary_loss_clip": 0.01018275, + "auxiliary_loss_mlp": 0.01031675, + "balance_loss_clip": 1.02737975, + "balance_loss_mlp": 1.02015328, + "epoch": 0.6403126409138734, + "flos": 18041413806720.0, + "grad_norm": 1.791469701843042, + "language_loss": 0.75577223, + "learning_rate": 1.2102782105825956e-06, + "loss": 0.77627176, + "num_input_tokens_seen": 229815425, + "step": 10650, + "time_per_iteration": 2.8245513439178467 + }, + { + "auxiliary_loss_clip": 0.01065151, + "auxiliary_loss_mlp": 0.0102973, + "balance_loss_clip": 1.02595854, + "balance_loss_mlp": 1.01773775, + "epoch": 0.6403727641665414, + "flos": 21979485308160.0, + "grad_norm": 1.512112211232361, + "language_loss": 0.70791423, + "learning_rate": 1.2099204111753833e-06, + "loss": 0.72886306, + "num_input_tokens_seen": 229834545, + "step": 10651, + "time_per_iteration": 2.6403794288635254 + }, + { + "auxiliary_loss_clip": 0.01034108, + "auxiliary_loss_mlp": 0.01033633, + "balance_loss_clip": 1.02438188, + "balance_loss_mlp": 1.02199221, + "epoch": 0.6404328874192093, + "flos": 24895539135360.0, + "grad_norm": 2.436427008129614, + "language_loss": 0.63802493, + "learning_rate": 1.2095626417280684e-06, + "loss": 0.65870225, + "num_input_tokens_seen": 229849175, + "step": 10652, + "time_per_iteration": 2.928964853286743 + }, + { + "auxiliary_loss_clip": 0.01039106, + "auxiliary_loss_mlp": 0.01026164, + "balance_loss_clip": 1.02346098, + "balance_loss_mlp": 1.01537013, + "epoch": 0.6404930106718774, + "flos": 17597198309760.0, + "grad_norm": 2.437372693476937, + "language_loss": 0.79204983, + "learning_rate": 1.2092049022542168e-06, + "loss": 0.81270254, + "num_input_tokens_seen": 229865400, + "step": 10653, + "time_per_iteration": 2.907460927963257 + }, + { + "auxiliary_loss_clip": 0.01034765, + "auxiliary_loss_mlp": 0.01050539, + "balance_loss_clip": 1.0221374, + "balance_loss_mlp": 1.03588831, + "epoch": 0.6405531339245453, + "flos": 20157880930560.0, + "grad_norm": 2.6078789535011913, + "language_loss": 0.70530796, + "learning_rate": 1.2088471927673952e-06, + "loss": 0.726161, + "num_input_tokens_seen": 229882945, + "step": 10654, + "time_per_iteration": 2.747786045074463 + }, + { + "auxiliary_loss_clip": 0.01058449, + "auxiliary_loss_mlp": 0.01035664, + "balance_loss_clip": 1.02689743, + "balance_loss_mlp": 1.02383208, + "epoch": 0.6406132571772133, + "flos": 21942281796480.0, + "grad_norm": 1.6117765461958222, + "language_loss": 0.7230835, + "learning_rate": 1.2084895132811666e-06, + "loss": 0.74402457, + "num_input_tokens_seen": 229901590, + "step": 10655, + "time_per_iteration": 2.663882255554199 + }, + { + "auxiliary_loss_clip": 0.01036747, + "auxiliary_loss_mlp": 0.01030244, + "balance_loss_clip": 1.02648282, + "balance_loss_mlp": 1.01931286, + "epoch": 0.6406733804298813, + "flos": 28768002445440.0, + "grad_norm": 1.5483156081781342, + "language_loss": 0.82648087, + "learning_rate": 1.2081318638090952e-06, + "loss": 0.8471508, + "num_input_tokens_seen": 229922535, + "step": 10656, + "time_per_iteration": 2.7107114791870117 + }, + { + "auxiliary_loss_clip": 0.01012298, + "auxiliary_loss_mlp": 0.01027779, + "balance_loss_clip": 1.0216949, + "balance_loss_mlp": 1.01714516, + "epoch": 0.6407335036825492, + "flos": 17457183095040.0, + "grad_norm": 2.67893138067129, + "language_loss": 0.72377372, + "learning_rate": 1.2077742443647433e-06, + "loss": 0.74417448, + "num_input_tokens_seen": 229939575, + "step": 10657, + "time_per_iteration": 4.338896036148071 + }, + { + "auxiliary_loss_clip": 0.01032308, + "auxiliary_loss_mlp": 0.01032661, + "balance_loss_clip": 1.02429497, + "balance_loss_mlp": 1.02229571, + "epoch": 0.6407936269352172, + "flos": 22125282612480.0, + "grad_norm": 1.8094222906818715, + "language_loss": 0.77645755, + "learning_rate": 1.2074166549616707e-06, + "loss": 0.79710716, + "num_input_tokens_seen": 229958840, + "step": 10658, + "time_per_iteration": 2.8085198402404785 + }, + { + "auxiliary_loss_clip": 0.01065975, + "auxiliary_loss_mlp": 0.01031668, + "balance_loss_clip": 1.02595806, + "balance_loss_mlp": 1.01977718, + "epoch": 0.6408537501878852, + "flos": 23110635479040.0, + "grad_norm": 2.6863888714043473, + "language_loss": 0.76426321, + "learning_rate": 1.2070590956134386e-06, + "loss": 0.78523964, + "num_input_tokens_seen": 229979680, + "step": 10659, + "time_per_iteration": 4.264851331710815 + }, + { + "auxiliary_loss_clip": 0.01055195, + "auxiliary_loss_mlp": 0.01031203, + "balance_loss_clip": 1.02613878, + "balance_loss_mlp": 1.01994944, + "epoch": 0.6409138734405532, + "flos": 16472440759680.0, + "grad_norm": 1.7195002480835633, + "language_loss": 0.78233218, + "learning_rate": 1.2067015663336046e-06, + "loss": 0.80319619, + "num_input_tokens_seen": 229996830, + "step": 10660, + "time_per_iteration": 2.6006860733032227 + }, + { + "auxiliary_loss_clip": 0.01046258, + "auxiliary_loss_mlp": 0.01033438, + "balance_loss_clip": 1.02545249, + "balance_loss_mlp": 1.02095068, + "epoch": 0.6409739966932211, + "flos": 22777922776320.0, + "grad_norm": 1.929031477918861, + "language_loss": 0.68786788, + "learning_rate": 1.206344067135727e-06, + "loss": 0.70866489, + "num_input_tokens_seen": 230015115, + "step": 10661, + "time_per_iteration": 2.7340612411499023 + }, + { + "auxiliary_loss_clip": 0.01062694, + "auxiliary_loss_mlp": 0.01031422, + "balance_loss_clip": 1.02572751, + "balance_loss_mlp": 1.02142024, + "epoch": 0.6410341199458891, + "flos": 25152049134720.0, + "grad_norm": 1.6613366553583482, + "language_loss": 0.75860333, + "learning_rate": 1.205986598033362e-06, + "loss": 0.77954459, + "num_input_tokens_seen": 230035515, + "step": 10662, + "time_per_iteration": 2.67095947265625 + }, + { + "auxiliary_loss_clip": 0.01047194, + "auxiliary_loss_mlp": 0.01030445, + "balance_loss_clip": 1.02223468, + "balance_loss_mlp": 1.01830935, + "epoch": 0.641094243198557, + "flos": 27046193028480.0, + "grad_norm": 1.9038630176544702, + "language_loss": 0.69706959, + "learning_rate": 1.2056291590400644e-06, + "loss": 0.71784598, + "num_input_tokens_seen": 230054355, + "step": 10663, + "time_per_iteration": 2.743760108947754 + }, + { + "auxiliary_loss_clip": 0.01034655, + "auxiliary_loss_mlp": 0.01035502, + "balance_loss_clip": 1.02494121, + "balance_loss_mlp": 1.0227704, + "epoch": 0.641154366451225, + "flos": 25374551932800.0, + "grad_norm": 2.4026779601539587, + "language_loss": 0.67791939, + "learning_rate": 1.205271750169389e-06, + "loss": 0.69862098, + "num_input_tokens_seen": 230074605, + "step": 10664, + "time_per_iteration": 2.775416612625122 + }, + { + "auxiliary_loss_clip": 0.01038322, + "auxiliary_loss_mlp": 0.0103036, + "balance_loss_clip": 1.02220511, + "balance_loss_mlp": 1.01991713, + "epoch": 0.6412144897038929, + "flos": 25153342024320.0, + "grad_norm": 1.827591572337069, + "language_loss": 0.6631887, + "learning_rate": 1.2049143714348881e-06, + "loss": 0.68387544, + "num_input_tokens_seen": 230093820, + "step": 10665, + "time_per_iteration": 2.714294910430908 + }, + { + "auxiliary_loss_clip": 0.01052764, + "auxiliary_loss_mlp": 0.01028651, + "balance_loss_clip": 1.02486467, + "balance_loss_mlp": 1.017416, + "epoch": 0.641274612956561, + "flos": 23440762402560.0, + "grad_norm": 1.7095886462474241, + "language_loss": 0.643641, + "learning_rate": 1.2045570228501145e-06, + "loss": 0.66445518, + "num_input_tokens_seen": 230114285, + "step": 10666, + "time_per_iteration": 2.6428627967834473 + }, + { + "auxiliary_loss_clip": 0.01054724, + "auxiliary_loss_mlp": 0.01031281, + "balance_loss_clip": 1.02539241, + "balance_loss_mlp": 1.02027774, + "epoch": 0.6413347362092289, + "flos": 19427493778560.0, + "grad_norm": 1.6804607632021602, + "language_loss": 0.71100986, + "learning_rate": 1.2041997044286176e-06, + "loss": 0.73186994, + "num_input_tokens_seen": 230132760, + "step": 10667, + "time_per_iteration": 2.600510358810425 + }, + { + "auxiliary_loss_clip": 0.01009908, + "auxiliary_loss_mlp": 0.00747828, + "balance_loss_clip": 1.02204621, + "balance_loss_mlp": 1.00058484, + "epoch": 0.6413948594618969, + "flos": 17196578945280.0, + "grad_norm": 3.098210429558831, + "language_loss": 0.77702284, + "learning_rate": 1.2038424161839484e-06, + "loss": 0.79460013, + "num_input_tokens_seen": 230149690, + "step": 10668, + "time_per_iteration": 2.7451372146606445 + }, + { + "auxiliary_loss_clip": 0.01058847, + "auxiliary_loss_mlp": 0.01034822, + "balance_loss_clip": 1.02895796, + "balance_loss_mlp": 1.02363431, + "epoch": 0.6414549827145648, + "flos": 22269787027200.0, + "grad_norm": 1.544790103314696, + "language_loss": 0.67786717, + "learning_rate": 1.2034851581296544e-06, + "loss": 0.69880378, + "num_input_tokens_seen": 230166950, + "step": 10669, + "time_per_iteration": 2.7621612548828125 + }, + { + "auxiliary_loss_clip": 0.01063129, + "auxiliary_loss_mlp": 0.01037223, + "balance_loss_clip": 1.02947533, + "balance_loss_mlp": 1.0253675, + "epoch": 0.6415151059672328, + "flos": 19640192163840.0, + "grad_norm": 1.894773117172067, + "language_loss": 0.78356481, + "learning_rate": 1.2031279302792825e-06, + "loss": 0.80456829, + "num_input_tokens_seen": 230184785, + "step": 10670, + "time_per_iteration": 2.770278215408325 + }, + { + "auxiliary_loss_clip": 0.01038874, + "auxiliary_loss_mlp": 0.01032388, + "balance_loss_clip": 1.02658248, + "balance_loss_mlp": 1.02057481, + "epoch": 0.6415752292199008, + "flos": 14865833237760.0, + "grad_norm": 2.2833329561736893, + "language_loss": 0.88597894, + "learning_rate": 1.20277073264638e-06, + "loss": 0.90669161, + "num_input_tokens_seen": 230201385, + "step": 10671, + "time_per_iteration": 4.382790803909302 + }, + { + "auxiliary_loss_clip": 0.0105346, + "auxiliary_loss_mlp": 0.01026581, + "balance_loss_clip": 1.02574766, + "balance_loss_mlp": 1.01600766, + "epoch": 0.6416353524725688, + "flos": 13735580906880.0, + "grad_norm": 1.672908310821083, + "language_loss": 0.69036067, + "learning_rate": 1.2024135652444907e-06, + "loss": 0.71116114, + "num_input_tokens_seen": 230220380, + "step": 10672, + "time_per_iteration": 2.6917502880096436 + }, + { + "auxiliary_loss_clip": 0.01060384, + "auxiliary_loss_mlp": 0.01028749, + "balance_loss_clip": 1.02703583, + "balance_loss_mlp": 1.01574326, + "epoch": 0.6416954757252368, + "flos": 24534924543360.0, + "grad_norm": 1.9946198176647452, + "language_loss": 0.74457788, + "learning_rate": 1.2020564280871593e-06, + "loss": 0.76546919, + "num_input_tokens_seen": 230239845, + "step": 10673, + "time_per_iteration": 2.702794075012207 + }, + { + "auxiliary_loss_clip": 0.01029366, + "auxiliary_loss_mlp": 0.01032713, + "balance_loss_clip": 1.0230633, + "balance_loss_mlp": 1.0203805, + "epoch": 0.6417555989779047, + "flos": 27710002321920.0, + "grad_norm": 2.3777216894141926, + "language_loss": 0.69339836, + "learning_rate": 1.2016993211879283e-06, + "loss": 0.71401918, + "num_input_tokens_seen": 230262420, + "step": 10674, + "time_per_iteration": 2.874067783355713 + }, + { + "auxiliary_loss_clip": 0.01067925, + "auxiliary_loss_mlp": 0.01029031, + "balance_loss_clip": 1.0254246, + "balance_loss_mlp": 1.01697874, + "epoch": 0.6418157222305727, + "flos": 20556632787840.0, + "grad_norm": 2.0667529999815533, + "language_loss": 0.66871202, + "learning_rate": 1.201342244560338e-06, + "loss": 0.68968153, + "num_input_tokens_seen": 230279950, + "step": 10675, + "time_per_iteration": 2.653264284133911 + }, + { + "auxiliary_loss_clip": 0.01067361, + "auxiliary_loss_mlp": 0.0103485, + "balance_loss_clip": 1.02753305, + "balance_loss_mlp": 1.02407289, + "epoch": 0.6418758454832406, + "flos": 22601530062720.0, + "grad_norm": 1.675007861616439, + "language_loss": 0.66197681, + "learning_rate": 1.2009851982179307e-06, + "loss": 0.6829989, + "num_input_tokens_seen": 230299705, + "step": 10676, + "time_per_iteration": 2.6597511768341064 + }, + { + "auxiliary_loss_clip": 0.01068105, + "auxiliary_loss_mlp": 0.01026813, + "balance_loss_clip": 1.02676058, + "balance_loss_mlp": 1.01415277, + "epoch": 0.6419359687359086, + "flos": 27375098889600.0, + "grad_norm": 1.8092146522014996, + "language_loss": 0.75443584, + "learning_rate": 1.2006281821742446e-06, + "loss": 0.77538502, + "num_input_tokens_seen": 230320030, + "step": 10677, + "time_per_iteration": 2.6207659244537354 + }, + { + "auxiliary_loss_clip": 0.00992398, + "auxiliary_loss_mlp": 0.01004176, + "balance_loss_clip": 1.00549293, + "balance_loss_mlp": 1.00306118, + "epoch": 0.6419960919885765, + "flos": 67251924552960.0, + "grad_norm": 0.7680335786463148, + "language_loss": 0.60728264, + "learning_rate": 1.200271196442818e-06, + "loss": 0.62724841, + "num_input_tokens_seen": 230381495, + "step": 10678, + "time_per_iteration": 3.3886959552764893 + }, + { + "auxiliary_loss_clip": 0.01048361, + "auxiliary_loss_mlp": 0.01030603, + "balance_loss_clip": 1.02391624, + "balance_loss_mlp": 1.01954079, + "epoch": 0.6420562152412446, + "flos": 19901873721600.0, + "grad_norm": 1.7669071512002847, + "language_loss": 0.67453051, + "learning_rate": 1.1999142410371875e-06, + "loss": 0.69532013, + "num_input_tokens_seen": 230401385, + "step": 10679, + "time_per_iteration": 4.259036540985107 + }, + { + "auxiliary_loss_clip": 0.01050715, + "auxiliary_loss_mlp": 0.01036043, + "balance_loss_clip": 1.0253706, + "balance_loss_mlp": 1.02319205, + "epoch": 0.6421163384939125, + "flos": 24790177566720.0, + "grad_norm": 1.8221420180093617, + "language_loss": 0.73081559, + "learning_rate": 1.1995573159708897e-06, + "loss": 0.75168318, + "num_input_tokens_seen": 230421340, + "step": 10680, + "time_per_iteration": 2.620772123336792 + }, + { + "auxiliary_loss_clip": 0.01033068, + "auxiliary_loss_mlp": 0.01026773, + "balance_loss_clip": 1.02403307, + "balance_loss_mlp": 1.01659203, + "epoch": 0.6421764617465805, + "flos": 25592816926080.0, + "grad_norm": 1.8043462383252835, + "language_loss": 0.67570287, + "learning_rate": 1.1992004212574582e-06, + "loss": 0.69630134, + "num_input_tokens_seen": 230441270, + "step": 10681, + "time_per_iteration": 2.7212634086608887 + }, + { + "auxiliary_loss_clip": 0.01063487, + "auxiliary_loss_mlp": 0.0102545, + "balance_loss_clip": 1.02541399, + "balance_loss_mlp": 1.0149653, + "epoch": 0.6422365849992484, + "flos": 14134727813760.0, + "grad_norm": 1.6843333751401932, + "language_loss": 0.74948114, + "learning_rate": 1.198843556910427e-06, + "loss": 0.77037048, + "num_input_tokens_seen": 230457455, + "step": 10682, + "time_per_iteration": 2.6188764572143555 + }, + { + "auxiliary_loss_clip": 0.01005224, + "auxiliary_loss_mlp": 0.01026639, + "balance_loss_clip": 1.02270913, + "balance_loss_mlp": 1.01667917, + "epoch": 0.6422967082519164, + "flos": 22383911514240.0, + "grad_norm": 1.6405165917388165, + "language_loss": 0.79172421, + "learning_rate": 1.1984867229433287e-06, + "loss": 0.81204277, + "num_input_tokens_seen": 230478955, + "step": 10683, + "time_per_iteration": 2.778360366821289 + }, + { + "auxiliary_loss_clip": 0.01066554, + "auxiliary_loss_mlp": 0.01034296, + "balance_loss_clip": 1.02672148, + "balance_loss_mlp": 1.02237523, + "epoch": 0.6423568315045844, + "flos": 14647927380480.0, + "grad_norm": 1.660152910088971, + "language_loss": 0.67306292, + "learning_rate": 1.1981299193696941e-06, + "loss": 0.69407141, + "num_input_tokens_seen": 230496425, + "step": 10684, + "time_per_iteration": 2.5183467864990234 + }, + { + "auxiliary_loss_clip": 0.01054837, + "auxiliary_loss_mlp": 0.01030279, + "balance_loss_clip": 1.02503896, + "balance_loss_mlp": 1.01876926, + "epoch": 0.6424169547572524, + "flos": 26833925606400.0, + "grad_norm": 2.0375335540019024, + "language_loss": 0.72104859, + "learning_rate": 1.1977731462030533e-06, + "loss": 0.74189979, + "num_input_tokens_seen": 230516245, + "step": 10685, + "time_per_iteration": 2.8232693672180176 + }, + { + "auxiliary_loss_clip": 0.01027874, + "auxiliary_loss_mlp": 0.01028426, + "balance_loss_clip": 1.02235866, + "balance_loss_mlp": 1.01809025, + "epoch": 0.6424770780099204, + "flos": 22707430335360.0, + "grad_norm": 1.5019290796684495, + "language_loss": 0.75371456, + "learning_rate": 1.197416403456935e-06, + "loss": 0.77427757, + "num_input_tokens_seen": 230534745, + "step": 10686, + "time_per_iteration": 2.669234037399292 + }, + { + "auxiliary_loss_clip": 0.01034457, + "auxiliary_loss_mlp": 0.01031527, + "balance_loss_clip": 1.02428889, + "balance_loss_mlp": 1.01945114, + "epoch": 0.6425372012625883, + "flos": 28469512425600.0, + "grad_norm": 1.9936517503300089, + "language_loss": 0.68256491, + "learning_rate": 1.197059691144867e-06, + "loss": 0.70322478, + "num_input_tokens_seen": 230555895, + "step": 10687, + "time_per_iteration": 2.7119290828704834 + }, + { + "auxiliary_loss_clip": 0.01039223, + "auxiliary_loss_mlp": 0.01031737, + "balance_loss_clip": 1.02487659, + "balance_loss_mlp": 1.0206449, + "epoch": 0.6425973245152563, + "flos": 29351694453120.0, + "grad_norm": 1.798285560284246, + "language_loss": 0.66237867, + "learning_rate": 1.1967030092803767e-06, + "loss": 0.6830883, + "num_input_tokens_seen": 230577460, + "step": 10688, + "time_per_iteration": 2.6729705333709717 + }, + { + "auxiliary_loss_clip": 0.01063836, + "auxiliary_loss_mlp": 0.01027555, + "balance_loss_clip": 1.02460182, + "balance_loss_mlp": 1.01657021, + "epoch": 0.6426574477679242, + "flos": 16430388912000.0, + "grad_norm": 1.6934851170300482, + "language_loss": 0.73280734, + "learning_rate": 1.1963463578769876e-06, + "loss": 0.75372124, + "num_input_tokens_seen": 230595030, + "step": 10689, + "time_per_iteration": 2.7218430042266846 + }, + { + "auxiliary_loss_clip": 0.01046887, + "auxiliary_loss_mlp": 0.01032255, + "balance_loss_clip": 1.02477872, + "balance_loss_mlp": 1.02161598, + "epoch": 0.6427175710205922, + "flos": 21835914647040.0, + "grad_norm": 1.8881374091598535, + "language_loss": 0.71614754, + "learning_rate": 1.195989736948226e-06, + "loss": 0.73693895, + "num_input_tokens_seen": 230615135, + "step": 10690, + "time_per_iteration": 2.630995750427246 + }, + { + "auxiliary_loss_clip": 0.01033376, + "auxiliary_loss_mlp": 0.01027639, + "balance_loss_clip": 1.02218139, + "balance_loss_mlp": 1.01675546, + "epoch": 0.6427776942732601, + "flos": 17786627660160.0, + "grad_norm": 1.8158521687233244, + "language_loss": 0.77934289, + "learning_rate": 1.1956331465076143e-06, + "loss": 0.7999531, + "num_input_tokens_seen": 230631965, + "step": 10691, + "time_per_iteration": 2.66774582862854 + }, + { + "auxiliary_loss_clip": 0.01046033, + "auxiliary_loss_mlp": 0.01028734, + "balance_loss_clip": 1.02602971, + "balance_loss_mlp": 1.01803541, + "epoch": 0.6428378175259282, + "flos": 15085893911040.0, + "grad_norm": 2.0503243516006577, + "language_loss": 0.74381971, + "learning_rate": 1.1952765865686738e-06, + "loss": 0.76456738, + "num_input_tokens_seen": 230649565, + "step": 10692, + "time_per_iteration": 2.7005932331085205 + }, + { + "auxiliary_loss_clip": 0.01054498, + "auxiliary_loss_mlp": 0.0103073, + "balance_loss_clip": 1.02545834, + "balance_loss_mlp": 1.01998377, + "epoch": 0.6428979407785961, + "flos": 23841776816640.0, + "grad_norm": 2.8297659912909054, + "language_loss": 0.61078453, + "learning_rate": 1.1949200571449263e-06, + "loss": 0.63163686, + "num_input_tokens_seen": 230669265, + "step": 10693, + "time_per_iteration": 2.692798614501953 + }, + { + "auxiliary_loss_clip": 0.01036823, + "auxiliary_loss_mlp": 0.01027915, + "balance_loss_clip": 1.02614379, + "balance_loss_mlp": 1.01582098, + "epoch": 0.6429580640312641, + "flos": 32926852892160.0, + "grad_norm": 1.6846807929108307, + "language_loss": 0.59377766, + "learning_rate": 1.1945635582498903e-06, + "loss": 0.61442506, + "num_input_tokens_seen": 230690575, + "step": 10694, + "time_per_iteration": 2.855158567428589 + }, + { + "auxiliary_loss_clip": 0.01038243, + "auxiliary_loss_mlp": 0.01034899, + "balance_loss_clip": 1.02544713, + "balance_loss_mlp": 1.02377117, + "epoch": 0.643018187283932, + "flos": 21068359896960.0, + "grad_norm": 1.4455912756813694, + "language_loss": 0.80100274, + "learning_rate": 1.1942070898970853e-06, + "loss": 0.82173419, + "num_input_tokens_seen": 230709420, + "step": 10695, + "time_per_iteration": 2.8545591831207275 + }, + { + "auxiliary_loss_clip": 0.01065455, + "auxiliary_loss_mlp": 0.01037039, + "balance_loss_clip": 1.02570581, + "balance_loss_mlp": 1.02555895, + "epoch": 0.6430783105366, + "flos": 26724649455360.0, + "grad_norm": 1.8679191255337504, + "language_loss": 0.73202825, + "learning_rate": 1.1938506521000285e-06, + "loss": 0.75305313, + "num_input_tokens_seen": 230729350, + "step": 10696, + "time_per_iteration": 2.7806365489959717 + }, + { + "auxiliary_loss_clip": 0.0102795, + "auxiliary_loss_mlp": 0.01025055, + "balance_loss_clip": 1.02365184, + "balance_loss_mlp": 1.01434374, + "epoch": 0.643138433789268, + "flos": 23696841438720.0, + "grad_norm": 1.5366316660373935, + "language_loss": 0.75161892, + "learning_rate": 1.1934942448722347e-06, + "loss": 0.77214897, + "num_input_tokens_seen": 230749220, + "step": 10697, + "time_per_iteration": 2.8874411582946777 + }, + { + "auxiliary_loss_clip": 0.01038501, + "auxiliary_loss_mlp": 0.01029532, + "balance_loss_clip": 1.02214015, + "balance_loss_mlp": 1.01928592, + "epoch": 0.643198557041936, + "flos": 34202184255360.0, + "grad_norm": 1.4633069468376054, + "language_loss": 0.66342926, + "learning_rate": 1.1931378682272208e-06, + "loss": 0.68410957, + "num_input_tokens_seen": 230770245, + "step": 10698, + "time_per_iteration": 2.9532105922698975 + }, + { + "auxiliary_loss_clip": 0.01007094, + "auxiliary_loss_mlp": 0.01003028, + "balance_loss_clip": 1.00120139, + "balance_loss_mlp": 1.002056, + "epoch": 0.643258680294604, + "flos": 67626473621760.0, + "grad_norm": 0.8476532791011775, + "language_loss": 0.63470387, + "learning_rate": 1.1927815221784996e-06, + "loss": 0.65480506, + "num_input_tokens_seen": 230837030, + "step": 10699, + "time_per_iteration": 3.2881386280059814 + }, + { + "auxiliary_loss_clip": 0.01054204, + "auxiliary_loss_mlp": 0.01024694, + "balance_loss_clip": 1.02563119, + "balance_loss_mlp": 1.01478732, + "epoch": 0.6433188035472719, + "flos": 25185984508800.0, + "grad_norm": 1.7645875139823437, + "language_loss": 0.69485986, + "learning_rate": 1.1924252067395838e-06, + "loss": 0.71564883, + "num_input_tokens_seen": 230856845, + "step": 10700, + "time_per_iteration": 2.716564893722534 + }, + { + "auxiliary_loss_clip": 0.01065705, + "auxiliary_loss_mlp": 0.0102585, + "balance_loss_clip": 1.02564657, + "balance_loss_mlp": 1.0142808, + "epoch": 0.6433789267999399, + "flos": 24973573432320.0, + "grad_norm": 1.521842201917239, + "language_loss": 0.73370159, + "learning_rate": 1.1920689219239855e-06, + "loss": 0.75461721, + "num_input_tokens_seen": 230878785, + "step": 10701, + "time_per_iteration": 2.692289113998413 + }, + { + "auxiliary_loss_clip": 0.01054114, + "auxiliary_loss_mlp": 0.01030168, + "balance_loss_clip": 1.02327323, + "balance_loss_mlp": 1.01752543, + "epoch": 0.6434390500526078, + "flos": 17566028282880.0, + "grad_norm": 3.1437187852671413, + "language_loss": 0.81818634, + "learning_rate": 1.1917126677452144e-06, + "loss": 0.83902913, + "num_input_tokens_seen": 230895445, + "step": 10702, + "time_per_iteration": 2.584162950515747 + }, + { + "auxiliary_loss_clip": 0.01032798, + "auxiliary_loss_mlp": 0.01035611, + "balance_loss_clip": 1.02203083, + "balance_loss_mlp": 1.02453017, + "epoch": 0.6434991733052758, + "flos": 20843594542080.0, + "grad_norm": 2.04848769817103, + "language_loss": 0.74427915, + "learning_rate": 1.1913564442167798e-06, + "loss": 0.76496321, + "num_input_tokens_seen": 230911375, + "step": 10703, + "time_per_iteration": 2.6223392486572266 + }, + { + "auxiliary_loss_clip": 0.00964767, + "auxiliary_loss_mlp": 0.0100353, + "balance_loss_clip": 1.00697875, + "balance_loss_mlp": 1.00220716, + "epoch": 0.6435592965579437, + "flos": 66094596345600.0, + "grad_norm": 0.6581361028440232, + "language_loss": 0.54590821, + "learning_rate": 1.1910002513521898e-06, + "loss": 0.56559116, + "num_input_tokens_seen": 230975990, + "step": 10704, + "time_per_iteration": 4.885262727737427 + }, + { + "auxiliary_loss_clip": 0.0103575, + "auxiliary_loss_mlp": 0.01022093, + "balance_loss_clip": 1.02694869, + "balance_loss_mlp": 1.01213908, + "epoch": 0.6436194198106118, + "flos": 23768842250880.0, + "grad_norm": 1.622068226113614, + "language_loss": 0.76976156, + "learning_rate": 1.1906440891649519e-06, + "loss": 0.79033995, + "num_input_tokens_seen": 230997110, + "step": 10705, + "time_per_iteration": 2.760254144668579 + }, + { + "auxiliary_loss_clip": 0.01028989, + "auxiliary_loss_mlp": 0.01031665, + "balance_loss_clip": 1.02188325, + "balance_loss_mlp": 1.02078724, + "epoch": 0.6436795430632797, + "flos": 20230312705920.0, + "grad_norm": 2.000138783706091, + "language_loss": 0.79253322, + "learning_rate": 1.1902879576685708e-06, + "loss": 0.81313974, + "num_input_tokens_seen": 231015590, + "step": 10706, + "time_per_iteration": 4.318651914596558 + }, + { + "auxiliary_loss_clip": 0.01019853, + "auxiliary_loss_mlp": 0.01032973, + "balance_loss_clip": 1.02001595, + "balance_loss_mlp": 1.02122474, + "epoch": 0.6437396663159477, + "flos": 20301846641280.0, + "grad_norm": 1.910514998935658, + "language_loss": 0.79752076, + "learning_rate": 1.1899318568765518e-06, + "loss": 0.81804895, + "num_input_tokens_seen": 231033800, + "step": 10707, + "time_per_iteration": 2.9089066982269287 + }, + { + "auxiliary_loss_clip": 0.01053242, + "auxiliary_loss_mlp": 0.01027205, + "balance_loss_clip": 1.02454317, + "balance_loss_mlp": 1.0167805, + "epoch": 0.6437997895686156, + "flos": 23878585278720.0, + "grad_norm": 1.6411995328845566, + "language_loss": 0.85623068, + "learning_rate": 1.1895757868023978e-06, + "loss": 0.87703514, + "num_input_tokens_seen": 231053160, + "step": 10708, + "time_per_iteration": 2.6142609119415283 + }, + { + "auxiliary_loss_clip": 0.01022668, + "auxiliary_loss_mlp": 0.01041344, + "balance_loss_clip": 1.02501392, + "balance_loss_mlp": 1.02818954, + "epoch": 0.6438599128212836, + "flos": 18989275852800.0, + "grad_norm": 1.8044638650260494, + "language_loss": 0.65390712, + "learning_rate": 1.1892197474596106e-06, + "loss": 0.67454726, + "num_input_tokens_seen": 231069470, + "step": 10709, + "time_per_iteration": 2.7466206550598145 + }, + { + "auxiliary_loss_clip": 0.01063892, + "auxiliary_loss_mlp": 0.0102765, + "balance_loss_clip": 1.02579081, + "balance_loss_mlp": 1.01726723, + "epoch": 0.6439200360739517, + "flos": 24096347481600.0, + "grad_norm": 1.957044451295428, + "language_loss": 0.80495524, + "learning_rate": 1.1888637388616929e-06, + "loss": 0.82587063, + "num_input_tokens_seen": 231088205, + "step": 10710, + "time_per_iteration": 2.7090883255004883 + }, + { + "auxiliary_loss_clip": 0.01050946, + "auxiliary_loss_mlp": 0.0102588, + "balance_loss_clip": 1.02246356, + "balance_loss_mlp": 1.015378, + "epoch": 0.6439801593266196, + "flos": 31902141697920.0, + "grad_norm": 1.7491518734792086, + "language_loss": 0.66422784, + "learning_rate": 1.1885077610221425e-06, + "loss": 0.68499607, + "num_input_tokens_seen": 231107850, + "step": 10711, + "time_per_iteration": 2.786726951599121 + }, + { + "auxiliary_loss_clip": 0.01030407, + "auxiliary_loss_mlp": 0.01033725, + "balance_loss_clip": 1.02790046, + "balance_loss_mlp": 1.02291846, + "epoch": 0.6440402825792876, + "flos": 27125879351040.0, + "grad_norm": 1.625254253848553, + "language_loss": 0.78771305, + "learning_rate": 1.1881518139544597e-06, + "loss": 0.80835438, + "num_input_tokens_seen": 231127200, + "step": 10712, + "time_per_iteration": 2.780122756958008 + }, + { + "auxiliary_loss_clip": 0.01049163, + "auxiliary_loss_mlp": 0.01032583, + "balance_loss_clip": 1.02316034, + "balance_loss_mlp": 1.02101994, + "epoch": 0.6441004058319555, + "flos": 20667704618880.0, + "grad_norm": 2.632038135202966, + "language_loss": 0.82699484, + "learning_rate": 1.1877958976721417e-06, + "loss": 0.84781229, + "num_input_tokens_seen": 231146360, + "step": 10713, + "time_per_iteration": 2.673031806945801 + }, + { + "auxiliary_loss_clip": 0.01063192, + "auxiliary_loss_mlp": 0.01034671, + "balance_loss_clip": 1.02661991, + "balance_loss_mlp": 1.02478266, + "epoch": 0.6441605290846235, + "flos": 26026006947840.0, + "grad_norm": 1.4450188277704115, + "language_loss": 0.78342724, + "learning_rate": 1.187440012188684e-06, + "loss": 0.80440587, + "num_input_tokens_seen": 231168350, + "step": 10714, + "time_per_iteration": 2.6701321601867676 + }, + { + "auxiliary_loss_clip": 0.01030184, + "auxiliary_loss_mlp": 0.01024825, + "balance_loss_clip": 1.02312171, + "balance_loss_mlp": 1.01513338, + "epoch": 0.6442206523372914, + "flos": 24899489631360.0, + "grad_norm": 1.3651980400495167, + "language_loss": 0.81434071, + "learning_rate": 1.187084157517583e-06, + "loss": 0.83489072, + "num_input_tokens_seen": 231188385, + "step": 10715, + "time_per_iteration": 2.7423696517944336 + }, + { + "auxiliary_loss_clip": 0.01036259, + "auxiliary_loss_mlp": 0.01027069, + "balance_loss_clip": 1.02124178, + "balance_loss_mlp": 1.01634645, + "epoch": 0.6442807755899594, + "flos": 25156322853120.0, + "grad_norm": 1.8915598000969993, + "language_loss": 0.81436479, + "learning_rate": 1.186728333672332e-06, + "loss": 0.83499801, + "num_input_tokens_seen": 231209880, + "step": 10716, + "time_per_iteration": 2.7675116062164307 + }, + { + "auxiliary_loss_clip": 0.01034307, + "auxiliary_loss_mlp": 0.01032784, + "balance_loss_clip": 1.02523708, + "balance_loss_mlp": 1.02070785, + "epoch": 0.6443408988426274, + "flos": 27344503480320.0, + "grad_norm": 1.6118631473861316, + "language_loss": 0.78065777, + "learning_rate": 1.186372540666424e-06, + "loss": 0.80132866, + "num_input_tokens_seen": 231230765, + "step": 10717, + "time_per_iteration": 2.8811633586883545 + }, + { + "auxiliary_loss_clip": 0.01062308, + "auxiliary_loss_mlp": 0.01025964, + "balance_loss_clip": 1.02466989, + "balance_loss_mlp": 1.01622486, + "epoch": 0.6444010220952954, + "flos": 27928339142400.0, + "grad_norm": 1.8375068755365773, + "language_loss": 0.68174964, + "learning_rate": 1.1860167785133513e-06, + "loss": 0.70263237, + "num_input_tokens_seen": 231252350, + "step": 10718, + "time_per_iteration": 4.376072645187378 + }, + { + "auxiliary_loss_clip": 0.00999416, + "auxiliary_loss_mlp": 0.01003721, + "balance_loss_clip": 1.00261378, + "balance_loss_mlp": 1.0027132, + "epoch": 0.6444611453479633, + "flos": 71215024855680.0, + "grad_norm": 0.7627010490576535, + "language_loss": 0.49589884, + "learning_rate": 1.185661047226603e-06, + "loss": 0.51593018, + "num_input_tokens_seen": 231313865, + "step": 10719, + "time_per_iteration": 3.4272196292877197 + }, + { + "auxiliary_loss_clip": 0.01066922, + "auxiliary_loss_mlp": 0.01034051, + "balance_loss_clip": 1.02679074, + "balance_loss_mlp": 1.02243376, + "epoch": 0.6445212686006313, + "flos": 22705131864960.0, + "grad_norm": 1.737747209163731, + "language_loss": 0.78225935, + "learning_rate": 1.18530534681967e-06, + "loss": 0.80326909, + "num_input_tokens_seen": 231331710, + "step": 10720, + "time_per_iteration": 2.580690383911133 + }, + { + "auxiliary_loss_clip": 0.01046207, + "auxiliary_loss_mlp": 0.01030724, + "balance_loss_clip": 1.0265578, + "balance_loss_mlp": 1.01911354, + "epoch": 0.6445813918532992, + "flos": 21178821196800.0, + "grad_norm": 2.197076381674031, + "language_loss": 0.77011275, + "learning_rate": 1.18494967730604e-06, + "loss": 0.79088199, + "num_input_tokens_seen": 231350705, + "step": 10721, + "time_per_iteration": 2.6509456634521484 + }, + { + "auxiliary_loss_clip": 0.01025057, + "auxiliary_loss_mlp": 0.01031252, + "balance_loss_clip": 1.02323103, + "balance_loss_mlp": 1.01982009, + "epoch": 0.6446415151059672, + "flos": 25191910252800.0, + "grad_norm": 2.274311955770831, + "language_loss": 0.72740585, + "learning_rate": 1.1845940386991995e-06, + "loss": 0.74796891, + "num_input_tokens_seen": 231369550, + "step": 10722, + "time_per_iteration": 2.703688859939575 + }, + { + "auxiliary_loss_clip": 0.01062237, + "auxiliary_loss_mlp": 0.010264, + "balance_loss_clip": 1.02545142, + "balance_loss_mlp": 1.01617754, + "epoch": 0.6447016383586353, + "flos": 25302227898240.0, + "grad_norm": 1.5313369494983826, + "language_loss": 0.77694517, + "learning_rate": 1.184238431012635e-06, + "loss": 0.79783154, + "num_input_tokens_seen": 231389285, + "step": 10723, + "time_per_iteration": 2.5775530338287354 + }, + { + "auxiliary_loss_clip": 0.01052359, + "auxiliary_loss_mlp": 0.0103323, + "balance_loss_clip": 1.02432096, + "balance_loss_mlp": 1.02159488, + "epoch": 0.6447617616113032, + "flos": 27703142824320.0, + "grad_norm": 1.5048632037162224, + "language_loss": 0.58389372, + "learning_rate": 1.1838828542598312e-06, + "loss": 0.60474968, + "num_input_tokens_seen": 231408820, + "step": 10724, + "time_per_iteration": 2.678589105606079 + }, + { + "auxiliary_loss_clip": 0.01052316, + "auxiliary_loss_mlp": 0.01026037, + "balance_loss_clip": 1.02600217, + "balance_loss_mlp": 1.01635146, + "epoch": 0.6448218848639712, + "flos": 23039101543680.0, + "grad_norm": 1.653393159296905, + "language_loss": 0.83037907, + "learning_rate": 1.183527308454271e-06, + "loss": 0.85116261, + "num_input_tokens_seen": 231428100, + "step": 10725, + "time_per_iteration": 2.576112747192383 + }, + { + "auxiliary_loss_clip": 0.0103761, + "auxiliary_loss_mlp": 0.01031795, + "balance_loss_clip": 1.0219202, + "balance_loss_mlp": 1.02075589, + "epoch": 0.6448820081166391, + "flos": 24496104919680.0, + "grad_norm": 1.7276217697994702, + "language_loss": 0.82045317, + "learning_rate": 1.1831717936094368e-06, + "loss": 0.84114724, + "num_input_tokens_seen": 231445810, + "step": 10726, + "time_per_iteration": 2.660449504852295 + }, + { + "auxiliary_loss_clip": 0.01047576, + "auxiliary_loss_mlp": 0.01031277, + "balance_loss_clip": 1.02378881, + "balance_loss_mlp": 1.01988065, + "epoch": 0.6449421313693071, + "flos": 22419283432320.0, + "grad_norm": 1.8453383276226247, + "language_loss": 0.81365252, + "learning_rate": 1.1828163097388108e-06, + "loss": 0.83444107, + "num_input_tokens_seen": 231463570, + "step": 10727, + "time_per_iteration": 4.172783136367798 + }, + { + "auxiliary_loss_clip": 0.01059421, + "auxiliary_loss_mlp": 0.01030608, + "balance_loss_clip": 1.026402, + "balance_loss_mlp": 1.01865768, + "epoch": 0.645002254621975, + "flos": 20225715765120.0, + "grad_norm": 2.0509033866383137, + "language_loss": 0.79031551, + "learning_rate": 1.1824608568558717e-06, + "loss": 0.81121582, + "num_input_tokens_seen": 231482155, + "step": 10728, + "time_per_iteration": 2.6315746307373047 + }, + { + "auxiliary_loss_clip": 0.00972487, + "auxiliary_loss_mlp": 0.01033378, + "balance_loss_clip": 1.02038431, + "balance_loss_mlp": 1.02087283, + "epoch": 0.645062377874643, + "flos": 27855440490240.0, + "grad_norm": 1.66375688384234, + "language_loss": 0.74692148, + "learning_rate": 1.1821054349740988e-06, + "loss": 0.76698011, + "num_input_tokens_seen": 231502465, + "step": 10729, + "time_per_iteration": 2.9665751457214355 + }, + { + "auxiliary_loss_clip": 0.01029436, + "auxiliary_loss_mlp": 0.01033607, + "balance_loss_clip": 1.02655101, + "balance_loss_mlp": 1.02197778, + "epoch": 0.645122501127311, + "flos": 25301509626240.0, + "grad_norm": 1.556111561894462, + "language_loss": 0.66413343, + "learning_rate": 1.1817500441069706e-06, + "loss": 0.68476391, + "num_input_tokens_seen": 231522740, + "step": 10730, + "time_per_iteration": 2.7999777793884277 + }, + { + "auxiliary_loss_clip": 0.01006089, + "auxiliary_loss_mlp": 0.01033361, + "balance_loss_clip": 1.02291894, + "balance_loss_mlp": 1.02060616, + "epoch": 0.645182624379979, + "flos": 18807352444800.0, + "grad_norm": 1.549972166353878, + "language_loss": 0.63511246, + "learning_rate": 1.1813946842679614e-06, + "loss": 0.65550697, + "num_input_tokens_seen": 231542050, + "step": 10731, + "time_per_iteration": 2.8396713733673096 + }, + { + "auxiliary_loss_clip": 0.01062509, + "auxiliary_loss_mlp": 0.01034073, + "balance_loss_clip": 1.02441108, + "balance_loss_mlp": 1.02316523, + "epoch": 0.6452427476326469, + "flos": 18332182402560.0, + "grad_norm": 1.5841912333118962, + "language_loss": 0.67886293, + "learning_rate": 1.1810393554705492e-06, + "loss": 0.69982874, + "num_input_tokens_seen": 231560380, + "step": 10732, + "time_per_iteration": 2.689600706100464 + }, + { + "auxiliary_loss_clip": 0.01052716, + "auxiliary_loss_mlp": 0.01029134, + "balance_loss_clip": 1.02469683, + "balance_loss_mlp": 1.01886964, + "epoch": 0.6453028708853149, + "flos": 22784746360320.0, + "grad_norm": 2.263484812444053, + "language_loss": 0.75438011, + "learning_rate": 1.1806840577282055e-06, + "loss": 0.77519858, + "num_input_tokens_seen": 231580810, + "step": 10733, + "time_per_iteration": 2.8046934604644775 + }, + { + "auxiliary_loss_clip": 0.01050047, + "auxiliary_loss_mlp": 0.01038645, + "balance_loss_clip": 1.02483821, + "balance_loss_mlp": 1.02643228, + "epoch": 0.6453629941379828, + "flos": 23945989150080.0, + "grad_norm": 1.7492721863994405, + "language_loss": 0.66995215, + "learning_rate": 1.1803287910544048e-06, + "loss": 0.69083905, + "num_input_tokens_seen": 231600585, + "step": 10734, + "time_per_iteration": 2.8359432220458984 + }, + { + "auxiliary_loss_clip": 0.01062854, + "auxiliary_loss_mlp": 0.01031953, + "balance_loss_clip": 1.02724862, + "balance_loss_mlp": 1.02164197, + "epoch": 0.6454231173906508, + "flos": 17676381841920.0, + "grad_norm": 2.085871299716626, + "language_loss": 0.73824948, + "learning_rate": 1.1799735554626191e-06, + "loss": 0.75919747, + "num_input_tokens_seen": 231618765, + "step": 10735, + "time_per_iteration": 2.7239396572113037 + }, + { + "auxiliary_loss_clip": 0.0100596, + "auxiliary_loss_mlp": 0.00747662, + "balance_loss_clip": 1.02303028, + "balance_loss_mlp": 1.00053203, + "epoch": 0.6454832406433189, + "flos": 23292774368640.0, + "grad_norm": 1.9258462534910157, + "language_loss": 0.7499609, + "learning_rate": 1.1796183509663176e-06, + "loss": 0.76749718, + "num_input_tokens_seen": 231638525, + "step": 10736, + "time_per_iteration": 2.992755889892578 + }, + { + "auxiliary_loss_clip": 0.01055581, + "auxiliary_loss_mlp": 0.01023894, + "balance_loss_clip": 1.0255301, + "balance_loss_mlp": 1.01256895, + "epoch": 0.6455433638959868, + "flos": 20157198572160.0, + "grad_norm": 1.8462838748858936, + "language_loss": 0.70906496, + "learning_rate": 1.1792631775789708e-06, + "loss": 0.72985971, + "num_input_tokens_seen": 231656785, + "step": 10737, + "time_per_iteration": 2.7577733993530273 + }, + { + "auxiliary_loss_clip": 0.00997411, + "auxiliary_loss_mlp": 0.01003515, + "balance_loss_clip": 1.00102377, + "balance_loss_mlp": 1.00254965, + "epoch": 0.6456034871486548, + "flos": 66532922012160.0, + "grad_norm": 0.7856752626433274, + "language_loss": 0.58520365, + "learning_rate": 1.1789080353140464e-06, + "loss": 0.60521293, + "num_input_tokens_seen": 231719075, + "step": 10738, + "time_per_iteration": 3.4272100925445557 + }, + { + "auxiliary_loss_clip": 0.01033035, + "auxiliary_loss_mlp": 0.0102575, + "balance_loss_clip": 1.02484083, + "balance_loss_mlp": 1.01508069, + "epoch": 0.6456636104013227, + "flos": 24206090509440.0, + "grad_norm": 1.697068867414685, + "language_loss": 0.74565434, + "learning_rate": 1.1785529241850118e-06, + "loss": 0.76624215, + "num_input_tokens_seen": 231737810, + "step": 10739, + "time_per_iteration": 2.760842800140381 + }, + { + "auxiliary_loss_clip": 0.01045712, + "auxiliary_loss_mlp": 0.0074772, + "balance_loss_clip": 1.025545, + "balance_loss_mlp": 1.00058866, + "epoch": 0.6457237336539907, + "flos": 23624086440960.0, + "grad_norm": 1.7163809536173855, + "language_loss": 0.71320814, + "learning_rate": 1.1781978442053324e-06, + "loss": 0.7311424, + "num_input_tokens_seen": 231756140, + "step": 10740, + "time_per_iteration": 2.8573052883148193 + }, + { + "auxiliary_loss_clip": 0.00993329, + "auxiliary_loss_mlp": 0.01002336, + "balance_loss_clip": 1.0060122, + "balance_loss_mlp": 1.00129271, + "epoch": 0.6457838569066586, + "flos": 65846023251840.0, + "grad_norm": 0.6746757957725902, + "language_loss": 0.55363065, + "learning_rate": 1.1778427953884733e-06, + "loss": 0.57358724, + "num_input_tokens_seen": 231823665, + "step": 10741, + "time_per_iteration": 3.3766942024230957 + }, + { + "auxiliary_loss_clip": 0.01061769, + "auxiliary_loss_mlp": 0.01030242, + "balance_loss_clip": 1.02509475, + "balance_loss_mlp": 1.01997805, + "epoch": 0.6458439801593266, + "flos": 22381972179840.0, + "grad_norm": 1.5558120307698637, + "language_loss": 0.80814409, + "learning_rate": 1.1774877777478977e-06, + "loss": 0.82906425, + "num_input_tokens_seen": 231844500, + "step": 10742, + "time_per_iteration": 2.6882195472717285 + }, + { + "auxiliary_loss_clip": 0.01037591, + "auxiliary_loss_mlp": 0.01027114, + "balance_loss_clip": 1.02200103, + "balance_loss_mlp": 1.01693976, + "epoch": 0.6459041034119946, + "flos": 24789243813120.0, + "grad_norm": 1.5843313451608119, + "language_loss": 0.81549793, + "learning_rate": 1.1771327912970678e-06, + "loss": 0.83614492, + "num_input_tokens_seen": 231864510, + "step": 10743, + "time_per_iteration": 2.9284005165100098 + }, + { + "auxiliary_loss_clip": 0.01042811, + "auxiliary_loss_mlp": 0.01023592, + "balance_loss_clip": 1.0248065, + "balance_loss_mlp": 1.01270247, + "epoch": 0.6459642266646626, + "flos": 18325358818560.0, + "grad_norm": 1.8642811241092205, + "language_loss": 0.71864933, + "learning_rate": 1.1767778360494453e-06, + "loss": 0.73931336, + "num_input_tokens_seen": 231881555, + "step": 10744, + "time_per_iteration": 2.895358085632324 + }, + { + "auxiliary_loss_clip": 0.01062108, + "auxiliary_loss_mlp": 0.01026596, + "balance_loss_clip": 1.02416527, + "balance_loss_mlp": 1.01647532, + "epoch": 0.6460243499173305, + "flos": 43581368891520.0, + "grad_norm": 1.9538379918687925, + "language_loss": 0.66210681, + "learning_rate": 1.1764229120184896e-06, + "loss": 0.68299389, + "num_input_tokens_seen": 231905945, + "step": 10745, + "time_per_iteration": 2.762701988220215 + }, + { + "auxiliary_loss_clip": 0.01048068, + "auxiliary_loss_mlp": 0.01031066, + "balance_loss_clip": 1.02284622, + "balance_loss_mlp": 1.01991355, + "epoch": 0.6460844731699985, + "flos": 19244026085760.0, + "grad_norm": 2.3499191826374615, + "language_loss": 0.73655427, + "learning_rate": 1.1760680192176597e-06, + "loss": 0.75734568, + "num_input_tokens_seen": 231922535, + "step": 10746, + "time_per_iteration": 2.721468687057495 + }, + { + "auxiliary_loss_clip": 0.01055958, + "auxiliary_loss_mlp": 0.01030301, + "balance_loss_clip": 1.02637219, + "balance_loss_mlp": 1.01964974, + "epoch": 0.6461445964226664, + "flos": 27453348668160.0, + "grad_norm": 1.4533978962764942, + "language_loss": 0.66536206, + "learning_rate": 1.175713157660413e-06, + "loss": 0.6862247, + "num_input_tokens_seen": 231944800, + "step": 10747, + "time_per_iteration": 2.834937810897827 + }, + { + "auxiliary_loss_clip": 0.01035222, + "auxiliary_loss_mlp": 0.01033786, + "balance_loss_clip": 1.02459717, + "balance_loss_mlp": 1.02327228, + "epoch": 0.6462047196753344, + "flos": 20295489934080.0, + "grad_norm": 2.181222553187999, + "language_loss": 0.67554116, + "learning_rate": 1.1753583273602056e-06, + "loss": 0.69623125, + "num_input_tokens_seen": 231962970, + "step": 10748, + "time_per_iteration": 2.709460973739624 + }, + { + "auxiliary_loss_clip": 0.01066097, + "auxiliary_loss_mlp": 0.01039722, + "balance_loss_clip": 1.02575171, + "balance_loss_mlp": 1.02790272, + "epoch": 0.6462648429280025, + "flos": 22018340845440.0, + "grad_norm": 1.6200371747494318, + "language_loss": 0.76235598, + "learning_rate": 1.1750035283304937e-06, + "loss": 0.78341419, + "num_input_tokens_seen": 231981195, + "step": 10749, + "time_per_iteration": 2.6644253730773926 + }, + { + "auxiliary_loss_clip": 0.01023585, + "auxiliary_loss_mlp": 0.01034212, + "balance_loss_clip": 1.02092481, + "balance_loss_mlp": 1.02240992, + "epoch": 0.6463249661806704, + "flos": 27781141207680.0, + "grad_norm": 1.6845502939979928, + "language_loss": 0.76950884, + "learning_rate": 1.17464876058473e-06, + "loss": 0.79008687, + "num_input_tokens_seen": 232001735, + "step": 10750, + "time_per_iteration": 2.902846336364746 + }, + { + "auxiliary_loss_clip": 0.01046089, + "auxiliary_loss_mlp": 0.01030264, + "balance_loss_clip": 1.02514124, + "balance_loss_mlp": 1.01796186, + "epoch": 0.6463850894333384, + "flos": 22050588280320.0, + "grad_norm": 2.1850253016715775, + "language_loss": 0.68897933, + "learning_rate": 1.1742940241363683e-06, + "loss": 0.70974284, + "num_input_tokens_seen": 232019830, + "step": 10751, + "time_per_iteration": 4.744516134262085 + }, + { + "auxiliary_loss_clip": 0.01035391, + "auxiliary_loss_mlp": 0.01030485, + "balance_loss_clip": 1.02351761, + "balance_loss_mlp": 1.01849222, + "epoch": 0.6464452126860063, + "flos": 21106245767040.0, + "grad_norm": 2.3514801341815947, + "language_loss": 0.71232134, + "learning_rate": 1.1739393189988604e-06, + "loss": 0.73298007, + "num_input_tokens_seen": 232039625, + "step": 10752, + "time_per_iteration": 2.9164552688598633 + }, + { + "auxiliary_loss_clip": 0.01029517, + "auxiliary_loss_mlp": 0.01036575, + "balance_loss_clip": 1.02171791, + "balance_loss_mlp": 1.02321792, + "epoch": 0.6465053359386743, + "flos": 16028045694720.0, + "grad_norm": 1.881678253733969, + "language_loss": 0.77944934, + "learning_rate": 1.1735846451856554e-06, + "loss": 0.80011028, + "num_input_tokens_seen": 232055855, + "step": 10753, + "time_per_iteration": 4.626652479171753 + }, + { + "auxiliary_loss_clip": 0.01063609, + "auxiliary_loss_mlp": 0.01038149, + "balance_loss_clip": 1.0249815, + "balance_loss_mlp": 1.02687788, + "epoch": 0.6465654591913422, + "flos": 23398674641280.0, + "grad_norm": 1.991922785642471, + "language_loss": 0.85088968, + "learning_rate": 1.1732300027102041e-06, + "loss": 0.87190723, + "num_input_tokens_seen": 232073475, + "step": 10754, + "time_per_iteration": 2.859562635421753 + }, + { + "auxiliary_loss_clip": 0.01037545, + "auxiliary_loss_mlp": 0.01032729, + "balance_loss_clip": 1.02262902, + "balance_loss_mlp": 1.02163637, + "epoch": 0.6466255824440102, + "flos": 15377273038080.0, + "grad_norm": 2.357425894741987, + "language_loss": 0.59721816, + "learning_rate": 1.1728753915859541e-06, + "loss": 0.61792088, + "num_input_tokens_seen": 232091090, + "step": 10755, + "time_per_iteration": 2.7492854595184326 + }, + { + "auxiliary_loss_clip": 0.01017945, + "auxiliary_loss_mlp": 0.01029887, + "balance_loss_clip": 1.02314591, + "balance_loss_mlp": 1.01819265, + "epoch": 0.6466857056966782, + "flos": 16252846963200.0, + "grad_norm": 2.38676374886498, + "language_loss": 0.68044865, + "learning_rate": 1.1725208118263518e-06, + "loss": 0.70092702, + "num_input_tokens_seen": 232107320, + "step": 10756, + "time_per_iteration": 2.872906446456909 + }, + { + "auxiliary_loss_clip": 0.01024416, + "auxiliary_loss_mlp": 0.01037715, + "balance_loss_clip": 1.0252918, + "balance_loss_mlp": 1.02454829, + "epoch": 0.6467458289493462, + "flos": 21178246579200.0, + "grad_norm": 2.3888657218921314, + "language_loss": 0.73963577, + "learning_rate": 1.172166263444844e-06, + "loss": 0.76025707, + "num_input_tokens_seen": 232123930, + "step": 10757, + "time_per_iteration": 2.809128522872925 + }, + { + "auxiliary_loss_clip": 0.01020838, + "auxiliary_loss_mlp": 0.01031161, + "balance_loss_clip": 1.02578926, + "balance_loss_mlp": 1.0200808, + "epoch": 0.6468059522020141, + "flos": 17968299672960.0, + "grad_norm": 1.4741211371791239, + "language_loss": 0.74450129, + "learning_rate": 1.1718117464548734e-06, + "loss": 0.7650212, + "num_input_tokens_seen": 232142905, + "step": 10758, + "time_per_iteration": 2.807835817337036 + }, + { + "auxiliary_loss_clip": 0.01035407, + "auxiliary_loss_mlp": 0.01033231, + "balance_loss_clip": 1.02679825, + "balance_loss_mlp": 1.02075601, + "epoch": 0.6468660754546821, + "flos": 17890157635200.0, + "grad_norm": 1.9719002013288633, + "language_loss": 0.67612362, + "learning_rate": 1.1714572608698845e-06, + "loss": 0.69681001, + "num_input_tokens_seen": 232162230, + "step": 10759, + "time_per_iteration": 2.7260797023773193 + }, + { + "auxiliary_loss_clip": 0.01026437, + "auxiliary_loss_mlp": 0.0103196, + "balance_loss_clip": 1.02122903, + "balance_loss_mlp": 1.01989615, + "epoch": 0.64692619870735, + "flos": 22600991358720.0, + "grad_norm": 1.5354188242505877, + "language_loss": 0.75319558, + "learning_rate": 1.1711028067033197e-06, + "loss": 0.77377951, + "num_input_tokens_seen": 232182700, + "step": 10760, + "time_per_iteration": 2.717189073562622 + }, + { + "auxiliary_loss_clip": 0.01035077, + "auxiliary_loss_mlp": 0.01033771, + "balance_loss_clip": 1.02098775, + "balance_loss_mlp": 1.02209449, + "epoch": 0.646986321960018, + "flos": 49600786993920.0, + "grad_norm": 1.6304628465772926, + "language_loss": 0.65479583, + "learning_rate": 1.1707483839686194e-06, + "loss": 0.6754843, + "num_input_tokens_seen": 232208235, + "step": 10761, + "time_per_iteration": 2.9901514053344727 + }, + { + "auxiliary_loss_clip": 0.01026033, + "auxiliary_loss_mlp": 0.01026909, + "balance_loss_clip": 1.02564502, + "balance_loss_mlp": 1.01528049, + "epoch": 0.6470464452126861, + "flos": 21908454163200.0, + "grad_norm": 1.8535806745570356, + "language_loss": 0.69357735, + "learning_rate": 1.1703939926792235e-06, + "loss": 0.7141068, + "num_input_tokens_seen": 232228720, + "step": 10762, + "time_per_iteration": 2.8385679721832275 + }, + { + "auxiliary_loss_clip": 0.01066177, + "auxiliary_loss_mlp": 0.01030068, + "balance_loss_clip": 1.02580142, + "balance_loss_mlp": 1.01862454, + "epoch": 0.647106568465354, + "flos": 18106124158080.0, + "grad_norm": 4.075316620182987, + "language_loss": 0.82390726, + "learning_rate": 1.1700396328485705e-06, + "loss": 0.84486967, + "num_input_tokens_seen": 232244655, + "step": 10763, + "time_per_iteration": 2.5385701656341553 + }, + { + "auxiliary_loss_clip": 0.01006438, + "auxiliary_loss_mlp": 0.01002458, + "balance_loss_clip": 1.00082326, + "balance_loss_mlp": 1.00143874, + "epoch": 0.647166691718022, + "flos": 69480038125440.0, + "grad_norm": 0.7098231865533282, + "language_loss": 0.57737714, + "learning_rate": 1.1696853044900978e-06, + "loss": 0.59746611, + "num_input_tokens_seen": 232308685, + "step": 10764, + "time_per_iteration": 3.381915330886841 + }, + { + "auxiliary_loss_clip": 0.0102552, + "auxiliary_loss_mlp": 0.01028612, + "balance_loss_clip": 1.02127123, + "balance_loss_mlp": 1.01728129, + "epoch": 0.6472268149706899, + "flos": 34095170661120.0, + "grad_norm": 1.9817598253362767, + "language_loss": 0.60517073, + "learning_rate": 1.1693310076172413e-06, + "loss": 0.62571204, + "num_input_tokens_seen": 232327520, + "step": 10765, + "time_per_iteration": 2.8121438026428223 + }, + { + "auxiliary_loss_clip": 0.01064115, + "auxiliary_loss_mlp": 0.01027261, + "balance_loss_clip": 1.02541494, + "balance_loss_mlp": 1.01630557, + "epoch": 0.6472869382233579, + "flos": 28111232217600.0, + "grad_norm": 1.7683113843017386, + "language_loss": 0.62676704, + "learning_rate": 1.168976742243437e-06, + "loss": 0.64768082, + "num_input_tokens_seen": 232349025, + "step": 10766, + "time_per_iteration": 4.505815029144287 + }, + { + "auxiliary_loss_clip": 0.01037363, + "auxiliary_loss_mlp": 0.01030475, + "balance_loss_clip": 1.02369928, + "balance_loss_mlp": 1.01823771, + "epoch": 0.6473470614760258, + "flos": 22492146170880.0, + "grad_norm": 1.5083632285199975, + "language_loss": 0.75270808, + "learning_rate": 1.1686225083821174e-06, + "loss": 0.77338642, + "num_input_tokens_seen": 232367835, + "step": 10767, + "time_per_iteration": 2.8027071952819824 + }, + { + "auxiliary_loss_clip": 0.01044669, + "auxiliary_loss_mlp": 0.01035463, + "balance_loss_clip": 1.02453375, + "balance_loss_mlp": 1.02440035, + "epoch": 0.6474071847286939, + "flos": 14538938538240.0, + "grad_norm": 1.8271040224233592, + "language_loss": 0.77976251, + "learning_rate": 1.1682683060467153e-06, + "loss": 0.80056381, + "num_input_tokens_seen": 232385840, + "step": 10768, + "time_per_iteration": 2.681171417236328 + }, + { + "auxiliary_loss_clip": 0.0100951, + "auxiliary_loss_mlp": 0.01028323, + "balance_loss_clip": 1.0243206, + "balance_loss_mlp": 1.01651573, + "epoch": 0.6474673079813618, + "flos": 24098214988800.0, + "grad_norm": 1.6040512095167645, + "language_loss": 0.72049272, + "learning_rate": 1.167914135250663e-06, + "loss": 0.74087107, + "num_input_tokens_seen": 232406205, + "step": 10769, + "time_per_iteration": 2.768714189529419 + }, + { + "auxiliary_loss_clip": 0.01061573, + "auxiliary_loss_mlp": 0.0103255, + "balance_loss_clip": 1.0249095, + "balance_loss_mlp": 1.02157688, + "epoch": 0.6475274312340298, + "flos": 14976186796800.0, + "grad_norm": 1.7967675013916007, + "language_loss": 0.7211597, + "learning_rate": 1.1675599960073895e-06, + "loss": 0.74210089, + "num_input_tokens_seen": 232424995, + "step": 10770, + "time_per_iteration": 2.5669331550598145 + }, + { + "auxiliary_loss_clip": 0.01021402, + "auxiliary_loss_mlp": 0.0103132, + "balance_loss_clip": 1.02358437, + "balance_loss_mlp": 1.01902306, + "epoch": 0.6475875544866977, + "flos": 25045322849280.0, + "grad_norm": 1.5848757581748956, + "language_loss": 0.73024118, + "learning_rate": 1.167205888330325e-06, + "loss": 0.75076836, + "num_input_tokens_seen": 232445870, + "step": 10771, + "time_per_iteration": 2.899336338043213 + }, + { + "auxiliary_loss_clip": 0.01022829, + "auxiliary_loss_mlp": 0.0103425, + "balance_loss_clip": 1.02187037, + "balance_loss_mlp": 1.02237654, + "epoch": 0.6476476777393657, + "flos": 16472153450880.0, + "grad_norm": 2.0359242190895586, + "language_loss": 0.73957336, + "learning_rate": 1.1668518122328958e-06, + "loss": 0.76014417, + "num_input_tokens_seen": 232464285, + "step": 10772, + "time_per_iteration": 2.813546895980835 + }, + { + "auxiliary_loss_clip": 0.01042747, + "auxiliary_loss_mlp": 0.0102709, + "balance_loss_clip": 1.02449739, + "balance_loss_mlp": 1.01737452, + "epoch": 0.6477078009920336, + "flos": 25812267068160.0, + "grad_norm": 1.69277379238291, + "language_loss": 0.82996345, + "learning_rate": 1.1664977677285305e-06, + "loss": 0.85066175, + "num_input_tokens_seen": 232485815, + "step": 10773, + "time_per_iteration": 2.731955051422119 + }, + { + "auxiliary_loss_clip": 0.01049648, + "auxiliary_loss_mlp": 0.00747611, + "balance_loss_clip": 1.02273273, + "balance_loss_mlp": 1.00060034, + "epoch": 0.6477679242447016, + "flos": 17676130446720.0, + "grad_norm": 2.3431131283106126, + "language_loss": 0.78317964, + "learning_rate": 1.1661437548306524e-06, + "loss": 0.80115229, + "num_input_tokens_seen": 232504875, + "step": 10774, + "time_per_iteration": 4.380530118942261 + }, + { + "auxiliary_loss_clip": 0.01054533, + "auxiliary_loss_mlp": 0.01032856, + "balance_loss_clip": 1.02492952, + "balance_loss_mlp": 1.02139425, + "epoch": 0.6478280474973696, + "flos": 21032305620480.0, + "grad_norm": 2.0989859529936714, + "language_loss": 0.68928403, + "learning_rate": 1.1657897735526867e-06, + "loss": 0.71015793, + "num_input_tokens_seen": 232521945, + "step": 10775, + "time_per_iteration": 2.595829725265503 + }, + { + "auxiliary_loss_clip": 0.01028986, + "auxiliary_loss_mlp": 0.01035024, + "balance_loss_clip": 1.02373481, + "balance_loss_mlp": 1.02349663, + "epoch": 0.6478881707500376, + "flos": 21616931381760.0, + "grad_norm": 1.871639432227451, + "language_loss": 0.66026944, + "learning_rate": 1.1654358239080574e-06, + "loss": 0.68090951, + "num_input_tokens_seen": 232541500, + "step": 10776, + "time_per_iteration": 2.7190518379211426 + }, + { + "auxiliary_loss_clip": 0.0103496, + "auxiliary_loss_mlp": 0.01036515, + "balance_loss_clip": 1.02178204, + "balance_loss_mlp": 1.02443957, + "epoch": 0.6479482940027056, + "flos": 18442571875200.0, + "grad_norm": 2.6850393412619944, + "language_loss": 0.79270482, + "learning_rate": 1.1650819059101839e-06, + "loss": 0.81341958, + "num_input_tokens_seen": 232559720, + "step": 10777, + "time_per_iteration": 2.641629934310913 + }, + { + "auxiliary_loss_clip": 0.0105666, + "auxiliary_loss_mlp": 0.01034644, + "balance_loss_clip": 1.02724648, + "balance_loss_mlp": 1.02288389, + "epoch": 0.6480084172553735, + "flos": 22164066322560.0, + "grad_norm": 1.8980391458969321, + "language_loss": 0.73727489, + "learning_rate": 1.1647280195724896e-06, + "loss": 0.75818789, + "num_input_tokens_seen": 232579370, + "step": 10778, + "time_per_iteration": 2.7048556804656982 + }, + { + "auxiliary_loss_clip": 0.01050273, + "auxiliary_loss_mlp": 0.01028004, + "balance_loss_clip": 1.02236867, + "balance_loss_mlp": 1.01683378, + "epoch": 0.6480685405080415, + "flos": 24316228586880.0, + "grad_norm": 1.3793177093290345, + "language_loss": 0.77930605, + "learning_rate": 1.1643741649083923e-06, + "loss": 0.80008876, + "num_input_tokens_seen": 232600495, + "step": 10779, + "time_per_iteration": 2.6791024208068848 + }, + { + "auxiliary_loss_clip": 0.01002544, + "auxiliary_loss_mlp": 0.01002886, + "balance_loss_clip": 1.00633764, + "balance_loss_mlp": 1.00156295, + "epoch": 0.6481286637607094, + "flos": 59891207760000.0, + "grad_norm": 0.7237522749496561, + "language_loss": 0.59424734, + "learning_rate": 1.1640203419313095e-06, + "loss": 0.61430168, + "num_input_tokens_seen": 232663165, + "step": 10780, + "time_per_iteration": 3.19401216506958 + }, + { + "auxiliary_loss_clip": 0.00968153, + "auxiliary_loss_mlp": 0.01029694, + "balance_loss_clip": 1.02139533, + "balance_loss_mlp": 1.01878643, + "epoch": 0.6481887870133775, + "flos": 25484187219840.0, + "grad_norm": 6.843238566425997, + "language_loss": 0.79292643, + "learning_rate": 1.1636665506546599e-06, + "loss": 0.81290495, + "num_input_tokens_seen": 232683385, + "step": 10781, + "time_per_iteration": 3.080543279647827 + }, + { + "auxiliary_loss_clip": 0.01067154, + "auxiliary_loss_mlp": 0.01036577, + "balance_loss_clip": 1.02566743, + "balance_loss_mlp": 1.02361858, + "epoch": 0.6482489102660454, + "flos": 19930206574080.0, + "grad_norm": 3.489455441908943, + "language_loss": 0.79013205, + "learning_rate": 1.1633127910918578e-06, + "loss": 0.81116939, + "num_input_tokens_seen": 232699095, + "step": 10782, + "time_per_iteration": 2.7997756004333496 + }, + { + "auxiliary_loss_clip": 0.01057177, + "auxiliary_loss_mlp": 0.00747807, + "balance_loss_clip": 1.02704418, + "balance_loss_mlp": 1.00059342, + "epoch": 0.6483090335187134, + "flos": 26979471515520.0, + "grad_norm": 3.091588598114111, + "language_loss": 0.63998455, + "learning_rate": 1.1629590632563187e-06, + "loss": 0.65803432, + "num_input_tokens_seen": 232717920, + "step": 10783, + "time_per_iteration": 2.740394115447998 + }, + { + "auxiliary_loss_clip": 0.01067182, + "auxiliary_loss_mlp": 0.01034487, + "balance_loss_clip": 1.02679753, + "balance_loss_mlp": 1.02192235, + "epoch": 0.6483691567713813, + "flos": 25077965333760.0, + "grad_norm": 1.7899877676087983, + "language_loss": 0.88749278, + "learning_rate": 1.1626053671614561e-06, + "loss": 0.90850949, + "num_input_tokens_seen": 232737605, + "step": 10784, + "time_per_iteration": 2.6388697624206543 + }, + { + "auxiliary_loss_clip": 0.01043556, + "auxiliary_loss_mlp": 0.0103101, + "balance_loss_clip": 1.02421045, + "balance_loss_mlp": 1.01913118, + "epoch": 0.6484292800240493, + "flos": 16105972250880.0, + "grad_norm": 2.843241549612124, + "language_loss": 0.72855276, + "learning_rate": 1.1622517028206815e-06, + "loss": 0.74929845, + "num_input_tokens_seen": 232755110, + "step": 10785, + "time_per_iteration": 2.6677749156951904 + }, + { + "auxiliary_loss_clip": 0.01031567, + "auxiliary_loss_mlp": 0.01027195, + "balance_loss_clip": 1.02385283, + "balance_loss_mlp": 1.01644206, + "epoch": 0.6484894032767172, + "flos": 28840398307200.0, + "grad_norm": 1.5994333725303922, + "language_loss": 0.69131011, + "learning_rate": 1.1618980702474071e-06, + "loss": 0.71189773, + "num_input_tokens_seen": 232779040, + "step": 10786, + "time_per_iteration": 2.7441866397857666 + }, + { + "auxiliary_loss_clip": 0.01035777, + "auxiliary_loss_mlp": 0.01028204, + "balance_loss_clip": 1.02591395, + "balance_loss_mlp": 1.01717138, + "epoch": 0.6485495265293852, + "flos": 30227052896640.0, + "grad_norm": 1.7493892859031563, + "language_loss": 0.71333587, + "learning_rate": 1.161544469455041e-06, + "loss": 0.73397577, + "num_input_tokens_seen": 232800515, + "step": 10787, + "time_per_iteration": 2.997485876083374 + }, + { + "auxiliary_loss_clip": 0.0106728, + "auxiliary_loss_mlp": 0.01030989, + "balance_loss_clip": 1.0257529, + "balance_loss_mlp": 1.01920474, + "epoch": 0.6486096497820532, + "flos": 20082181017600.0, + "grad_norm": 2.2528274580838925, + "language_loss": 0.84204054, + "learning_rate": 1.1611909004569934e-06, + "loss": 0.86302328, + "num_input_tokens_seen": 232818450, + "step": 10788, + "time_per_iteration": 3.2034618854522705 + }, + { + "auxiliary_loss_clip": 0.01027357, + "auxiliary_loss_mlp": 0.01027521, + "balance_loss_clip": 1.02465427, + "balance_loss_mlp": 1.01539195, + "epoch": 0.6486697730347212, + "flos": 17129067333120.0, + "grad_norm": 1.8080719220996015, + "language_loss": 0.77287292, + "learning_rate": 1.1608373632666708e-06, + "loss": 0.79342175, + "num_input_tokens_seen": 232834785, + "step": 10789, + "time_per_iteration": 2.714682102203369 + }, + { + "auxiliary_loss_clip": 0.01046374, + "auxiliary_loss_mlp": 0.01029479, + "balance_loss_clip": 1.02237463, + "balance_loss_mlp": 1.01859522, + "epoch": 0.6487298962873892, + "flos": 38911940570880.0, + "grad_norm": 1.54439501213858, + "language_loss": 0.75462449, + "learning_rate": 1.160483857897479e-06, + "loss": 0.77538306, + "num_input_tokens_seen": 232856050, + "step": 10790, + "time_per_iteration": 2.8284285068511963 + }, + { + "auxiliary_loss_clip": 0.01063373, + "auxiliary_loss_mlp": 0.01028876, + "balance_loss_clip": 1.02572715, + "balance_loss_mlp": 1.01875544, + "epoch": 0.6487900195400571, + "flos": 11947840076160.0, + "grad_norm": 2.9947212962301095, + "language_loss": 0.59967297, + "learning_rate": 1.160130384362823e-06, + "loss": 0.62059546, + "num_input_tokens_seen": 232873945, + "step": 10791, + "time_per_iteration": 2.571101188659668 + }, + { + "auxiliary_loss_clip": 0.01034983, + "auxiliary_loss_mlp": 0.0103156, + "balance_loss_clip": 1.02478969, + "balance_loss_mlp": 1.02006817, + "epoch": 0.6488501427927251, + "flos": 22344445445760.0, + "grad_norm": 1.8167485387734765, + "language_loss": 0.85969639, + "learning_rate": 1.1597769426761082e-06, + "loss": 0.8803618, + "num_input_tokens_seen": 232892160, + "step": 10792, + "time_per_iteration": 2.8574435710906982 + }, + { + "auxiliary_loss_clip": 0.01041011, + "auxiliary_loss_mlp": 0.0103363, + "balance_loss_clip": 1.02554178, + "balance_loss_mlp": 1.0219295, + "epoch": 0.648910266045393, + "flos": 22236282616320.0, + "grad_norm": 1.8985593377974295, + "language_loss": 0.78326631, + "learning_rate": 1.159423532850735e-06, + "loss": 0.80401272, + "num_input_tokens_seen": 232911725, + "step": 10793, + "time_per_iteration": 2.889737844467163 + }, + { + "auxiliary_loss_clip": 0.01033846, + "auxiliary_loss_mlp": 0.01026633, + "balance_loss_clip": 1.02497411, + "balance_loss_mlp": 1.01543331, + "epoch": 0.6489703892980611, + "flos": 25301258231040.0, + "grad_norm": 1.8822323086011632, + "language_loss": 0.74983597, + "learning_rate": 1.1590701549001055e-06, + "loss": 0.77044082, + "num_input_tokens_seen": 232929085, + "step": 10794, + "time_per_iteration": 2.8626492023468018 + }, + { + "auxiliary_loss_clip": 0.01052193, + "auxiliary_loss_mlp": 0.00747875, + "balance_loss_clip": 1.0230844, + "balance_loss_mlp": 1.00069499, + "epoch": 0.649030512550729, + "flos": 24571912573440.0, + "grad_norm": 1.6448835565474569, + "language_loss": 0.69969779, + "learning_rate": 1.158716808837621e-06, + "loss": 0.71769845, + "num_input_tokens_seen": 232949455, + "step": 10795, + "time_per_iteration": 2.8171989917755127 + }, + { + "auxiliary_loss_clip": 0.01040841, + "auxiliary_loss_mlp": 0.01030118, + "balance_loss_clip": 1.02377808, + "balance_loss_mlp": 1.01800072, + "epoch": 0.649090635803397, + "flos": 26244702904320.0, + "grad_norm": 1.6631332610545349, + "language_loss": 0.53921843, + "learning_rate": 1.158363494676679e-06, + "loss": 0.559928, + "num_input_tokens_seen": 232969445, + "step": 10796, + "time_per_iteration": 2.905458688735962 + }, + { + "auxiliary_loss_clip": 0.0105322, + "auxiliary_loss_mlp": 0.01027032, + "balance_loss_clip": 1.02472782, + "balance_loss_mlp": 1.01631546, + "epoch": 0.6491507590560649, + "flos": 24937375501440.0, + "grad_norm": 1.3861642747576979, + "language_loss": 0.77463651, + "learning_rate": 1.1580102124306775e-06, + "loss": 0.795439, + "num_input_tokens_seen": 232988900, + "step": 10797, + "time_per_iteration": 2.651341676712036 + }, + { + "auxiliary_loss_clip": 0.01020126, + "auxiliary_loss_mlp": 0.01027973, + "balance_loss_clip": 1.02435422, + "balance_loss_mlp": 1.01786971, + "epoch": 0.6492108823087329, + "flos": 19499781899520.0, + "grad_norm": 2.3502185199941246, + "language_loss": 0.70555162, + "learning_rate": 1.1576569621130134e-06, + "loss": 0.72603267, + "num_input_tokens_seen": 233005060, + "step": 10798, + "time_per_iteration": 2.8894660472869873 + }, + { + "auxiliary_loss_clip": 0.01008438, + "auxiliary_loss_mlp": 0.01027704, + "balance_loss_clip": 1.02071512, + "balance_loss_mlp": 1.01733291, + "epoch": 0.6492710055614008, + "flos": 19719303868800.0, + "grad_norm": 1.9662337646581258, + "language_loss": 0.77148998, + "learning_rate": 1.1573037437370811e-06, + "loss": 0.7918514, + "num_input_tokens_seen": 233023375, + "step": 10799, + "time_per_iteration": 4.549810171127319 + }, + { + "auxiliary_loss_clip": 0.01054997, + "auxiliary_loss_mlp": 0.01032955, + "balance_loss_clip": 1.02410698, + "balance_loss_mlp": 1.02133811, + "epoch": 0.6493311288140688, + "flos": 24317018686080.0, + "grad_norm": 1.9947525034200648, + "language_loss": 0.71853495, + "learning_rate": 1.1569505573162755e-06, + "loss": 0.73941451, + "num_input_tokens_seen": 233043130, + "step": 10800, + "time_per_iteration": 4.496287822723389 + }, + { + "auxiliary_loss_clip": 0.00997196, + "auxiliary_loss_mlp": 0.01006153, + "balance_loss_clip": 1.0014708, + "balance_loss_mlp": 1.00495517, + "epoch": 0.6493912520667368, + "flos": 70934635290240.0, + "grad_norm": 0.7640912576318564, + "language_loss": 0.60217118, + "learning_rate": 1.1565974028639897e-06, + "loss": 0.62220466, + "num_input_tokens_seen": 233110560, + "step": 10801, + "time_per_iteration": 3.316159725189209 + }, + { + "auxiliary_loss_clip": 0.01051377, + "auxiliary_loss_mlp": 0.01035699, + "balance_loss_clip": 1.02545011, + "balance_loss_mlp": 1.02326584, + "epoch": 0.6494513753194048, + "flos": 25337779384320.0, + "grad_norm": 1.7335473514933155, + "language_loss": 0.78740555, + "learning_rate": 1.156244280393614e-06, + "loss": 0.8082763, + "num_input_tokens_seen": 233130080, + "step": 10802, + "time_per_iteration": 2.8515021800994873 + }, + { + "auxiliary_loss_clip": 0.01063176, + "auxiliary_loss_mlp": 0.01034988, + "balance_loss_clip": 1.02326095, + "balance_loss_mlp": 1.02319837, + "epoch": 0.6495114985720728, + "flos": 24681978823680.0, + "grad_norm": 1.7095227519378537, + "language_loss": 0.74640054, + "learning_rate": 1.155891189918541e-06, + "loss": 0.7673822, + "num_input_tokens_seen": 233150235, + "step": 10803, + "time_per_iteration": 2.7293319702148438 + }, + { + "auxiliary_loss_clip": 0.01007351, + "auxiliary_loss_mlp": 0.01031079, + "balance_loss_clip": 1.02240443, + "balance_loss_mlp": 1.01961112, + "epoch": 0.6495716218247407, + "flos": 23651162317440.0, + "grad_norm": 2.050999870583766, + "language_loss": 0.70139265, + "learning_rate": 1.1555381314521578e-06, + "loss": 0.7217769, + "num_input_tokens_seen": 233166710, + "step": 10804, + "time_per_iteration": 3.040571451187134 + }, + { + "auxiliary_loss_clip": 0.01054436, + "auxiliary_loss_mlp": 0.01031303, + "balance_loss_clip": 1.0252049, + "balance_loss_mlp": 1.01907814, + "epoch": 0.6496317450774087, + "flos": 22346169298560.0, + "grad_norm": 1.6725846922378051, + "language_loss": 0.72719175, + "learning_rate": 1.1551851050078537e-06, + "loss": 0.74804914, + "num_input_tokens_seen": 233185445, + "step": 10805, + "time_per_iteration": 2.8232553005218506 + }, + { + "auxiliary_loss_clip": 0.01045183, + "auxiliary_loss_mlp": 0.01027836, + "balance_loss_clip": 1.02544832, + "balance_loss_mlp": 1.01679146, + "epoch": 0.6496918683300766, + "flos": 30518647505280.0, + "grad_norm": 2.0516931324701613, + "language_loss": 0.65715051, + "learning_rate": 1.1548321105990155e-06, + "loss": 0.6778807, + "num_input_tokens_seen": 233205805, + "step": 10806, + "time_per_iteration": 2.8611881732940674 + }, + { + "auxiliary_loss_clip": 0.0104389, + "auxiliary_loss_mlp": 0.00747931, + "balance_loss_clip": 1.02322149, + "balance_loss_mlp": 1.00074649, + "epoch": 0.6497519915827447, + "flos": 12458992567680.0, + "grad_norm": 2.0728522211073312, + "language_loss": 0.78548563, + "learning_rate": 1.1544791482390275e-06, + "loss": 0.80340385, + "num_input_tokens_seen": 233224215, + "step": 10807, + "time_per_iteration": 2.6210808753967285 + }, + { + "auxiliary_loss_clip": 0.00988498, + "auxiliary_loss_mlp": 0.01000908, + "balance_loss_clip": 1.00196302, + "balance_loss_mlp": 0.99982876, + "epoch": 0.6498121148354126, + "flos": 69093748287360.0, + "grad_norm": 0.7927855815584446, + "language_loss": 0.58928382, + "learning_rate": 1.1541262179412745e-06, + "loss": 0.60917795, + "num_input_tokens_seen": 233294440, + "step": 10808, + "time_per_iteration": 3.3853390216827393 + }, + { + "auxiliary_loss_clip": 0.01043484, + "auxiliary_loss_mlp": 0.01023186, + "balance_loss_clip": 1.02571654, + "balance_loss_mlp": 1.01273155, + "epoch": 0.6498722380880806, + "flos": 36897135914880.0, + "grad_norm": 1.7027795879377214, + "language_loss": 0.63353992, + "learning_rate": 1.1537733197191415e-06, + "loss": 0.65420663, + "num_input_tokens_seen": 233316125, + "step": 10809, + "time_per_iteration": 2.7849771976470947 + }, + { + "auxiliary_loss_clip": 0.01051348, + "auxiliary_loss_mlp": 0.00747678, + "balance_loss_clip": 1.024845, + "balance_loss_mlp": 1.00060141, + "epoch": 0.6499323613407485, + "flos": 29017760688000.0, + "grad_norm": 1.7139453775255458, + "language_loss": 0.81477672, + "learning_rate": 1.153420453586008e-06, + "loss": 0.83276701, + "num_input_tokens_seen": 233336140, + "step": 10810, + "time_per_iteration": 2.669290781021118 + }, + { + "auxiliary_loss_clip": 0.01023584, + "auxiliary_loss_mlp": 0.01030059, + "balance_loss_clip": 1.02387249, + "balance_loss_mlp": 1.02015877, + "epoch": 0.6499924845934165, + "flos": 20119240874880.0, + "grad_norm": 1.5881257254579642, + "language_loss": 0.72081971, + "learning_rate": 1.1530676195552561e-06, + "loss": 0.74135613, + "num_input_tokens_seen": 233356095, + "step": 10811, + "time_per_iteration": 2.668055772781372 + }, + { + "auxiliary_loss_clip": 0.01016846, + "auxiliary_loss_mlp": 0.01025767, + "balance_loss_clip": 1.02873182, + "balance_loss_mlp": 1.01521075, + "epoch": 0.6500526078460844, + "flos": 24421338760320.0, + "grad_norm": 1.4951196522656967, + "language_loss": 0.77487296, + "learning_rate": 1.1527148176402649e-06, + "loss": 0.79529905, + "num_input_tokens_seen": 233376830, + "step": 10812, + "time_per_iteration": 2.8713808059692383 + }, + { + "auxiliary_loss_clip": 0.01055753, + "auxiliary_loss_mlp": 0.01031566, + "balance_loss_clip": 1.02591634, + "balance_loss_mlp": 1.02007985, + "epoch": 0.6501127310987524, + "flos": 23331019374720.0, + "grad_norm": 2.1774640124492617, + "language_loss": 0.85229111, + "learning_rate": 1.152362047854413e-06, + "loss": 0.87316436, + "num_input_tokens_seen": 233395275, + "step": 10813, + "time_per_iteration": 4.259962797164917 + }, + { + "auxiliary_loss_clip": 0.01019711, + "auxiliary_loss_mlp": 0.01032122, + "balance_loss_clip": 1.02112412, + "balance_loss_mlp": 1.02027261, + "epoch": 0.6501728543514204, + "flos": 18697824898560.0, + "grad_norm": 1.9313659237526042, + "language_loss": 0.80063426, + "learning_rate": 1.1520093102110764e-06, + "loss": 0.82115257, + "num_input_tokens_seen": 233413345, + "step": 10814, + "time_per_iteration": 2.8094046115875244 + }, + { + "auxiliary_loss_clip": 0.01021917, + "auxiliary_loss_mlp": 0.00747962, + "balance_loss_clip": 1.02595592, + "balance_loss_mlp": 1.00071883, + "epoch": 0.6502329776040884, + "flos": 44199858199680.0, + "grad_norm": 1.625391988528622, + "language_loss": 0.65499699, + "learning_rate": 1.1516566047236328e-06, + "loss": 0.67269576, + "num_input_tokens_seen": 233436105, + "step": 10815, + "time_per_iteration": 3.009305953979492 + }, + { + "auxiliary_loss_clip": 0.01067073, + "auxiliary_loss_mlp": 0.01032804, + "balance_loss_clip": 1.02567768, + "balance_loss_mlp": 1.01977444, + "epoch": 0.6502931008567564, + "flos": 14574741419520.0, + "grad_norm": 2.5211786999952777, + "language_loss": 0.75181073, + "learning_rate": 1.1513039314054546e-06, + "loss": 0.77280951, + "num_input_tokens_seen": 233452320, + "step": 10816, + "time_per_iteration": 2.628145456314087 + }, + { + "auxiliary_loss_clip": 0.01034098, + "auxiliary_loss_mlp": 0.01026602, + "balance_loss_clip": 1.02312279, + "balance_loss_mlp": 1.01567054, + "epoch": 0.6503532241094243, + "flos": 21395003201280.0, + "grad_norm": 1.7226756489924708, + "language_loss": 0.73228008, + "learning_rate": 1.1509512902699174e-06, + "loss": 0.75288707, + "num_input_tokens_seen": 233469920, + "step": 10817, + "time_per_iteration": 2.7942888736724854 + }, + { + "auxiliary_loss_clip": 0.0101788, + "auxiliary_loss_mlp": 0.01035511, + "balance_loss_clip": 1.02012038, + "balance_loss_mlp": 1.0224154, + "epoch": 0.6504133473620923, + "flos": 74740840986240.0, + "grad_norm": 1.4067689071349496, + "language_loss": 0.72024226, + "learning_rate": 1.1505986813303916e-06, + "loss": 0.74077618, + "num_input_tokens_seen": 233499780, + "step": 10818, + "time_per_iteration": 3.0881853103637695 + }, + { + "auxiliary_loss_clip": 0.0103576, + "auxiliary_loss_mlp": 0.0102808, + "balance_loss_clip": 1.02574086, + "balance_loss_mlp": 1.01672494, + "epoch": 0.6504734706147602, + "flos": 19713270384000.0, + "grad_norm": 1.9263309309381438, + "language_loss": 0.64627254, + "learning_rate": 1.150246104600249e-06, + "loss": 0.66691101, + "num_input_tokens_seen": 233518235, + "step": 10819, + "time_per_iteration": 2.7161412239074707 + }, + { + "auxiliary_loss_clip": 0.01027678, + "auxiliary_loss_mlp": 0.01030388, + "balance_loss_clip": 1.02245772, + "balance_loss_mlp": 1.01848459, + "epoch": 0.6505335938674283, + "flos": 25556870390400.0, + "grad_norm": 1.8701204447747337, + "language_loss": 0.83680528, + "learning_rate": 1.14989356009286e-06, + "loss": 0.85738593, + "num_input_tokens_seen": 233535215, + "step": 10820, + "time_per_iteration": 2.7626242637634277 + }, + { + "auxiliary_loss_clip": 0.01058202, + "auxiliary_loss_mlp": 0.01026095, + "balance_loss_clip": 1.02630639, + "balance_loss_mlp": 1.01455545, + "epoch": 0.6505937171200962, + "flos": 17821424960640.0, + "grad_norm": 2.26414707986665, + "language_loss": 0.77373654, + "learning_rate": 1.1495410478215914e-06, + "loss": 0.79457951, + "num_input_tokens_seen": 233552775, + "step": 10821, + "time_per_iteration": 4.184399366378784 + }, + { + "auxiliary_loss_clip": 0.01030635, + "auxiliary_loss_mlp": 0.01026944, + "balance_loss_clip": 1.02437174, + "balance_loss_mlp": 1.01746058, + "epoch": 0.6506538403727642, + "flos": 20668135582080.0, + "grad_norm": 1.549202459706928, + "language_loss": 0.80171901, + "learning_rate": 1.1491885677998126e-06, + "loss": 0.82229483, + "num_input_tokens_seen": 233572080, + "step": 10822, + "time_per_iteration": 2.6984317302703857 + }, + { + "auxiliary_loss_clip": 0.01036858, + "auxiliary_loss_mlp": 0.0102664, + "balance_loss_clip": 1.02727485, + "balance_loss_mlp": 1.01606011, + "epoch": 0.6507139636254321, + "flos": 11721422695680.0, + "grad_norm": 1.8366231916892903, + "language_loss": 0.87080568, + "learning_rate": 1.1488361200408883e-06, + "loss": 0.89144063, + "num_input_tokens_seen": 233589155, + "step": 10823, + "time_per_iteration": 2.6457793712615967 + }, + { + "auxiliary_loss_clip": 0.01063834, + "auxiliary_loss_mlp": 0.01026055, + "balance_loss_clip": 1.02449179, + "balance_loss_mlp": 1.01526022, + "epoch": 0.6507740868781001, + "flos": 26761745226240.0, + "grad_norm": 2.364641854285943, + "language_loss": 0.66484892, + "learning_rate": 1.148483704558183e-06, + "loss": 0.6857478, + "num_input_tokens_seen": 233608180, + "step": 10824, + "time_per_iteration": 2.636646270751953 + }, + { + "auxiliary_loss_clip": 0.01047, + "auxiliary_loss_mlp": 0.01025936, + "balance_loss_clip": 1.02546883, + "balance_loss_mlp": 1.01504064, + "epoch": 0.650834210130768, + "flos": 16471722487680.0, + "grad_norm": 2.4678271012866158, + "language_loss": 0.87366223, + "learning_rate": 1.1481313213650607e-06, + "loss": 0.8943916, + "num_input_tokens_seen": 233625750, + "step": 10825, + "time_per_iteration": 2.6642613410949707 + }, + { + "auxiliary_loss_clip": 0.01043589, + "auxiliary_loss_mlp": 0.01025235, + "balance_loss_clip": 1.02323508, + "balance_loss_mlp": 1.0129323, + "epoch": 0.650894333383436, + "flos": 17128672283520.0, + "grad_norm": 2.5222488150235236, + "language_loss": 0.72775429, + "learning_rate": 1.147778970474885e-06, + "loss": 0.74844253, + "num_input_tokens_seen": 233644235, + "step": 10826, + "time_per_iteration": 2.8706226348876953 + }, + { + "auxiliary_loss_clip": 0.01053661, + "auxiliary_loss_mlp": 0.01025093, + "balance_loss_clip": 1.02532887, + "balance_loss_mlp": 1.01481152, + "epoch": 0.650954456636104, + "flos": 18734238311040.0, + "grad_norm": 1.7564485480222625, + "language_loss": 0.68902445, + "learning_rate": 1.1474266519010157e-06, + "loss": 0.70981193, + "num_input_tokens_seen": 233662845, + "step": 10827, + "time_per_iteration": 2.7987682819366455 + }, + { + "auxiliary_loss_clip": 0.01039661, + "auxiliary_loss_mlp": 0.0102815, + "balance_loss_clip": 1.02294922, + "balance_loss_mlp": 1.01791, + "epoch": 0.651014579888772, + "flos": 24528244613760.0, + "grad_norm": 1.8388287927277254, + "language_loss": 0.76656115, + "learning_rate": 1.1470743656568136e-06, + "loss": 0.78723919, + "num_input_tokens_seen": 233681990, + "step": 10828, + "time_per_iteration": 2.813077688217163 + }, + { + "auxiliary_loss_clip": 0.01052746, + "auxiliary_loss_mlp": 0.01024394, + "balance_loss_clip": 1.02505255, + "balance_loss_mlp": 1.0141722, + "epoch": 0.65107470314144, + "flos": 24061083304320.0, + "grad_norm": 1.765011100249856, + "language_loss": 0.89435977, + "learning_rate": 1.1467221117556362e-06, + "loss": 0.91513109, + "num_input_tokens_seen": 233698930, + "step": 10829, + "time_per_iteration": 2.601614236831665 + }, + { + "auxiliary_loss_clip": 0.01007173, + "auxiliary_loss_mlp": 0.01000733, + "balance_loss_clip": 1.00167632, + "balance_loss_mlp": 0.99965459, + "epoch": 0.6511348263941079, + "flos": 72480734352000.0, + "grad_norm": 0.6736647014889149, + "language_loss": 0.55430031, + "learning_rate": 1.1463698902108428e-06, + "loss": 0.57437944, + "num_input_tokens_seen": 233769825, + "step": 10830, + "time_per_iteration": 3.3014817237854004 + }, + { + "auxiliary_loss_clip": 0.01031088, + "auxiliary_loss_mlp": 0.01030639, + "balance_loss_clip": 1.02279997, + "balance_loss_mlp": 1.01940954, + "epoch": 0.6511949496467759, + "flos": 23367684182400.0, + "grad_norm": 1.84491814621319, + "language_loss": 0.74741626, + "learning_rate": 1.1460177010357878e-06, + "loss": 0.7680335, + "num_input_tokens_seen": 233787095, + "step": 10831, + "time_per_iteration": 2.7554025650024414 + }, + { + "auxiliary_loss_clip": 0.00989456, + "auxiliary_loss_mlp": 0.01004816, + "balance_loss_clip": 1.00332403, + "balance_loss_mlp": 1.00370777, + "epoch": 0.6512550728994438, + "flos": 67333191073920.0, + "grad_norm": 0.640990467470119, + "language_loss": 0.51070178, + "learning_rate": 1.145665544243828e-06, + "loss": 0.53064454, + "num_input_tokens_seen": 233853050, + "step": 10832, + "time_per_iteration": 3.356619119644165 + }, + { + "auxiliary_loss_clip": 0.01045823, + "auxiliary_loss_mlp": 0.01031641, + "balance_loss_clip": 1.02442861, + "balance_loss_mlp": 1.0200541, + "epoch": 0.6513151961521119, + "flos": 21141689512320.0, + "grad_norm": 1.9540877994711263, + "language_loss": 0.8301819, + "learning_rate": 1.145313419848316e-06, + "loss": 0.8509565, + "num_input_tokens_seen": 233871385, + "step": 10833, + "time_per_iteration": 2.659770965576172 + }, + { + "auxiliary_loss_clip": 0.01047548, + "auxiliary_loss_mlp": 0.01029053, + "balance_loss_clip": 1.0278008, + "balance_loss_mlp": 1.01777577, + "epoch": 0.6513753194047798, + "flos": 15158828476800.0, + "grad_norm": 2.0574651865656204, + "language_loss": 0.83550072, + "learning_rate": 1.1449613278626049e-06, + "loss": 0.85626674, + "num_input_tokens_seen": 233888175, + "step": 10834, + "time_per_iteration": 2.6616458892822266 + }, + { + "auxiliary_loss_clip": 0.01050347, + "auxiliary_loss_mlp": 0.01032567, + "balance_loss_clip": 1.02543473, + "balance_loss_mlp": 1.02108169, + "epoch": 0.6514354426574478, + "flos": 30226621933440.0, + "grad_norm": 1.6530863257798354, + "language_loss": 0.77231634, + "learning_rate": 1.1446092683000455e-06, + "loss": 0.79314548, + "num_input_tokens_seen": 233911470, + "step": 10835, + "time_per_iteration": 2.768294334411621 + }, + { + "auxiliary_loss_clip": 0.01041507, + "auxiliary_loss_mlp": 0.01030597, + "balance_loss_clip": 1.02585351, + "balance_loss_mlp": 1.01922429, + "epoch": 0.6514955659101157, + "flos": 24205587719040.0, + "grad_norm": 1.512632717156667, + "language_loss": 0.77395976, + "learning_rate": 1.1442572411739882e-06, + "loss": 0.79468083, + "num_input_tokens_seen": 233932135, + "step": 10836, + "time_per_iteration": 2.70802640914917 + }, + { + "auxiliary_loss_clip": 0.01029238, + "auxiliary_loss_mlp": 0.01029698, + "balance_loss_clip": 1.02457535, + "balance_loss_mlp": 1.01859951, + "epoch": 0.6515556891627837, + "flos": 12377761960320.0, + "grad_norm": 2.1979416369297007, + "language_loss": 0.82203776, + "learning_rate": 1.143905246497783e-06, + "loss": 0.84262717, + "num_input_tokens_seen": 233947880, + "step": 10837, + "time_per_iteration": 2.806102991104126 + }, + { + "auxiliary_loss_clip": 0.01032796, + "auxiliary_loss_mlp": 0.01031497, + "balance_loss_clip": 1.02498794, + "balance_loss_mlp": 1.01954651, + "epoch": 0.6516158124154516, + "flos": 49601217957120.0, + "grad_norm": 2.0806147478870916, + "language_loss": 0.58638722, + "learning_rate": 1.1435532842847758e-06, + "loss": 0.60703015, + "num_input_tokens_seen": 233971475, + "step": 10838, + "time_per_iteration": 2.9239189624786377 + }, + { + "auxiliary_loss_clip": 0.01006838, + "auxiliary_loss_mlp": 0.0100185, + "balance_loss_clip": 1.00095606, + "balance_loss_mlp": 1.00073552, + "epoch": 0.6516759356681197, + "flos": 59702748076800.0, + "grad_norm": 0.7325965549201792, + "language_loss": 0.6105141, + "learning_rate": 1.1432013545483147e-06, + "loss": 0.63060099, + "num_input_tokens_seen": 234030690, + "step": 10839, + "time_per_iteration": 3.20072603225708 + }, + { + "auxiliary_loss_clip": 0.01043065, + "auxiliary_loss_mlp": 0.01027469, + "balance_loss_clip": 1.02494752, + "balance_loss_mlp": 1.01751518, + "epoch": 0.6517360589207876, + "flos": 37450807130880.0, + "grad_norm": 1.6506086394690558, + "language_loss": 0.67220521, + "learning_rate": 1.1428494573017439e-06, + "loss": 0.69291055, + "num_input_tokens_seen": 234052470, + "step": 10840, + "time_per_iteration": 2.7908880710601807 + }, + { + "auxiliary_loss_clip": 0.01022811, + "auxiliary_loss_mlp": 0.01031728, + "balance_loss_clip": 1.02326787, + "balance_loss_mlp": 1.02085567, + "epoch": 0.6517961821734556, + "flos": 25374911068800.0, + "grad_norm": 3.5777575093430514, + "language_loss": 0.73830211, + "learning_rate": 1.1424975925584071e-06, + "loss": 0.75884748, + "num_input_tokens_seen": 234071495, + "step": 10841, + "time_per_iteration": 2.791673421859741 + }, + { + "auxiliary_loss_clip": 0.01063761, + "auxiliary_loss_mlp": 0.01033376, + "balance_loss_clip": 1.02452886, + "balance_loss_mlp": 1.02206898, + "epoch": 0.6518563054261236, + "flos": 28766996864640.0, + "grad_norm": 1.4889536737680236, + "language_loss": 0.62563354, + "learning_rate": 1.142145760331648e-06, + "loss": 0.6466049, + "num_input_tokens_seen": 234092325, + "step": 10842, + "time_per_iteration": 2.610507011413574 + }, + { + "auxiliary_loss_clip": 0.01000408, + "auxiliary_loss_mlp": 0.01003614, + "balance_loss_clip": 1.00413442, + "balance_loss_mlp": 1.00241029, + "epoch": 0.6519164286787915, + "flos": 68924750797440.0, + "grad_norm": 0.8155424280725164, + "language_loss": 0.56163692, + "learning_rate": 1.141793960634807e-06, + "loss": 0.58167708, + "num_input_tokens_seen": 234148005, + "step": 10843, + "time_per_iteration": 2.9932503700256348 + }, + { + "auxiliary_loss_clip": 0.01056375, + "auxiliary_loss_mlp": 0.01031727, + "balance_loss_clip": 1.02541184, + "balance_loss_mlp": 1.01997924, + "epoch": 0.6519765519314595, + "flos": 20441933683200.0, + "grad_norm": 1.7818394219968183, + "language_loss": 0.82802403, + "learning_rate": 1.1414421934812253e-06, + "loss": 0.84890503, + "num_input_tokens_seen": 234164280, + "step": 10844, + "time_per_iteration": 2.554022789001465 + }, + { + "auxiliary_loss_clip": 0.01054449, + "auxiliary_loss_mlp": 0.01029259, + "balance_loss_clip": 1.02551877, + "balance_loss_mlp": 1.01795816, + "epoch": 0.6520366751841274, + "flos": 28402970480640.0, + "grad_norm": 3.6194235155547223, + "language_loss": 0.60040212, + "learning_rate": 1.1410904588842421e-06, + "loss": 0.62123919, + "num_input_tokens_seen": 234185090, + "step": 10845, + "time_per_iteration": 2.6537013053894043 + }, + { + "auxiliary_loss_clip": 0.01055447, + "auxiliary_loss_mlp": 0.01031533, + "balance_loss_clip": 1.02578187, + "balance_loss_mlp": 1.02007127, + "epoch": 0.6520967984367955, + "flos": 22273414300800.0, + "grad_norm": 1.6789288703009808, + "language_loss": 0.79284322, + "learning_rate": 1.140738756857194e-06, + "loss": 0.81371301, + "num_input_tokens_seen": 234204050, + "step": 10846, + "time_per_iteration": 4.224957227706909 + }, + { + "auxiliary_loss_clip": 0.01000213, + "auxiliary_loss_mlp": 0.00999496, + "balance_loss_clip": 1.00365686, + "balance_loss_mlp": 0.9984116, + "epoch": 0.6521569216894634, + "flos": 68917140092160.0, + "grad_norm": 0.733211964414026, + "language_loss": 0.60232186, + "learning_rate": 1.1403870874134192e-06, + "loss": 0.62231898, + "num_input_tokens_seen": 234269790, + "step": 10847, + "time_per_iteration": 4.975739002227783 + }, + { + "auxiliary_loss_clip": 0.01065903, + "auxiliary_loss_mlp": 0.01036353, + "balance_loss_clip": 1.02552724, + "balance_loss_mlp": 1.02518344, + "epoch": 0.6522170449421314, + "flos": 29130520458240.0, + "grad_norm": 1.8808798974908851, + "language_loss": 0.8084479, + "learning_rate": 1.1400354505662514e-06, + "loss": 0.82947046, + "num_input_tokens_seen": 234290135, + "step": 10848, + "time_per_iteration": 2.6725986003875732 + }, + { + "auxiliary_loss_clip": 0.01035917, + "auxiliary_loss_mlp": 0.01033656, + "balance_loss_clip": 1.02489865, + "balance_loss_mlp": 1.02289176, + "epoch": 0.6522771681947993, + "flos": 26651930371200.0, + "grad_norm": 2.2397047803366017, + "language_loss": 0.7444073, + "learning_rate": 1.1396838463290263e-06, + "loss": 0.76510304, + "num_input_tokens_seen": 234309535, + "step": 10849, + "time_per_iteration": 2.6851096153259277 + }, + { + "auxiliary_loss_clip": 0.01019027, + "auxiliary_loss_mlp": 0.01028615, + "balance_loss_clip": 1.02364469, + "balance_loss_mlp": 1.01793385, + "epoch": 0.6523372914474673, + "flos": 25739763465600.0, + "grad_norm": 1.4022225953525354, + "language_loss": 0.68145752, + "learning_rate": 1.1393322747150752e-06, + "loss": 0.70193392, + "num_input_tokens_seen": 234328755, + "step": 10850, + "time_per_iteration": 2.796254873275757 + }, + { + "auxiliary_loss_clip": 0.0104245, + "auxiliary_loss_mlp": 0.00747786, + "balance_loss_clip": 1.02571249, + "balance_loss_mlp": 1.00065041, + "epoch": 0.6523974147001352, + "flos": 24827345164800.0, + "grad_norm": 1.8934398155223915, + "language_loss": 0.6647861, + "learning_rate": 1.1389807357377313e-06, + "loss": 0.68268847, + "num_input_tokens_seen": 234348655, + "step": 10851, + "time_per_iteration": 2.6836462020874023 + }, + { + "auxiliary_loss_clip": 0.01046628, + "auxiliary_loss_mlp": 0.01028688, + "balance_loss_clip": 1.02554703, + "balance_loss_mlp": 1.01794076, + "epoch": 0.6524575379528033, + "flos": 26317637470080.0, + "grad_norm": 2.049360739534877, + "language_loss": 0.73881799, + "learning_rate": 1.1386292294103235e-06, + "loss": 0.75957119, + "num_input_tokens_seen": 234367445, + "step": 10852, + "time_per_iteration": 2.7912797927856445 + }, + { + "auxiliary_loss_clip": 0.0104752, + "auxiliary_loss_mlp": 0.01029731, + "balance_loss_clip": 1.02640295, + "balance_loss_mlp": 1.01755953, + "epoch": 0.6525176612054712, + "flos": 19494143464320.0, + "grad_norm": 1.7358627897725583, + "language_loss": 0.66638333, + "learning_rate": 1.1382777557461812e-06, + "loss": 0.68715584, + "num_input_tokens_seen": 234384825, + "step": 10853, + "time_per_iteration": 2.697779417037964 + }, + { + "auxiliary_loss_clip": 0.00982238, + "auxiliary_loss_mlp": 0.01003361, + "balance_loss_clip": 1.00538921, + "balance_loss_mlp": 1.00206745, + "epoch": 0.6525777844581392, + "flos": 71706894721920.0, + "grad_norm": 0.7251957894397411, + "language_loss": 0.63016081, + "learning_rate": 1.137926314758634e-06, + "loss": 0.65001678, + "num_input_tokens_seen": 234450630, + "step": 10854, + "time_per_iteration": 3.375955104827881 + }, + { + "auxiliary_loss_clip": 0.01049318, + "auxiliary_loss_mlp": 0.01038771, + "balance_loss_clip": 1.0247314, + "balance_loss_mlp": 1.02589011, + "epoch": 0.6526379077108072, + "flos": 26653115520000.0, + "grad_norm": 1.9579860495942147, + "language_loss": 0.77757144, + "learning_rate": 1.1375749064610072e-06, + "loss": 0.79845226, + "num_input_tokens_seen": 234473505, + "step": 10855, + "time_per_iteration": 2.725802421569824 + }, + { + "auxiliary_loss_clip": 0.01023132, + "auxiliary_loss_mlp": 0.01022512, + "balance_loss_clip": 1.02057278, + "balance_loss_mlp": 1.0118134, + "epoch": 0.6526980309634751, + "flos": 22820369673600.0, + "grad_norm": 1.8494413273403343, + "language_loss": 0.79010177, + "learning_rate": 1.1372235308666256e-06, + "loss": 0.81055814, + "num_input_tokens_seen": 234492485, + "step": 10856, + "time_per_iteration": 2.7525365352630615 + }, + { + "auxiliary_loss_clip": 0.01063129, + "auxiliary_loss_mlp": 0.010269, + "balance_loss_clip": 1.02467847, + "balance_loss_mlp": 1.01506233, + "epoch": 0.6527581542161431, + "flos": 28365048696960.0, + "grad_norm": 1.8520306962447908, + "language_loss": 0.73569018, + "learning_rate": 1.136872187988815e-06, + "loss": 0.75659049, + "num_input_tokens_seen": 234512645, + "step": 10857, + "time_per_iteration": 2.8646867275238037 + }, + { + "auxiliary_loss_clip": 0.01039472, + "auxiliary_loss_mlp": 0.01033274, + "balance_loss_clip": 1.02327085, + "balance_loss_mlp": 1.02274144, + "epoch": 0.652818277468811, + "flos": 18369206346240.0, + "grad_norm": 2.290775302712856, + "language_loss": 0.62886548, + "learning_rate": 1.1365208778408965e-06, + "loss": 0.64959294, + "num_input_tokens_seen": 234529310, + "step": 10858, + "time_per_iteration": 2.8323965072631836 + }, + { + "auxiliary_loss_clip": 0.01062481, + "auxiliary_loss_mlp": 0.0103349, + "balance_loss_clip": 1.02472687, + "balance_loss_mlp": 1.02306473, + "epoch": 0.6528784007214791, + "flos": 18036170421120.0, + "grad_norm": 1.646448072953669, + "language_loss": 0.78712153, + "learning_rate": 1.1361696004361939e-06, + "loss": 0.80808127, + "num_input_tokens_seen": 234546685, + "step": 10859, + "time_per_iteration": 2.6496689319610596 + }, + { + "auxiliary_loss_clip": 0.01055673, + "auxiliary_loss_mlp": 0.01026843, + "balance_loss_clip": 1.02513671, + "balance_loss_mlp": 1.01576269, + "epoch": 0.652938523974147, + "flos": 22382008093440.0, + "grad_norm": 1.512707860090252, + "language_loss": 0.67697531, + "learning_rate": 1.1358183557880256e-06, + "loss": 0.69780046, + "num_input_tokens_seen": 234566255, + "step": 10860, + "time_per_iteration": 2.6429529190063477 + }, + { + "auxiliary_loss_clip": 0.01057863, + "auxiliary_loss_mlp": 0.01029158, + "balance_loss_clip": 1.02694058, + "balance_loss_mlp": 1.01808357, + "epoch": 0.652998647226815, + "flos": 16764035368320.0, + "grad_norm": 2.010387970753942, + "language_loss": 0.662489, + "learning_rate": 1.135467143909712e-06, + "loss": 0.68335927, + "num_input_tokens_seen": 234585405, + "step": 10861, + "time_per_iteration": 4.295617341995239 + }, + { + "auxiliary_loss_clip": 0.01044738, + "auxiliary_loss_mlp": 0.01030997, + "balance_loss_clip": 1.02531517, + "balance_loss_mlp": 1.0190165, + "epoch": 0.6530587704794829, + "flos": 35772522019200.0, + "grad_norm": 1.7093370950509006, + "language_loss": 0.6501016, + "learning_rate": 1.135115964814572e-06, + "loss": 0.67085898, + "num_input_tokens_seen": 234608095, + "step": 10862, + "time_per_iteration": 2.8376286029815674 + }, + { + "auxiliary_loss_clip": 0.01040374, + "auxiliary_loss_mlp": 0.01030174, + "balance_loss_clip": 1.02369308, + "balance_loss_mlp": 1.019243, + "epoch": 0.6531188937321509, + "flos": 19316134638720.0, + "grad_norm": 1.5368102653706956, + "language_loss": 0.76866126, + "learning_rate": 1.13476481851592e-06, + "loss": 0.78936678, + "num_input_tokens_seen": 234627335, + "step": 10863, + "time_per_iteration": 2.7468061447143555 + }, + { + "auxiliary_loss_clip": 0.01034024, + "auxiliary_loss_mlp": 0.01031144, + "balance_loss_clip": 1.02356374, + "balance_loss_mlp": 1.02099907, + "epoch": 0.6531790169848188, + "flos": 22893771116160.0, + "grad_norm": 1.6600872261825825, + "language_loss": 0.74893475, + "learning_rate": 1.1344137050270739e-06, + "loss": 0.76958638, + "num_input_tokens_seen": 234646540, + "step": 10864, + "time_per_iteration": 2.69016695022583 + }, + { + "auxiliary_loss_clip": 0.01052882, + "auxiliary_loss_mlp": 0.01030278, + "balance_loss_clip": 1.02529192, + "balance_loss_mlp": 1.01970994, + "epoch": 0.6532391402374869, + "flos": 29563530912000.0, + "grad_norm": 1.8114166189064758, + "language_loss": 0.86489606, + "learning_rate": 1.1340626243613458e-06, + "loss": 0.88572758, + "num_input_tokens_seen": 234665470, + "step": 10865, + "time_per_iteration": 2.662050485610962 + }, + { + "auxiliary_loss_clip": 0.01032095, + "auxiliary_loss_mlp": 0.00747776, + "balance_loss_clip": 1.02322388, + "balance_loss_mlp": 1.00068319, + "epoch": 0.6532992634901548, + "flos": 23105463920640.0, + "grad_norm": 4.389301772309395, + "language_loss": 0.81206167, + "learning_rate": 1.133711576532051e-06, + "loss": 0.82986039, + "num_input_tokens_seen": 234683955, + "step": 10866, + "time_per_iteration": 2.743340015411377 + }, + { + "auxiliary_loss_clip": 0.01045382, + "auxiliary_loss_mlp": 0.01025769, + "balance_loss_clip": 1.02719522, + "balance_loss_mlp": 1.01536846, + "epoch": 0.6533593867428228, + "flos": 26067340523520.0, + "grad_norm": 1.4740695227909062, + "language_loss": 0.82477361, + "learning_rate": 1.1333605615524995e-06, + "loss": 0.84548509, + "num_input_tokens_seen": 234704595, + "step": 10867, + "time_per_iteration": 2.631410598754883 + }, + { + "auxiliary_loss_clip": 0.01034936, + "auxiliary_loss_mlp": 0.01025478, + "balance_loss_clip": 1.02419758, + "balance_loss_mlp": 1.01488602, + "epoch": 0.6534195099954908, + "flos": 21212469262080.0, + "grad_norm": 2.6560105307576674, + "language_loss": 0.80793488, + "learning_rate": 1.1330095794360016e-06, + "loss": 0.82853901, + "num_input_tokens_seen": 234724090, + "step": 10868, + "time_per_iteration": 4.380091428756714 + }, + { + "auxiliary_loss_clip": 0.01036482, + "auxiliary_loss_mlp": 0.01026328, + "balance_loss_clip": 1.02599728, + "balance_loss_mlp": 1.01464534, + "epoch": 0.6534796332481587, + "flos": 19646584784640.0, + "grad_norm": 2.547910212790918, + "language_loss": 0.79380226, + "learning_rate": 1.1326586301958675e-06, + "loss": 0.81443036, + "num_input_tokens_seen": 234742560, + "step": 10869, + "time_per_iteration": 2.7121739387512207 + }, + { + "auxiliary_loss_clip": 0.01057108, + "auxiliary_loss_mlp": 0.01035242, + "balance_loss_clip": 1.02739668, + "balance_loss_mlp": 1.02415538, + "epoch": 0.6535397565008267, + "flos": 24022479162240.0, + "grad_norm": 2.916332359050655, + "language_loss": 0.72473752, + "learning_rate": 1.1323077138454063e-06, + "loss": 0.74566108, + "num_input_tokens_seen": 234762315, + "step": 10870, + "time_per_iteration": 2.7108659744262695 + }, + { + "auxiliary_loss_clip": 0.01028309, + "auxiliary_loss_mlp": 0.0103477, + "balance_loss_clip": 1.02466846, + "balance_loss_mlp": 1.02335608, + "epoch": 0.6535998797534947, + "flos": 24602759377920.0, + "grad_norm": 2.52554336973485, + "language_loss": 0.74598789, + "learning_rate": 1.1319568303979221e-06, + "loss": 0.76661873, + "num_input_tokens_seen": 234781300, + "step": 10871, + "time_per_iteration": 2.7623183727264404 + }, + { + "auxiliary_loss_clip": 0.01041714, + "auxiliary_loss_mlp": 0.00747548, + "balance_loss_clip": 1.02326417, + "balance_loss_mlp": 1.00056481, + "epoch": 0.6536600030061627, + "flos": 23364164649600.0, + "grad_norm": 1.8979247629831149, + "language_loss": 0.55936205, + "learning_rate": 1.1316059798667227e-06, + "loss": 0.57725477, + "num_input_tokens_seen": 234801040, + "step": 10872, + "time_per_iteration": 2.7151429653167725 + }, + { + "auxiliary_loss_clip": 0.01040665, + "auxiliary_loss_mlp": 0.01033477, + "balance_loss_clip": 1.0244689, + "balance_loss_mlp": 1.02270675, + "epoch": 0.6537201262588306, + "flos": 23878477537920.0, + "grad_norm": 1.5775528770018852, + "language_loss": 0.74801666, + "learning_rate": 1.1312551622651112e-06, + "loss": 0.76875806, + "num_input_tokens_seen": 234821415, + "step": 10873, + "time_per_iteration": 2.6840696334838867 + }, + { + "auxiliary_loss_clip": 0.01056165, + "auxiliary_loss_mlp": 0.01029101, + "balance_loss_clip": 1.02689004, + "balance_loss_mlp": 1.01835454, + "epoch": 0.6537802495114986, + "flos": 24354760901760.0, + "grad_norm": 1.4418017162343164, + "language_loss": 0.75395954, + "learning_rate": 1.1309043776063917e-06, + "loss": 0.77481216, + "num_input_tokens_seen": 234843795, + "step": 10874, + "time_per_iteration": 2.7924633026123047 + }, + { + "auxiliary_loss_clip": 0.01026628, + "auxiliary_loss_mlp": 0.01032628, + "balance_loss_clip": 1.02485442, + "balance_loss_mlp": 1.0210886, + "epoch": 0.6538403727641665, + "flos": 27996892248960.0, + "grad_norm": 1.5099969380458715, + "language_loss": 0.8163951, + "learning_rate": 1.1305536259038642e-06, + "loss": 0.83698767, + "num_input_tokens_seen": 234862350, + "step": 10875, + "time_per_iteration": 2.9289650917053223 + }, + { + "auxiliary_loss_clip": 0.01063918, + "auxiliary_loss_mlp": 0.01035532, + "balance_loss_clip": 1.02540672, + "balance_loss_mlp": 1.02549458, + "epoch": 0.6539004960168345, + "flos": 27563594486400.0, + "grad_norm": 1.9810173772791981, + "language_loss": 0.69825339, + "learning_rate": 1.1302029071708314e-06, + "loss": 0.71924794, + "num_input_tokens_seen": 234881790, + "step": 10876, + "time_per_iteration": 2.866217613220215 + }, + { + "auxiliary_loss_clip": 0.00979037, + "auxiliary_loss_mlp": 0.01034011, + "balance_loss_clip": 1.02290428, + "balance_loss_mlp": 1.0226326, + "epoch": 0.6539606192695024, + "flos": 14530067879040.0, + "grad_norm": 1.9880581957063657, + "language_loss": 0.79728055, + "learning_rate": 1.1298522214205908e-06, + "loss": 0.81741101, + "num_input_tokens_seen": 234897775, + "step": 10877, + "time_per_iteration": 3.1876773834228516 + }, + { + "auxiliary_loss_clip": 0.01044359, + "auxiliary_loss_mlp": 0.00747686, + "balance_loss_clip": 1.0244422, + "balance_loss_mlp": 1.00064445, + "epoch": 0.6540207425221705, + "flos": 21616356764160.0, + "grad_norm": 3.137881487436625, + "language_loss": 0.80263925, + "learning_rate": 1.1295015686664408e-06, + "loss": 0.82055974, + "num_input_tokens_seen": 234918395, + "step": 10878, + "time_per_iteration": 3.1219725608825684 + }, + { + "auxiliary_loss_clip": 0.01039932, + "auxiliary_loss_mlp": 0.01029424, + "balance_loss_clip": 1.0241847, + "balance_loss_mlp": 1.01757431, + "epoch": 0.6540808657748384, + "flos": 17668983640320.0, + "grad_norm": 2.1829676734059382, + "language_loss": 0.84464312, + "learning_rate": 1.1291509489216797e-06, + "loss": 0.86533666, + "num_input_tokens_seen": 234936260, + "step": 10879, + "time_per_iteration": 2.6753957271575928 + }, + { + "auxiliary_loss_clip": 0.01045598, + "auxiliary_loss_mlp": 0.0102856, + "balance_loss_clip": 1.02506185, + "balance_loss_mlp": 1.01697338, + "epoch": 0.6541409890275064, + "flos": 14538292093440.0, + "grad_norm": 2.9089970935148566, + "language_loss": 0.71498489, + "learning_rate": 1.128800362199601e-06, + "loss": 0.73572648, + "num_input_tokens_seen": 234952110, + "step": 10880, + "time_per_iteration": 2.8112969398498535 + }, + { + "auxiliary_loss_clip": 0.01024051, + "auxiliary_loss_mlp": 0.01031757, + "balance_loss_clip": 1.02259707, + "balance_loss_mlp": 1.02094436, + "epoch": 0.6542011122801744, + "flos": 17165301177600.0, + "grad_norm": 1.8698474761410313, + "language_loss": 0.84197336, + "learning_rate": 1.1284498085135005e-06, + "loss": 0.86253142, + "num_input_tokens_seen": 234970810, + "step": 10881, + "time_per_iteration": 2.762871265411377 + }, + { + "auxiliary_loss_clip": 0.01030074, + "auxiliary_loss_mlp": 0.01031948, + "balance_loss_clip": 1.02371788, + "balance_loss_mlp": 1.01990819, + "epoch": 0.6542612355328423, + "flos": 18186600579840.0, + "grad_norm": 1.7873457479546886, + "language_loss": 0.77550459, + "learning_rate": 1.1280992878766699e-06, + "loss": 0.79612482, + "num_input_tokens_seen": 234989565, + "step": 10882, + "time_per_iteration": 2.690554141998291 + }, + { + "auxiliary_loss_clip": 0.01067126, + "auxiliary_loss_mlp": 0.01028001, + "balance_loss_clip": 1.02692986, + "balance_loss_mlp": 1.016348, + "epoch": 0.6543213587855103, + "flos": 19792453916160.0, + "grad_norm": 2.240456991329809, + "language_loss": 0.82285184, + "learning_rate": 1.1277488003024024e-06, + "loss": 0.84380311, + "num_input_tokens_seen": 235007955, + "step": 10883, + "time_per_iteration": 2.585482358932495 + }, + { + "auxiliary_loss_clip": 0.01012057, + "auxiliary_loss_mlp": 0.01030697, + "balance_loss_clip": 1.02482045, + "balance_loss_mlp": 1.01931834, + "epoch": 0.6543814820381783, + "flos": 21105096531840.0, + "grad_norm": 2.698340154584688, + "language_loss": 0.85629416, + "learning_rate": 1.127398345803988e-06, + "loss": 0.87672168, + "num_input_tokens_seen": 235024860, + "step": 10884, + "time_per_iteration": 2.7224302291870117 + }, + { + "auxiliary_loss_clip": 0.01039078, + "auxiliary_loss_mlp": 0.01033285, + "balance_loss_clip": 1.02297688, + "balance_loss_mlp": 1.02181673, + "epoch": 0.6544416052908463, + "flos": 20194042947840.0, + "grad_norm": 2.155477865351153, + "language_loss": 0.80281687, + "learning_rate": 1.127047924394715e-06, + "loss": 0.82354045, + "num_input_tokens_seen": 235043815, + "step": 10885, + "time_per_iteration": 2.7512261867523193 + }, + { + "auxiliary_loss_clip": 0.01024162, + "auxiliary_loss_mlp": 0.0102536, + "balance_loss_clip": 1.02255917, + "balance_loss_mlp": 1.01449454, + "epoch": 0.6545017285435142, + "flos": 23368258800000.0, + "grad_norm": 5.387937366466561, + "language_loss": 0.71987331, + "learning_rate": 1.1266975360878722e-06, + "loss": 0.74036855, + "num_input_tokens_seen": 235062985, + "step": 10886, + "time_per_iteration": 2.7358858585357666 + }, + { + "auxiliary_loss_clip": 0.01053814, + "auxiliary_loss_mlp": 0.01028656, + "balance_loss_clip": 1.02619278, + "balance_loss_mlp": 1.01822531, + "epoch": 0.6545618517961822, + "flos": 19134714021120.0, + "grad_norm": 1.7838092311863452, + "language_loss": 0.78245246, + "learning_rate": 1.1263471808967468e-06, + "loss": 0.80327719, + "num_input_tokens_seen": 235081670, + "step": 10887, + "time_per_iteration": 2.6453359127044678 + }, + { + "auxiliary_loss_clip": 0.01033817, + "auxiliary_loss_mlp": 0.01032704, + "balance_loss_clip": 1.02257586, + "balance_loss_mlp": 1.0219996, + "epoch": 0.6546219750488501, + "flos": 14938624149120.0, + "grad_norm": 1.9058647204060293, + "language_loss": 0.78950453, + "learning_rate": 1.1259968588346234e-06, + "loss": 0.8101697, + "num_input_tokens_seen": 235098510, + "step": 10888, + "time_per_iteration": 2.8373260498046875 + }, + { + "auxiliary_loss_clip": 0.01051143, + "auxiliary_loss_mlp": 0.01030777, + "balance_loss_clip": 1.02418363, + "balance_loss_mlp": 1.02090025, + "epoch": 0.6546820983015181, + "flos": 36320518886400.0, + "grad_norm": 1.5402501292870894, + "language_loss": 0.66621602, + "learning_rate": 1.1256465699147874e-06, + "loss": 0.68703526, + "num_input_tokens_seen": 235119990, + "step": 10889, + "time_per_iteration": 2.9870100021362305 + }, + { + "auxiliary_loss_clip": 0.01036232, + "auxiliary_loss_mlp": 0.01037873, + "balance_loss_clip": 1.02290559, + "balance_loss_mlp": 1.02489126, + "epoch": 0.654742221554186, + "flos": 20411446014720.0, + "grad_norm": 1.442669265400328, + "language_loss": 0.79649609, + "learning_rate": 1.1252963141505203e-06, + "loss": 0.81723714, + "num_input_tokens_seen": 235139255, + "step": 10890, + "time_per_iteration": 2.771501064300537 + }, + { + "auxiliary_loss_clip": 0.01051648, + "auxiliary_loss_mlp": 0.00747751, + "balance_loss_clip": 1.0231607, + "balance_loss_mlp": 1.00068331, + "epoch": 0.6548023448068541, + "flos": 24863650836480.0, + "grad_norm": 1.9568473543844542, + "language_loss": 0.65037769, + "learning_rate": 1.1249460915551052e-06, + "loss": 0.66837168, + "num_input_tokens_seen": 235158455, + "step": 10891, + "time_per_iteration": 2.815531015396118 + }, + { + "auxiliary_loss_clip": 0.01047128, + "auxiliary_loss_mlp": 0.01035265, + "balance_loss_clip": 1.02266741, + "balance_loss_mlp": 1.02468562, + "epoch": 0.654862468059522, + "flos": 21427573858560.0, + "grad_norm": 2.1515340397168456, + "language_loss": 0.7953198, + "learning_rate": 1.1245959021418214e-06, + "loss": 0.81614375, + "num_input_tokens_seen": 235177350, + "step": 10892, + "time_per_iteration": 2.783714771270752 + }, + { + "auxiliary_loss_clip": 0.01058423, + "auxiliary_loss_mlp": 0.01028858, + "balance_loss_clip": 1.02733755, + "balance_loss_mlp": 1.0180819, + "epoch": 0.65492259131219, + "flos": 26577846570240.0, + "grad_norm": 1.8449840737937946, + "language_loss": 0.77993, + "learning_rate": 1.1242457459239497e-06, + "loss": 0.80080283, + "num_input_tokens_seen": 235196435, + "step": 10893, + "time_per_iteration": 4.508152484893799 + }, + { + "auxiliary_loss_clip": 0.0106692, + "auxiliary_loss_mlp": 0.01028902, + "balance_loss_clip": 1.02623868, + "balance_loss_mlp": 1.01696908, + "epoch": 0.6549827145648579, + "flos": 21501334437120.0, + "grad_norm": 1.551693975408184, + "language_loss": 0.70067734, + "learning_rate": 1.123895622914766e-06, + "loss": 0.72163558, + "num_input_tokens_seen": 235215430, + "step": 10894, + "time_per_iteration": 2.5975053310394287 + }, + { + "auxiliary_loss_clip": 0.01055873, + "auxiliary_loss_mlp": 0.01029529, + "balance_loss_clip": 1.0249176, + "balance_loss_mlp": 1.01823413, + "epoch": 0.6550428378175259, + "flos": 22594275515520.0, + "grad_norm": 28.05462019515397, + "language_loss": 0.62738866, + "learning_rate": 1.123545533127549e-06, + "loss": 0.64824271, + "num_input_tokens_seen": 235232015, + "step": 10895, + "time_per_iteration": 4.627051830291748 + }, + { + "auxiliary_loss_clip": 0.01049526, + "auxiliary_loss_mlp": 0.01031019, + "balance_loss_clip": 1.0229162, + "balance_loss_mlp": 1.02065349, + "epoch": 0.655102961070194, + "flos": 12823809050880.0, + "grad_norm": 1.7998962263779268, + "language_loss": 0.78834772, + "learning_rate": 1.1231954765755722e-06, + "loss": 0.8091532, + "num_input_tokens_seen": 235248115, + "step": 10896, + "time_per_iteration": 2.7780215740203857 + }, + { + "auxiliary_loss_clip": 0.010431, + "auxiliary_loss_mlp": 0.01029737, + "balance_loss_clip": 1.02529979, + "balance_loss_mlp": 1.01946676, + "epoch": 0.6551630843228619, + "flos": 24791075406720.0, + "grad_norm": 1.3895083192832212, + "language_loss": 0.70479941, + "learning_rate": 1.1228454532721111e-06, + "loss": 0.72552776, + "num_input_tokens_seen": 235270785, + "step": 10897, + "time_per_iteration": 2.821009874343872 + }, + { + "auxiliary_loss_clip": 0.0106357, + "auxiliary_loss_mlp": 0.01030873, + "balance_loss_clip": 1.02439404, + "balance_loss_mlp": 1.02003694, + "epoch": 0.6552232075755299, + "flos": 16724461559040.0, + "grad_norm": 2.104137569691178, + "language_loss": 0.75428176, + "learning_rate": 1.1224954632304391e-06, + "loss": 0.77522618, + "num_input_tokens_seen": 235287905, + "step": 10898, + "time_per_iteration": 2.7326853275299072 + }, + { + "auxiliary_loss_clip": 0.01042551, + "auxiliary_loss_mlp": 0.01030624, + "balance_loss_clip": 1.02443302, + "balance_loss_mlp": 1.01999009, + "epoch": 0.6552833308281978, + "flos": 22016473338240.0, + "grad_norm": 10.08287035408275, + "language_loss": 0.73637676, + "learning_rate": 1.122145506463827e-06, + "loss": 0.75710857, + "num_input_tokens_seen": 235305525, + "step": 10899, + "time_per_iteration": 2.8195810317993164 + }, + { + "auxiliary_loss_clip": 0.01042105, + "auxiliary_loss_mlp": 0.01025051, + "balance_loss_clip": 1.02504945, + "balance_loss_mlp": 1.01476359, + "epoch": 0.6553434540808658, + "flos": 24863399441280.0, + "grad_norm": 2.9452247675750365, + "language_loss": 0.56847972, + "learning_rate": 1.1217955829855443e-06, + "loss": 0.58915126, + "num_input_tokens_seen": 235324415, + "step": 10900, + "time_per_iteration": 2.8476905822753906 + }, + { + "auxiliary_loss_clip": 0.01054975, + "auxiliary_loss_mlp": 0.01032623, + "balance_loss_clip": 1.02673554, + "balance_loss_mlp": 1.02085674, + "epoch": 0.6554035773335337, + "flos": 23221060865280.0, + "grad_norm": 1.7361884679461035, + "language_loss": 0.76945639, + "learning_rate": 1.1214456928088622e-06, + "loss": 0.79033244, + "num_input_tokens_seen": 235341595, + "step": 10901, + "time_per_iteration": 2.9219160079956055 + }, + { + "auxiliary_loss_clip": 0.01061369, + "auxiliary_loss_mlp": 0.01026047, + "balance_loss_clip": 1.02445817, + "balance_loss_mlp": 1.01469278, + "epoch": 0.6554637005862017, + "flos": 22783597125120.0, + "grad_norm": 1.6824199960313353, + "language_loss": 0.73192859, + "learning_rate": 1.1210958359470463e-06, + "loss": 0.75280279, + "num_input_tokens_seen": 235361700, + "step": 10902, + "time_per_iteration": 2.5970911979675293 + }, + { + "auxiliary_loss_clip": 0.01063497, + "auxiliary_loss_mlp": 0.01027172, + "balance_loss_clip": 1.02649546, + "balance_loss_mlp": 1.01657987, + "epoch": 0.6555238238388696, + "flos": 21507224267520.0, + "grad_norm": 1.7231456038620125, + "language_loss": 0.67815346, + "learning_rate": 1.1207460124133645e-06, + "loss": 0.6990602, + "num_input_tokens_seen": 235382065, + "step": 10903, + "time_per_iteration": 2.6884193420410156 + }, + { + "auxiliary_loss_clip": 0.01042351, + "auxiliary_loss_mlp": 0.00747844, + "balance_loss_clip": 1.02307832, + "balance_loss_mlp": 1.00061297, + "epoch": 0.6555839470915377, + "flos": 30519473518080.0, + "grad_norm": 1.783440945452914, + "language_loss": 0.66780436, + "learning_rate": 1.1203962222210832e-06, + "loss": 0.68570632, + "num_input_tokens_seen": 235402130, + "step": 10904, + "time_per_iteration": 2.6862423419952393 + }, + { + "auxiliary_loss_clip": 0.01054083, + "auxiliary_loss_mlp": 0.01032287, + "balance_loss_clip": 1.02407551, + "balance_loss_mlp": 1.02021134, + "epoch": 0.6556440703442056, + "flos": 24642943718400.0, + "grad_norm": 1.7971626373388614, + "language_loss": 0.90146852, + "learning_rate": 1.120046465383464e-06, + "loss": 0.92233229, + "num_input_tokens_seen": 235420435, + "step": 10905, + "time_per_iteration": 2.6261677742004395 + }, + { + "auxiliary_loss_clip": 0.01050594, + "auxiliary_loss_mlp": 0.0102871, + "balance_loss_clip": 1.02376461, + "balance_loss_mlp": 1.01812458, + "epoch": 0.6557041935968736, + "flos": 23732464752000.0, + "grad_norm": 1.7026336519706404, + "language_loss": 0.75512218, + "learning_rate": 1.1196967419137721e-06, + "loss": 0.77591515, + "num_input_tokens_seen": 235439960, + "step": 10906, + "time_per_iteration": 2.7253592014312744 + }, + { + "auxiliary_loss_clip": 0.01066632, + "auxiliary_loss_mlp": 0.01035384, + "balance_loss_clip": 1.02613688, + "balance_loss_mlp": 1.02373695, + "epoch": 0.6557643168495415, + "flos": 11102753819520.0, + "grad_norm": 2.584309955491031, + "language_loss": 0.75097346, + "learning_rate": 1.119347051825267e-06, + "loss": 0.77199364, + "num_input_tokens_seen": 235457495, + "step": 10907, + "time_per_iteration": 2.6842565536499023 + }, + { + "auxiliary_loss_clip": 0.01033636, + "auxiliary_loss_mlp": 0.01027128, + "balance_loss_clip": 1.02508795, + "balance_loss_mlp": 1.01532674, + "epoch": 0.6558244401022095, + "flos": 30191034533760.0, + "grad_norm": 1.6764954481659802, + "language_loss": 0.72417772, + "learning_rate": 1.118997395131211e-06, + "loss": 0.74478537, + "num_input_tokens_seen": 235479525, + "step": 10908, + "time_per_iteration": 4.502645492553711 + }, + { + "auxiliary_loss_clip": 0.01064918, + "auxiliary_loss_mlp": 0.01032449, + "balance_loss_clip": 1.02595353, + "balance_loss_mlp": 1.02056432, + "epoch": 0.6558845633548775, + "flos": 17931060247680.0, + "grad_norm": 4.516645062625464, + "language_loss": 0.80762923, + "learning_rate": 1.118647771844861e-06, + "loss": 0.82860291, + "num_input_tokens_seen": 235496305, + "step": 10909, + "time_per_iteration": 2.557004690170288 + }, + { + "auxiliary_loss_clip": 0.01065418, + "auxiliary_loss_mlp": 0.01034355, + "balance_loss_clip": 1.02601111, + "balance_loss_mlp": 1.02205229, + "epoch": 0.6559446866075455, + "flos": 21904144531200.0, + "grad_norm": 2.011316153175129, + "language_loss": 0.63753587, + "learning_rate": 1.1182981819794767e-06, + "loss": 0.65853357, + "num_input_tokens_seen": 235512545, + "step": 10910, + "time_per_iteration": 2.6020805835723877 + }, + { + "auxiliary_loss_clip": 0.01038885, + "auxiliary_loss_mlp": 0.01033859, + "balance_loss_clip": 1.02336049, + "balance_loss_mlp": 1.02001882, + "epoch": 0.6560048098602135, + "flos": 14127976056960.0, + "grad_norm": 3.3770548203841337, + "language_loss": 0.75883937, + "learning_rate": 1.117948625548313e-06, + "loss": 0.77956676, + "num_input_tokens_seen": 235526045, + "step": 10911, + "time_per_iteration": 2.6314480304718018 + }, + { + "auxiliary_loss_clip": 0.01058471, + "auxiliary_loss_mlp": 0.01026104, + "balance_loss_clip": 1.02259874, + "balance_loss_mlp": 1.01606619, + "epoch": 0.6560649331128814, + "flos": 18807567926400.0, + "grad_norm": 1.5165471010473446, + "language_loss": 0.75315422, + "learning_rate": 1.1175991025646265e-06, + "loss": 0.77399993, + "num_input_tokens_seen": 235545285, + "step": 10912, + "time_per_iteration": 2.6469860076904297 + }, + { + "auxiliary_loss_clip": 0.01030005, + "auxiliary_loss_mlp": 0.00747762, + "balance_loss_clip": 1.02459955, + "balance_loss_mlp": 1.00058806, + "epoch": 0.6561250563655494, + "flos": 17053618815360.0, + "grad_norm": 4.071995743794071, + "language_loss": 0.7732029, + "learning_rate": 1.1172496130416697e-06, + "loss": 0.79098058, + "num_input_tokens_seen": 235563150, + "step": 10913, + "time_per_iteration": 2.8141398429870605 + }, + { + "auxiliary_loss_clip": 0.01029981, + "auxiliary_loss_mlp": 0.01025515, + "balance_loss_clip": 1.02066255, + "balance_loss_mlp": 1.01566792, + "epoch": 0.6561851796182173, + "flos": 22637656166400.0, + "grad_norm": 1.9501724239630096, + "language_loss": 0.71035111, + "learning_rate": 1.1169001569926961e-06, + "loss": 0.73090607, + "num_input_tokens_seen": 235582535, + "step": 10914, + "time_per_iteration": 2.750575304031372 + }, + { + "auxiliary_loss_clip": 0.01034239, + "auxiliary_loss_mlp": 0.01031729, + "balance_loss_clip": 1.02552319, + "balance_loss_mlp": 1.02063012, + "epoch": 0.6562453028708853, + "flos": 19239213663360.0, + "grad_norm": 1.938455094102509, + "language_loss": 0.73753691, + "learning_rate": 1.116550734430958e-06, + "loss": 0.75819653, + "num_input_tokens_seen": 235601490, + "step": 10915, + "time_per_iteration": 2.698807954788208 + }, + { + "auxiliary_loss_clip": 0.01025897, + "auxiliary_loss_mlp": 0.01028076, + "balance_loss_clip": 1.02196693, + "balance_loss_mlp": 1.01653624, + "epoch": 0.6563054261235532, + "flos": 23801305167360.0, + "grad_norm": 1.541041472553337, + "language_loss": 0.79781342, + "learning_rate": 1.1162013453697042e-06, + "loss": 0.81835318, + "num_input_tokens_seen": 235619165, + "step": 10916, + "time_per_iteration": 4.418932676315308 + }, + { + "auxiliary_loss_clip": 0.01030503, + "auxiliary_loss_mlp": 0.01027215, + "balance_loss_clip": 1.02190781, + "balance_loss_mlp": 1.01662874, + "epoch": 0.6563655493762213, + "flos": 19240039676160.0, + "grad_norm": 2.1557299255072437, + "language_loss": 0.76301908, + "learning_rate": 1.1158519898221831e-06, + "loss": 0.78359622, + "num_input_tokens_seen": 235637115, + "step": 10917, + "time_per_iteration": 2.689094305038452 + }, + { + "auxiliary_loss_clip": 0.01062096, + "auxiliary_loss_mlp": 0.00747704, + "balance_loss_clip": 1.02449036, + "balance_loss_mlp": 1.00058675, + "epoch": 0.6564256726288892, + "flos": 25556439427200.0, + "grad_norm": 2.149019853957575, + "language_loss": 0.69609559, + "learning_rate": 1.1155026678016445e-06, + "loss": 0.71419364, + "num_input_tokens_seen": 235656330, + "step": 10918, + "time_per_iteration": 2.736963987350464 + }, + { + "auxiliary_loss_clip": 0.01032117, + "auxiliary_loss_mlp": 0.01032905, + "balance_loss_clip": 1.02534389, + "balance_loss_mlp": 1.02225375, + "epoch": 0.6564857958815572, + "flos": 22200623389440.0, + "grad_norm": 1.546046730787855, + "language_loss": 0.766132, + "learning_rate": 1.115153379321332e-06, + "loss": 0.78678221, + "num_input_tokens_seen": 235674510, + "step": 10919, + "time_per_iteration": 2.7142205238342285 + }, + { + "auxiliary_loss_clip": 0.01000043, + "auxiliary_loss_mlp": 0.00747085, + "balance_loss_clip": 1.0033145, + "balance_loss_mlp": 1.00139761, + "epoch": 0.6565459191342251, + "flos": 58123144604160.0, + "grad_norm": 0.7275250836974222, + "language_loss": 0.53015631, + "learning_rate": 1.1148041243944931e-06, + "loss": 0.54762757, + "num_input_tokens_seen": 235735050, + "step": 10920, + "time_per_iteration": 3.22001314163208 + }, + { + "auxiliary_loss_clip": 0.01052448, + "auxiliary_loss_mlp": 0.01025278, + "balance_loss_clip": 1.02527547, + "balance_loss_mlp": 1.01413822, + "epoch": 0.6566060423868931, + "flos": 30809631582720.0, + "grad_norm": 1.5207659104637388, + "language_loss": 0.65371269, + "learning_rate": 1.1144549030343697e-06, + "loss": 0.67448997, + "num_input_tokens_seen": 235757545, + "step": 10921, + "time_per_iteration": 2.8860766887664795 + }, + { + "auxiliary_loss_clip": 0.01038338, + "auxiliary_loss_mlp": 0.01034486, + "balance_loss_clip": 1.02402174, + "balance_loss_mlp": 1.02082491, + "epoch": 0.6566661656395612, + "flos": 23367432787200.0, + "grad_norm": 1.6761304868477112, + "language_loss": 0.81357086, + "learning_rate": 1.114105715254205e-06, + "loss": 0.83429909, + "num_input_tokens_seen": 235777265, + "step": 10922, + "time_per_iteration": 2.719266176223755 + }, + { + "auxiliary_loss_clip": 0.0101064, + "auxiliary_loss_mlp": 0.00747885, + "balance_loss_clip": 1.02356005, + "balance_loss_mlp": 1.00062227, + "epoch": 0.6567262888922291, + "flos": 25735597488000.0, + "grad_norm": 1.7932663169679648, + "language_loss": 0.71237886, + "learning_rate": 1.1137565610672414e-06, + "loss": 0.72996408, + "num_input_tokens_seen": 235796565, + "step": 10923, + "time_per_iteration": 2.8383870124816895 + }, + { + "auxiliary_loss_clip": 0.01038304, + "auxiliary_loss_mlp": 0.01032166, + "balance_loss_clip": 1.02801371, + "balance_loss_mlp": 1.02060843, + "epoch": 0.6567864121448971, + "flos": 17123716206720.0, + "grad_norm": 1.7995318781396794, + "language_loss": 0.8084991, + "learning_rate": 1.1134074404867169e-06, + "loss": 0.82920378, + "num_input_tokens_seen": 235814805, + "step": 10924, + "time_per_iteration": 2.6814651489257812 + }, + { + "auxiliary_loss_clip": 0.01046354, + "auxiliary_loss_mlp": 0.01026601, + "balance_loss_clip": 1.0233984, + "balance_loss_mlp": 1.0160749, + "epoch": 0.656846535397565, + "flos": 22419319345920.0, + "grad_norm": 1.7850264441803472, + "language_loss": 0.72259676, + "learning_rate": 1.1130583535258717e-06, + "loss": 0.74332631, + "num_input_tokens_seen": 235833405, + "step": 10925, + "time_per_iteration": 2.712371349334717 + }, + { + "auxiliary_loss_clip": 0.01050449, + "auxiliary_loss_mlp": 0.01025223, + "balance_loss_clip": 1.02447844, + "balance_loss_mlp": 1.01444626, + "epoch": 0.656906658650233, + "flos": 17704535126400.0, + "grad_norm": 2.9694838212946393, + "language_loss": 0.73144114, + "learning_rate": 1.112709300197942e-06, + "loss": 0.75219786, + "num_input_tokens_seen": 235848530, + "step": 10926, + "time_per_iteration": 2.687387466430664 + }, + { + "auxiliary_loss_clip": 0.01018761, + "auxiliary_loss_mlp": 0.0102763, + "balance_loss_clip": 1.02446628, + "balance_loss_mlp": 1.01572084, + "epoch": 0.6569667819029009, + "flos": 21175158009600.0, + "grad_norm": 1.7135368284254642, + "language_loss": 0.72306365, + "learning_rate": 1.1123602805161656e-06, + "loss": 0.74352759, + "num_input_tokens_seen": 235867225, + "step": 10927, + "time_per_iteration": 2.909485340118408 + }, + { + "auxiliary_loss_clip": 0.00979678, + "auxiliary_loss_mlp": 0.01008486, + "balance_loss_clip": 1.00321889, + "balance_loss_mlp": 1.00730026, + "epoch": 0.6570269051555689, + "flos": 68761897511040.0, + "grad_norm": 0.7348198911372299, + "language_loss": 0.64470971, + "learning_rate": 1.112011294493775e-06, + "loss": 0.66459137, + "num_input_tokens_seen": 235932925, + "step": 10928, + "time_per_iteration": 3.4134395122528076 + }, + { + "auxiliary_loss_clip": 0.01050158, + "auxiliary_loss_mlp": 0.01028596, + "balance_loss_clip": 1.0232178, + "balance_loss_mlp": 1.0175575, + "epoch": 0.6570870284082369, + "flos": 26319289495680.0, + "grad_norm": 2.7453035258200904, + "language_loss": 0.77986521, + "learning_rate": 1.1116623421440063e-06, + "loss": 0.80065274, + "num_input_tokens_seen": 235952680, + "step": 10929, + "time_per_iteration": 2.863348960876465 + }, + { + "auxiliary_loss_clip": 0.01028466, + "auxiliary_loss_mlp": 0.01029827, + "balance_loss_clip": 1.0223794, + "balance_loss_mlp": 1.01883554, + "epoch": 0.6571471516609049, + "flos": 26174749167360.0, + "grad_norm": 1.7245549039581078, + "language_loss": 0.65154326, + "learning_rate": 1.1113134234800895e-06, + "loss": 0.67212623, + "num_input_tokens_seen": 235972075, + "step": 10930, + "time_per_iteration": 2.859506130218506 + }, + { + "auxiliary_loss_clip": 0.01013737, + "auxiliary_loss_mlp": 0.01031772, + "balance_loss_clip": 1.02104974, + "balance_loss_mlp": 1.02035785, + "epoch": 0.6572072749135728, + "flos": 20376253664640.0, + "grad_norm": 1.5380029826380721, + "language_loss": 0.71039224, + "learning_rate": 1.110964538515258e-06, + "loss": 0.73084736, + "num_input_tokens_seen": 235990340, + "step": 10931, + "time_per_iteration": 2.9864492416381836 + }, + { + "auxiliary_loss_clip": 0.01024385, + "auxiliary_loss_mlp": 0.01032225, + "balance_loss_clip": 1.02447057, + "balance_loss_mlp": 1.02122176, + "epoch": 0.6572673981662408, + "flos": 17128744110720.0, + "grad_norm": 2.4891825933431746, + "language_loss": 0.68711543, + "learning_rate": 1.1106156872627393e-06, + "loss": 0.70768154, + "num_input_tokens_seen": 236007470, + "step": 10932, + "time_per_iteration": 2.8386013507843018 + }, + { + "auxiliary_loss_clip": 0.01038051, + "auxiliary_loss_mlp": 0.00747683, + "balance_loss_clip": 1.02211547, + "balance_loss_mlp": 1.00062442, + "epoch": 0.6573275214189087, + "flos": 41275113281280.0, + "grad_norm": 1.6564690260423522, + "language_loss": 0.8007763, + "learning_rate": 1.1102668697357626e-06, + "loss": 0.81863368, + "num_input_tokens_seen": 236029030, + "step": 10933, + "time_per_iteration": 3.0687711238861084 + }, + { + "auxiliary_loss_clip": 0.01018939, + "auxiliary_loss_mlp": 0.01031067, + "balance_loss_clip": 1.02428985, + "balance_loss_mlp": 1.01967049, + "epoch": 0.6573876446715767, + "flos": 22890143842560.0, + "grad_norm": 1.894756095507535, + "language_loss": 0.74062347, + "learning_rate": 1.1099180859475571e-06, + "loss": 0.76112354, + "num_input_tokens_seen": 236047160, + "step": 10934, + "time_per_iteration": 2.9835684299468994 + }, + { + "auxiliary_loss_clip": 0.01045902, + "auxiliary_loss_mlp": 0.0103058, + "balance_loss_clip": 1.02265334, + "balance_loss_mlp": 1.01878989, + "epoch": 0.6574477679242448, + "flos": 44018150273280.0, + "grad_norm": 1.6429280265698922, + "language_loss": 0.76111108, + "learning_rate": 1.1095693359113454e-06, + "loss": 0.78187591, + "num_input_tokens_seen": 236069215, + "step": 10935, + "time_per_iteration": 2.873793840408325 + }, + { + "auxiliary_loss_clip": 0.01025021, + "auxiliary_loss_mlp": 0.01037977, + "balance_loss_clip": 1.02280784, + "balance_loss_mlp": 1.02560902, + "epoch": 0.6575078911769127, + "flos": 24571517523840.0, + "grad_norm": 1.6302961293263731, + "language_loss": 0.78329432, + "learning_rate": 1.1092206196403538e-06, + "loss": 0.80392432, + "num_input_tokens_seen": 236088335, + "step": 10936, + "time_per_iteration": 2.77820086479187 + }, + { + "auxiliary_loss_clip": 0.01020169, + "auxiliary_loss_mlp": 0.01029505, + "balance_loss_clip": 1.02374935, + "balance_loss_mlp": 1.01876462, + "epoch": 0.6575680144295807, + "flos": 20924035050240.0, + "grad_norm": 1.9609145157061305, + "language_loss": 0.69545668, + "learning_rate": 1.1088719371478056e-06, + "loss": 0.71595347, + "num_input_tokens_seen": 236108540, + "step": 10937, + "time_per_iteration": 2.8583731651306152 + }, + { + "auxiliary_loss_clip": 0.01042604, + "auxiliary_loss_mlp": 0.01029039, + "balance_loss_clip": 1.0246985, + "balance_loss_mlp": 1.01800573, + "epoch": 0.6576281376822486, + "flos": 10925642833920.0, + "grad_norm": 2.254530212071655, + "language_loss": 0.68922079, + "learning_rate": 1.1085232884469236e-06, + "loss": 0.70993721, + "num_input_tokens_seen": 236124495, + "step": 10938, + "time_per_iteration": 2.64274525642395 + }, + { + "auxiliary_loss_clip": 0.01040916, + "auxiliary_loss_mlp": 0.01031843, + "balance_loss_clip": 1.0250752, + "balance_loss_mlp": 1.02035725, + "epoch": 0.6576882609349166, + "flos": 19281552819840.0, + "grad_norm": 2.0032006630315844, + "language_loss": 0.71279752, + "learning_rate": 1.108174673550927e-06, + "loss": 0.73352516, + "num_input_tokens_seen": 236142550, + "step": 10939, + "time_per_iteration": 2.6599538326263428 + }, + { + "auxiliary_loss_clip": 0.01044442, + "auxiliary_loss_mlp": 0.0074768, + "balance_loss_clip": 1.02479327, + "balance_loss_mlp": 1.00063682, + "epoch": 0.6577483841875845, + "flos": 20220544206720.0, + "grad_norm": 4.885962994644147, + "language_loss": 0.77885115, + "learning_rate": 1.107826092473037e-06, + "loss": 0.79677242, + "num_input_tokens_seen": 236156620, + "step": 10940, + "time_per_iteration": 4.385200500488281 + }, + { + "auxiliary_loss_clip": 0.01018948, + "auxiliary_loss_mlp": 0.01030367, + "balance_loss_clip": 1.02342939, + "balance_loss_mlp": 1.01879203, + "epoch": 0.6578085074402525, + "flos": 34751078962560.0, + "grad_norm": 1.870832289181486, + "language_loss": 0.68383312, + "learning_rate": 1.107477545226471e-06, + "loss": 0.70432639, + "num_input_tokens_seen": 236177095, + "step": 10941, + "time_per_iteration": 2.8655049800872803 + }, + { + "auxiliary_loss_clip": 0.01046364, + "auxiliary_loss_mlp": 0.00747667, + "balance_loss_clip": 1.02209747, + "balance_loss_mlp": 1.00057387, + "epoch": 0.6578686306929205, + "flos": 23470998675840.0, + "grad_norm": 1.9622143196370734, + "language_loss": 0.68132961, + "learning_rate": 1.1071290318244448e-06, + "loss": 0.69926989, + "num_input_tokens_seen": 236194695, + "step": 10942, + "time_per_iteration": 4.200013875961304 + }, + { + "auxiliary_loss_clip": 0.01031597, + "auxiliary_loss_mlp": 0.01035142, + "balance_loss_clip": 1.02520156, + "balance_loss_mlp": 1.02242231, + "epoch": 0.6579287539455885, + "flos": 18077073033600.0, + "grad_norm": 2.3658663249823477, + "language_loss": 0.71582443, + "learning_rate": 1.1067805522801753e-06, + "loss": 0.7364918, + "num_input_tokens_seen": 236213885, + "step": 10943, + "time_per_iteration": 2.6855945587158203 + }, + { + "auxiliary_loss_clip": 0.01018299, + "auxiliary_loss_mlp": 0.01026421, + "balance_loss_clip": 1.02281046, + "balance_loss_mlp": 1.01491737, + "epoch": 0.6579888771982564, + "flos": 28661383900800.0, + "grad_norm": 1.6253083927026262, + "language_loss": 0.59185553, + "learning_rate": 1.1064321066068778e-06, + "loss": 0.61230272, + "num_input_tokens_seen": 236237315, + "step": 10944, + "time_per_iteration": 2.8162243366241455 + }, + { + "auxiliary_loss_clip": 0.01057147, + "auxiliary_loss_mlp": 0.01032129, + "balance_loss_clip": 1.02621198, + "balance_loss_mlp": 1.02071476, + "epoch": 0.6580490004509244, + "flos": 25046543911680.0, + "grad_norm": 1.6773953454421962, + "language_loss": 0.72575092, + "learning_rate": 1.1060836948177646e-06, + "loss": 0.74664366, + "num_input_tokens_seen": 236256345, + "step": 10945, + "time_per_iteration": 2.737172842025757 + }, + { + "auxiliary_loss_clip": 0.0104338, + "auxiliary_loss_mlp": 0.01025846, + "balance_loss_clip": 1.02514386, + "balance_loss_mlp": 1.0158565, + "epoch": 0.6581091237035923, + "flos": 43508793461760.0, + "grad_norm": 1.6174833803765132, + "language_loss": 0.7058903, + "learning_rate": 1.105735316926046e-06, + "loss": 0.72658253, + "num_input_tokens_seen": 236281890, + "step": 10946, + "time_per_iteration": 3.062553882598877 + }, + { + "auxiliary_loss_clip": 0.01053755, + "auxiliary_loss_mlp": 0.01030618, + "balance_loss_clip": 1.0257566, + "balance_loss_mlp": 1.01970422, + "epoch": 0.6581692469562603, + "flos": 22415404763520.0, + "grad_norm": 1.9856868322945083, + "language_loss": 0.82124114, + "learning_rate": 1.105386972944934e-06, + "loss": 0.84208488, + "num_input_tokens_seen": 236298370, + "step": 10947, + "time_per_iteration": 2.9149680137634277 + }, + { + "auxiliary_loss_clip": 0.01011555, + "auxiliary_loss_mlp": 0.00747604, + "balance_loss_clip": 1.02145052, + "balance_loss_mlp": 1.00058579, + "epoch": 0.6582293702089284, + "flos": 24859772167680.0, + "grad_norm": 1.5202217678506091, + "language_loss": 0.77228218, + "learning_rate": 1.1050386628876385e-06, + "loss": 0.78987378, + "num_input_tokens_seen": 236317380, + "step": 10948, + "time_per_iteration": 3.001479387283325 + }, + { + "auxiliary_loss_clip": 0.01053745, + "auxiliary_loss_mlp": 0.01024207, + "balance_loss_clip": 1.02591467, + "balance_loss_mlp": 1.0137161, + "epoch": 0.6582894934615963, + "flos": 23039676161280.0, + "grad_norm": 1.658132115368027, + "language_loss": 0.78712898, + "learning_rate": 1.1046903867673655e-06, + "loss": 0.80790848, + "num_input_tokens_seen": 236336210, + "step": 10949, + "time_per_iteration": 2.720518112182617 + }, + { + "auxiliary_loss_clip": 0.00998203, + "auxiliary_loss_mlp": 0.01000999, + "balance_loss_clip": 1.00214767, + "balance_loss_mlp": 0.99992067, + "epoch": 0.6583496167142643, + "flos": 72551980978560.0, + "grad_norm": 0.7352269152402386, + "language_loss": 0.61805463, + "learning_rate": 1.104342144597323e-06, + "loss": 0.63804662, + "num_input_tokens_seen": 236403090, + "step": 10950, + "time_per_iteration": 3.315791130065918 + }, + { + "auxiliary_loss_clip": 0.01050607, + "auxiliary_loss_mlp": 0.01030171, + "balance_loss_clip": 1.0241282, + "balance_loss_mlp": 1.02025914, + "epoch": 0.6584097399669322, + "flos": 13078846592640.0, + "grad_norm": 4.928369586936061, + "language_loss": 0.66941249, + "learning_rate": 1.1039939363907178e-06, + "loss": 0.69022024, + "num_input_tokens_seen": 236420475, + "step": 10951, + "time_per_iteration": 2.660454034805298 + }, + { + "auxiliary_loss_clip": 0.0104734, + "auxiliary_loss_mlp": 0.01027367, + "balance_loss_clip": 1.02319908, + "balance_loss_mlp": 1.01729393, + "epoch": 0.6584698632196002, + "flos": 28693164458880.0, + "grad_norm": 1.3772267523618256, + "language_loss": 0.7625826, + "learning_rate": 1.1036457621607504e-06, + "loss": 0.78332973, + "num_input_tokens_seen": 236441915, + "step": 10952, + "time_per_iteration": 2.7733874320983887 + }, + { + "auxiliary_loss_clip": 0.01063376, + "auxiliary_loss_mlp": 0.01028253, + "balance_loss_clip": 1.02609015, + "balance_loss_mlp": 1.01767278, + "epoch": 0.6585299864722681, + "flos": 14319272914560.0, + "grad_norm": 1.6764606555490513, + "language_loss": 0.73597705, + "learning_rate": 1.1032976219206257e-06, + "loss": 0.7568934, + "num_input_tokens_seen": 236460340, + "step": 10953, + "time_per_iteration": 2.63374924659729 + }, + { + "auxiliary_loss_clip": 0.01032002, + "auxiliary_loss_mlp": 0.01034388, + "balance_loss_clip": 1.02408004, + "balance_loss_mlp": 1.02322388, + "epoch": 0.6585901097249361, + "flos": 26797907243520.0, + "grad_norm": 2.982811934254367, + "language_loss": 0.78735495, + "learning_rate": 1.102949515683546e-06, + "loss": 0.80801886, + "num_input_tokens_seen": 236478280, + "step": 10954, + "time_per_iteration": 2.7917351722717285 + }, + { + "auxiliary_loss_clip": 0.01036935, + "auxiliary_loss_mlp": 0.01031493, + "balance_loss_clip": 1.02194715, + "balance_loss_mlp": 1.0202868, + "epoch": 0.658650232977604, + "flos": 18733124989440.0, + "grad_norm": 2.241822736261962, + "language_loss": 0.69477272, + "learning_rate": 1.1026014434627096e-06, + "loss": 0.71545702, + "num_input_tokens_seen": 236493225, + "step": 10955, + "time_per_iteration": 2.669926881790161 + }, + { + "auxiliary_loss_clip": 0.01031361, + "auxiliary_loss_mlp": 0.01030171, + "balance_loss_clip": 1.02121019, + "balance_loss_mlp": 1.02022958, + "epoch": 0.6587103562302721, + "flos": 24753440931840.0, + "grad_norm": 2.1751106589615077, + "language_loss": 0.80354512, + "learning_rate": 1.1022534052713172e-06, + "loss": 0.8241604, + "num_input_tokens_seen": 236514420, + "step": 10956, + "time_per_iteration": 4.391749143600464 + }, + { + "auxiliary_loss_clip": 0.01054391, + "auxiliary_loss_mlp": 0.01031587, + "balance_loss_clip": 1.02629268, + "balance_loss_mlp": 1.02039909, + "epoch": 0.65877047948294, + "flos": 22346133384960.0, + "grad_norm": 2.0017375753656204, + "language_loss": 0.80976433, + "learning_rate": 1.1019054011225648e-06, + "loss": 0.8306241, + "num_input_tokens_seen": 236532785, + "step": 10957, + "time_per_iteration": 2.7146804332733154 + }, + { + "auxiliary_loss_clip": 0.01041512, + "auxiliary_loss_mlp": 0.0102738, + "balance_loss_clip": 1.02483654, + "balance_loss_mlp": 1.01776624, + "epoch": 0.658830602735608, + "flos": 45180542298240.0, + "grad_norm": 1.6049230496421907, + "language_loss": 0.7580058, + "learning_rate": 1.1015574310296506e-06, + "loss": 0.77869469, + "num_input_tokens_seen": 236553330, + "step": 10958, + "time_per_iteration": 2.832956552505493 + }, + { + "auxiliary_loss_clip": 0.01025296, + "auxiliary_loss_mlp": 0.01037114, + "balance_loss_clip": 1.02357304, + "balance_loss_mlp": 1.02491307, + "epoch": 0.6588907259882759, + "flos": 19901622326400.0, + "grad_norm": 1.605287633126911, + "language_loss": 0.75038302, + "learning_rate": 1.1012094950057678e-06, + "loss": 0.77100712, + "num_input_tokens_seen": 236572960, + "step": 10959, + "time_per_iteration": 2.7379629611968994 + }, + { + "auxiliary_loss_clip": 0.0105259, + "auxiliary_loss_mlp": 0.01023725, + "balance_loss_clip": 1.02500033, + "balance_loss_mlp": 1.01367593, + "epoch": 0.6589508492409439, + "flos": 24133766474880.0, + "grad_norm": 1.54462359831397, + "language_loss": 0.64772642, + "learning_rate": 1.1008615930641107e-06, + "loss": 0.66848952, + "num_input_tokens_seen": 236594090, + "step": 10960, + "time_per_iteration": 2.7862117290496826 + }, + { + "auxiliary_loss_clip": 0.01065905, + "auxiliary_loss_mlp": 0.01032235, + "balance_loss_clip": 1.02550113, + "balance_loss_mlp": 1.0203383, + "epoch": 0.659010972493612, + "flos": 18222906251520.0, + "grad_norm": 2.062794637985675, + "language_loss": 0.81952834, + "learning_rate": 1.1005137252178734e-06, + "loss": 0.84050971, + "num_input_tokens_seen": 236610190, + "step": 10961, + "time_per_iteration": 2.875596046447754 + }, + { + "auxiliary_loss_clip": 0.01029345, + "auxiliary_loss_mlp": 0.01026065, + "balance_loss_clip": 1.02445459, + "balance_loss_mlp": 1.01511586, + "epoch": 0.6590710957462799, + "flos": 27600007898880.0, + "grad_norm": 2.1685699523711364, + "language_loss": 0.73400962, + "learning_rate": 1.1001658914802453e-06, + "loss": 0.75456369, + "num_input_tokens_seen": 236631575, + "step": 10962, + "time_per_iteration": 2.8317291736602783 + }, + { + "auxiliary_loss_clip": 0.01045283, + "auxiliary_loss_mlp": 0.01032105, + "balance_loss_clip": 1.0235157, + "balance_loss_mlp": 1.02134633, + "epoch": 0.6591312189989479, + "flos": 20302959962880.0, + "grad_norm": 3.673522563683037, + "language_loss": 0.7983377, + "learning_rate": 1.0998180918644165e-06, + "loss": 0.81911153, + "num_input_tokens_seen": 236649815, + "step": 10963, + "time_per_iteration": 4.439687490463257 + }, + { + "auxiliary_loss_clip": 0.01012735, + "auxiliary_loss_mlp": 0.00747537, + "balance_loss_clip": 1.02282119, + "balance_loss_mlp": 1.00061667, + "epoch": 0.6591913422516158, + "flos": 12312943868160.0, + "grad_norm": 1.507887830064879, + "language_loss": 0.78559202, + "learning_rate": 1.0994703263835754e-06, + "loss": 0.8031947, + "num_input_tokens_seen": 236668335, + "step": 10964, + "time_per_iteration": 2.8081507682800293 + }, + { + "auxiliary_loss_clip": 0.01021609, + "auxiliary_loss_mlp": 0.01035975, + "balance_loss_clip": 1.02041936, + "balance_loss_mlp": 1.02525818, + "epoch": 0.6592514655042838, + "flos": 25884591102720.0, + "grad_norm": 1.5737717542788234, + "language_loss": 0.74233633, + "learning_rate": 1.0991225950509106e-06, + "loss": 0.76291215, + "num_input_tokens_seen": 236688945, + "step": 10965, + "time_per_iteration": 3.0316996574401855 + }, + { + "auxiliary_loss_clip": 0.01029663, + "auxiliary_loss_mlp": 0.01035377, + "balance_loss_clip": 1.0232712, + "balance_loss_mlp": 1.02290154, + "epoch": 0.6593115887569517, + "flos": 14063624841600.0, + "grad_norm": 1.9675232374852933, + "language_loss": 0.73393649, + "learning_rate": 1.0987748978796067e-06, + "loss": 0.75458694, + "num_input_tokens_seen": 236707055, + "step": 10966, + "time_per_iteration": 2.975403070449829 + }, + { + "auxiliary_loss_clip": 0.01053444, + "auxiliary_loss_mlp": 0.01027918, + "balance_loss_clip": 1.02416015, + "balance_loss_mlp": 1.01665282, + "epoch": 0.6593717120096197, + "flos": 24717925359360.0, + "grad_norm": 1.4675247581498543, + "language_loss": 0.76815122, + "learning_rate": 1.0984272348828487e-06, + "loss": 0.78896487, + "num_input_tokens_seen": 236725900, + "step": 10967, + "time_per_iteration": 2.6985015869140625 + }, + { + "auxiliary_loss_clip": 0.00999436, + "auxiliary_loss_mlp": 0.01002264, + "balance_loss_clip": 1.00316954, + "balance_loss_mlp": 1.00124443, + "epoch": 0.6594318352622877, + "flos": 55558083502080.0, + "grad_norm": 0.6935130284273547, + "language_loss": 0.48521084, + "learning_rate": 1.0980796060738221e-06, + "loss": 0.50522786, + "num_input_tokens_seen": 236788415, + "step": 10968, + "time_per_iteration": 3.2393996715545654 + }, + { + "auxiliary_loss_clip": 0.01011071, + "auxiliary_loss_mlp": 0.01033704, + "balance_loss_clip": 1.02003193, + "balance_loss_mlp": 1.02148485, + "epoch": 0.6594919585149557, + "flos": 17456931699840.0, + "grad_norm": 1.8093793953527464, + "language_loss": 0.79114676, + "learning_rate": 1.0977320114657058e-06, + "loss": 0.81159449, + "num_input_tokens_seen": 236805155, + "step": 10969, + "time_per_iteration": 2.7056777477264404 + }, + { + "auxiliary_loss_clip": 0.01053222, + "auxiliary_loss_mlp": 0.0103026, + "balance_loss_clip": 1.02497327, + "balance_loss_mlp": 1.02003801, + "epoch": 0.6595520817676236, + "flos": 18223229473920.0, + "grad_norm": 2.315228531544688, + "language_loss": 0.65501702, + "learning_rate": 1.0973844510716817e-06, + "loss": 0.67585194, + "num_input_tokens_seen": 236824360, + "step": 10970, + "time_per_iteration": 2.703474283218384 + }, + { + "auxiliary_loss_clip": 0.01047076, + "auxiliary_loss_mlp": 0.01023527, + "balance_loss_clip": 1.02293921, + "balance_loss_mlp": 1.01286411, + "epoch": 0.6596122050202916, + "flos": 22199761463040.0, + "grad_norm": 1.7786960172030746, + "language_loss": 0.76371294, + "learning_rate": 1.0970369249049308e-06, + "loss": 0.78441906, + "num_input_tokens_seen": 236844640, + "step": 10971, + "time_per_iteration": 2.6577346324920654 + }, + { + "auxiliary_loss_clip": 0.00989315, + "auxiliary_loss_mlp": 0.01046803, + "balance_loss_clip": 1.01758742, + "balance_loss_mlp": 1.03348732, + "epoch": 0.6596723282729595, + "flos": 14173834746240.0, + "grad_norm": 2.4799150991382564, + "language_loss": 0.7029736, + "learning_rate": 1.096689432978629e-06, + "loss": 0.72333479, + "num_input_tokens_seen": 236861160, + "step": 10972, + "time_per_iteration": 2.9509472846984863 + }, + { + "auxiliary_loss_clip": 0.01052214, + "auxiliary_loss_mlp": 0.01024685, + "balance_loss_clip": 1.02514148, + "balance_loss_mlp": 1.01332426, + "epoch": 0.6597324515256275, + "flos": 30553193410560.0, + "grad_norm": 1.6697975329053032, + "language_loss": 0.55811262, + "learning_rate": 1.0963419753059556e-06, + "loss": 0.57888162, + "num_input_tokens_seen": 236880465, + "step": 10973, + "time_per_iteration": 2.6808931827545166 + }, + { + "auxiliary_loss_clip": 0.01048416, + "auxiliary_loss_mlp": 0.01036915, + "balance_loss_clip": 1.02712679, + "balance_loss_mlp": 1.02530432, + "epoch": 0.6597925747782956, + "flos": 17639860688640.0, + "grad_norm": 2.0532749700699906, + "language_loss": 0.78624082, + "learning_rate": 1.0959945519000839e-06, + "loss": 0.80709416, + "num_input_tokens_seen": 236897730, + "step": 10974, + "time_per_iteration": 2.6581788063049316 + }, + { + "auxiliary_loss_clip": 0.0105502, + "auxiliary_loss_mlp": 0.01032055, + "balance_loss_clip": 1.02608371, + "balance_loss_mlp": 1.02096891, + "epoch": 0.6598526980309635, + "flos": 22819112697600.0, + "grad_norm": 2.069823586898322, + "language_loss": 0.68554044, + "learning_rate": 1.0956471627741906e-06, + "loss": 0.70641118, + "num_input_tokens_seen": 236917300, + "step": 10975, + "time_per_iteration": 2.5883078575134277 + }, + { + "auxiliary_loss_clip": 0.0104466, + "auxiliary_loss_mlp": 0.01025651, + "balance_loss_clip": 1.02437651, + "balance_loss_mlp": 1.01548803, + "epoch": 0.6599128212836315, + "flos": 21068036674560.0, + "grad_norm": 1.7781248953309963, + "language_loss": 0.70996416, + "learning_rate": 1.0952998079414464e-06, + "loss": 0.73066729, + "num_input_tokens_seen": 236935590, + "step": 10976, + "time_per_iteration": 2.6176793575286865 + }, + { + "auxiliary_loss_clip": 0.01041328, + "auxiliary_loss_mlp": 0.01026748, + "balance_loss_clip": 1.02509367, + "balance_loss_mlp": 1.01598322, + "epoch": 0.6599729445362994, + "flos": 22163527618560.0, + "grad_norm": 1.6805760100766702, + "language_loss": 0.67835665, + "learning_rate": 1.0949524874150243e-06, + "loss": 0.69903737, + "num_input_tokens_seen": 236952830, + "step": 10977, + "time_per_iteration": 2.718606472015381 + }, + { + "auxiliary_loss_clip": 0.0103367, + "auxiliary_loss_mlp": 0.01030539, + "balance_loss_clip": 1.02529287, + "balance_loss_mlp": 1.0185461, + "epoch": 0.6600330677889674, + "flos": 18150079426560.0, + "grad_norm": 2.018275798062351, + "language_loss": 0.81345183, + "learning_rate": 1.0946052012080952e-06, + "loss": 0.83409387, + "num_input_tokens_seen": 236971930, + "step": 10978, + "time_per_iteration": 2.750356435775757 + }, + { + "auxiliary_loss_clip": 0.01037408, + "auxiliary_loss_mlp": 0.01031354, + "balance_loss_clip": 1.02563608, + "balance_loss_mlp": 1.01976085, + "epoch": 0.6600931910416353, + "flos": 18150115340160.0, + "grad_norm": 2.0209801982094175, + "language_loss": 0.6718322, + "learning_rate": 1.0942579493338278e-06, + "loss": 0.69251978, + "num_input_tokens_seen": 236989920, + "step": 10979, + "time_per_iteration": 2.700932264328003 + }, + { + "auxiliary_loss_clip": 0.01036363, + "auxiliary_loss_mlp": 0.01026953, + "balance_loss_clip": 1.02411962, + "balance_loss_mlp": 1.01528275, + "epoch": 0.6601533142943034, + "flos": 17420733768960.0, + "grad_norm": 2.3286173083915647, + "language_loss": 0.73387223, + "learning_rate": 1.0939107318053889e-06, + "loss": 0.7545054, + "num_input_tokens_seen": 237006570, + "step": 10980, + "time_per_iteration": 2.640536308288574 + }, + { + "auxiliary_loss_clip": 0.01024071, + "auxiliary_loss_mlp": 0.01028548, + "balance_loss_clip": 1.02150488, + "balance_loss_mlp": 1.01864123, + "epoch": 0.6602134375469713, + "flos": 28219574615040.0, + "grad_norm": 1.8354930040775004, + "language_loss": 0.7289511, + "learning_rate": 1.0935635486359459e-06, + "loss": 0.74947727, + "num_input_tokens_seen": 237028415, + "step": 10981, + "time_per_iteration": 2.737518787384033 + }, + { + "auxiliary_loss_clip": 0.01011662, + "auxiliary_loss_mlp": 0.01034294, + "balance_loss_clip": 1.02254605, + "balance_loss_mlp": 1.02295709, + "epoch": 0.6602735607996393, + "flos": 29418056830080.0, + "grad_norm": 2.033449738412616, + "language_loss": 0.68479967, + "learning_rate": 1.0932163998386647e-06, + "loss": 0.7052592, + "num_input_tokens_seen": 237046595, + "step": 10982, + "time_per_iteration": 2.7678611278533936 + }, + { + "auxiliary_loss_clip": 0.01052042, + "auxiliary_loss_mlp": 0.01028088, + "balance_loss_clip": 1.0249114, + "balance_loss_mlp": 1.01743615, + "epoch": 0.6603336840523072, + "flos": 18588045957120.0, + "grad_norm": 1.6823789628121435, + "language_loss": 0.69649172, + "learning_rate": 1.0928692854267075e-06, + "loss": 0.71729302, + "num_input_tokens_seen": 237066150, + "step": 10983, + "time_per_iteration": 2.671433687210083 + }, + { + "auxiliary_loss_clip": 0.01050951, + "auxiliary_loss_mlp": 0.01028724, + "balance_loss_clip": 1.02316082, + "balance_loss_mlp": 1.01811409, + "epoch": 0.6603938073049752, + "flos": 33254860913280.0, + "grad_norm": 2.2828294403437277, + "language_loss": 0.70389414, + "learning_rate": 1.092522205413239e-06, + "loss": 0.72469085, + "num_input_tokens_seen": 237087060, + "step": 10984, + "time_per_iteration": 2.8034088611602783 + }, + { + "auxiliary_loss_clip": 0.01029897, + "auxiliary_loss_mlp": 0.01031272, + "balance_loss_clip": 1.02303374, + "balance_loss_mlp": 1.02071047, + "epoch": 0.6604539305576431, + "flos": 17384284442880.0, + "grad_norm": 1.6684101326330365, + "language_loss": 0.84065711, + "learning_rate": 1.0921751598114193e-06, + "loss": 0.86126876, + "num_input_tokens_seen": 237103825, + "step": 10985, + "time_per_iteration": 2.812964677810669 + }, + { + "auxiliary_loss_clip": 0.01054599, + "auxiliary_loss_mlp": 0.0103053, + "balance_loss_clip": 1.02522862, + "balance_loss_mlp": 1.01908636, + "epoch": 0.6605140538103111, + "flos": 21251145231360.0, + "grad_norm": 1.938172952335743, + "language_loss": 0.73665905, + "learning_rate": 1.0918281486344077e-06, + "loss": 0.7575103, + "num_input_tokens_seen": 237121740, + "step": 10986, + "time_per_iteration": 2.837524175643921 + }, + { + "auxiliary_loss_clip": 0.01050336, + "auxiliary_loss_mlp": 0.010273, + "balance_loss_clip": 1.0241313, + "balance_loss_mlp": 1.01621985, + "epoch": 0.6605741770629792, + "flos": 13881701433600.0, + "grad_norm": 1.7163381531979813, + "language_loss": 0.78908318, + "learning_rate": 1.0914811718953636e-06, + "loss": 0.80985951, + "num_input_tokens_seen": 237139565, + "step": 10987, + "time_per_iteration": 4.307261228561401 + }, + { + "auxiliary_loss_clip": 0.00991091, + "auxiliary_loss_mlp": 0.01003554, + "balance_loss_clip": 1.00407517, + "balance_loss_mlp": 1.00236821, + "epoch": 0.6606343003156471, + "flos": 69316215171840.0, + "grad_norm": 0.8568152216968918, + "language_loss": 0.54193294, + "learning_rate": 1.0911342296074454e-06, + "loss": 0.5618794, + "num_input_tokens_seen": 237201055, + "step": 10988, + "time_per_iteration": 4.978988885879517 + }, + { + "auxiliary_loss_clip": 0.01003929, + "auxiliary_loss_mlp": 0.01027425, + "balance_loss_clip": 1.02266645, + "balance_loss_mlp": 1.01714969, + "epoch": 0.6606944235683151, + "flos": 27272394927360.0, + "grad_norm": 1.4191859762491026, + "language_loss": 0.77145505, + "learning_rate": 1.0907873217838077e-06, + "loss": 0.79176855, + "num_input_tokens_seen": 237221805, + "step": 10989, + "time_per_iteration": 2.864204168319702 + }, + { + "auxiliary_loss_clip": 0.01043352, + "auxiliary_loss_mlp": 0.01032359, + "balance_loss_clip": 1.02608871, + "balance_loss_mlp": 1.02201176, + "epoch": 0.660754546820983, + "flos": 13772820332160.0, + "grad_norm": 1.9986764131062706, + "language_loss": 0.77219975, + "learning_rate": 1.0904404484376064e-06, + "loss": 0.79295689, + "num_input_tokens_seen": 237238270, + "step": 10990, + "time_per_iteration": 2.647465229034424 + }, + { + "auxiliary_loss_clip": 0.0106583, + "auxiliary_loss_mlp": 0.01027753, + "balance_loss_clip": 1.02583754, + "balance_loss_mlp": 1.01660109, + "epoch": 0.660814670073651, + "flos": 15705209232000.0, + "grad_norm": 2.020045536447694, + "language_loss": 0.60628307, + "learning_rate": 1.0900936095819937e-06, + "loss": 0.62721884, + "num_input_tokens_seen": 237255400, + "step": 10991, + "time_per_iteration": 2.692650079727173 + }, + { + "auxiliary_loss_clip": 0.01038648, + "auxiliary_loss_mlp": 0.01032209, + "balance_loss_clip": 1.02479827, + "balance_loss_mlp": 1.02027655, + "epoch": 0.6608747933263189, + "flos": 20850023076480.0, + "grad_norm": 2.3084362908139773, + "language_loss": 0.68520367, + "learning_rate": 1.0897468052301234e-06, + "loss": 0.70591217, + "num_input_tokens_seen": 237273105, + "step": 10992, + "time_per_iteration": 2.7426209449768066 + }, + { + "auxiliary_loss_clip": 0.01053453, + "auxiliary_loss_mlp": 0.01029467, + "balance_loss_clip": 1.02365005, + "balance_loss_mlp": 1.01836848, + "epoch": 0.660934916578987, + "flos": 20632117219200.0, + "grad_norm": 1.6801254603963707, + "language_loss": 0.87440896, + "learning_rate": 1.0894000353951444e-06, + "loss": 0.8952381, + "num_input_tokens_seen": 237292650, + "step": 10993, + "time_per_iteration": 2.6083714962005615 + }, + { + "auxiliary_loss_clip": 0.01060143, + "auxiliary_loss_mlp": 0.01028206, + "balance_loss_clip": 1.02649057, + "balance_loss_mlp": 1.01540256, + "epoch": 0.6609950398316549, + "flos": 25113588647040.0, + "grad_norm": 1.9360749876148922, + "language_loss": 0.66688395, + "learning_rate": 1.0890533000902078e-06, + "loss": 0.68776745, + "num_input_tokens_seen": 237312865, + "step": 10994, + "time_per_iteration": 2.776442289352417 + }, + { + "auxiliary_loss_clip": 0.01028046, + "auxiliary_loss_mlp": 0.01031636, + "balance_loss_clip": 1.02468252, + "balance_loss_mlp": 1.01960754, + "epoch": 0.6610551630843229, + "flos": 18661196004480.0, + "grad_norm": 1.7932947422930094, + "language_loss": 0.77250814, + "learning_rate": 1.0887065993284626e-06, + "loss": 0.79310495, + "num_input_tokens_seen": 237331210, + "step": 10995, + "time_per_iteration": 2.7387022972106934 + }, + { + "auxiliary_loss_clip": 0.01045439, + "auxiliary_loss_mlp": 0.01027564, + "balance_loss_clip": 1.02555633, + "balance_loss_mlp": 1.01705527, + "epoch": 0.6611152863369908, + "flos": 23258192549760.0, + "grad_norm": 2.103588708526763, + "language_loss": 0.7432276, + "learning_rate": 1.088359933123053e-06, + "loss": 0.76395762, + "num_input_tokens_seen": 237349455, + "step": 10996, + "time_per_iteration": 2.7487072944641113 + }, + { + "auxiliary_loss_clip": 0.01064321, + "auxiliary_loss_mlp": 0.01031518, + "balance_loss_clip": 1.02658999, + "balance_loss_mlp": 1.02083135, + "epoch": 0.6611754095896588, + "flos": 22159720776960.0, + "grad_norm": 1.9108513221927665, + "language_loss": 0.68777514, + "learning_rate": 1.088013301487126e-06, + "loss": 0.70873362, + "num_input_tokens_seen": 237367100, + "step": 10997, + "time_per_iteration": 2.5950260162353516 + }, + { + "auxiliary_loss_clip": 0.01044014, + "auxiliary_loss_mlp": 0.01028073, + "balance_loss_clip": 1.02412224, + "balance_loss_mlp": 1.01757097, + "epoch": 0.6612355328423267, + "flos": 13991228979840.0, + "grad_norm": 8.544123909761899, + "language_loss": 0.68611848, + "learning_rate": 1.0876667044338269e-06, + "loss": 0.70683932, + "num_input_tokens_seen": 237384840, + "step": 10998, + "time_per_iteration": 2.754547119140625 + }, + { + "auxiliary_loss_clip": 0.0100114, + "auxiliary_loss_mlp": 0.0100084, + "balance_loss_clip": 1.00483966, + "balance_loss_mlp": 0.999749, + "epoch": 0.6612956560949947, + "flos": 61453716359040.0, + "grad_norm": 0.719459119501467, + "language_loss": 0.51120687, + "learning_rate": 1.087320141976297e-06, + "loss": 0.53122663, + "num_input_tokens_seen": 237443355, + "step": 10999, + "time_per_iteration": 3.1995797157287598 + }, + { + "auxiliary_loss_clip": 0.01066691, + "auxiliary_loss_mlp": 0.0074781, + "balance_loss_clip": 1.02679181, + "balance_loss_mlp": 1.00059366, + "epoch": 0.6613557793476627, + "flos": 21616644072960.0, + "grad_norm": 2.7320447002842556, + "language_loss": 0.70429242, + "learning_rate": 1.086973614127679e-06, + "loss": 0.72243738, + "num_input_tokens_seen": 237459205, + "step": 11000, + "time_per_iteration": 2.6264636516571045 + }, + { + "auxiliary_loss_clip": 0.01032355, + "auxiliary_loss_mlp": 0.01030683, + "balance_loss_clip": 1.0241065, + "balance_loss_mlp": 1.02070522, + "epoch": 0.6614159026003307, + "flos": 34020117192960.0, + "grad_norm": 2.992806218376824, + "language_loss": 0.65296745, + "learning_rate": 1.0866271209011133e-06, + "loss": 0.67359781, + "num_input_tokens_seen": 237483580, + "step": 11001, + "time_per_iteration": 2.781369686126709 + }, + { + "auxiliary_loss_clip": 0.0106216, + "auxiliary_loss_mlp": 0.01025483, + "balance_loss_clip": 1.02395105, + "balance_loss_mlp": 1.01467681, + "epoch": 0.6614760258529987, + "flos": 24097281235200.0, + "grad_norm": 2.717685105962639, + "language_loss": 0.73184192, + "learning_rate": 1.086280662309739e-06, + "loss": 0.75271839, + "num_input_tokens_seen": 237502860, + "step": 11002, + "time_per_iteration": 4.358923435211182 + }, + { + "auxiliary_loss_clip": 0.01044745, + "auxiliary_loss_mlp": 0.01027714, + "balance_loss_clip": 1.02228773, + "balance_loss_mlp": 1.01685405, + "epoch": 0.6615361491056666, + "flos": 14903790935040.0, + "grad_norm": 1.9422396177251648, + "language_loss": 0.79307055, + "learning_rate": 1.0859342383666928e-06, + "loss": 0.81379509, + "num_input_tokens_seen": 237521030, + "step": 11003, + "time_per_iteration": 2.8731346130371094 + }, + { + "auxiliary_loss_clip": 0.0105485, + "auxiliary_loss_mlp": 0.01033163, + "balance_loss_clip": 1.02554941, + "balance_loss_mlp": 1.02097344, + "epoch": 0.6615962723583346, + "flos": 15304877176320.0, + "grad_norm": 1.866699865979543, + "language_loss": 0.68597502, + "learning_rate": 1.0855878490851119e-06, + "loss": 0.70685518, + "num_input_tokens_seen": 237539585, + "step": 11004, + "time_per_iteration": 2.8356950283050537 + }, + { + "auxiliary_loss_clip": 0.01056737, + "auxiliary_loss_mlp": 0.01029208, + "balance_loss_clip": 1.02558029, + "balance_loss_mlp": 1.01697171, + "epoch": 0.6616563956110025, + "flos": 18732586285440.0, + "grad_norm": 2.0648833021281794, + "language_loss": 0.69573808, + "learning_rate": 1.085241494478132e-06, + "loss": 0.71659756, + "num_input_tokens_seen": 237557655, + "step": 11005, + "time_per_iteration": 2.7523555755615234 + }, + { + "auxiliary_loss_clip": 0.01044055, + "auxiliary_loss_mlp": 0.01028088, + "balance_loss_clip": 1.02563, + "balance_loss_mlp": 1.01756835, + "epoch": 0.6617165188636706, + "flos": 24495063425280.0, + "grad_norm": 1.559121270465043, + "language_loss": 0.78416872, + "learning_rate": 1.0848951745588855e-06, + "loss": 0.80489016, + "num_input_tokens_seen": 237577000, + "step": 11006, + "time_per_iteration": 2.7284154891967773 + }, + { + "auxiliary_loss_clip": 0.01052831, + "auxiliary_loss_mlp": 0.01029616, + "balance_loss_clip": 1.02482677, + "balance_loss_mlp": 1.0186609, + "epoch": 0.6617766421163385, + "flos": 22379673709440.0, + "grad_norm": 11.505784765453674, + "language_loss": 0.76370841, + "learning_rate": 1.0845488893405068e-06, + "loss": 0.7845329, + "num_input_tokens_seen": 237597960, + "step": 11007, + "time_per_iteration": 2.7530224323272705 + }, + { + "auxiliary_loss_clip": 0.01052911, + "auxiliary_loss_mlp": 0.01027117, + "balance_loss_clip": 1.02536118, + "balance_loss_mlp": 1.01651931, + "epoch": 0.6618367653690065, + "flos": 20850418126080.0, + "grad_norm": 1.534844189451509, + "language_loss": 0.78416234, + "learning_rate": 1.0842026388361248e-06, + "loss": 0.80496264, + "num_input_tokens_seen": 237616385, + "step": 11008, + "time_per_iteration": 2.7670931816101074 + }, + { + "auxiliary_loss_clip": 0.01067565, + "auxiliary_loss_mlp": 0.01027736, + "balance_loss_clip": 1.02670205, + "balance_loss_mlp": 1.01562476, + "epoch": 0.6618968886216744, + "flos": 17712328377600.0, + "grad_norm": 2.0475763102827846, + "language_loss": 0.81625354, + "learning_rate": 1.0838564230588715e-06, + "loss": 0.83720654, + "num_input_tokens_seen": 237634930, + "step": 11009, + "time_per_iteration": 2.684849739074707 + }, + { + "auxiliary_loss_clip": 0.00980825, + "auxiliary_loss_mlp": 0.01001298, + "balance_loss_clip": 1.00681329, + "balance_loss_mlp": 1.00008249, + "epoch": 0.6619570118743424, + "flos": 67035347498880.0, + "grad_norm": 0.9800304312726358, + "language_loss": 0.67379946, + "learning_rate": 1.0835102420218735e-06, + "loss": 0.69362068, + "num_input_tokens_seen": 237693175, + "step": 11010, + "time_per_iteration": 3.2007906436920166 + }, + { + "auxiliary_loss_clip": 0.01054348, + "auxiliary_loss_mlp": 0.0103108, + "balance_loss_clip": 1.0252502, + "balance_loss_mlp": 1.01940298, + "epoch": 0.6620171351270103, + "flos": 18660908695680.0, + "grad_norm": 1.5360733120035004, + "language_loss": 0.71243262, + "learning_rate": 1.0831640957382593e-06, + "loss": 0.73328686, + "num_input_tokens_seen": 237713160, + "step": 11011, + "time_per_iteration": 4.432085752487183 + }, + { + "auxiliary_loss_clip": 0.01055514, + "auxiliary_loss_mlp": 0.01031033, + "balance_loss_clip": 1.02719736, + "balance_loss_mlp": 1.02084017, + "epoch": 0.6620772583796783, + "flos": 24170503109760.0, + "grad_norm": 1.6326296731428298, + "language_loss": 0.72248995, + "learning_rate": 1.0828179842211557e-06, + "loss": 0.74335545, + "num_input_tokens_seen": 237733600, + "step": 11012, + "time_per_iteration": 2.7125821113586426 + }, + { + "auxiliary_loss_clip": 0.01047, + "auxiliary_loss_mlp": 0.01028606, + "balance_loss_clip": 1.0234468, + "balance_loss_mlp": 1.01874769, + "epoch": 0.6621373816323463, + "flos": 23623547736960.0, + "grad_norm": 1.6280837712406189, + "language_loss": 0.79236197, + "learning_rate": 1.0824719074836845e-06, + "loss": 0.81311798, + "num_input_tokens_seen": 237752135, + "step": 11013, + "time_per_iteration": 2.8050403594970703 + }, + { + "auxiliary_loss_clip": 0.01039232, + "auxiliary_loss_mlp": 0.01029114, + "balance_loss_clip": 1.02304482, + "balance_loss_mlp": 1.01793861, + "epoch": 0.6621975048850143, + "flos": 18442212739200.0, + "grad_norm": 1.845455567373693, + "language_loss": 0.70368278, + "learning_rate": 1.082125865538971e-06, + "loss": 0.72436619, + "num_input_tokens_seen": 237770735, + "step": 11014, + "time_per_iteration": 2.743624687194824 + }, + { + "auxiliary_loss_clip": 0.01042767, + "auxiliary_loss_mlp": 0.00747583, + "balance_loss_clip": 1.02612329, + "balance_loss_mlp": 1.00051999, + "epoch": 0.6622576281376823, + "flos": 14063876236800.0, + "grad_norm": 1.8269996618283553, + "language_loss": 0.77128351, + "learning_rate": 1.081779858400137e-06, + "loss": 0.78918701, + "num_input_tokens_seen": 237789005, + "step": 11015, + "time_per_iteration": 2.654448986053467 + }, + { + "auxiliary_loss_clip": 0.01053676, + "auxiliary_loss_mlp": 0.00747636, + "balance_loss_clip": 1.02611554, + "balance_loss_mlp": 1.00056577, + "epoch": 0.6623177513903502, + "flos": 17018965169280.0, + "grad_norm": 4.622406590125527, + "language_loss": 0.82349801, + "learning_rate": 1.0814338860803021e-06, + "loss": 0.84151113, + "num_input_tokens_seen": 237807740, + "step": 11016, + "time_per_iteration": 2.7584164142608643 + }, + { + "auxiliary_loss_clip": 0.01054336, + "auxiliary_loss_mlp": 0.01028832, + "balance_loss_clip": 1.0251081, + "balance_loss_mlp": 1.01760268, + "epoch": 0.6623778746430182, + "flos": 17271021882240.0, + "grad_norm": 2.0681121982743527, + "language_loss": 0.69253236, + "learning_rate": 1.0810879485925864e-06, + "loss": 0.71336401, + "num_input_tokens_seen": 237826340, + "step": 11017, + "time_per_iteration": 2.5521278381347656 + }, + { + "auxiliary_loss_clip": 0.01034376, + "auxiliary_loss_mlp": 0.01033283, + "balance_loss_clip": 1.023211, + "balance_loss_mlp": 1.02139211, + "epoch": 0.6624379978956861, + "flos": 48792688767360.0, + "grad_norm": 1.89250920342068, + "language_loss": 0.77752292, + "learning_rate": 1.0807420459501084e-06, + "loss": 0.79819953, + "num_input_tokens_seen": 237848305, + "step": 11018, + "time_per_iteration": 2.8927485942840576 + }, + { + "auxiliary_loss_clip": 0.01041686, + "auxiliary_loss_mlp": 0.01034764, + "balance_loss_clip": 1.02417147, + "balance_loss_mlp": 1.02379131, + "epoch": 0.6624981211483542, + "flos": 18952431477120.0, + "grad_norm": 2.157038017863698, + "language_loss": 0.82908893, + "learning_rate": 1.0803961781659841e-06, + "loss": 0.8498534, + "num_input_tokens_seen": 237867020, + "step": 11019, + "time_per_iteration": 2.608203411102295 + }, + { + "auxiliary_loss_clip": 0.01045551, + "auxiliary_loss_mlp": 0.00747661, + "balance_loss_clip": 1.02304626, + "balance_loss_mlp": 1.00049973, + "epoch": 0.6625582444010221, + "flos": 23256576437760.0, + "grad_norm": 1.5732722828017927, + "language_loss": 0.71823204, + "learning_rate": 1.080050345253328e-06, + "loss": 0.73616421, + "num_input_tokens_seen": 237886710, + "step": 11020, + "time_per_iteration": 2.7281339168548584 + }, + { + "auxiliary_loss_clip": 0.01039864, + "auxiliary_loss_mlp": 0.01030021, + "balance_loss_clip": 1.02371979, + "balance_loss_mlp": 1.01759958, + "epoch": 0.6626183676536901, + "flos": 21394823633280.0, + "grad_norm": 2.494003227769687, + "language_loss": 0.72498232, + "learning_rate": 1.0797045472252554e-06, + "loss": 0.74568117, + "num_input_tokens_seen": 237904795, + "step": 11021, + "time_per_iteration": 2.6019697189331055 + }, + { + "auxiliary_loss_clip": 0.01038962, + "auxiliary_loss_mlp": 0.01032797, + "balance_loss_clip": 1.02336526, + "balance_loss_mlp": 1.02109694, + "epoch": 0.662678490906358, + "flos": 14571293713920.0, + "grad_norm": 3.156116641965984, + "language_loss": 0.83304286, + "learning_rate": 1.0793587840948793e-06, + "loss": 0.85376048, + "num_input_tokens_seen": 237921320, + "step": 11022, + "time_per_iteration": 2.7528293132781982 + }, + { + "auxiliary_loss_clip": 0.01049116, + "auxiliary_loss_mlp": 0.01031944, + "balance_loss_clip": 1.02542424, + "balance_loss_mlp": 1.01897454, + "epoch": 0.662738614159026, + "flos": 15992350554240.0, + "grad_norm": 2.2010546333966436, + "language_loss": 0.72580004, + "learning_rate": 1.0790130558753099e-06, + "loss": 0.74661064, + "num_input_tokens_seen": 237933525, + "step": 11023, + "time_per_iteration": 2.591060161590576 + }, + { + "auxiliary_loss_clip": 0.01023986, + "auxiliary_loss_mlp": 0.01032235, + "balance_loss_clip": 1.02054381, + "balance_loss_mlp": 1.02104712, + "epoch": 0.6627987374116939, + "flos": 19536338966400.0, + "grad_norm": 25.497495271315483, + "language_loss": 0.7478981, + "learning_rate": 1.0786673625796574e-06, + "loss": 0.76846027, + "num_input_tokens_seen": 237953395, + "step": 11024, + "time_per_iteration": 2.774604082107544 + }, + { + "auxiliary_loss_clip": 0.0103444, + "auxiliary_loss_mlp": 0.01029684, + "balance_loss_clip": 1.02443588, + "balance_loss_mlp": 1.01840055, + "epoch": 0.662858860664362, + "flos": 15702838934400.0, + "grad_norm": 2.2914825452630585, + "language_loss": 0.69547397, + "learning_rate": 1.0783217042210306e-06, + "loss": 0.71611524, + "num_input_tokens_seen": 237971445, + "step": 11025, + "time_per_iteration": 2.697939157485962 + }, + { + "auxiliary_loss_clip": 0.01065187, + "auxiliary_loss_mlp": 0.01028974, + "balance_loss_clip": 1.02655387, + "balance_loss_mlp": 1.01779795, + "epoch": 0.6629189839170299, + "flos": 20154289570560.0, + "grad_norm": 1.509630105422901, + "language_loss": 0.78835708, + "learning_rate": 1.0779760808125379e-06, + "loss": 0.80929869, + "num_input_tokens_seen": 237989965, + "step": 11026, + "time_per_iteration": 2.657871723175049 + }, + { + "auxiliary_loss_clip": 0.01052207, + "auxiliary_loss_mlp": 0.01028727, + "balance_loss_clip": 1.02516973, + "balance_loss_mlp": 1.01834977, + "epoch": 0.6629791071696979, + "flos": 20915415786240.0, + "grad_norm": 1.9136104820050848, + "language_loss": 0.76218569, + "learning_rate": 1.0776304923672842e-06, + "loss": 0.7829951, + "num_input_tokens_seen": 238006820, + "step": 11027, + "time_per_iteration": 2.692150115966797 + }, + { + "auxiliary_loss_clip": 0.0103471, + "auxiliary_loss_mlp": 0.01033058, + "balance_loss_clip": 1.02299035, + "balance_loss_mlp": 1.02117276, + "epoch": 0.6630392304223659, + "flos": 20846898593280.0, + "grad_norm": 3.065111007886345, + "language_loss": 0.70309746, + "learning_rate": 1.0772849388983742e-06, + "loss": 0.72377515, + "num_input_tokens_seen": 238022560, + "step": 11028, + "time_per_iteration": 2.7020630836486816 + }, + { + "auxiliary_loss_clip": 0.01052151, + "auxiliary_loss_mlp": 0.01029138, + "balance_loss_clip": 1.02400351, + "balance_loss_mlp": 1.0191009, + "epoch": 0.6630993536750338, + "flos": 20995820380800.0, + "grad_norm": 1.957247888244163, + "language_loss": 0.79430103, + "learning_rate": 1.0769394204189138e-06, + "loss": 0.8151139, + "num_input_tokens_seen": 238041895, + "step": 11029, + "time_per_iteration": 2.63472056388855 + }, + { + "auxiliary_loss_clip": 0.01064207, + "auxiliary_loss_mlp": 0.01030004, + "balance_loss_clip": 1.02423823, + "balance_loss_mlp": 1.01802945, + "epoch": 0.6631594769277018, + "flos": 18259032355200.0, + "grad_norm": 2.31547915563143, + "language_loss": 0.7626726, + "learning_rate": 1.0765939369420012e-06, + "loss": 0.78361475, + "num_input_tokens_seen": 238060445, + "step": 11030, + "time_per_iteration": 2.5510785579681396 + }, + { + "auxiliary_loss_clip": 0.01060055, + "auxiliary_loss_mlp": 0.01030175, + "balance_loss_clip": 1.02747989, + "balance_loss_mlp": 1.01837921, + "epoch": 0.6632196001803697, + "flos": 17820491207040.0, + "grad_norm": 2.320939476897732, + "language_loss": 0.74818903, + "learning_rate": 1.0762484884807391e-06, + "loss": 0.76909137, + "num_input_tokens_seen": 238077080, + "step": 11031, + "time_per_iteration": 2.6970064640045166 + }, + { + "auxiliary_loss_clip": 0.01054749, + "auxiliary_loss_mlp": 0.0103437, + "balance_loss_clip": 1.02505159, + "balance_loss_mlp": 1.02293169, + "epoch": 0.6632797234330378, + "flos": 12670182581760.0, + "grad_norm": 2.618338840856229, + "language_loss": 0.74653327, + "learning_rate": 1.075903075048228e-06, + "loss": 0.76742446, + "num_input_tokens_seen": 238091045, + "step": 11032, + "time_per_iteration": 2.7404847145080566 + }, + { + "auxiliary_loss_clip": 0.01019962, + "auxiliary_loss_mlp": 0.01030957, + "balance_loss_clip": 1.02277255, + "balance_loss_mlp": 1.02043641, + "epoch": 0.6633398466857057, + "flos": 23584728113280.0, + "grad_norm": 1.6140073854400154, + "language_loss": 0.80449373, + "learning_rate": 1.0755576966575635e-06, + "loss": 0.82500291, + "num_input_tokens_seen": 238110220, + "step": 11033, + "time_per_iteration": 2.98282527923584 + }, + { + "auxiliary_loss_clip": 0.01046481, + "auxiliary_loss_mlp": 0.01027747, + "balance_loss_clip": 1.02590084, + "balance_loss_mlp": 1.01624358, + "epoch": 0.6633999699383737, + "flos": 20631686256000.0, + "grad_norm": 1.6750534662165137, + "language_loss": 0.80364084, + "learning_rate": 1.0752123533218451e-06, + "loss": 0.82438314, + "num_input_tokens_seen": 238130400, + "step": 11034, + "time_per_iteration": 2.899583339691162 + }, + { + "auxiliary_loss_clip": 0.01052664, + "auxiliary_loss_mlp": 0.01027283, + "balance_loss_clip": 1.02550495, + "balance_loss_mlp": 1.01692367, + "epoch": 0.6634600931910416, + "flos": 21797095023360.0, + "grad_norm": 1.7899310915550024, + "language_loss": 0.75609785, + "learning_rate": 1.074867045054166e-06, + "loss": 0.77689731, + "num_input_tokens_seen": 238148165, + "step": 11035, + "time_per_iteration": 4.315999746322632 + }, + { + "auxiliary_loss_clip": 0.0103252, + "auxiliary_loss_mlp": 0.0102457, + "balance_loss_clip": 1.02261782, + "balance_loss_mlp": 1.01333499, + "epoch": 0.6635202164437096, + "flos": 18732873594240.0, + "grad_norm": 2.3671499982131374, + "language_loss": 0.82936895, + "learning_rate": 1.074521771867622e-06, + "loss": 0.84993988, + "num_input_tokens_seen": 238166360, + "step": 11036, + "time_per_iteration": 4.386494159698486 + }, + { + "auxiliary_loss_clip": 0.0100838, + "auxiliary_loss_mlp": 0.0100268, + "balance_loss_clip": 1.00274968, + "balance_loss_mlp": 1.00157166, + "epoch": 0.6635803396963775, + "flos": 60222771227520.0, + "grad_norm": 0.7777744726764898, + "language_loss": 0.52241731, + "learning_rate": 1.0741765337753044e-06, + "loss": 0.54252791, + "num_input_tokens_seen": 238227630, + "step": 11037, + "time_per_iteration": 3.177797317504883 + }, + { + "auxiliary_loss_clip": 0.01015721, + "auxiliary_loss_mlp": 0.01037204, + "balance_loss_clip": 1.02574384, + "balance_loss_mlp": 1.02487803, + "epoch": 0.6636404629490456, + "flos": 29167041611520.0, + "grad_norm": 1.5643998969183113, + "language_loss": 0.78911614, + "learning_rate": 1.0738313307903052e-06, + "loss": 0.80964535, + "num_input_tokens_seen": 238248435, + "step": 11038, + "time_per_iteration": 2.8534865379333496 + }, + { + "auxiliary_loss_clip": 0.0103487, + "auxiliary_loss_mlp": 0.01033292, + "balance_loss_clip": 1.02613664, + "balance_loss_mlp": 1.0217768, + "epoch": 0.6637005862017135, + "flos": 38907702766080.0, + "grad_norm": 1.790165589448605, + "language_loss": 0.64144063, + "learning_rate": 1.073486162925716e-06, + "loss": 0.66212225, + "num_input_tokens_seen": 238268755, + "step": 11039, + "time_per_iteration": 2.9101414680480957 + }, + { + "auxiliary_loss_clip": 0.01030693, + "auxiliary_loss_mlp": 0.01026712, + "balance_loss_clip": 1.02519679, + "balance_loss_mlp": 1.01578093, + "epoch": 0.6637607094543815, + "flos": 22783345729920.0, + "grad_norm": 4.026378517414929, + "language_loss": 0.6404053, + "learning_rate": 1.0731410301946237e-06, + "loss": 0.66097939, + "num_input_tokens_seen": 238290120, + "step": 11040, + "time_per_iteration": 2.7821125984191895 + }, + { + "auxiliary_loss_clip": 0.01025766, + "auxiliary_loss_mlp": 0.01037621, + "balance_loss_clip": 1.02163923, + "balance_loss_mlp": 1.02575946, + "epoch": 0.6638208327070495, + "flos": 18114096977280.0, + "grad_norm": 3.0938194088458406, + "language_loss": 0.71843886, + "learning_rate": 1.0727959326101161e-06, + "loss": 0.73907268, + "num_input_tokens_seen": 238309290, + "step": 11041, + "time_per_iteration": 2.6894779205322266 + }, + { + "auxiliary_loss_clip": 0.01046509, + "auxiliary_loss_mlp": 0.01037197, + "balance_loss_clip": 1.02313781, + "balance_loss_mlp": 1.02426851, + "epoch": 0.6638809559597174, + "flos": 29424880414080.0, + "grad_norm": 2.4044661806099668, + "language_loss": 0.61637819, + "learning_rate": 1.0724508701852806e-06, + "loss": 0.63721514, + "num_input_tokens_seen": 238327280, + "step": 11042, + "time_per_iteration": 2.7587671279907227 + }, + { + "auxiliary_loss_clip": 0.01056621, + "auxiliary_loss_mlp": 0.01027925, + "balance_loss_clip": 1.02467048, + "balance_loss_mlp": 1.01564658, + "epoch": 0.6639410792123854, + "flos": 28072699902720.0, + "grad_norm": 1.933107588915863, + "language_loss": 0.67948091, + "learning_rate": 1.0721058429331998e-06, + "loss": 0.70032644, + "num_input_tokens_seen": 238346330, + "step": 11043, + "time_per_iteration": 2.732978343963623 + }, + { + "auxiliary_loss_clip": 0.01052187, + "auxiliary_loss_mlp": 0.01025468, + "balance_loss_clip": 1.02615094, + "balance_loss_mlp": 1.01596713, + "epoch": 0.6640012024650533, + "flos": 25556367600000.0, + "grad_norm": 1.5804163523743342, + "language_loss": 0.83870435, + "learning_rate": 1.0717608508669587e-06, + "loss": 0.85948086, + "num_input_tokens_seen": 238364650, + "step": 11044, + "time_per_iteration": 2.6703763008117676 + }, + { + "auxiliary_loss_clip": 0.01029221, + "auxiliary_loss_mlp": 0.01027191, + "balance_loss_clip": 1.02330041, + "balance_loss_mlp": 1.01594961, + "epoch": 0.6640613257177214, + "flos": 14866946559360.0, + "grad_norm": 3.000461108590188, + "language_loss": 0.69643378, + "learning_rate": 1.0714158939996392e-06, + "loss": 0.71699798, + "num_input_tokens_seen": 238381630, + "step": 11045, + "time_per_iteration": 2.742177963256836 + }, + { + "auxiliary_loss_clip": 0.0105569, + "auxiliary_loss_mlp": 0.01027872, + "balance_loss_clip": 1.02638149, + "balance_loss_mlp": 1.01674366, + "epoch": 0.6641214489703893, + "flos": 23221096778880.0, + "grad_norm": 1.3752816660644778, + "language_loss": 0.64568782, + "learning_rate": 1.0710709723443235e-06, + "loss": 0.6665234, + "num_input_tokens_seen": 238402595, + "step": 11046, + "time_per_iteration": 2.7721266746520996 + }, + { + "auxiliary_loss_clip": 0.01034435, + "auxiliary_loss_mlp": 0.01025983, + "balance_loss_clip": 1.02525938, + "balance_loss_mlp": 1.01521206, + "epoch": 0.6641815722230573, + "flos": 37742617221120.0, + "grad_norm": 1.5769775115057556, + "language_loss": 0.71085024, + "learning_rate": 1.070726085914088e-06, + "loss": 0.73145437, + "num_input_tokens_seen": 238426860, + "step": 11047, + "time_per_iteration": 2.926011800765991 + }, + { + "auxiliary_loss_clip": 0.01011407, + "auxiliary_loss_mlp": 0.01032473, + "balance_loss_clip": 1.02767646, + "balance_loss_mlp": 1.02076054, + "epoch": 0.6642416954757252, + "flos": 17931132074880.0, + "grad_norm": 1.9704577173858127, + "language_loss": 0.77310908, + "learning_rate": 1.0703812347220126e-06, + "loss": 0.79354787, + "num_input_tokens_seen": 238443990, + "step": 11048, + "time_per_iteration": 2.8275609016418457 + }, + { + "auxiliary_loss_clip": 0.00984271, + "auxiliary_loss_mlp": 0.01002052, + "balance_loss_clip": 1.00737309, + "balance_loss_mlp": 1.00097883, + "epoch": 0.6643018187283932, + "flos": 51995384104320.0, + "grad_norm": 0.7530351249122448, + "language_loss": 0.55050063, + "learning_rate": 1.0700364187811745e-06, + "loss": 0.57036388, + "num_input_tokens_seen": 238503045, + "step": 11049, + "time_per_iteration": 3.345238447189331 + }, + { + "auxiliary_loss_clip": 0.0105552, + "auxiliary_loss_mlp": 0.01029276, + "balance_loss_clip": 1.02628922, + "balance_loss_mlp": 1.01880991, + "epoch": 0.6643619419810611, + "flos": 30226657847040.0, + "grad_norm": 1.7645587424810836, + "language_loss": 0.64505517, + "learning_rate": 1.069691638104648e-06, + "loss": 0.66590309, + "num_input_tokens_seen": 238527320, + "step": 11050, + "time_per_iteration": 4.322990894317627 + }, + { + "auxiliary_loss_clip": 0.01061278, + "auxiliary_loss_mlp": 0.01027607, + "balance_loss_clip": 1.02494955, + "balance_loss_mlp": 1.01739085, + "epoch": 0.6644220652337292, + "flos": 22966131064320.0, + "grad_norm": 2.43752186465122, + "language_loss": 0.78564036, + "learning_rate": 1.0693468927055085e-06, + "loss": 0.80652916, + "num_input_tokens_seen": 238546030, + "step": 11051, + "time_per_iteration": 2.6932849884033203 + }, + { + "auxiliary_loss_clip": 0.01044984, + "auxiliary_loss_mlp": 0.0103077, + "balance_loss_clip": 1.02726817, + "balance_loss_mlp": 1.01946914, + "epoch": 0.6644821884863971, + "flos": 21142228216320.0, + "grad_norm": 5.10557446291787, + "language_loss": 0.85291314, + "learning_rate": 1.0690021825968276e-06, + "loss": 0.87367076, + "num_input_tokens_seen": 238564175, + "step": 11052, + "time_per_iteration": 2.730074167251587 + }, + { + "auxiliary_loss_clip": 0.01017632, + "auxiliary_loss_mlp": 0.01033366, + "balance_loss_clip": 1.02345502, + "balance_loss_mlp": 1.02059901, + "epoch": 0.6645423117390651, + "flos": 20192821885440.0, + "grad_norm": 2.8840450185210096, + "language_loss": 0.74687034, + "learning_rate": 1.0686575077916776e-06, + "loss": 0.76738036, + "num_input_tokens_seen": 238581010, + "step": 11053, + "time_per_iteration": 2.8562405109405518 + }, + { + "auxiliary_loss_clip": 0.01033395, + "auxiliary_loss_mlp": 0.01023831, + "balance_loss_clip": 1.02404046, + "balance_loss_mlp": 1.01363277, + "epoch": 0.6646024349917331, + "flos": 24351959640960.0, + "grad_norm": 1.5117156585003508, + "language_loss": 0.79546219, + "learning_rate": 1.0683128683031278e-06, + "loss": 0.81603444, + "num_input_tokens_seen": 238601365, + "step": 11054, + "time_per_iteration": 2.706912040710449 + }, + { + "auxiliary_loss_clip": 0.01020763, + "auxiliary_loss_mlp": 0.01026524, + "balance_loss_clip": 1.02330077, + "balance_loss_mlp": 1.01634312, + "epoch": 0.664662558244401, + "flos": 18806706000000.0, + "grad_norm": 1.6028028204228493, + "language_loss": 0.74255425, + "learning_rate": 1.0679682641442472e-06, + "loss": 0.76302707, + "num_input_tokens_seen": 238619850, + "step": 11055, + "time_per_iteration": 2.728957176208496 + }, + { + "auxiliary_loss_clip": 0.01029816, + "auxiliary_loss_mlp": 0.01038561, + "balance_loss_clip": 1.02348542, + "balance_loss_mlp": 1.026443, + "epoch": 0.664722681497069, + "flos": 18952790613120.0, + "grad_norm": 1.537438378590107, + "language_loss": 0.72376668, + "learning_rate": 1.0676236953281042e-06, + "loss": 0.74445045, + "num_input_tokens_seen": 238637635, + "step": 11056, + "time_per_iteration": 2.8777475357055664 + }, + { + "auxiliary_loss_clip": 0.0102114, + "auxiliary_loss_mlp": 0.01023321, + "balance_loss_clip": 1.02409017, + "balance_loss_mlp": 1.01265812, + "epoch": 0.6647828047497369, + "flos": 19571279921280.0, + "grad_norm": 1.743580733814967, + "language_loss": 0.69343984, + "learning_rate": 1.0672791618677641e-06, + "loss": 0.71388435, + "num_input_tokens_seen": 238656200, + "step": 11057, + "time_per_iteration": 2.7082929611206055 + }, + { + "auxiliary_loss_clip": 0.01052685, + "auxiliary_loss_mlp": 0.0102718, + "balance_loss_clip": 1.02412581, + "balance_loss_mlp": 1.01585531, + "epoch": 0.664842928002405, + "flos": 23149455102720.0, + "grad_norm": 1.779855482135263, + "language_loss": 0.80300725, + "learning_rate": 1.066934663776291e-06, + "loss": 0.82380593, + "num_input_tokens_seen": 238675005, + "step": 11058, + "time_per_iteration": 4.1954076290130615 + }, + { + "auxiliary_loss_clip": 0.00980072, + "auxiliary_loss_mlp": 0.01002698, + "balance_loss_clip": 1.00367236, + "balance_loss_mlp": 1.00172698, + "epoch": 0.6649030512550729, + "flos": 65244913148160.0, + "grad_norm": 0.8092969965772404, + "language_loss": 0.62620366, + "learning_rate": 1.0665902010667496e-06, + "loss": 0.64603138, + "num_input_tokens_seen": 238731425, + "step": 11059, + "time_per_iteration": 3.1484334468841553 + }, + { + "auxiliary_loss_clip": 0.01052143, + "auxiliary_loss_mlp": 0.01030748, + "balance_loss_clip": 1.02499866, + "balance_loss_mlp": 1.02078784, + "epoch": 0.6649631745077409, + "flos": 20194797133440.0, + "grad_norm": 1.4188009641330157, + "language_loss": 0.78997433, + "learning_rate": 1.0662457737522008e-06, + "loss": 0.81080329, + "num_input_tokens_seen": 238752020, + "step": 11060, + "time_per_iteration": 2.613710403442383 + }, + { + "auxiliary_loss_clip": 0.0103188, + "auxiliary_loss_mlp": 0.01030762, + "balance_loss_clip": 1.02454519, + "balance_loss_mlp": 1.01910889, + "epoch": 0.6650232977604088, + "flos": 17238558965760.0, + "grad_norm": 5.318421086804532, + "language_loss": 0.79058248, + "learning_rate": 1.0659013818457055e-06, + "loss": 0.81120896, + "num_input_tokens_seen": 238769665, + "step": 11061, + "time_per_iteration": 2.705893039703369 + }, + { + "auxiliary_loss_clip": 0.01045284, + "auxiliary_loss_mlp": 0.01026641, + "balance_loss_clip": 1.02746058, + "balance_loss_mlp": 1.01612663, + "epoch": 0.6650834210130768, + "flos": 10006867825920.0, + "grad_norm": 2.22730988961349, + "language_loss": 0.5645659, + "learning_rate": 1.0655570253603243e-06, + "loss": 0.58528519, + "num_input_tokens_seen": 238782180, + "step": 11062, + "time_per_iteration": 2.7316126823425293 + }, + { + "auxiliary_loss_clip": 0.01049843, + "auxiliary_loss_mlp": 0.01029541, + "balance_loss_clip": 1.02379608, + "balance_loss_mlp": 1.01531374, + "epoch": 0.6651435442657447, + "flos": 10452088903680.0, + "grad_norm": 2.440171902387774, + "language_loss": 0.76162219, + "learning_rate": 1.0652127043091144e-06, + "loss": 0.78241605, + "num_input_tokens_seen": 238800315, + "step": 11063, + "time_per_iteration": 2.9060299396514893 + }, + { + "auxiliary_loss_clip": 0.01010286, + "auxiliary_loss_mlp": 0.01036876, + "balance_loss_clip": 1.02445602, + "balance_loss_mlp": 1.02478778, + "epoch": 0.6652036675184128, + "flos": 22344229964160.0, + "grad_norm": 2.412241283573578, + "language_loss": 0.70660257, + "learning_rate": 1.0648684187051316e-06, + "loss": 0.72707415, + "num_input_tokens_seen": 238822250, + "step": 11064, + "time_per_iteration": 2.8113274574279785 + }, + { + "auxiliary_loss_clip": 0.01006555, + "auxiliary_loss_mlp": 0.0099997, + "balance_loss_clip": 1.00103104, + "balance_loss_mlp": 0.99896872, + "epoch": 0.6652637907710807, + "flos": 52909633998720.0, + "grad_norm": 0.8563766985048493, + "language_loss": 0.63165861, + "learning_rate": 1.0645241685614322e-06, + "loss": 0.65172386, + "num_input_tokens_seen": 238877190, + "step": 11065, + "time_per_iteration": 3.0831263065338135 + }, + { + "auxiliary_loss_clip": 0.01046107, + "auxiliary_loss_mlp": 0.01034661, + "balance_loss_clip": 1.02295685, + "balance_loss_mlp": 1.02197742, + "epoch": 0.6653239140237487, + "flos": 23104637907840.0, + "grad_norm": 1.6561834205609063, + "language_loss": 0.6235081, + "learning_rate": 1.0641799538910708e-06, + "loss": 0.64431572, + "num_input_tokens_seen": 238896010, + "step": 11066, + "time_per_iteration": 2.724497079849243 + }, + { + "auxiliary_loss_clip": 0.01023862, + "auxiliary_loss_mlp": 0.01035962, + "balance_loss_clip": 1.02103853, + "balance_loss_mlp": 1.02193725, + "epoch": 0.6653840372764167, + "flos": 25959393175680.0, + "grad_norm": 1.5093728243962372, + "language_loss": 0.69944298, + "learning_rate": 1.0638357747070985e-06, + "loss": 0.72004116, + "num_input_tokens_seen": 238918990, + "step": 11067, + "time_per_iteration": 2.7304821014404297 + }, + { + "auxiliary_loss_clip": 0.00988814, + "auxiliary_loss_mlp": 0.01008581, + "balance_loss_clip": 1.00287771, + "balance_loss_mlp": 1.00737667, + "epoch": 0.6654441605290846, + "flos": 66041985899520.0, + "grad_norm": 0.9079549129607299, + "language_loss": 0.72073233, + "learning_rate": 1.0634916310225684e-06, + "loss": 0.74070632, + "num_input_tokens_seen": 238975735, + "step": 11068, + "time_per_iteration": 3.2397382259368896 + }, + { + "auxiliary_loss_clip": 0.0097905, + "auxiliary_loss_mlp": 0.01001958, + "balance_loss_clip": 1.00280976, + "balance_loss_mlp": 1.00095618, + "epoch": 0.6655042837817526, + "flos": 65196112521600.0, + "grad_norm": 0.7058486556121957, + "language_loss": 0.57800496, + "learning_rate": 1.0631475228505285e-06, + "loss": 0.59781504, + "num_input_tokens_seen": 239042360, + "step": 11069, + "time_per_iteration": 3.3990139961242676 + }, + { + "auxiliary_loss_clip": 0.00985319, + "auxiliary_loss_mlp": 0.01001955, + "balance_loss_clip": 1.00106502, + "balance_loss_mlp": 1.00097108, + "epoch": 0.6655644070344205, + "flos": 69008746752000.0, + "grad_norm": 0.7625939837367542, + "language_loss": 0.635396, + "learning_rate": 1.062803450204029e-06, + "loss": 0.65526867, + "num_input_tokens_seen": 239109410, + "step": 11070, + "time_per_iteration": 3.2654178142547607 + }, + { + "auxiliary_loss_clip": 0.01061492, + "auxiliary_loss_mlp": 0.01026396, + "balance_loss_clip": 1.02323258, + "balance_loss_mlp": 1.01578629, + "epoch": 0.6656245302870886, + "flos": 36315562809600.0, + "grad_norm": 1.6342370266733426, + "language_loss": 0.58647633, + "learning_rate": 1.062459413096116e-06, + "loss": 0.60735512, + "num_input_tokens_seen": 239135345, + "step": 11071, + "time_per_iteration": 2.817368984222412 + }, + { + "auxiliary_loss_clip": 0.01056299, + "auxiliary_loss_mlp": 0.01025995, + "balance_loss_clip": 1.02654815, + "balance_loss_mlp": 1.01548052, + "epoch": 0.6656846535397565, + "flos": 21794832466560.0, + "grad_norm": 2.0199564736908853, + "language_loss": 0.72933209, + "learning_rate": 1.0621154115398364e-06, + "loss": 0.75015497, + "num_input_tokens_seen": 239154340, + "step": 11072, + "time_per_iteration": 2.875349283218384 + }, + { + "auxiliary_loss_clip": 0.010548, + "auxiliary_loss_mlp": 0.0102981, + "balance_loss_clip": 1.02705586, + "balance_loss_mlp": 1.01811504, + "epoch": 0.6657447767924245, + "flos": 37487615592960.0, + "grad_norm": 1.7377326380763727, + "language_loss": 0.70501912, + "learning_rate": 1.0617714455482353e-06, + "loss": 0.72586513, + "num_input_tokens_seen": 239177815, + "step": 11073, + "time_per_iteration": 2.7996394634246826 + }, + { + "auxiliary_loss_clip": 0.01036564, + "auxiliary_loss_mlp": 0.01031525, + "balance_loss_clip": 1.02586794, + "balance_loss_mlp": 1.01989603, + "epoch": 0.6658049000450924, + "flos": 16837688206080.0, + "grad_norm": 1.8814322899230262, + "language_loss": 0.56369489, + "learning_rate": 1.061427515134354e-06, + "loss": 0.5843758, + "num_input_tokens_seen": 239195735, + "step": 11074, + "time_per_iteration": 2.8005411624908447 + }, + { + "auxiliary_loss_clip": 0.01064692, + "auxiliary_loss_mlp": 0.00747682, + "balance_loss_clip": 1.02626002, + "balance_loss_mlp": 1.00053787, + "epoch": 0.6658650232977604, + "flos": 33510975863040.0, + "grad_norm": 1.645389905792931, + "language_loss": 0.72688234, + "learning_rate": 1.061083620311235e-06, + "loss": 0.74500608, + "num_input_tokens_seen": 239217535, + "step": 11075, + "time_per_iteration": 2.646167516708374 + }, + { + "auxiliary_loss_clip": 0.01050746, + "auxiliary_loss_mlp": 0.0102777, + "balance_loss_clip": 1.02407789, + "balance_loss_mlp": 1.01746464, + "epoch": 0.6659251465504283, + "flos": 37706311549440.0, + "grad_norm": 1.4725874262795424, + "language_loss": 0.65812218, + "learning_rate": 1.0607397610919202e-06, + "loss": 0.67890733, + "num_input_tokens_seen": 239241975, + "step": 11076, + "time_per_iteration": 2.714024305343628 + }, + { + "auxiliary_loss_clip": 0.01037304, + "auxiliary_loss_mlp": 0.01032362, + "balance_loss_clip": 1.02238226, + "balance_loss_mlp": 1.02020872, + "epoch": 0.6659852698030964, + "flos": 24893420232960.0, + "grad_norm": 1.6356987366321978, + "language_loss": 0.75021344, + "learning_rate": 1.0603959374894468e-06, + "loss": 0.77091002, + "num_input_tokens_seen": 239262025, + "step": 11077, + "time_per_iteration": 2.7084169387817383 + }, + { + "auxiliary_loss_clip": 0.01042017, + "auxiliary_loss_mlp": 0.01027129, + "balance_loss_clip": 1.02324152, + "balance_loss_mlp": 1.01584637, + "epoch": 0.6660453930557643, + "flos": 24352821567360.0, + "grad_norm": 1.7946954876500738, + "language_loss": 0.67022341, + "learning_rate": 1.0600521495168538e-06, + "loss": 0.69091493, + "num_input_tokens_seen": 239282775, + "step": 11078, + "time_per_iteration": 2.6821322441101074 + }, + { + "auxiliary_loss_clip": 0.01064232, + "auxiliary_loss_mlp": 0.01028626, + "balance_loss_clip": 1.02434397, + "balance_loss_mlp": 1.01705122, + "epoch": 0.6661055163084323, + "flos": 10597814380800.0, + "grad_norm": 2.4561232143574956, + "language_loss": 0.69610524, + "learning_rate": 1.0597083971871783e-06, + "loss": 0.71703386, + "num_input_tokens_seen": 239299775, + "step": 11079, + "time_per_iteration": 2.59477162361145 + }, + { + "auxiliary_loss_clip": 0.01041773, + "auxiliary_loss_mlp": 0.01024906, + "balance_loss_clip": 1.02413082, + "balance_loss_mlp": 1.01467812, + "epoch": 0.6661656395611003, + "flos": 24057491944320.0, + "grad_norm": 1.8509657261763854, + "language_loss": 0.80217886, + "learning_rate": 1.0593646805134544e-06, + "loss": 0.8228457, + "num_input_tokens_seen": 239319660, + "step": 11080, + "time_per_iteration": 2.714597225189209 + }, + { + "auxiliary_loss_clip": 0.01029583, + "auxiliary_loss_mlp": 0.01029409, + "balance_loss_clip": 1.02356434, + "balance_loss_mlp": 1.01892447, + "epoch": 0.6662257628137682, + "flos": 23036192542080.0, + "grad_norm": 1.832795003919192, + "language_loss": 0.78233767, + "learning_rate": 1.0590209995087157e-06, + "loss": 0.80292749, + "num_input_tokens_seen": 239339215, + "step": 11081, + "time_per_iteration": 2.7389185428619385 + }, + { + "auxiliary_loss_clip": 0.01026852, + "auxiliary_loss_mlp": 0.0103502, + "balance_loss_clip": 1.02242112, + "balance_loss_mlp": 1.02139449, + "epoch": 0.6662858860664362, + "flos": 24754446512640.0, + "grad_norm": 1.8485345561707993, + "language_loss": 0.7953887, + "learning_rate": 1.0586773541859946e-06, + "loss": 0.81600738, + "num_input_tokens_seen": 239358545, + "step": 11082, + "time_per_iteration": 4.375001907348633 + }, + { + "auxiliary_loss_clip": 0.01025194, + "auxiliary_loss_mlp": 0.0103034, + "balance_loss_clip": 1.02395725, + "balance_loss_mlp": 1.0203805, + "epoch": 0.6663460093191041, + "flos": 20009066883840.0, + "grad_norm": 1.5418431137341793, + "language_loss": 0.83982754, + "learning_rate": 1.0583337445583234e-06, + "loss": 0.86038285, + "num_input_tokens_seen": 239376665, + "step": 11083, + "time_per_iteration": 4.4596171379089355 + }, + { + "auxiliary_loss_clip": 0.01041442, + "auxiliary_loss_mlp": 0.01033565, + "balance_loss_clip": 1.02933478, + "balance_loss_mlp": 1.02123833, + "epoch": 0.6664061325717722, + "flos": 17821389047040.0, + "grad_norm": 2.276410857942496, + "language_loss": 0.85296267, + "learning_rate": 1.057990170638731e-06, + "loss": 0.87371272, + "num_input_tokens_seen": 239394345, + "step": 11084, + "time_per_iteration": 2.864501714706421 + }, + { + "auxiliary_loss_clip": 0.0104458, + "auxiliary_loss_mlp": 0.01026409, + "balance_loss_clip": 1.02432418, + "balance_loss_mlp": 1.01469076, + "epoch": 0.6664662558244401, + "flos": 18076893465600.0, + "grad_norm": 2.5632970098581214, + "language_loss": 0.72819185, + "learning_rate": 1.0576466324402452e-06, + "loss": 0.74890172, + "num_input_tokens_seen": 239410605, + "step": 11085, + "time_per_iteration": 2.6565866470336914 + }, + { + "auxiliary_loss_clip": 0.01040943, + "auxiliary_loss_mlp": 0.01027732, + "balance_loss_clip": 1.02324176, + "balance_loss_mlp": 1.01664591, + "epoch": 0.6665263790771081, + "flos": 21574197175680.0, + "grad_norm": 1.949581887760537, + "language_loss": 0.80435371, + "learning_rate": 1.057303129975894e-06, + "loss": 0.82504046, + "num_input_tokens_seen": 239427155, + "step": 11086, + "time_per_iteration": 2.745145559310913 + }, + { + "auxiliary_loss_clip": 0.01042885, + "auxiliary_loss_mlp": 0.01032212, + "balance_loss_clip": 1.02407289, + "balance_loss_mlp": 1.02038622, + "epoch": 0.666586502329776, + "flos": 24206629213440.0, + "grad_norm": 1.8918406021904224, + "language_loss": 0.7491262, + "learning_rate": 1.056959663258702e-06, + "loss": 0.7698772, + "num_input_tokens_seen": 239445510, + "step": 11087, + "time_per_iteration": 2.7555081844329834 + }, + { + "auxiliary_loss_clip": 0.01053284, + "auxiliary_loss_mlp": 0.01029173, + "balance_loss_clip": 1.02503049, + "balance_loss_mlp": 1.01815248, + "epoch": 0.666646625582444, + "flos": 22200515648640.0, + "grad_norm": 1.5451216601198987, + "language_loss": 0.65001875, + "learning_rate": 1.0566162323016939e-06, + "loss": 0.67084324, + "num_input_tokens_seen": 239464805, + "step": 11088, + "time_per_iteration": 2.682962417602539 + }, + { + "auxiliary_loss_clip": 0.01050685, + "auxiliary_loss_mlp": 0.01025386, + "balance_loss_clip": 1.02464533, + "balance_loss_mlp": 1.0138402, + "epoch": 0.6667067488351119, + "flos": 18259930195200.0, + "grad_norm": 1.7251741837522545, + "language_loss": 0.64281911, + "learning_rate": 1.0562728371178928e-06, + "loss": 0.66357982, + "num_input_tokens_seen": 239483890, + "step": 11089, + "time_per_iteration": 2.7295894622802734 + }, + { + "auxiliary_loss_clip": 0.01063387, + "auxiliary_loss_mlp": 0.01029567, + "balance_loss_clip": 1.02561915, + "balance_loss_mlp": 1.0187254, + "epoch": 0.66676687208778, + "flos": 17236547804160.0, + "grad_norm": 2.017788887824225, + "language_loss": 0.80880487, + "learning_rate": 1.0559294777203221e-06, + "loss": 0.82973444, + "num_input_tokens_seen": 239500080, + "step": 11090, + "time_per_iteration": 2.6849634647369385 + }, + { + "auxiliary_loss_clip": 0.0104162, + "auxiliary_loss_mlp": 0.01031585, + "balance_loss_clip": 1.02276528, + "balance_loss_mlp": 1.02036786, + "epoch": 0.6668269953404479, + "flos": 19752197748480.0, + "grad_norm": 2.5415561236141313, + "language_loss": 0.77699608, + "learning_rate": 1.0555861541219984e-06, + "loss": 0.79772812, + "num_input_tokens_seen": 239517335, + "step": 11091, + "time_per_iteration": 2.6484482288360596 + }, + { + "auxiliary_loss_clip": 0.01063477, + "auxiliary_loss_mlp": 0.01032612, + "balance_loss_clip": 1.02534676, + "balance_loss_mlp": 1.0212456, + "epoch": 0.6668871185931159, + "flos": 20558428467840.0, + "grad_norm": 1.762547512636011, + "language_loss": 0.79574591, + "learning_rate": 1.0552428663359425e-06, + "loss": 0.81670678, + "num_input_tokens_seen": 239536240, + "step": 11092, + "time_per_iteration": 2.622981071472168 + }, + { + "auxiliary_loss_clip": 0.00985424, + "auxiliary_loss_mlp": 0.01001479, + "balance_loss_clip": 1.00832343, + "balance_loss_mlp": 1.00042367, + "epoch": 0.6669472418457839, + "flos": 58088167735680.0, + "grad_norm": 0.7584287867774424, + "language_loss": 0.57708764, + "learning_rate": 1.0548996143751724e-06, + "loss": 0.59695661, + "num_input_tokens_seen": 239598000, + "step": 11093, + "time_per_iteration": 3.2555887699127197 + }, + { + "auxiliary_loss_clip": 0.01063616, + "auxiliary_loss_mlp": 0.01026901, + "balance_loss_clip": 1.02582991, + "balance_loss_mlp": 1.01624918, + "epoch": 0.6670073650984518, + "flos": 26065113880320.0, + "grad_norm": 1.526177306476999, + "language_loss": 0.76360315, + "learning_rate": 1.054556398252703e-06, + "loss": 0.78450829, + "num_input_tokens_seen": 239617650, + "step": 11094, + "time_per_iteration": 2.6850576400756836 + }, + { + "auxiliary_loss_clip": 0.01065209, + "auxiliary_loss_mlp": 0.01030706, + "balance_loss_clip": 1.02571797, + "balance_loss_mlp": 1.01885629, + "epoch": 0.6670674883511198, + "flos": 32416849635840.0, + "grad_norm": 1.6576255154247543, + "language_loss": 0.73191768, + "learning_rate": 1.05421321798155e-06, + "loss": 0.75287688, + "num_input_tokens_seen": 239639825, + "step": 11095, + "time_per_iteration": 2.8217782974243164 + }, + { + "auxiliary_loss_clip": 0.01049576, + "auxiliary_loss_mlp": 0.01029999, + "balance_loss_clip": 1.02450478, + "balance_loss_mlp": 1.01920485, + "epoch": 0.6671276116037878, + "flos": 18037786533120.0, + "grad_norm": 1.8820551255914906, + "language_loss": 0.73222136, + "learning_rate": 1.053870073574727e-06, + "loss": 0.75301707, + "num_input_tokens_seen": 239656300, + "step": 11096, + "time_per_iteration": 2.8060595989227295 + }, + { + "auxiliary_loss_clip": 0.01025943, + "auxiliary_loss_mlp": 0.0102895, + "balance_loss_clip": 1.02425218, + "balance_loss_mlp": 1.01795912, + "epoch": 0.6671877348564558, + "flos": 23767046570880.0, + "grad_norm": 1.779302757478565, + "language_loss": 0.64397824, + "learning_rate": 1.0535269650452456e-06, + "loss": 0.66452718, + "num_input_tokens_seen": 239676655, + "step": 11097, + "time_per_iteration": 4.477778196334839 + }, + { + "auxiliary_loss_clip": 0.01053116, + "auxiliary_loss_mlp": 0.01032718, + "balance_loss_clip": 1.02391195, + "balance_loss_mlp": 1.02100539, + "epoch": 0.6672478581091237, + "flos": 20918360701440.0, + "grad_norm": 1.9175505860848578, + "language_loss": 0.75583458, + "learning_rate": 1.0531838924061158e-06, + "loss": 0.77669287, + "num_input_tokens_seen": 239695430, + "step": 11098, + "time_per_iteration": 2.6683645248413086 + }, + { + "auxiliary_loss_clip": 0.01066066, + "auxiliary_loss_mlp": 0.01026903, + "balance_loss_clip": 1.02678657, + "balance_loss_mlp": 1.01653826, + "epoch": 0.6673079813617917, + "flos": 27855799626240.0, + "grad_norm": 1.5493913675059556, + "language_loss": 0.74161357, + "learning_rate": 1.0528408556703476e-06, + "loss": 0.76254332, + "num_input_tokens_seen": 239717070, + "step": 11099, + "time_per_iteration": 2.719437837600708 + }, + { + "auxiliary_loss_clip": 0.01050347, + "auxiliary_loss_mlp": 0.01030032, + "balance_loss_clip": 1.0233202, + "balance_loss_mlp": 1.01927936, + "epoch": 0.6673681046144596, + "flos": 21616859554560.0, + "grad_norm": 1.7849673190720696, + "language_loss": 0.78276289, + "learning_rate": 1.0524978548509502e-06, + "loss": 0.80356669, + "num_input_tokens_seen": 239737105, + "step": 11100, + "time_per_iteration": 2.757904529571533 + }, + { + "auxiliary_loss_clip": 0.01062899, + "auxiliary_loss_mlp": 0.01034628, + "balance_loss_clip": 1.02487624, + "balance_loss_mlp": 1.02372682, + "epoch": 0.6674282278671276, + "flos": 20889884194560.0, + "grad_norm": 1.6100935337465643, + "language_loss": 0.6017766, + "learning_rate": 1.0521548899609288e-06, + "loss": 0.62275183, + "num_input_tokens_seen": 239757835, + "step": 11101, + "time_per_iteration": 2.757465362548828 + }, + { + "auxiliary_loss_clip": 0.01050465, + "auxiliary_loss_mlp": 0.01034776, + "balance_loss_clip": 1.02707398, + "balance_loss_mlp": 1.02238417, + "epoch": 0.6674883511197955, + "flos": 23624194181760.0, + "grad_norm": 1.990507796446308, + "language_loss": 0.71379405, + "learning_rate": 1.0518119610132884e-06, + "loss": 0.73464644, + "num_input_tokens_seen": 239775425, + "step": 11102, + "time_per_iteration": 2.7673985958099365 + }, + { + "auxiliary_loss_clip": 0.01054058, + "auxiliary_loss_mlp": 0.01027823, + "balance_loss_clip": 1.02445316, + "balance_loss_mlp": 1.0166533, + "epoch": 0.6675484743724636, + "flos": 19609668581760.0, + "grad_norm": 1.3842978653999871, + "language_loss": 0.84498227, + "learning_rate": 1.051469068021034e-06, + "loss": 0.8658011, + "num_input_tokens_seen": 239794605, + "step": 11103, + "time_per_iteration": 2.6820571422576904 + }, + { + "auxiliary_loss_clip": 0.01042237, + "auxiliary_loss_mlp": 0.01027282, + "balance_loss_clip": 1.02390027, + "balance_loss_mlp": 1.01623774, + "epoch": 0.6676085976251315, + "flos": 14319452482560.0, + "grad_norm": 1.9270230481582893, + "language_loss": 0.78272188, + "learning_rate": 1.0511262109971668e-06, + "loss": 0.80341703, + "num_input_tokens_seen": 239812135, + "step": 11104, + "time_per_iteration": 2.6937758922576904 + }, + { + "auxiliary_loss_clip": 0.01024802, + "auxiliary_loss_mlp": 0.01031954, + "balance_loss_clip": 1.0258913, + "balance_loss_mlp": 1.02133834, + "epoch": 0.6676687208777995, + "flos": 38104596529920.0, + "grad_norm": 1.7416333518180838, + "language_loss": 0.5798645, + "learning_rate": 1.0507833899546889e-06, + "loss": 0.60043204, + "num_input_tokens_seen": 239835845, + "step": 11105, + "time_per_iteration": 4.558672189712524 + }, + { + "auxiliary_loss_clip": 0.01057919, + "auxiliary_loss_mlp": 0.01029654, + "balance_loss_clip": 1.0264312, + "balance_loss_mlp": 1.01731551, + "epoch": 0.6677288441304675, + "flos": 23981576549760.0, + "grad_norm": 1.771852204737986, + "language_loss": 0.72932065, + "learning_rate": 1.0504406049066e-06, + "loss": 0.75019634, + "num_input_tokens_seen": 239853820, + "step": 11106, + "time_per_iteration": 2.762601613998413 + }, + { + "auxiliary_loss_clip": 0.01063504, + "auxiliary_loss_mlp": 0.01030115, + "balance_loss_clip": 1.02487504, + "balance_loss_mlp": 1.01908255, + "epoch": 0.6677889673831354, + "flos": 24170682677760.0, + "grad_norm": 4.837594765465079, + "language_loss": 0.76494819, + "learning_rate": 1.0500978558659e-06, + "loss": 0.78588438, + "num_input_tokens_seen": 239873365, + "step": 11107, + "time_per_iteration": 2.7388341426849365 + }, + { + "auxiliary_loss_clip": 0.01039923, + "auxiliary_loss_mlp": 0.01027558, + "balance_loss_clip": 1.02355123, + "balance_loss_mlp": 1.01656103, + "epoch": 0.6678490906358034, + "flos": 22309648145280.0, + "grad_norm": 2.2007989861369577, + "language_loss": 0.90405655, + "learning_rate": 1.049755142845583e-06, + "loss": 0.92473131, + "num_input_tokens_seen": 239891215, + "step": 11108, + "time_per_iteration": 2.813735246658325 + }, + { + "auxiliary_loss_clip": 0.0103928, + "auxiliary_loss_mlp": 0.01023781, + "balance_loss_clip": 1.0294503, + "balance_loss_mlp": 1.01421404, + "epoch": 0.6679092138884714, + "flos": 36898752026880.0, + "grad_norm": 1.4575612552511377, + "language_loss": 0.82653594, + "learning_rate": 1.049412465858646e-06, + "loss": 0.84716654, + "num_input_tokens_seen": 239913490, + "step": 11109, + "time_per_iteration": 2.9791460037231445 + }, + { + "auxiliary_loss_clip": 0.01043003, + "auxiliary_loss_mlp": 0.01030994, + "balance_loss_clip": 1.02411556, + "balance_loss_mlp": 1.01920414, + "epoch": 0.6679693371411394, + "flos": 18150294908160.0, + "grad_norm": 1.8682154359427112, + "language_loss": 0.69311959, + "learning_rate": 1.0490698249180847e-06, + "loss": 0.71385956, + "num_input_tokens_seen": 239931565, + "step": 11110, + "time_per_iteration": 2.7967369556427 + }, + { + "auxiliary_loss_clip": 0.0104675, + "auxiliary_loss_mlp": 0.01033141, + "balance_loss_clip": 1.02635229, + "balance_loss_mlp": 1.02035618, + "epoch": 0.6680294603938073, + "flos": 27198167472000.0, + "grad_norm": 1.5324836174154963, + "language_loss": 0.73755026, + "learning_rate": 1.04872722003689e-06, + "loss": 0.75834918, + "num_input_tokens_seen": 239952395, + "step": 11111, + "time_per_iteration": 2.8057026863098145 + }, + { + "auxiliary_loss_clip": 0.01060636, + "auxiliary_loss_mlp": 0.01026895, + "balance_loss_clip": 1.02397323, + "balance_loss_mlp": 1.01638079, + "epoch": 0.6680895836464753, + "flos": 21725309692800.0, + "grad_norm": 1.983432094750477, + "language_loss": 0.65199125, + "learning_rate": 1.0483846512280553e-06, + "loss": 0.67286646, + "num_input_tokens_seen": 239968910, + "step": 11112, + "time_per_iteration": 2.637007474899292 + }, + { + "auxiliary_loss_clip": 0.01040621, + "auxiliary_loss_mlp": 0.0102954, + "balance_loss_clip": 1.02342868, + "balance_loss_mlp": 1.01841807, + "epoch": 0.6681497068991432, + "flos": 19646477043840.0, + "grad_norm": 1.958225897547328, + "language_loss": 0.63421988, + "learning_rate": 1.048042118504569e-06, + "loss": 0.65492153, + "num_input_tokens_seen": 239987680, + "step": 11113, + "time_per_iteration": 2.6997923851013184 + }, + { + "auxiliary_loss_clip": 0.01025224, + "auxiliary_loss_mlp": 0.01028315, + "balance_loss_clip": 1.02718163, + "balance_loss_mlp": 1.01804519, + "epoch": 0.6682098301518112, + "flos": 17419153570560.0, + "grad_norm": 1.8156162754588183, + "language_loss": 0.65929627, + "learning_rate": 1.047699621879422e-06, + "loss": 0.67983162, + "num_input_tokens_seen": 240005790, + "step": 11114, + "time_per_iteration": 2.7268669605255127 + }, + { + "auxiliary_loss_clip": 0.01054747, + "auxiliary_loss_mlp": 0.01033014, + "balance_loss_clip": 1.02495253, + "balance_loss_mlp": 1.02260709, + "epoch": 0.6682699534044791, + "flos": 22599016110720.0, + "grad_norm": 1.7383205984412269, + "language_loss": 0.78412712, + "learning_rate": 1.0473571613655998e-06, + "loss": 0.80500478, + "num_input_tokens_seen": 240025895, + "step": 11115, + "time_per_iteration": 2.682236909866333 + }, + { + "auxiliary_loss_clip": 0.01006794, + "auxiliary_loss_mlp": 0.00747753, + "balance_loss_clip": 1.01865268, + "balance_loss_mlp": 1.00054836, + "epoch": 0.6683300766571472, + "flos": 24863686750080.0, + "grad_norm": 1.7529628980632375, + "language_loss": 0.79666817, + "learning_rate": 1.0470147369760896e-06, + "loss": 0.81421363, + "num_input_tokens_seen": 240044880, + "step": 11116, + "time_per_iteration": 2.7968783378601074 + }, + { + "auxiliary_loss_clip": 0.01035528, + "auxiliary_loss_mlp": 0.0103334, + "balance_loss_clip": 1.02510059, + "balance_loss_mlp": 1.02118027, + "epoch": 0.6683901999098151, + "flos": 27126633536640.0, + "grad_norm": 1.7219624754193616, + "language_loss": 0.79528397, + "learning_rate": 1.0466723487238768e-06, + "loss": 0.81597269, + "num_input_tokens_seen": 240065785, + "step": 11117, + "time_per_iteration": 2.845963478088379 + }, + { + "auxiliary_loss_clip": 0.01026043, + "auxiliary_loss_mlp": 0.01032346, + "balance_loss_clip": 1.02496374, + "balance_loss_mlp": 1.02003741, + "epoch": 0.6684503231624831, + "flos": 20739023072640.0, + "grad_norm": 1.6061811975196558, + "language_loss": 0.65468597, + "learning_rate": 1.0463299966219441e-06, + "loss": 0.6752699, + "num_input_tokens_seen": 240085130, + "step": 11118, + "time_per_iteration": 2.719951629638672 + }, + { + "auxiliary_loss_clip": 0.01042141, + "auxiliary_loss_mlp": 0.01027618, + "balance_loss_clip": 1.02461982, + "balance_loss_mlp": 1.01750278, + "epoch": 0.668510446415151, + "flos": 21762189982080.0, + "grad_norm": 2.132589107571072, + "language_loss": 0.68586504, + "learning_rate": 1.0459876806832727e-06, + "loss": 0.70656258, + "num_input_tokens_seen": 240105495, + "step": 11119, + "time_per_iteration": 2.7189269065856934 + }, + { + "auxiliary_loss_clip": 0.0103625, + "auxiliary_loss_mlp": 0.01029404, + "balance_loss_clip": 1.02269959, + "balance_loss_mlp": 1.01768613, + "epoch": 0.668570569667819, + "flos": 30191250015360.0, + "grad_norm": 2.327370351170766, + "language_loss": 0.67222869, + "learning_rate": 1.0456454009208448e-06, + "loss": 0.69288528, + "num_input_tokens_seen": 240125455, + "step": 11120, + "time_per_iteration": 2.757922410964966 + }, + { + "auxiliary_loss_clip": 0.0102972, + "auxiliary_loss_mlp": 0.01030509, + "balance_loss_clip": 1.02310479, + "balance_loss_mlp": 1.01937509, + "epoch": 0.668630692920487, + "flos": 24170646764160.0, + "grad_norm": 1.691754172439797, + "language_loss": 0.72391331, + "learning_rate": 1.045303157347638e-06, + "loss": 0.7445156, + "num_input_tokens_seen": 240143870, + "step": 11121, + "time_per_iteration": 2.753350019454956 + }, + { + "auxiliary_loss_clip": 0.01040157, + "auxiliary_loss_mlp": 0.01036542, + "balance_loss_clip": 1.02257037, + "balance_loss_mlp": 1.02519369, + "epoch": 0.668690816173155, + "flos": 17457147181440.0, + "grad_norm": 3.125652733417521, + "language_loss": 0.70433831, + "learning_rate": 1.0449609499766316e-06, + "loss": 0.72510535, + "num_input_tokens_seen": 240161020, + "step": 11122, + "time_per_iteration": 2.6255087852478027 + }, + { + "auxiliary_loss_clip": 0.01001909, + "auxiliary_loss_mlp": 0.00747674, + "balance_loss_clip": 1.02082503, + "balance_loss_mlp": 1.00052917, + "epoch": 0.668750939425823, + "flos": 25005102595200.0, + "grad_norm": 1.596161155828831, + "language_loss": 0.71509421, + "learning_rate": 1.0446187788208015e-06, + "loss": 0.73259002, + "num_input_tokens_seen": 240179820, + "step": 11123, + "time_per_iteration": 2.9544529914855957 + }, + { + "auxiliary_loss_clip": 0.01048996, + "auxiliary_loss_mlp": 0.01032201, + "balance_loss_clip": 1.02862692, + "balance_loss_mlp": 1.02003574, + "epoch": 0.6688110626784909, + "flos": 24096778444800.0, + "grad_norm": 2.2802194998811527, + "language_loss": 0.79492038, + "learning_rate": 1.0442766438931244e-06, + "loss": 0.81573236, + "num_input_tokens_seen": 240200130, + "step": 11124, + "time_per_iteration": 2.928506851196289 + }, + { + "auxiliary_loss_clip": 0.01050287, + "auxiliary_loss_mlp": 0.01038731, + "balance_loss_clip": 1.02972579, + "balance_loss_mlp": 1.02697146, + "epoch": 0.6688711859311589, + "flos": 21759532375680.0, + "grad_norm": 1.5123283874432947, + "language_loss": 0.74202228, + "learning_rate": 1.0439345452065716e-06, + "loss": 0.76291245, + "num_input_tokens_seen": 240217945, + "step": 11125, + "time_per_iteration": 2.743579626083374 + }, + { + "auxiliary_loss_clip": 0.01021378, + "auxiliary_loss_mlp": 0.01031048, + "balance_loss_clip": 1.0231694, + "balance_loss_mlp": 1.01968741, + "epoch": 0.6689313091838268, + "flos": 22929645824640.0, + "grad_norm": 2.4925635124235868, + "language_loss": 0.66003245, + "learning_rate": 1.043592482774116e-06, + "loss": 0.68055665, + "num_input_tokens_seen": 240237220, + "step": 11126, + "time_per_iteration": 2.7185065746307373 + }, + { + "auxiliary_loss_clip": 0.01044128, + "auxiliary_loss_mlp": 0.01026914, + "balance_loss_clip": 1.02194786, + "balance_loss_mlp": 1.01570797, + "epoch": 0.6689914324364948, + "flos": 20886149180160.0, + "grad_norm": 1.757306899048641, + "language_loss": 0.71071076, + "learning_rate": 1.0432504566087305e-06, + "loss": 0.73142111, + "num_input_tokens_seen": 240256000, + "step": 11127, + "time_per_iteration": 2.662083387374878 + }, + { + "auxiliary_loss_clip": 0.01047072, + "auxiliary_loss_mlp": 0.0103406, + "balance_loss_clip": 1.02537417, + "balance_loss_mlp": 1.02159059, + "epoch": 0.6690515556891627, + "flos": 22748225207040.0, + "grad_norm": 1.9271704041201267, + "language_loss": 0.8004474, + "learning_rate": 1.0429084667233827e-06, + "loss": 0.82125878, + "num_input_tokens_seen": 240275845, + "step": 11128, + "time_per_iteration": 2.7163305282592773 + }, + { + "auxiliary_loss_clip": 0.01065444, + "auxiliary_loss_mlp": 0.01026962, + "balance_loss_clip": 1.02620757, + "balance_loss_mlp": 1.01576233, + "epoch": 0.6691116789418308, + "flos": 23331450337920.0, + "grad_norm": 1.825000764867642, + "language_loss": 0.8085165, + "learning_rate": 1.0425665131310427e-06, + "loss": 0.82944059, + "num_input_tokens_seen": 240294095, + "step": 11129, + "time_per_iteration": 4.336997032165527 + }, + { + "auxiliary_loss_clip": 0.01042738, + "auxiliary_loss_mlp": 0.0103491, + "balance_loss_clip": 1.02296066, + "balance_loss_mlp": 1.02434206, + "epoch": 0.6691718021944987, + "flos": 32447014081920.0, + "grad_norm": 1.5380604253853363, + "language_loss": 0.70386183, + "learning_rate": 1.0422245958446762e-06, + "loss": 0.72463828, + "num_input_tokens_seen": 240313460, + "step": 11130, + "time_per_iteration": 2.7497215270996094 + }, + { + "auxiliary_loss_clip": 0.01037346, + "auxiliary_loss_mlp": 0.01029048, + "balance_loss_clip": 1.02383375, + "balance_loss_mlp": 1.01857567, + "epoch": 0.6692319254471667, + "flos": 23731602825600.0, + "grad_norm": 1.5273016622815023, + "language_loss": 0.69658965, + "learning_rate": 1.0418827148772486e-06, + "loss": 0.71725357, + "num_input_tokens_seen": 240333540, + "step": 11131, + "time_per_iteration": 4.3027403354644775 + }, + { + "auxiliary_loss_clip": 0.01055234, + "auxiliary_loss_mlp": 0.01029785, + "balance_loss_clip": 1.02558351, + "balance_loss_mlp": 1.01717818, + "epoch": 0.6692920486998346, + "flos": 14427902620800.0, + "grad_norm": 2.8209737966601773, + "language_loss": 0.65457755, + "learning_rate": 1.0415408702417243e-06, + "loss": 0.67542768, + "num_input_tokens_seen": 240350085, + "step": 11132, + "time_per_iteration": 2.6253435611724854 + }, + { + "auxiliary_loss_clip": 0.01049578, + "auxiliary_loss_mlp": 0.01031353, + "balance_loss_clip": 1.02267385, + "balance_loss_mlp": 1.01931906, + "epoch": 0.6693521719525026, + "flos": 21507475662720.0, + "grad_norm": 1.702994024884388, + "language_loss": 0.74749041, + "learning_rate": 1.0411990619510661e-06, + "loss": 0.7682997, + "num_input_tokens_seen": 240370015, + "step": 11133, + "time_per_iteration": 2.630847215652466 + }, + { + "auxiliary_loss_clip": 0.01058668, + "auxiliary_loss_mlp": 0.01028309, + "balance_loss_clip": 1.02790606, + "balance_loss_mlp": 1.01605463, + "epoch": 0.6694122952051706, + "flos": 25406943022080.0, + "grad_norm": 2.214809163675833, + "language_loss": 0.66059017, + "learning_rate": 1.0408572900182363e-06, + "loss": 0.6814599, + "num_input_tokens_seen": 240390770, + "step": 11134, + "time_per_iteration": 2.6708593368530273 + }, + { + "auxiliary_loss_clip": 0.01052786, + "auxiliary_loss_mlp": 0.01036658, + "balance_loss_clip": 1.02555704, + "balance_loss_mlp": 1.02347326, + "epoch": 0.6694724184578386, + "flos": 25661729168640.0, + "grad_norm": 1.720653093414925, + "language_loss": 0.76923692, + "learning_rate": 1.0405155544561943e-06, + "loss": 0.79013133, + "num_input_tokens_seen": 240409590, + "step": 11135, + "time_per_iteration": 2.6804497241973877 + }, + { + "auxiliary_loss_clip": 0.01054058, + "auxiliary_loss_mlp": 0.01027507, + "balance_loss_clip": 1.02527273, + "balance_loss_mlp": 1.01624823, + "epoch": 0.6695325417105066, + "flos": 17709311635200.0, + "grad_norm": 1.569231079378904, + "language_loss": 0.74086082, + "learning_rate": 1.040173855277898e-06, + "loss": 0.76167643, + "num_input_tokens_seen": 240428180, + "step": 11136, + "time_per_iteration": 2.5877223014831543 + }, + { + "auxiliary_loss_clip": 0.0106019, + "auxiliary_loss_mlp": 0.0103417, + "balance_loss_clip": 1.0282011, + "balance_loss_mlp": 1.02124143, + "epoch": 0.6695926649631745, + "flos": 24460050643200.0, + "grad_norm": 1.5395406891217844, + "language_loss": 0.61827898, + "learning_rate": 1.0398321924963061e-06, + "loss": 0.63922262, + "num_input_tokens_seen": 240447815, + "step": 11137, + "time_per_iteration": 2.6698415279388428 + }, + { + "auxiliary_loss_clip": 0.01062732, + "auxiliary_loss_mlp": 0.01028406, + "balance_loss_clip": 1.02423549, + "balance_loss_mlp": 1.01703382, + "epoch": 0.6696527882158425, + "flos": 24280138396800.0, + "grad_norm": 1.8363073219359007, + "language_loss": 0.65807915, + "learning_rate": 1.0394905661243724e-06, + "loss": 0.6789906, + "num_input_tokens_seen": 240468635, + "step": 11138, + "time_per_iteration": 2.5508131980895996 + }, + { + "auxiliary_loss_clip": 0.0102785, + "auxiliary_loss_mlp": 0.01029566, + "balance_loss_clip": 1.02292085, + "balance_loss_mlp": 1.01792526, + "epoch": 0.6697129114685104, + "flos": 23002759958400.0, + "grad_norm": 1.7162915386978057, + "language_loss": 0.72691095, + "learning_rate": 1.039148976175053e-06, + "loss": 0.7474851, + "num_input_tokens_seen": 240488550, + "step": 11139, + "time_per_iteration": 2.6879162788391113 + }, + { + "auxiliary_loss_clip": 0.01025893, + "auxiliary_loss_mlp": 0.01031492, + "balance_loss_clip": 1.02213597, + "balance_loss_mlp": 1.02147889, + "epoch": 0.6697730347211784, + "flos": 22638123043200.0, + "grad_norm": 1.6761786376824312, + "language_loss": 0.70397782, + "learning_rate": 1.0388074226613016e-06, + "loss": 0.72455168, + "num_input_tokens_seen": 240508330, + "step": 11140, + "time_per_iteration": 2.609989643096924 + }, + { + "auxiliary_loss_clip": 0.01054025, + "auxiliary_loss_mlp": 0.0102875, + "balance_loss_clip": 1.02322221, + "balance_loss_mlp": 1.01692474, + "epoch": 0.6698331579738463, + "flos": 28877242682880.0, + "grad_norm": 1.9085424992695765, + "language_loss": 0.75923228, + "learning_rate": 1.0384659055960691e-06, + "loss": 0.78006005, + "num_input_tokens_seen": 240528470, + "step": 11141, + "time_per_iteration": 2.5598304271698 + }, + { + "auxiliary_loss_clip": 0.01055273, + "auxiliary_loss_mlp": 0.01031029, + "balance_loss_clip": 1.02584028, + "balance_loss_mlp": 1.01970422, + "epoch": 0.6698932812265144, + "flos": 24207096090240.0, + "grad_norm": 1.7893040532980826, + "language_loss": 0.82085937, + "learning_rate": 1.0381244249923052e-06, + "loss": 0.84172237, + "num_input_tokens_seen": 240547815, + "step": 11142, + "time_per_iteration": 2.655973196029663 + }, + { + "auxiliary_loss_clip": 0.01022924, + "auxiliary_loss_mlp": 0.01029165, + "balance_loss_clip": 1.0252769, + "balance_loss_mlp": 1.01766133, + "epoch": 0.6699534044791823, + "flos": 22090269830400.0, + "grad_norm": 1.485806082468032, + "language_loss": 0.70331717, + "learning_rate": 1.037782980862959e-06, + "loss": 0.72383809, + "num_input_tokens_seen": 240567765, + "step": 11143, + "time_per_iteration": 2.8863472938537598 + }, + { + "auxiliary_loss_clip": 0.01022087, + "auxiliary_loss_mlp": 0.0074766, + "balance_loss_clip": 1.02204943, + "balance_loss_mlp": 1.00060701, + "epoch": 0.6700135277318503, + "flos": 25192377129600.0, + "grad_norm": 1.72697396004918, + "language_loss": 0.70439529, + "learning_rate": 1.0374415732209796e-06, + "loss": 0.72209275, + "num_input_tokens_seen": 240590750, + "step": 11144, + "time_per_iteration": 2.7834866046905518 + }, + { + "auxiliary_loss_clip": 0.01038419, + "auxiliary_loss_mlp": 0.01028579, + "balance_loss_clip": 1.02272105, + "balance_loss_mlp": 1.0176177, + "epoch": 0.6700736509845182, + "flos": 23440187784960.0, + "grad_norm": 1.701440516703643, + "language_loss": 0.74851412, + "learning_rate": 1.0371002020793114e-06, + "loss": 0.76918411, + "num_input_tokens_seen": 240608875, + "step": 11145, + "time_per_iteration": 4.5440967082977295 + }, + { + "auxiliary_loss_clip": 0.01044663, + "auxiliary_loss_mlp": 0.01028832, + "balance_loss_clip": 1.02388883, + "balance_loss_mlp": 1.01717973, + "epoch": 0.6701337742371862, + "flos": 24389953251840.0, + "grad_norm": 1.4290197827330544, + "language_loss": 0.70460331, + "learning_rate": 1.0367588674509008e-06, + "loss": 0.72533834, + "num_input_tokens_seen": 240628565, + "step": 11146, + "time_per_iteration": 2.861536979675293 + }, + { + "auxiliary_loss_clip": 0.01060381, + "auxiliary_loss_mlp": 0.00747645, + "balance_loss_clip": 1.02432275, + "balance_loss_mlp": 1.00056481, + "epoch": 0.6701938974898543, + "flos": 14793652857600.0, + "grad_norm": 1.9264104806300706, + "language_loss": 0.78136992, + "learning_rate": 1.0364175693486905e-06, + "loss": 0.79945028, + "num_input_tokens_seen": 240646325, + "step": 11147, + "time_per_iteration": 2.698270320892334 + }, + { + "auxiliary_loss_clip": 0.01054389, + "auxiliary_loss_mlp": 0.00747633, + "balance_loss_clip": 1.02508724, + "balance_loss_mlp": 1.00060439, + "epoch": 0.6702540207425222, + "flos": 20154002261760.0, + "grad_norm": 1.8806476233866563, + "language_loss": 0.70340025, + "learning_rate": 1.0360763077856218e-06, + "loss": 0.72142053, + "num_input_tokens_seen": 240666145, + "step": 11148, + "time_per_iteration": 2.746408700942993 + }, + { + "auxiliary_loss_clip": 0.01042674, + "auxiliary_loss_mlp": 0.01032734, + "balance_loss_clip": 1.02349257, + "balance_loss_mlp": 1.02179623, + "epoch": 0.6703141439951902, + "flos": 21214157201280.0, + "grad_norm": 1.6887645063716274, + "language_loss": 0.70065904, + "learning_rate": 1.035735082774636e-06, + "loss": 0.72141314, + "num_input_tokens_seen": 240685570, + "step": 11149, + "time_per_iteration": 2.7963953018188477 + }, + { + "auxiliary_loss_clip": 0.01043638, + "auxiliary_loss_mlp": 0.0102375, + "balance_loss_clip": 1.0233804, + "balance_loss_mlp": 1.01371264, + "epoch": 0.6703742672478581, + "flos": 23112538899840.0, + "grad_norm": 1.818659845377225, + "language_loss": 0.73654842, + "learning_rate": 1.0353938943286727e-06, + "loss": 0.75722229, + "num_input_tokens_seen": 240706945, + "step": 11150, + "time_per_iteration": 2.9547383785247803 + }, + { + "auxiliary_loss_clip": 0.01054449, + "auxiliary_loss_mlp": 0.0102971, + "balance_loss_clip": 1.02593565, + "balance_loss_mlp": 1.01889157, + "epoch": 0.6704343905005261, + "flos": 22528918719360.0, + "grad_norm": 2.0969150077098107, + "language_loss": 0.78368503, + "learning_rate": 1.035052742460671e-06, + "loss": 0.80452657, + "num_input_tokens_seen": 240727990, + "step": 11151, + "time_per_iteration": 2.793337345123291 + }, + { + "auxiliary_loss_clip": 0.00963538, + "auxiliary_loss_mlp": 0.01002474, + "balance_loss_clip": 1.00680327, + "balance_loss_mlp": 1.00146699, + "epoch": 0.670494513753194, + "flos": 64793158773120.0, + "grad_norm": 0.7961948004163945, + "language_loss": 0.5547471, + "learning_rate": 1.0347116271835643e-06, + "loss": 0.57440722, + "num_input_tokens_seen": 240790380, + "step": 11152, + "time_per_iteration": 3.4157607555389404 + }, + { + "auxiliary_loss_clip": 0.01043205, + "auxiliary_loss_mlp": 0.01032552, + "balance_loss_clip": 1.02407861, + "balance_loss_mlp": 1.02134037, + "epoch": 0.670554637005862, + "flos": 23511506238720.0, + "grad_norm": 1.6690155848607249, + "language_loss": 0.80995715, + "learning_rate": 1.0343705485102896e-06, + "loss": 0.8307147, + "num_input_tokens_seen": 240811545, + "step": 11153, + "time_per_iteration": 4.324508190155029 + }, + { + "auxiliary_loss_clip": 0.01034594, + "auxiliary_loss_mlp": 0.0074775, + "balance_loss_clip": 1.02557468, + "balance_loss_mlp": 1.00056124, + "epoch": 0.67061476025853, + "flos": 19463404400640.0, + "grad_norm": 1.4791022696671918, + "language_loss": 0.75936794, + "learning_rate": 1.0340295064537814e-06, + "loss": 0.7771914, + "num_input_tokens_seen": 240831380, + "step": 11154, + "time_per_iteration": 2.881340503692627 + }, + { + "auxiliary_loss_clip": 0.01039886, + "auxiliary_loss_mlp": 0.01031793, + "balance_loss_clip": 1.02294374, + "balance_loss_mlp": 1.02009892, + "epoch": 0.670674883511198, + "flos": 20519967980160.0, + "grad_norm": 1.5469004490300033, + "language_loss": 0.75874388, + "learning_rate": 1.0336885010269702e-06, + "loss": 0.77946067, + "num_input_tokens_seen": 240851855, + "step": 11155, + "time_per_iteration": 2.7625038623809814 + }, + { + "auxiliary_loss_clip": 0.01067303, + "auxiliary_loss_mlp": 0.01032584, + "balance_loss_clip": 1.02856445, + "balance_loss_mlp": 1.02163482, + "epoch": 0.6707350067638659, + "flos": 25483971738240.0, + "grad_norm": 2.517532050968355, + "language_loss": 0.82093471, + "learning_rate": 1.0333475322427878e-06, + "loss": 0.84193367, + "num_input_tokens_seen": 240869980, + "step": 11156, + "time_per_iteration": 2.605098009109497 + }, + { + "auxiliary_loss_clip": 0.01063832, + "auxiliary_loss_mlp": 0.01028317, + "balance_loss_clip": 1.02562189, + "balance_loss_mlp": 1.01786852, + "epoch": 0.6707951300165339, + "flos": 22273450214400.0, + "grad_norm": 2.0064258667212393, + "language_loss": 0.75115913, + "learning_rate": 1.033006600114165e-06, + "loss": 0.77208066, + "num_input_tokens_seen": 240888680, + "step": 11157, + "time_per_iteration": 2.561546564102173 + }, + { + "auxiliary_loss_clip": 0.01058738, + "auxiliary_loss_mlp": 0.01033962, + "balance_loss_clip": 1.02825689, + "balance_loss_mlp": 1.02278638, + "epoch": 0.6708552532692018, + "flos": 23984593292160.0, + "grad_norm": 2.6705388615806376, + "language_loss": 0.74530274, + "learning_rate": 1.0326657046540282e-06, + "loss": 0.76622975, + "num_input_tokens_seen": 240909050, + "step": 11158, + "time_per_iteration": 2.567751884460449 + }, + { + "auxiliary_loss_clip": 0.01067257, + "auxiliary_loss_mlp": 0.01032481, + "balance_loss_clip": 1.02745533, + "balance_loss_mlp": 1.02123916, + "epoch": 0.6709153765218698, + "flos": 24937519155840.0, + "grad_norm": 1.5772567762133909, + "language_loss": 0.81560332, + "learning_rate": 1.0323248458753044e-06, + "loss": 0.83660066, + "num_input_tokens_seen": 240930035, + "step": 11159, + "time_per_iteration": 2.5780019760131836 + }, + { + "auxiliary_loss_clip": 0.01041936, + "auxiliary_loss_mlp": 0.01028684, + "balance_loss_clip": 1.02329898, + "balance_loss_mlp": 1.0179019, + "epoch": 0.6709754997745379, + "flos": 17530225401600.0, + "grad_norm": 1.7433793143319343, + "language_loss": 0.77259052, + "learning_rate": 1.0319840237909193e-06, + "loss": 0.79329669, + "num_input_tokens_seen": 240948895, + "step": 11160, + "time_per_iteration": 2.7529611587524414 + }, + { + "auxiliary_loss_clip": 0.01043218, + "auxiliary_loss_mlp": 0.01026401, + "balance_loss_clip": 1.02572107, + "balance_loss_mlp": 1.01591611, + "epoch": 0.6710356230272058, + "flos": 22090880361600.0, + "grad_norm": 1.6647797406030078, + "language_loss": 0.73521459, + "learning_rate": 1.0316432384137978e-06, + "loss": 0.75591075, + "num_input_tokens_seen": 240967770, + "step": 11161, + "time_per_iteration": 2.7064385414123535 + }, + { + "auxiliary_loss_clip": 0.01041488, + "auxiliary_loss_mlp": 0.01036191, + "balance_loss_clip": 1.02278948, + "balance_loss_mlp": 1.02449632, + "epoch": 0.6710957462798738, + "flos": 24206449645440.0, + "grad_norm": 1.6086049865820484, + "language_loss": 0.68227947, + "learning_rate": 1.0313024897568618e-06, + "loss": 0.70305622, + "num_input_tokens_seen": 240988985, + "step": 11162, + "time_per_iteration": 2.6571731567382812 + }, + { + "auxiliary_loss_clip": 0.01036047, + "auxiliary_loss_mlp": 0.01032035, + "balance_loss_clip": 1.02185631, + "balance_loss_mlp": 1.02152109, + "epoch": 0.6711558695325417, + "flos": 19093955063040.0, + "grad_norm": 1.6853402118771315, + "language_loss": 0.70202446, + "learning_rate": 1.030961777833032e-06, + "loss": 0.72270525, + "num_input_tokens_seen": 241005455, + "step": 11163, + "time_per_iteration": 2.6298580169677734 + }, + { + "auxiliary_loss_clip": 0.01063795, + "auxiliary_loss_mlp": 0.01028723, + "balance_loss_clip": 1.02680993, + "balance_loss_mlp": 1.01828647, + "epoch": 0.6712159927852097, + "flos": 25557875971200.0, + "grad_norm": 43.859068361502786, + "language_loss": 0.75542009, + "learning_rate": 1.0306211026552291e-06, + "loss": 0.77634531, + "num_input_tokens_seen": 241026175, + "step": 11164, + "time_per_iteration": 2.6508305072784424 + }, + { + "auxiliary_loss_clip": 0.01063682, + "auxiliary_loss_mlp": 0.01030909, + "balance_loss_clip": 1.02560949, + "balance_loss_mlp": 1.01978111, + "epoch": 0.6712761160378776, + "flos": 22228812587520.0, + "grad_norm": 3.804250485743904, + "language_loss": 0.65232593, + "learning_rate": 1.0302804642363704e-06, + "loss": 0.67327178, + "num_input_tokens_seen": 241044040, + "step": 11165, + "time_per_iteration": 2.5960168838500977 + }, + { + "auxiliary_loss_clip": 0.01062682, + "auxiliary_loss_mlp": 0.01026723, + "balance_loss_clip": 1.0259217, + "balance_loss_mlp": 1.01579773, + "epoch": 0.6713362392905456, + "flos": 22455517276800.0, + "grad_norm": 1.9688624424995715, + "language_loss": 0.71325684, + "learning_rate": 1.0299398625893738e-06, + "loss": 0.73415089, + "num_input_tokens_seen": 241063615, + "step": 11166, + "time_per_iteration": 2.5620687007904053 + }, + { + "auxiliary_loss_clip": 0.01062827, + "auxiliary_loss_mlp": 0.01024576, + "balance_loss_clip": 1.02645612, + "balance_loss_mlp": 1.01464629, + "epoch": 0.6713963625432136, + "flos": 25630200005760.0, + "grad_norm": 1.8577283307024288, + "language_loss": 0.77021581, + "learning_rate": 1.0295992977271546e-06, + "loss": 0.79108989, + "num_input_tokens_seen": 241082520, + "step": 11167, + "time_per_iteration": 2.607454299926758 + }, + { + "auxiliary_loss_clip": 0.01049285, + "auxiliary_loss_mlp": 0.01031412, + "balance_loss_clip": 1.02237582, + "balance_loss_mlp": 1.02099323, + "epoch": 0.6714564857958816, + "flos": 35006475640320.0, + "grad_norm": 1.7778784440805433, + "language_loss": 0.68299329, + "learning_rate": 1.029258769662629e-06, + "loss": 0.70380032, + "num_input_tokens_seen": 241103505, + "step": 11168, + "time_per_iteration": 2.777869462966919 + }, + { + "auxiliary_loss_clip": 0.01024614, + "auxiliary_loss_mlp": 0.01035849, + "balance_loss_clip": 1.02282, + "balance_loss_mlp": 1.02377319, + "epoch": 0.6715166090485495, + "flos": 26279931168000.0, + "grad_norm": 2.0893527525785194, + "language_loss": 0.73664415, + "learning_rate": 1.0289182784087068e-06, + "loss": 0.75724876, + "num_input_tokens_seen": 241122885, + "step": 11169, + "time_per_iteration": 2.7740588188171387 + }, + { + "auxiliary_loss_clip": 0.01054055, + "auxiliary_loss_mlp": 0.01030525, + "balance_loss_clip": 1.02409267, + "balance_loss_mlp": 1.01880085, + "epoch": 0.6715767323012175, + "flos": 15924156583680.0, + "grad_norm": 2.1097359190900855, + "language_loss": 0.76252055, + "learning_rate": 1.0285778239783005e-06, + "loss": 0.78336638, + "num_input_tokens_seen": 241140865, + "step": 11170, + "time_per_iteration": 2.627243995666504 + }, + { + "auxiliary_loss_clip": 0.01045806, + "auxiliary_loss_mlp": 0.01028974, + "balance_loss_clip": 1.02514148, + "balance_loss_mlp": 1.01806617, + "epoch": 0.6716368555538854, + "flos": 17491441691520.0, + "grad_norm": 2.4214314588311625, + "language_loss": 0.74553555, + "learning_rate": 1.0282374063843212e-06, + "loss": 0.76628327, + "num_input_tokens_seen": 241158225, + "step": 11171, + "time_per_iteration": 2.5674009323120117 + }, + { + "auxiliary_loss_clip": 0.01025666, + "auxiliary_loss_mlp": 0.01039558, + "balance_loss_clip": 1.02215052, + "balance_loss_mlp": 1.02698147, + "epoch": 0.6716969788065534, + "flos": 16761521416320.0, + "grad_norm": 1.4593805209433373, + "language_loss": 0.86185569, + "learning_rate": 1.0278970256396762e-06, + "loss": 0.88250792, + "num_input_tokens_seen": 241175215, + "step": 11172, + "time_per_iteration": 2.655068874359131 + }, + { + "auxiliary_loss_clip": 0.01046801, + "auxiliary_loss_mlp": 0.01031641, + "balance_loss_clip": 1.02144134, + "balance_loss_mlp": 1.02013171, + "epoch": 0.6717571020592215, + "flos": 22709800632960.0, + "grad_norm": 1.5029748169429538, + "language_loss": 0.63933069, + "learning_rate": 1.0275566817572733e-06, + "loss": 0.66011512, + "num_input_tokens_seen": 241195250, + "step": 11173, + "time_per_iteration": 2.631863832473755 + }, + { + "auxiliary_loss_clip": 0.01057386, + "auxiliary_loss_mlp": 0.0103515, + "balance_loss_clip": 1.02424932, + "balance_loss_mlp": 1.02224565, + "epoch": 0.6718172253118894, + "flos": 18734094656640.0, + "grad_norm": 2.0664369646837875, + "language_loss": 0.71213114, + "learning_rate": 1.02721637475002e-06, + "loss": 0.73305649, + "num_input_tokens_seen": 241210720, + "step": 11174, + "time_per_iteration": 2.5388035774230957 + }, + { + "auxiliary_loss_clip": 0.01026249, + "auxiliary_loss_mlp": 0.01027031, + "balance_loss_clip": 1.02471888, + "balance_loss_mlp": 1.01700521, + "epoch": 0.6718773485645574, + "flos": 15632526061440.0, + "grad_norm": 2.1717836628183096, + "language_loss": 0.68768966, + "learning_rate": 1.0268761046308178e-06, + "loss": 0.70822251, + "num_input_tokens_seen": 241227395, + "step": 11175, + "time_per_iteration": 2.5984408855438232 + }, + { + "auxiliary_loss_clip": 0.01035106, + "auxiliary_loss_mlp": 0.01029784, + "balance_loss_clip": 1.02471757, + "balance_loss_mlp": 1.01934123, + "epoch": 0.6719374718172253, + "flos": 19354774694400.0, + "grad_norm": 2.0388731008152368, + "language_loss": 0.73480153, + "learning_rate": 1.0265358714125714e-06, + "loss": 0.75545049, + "num_input_tokens_seen": 241246355, + "step": 11176, + "time_per_iteration": 2.613032817840576 + }, + { + "auxiliary_loss_clip": 0.01044051, + "auxiliary_loss_mlp": 0.01026973, + "balance_loss_clip": 1.0247122, + "balance_loss_mlp": 1.01552331, + "epoch": 0.6719975950698933, + "flos": 21981316901760.0, + "grad_norm": 2.018749807857133, + "language_loss": 0.73033094, + "learning_rate": 1.026195675108182e-06, + "loss": 0.75104117, + "num_input_tokens_seen": 241264180, + "step": 11177, + "time_per_iteration": 4.215540409088135 + }, + { + "auxiliary_loss_clip": 0.01063031, + "auxiliary_loss_mlp": 0.01035234, + "balance_loss_clip": 1.02463508, + "balance_loss_mlp": 1.02345645, + "epoch": 0.6720577183225612, + "flos": 25228072270080.0, + "grad_norm": 2.004851100781086, + "language_loss": 0.76417923, + "learning_rate": 1.025855515730551e-06, + "loss": 0.78516191, + "num_input_tokens_seen": 241282245, + "step": 11178, + "time_per_iteration": 2.5577328205108643 + }, + { + "auxiliary_loss_clip": 0.01056257, + "auxiliary_loss_mlp": 0.01030366, + "balance_loss_clip": 1.02663016, + "balance_loss_mlp": 1.01995897, + "epoch": 0.6721178415752292, + "flos": 16945886949120.0, + "grad_norm": 2.1451256000650765, + "language_loss": 0.69450468, + "learning_rate": 1.0255153932925766e-06, + "loss": 0.71537089, + "num_input_tokens_seen": 241300745, + "step": 11179, + "time_per_iteration": 4.2285988330841064 + }, + { + "auxiliary_loss_clip": 0.01016996, + "auxiliary_loss_mlp": 0.01025268, + "balance_loss_clip": 1.02279317, + "balance_loss_mlp": 1.01513517, + "epoch": 0.6721779648278972, + "flos": 21541375123200.0, + "grad_norm": 1.5346567707932013, + "language_loss": 0.7400021, + "learning_rate": 1.0251753078071557e-06, + "loss": 0.76042473, + "num_input_tokens_seen": 241319320, + "step": 11180, + "time_per_iteration": 2.7046868801116943 + }, + { + "auxiliary_loss_clip": 0.0104321, + "auxiliary_loss_mlp": 0.01028671, + "balance_loss_clip": 1.02574778, + "balance_loss_mlp": 1.01806736, + "epoch": 0.6722380880805652, + "flos": 22605444645120.0, + "grad_norm": 1.3657852698407416, + "language_loss": 0.75117034, + "learning_rate": 1.0248352592871848e-06, + "loss": 0.77188909, + "num_input_tokens_seen": 241342225, + "step": 11181, + "time_per_iteration": 2.8253321647644043 + }, + { + "auxiliary_loss_clip": 0.01046179, + "auxiliary_loss_mlp": 0.01025394, + "balance_loss_clip": 1.02575874, + "balance_loss_mlp": 1.01524329, + "epoch": 0.6722982113332331, + "flos": 15925269905280.0, + "grad_norm": 2.3494173761546375, + "language_loss": 0.74261421, + "learning_rate": 1.0244952477455585e-06, + "loss": 0.76332992, + "num_input_tokens_seen": 241358240, + "step": 11182, + "time_per_iteration": 2.6986188888549805 + }, + { + "auxiliary_loss_clip": 0.0105092, + "auxiliary_loss_mlp": 0.01029276, + "balance_loss_clip": 1.02515078, + "balance_loss_mlp": 1.01929212, + "epoch": 0.6723583345859011, + "flos": 20596170683520.0, + "grad_norm": 1.714874882146957, + "language_loss": 0.69800186, + "learning_rate": 1.0241552731951699e-06, + "loss": 0.71880388, + "num_input_tokens_seen": 241378420, + "step": 11183, + "time_per_iteration": 2.7156076431274414 + }, + { + "auxiliary_loss_clip": 0.01023842, + "auxiliary_loss_mlp": 0.01033113, + "balance_loss_clip": 1.02446246, + "balance_loss_mlp": 1.02190089, + "epoch": 0.672418457838569, + "flos": 21725848396800.0, + "grad_norm": 1.444469704811669, + "language_loss": 0.77708566, + "learning_rate": 1.0238153356489112e-06, + "loss": 0.79765522, + "num_input_tokens_seen": 241397185, + "step": 11184, + "time_per_iteration": 2.750061273574829 + }, + { + "auxiliary_loss_clip": 0.01044963, + "auxiliary_loss_mlp": 0.00748021, + "balance_loss_clip": 1.02698803, + "balance_loss_mlp": 1.00060308, + "epoch": 0.672478581091237, + "flos": 21470379891840.0, + "grad_norm": 2.10835170394377, + "language_loss": 0.66096717, + "learning_rate": 1.0234754351196743e-06, + "loss": 0.67889702, + "num_input_tokens_seen": 241415785, + "step": 11185, + "time_per_iteration": 2.7073252201080322 + }, + { + "auxiliary_loss_clip": 0.01029092, + "auxiliary_loss_mlp": 0.01029417, + "balance_loss_clip": 1.02255416, + "balance_loss_mlp": 1.01784217, + "epoch": 0.6725387043439051, + "flos": 30846763267200.0, + "grad_norm": 2.076162478025193, + "language_loss": 0.80551142, + "learning_rate": 1.023135571620345e-06, + "loss": 0.82609653, + "num_input_tokens_seen": 241437390, + "step": 11186, + "time_per_iteration": 2.820380210876465 + }, + { + "auxiliary_loss_clip": 0.01053513, + "auxiliary_loss_mlp": 0.01035449, + "balance_loss_clip": 1.02665377, + "balance_loss_mlp": 1.02580464, + "epoch": 0.672598827596573, + "flos": 24055947659520.0, + "grad_norm": 1.4173339499845083, + "language_loss": 0.80293387, + "learning_rate": 1.022795745163813e-06, + "loss": 0.82382351, + "num_input_tokens_seen": 241458085, + "step": 11187, + "time_per_iteration": 2.5796918869018555 + }, + { + "auxiliary_loss_clip": 0.01026787, + "auxiliary_loss_mlp": 0.01031207, + "balance_loss_clip": 1.03118384, + "balance_loss_mlp": 1.01935732, + "epoch": 0.672658950849241, + "flos": 21871861182720.0, + "grad_norm": 2.325684749604426, + "language_loss": 0.70432758, + "learning_rate": 1.022455955762965e-06, + "loss": 0.72490752, + "num_input_tokens_seen": 241476880, + "step": 11188, + "time_per_iteration": 2.897130012512207 + }, + { + "auxiliary_loss_clip": 0.01005005, + "auxiliary_loss_mlp": 0.01031692, + "balance_loss_clip": 1.02953672, + "balance_loss_mlp": 1.02095127, + "epoch": 0.6727190741019089, + "flos": 23222102359680.0, + "grad_norm": 1.7235103398243927, + "language_loss": 0.75738907, + "learning_rate": 1.0221162034306842e-06, + "loss": 0.77775604, + "num_input_tokens_seen": 241496535, + "step": 11189, + "time_per_iteration": 3.1190624237060547 + }, + { + "auxiliary_loss_clip": 0.0106625, + "auxiliary_loss_mlp": 0.01031713, + "balance_loss_clip": 1.02511382, + "balance_loss_mlp": 1.01889181, + "epoch": 0.6727791973545769, + "flos": 15778610674560.0, + "grad_norm": 2.0681939091132, + "language_loss": 0.7497133, + "learning_rate": 1.0217764881798562e-06, + "loss": 0.77069294, + "num_input_tokens_seen": 241513465, + "step": 11190, + "time_per_iteration": 2.8952841758728027 + }, + { + "auxiliary_loss_clip": 0.00997095, + "auxiliary_loss_mlp": 0.01031043, + "balance_loss_clip": 1.0211339, + "balance_loss_mlp": 1.01945603, + "epoch": 0.6728393206072448, + "flos": 21249852341760.0, + "grad_norm": 1.5141887734740835, + "language_loss": 0.77527034, + "learning_rate": 1.0214368100233612e-06, + "loss": 0.79555166, + "num_input_tokens_seen": 241534125, + "step": 11191, + "time_per_iteration": 2.9476494789123535 + }, + { + "auxiliary_loss_clip": 0.01061996, + "auxiliary_loss_mlp": 0.01024034, + "balance_loss_clip": 1.02502608, + "balance_loss_mlp": 1.01359689, + "epoch": 0.6728994438599128, + "flos": 32123279779200.0, + "grad_norm": 3.641624534117011, + "language_loss": 0.86305261, + "learning_rate": 1.0210971689740802e-06, + "loss": 0.88391292, + "num_input_tokens_seen": 241556340, + "step": 11192, + "time_per_iteration": 4.441463947296143 + }, + { + "auxiliary_loss_clip": 0.01051775, + "auxiliary_loss_mlp": 0.0103242, + "balance_loss_clip": 1.02430773, + "balance_loss_mlp": 1.02020717, + "epoch": 0.6729595671125808, + "flos": 23112359331840.0, + "grad_norm": 2.2186285878760055, + "language_loss": 0.76015812, + "learning_rate": 1.0207575650448923e-06, + "loss": 0.78100014, + "num_input_tokens_seen": 241575185, + "step": 11193, + "time_per_iteration": 2.7997028827667236 + }, + { + "auxiliary_loss_clip": 0.01037692, + "auxiliary_loss_mlp": 0.01028939, + "balance_loss_clip": 1.02797413, + "balance_loss_mlp": 1.01779306, + "epoch": 0.6730196903652488, + "flos": 14611406227200.0, + "grad_norm": 1.9860590555730535, + "language_loss": 0.7889421, + "learning_rate": 1.0204179982486758e-06, + "loss": 0.8096084, + "num_input_tokens_seen": 241592970, + "step": 11194, + "time_per_iteration": 2.764310359954834 + }, + { + "auxiliary_loss_clip": 0.01053292, + "auxiliary_loss_mlp": 0.01028053, + "balance_loss_clip": 1.02421832, + "balance_loss_mlp": 1.01777148, + "epoch": 0.6730798136179167, + "flos": 21105922544640.0, + "grad_norm": 2.3551140945389135, + "language_loss": 0.89901823, + "learning_rate": 1.0200784685983075e-06, + "loss": 0.91983163, + "num_input_tokens_seen": 241610245, + "step": 11195, + "time_per_iteration": 2.770578145980835 + }, + { + "auxiliary_loss_clip": 0.01053228, + "auxiliary_loss_mlp": 0.0102699, + "balance_loss_clip": 1.024616, + "balance_loss_mlp": 1.01661253, + "epoch": 0.6731399368705847, + "flos": 28986267438720.0, + "grad_norm": 2.15232336309846, + "language_loss": 0.72375274, + "learning_rate": 1.019738976106662e-06, + "loss": 0.74455494, + "num_input_tokens_seen": 241630350, + "step": 11196, + "time_per_iteration": 2.642559051513672 + }, + { + "auxiliary_loss_clip": 0.00948861, + "auxiliary_loss_mlp": 0.01004966, + "balance_loss_clip": 1.00497794, + "balance_loss_mlp": 1.00389338, + "epoch": 0.6732000601232526, + "flos": 64743708723840.0, + "grad_norm": 0.7735463283670075, + "language_loss": 0.56551349, + "learning_rate": 1.0193995207866123e-06, + "loss": 0.58505166, + "num_input_tokens_seen": 241692380, + "step": 11197, + "time_per_iteration": 3.3542494773864746 + }, + { + "auxiliary_loss_clip": 0.01043138, + "auxiliary_loss_mlp": 0.01026588, + "balance_loss_clip": 1.02568233, + "balance_loss_mlp": 1.01671147, + "epoch": 0.6732601833759206, + "flos": 17201642762880.0, + "grad_norm": 2.3270375320824583, + "language_loss": 0.75299609, + "learning_rate": 1.0190601026510312e-06, + "loss": 0.77369332, + "num_input_tokens_seen": 241710430, + "step": 11198, + "time_per_iteration": 3.0069799423217773 + }, + { + "auxiliary_loss_clip": 0.01051121, + "auxiliary_loss_mlp": 0.01025845, + "balance_loss_clip": 1.02277255, + "balance_loss_mlp": 1.01462758, + "epoch": 0.6733203066285887, + "flos": 18658861620480.0, + "grad_norm": 2.8314654560585133, + "language_loss": 0.8159219, + "learning_rate": 1.0187207217127892e-06, + "loss": 0.83669162, + "num_input_tokens_seen": 241724775, + "step": 11199, + "time_per_iteration": 4.232370615005493 + }, + { + "auxiliary_loss_clip": 0.01017317, + "auxiliary_loss_mlp": 0.01032461, + "balance_loss_clip": 1.02269268, + "balance_loss_mlp": 1.02048087, + "epoch": 0.6733804298812566, + "flos": 35809330481280.0, + "grad_norm": 1.8798648324919105, + "language_loss": 0.71466225, + "learning_rate": 1.0183813779847552e-06, + "loss": 0.73515999, + "num_input_tokens_seen": 241744440, + "step": 11200, + "time_per_iteration": 2.929408550262451 + }, + { + "auxiliary_loss_clip": 0.01067518, + "auxiliary_loss_mlp": 0.01033329, + "balance_loss_clip": 1.02800083, + "balance_loss_mlp": 1.0218668, + "epoch": 0.6734405531339246, + "flos": 61638833099520.0, + "grad_norm": 1.416259373944242, + "language_loss": 0.64470637, + "learning_rate": 1.0180420714797987e-06, + "loss": 0.66571486, + "num_input_tokens_seen": 241771705, + "step": 11201, + "time_per_iteration": 2.8998043537139893 + }, + { + "auxiliary_loss_clip": 0.01048167, + "auxiliary_loss_mlp": 0.01030433, + "balance_loss_clip": 1.02639675, + "balance_loss_mlp": 1.01867282, + "epoch": 0.6735006763865925, + "flos": 20522338277760.0, + "grad_norm": 1.8269103054482245, + "language_loss": 0.63608581, + "learning_rate": 1.0177028022107856e-06, + "loss": 0.6568718, + "num_input_tokens_seen": 241790830, + "step": 11202, + "time_per_iteration": 2.642112970352173 + }, + { + "auxiliary_loss_clip": 0.01063311, + "auxiliary_loss_mlp": 0.01028407, + "balance_loss_clip": 1.02502704, + "balance_loss_mlp": 1.01796412, + "epoch": 0.6735607996392605, + "flos": 13918869031680.0, + "grad_norm": 2.0097547313528787, + "language_loss": 0.74681908, + "learning_rate": 1.0173635701905796e-06, + "loss": 0.76773626, + "num_input_tokens_seen": 241808165, + "step": 11203, + "time_per_iteration": 2.5035789012908936 + }, + { + "auxiliary_loss_clip": 0.01049392, + "auxiliary_loss_mlp": 0.01031446, + "balance_loss_clip": 1.02677536, + "balance_loss_mlp": 1.01844633, + "epoch": 0.6736209228919284, + "flos": 18807244704000.0, + "grad_norm": 1.8636673167189652, + "language_loss": 0.67904043, + "learning_rate": 1.0170243754320456e-06, + "loss": 0.69984877, + "num_input_tokens_seen": 241826925, + "step": 11204, + "time_per_iteration": 2.643256187438965 + }, + { + "auxiliary_loss_clip": 0.01058432, + "auxiliary_loss_mlp": 0.01030067, + "balance_loss_clip": 1.02711689, + "balance_loss_mlp": 1.01833689, + "epoch": 0.6736810461445965, + "flos": 20373129181440.0, + "grad_norm": 1.4720133763317986, + "language_loss": 0.73986101, + "learning_rate": 1.0166852179480465e-06, + "loss": 0.760746, + "num_input_tokens_seen": 241845525, + "step": 11205, + "time_per_iteration": 2.679717779159546 + }, + { + "auxiliary_loss_clip": 0.01061218, + "auxiliary_loss_mlp": 0.01031189, + "balance_loss_clip": 1.02359295, + "balance_loss_mlp": 1.0206331, + "epoch": 0.6737411693972644, + "flos": 30007530927360.0, + "grad_norm": 1.7150856927874991, + "language_loss": 0.71250641, + "learning_rate": 1.0163460977514416e-06, + "loss": 0.7334305, + "num_input_tokens_seen": 241866815, + "step": 11206, + "time_per_iteration": 2.632801055908203 + }, + { + "auxiliary_loss_clip": 0.01032079, + "auxiliary_loss_mlp": 0.00747796, + "balance_loss_clip": 1.025141, + "balance_loss_mlp": 1.00055385, + "epoch": 0.6738012926499324, + "flos": 25447342844160.0, + "grad_norm": 3.4269168791734756, + "language_loss": 0.67616713, + "learning_rate": 1.016007014855092e-06, + "loss": 0.69396591, + "num_input_tokens_seen": 241887050, + "step": 11207, + "time_per_iteration": 2.669189453125 + }, + { + "auxiliary_loss_clip": 0.0101277, + "auxiliary_loss_mlp": 0.0103109, + "balance_loss_clip": 1.02257538, + "balance_loss_mlp": 1.02067733, + "epoch": 0.6738614159026003, + "flos": 20776873029120.0, + "grad_norm": 2.0328929389656984, + "language_loss": 0.74441445, + "learning_rate": 1.0156679692718553e-06, + "loss": 0.76485306, + "num_input_tokens_seen": 241904280, + "step": 11208, + "time_per_iteration": 2.7835464477539062 + }, + { + "auxiliary_loss_clip": 0.01047089, + "auxiliary_loss_mlp": 0.01034805, + "balance_loss_clip": 1.02292454, + "balance_loss_mlp": 1.02103043, + "epoch": 0.6739215391552683, + "flos": 19566898462080.0, + "grad_norm": 2.219174519223159, + "language_loss": 0.75596637, + "learning_rate": 1.0153289610145867e-06, + "loss": 0.77678531, + "num_input_tokens_seen": 241919190, + "step": 11209, + "time_per_iteration": 2.7048017978668213 + }, + { + "auxiliary_loss_clip": 0.01030842, + "auxiliary_loss_mlp": 0.0103208, + "balance_loss_clip": 1.0245595, + "balance_loss_mlp": 1.02216172, + "epoch": 0.6739816624079362, + "flos": 24388193485440.0, + "grad_norm": 2.9222088242113373, + "language_loss": 0.66445583, + "learning_rate": 1.0149899900961428e-06, + "loss": 0.68508506, + "num_input_tokens_seen": 241940525, + "step": 11210, + "time_per_iteration": 2.640855073928833 + }, + { + "auxiliary_loss_clip": 0.01060402, + "auxiliary_loss_mlp": 0.0102692, + "balance_loss_clip": 1.02371931, + "balance_loss_mlp": 1.01709139, + "epoch": 0.6740417856606042, + "flos": 22528164533760.0, + "grad_norm": 1.9948443326674385, + "language_loss": 0.80211735, + "learning_rate": 1.014651056529377e-06, + "loss": 0.82299054, + "num_input_tokens_seen": 241959290, + "step": 11211, + "time_per_iteration": 2.5157341957092285 + }, + { + "auxiliary_loss_clip": 0.01015115, + "auxiliary_loss_mlp": 0.01031037, + "balance_loss_clip": 1.02238369, + "balance_loss_mlp": 1.01996303, + "epoch": 0.6741019089132723, + "flos": 25775458606080.0, + "grad_norm": 1.3572056457087487, + "language_loss": 0.7656697, + "learning_rate": 1.014312160327143e-06, + "loss": 0.7861312, + "num_input_tokens_seen": 241980715, + "step": 11212, + "time_per_iteration": 2.84751558303833 + }, + { + "auxiliary_loss_clip": 0.01027447, + "auxiliary_loss_mlp": 0.00747699, + "balance_loss_clip": 1.02327573, + "balance_loss_mlp": 1.00047255, + "epoch": 0.6741620321659402, + "flos": 21105671149440.0, + "grad_norm": 1.6735111553035162, + "language_loss": 0.77656692, + "learning_rate": 1.0139733015022905e-06, + "loss": 0.79431838, + "num_input_tokens_seen": 241999985, + "step": 11213, + "time_per_iteration": 2.6376795768737793 + }, + { + "auxiliary_loss_clip": 0.01020048, + "auxiliary_loss_mlp": 0.01032113, + "balance_loss_clip": 1.02607858, + "balance_loss_mlp": 1.02070415, + "epoch": 0.6742221554186082, + "flos": 20740423703040.0, + "grad_norm": 1.8947670205168405, + "language_loss": 0.67408121, + "learning_rate": 1.0136344800676685e-06, + "loss": 0.69460285, + "num_input_tokens_seen": 242018990, + "step": 11214, + "time_per_iteration": 2.6801369190216064 + }, + { + "auxiliary_loss_clip": 0.01062493, + "auxiliary_loss_mlp": 0.00747784, + "balance_loss_clip": 1.02458096, + "balance_loss_mlp": 1.00057757, + "epoch": 0.6742822786712761, + "flos": 37774146384000.0, + "grad_norm": 1.6267491628889335, + "language_loss": 0.72656578, + "learning_rate": 1.0132956960361263e-06, + "loss": 0.7446686, + "num_input_tokens_seen": 242039340, + "step": 11215, + "time_per_iteration": 2.68483567237854 + }, + { + "auxiliary_loss_clip": 0.01051743, + "auxiliary_loss_mlp": 0.00747632, + "balance_loss_clip": 1.02321005, + "balance_loss_mlp": 1.00045061, + "epoch": 0.6743424019239441, + "flos": 37263891732480.0, + "grad_norm": 2.2387651997930558, + "language_loss": 0.67067802, + "learning_rate": 1.0129569494205096e-06, + "loss": 0.68867177, + "num_input_tokens_seen": 242062215, + "step": 11216, + "time_per_iteration": 2.777157783508301 + }, + { + "auxiliary_loss_clip": 0.01007062, + "auxiliary_loss_mlp": 0.01005244, + "balance_loss_clip": 1.00162959, + "balance_loss_mlp": 1.00431979, + "epoch": 0.674402525176612, + "flos": 65997746300160.0, + "grad_norm": 0.6733930865780147, + "language_loss": 0.56267822, + "learning_rate": 1.0126182402336646e-06, + "loss": 0.58280134, + "num_input_tokens_seen": 242131130, + "step": 11217, + "time_per_iteration": 3.206533908843994 + }, + { + "auxiliary_loss_clip": 0.01053021, + "auxiliary_loss_mlp": 0.01030037, + "balance_loss_clip": 1.02461171, + "balance_loss_mlp": 1.01873589, + "epoch": 0.67446264842928, + "flos": 26461208131200.0, + "grad_norm": 2.6925512217992855, + "language_loss": 0.74313283, + "learning_rate": 1.0122795684884363e-06, + "loss": 0.76396346, + "num_input_tokens_seen": 242149720, + "step": 11218, + "time_per_iteration": 2.6094586849212646 + }, + { + "auxiliary_loss_clip": 0.01035136, + "auxiliary_loss_mlp": 0.01048569, + "balance_loss_clip": 1.02622437, + "balance_loss_mlp": 1.03617775, + "epoch": 0.674522771681948, + "flos": 23732392924800.0, + "grad_norm": 1.7103197368694183, + "language_loss": 0.65897769, + "learning_rate": 1.0119409341976639e-06, + "loss": 0.67981482, + "num_input_tokens_seen": 242168875, + "step": 11219, + "time_per_iteration": 2.7224438190460205 + }, + { + "auxiliary_loss_clip": 0.01020292, + "auxiliary_loss_mlp": 0.0103629, + "balance_loss_clip": 1.02062821, + "balance_loss_mlp": 1.02348709, + "epoch": 0.674582894934616, + "flos": 24754338771840.0, + "grad_norm": 1.7132993358678268, + "language_loss": 0.74597943, + "learning_rate": 1.0116023373741904e-06, + "loss": 0.7665453, + "num_input_tokens_seen": 242188465, + "step": 11220, + "time_per_iteration": 2.7303078174591064 + }, + { + "auxiliary_loss_clip": 0.01055521, + "auxiliary_loss_mlp": 0.01028936, + "balance_loss_clip": 1.02631664, + "balance_loss_mlp": 1.01723564, + "epoch": 0.6746430181872839, + "flos": 24826626892800.0, + "grad_norm": 1.62505563397978, + "language_loss": 0.70432281, + "learning_rate": 1.0112637780308554e-06, + "loss": 0.72516739, + "num_input_tokens_seen": 242208675, + "step": 11221, + "time_per_iteration": 2.6403400897979736 + }, + { + "auxiliary_loss_clip": 0.01027568, + "auxiliary_loss_mlp": 0.01029721, + "balance_loss_clip": 1.0232017, + "balance_loss_mlp": 1.01952302, + "epoch": 0.6747031414399519, + "flos": 16873491087360.0, + "grad_norm": 1.8713055978368642, + "language_loss": 0.58261555, + "learning_rate": 1.010925256180498e-06, + "loss": 0.60318851, + "num_input_tokens_seen": 242227440, + "step": 11222, + "time_per_iteration": 2.684796094894409 + }, + { + "auxiliary_loss_clip": 0.01053938, + "auxiliary_loss_mlp": 0.01033702, + "balance_loss_clip": 1.02556407, + "balance_loss_mlp": 1.0225327, + "epoch": 0.6747632646926198, + "flos": 22784925928320.0, + "grad_norm": 1.5998742059623383, + "language_loss": 0.76869178, + "learning_rate": 1.0105867718359528e-06, + "loss": 0.78956819, + "num_input_tokens_seen": 242245240, + "step": 11223, + "time_per_iteration": 2.6354708671569824 + }, + { + "auxiliary_loss_clip": 0.01058734, + "auxiliary_loss_mlp": 0.01030348, + "balance_loss_clip": 1.02793014, + "balance_loss_mlp": 1.01958978, + "epoch": 0.6748233879452878, + "flos": 20046090827520.0, + "grad_norm": 1.8494111947750778, + "language_loss": 0.75282949, + "learning_rate": 1.0102483250100574e-06, + "loss": 0.77372026, + "num_input_tokens_seen": 242263435, + "step": 11224, + "time_per_iteration": 4.234992742538452 + }, + { + "auxiliary_loss_clip": 0.01013384, + "auxiliary_loss_mlp": 0.01023454, + "balance_loss_clip": 1.0235424, + "balance_loss_mlp": 1.01413202, + "epoch": 0.6748835111979558, + "flos": 23002831785600.0, + "grad_norm": 1.6231927519106668, + "language_loss": 0.63408726, + "learning_rate": 1.0099099157156445e-06, + "loss": 0.65445566, + "num_input_tokens_seen": 242282765, + "step": 11225, + "time_per_iteration": 4.462769031524658 + }, + { + "auxiliary_loss_clip": 0.01061135, + "auxiliary_loss_mlp": 0.00747543, + "balance_loss_clip": 1.0253042, + "balance_loss_mlp": 1.000489, + "epoch": 0.6749436344506238, + "flos": 12197311009920.0, + "grad_norm": 1.6674792980836433, + "language_loss": 0.6397115, + "learning_rate": 1.0095715439655462e-06, + "loss": 0.65779829, + "num_input_tokens_seen": 242298980, + "step": 11226, + "time_per_iteration": 2.669318914413452 + }, + { + "auxiliary_loss_clip": 0.01054468, + "auxiliary_loss_mlp": 0.01031053, + "balance_loss_clip": 1.02548265, + "balance_loss_mlp": 1.01957273, + "epoch": 0.6750037577032918, + "flos": 11873720361600.0, + "grad_norm": 2.2690067620402083, + "language_loss": 0.71737468, + "learning_rate": 1.0092332097725945e-06, + "loss": 0.73822993, + "num_input_tokens_seen": 242315420, + "step": 11227, + "time_per_iteration": 2.6186375617980957 + }, + { + "auxiliary_loss_clip": 0.01034312, + "auxiliary_loss_mlp": 0.01028937, + "balance_loss_clip": 1.022156, + "balance_loss_mlp": 1.01685512, + "epoch": 0.6750638809559597, + "flos": 17019611614080.0, + "grad_norm": 2.4227523101979487, + "language_loss": 0.71652347, + "learning_rate": 1.0088949131496183e-06, + "loss": 0.73715597, + "num_input_tokens_seen": 242332805, + "step": 11228, + "time_per_iteration": 2.6724538803100586 + }, + { + "auxiliary_loss_clip": 0.00992336, + "auxiliary_loss_mlp": 0.01003008, + "balance_loss_clip": 1.0052402, + "balance_loss_mlp": 1.00188148, + "epoch": 0.6751240042086277, + "flos": 70951011891840.0, + "grad_norm": 0.7581366045050284, + "language_loss": 0.53317159, + "learning_rate": 1.0085566541094482e-06, + "loss": 0.55312496, + "num_input_tokens_seen": 242396160, + "step": 11229, + "time_per_iteration": 3.3448987007141113 + }, + { + "auxiliary_loss_clip": 0.010556, + "auxiliary_loss_mlp": 0.01026831, + "balance_loss_clip": 1.02639234, + "balance_loss_mlp": 1.01608455, + "epoch": 0.6751841274612956, + "flos": 22675146986880.0, + "grad_norm": 1.759876765595835, + "language_loss": 0.80623972, + "learning_rate": 1.0082184326649072e-06, + "loss": 0.82706404, + "num_input_tokens_seen": 242414660, + "step": 11230, + "time_per_iteration": 2.6945173740386963 + }, + { + "auxiliary_loss_clip": 0.01032122, + "auxiliary_loss_mlp": 0.01027426, + "balance_loss_clip": 1.0220629, + "balance_loss_mlp": 1.0173347, + "epoch": 0.6752442507139637, + "flos": 21288636051840.0, + "grad_norm": 1.4779086024256172, + "language_loss": 0.65819037, + "learning_rate": 1.0078802488288228e-06, + "loss": 0.67878592, + "num_input_tokens_seen": 242434225, + "step": 11231, + "time_per_iteration": 2.790322780609131 + }, + { + "auxiliary_loss_clip": 0.01028668, + "auxiliary_loss_mlp": 0.01033994, + "balance_loss_clip": 1.02739668, + "balance_loss_mlp": 1.02149439, + "epoch": 0.6753043739666316, + "flos": 28256921781120.0, + "grad_norm": 1.8878485725980303, + "language_loss": 0.66204381, + "learning_rate": 1.0075421026140198e-06, + "loss": 0.68267047, + "num_input_tokens_seen": 242454355, + "step": 11232, + "time_per_iteration": 2.8559157848358154 + }, + { + "auxiliary_loss_clip": 0.010154, + "auxiliary_loss_mlp": 0.01025224, + "balance_loss_clip": 1.02044511, + "balance_loss_mlp": 1.01484692, + "epoch": 0.6753644972192996, + "flos": 21360349555200.0, + "grad_norm": 1.7115183118945299, + "language_loss": 0.72552675, + "learning_rate": 1.0072039940333188e-06, + "loss": 0.74593294, + "num_input_tokens_seen": 242474935, + "step": 11233, + "time_per_iteration": 2.632359266281128 + }, + { + "auxiliary_loss_clip": 0.01054565, + "auxiliary_loss_mlp": 0.01027412, + "balance_loss_clip": 1.02522087, + "balance_loss_mlp": 1.01649857, + "epoch": 0.6754246204719675, + "flos": 26541971861760.0, + "grad_norm": 1.669019119938057, + "language_loss": 0.76614463, + "learning_rate": 1.0068659230995418e-06, + "loss": 0.78696442, + "num_input_tokens_seen": 242495530, + "step": 11234, + "time_per_iteration": 2.6609153747558594 + }, + { + "auxiliary_loss_clip": 0.01062005, + "auxiliary_loss_mlp": 0.01029439, + "balance_loss_clip": 1.02479303, + "balance_loss_mlp": 1.01823878, + "epoch": 0.6754847437246355, + "flos": 25556690822400.0, + "grad_norm": 1.4904027578598982, + "language_loss": 0.75194269, + "learning_rate": 1.0065278898255101e-06, + "loss": 0.77285713, + "num_input_tokens_seen": 242514550, + "step": 11235, + "time_per_iteration": 2.5785913467407227 + }, + { + "auxiliary_loss_clip": 0.00999674, + "auxiliary_loss_mlp": 0.01001072, + "balance_loss_clip": 1.00370646, + "balance_loss_mlp": 1.00007045, + "epoch": 0.6755448669773034, + "flos": 59513318726400.0, + "grad_norm": 0.7822568656484263, + "language_loss": 0.51339591, + "learning_rate": 1.0061898942240387e-06, + "loss": 0.53340334, + "num_input_tokens_seen": 242569200, + "step": 11236, + "time_per_iteration": 3.2310774326324463 + }, + { + "auxiliary_loss_clip": 0.0103116, + "auxiliary_loss_mlp": 0.01028544, + "balance_loss_clip": 1.02430272, + "balance_loss_mlp": 1.01607513, + "epoch": 0.6756049902299714, + "flos": 23294534135040.0, + "grad_norm": 2.3792759277193616, + "language_loss": 0.7571162, + "learning_rate": 1.0058519363079464e-06, + "loss": 0.77771324, + "num_input_tokens_seen": 242586950, + "step": 11237, + "time_per_iteration": 2.9036664962768555 + }, + { + "auxiliary_loss_clip": 0.01037431, + "auxiliary_loss_mlp": 0.01034344, + "balance_loss_clip": 1.02473152, + "balance_loss_mlp": 1.02329326, + "epoch": 0.6756651134826394, + "flos": 31575426566400.0, + "grad_norm": 8.418802325045203, + "language_loss": 0.77645814, + "learning_rate": 1.0055140160900482e-06, + "loss": 0.79717588, + "num_input_tokens_seen": 242607380, + "step": 11238, + "time_per_iteration": 2.8034873008728027 + }, + { + "auxiliary_loss_clip": 0.01049744, + "auxiliary_loss_mlp": 0.01030337, + "balance_loss_clip": 1.02405488, + "balance_loss_mlp": 1.01815975, + "epoch": 0.6757252367353074, + "flos": 27272287186560.0, + "grad_norm": 1.736778813339156, + "language_loss": 0.66070551, + "learning_rate": 1.0051761335831587e-06, + "loss": 0.68150628, + "num_input_tokens_seen": 242628025, + "step": 11239, + "time_per_iteration": 4.345174312591553 + }, + { + "auxiliary_loss_clip": 0.0104165, + "auxiliary_loss_mlp": 0.01023991, + "balance_loss_clip": 1.02522469, + "balance_loss_mlp": 1.0131551, + "epoch": 0.6757853599879754, + "flos": 16830900535680.0, + "grad_norm": 2.333353540739139, + "language_loss": 0.83137703, + "learning_rate": 1.0048382888000898e-06, + "loss": 0.85203344, + "num_input_tokens_seen": 242643825, + "step": 11240, + "time_per_iteration": 2.6393680572509766 + }, + { + "auxiliary_loss_clip": 0.01040385, + "auxiliary_loss_mlp": 0.01030616, + "balance_loss_clip": 1.02643013, + "balance_loss_mlp": 1.01762187, + "epoch": 0.6758454832406433, + "flos": 23220055284480.0, + "grad_norm": 6.2592501847181365, + "language_loss": 0.73953974, + "learning_rate": 1.0045004817536525e-06, + "loss": 0.76024973, + "num_input_tokens_seen": 242661820, + "step": 11241, + "time_per_iteration": 2.7010788917541504 + }, + { + "auxiliary_loss_clip": 0.01019788, + "auxiliary_loss_mlp": 0.01033709, + "balance_loss_clip": 1.02690864, + "balance_loss_mlp": 1.02282548, + "epoch": 0.6759056064933113, + "flos": 16289547684480.0, + "grad_norm": 2.13273923358734, + "language_loss": 0.80446744, + "learning_rate": 1.0041627124566572e-06, + "loss": 0.82500243, + "num_input_tokens_seen": 242679890, + "step": 11242, + "time_per_iteration": 2.7549965381622314 + }, + { + "auxiliary_loss_clip": 0.01045384, + "auxiliary_loss_mlp": 0.01033234, + "balance_loss_clip": 1.02244258, + "balance_loss_mlp": 1.02147377, + "epoch": 0.6759657297459792, + "flos": 25922297404800.0, + "grad_norm": 1.7167580353832783, + "language_loss": 0.72416013, + "learning_rate": 1.0038249809219109e-06, + "loss": 0.7449463, + "num_input_tokens_seen": 242699495, + "step": 11243, + "time_per_iteration": 2.701997756958008 + }, + { + "auxiliary_loss_clip": 0.0105318, + "auxiliary_loss_mlp": 0.01032105, + "balance_loss_clip": 1.02514839, + "balance_loss_mlp": 1.02178764, + "epoch": 0.6760258529986473, + "flos": 23000820624000.0, + "grad_norm": 1.6122648536482478, + "language_loss": 0.72407931, + "learning_rate": 1.003487287162221e-06, + "loss": 0.74493217, + "num_input_tokens_seen": 242719500, + "step": 11244, + "time_per_iteration": 2.6073520183563232 + }, + { + "auxiliary_loss_clip": 0.01066135, + "auxiliary_loss_mlp": 0.01035823, + "balance_loss_clip": 1.02661586, + "balance_loss_mlp": 1.02445674, + "epoch": 0.6760859762513152, + "flos": 20959335141120.0, + "grad_norm": 1.8438901330033517, + "language_loss": 0.8570143, + "learning_rate": 1.003149631190393e-06, + "loss": 0.87803388, + "num_input_tokens_seen": 242738325, + "step": 11245, + "time_per_iteration": 2.5874056816101074 + }, + { + "auxiliary_loss_clip": 0.01068029, + "auxiliary_loss_mlp": 0.00747909, + "balance_loss_clip": 1.02632141, + "balance_loss_mlp": 1.00057244, + "epoch": 0.6761460995039832, + "flos": 23622937205760.0, + "grad_norm": 2.404213683001351, + "language_loss": 0.73651242, + "learning_rate": 1.0028120130192327e-06, + "loss": 0.75467181, + "num_input_tokens_seen": 242756620, + "step": 11246, + "time_per_iteration": 2.5812394618988037 + }, + { + "auxiliary_loss_clip": 0.01053825, + "auxiliary_loss_mlp": 0.01024831, + "balance_loss_clip": 1.02486205, + "balance_loss_mlp": 1.01387572, + "epoch": 0.6762062227566511, + "flos": 20770875457920.0, + "grad_norm": 1.9068819217071464, + "language_loss": 0.8761279, + "learning_rate": 1.002474432661539e-06, + "loss": 0.89691454, + "num_input_tokens_seen": 242774505, + "step": 11247, + "time_per_iteration": 4.183326721191406 + }, + { + "auxiliary_loss_clip": 0.00989835, + "auxiliary_loss_mlp": 0.01003568, + "balance_loss_clip": 1.00300598, + "balance_loss_mlp": 1.00239408, + "epoch": 0.6762663460093191, + "flos": 52818099166080.0, + "grad_norm": 0.8236677717111299, + "language_loss": 0.53969193, + "learning_rate": 1.002136890130115e-06, + "loss": 0.55962598, + "num_input_tokens_seen": 242828645, + "step": 11248, + "time_per_iteration": 3.2449889183044434 + }, + { + "auxiliary_loss_clip": 0.01008062, + "auxiliary_loss_mlp": 0.01029652, + "balance_loss_clip": 1.02736568, + "balance_loss_mlp": 1.01951373, + "epoch": 0.676326469261987, + "flos": 23696302734720.0, + "grad_norm": 1.7540990616588568, + "language_loss": 0.7315039, + "learning_rate": 1.001799385437761e-06, + "loss": 0.75188112, + "num_input_tokens_seen": 242850100, + "step": 11249, + "time_per_iteration": 2.8597347736358643 + }, + { + "auxiliary_loss_clip": 0.01053975, + "auxiliary_loss_mlp": 0.01031155, + "balance_loss_clip": 1.02443409, + "balance_loss_mlp": 1.01894224, + "epoch": 0.676386592514655, + "flos": 14063732582400.0, + "grad_norm": 1.9540439933429357, + "language_loss": 0.73669231, + "learning_rate": 1.0014619185972732e-06, + "loss": 0.75754362, + "num_input_tokens_seen": 242867775, + "step": 11250, + "time_per_iteration": 2.5987300872802734 + }, + { + "auxiliary_loss_clip": 0.01066063, + "auxiliary_loss_mlp": 0.01025343, + "balance_loss_clip": 1.02694237, + "balance_loss_mlp": 1.0144887, + "epoch": 0.676446715767323, + "flos": 20412236113920.0, + "grad_norm": 2.997603987406711, + "language_loss": 0.74849999, + "learning_rate": 1.0011244896214497e-06, + "loss": 0.76941407, + "num_input_tokens_seen": 242886865, + "step": 11251, + "time_per_iteration": 2.544940948486328 + }, + { + "auxiliary_loss_clip": 0.01028089, + "auxiliary_loss_mlp": 0.01029128, + "balance_loss_clip": 1.0257287, + "balance_loss_mlp": 1.01802325, + "epoch": 0.676506839019991, + "flos": 21288241002240.0, + "grad_norm": 1.581604850996355, + "language_loss": 0.69987369, + "learning_rate": 1.0007870985230873e-06, + "loss": 0.72044581, + "num_input_tokens_seen": 242906705, + "step": 11252, + "time_per_iteration": 2.646986246109009 + }, + { + "auxiliary_loss_clip": 0.01017197, + "auxiliary_loss_mlp": 0.01031663, + "balance_loss_clip": 1.02540016, + "balance_loss_mlp": 1.02150071, + "epoch": 0.676566962272659, + "flos": 29932477459200.0, + "grad_norm": 1.6069231168666944, + "language_loss": 0.66749847, + "learning_rate": 1.0004497453149765e-06, + "loss": 0.68798709, + "num_input_tokens_seen": 242925215, + "step": 11253, + "time_per_iteration": 2.7121615409851074 + }, + { + "auxiliary_loss_clip": 0.01020059, + "auxiliary_loss_mlp": 0.00747872, + "balance_loss_clip": 1.02186632, + "balance_loss_mlp": 1.00054049, + "epoch": 0.6766270855253269, + "flos": 17931203902080.0, + "grad_norm": 2.5601713520342106, + "language_loss": 0.77166063, + "learning_rate": 1.0001124300099115e-06, + "loss": 0.7893399, + "num_input_tokens_seen": 242944750, + "step": 11254, + "time_per_iteration": 2.645148754119873 + }, + { + "auxiliary_loss_clip": 0.01054374, + "auxiliary_loss_mlp": 0.01028734, + "balance_loss_clip": 1.02510202, + "balance_loss_mlp": 1.01714134, + "epoch": 0.6766872087779949, + "flos": 23104853389440.0, + "grad_norm": 1.978608496702302, + "language_loss": 0.72096181, + "learning_rate": 9.997751526206835e-07, + "loss": 0.74179292, + "num_input_tokens_seen": 242963860, + "step": 11255, + "time_per_iteration": 2.675621509552002 + }, + { + "auxiliary_loss_clip": 0.01013719, + "auxiliary_loss_mlp": 0.00747583, + "balance_loss_clip": 1.02166569, + "balance_loss_mlp": 1.00042522, + "epoch": 0.6767473320306628, + "flos": 26213137827840.0, + "grad_norm": 2.233510374342283, + "language_loss": 0.75398803, + "learning_rate": 9.994379131600828e-07, + "loss": 0.77160096, + "num_input_tokens_seen": 242983050, + "step": 11256, + "time_per_iteration": 2.8013916015625 + }, + { + "auxiliary_loss_clip": 0.01056278, + "auxiliary_loss_mlp": 0.01032729, + "balance_loss_clip": 1.02686381, + "balance_loss_mlp": 1.02123165, + "epoch": 0.6768074552833309, + "flos": 18368739469440.0, + "grad_norm": 2.0776975955369283, + "language_loss": 0.65087092, + "learning_rate": 9.991007116408965e-07, + "loss": 0.67176092, + "num_input_tokens_seen": 243001125, + "step": 11257, + "time_per_iteration": 2.544843912124634 + }, + { + "auxiliary_loss_clip": 0.01028984, + "auxiliary_loss_mlp": 0.0103019, + "balance_loss_clip": 1.02994609, + "balance_loss_mlp": 1.01944327, + "epoch": 0.6768675785359988, + "flos": 23039927556480.0, + "grad_norm": 1.597920233263154, + "language_loss": 0.75568044, + "learning_rate": 9.987635480759109e-07, + "loss": 0.77627212, + "num_input_tokens_seen": 243021865, + "step": 11258, + "time_per_iteration": 2.693721294403076 + }, + { + "auxiliary_loss_clip": 0.01042942, + "auxiliary_loss_mlp": 0.01032256, + "balance_loss_clip": 1.02519667, + "balance_loss_mlp": 1.0218792, + "epoch": 0.6769277017886668, + "flos": 33036524092800.0, + "grad_norm": 1.524234376520656, + "language_loss": 0.66689837, + "learning_rate": 9.984264224779127e-07, + "loss": 0.68765032, + "num_input_tokens_seen": 243042970, + "step": 11259, + "time_per_iteration": 2.7095537185668945 + }, + { + "auxiliary_loss_clip": 0.01042442, + "auxiliary_loss_mlp": 0.01028605, + "balance_loss_clip": 1.02374077, + "balance_loss_mlp": 1.01734543, + "epoch": 0.6769878250413347, + "flos": 20848406964480.0, + "grad_norm": 2.1347619632454014, + "language_loss": 0.85354173, + "learning_rate": 9.980893348596839e-07, + "loss": 0.8742522, + "num_input_tokens_seen": 243058470, + "step": 11260, + "time_per_iteration": 2.5767624378204346 + }, + { + "auxiliary_loss_clip": 0.01036468, + "auxiliary_loss_mlp": 0.01035635, + "balance_loss_clip": 1.02287126, + "balance_loss_mlp": 1.02320755, + "epoch": 0.6770479482940027, + "flos": 15595968994560.0, + "grad_norm": 2.12552028158634, + "language_loss": 0.77451026, + "learning_rate": 9.977522852340081e-07, + "loss": 0.79523134, + "num_input_tokens_seen": 243076630, + "step": 11261, + "time_per_iteration": 2.654700756072998 + }, + { + "auxiliary_loss_clip": 0.01044008, + "auxiliary_loss_mlp": 0.01033304, + "balance_loss_clip": 1.02404213, + "balance_loss_mlp": 1.0220803, + "epoch": 0.6771080715466706, + "flos": 18621011664000.0, + "grad_norm": 1.7733176840950353, + "language_loss": 0.87792927, + "learning_rate": 9.97415273613666e-07, + "loss": 0.89870238, + "num_input_tokens_seen": 243092260, + "step": 11262, + "time_per_iteration": 2.634345293045044 + }, + { + "auxiliary_loss_clip": 0.01047999, + "auxiliary_loss_mlp": 0.01030186, + "balance_loss_clip": 1.02739859, + "balance_loss_mlp": 1.01881957, + "epoch": 0.6771681947993387, + "flos": 12495441893760.0, + "grad_norm": 1.9607257418535748, + "language_loss": 0.73985839, + "learning_rate": 9.97078300011439e-07, + "loss": 0.76064026, + "num_input_tokens_seen": 243109405, + "step": 11263, + "time_per_iteration": 2.6924774646759033 + }, + { + "auxiliary_loss_clip": 0.01056441, + "auxiliary_loss_mlp": 0.0103072, + "balance_loss_clip": 1.02499676, + "balance_loss_mlp": 1.01853669, + "epoch": 0.6772283180520066, + "flos": 22236964974720.0, + "grad_norm": 2.014212326710552, + "language_loss": 0.67958605, + "learning_rate": 9.967413644401016e-07, + "loss": 0.70045757, + "num_input_tokens_seen": 243128135, + "step": 11264, + "time_per_iteration": 2.607461452484131 + }, + { + "auxiliary_loss_clip": 0.01045936, + "auxiliary_loss_mlp": 0.01031217, + "balance_loss_clip": 1.0268178, + "balance_loss_mlp": 1.0194149, + "epoch": 0.6772884413046746, + "flos": 16143139848960.0, + "grad_norm": 2.639594678128143, + "language_loss": 0.73159242, + "learning_rate": 9.964044669124324e-07, + "loss": 0.75236392, + "num_input_tokens_seen": 243146785, + "step": 11265, + "time_per_iteration": 2.6320252418518066 + }, + { + "auxiliary_loss_clip": 0.01025504, + "auxiliary_loss_mlp": 0.01033223, + "balance_loss_clip": 1.02241302, + "balance_loss_mlp": 1.02203536, + "epoch": 0.6773485645573426, + "flos": 19135755515520.0, + "grad_norm": 1.5544120774765935, + "language_loss": 0.61173093, + "learning_rate": 9.96067607441207e-07, + "loss": 0.63231826, + "num_input_tokens_seen": 243165275, + "step": 11266, + "time_per_iteration": 2.654788017272949 + }, + { + "auxiliary_loss_clip": 0.0103759, + "auxiliary_loss_mlp": 0.01029716, + "balance_loss_clip": 1.0265435, + "balance_loss_mlp": 1.01880789, + "epoch": 0.6774086878100105, + "flos": 14136918543360.0, + "grad_norm": 1.8471723390786374, + "language_loss": 0.70584524, + "learning_rate": 9.957307860391976e-07, + "loss": 0.72651833, + "num_input_tokens_seen": 243182845, + "step": 11267, + "time_per_iteration": 2.66680645942688 + }, + { + "auxiliary_loss_clip": 0.01065535, + "auxiliary_loss_mlp": 0.01029371, + "balance_loss_clip": 1.02592242, + "balance_loss_mlp": 1.0180763, + "epoch": 0.6774688110626785, + "flos": 22197067943040.0, + "grad_norm": 2.332842259032272, + "language_loss": 0.71157598, + "learning_rate": 9.953940027191785e-07, + "loss": 0.73252499, + "num_input_tokens_seen": 243201475, + "step": 11268, + "time_per_iteration": 2.5960781574249268 + }, + { + "auxiliary_loss_clip": 0.0103666, + "auxiliary_loss_mlp": 0.01028758, + "balance_loss_clip": 1.02333164, + "balance_loss_mlp": 1.01677132, + "epoch": 0.6775289343153464, + "flos": 23039963470080.0, + "grad_norm": 1.744645731747151, + "language_loss": 0.76886463, + "learning_rate": 9.950572574939194e-07, + "loss": 0.78951877, + "num_input_tokens_seen": 243221850, + "step": 11269, + "time_per_iteration": 2.7670860290527344 + }, + { + "auxiliary_loss_clip": 0.01033836, + "auxiliary_loss_mlp": 0.0103434, + "balance_loss_clip": 1.02426875, + "balance_loss_mlp": 1.02261615, + "epoch": 0.6775890575680145, + "flos": 18293506433280.0, + "grad_norm": 9.217313541967275, + "language_loss": 0.74492812, + "learning_rate": 9.94720550376189e-07, + "loss": 0.76560986, + "num_input_tokens_seen": 243239855, + "step": 11270, + "time_per_iteration": 2.6971182823181152 + }, + { + "auxiliary_loss_clip": 0.01017391, + "auxiliary_loss_mlp": 0.01037946, + "balance_loss_clip": 1.02641296, + "balance_loss_mlp": 1.02593541, + "epoch": 0.6776491808206824, + "flos": 25336450581120.0, + "grad_norm": 1.9510371474027968, + "language_loss": 0.7279256, + "learning_rate": 9.94383881378756e-07, + "loss": 0.74847895, + "num_input_tokens_seen": 243260085, + "step": 11271, + "time_per_iteration": 4.386785507202148 + }, + { + "auxiliary_loss_clip": 0.01064167, + "auxiliary_loss_mlp": 0.01035491, + "balance_loss_clip": 1.0248735, + "balance_loss_mlp": 1.02445197, + "epoch": 0.6777093040733504, + "flos": 26028233591040.0, + "grad_norm": 1.720127059182028, + "language_loss": 0.68095821, + "learning_rate": 9.94047250514387e-07, + "loss": 0.70195484, + "num_input_tokens_seen": 243280065, + "step": 11272, + "time_per_iteration": 4.557191848754883 + }, + { + "auxiliary_loss_clip": 0.01059449, + "auxiliary_loss_mlp": 0.0103281, + "balance_loss_clip": 1.02748466, + "balance_loss_mlp": 1.0198524, + "epoch": 0.6777694273260183, + "flos": 18003599763840.0, + "grad_norm": 3.215923461238428, + "language_loss": 0.74038196, + "learning_rate": 9.937106577958481e-07, + "loss": 0.76130456, + "num_input_tokens_seen": 243297775, + "step": 11273, + "time_per_iteration": 2.590289831161499 + }, + { + "auxiliary_loss_clip": 0.01043965, + "auxiliary_loss_mlp": 0.01035452, + "balance_loss_clip": 1.02488434, + "balance_loss_mlp": 1.02411497, + "epoch": 0.6778295505786863, + "flos": 23441085624960.0, + "grad_norm": 1.5755097300839875, + "language_loss": 0.70345718, + "learning_rate": 9.933741032359015e-07, + "loss": 0.72425139, + "num_input_tokens_seen": 243315760, + "step": 11274, + "time_per_iteration": 2.5597410202026367 + }, + { + "auxiliary_loss_clip": 0.01065864, + "auxiliary_loss_mlp": 0.01030194, + "balance_loss_clip": 1.02634287, + "balance_loss_mlp": 1.01852894, + "epoch": 0.6778896738313542, + "flos": 19098408349440.0, + "grad_norm": 1.5666471889884588, + "language_loss": 0.65558589, + "learning_rate": 9.930375868473093e-07, + "loss": 0.67654651, + "num_input_tokens_seen": 243335715, + "step": 11275, + "time_per_iteration": 2.551301956176758 + }, + { + "auxiliary_loss_clip": 0.01058249, + "auxiliary_loss_mlp": 0.01029898, + "balance_loss_clip": 1.02862167, + "balance_loss_mlp": 1.0194732, + "epoch": 0.6779497970840223, + "flos": 26103933504000.0, + "grad_norm": 1.5179119796461158, + "language_loss": 0.72466415, + "learning_rate": 9.927011086428335e-07, + "loss": 0.74554563, + "num_input_tokens_seen": 243356935, + "step": 11276, + "time_per_iteration": 2.5660605430603027 + }, + { + "auxiliary_loss_clip": 0.01034433, + "auxiliary_loss_mlp": 0.00747783, + "balance_loss_clip": 1.02242804, + "balance_loss_mlp": 1.0004425, + "epoch": 0.6780099203366902, + "flos": 19719232041600.0, + "grad_norm": 1.6116369386135891, + "language_loss": 0.77026272, + "learning_rate": 9.923646686352317e-07, + "loss": 0.78808486, + "num_input_tokens_seen": 243375625, + "step": 11277, + "time_per_iteration": 2.587442636489868 + }, + { + "auxiliary_loss_clip": 0.0104301, + "auxiliary_loss_mlp": 0.01025205, + "balance_loss_clip": 1.024611, + "balance_loss_mlp": 1.01402926, + "epoch": 0.6780700435893582, + "flos": 18214538382720.0, + "grad_norm": 3.1377612307768064, + "language_loss": 0.83608043, + "learning_rate": 9.920282668372627e-07, + "loss": 0.85676259, + "num_input_tokens_seen": 243390195, + "step": 11278, + "time_per_iteration": 2.5414862632751465 + }, + { + "auxiliary_loss_clip": 0.01043707, + "auxiliary_loss_mlp": 0.00747645, + "balance_loss_clip": 1.02658951, + "balance_loss_mlp": 1.00051737, + "epoch": 0.6781301668420262, + "flos": 25376239872000.0, + "grad_norm": 1.5999983352064564, + "language_loss": 0.70633459, + "learning_rate": 9.916919032616844e-07, + "loss": 0.72424817, + "num_input_tokens_seen": 243411690, + "step": 11279, + "time_per_iteration": 2.635864734649658 + }, + { + "auxiliary_loss_clip": 0.01055954, + "auxiliary_loss_mlp": 0.01035061, + "balance_loss_clip": 1.02635646, + "balance_loss_mlp": 1.02304506, + "epoch": 0.6781902900946941, + "flos": 24020432087040.0, + "grad_norm": 2.355011676784213, + "language_loss": 0.74079418, + "learning_rate": 9.913555779212485e-07, + "loss": 0.76170433, + "num_input_tokens_seen": 243430280, + "step": 11280, + "time_per_iteration": 2.596357822418213 + }, + { + "auxiliary_loss_clip": 0.01052461, + "auxiliary_loss_mlp": 0.01029346, + "balance_loss_clip": 1.02426648, + "balance_loss_mlp": 1.01734734, + "epoch": 0.6782504133473621, + "flos": 19646764352640.0, + "grad_norm": 2.135994277515104, + "language_loss": 0.70398557, + "learning_rate": 9.910192908287104e-07, + "loss": 0.72480357, + "num_input_tokens_seen": 243448690, + "step": 11281, + "time_per_iteration": 2.5941152572631836 + }, + { + "auxiliary_loss_clip": 0.01063707, + "auxiliary_loss_mlp": 0.01026414, + "balance_loss_clip": 1.0265801, + "balance_loss_mlp": 1.01607275, + "epoch": 0.67831053660003, + "flos": 24932742647040.0, + "grad_norm": 1.5892394322659178, + "language_loss": 0.63570178, + "learning_rate": 9.906830419968217e-07, + "loss": 0.65660298, + "num_input_tokens_seen": 243470695, + "step": 11282, + "time_per_iteration": 2.6142256259918213 + }, + { + "auxiliary_loss_clip": 0.01020008, + "auxiliary_loss_mlp": 0.01038775, + "balance_loss_clip": 1.02074206, + "balance_loss_mlp": 1.0245477, + "epoch": 0.6783706598526981, + "flos": 31208383440000.0, + "grad_norm": 2.6330567760887478, + "language_loss": 0.74258536, + "learning_rate": 9.90346831438334e-07, + "loss": 0.76317322, + "num_input_tokens_seen": 243493345, + "step": 11283, + "time_per_iteration": 2.8284032344818115 + }, + { + "auxiliary_loss_clip": 0.01052769, + "auxiliary_loss_mlp": 0.01027549, + "balance_loss_clip": 1.02470577, + "balance_loss_mlp": 1.01639056, + "epoch": 0.678430783105366, + "flos": 35441317687680.0, + "grad_norm": 1.6960622164028831, + "language_loss": 0.57567692, + "learning_rate": 9.900106591659948e-07, + "loss": 0.59648013, + "num_input_tokens_seen": 243515670, + "step": 11284, + "time_per_iteration": 2.7466819286346436 + }, + { + "auxiliary_loss_clip": 0.01045794, + "auxiliary_loss_mlp": 0.01026912, + "balance_loss_clip": 1.02691984, + "balance_loss_mlp": 1.01636147, + "epoch": 0.678490906358034, + "flos": 14428800460800.0, + "grad_norm": 1.8940493754002736, + "language_loss": 0.75424767, + "learning_rate": 9.896745251925535e-07, + "loss": 0.77497476, + "num_input_tokens_seen": 243533625, + "step": 11285, + "time_per_iteration": 2.6582725048065186 + }, + { + "auxiliary_loss_clip": 0.01063888, + "auxiliary_loss_mlp": 0.0103224, + "balance_loss_clip": 1.0264895, + "balance_loss_mlp": 1.02140403, + "epoch": 0.6785510296107019, + "flos": 24311236596480.0, + "grad_norm": 1.5901505656867683, + "language_loss": 0.66498291, + "learning_rate": 9.893384295307557e-07, + "loss": 0.6859442, + "num_input_tokens_seen": 243553040, + "step": 11286, + "time_per_iteration": 4.155334949493408 + }, + { + "auxiliary_loss_clip": 0.01042589, + "auxiliary_loss_mlp": 0.01028234, + "balance_loss_clip": 1.02339375, + "balance_loss_mlp": 1.01705766, + "epoch": 0.6786111528633699, + "flos": 26977244872320.0, + "grad_norm": 2.277038716984277, + "language_loss": 0.52781171, + "learning_rate": 9.890023721933447e-07, + "loss": 0.54851991, + "num_input_tokens_seen": 243572590, + "step": 11287, + "time_per_iteration": 2.7002665996551514 + }, + { + "auxiliary_loss_clip": 0.01023285, + "auxiliary_loss_mlp": 0.01033462, + "balance_loss_clip": 1.02403522, + "balance_loss_mlp": 1.0213151, + "epoch": 0.6786712761160378, + "flos": 24317557390080.0, + "grad_norm": 1.7888134942758511, + "language_loss": 0.77040339, + "learning_rate": 9.886663531930655e-07, + "loss": 0.7909708, + "num_input_tokens_seen": 243594140, + "step": 11288, + "time_per_iteration": 2.69573974609375 + }, + { + "auxiliary_loss_clip": 0.01059033, + "auxiliary_loss_mlp": 0.01033733, + "balance_loss_clip": 1.02854228, + "balance_loss_mlp": 1.02255154, + "epoch": 0.6787313993687059, + "flos": 22930435923840.0, + "grad_norm": 1.9425055233504425, + "language_loss": 0.73150587, + "learning_rate": 9.883303725426593e-07, + "loss": 0.75243354, + "num_input_tokens_seen": 243615170, + "step": 11289, + "time_per_iteration": 2.657890558242798 + }, + { + "auxiliary_loss_clip": 0.01065298, + "auxiliary_loss_mlp": 0.01034209, + "balance_loss_clip": 1.02620196, + "balance_loss_mlp": 1.02266991, + "epoch": 0.6787915226213738, + "flos": 26868435598080.0, + "grad_norm": 1.4395494636780097, + "language_loss": 0.80212915, + "learning_rate": 9.879944302548682e-07, + "loss": 0.82312429, + "num_input_tokens_seen": 243635675, + "step": 11290, + "time_per_iteration": 2.5673887729644775 + }, + { + "auxiliary_loss_clip": 0.01052294, + "auxiliary_loss_mlp": 0.01029301, + "balance_loss_clip": 1.02574646, + "balance_loss_mlp": 1.01858449, + "epoch": 0.6788516458740418, + "flos": 20008851402240.0, + "grad_norm": 1.458952929049129, + "language_loss": 0.75001866, + "learning_rate": 9.87658526342428e-07, + "loss": 0.77083457, + "num_input_tokens_seen": 243654950, + "step": 11291, + "time_per_iteration": 2.658439874649048 + }, + { + "auxiliary_loss_clip": 0.01048647, + "auxiliary_loss_mlp": 0.00747699, + "balance_loss_clip": 1.02769971, + "balance_loss_mlp": 1.00048423, + "epoch": 0.6789117691267098, + "flos": 28727099832960.0, + "grad_norm": 1.871974783041325, + "language_loss": 0.75306547, + "learning_rate": 9.873226608180785e-07, + "loss": 0.77102888, + "num_input_tokens_seen": 243674970, + "step": 11292, + "time_per_iteration": 2.6933772563934326 + }, + { + "auxiliary_loss_clip": 0.01026699, + "auxiliary_loss_mlp": 0.01031437, + "balance_loss_clip": 1.02271605, + "balance_loss_mlp": 1.01934922, + "epoch": 0.6789718923793777, + "flos": 23403451150080.0, + "grad_norm": 2.1901658235344716, + "language_loss": 0.84053135, + "learning_rate": 9.869868336945556e-07, + "loss": 0.86111277, + "num_input_tokens_seen": 243693440, + "step": 11293, + "time_per_iteration": 2.665808916091919 + }, + { + "auxiliary_loss_clip": 0.01070152, + "auxiliary_loss_mlp": 0.01037014, + "balance_loss_clip": 1.0282861, + "balance_loss_mlp": 1.02471149, + "epoch": 0.6790320156320457, + "flos": 20448865008000.0, + "grad_norm": 2.2180817875586096, + "language_loss": 0.7931273, + "learning_rate": 9.866510449845929e-07, + "loss": 0.81419897, + "num_input_tokens_seen": 243710055, + "step": 11294, + "time_per_iteration": 2.6603102684020996 + }, + { + "auxiliary_loss_clip": 0.01043137, + "auxiliary_loss_mlp": 0.01024991, + "balance_loss_clip": 1.02524734, + "balance_loss_mlp": 1.01448822, + "epoch": 0.6790921388847136, + "flos": 24167199058560.0, + "grad_norm": 1.6896118349375417, + "language_loss": 0.79168922, + "learning_rate": 9.86315294700924e-07, + "loss": 0.81237054, + "num_input_tokens_seen": 243728635, + "step": 11295, + "time_per_iteration": 4.2383692264556885 + }, + { + "auxiliary_loss_clip": 0.01043093, + "auxiliary_loss_mlp": 0.01028303, + "balance_loss_clip": 1.02635431, + "balance_loss_mlp": 1.01907015, + "epoch": 0.6791522621373817, + "flos": 21908095027200.0, + "grad_norm": 1.658363050696406, + "language_loss": 0.71288675, + "learning_rate": 9.859795828562823e-07, + "loss": 0.73360074, + "num_input_tokens_seen": 243748330, + "step": 11296, + "time_per_iteration": 2.6091434955596924 + }, + { + "auxiliary_loss_clip": 0.01051952, + "auxiliary_loss_mlp": 0.01028621, + "balance_loss_clip": 1.0244875, + "balance_loss_mlp": 1.01839304, + "epoch": 0.6792123853900496, + "flos": 24826519152000.0, + "grad_norm": 1.4891291623213838, + "language_loss": 0.7071377, + "learning_rate": 9.856439094633949e-07, + "loss": 0.72794342, + "num_input_tokens_seen": 243769380, + "step": 11297, + "time_per_iteration": 2.5709400177001953 + }, + { + "auxiliary_loss_clip": 0.01036667, + "auxiliary_loss_mlp": 0.01030174, + "balance_loss_clip": 1.02554977, + "balance_loss_mlp": 1.01824093, + "epoch": 0.6792725086427176, + "flos": 17566279678080.0, + "grad_norm": 2.11844486433558, + "language_loss": 0.66428918, + "learning_rate": 9.853082745349918e-07, + "loss": 0.68495756, + "num_input_tokens_seen": 243785510, + "step": 11298, + "time_per_iteration": 2.6111271381378174 + }, + { + "auxiliary_loss_clip": 0.01055418, + "auxiliary_loss_mlp": 0.01023247, + "balance_loss_clip": 1.02550507, + "balance_loss_mlp": 1.0131681, + "epoch": 0.6793326318953855, + "flos": 26941837040640.0, + "grad_norm": 1.8149703111355306, + "language_loss": 0.7179625, + "learning_rate": 9.84972678083801e-07, + "loss": 0.73874915, + "num_input_tokens_seen": 243805545, + "step": 11299, + "time_per_iteration": 2.565826177597046 + }, + { + "auxiliary_loss_clip": 0.01065857, + "auxiliary_loss_mlp": 0.01029226, + "balance_loss_clip": 1.02654064, + "balance_loss_mlp": 1.01778185, + "epoch": 0.6793927551480535, + "flos": 24318275662080.0, + "grad_norm": 1.4383377941996194, + "language_loss": 0.77166653, + "learning_rate": 9.846371201225488e-07, + "loss": 0.79261738, + "num_input_tokens_seen": 243825185, + "step": 11300, + "time_per_iteration": 2.5503594875335693 + }, + { + "auxiliary_loss_clip": 0.01053424, + "auxiliary_loss_mlp": 0.01030516, + "balance_loss_clip": 1.02504992, + "balance_loss_mlp": 1.01907134, + "epoch": 0.6794528784007214, + "flos": 11436615757440.0, + "grad_norm": 1.8213905137353656, + "language_loss": 0.62870049, + "learning_rate": 9.843016006639577e-07, + "loss": 0.64953989, + "num_input_tokens_seen": 243841600, + "step": 11301, + "time_per_iteration": 2.526695728302002 + }, + { + "auxiliary_loss_clip": 0.01052953, + "auxiliary_loss_mlp": 0.01027231, + "balance_loss_clip": 1.02549803, + "balance_loss_mlp": 1.01710427, + "epoch": 0.6795130016533895, + "flos": 25229688382080.0, + "grad_norm": 1.673310657440983, + "language_loss": 0.8277092, + "learning_rate": 9.839661197207525e-07, + "loss": 0.8485111, + "num_input_tokens_seen": 243862250, + "step": 11302, + "time_per_iteration": 2.5741970539093018 + }, + { + "auxiliary_loss_clip": 0.01054495, + "auxiliary_loss_mlp": 0.01030037, + "balance_loss_clip": 1.02493036, + "balance_loss_mlp": 1.01883173, + "epoch": 0.6795731249060574, + "flos": 18296415434880.0, + "grad_norm": 2.4769461893610574, + "language_loss": 0.69483435, + "learning_rate": 9.83630677305654e-07, + "loss": 0.71567965, + "num_input_tokens_seen": 243880560, + "step": 11303, + "time_per_iteration": 2.6424624919891357 + }, + { + "auxiliary_loss_clip": 0.01036584, + "auxiliary_loss_mlp": 0.01029024, + "balance_loss_clip": 1.02520216, + "balance_loss_mlp": 1.01775885, + "epoch": 0.6796332481587254, + "flos": 20300374183680.0, + "grad_norm": 2.151955013281083, + "language_loss": 0.69772869, + "learning_rate": 9.832952734313813e-07, + "loss": 0.71838474, + "num_input_tokens_seen": 243900635, + "step": 11304, + "time_per_iteration": 2.6374847888946533 + }, + { + "auxiliary_loss_clip": 0.01052511, + "auxiliary_loss_mlp": 0.01033409, + "balance_loss_clip": 1.02572763, + "balance_loss_mlp": 1.02121401, + "epoch": 0.6796933714113934, + "flos": 23586847015680.0, + "grad_norm": 2.0852192066098674, + "language_loss": 0.72572452, + "learning_rate": 9.829599081106536e-07, + "loss": 0.74658376, + "num_input_tokens_seen": 243920160, + "step": 11305, + "time_per_iteration": 2.60846209526062 + }, + { + "auxiliary_loss_clip": 0.01043142, + "auxiliary_loss_mlp": 0.01030712, + "balance_loss_clip": 1.02481318, + "balance_loss_mlp": 1.01963711, + "epoch": 0.6797534946640613, + "flos": 27119917693440.0, + "grad_norm": 1.7663340814260364, + "language_loss": 0.65857989, + "learning_rate": 9.826245813561882e-07, + "loss": 0.67931843, + "num_input_tokens_seen": 243939015, + "step": 11306, + "time_per_iteration": 2.760767698287964 + }, + { + "auxiliary_loss_clip": 0.01042944, + "auxiliary_loss_mlp": 0.01026329, + "balance_loss_clip": 1.02536356, + "balance_loss_mlp": 1.01508129, + "epoch": 0.6798136179167293, + "flos": 22127437428480.0, + "grad_norm": 1.607319009095248, + "language_loss": 0.80060077, + "learning_rate": 9.822892931807021e-07, + "loss": 0.82129359, + "num_input_tokens_seen": 243958470, + "step": 11307, + "time_per_iteration": 2.664119243621826 + }, + { + "auxiliary_loss_clip": 0.01038549, + "auxiliary_loss_mlp": 0.0103578, + "balance_loss_clip": 1.02427471, + "balance_loss_mlp": 1.02343011, + "epoch": 0.6798737411693972, + "flos": 17488640430720.0, + "grad_norm": 1.8889259826543194, + "language_loss": 0.88834596, + "learning_rate": 9.819540435969066e-07, + "loss": 0.90908927, + "num_input_tokens_seen": 243975450, + "step": 11308, + "time_per_iteration": 2.621464490890503 + }, + { + "auxiliary_loss_clip": 0.01026821, + "auxiliary_loss_mlp": 0.01038752, + "balance_loss_clip": 1.02177, + "balance_loss_mlp": 1.02596664, + "epoch": 0.6799338644220653, + "flos": 22892262744960.0, + "grad_norm": 2.0464451152104415, + "language_loss": 0.71225852, + "learning_rate": 9.816188326175154e-07, + "loss": 0.73291421, + "num_input_tokens_seen": 243994355, + "step": 11309, + "time_per_iteration": 2.7612099647521973 + }, + { + "auxiliary_loss_clip": 0.01038711, + "auxiliary_loss_mlp": 0.01035477, + "balance_loss_clip": 1.02871823, + "balance_loss_mlp": 1.02418184, + "epoch": 0.6799939876747332, + "flos": 23180409648000.0, + "grad_norm": 2.030735092072388, + "language_loss": 0.84495932, + "learning_rate": 9.812836602552411e-07, + "loss": 0.8657012, + "num_input_tokens_seen": 244011620, + "step": 11310, + "time_per_iteration": 2.667224168777466 + }, + { + "auxiliary_loss_clip": 0.01043065, + "auxiliary_loss_mlp": 0.01027118, + "balance_loss_clip": 1.02747941, + "balance_loss_mlp": 1.01682985, + "epoch": 0.6800541109274012, + "flos": 19499925553920.0, + "grad_norm": 2.0295452357086545, + "language_loss": 0.83153439, + "learning_rate": 9.80948526522792e-07, + "loss": 0.85223615, + "num_input_tokens_seen": 244029925, + "step": 11311, + "time_per_iteration": 2.5959556102752686 + }, + { + "auxiliary_loss_clip": 0.01009297, + "auxiliary_loss_mlp": 0.01031191, + "balance_loss_clip": 1.02037227, + "balance_loss_mlp": 1.01789892, + "epoch": 0.6801142341800691, + "flos": 22277652105600.0, + "grad_norm": 2.397310498665072, + "language_loss": 0.76113325, + "learning_rate": 9.806134314328767e-07, + "loss": 0.78153813, + "num_input_tokens_seen": 244051225, + "step": 11312, + "time_per_iteration": 2.8144640922546387 + }, + { + "auxiliary_loss_clip": 0.01006894, + "auxiliary_loss_mlp": 0.01001616, + "balance_loss_clip": 1.00144625, + "balance_loss_mlp": 1.00065029, + "epoch": 0.6801743574327371, + "flos": 68714817759360.0, + "grad_norm": 0.6556780742245001, + "language_loss": 0.57271308, + "learning_rate": 9.802783749982038e-07, + "loss": 0.59279817, + "num_input_tokens_seen": 244115930, + "step": 11313, + "time_per_iteration": 3.3328373432159424 + }, + { + "auxiliary_loss_clip": 0.01052295, + "auxiliary_loss_mlp": 0.01026326, + "balance_loss_clip": 1.02353072, + "balance_loss_mlp": 1.015764, + "epoch": 0.680234480685405, + "flos": 29460467813760.0, + "grad_norm": 1.6364836261619844, + "language_loss": 0.68734097, + "learning_rate": 9.799433572314754e-07, + "loss": 0.70812714, + "num_input_tokens_seen": 244137320, + "step": 11314, + "time_per_iteration": 2.626459836959839 + }, + { + "auxiliary_loss_clip": 0.01046894, + "auxiliary_loss_mlp": 0.01027754, + "balance_loss_clip": 1.02098775, + "balance_loss_mlp": 1.01727581, + "epoch": 0.6802946039380731, + "flos": 15916866122880.0, + "grad_norm": 2.374751875588675, + "language_loss": 0.81626701, + "learning_rate": 9.796083781453972e-07, + "loss": 0.83701348, + "num_input_tokens_seen": 244152755, + "step": 11315, + "time_per_iteration": 2.6132853031158447 + }, + { + "auxiliary_loss_clip": 0.01025563, + "auxiliary_loss_mlp": 0.01025492, + "balance_loss_clip": 1.02591491, + "balance_loss_mlp": 1.01444757, + "epoch": 0.680354727190741, + "flos": 22018664067840.0, + "grad_norm": 1.6760566408656856, + "language_loss": 0.70106858, + "learning_rate": 9.792734377526718e-07, + "loss": 0.72157907, + "num_input_tokens_seen": 244171480, + "step": 11316, + "time_per_iteration": 2.7547190189361572 + }, + { + "auxiliary_loss_clip": 0.01054244, + "auxiliary_loss_mlp": 0.01026071, + "balance_loss_clip": 1.02625489, + "balance_loss_mlp": 1.01566958, + "epoch": 0.680414850443409, + "flos": 18441494467200.0, + "grad_norm": 2.1221779786811266, + "language_loss": 0.66782022, + "learning_rate": 9.789385360660003e-07, + "loss": 0.68862343, + "num_input_tokens_seen": 244187920, + "step": 11317, + "time_per_iteration": 2.6492507457733154 + }, + { + "auxiliary_loss_clip": 0.01056681, + "auxiliary_loss_mlp": 0.01039815, + "balance_loss_clip": 1.02781153, + "balance_loss_mlp": 1.02897263, + "epoch": 0.680474973696077, + "flos": 26358611909760.0, + "grad_norm": 1.5708541756007663, + "language_loss": 0.75248128, + "learning_rate": 9.78603673098082e-07, + "loss": 0.77344626, + "num_input_tokens_seen": 244209565, + "step": 11318, + "time_per_iteration": 4.303986072540283 + }, + { + "auxiliary_loss_clip": 0.01032248, + "auxiliary_loss_mlp": 0.01030428, + "balance_loss_clip": 1.02185977, + "balance_loss_mlp": 1.01998544, + "epoch": 0.6805350969487449, + "flos": 18333116156160.0, + "grad_norm": 1.563742164740196, + "language_loss": 0.68192297, + "learning_rate": 9.782688488616143e-07, + "loss": 0.7025497, + "num_input_tokens_seen": 244228015, + "step": 11319, + "time_per_iteration": 4.245357275009155 + }, + { + "auxiliary_loss_clip": 0.01029367, + "auxiliary_loss_mlp": 0.00747811, + "balance_loss_clip": 1.02865458, + "balance_loss_mlp": 1.00053835, + "epoch": 0.6805952202014129, + "flos": 19937497034880.0, + "grad_norm": 1.5465524273717985, + "language_loss": 0.77116871, + "learning_rate": 9.779340633692945e-07, + "loss": 0.78894049, + "num_input_tokens_seen": 244245615, + "step": 11320, + "time_per_iteration": 2.813347578048706 + }, + { + "auxiliary_loss_clip": 0.0104437, + "auxiliary_loss_mlp": 0.0103226, + "balance_loss_clip": 1.02629232, + "balance_loss_mlp": 1.02089357, + "epoch": 0.6806553434540809, + "flos": 25224301342080.0, + "grad_norm": 1.6253164843770587, + "language_loss": 0.74744976, + "learning_rate": 9.77599316633817e-07, + "loss": 0.76821601, + "num_input_tokens_seen": 244263625, + "step": 11321, + "time_per_iteration": 2.6951448917388916 + }, + { + "auxiliary_loss_clip": 0.0104585, + "auxiliary_loss_mlp": 0.01034217, + "balance_loss_clip": 1.02691305, + "balance_loss_mlp": 1.02323771, + "epoch": 0.6807154667067489, + "flos": 17785586165760.0, + "grad_norm": 2.5870235876607044, + "language_loss": 0.72896099, + "learning_rate": 9.772646086678758e-07, + "loss": 0.7497617, + "num_input_tokens_seen": 244282745, + "step": 11322, + "time_per_iteration": 2.586357593536377 + }, + { + "auxiliary_loss_clip": 0.01014453, + "auxiliary_loss_mlp": 0.00747734, + "balance_loss_clip": 1.02235401, + "balance_loss_mlp": 1.00050068, + "epoch": 0.6807755899594168, + "flos": 22199905117440.0, + "grad_norm": 1.7376667685545044, + "language_loss": 0.7815063, + "learning_rate": 9.769299394841638e-07, + "loss": 0.79912817, + "num_input_tokens_seen": 244303770, + "step": 11323, + "time_per_iteration": 2.7069313526153564 + }, + { + "auxiliary_loss_clip": 0.0098182, + "auxiliary_loss_mlp": 0.01001787, + "balance_loss_clip": 1.00543904, + "balance_loss_mlp": 1.00083923, + "epoch": 0.6808357132120848, + "flos": 68631073200000.0, + "grad_norm": 0.7513271973019653, + "language_loss": 0.57130903, + "learning_rate": 9.765953090953714e-07, + "loss": 0.5911451, + "num_input_tokens_seen": 244355910, + "step": 11324, + "time_per_iteration": 3.1154158115386963 + }, + { + "auxiliary_loss_clip": 0.01044731, + "auxiliary_loss_mlp": 0.01032501, + "balance_loss_clip": 1.02573466, + "balance_loss_mlp": 1.02068138, + "epoch": 0.6808958364647527, + "flos": 23843357015040.0, + "grad_norm": 1.9123260939792166, + "language_loss": 0.68351448, + "learning_rate": 9.76260717514186e-07, + "loss": 0.70428681, + "num_input_tokens_seen": 244376610, + "step": 11325, + "time_per_iteration": 2.66049861907959 + }, + { + "auxiliary_loss_clip": 0.01051267, + "auxiliary_loss_mlp": 0.01028811, + "balance_loss_clip": 1.02337277, + "balance_loss_mlp": 1.01716423, + "epoch": 0.6809559597174207, + "flos": 17711717846400.0, + "grad_norm": 2.0158423034009054, + "language_loss": 0.70250571, + "learning_rate": 9.759261647532974e-07, + "loss": 0.72330642, + "num_input_tokens_seen": 244393000, + "step": 11326, + "time_per_iteration": 2.6096911430358887 + }, + { + "auxiliary_loss_clip": 0.01064414, + "auxiliary_loss_mlp": 0.01026453, + "balance_loss_clip": 1.02524674, + "balance_loss_mlp": 1.01555109, + "epoch": 0.6810160829700886, + "flos": 22491894775680.0, + "grad_norm": 1.6931677839473716, + "language_loss": 0.7317639, + "learning_rate": 9.75591650825392e-07, + "loss": 0.75267255, + "num_input_tokens_seen": 244409515, + "step": 11327, + "time_per_iteration": 2.5261995792388916 + }, + { + "auxiliary_loss_clip": 0.01050095, + "auxiliary_loss_mlp": 0.0102767, + "balance_loss_clip": 1.02298403, + "balance_loss_mlp": 1.01675057, + "epoch": 0.6810762062227567, + "flos": 16832875783680.0, + "grad_norm": 1.748135119518857, + "language_loss": 0.77305514, + "learning_rate": 9.752571757431526e-07, + "loss": 0.79383278, + "num_input_tokens_seen": 244427165, + "step": 11328, + "time_per_iteration": 2.6076831817626953 + }, + { + "auxiliary_loss_clip": 0.01064472, + "auxiliary_loss_mlp": 0.01027491, + "balance_loss_clip": 1.02523696, + "balance_loss_mlp": 1.01633906, + "epoch": 0.6811363294754246, + "flos": 12714676554240.0, + "grad_norm": 1.8511970280919785, + "language_loss": 0.6434173, + "learning_rate": 9.74922739519265e-07, + "loss": 0.66433692, + "num_input_tokens_seen": 244445705, + "step": 11329, + "time_per_iteration": 2.5468902587890625 + }, + { + "auxiliary_loss_clip": 0.01016913, + "auxiliary_loss_mlp": 0.00747691, + "balance_loss_clip": 1.02690518, + "balance_loss_mlp": 1.0004766, + "epoch": 0.6811964527280926, + "flos": 17711969241600.0, + "grad_norm": 2.5083563184802182, + "language_loss": 0.79410124, + "learning_rate": 9.745883421664096e-07, + "loss": 0.81174725, + "num_input_tokens_seen": 244460415, + "step": 11330, + "time_per_iteration": 2.887277126312256 + }, + { + "auxiliary_loss_clip": 0.01053546, + "auxiliary_loss_mlp": 0.01026391, + "balance_loss_clip": 1.02524734, + "balance_loss_mlp": 1.01536989, + "epoch": 0.6812565759807605, + "flos": 24863471268480.0, + "grad_norm": 1.7729794990431258, + "language_loss": 0.63829303, + "learning_rate": 9.742539836972665e-07, + "loss": 0.65909243, + "num_input_tokens_seen": 244480555, + "step": 11331, + "time_per_iteration": 3.3280012607574463 + }, + { + "auxiliary_loss_clip": 0.01020594, + "auxiliary_loss_mlp": 0.01036039, + "balance_loss_clip": 1.02536178, + "balance_loss_mlp": 1.02372515, + "epoch": 0.6813166992334285, + "flos": 17166019449600.0, + "grad_norm": 1.6027168843644355, + "language_loss": 0.72532481, + "learning_rate": 9.739196641245148e-07, + "loss": 0.74589115, + "num_input_tokens_seen": 244498540, + "step": 11332, + "time_per_iteration": 2.737435817718506 + }, + { + "auxiliary_loss_clip": 0.01057796, + "auxiliary_loss_mlp": 0.01031721, + "balance_loss_clip": 1.02745509, + "balance_loss_mlp": 1.01999044, + "epoch": 0.6813768224860965, + "flos": 18843550375680.0, + "grad_norm": 2.057617291002613, + "language_loss": 0.7438736, + "learning_rate": 9.735853834608326e-07, + "loss": 0.76476872, + "num_input_tokens_seen": 244517015, + "step": 11333, + "time_per_iteration": 4.355731964111328 + }, + { + "auxiliary_loss_clip": 0.01056595, + "auxiliary_loss_mlp": 0.0102975, + "balance_loss_clip": 1.02579141, + "balance_loss_mlp": 1.01817441, + "epoch": 0.6814369457387645, + "flos": 24532733813760.0, + "grad_norm": 2.1420438271388327, + "language_loss": 0.72114992, + "learning_rate": 9.732511417188963e-07, + "loss": 0.74201339, + "num_input_tokens_seen": 244537450, + "step": 11334, + "time_per_iteration": 2.643279552459717 + }, + { + "auxiliary_loss_clip": 0.01056327, + "auxiliary_loss_mlp": 0.01028132, + "balance_loss_clip": 1.02773428, + "balance_loss_mlp": 1.01739156, + "epoch": 0.6814970689914325, + "flos": 18222978078720.0, + "grad_norm": 1.5955523568563963, + "language_loss": 0.85887843, + "learning_rate": 9.729169389113791e-07, + "loss": 0.87972301, + "num_input_tokens_seen": 244555640, + "step": 11335, + "time_per_iteration": 2.6016290187835693 + }, + { + "auxiliary_loss_clip": 0.01043957, + "auxiliary_loss_mlp": 0.01026457, + "balance_loss_clip": 1.0214591, + "balance_loss_mlp": 1.01668787, + "epoch": 0.6815571922441004, + "flos": 25228790542080.0, + "grad_norm": 1.6243047256245366, + "language_loss": 0.82270634, + "learning_rate": 9.725827750509542e-07, + "loss": 0.84341049, + "num_input_tokens_seen": 244574005, + "step": 11336, + "time_per_iteration": 2.6481776237487793 + }, + { + "auxiliary_loss_clip": 0.01026439, + "auxiliary_loss_mlp": 0.01030993, + "balance_loss_clip": 1.02236772, + "balance_loss_mlp": 1.02032971, + "epoch": 0.6816173154967684, + "flos": 19456078026240.0, + "grad_norm": 1.7727728116558201, + "language_loss": 0.81602108, + "learning_rate": 9.72248650150294e-07, + "loss": 0.83659542, + "num_input_tokens_seen": 244591395, + "step": 11337, + "time_per_iteration": 2.722235918045044 + }, + { + "auxiliary_loss_clip": 0.01023154, + "auxiliary_loss_mlp": 0.01028393, + "balance_loss_clip": 1.02584481, + "balance_loss_mlp": 1.01789033, + "epoch": 0.6816774387494363, + "flos": 17931455297280.0, + "grad_norm": 1.6901509834008266, + "language_loss": 0.72374892, + "learning_rate": 9.719145642220673e-07, + "loss": 0.74426436, + "num_input_tokens_seen": 244610400, + "step": 11338, + "time_per_iteration": 2.7051281929016113 + }, + { + "auxiliary_loss_clip": 0.01015447, + "auxiliary_loss_mlp": 0.01034022, + "balance_loss_clip": 1.02148151, + "balance_loss_mlp": 1.02319813, + "epoch": 0.6817375620021043, + "flos": 22233014478720.0, + "grad_norm": 1.4338762416272917, + "language_loss": 0.77475548, + "learning_rate": 9.715805172789435e-07, + "loss": 0.79525012, + "num_input_tokens_seen": 244630400, + "step": 11339, + "time_per_iteration": 2.848156690597534 + }, + { + "auxiliary_loss_clip": 0.01028367, + "auxiliary_loss_mlp": 0.0103384, + "balance_loss_clip": 1.02302623, + "balance_loss_mlp": 1.02268779, + "epoch": 0.6817976852547722, + "flos": 25374408278400.0, + "grad_norm": 1.9916386566417421, + "language_loss": 0.70663965, + "learning_rate": 9.712465093335901e-07, + "loss": 0.72726178, + "num_input_tokens_seen": 244649155, + "step": 11340, + "time_per_iteration": 2.748206377029419 + }, + { + "auxiliary_loss_clip": 0.01048159, + "auxiliary_loss_mlp": 0.01031433, + "balance_loss_clip": 1.027511, + "balance_loss_mlp": 1.02044761, + "epoch": 0.6818578085074403, + "flos": 22265764704000.0, + "grad_norm": 2.1954394154416823, + "language_loss": 0.83732349, + "learning_rate": 9.709125403986722e-07, + "loss": 0.85811949, + "num_input_tokens_seen": 244665470, + "step": 11341, + "time_per_iteration": 2.7066285610198975 + }, + { + "auxiliary_loss_clip": 0.01035464, + "auxiliary_loss_mlp": 0.01030826, + "balance_loss_clip": 1.02522564, + "balance_loss_mlp": 1.01873183, + "epoch": 0.6819179317601082, + "flos": 19318145800320.0, + "grad_norm": 2.056463223895019, + "language_loss": 0.68607485, + "learning_rate": 9.705786104868531e-07, + "loss": 0.70673776, + "num_input_tokens_seen": 244684390, + "step": 11342, + "time_per_iteration": 2.759589195251465 + }, + { + "auxiliary_loss_clip": 0.01015158, + "auxiliary_loss_mlp": 0.01027829, + "balance_loss_clip": 1.0246377, + "balance_loss_mlp": 1.01703441, + "epoch": 0.6819780550127762, + "flos": 21104126864640.0, + "grad_norm": 1.4960946881948785, + "language_loss": 0.74968314, + "learning_rate": 9.702447196107963e-07, + "loss": 0.77011305, + "num_input_tokens_seen": 244703370, + "step": 11343, + "time_per_iteration": 4.504784107208252 + }, + { + "auxiliary_loss_clip": 0.0102997, + "auxiliary_loss_mlp": 0.01034697, + "balance_loss_clip": 1.02552056, + "balance_loss_mlp": 1.02259696, + "epoch": 0.6820381782654441, + "flos": 29716403195520.0, + "grad_norm": 1.6820631411273246, + "language_loss": 0.79522467, + "learning_rate": 9.699108677831639e-07, + "loss": 0.81587136, + "num_input_tokens_seen": 244723325, + "step": 11344, + "time_per_iteration": 2.8664727210998535 + }, + { + "auxiliary_loss_clip": 0.01034707, + "auxiliary_loss_mlp": 0.010306, + "balance_loss_clip": 1.02488995, + "balance_loss_mlp": 1.01960301, + "epoch": 0.6820983015181121, + "flos": 29242130993280.0, + "grad_norm": 2.4895435933975887, + "language_loss": 0.66355991, + "learning_rate": 9.695770550166136e-07, + "loss": 0.68421304, + "num_input_tokens_seen": 244745650, + "step": 11345, + "time_per_iteration": 2.858762741088867 + }, + { + "auxiliary_loss_clip": 0.01048128, + "auxiliary_loss_mlp": 0.01030765, + "balance_loss_clip": 1.0272212, + "balance_loss_mlp": 1.01922524, + "epoch": 0.6821584247707801, + "flos": 18871775487360.0, + "grad_norm": 2.585966008727193, + "language_loss": 0.6496833, + "learning_rate": 9.692432813238054e-07, + "loss": 0.67047226, + "num_input_tokens_seen": 244760270, + "step": 11346, + "time_per_iteration": 2.724792242050171 + }, + { + "auxiliary_loss_clip": 0.00991846, + "auxiliary_loss_mlp": 0.00747731, + "balance_loss_clip": 1.01682067, + "balance_loss_mlp": 1.00053394, + "epoch": 0.6822185480234481, + "flos": 21324582587520.0, + "grad_norm": 1.7423129308553427, + "language_loss": 0.78483772, + "learning_rate": 9.689095467173952e-07, + "loss": 0.80223352, + "num_input_tokens_seen": 244779565, + "step": 11347, + "time_per_iteration": 2.8529560565948486 + }, + { + "auxiliary_loss_clip": 0.01000256, + "auxiliary_loss_mlp": 0.0100184, + "balance_loss_clip": 1.00436175, + "balance_loss_mlp": 1.0008086, + "epoch": 0.6822786712761161, + "flos": 63488306430720.0, + "grad_norm": 0.730827864240289, + "language_loss": 0.52527285, + "learning_rate": 9.685758512100378e-07, + "loss": 0.54529381, + "num_input_tokens_seen": 244838480, + "step": 11348, + "time_per_iteration": 3.2036867141723633 + }, + { + "auxiliary_loss_clip": 0.01062475, + "auxiliary_loss_mlp": 0.01030021, + "balance_loss_clip": 1.02529263, + "balance_loss_mlp": 1.01956642, + "epoch": 0.682338794528784, + "flos": 21068934514560.0, + "grad_norm": 1.7776008637436171, + "language_loss": 0.79520172, + "learning_rate": 9.682421948143873e-07, + "loss": 0.8161267, + "num_input_tokens_seen": 244855265, + "step": 11349, + "time_per_iteration": 2.6799490451812744 + }, + { + "auxiliary_loss_clip": 0.01059027, + "auxiliary_loss_mlp": 0.01029112, + "balance_loss_clip": 1.02631176, + "balance_loss_mlp": 1.01600528, + "epoch": 0.682398917781452, + "flos": 36283243547520.0, + "grad_norm": 1.77196031069164, + "language_loss": 0.73565418, + "learning_rate": 9.67908577543096e-07, + "loss": 0.75653553, + "num_input_tokens_seen": 244875555, + "step": 11350, + "time_per_iteration": 2.780057907104492 + }, + { + "auxiliary_loss_clip": 0.01064518, + "auxiliary_loss_mlp": 0.01028798, + "balance_loss_clip": 1.02637506, + "balance_loss_mlp": 1.01728797, + "epoch": 0.6824590410341199, + "flos": 24859197550080.0, + "grad_norm": 1.5734675870989074, + "language_loss": 0.79206842, + "learning_rate": 9.675749994088161e-07, + "loss": 0.81300163, + "num_input_tokens_seen": 244895270, + "step": 11351, + "time_per_iteration": 2.5935513973236084 + }, + { + "auxiliary_loss_clip": 0.01054048, + "auxiliary_loss_mlp": 0.01029355, + "balance_loss_clip": 1.02620053, + "balance_loss_mlp": 1.01928818, + "epoch": 0.6825191642867879, + "flos": 22452392793600.0, + "grad_norm": 1.551133236549722, + "language_loss": 0.73193127, + "learning_rate": 9.672414604241954e-07, + "loss": 0.7527653, + "num_input_tokens_seen": 244914535, + "step": 11352, + "time_per_iteration": 2.6238040924072266 + }, + { + "auxiliary_loss_clip": 0.01019609, + "auxiliary_loss_mlp": 0.0103441, + "balance_loss_clip": 1.02373219, + "balance_loss_mlp": 1.02203035, + "epoch": 0.6825792875394558, + "flos": 29424377623680.0, + "grad_norm": 1.8460023039931222, + "language_loss": 0.80243981, + "learning_rate": 9.669079606018814e-07, + "loss": 0.82298005, + "num_input_tokens_seen": 244936095, + "step": 11353, + "time_per_iteration": 2.714658498764038 + }, + { + "auxiliary_loss_clip": 0.01052271, + "auxiliary_loss_mlp": 0.01026102, + "balance_loss_clip": 1.02412045, + "balance_loss_mlp": 1.01477063, + "epoch": 0.6826394107921239, + "flos": 18770974945920.0, + "grad_norm": 1.6232308475209931, + "language_loss": 0.78299642, + "learning_rate": 9.665744999545218e-07, + "loss": 0.8037802, + "num_input_tokens_seen": 244955290, + "step": 11354, + "time_per_iteration": 2.5628693103790283 + }, + { + "auxiliary_loss_clip": 0.01012832, + "auxiliary_loss_mlp": 0.01026085, + "balance_loss_clip": 1.02551281, + "balance_loss_mlp": 1.01577306, + "epoch": 0.6826995340447918, + "flos": 16617591619200.0, + "grad_norm": 2.9216310234579543, + "language_loss": 0.62233961, + "learning_rate": 9.662410784947599e-07, + "loss": 0.64272881, + "num_input_tokens_seen": 244972935, + "step": 11355, + "time_per_iteration": 2.7696430683135986 + }, + { + "auxiliary_loss_clip": 0.01006305, + "auxiliary_loss_mlp": 0.01027581, + "balance_loss_clip": 1.0192318, + "balance_loss_mlp": 1.01607704, + "epoch": 0.6827596572974598, + "flos": 20848299223680.0, + "grad_norm": 1.8074827663201514, + "language_loss": 0.81761897, + "learning_rate": 9.659076962352398e-07, + "loss": 0.83795774, + "num_input_tokens_seen": 244989440, + "step": 11356, + "time_per_iteration": 2.6941845417022705 + }, + { + "auxiliary_loss_clip": 0.01048854, + "auxiliary_loss_mlp": 0.01029629, + "balance_loss_clip": 1.02922201, + "balance_loss_mlp": 1.01806617, + "epoch": 0.6828197805501277, + "flos": 22748081552640.0, + "grad_norm": 1.683121132137117, + "language_loss": 0.78463256, + "learning_rate": 9.655743531886052e-07, + "loss": 0.80541742, + "num_input_tokens_seen": 245007830, + "step": 11357, + "time_per_iteration": 2.6589787006378174 + }, + { + "auxiliary_loss_clip": 0.00987105, + "auxiliary_loss_mlp": 0.01003939, + "balance_loss_clip": 1.0016042, + "balance_loss_mlp": 1.00290203, + "epoch": 0.6828799038027957, + "flos": 71646565829760.0, + "grad_norm": 0.8188783388060886, + "language_loss": 0.59571886, + "learning_rate": 9.65241049367493e-07, + "loss": 0.61562932, + "num_input_tokens_seen": 245070720, + "step": 11358, + "time_per_iteration": 3.239731550216675 + }, + { + "auxiliary_loss_clip": 0.01019552, + "auxiliary_loss_mlp": 0.01041325, + "balance_loss_clip": 1.0194838, + "balance_loss_mlp": 1.02768135, + "epoch": 0.6829400270554637, + "flos": 19829154637440.0, + "grad_norm": 1.8281879676441897, + "language_loss": 0.78484023, + "learning_rate": 9.64907784784544e-07, + "loss": 0.80544901, + "num_input_tokens_seen": 245089070, + "step": 11359, + "time_per_iteration": 2.598057746887207 + }, + { + "auxiliary_loss_clip": 0.01052917, + "auxiliary_loss_mlp": 0.01033606, + "balance_loss_clip": 1.02441049, + "balance_loss_mlp": 1.02254903, + "epoch": 0.6830001503081317, + "flos": 21980634543360.0, + "grad_norm": 1.9073042530774336, + "language_loss": 0.81539059, + "learning_rate": 9.645745594523958e-07, + "loss": 0.83625579, + "num_input_tokens_seen": 245106500, + "step": 11360, + "time_per_iteration": 2.545091152191162 + }, + { + "auxiliary_loss_clip": 0.01050711, + "auxiliary_loss_mlp": 0.01032983, + "balance_loss_clip": 1.02645016, + "balance_loss_mlp": 1.02090132, + "epoch": 0.6830602735607997, + "flos": 24316767290880.0, + "grad_norm": 1.7299421719127777, + "language_loss": 0.75362116, + "learning_rate": 9.642413733836844e-07, + "loss": 0.77445817, + "num_input_tokens_seen": 245125260, + "step": 11361, + "time_per_iteration": 2.6382501125335693 + }, + { + "auxiliary_loss_clip": 0.00993573, + "auxiliary_loss_mlp": 0.01002272, + "balance_loss_clip": 1.00732422, + "balance_loss_mlp": 1.00092506, + "epoch": 0.6831203968134676, + "flos": 57690062323200.0, + "grad_norm": 0.8744192028291959, + "language_loss": 0.59685117, + "learning_rate": 9.639082265910437e-07, + "loss": 0.61680967, + "num_input_tokens_seen": 245188730, + "step": 11362, + "time_per_iteration": 3.2753279209136963 + }, + { + "auxiliary_loss_clip": 0.01041552, + "auxiliary_loss_mlp": 0.01027581, + "balance_loss_clip": 1.02358806, + "balance_loss_mlp": 1.01587486, + "epoch": 0.6831805200661356, + "flos": 14388436552320.0, + "grad_norm": 2.1620728240497655, + "language_loss": 0.75558454, + "learning_rate": 9.635751190871074e-07, + "loss": 0.77627587, + "num_input_tokens_seen": 245205065, + "step": 11363, + "time_per_iteration": 2.606661796569824 + }, + { + "auxiliary_loss_clip": 0.01043402, + "auxiliary_loss_mlp": 0.01029697, + "balance_loss_clip": 1.02467132, + "balance_loss_mlp": 1.01845503, + "epoch": 0.6832406433188035, + "flos": 22820297846400.0, + "grad_norm": 2.7216209876608857, + "language_loss": 0.89606488, + "learning_rate": 9.632420508845063e-07, + "loss": 0.91679585, + "num_input_tokens_seen": 245224265, + "step": 11364, + "time_per_iteration": 2.692298173904419 + }, + { + "auxiliary_loss_clip": 0.01039181, + "auxiliary_loss_mlp": 0.01027832, + "balance_loss_clip": 1.02355266, + "balance_loss_mlp": 1.01749027, + "epoch": 0.6833007665714715, + "flos": 17561718650880.0, + "grad_norm": 2.2018564229410615, + "language_loss": 0.88371295, + "learning_rate": 9.629090219958697e-07, + "loss": 0.90438306, + "num_input_tokens_seen": 245243360, + "step": 11365, + "time_per_iteration": 4.338263273239136 + }, + { + "auxiliary_loss_clip": 0.01029857, + "auxiliary_loss_mlp": 0.0103748, + "balance_loss_clip": 1.02369165, + "balance_loss_mlp": 1.02488577, + "epoch": 0.6833608898241395, + "flos": 22445928345600.0, + "grad_norm": 2.396835613945954, + "language_loss": 0.81390953, + "learning_rate": 9.625760324338272e-07, + "loss": 0.83458287, + "num_input_tokens_seen": 245256350, + "step": 11366, + "time_per_iteration": 4.3464250564575195 + }, + { + "auxiliary_loss_clip": 0.01044121, + "auxiliary_loss_mlp": 0.01028753, + "balance_loss_clip": 1.02466989, + "balance_loss_mlp": 1.01794672, + "epoch": 0.6834210130768075, + "flos": 24534637234560.0, + "grad_norm": 1.379391720901288, + "language_loss": 0.76730782, + "learning_rate": 9.622430822110062e-07, + "loss": 0.78803658, + "num_input_tokens_seen": 245277575, + "step": 11367, + "time_per_iteration": 2.6881701946258545 + }, + { + "auxiliary_loss_clip": 0.01044839, + "auxiliary_loss_mlp": 0.01034324, + "balance_loss_clip": 1.02642775, + "balance_loss_mlp": 1.02293384, + "epoch": 0.6834811363294754, + "flos": 20047132321920.0, + "grad_norm": 1.5228836052867092, + "language_loss": 0.69368857, + "learning_rate": 9.619101713400312e-07, + "loss": 0.71448022, + "num_input_tokens_seen": 245296615, + "step": 11368, + "time_per_iteration": 2.7030837535858154 + }, + { + "auxiliary_loss_clip": 0.01021901, + "auxiliary_loss_mlp": 0.0103282, + "balance_loss_clip": 1.02129376, + "balance_loss_mlp": 1.02188301, + "epoch": 0.6835412595821434, + "flos": 24790752184320.0, + "grad_norm": 1.8866384561448983, + "language_loss": 0.73584294, + "learning_rate": 9.615772998335261e-07, + "loss": 0.75639015, + "num_input_tokens_seen": 245316275, + "step": 11369, + "time_per_iteration": 2.671753406524658 + }, + { + "auxiliary_loss_clip": 0.01053782, + "auxiliary_loss_mlp": 0.0102784, + "balance_loss_clip": 1.02436459, + "balance_loss_mlp": 1.01725984, + "epoch": 0.6836013828348113, + "flos": 19500356517120.0, + "grad_norm": 1.822995963301413, + "language_loss": 0.78929704, + "learning_rate": 9.612444677041138e-07, + "loss": 0.81011319, + "num_input_tokens_seen": 245334595, + "step": 11370, + "time_per_iteration": 2.5631332397460938 + }, + { + "auxiliary_loss_clip": 0.0099839, + "auxiliary_loss_mlp": 0.01002938, + "balance_loss_clip": 1.00223994, + "balance_loss_mlp": 1.00190103, + "epoch": 0.6836615060874793, + "flos": 58363999251840.0, + "grad_norm": 0.7509062749904, + "language_loss": 0.59819776, + "learning_rate": 9.609116749644162e-07, + "loss": 0.61821103, + "num_input_tokens_seen": 245389750, + "step": 11371, + "time_per_iteration": 3.0848660469055176 + }, + { + "auxiliary_loss_clip": 0.01041791, + "auxiliary_loss_mlp": 0.0102554, + "balance_loss_clip": 1.02499735, + "balance_loss_mlp": 1.01548481, + "epoch": 0.6837216293401474, + "flos": 12166895168640.0, + "grad_norm": 1.467515330371912, + "language_loss": 0.63827533, + "learning_rate": 9.605789216270511e-07, + "loss": 0.65894866, + "num_input_tokens_seen": 245407530, + "step": 11372, + "time_per_iteration": 2.6879308223724365 + }, + { + "auxiliary_loss_clip": 0.01054171, + "auxiliary_loss_mlp": 0.01025723, + "balance_loss_clip": 1.02567625, + "balance_loss_mlp": 1.01516736, + "epoch": 0.6837817525928153, + "flos": 22127581082880.0, + "grad_norm": 1.9482533619465787, + "language_loss": 0.7181654, + "learning_rate": 9.602462077046375e-07, + "loss": 0.73896432, + "num_input_tokens_seen": 245427000, + "step": 11373, + "time_per_iteration": 2.546649932861328 + }, + { + "auxiliary_loss_clip": 0.00980281, + "auxiliary_loss_mlp": 0.01002411, + "balance_loss_clip": 1.003474, + "balance_loss_mlp": 1.00136745, + "epoch": 0.6838418758454833, + "flos": 65005928985600.0, + "grad_norm": 1.284965514173848, + "language_loss": 0.56700706, + "learning_rate": 9.599135332097935e-07, + "loss": 0.58683407, + "num_input_tokens_seen": 245491620, + "step": 11374, + "time_per_iteration": 3.3281753063201904 + }, + { + "auxiliary_loss_clip": 0.01054931, + "auxiliary_loss_mlp": 0.01026473, + "balance_loss_clip": 1.02570796, + "balance_loss_mlp": 1.01524377, + "epoch": 0.6839019990981512, + "flos": 21030833162880.0, + "grad_norm": 1.4314571053508456, + "language_loss": 0.73848242, + "learning_rate": 9.595808981551312e-07, + "loss": 0.75929648, + "num_input_tokens_seen": 245511285, + "step": 11375, + "time_per_iteration": 2.5893211364746094 + }, + { + "auxiliary_loss_clip": 0.01045144, + "auxiliary_loss_mlp": 0.01029196, + "balance_loss_clip": 1.02664113, + "balance_loss_mlp": 1.0183301, + "epoch": 0.6839621223508192, + "flos": 24935543907840.0, + "grad_norm": 1.690952422486465, + "language_loss": 0.70544577, + "learning_rate": 9.592483025532651e-07, + "loss": 0.72618914, + "num_input_tokens_seen": 245532910, + "step": 11376, + "time_per_iteration": 2.652879238128662 + }, + { + "auxiliary_loss_clip": 0.01064815, + "auxiliary_loss_mlp": 0.01030918, + "balance_loss_clip": 1.02498484, + "balance_loss_mlp": 1.01976621, + "epoch": 0.6840222456034871, + "flos": 26358827391360.0, + "grad_norm": 1.7818035236224488, + "language_loss": 0.74298143, + "learning_rate": 9.58915746416808e-07, + "loss": 0.76393872, + "num_input_tokens_seen": 245550540, + "step": 11377, + "time_per_iteration": 2.540029764175415 + }, + { + "auxiliary_loss_clip": 0.00991165, + "auxiliary_loss_mlp": 0.01002255, + "balance_loss_clip": 1.00478041, + "balance_loss_mlp": 1.00118256, + "epoch": 0.6840823688561551, + "flos": 65988336936960.0, + "grad_norm": 0.7142626932900753, + "language_loss": 0.56884825, + "learning_rate": 9.585832297583707e-07, + "loss": 0.58878249, + "num_input_tokens_seen": 245619570, + "step": 11378, + "time_per_iteration": 3.281987190246582 + }, + { + "auxiliary_loss_clip": 0.01064543, + "auxiliary_loss_mlp": 0.01030071, + "balance_loss_clip": 1.02607465, + "balance_loss_mlp": 1.01860905, + "epoch": 0.684142492108823, + "flos": 21397588980480.0, + "grad_norm": 1.7222188307752082, + "language_loss": 0.78481007, + "learning_rate": 9.58250752590561e-07, + "loss": 0.80575621, + "num_input_tokens_seen": 245637980, + "step": 11379, + "time_per_iteration": 2.5217509269714355 + }, + { + "auxiliary_loss_clip": 0.01059106, + "auxiliary_loss_mlp": 0.01025357, + "balance_loss_clip": 1.02494097, + "balance_loss_mlp": 1.01626182, + "epoch": 0.6842026153614911, + "flos": 18801426700800.0, + "grad_norm": 1.7000910680568753, + "language_loss": 0.68967533, + "learning_rate": 9.57918314925988e-07, + "loss": 0.71051991, + "num_input_tokens_seen": 245655690, + "step": 11380, + "time_per_iteration": 4.160714387893677 + }, + { + "auxiliary_loss_clip": 0.0104157, + "auxiliary_loss_mlp": 0.01028787, + "balance_loss_clip": 1.02355874, + "balance_loss_mlp": 1.01798677, + "epoch": 0.684262738614159, + "flos": 19646405216640.0, + "grad_norm": 2.2103851897379045, + "language_loss": 0.7829712, + "learning_rate": 9.575859167772568e-07, + "loss": 0.80367482, + "num_input_tokens_seen": 245671525, + "step": 11381, + "time_per_iteration": 2.7774901390075684 + }, + { + "auxiliary_loss_clip": 0.00997812, + "auxiliary_loss_mlp": 0.01002099, + "balance_loss_clip": 1.00225341, + "balance_loss_mlp": 1.00122261, + "epoch": 0.684322861866827, + "flos": 62354462739840.0, + "grad_norm": 0.8900626762491223, + "language_loss": 0.6720646, + "learning_rate": 9.572535581569713e-07, + "loss": 0.69206369, + "num_input_tokens_seen": 245724115, + "step": 11382, + "time_per_iteration": 3.0119967460632324 + }, + { + "auxiliary_loss_clip": 0.01000254, + "auxiliary_loss_mlp": 0.01002438, + "balance_loss_clip": 1.00449193, + "balance_loss_mlp": 1.00151443, + "epoch": 0.6843829851194949, + "flos": 65805048812160.0, + "grad_norm": 0.8474137156953356, + "language_loss": 0.58154017, + "learning_rate": 9.569212390777356e-07, + "loss": 0.60156709, + "num_input_tokens_seen": 245789245, + "step": 11383, + "time_per_iteration": 3.2287373542785645 + }, + { + "auxiliary_loss_clip": 0.01015445, + "auxiliary_loss_mlp": 0.01026962, + "balance_loss_clip": 1.02125406, + "balance_loss_mlp": 1.01573253, + "epoch": 0.6844431083721629, + "flos": 27855153181440.0, + "grad_norm": 1.591585028240997, + "language_loss": 0.79780173, + "learning_rate": 9.565889595521517e-07, + "loss": 0.8182258, + "num_input_tokens_seen": 245812420, + "step": 11384, + "time_per_iteration": 2.7984447479248047 + }, + { + "auxiliary_loss_clip": 0.01055321, + "auxiliary_loss_mlp": 0.01032889, + "balance_loss_clip": 1.02528739, + "balance_loss_mlp": 1.02192807, + "epoch": 0.684503231624831, + "flos": 18255010032000.0, + "grad_norm": 1.75908423864088, + "language_loss": 0.7718178, + "learning_rate": 9.562567195928187e-07, + "loss": 0.79269987, + "num_input_tokens_seen": 245829135, + "step": 11385, + "time_per_iteration": 2.571256399154663 + }, + { + "auxiliary_loss_clip": 0.01034067, + "auxiliary_loss_mlp": 0.01035769, + "balance_loss_clip": 1.02400827, + "balance_loss_mlp": 1.02291226, + "epoch": 0.6845633548774989, + "flos": 17639681120640.0, + "grad_norm": 8.472923497674023, + "language_loss": 0.84200209, + "learning_rate": 9.55924519212335e-07, + "loss": 0.86270046, + "num_input_tokens_seen": 245847140, + "step": 11386, + "time_per_iteration": 2.63688325881958 + }, + { + "auxiliary_loss_clip": 0.01052035, + "auxiliary_loss_mlp": 0.01034306, + "balance_loss_clip": 1.02487361, + "balance_loss_mlp": 1.02430451, + "epoch": 0.6846234781301669, + "flos": 20807576179200.0, + "grad_norm": 4.071005649642153, + "language_loss": 0.83233213, + "learning_rate": 9.555923584232984e-07, + "loss": 0.85319555, + "num_input_tokens_seen": 245862855, + "step": 11387, + "time_per_iteration": 2.665597438812256 + }, + { + "auxiliary_loss_clip": 0.01044107, + "auxiliary_loss_mlp": 0.01023988, + "balance_loss_clip": 1.02269411, + "balance_loss_mlp": 1.01343799, + "epoch": 0.6846836013828348, + "flos": 36101176485120.0, + "grad_norm": 1.5332569927853452, + "language_loss": 0.72165346, + "learning_rate": 9.552602372383047e-07, + "loss": 0.74233443, + "num_input_tokens_seen": 245885415, + "step": 11388, + "time_per_iteration": 2.749882459640503 + }, + { + "auxiliary_loss_clip": 0.01053727, + "auxiliary_loss_mlp": 0.01026767, + "balance_loss_clip": 1.02638459, + "balance_loss_mlp": 1.01657474, + "epoch": 0.6847437246355028, + "flos": 43142468607360.0, + "grad_norm": 2.6895649097396594, + "language_loss": 0.62790477, + "learning_rate": 9.549281556699469e-07, + "loss": 0.64870971, + "num_input_tokens_seen": 245906285, + "step": 11389, + "time_per_iteration": 4.3440563678741455 + }, + { + "auxiliary_loss_clip": 0.00988045, + "auxiliary_loss_mlp": 0.01000399, + "balance_loss_clip": 1.00296497, + "balance_loss_mlp": 0.9994151, + "epoch": 0.6848038478881707, + "flos": 71663729552640.0, + "grad_norm": 0.728518668086981, + "language_loss": 0.55997324, + "learning_rate": 9.54596113730818e-07, + "loss": 0.57985771, + "num_input_tokens_seen": 245967620, + "step": 11390, + "time_per_iteration": 3.2430264949798584 + }, + { + "auxiliary_loss_clip": 0.0102583, + "auxiliary_loss_mlp": 0.00747612, + "balance_loss_clip": 1.02428412, + "balance_loss_mlp": 1.0003891, + "epoch": 0.6848639711408387, + "flos": 19937820257280.0, + "grad_norm": 9.081907820656587, + "language_loss": 0.87716019, + "learning_rate": 9.542641114335109e-07, + "loss": 0.8948946, + "num_input_tokens_seen": 245985075, + "step": 11391, + "time_per_iteration": 2.649832010269165 + }, + { + "auxiliary_loss_clip": 0.01019219, + "auxiliary_loss_mlp": 0.01029533, + "balance_loss_clip": 1.02479064, + "balance_loss_mlp": 1.01848865, + "epoch": 0.6849240943935067, + "flos": 26867501844480.0, + "grad_norm": 1.6137628693548838, + "language_loss": 0.78852618, + "learning_rate": 9.539321487906117e-07, + "loss": 0.80901372, + "num_input_tokens_seen": 246003560, + "step": 11392, + "time_per_iteration": 2.7318460941314697 + }, + { + "auxiliary_loss_clip": 0.01039794, + "auxiliary_loss_mlp": 0.01026409, + "balance_loss_clip": 1.02391648, + "balance_loss_mlp": 1.01567996, + "epoch": 0.6849842176461747, + "flos": 13735365425280.0, + "grad_norm": 1.861296301343471, + "language_loss": 0.70324874, + "learning_rate": 9.536002258147104e-07, + "loss": 0.72391075, + "num_input_tokens_seen": 246019600, + "step": 11393, + "time_per_iteration": 2.7308859825134277 + }, + { + "auxiliary_loss_clip": 0.01021465, + "auxiliary_loss_mlp": 0.01029873, + "balance_loss_clip": 1.02253139, + "balance_loss_mlp": 1.01822662, + "epoch": 0.6850443408988426, + "flos": 24973070641920.0, + "grad_norm": 1.6677730501639023, + "language_loss": 0.64868581, + "learning_rate": 9.532683425183936e-07, + "loss": 0.66919923, + "num_input_tokens_seen": 246038920, + "step": 11394, + "time_per_iteration": 2.7936246395111084 + }, + { + "auxiliary_loss_clip": 0.01036383, + "auxiliary_loss_mlp": 0.00747744, + "balance_loss_clip": 1.02417898, + "balance_loss_mlp": 1.0004642, + "epoch": 0.6851044641515106, + "flos": 27744225004800.0, + "grad_norm": 1.5076561356571068, + "language_loss": 0.80567431, + "learning_rate": 9.529364989142468e-07, + "loss": 0.82351559, + "num_input_tokens_seen": 246060490, + "step": 11395, + "time_per_iteration": 2.7270519733428955 + }, + { + "auxiliary_loss_clip": 0.01025472, + "auxiliary_loss_mlp": 0.01030985, + "balance_loss_clip": 1.0245564, + "balance_loss_mlp": 1.01907611, + "epoch": 0.6851645874041785, + "flos": 24351061800960.0, + "grad_norm": 2.058206994237591, + "language_loss": 0.72926009, + "learning_rate": 9.526046950148527e-07, + "loss": 0.74982464, + "num_input_tokens_seen": 246081465, + "step": 11396, + "time_per_iteration": 2.7101638317108154 + }, + { + "auxiliary_loss_clip": 0.01032073, + "auxiliary_loss_mlp": 0.01025563, + "balance_loss_clip": 1.02514887, + "balance_loss_mlp": 1.01358807, + "epoch": 0.6852247106568465, + "flos": 15077849264640.0, + "grad_norm": 2.6921331343867805, + "language_loss": 0.78967541, + "learning_rate": 9.522729308327931e-07, + "loss": 0.81025183, + "num_input_tokens_seen": 246096110, + "step": 11397, + "time_per_iteration": 2.6139416694641113 + }, + { + "auxiliary_loss_clip": 0.00986582, + "auxiliary_loss_mlp": 0.01027555, + "balance_loss_clip": 1.02014136, + "balance_loss_mlp": 1.01580071, + "epoch": 0.6852848339095146, + "flos": 18770005278720.0, + "grad_norm": 1.9884236366310608, + "language_loss": 0.71737069, + "learning_rate": 9.519412063806493e-07, + "loss": 0.73751211, + "num_input_tokens_seen": 246114785, + "step": 11398, + "time_per_iteration": 2.790327548980713 + }, + { + "auxiliary_loss_clip": 0.01022815, + "auxiliary_loss_mlp": 0.01029157, + "balance_loss_clip": 1.02522826, + "balance_loss_mlp": 1.01876211, + "epoch": 0.6853449571621825, + "flos": 27854363082240.0, + "grad_norm": 2.1719933958915356, + "language_loss": 0.71296918, + "learning_rate": 9.516095216709996e-07, + "loss": 0.73348892, + "num_input_tokens_seen": 246136375, + "step": 11399, + "time_per_iteration": 2.8204538822174072 + }, + { + "auxiliary_loss_clip": 0.0104568, + "auxiliary_loss_mlp": 0.01032924, + "balance_loss_clip": 1.02382863, + "balance_loss_mlp": 1.02175426, + "epoch": 0.6854050804148505, + "flos": 18150510389760.0, + "grad_norm": 2.207746576600165, + "language_loss": 0.70353138, + "learning_rate": 9.512778767164217e-07, + "loss": 0.72431737, + "num_input_tokens_seen": 246155090, + "step": 11400, + "time_per_iteration": 2.630263328552246 + }, + { + "auxiliary_loss_clip": 0.0103065, + "auxiliary_loss_mlp": 0.01039088, + "balance_loss_clip": 1.02626455, + "balance_loss_mlp": 1.02487206, + "epoch": 0.6854652036675184, + "flos": 16326212492160.0, + "grad_norm": 1.9321774885792062, + "language_loss": 0.77936757, + "learning_rate": 9.509462715294927e-07, + "loss": 0.80006492, + "num_input_tokens_seen": 246172645, + "step": 11401, + "time_per_iteration": 2.643425226211548 + }, + { + "auxiliary_loss_clip": 0.0106317, + "auxiliary_loss_mlp": 0.01031884, + "balance_loss_clip": 1.02581167, + "balance_loss_mlp": 1.02143562, + "epoch": 0.6855253269201864, + "flos": 14940814878720.0, + "grad_norm": 2.4709450185995467, + "language_loss": 0.75268316, + "learning_rate": 9.50614706122786e-07, + "loss": 0.77363372, + "num_input_tokens_seen": 246189055, + "step": 11402, + "time_per_iteration": 2.4856069087982178 + }, + { + "auxiliary_loss_clip": 0.01045629, + "auxiliary_loss_mlp": 0.01034533, + "balance_loss_clip": 1.02336991, + "balance_loss_mlp": 1.02224267, + "epoch": 0.6855854501728543, + "flos": 23037736826880.0, + "grad_norm": 2.379020034992819, + "language_loss": 0.72902346, + "learning_rate": 9.502831805088742e-07, + "loss": 0.74982512, + "num_input_tokens_seen": 246207990, + "step": 11403, + "time_per_iteration": 2.666478395462036 + }, + { + "auxiliary_loss_clip": 0.0106186, + "auxiliary_loss_mlp": 0.01027974, + "balance_loss_clip": 1.02493775, + "balance_loss_mlp": 1.01745939, + "epoch": 0.6856455734255223, + "flos": 13253623194240.0, + "grad_norm": 3.589503931550273, + "language_loss": 0.81577837, + "learning_rate": 9.499516947003294e-07, + "loss": 0.83667672, + "num_input_tokens_seen": 246221595, + "step": 11404, + "time_per_iteration": 2.5283284187316895 + }, + { + "auxiliary_loss_clip": 0.01034166, + "auxiliary_loss_mlp": 0.01034666, + "balance_loss_clip": 1.02291, + "balance_loss_mlp": 1.02384174, + "epoch": 0.6857056966781903, + "flos": 23333461499520.0, + "grad_norm": 1.3842398430404985, + "language_loss": 0.7793597, + "learning_rate": 9.496202487097222e-07, + "loss": 0.80004799, + "num_input_tokens_seen": 246242970, + "step": 11405, + "time_per_iteration": 2.8985044956207275 + }, + { + "auxiliary_loss_clip": 0.00997353, + "auxiliary_loss_mlp": 0.01004648, + "balance_loss_clip": 1.00153172, + "balance_loss_mlp": 1.00368202, + "epoch": 0.6857658199308583, + "flos": 61852647784320.0, + "grad_norm": 0.8009761705965227, + "language_loss": 0.61018926, + "learning_rate": 9.492888425496199e-07, + "loss": 0.63020927, + "num_input_tokens_seen": 246300405, + "step": 11406, + "time_per_iteration": 3.3174996376037598 + }, + { + "auxiliary_loss_clip": 0.01034076, + "auxiliary_loss_mlp": 0.01030059, + "balance_loss_clip": 1.02494621, + "balance_loss_mlp": 1.01826894, + "epoch": 0.6858259431835262, + "flos": 16654543735680.0, + "grad_norm": 1.6275091936011736, + "language_loss": 0.76976365, + "learning_rate": 9.489574762325907e-07, + "loss": 0.79040504, + "num_input_tokens_seen": 246318780, + "step": 11407, + "time_per_iteration": 2.845867872238159 + }, + { + "auxiliary_loss_clip": 0.01044952, + "auxiliary_loss_mlp": 0.01032243, + "balance_loss_clip": 1.02495897, + "balance_loss_mlp": 1.02063799, + "epoch": 0.6858860664361942, + "flos": 21872974504320.0, + "grad_norm": 2.6403790348085745, + "language_loss": 0.71591425, + "learning_rate": 9.486261497711991e-07, + "loss": 0.73668611, + "num_input_tokens_seen": 246339405, + "step": 11408, + "time_per_iteration": 2.7946767807006836 + }, + { + "auxiliary_loss_clip": 0.0105484, + "auxiliary_loss_mlp": 0.0102735, + "balance_loss_clip": 1.02551746, + "balance_loss_mlp": 1.01646602, + "epoch": 0.6859461896888621, + "flos": 15267637751040.0, + "grad_norm": 1.8546591138754442, + "language_loss": 0.70209801, + "learning_rate": 9.482948631780087e-07, + "loss": 0.72291994, + "num_input_tokens_seen": 246357055, + "step": 11409, + "time_per_iteration": 2.749804973602295 + }, + { + "auxiliary_loss_clip": 0.01020728, + "auxiliary_loss_mlp": 0.01025929, + "balance_loss_clip": 1.02633286, + "balance_loss_mlp": 1.01608801, + "epoch": 0.6860063129415301, + "flos": 18620293392000.0, + "grad_norm": 1.6510136085642722, + "language_loss": 0.78190196, + "learning_rate": 9.479636164655825e-07, + "loss": 0.80236852, + "num_input_tokens_seen": 246374050, + "step": 11410, + "time_per_iteration": 2.723477363586426 + }, + { + "auxiliary_loss_clip": 0.01056422, + "auxiliary_loss_mlp": 0.01033984, + "balance_loss_clip": 1.02526617, + "balance_loss_mlp": 1.02224243, + "epoch": 0.6860664361941982, + "flos": 23951376190080.0, + "grad_norm": 1.8365407670724299, + "language_loss": 0.71766591, + "learning_rate": 9.476324096464821e-07, + "loss": 0.73856997, + "num_input_tokens_seen": 246392910, + "step": 11411, + "time_per_iteration": 2.6354074478149414 + }, + { + "auxiliary_loss_clip": 0.01015793, + "auxiliary_loss_mlp": 0.01036627, + "balance_loss_clip": 1.02207267, + "balance_loss_mlp": 1.02347779, + "epoch": 0.6861265594468661, + "flos": 20407782827520.0, + "grad_norm": 2.282369204696205, + "language_loss": 0.69828886, + "learning_rate": 9.473012427332654e-07, + "loss": 0.718813, + "num_input_tokens_seen": 246411540, + "step": 11412, + "time_per_iteration": 4.405573129653931 + }, + { + "auxiliary_loss_clip": 0.0106385, + "auxiliary_loss_mlp": 0.01029985, + "balance_loss_clip": 1.0249393, + "balance_loss_mlp": 1.01873755, + "epoch": 0.6861866826995341, + "flos": 11428571111040.0, + "grad_norm": 3.8039886031071273, + "language_loss": 0.71245313, + "learning_rate": 9.469701157384919e-07, + "loss": 0.73339146, + "num_input_tokens_seen": 246423295, + "step": 11413, + "time_per_iteration": 4.050308465957642 + }, + { + "auxiliary_loss_clip": 0.01054639, + "auxiliary_loss_mlp": 0.01027469, + "balance_loss_clip": 1.02572823, + "balance_loss_mlp": 1.016752, + "epoch": 0.686246805952202, + "flos": 15997593939840.0, + "grad_norm": 1.8284435466504068, + "language_loss": 0.73429048, + "learning_rate": 9.466390286747164e-07, + "loss": 0.75511158, + "num_input_tokens_seen": 246441045, + "step": 11414, + "time_per_iteration": 2.571821451187134 + }, + { + "auxiliary_loss_clip": 0.010432, + "auxiliary_loss_mlp": 0.01028727, + "balance_loss_clip": 1.02452004, + "balance_loss_mlp": 1.01721704, + "epoch": 0.68630692920487, + "flos": 19826712512640.0, + "grad_norm": 2.760562652824961, + "language_loss": 0.8689633, + "learning_rate": 9.46307981554495e-07, + "loss": 0.88968253, + "num_input_tokens_seen": 246456905, + "step": 11415, + "time_per_iteration": 2.6144843101501465 + }, + { + "auxiliary_loss_clip": 0.01055519, + "auxiliary_loss_mlp": 0.01033453, + "balance_loss_clip": 1.02521396, + "balance_loss_mlp": 1.02160418, + "epoch": 0.6863670524575379, + "flos": 26286216048000.0, + "grad_norm": 1.7358375765182705, + "language_loss": 0.67100084, + "learning_rate": 9.459769743903801e-07, + "loss": 0.6918906, + "num_input_tokens_seen": 246477545, + "step": 11416, + "time_per_iteration": 2.6233811378479004 + }, + { + "auxiliary_loss_clip": 0.01040903, + "auxiliary_loss_mlp": 0.01033093, + "balance_loss_clip": 1.02537465, + "balance_loss_mlp": 1.02110076, + "epoch": 0.686427175710206, + "flos": 19173138595200.0, + "grad_norm": 1.5539947157704679, + "language_loss": 0.76041901, + "learning_rate": 9.456460071949237e-07, + "loss": 0.78115892, + "num_input_tokens_seen": 246496705, + "step": 11417, + "time_per_iteration": 2.7103195190429688 + }, + { + "auxiliary_loss_clip": 0.0103863, + "auxiliary_loss_mlp": 0.01031184, + "balance_loss_clip": 1.02410173, + "balance_loss_mlp": 1.01977611, + "epoch": 0.6864872989628739, + "flos": 18916628595840.0, + "grad_norm": 1.7941722232284278, + "language_loss": 0.77531445, + "learning_rate": 9.45315079980678e-07, + "loss": 0.79601264, + "num_input_tokens_seen": 246514860, + "step": 11418, + "time_per_iteration": 2.6224539279937744 + }, + { + "auxiliary_loss_clip": 0.01022897, + "auxiliary_loss_mlp": 0.0102781, + "balance_loss_clip": 1.0245986, + "balance_loss_mlp": 1.01680756, + "epoch": 0.6865474222155419, + "flos": 25956196865280.0, + "grad_norm": 1.871502796061382, + "language_loss": 0.76411301, + "learning_rate": 9.449841927601887e-07, + "loss": 0.78462005, + "num_input_tokens_seen": 246536145, + "step": 11419, + "time_per_iteration": 2.69149112701416 + }, + { + "auxiliary_loss_clip": 0.01063081, + "auxiliary_loss_mlp": 0.01031798, + "balance_loss_clip": 1.0258081, + "balance_loss_mlp": 1.02164185, + "epoch": 0.6866075454682098, + "flos": 18478087447680.0, + "grad_norm": 1.6159199201132721, + "language_loss": 0.71368736, + "learning_rate": 9.446533455460044e-07, + "loss": 0.73463619, + "num_input_tokens_seen": 246553265, + "step": 11420, + "time_per_iteration": 2.479250192642212 + }, + { + "auxiliary_loss_clip": 0.01024536, + "auxiliary_loss_mlp": 0.01027441, + "balance_loss_clip": 1.0220654, + "balance_loss_mlp": 1.0165627, + "epoch": 0.6866676687208778, + "flos": 34239998298240.0, + "grad_norm": 1.343542675291084, + "language_loss": 0.7458154, + "learning_rate": 9.443225383506712e-07, + "loss": 0.76633519, + "num_input_tokens_seen": 246575130, + "step": 11421, + "time_per_iteration": 2.8967974185943604 + }, + { + "auxiliary_loss_clip": 0.01051148, + "auxiliary_loss_mlp": 0.0102554, + "balance_loss_clip": 1.02514553, + "balance_loss_mlp": 1.01541317, + "epoch": 0.6867277919735457, + "flos": 21721754246400.0, + "grad_norm": 1.749381012799042, + "language_loss": 0.77064109, + "learning_rate": 9.439917711867338e-07, + "loss": 0.79140794, + "num_input_tokens_seen": 246593095, + "step": 11422, + "time_per_iteration": 2.637434720993042 + }, + { + "auxiliary_loss_clip": 0.01055779, + "auxiliary_loss_mlp": 0.01034654, + "balance_loss_clip": 1.02562487, + "balance_loss_mlp": 1.02272153, + "epoch": 0.6867879152262137, + "flos": 24097999507200.0, + "grad_norm": 1.6742060389810287, + "language_loss": 0.77214175, + "learning_rate": 9.436610440667334e-07, + "loss": 0.79304612, + "num_input_tokens_seen": 246612165, + "step": 11423, + "time_per_iteration": 2.6223630905151367 + }, + { + "auxiliary_loss_clip": 0.01033792, + "auxiliary_loss_mlp": 0.01029215, + "balance_loss_clip": 1.02488375, + "balance_loss_mlp": 1.01815796, + "epoch": 0.6868480384788818, + "flos": 21615818060160.0, + "grad_norm": 1.4051380761272776, + "language_loss": 0.72540832, + "learning_rate": 9.433303570032129e-07, + "loss": 0.74603844, + "num_input_tokens_seen": 246632065, + "step": 11424, + "time_per_iteration": 2.6878280639648438 + }, + { + "auxiliary_loss_clip": 0.01045393, + "auxiliary_loss_mlp": 0.01023631, + "balance_loss_clip": 1.02679706, + "balance_loss_mlp": 1.01309323, + "epoch": 0.6869081617315497, + "flos": 26286144220800.0, + "grad_norm": 1.862751755072846, + "language_loss": 0.6514535, + "learning_rate": 9.429997100087112e-07, + "loss": 0.67214382, + "num_input_tokens_seen": 246651245, + "step": 11425, + "time_per_iteration": 2.7372140884399414 + }, + { + "auxiliary_loss_clip": 0.0103399, + "auxiliary_loss_mlp": 0.01023125, + "balance_loss_clip": 1.02711225, + "balance_loss_mlp": 1.01295638, + "epoch": 0.6869682849842177, + "flos": 21105096531840.0, + "grad_norm": 1.3902100315931925, + "language_loss": 0.72084129, + "learning_rate": 9.426691030957657e-07, + "loss": 0.7414124, + "num_input_tokens_seen": 246672225, + "step": 11426, + "time_per_iteration": 2.6681244373321533 + }, + { + "auxiliary_loss_clip": 0.01015164, + "auxiliary_loss_mlp": 0.01028731, + "balance_loss_clip": 1.02287865, + "balance_loss_mlp": 1.01668477, + "epoch": 0.6870284082368856, + "flos": 17092653920640.0, + "grad_norm": 2.861398283060612, + "language_loss": 0.84929645, + "learning_rate": 9.423385362769136e-07, + "loss": 0.86973536, + "num_input_tokens_seen": 246688385, + "step": 11427, + "time_per_iteration": 2.7482688426971436 + }, + { + "auxiliary_loss_clip": 0.01053099, + "auxiliary_loss_mlp": 0.01027297, + "balance_loss_clip": 1.02572727, + "balance_loss_mlp": 1.01685429, + "epoch": 0.6870885314895536, + "flos": 27308090067840.0, + "grad_norm": 1.5689779451003356, + "language_loss": 0.76048064, + "learning_rate": 9.420080095646909e-07, + "loss": 0.78128457, + "num_input_tokens_seen": 246710730, + "step": 11428, + "time_per_iteration": 4.305830955505371 + }, + { + "auxiliary_loss_clip": 0.01033974, + "auxiliary_loss_mlp": 0.01030979, + "balance_loss_clip": 1.02378988, + "balance_loss_mlp": 1.01933861, + "epoch": 0.6871486547422215, + "flos": 20814543417600.0, + "grad_norm": 1.6924868968233384, + "language_loss": 0.73047352, + "learning_rate": 9.4167752297163e-07, + "loss": 0.75112307, + "num_input_tokens_seen": 246730350, + "step": 11429, + "time_per_iteration": 2.732515335083008 + }, + { + "auxiliary_loss_clip": 0.01040255, + "auxiliary_loss_mlp": 0.01027936, + "balance_loss_clip": 1.02493095, + "balance_loss_mlp": 1.01698637, + "epoch": 0.6872087779948896, + "flos": 30154118330880.0, + "grad_norm": 1.8397831786686445, + "language_loss": 0.83092761, + "learning_rate": 9.413470765102643e-07, + "loss": 0.85160947, + "num_input_tokens_seen": 246751700, + "step": 11430, + "time_per_iteration": 2.7504520416259766 + }, + { + "auxiliary_loss_clip": 0.01053954, + "auxiliary_loss_mlp": 0.010309, + "balance_loss_clip": 1.02500725, + "balance_loss_mlp": 1.02016544, + "epoch": 0.6872689012475575, + "flos": 20704584908160.0, + "grad_norm": 2.053013803635539, + "language_loss": 0.70399368, + "learning_rate": 9.410166701931225e-07, + "loss": 0.72484219, + "num_input_tokens_seen": 246769860, + "step": 11431, + "time_per_iteration": 2.692319631576538 + }, + { + "auxiliary_loss_clip": 0.01041066, + "auxiliary_loss_mlp": 0.00747756, + "balance_loss_clip": 1.02292657, + "balance_loss_mlp": 1.00045061, + "epoch": 0.6873290245002255, + "flos": 25520852027520.0, + "grad_norm": 2.2331557139088836, + "language_loss": 0.79843664, + "learning_rate": 9.406863040327355e-07, + "loss": 0.81632495, + "num_input_tokens_seen": 246789905, + "step": 11432, + "time_per_iteration": 2.74460768699646 + }, + { + "auxiliary_loss_clip": 0.01039711, + "auxiliary_loss_mlp": 0.01024233, + "balance_loss_clip": 1.02342641, + "balance_loss_mlp": 1.01427877, + "epoch": 0.6873891477528934, + "flos": 25191479289600.0, + "grad_norm": 1.4612508440746081, + "language_loss": 0.67645222, + "learning_rate": 9.403559780416295e-07, + "loss": 0.69709164, + "num_input_tokens_seen": 246808815, + "step": 11433, + "time_per_iteration": 2.666505813598633 + }, + { + "auxiliary_loss_clip": 0.01056591, + "auxiliary_loss_mlp": 0.01033777, + "balance_loss_clip": 1.02817249, + "balance_loss_mlp": 1.02257133, + "epoch": 0.6874492710055614, + "flos": 35152380685440.0, + "grad_norm": 1.8834345163943411, + "language_loss": 0.73037302, + "learning_rate": 9.400256922323309e-07, + "loss": 0.75127673, + "num_input_tokens_seen": 246829775, + "step": 11434, + "time_per_iteration": 2.701897621154785 + }, + { + "auxiliary_loss_clip": 0.01036705, + "auxiliary_loss_mlp": 0.01027115, + "balance_loss_clip": 1.02854609, + "balance_loss_mlp": 1.01656508, + "epoch": 0.6875093942582293, + "flos": 17822215059840.0, + "grad_norm": 1.579679616249709, + "language_loss": 0.80411756, + "learning_rate": 9.396954466173657e-07, + "loss": 0.82475579, + "num_input_tokens_seen": 246848045, + "step": 11435, + "time_per_iteration": 2.649996519088745 + }, + { + "auxiliary_loss_clip": 0.01064348, + "auxiliary_loss_mlp": 0.01031756, + "balance_loss_clip": 1.02459192, + "balance_loss_mlp": 1.02025235, + "epoch": 0.6875695175108973, + "flos": 20704548994560.0, + "grad_norm": 6.132707356590368, + "language_loss": 0.81323445, + "learning_rate": 9.393652412092538e-07, + "loss": 0.83419549, + "num_input_tokens_seen": 246866095, + "step": 11436, + "time_per_iteration": 4.118508815765381 + }, + { + "auxiliary_loss_clip": 0.01019805, + "auxiliary_loss_mlp": 0.01035237, + "balance_loss_clip": 1.0198493, + "balance_loss_mlp": 1.02543223, + "epoch": 0.6876296407635654, + "flos": 25374013228800.0, + "grad_norm": 1.677573233113683, + "language_loss": 0.81680298, + "learning_rate": 9.390350760205183e-07, + "loss": 0.83735341, + "num_input_tokens_seen": 246883975, + "step": 11437, + "time_per_iteration": 2.6603176593780518 + }, + { + "auxiliary_loss_clip": 0.01041462, + "auxiliary_loss_mlp": 0.01033207, + "balance_loss_clip": 1.02374959, + "balance_loss_mlp": 1.02115536, + "epoch": 0.6876897640162333, + "flos": 23222317841280.0, + "grad_norm": 4.584837154903369, + "language_loss": 0.78046077, + "learning_rate": 9.387049510636793e-07, + "loss": 0.80120748, + "num_input_tokens_seen": 246901560, + "step": 11438, + "time_per_iteration": 2.706617593765259 + }, + { + "auxiliary_loss_clip": 0.01058826, + "auxiliary_loss_mlp": 0.0102636, + "balance_loss_clip": 1.0238452, + "balance_loss_mlp": 1.01638222, + "epoch": 0.6877498872689013, + "flos": 27124335066240.0, + "grad_norm": 1.649184609128325, + "language_loss": 0.72743905, + "learning_rate": 9.383748663512554e-07, + "loss": 0.74829084, + "num_input_tokens_seen": 246922655, + "step": 11439, + "time_per_iteration": 2.599421501159668 + }, + { + "auxiliary_loss_clip": 0.01051893, + "auxiliary_loss_mlp": 0.01026315, + "balance_loss_clip": 1.02476203, + "balance_loss_mlp": 1.01556826, + "epoch": 0.6878100105215692, + "flos": 11581658876160.0, + "grad_norm": 1.819396137683318, + "language_loss": 0.75505996, + "learning_rate": 9.380448218957623e-07, + "loss": 0.77584207, + "num_input_tokens_seen": 246940100, + "step": 11440, + "time_per_iteration": 2.5809295177459717 + }, + { + "auxiliary_loss_clip": 0.01023694, + "auxiliary_loss_mlp": 0.01031936, + "balance_loss_clip": 1.02274418, + "balance_loss_mlp": 1.02121353, + "epoch": 0.6878701337742372, + "flos": 20303175444480.0, + "grad_norm": 1.6907324649494146, + "language_loss": 0.72028869, + "learning_rate": 9.377148177097167e-07, + "loss": 0.74084502, + "num_input_tokens_seen": 246958545, + "step": 11441, + "time_per_iteration": 2.7884316444396973 + }, + { + "auxiliary_loss_clip": 0.01029345, + "auxiliary_loss_mlp": 0.01039376, + "balance_loss_clip": 1.02312922, + "balance_loss_mlp": 1.02649546, + "epoch": 0.6879302570269051, + "flos": 13840080549120.0, + "grad_norm": 1.8044608967048432, + "language_loss": 0.66648006, + "learning_rate": 9.373848538056317e-07, + "loss": 0.68716729, + "num_input_tokens_seen": 246974805, + "step": 11442, + "time_per_iteration": 2.677147626876831 + }, + { + "auxiliary_loss_clip": 0.01054103, + "auxiliary_loss_mlp": 0.01028224, + "balance_loss_clip": 1.0259397, + "balance_loss_mlp": 1.01804399, + "epoch": 0.6879903802795732, + "flos": 21324654414720.0, + "grad_norm": 2.2280694242642096, + "language_loss": 0.69827819, + "learning_rate": 9.370549301960189e-07, + "loss": 0.71910155, + "num_input_tokens_seen": 246992505, + "step": 11443, + "time_per_iteration": 2.6646742820739746 + }, + { + "auxiliary_loss_clip": 0.01041358, + "auxiliary_loss_mlp": 0.01031353, + "balance_loss_clip": 1.02452826, + "balance_loss_mlp": 1.01999211, + "epoch": 0.6880505035322411, + "flos": 25152049134720.0, + "grad_norm": 1.449016326193159, + "language_loss": 0.76143974, + "learning_rate": 9.367250468933893e-07, + "loss": 0.78216684, + "num_input_tokens_seen": 247013370, + "step": 11444, + "time_per_iteration": 2.7834255695343018 + }, + { + "auxiliary_loss_clip": 0.01061321, + "auxiliary_loss_mlp": 0.01026222, + "balance_loss_clip": 1.02440166, + "balance_loss_mlp": 1.01610661, + "epoch": 0.6881106267849091, + "flos": 23215530170880.0, + "grad_norm": 1.9778628737249386, + "language_loss": 0.76832634, + "learning_rate": 9.363952039102536e-07, + "loss": 0.78920174, + "num_input_tokens_seen": 247029855, + "step": 11445, + "time_per_iteration": 2.5558173656463623 + }, + { + "auxiliary_loss_clip": 0.00996859, + "auxiliary_loss_mlp": 0.01000344, + "balance_loss_clip": 1.00126266, + "balance_loss_mlp": 0.99931842, + "epoch": 0.688170750037577, + "flos": 48484397312640.0, + "grad_norm": 0.8240779652025771, + "language_loss": 0.58364999, + "learning_rate": 9.360654012591183e-07, + "loss": 0.60362196, + "num_input_tokens_seen": 247085030, + "step": 11446, + "time_per_iteration": 3.203810930252075 + }, + { + "auxiliary_loss_clip": 0.01050313, + "auxiliary_loss_mlp": 0.01028633, + "balance_loss_clip": 1.02247643, + "balance_loss_mlp": 1.01751661, + "epoch": 0.688230873290245, + "flos": 22783633038720.0, + "grad_norm": 1.7371903083874738, + "language_loss": 0.75635791, + "learning_rate": 9.357356389524886e-07, + "loss": 0.77714741, + "num_input_tokens_seen": 247104840, + "step": 11447, + "time_per_iteration": 2.617182493209839 + }, + { + "auxiliary_loss_clip": 0.01043152, + "auxiliary_loss_mlp": 0.0103141, + "balance_loss_clip": 1.0231806, + "balance_loss_mlp": 1.02079463, + "epoch": 0.6882909965429129, + "flos": 22455660931200.0, + "grad_norm": 2.002878046878225, + "language_loss": 0.72846115, + "learning_rate": 9.354059170028705e-07, + "loss": 0.74920672, + "num_input_tokens_seen": 247121905, + "step": 11448, + "time_per_iteration": 2.6406538486480713 + }, + { + "auxiliary_loss_clip": 0.01046578, + "auxiliary_loss_mlp": 0.01036755, + "balance_loss_clip": 1.02184856, + "balance_loss_mlp": 1.02392244, + "epoch": 0.688351119795581, + "flos": 26214143408640.0, + "grad_norm": 1.7472013408717222, + "language_loss": 0.74527442, + "learning_rate": 9.350762354227673e-07, + "loss": 0.7661078, + "num_input_tokens_seen": 247142375, + "step": 11449, + "time_per_iteration": 2.802138328552246 + }, + { + "auxiliary_loss_clip": 0.01060944, + "auxiliary_loss_mlp": 0.01027906, + "balance_loss_clip": 1.02450562, + "balance_loss_mlp": 1.01805282, + "epoch": 0.6884112430482489, + "flos": 22565260304640.0, + "grad_norm": 1.8798208267197658, + "language_loss": 0.69959193, + "learning_rate": 9.34746594224679e-07, + "loss": 0.72048044, + "num_input_tokens_seen": 247161095, + "step": 11450, + "time_per_iteration": 2.6775319576263428 + }, + { + "auxiliary_loss_clip": 0.01027313, + "auxiliary_loss_mlp": 0.01031734, + "balance_loss_clip": 1.02383506, + "balance_loss_mlp": 1.01962817, + "epoch": 0.6884713663009169, + "flos": 17341047446400.0, + "grad_norm": 1.6786901061563952, + "language_loss": 0.75739783, + "learning_rate": 9.344169934211068e-07, + "loss": 0.77798831, + "num_input_tokens_seen": 247178565, + "step": 11451, + "time_per_iteration": 2.7594518661499023 + }, + { + "auxiliary_loss_clip": 0.01053248, + "auxiliary_loss_mlp": 0.01027609, + "balance_loss_clip": 1.02479219, + "balance_loss_mlp": 1.01676655, + "epoch": 0.6885314895535849, + "flos": 26470832976000.0, + "grad_norm": 1.3255118820765328, + "language_loss": 0.69198883, + "learning_rate": 9.340874330245505e-07, + "loss": 0.7127974, + "num_input_tokens_seen": 247202345, + "step": 11452, + "time_per_iteration": 2.9099056720733643 + }, + { + "auxiliary_loss_clip": 0.01062159, + "auxiliary_loss_mlp": 0.01031056, + "balance_loss_clip": 1.02453494, + "balance_loss_mlp": 1.01945138, + "epoch": 0.6885916128062528, + "flos": 20521548178560.0, + "grad_norm": 2.2495244217322674, + "language_loss": 0.71881276, + "learning_rate": 9.337579130475042e-07, + "loss": 0.73974496, + "num_input_tokens_seen": 247219240, + "step": 11453, + "time_per_iteration": 2.5358774662017822 + }, + { + "auxiliary_loss_clip": 0.00998412, + "auxiliary_loss_mlp": 0.00746883, + "balance_loss_clip": 1.00279963, + "balance_loss_mlp": 1.00103629, + "epoch": 0.6886517360589208, + "flos": 70715795679360.0, + "grad_norm": 0.7829505848304854, + "language_loss": 0.50720197, + "learning_rate": 9.334284335024644e-07, + "loss": 0.52465492, + "num_input_tokens_seen": 247272010, + "step": 11454, + "time_per_iteration": 3.199280261993408 + }, + { + "auxiliary_loss_clip": 0.01050033, + "auxiliary_loss_mlp": 0.01026161, + "balance_loss_clip": 1.02512693, + "balance_loss_mlp": 1.01604068, + "epoch": 0.6887118593115887, + "flos": 17893533513600.0, + "grad_norm": 1.987526758773716, + "language_loss": 0.75638682, + "learning_rate": 9.330989944019263e-07, + "loss": 0.77714878, + "num_input_tokens_seen": 247290630, + "step": 11455, + "time_per_iteration": 2.6575734615325928 + }, + { + "auxiliary_loss_clip": 0.01040868, + "auxiliary_loss_mlp": 0.01034069, + "balance_loss_clip": 1.02179074, + "balance_loss_mlp": 1.02196896, + "epoch": 0.6887719825642568, + "flos": 17453017117440.0, + "grad_norm": 2.1561686917674425, + "language_loss": 0.72618717, + "learning_rate": 9.327695957583803e-07, + "loss": 0.74693656, + "num_input_tokens_seen": 247304800, + "step": 11456, + "time_per_iteration": 2.6632328033447266 + }, + { + "auxiliary_loss_clip": 0.01038379, + "auxiliary_loss_mlp": 0.01028226, + "balance_loss_clip": 1.02375674, + "balance_loss_mlp": 1.01801538, + "epoch": 0.6888321058169247, + "flos": 23070199743360.0, + "grad_norm": 2.265683242321641, + "language_loss": 0.81133032, + "learning_rate": 9.32440237584319e-07, + "loss": 0.83199632, + "num_input_tokens_seen": 247323450, + "step": 11457, + "time_per_iteration": 2.789212942123413 + }, + { + "auxiliary_loss_clip": 0.01055708, + "auxiliary_loss_mlp": 0.00747679, + "balance_loss_clip": 1.02633715, + "balance_loss_mlp": 1.00044191, + "epoch": 0.6888922290695927, + "flos": 23368833417600.0, + "grad_norm": 1.5638850064566816, + "language_loss": 0.76032209, + "learning_rate": 9.321109198922301e-07, + "loss": 0.77835596, + "num_input_tokens_seen": 247343845, + "step": 11458, + "time_per_iteration": 2.5572140216827393 + }, + { + "auxiliary_loss_clip": 0.01063987, + "auxiliary_loss_mlp": 0.01029096, + "balance_loss_clip": 1.02580452, + "balance_loss_mlp": 1.01870131, + "epoch": 0.6889523523222606, + "flos": 17631636474240.0, + "grad_norm": 3.254650292986324, + "language_loss": 0.68109071, + "learning_rate": 9.31781642694603e-07, + "loss": 0.70202148, + "num_input_tokens_seen": 247356650, + "step": 11459, + "time_per_iteration": 4.264230489730835 + }, + { + "auxiliary_loss_clip": 0.010174, + "auxiliary_loss_mlp": 0.01031181, + "balance_loss_clip": 1.02340364, + "balance_loss_mlp": 1.02064931, + "epoch": 0.6890124755749286, + "flos": 25228144097280.0, + "grad_norm": 2.189307942169068, + "language_loss": 0.68645799, + "learning_rate": 9.314524060039221e-07, + "loss": 0.70694375, + "num_input_tokens_seen": 247377340, + "step": 11460, + "time_per_iteration": 4.562270164489746 + }, + { + "auxiliary_loss_clip": 0.0104009, + "auxiliary_loss_mlp": 0.0103089, + "balance_loss_clip": 1.02748203, + "balance_loss_mlp": 1.01865315, + "epoch": 0.6890725988275965, + "flos": 20230240878720.0, + "grad_norm": 1.6614011317668658, + "language_loss": 0.77171385, + "learning_rate": 9.311232098326731e-07, + "loss": 0.79242361, + "num_input_tokens_seen": 247395805, + "step": 11461, + "time_per_iteration": 2.9007017612457275 + }, + { + "auxiliary_loss_clip": 0.01041283, + "auxiliary_loss_mlp": 0.01031504, + "balance_loss_clip": 1.02395177, + "balance_loss_mlp": 1.02081048, + "epoch": 0.6891327220802645, + "flos": 14535311264640.0, + "grad_norm": 1.9694655002978685, + "language_loss": 0.69725907, + "learning_rate": 9.307940541933401e-07, + "loss": 0.71798694, + "num_input_tokens_seen": 247413165, + "step": 11462, + "time_per_iteration": 2.7716686725616455 + }, + { + "auxiliary_loss_clip": 0.01054915, + "auxiliary_loss_mlp": 0.010225, + "balance_loss_clip": 1.02667379, + "balance_loss_mlp": 1.01188445, + "epoch": 0.6891928453329325, + "flos": 21139139646720.0, + "grad_norm": 1.5858823170359215, + "language_loss": 0.87450129, + "learning_rate": 9.304649390984034e-07, + "loss": 0.89527547, + "num_input_tokens_seen": 247433140, + "step": 11463, + "time_per_iteration": 2.7988901138305664 + }, + { + "auxiliary_loss_clip": 0.01021098, + "auxiliary_loss_mlp": 0.01025854, + "balance_loss_clip": 1.0251013, + "balance_loss_mlp": 1.01667488, + "epoch": 0.6892529685856005, + "flos": 17858520731520.0, + "grad_norm": 1.5501202216434913, + "language_loss": 0.68160665, + "learning_rate": 9.301358645603428e-07, + "loss": 0.70207614, + "num_input_tokens_seen": 247451265, + "step": 11464, + "time_per_iteration": 2.911672353744507 + }, + { + "auxiliary_loss_clip": 0.0105435, + "auxiliary_loss_mlp": 0.01034003, + "balance_loss_clip": 1.02593708, + "balance_loss_mlp": 1.0234468, + "epoch": 0.6893130918382685, + "flos": 29934811843200.0, + "grad_norm": 1.866950718048086, + "language_loss": 0.65399146, + "learning_rate": 9.298068305916373e-07, + "loss": 0.6748749, + "num_input_tokens_seen": 247471645, + "step": 11465, + "time_per_iteration": 2.845046043395996 + }, + { + "auxiliary_loss_clip": 0.01055045, + "auxiliary_loss_mlp": 0.01030751, + "balance_loss_clip": 1.02499986, + "balance_loss_mlp": 1.02005816, + "epoch": 0.6893732150909364, + "flos": 24388516707840.0, + "grad_norm": 1.3052708829848436, + "language_loss": 0.72602457, + "learning_rate": 9.294778372047649e-07, + "loss": 0.7468825, + "num_input_tokens_seen": 247491170, + "step": 11466, + "time_per_iteration": 2.8629648685455322 + }, + { + "auxiliary_loss_clip": 0.01064268, + "auxiliary_loss_mlp": 0.01029577, + "balance_loss_clip": 1.02598619, + "balance_loss_mlp": 1.01920605, + "epoch": 0.6894333383436044, + "flos": 16982874979200.0, + "grad_norm": 1.7298676874704533, + "language_loss": 0.72016442, + "learning_rate": 9.291488844121995e-07, + "loss": 0.74110281, + "num_input_tokens_seen": 247509005, + "step": 11467, + "time_per_iteration": 2.7960422039031982 + }, + { + "auxiliary_loss_clip": 0.01046072, + "auxiliary_loss_mlp": 0.010303, + "balance_loss_clip": 1.02568221, + "balance_loss_mlp": 1.0183847, + "epoch": 0.6894934615962723, + "flos": 18985540838400.0, + "grad_norm": 1.9992709730814273, + "language_loss": 0.81014335, + "learning_rate": 9.288199722264156e-07, + "loss": 0.83090705, + "num_input_tokens_seen": 247527050, + "step": 11468, + "time_per_iteration": 2.9151206016540527 + }, + { + "auxiliary_loss_clip": 0.01065041, + "auxiliary_loss_mlp": 0.01030306, + "balance_loss_clip": 1.02613533, + "balance_loss_mlp": 1.01930869, + "epoch": 0.6895535848489404, + "flos": 34531664734080.0, + "grad_norm": 1.525778825231846, + "language_loss": 0.65996718, + "learning_rate": 9.284911006598875e-07, + "loss": 0.6809206, + "num_input_tokens_seen": 247547765, + "step": 11469, + "time_per_iteration": 2.863222599029541 + }, + { + "auxiliary_loss_clip": 0.00999859, + "auxiliary_loss_mlp": 0.01005217, + "balance_loss_clip": 1.0037334, + "balance_loss_mlp": 1.00434685, + "epoch": 0.6896137081016083, + "flos": 50075852273280.0, + "grad_norm": 0.8047609642283293, + "language_loss": 0.55203617, + "learning_rate": 9.281622697250824e-07, + "loss": 0.57208693, + "num_input_tokens_seen": 247603515, + "step": 11470, + "time_per_iteration": 3.2006516456604004 + }, + { + "auxiliary_loss_clip": 0.01048733, + "auxiliary_loss_mlp": 0.01031122, + "balance_loss_clip": 1.02401638, + "balance_loss_mlp": 1.02259219, + "epoch": 0.6896738313542763, + "flos": 19938215306880.0, + "grad_norm": 1.7364729958971188, + "language_loss": 0.77819371, + "learning_rate": 9.278334794344715e-07, + "loss": 0.79899228, + "num_input_tokens_seen": 247622110, + "step": 11471, + "time_per_iteration": 2.6276895999908447 + }, + { + "auxiliary_loss_clip": 0.01035221, + "auxiliary_loss_mlp": 0.01028492, + "balance_loss_clip": 1.02154851, + "balance_loss_mlp": 1.01753116, + "epoch": 0.6897339546069442, + "flos": 21725489260800.0, + "grad_norm": 1.6642329862259873, + "language_loss": 0.78516555, + "learning_rate": 9.275047298005232e-07, + "loss": 0.8058027, + "num_input_tokens_seen": 247641905, + "step": 11472, + "time_per_iteration": 2.9082789421081543 + }, + { + "auxiliary_loss_clip": 0.01032519, + "auxiliary_loss_mlp": 0.01029744, + "balance_loss_clip": 1.0232085, + "balance_loss_mlp": 1.02003455, + "epoch": 0.6897940778596122, + "flos": 19826497031040.0, + "grad_norm": 1.563728588512237, + "language_loss": 0.76160538, + "learning_rate": 9.271760208357024e-07, + "loss": 0.78222799, + "num_input_tokens_seen": 247660945, + "step": 11473, + "time_per_iteration": 2.7705631256103516 + }, + { + "auxiliary_loss_clip": 0.01017778, + "auxiliary_loss_mlp": 0.01036375, + "balance_loss_clip": 1.02060997, + "balance_loss_mlp": 1.02307093, + "epoch": 0.6898542011122801, + "flos": 17310056987520.0, + "grad_norm": 2.222394994999231, + "language_loss": 0.75504565, + "learning_rate": 9.268473525524751e-07, + "loss": 0.7755872, + "num_input_tokens_seen": 247678395, + "step": 11474, + "time_per_iteration": 2.7823524475097656 + }, + { + "auxiliary_loss_clip": 0.01019214, + "auxiliary_loss_mlp": 0.01028192, + "balance_loss_clip": 1.02875018, + "balance_loss_mlp": 1.01675415, + "epoch": 0.6899143243649482, + "flos": 24754051463040.0, + "grad_norm": 1.5464696372761304, + "language_loss": 0.74143392, + "learning_rate": 9.26518724963303e-07, + "loss": 0.76190794, + "num_input_tokens_seen": 247698380, + "step": 11475, + "time_per_iteration": 2.953702688217163 + }, + { + "auxiliary_loss_clip": 0.01026511, + "auxiliary_loss_mlp": 0.01032835, + "balance_loss_clip": 1.02024245, + "balance_loss_mlp": 1.02165961, + "epoch": 0.6899744476176161, + "flos": 17234536642560.0, + "grad_norm": 2.188062572354352, + "language_loss": 0.88457161, + "learning_rate": 9.261901380806491e-07, + "loss": 0.90516508, + "num_input_tokens_seen": 247716370, + "step": 11476, + "time_per_iteration": 4.418827533721924 + }, + { + "auxiliary_loss_clip": 0.01062754, + "auxiliary_loss_mlp": 0.01032228, + "balance_loss_clip": 1.02491164, + "balance_loss_mlp": 1.02202976, + "epoch": 0.6900345708702841, + "flos": 25410678036480.0, + "grad_norm": 1.3607742245115282, + "language_loss": 0.701424, + "learning_rate": 9.258615919169724e-07, + "loss": 0.72237384, + "num_input_tokens_seen": 247737335, + "step": 11477, + "time_per_iteration": 2.7594776153564453 + }, + { + "auxiliary_loss_clip": 0.01055975, + "auxiliary_loss_mlp": 0.01037089, + "balance_loss_clip": 1.02471328, + "balance_loss_mlp": 1.0259428, + "epoch": 0.6900946941229521, + "flos": 23434190213760.0, + "grad_norm": 2.659401634995249, + "language_loss": 0.68497139, + "learning_rate": 9.255330864847313e-07, + "loss": 0.70590198, + "num_input_tokens_seen": 247756680, + "step": 11478, + "time_per_iteration": 2.7722384929656982 + }, + { + "auxiliary_loss_clip": 0.01055086, + "auxiliary_loss_mlp": 0.01029232, + "balance_loss_clip": 1.02639198, + "balance_loss_mlp": 1.01868773, + "epoch": 0.69015481737562, + "flos": 17820096157440.0, + "grad_norm": 1.6802214920593903, + "language_loss": 0.76493114, + "learning_rate": 9.252046217963843e-07, + "loss": 0.78577429, + "num_input_tokens_seen": 247774265, + "step": 11479, + "time_per_iteration": 2.8202457427978516 + }, + { + "auxiliary_loss_clip": 0.01050635, + "auxiliary_loss_mlp": 0.01027669, + "balance_loss_clip": 1.02330565, + "balance_loss_mlp": 1.01653516, + "epoch": 0.690214940628288, + "flos": 17456500736640.0, + "grad_norm": 2.076476663401908, + "language_loss": 0.78309774, + "learning_rate": 9.248761978643856e-07, + "loss": 0.80388081, + "num_input_tokens_seen": 247792395, + "step": 11480, + "time_per_iteration": 2.7966434955596924 + }, + { + "auxiliary_loss_clip": 0.01017223, + "auxiliary_loss_mlp": 0.01029299, + "balance_loss_clip": 1.02106452, + "balance_loss_mlp": 1.01830173, + "epoch": 0.6902750638809559, + "flos": 29566691308800.0, + "grad_norm": 1.5789812837589443, + "language_loss": 0.75319147, + "learning_rate": 9.245478147011885e-07, + "loss": 0.77365667, + "num_input_tokens_seen": 247811985, + "step": 11481, + "time_per_iteration": 2.8870925903320312 + }, + { + "auxiliary_loss_clip": 0.01034088, + "auxiliary_loss_mlp": 0.01029418, + "balance_loss_clip": 1.0253197, + "balance_loss_mlp": 1.01825404, + "epoch": 0.690335187133624, + "flos": 25557121785600.0, + "grad_norm": 1.6168075848706878, + "language_loss": 0.6921314, + "learning_rate": 9.24219472319246e-07, + "loss": 0.71276641, + "num_input_tokens_seen": 247831880, + "step": 11482, + "time_per_iteration": 2.9343888759613037 + }, + { + "auxiliary_loss_clip": 0.01061685, + "auxiliary_loss_mlp": 0.01026877, + "balance_loss_clip": 1.02403414, + "balance_loss_mlp": 1.01633847, + "epoch": 0.6903953103862919, + "flos": 22488447070080.0, + "grad_norm": 1.457476657834178, + "language_loss": 0.82797861, + "learning_rate": 9.238911707310096e-07, + "loss": 0.8488642, + "num_input_tokens_seen": 247851170, + "step": 11483, + "time_per_iteration": 2.800295352935791 + }, + { + "auxiliary_loss_clip": 0.01062944, + "auxiliary_loss_mlp": 0.01026647, + "balance_loss_clip": 1.02508688, + "balance_loss_mlp": 1.01684785, + "epoch": 0.6904554336389599, + "flos": 26100521712000.0, + "grad_norm": 1.8007811048913265, + "language_loss": 0.65339577, + "learning_rate": 9.235629099489273e-07, + "loss": 0.67429173, + "num_input_tokens_seen": 247868950, + "step": 11484, + "time_per_iteration": 4.260119915008545 + }, + { + "auxiliary_loss_clip": 0.01035495, + "auxiliary_loss_mlp": 0.01038706, + "balance_loss_clip": 1.02287698, + "balance_loss_mlp": 1.02687454, + "epoch": 0.6905155568916278, + "flos": 31171754545920.0, + "grad_norm": 1.6948979822656551, + "language_loss": 0.73954177, + "learning_rate": 9.232346899854479e-07, + "loss": 0.76028383, + "num_input_tokens_seen": 247889805, + "step": 11485, + "time_per_iteration": 2.8880386352539062 + }, + { + "auxiliary_loss_clip": 0.0105575, + "auxiliary_loss_mlp": 0.00747736, + "balance_loss_clip": 1.02622151, + "balance_loss_mlp": 1.00049019, + "epoch": 0.6905756801442958, + "flos": 17639681120640.0, + "grad_norm": 1.8040744297817968, + "language_loss": 0.85394692, + "learning_rate": 9.22906510853017e-07, + "loss": 0.87198186, + "num_input_tokens_seen": 247908585, + "step": 11486, + "time_per_iteration": 2.8332126140594482 + }, + { + "auxiliary_loss_clip": 0.01007818, + "auxiliary_loss_mlp": 0.01032044, + "balance_loss_clip": 1.02347112, + "balance_loss_mlp": 1.02104056, + "epoch": 0.6906358033969637, + "flos": 22343691260160.0, + "grad_norm": 1.5469957420224527, + "language_loss": 0.72562623, + "learning_rate": 9.225783725640786e-07, + "loss": 0.74602485, + "num_input_tokens_seen": 247928480, + "step": 11487, + "time_per_iteration": 2.861845016479492 + }, + { + "auxiliary_loss_clip": 0.00991641, + "auxiliary_loss_mlp": 0.01001563, + "balance_loss_clip": 1.00558436, + "balance_loss_mlp": 1.000615, + "epoch": 0.6906959266496318, + "flos": 69747789081600.0, + "grad_norm": 0.9028397430621313, + "language_loss": 0.66677654, + "learning_rate": 9.222502751310759e-07, + "loss": 0.68670857, + "num_input_tokens_seen": 247988855, + "step": 11488, + "time_per_iteration": 3.226864814758301 + }, + { + "auxiliary_loss_clip": 0.01047511, + "auxiliary_loss_mlp": 0.01031353, + "balance_loss_clip": 1.02606201, + "balance_loss_mlp": 1.01875222, + "epoch": 0.6907560499022997, + "flos": 21434253788160.0, + "grad_norm": 2.538153889555378, + "language_loss": 0.74920607, + "learning_rate": 9.219222185664519e-07, + "loss": 0.76999474, + "num_input_tokens_seen": 248007685, + "step": 11489, + "time_per_iteration": 2.7922611236572266 + }, + { + "auxiliary_loss_clip": 0.01048575, + "auxiliary_loss_mlp": 0.01034394, + "balance_loss_clip": 1.02267742, + "balance_loss_mlp": 1.02293181, + "epoch": 0.6908161731549677, + "flos": 14392207480320.0, + "grad_norm": 1.7984806503975848, + "language_loss": 0.62049764, + "learning_rate": 9.215942028826445e-07, + "loss": 0.64132738, + "num_input_tokens_seen": 248025145, + "step": 11490, + "time_per_iteration": 2.7527124881744385 + }, + { + "auxiliary_loss_clip": 0.01043112, + "auxiliary_loss_mlp": 0.01028499, + "balance_loss_clip": 1.02508545, + "balance_loss_mlp": 1.01788378, + "epoch": 0.6908762964076357, + "flos": 20010970304640.0, + "grad_norm": 1.7219610217832626, + "language_loss": 0.72561383, + "learning_rate": 9.212662280920937e-07, + "loss": 0.74632996, + "num_input_tokens_seen": 248043750, + "step": 11491, + "time_per_iteration": 2.8200607299804688 + }, + { + "auxiliary_loss_clip": 0.01041273, + "auxiliary_loss_mlp": 0.00747674, + "balance_loss_clip": 1.02402592, + "balance_loss_mlp": 1.00041926, + "epoch": 0.6909364196603036, + "flos": 28769079853440.0, + "grad_norm": 1.3131884491064245, + "language_loss": 0.7015962, + "learning_rate": 9.20938294207235e-07, + "loss": 0.71948564, + "num_input_tokens_seen": 248065765, + "step": 11492, + "time_per_iteration": 2.8555963039398193 + }, + { + "auxiliary_loss_clip": 0.01032466, + "auxiliary_loss_mlp": 0.01030451, + "balance_loss_clip": 1.02938747, + "balance_loss_mlp": 1.01876867, + "epoch": 0.6909965429129716, + "flos": 22528128620160.0, + "grad_norm": 1.8139640143712108, + "language_loss": 0.74770731, + "learning_rate": 9.206104012405049e-07, + "loss": 0.76833647, + "num_input_tokens_seen": 248083810, + "step": 11493, + "time_per_iteration": 2.8482425212860107 + }, + { + "auxiliary_loss_clip": 0.01063185, + "auxiliary_loss_mlp": 0.01029137, + "balance_loss_clip": 1.0255295, + "balance_loss_mlp": 1.01850367, + "epoch": 0.6910566661656395, + "flos": 18405942981120.0, + "grad_norm": 1.6353142580402917, + "language_loss": 0.74625778, + "learning_rate": 9.20282549204336e-07, + "loss": 0.7671811, + "num_input_tokens_seen": 248103185, + "step": 11494, + "time_per_iteration": 2.5262703895568848 + }, + { + "auxiliary_loss_clip": 0.01041908, + "auxiliary_loss_mlp": 0.01028026, + "balance_loss_clip": 1.02421641, + "balance_loss_mlp": 1.0170176, + "epoch": 0.6911167894183076, + "flos": 30773972355840.0, + "grad_norm": 1.4693616580345752, + "language_loss": 0.68211335, + "learning_rate": 9.19954738111161e-07, + "loss": 0.70281273, + "num_input_tokens_seen": 248125665, + "step": 11495, + "time_per_iteration": 2.6512153148651123 + }, + { + "auxiliary_loss_clip": 0.01039993, + "auxiliary_loss_mlp": 0.01025776, + "balance_loss_clip": 1.0225656, + "balance_loss_mlp": 1.01495218, + "epoch": 0.6911769126709755, + "flos": 13735724561280.0, + "grad_norm": 1.9679246371904593, + "language_loss": 0.7387495, + "learning_rate": 9.196269679734119e-07, + "loss": 0.75940716, + "num_input_tokens_seen": 248142545, + "step": 11496, + "time_per_iteration": 2.5620994567871094 + }, + { + "auxiliary_loss_clip": 0.01024766, + "auxiliary_loss_mlp": 0.01029299, + "balance_loss_clip": 1.02129817, + "balance_loss_mlp": 1.01905847, + "epoch": 0.6912370359236435, + "flos": 17566854295680.0, + "grad_norm": 1.955267944881928, + "language_loss": 0.80344796, + "learning_rate": 9.19299238803515e-07, + "loss": 0.82398856, + "num_input_tokens_seen": 248160225, + "step": 11497, + "time_per_iteration": 2.5988526344299316 + }, + { + "auxiliary_loss_clip": 0.0102576, + "auxiliary_loss_mlp": 0.01032934, + "balance_loss_clip": 1.02283728, + "balance_loss_mlp": 1.02180624, + "epoch": 0.6912971591763114, + "flos": 22090772620800.0, + "grad_norm": 1.4818446335610111, + "language_loss": 0.80736578, + "learning_rate": 9.189715506138993e-07, + "loss": 0.82795274, + "num_input_tokens_seen": 248180430, + "step": 11498, + "time_per_iteration": 2.677938938140869 + }, + { + "auxiliary_loss_clip": 0.01051343, + "auxiliary_loss_mlp": 0.01027114, + "balance_loss_clip": 1.0242157, + "balance_loss_mlp": 1.01661134, + "epoch": 0.6913572824289794, + "flos": 29971476650880.0, + "grad_norm": 1.7619331966261282, + "language_loss": 0.85858214, + "learning_rate": 9.186439034169915e-07, + "loss": 0.87936676, + "num_input_tokens_seen": 248202365, + "step": 11499, + "time_per_iteration": 2.632974624633789 + }, + { + "auxiliary_loss_clip": 0.01033978, + "auxiliary_loss_mlp": 0.00747535, + "balance_loss_clip": 1.02493036, + "balance_loss_mlp": 1.00043797, + "epoch": 0.6914174056816473, + "flos": 20448936835200.0, + "grad_norm": 1.542592513579856, + "language_loss": 0.75754333, + "learning_rate": 9.183162972252145e-07, + "loss": 0.77535844, + "num_input_tokens_seen": 248221750, + "step": 11500, + "time_per_iteration": 2.606334924697876 + }, + { + "auxiliary_loss_clip": 0.01006062, + "auxiliary_loss_mlp": 0.01034697, + "balance_loss_clip": 1.02298212, + "balance_loss_mlp": 1.0222578, + "epoch": 0.6914775289343154, + "flos": 21282530739840.0, + "grad_norm": 2.2336882702307665, + "language_loss": 0.76990497, + "learning_rate": 9.179887320509921e-07, + "loss": 0.79031253, + "num_input_tokens_seen": 248239535, + "step": 11501, + "time_per_iteration": 2.7687671184539795 + }, + { + "auxiliary_loss_clip": 0.01047048, + "auxiliary_loss_mlp": 0.01035387, + "balance_loss_clip": 1.02419972, + "balance_loss_mlp": 1.02354383, + "epoch": 0.6915376521869833, + "flos": 23878118401920.0, + "grad_norm": 1.880038362187006, + "language_loss": 0.73210967, + "learning_rate": 9.176612079067458e-07, + "loss": 0.7529341, + "num_input_tokens_seen": 248259055, + "step": 11502, + "time_per_iteration": 2.721325159072876 + }, + { + "auxiliary_loss_clip": 0.00997893, + "auxiliary_loss_mlp": 0.01035705, + "balance_loss_clip": 1.02281094, + "balance_loss_mlp": 1.02268159, + "epoch": 0.6915977754396513, + "flos": 11510268595200.0, + "grad_norm": 2.010086845607205, + "language_loss": 0.73209095, + "learning_rate": 9.173337248048953e-07, + "loss": 0.75242692, + "num_input_tokens_seen": 248276765, + "step": 11503, + "time_per_iteration": 2.794837474822998 + }, + { + "auxiliary_loss_clip": 0.01047019, + "auxiliary_loss_mlp": 0.01037345, + "balance_loss_clip": 1.02355349, + "balance_loss_mlp": 1.02572763, + "epoch": 0.6916578986923193, + "flos": 22601278667520.0, + "grad_norm": 1.6979906640343352, + "language_loss": 0.76954126, + "learning_rate": 9.170062827578575e-07, + "loss": 0.79038489, + "num_input_tokens_seen": 248295310, + "step": 11504, + "time_per_iteration": 2.7206337451934814 + }, + { + "auxiliary_loss_clip": 0.01016618, + "auxiliary_loss_mlp": 0.01030925, + "balance_loss_clip": 1.01993704, + "balance_loss_mlp": 1.01883078, + "epoch": 0.6917180219449872, + "flos": 23477355383040.0, + "grad_norm": 1.688557069298829, + "language_loss": 0.7379939, + "learning_rate": 9.166788817780499e-07, + "loss": 0.75846934, + "num_input_tokens_seen": 248315230, + "step": 11505, + "time_per_iteration": 2.788986921310425 + }, + { + "auxiliary_loss_clip": 0.01001109, + "auxiliary_loss_mlp": 0.00747538, + "balance_loss_clip": 1.02114236, + "balance_loss_mlp": 1.00038755, + "epoch": 0.6917781451976552, + "flos": 23732536579200.0, + "grad_norm": 1.8271613842733152, + "language_loss": 0.8809644, + "learning_rate": 9.163515218778886e-07, + "loss": 0.89845085, + "num_input_tokens_seen": 248332980, + "step": 11506, + "time_per_iteration": 2.7921319007873535 + }, + { + "auxiliary_loss_clip": 0.01043915, + "auxiliary_loss_mlp": 0.01026541, + "balance_loss_clip": 1.02651072, + "balance_loss_mlp": 1.01645589, + "epoch": 0.6918382684503231, + "flos": 31466760946560.0, + "grad_norm": 2.0021183739358626, + "language_loss": 0.69867718, + "learning_rate": 9.160242030697856e-07, + "loss": 0.71938169, + "num_input_tokens_seen": 248352865, + "step": 11507, + "time_per_iteration": 4.287466287612915 + }, + { + "auxiliary_loss_clip": 0.0103761, + "auxiliary_loss_mlp": 0.01030121, + "balance_loss_clip": 1.0248059, + "balance_loss_mlp": 1.01903474, + "epoch": 0.6918983917029912, + "flos": 21650471706240.0, + "grad_norm": 1.8318650245756876, + "language_loss": 0.77020723, + "learning_rate": 9.156969253661538e-07, + "loss": 0.79088449, + "num_input_tokens_seen": 248371125, + "step": 11508, + "time_per_iteration": 4.218903303146362 + }, + { + "auxiliary_loss_clip": 0.01044188, + "auxiliary_loss_mlp": 0.01023392, + "balance_loss_clip": 1.02335429, + "balance_loss_mlp": 1.0136348, + "epoch": 0.6919585149556591, + "flos": 25550082720000.0, + "grad_norm": 1.706850944701823, + "language_loss": 0.74999309, + "learning_rate": 9.153696887794027e-07, + "loss": 0.77066886, + "num_input_tokens_seen": 248390455, + "step": 11509, + "time_per_iteration": 2.6624903678894043 + }, + { + "auxiliary_loss_clip": 0.01025726, + "auxiliary_loss_mlp": 0.01034511, + "balance_loss_clip": 1.02710819, + "balance_loss_mlp": 1.02357924, + "epoch": 0.6920186382083271, + "flos": 23659781581440.0, + "grad_norm": 1.693421291970506, + "language_loss": 0.64461565, + "learning_rate": 9.150424933219425e-07, + "loss": 0.66521806, + "num_input_tokens_seen": 248411305, + "step": 11510, + "time_per_iteration": 2.9364423751831055 + }, + { + "auxiliary_loss_clip": 0.01031124, + "auxiliary_loss_mlp": 0.01029792, + "balance_loss_clip": 1.02469909, + "balance_loss_mlp": 1.01776373, + "epoch": 0.692078761460995, + "flos": 19061959023360.0, + "grad_norm": 1.8321929219090873, + "language_loss": 0.75536978, + "learning_rate": 9.147153390061788e-07, + "loss": 0.77597892, + "num_input_tokens_seen": 248430190, + "step": 11511, + "time_per_iteration": 2.800706624984741 + }, + { + "auxiliary_loss_clip": 0.01033076, + "auxiliary_loss_mlp": 0.01026073, + "balance_loss_clip": 1.02464545, + "balance_loss_mlp": 1.0163815, + "epoch": 0.692138884713663, + "flos": 29023291382400.0, + "grad_norm": 1.536584468702138, + "language_loss": 0.62281263, + "learning_rate": 9.143882258445184e-07, + "loss": 0.64340413, + "num_input_tokens_seen": 248450830, + "step": 11512, + "time_per_iteration": 2.7556073665618896 + }, + { + "auxiliary_loss_clip": 0.01033461, + "auxiliary_loss_mlp": 0.01027644, + "balance_loss_clip": 1.02418756, + "balance_loss_mlp": 1.01699841, + "epoch": 0.6921990079663309, + "flos": 14757849976320.0, + "grad_norm": 2.2754099792628395, + "language_loss": 0.82939684, + "learning_rate": 9.140611538493666e-07, + "loss": 0.85000789, + "num_input_tokens_seen": 248468585, + "step": 11513, + "time_per_iteration": 2.677727699279785 + }, + { + "auxiliary_loss_clip": 0.01002976, + "auxiliary_loss_mlp": 0.01030479, + "balance_loss_clip": 1.02376056, + "balance_loss_mlp": 1.02057278, + "epoch": 0.692259131218999, + "flos": 23841848643840.0, + "grad_norm": 1.4090959053936152, + "language_loss": 0.78359926, + "learning_rate": 9.137341230331233e-07, + "loss": 0.80393374, + "num_input_tokens_seen": 248490535, + "step": 11514, + "time_per_iteration": 2.779834508895874 + }, + { + "auxiliary_loss_clip": 0.0101734, + "auxiliary_loss_mlp": 0.01027527, + "balance_loss_clip": 1.0217979, + "balance_loss_mlp": 1.01664865, + "epoch": 0.6923192544716669, + "flos": 19135073157120.0, + "grad_norm": 1.791116340843913, + "language_loss": 0.74786973, + "learning_rate": 9.134071334081907e-07, + "loss": 0.76831841, + "num_input_tokens_seen": 248508575, + "step": 11515, + "time_per_iteration": 2.851532459259033 + }, + { + "auxiliary_loss_clip": 0.01018197, + "auxiliary_loss_mlp": 0.01029713, + "balance_loss_clip": 1.02531242, + "balance_loss_mlp": 1.019485, + "epoch": 0.6923793777243349, + "flos": 28074639237120.0, + "grad_norm": 2.0179954654175676, + "language_loss": 0.53656065, + "learning_rate": 9.130801849869694e-07, + "loss": 0.5570398, + "num_input_tokens_seen": 248527025, + "step": 11516, + "time_per_iteration": 2.80012845993042 + }, + { + "auxiliary_loss_clip": 0.01047878, + "auxiliary_loss_mlp": 0.01030803, + "balance_loss_clip": 1.02391768, + "balance_loss_mlp": 1.01994276, + "epoch": 0.6924395009770029, + "flos": 16581250033920.0, + "grad_norm": 1.667070413851798, + "language_loss": 0.73221922, + "learning_rate": 9.127532777818557e-07, + "loss": 0.7530061, + "num_input_tokens_seen": 248544275, + "step": 11517, + "time_per_iteration": 2.577927589416504 + }, + { + "auxiliary_loss_clip": 0.01065842, + "auxiliary_loss_mlp": 0.01030616, + "balance_loss_clip": 1.02636695, + "balance_loss_mlp": 1.01972675, + "epoch": 0.6924996242296708, + "flos": 16655297921280.0, + "grad_norm": 1.9042936191202608, + "language_loss": 0.76531041, + "learning_rate": 9.124264118052465e-07, + "loss": 0.78627497, + "num_input_tokens_seen": 248561870, + "step": 11518, + "time_per_iteration": 2.5558269023895264 + }, + { + "auxiliary_loss_clip": 0.01056164, + "auxiliary_loss_mlp": 0.01032005, + "balance_loss_clip": 1.02619743, + "balance_loss_mlp": 1.0200361, + "epoch": 0.6925597474823388, + "flos": 34754167532160.0, + "grad_norm": 1.3616478457728518, + "language_loss": 0.64575589, + "learning_rate": 9.120995870695376e-07, + "loss": 0.66663754, + "num_input_tokens_seen": 248588190, + "step": 11519, + "time_per_iteration": 2.7829089164733887 + }, + { + "auxiliary_loss_clip": 0.01035538, + "auxiliary_loss_mlp": 0.01033021, + "balance_loss_clip": 1.02383924, + "balance_loss_mlp": 1.02175617, + "epoch": 0.6926198707350067, + "flos": 21871717528320.0, + "grad_norm": 1.7197357924045198, + "language_loss": 0.62417811, + "learning_rate": 9.117728035871212e-07, + "loss": 0.64486372, + "num_input_tokens_seen": 248606460, + "step": 11520, + "time_per_iteration": 2.6439411640167236 + }, + { + "auxiliary_loss_clip": 0.01028609, + "auxiliary_loss_mlp": 0.01034807, + "balance_loss_clip": 1.02394748, + "balance_loss_mlp": 1.02181947, + "epoch": 0.6926799939876748, + "flos": 13006271162880.0, + "grad_norm": 3.8595857804680493, + "language_loss": 0.77616656, + "learning_rate": 9.114460613703887e-07, + "loss": 0.79680073, + "num_input_tokens_seen": 248623715, + "step": 11521, + "time_per_iteration": 2.6352949142456055 + }, + { + "auxiliary_loss_clip": 0.01048484, + "auxiliary_loss_mlp": 0.01036371, + "balance_loss_clip": 1.02286696, + "balance_loss_mlp": 1.02341318, + "epoch": 0.6927401172403427, + "flos": 16761234107520.0, + "grad_norm": 1.9453785949007372, + "language_loss": 0.82065308, + "learning_rate": 9.111193604317304e-07, + "loss": 0.84150165, + "num_input_tokens_seen": 248640575, + "step": 11522, + "time_per_iteration": 2.5443482398986816 + }, + { + "auxiliary_loss_clip": 0.01057242, + "auxiliary_loss_mlp": 0.01030117, + "balance_loss_clip": 1.02931035, + "balance_loss_mlp": 1.0191493, + "epoch": 0.6928002404930107, + "flos": 25705648523520.0, + "grad_norm": 1.3482622548931071, + "language_loss": 0.76717639, + "learning_rate": 9.107927007835361e-07, + "loss": 0.78805, + "num_input_tokens_seen": 248663535, + "step": 11523, + "time_per_iteration": 4.227794170379639 + }, + { + "auxiliary_loss_clip": 0.0103033, + "auxiliary_loss_mlp": 0.01029382, + "balance_loss_clip": 1.0224849, + "balance_loss_mlp": 1.01946378, + "epoch": 0.6928603637456786, + "flos": 18588261438720.0, + "grad_norm": 2.154335843443647, + "language_loss": 0.68376571, + "learning_rate": 9.104660824381915e-07, + "loss": 0.70436275, + "num_input_tokens_seen": 248681125, + "step": 11524, + "time_per_iteration": 2.6110339164733887 + }, + { + "auxiliary_loss_clip": 0.01034808, + "auxiliary_loss_mlp": 0.01028521, + "balance_loss_clip": 1.0247016, + "balance_loss_mlp": 1.01691031, + "epoch": 0.6929204869983466, + "flos": 22200874784640.0, + "grad_norm": 1.7102680929622036, + "language_loss": 0.64390814, + "learning_rate": 9.101395054080815e-07, + "loss": 0.66454142, + "num_input_tokens_seen": 248700555, + "step": 11525, + "time_per_iteration": 2.6448349952697754 + }, + { + "auxiliary_loss_clip": 0.01022913, + "auxiliary_loss_mlp": 0.01034139, + "balance_loss_clip": 1.02449679, + "balance_loss_mlp": 1.0231297, + "epoch": 0.6929806102510145, + "flos": 17894754576000.0, + "grad_norm": 2.099871602199635, + "language_loss": 0.70713586, + "learning_rate": 9.098129697055907e-07, + "loss": 0.72770643, + "num_input_tokens_seen": 248716095, + "step": 11526, + "time_per_iteration": 2.720616579055786 + }, + { + "auxiliary_loss_clip": 0.01039167, + "auxiliary_loss_mlp": 0.01023961, + "balance_loss_clip": 1.02287006, + "balance_loss_mlp": 1.0140425, + "epoch": 0.6930407335036826, + "flos": 19755178577280.0, + "grad_norm": 1.527048959099474, + "language_loss": 0.76539594, + "learning_rate": 9.094864753431022e-07, + "loss": 0.78602719, + "num_input_tokens_seen": 248735330, + "step": 11527, + "time_per_iteration": 2.7058467864990234 + }, + { + "auxiliary_loss_clip": 0.01034084, + "auxiliary_loss_mlp": 0.01031652, + "balance_loss_clip": 1.02170813, + "balance_loss_mlp": 1.02082741, + "epoch": 0.6931008567563505, + "flos": 21544248211200.0, + "grad_norm": 1.6767334268518264, + "language_loss": 0.79743099, + "learning_rate": 9.091600223329952e-07, + "loss": 0.81808829, + "num_input_tokens_seen": 248754530, + "step": 11528, + "time_per_iteration": 2.6318252086639404 + }, + { + "auxiliary_loss_clip": 0.01050154, + "auxiliary_loss_mlp": 0.01028986, + "balance_loss_clip": 1.02458191, + "balance_loss_mlp": 1.01883519, + "epoch": 0.6931609800090185, + "flos": 26250018117120.0, + "grad_norm": 1.355040647337504, + "language_loss": 0.76015162, + "learning_rate": 9.088336106876491e-07, + "loss": 0.78094304, + "num_input_tokens_seen": 248775825, + "step": 11529, + "time_per_iteration": 2.616382122039795 + }, + { + "auxiliary_loss_clip": 0.01061365, + "auxiliary_loss_mlp": 0.00747544, + "balance_loss_clip": 1.02484322, + "balance_loss_mlp": 1.00045013, + "epoch": 0.6932211032616865, + "flos": 32343376366080.0, + "grad_norm": 1.6269170685067416, + "language_loss": 0.72428972, + "learning_rate": 9.085072404194436e-07, + "loss": 0.74237883, + "num_input_tokens_seen": 248796180, + "step": 11530, + "time_per_iteration": 2.7056171894073486 + }, + { + "auxiliary_loss_clip": 0.01049943, + "auxiliary_loss_mlp": 0.01031831, + "balance_loss_clip": 1.02797401, + "balance_loss_mlp": 1.01891446, + "epoch": 0.6932812265143544, + "flos": 22049079909120.0, + "grad_norm": 2.110101962981763, + "language_loss": 0.78362542, + "learning_rate": 9.081809115407513e-07, + "loss": 0.80444318, + "num_input_tokens_seen": 248814735, + "step": 11531, + "time_per_iteration": 4.2607598304748535 + }, + { + "auxiliary_loss_clip": 0.01052628, + "auxiliary_loss_mlp": 0.01029302, + "balance_loss_clip": 1.02510285, + "balance_loss_mlp": 1.01914573, + "epoch": 0.6933413497670224, + "flos": 26256626219520.0, + "grad_norm": 1.5489406509048063, + "language_loss": 0.69347382, + "learning_rate": 9.078546240639484e-07, + "loss": 0.71429312, + "num_input_tokens_seen": 248839140, + "step": 11532, + "time_per_iteration": 2.7453675270080566 + }, + { + "auxiliary_loss_clip": 0.01038039, + "auxiliary_loss_mlp": 0.01024973, + "balance_loss_clip": 1.02373338, + "balance_loss_mlp": 1.01361203, + "epoch": 0.6934014730196904, + "flos": 19573003774080.0, + "grad_norm": 1.3639302754909082, + "language_loss": 0.67122078, + "learning_rate": 9.075283780014082e-07, + "loss": 0.6918509, + "num_input_tokens_seen": 248858300, + "step": 11533, + "time_per_iteration": 2.842785120010376 + }, + { + "auxiliary_loss_clip": 0.01047299, + "auxiliary_loss_mlp": 0.01031424, + "balance_loss_clip": 1.02768517, + "balance_loss_mlp": 1.01938415, + "epoch": 0.6934615962723584, + "flos": 22119249127680.0, + "grad_norm": 2.227457988955027, + "language_loss": 0.58585054, + "learning_rate": 9.072021733655007e-07, + "loss": 0.60663778, + "num_input_tokens_seen": 248876310, + "step": 11534, + "time_per_iteration": 2.6201353073120117 + }, + { + "auxiliary_loss_clip": 0.01032268, + "auxiliary_loss_mlp": 0.01028015, + "balance_loss_clip": 1.02341044, + "balance_loss_mlp": 1.01687503, + "epoch": 0.6935217195250263, + "flos": 21360816432000.0, + "grad_norm": 1.933231851350839, + "language_loss": 0.70873237, + "learning_rate": 9.068760101685971e-07, + "loss": 0.72933525, + "num_input_tokens_seen": 248895650, + "step": 11535, + "time_per_iteration": 2.649322509765625 + }, + { + "auxiliary_loss_clip": 0.0098836, + "auxiliary_loss_mlp": 0.01001945, + "balance_loss_clip": 1.00234103, + "balance_loss_mlp": 1.00097358, + "epoch": 0.6935818427776943, + "flos": 64063813115520.0, + "grad_norm": 0.7145632735407771, + "language_loss": 0.59065592, + "learning_rate": 9.065498884230638e-07, + "loss": 0.61055899, + "num_input_tokens_seen": 248963920, + "step": 11536, + "time_per_iteration": 3.2957966327667236 + }, + { + "auxiliary_loss_clip": 0.01058612, + "auxiliary_loss_mlp": 0.00747568, + "balance_loss_clip": 1.02755618, + "balance_loss_mlp": 1.00037134, + "epoch": 0.6936419660303622, + "flos": 20302564913280.0, + "grad_norm": 2.249729528991172, + "language_loss": 0.72826719, + "learning_rate": 9.062238081412692e-07, + "loss": 0.74632901, + "num_input_tokens_seen": 248983380, + "step": 11537, + "time_per_iteration": 2.658334732055664 + }, + { + "auxiliary_loss_clip": 0.00996349, + "auxiliary_loss_mlp": 0.00746795, + "balance_loss_clip": 1.00109673, + "balance_loss_mlp": 1.00097656, + "epoch": 0.6937020892830302, + "flos": 67182581347200.0, + "grad_norm": 0.7770465888172057, + "language_loss": 0.55594707, + "learning_rate": 9.058977693355767e-07, + "loss": 0.5733785, + "num_input_tokens_seen": 249044680, + "step": 11538, + "time_per_iteration": 3.140812397003174 + }, + { + "auxiliary_loss_clip": 0.01050561, + "auxiliary_loss_mlp": 0.01032117, + "balance_loss_clip": 1.02464199, + "balance_loss_mlp": 1.02278924, + "epoch": 0.6937622125356981, + "flos": 23878190229120.0, + "grad_norm": 1.4583409887017946, + "language_loss": 0.77658772, + "learning_rate": 9.055717720183505e-07, + "loss": 0.79741454, + "num_input_tokens_seen": 249061060, + "step": 11539, + "time_per_iteration": 2.6218059062957764 + }, + { + "auxiliary_loss_clip": 0.01041661, + "auxiliary_loss_mlp": 0.01023932, + "balance_loss_clip": 1.0243566, + "balance_loss_mlp": 1.01354241, + "epoch": 0.6938223357883662, + "flos": 28730619365760.0, + "grad_norm": 1.8436665771840983, + "language_loss": 0.64387178, + "learning_rate": 9.05245816201953e-07, + "loss": 0.66452771, + "num_input_tokens_seen": 249081430, + "step": 11540, + "time_per_iteration": 2.618549346923828 + }, + { + "auxiliary_loss_clip": 0.010155, + "auxiliary_loss_mlp": 0.0102973, + "balance_loss_clip": 1.02284932, + "balance_loss_mlp": 1.018888, + "epoch": 0.6938824590410341, + "flos": 28655027193600.0, + "grad_norm": 1.506854520283282, + "language_loss": 0.86852932, + "learning_rate": 9.049199018987437e-07, + "loss": 0.8889817, + "num_input_tokens_seen": 249103020, + "step": 11541, + "time_per_iteration": 2.6809439659118652 + }, + { + "auxiliary_loss_clip": 0.01062856, + "auxiliary_loss_mlp": 0.00747705, + "balance_loss_clip": 1.02536988, + "balance_loss_mlp": 1.00036383, + "epoch": 0.6939425822937021, + "flos": 18983062800000.0, + "grad_norm": 1.740422298410307, + "language_loss": 0.84373355, + "learning_rate": 9.04594029121081e-07, + "loss": 0.86183918, + "num_input_tokens_seen": 249120810, + "step": 11542, + "time_per_iteration": 2.5179669857025146 + }, + { + "auxiliary_loss_clip": 0.01053105, + "auxiliary_loss_mlp": 0.01029727, + "balance_loss_clip": 1.02443564, + "balance_loss_mlp": 1.01800311, + "epoch": 0.6940027055463701, + "flos": 23075838178560.0, + "grad_norm": 1.9182805872918178, + "language_loss": 0.7551493, + "learning_rate": 9.04268197881323e-07, + "loss": 0.77597761, + "num_input_tokens_seen": 249138050, + "step": 11543, + "time_per_iteration": 2.606290578842163 + }, + { + "auxiliary_loss_clip": 0.01040997, + "auxiliary_loss_mlp": 0.0102625, + "balance_loss_clip": 1.02344799, + "balance_loss_mlp": 1.01591444, + "epoch": 0.694062828799038, + "flos": 18186564666240.0, + "grad_norm": 2.0160281819610524, + "language_loss": 0.76094025, + "learning_rate": 9.039424081918241e-07, + "loss": 0.78161275, + "num_input_tokens_seen": 249155570, + "step": 11544, + "time_per_iteration": 2.6643643379211426 + }, + { + "auxiliary_loss_clip": 0.01016199, + "auxiliary_loss_mlp": 0.01037936, + "balance_loss_clip": 1.02358186, + "balance_loss_mlp": 1.02571678, + "epoch": 0.694122952051706, + "flos": 17821532701440.0, + "grad_norm": 1.9651653837491276, + "language_loss": 0.71042156, + "learning_rate": 9.036166600649388e-07, + "loss": 0.73096293, + "num_input_tokens_seen": 249172960, + "step": 11545, + "time_per_iteration": 2.6906087398529053 + }, + { + "auxiliary_loss_clip": 0.01051518, + "auxiliary_loss_mlp": 0.01025271, + "balance_loss_clip": 1.02565145, + "balance_loss_mlp": 1.01543665, + "epoch": 0.694183075304374, + "flos": 21215306436480.0, + "grad_norm": 1.521307781028858, + "language_loss": 0.79368013, + "learning_rate": 9.0329095351302e-07, + "loss": 0.814448, + "num_input_tokens_seen": 249192450, + "step": 11546, + "time_per_iteration": 2.695827007293701 + }, + { + "auxiliary_loss_clip": 0.01027034, + "auxiliary_loss_mlp": 0.01029773, + "balance_loss_clip": 1.02216601, + "balance_loss_mlp": 1.01919961, + "epoch": 0.694243198557042, + "flos": 24060508686720.0, + "grad_norm": 2.029400421565112, + "language_loss": 0.78624463, + "learning_rate": 9.029652885484194e-07, + "loss": 0.8068127, + "num_input_tokens_seen": 249214320, + "step": 11547, + "time_per_iteration": 2.8789145946502686 + }, + { + "auxiliary_loss_clip": 0.01045365, + "auxiliary_loss_mlp": 0.00747641, + "balance_loss_clip": 1.02639937, + "balance_loss_mlp": 1.00042009, + "epoch": 0.6943033218097099, + "flos": 21141869080320.0, + "grad_norm": 2.0319840974346266, + "language_loss": 0.80339909, + "learning_rate": 9.026396651834834e-07, + "loss": 0.82132918, + "num_input_tokens_seen": 249230925, + "step": 11548, + "time_per_iteration": 2.6609606742858887 + }, + { + "auxiliary_loss_clip": 0.01006441, + "auxiliary_loss_mlp": 0.00746884, + "balance_loss_clip": 1.00127327, + "balance_loss_mlp": 1.00104833, + "epoch": 0.6943634450623779, + "flos": 57812015975040.0, + "grad_norm": 0.6940639521234606, + "language_loss": 0.53786153, + "learning_rate": 9.023140834305613e-07, + "loss": 0.55539471, + "num_input_tokens_seen": 249293975, + "step": 11549, + "time_per_iteration": 3.2587766647338867 + }, + { + "auxiliary_loss_clip": 0.01044373, + "auxiliary_loss_mlp": 0.01033778, + "balance_loss_clip": 1.02235532, + "balance_loss_mlp": 1.02207792, + "epoch": 0.6944235683150458, + "flos": 30590684231040.0, + "grad_norm": 1.3703613453105337, + "language_loss": 0.73556852, + "learning_rate": 9.01988543302e-07, + "loss": 0.75635004, + "num_input_tokens_seen": 249315285, + "step": 11550, + "time_per_iteration": 2.6510982513427734 + }, + { + "auxiliary_loss_clip": 0.0104866, + "auxiliary_loss_mlp": 0.01034781, + "balance_loss_clip": 1.02955794, + "balance_loss_mlp": 1.02374792, + "epoch": 0.6944836915677138, + "flos": 19719447523200.0, + "grad_norm": 2.1305058967260453, + "language_loss": 0.74171197, + "learning_rate": 9.016630448101425e-07, + "loss": 0.76254642, + "num_input_tokens_seen": 249333505, + "step": 11551, + "time_per_iteration": 2.6698806285858154 + }, + { + "auxiliary_loss_clip": 0.01064023, + "auxiliary_loss_mlp": 0.01035292, + "balance_loss_clip": 1.0262419, + "balance_loss_mlp": 1.02437806, + "epoch": 0.6945438148203817, + "flos": 24863579009280.0, + "grad_norm": 1.4869576169881, + "language_loss": 0.84412014, + "learning_rate": 9.01337587967333e-07, + "loss": 0.86511326, + "num_input_tokens_seen": 249354180, + "step": 11552, + "time_per_iteration": 2.5990915298461914 + }, + { + "auxiliary_loss_clip": 0.01063363, + "auxiliary_loss_mlp": 0.01033642, + "balance_loss_clip": 1.0259198, + "balance_loss_mlp": 1.02315152, + "epoch": 0.6946039380730498, + "flos": 33326646243840.0, + "grad_norm": 2.0877254004290635, + "language_loss": 0.67149699, + "learning_rate": 9.010121727859117e-07, + "loss": 0.69246697, + "num_input_tokens_seen": 249377035, + "step": 11553, + "time_per_iteration": 4.282129526138306 + }, + { + "auxiliary_loss_clip": 0.01049369, + "auxiliary_loss_mlp": 0.01028016, + "balance_loss_clip": 1.02804422, + "balance_loss_mlp": 1.01662564, + "epoch": 0.6946640613257177, + "flos": 20850956830080.0, + "grad_norm": 1.5341593773631763, + "language_loss": 0.79621077, + "learning_rate": 9.006867992782195e-07, + "loss": 0.81698465, + "num_input_tokens_seen": 249396155, + "step": 11554, + "time_per_iteration": 4.316767692565918 + }, + { + "auxiliary_loss_clip": 0.01054092, + "auxiliary_loss_mlp": 0.01029406, + "balance_loss_clip": 1.02469945, + "balance_loss_mlp": 1.01870704, + "epoch": 0.6947241845783857, + "flos": 19354846521600.0, + "grad_norm": 1.7759815909761112, + "language_loss": 0.72707844, + "learning_rate": 9.003614674565934e-07, + "loss": 0.74791348, + "num_input_tokens_seen": 249414555, + "step": 11555, + "time_per_iteration": 2.6292338371276855 + }, + { + "auxiliary_loss_clip": 0.01030819, + "auxiliary_loss_mlp": 0.01024852, + "balance_loss_clip": 1.02358389, + "balance_loss_mlp": 1.01486838, + "epoch": 0.6947843078310536, + "flos": 27120240915840.0, + "grad_norm": 1.655154612865548, + "language_loss": 0.77876532, + "learning_rate": 9.000361773333705e-07, + "loss": 0.79932201, + "num_input_tokens_seen": 249433570, + "step": 11556, + "time_per_iteration": 2.7855849266052246 + }, + { + "auxiliary_loss_clip": 0.0100573, + "auxiliary_loss_mlp": 0.01029947, + "balance_loss_clip": 1.02317381, + "balance_loss_mlp": 1.01921856, + "epoch": 0.6948444310837216, + "flos": 28585109370240.0, + "grad_norm": 3.4515202337282105, + "language_loss": 0.60238463, + "learning_rate": 8.997109289208869e-07, + "loss": 0.6227414, + "num_input_tokens_seen": 249453735, + "step": 11557, + "time_per_iteration": 2.8070497512817383 + }, + { + "auxiliary_loss_clip": 0.01042223, + "auxiliary_loss_mlp": 0.01033749, + "balance_loss_clip": 1.03113723, + "balance_loss_mlp": 1.02328253, + "epoch": 0.6949045543363896, + "flos": 15669262696320.0, + "grad_norm": 2.0341522140824404, + "language_loss": 0.85481167, + "learning_rate": 8.993857222314752e-07, + "loss": 0.87557143, + "num_input_tokens_seen": 249470805, + "step": 11558, + "time_per_iteration": 2.7822937965393066 + }, + { + "auxiliary_loss_clip": 0.01051262, + "auxiliary_loss_mlp": 0.0102883, + "balance_loss_clip": 1.02359128, + "balance_loss_mlp": 1.01712942, + "epoch": 0.6949646775890576, + "flos": 23259413612160.0, + "grad_norm": 1.5510820589882013, + "language_loss": 0.70137632, + "learning_rate": 8.990605572774664e-07, + "loss": 0.72217727, + "num_input_tokens_seen": 249491150, + "step": 11559, + "time_per_iteration": 2.773231029510498 + }, + { + "auxiliary_loss_clip": 0.01030386, + "auxiliary_loss_mlp": 0.01026747, + "balance_loss_clip": 1.02354193, + "balance_loss_mlp": 1.01643527, + "epoch": 0.6950248008417256, + "flos": 22382546797440.0, + "grad_norm": 1.504728760749485, + "language_loss": 0.78532898, + "learning_rate": 8.987354340711921e-07, + "loss": 0.80590028, + "num_input_tokens_seen": 249511560, + "step": 11560, + "time_per_iteration": 2.794490098953247 + }, + { + "auxiliary_loss_clip": 0.01040966, + "auxiliary_loss_mlp": 0.01034185, + "balance_loss_clip": 1.02479005, + "balance_loss_mlp": 1.02398705, + "epoch": 0.6950849240943935, + "flos": 23477355383040.0, + "grad_norm": 1.5006410609192413, + "language_loss": 0.76727521, + "learning_rate": 8.9841035262498e-07, + "loss": 0.78802669, + "num_input_tokens_seen": 249531910, + "step": 11561, + "time_per_iteration": 2.898326873779297 + }, + { + "auxiliary_loss_clip": 0.01060294, + "auxiliary_loss_mlp": 0.0103045, + "balance_loss_clip": 1.02289474, + "balance_loss_mlp": 1.01899409, + "epoch": 0.6951450473470615, + "flos": 17420554200960.0, + "grad_norm": 1.7509354692756176, + "language_loss": 0.78344339, + "learning_rate": 8.980853129511577e-07, + "loss": 0.80435085, + "num_input_tokens_seen": 249550300, + "step": 11562, + "time_per_iteration": 2.763230562210083 + }, + { + "auxiliary_loss_clip": 0.0105323, + "auxiliary_loss_mlp": 0.01027851, + "balance_loss_clip": 1.02405, + "balance_loss_mlp": 1.01698554, + "epoch": 0.6952051705997294, + "flos": 20485745297280.0, + "grad_norm": 2.315942996048734, + "language_loss": 0.6917237, + "learning_rate": 8.977603150620515e-07, + "loss": 0.71253455, + "num_input_tokens_seen": 249567740, + "step": 11563, + "time_per_iteration": 2.5404253005981445 + }, + { + "auxiliary_loss_clip": 0.01040137, + "auxiliary_loss_mlp": 0.01025844, + "balance_loss_clip": 1.02270734, + "balance_loss_mlp": 1.01575923, + "epoch": 0.6952652938523974, + "flos": 13989541040640.0, + "grad_norm": 2.1176248472048713, + "language_loss": 0.73783219, + "learning_rate": 8.974353589699846e-07, + "loss": 0.75849199, + "num_input_tokens_seen": 249582700, + "step": 11564, + "time_per_iteration": 2.5059432983398438 + }, + { + "auxiliary_loss_clip": 0.01036229, + "auxiliary_loss_mlp": 0.01034671, + "balance_loss_clip": 1.02844715, + "balance_loss_mlp": 1.02115273, + "epoch": 0.6953254171050653, + "flos": 30953956429440.0, + "grad_norm": 1.900074544601692, + "language_loss": 0.71779811, + "learning_rate": 8.971104446872785e-07, + "loss": 0.73850715, + "num_input_tokens_seen": 249602920, + "step": 11565, + "time_per_iteration": 2.7216501235961914 + }, + { + "auxiliary_loss_clip": 0.00993056, + "auxiliary_loss_mlp": 0.01001093, + "balance_loss_clip": 1.00654531, + "balance_loss_mlp": 0.99996102, + "epoch": 0.6953855403577334, + "flos": 61670257499520.0, + "grad_norm": 0.9265189835418305, + "language_loss": 0.58473384, + "learning_rate": 8.96785572226255e-07, + "loss": 0.60467535, + "num_input_tokens_seen": 249660400, + "step": 11566, + "time_per_iteration": 3.1184465885162354 + }, + { + "auxiliary_loss_clip": 0.01037791, + "auxiliary_loss_mlp": 0.01028082, + "balance_loss_clip": 1.02643216, + "balance_loss_mlp": 1.01660836, + "epoch": 0.6954456636104013, + "flos": 23039029716480.0, + "grad_norm": 1.8221693443281675, + "language_loss": 0.73767459, + "learning_rate": 8.964607415992338e-07, + "loss": 0.75833333, + "num_input_tokens_seen": 249679335, + "step": 11567, + "time_per_iteration": 2.8289005756378174 + }, + { + "auxiliary_loss_clip": 0.01032021, + "auxiliary_loss_mlp": 0.0103235, + "balance_loss_clip": 1.02109051, + "balance_loss_mlp": 1.02120399, + "epoch": 0.6955057868630693, + "flos": 23918518224000.0, + "grad_norm": 1.2444985352818607, + "language_loss": 0.76971626, + "learning_rate": 8.961359528185313e-07, + "loss": 0.79035997, + "num_input_tokens_seen": 249701805, + "step": 11568, + "time_per_iteration": 2.871044397354126 + }, + { + "auxiliary_loss_clip": 0.01046006, + "auxiliary_loss_mlp": 0.01028583, + "balance_loss_clip": 1.0263778, + "balance_loss_mlp": 1.01861072, + "epoch": 0.6955659101157372, + "flos": 22594634651520.0, + "grad_norm": 2.6292438404769065, + "language_loss": 0.7285651, + "learning_rate": 8.958112058964649e-07, + "loss": 0.74931109, + "num_input_tokens_seen": 249720550, + "step": 11569, + "time_per_iteration": 2.667165994644165 + }, + { + "auxiliary_loss_clip": 0.01049278, + "auxiliary_loss_mlp": 0.01027904, + "balance_loss_clip": 1.02889359, + "balance_loss_mlp": 1.01666856, + "epoch": 0.6956260333684052, + "flos": 24572523104640.0, + "grad_norm": 1.41583337216487, + "language_loss": 0.77082419, + "learning_rate": 8.954865008453471e-07, + "loss": 0.791596, + "num_input_tokens_seen": 249740325, + "step": 11570, + "time_per_iteration": 4.3234593868255615 + }, + { + "auxiliary_loss_clip": 0.01051555, + "auxiliary_loss_mlp": 0.01029287, + "balance_loss_clip": 1.02328372, + "balance_loss_mlp": 1.01878428, + "epoch": 0.6956861566210732, + "flos": 25846058787840.0, + "grad_norm": 2.0220082514677014, + "language_loss": 0.74633366, + "learning_rate": 8.95161837677493e-07, + "loss": 0.76714206, + "num_input_tokens_seen": 249760570, + "step": 11571, + "time_per_iteration": 2.6871469020843506 + }, + { + "auxiliary_loss_clip": 0.01044975, + "auxiliary_loss_mlp": 0.01028643, + "balance_loss_clip": 1.02292991, + "balance_loss_mlp": 1.01830149, + "epoch": 0.6957462798737412, + "flos": 15301393557120.0, + "grad_norm": 1.7247523845698638, + "language_loss": 0.74993378, + "learning_rate": 8.948372164052118e-07, + "loss": 0.77067, + "num_input_tokens_seen": 249778290, + "step": 11572, + "time_per_iteration": 2.575705051422119 + }, + { + "auxiliary_loss_clip": 0.01040929, + "auxiliary_loss_mlp": 0.01025178, + "balance_loss_clip": 1.02288544, + "balance_loss_mlp": 1.0146879, + "epoch": 0.6958064031264092, + "flos": 36246830135040.0, + "grad_norm": 1.7697448451190465, + "language_loss": 0.69954461, + "learning_rate": 8.94512637040814e-07, + "loss": 0.72020566, + "num_input_tokens_seen": 249800925, + "step": 11573, + "time_per_iteration": 2.8700051307678223 + }, + { + "auxiliary_loss_clip": 0.01047774, + "auxiliary_loss_mlp": 0.01034924, + "balance_loss_clip": 1.02705097, + "balance_loss_mlp": 1.02302074, + "epoch": 0.6958665263790771, + "flos": 19208725994880.0, + "grad_norm": 1.7454432188948994, + "language_loss": 0.74975836, + "learning_rate": 8.941880995966095e-07, + "loss": 0.7705853, + "num_input_tokens_seen": 249820500, + "step": 11574, + "time_per_iteration": 2.6145107746124268 + }, + { + "auxiliary_loss_clip": 0.01026191, + "auxiliary_loss_mlp": 0.01031011, + "balance_loss_clip": 1.02167809, + "balance_loss_mlp": 1.02044857, + "epoch": 0.6959266496317451, + "flos": 21795838047360.0, + "grad_norm": 1.5725005691232476, + "language_loss": 0.7429961, + "learning_rate": 8.938636040849014e-07, + "loss": 0.7635681, + "num_input_tokens_seen": 249839845, + "step": 11575, + "time_per_iteration": 2.6926424503326416 + }, + { + "auxiliary_loss_clip": 0.01053248, + "auxiliary_loss_mlp": 0.0102804, + "balance_loss_clip": 1.02428246, + "balance_loss_mlp": 1.01686978, + "epoch": 0.695986772884413, + "flos": 20558248899840.0, + "grad_norm": 1.8842753407926338, + "language_loss": 0.78381079, + "learning_rate": 8.935391505179966e-07, + "loss": 0.8046236, + "num_input_tokens_seen": 249857400, + "step": 11576, + "time_per_iteration": 2.567485809326172 + }, + { + "auxiliary_loss_clip": 0.01025116, + "auxiliary_loss_mlp": 0.0102912, + "balance_loss_clip": 1.025069, + "balance_loss_mlp": 1.01834369, + "epoch": 0.696046896137081, + "flos": 14936217937920.0, + "grad_norm": 2.623404675885966, + "language_loss": 0.56426162, + "learning_rate": 8.932147389081985e-07, + "loss": 0.58480406, + "num_input_tokens_seen": 249871645, + "step": 11577, + "time_per_iteration": 2.6713573932647705 + }, + { + "auxiliary_loss_clip": 0.00987447, + "auxiliary_loss_mlp": 0.01027266, + "balance_loss_clip": 1.02100635, + "balance_loss_mlp": 1.01761055, + "epoch": 0.696107019389749, + "flos": 30740216549760.0, + "grad_norm": 1.3387966131034799, + "language_loss": 0.77026916, + "learning_rate": 8.928903692678081e-07, + "loss": 0.79041636, + "num_input_tokens_seen": 249894215, + "step": 11578, + "time_per_iteration": 2.9226512908935547 + }, + { + "auxiliary_loss_clip": 0.01028686, + "auxiliary_loss_mlp": 0.01030955, + "balance_loss_clip": 1.0241282, + "balance_loss_mlp": 1.02019644, + "epoch": 0.696167142642417, + "flos": 20776729374720.0, + "grad_norm": 2.1761370109378966, + "language_loss": 0.79737896, + "learning_rate": 8.925660416091254e-07, + "loss": 0.8179754, + "num_input_tokens_seen": 249912850, + "step": 11579, + "time_per_iteration": 4.193680047988892 + }, + { + "auxiliary_loss_clip": 0.0102892, + "auxiliary_loss_mlp": 0.01028451, + "balance_loss_clip": 1.02211189, + "balance_loss_mlp": 1.01784134, + "epoch": 0.6962272658950849, + "flos": 22565152563840.0, + "grad_norm": 1.8200981211426575, + "language_loss": 0.72868133, + "learning_rate": 8.922417559444502e-07, + "loss": 0.74925506, + "num_input_tokens_seen": 249932650, + "step": 11580, + "time_per_iteration": 2.705460548400879 + }, + { + "auxiliary_loss_clip": 0.01046087, + "auxiliary_loss_mlp": 0.01027555, + "balance_loss_clip": 1.02578259, + "balance_loss_mlp": 1.0157119, + "epoch": 0.6962873891477529, + "flos": 22200156512640.0, + "grad_norm": 2.017969213113993, + "language_loss": 0.65913087, + "learning_rate": 8.919175122860787e-07, + "loss": 0.67986733, + "num_input_tokens_seen": 249951205, + "step": 11581, + "time_per_iteration": 2.678377151489258 + }, + { + "auxiliary_loss_clip": 0.01065152, + "auxiliary_loss_mlp": 0.01026876, + "balance_loss_clip": 1.02715492, + "balance_loss_mlp": 1.01642776, + "epoch": 0.6963475124004208, + "flos": 12489695717760.0, + "grad_norm": 2.314708979520421, + "language_loss": 0.76304817, + "learning_rate": 8.915933106463056e-07, + "loss": 0.78396845, + "num_input_tokens_seen": 249967045, + "step": 11582, + "time_per_iteration": 2.6290271282196045 + }, + { + "auxiliary_loss_clip": 0.01036751, + "auxiliary_loss_mlp": 0.01027862, + "balance_loss_clip": 1.0223918, + "balance_loss_mlp": 1.01812243, + "epoch": 0.6964076356530888, + "flos": 17165085696000.0, + "grad_norm": 1.9682022084217963, + "language_loss": 0.69753981, + "learning_rate": 8.91269151037425e-07, + "loss": 0.71818602, + "num_input_tokens_seen": 249984565, + "step": 11583, + "time_per_iteration": 2.63517689704895 + }, + { + "auxiliary_loss_clip": 0.01039409, + "auxiliary_loss_mlp": 0.01031391, + "balance_loss_clip": 1.02901721, + "balance_loss_mlp": 1.02047753, + "epoch": 0.6964677589057569, + "flos": 19937317466880.0, + "grad_norm": 1.7893217430866832, + "language_loss": 0.82369518, + "learning_rate": 8.909450334717301e-07, + "loss": 0.84440321, + "num_input_tokens_seen": 250004235, + "step": 11584, + "time_per_iteration": 2.6579761505126953 + }, + { + "auxiliary_loss_clip": 0.01009781, + "auxiliary_loss_mlp": 0.01037764, + "balance_loss_clip": 1.02658224, + "balance_loss_mlp": 1.02528262, + "epoch": 0.6965278821584248, + "flos": 22784064001920.0, + "grad_norm": 3.2862663937068866, + "language_loss": 0.79253912, + "learning_rate": 8.906209579615107e-07, + "loss": 0.81301451, + "num_input_tokens_seen": 250017645, + "step": 11585, + "time_per_iteration": 2.8165509700775146 + }, + { + "auxiliary_loss_clip": 0.01061133, + "auxiliary_loss_mlp": 0.01026115, + "balance_loss_clip": 1.0249635, + "balance_loss_mlp": 1.01603544, + "epoch": 0.6965880054110928, + "flos": 20047563285120.0, + "grad_norm": 1.4783540194575768, + "language_loss": 0.77236664, + "learning_rate": 8.90296924519055e-07, + "loss": 0.79323912, + "num_input_tokens_seen": 250037640, + "step": 11586, + "time_per_iteration": 2.5664243698120117 + }, + { + "auxiliary_loss_clip": 0.01047512, + "auxiliary_loss_mlp": 0.01026292, + "balance_loss_clip": 1.02303505, + "balance_loss_mlp": 1.01630831, + "epoch": 0.6966481286637607, + "flos": 21908238681600.0, + "grad_norm": 1.904075132978134, + "language_loss": 0.78543437, + "learning_rate": 8.899729331566519e-07, + "loss": 0.80617237, + "num_input_tokens_seen": 250056490, + "step": 11587, + "time_per_iteration": 2.649686813354492 + }, + { + "auxiliary_loss_clip": 0.01041973, + "auxiliary_loss_mlp": 0.01028597, + "balance_loss_clip": 1.0257057, + "balance_loss_mlp": 1.01797581, + "epoch": 0.6967082519164287, + "flos": 15633172506240.0, + "grad_norm": 4.8721249755962495, + "language_loss": 0.72975564, + "learning_rate": 8.896489838865857e-07, + "loss": 0.75046134, + "num_input_tokens_seen": 250074285, + "step": 11588, + "time_per_iteration": 2.6640708446502686 + }, + { + "auxiliary_loss_clip": 0.01042712, + "auxiliary_loss_mlp": 0.01023419, + "balance_loss_clip": 1.02525902, + "balance_loss_mlp": 1.01379871, + "epoch": 0.6967683751690966, + "flos": 24024598064640.0, + "grad_norm": 1.7708094491967135, + "language_loss": 0.74863243, + "learning_rate": 8.893250767211413e-07, + "loss": 0.76929379, + "num_input_tokens_seen": 250093350, + "step": 11589, + "time_per_iteration": 2.7430434226989746 + }, + { + "auxiliary_loss_clip": 0.01042647, + "auxiliary_loss_mlp": 0.01030376, + "balance_loss_clip": 1.02500057, + "balance_loss_mlp": 1.01970696, + "epoch": 0.6968284984217646, + "flos": 31024700265600.0, + "grad_norm": 3.6751793147657312, + "language_loss": 0.63713217, + "learning_rate": 8.890012116726012e-07, + "loss": 0.65786242, + "num_input_tokens_seen": 250114170, + "step": 11590, + "time_per_iteration": 2.846538543701172 + }, + { + "auxiliary_loss_clip": 0.00975212, + "auxiliary_loss_mlp": 0.01002362, + "balance_loss_clip": 1.00910711, + "balance_loss_mlp": 1.00099134, + "epoch": 0.6968886216744326, + "flos": 67622990002560.0, + "grad_norm": 0.7599738347224717, + "language_loss": 0.61224985, + "learning_rate": 8.88677388753248e-07, + "loss": 0.6320256, + "num_input_tokens_seen": 250178250, + "step": 11591, + "time_per_iteration": 3.4632856845855713 + }, + { + "auxiliary_loss_clip": 0.0102424, + "auxiliary_loss_mlp": 0.00747586, + "balance_loss_clip": 1.03232205, + "balance_loss_mlp": 1.00038934, + "epoch": 0.6969487449271006, + "flos": 24863686750080.0, + "grad_norm": 1.6777017740263527, + "language_loss": 0.68958282, + "learning_rate": 8.883536079753582e-07, + "loss": 0.70730102, + "num_input_tokens_seen": 250198420, + "step": 11592, + "time_per_iteration": 2.856482982635498 + }, + { + "auxiliary_loss_clip": 0.01024943, + "auxiliary_loss_mlp": 0.01025632, + "balance_loss_clip": 1.0225513, + "balance_loss_mlp": 1.01548755, + "epoch": 0.6970088681797685, + "flos": 28767858791040.0, + "grad_norm": 1.482573406505535, + "language_loss": 0.62447011, + "learning_rate": 8.880298693512109e-07, + "loss": 0.6449759, + "num_input_tokens_seen": 250220650, + "step": 11593, + "time_per_iteration": 2.9135375022888184 + }, + { + "auxiliary_loss_clip": 0.01037874, + "auxiliary_loss_mlp": 0.0102813, + "balance_loss_clip": 1.02332902, + "balance_loss_mlp": 1.0184201, + "epoch": 0.6970689914324365, + "flos": 27308556944640.0, + "grad_norm": 1.3597464582565155, + "language_loss": 0.54524839, + "learning_rate": 8.877061728930832e-07, + "loss": 0.56590843, + "num_input_tokens_seen": 250241750, + "step": 11594, + "time_per_iteration": 2.810802698135376 + }, + { + "auxiliary_loss_clip": 0.01051361, + "auxiliary_loss_mlp": 0.01024369, + "balance_loss_clip": 1.02437329, + "balance_loss_mlp": 1.01421285, + "epoch": 0.6971291146851044, + "flos": 19136258305920.0, + "grad_norm": 2.057408633098086, + "language_loss": 0.76824987, + "learning_rate": 8.87382518613248e-07, + "loss": 0.78900719, + "num_input_tokens_seen": 250259445, + "step": 11595, + "time_per_iteration": 2.558251142501831 + }, + { + "auxiliary_loss_clip": 0.0104813, + "auxiliary_loss_mlp": 0.00747673, + "balance_loss_clip": 1.02826369, + "balance_loss_mlp": 1.00037646, + "epoch": 0.6971892379377724, + "flos": 14610508387200.0, + "grad_norm": 2.8323737513076597, + "language_loss": 0.7161479, + "learning_rate": 8.870589065239793e-07, + "loss": 0.73410594, + "num_input_tokens_seen": 250275640, + "step": 11596, + "time_per_iteration": 2.5785717964172363 + }, + { + "auxiliary_loss_clip": 0.01066368, + "auxiliary_loss_mlp": 0.01030983, + "balance_loss_clip": 1.02819669, + "balance_loss_mlp": 1.01993191, + "epoch": 0.6972493611904405, + "flos": 22307457415680.0, + "grad_norm": 1.6737784687143349, + "language_loss": 0.76410294, + "learning_rate": 8.867353366375492e-07, + "loss": 0.78507644, + "num_input_tokens_seen": 250296435, + "step": 11597, + "time_per_iteration": 2.572458028793335 + }, + { + "auxiliary_loss_clip": 0.01051592, + "auxiliary_loss_mlp": 0.01030214, + "balance_loss_clip": 1.02369988, + "balance_loss_mlp": 1.01966357, + "epoch": 0.6973094844431084, + "flos": 17420374632960.0, + "grad_norm": 1.9742480718058104, + "language_loss": 0.74996161, + "learning_rate": 8.864118089662267e-07, + "loss": 0.77077967, + "num_input_tokens_seen": 250314035, + "step": 11598, + "time_per_iteration": 2.533365249633789 + }, + { + "auxiliary_loss_clip": 0.01048604, + "auxiliary_loss_mlp": 0.01032675, + "balance_loss_clip": 1.02739859, + "balance_loss_mlp": 1.02114773, + "epoch": 0.6973696076957764, + "flos": 27235370983680.0, + "grad_norm": 2.0011911292562656, + "language_loss": 0.89689338, + "learning_rate": 8.860883235222791e-07, + "loss": 0.91770613, + "num_input_tokens_seen": 250332995, + "step": 11599, + "time_per_iteration": 2.673856496810913 + }, + { + "auxiliary_loss_clip": 0.01060576, + "auxiliary_loss_mlp": 0.01036431, + "balance_loss_clip": 1.02846694, + "balance_loss_mlp": 1.02428317, + "epoch": 0.6974297309484443, + "flos": 22018089450240.0, + "grad_norm": 2.0307364763329097, + "language_loss": 0.69727141, + "learning_rate": 8.85764880317974e-07, + "loss": 0.71824151, + "num_input_tokens_seen": 250352120, + "step": 11600, + "time_per_iteration": 4.156175136566162 + }, + { + "auxiliary_loss_clip": 0.01030084, + "auxiliary_loss_mlp": 0.01030633, + "balance_loss_clip": 1.02311134, + "balance_loss_mlp": 1.01948071, + "epoch": 0.6974898542011123, + "flos": 28366449327360.0, + "grad_norm": 1.6824450783591072, + "language_loss": 0.76713419, + "learning_rate": 8.854414793655771e-07, + "loss": 0.7877413, + "num_input_tokens_seen": 250371705, + "step": 11601, + "time_per_iteration": 4.264479875564575 + }, + { + "auxiliary_loss_clip": 0.0104964, + "auxiliary_loss_mlp": 0.00747571, + "balance_loss_clip": 1.02376354, + "balance_loss_mlp": 1.0004065, + "epoch": 0.6975499774537802, + "flos": 15232050351360.0, + "grad_norm": 1.6862318431203633, + "language_loss": 0.72007805, + "learning_rate": 8.851181206773508e-07, + "loss": 0.73805016, + "num_input_tokens_seen": 250390485, + "step": 11602, + "time_per_iteration": 2.58302640914917 + }, + { + "auxiliary_loss_clip": 0.01043804, + "auxiliary_loss_mlp": 0.00747488, + "balance_loss_clip": 1.02511644, + "balance_loss_mlp": 1.00039184, + "epoch": 0.6976101007064482, + "flos": 22157422306560.0, + "grad_norm": 4.731233121097422, + "language_loss": 0.76924092, + "learning_rate": 8.847948042655567e-07, + "loss": 0.78715384, + "num_input_tokens_seen": 250407020, + "step": 11603, + "time_per_iteration": 2.750511407852173 + }, + { + "auxiliary_loss_clip": 0.01017651, + "auxiliary_loss_mlp": 0.01028939, + "balance_loss_clip": 1.02329135, + "balance_loss_mlp": 1.01847839, + "epoch": 0.6976702239591162, + "flos": 22273522041600.0, + "grad_norm": 5.5039459867826706, + "language_loss": 0.62266123, + "learning_rate": 8.844715301424557e-07, + "loss": 0.64312714, + "num_input_tokens_seen": 250425880, + "step": 11604, + "time_per_iteration": 2.7580668926239014 + }, + { + "auxiliary_loss_clip": 0.0104571, + "auxiliary_loss_mlp": 0.01031406, + "balance_loss_clip": 1.02479231, + "balance_loss_mlp": 1.01941359, + "epoch": 0.6977303472117842, + "flos": 25848608653440.0, + "grad_norm": 2.2076924186852627, + "language_loss": 0.81590736, + "learning_rate": 8.841482983203057e-07, + "loss": 0.8366785, + "num_input_tokens_seen": 250442925, + "step": 11605, + "time_per_iteration": 2.653580665588379 + }, + { + "auxiliary_loss_clip": 0.0105322, + "auxiliary_loss_mlp": 0.01031741, + "balance_loss_clip": 1.0250504, + "balance_loss_mlp": 1.02123916, + "epoch": 0.6977904704644521, + "flos": 20959586536320.0, + "grad_norm": 1.8289985403616569, + "language_loss": 0.70397639, + "learning_rate": 8.838251088113638e-07, + "loss": 0.72482598, + "num_input_tokens_seen": 250461220, + "step": 11606, + "time_per_iteration": 2.623150587081909 + }, + { + "auxiliary_loss_clip": 0.01046214, + "auxiliary_loss_mlp": 0.01028206, + "balance_loss_clip": 1.02656603, + "balance_loss_mlp": 1.01695871, + "epoch": 0.6978505937171201, + "flos": 22055041566720.0, + "grad_norm": 1.8306440600910394, + "language_loss": 0.82175422, + "learning_rate": 8.835019616278856e-07, + "loss": 0.84249842, + "num_input_tokens_seen": 250480975, + "step": 11607, + "time_per_iteration": 2.6132562160491943 + }, + { + "auxiliary_loss_clip": 0.01048339, + "auxiliary_loss_mlp": 0.01033293, + "balance_loss_clip": 1.02680588, + "balance_loss_mlp": 1.02150965, + "epoch": 0.697910716969788, + "flos": 20043720529920.0, + "grad_norm": 1.8501057211120726, + "language_loss": 0.78808033, + "learning_rate": 8.831788567821265e-07, + "loss": 0.80889666, + "num_input_tokens_seen": 250497980, + "step": 11608, + "time_per_iteration": 2.702836513519287 + }, + { + "auxiliary_loss_clip": 0.01043979, + "auxiliary_loss_mlp": 0.01026781, + "balance_loss_clip": 1.02461517, + "balance_loss_mlp": 1.0162245, + "epoch": 0.697970840222456, + "flos": 15888245961600.0, + "grad_norm": 1.8433762263987006, + "language_loss": 0.90129364, + "learning_rate": 8.828557942863357e-07, + "loss": 0.92200118, + "num_input_tokens_seen": 250511910, + "step": 11609, + "time_per_iteration": 2.707460403442383 + }, + { + "auxiliary_loss_clip": 0.01036906, + "auxiliary_loss_mlp": 0.01026318, + "balance_loss_clip": 1.02597737, + "balance_loss_mlp": 1.01537442, + "epoch": 0.698030963475124, + "flos": 21215629658880.0, + "grad_norm": 2.200506086007147, + "language_loss": 0.64253473, + "learning_rate": 8.82532774152765e-07, + "loss": 0.663167, + "num_input_tokens_seen": 250531090, + "step": 11610, + "time_per_iteration": 2.770750045776367 + }, + { + "auxiliary_loss_clip": 0.01033576, + "auxiliary_loss_mlp": 0.01026709, + "balance_loss_clip": 1.02480567, + "balance_loss_mlp": 1.01648128, + "epoch": 0.698091086727792, + "flos": 33759728524800.0, + "grad_norm": 1.6807210183486054, + "language_loss": 0.84594357, + "learning_rate": 8.822097963936643e-07, + "loss": 0.86654639, + "num_input_tokens_seen": 250551565, + "step": 11611, + "time_per_iteration": 2.8119988441467285 + }, + { + "auxiliary_loss_clip": 0.01053318, + "auxiliary_loss_mlp": 0.01027774, + "balance_loss_clip": 1.02454472, + "balance_loss_mlp": 1.01693177, + "epoch": 0.69815120998046, + "flos": 15887850912000.0, + "grad_norm": 2.738382477618108, + "language_loss": 0.70779663, + "learning_rate": 8.818868610212793e-07, + "loss": 0.72860765, + "num_input_tokens_seen": 250569625, + "step": 11612, + "time_per_iteration": 2.7091288566589355 + }, + { + "auxiliary_loss_clip": 0.01044389, + "auxiliary_loss_mlp": 0.01028274, + "balance_loss_clip": 1.02327991, + "balance_loss_mlp": 1.01741385, + "epoch": 0.6982113332331279, + "flos": 18947044437120.0, + "grad_norm": 2.4764996251551126, + "language_loss": 0.80938816, + "learning_rate": 8.815639680478573e-07, + "loss": 0.83011484, + "num_input_tokens_seen": 250586960, + "step": 11613, + "time_per_iteration": 2.812077283859253 + }, + { + "auxiliary_loss_clip": 0.01050915, + "auxiliary_loss_mlp": 0.01030929, + "balance_loss_clip": 1.02420866, + "balance_loss_mlp": 1.02097523, + "epoch": 0.6982714564857959, + "flos": 24389594115840.0, + "grad_norm": 1.9053718410395517, + "language_loss": 0.75193191, + "learning_rate": 8.812411174856411e-07, + "loss": 0.77275032, + "num_input_tokens_seen": 250605080, + "step": 11614, + "time_per_iteration": 2.7350828647613525 + }, + { + "auxiliary_loss_clip": 0.0101079, + "auxiliary_loss_mlp": 0.01031232, + "balance_loss_clip": 1.0294044, + "balance_loss_mlp": 1.02039635, + "epoch": 0.6983315797384638, + "flos": 20083725302400.0, + "grad_norm": 3.0966955794987387, + "language_loss": 0.77350837, + "learning_rate": 8.809183093468746e-07, + "loss": 0.79392862, + "num_input_tokens_seen": 250623965, + "step": 11615, + "time_per_iteration": 2.8324437141418457 + }, + { + "auxiliary_loss_clip": 0.01041621, + "auxiliary_loss_mlp": 0.01027608, + "balance_loss_clip": 1.0254606, + "balance_loss_mlp": 1.01724887, + "epoch": 0.6983917029911318, + "flos": 13512431664000.0, + "grad_norm": 2.9904288097901874, + "language_loss": 0.73042476, + "learning_rate": 8.80595543643797e-07, + "loss": 0.75111705, + "num_input_tokens_seen": 250640675, + "step": 11616, + "time_per_iteration": 2.559255361557007 + }, + { + "auxiliary_loss_clip": 0.01063702, + "auxiliary_loss_mlp": 0.01032891, + "balance_loss_clip": 1.02596283, + "balance_loss_mlp": 1.02222157, + "epoch": 0.6984518262437998, + "flos": 22018412672640.0, + "grad_norm": 1.5997943312226182, + "language_loss": 0.8412903, + "learning_rate": 8.802728203886487e-07, + "loss": 0.86225617, + "num_input_tokens_seen": 250660295, + "step": 11617, + "time_per_iteration": 2.67995285987854 + }, + { + "auxiliary_loss_clip": 0.01021834, + "auxiliary_loss_mlp": 0.01037791, + "balance_loss_clip": 1.02269554, + "balance_loss_mlp": 1.02615619, + "epoch": 0.6985119494964678, + "flos": 18770615809920.0, + "grad_norm": 2.8545264278044082, + "language_loss": 0.59232163, + "learning_rate": 8.799501395936682e-07, + "loss": 0.6129179, + "num_input_tokens_seen": 250678155, + "step": 11618, + "time_per_iteration": 4.253488540649414 + }, + { + "auxiliary_loss_clip": 0.01042719, + "auxiliary_loss_mlp": 0.01030094, + "balance_loss_clip": 1.02407587, + "balance_loss_mlp": 1.01986003, + "epoch": 0.6985720727491357, + "flos": 22382834106240.0, + "grad_norm": 2.900713542898425, + "language_loss": 0.83032823, + "learning_rate": 8.796275012710903e-07, + "loss": 0.85105634, + "num_input_tokens_seen": 250697230, + "step": 11619, + "time_per_iteration": 2.623497486114502 + }, + { + "auxiliary_loss_clip": 0.01048474, + "auxiliary_loss_mlp": 0.01026108, + "balance_loss_clip": 1.02351069, + "balance_loss_mlp": 1.01671433, + "epoch": 0.6986321960018037, + "flos": 39567884785920.0, + "grad_norm": 1.714731084207482, + "language_loss": 0.67164493, + "learning_rate": 8.793049054331494e-07, + "loss": 0.69239074, + "num_input_tokens_seen": 250719865, + "step": 11620, + "time_per_iteration": 2.688488483428955 + }, + { + "auxiliary_loss_clip": 0.01011813, + "auxiliary_loss_mlp": 0.01027367, + "balance_loss_clip": 1.02225065, + "balance_loss_mlp": 1.01601839, + "epoch": 0.6986923192544716, + "flos": 17967725055360.0, + "grad_norm": 1.9408375384665333, + "language_loss": 0.72820699, + "learning_rate": 8.789823520920794e-07, + "loss": 0.74859881, + "num_input_tokens_seen": 250736565, + "step": 11621, + "time_per_iteration": 2.71610164642334 + }, + { + "auxiliary_loss_clip": 0.01008728, + "auxiliary_loss_mlp": 0.01036354, + "balance_loss_clip": 1.02259886, + "balance_loss_mlp": 1.02492213, + "epoch": 0.6987524425071396, + "flos": 25594325297280.0, + "grad_norm": 2.2988317418549284, + "language_loss": 0.68492329, + "learning_rate": 8.7865984126011e-07, + "loss": 0.70537412, + "num_input_tokens_seen": 250757235, + "step": 11622, + "time_per_iteration": 2.7779593467712402 + }, + { + "auxiliary_loss_clip": 0.01009395, + "auxiliary_loss_mlp": 0.01022311, + "balance_loss_clip": 1.02362013, + "balance_loss_mlp": 1.0124644, + "epoch": 0.6988125657598077, + "flos": 17530081747200.0, + "grad_norm": 2.194590430392792, + "language_loss": 0.62363034, + "learning_rate": 8.783373729494721e-07, + "loss": 0.64394736, + "num_input_tokens_seen": 250775585, + "step": 11623, + "time_per_iteration": 2.7158572673797607 + }, + { + "auxiliary_loss_clip": 0.01064941, + "auxiliary_loss_mlp": 0.01022737, + "balance_loss_clip": 1.02412891, + "balance_loss_mlp": 1.01144767, + "epoch": 0.6988726890124756, + "flos": 39165721136640.0, + "grad_norm": 1.7869542493277255, + "language_loss": 0.6078769, + "learning_rate": 8.780149471723932e-07, + "loss": 0.62875366, + "num_input_tokens_seen": 250795725, + "step": 11624, + "time_per_iteration": 2.7713139057159424 + }, + { + "auxiliary_loss_clip": 0.01052915, + "auxiliary_loss_mlp": 0.01035111, + "balance_loss_clip": 1.02287006, + "balance_loss_mlp": 1.02406013, + "epoch": 0.6989328122651436, + "flos": 20193468330240.0, + "grad_norm": 1.716808499105213, + "language_loss": 0.78398269, + "learning_rate": 8.776925639411017e-07, + "loss": 0.80486298, + "num_input_tokens_seen": 250814555, + "step": 11625, + "time_per_iteration": 2.5862069129943848 + }, + { + "auxiliary_loss_clip": 0.01025018, + "auxiliary_loss_mlp": 0.01027834, + "balance_loss_clip": 1.02166283, + "balance_loss_mlp": 1.01788044, + "epoch": 0.6989929355178115, + "flos": 21834873152640.0, + "grad_norm": 2.226529660221596, + "language_loss": 0.65819114, + "learning_rate": 8.773702232678188e-07, + "loss": 0.67871958, + "num_input_tokens_seen": 250833105, + "step": 11626, + "time_per_iteration": 4.287192106246948 + }, + { + "auxiliary_loss_clip": 0.0104506, + "auxiliary_loss_mlp": 0.00747728, + "balance_loss_clip": 1.02573824, + "balance_loss_mlp": 1.00045037, + "epoch": 0.6990530587704795, + "flos": 26322880855680.0, + "grad_norm": 1.8353180254342625, + "language_loss": 0.70737469, + "learning_rate": 8.770479251647697e-07, + "loss": 0.72530258, + "num_input_tokens_seen": 250852570, + "step": 11627, + "time_per_iteration": 2.7053751945495605 + }, + { + "auxiliary_loss_clip": 0.01063154, + "auxiliary_loss_mlp": 0.0102626, + "balance_loss_clip": 1.02705193, + "balance_loss_mlp": 1.01684856, + "epoch": 0.6991131820231474, + "flos": 19828975069440.0, + "grad_norm": 1.6446839027211653, + "language_loss": 0.62329233, + "learning_rate": 8.767256696441768e-07, + "loss": 0.64418644, + "num_input_tokens_seen": 250870500, + "step": 11628, + "time_per_iteration": 2.5385067462921143 + }, + { + "auxiliary_loss_clip": 0.01053261, + "auxiliary_loss_mlp": 0.01034169, + "balance_loss_clip": 1.02413309, + "balance_loss_mlp": 1.02292204, + "epoch": 0.6991733052758154, + "flos": 33984817102080.0, + "grad_norm": 2.2778724453893986, + "language_loss": 0.68259227, + "learning_rate": 8.764034567182581e-07, + "loss": 0.70346653, + "num_input_tokens_seen": 250892745, + "step": 11629, + "time_per_iteration": 2.743382453918457 + }, + { + "auxiliary_loss_clip": 0.01064804, + "auxiliary_loss_mlp": 0.0103171, + "balance_loss_clip": 1.02684832, + "balance_loss_mlp": 1.02043295, + "epoch": 0.6992334285284834, + "flos": 15633136592640.0, + "grad_norm": 1.7269430395029006, + "language_loss": 0.72278082, + "learning_rate": 8.760812863992337e-07, + "loss": 0.74374598, + "num_input_tokens_seen": 250910225, + "step": 11630, + "time_per_iteration": 2.5749104022979736 + }, + { + "auxiliary_loss_clip": 0.01063928, + "auxiliary_loss_mlp": 0.0103134, + "balance_loss_clip": 1.02668321, + "balance_loss_mlp": 1.02105224, + "epoch": 0.6992935517811514, + "flos": 21726279360000.0, + "grad_norm": 1.5705243993480882, + "language_loss": 0.73929489, + "learning_rate": 8.757591586993196e-07, + "loss": 0.76024747, + "num_input_tokens_seen": 250929715, + "step": 11631, + "time_per_iteration": 2.6151556968688965 + }, + { + "auxiliary_loss_clip": 0.0105646, + "auxiliary_loss_mlp": 0.01030028, + "balance_loss_clip": 1.02678835, + "balance_loss_mlp": 1.01835132, + "epoch": 0.6993536750338193, + "flos": 20115254465280.0, + "grad_norm": 2.0417784470793623, + "language_loss": 0.89008951, + "learning_rate": 8.7543707363073e-07, + "loss": 0.91095436, + "num_input_tokens_seen": 250944230, + "step": 11632, + "time_per_iteration": 2.536472797393799 + }, + { + "auxiliary_loss_clip": 0.01045626, + "auxiliary_loss_mlp": 0.01032615, + "balance_loss_clip": 1.02707338, + "balance_loss_mlp": 1.02238643, + "epoch": 0.6994137982864873, + "flos": 22010547594240.0, + "grad_norm": 1.5373114627691857, + "language_loss": 0.80024904, + "learning_rate": 8.751150312056792e-07, + "loss": 0.82103145, + "num_input_tokens_seen": 250961865, + "step": 11633, + "time_per_iteration": 2.663428783416748 + }, + { + "auxiliary_loss_clip": 0.01067493, + "auxiliary_loss_mlp": 0.01032996, + "balance_loss_clip": 1.02640295, + "balance_loss_mlp": 1.02120662, + "epoch": 0.6994739215391552, + "flos": 25519020433920.0, + "grad_norm": 1.9659983095732751, + "language_loss": 0.67382956, + "learning_rate": 8.747930314363794e-07, + "loss": 0.69483453, + "num_input_tokens_seen": 250982025, + "step": 11634, + "time_per_iteration": 2.6176536083221436 + }, + { + "auxiliary_loss_clip": 0.00988029, + "auxiliary_loss_mlp": 0.01002705, + "balance_loss_clip": 1.01088405, + "balance_loss_mlp": 1.0016377, + "epoch": 0.6995340447918232, + "flos": 59128357691520.0, + "grad_norm": 0.6856722269439425, + "language_loss": 0.53195262, + "learning_rate": 8.744710743350412e-07, + "loss": 0.55185997, + "num_input_tokens_seen": 251046900, + "step": 11635, + "time_per_iteration": 3.424475908279419 + }, + { + "auxiliary_loss_clip": 0.01043219, + "auxiliary_loss_mlp": 0.01028552, + "balance_loss_clip": 1.02533317, + "balance_loss_mlp": 1.01744795, + "epoch": 0.6995941680444913, + "flos": 17967832796160.0, + "grad_norm": 1.6242909074565637, + "language_loss": 0.82109797, + "learning_rate": 8.741491599138726e-07, + "loss": 0.84181571, + "num_input_tokens_seen": 251065050, + "step": 11636, + "time_per_iteration": 2.600109100341797 + }, + { + "auxiliary_loss_clip": 0.01065659, + "auxiliary_loss_mlp": 0.01028734, + "balance_loss_clip": 1.02658892, + "balance_loss_mlp": 1.01775503, + "epoch": 0.6996542912971592, + "flos": 21980095839360.0, + "grad_norm": 1.8587019097526216, + "language_loss": 0.83198082, + "learning_rate": 8.738272881850801e-07, + "loss": 0.85292476, + "num_input_tokens_seen": 251083355, + "step": 11637, + "time_per_iteration": 2.531186103820801 + }, + { + "auxiliary_loss_clip": 0.01017064, + "auxiliary_loss_mlp": 0.01033771, + "balance_loss_clip": 1.0221312, + "balance_loss_mlp": 1.02285743, + "epoch": 0.6997144145498272, + "flos": 11686158518400.0, + "grad_norm": 2.0395637770724555, + "language_loss": 0.67620283, + "learning_rate": 8.735054591608704e-07, + "loss": 0.69671124, + "num_input_tokens_seen": 251096420, + "step": 11638, + "time_per_iteration": 2.715224266052246 + }, + { + "auxiliary_loss_clip": 0.01059022, + "auxiliary_loss_mlp": 0.01032845, + "balance_loss_clip": 1.02683949, + "balance_loss_mlp": 1.02051854, + "epoch": 0.6997745378024951, + "flos": 29607162958080.0, + "grad_norm": 2.608182856125973, + "language_loss": 0.77879286, + "learning_rate": 8.731836728534459e-07, + "loss": 0.79971147, + "num_input_tokens_seen": 251115410, + "step": 11639, + "time_per_iteration": 2.6438210010528564 + }, + { + "auxiliary_loss_clip": 0.01042079, + "auxiliary_loss_mlp": 0.01035887, + "balance_loss_clip": 1.02422905, + "balance_loss_mlp": 1.0245024, + "epoch": 0.6998346610551631, + "flos": 20886616056960.0, + "grad_norm": 2.19508087765932, + "language_loss": 0.82415843, + "learning_rate": 8.728619292750093e-07, + "loss": 0.8449381, + "num_input_tokens_seen": 251133530, + "step": 11640, + "time_per_iteration": 2.60988450050354 + }, + { + "auxiliary_loss_clip": 0.01033661, + "auxiliary_loss_mlp": 0.01027894, + "balance_loss_clip": 1.02478182, + "balance_loss_mlp": 1.01713514, + "epoch": 0.699894784307831, + "flos": 27163046949120.0, + "grad_norm": 1.6640168862469358, + "language_loss": 0.75270319, + "learning_rate": 8.725402284377619e-07, + "loss": 0.77331871, + "num_input_tokens_seen": 251153985, + "step": 11641, + "time_per_iteration": 2.6978492736816406 + }, + { + "auxiliary_loss_clip": 0.01041407, + "auxiliary_loss_mlp": 0.01021999, + "balance_loss_clip": 1.02505493, + "balance_loss_mlp": 1.01075792, + "epoch": 0.699954907560499, + "flos": 20923640000640.0, + "grad_norm": 2.186420918563222, + "language_loss": 0.779369, + "learning_rate": 8.722185703539022e-07, + "loss": 0.80000305, + "num_input_tokens_seen": 251173225, + "step": 11642, + "time_per_iteration": 2.6444647312164307 + }, + { + "auxiliary_loss_clip": 0.0106071, + "auxiliary_loss_mlp": 0.01032978, + "balance_loss_clip": 1.02780294, + "balance_loss_mlp": 1.02058029, + "epoch": 0.700015030813167, + "flos": 28657792540800.0, + "grad_norm": 2.1431839188848003, + "language_loss": 0.7472899, + "learning_rate": 8.718969550356266e-07, + "loss": 0.76822674, + "num_input_tokens_seen": 251192485, + "step": 11643, + "time_per_iteration": 2.71964168548584 + }, + { + "auxiliary_loss_clip": 0.01034523, + "auxiliary_loss_mlp": 0.01025461, + "balance_loss_clip": 1.02511013, + "balance_loss_mlp": 1.01462483, + "epoch": 0.700075154065835, + "flos": 29205286617600.0, + "grad_norm": 1.7258447753441708, + "language_loss": 0.60306203, + "learning_rate": 8.715753824951315e-07, + "loss": 0.62366182, + "num_input_tokens_seen": 251214965, + "step": 11644, + "time_per_iteration": 2.7587039470672607 + }, + { + "auxiliary_loss_clip": 0.01049682, + "auxiliary_loss_mlp": 0.01024132, + "balance_loss_clip": 1.02306724, + "balance_loss_mlp": 1.0137434, + "epoch": 0.7001352773185029, + "flos": 23112431159040.0, + "grad_norm": 1.5095362121886386, + "language_loss": 0.81561679, + "learning_rate": 8.712538527446119e-07, + "loss": 0.83635497, + "num_input_tokens_seen": 251234500, + "step": 11645, + "time_per_iteration": 2.582540988922119 + }, + { + "auxiliary_loss_clip": 0.01052062, + "auxiliary_loss_mlp": 0.01028984, + "balance_loss_clip": 1.02367043, + "balance_loss_mlp": 1.0180819, + "epoch": 0.7001954005711709, + "flos": 21322858734720.0, + "grad_norm": 1.7462914542412187, + "language_loss": 0.68397999, + "learning_rate": 8.709323657962584e-07, + "loss": 0.70479047, + "num_input_tokens_seen": 251254360, + "step": 11646, + "time_per_iteration": 4.132730007171631 + }, + { + "auxiliary_loss_clip": 0.01048722, + "auxiliary_loss_mlp": 0.01041598, + "balance_loss_clip": 1.02525949, + "balance_loss_mlp": 1.03017163, + "epoch": 0.7002555238238388, + "flos": 24535822383360.0, + "grad_norm": 2.2055124333908016, + "language_loss": 0.71213716, + "learning_rate": 8.706109216622635e-07, + "loss": 0.73304033, + "num_input_tokens_seen": 251274790, + "step": 11647, + "time_per_iteration": 2.591583728790283 + }, + { + "auxiliary_loss_clip": 0.01056593, + "auxiliary_loss_mlp": 0.01032752, + "balance_loss_clip": 1.02706218, + "balance_loss_mlp": 1.02136207, + "epoch": 0.7003156470765068, + "flos": 39056552726400.0, + "grad_norm": 1.7383582888594453, + "language_loss": 0.71623242, + "learning_rate": 8.702895203548155e-07, + "loss": 0.73712587, + "num_input_tokens_seen": 251296275, + "step": 11648, + "time_per_iteration": 2.7118475437164307 + }, + { + "auxiliary_loss_clip": 0.01007206, + "auxiliary_loss_mlp": 0.01031567, + "balance_loss_clip": 1.0199188, + "balance_loss_mlp": 1.02056408, + "epoch": 0.7003757703291749, + "flos": 28804092635520.0, + "grad_norm": 1.6020227024014875, + "language_loss": 0.77032828, + "learning_rate": 8.699681618861014e-07, + "loss": 0.79071605, + "num_input_tokens_seen": 251317375, + "step": 11649, + "time_per_iteration": 4.45177960395813 + }, + { + "auxiliary_loss_clip": 0.01041486, + "auxiliary_loss_mlp": 0.01033205, + "balance_loss_clip": 1.02453804, + "balance_loss_mlp": 1.02262509, + "epoch": 0.7004358935818428, + "flos": 15953854152960.0, + "grad_norm": 4.483552555956694, + "language_loss": 0.78897381, + "learning_rate": 8.69646846268308e-07, + "loss": 0.80972064, + "num_input_tokens_seen": 251333570, + "step": 11650, + "time_per_iteration": 2.646810531616211 + }, + { + "auxiliary_loss_clip": 0.01041644, + "auxiliary_loss_mlp": 0.01023791, + "balance_loss_clip": 1.02351308, + "balance_loss_mlp": 1.01330066, + "epoch": 0.7004960168345108, + "flos": 20411984718720.0, + "grad_norm": 1.8766536318842182, + "language_loss": 0.78164691, + "learning_rate": 8.693255735136194e-07, + "loss": 0.80230123, + "num_input_tokens_seen": 251351070, + "step": 11651, + "time_per_iteration": 2.682725429534912 + }, + { + "auxiliary_loss_clip": 0.01028534, + "auxiliary_loss_mlp": 0.01035634, + "balance_loss_clip": 1.02390885, + "balance_loss_mlp": 1.02395129, + "epoch": 0.7005561400871787, + "flos": 17347547808000.0, + "grad_norm": 1.7165493697026017, + "language_loss": 0.6958921, + "learning_rate": 8.690043436342198e-07, + "loss": 0.71653378, + "num_input_tokens_seen": 251370005, + "step": 11652, + "time_per_iteration": 2.738924026489258 + }, + { + "auxiliary_loss_clip": 0.01053335, + "auxiliary_loss_mlp": 0.01027844, + "balance_loss_clip": 1.02526653, + "balance_loss_mlp": 1.01738381, + "epoch": 0.7006162633398467, + "flos": 25302120157440.0, + "grad_norm": 1.3904660608116401, + "language_loss": 0.74434984, + "learning_rate": 8.686831566422874e-07, + "loss": 0.76516157, + "num_input_tokens_seen": 251391210, + "step": 11653, + "time_per_iteration": 2.6637918949127197 + }, + { + "auxiliary_loss_clip": 0.01036781, + "auxiliary_loss_mlp": 0.01032261, + "balance_loss_clip": 1.02537894, + "balance_loss_mlp": 1.02076316, + "epoch": 0.7006763865925146, + "flos": 20668997508480.0, + "grad_norm": 2.1366265397677866, + "language_loss": 0.70658433, + "learning_rate": 8.68362012550003e-07, + "loss": 0.72727478, + "num_input_tokens_seen": 251411505, + "step": 11654, + "time_per_iteration": 2.6578593254089355 + }, + { + "auxiliary_loss_clip": 0.01023255, + "auxiliary_loss_mlp": 0.01032989, + "balance_loss_clip": 1.02367187, + "balance_loss_mlp": 1.0211401, + "epoch": 0.7007365098451827, + "flos": 20046449963520.0, + "grad_norm": 3.27145066410373, + "language_loss": 0.72962272, + "learning_rate": 8.680409113695453e-07, + "loss": 0.75018513, + "num_input_tokens_seen": 251428975, + "step": 11655, + "time_per_iteration": 2.7556328773498535 + }, + { + "auxiliary_loss_clip": 0.01063655, + "auxiliary_loss_mlp": 0.0103531, + "balance_loss_clip": 1.02973413, + "balance_loss_mlp": 1.02287078, + "epoch": 0.7007966330978506, + "flos": 20777375819520.0, + "grad_norm": 1.8557755590105172, + "language_loss": 0.70689666, + "learning_rate": 8.677198531130889e-07, + "loss": 0.72788632, + "num_input_tokens_seen": 251446940, + "step": 11656, + "time_per_iteration": 2.726867914199829 + }, + { + "auxiliary_loss_clip": 0.01020897, + "auxiliary_loss_mlp": 0.01027733, + "balance_loss_clip": 1.02364147, + "balance_loss_mlp": 1.01783872, + "epoch": 0.7008567563505186, + "flos": 29638189330560.0, + "grad_norm": 1.8249280676647108, + "language_loss": 0.77758133, + "learning_rate": 8.673988377928092e-07, + "loss": 0.79806769, + "num_input_tokens_seen": 251466205, + "step": 11657, + "time_per_iteration": 2.755364179611206 + }, + { + "auxiliary_loss_clip": 0.01068523, + "auxiliary_loss_mlp": 0.01034221, + "balance_loss_clip": 1.0267837, + "balance_loss_mlp": 1.02204418, + "epoch": 0.7009168796031865, + "flos": 17092007475840.0, + "grad_norm": 1.758568577497591, + "language_loss": 0.78017032, + "learning_rate": 8.670778654208797e-07, + "loss": 0.80119777, + "num_input_tokens_seen": 251484820, + "step": 11658, + "time_per_iteration": 2.543614625930786 + }, + { + "auxiliary_loss_clip": 0.01037213, + "auxiliary_loss_mlp": 0.01023974, + "balance_loss_clip": 1.02218723, + "balance_loss_mlp": 1.01293564, + "epoch": 0.7009770028558545, + "flos": 20448972748800.0, + "grad_norm": 1.6933686090955489, + "language_loss": 0.8274076, + "learning_rate": 8.667569360094713e-07, + "loss": 0.84801942, + "num_input_tokens_seen": 251502670, + "step": 11659, + "time_per_iteration": 2.7148711681365967 + }, + { + "auxiliary_loss_clip": 0.01019115, + "auxiliary_loss_mlp": 0.01026433, + "balance_loss_clip": 1.02200234, + "balance_loss_mlp": 1.01631188, + "epoch": 0.7010371261085224, + "flos": 19245139407360.0, + "grad_norm": 2.011608037892093, + "language_loss": 0.69458544, + "learning_rate": 8.664360495707526e-07, + "loss": 0.71504086, + "num_input_tokens_seen": 251521630, + "step": 11660, + "time_per_iteration": 2.6087770462036133 + }, + { + "auxiliary_loss_clip": 0.01065282, + "auxiliary_loss_mlp": 0.01032846, + "balance_loss_clip": 1.02530706, + "balance_loss_mlp": 1.02108002, + "epoch": 0.7010972493611904, + "flos": 22127581082880.0, + "grad_norm": 1.8637999204677418, + "language_loss": 0.8067466, + "learning_rate": 8.661152061168924e-07, + "loss": 0.82772791, + "num_input_tokens_seen": 251540105, + "step": 11661, + "time_per_iteration": 2.6399123668670654 + }, + { + "auxiliary_loss_clip": 0.01052435, + "auxiliary_loss_mlp": 0.01030146, + "balance_loss_clip": 1.02374125, + "balance_loss_mlp": 1.01966751, + "epoch": 0.7011573726138585, + "flos": 31391132860800.0, + "grad_norm": 1.540009707455086, + "language_loss": 0.78701597, + "learning_rate": 8.657944056600579e-07, + "loss": 0.80784178, + "num_input_tokens_seen": 251560530, + "step": 11662, + "time_per_iteration": 2.7388358116149902 + }, + { + "auxiliary_loss_clip": 0.01048109, + "auxiliary_loss_mlp": 0.01031554, + "balance_loss_clip": 1.02327657, + "balance_loss_mlp": 1.01897144, + "epoch": 0.7012174958665264, + "flos": 18150582216960.0, + "grad_norm": 1.771608809674593, + "language_loss": 0.83366388, + "learning_rate": 8.654736482124134e-07, + "loss": 0.85446048, + "num_input_tokens_seen": 251577930, + "step": 11663, + "time_per_iteration": 2.603165864944458 + }, + { + "auxiliary_loss_clip": 0.00998319, + "auxiliary_loss_mlp": 0.01001687, + "balance_loss_clip": 1.00317597, + "balance_loss_mlp": 1.0007571, + "epoch": 0.7012776191191944, + "flos": 60651256567680.0, + "grad_norm": 0.8166915650619967, + "language_loss": 0.53745937, + "learning_rate": 8.651529337861209e-07, + "loss": 0.55745941, + "num_input_tokens_seen": 251638820, + "step": 11664, + "time_per_iteration": 3.176135778427124 + }, + { + "auxiliary_loss_clip": 0.01039813, + "auxiliary_loss_mlp": 0.01028467, + "balance_loss_clip": 1.02357173, + "balance_loss_mlp": 1.01734447, + "epoch": 0.7013377423718623, + "flos": 27198598435200.0, + "grad_norm": 2.2413542142789304, + "language_loss": 0.79275084, + "learning_rate": 8.64832262393344e-07, + "loss": 0.81343359, + "num_input_tokens_seen": 251658070, + "step": 11665, + "time_per_iteration": 4.209030628204346 + }, + { + "auxiliary_loss_clip": 0.0104579, + "auxiliary_loss_mlp": 0.01029821, + "balance_loss_clip": 1.02218759, + "balance_loss_mlp": 1.01889002, + "epoch": 0.7013978656245303, + "flos": 16543543731840.0, + "grad_norm": 2.1991730712862236, + "language_loss": 0.77100492, + "learning_rate": 8.645116340462404e-07, + "loss": 0.79176104, + "num_input_tokens_seen": 251671575, + "step": 11666, + "time_per_iteration": 2.552072048187256 + }, + { + "auxiliary_loss_clip": 0.01048109, + "auxiliary_loss_mlp": 0.01032048, + "balance_loss_clip": 1.02475429, + "balance_loss_mlp": 1.02132535, + "epoch": 0.7014579888771982, + "flos": 23143780753920.0, + "grad_norm": 2.0936600864582036, + "language_loss": 0.80872476, + "learning_rate": 8.641910487569695e-07, + "loss": 0.82952636, + "num_input_tokens_seen": 251689350, + "step": 11667, + "time_per_iteration": 2.58001446723938 + }, + { + "auxiliary_loss_clip": 0.01030057, + "auxiliary_loss_mlp": 0.01032752, + "balance_loss_clip": 1.02213168, + "balance_loss_mlp": 1.02183282, + "epoch": 0.7015181121298663, + "flos": 25082095397760.0, + "grad_norm": 1.9531720961410732, + "language_loss": 0.65199471, + "learning_rate": 8.638705065376879e-07, + "loss": 0.6726228, + "num_input_tokens_seen": 251704635, + "step": 11668, + "time_per_iteration": 2.698103189468384 + }, + { + "auxiliary_loss_clip": 0.01043444, + "auxiliary_loss_mlp": 0.01024391, + "balance_loss_clip": 1.02388358, + "balance_loss_mlp": 1.01338172, + "epoch": 0.7015782353825342, + "flos": 23327894891520.0, + "grad_norm": 3.5385392580028086, + "language_loss": 0.76323789, + "learning_rate": 8.635500074005519e-07, + "loss": 0.78391623, + "num_input_tokens_seen": 251723035, + "step": 11669, + "time_per_iteration": 2.758791208267212 + }, + { + "auxiliary_loss_clip": 0.00987263, + "auxiliary_loss_mlp": 0.01003068, + "balance_loss_clip": 1.00201714, + "balance_loss_mlp": 1.00201344, + "epoch": 0.7016383586352022, + "flos": 70397161107840.0, + "grad_norm": 0.7809608249220947, + "language_loss": 0.54461265, + "learning_rate": 8.632295513577122e-07, + "loss": 0.56451595, + "num_input_tokens_seen": 251791630, + "step": 11670, + "time_per_iteration": 3.389559745788574 + }, + { + "auxiliary_loss_clip": 0.01045286, + "auxiliary_loss_mlp": 0.01036589, + "balance_loss_clip": 1.02699959, + "balance_loss_mlp": 1.02562141, + "epoch": 0.7016984818878701, + "flos": 19792274348160.0, + "grad_norm": 1.8097427550518423, + "language_loss": 0.81949031, + "learning_rate": 8.629091384213218e-07, + "loss": 0.84030908, + "num_input_tokens_seen": 251809840, + "step": 11671, + "time_per_iteration": 2.6900103092193604 + }, + { + "auxiliary_loss_clip": 0.0105546, + "auxiliary_loss_mlp": 0.01026225, + "balance_loss_clip": 1.02691936, + "balance_loss_mlp": 1.0155977, + "epoch": 0.7017586051405381, + "flos": 12896923184640.0, + "grad_norm": 1.8309787116495122, + "language_loss": 0.74957097, + "learning_rate": 8.625887686035313e-07, + "loss": 0.77038789, + "num_input_tokens_seen": 251827550, + "step": 11672, + "time_per_iteration": 2.5969393253326416 + }, + { + "auxiliary_loss_clip": 0.01043844, + "auxiliary_loss_mlp": 0.01029955, + "balance_loss_clip": 1.02223444, + "balance_loss_mlp": 1.01845765, + "epoch": 0.701818728393206, + "flos": 18332828847360.0, + "grad_norm": 1.5652991752793726, + "language_loss": 0.8688938, + "learning_rate": 8.622684419164883e-07, + "loss": 0.88963175, + "num_input_tokens_seen": 251844880, + "step": 11673, + "time_per_iteration": 2.5625197887420654 + }, + { + "auxiliary_loss_clip": 0.0104679, + "auxiliary_loss_mlp": 0.01023292, + "balance_loss_clip": 1.02328086, + "balance_loss_mlp": 1.01235473, + "epoch": 0.701878851645874, + "flos": 17384212615680.0, + "grad_norm": 1.8222614420257297, + "language_loss": 0.73027748, + "learning_rate": 8.619481583723399e-07, + "loss": 0.75097835, + "num_input_tokens_seen": 251861025, + "step": 11674, + "time_per_iteration": 4.167157411575317 + }, + { + "auxiliary_loss_clip": 0.01043323, + "auxiliary_loss_mlp": 0.00747504, + "balance_loss_clip": 1.02509904, + "balance_loss_mlp": 1.00043774, + "epoch": 0.701938974898542, + "flos": 23915501481600.0, + "grad_norm": 1.774764747366991, + "language_loss": 0.72218168, + "learning_rate": 8.616279179832329e-07, + "loss": 0.74009001, + "num_input_tokens_seen": 251880175, + "step": 11675, + "time_per_iteration": 2.5842506885528564 + }, + { + "auxiliary_loss_clip": 0.010274, + "auxiliary_loss_mlp": 0.01027539, + "balance_loss_clip": 1.02402282, + "balance_loss_mlp": 1.01613069, + "epoch": 0.70199909815121, + "flos": 21795586652160.0, + "grad_norm": 1.9507672902060595, + "language_loss": 0.51232147, + "learning_rate": 8.613077207613078e-07, + "loss": 0.53287089, + "num_input_tokens_seen": 251899005, + "step": 11676, + "time_per_iteration": 2.714399576187134 + }, + { + "auxiliary_loss_clip": 0.00987601, + "auxiliary_loss_mlp": 0.00746729, + "balance_loss_clip": 1.00268936, + "balance_loss_mlp": 1.00084531, + "epoch": 0.702059221403878, + "flos": 71715047109120.0, + "grad_norm": 0.7423364396421471, + "language_loss": 0.59226108, + "learning_rate": 8.609875667187079e-07, + "loss": 0.60960436, + "num_input_tokens_seen": 251966790, + "step": 11677, + "time_per_iteration": 3.2719180583953857 + }, + { + "auxiliary_loss_clip": 0.01047856, + "auxiliary_loss_mlp": 0.01030054, + "balance_loss_clip": 1.02289772, + "balance_loss_mlp": 1.01781678, + "epoch": 0.7021193446565459, + "flos": 28111052649600.0, + "grad_norm": 2.187426546002761, + "language_loss": 0.62676549, + "learning_rate": 8.606674558675737e-07, + "loss": 0.64754456, + "num_input_tokens_seen": 251989315, + "step": 11678, + "time_per_iteration": 2.596635580062866 + }, + { + "auxiliary_loss_clip": 0.01062226, + "auxiliary_loss_mlp": 0.01029598, + "balance_loss_clip": 1.02460384, + "balance_loss_mlp": 1.01873815, + "epoch": 0.7021794679092139, + "flos": 22924905229440.0, + "grad_norm": 1.6026191097887503, + "language_loss": 0.79175359, + "learning_rate": 8.603473882200444e-07, + "loss": 0.81267184, + "num_input_tokens_seen": 252006620, + "step": 11679, + "time_per_iteration": 2.4913957118988037 + }, + { + "auxiliary_loss_clip": 0.01031917, + "auxiliary_loss_mlp": 0.01039905, + "balance_loss_clip": 1.02164078, + "balance_loss_mlp": 1.028759, + "epoch": 0.7022395911618818, + "flos": 18077827219200.0, + "grad_norm": 2.156569774081089, + "language_loss": 0.70742255, + "learning_rate": 8.600273637882567e-07, + "loss": 0.72814077, + "num_input_tokens_seen": 252024570, + "step": 11680, + "time_per_iteration": 2.541273593902588 + }, + { + "auxiliary_loss_clip": 0.01022336, + "auxiliary_loss_mlp": 0.01037083, + "balance_loss_clip": 1.02225447, + "balance_loss_mlp": 1.02449417, + "epoch": 0.7022997144145499, + "flos": 16034294661120.0, + "grad_norm": 1.7226589071428782, + "language_loss": 0.74887359, + "learning_rate": 8.597073825843446e-07, + "loss": 0.76946777, + "num_input_tokens_seen": 252042775, + "step": 11681, + "time_per_iteration": 2.654083251953125 + }, + { + "auxiliary_loss_clip": 0.01047233, + "auxiliary_loss_mlp": 0.01031194, + "balance_loss_clip": 1.02635908, + "balance_loss_mlp": 1.02124572, + "epoch": 0.7023598376672178, + "flos": 26468678160000.0, + "grad_norm": 1.9279888359343686, + "language_loss": 0.76894212, + "learning_rate": 8.593874446204434e-07, + "loss": 0.78972638, + "num_input_tokens_seen": 252063690, + "step": 11682, + "time_per_iteration": 2.6700026988983154 + }, + { + "auxiliary_loss_clip": 0.01037029, + "auxiliary_loss_mlp": 0.00747581, + "balance_loss_clip": 1.02616048, + "balance_loss_mlp": 1.00042868, + "epoch": 0.7024199609198858, + "flos": 17055917285760.0, + "grad_norm": 2.1385185738406247, + "language_loss": 0.73113382, + "learning_rate": 8.590675499086841e-07, + "loss": 0.74897987, + "num_input_tokens_seen": 252080335, + "step": 11683, + "time_per_iteration": 2.6963577270507812 + }, + { + "auxiliary_loss_clip": 0.01035004, + "auxiliary_loss_mlp": 0.01026471, + "balance_loss_clip": 1.02654338, + "balance_loss_mlp": 1.01489544, + "epoch": 0.7024800841725537, + "flos": 25849039616640.0, + "grad_norm": 1.7851702946750967, + "language_loss": 0.71951771, + "learning_rate": 8.587476984611976e-07, + "loss": 0.74013257, + "num_input_tokens_seen": 252101075, + "step": 11684, + "time_per_iteration": 2.7650105953216553 + }, + { + "auxiliary_loss_clip": 0.01055344, + "auxiliary_loss_mlp": 0.01032553, + "balance_loss_clip": 1.02644551, + "balance_loss_mlp": 1.02138901, + "epoch": 0.7025402074252217, + "flos": 23513014609920.0, + "grad_norm": 1.731706918190408, + "language_loss": 0.71658599, + "learning_rate": 8.584278902901128e-07, + "loss": 0.7374649, + "num_input_tokens_seen": 252120510, + "step": 11685, + "time_per_iteration": 2.6278648376464844 + }, + { + "auxiliary_loss_clip": 0.01048083, + "auxiliary_loss_mlp": 0.01028761, + "balance_loss_clip": 1.02289164, + "balance_loss_mlp": 1.0183537, + "epoch": 0.7026003306778896, + "flos": 20150985519360.0, + "grad_norm": 1.7375881882177973, + "language_loss": 0.84559631, + "learning_rate": 8.581081254075582e-07, + "loss": 0.86636472, + "num_input_tokens_seen": 252137590, + "step": 11686, + "time_per_iteration": 2.5642054080963135 + }, + { + "auxiliary_loss_clip": 0.00999712, + "auxiliary_loss_mlp": 0.01004625, + "balance_loss_clip": 1.00337863, + "balance_loss_mlp": 1.00368297, + "epoch": 0.7026604539305576, + "flos": 64772400712320.0, + "grad_norm": 0.9847081405936408, + "language_loss": 0.69904619, + "learning_rate": 8.577884038256566e-07, + "loss": 0.71908957, + "num_input_tokens_seen": 252199830, + "step": 11687, + "time_per_iteration": 3.3052382469177246 + }, + { + "auxiliary_loss_clip": 0.01025114, + "auxiliary_loss_mlp": 0.01027902, + "balance_loss_clip": 1.02204204, + "balance_loss_mlp": 1.01673853, + "epoch": 0.7027205771832256, + "flos": 21871466133120.0, + "grad_norm": 1.8356982391235994, + "language_loss": 0.77572006, + "learning_rate": 8.574687255565329e-07, + "loss": 0.79625022, + "num_input_tokens_seen": 252217200, + "step": 11688, + "time_per_iteration": 2.656259298324585 + }, + { + "auxiliary_loss_clip": 0.01062994, + "auxiliary_loss_mlp": 0.0102906, + "balance_loss_clip": 1.0246098, + "balance_loss_mlp": 1.0179739, + "epoch": 0.7027807004358936, + "flos": 23367791923200.0, + "grad_norm": 2.0990335229228037, + "language_loss": 0.68987167, + "learning_rate": 8.571490906123107e-07, + "loss": 0.71079218, + "num_input_tokens_seen": 252236105, + "step": 11689, + "time_per_iteration": 2.5483670234680176 + }, + { + "auxiliary_loss_clip": 0.0104564, + "auxiliary_loss_mlp": 0.01032092, + "balance_loss_clip": 1.02616179, + "balance_loss_mlp": 1.02050543, + "epoch": 0.7028408236885616, + "flos": 15304266645120.0, + "grad_norm": 2.460772246371367, + "language_loss": 0.80011678, + "learning_rate": 8.568294990051086e-07, + "loss": 0.82089412, + "num_input_tokens_seen": 252253315, + "step": 11690, + "time_per_iteration": 2.59509015083313 + }, + { + "auxiliary_loss_clip": 0.01065456, + "auxiliary_loss_mlp": 0.01029311, + "balance_loss_clip": 1.02650571, + "balance_loss_mlp": 1.01870704, + "epoch": 0.7029009469412295, + "flos": 22018197191040.0, + "grad_norm": 2.1455555726445454, + "language_loss": 0.75951201, + "learning_rate": 8.56509950747047e-07, + "loss": 0.78045964, + "num_input_tokens_seen": 252272765, + "step": 11691, + "time_per_iteration": 2.576338529586792 + }, + { + "auxiliary_loss_clip": 0.01038378, + "auxiliary_loss_mlp": 0.01028462, + "balance_loss_clip": 1.02443182, + "balance_loss_mlp": 1.01809716, + "epoch": 0.7029610701938975, + "flos": 21835519597440.0, + "grad_norm": 1.82457299640793, + "language_loss": 0.81565332, + "learning_rate": 8.561904458502429e-07, + "loss": 0.83632177, + "num_input_tokens_seen": 252290510, + "step": 11692, + "time_per_iteration": 2.609780788421631 + }, + { + "auxiliary_loss_clip": 0.01043172, + "auxiliary_loss_mlp": 0.01026591, + "balance_loss_clip": 1.02481914, + "balance_loss_mlp": 1.01556993, + "epoch": 0.7030211934465654, + "flos": 19135647774720.0, + "grad_norm": 1.55771001171639, + "language_loss": 0.76471376, + "learning_rate": 8.558709843268111e-07, + "loss": 0.78541136, + "num_input_tokens_seen": 252309365, + "step": 11693, + "time_per_iteration": 2.6632583141326904 + }, + { + "auxiliary_loss_clip": 0.01044712, + "auxiliary_loss_mlp": 0.01030258, + "balance_loss_clip": 1.02773833, + "balance_loss_mlp": 1.0194931, + "epoch": 0.7030813166992335, + "flos": 38546010766080.0, + "grad_norm": 1.4956266457594674, + "language_loss": 0.68640435, + "learning_rate": 8.55551566188866e-07, + "loss": 0.70715404, + "num_input_tokens_seen": 252333010, + "step": 11694, + "time_per_iteration": 4.472970008850098 + }, + { + "auxiliary_loss_clip": 0.01065152, + "auxiliary_loss_mlp": 0.01026669, + "balance_loss_clip": 1.02571249, + "balance_loss_mlp": 1.01568341, + "epoch": 0.7031414399519014, + "flos": 14720897859840.0, + "grad_norm": 2.760610605594587, + "language_loss": 0.7603237, + "learning_rate": 8.552321914485203e-07, + "loss": 0.78124189, + "num_input_tokens_seen": 252351330, + "step": 11695, + "time_per_iteration": 2.525448799133301 + }, + { + "auxiliary_loss_clip": 0.01049178, + "auxiliary_loss_mlp": 0.01040822, + "balance_loss_clip": 1.02896667, + "balance_loss_mlp": 1.02942562, + "epoch": 0.7032015632045694, + "flos": 14027247342720.0, + "grad_norm": 2.008878477495967, + "language_loss": 0.73555404, + "learning_rate": 8.549128601178852e-07, + "loss": 0.75645399, + "num_input_tokens_seen": 252369580, + "step": 11696, + "time_per_iteration": 4.321025371551514 + }, + { + "auxiliary_loss_clip": 0.01048503, + "auxiliary_loss_mlp": 0.01026383, + "balance_loss_clip": 1.02741027, + "balance_loss_mlp": 1.0148313, + "epoch": 0.7032616864572373, + "flos": 27637175496960.0, + "grad_norm": 1.5792388646657622, + "language_loss": 0.75423223, + "learning_rate": 8.545935722090693e-07, + "loss": 0.77498108, + "num_input_tokens_seen": 252390525, + "step": 11697, + "time_per_iteration": 2.709616184234619 + }, + { + "auxiliary_loss_clip": 0.01012308, + "auxiliary_loss_mlp": 0.01036703, + "balance_loss_clip": 1.02655244, + "balance_loss_mlp": 1.02366781, + "epoch": 0.7033218097099053, + "flos": 17967294092160.0, + "grad_norm": 1.7023637005602497, + "language_loss": 0.80618739, + "learning_rate": 8.542743277341793e-07, + "loss": 0.82667756, + "num_input_tokens_seen": 252407470, + "step": 11698, + "time_per_iteration": 2.8134806156158447 + }, + { + "auxiliary_loss_clip": 0.01038731, + "auxiliary_loss_mlp": 0.01038487, + "balance_loss_clip": 1.02373195, + "balance_loss_mlp": 1.02607119, + "epoch": 0.7033819329625732, + "flos": 19501721233920.0, + "grad_norm": 1.404611903583869, + "language_loss": 0.8479923, + "learning_rate": 8.539551267053222e-07, + "loss": 0.86876446, + "num_input_tokens_seen": 252427025, + "step": 11699, + "time_per_iteration": 2.606098175048828 + }, + { + "auxiliary_loss_clip": 0.01046667, + "auxiliary_loss_mlp": 0.01031914, + "balance_loss_clip": 1.02462006, + "balance_loss_mlp": 1.01952195, + "epoch": 0.7034420562152413, + "flos": 23987645948160.0, + "grad_norm": 1.7586970278274312, + "language_loss": 0.78766268, + "learning_rate": 8.53635969134601e-07, + "loss": 0.80844849, + "num_input_tokens_seen": 252445410, + "step": 11700, + "time_per_iteration": 2.6256494522094727 + }, + { + "auxiliary_loss_clip": 0.01053751, + "auxiliary_loss_mlp": 0.010275, + "balance_loss_clip": 1.02483571, + "balance_loss_mlp": 1.01568079, + "epoch": 0.7035021794679092, + "flos": 35043427756800.0, + "grad_norm": 1.7321185976810796, + "language_loss": 0.74253124, + "learning_rate": 8.533168550341186e-07, + "loss": 0.76334375, + "num_input_tokens_seen": 252463905, + "step": 11701, + "time_per_iteration": 2.6568832397460938 + }, + { + "auxiliary_loss_clip": 0.01056943, + "auxiliary_loss_mlp": 0.01030038, + "balance_loss_clip": 1.02655518, + "balance_loss_mlp": 1.0177176, + "epoch": 0.7035623027205772, + "flos": 10997428164480.0, + "grad_norm": 2.2824782023327668, + "language_loss": 0.84361732, + "learning_rate": 8.529977844159769e-07, + "loss": 0.86448717, + "num_input_tokens_seen": 252478655, + "step": 11702, + "time_per_iteration": 2.5622384548187256 + }, + { + "auxiliary_loss_clip": 0.0106337, + "auxiliary_loss_mlp": 0.01035201, + "balance_loss_clip": 1.02478957, + "balance_loss_mlp": 1.02378058, + "epoch": 0.7036224259732452, + "flos": 23623727304960.0, + "grad_norm": 1.6737581328412308, + "language_loss": 0.6109854, + "learning_rate": 8.526787572922738e-07, + "loss": 0.63197112, + "num_input_tokens_seen": 252498740, + "step": 11703, + "time_per_iteration": 2.554426670074463 + }, + { + "auxiliary_loss_clip": 0.01063137, + "auxiliary_loss_mlp": 0.01026048, + "balance_loss_clip": 1.02399993, + "balance_loss_mlp": 1.01507449, + "epoch": 0.7036825492259131, + "flos": 31686175175040.0, + "grad_norm": 1.9225990646022377, + "language_loss": 0.6091274, + "learning_rate": 8.523597736751067e-07, + "loss": 0.63001931, + "num_input_tokens_seen": 252517800, + "step": 11704, + "time_per_iteration": 2.586171865463257 + }, + { + "auxiliary_loss_clip": 0.01050349, + "auxiliary_loss_mlp": 0.01028351, + "balance_loss_clip": 1.02458489, + "balance_loss_mlp": 1.0188266, + "epoch": 0.7037426724785811, + "flos": 30192866127360.0, + "grad_norm": 1.5450618674760748, + "language_loss": 0.71182537, + "learning_rate": 8.520408335765719e-07, + "loss": 0.73261237, + "num_input_tokens_seen": 252539620, + "step": 11705, + "time_per_iteration": 2.6862008571624756 + }, + { + "auxiliary_loss_clip": 0.01051764, + "auxiliary_loss_mlp": 0.01031161, + "balance_loss_clip": 1.02528274, + "balance_loss_mlp": 1.02017605, + "epoch": 0.703802795731249, + "flos": 24311523905280.0, + "grad_norm": 1.721113231681739, + "language_loss": 0.61859012, + "learning_rate": 8.517219370087645e-07, + "loss": 0.63941944, + "num_input_tokens_seen": 252557300, + "step": 11706, + "time_per_iteration": 2.654371976852417 + }, + { + "auxiliary_loss_clip": 0.01056127, + "auxiliary_loss_mlp": 0.01027852, + "balance_loss_clip": 1.02590394, + "balance_loss_mlp": 1.01702833, + "epoch": 0.7038629189839171, + "flos": 22528954632960.0, + "grad_norm": 1.8234194314939143, + "language_loss": 0.67869639, + "learning_rate": 8.514030839837756e-07, + "loss": 0.6995362, + "num_input_tokens_seen": 252576715, + "step": 11707, + "time_per_iteration": 2.616764783859253 + }, + { + "auxiliary_loss_clip": 0.01061383, + "auxiliary_loss_mlp": 0.01028642, + "balance_loss_clip": 1.02436185, + "balance_loss_mlp": 1.01833653, + "epoch": 0.703923042236585, + "flos": 26250484993920.0, + "grad_norm": 2.2843537232982176, + "language_loss": 0.7620492, + "learning_rate": 8.510842745136974e-07, + "loss": 0.78294945, + "num_input_tokens_seen": 252596190, + "step": 11708, + "time_per_iteration": 2.530674457550049 + }, + { + "auxiliary_loss_clip": 0.01038229, + "auxiliary_loss_mlp": 0.01030439, + "balance_loss_clip": 1.02387285, + "balance_loss_mlp": 1.01951361, + "epoch": 0.703983165489253, + "flos": 19390254353280.0, + "grad_norm": 3.3130770129607203, + "language_loss": 0.72280377, + "learning_rate": 8.50765508610619e-07, + "loss": 0.74349046, + "num_input_tokens_seen": 252613410, + "step": 11709, + "time_per_iteration": 2.6477062702178955 + }, + { + "auxiliary_loss_clip": 0.01052271, + "auxiliary_loss_mlp": 0.01025545, + "balance_loss_clip": 1.0246594, + "balance_loss_mlp": 1.01539445, + "epoch": 0.7040432887419209, + "flos": 16683630773760.0, + "grad_norm": 2.1271175001855944, + "language_loss": 0.78836614, + "learning_rate": 8.504467862866267e-07, + "loss": 0.80914432, + "num_input_tokens_seen": 252629150, + "step": 11710, + "time_per_iteration": 2.5068135261535645 + }, + { + "auxiliary_loss_clip": 0.01054906, + "auxiliary_loss_mlp": 0.01031397, + "balance_loss_clip": 1.02563524, + "balance_loss_mlp": 1.01955938, + "epoch": 0.7041034119945889, + "flos": 21141402203520.0, + "grad_norm": 1.5458517914705003, + "language_loss": 0.77105689, + "learning_rate": 8.501281075538076e-07, + "loss": 0.79191995, + "num_input_tokens_seen": 252648225, + "step": 11711, + "time_per_iteration": 4.229925870895386 + }, + { + "auxiliary_loss_clip": 0.01027556, + "auxiliary_loss_mlp": 0.01029354, + "balance_loss_clip": 1.02269125, + "balance_loss_mlp": 1.0197401, + "epoch": 0.7041635352472568, + "flos": 16910299549440.0, + "grad_norm": 2.2389509190448496, + "language_loss": 0.74178863, + "learning_rate": 8.498094724242457e-07, + "loss": 0.76235771, + "num_input_tokens_seen": 252665380, + "step": 11712, + "time_per_iteration": 2.6704227924346924 + }, + { + "auxiliary_loss_clip": 0.00965845, + "auxiliary_loss_mlp": 0.01005164, + "balance_loss_clip": 1.00145674, + "balance_loss_mlp": 1.00416315, + "epoch": 0.7042236584999249, + "flos": 71681219475840.0, + "grad_norm": 0.884344151547581, + "language_loss": 0.64595449, + "learning_rate": 8.494908809100247e-07, + "loss": 0.66566455, + "num_input_tokens_seen": 252727950, + "step": 11713, + "time_per_iteration": 3.2721967697143555 + }, + { + "auxiliary_loss_clip": 0.01044106, + "auxiliary_loss_mlp": 0.01024225, + "balance_loss_clip": 1.02111244, + "balance_loss_mlp": 1.014027, + "epoch": 0.7042837817525928, + "flos": 28658187590400.0, + "grad_norm": 2.367588986381566, + "language_loss": 0.73038715, + "learning_rate": 8.49172333023225e-07, + "loss": 0.75107044, + "num_input_tokens_seen": 252746770, + "step": 11714, + "time_per_iteration": 2.649656295776367 + }, + { + "auxiliary_loss_clip": 0.01043397, + "auxiliary_loss_mlp": 0.00747781, + "balance_loss_clip": 1.02492714, + "balance_loss_mlp": 1.00054502, + "epoch": 0.7043439050052608, + "flos": 19753562465280.0, + "grad_norm": 1.6769385428721124, + "language_loss": 0.79783857, + "learning_rate": 8.488538287759248e-07, + "loss": 0.81575036, + "num_input_tokens_seen": 252765610, + "step": 11715, + "time_per_iteration": 2.670128583908081 + }, + { + "auxiliary_loss_clip": 0.01038158, + "auxiliary_loss_mlp": 0.01034727, + "balance_loss_clip": 1.02382612, + "balance_loss_mlp": 1.02290773, + "epoch": 0.7044040282579288, + "flos": 11538529620480.0, + "grad_norm": 2.109444874705394, + "language_loss": 0.71549904, + "learning_rate": 8.485353681802037e-07, + "loss": 0.73622787, + "num_input_tokens_seen": 252781610, + "step": 11716, + "time_per_iteration": 2.791747808456421 + }, + { + "auxiliary_loss_clip": 0.01034106, + "auxiliary_loss_mlp": 0.01031164, + "balance_loss_clip": 1.03089714, + "balance_loss_mlp": 1.01979709, + "epoch": 0.7044641515105967, + "flos": 33656126722560.0, + "grad_norm": 4.277165840327747, + "language_loss": 0.6635915, + "learning_rate": 8.482169512481358e-07, + "loss": 0.68424422, + "num_input_tokens_seen": 252800600, + "step": 11717, + "time_per_iteration": 2.886719226837158 + }, + { + "auxiliary_loss_clip": 0.01064568, + "auxiliary_loss_mlp": 0.01030948, + "balance_loss_clip": 1.02599084, + "balance_loss_mlp": 1.02027917, + "epoch": 0.7045242747632647, + "flos": 26723859356160.0, + "grad_norm": 1.4969840482747072, + "language_loss": 0.74350166, + "learning_rate": 8.478985779917967e-07, + "loss": 0.76445687, + "num_input_tokens_seen": 252822310, + "step": 11718, + "time_per_iteration": 2.6340527534484863 + }, + { + "auxiliary_loss_clip": 0.01047448, + "auxiliary_loss_mlp": 0.01030822, + "balance_loss_clip": 1.02295542, + "balance_loss_mlp": 1.02069521, + "epoch": 0.7045843980159326, + "flos": 26797655848320.0, + "grad_norm": 367.5895296786114, + "language_loss": 0.79826224, + "learning_rate": 8.475802484232606e-07, + "loss": 0.81904495, + "num_input_tokens_seen": 252842355, + "step": 11719, + "time_per_iteration": 2.8796026706695557 + }, + { + "auxiliary_loss_clip": 0.01054537, + "auxiliary_loss_mlp": 0.0103453, + "balance_loss_clip": 1.02675962, + "balance_loss_mlp": 1.02325916, + "epoch": 0.7046445212686007, + "flos": 41574824363520.0, + "grad_norm": 1.879855065864206, + "language_loss": 0.65707505, + "learning_rate": 8.472619625545951e-07, + "loss": 0.6779657, + "num_input_tokens_seen": 252866785, + "step": 11720, + "time_per_iteration": 2.865044593811035 + }, + { + "auxiliary_loss_clip": 0.01044453, + "auxiliary_loss_mlp": 0.01030324, + "balance_loss_clip": 1.02574682, + "balance_loss_mlp": 1.01954722, + "epoch": 0.7047046445212686, + "flos": 15560166113280.0, + "grad_norm": 2.665663090397423, + "language_loss": 0.79925883, + "learning_rate": 8.46943720397872e-07, + "loss": 0.82000661, + "num_input_tokens_seen": 252881870, + "step": 11721, + "time_per_iteration": 4.309919357299805 + }, + { + "auxiliary_loss_clip": 0.00981651, + "auxiliary_loss_mlp": 0.01003672, + "balance_loss_clip": 1.00508809, + "balance_loss_mlp": 1.00251532, + "epoch": 0.7047647677739366, + "flos": 70410269571840.0, + "grad_norm": 0.7610524532532887, + "language_loss": 0.64825183, + "learning_rate": 8.466255219651582e-07, + "loss": 0.66810507, + "num_input_tokens_seen": 252951300, + "step": 11722, + "time_per_iteration": 3.3324766159057617 + }, + { + "auxiliary_loss_clip": 0.01041372, + "auxiliary_loss_mlp": 0.01029851, + "balance_loss_clip": 1.02517998, + "balance_loss_mlp": 1.01931858, + "epoch": 0.7048248910266045, + "flos": 23660032976640.0, + "grad_norm": 1.5286691592613413, + "language_loss": 0.65939045, + "learning_rate": 8.463073672685211e-07, + "loss": 0.68010265, + "num_input_tokens_seen": 252971400, + "step": 11723, + "time_per_iteration": 2.6825530529022217 + }, + { + "auxiliary_loss_clip": 0.01032505, + "auxiliary_loss_mlp": 0.01029157, + "balance_loss_clip": 1.0243187, + "balance_loss_mlp": 1.01793349, + "epoch": 0.7048850142792725, + "flos": 21397158017280.0, + "grad_norm": 1.6744940069587346, + "language_loss": 0.80992162, + "learning_rate": 8.459892563200235e-07, + "loss": 0.83053827, + "num_input_tokens_seen": 252989475, + "step": 11724, + "time_per_iteration": 2.685370445251465 + }, + { + "auxiliary_loss_clip": 0.01053029, + "auxiliary_loss_mlp": 0.01031952, + "balance_loss_clip": 1.02424765, + "balance_loss_mlp": 1.02105582, + "epoch": 0.7049451375319404, + "flos": 21648101408640.0, + "grad_norm": 1.6832148514315268, + "language_loss": 0.73021555, + "learning_rate": 8.456711891317296e-07, + "loss": 0.75106537, + "num_input_tokens_seen": 253007220, + "step": 11725, + "time_per_iteration": 2.6797921657562256 + }, + { + "auxiliary_loss_clip": 0.01013798, + "auxiliary_loss_mlp": 0.01030332, + "balance_loss_clip": 1.02115464, + "balance_loss_mlp": 1.01854193, + "epoch": 0.7050052607846085, + "flos": 14866802904960.0, + "grad_norm": 1.9520546216165988, + "language_loss": 0.78611422, + "learning_rate": 8.453531657156998e-07, + "loss": 0.80655551, + "num_input_tokens_seen": 253025410, + "step": 11726, + "time_per_iteration": 2.7681400775909424 + }, + { + "auxiliary_loss_clip": 0.01043394, + "auxiliary_loss_mlp": 0.01029621, + "balance_loss_clip": 1.02390504, + "balance_loss_mlp": 1.01952362, + "epoch": 0.7050653840372764, + "flos": 19241763528960.0, + "grad_norm": 2.0280990328482784, + "language_loss": 0.7073977, + "learning_rate": 8.450351860839931e-07, + "loss": 0.72812778, + "num_input_tokens_seen": 253043305, + "step": 11727, + "time_per_iteration": 2.7726938724517822 + }, + { + "auxiliary_loss_clip": 0.01056655, + "auxiliary_loss_mlp": 0.00747494, + "balance_loss_clip": 1.02220452, + "balance_loss_mlp": 1.00033164, + "epoch": 0.7051255072899444, + "flos": 27780422935680.0, + "grad_norm": 1.4864901877288514, + "language_loss": 0.6913802, + "learning_rate": 8.44717250248668e-07, + "loss": 0.70942175, + "num_input_tokens_seen": 253062790, + "step": 11728, + "time_per_iteration": 2.612417697906494 + }, + { + "auxiliary_loss_clip": 0.01032162, + "auxiliary_loss_mlp": 0.00747651, + "balance_loss_clip": 1.02484715, + "balance_loss_mlp": 1.00036979, + "epoch": 0.7051856305426124, + "flos": 27892033470720.0, + "grad_norm": 1.724192634952849, + "language_loss": 0.73182893, + "learning_rate": 8.443993582217803e-07, + "loss": 0.74962711, + "num_input_tokens_seen": 253082055, + "step": 11729, + "time_per_iteration": 2.71370530128479 + }, + { + "auxiliary_loss_clip": 0.01053878, + "auxiliary_loss_mlp": 0.01027747, + "balance_loss_clip": 1.03038251, + "balance_loss_mlp": 1.01615953, + "epoch": 0.7052457537952803, + "flos": 25043563082880.0, + "grad_norm": 1.5957875989191133, + "language_loss": 0.77968115, + "learning_rate": 8.440815100153862e-07, + "loss": 0.80049735, + "num_input_tokens_seen": 253102575, + "step": 11730, + "time_per_iteration": 2.69228196144104 + }, + { + "auxiliary_loss_clip": 0.01064128, + "auxiliary_loss_mlp": 0.01030314, + "balance_loss_clip": 1.02492809, + "balance_loss_mlp": 1.01920986, + "epoch": 0.7053058770479483, + "flos": 21871717528320.0, + "grad_norm": 2.190791381165566, + "language_loss": 0.62883115, + "learning_rate": 8.437637056415359e-07, + "loss": 0.64977556, + "num_input_tokens_seen": 253121290, + "step": 11731, + "time_per_iteration": 2.6036133766174316 + }, + { + "auxiliary_loss_clip": 0.01010781, + "auxiliary_loss_mlp": 0.01028308, + "balance_loss_clip": 1.02304173, + "balance_loss_mlp": 1.01584494, + "epoch": 0.7053660003006162, + "flos": 16398716094720.0, + "grad_norm": 2.1182405815928242, + "language_loss": 0.74317515, + "learning_rate": 8.434459451122815e-07, + "loss": 0.76356608, + "num_input_tokens_seen": 253139720, + "step": 11732, + "time_per_iteration": 2.7164459228515625 + }, + { + "auxiliary_loss_clip": 0.01048741, + "auxiliary_loss_mlp": 0.01027018, + "balance_loss_clip": 1.02371049, + "balance_loss_mlp": 1.01639664, + "epoch": 0.7054261235532843, + "flos": 22711560399360.0, + "grad_norm": 1.437838638728755, + "language_loss": 0.70912158, + "learning_rate": 8.431282284396735e-07, + "loss": 0.72987926, + "num_input_tokens_seen": 253160250, + "step": 11733, + "time_per_iteration": 2.620150327682495 + }, + { + "auxiliary_loss_clip": 0.01031753, + "auxiliary_loss_mlp": 0.01032866, + "balance_loss_clip": 1.02363968, + "balance_loss_mlp": 1.02172565, + "epoch": 0.7054862468059522, + "flos": 13589711775360.0, + "grad_norm": 1.8979208866323873, + "language_loss": 0.73505342, + "learning_rate": 8.428105556357583e-07, + "loss": 0.75569963, + "num_input_tokens_seen": 253178710, + "step": 11734, + "time_per_iteration": 2.63602352142334 + }, + { + "auxiliary_loss_clip": 0.01027636, + "auxiliary_loss_mlp": 0.01038605, + "balance_loss_clip": 1.02292514, + "balance_loss_mlp": 1.0255338, + "epoch": 0.7055463700586202, + "flos": 15880704105600.0, + "grad_norm": 2.3548546692533425, + "language_loss": 0.68909651, + "learning_rate": 8.424929267125829e-07, + "loss": 0.70975894, + "num_input_tokens_seen": 253194805, + "step": 11735, + "time_per_iteration": 2.6219353675842285 + }, + { + "auxiliary_loss_clip": 0.01034679, + "auxiliary_loss_mlp": 0.01032751, + "balance_loss_clip": 1.02256751, + "balance_loss_mlp": 1.01995993, + "epoch": 0.7056064933112881, + "flos": 23076161400960.0, + "grad_norm": 1.6843272076887517, + "language_loss": 0.72124612, + "learning_rate": 8.421753416821933e-07, + "loss": 0.74192047, + "num_input_tokens_seen": 253213895, + "step": 11736, + "time_per_iteration": 2.6445059776306152 + }, + { + "auxiliary_loss_clip": 0.01040812, + "auxiliary_loss_mlp": 0.01026987, + "balance_loss_clip": 1.0249331, + "balance_loss_mlp": 1.01708078, + "epoch": 0.7056666165639561, + "flos": 24057168721920.0, + "grad_norm": 1.6972036864998536, + "language_loss": 0.69410747, + "learning_rate": 8.41857800556629e-07, + "loss": 0.71478546, + "num_input_tokens_seen": 253231620, + "step": 11737, + "time_per_iteration": 2.649217367172241 + }, + { + "auxiliary_loss_clip": 0.01038592, + "auxiliary_loss_mlp": 0.01035535, + "balance_loss_clip": 1.02769446, + "balance_loss_mlp": 1.02359056, + "epoch": 0.705726739816624, + "flos": 17493237371520.0, + "grad_norm": 4.293423713730972, + "language_loss": 0.68034446, + "learning_rate": 8.415403033479332e-07, + "loss": 0.70108569, + "num_input_tokens_seen": 253249590, + "step": 11738, + "time_per_iteration": 2.6387903690338135 + }, + { + "auxiliary_loss_clip": 0.01063155, + "auxiliary_loss_mlp": 0.01032148, + "balance_loss_clip": 1.02483106, + "balance_loss_mlp": 1.02017927, + "epoch": 0.7057868630692921, + "flos": 51350426472960.0, + "grad_norm": 2.2132293716427878, + "language_loss": 0.74985248, + "learning_rate": 8.41222850068145e-07, + "loss": 0.77080548, + "num_input_tokens_seen": 253273870, + "step": 11739, + "time_per_iteration": 2.9071266651153564 + }, + { + "auxiliary_loss_clip": 0.01035274, + "auxiliary_loss_mlp": 0.00747644, + "balance_loss_clip": 1.02324533, + "balance_loss_mlp": 1.00035, + "epoch": 0.70584698632196, + "flos": 26102963836800.0, + "grad_norm": 1.6455655387840153, + "language_loss": 0.71144295, + "learning_rate": 8.409054407293032e-07, + "loss": 0.72927207, + "num_input_tokens_seen": 253293720, + "step": 11740, + "time_per_iteration": 2.8042662143707275 + }, + { + "auxiliary_loss_clip": 0.01033863, + "auxiliary_loss_mlp": 0.01024986, + "balance_loss_clip": 1.0260787, + "balance_loss_mlp": 1.01493692, + "epoch": 0.705907109574628, + "flos": 21543134889600.0, + "grad_norm": 1.6893573511070177, + "language_loss": 0.81935108, + "learning_rate": 8.405880753434434e-07, + "loss": 0.83993959, + "num_input_tokens_seen": 253313700, + "step": 11741, + "time_per_iteration": 4.2620790004730225 + }, + { + "auxiliary_loss_clip": 0.01042951, + "auxiliary_loss_mlp": 0.01029759, + "balance_loss_clip": 1.02410316, + "balance_loss_mlp": 1.01811194, + "epoch": 0.705967232827296, + "flos": 22710842127360.0, + "grad_norm": 1.9249640235573797, + "language_loss": 0.77999806, + "learning_rate": 8.402707539225993e-07, + "loss": 0.80072516, + "num_input_tokens_seen": 253332425, + "step": 11742, + "time_per_iteration": 2.818939447402954 + }, + { + "auxiliary_loss_clip": 0.01066686, + "auxiliary_loss_mlp": 0.01031419, + "balance_loss_clip": 1.02643991, + "balance_loss_mlp": 1.02016556, + "epoch": 0.7060273560799639, + "flos": 28691225124480.0, + "grad_norm": 1.5632973700333388, + "language_loss": 0.64314747, + "learning_rate": 8.39953476478805e-07, + "loss": 0.66412854, + "num_input_tokens_seen": 253353620, + "step": 11743, + "time_per_iteration": 4.266103029251099 + }, + { + "auxiliary_loss_clip": 0.01034087, + "auxiliary_loss_mlp": 0.01030148, + "balance_loss_clip": 1.02156723, + "balance_loss_mlp": 1.01785183, + "epoch": 0.7060874793326319, + "flos": 15706178899200.0, + "grad_norm": 2.5610050951442935, + "language_loss": 0.65677816, + "learning_rate": 8.396362430240902e-07, + "loss": 0.6774205, + "num_input_tokens_seen": 253370930, + "step": 11744, + "time_per_iteration": 2.6097893714904785 + }, + { + "auxiliary_loss_clip": 0.01050009, + "auxiliary_loss_mlp": 0.01032716, + "balance_loss_clip": 1.02333903, + "balance_loss_mlp": 1.02178431, + "epoch": 0.7061476025852998, + "flos": 21506757390720.0, + "grad_norm": 2.109652963818619, + "language_loss": 0.63549268, + "learning_rate": 8.393190535704857e-07, + "loss": 0.65631998, + "num_input_tokens_seen": 253389810, + "step": 11745, + "time_per_iteration": 2.5883193016052246 + }, + { + "auxiliary_loss_clip": 0.01024763, + "auxiliary_loss_mlp": 0.01030178, + "balance_loss_clip": 1.02123427, + "balance_loss_mlp": 1.01956868, + "epoch": 0.7062077258379679, + "flos": 28181832399360.0, + "grad_norm": 2.125364359241082, + "language_loss": 0.71758324, + "learning_rate": 8.390019081300188e-07, + "loss": 0.73813272, + "num_input_tokens_seen": 253408685, + "step": 11746, + "time_per_iteration": 2.7317397594451904 + }, + { + "auxiliary_loss_clip": 0.01016581, + "auxiliary_loss_mlp": 0.01028059, + "balance_loss_clip": 1.02710462, + "balance_loss_mlp": 1.01709211, + "epoch": 0.7062678490906358, + "flos": 27853680723840.0, + "grad_norm": 1.3662146593965152, + "language_loss": 0.79312658, + "learning_rate": 8.386848067147175e-07, + "loss": 0.813573, + "num_input_tokens_seen": 253429685, + "step": 11747, + "time_per_iteration": 2.7549784183502197 + }, + { + "auxiliary_loss_clip": 0.01049295, + "auxiliary_loss_mlp": 0.0102715, + "balance_loss_clip": 1.02426875, + "balance_loss_mlp": 1.01676631, + "epoch": 0.7063279723433038, + "flos": 23184862934400.0, + "grad_norm": 1.9622920876056493, + "language_loss": 0.65000957, + "learning_rate": 8.383677493366031e-07, + "loss": 0.67077398, + "num_input_tokens_seen": 253448260, + "step": 11748, + "time_per_iteration": 2.7535057067871094 + }, + { + "auxiliary_loss_clip": 0.01021854, + "auxiliary_loss_mlp": 0.01035009, + "balance_loss_clip": 1.02240658, + "balance_loss_mlp": 1.02341008, + "epoch": 0.7063880955959717, + "flos": 20188655907840.0, + "grad_norm": 1.8388244221261705, + "language_loss": 0.79620576, + "learning_rate": 8.380507360077003e-07, + "loss": 0.81677437, + "num_input_tokens_seen": 253467725, + "step": 11749, + "time_per_iteration": 2.6734554767608643 + }, + { + "auxiliary_loss_clip": 0.01008024, + "auxiliary_loss_mlp": 0.01002679, + "balance_loss_clip": 1.00269508, + "balance_loss_mlp": 1.00167775, + "epoch": 0.7064482188486397, + "flos": 63668182763520.0, + "grad_norm": 0.7910377661055539, + "language_loss": 0.53996134, + "learning_rate": 8.377337667400304e-07, + "loss": 0.56006837, + "num_input_tokens_seen": 253526940, + "step": 11750, + "time_per_iteration": 3.272240161895752 + }, + { + "auxiliary_loss_clip": 0.01043626, + "auxiliary_loss_mlp": 0.01033321, + "balance_loss_clip": 1.02520621, + "balance_loss_mlp": 1.02192426, + "epoch": 0.7065083421013076, + "flos": 25191227894400.0, + "grad_norm": 1.9207513731408992, + "language_loss": 0.78823066, + "learning_rate": 8.37416841545612e-07, + "loss": 0.80900013, + "num_input_tokens_seen": 253546160, + "step": 11751, + "time_per_iteration": 2.6397478580474854 + }, + { + "auxiliary_loss_clip": 0.01029606, + "auxiliary_loss_mlp": 0.01024822, + "balance_loss_clip": 1.02379084, + "balance_loss_mlp": 1.01486254, + "epoch": 0.7065684653539757, + "flos": 22893699288960.0, + "grad_norm": 6.699183548564197, + "language_loss": 0.67780584, + "learning_rate": 8.370999604364634e-07, + "loss": 0.69835013, + "num_input_tokens_seen": 253565505, + "step": 11752, + "time_per_iteration": 2.7006337642669678 + }, + { + "auxiliary_loss_clip": 0.01011309, + "auxiliary_loss_mlp": 0.00747689, + "balance_loss_clip": 1.02329528, + "balance_loss_mlp": 1.00037527, + "epoch": 0.7066285886066436, + "flos": 23550254035200.0, + "grad_norm": 1.8619417506806126, + "language_loss": 0.76787698, + "learning_rate": 8.367831234246025e-07, + "loss": 0.78546691, + "num_input_tokens_seen": 253585125, + "step": 11753, + "time_per_iteration": 2.7581942081451416 + }, + { + "auxiliary_loss_clip": 0.01031377, + "auxiliary_loss_mlp": 0.00747391, + "balance_loss_clip": 1.02381778, + "balance_loss_mlp": 1.00034881, + "epoch": 0.7066887118593116, + "flos": 21069293650560.0, + "grad_norm": 1.7657338409372316, + "language_loss": 0.70695341, + "learning_rate": 8.364663305220405e-07, + "loss": 0.7247411, + "num_input_tokens_seen": 253604815, + "step": 11754, + "time_per_iteration": 2.872459650039673 + }, + { + "auxiliary_loss_clip": 0.01019795, + "auxiliary_loss_mlp": 0.01036281, + "balance_loss_clip": 1.02194047, + "balance_loss_mlp": 1.02413321, + "epoch": 0.7067488351119796, + "flos": 21176307244800.0, + "grad_norm": 2.0224460372876805, + "language_loss": 0.89249247, + "learning_rate": 8.361495817407919e-07, + "loss": 0.91305321, + "num_input_tokens_seen": 253622855, + "step": 11755, + "time_per_iteration": 2.8232553005218506 + }, + { + "auxiliary_loss_clip": 0.0104387, + "auxiliary_loss_mlp": 0.00747723, + "balance_loss_clip": 1.02512419, + "balance_loss_mlp": 1.00042844, + "epoch": 0.7068089583646475, + "flos": 20449224144000.0, + "grad_norm": 1.7214288403892017, + "language_loss": 0.79590869, + "learning_rate": 8.358328770928678e-07, + "loss": 0.81382459, + "num_input_tokens_seen": 253642760, + "step": 11756, + "time_per_iteration": 2.927539587020874 + }, + { + "auxiliary_loss_clip": 0.00972822, + "auxiliary_loss_mlp": 0.01002869, + "balance_loss_clip": 1.00624752, + "balance_loss_mlp": 1.00180817, + "epoch": 0.7068690816173155, + "flos": 59109179829120.0, + "grad_norm": 0.8235806123483984, + "language_loss": 0.60314727, + "learning_rate": 8.355162165902785e-07, + "loss": 0.62290418, + "num_input_tokens_seen": 253695685, + "step": 11757, + "time_per_iteration": 3.1089015007019043 + }, + { + "auxiliary_loss_clip": 0.01032284, + "auxiliary_loss_mlp": 0.01030008, + "balance_loss_clip": 1.02420425, + "balance_loss_mlp": 1.01953566, + "epoch": 0.7069292048699835, + "flos": 16251554073600.0, + "grad_norm": 1.6860505767134193, + "language_loss": 0.80327773, + "learning_rate": 8.351996002450307e-07, + "loss": 0.8239007, + "num_input_tokens_seen": 253713305, + "step": 11758, + "time_per_iteration": 4.2753520011901855 + }, + { + "auxiliary_loss_clip": 0.01022397, + "auxiliary_loss_mlp": 0.00747615, + "balance_loss_clip": 1.02305508, + "balance_loss_mlp": 1.00039196, + "epoch": 0.7069893281226515, + "flos": 41172768455040.0, + "grad_norm": 1.8488176050188838, + "language_loss": 0.77586925, + "learning_rate": 8.348830280691304e-07, + "loss": 0.79356933, + "num_input_tokens_seen": 253736100, + "step": 11759, + "time_per_iteration": 2.903934955596924 + }, + { + "auxiliary_loss_clip": 0.0105358, + "auxiliary_loss_mlp": 0.01028969, + "balance_loss_clip": 1.02465117, + "balance_loss_mlp": 1.01799583, + "epoch": 0.7070494513753194, + "flos": 24207275658240.0, + "grad_norm": 1.5091000374168224, + "language_loss": 0.67746383, + "learning_rate": 8.34566500074583e-07, + "loss": 0.69828933, + "num_input_tokens_seen": 253757350, + "step": 11760, + "time_per_iteration": 2.770869493484497 + }, + { + "auxiliary_loss_clip": 0.01026574, + "auxiliary_loss_mlp": 0.01025635, + "balance_loss_clip": 1.02467716, + "balance_loss_mlp": 1.0148108, + "epoch": 0.7071095746279874, + "flos": 20185675079040.0, + "grad_norm": 2.8478892945535748, + "language_loss": 0.79951298, + "learning_rate": 8.342500162733899e-07, + "loss": 0.82003504, + "num_input_tokens_seen": 253772855, + "step": 11761, + "time_per_iteration": 2.616426944732666 + }, + { + "auxiliary_loss_clip": 0.01034639, + "auxiliary_loss_mlp": 0.01034836, + "balance_loss_clip": 1.02177477, + "balance_loss_mlp": 1.02214003, + "epoch": 0.7071696978806553, + "flos": 18183045133440.0, + "grad_norm": 2.417137576491922, + "language_loss": 0.74894524, + "learning_rate": 8.33933576677553e-07, + "loss": 0.76963997, + "num_input_tokens_seen": 253790360, + "step": 11762, + "time_per_iteration": 2.7344298362731934 + }, + { + "auxiliary_loss_clip": 0.01040966, + "auxiliary_loss_mlp": 0.01030738, + "balance_loss_clip": 1.02506566, + "balance_loss_mlp": 1.02021146, + "epoch": 0.7072298211333233, + "flos": 24131719399680.0, + "grad_norm": 1.986008709492259, + "language_loss": 0.76926434, + "learning_rate": 8.336171812990724e-07, + "loss": 0.78998137, + "num_input_tokens_seen": 253810585, + "step": 11763, + "time_per_iteration": 2.659970283508301 + }, + { + "auxiliary_loss_clip": 0.01031408, + "auxiliary_loss_mlp": 0.00747549, + "balance_loss_clip": 1.02311194, + "balance_loss_mlp": 1.00035691, + "epoch": 0.7072899443859912, + "flos": 27198418867200.0, + "grad_norm": 3.5155523844193364, + "language_loss": 0.78904408, + "learning_rate": 8.333008301499453e-07, + "loss": 0.80683362, + "num_input_tokens_seen": 253829080, + "step": 11764, + "time_per_iteration": 2.803338050842285 + }, + { + "auxiliary_loss_clip": 0.01021464, + "auxiliary_loss_mlp": 0.01035765, + "balance_loss_clip": 1.02355564, + "balance_loss_mlp": 1.02395165, + "epoch": 0.7073500676386593, + "flos": 16435596384000.0, + "grad_norm": 1.5475379178521305, + "language_loss": 0.79702109, + "learning_rate": 8.32984523242167e-07, + "loss": 0.81759334, + "num_input_tokens_seen": 253846780, + "step": 11765, + "time_per_iteration": 2.749159574508667 + }, + { + "auxiliary_loss_clip": 0.01062216, + "auxiliary_loss_mlp": 0.01026135, + "balance_loss_clip": 1.02601004, + "balance_loss_mlp": 1.01665759, + "epoch": 0.7074101908913272, + "flos": 27673732563840.0, + "grad_norm": 1.633820536496686, + "language_loss": 0.68505836, + "learning_rate": 8.326682605877324e-07, + "loss": 0.7059418, + "num_input_tokens_seen": 253867075, + "step": 11766, + "time_per_iteration": 2.6649861335754395 + }, + { + "auxiliary_loss_clip": 0.01041392, + "auxiliary_loss_mlp": 0.01033314, + "balance_loss_clip": 1.0227406, + "balance_loss_mlp": 1.02163124, + "epoch": 0.7074703141439952, + "flos": 22238078296320.0, + "grad_norm": 1.8796531767852802, + "language_loss": 0.63954765, + "learning_rate": 8.323520421986352e-07, + "loss": 0.66029471, + "num_input_tokens_seen": 253885790, + "step": 11767, + "time_per_iteration": 2.7748830318450928 + }, + { + "auxiliary_loss_clip": 0.01053396, + "auxiliary_loss_mlp": 0.01026459, + "balance_loss_clip": 1.02457416, + "balance_loss_mlp": 1.01556373, + "epoch": 0.7075304373966632, + "flos": 29643217234560.0, + "grad_norm": 1.4549290936370545, + "language_loss": 0.52895617, + "learning_rate": 8.320358680868646e-07, + "loss": 0.54975474, + "num_input_tokens_seen": 253907070, + "step": 11768, + "time_per_iteration": 4.401324272155762 + }, + { + "auxiliary_loss_clip": 0.01042986, + "auxiliary_loss_mlp": 0.00747654, + "balance_loss_clip": 1.0256778, + "balance_loss_mlp": 1.00036383, + "epoch": 0.7075905606493311, + "flos": 19755214490880.0, + "grad_norm": 1.6006912374338231, + "language_loss": 0.7571795, + "learning_rate": 8.317197382644119e-07, + "loss": 0.77508587, + "num_input_tokens_seen": 253927290, + "step": 11769, + "time_per_iteration": 2.6981723308563232 + }, + { + "auxiliary_loss_clip": 0.00987922, + "auxiliary_loss_mlp": 0.01013443, + "balance_loss_clip": 1.00222135, + "balance_loss_mlp": 1.01247787, + "epoch": 0.7076506839019991, + "flos": 65716132694400.0, + "grad_norm": 0.8460940765326911, + "language_loss": 0.62006807, + "learning_rate": 8.314036527432637e-07, + "loss": 0.64008176, + "num_input_tokens_seen": 253983440, + "step": 11770, + "time_per_iteration": 3.1499125957489014 + }, + { + "auxiliary_loss_clip": 0.01031042, + "auxiliary_loss_mlp": 0.01035444, + "balance_loss_clip": 1.02308607, + "balance_loss_mlp": 1.0241015, + "epoch": 0.707710807154667, + "flos": 23765286804480.0, + "grad_norm": 1.6436774267117678, + "language_loss": 0.76181775, + "learning_rate": 8.310876115354055e-07, + "loss": 0.78248256, + "num_input_tokens_seen": 254003825, + "step": 11771, + "time_per_iteration": 2.7665178775787354 + }, + { + "auxiliary_loss_clip": 0.01052576, + "auxiliary_loss_mlp": 0.01026646, + "balance_loss_clip": 1.02565873, + "balance_loss_mlp": 1.01668584, + "epoch": 0.7077709304073351, + "flos": 21251360712960.0, + "grad_norm": 1.5073435841200082, + "language_loss": 0.71427292, + "learning_rate": 8.307716146528221e-07, + "loss": 0.7350651, + "num_input_tokens_seen": 254023345, + "step": 11772, + "time_per_iteration": 2.598701238632202 + }, + { + "auxiliary_loss_clip": 0.01014509, + "auxiliary_loss_mlp": 0.01029595, + "balance_loss_clip": 1.02314448, + "balance_loss_mlp": 1.018538, + "epoch": 0.707831053660003, + "flos": 20740746925440.0, + "grad_norm": 1.7897969716542326, + "language_loss": 0.69618374, + "learning_rate": 8.30455662107496e-07, + "loss": 0.71662474, + "num_input_tokens_seen": 254041815, + "step": 11773, + "time_per_iteration": 2.729966402053833 + }, + { + "auxiliary_loss_clip": 0.01055077, + "auxiliary_loss_mlp": 0.01033595, + "balance_loss_clip": 1.02577972, + "balance_loss_mlp": 1.02291346, + "epoch": 0.707891176912671, + "flos": 21980993679360.0, + "grad_norm": 1.3771152102748236, + "language_loss": 0.70002079, + "learning_rate": 8.301397539114095e-07, + "loss": 0.72090757, + "num_input_tokens_seen": 254062065, + "step": 11774, + "time_per_iteration": 2.6000711917877197 + }, + { + "auxiliary_loss_clip": 0.01033466, + "auxiliary_loss_mlp": 0.01026831, + "balance_loss_clip": 1.02584195, + "balance_loss_mlp": 1.01683521, + "epoch": 0.7079513001653389, + "flos": 21068970428160.0, + "grad_norm": 1.5141812741134029, + "language_loss": 0.74412942, + "learning_rate": 8.298238900765407e-07, + "loss": 0.76473236, + "num_input_tokens_seen": 254080605, + "step": 11775, + "time_per_iteration": 2.695716381072998 + }, + { + "auxiliary_loss_clip": 0.01024472, + "auxiliary_loss_mlp": 0.00747556, + "balance_loss_clip": 1.02469134, + "balance_loss_mlp": 1.00041461, + "epoch": 0.7080114234180069, + "flos": 18040659621120.0, + "grad_norm": 1.6913434362822009, + "language_loss": 0.86796081, + "learning_rate": 8.295080706148665e-07, + "loss": 0.88568103, + "num_input_tokens_seen": 254098710, + "step": 11776, + "time_per_iteration": 2.8133959770202637 + }, + { + "auxiliary_loss_clip": 0.01044062, + "auxiliary_loss_mlp": 0.01026286, + "balance_loss_clip": 1.02310014, + "balance_loss_mlp": 1.01629043, + "epoch": 0.7080715466706748, + "flos": 15122271409920.0, + "grad_norm": 1.5882011543798196, + "language_loss": 0.74968386, + "learning_rate": 8.291922955383641e-07, + "loss": 0.77038735, + "num_input_tokens_seen": 254117200, + "step": 11777, + "time_per_iteration": 2.6024975776672363 + }, + { + "auxiliary_loss_clip": 0.01051178, + "auxiliary_loss_mlp": 0.01031314, + "balance_loss_clip": 1.02935445, + "balance_loss_mlp": 1.01997113, + "epoch": 0.7081316699233429, + "flos": 14422802889600.0, + "grad_norm": 2.0892950324583617, + "language_loss": 0.82399011, + "learning_rate": 8.288765648590066e-07, + "loss": 0.84481502, + "num_input_tokens_seen": 254132115, + "step": 11778, + "time_per_iteration": 2.640133857727051 + }, + { + "auxiliary_loss_clip": 0.01035894, + "auxiliary_loss_mlp": 0.0102815, + "balance_loss_clip": 1.02269888, + "balance_loss_mlp": 1.01858938, + "epoch": 0.7081917931760108, + "flos": 23222389668480.0, + "grad_norm": 1.4999863855745628, + "language_loss": 0.84676862, + "learning_rate": 8.285608785887673e-07, + "loss": 0.86740899, + "num_input_tokens_seen": 254152285, + "step": 11779, + "time_per_iteration": 2.654536485671997 + }, + { + "auxiliary_loss_clip": 0.01045788, + "auxiliary_loss_mlp": 0.01030314, + "balance_loss_clip": 1.02613306, + "balance_loss_mlp": 1.01959133, + "epoch": 0.7082519164286788, + "flos": 39308429871360.0, + "grad_norm": 2.0773307441218307, + "language_loss": 0.71496117, + "learning_rate": 8.28245236739618e-07, + "loss": 0.73572218, + "num_input_tokens_seen": 254172805, + "step": 11780, + "time_per_iteration": 2.823530435562134 + }, + { + "auxiliary_loss_clip": 0.01010532, + "auxiliary_loss_mlp": 0.01024806, + "balance_loss_clip": 1.02289677, + "balance_loss_mlp": 1.01426148, + "epoch": 0.7083120396813467, + "flos": 21651154064640.0, + "grad_norm": 1.3762375857333304, + "language_loss": 0.72960508, + "learning_rate": 8.279296393235256e-07, + "loss": 0.74995852, + "num_input_tokens_seen": 254191890, + "step": 11781, + "time_per_iteration": 2.8108201026916504 + }, + { + "auxiliary_loss_clip": 0.01053463, + "auxiliary_loss_mlp": 0.01028114, + "balance_loss_clip": 1.02654386, + "balance_loss_mlp": 1.01817155, + "epoch": 0.7083721629340147, + "flos": 17567033863680.0, + "grad_norm": 1.6166308095626207, + "language_loss": 0.77077353, + "learning_rate": 8.276140863524585e-07, + "loss": 0.79158926, + "num_input_tokens_seen": 254210150, + "step": 11782, + "time_per_iteration": 2.60722279548645 + }, + { + "auxiliary_loss_clip": 0.01040517, + "auxiliary_loss_mlp": 0.01027142, + "balance_loss_clip": 1.02462387, + "balance_loss_mlp": 1.01782584, + "epoch": 0.7084322861866827, + "flos": 29350509304320.0, + "grad_norm": 2.69824598183399, + "language_loss": 0.70142388, + "learning_rate": 8.272985778383828e-07, + "loss": 0.7221005, + "num_input_tokens_seen": 254233015, + "step": 11783, + "time_per_iteration": 2.7360708713531494 + }, + { + "auxiliary_loss_clip": 0.01024191, + "auxiliary_loss_mlp": 0.01026188, + "balance_loss_clip": 1.02497351, + "balance_loss_mlp": 1.01549506, + "epoch": 0.7084924094393507, + "flos": 20194294343040.0, + "grad_norm": 1.8907433714773638, + "language_loss": 0.78961504, + "learning_rate": 8.269831137932632e-07, + "loss": 0.81011885, + "num_input_tokens_seen": 254251345, + "step": 11784, + "time_per_iteration": 2.692904233932495 + }, + { + "auxiliary_loss_clip": 0.01061973, + "auxiliary_loss_mlp": 0.01028841, + "balance_loss_clip": 1.02501774, + "balance_loss_mlp": 1.01839793, + "epoch": 0.7085525326920187, + "flos": 23477211728640.0, + "grad_norm": 1.8165182779044553, + "language_loss": 0.77046442, + "learning_rate": 8.266676942290609e-07, + "loss": 0.79137254, + "num_input_tokens_seen": 254269905, + "step": 11785, + "time_per_iteration": 2.5413591861724854 + }, + { + "auxiliary_loss_clip": 0.01035239, + "auxiliary_loss_mlp": 0.0102775, + "balance_loss_clip": 1.02333808, + "balance_loss_mlp": 1.01673532, + "epoch": 0.7086126559446866, + "flos": 25958818558080.0, + "grad_norm": 1.7777952703167836, + "language_loss": 0.78030765, + "learning_rate": 8.26352319157738e-07, + "loss": 0.80093753, + "num_input_tokens_seen": 254289990, + "step": 11786, + "time_per_iteration": 2.7022671699523926 + }, + { + "auxiliary_loss_clip": 0.01064155, + "auxiliary_loss_mlp": 0.01025288, + "balance_loss_clip": 1.02578735, + "balance_loss_mlp": 1.01451111, + "epoch": 0.7086727791973546, + "flos": 26724793109760.0, + "grad_norm": 2.434150095387096, + "language_loss": 0.79120547, + "learning_rate": 8.260369885912526e-07, + "loss": 0.81209987, + "num_input_tokens_seen": 254309085, + "step": 11787, + "time_per_iteration": 2.602001428604126 + }, + { + "auxiliary_loss_clip": 0.01054436, + "auxiliary_loss_mlp": 0.01026945, + "balance_loss_clip": 1.02571321, + "balance_loss_mlp": 1.01645505, + "epoch": 0.7087329024500225, + "flos": 21683365585920.0, + "grad_norm": 10.589678495018545, + "language_loss": 0.76840556, + "learning_rate": 8.257217025415615e-07, + "loss": 0.78921938, + "num_input_tokens_seen": 254327045, + "step": 11788, + "time_per_iteration": 2.683950185775757 + }, + { + "auxiliary_loss_clip": 0.01021736, + "auxiliary_loss_mlp": 0.01034508, + "balance_loss_clip": 1.02135742, + "balance_loss_mlp": 1.02143085, + "epoch": 0.7087930257026905, + "flos": 17931060247680.0, + "grad_norm": 2.577745827376391, + "language_loss": 0.67527616, + "learning_rate": 8.254064610206212e-07, + "loss": 0.69583857, + "num_input_tokens_seen": 254344585, + "step": 11789, + "time_per_iteration": 4.384403228759766 + }, + { + "auxiliary_loss_clip": 0.01014911, + "auxiliary_loss_mlp": 0.01030583, + "balance_loss_clip": 1.02479076, + "balance_loss_mlp": 1.01916885, + "epoch": 0.7088531489553584, + "flos": 18911528864640.0, + "grad_norm": 1.5778291090136507, + "language_loss": 0.77533054, + "learning_rate": 8.250912640403858e-07, + "loss": 0.79578549, + "num_input_tokens_seen": 254362470, + "step": 11790, + "time_per_iteration": 4.537750244140625 + }, + { + "auxiliary_loss_clip": 0.01043867, + "auxiliary_loss_mlp": 0.01029333, + "balance_loss_clip": 1.02379239, + "balance_loss_mlp": 1.01788902, + "epoch": 0.7089132722080265, + "flos": 27380880979200.0, + "grad_norm": 1.6425449626029605, + "language_loss": 0.71759337, + "learning_rate": 8.247761116128085e-07, + "loss": 0.73832536, + "num_input_tokens_seen": 254383190, + "step": 11791, + "time_per_iteration": 2.6639363765716553 + }, + { + "auxiliary_loss_clip": 0.01054936, + "auxiliary_loss_mlp": 0.01032067, + "balance_loss_clip": 1.02594471, + "balance_loss_mlp": 1.02060544, + "epoch": 0.7089733954606944, + "flos": 22162917087360.0, + "grad_norm": 2.088402863138247, + "language_loss": 0.8236953, + "learning_rate": 8.244610037498376e-07, + "loss": 0.84456533, + "num_input_tokens_seen": 254403115, + "step": 11792, + "time_per_iteration": 2.759401321411133 + }, + { + "auxiliary_loss_clip": 0.01027868, + "auxiliary_loss_mlp": 0.01024556, + "balance_loss_clip": 1.02538574, + "balance_loss_mlp": 1.01327896, + "epoch": 0.7090335187133624, + "flos": 24425827960320.0, + "grad_norm": 3.07252165403018, + "language_loss": 0.64443487, + "learning_rate": 8.241459404634232e-07, + "loss": 0.66495907, + "num_input_tokens_seen": 254421875, + "step": 11793, + "time_per_iteration": 2.784141778945923 + }, + { + "auxiliary_loss_clip": 0.01047472, + "auxiliary_loss_mlp": 0.01031691, + "balance_loss_clip": 1.02431643, + "balance_loss_mlp": 1.02037787, + "epoch": 0.7090936419660303, + "flos": 21835232288640.0, + "grad_norm": 2.015631857003463, + "language_loss": 0.70557034, + "learning_rate": 8.238309217655133e-07, + "loss": 0.72636193, + "num_input_tokens_seen": 254440765, + "step": 11794, + "time_per_iteration": 2.7722833156585693 + }, + { + "auxiliary_loss_clip": 0.01042821, + "auxiliary_loss_mlp": 0.01031679, + "balance_loss_clip": 1.02554095, + "balance_loss_mlp": 1.02142763, + "epoch": 0.7091537652186983, + "flos": 20082360585600.0, + "grad_norm": 1.715310803115348, + "language_loss": 0.75987315, + "learning_rate": 8.23515947668052e-07, + "loss": 0.78061819, + "num_input_tokens_seen": 254459480, + "step": 11795, + "time_per_iteration": 2.6900815963745117 + }, + { + "auxiliary_loss_clip": 0.01033963, + "auxiliary_loss_mlp": 0.01033018, + "balance_loss_clip": 1.02538705, + "balance_loss_mlp": 1.0225755, + "epoch": 0.7092138884713663, + "flos": 13151565676800.0, + "grad_norm": 1.975409105927938, + "language_loss": 0.74685991, + "learning_rate": 8.232010181829838e-07, + "loss": 0.76752967, + "num_input_tokens_seen": 254473985, + "step": 11796, + "time_per_iteration": 2.8136720657348633 + }, + { + "auxiliary_loss_clip": 0.01053142, + "auxiliary_loss_mlp": 0.01039234, + "balance_loss_clip": 1.02713275, + "balance_loss_mlp": 1.02522147, + "epoch": 0.7092740117240343, + "flos": 21645982506240.0, + "grad_norm": 2.838553365014113, + "language_loss": 0.74262714, + "learning_rate": 8.228861333222523e-07, + "loss": 0.76355088, + "num_input_tokens_seen": 254492135, + "step": 11797, + "time_per_iteration": 2.5959062576293945 + }, + { + "auxiliary_loss_clip": 0.0102428, + "auxiliary_loss_mlp": 0.01027155, + "balance_loss_clip": 1.0250653, + "balance_loss_mlp": 1.01692736, + "epoch": 0.7093341349767023, + "flos": 21032521102080.0, + "grad_norm": 1.539278998398106, + "language_loss": 0.79331446, + "learning_rate": 8.225712930977953e-07, + "loss": 0.81382883, + "num_input_tokens_seen": 254512865, + "step": 11798, + "time_per_iteration": 2.7432148456573486 + }, + { + "auxiliary_loss_clip": 0.01025805, + "auxiliary_loss_mlp": 0.01032715, + "balance_loss_clip": 1.01968575, + "balance_loss_mlp": 1.02065659, + "epoch": 0.7093942582293702, + "flos": 22017658487040.0, + "grad_norm": 3.246152532102459, + "language_loss": 0.67220747, + "learning_rate": 8.222564975215529e-07, + "loss": 0.69279265, + "num_input_tokens_seen": 254532605, + "step": 11799, + "time_per_iteration": 2.662722587585449 + }, + { + "auxiliary_loss_clip": 0.01063745, + "auxiliary_loss_mlp": 0.01026434, + "balance_loss_clip": 1.02550364, + "balance_loss_mlp": 1.01497781, + "epoch": 0.7094543814820382, + "flos": 27235586465280.0, + "grad_norm": 1.5562850976499287, + "language_loss": 0.81421256, + "learning_rate": 8.219417466054622e-07, + "loss": 0.8351143, + "num_input_tokens_seen": 254553780, + "step": 11800, + "time_per_iteration": 2.669093608856201 + }, + { + "auxiliary_loss_clip": 0.0104045, + "auxiliary_loss_mlp": 0.01026471, + "balance_loss_clip": 1.02349114, + "balance_loss_mlp": 1.01630831, + "epoch": 0.7095145047347061, + "flos": 12089148180480.0, + "grad_norm": 1.8697278700635214, + "language_loss": 0.86687815, + "learning_rate": 8.21627040361459e-07, + "loss": 0.88754737, + "num_input_tokens_seen": 254567510, + "step": 11801, + "time_per_iteration": 2.701813220977783 + }, + { + "auxiliary_loss_clip": 0.01065399, + "auxiliary_loss_mlp": 0.01033941, + "balance_loss_clip": 1.02649355, + "balance_loss_mlp": 1.02323651, + "epoch": 0.7095746279873741, + "flos": 19383789905280.0, + "grad_norm": 2.990062691189164, + "language_loss": 0.76428843, + "learning_rate": 8.213123788014758e-07, + "loss": 0.78528184, + "num_input_tokens_seen": 254585565, + "step": 11802, + "time_per_iteration": 2.7663052082061768 + }, + { + "auxiliary_loss_clip": 0.01047451, + "auxiliary_loss_mlp": 0.0103979, + "balance_loss_clip": 1.02451563, + "balance_loss_mlp": 1.02884054, + "epoch": 0.709634751240042, + "flos": 21360600950400.0, + "grad_norm": 2.212980568224402, + "language_loss": 0.81496513, + "learning_rate": 8.209977619374462e-07, + "loss": 0.8358376, + "num_input_tokens_seen": 254603465, + "step": 11803, + "time_per_iteration": 2.6480140686035156 + }, + { + "auxiliary_loss_clip": 0.0106448, + "auxiliary_loss_mlp": 0.01029738, + "balance_loss_clip": 1.0249095, + "balance_loss_mlp": 1.0185324, + "epoch": 0.7096948744927101, + "flos": 13917037438080.0, + "grad_norm": 1.9757092589371585, + "language_loss": 0.67354774, + "learning_rate": 8.206831897812995e-07, + "loss": 0.69448996, + "num_input_tokens_seen": 254620500, + "step": 11804, + "time_per_iteration": 2.4949111938476562 + }, + { + "auxiliary_loss_clip": 0.01049194, + "auxiliary_loss_mlp": 0.01024868, + "balance_loss_clip": 1.02356493, + "balance_loss_mlp": 1.01545095, + "epoch": 0.709754997745378, + "flos": 30298335436800.0, + "grad_norm": 1.8167151259433907, + "language_loss": 0.78152221, + "learning_rate": 8.203686623449637e-07, + "loss": 0.8022629, + "num_input_tokens_seen": 254638565, + "step": 11805, + "time_per_iteration": 2.7024519443511963 + }, + { + "auxiliary_loss_clip": 0.01040767, + "auxiliary_loss_mlp": 0.00747709, + "balance_loss_clip": 1.02315557, + "balance_loss_mlp": 1.00048113, + "epoch": 0.709815120998046, + "flos": 18515147304960.0, + "grad_norm": 2.068684274671016, + "language_loss": 0.78891921, + "learning_rate": 8.200541796403667e-07, + "loss": 0.806804, + "num_input_tokens_seen": 254657505, + "step": 11806, + "time_per_iteration": 4.234152555465698 + }, + { + "auxiliary_loss_clip": 0.01037065, + "auxiliary_loss_mlp": 0.01034692, + "balance_loss_clip": 1.02283955, + "balance_loss_mlp": 1.02258587, + "epoch": 0.7098752442507139, + "flos": 22272588288000.0, + "grad_norm": 2.6439733398363843, + "language_loss": 0.55834007, + "learning_rate": 8.197397416794332e-07, + "loss": 0.57905763, + "num_input_tokens_seen": 254674730, + "step": 11807, + "time_per_iteration": 2.6422061920166016 + }, + { + "auxiliary_loss_clip": 0.01064731, + "auxiliary_loss_mlp": 0.01034224, + "balance_loss_clip": 1.02384627, + "balance_loss_mlp": 1.0232451, + "epoch": 0.7099353675033819, + "flos": 19275447507840.0, + "grad_norm": 1.9829952321795035, + "language_loss": 0.68743801, + "learning_rate": 8.194253484740882e-07, + "loss": 0.70842755, + "num_input_tokens_seen": 254691665, + "step": 11808, + "time_per_iteration": 2.5080831050872803 + }, + { + "auxiliary_loss_clip": 0.01055347, + "auxiliary_loss_mlp": 0.01030759, + "balance_loss_clip": 1.02554274, + "balance_loss_mlp": 1.01994073, + "epoch": 0.70999549075605, + "flos": 21908525990400.0, + "grad_norm": 1.7621723291634521, + "language_loss": 0.71587729, + "learning_rate": 8.191110000362513e-07, + "loss": 0.73673832, + "num_input_tokens_seen": 254711610, + "step": 11809, + "time_per_iteration": 2.55362606048584 + }, + { + "auxiliary_loss_clip": 0.01006262, + "auxiliary_loss_mlp": 0.01001014, + "balance_loss_clip": 1.00112724, + "balance_loss_mlp": 1.00010777, + "epoch": 0.7100556140087179, + "flos": 70456053456000.0, + "grad_norm": 0.7501742226464505, + "language_loss": 0.59441102, + "learning_rate": 8.187966963778435e-07, + "loss": 0.61448383, + "num_input_tokens_seen": 254772615, + "step": 11810, + "time_per_iteration": 3.212700128555298 + }, + { + "auxiliary_loss_clip": 0.00981504, + "auxiliary_loss_mlp": 0.01041311, + "balance_loss_clip": 1.0202713, + "balance_loss_mlp": 1.02950883, + "epoch": 0.7101157372613859, + "flos": 23039568420480.0, + "grad_norm": 1.5340850759698417, + "language_loss": 0.74062967, + "learning_rate": 8.18482437510784e-07, + "loss": 0.76085782, + "num_input_tokens_seen": 254791375, + "step": 11811, + "time_per_iteration": 2.9452064037323 + }, + { + "auxiliary_loss_clip": 0.01022929, + "auxiliary_loss_mlp": 0.0102562, + "balance_loss_clip": 1.02525544, + "balance_loss_mlp": 1.0158205, + "epoch": 0.7101758605140538, + "flos": 23185329811200.0, + "grad_norm": 1.6515006322755326, + "language_loss": 0.83423883, + "learning_rate": 8.181682234469882e-07, + "loss": 0.85472429, + "num_input_tokens_seen": 254809300, + "step": 11812, + "time_per_iteration": 2.8971707820892334 + }, + { + "auxiliary_loss_clip": 0.01065704, + "auxiliary_loss_mlp": 0.01026784, + "balance_loss_clip": 1.02637601, + "balance_loss_mlp": 1.01558995, + "epoch": 0.7102359837667218, + "flos": 23696123166720.0, + "grad_norm": 1.4438655278421817, + "language_loss": 0.69855517, + "learning_rate": 8.178540541983716e-07, + "loss": 0.71948004, + "num_input_tokens_seen": 254829325, + "step": 11813, + "time_per_iteration": 2.5425961017608643 + }, + { + "auxiliary_loss_clip": 0.01059566, + "auxiliary_loss_mlp": 0.01024916, + "balance_loss_clip": 1.02312803, + "balance_loss_mlp": 1.01490831, + "epoch": 0.7102961070193897, + "flos": 19391116279680.0, + "grad_norm": 2.190662488262257, + "language_loss": 0.81705701, + "learning_rate": 8.175399297768495e-07, + "loss": 0.83790189, + "num_input_tokens_seen": 254847690, + "step": 11814, + "time_per_iteration": 2.5176639556884766 + }, + { + "auxiliary_loss_clip": 0.01063329, + "auxiliary_loss_mlp": 0.01026109, + "balance_loss_clip": 1.02578831, + "balance_loss_mlp": 1.01494491, + "epoch": 0.7103562302720577, + "flos": 21507511576320.0, + "grad_norm": 2.0847297073377145, + "language_loss": 0.75735152, + "learning_rate": 8.172258501943301e-07, + "loss": 0.77824587, + "num_input_tokens_seen": 254865960, + "step": 11815, + "time_per_iteration": 2.545994281768799 + }, + { + "auxiliary_loss_clip": 0.01021507, + "auxiliary_loss_mlp": 0.01026689, + "balance_loss_clip": 1.02382398, + "balance_loss_mlp": 1.01646709, + "epoch": 0.7104163535247257, + "flos": 14535059869440.0, + "grad_norm": 1.8908519798422552, + "language_loss": 0.78499448, + "learning_rate": 8.16911815462725e-07, + "loss": 0.80547649, + "num_input_tokens_seen": 254882815, + "step": 11816, + "time_per_iteration": 4.4173009395599365 + }, + { + "auxiliary_loss_clip": 0.01040779, + "auxiliary_loss_mlp": 0.01034698, + "balance_loss_clip": 1.02522337, + "balance_loss_mlp": 1.02395141, + "epoch": 0.7104764767773937, + "flos": 11400310085760.0, + "grad_norm": 1.782411095781705, + "language_loss": 0.86500454, + "learning_rate": 8.165978255939426e-07, + "loss": 0.88575935, + "num_input_tokens_seen": 254898705, + "step": 11817, + "time_per_iteration": 2.650836944580078 + }, + { + "auxiliary_loss_clip": 0.01023264, + "auxiliary_loss_mlp": 0.01030298, + "balance_loss_clip": 1.02594566, + "balance_loss_mlp": 1.01999211, + "epoch": 0.7105366000300616, + "flos": 11690432236800.0, + "grad_norm": 2.0900269205780018, + "language_loss": 0.84484214, + "learning_rate": 8.162838805998897e-07, + "loss": 0.86537778, + "num_input_tokens_seen": 254913665, + "step": 11818, + "time_per_iteration": 2.7347984313964844 + }, + { + "auxiliary_loss_clip": 0.01063257, + "auxiliary_loss_mlp": 0.01032163, + "balance_loss_clip": 1.02437985, + "balance_loss_mlp": 1.02088547, + "epoch": 0.7105967232827296, + "flos": 19354020508800.0, + "grad_norm": 1.9391046224420607, + "language_loss": 0.75404012, + "learning_rate": 8.159699804924709e-07, + "loss": 0.77499431, + "num_input_tokens_seen": 254932140, + "step": 11819, + "time_per_iteration": 2.5731260776519775 + }, + { + "auxiliary_loss_clip": 0.01020272, + "auxiliary_loss_mlp": 0.01029355, + "balance_loss_clip": 1.02267742, + "balance_loss_mlp": 1.01670718, + "epoch": 0.7106568465353975, + "flos": 22930400010240.0, + "grad_norm": 1.6594833769599227, + "language_loss": 0.70863104, + "learning_rate": 8.156561252835883e-07, + "loss": 0.72912735, + "num_input_tokens_seen": 254951580, + "step": 11820, + "time_per_iteration": 2.752842426300049 + }, + { + "auxiliary_loss_clip": 0.01052965, + "auxiliary_loss_mlp": 0.01026857, + "balance_loss_clip": 1.02537513, + "balance_loss_mlp": 1.01650417, + "epoch": 0.7107169697880655, + "flos": 19099665325440.0, + "grad_norm": 1.7810650054672927, + "language_loss": 0.75444835, + "learning_rate": 8.153423149851449e-07, + "loss": 0.77524656, + "num_input_tokens_seen": 254969425, + "step": 11821, + "time_per_iteration": 2.5914015769958496 + }, + { + "auxiliary_loss_clip": 0.00960673, + "auxiliary_loss_mlp": 0.01005556, + "balance_loss_clip": 1.00582874, + "balance_loss_mlp": 1.0046258, + "epoch": 0.7107770930407336, + "flos": 63638054231040.0, + "grad_norm": 0.8525726383769484, + "language_loss": 0.55077899, + "learning_rate": 8.150285496090388e-07, + "loss": 0.57044125, + "num_input_tokens_seen": 255032680, + "step": 11822, + "time_per_iteration": 3.3123385906219482 + }, + { + "auxiliary_loss_clip": 0.01049504, + "auxiliary_loss_mlp": 0.01028568, + "balance_loss_clip": 1.02395511, + "balance_loss_mlp": 1.01797056, + "epoch": 0.7108372162934015, + "flos": 22054466949120.0, + "grad_norm": 1.9921928832182159, + "language_loss": 0.60282278, + "learning_rate": 8.147148291671688e-07, + "loss": 0.62360346, + "num_input_tokens_seen": 255054400, + "step": 11823, + "time_per_iteration": 2.7297587394714355 + }, + { + "auxiliary_loss_clip": 0.01052573, + "auxiliary_loss_mlp": 0.01030261, + "balance_loss_clip": 1.02521586, + "balance_loss_mlp": 1.02043271, + "epoch": 0.7108973395460695, + "flos": 19135144984320.0, + "grad_norm": 2.126052599755124, + "language_loss": 0.71175009, + "learning_rate": 8.144011536714322e-07, + "loss": 0.73257846, + "num_input_tokens_seen": 255072785, + "step": 11824, + "time_per_iteration": 2.6654117107391357 + }, + { + "auxiliary_loss_clip": 0.01026501, + "auxiliary_loss_mlp": 0.00747463, + "balance_loss_clip": 1.02034092, + "balance_loss_mlp": 1.00034046, + "epoch": 0.7109574627987374, + "flos": 17894431353600.0, + "grad_norm": 1.982013293498137, + "language_loss": 0.72660863, + "learning_rate": 8.140875231337223e-07, + "loss": 0.74434829, + "num_input_tokens_seen": 255091820, + "step": 11825, + "time_per_iteration": 2.6138112545013428 + }, + { + "auxiliary_loss_clip": 0.01036294, + "auxiliary_loss_mlp": 0.01030231, + "balance_loss_clip": 1.0246743, + "balance_loss_mlp": 1.02005053, + "epoch": 0.7110175860514054, + "flos": 28979623422720.0, + "grad_norm": 1.7322737816936662, + "language_loss": 0.78956044, + "learning_rate": 8.137739375659321e-07, + "loss": 0.81022561, + "num_input_tokens_seen": 255111720, + "step": 11826, + "time_per_iteration": 2.6279566287994385 + }, + { + "auxiliary_loss_clip": 0.01052425, + "auxiliary_loss_mlp": 0.0103, + "balance_loss_clip": 1.02591681, + "balance_loss_mlp": 1.01973605, + "epoch": 0.7110777093040733, + "flos": 26173312623360.0, + "grad_norm": 2.056094110311308, + "language_loss": 0.83256006, + "learning_rate": 8.134603969799527e-07, + "loss": 0.85338426, + "num_input_tokens_seen": 255133495, + "step": 11827, + "time_per_iteration": 2.629427909851074 + }, + { + "auxiliary_loss_clip": 0.01030321, + "auxiliary_loss_mlp": 0.0102801, + "balance_loss_clip": 1.02376127, + "balance_loss_mlp": 1.01654792, + "epoch": 0.7111378325567413, + "flos": 26869943969280.0, + "grad_norm": 1.3512526624208794, + "language_loss": 0.62314153, + "learning_rate": 8.131469013876748e-07, + "loss": 0.6437248, + "num_input_tokens_seen": 255156880, + "step": 11828, + "time_per_iteration": 2.6748673915863037 + }, + { + "auxiliary_loss_clip": 0.01061917, + "auxiliary_loss_mlp": 0.01029163, + "balance_loss_clip": 1.02466547, + "balance_loss_mlp": 1.01869655, + "epoch": 0.7111979558094093, + "flos": 27271820309760.0, + "grad_norm": 1.4779646414737573, + "language_loss": 0.71825272, + "learning_rate": 8.128334508009846e-07, + "loss": 0.73916352, + "num_input_tokens_seen": 255178920, + "step": 11829, + "time_per_iteration": 2.5984883308410645 + }, + { + "auxiliary_loss_clip": 0.01061616, + "auxiliary_loss_mlp": 0.01030267, + "balance_loss_clip": 1.02495098, + "balance_loss_mlp": 1.02008665, + "epoch": 0.7112580790620773, + "flos": 25046938961280.0, + "grad_norm": 1.7229039552443493, + "language_loss": 0.80522001, + "learning_rate": 8.125200452317697e-07, + "loss": 0.82613885, + "num_input_tokens_seen": 255198095, + "step": 11830, + "time_per_iteration": 2.6189303398132324 + }, + { + "auxiliary_loss_clip": 0.01046725, + "auxiliary_loss_mlp": 0.01032957, + "balance_loss_clip": 1.02262187, + "balance_loss_mlp": 1.02232313, + "epoch": 0.7113182023147452, + "flos": 21646628951040.0, + "grad_norm": 1.6920835567613024, + "language_loss": 0.8436296, + "learning_rate": 8.122066846919138e-07, + "loss": 0.86442637, + "num_input_tokens_seen": 255215860, + "step": 11831, + "time_per_iteration": 2.6818790435791016 + }, + { + "auxiliary_loss_clip": 0.01038267, + "auxiliary_loss_mlp": 0.01029125, + "balance_loss_clip": 1.02281022, + "balance_loss_mlp": 1.01819921, + "epoch": 0.7113783255674132, + "flos": 20996287257600.0, + "grad_norm": 2.1721160596553615, + "language_loss": 0.76957357, + "learning_rate": 8.118933691932985e-07, + "loss": 0.79024744, + "num_input_tokens_seen": 255235425, + "step": 11832, + "time_per_iteration": 2.7307512760162354 + }, + { + "auxiliary_loss_clip": 0.00997736, + "auxiliary_loss_mlp": 0.01001082, + "balance_loss_clip": 1.00231624, + "balance_loss_mlp": 1.00014055, + "epoch": 0.7114384488200811, + "flos": 66771080161920.0, + "grad_norm": 0.7436780459830133, + "language_loss": 0.56646067, + "learning_rate": 8.115800987478059e-07, + "loss": 0.58644885, + "num_input_tokens_seen": 255291680, + "step": 11833, + "time_per_iteration": 3.1714906692504883 + }, + { + "auxiliary_loss_clip": 0.01020652, + "auxiliary_loss_mlp": 0.01032876, + "balance_loss_clip": 1.023453, + "balance_loss_mlp": 1.02283883, + "epoch": 0.7114985720727491, + "flos": 25010058672000.0, + "grad_norm": 1.5749909307533612, + "language_loss": 0.71044219, + "learning_rate": 8.11266873367315e-07, + "loss": 0.73097742, + "num_input_tokens_seen": 255313880, + "step": 11834, + "time_per_iteration": 2.853273868560791 + }, + { + "auxiliary_loss_clip": 0.01065663, + "auxiliary_loss_mlp": 0.01027253, + "balance_loss_clip": 1.02658617, + "balance_loss_mlp": 1.01616025, + "epoch": 0.7115586953254172, + "flos": 21470128496640.0, + "grad_norm": 2.220396670357165, + "language_loss": 0.79324555, + "learning_rate": 8.10953693063704e-07, + "loss": 0.81417471, + "num_input_tokens_seen": 255332390, + "step": 11835, + "time_per_iteration": 2.5841307640075684 + }, + { + "auxiliary_loss_clip": 0.01049349, + "auxiliary_loss_mlp": 0.01024721, + "balance_loss_clip": 1.02340019, + "balance_loss_mlp": 1.01536894, + "epoch": 0.7116188185780851, + "flos": 28622600190720.0, + "grad_norm": 1.578692956285755, + "language_loss": 0.75870389, + "learning_rate": 8.10640557848848e-07, + "loss": 0.77944458, + "num_input_tokens_seen": 255354025, + "step": 11836, + "time_per_iteration": 2.6593375205993652 + }, + { + "auxiliary_loss_clip": 0.01000321, + "auxiliary_loss_mlp": 0.01033365, + "balance_loss_clip": 1.02392411, + "balance_loss_mlp": 1.02293968, + "epoch": 0.7116789418307531, + "flos": 25293608634240.0, + "grad_norm": 1.67626931506562, + "language_loss": 0.70468628, + "learning_rate": 8.103274677346208e-07, + "loss": 0.72502315, + "num_input_tokens_seen": 255371400, + "step": 11837, + "time_per_iteration": 6.070312738418579 + }, + { + "auxiliary_loss_clip": 0.01055774, + "auxiliary_loss_mlp": 0.01030527, + "balance_loss_clip": 1.02591681, + "balance_loss_mlp": 1.01843286, + "epoch": 0.711739065083421, + "flos": 25557301353600.0, + "grad_norm": 1.8925441189385503, + "language_loss": 0.61703849, + "learning_rate": 8.100144227328958e-07, + "loss": 0.63790143, + "num_input_tokens_seen": 255390710, + "step": 11838, + "time_per_iteration": 2.7050678730010986 + }, + { + "auxiliary_loss_clip": 0.01053887, + "auxiliary_loss_mlp": 0.01027414, + "balance_loss_clip": 1.02686167, + "balance_loss_mlp": 1.01645291, + "epoch": 0.711799188336089, + "flos": 26140993361280.0, + "grad_norm": 2.46624491583042, + "language_loss": 0.67627919, + "learning_rate": 8.097014228555426e-07, + "loss": 0.69709212, + "num_input_tokens_seen": 255408790, + "step": 11839, + "time_per_iteration": 2.7871146202087402 + }, + { + "auxiliary_loss_clip": 0.010633, + "auxiliary_loss_mlp": 0.01027858, + "balance_loss_clip": 1.02630496, + "balance_loss_mlp": 1.01759958, + "epoch": 0.7118593115887569, + "flos": 21140648017920.0, + "grad_norm": 2.273625990211293, + "language_loss": 0.84204739, + "learning_rate": 8.093884681144305e-07, + "loss": 0.86295897, + "num_input_tokens_seen": 255426280, + "step": 11840, + "time_per_iteration": 2.620565176010132 + }, + { + "auxiliary_loss_clip": 0.01040683, + "auxiliary_loss_mlp": 0.01031597, + "balance_loss_clip": 1.02455807, + "balance_loss_mlp": 1.02054, + "epoch": 0.711919434841425, + "flos": 14975684006400.0, + "grad_norm": 1.8454840075237084, + "language_loss": 0.76745772, + "learning_rate": 8.090755585214277e-07, + "loss": 0.78818053, + "num_input_tokens_seen": 255442935, + "step": 11841, + "time_per_iteration": 2.7241084575653076 + }, + { + "auxiliary_loss_clip": 0.0105068, + "auxiliary_loss_mlp": 0.0102815, + "balance_loss_clip": 1.02912927, + "balance_loss_mlp": 1.01701593, + "epoch": 0.7119795580940929, + "flos": 16508997826560.0, + "grad_norm": 2.003430611198508, + "language_loss": 0.75446808, + "learning_rate": 8.087626940883994e-07, + "loss": 0.7752564, + "num_input_tokens_seen": 255460925, + "step": 11842, + "time_per_iteration": 2.7766318321228027 + }, + { + "auxiliary_loss_clip": 0.0100144, + "auxiliary_loss_mlp": 0.01001696, + "balance_loss_clip": 1.00520086, + "balance_loss_mlp": 1.00047457, + "epoch": 0.7120396813467609, + "flos": 66570736055040.0, + "grad_norm": 0.7870786163670259, + "language_loss": 0.61589563, + "learning_rate": 8.084498748272082e-07, + "loss": 0.63592696, + "num_input_tokens_seen": 255521360, + "step": 11843, + "time_per_iteration": 3.25421142578125 + }, + { + "auxiliary_loss_clip": 0.01059773, + "auxiliary_loss_mlp": 0.01028032, + "balance_loss_clip": 1.02412021, + "balance_loss_mlp": 1.01798916, + "epoch": 0.7120998045994288, + "flos": 26432731624320.0, + "grad_norm": 1.549171966432052, + "language_loss": 0.79899728, + "learning_rate": 8.081371007497171e-07, + "loss": 0.81987524, + "num_input_tokens_seen": 255541435, + "step": 11844, + "time_per_iteration": 2.71187162399292 + }, + { + "auxiliary_loss_clip": 0.01008083, + "auxiliary_loss_mlp": 0.01027416, + "balance_loss_clip": 1.01966429, + "balance_loss_mlp": 1.01610351, + "epoch": 0.7121599278520968, + "flos": 16427982700800.0, + "grad_norm": 2.2210534724412128, + "language_loss": 0.79037517, + "learning_rate": 8.078243718677873e-07, + "loss": 0.8107301, + "num_input_tokens_seen": 255558505, + "step": 11845, + "time_per_iteration": 2.798099994659424 + }, + { + "auxiliary_loss_clip": 0.01045011, + "auxiliary_loss_mlp": 0.01029137, + "balance_loss_clip": 1.0250417, + "balance_loss_mlp": 1.01813388, + "epoch": 0.7122200511047647, + "flos": 28949889939840.0, + "grad_norm": 2.9381499657172134, + "language_loss": 0.77783978, + "learning_rate": 8.075116881932762e-07, + "loss": 0.79858124, + "num_input_tokens_seen": 255577815, + "step": 11846, + "time_per_iteration": 2.7242374420166016 + }, + { + "auxiliary_loss_clip": 0.0105061, + "auxiliary_loss_mlp": 0.01028892, + "balance_loss_clip": 1.02483475, + "balance_loss_mlp": 1.01855659, + "epoch": 0.7122801743574327, + "flos": 16471866142080.0, + "grad_norm": 1.8528062115212813, + "language_loss": 0.58561337, + "learning_rate": 8.071990497380421e-07, + "loss": 0.60640842, + "num_input_tokens_seen": 255595885, + "step": 11847, + "time_per_iteration": 2.6683332920074463 + }, + { + "auxiliary_loss_clip": 0.01048982, + "auxiliary_loss_mlp": 0.00747422, + "balance_loss_clip": 1.02356148, + "balance_loss_mlp": 1.00037742, + "epoch": 0.7123402976101008, + "flos": 20631039811200.0, + "grad_norm": 1.3675608820605119, + "language_loss": 0.7122308, + "learning_rate": 8.068864565139395e-07, + "loss": 0.73019481, + "num_input_tokens_seen": 255616750, + "step": 11848, + "time_per_iteration": 2.800635814666748 + }, + { + "auxiliary_loss_clip": 0.0099713, + "auxiliary_loss_mlp": 0.01001673, + "balance_loss_clip": 1.001441, + "balance_loss_mlp": 1.00077271, + "epoch": 0.7124004208627687, + "flos": 62325734837760.0, + "grad_norm": 0.8505338848804346, + "language_loss": 0.63032371, + "learning_rate": 8.065739085328211e-07, + "loss": 0.65031177, + "num_input_tokens_seen": 255677900, + "step": 11849, + "time_per_iteration": 3.205270767211914 + }, + { + "auxiliary_loss_clip": 0.01038424, + "auxiliary_loss_mlp": 0.01033199, + "balance_loss_clip": 1.0232569, + "balance_loss_mlp": 1.02296507, + "epoch": 0.7124605441154367, + "flos": 39675975788160.0, + "grad_norm": 1.5286203575366033, + "language_loss": 0.63925314, + "learning_rate": 8.0626140580654e-07, + "loss": 0.65996939, + "num_input_tokens_seen": 255699140, + "step": 11850, + "time_per_iteration": 2.844723701477051 + }, + { + "auxiliary_loss_clip": 0.01052243, + "auxiliary_loss_mlp": 0.01027968, + "balance_loss_clip": 1.0243485, + "balance_loss_mlp": 1.01749492, + "epoch": 0.7125206673681046, + "flos": 28181868312960.0, + "grad_norm": 1.432298782492065, + "language_loss": 0.70046759, + "learning_rate": 8.05948948346946e-07, + "loss": 0.72126973, + "num_input_tokens_seen": 255719640, + "step": 11851, + "time_per_iteration": 2.6505422592163086 + }, + { + "auxiliary_loss_clip": 0.01054698, + "auxiliary_loss_mlp": 0.01029252, + "balance_loss_clip": 1.0270884, + "balance_loss_mlp": 1.01972091, + "epoch": 0.7125807906207726, + "flos": 26176939896960.0, + "grad_norm": 1.4769126020784602, + "language_loss": 0.83383155, + "learning_rate": 8.056365361658882e-07, + "loss": 0.854671, + "num_input_tokens_seen": 255740450, + "step": 11852, + "time_per_iteration": 2.6297216415405273 + }, + { + "auxiliary_loss_clip": 0.0104869, + "auxiliary_loss_mlp": 0.00747717, + "balance_loss_clip": 1.02323043, + "balance_loss_mlp": 1.00042748, + "epoch": 0.7126409138734405, + "flos": 17157328358400.0, + "grad_norm": 2.8444912157448807, + "language_loss": 0.72772956, + "learning_rate": 8.053241692752126e-07, + "loss": 0.74569362, + "num_input_tokens_seen": 255758070, + "step": 11853, + "time_per_iteration": 4.348237752914429 + }, + { + "auxiliary_loss_clip": 0.01022627, + "auxiliary_loss_mlp": 0.010289, + "balance_loss_clip": 1.02180386, + "balance_loss_mlp": 1.01882648, + "epoch": 0.7127010371261085, + "flos": 18769933451520.0, + "grad_norm": 1.9728187855014778, + "language_loss": 0.92339867, + "learning_rate": 8.050118476867635e-07, + "loss": 0.94391394, + "num_input_tokens_seen": 255775685, + "step": 11854, + "time_per_iteration": 2.6321873664855957 + }, + { + "auxiliary_loss_clip": 0.01053032, + "auxiliary_loss_mlp": 0.01027595, + "balance_loss_clip": 1.02621651, + "balance_loss_mlp": 1.01756942, + "epoch": 0.7127611603787765, + "flos": 20376433232640.0, + "grad_norm": 1.7365372444904121, + "language_loss": 0.79818976, + "learning_rate": 8.046995714123856e-07, + "loss": 0.81899607, + "num_input_tokens_seen": 255794750, + "step": 11855, + "time_per_iteration": 2.637540817260742 + }, + { + "auxiliary_loss_clip": 0.01011451, + "auxiliary_loss_mlp": 0.01033205, + "balance_loss_clip": 1.02069271, + "balance_loss_mlp": 1.02143931, + "epoch": 0.7128212836314445, + "flos": 20449008662400.0, + "grad_norm": 1.8285962749133158, + "language_loss": 0.72753549, + "learning_rate": 8.043873404639192e-07, + "loss": 0.74798203, + "num_input_tokens_seen": 255813325, + "step": 11856, + "time_per_iteration": 2.8249778747558594 + }, + { + "auxiliary_loss_clip": 0.01054223, + "auxiliary_loss_mlp": 0.01028462, + "balance_loss_clip": 1.02555895, + "balance_loss_mlp": 1.01774561, + "epoch": 0.7128814068841124, + "flos": 23440834229760.0, + "grad_norm": 1.644799972641427, + "language_loss": 0.70197237, + "learning_rate": 8.040751548532046e-07, + "loss": 0.72279924, + "num_input_tokens_seen": 255832470, + "step": 11857, + "time_per_iteration": 2.624495506286621 + }, + { + "auxiliary_loss_clip": 0.01049859, + "auxiliary_loss_mlp": 0.01027486, + "balance_loss_clip": 1.02390909, + "balance_loss_mlp": 1.01663196, + "epoch": 0.7129415301367804, + "flos": 18222942165120.0, + "grad_norm": 2.4241253902472093, + "language_loss": 0.84718347, + "learning_rate": 8.03763014592081e-07, + "loss": 0.86795688, + "num_input_tokens_seen": 255849740, + "step": 11858, + "time_per_iteration": 2.627199649810791 + }, + { + "auxiliary_loss_clip": 0.01068121, + "auxiliary_loss_mlp": 0.01030996, + "balance_loss_clip": 1.02783573, + "balance_loss_mlp": 1.02047598, + "epoch": 0.7130016533894483, + "flos": 15523896355200.0, + "grad_norm": 1.6039659257229157, + "language_loss": 0.79989052, + "learning_rate": 8.034509196923829e-07, + "loss": 0.82088172, + "num_input_tokens_seen": 255866975, + "step": 11859, + "time_per_iteration": 2.68845534324646 + }, + { + "auxiliary_loss_clip": 0.0104093, + "auxiliary_loss_mlp": 0.01030615, + "balance_loss_clip": 1.02471852, + "balance_loss_mlp": 1.02082157, + "epoch": 0.7130617766421163, + "flos": 57115668960000.0, + "grad_norm": 1.136211454853853, + "language_loss": 0.6905961, + "learning_rate": 8.031388701659456e-07, + "loss": 0.71131152, + "num_input_tokens_seen": 255892915, + "step": 11860, + "time_per_iteration": 3.0666403770446777 + }, + { + "auxiliary_loss_clip": 0.01052785, + "auxiliary_loss_mlp": 0.01030815, + "balance_loss_clip": 1.02431917, + "balance_loss_mlp": 1.01956177, + "epoch": 0.7131218998947844, + "flos": 19788252024960.0, + "grad_norm": 1.5981304024011962, + "language_loss": 0.64361453, + "learning_rate": 8.028268660246023e-07, + "loss": 0.66445059, + "num_input_tokens_seen": 255911480, + "step": 11861, + "time_per_iteration": 2.6438777446746826 + }, + { + "auxiliary_loss_clip": 0.01049426, + "auxiliary_loss_mlp": 0.0103283, + "balance_loss_clip": 1.02838838, + "balance_loss_mlp": 1.02111721, + "epoch": 0.7131820231474523, + "flos": 26651894457600.0, + "grad_norm": 1.4902257701013495, + "language_loss": 0.67204714, + "learning_rate": 8.025149072801849e-07, + "loss": 0.69286972, + "num_input_tokens_seen": 255931140, + "step": 11862, + "time_per_iteration": 2.671265125274658 + }, + { + "auxiliary_loss_clip": 0.01034781, + "auxiliary_loss_mlp": 0.01035237, + "balance_loss_clip": 1.02309203, + "balance_loss_mlp": 1.02485991, + "epoch": 0.7132421464001203, + "flos": 29205609840000.0, + "grad_norm": 3.281721048131388, + "language_loss": 0.66762275, + "learning_rate": 8.022029939445214e-07, + "loss": 0.68832296, + "num_input_tokens_seen": 255951665, + "step": 11863, + "time_per_iteration": 4.396332263946533 + }, + { + "auxiliary_loss_clip": 0.01021901, + "auxiliary_loss_mlp": 0.01039857, + "balance_loss_clip": 1.02532113, + "balance_loss_mlp": 1.02702379, + "epoch": 0.7133022696527882, + "flos": 23073611535360.0, + "grad_norm": 2.6556114261455126, + "language_loss": 0.65589321, + "learning_rate": 8.018911260294414e-07, + "loss": 0.67651081, + "num_input_tokens_seen": 255970055, + "step": 11864, + "time_per_iteration": 2.899904251098633 + }, + { + "auxiliary_loss_clip": 0.01053966, + "auxiliary_loss_mlp": 0.01027316, + "balance_loss_clip": 1.02491379, + "balance_loss_mlp": 1.01618767, + "epoch": 0.7133623929054562, + "flos": 17457111267840.0, + "grad_norm": 2.2464199101528246, + "language_loss": 0.85519195, + "learning_rate": 8.015793035467697e-07, + "loss": 0.87600482, + "num_input_tokens_seen": 255987720, + "step": 11865, + "time_per_iteration": 2.716975212097168 + }, + { + "auxiliary_loss_clip": 0.01019448, + "auxiliary_loss_mlp": 0.01028263, + "balance_loss_clip": 1.0198828, + "balance_loss_mlp": 1.0162642, + "epoch": 0.7134225161581241, + "flos": 19536554448000.0, + "grad_norm": 2.9560663654839074, + "language_loss": 0.75351715, + "learning_rate": 8.012675265083304e-07, + "loss": 0.77399421, + "num_input_tokens_seen": 256005490, + "step": 11866, + "time_per_iteration": 2.665485382080078 + }, + { + "auxiliary_loss_clip": 0.01023488, + "auxiliary_loss_mlp": 0.0103188, + "balance_loss_clip": 1.02360404, + "balance_loss_mlp": 1.01998329, + "epoch": 0.7134826394107922, + "flos": 26250089944320.0, + "grad_norm": 3.189077444499227, + "language_loss": 0.70090204, + "learning_rate": 8.009557949259464e-07, + "loss": 0.72145569, + "num_input_tokens_seen": 256026030, + "step": 11867, + "time_per_iteration": 2.665274143218994 + }, + { + "auxiliary_loss_clip": 0.0105305, + "auxiliary_loss_mlp": 0.0102595, + "balance_loss_clip": 1.02659392, + "balance_loss_mlp": 1.01590037, + "epoch": 0.7135427626634601, + "flos": 15815311395840.0, + "grad_norm": 1.8934440795213607, + "language_loss": 0.71751487, + "learning_rate": 8.006441088114397e-07, + "loss": 0.73830485, + "num_input_tokens_seen": 256043680, + "step": 11868, + "time_per_iteration": 2.6976702213287354 + }, + { + "auxiliary_loss_clip": 0.01018264, + "auxiliary_loss_mlp": 0.01034652, + "balance_loss_clip": 1.02348495, + "balance_loss_mlp": 1.02118123, + "epoch": 0.7136028859161281, + "flos": 18223409041920.0, + "grad_norm": 2.904710609559468, + "language_loss": 0.66367686, + "learning_rate": 8.003324681766286e-07, + "loss": 0.68420601, + "num_input_tokens_seen": 256059705, + "step": 11869, + "time_per_iteration": 2.6754753589630127 + }, + { + "auxiliary_loss_clip": 0.0103545, + "auxiliary_loss_mlp": 0.01023141, + "balance_loss_clip": 1.02184463, + "balance_loss_mlp": 1.01260936, + "epoch": 0.713663009168796, + "flos": 24314827956480.0, + "grad_norm": 1.5537407117902675, + "language_loss": 0.77772963, + "learning_rate": 8.000208730333298e-07, + "loss": 0.79831553, + "num_input_tokens_seen": 256079785, + "step": 11870, + "time_per_iteration": 2.730914354324341 + }, + { + "auxiliary_loss_clip": 0.01015674, + "auxiliary_loss_mlp": 0.01030849, + "balance_loss_clip": 1.02440715, + "balance_loss_mlp": 1.01930988, + "epoch": 0.713723132421464, + "flos": 26538488242560.0, + "grad_norm": 1.7093236074499856, + "language_loss": 0.81094193, + "learning_rate": 7.997093233933597e-07, + "loss": 0.83140713, + "num_input_tokens_seen": 256099000, + "step": 11871, + "time_per_iteration": 2.7249884605407715 + }, + { + "auxiliary_loss_clip": 0.01033047, + "auxiliary_loss_mlp": 0.01035282, + "balance_loss_clip": 1.02415109, + "balance_loss_mlp": 1.02339125, + "epoch": 0.7137832556741319, + "flos": 19865675790720.0, + "grad_norm": 1.5927819371643959, + "language_loss": 0.78858316, + "learning_rate": 7.993978192685331e-07, + "loss": 0.80926645, + "num_input_tokens_seen": 256117985, + "step": 11872, + "time_per_iteration": 2.6378743648529053 + }, + { + "auxiliary_loss_clip": 0.01054884, + "auxiliary_loss_mlp": 0.01027405, + "balance_loss_clip": 1.0257864, + "balance_loss_mlp": 1.01653934, + "epoch": 0.7138433789267999, + "flos": 21688932193920.0, + "grad_norm": 2.4898541271282317, + "language_loss": 0.84027052, + "learning_rate": 7.990863606706606e-07, + "loss": 0.86109346, + "num_input_tokens_seen": 256134350, + "step": 11873, + "time_per_iteration": 2.5268445014953613 + }, + { + "auxiliary_loss_clip": 0.01024834, + "auxiliary_loss_mlp": 0.01034176, + "balance_loss_clip": 1.02156675, + "balance_loss_mlp": 1.02435923, + "epoch": 0.713903502179468, + "flos": 17602729004160.0, + "grad_norm": 1.8465654002058842, + "language_loss": 0.85999674, + "learning_rate": 7.987749476115539e-07, + "loss": 0.88058686, + "num_input_tokens_seen": 256150610, + "step": 11874, + "time_per_iteration": 2.574876546859741 + }, + { + "auxiliary_loss_clip": 0.01052349, + "auxiliary_loss_mlp": 0.01026994, + "balance_loss_clip": 1.02366519, + "balance_loss_mlp": 1.01606214, + "epoch": 0.7139636254321359, + "flos": 18040336398720.0, + "grad_norm": 1.808028939909326, + "language_loss": 0.83275497, + "learning_rate": 7.984635801030228e-07, + "loss": 0.85354841, + "num_input_tokens_seen": 256168620, + "step": 11875, + "time_per_iteration": 2.5826854705810547 + }, + { + "auxiliary_loss_clip": 0.01040306, + "auxiliary_loss_mlp": 0.0103532, + "balance_loss_clip": 1.02420914, + "balance_loss_mlp": 1.02133679, + "epoch": 0.7140237486848039, + "flos": 23331127115520.0, + "grad_norm": 1.9479774017649747, + "language_loss": 0.69939077, + "learning_rate": 7.981522581568721e-07, + "loss": 0.72014701, + "num_input_tokens_seen": 256186700, + "step": 11876, + "time_per_iteration": 2.660935878753662 + }, + { + "auxiliary_loss_clip": 0.01064753, + "auxiliary_loss_mlp": 0.01029431, + "balance_loss_clip": 1.02607346, + "balance_loss_mlp": 1.0186007, + "epoch": 0.7140838719374718, + "flos": 16837077674880.0, + "grad_norm": 2.179994335708998, + "language_loss": 0.78009391, + "learning_rate": 7.978409817849079e-07, + "loss": 0.8010357, + "num_input_tokens_seen": 256205390, + "step": 11877, + "time_per_iteration": 2.590658187866211 + }, + { + "auxiliary_loss_clip": 0.01053228, + "auxiliary_loss_mlp": 0.01029712, + "balance_loss_clip": 1.02558613, + "balance_loss_mlp": 1.01970994, + "epoch": 0.7141439951901398, + "flos": 21142012734720.0, + "grad_norm": 2.06081072124065, + "language_loss": 0.69696093, + "learning_rate": 7.97529750998934e-07, + "loss": 0.71779031, + "num_input_tokens_seen": 256224575, + "step": 11878, + "time_per_iteration": 2.579271078109741 + }, + { + "auxiliary_loss_clip": 0.01032439, + "auxiliary_loss_mlp": 0.01030879, + "balance_loss_clip": 1.02585578, + "balance_loss_mlp": 1.02094865, + "epoch": 0.7142041184428077, + "flos": 24717709877760.0, + "grad_norm": 1.7443008616176747, + "language_loss": 0.6763823, + "learning_rate": 7.972185658107535e-07, + "loss": 0.69701546, + "num_input_tokens_seen": 256242130, + "step": 11879, + "time_per_iteration": 2.7397477626800537 + }, + { + "auxiliary_loss_clip": 0.01009571, + "auxiliary_loss_mlp": 0.01038879, + "balance_loss_clip": 1.02290487, + "balance_loss_mlp": 1.02646351, + "epoch": 0.7142642416954758, + "flos": 21908202768000.0, + "grad_norm": 1.7052116318179305, + "language_loss": 0.69498694, + "learning_rate": 7.969074262321646e-07, + "loss": 0.71547139, + "num_input_tokens_seen": 256261920, + "step": 11880, + "time_per_iteration": 2.6315276622772217 + }, + { + "auxiliary_loss_clip": 0.01034206, + "auxiliary_loss_mlp": 0.01037076, + "balance_loss_clip": 1.0217768, + "balance_loss_mlp": 1.02541149, + "epoch": 0.7143243649481437, + "flos": 20805636844800.0, + "grad_norm": 2.0040067390111185, + "language_loss": 0.80268514, + "learning_rate": 7.965963322749674e-07, + "loss": 0.82339799, + "num_input_tokens_seen": 256277970, + "step": 11881, + "time_per_iteration": 2.5275793075561523 + }, + { + "auxiliary_loss_clip": 0.01022138, + "auxiliary_loss_mlp": 0.01027817, + "balance_loss_clip": 1.02220571, + "balance_loss_mlp": 1.01749325, + "epoch": 0.7143844882008117, + "flos": 27235011847680.0, + "grad_norm": 2.8683641490272636, + "language_loss": 0.63763797, + "learning_rate": 7.962852839509579e-07, + "loss": 0.6581375, + "num_input_tokens_seen": 256298205, + "step": 11882, + "time_per_iteration": 2.871208906173706 + }, + { + "auxiliary_loss_clip": 0.01066852, + "auxiliary_loss_mlp": 0.01029855, + "balance_loss_clip": 1.02638102, + "balance_loss_mlp": 1.01915598, + "epoch": 0.7144446114534796, + "flos": 17929623703680.0, + "grad_norm": 1.8477777720118091, + "language_loss": 0.68992114, + "learning_rate": 7.959742812719304e-07, + "loss": 0.71088815, + "num_input_tokens_seen": 256316685, + "step": 11883, + "time_per_iteration": 2.658867359161377 + }, + { + "auxiliary_loss_clip": 0.01051057, + "auxiliary_loss_mlp": 0.01031214, + "balance_loss_clip": 1.02419662, + "balance_loss_mlp": 1.01988316, + "epoch": 0.7145047347061476, + "flos": 20740962407040.0, + "grad_norm": 2.580890823529644, + "language_loss": 0.77558196, + "learning_rate": 7.956633242496788e-07, + "loss": 0.79640472, + "num_input_tokens_seen": 256334205, + "step": 11884, + "time_per_iteration": 5.949062824249268 + }, + { + "auxiliary_loss_clip": 0.01057088, + "auxiliary_loss_mlp": 0.01029358, + "balance_loss_clip": 1.02494347, + "balance_loss_mlp": 1.01694214, + "epoch": 0.7145648579588155, + "flos": 21178605715200.0, + "grad_norm": 2.039943583928009, + "language_loss": 0.7401228, + "learning_rate": 7.953524128959954e-07, + "loss": 0.76098728, + "num_input_tokens_seen": 256353340, + "step": 11885, + "time_per_iteration": 2.634752035140991 + }, + { + "auxiliary_loss_clip": 0.00987222, + "auxiliary_loss_mlp": 0.01007743, + "balance_loss_clip": 1.00137329, + "balance_loss_mlp": 1.00664008, + "epoch": 0.7146249812114835, + "flos": 64784539509120.0, + "grad_norm": 0.8931231555439649, + "language_loss": 0.66302246, + "learning_rate": 7.95041547222669e-07, + "loss": 0.68297207, + "num_input_tokens_seen": 256411550, + "step": 11886, + "time_per_iteration": 3.1897456645965576 + }, + { + "auxiliary_loss_clip": 0.01024623, + "auxiliary_loss_mlp": 0.01025543, + "balance_loss_clip": 1.02571106, + "balance_loss_mlp": 1.01415861, + "epoch": 0.7146851044641516, + "flos": 18113881495680.0, + "grad_norm": 1.607943328070737, + "language_loss": 0.74691105, + "learning_rate": 7.947307272414874e-07, + "loss": 0.76741278, + "num_input_tokens_seen": 256430360, + "step": 11887, + "time_per_iteration": 2.70131778717041 + }, + { + "auxiliary_loss_clip": 0.01052224, + "auxiliary_loss_mlp": 0.0102391, + "balance_loss_clip": 1.02443862, + "balance_loss_mlp": 1.01350296, + "epoch": 0.7147452277168195, + "flos": 19243846517760.0, + "grad_norm": 1.481867733796043, + "language_loss": 0.716887, + "learning_rate": 7.944199529642372e-07, + "loss": 0.73764837, + "num_input_tokens_seen": 256449750, + "step": 11888, + "time_per_iteration": 2.6111507415771484 + }, + { + "auxiliary_loss_clip": 0.01046797, + "auxiliary_loss_mlp": 0.01031895, + "balance_loss_clip": 1.02356482, + "balance_loss_mlp": 1.01989627, + "epoch": 0.7148053509694875, + "flos": 23764712186880.0, + "grad_norm": 2.068694445044653, + "language_loss": 0.84306222, + "learning_rate": 7.941092244027041e-07, + "loss": 0.86384916, + "num_input_tokens_seen": 256467330, + "step": 11889, + "time_per_iteration": 2.7528958320617676 + }, + { + "auxiliary_loss_clip": 0.01023894, + "auxiliary_loss_mlp": 0.01027307, + "balance_loss_clip": 1.02555525, + "balance_loss_mlp": 1.01642334, + "epoch": 0.7148654742221554, + "flos": 22485322586880.0, + "grad_norm": 1.827581674503359, + "language_loss": 0.75802982, + "learning_rate": 7.937985415686695e-07, + "loss": 0.7785418, + "num_input_tokens_seen": 256485705, + "step": 11890, + "time_per_iteration": 2.8154456615448 + }, + { + "auxiliary_loss_clip": 0.01018991, + "auxiliary_loss_mlp": 0.01032687, + "balance_loss_clip": 1.02163196, + "balance_loss_mlp": 1.02222681, + "epoch": 0.7149255974748234, + "flos": 24679213476480.0, + "grad_norm": 1.5959848603018705, + "language_loss": 0.74269855, + "learning_rate": 7.934879044739147e-07, + "loss": 0.76321536, + "num_input_tokens_seen": 256504755, + "step": 11891, + "time_per_iteration": 2.7420403957366943 + }, + { + "auxiliary_loss_clip": 0.0102513, + "auxiliary_loss_mlp": 0.01033053, + "balance_loss_clip": 1.02531958, + "balance_loss_mlp": 1.02171612, + "epoch": 0.7149857207274913, + "flos": 18405583845120.0, + "grad_norm": 1.8320979309008374, + "language_loss": 0.68121624, + "learning_rate": 7.931773131302211e-07, + "loss": 0.70179808, + "num_input_tokens_seen": 256523670, + "step": 11892, + "time_per_iteration": 2.8938817977905273 + }, + { + "auxiliary_loss_clip": 0.01039031, + "auxiliary_loss_mlp": 0.01034583, + "balance_loss_clip": 1.02764583, + "balance_loss_mlp": 1.02185774, + "epoch": 0.7150458439801594, + "flos": 24969515195520.0, + "grad_norm": 2.0024251780384645, + "language_loss": 0.73627359, + "learning_rate": 7.928667675493632e-07, + "loss": 0.75700974, + "num_input_tokens_seen": 256542225, + "step": 11893, + "time_per_iteration": 3.048269271850586 + }, + { + "auxiliary_loss_clip": 0.01065207, + "auxiliary_loss_mlp": 0.01028696, + "balance_loss_clip": 1.02568305, + "balance_loss_mlp": 1.01710296, + "epoch": 0.7151059672328273, + "flos": 16690777580160.0, + "grad_norm": 1.9531810525480193, + "language_loss": 0.66641271, + "learning_rate": 7.925562677431185e-07, + "loss": 0.6873517, + "num_input_tokens_seen": 256560730, + "step": 11894, + "time_per_iteration": 2.744133949279785 + }, + { + "auxiliary_loss_clip": 0.01038783, + "auxiliary_loss_mlp": 0.01028045, + "balance_loss_clip": 1.02856708, + "balance_loss_mlp": 1.01683927, + "epoch": 0.7151660904854953, + "flos": 27271820309760.0, + "grad_norm": 1.6846589745709228, + "language_loss": 0.77732503, + "learning_rate": 7.922458137232613e-07, + "loss": 0.79799324, + "num_input_tokens_seen": 256580505, + "step": 11895, + "time_per_iteration": 2.870548725128174 + }, + { + "auxiliary_loss_clip": 0.01055908, + "auxiliary_loss_mlp": 0.01026565, + "balance_loss_clip": 1.02646482, + "balance_loss_mlp": 1.0150677, + "epoch": 0.7152262137381632, + "flos": 18332254229760.0, + "grad_norm": 1.8211635231827308, + "language_loss": 0.69535351, + "learning_rate": 7.919354055015643e-07, + "loss": 0.7161783, + "num_input_tokens_seen": 256597330, + "step": 11896, + "time_per_iteration": 2.671583890914917 + }, + { + "auxiliary_loss_clip": 0.0104299, + "auxiliary_loss_mlp": 0.01039692, + "balance_loss_clip": 1.0236752, + "balance_loss_mlp": 1.02817619, + "epoch": 0.7152863369908312, + "flos": 21799285752960.0, + "grad_norm": 3.7878516551697556, + "language_loss": 0.86413342, + "learning_rate": 7.91625043089798e-07, + "loss": 0.88496017, + "num_input_tokens_seen": 256616030, + "step": 11897, + "time_per_iteration": 2.7768657207489014 + }, + { + "auxiliary_loss_clip": 0.010449, + "auxiliary_loss_mlp": 0.01029932, + "balance_loss_clip": 1.02680182, + "balance_loss_mlp": 1.01878548, + "epoch": 0.7153464602434991, + "flos": 22158427887360.0, + "grad_norm": 2.554915050507671, + "language_loss": 0.78373611, + "learning_rate": 7.913147264997304e-07, + "loss": 0.80448443, + "num_input_tokens_seen": 256635570, + "step": 11898, + "time_per_iteration": 2.7487988471984863 + }, + { + "auxiliary_loss_clip": 0.01038915, + "auxiliary_loss_mlp": 0.01029295, + "balance_loss_clip": 1.02361739, + "balance_loss_mlp": 1.01679611, + "epoch": 0.7154065834961671, + "flos": 24716057852160.0, + "grad_norm": 1.6144786627479533, + "language_loss": 0.7267071, + "learning_rate": 7.910044557431302e-07, + "loss": 0.7473892, + "num_input_tokens_seen": 256655290, + "step": 11899, + "time_per_iteration": 2.716796875 + }, + { + "auxiliary_loss_clip": 0.01052418, + "auxiliary_loss_mlp": 0.01034398, + "balance_loss_clip": 1.02436137, + "balance_loss_mlp": 1.02316213, + "epoch": 0.7154667067488351, + "flos": 22601494149120.0, + "grad_norm": 2.1702056583462865, + "language_loss": 0.76226968, + "learning_rate": 7.906942308317614e-07, + "loss": 0.78313786, + "num_input_tokens_seen": 256671605, + "step": 11900, + "time_per_iteration": 2.65084171295166 + }, + { + "auxiliary_loss_clip": 0.01055035, + "auxiliary_loss_mlp": 0.01031108, + "balance_loss_clip": 1.02638531, + "balance_loss_mlp": 1.020105, + "epoch": 0.7155268300015031, + "flos": 18771154513920.0, + "grad_norm": 1.9611031612261827, + "language_loss": 0.80835462, + "learning_rate": 7.903840517773886e-07, + "loss": 0.82921606, + "num_input_tokens_seen": 256689680, + "step": 11901, + "time_per_iteration": 4.207157135009766 + }, + { + "auxiliary_loss_clip": 0.01032547, + "auxiliary_loss_mlp": 0.01030841, + "balance_loss_clip": 1.02565813, + "balance_loss_mlp": 1.01868188, + "epoch": 0.7155869532541711, + "flos": 18296343607680.0, + "grad_norm": 2.0267163464821514, + "language_loss": 0.81381845, + "learning_rate": 7.900739185917744e-07, + "loss": 0.83445239, + "num_input_tokens_seen": 256707760, + "step": 11902, + "time_per_iteration": 2.7151236534118652 + }, + { + "auxiliary_loss_clip": 0.01025982, + "auxiliary_loss_mlp": 0.01024462, + "balance_loss_clip": 1.0231781, + "balance_loss_mlp": 1.01295209, + "epoch": 0.715647076506839, + "flos": 11980805783040.0, + "grad_norm": 2.112691831191895, + "language_loss": 0.67940664, + "learning_rate": 7.897638312866785e-07, + "loss": 0.69991106, + "num_input_tokens_seen": 256724150, + "step": 11903, + "time_per_iteration": 2.7160580158233643 + }, + { + "auxiliary_loss_clip": 0.01026552, + "auxiliary_loss_mlp": 0.01025522, + "balance_loss_clip": 1.02192938, + "balance_loss_mlp": 1.01529932, + "epoch": 0.715707199759507, + "flos": 18951641377920.0, + "grad_norm": 1.7311200672143325, + "language_loss": 0.76102626, + "learning_rate": 7.894537898738589e-07, + "loss": 0.78154707, + "num_input_tokens_seen": 256742780, + "step": 11904, + "time_per_iteration": 2.8323538303375244 + }, + { + "auxiliary_loss_clip": 0.01043906, + "auxiliary_loss_mlp": 0.01033324, + "balance_loss_clip": 1.02483654, + "balance_loss_mlp": 1.02185583, + "epoch": 0.7157673230121749, + "flos": 15304410299520.0, + "grad_norm": 1.9030624718202662, + "language_loss": 0.72147882, + "learning_rate": 7.891437943650727e-07, + "loss": 0.74225116, + "num_input_tokens_seen": 256761355, + "step": 11905, + "time_per_iteration": 2.631316661834717 + }, + { + "auxiliary_loss_clip": 0.01030337, + "auxiliary_loss_mlp": 0.01029246, + "balance_loss_clip": 1.02355552, + "balance_loss_mlp": 1.01894593, + "epoch": 0.715827446264843, + "flos": 23221850964480.0, + "grad_norm": 1.6096250202548834, + "language_loss": 0.77957404, + "learning_rate": 7.88833844772076e-07, + "loss": 0.80016994, + "num_input_tokens_seen": 256781335, + "step": 11906, + "time_per_iteration": 2.893784284591675 + }, + { + "auxiliary_loss_clip": 0.00986438, + "auxiliary_loss_mlp": 0.01000833, + "balance_loss_clip": 1.00114608, + "balance_loss_mlp": 0.99987954, + "epoch": 0.7158875695175109, + "flos": 60975421833600.0, + "grad_norm": 0.7445135384453665, + "language_loss": 0.5534656, + "learning_rate": 7.885239411066205e-07, + "loss": 0.57333827, + "num_input_tokens_seen": 256838890, + "step": 11907, + "time_per_iteration": 3.1426749229431152 + }, + { + "auxiliary_loss_clip": 0.01044428, + "auxiliary_loss_mlp": 0.01032955, + "balance_loss_clip": 1.02276063, + "balance_loss_mlp": 1.02161825, + "epoch": 0.7159476927701789, + "flos": 17128780024320.0, + "grad_norm": 1.740367023948648, + "language_loss": 0.69670105, + "learning_rate": 7.882140833804593e-07, + "loss": 0.71747482, + "num_input_tokens_seen": 256858145, + "step": 11908, + "time_per_iteration": 2.665071964263916 + }, + { + "auxiliary_loss_clip": 0.010041, + "auxiliary_loss_mlp": 0.01029296, + "balance_loss_clip": 1.01930535, + "balance_loss_mlp": 1.01666594, + "epoch": 0.7160078160228468, + "flos": 22490601886080.0, + "grad_norm": 1.7802346459240537, + "language_loss": 0.71375787, + "learning_rate": 7.879042716053415e-07, + "loss": 0.73409188, + "num_input_tokens_seen": 256878545, + "step": 11909, + "time_per_iteration": 2.7071292400360107 + }, + { + "auxiliary_loss_clip": 0.01054704, + "auxiliary_loss_mlp": 0.01027793, + "balance_loss_clip": 1.02575731, + "balance_loss_mlp": 1.01689172, + "epoch": 0.7160679392755148, + "flos": 30590935626240.0, + "grad_norm": 1.626664616979955, + "language_loss": 0.75139111, + "learning_rate": 7.875945057930144e-07, + "loss": 0.77221608, + "num_input_tokens_seen": 256899920, + "step": 11910, + "time_per_iteration": 4.279602527618408 + }, + { + "auxiliary_loss_clip": 0.0104435, + "auxiliary_loss_mlp": 0.01030376, + "balance_loss_clip": 1.02629006, + "balance_loss_mlp": 1.02082157, + "epoch": 0.7161280625281827, + "flos": 21323648833920.0, + "grad_norm": 1.4781256613813738, + "language_loss": 0.76458848, + "learning_rate": 7.872847859552251e-07, + "loss": 0.78533566, + "num_input_tokens_seen": 256918460, + "step": 11911, + "time_per_iteration": 2.6354658603668213 + }, + { + "auxiliary_loss_clip": 0.01019088, + "auxiliary_loss_mlp": 0.01032633, + "balance_loss_clip": 1.0229255, + "balance_loss_mlp": 1.01937068, + "epoch": 0.7161881857808508, + "flos": 61860078921600.0, + "grad_norm": 2.1791440959195976, + "language_loss": 0.58806044, + "learning_rate": 7.869751121037192e-07, + "loss": 0.60857761, + "num_input_tokens_seen": 256942015, + "step": 11912, + "time_per_iteration": 3.039806604385376 + }, + { + "auxiliary_loss_clip": 0.01054013, + "auxiliary_loss_mlp": 0.01032092, + "balance_loss_clip": 1.02577519, + "balance_loss_mlp": 1.02076113, + "epoch": 0.7162483090335187, + "flos": 20812101292800.0, + "grad_norm": 2.109977119245845, + "language_loss": 0.78473747, + "learning_rate": 7.866654842502376e-07, + "loss": 0.8055985, + "num_input_tokens_seen": 256961065, + "step": 11913, + "time_per_iteration": 2.5915639400482178 + }, + { + "auxiliary_loss_clip": 0.01040455, + "auxiliary_loss_mlp": 0.0102802, + "balance_loss_clip": 1.02418172, + "balance_loss_mlp": 1.01831615, + "epoch": 0.7163084322861867, + "flos": 24097532630400.0, + "grad_norm": 1.7311581738300752, + "language_loss": 0.74343145, + "learning_rate": 7.863559024065234e-07, + "loss": 0.76411617, + "num_input_tokens_seen": 256982165, + "step": 11914, + "time_per_iteration": 2.787684202194214 + }, + { + "auxiliary_loss_clip": 0.01029688, + "auxiliary_loss_mlp": 0.01027886, + "balance_loss_clip": 1.0239321, + "balance_loss_mlp": 1.01746082, + "epoch": 0.7163685555388547, + "flos": 20080888128000.0, + "grad_norm": 1.7136273548123286, + "language_loss": 0.73786539, + "learning_rate": 7.860463665843143e-07, + "loss": 0.75844115, + "num_input_tokens_seen": 256999825, + "step": 11915, + "time_per_iteration": 2.831906318664551 + }, + { + "auxiliary_loss_clip": 0.01064543, + "auxiliary_loss_mlp": 0.0102678, + "balance_loss_clip": 1.02541757, + "balance_loss_mlp": 1.01631331, + "epoch": 0.7164286787915226, + "flos": 17456967613440.0, + "grad_norm": 1.705265018654374, + "language_loss": 0.80729234, + "learning_rate": 7.85736876795349e-07, + "loss": 0.82820559, + "num_input_tokens_seen": 257017450, + "step": 11916, + "time_per_iteration": 2.559797763824463 + }, + { + "auxiliary_loss_clip": 0.00998996, + "auxiliary_loss_mlp": 0.01030514, + "balance_loss_clip": 1.02470231, + "balance_loss_mlp": 1.01981485, + "epoch": 0.7164888020441906, + "flos": 19718908819200.0, + "grad_norm": 1.9356183657507102, + "language_loss": 0.68617815, + "learning_rate": 7.854274330513626e-07, + "loss": 0.70647323, + "num_input_tokens_seen": 257035465, + "step": 11917, + "time_per_iteration": 2.8800220489501953 + }, + { + "auxiliary_loss_clip": 0.01038401, + "auxiliary_loss_mlp": 0.01028763, + "balance_loss_clip": 1.02261007, + "balance_loss_mlp": 1.01712251, + "epoch": 0.7165489252968585, + "flos": 21470523546240.0, + "grad_norm": 1.6368246599703458, + "language_loss": 0.75898743, + "learning_rate": 7.851180353640896e-07, + "loss": 0.77965909, + "num_input_tokens_seen": 257053750, + "step": 11918, + "time_per_iteration": 2.7088141441345215 + }, + { + "auxiliary_loss_clip": 0.00989676, + "auxiliary_loss_mlp": 0.01002241, + "balance_loss_clip": 1.00372148, + "balance_loss_mlp": 1.00126922, + "epoch": 0.7166090485495266, + "flos": 69928060464000.0, + "grad_norm": 0.6506369035727027, + "language_loss": 0.53901875, + "learning_rate": 7.848086837452639e-07, + "loss": 0.55893791, + "num_input_tokens_seen": 257121215, + "step": 11919, + "time_per_iteration": 3.3013646602630615 + }, + { + "auxiliary_loss_clip": 0.01045154, + "auxiliary_loss_mlp": 0.01029731, + "balance_loss_clip": 1.02656269, + "balance_loss_mlp": 1.0193125, + "epoch": 0.7166691718021945, + "flos": 27343892949120.0, + "grad_norm": 2.3954958489142784, + "language_loss": 0.69252217, + "learning_rate": 7.844993782066132e-07, + "loss": 0.71327102, + "num_input_tokens_seen": 257143370, + "step": 11920, + "time_per_iteration": 2.754697561264038 + }, + { + "auxiliary_loss_clip": 0.01043844, + "auxiliary_loss_mlp": 0.01035162, + "balance_loss_clip": 1.02492094, + "balance_loss_mlp": 1.02392662, + "epoch": 0.7167292950548625, + "flos": 30408868563840.0, + "grad_norm": 1.836539511723071, + "language_loss": 0.74988711, + "learning_rate": 7.841901187598678e-07, + "loss": 0.77067715, + "num_input_tokens_seen": 257162160, + "step": 11921, + "time_per_iteration": 2.7360353469848633 + }, + { + "auxiliary_loss_clip": 0.01032918, + "auxiliary_loss_mlp": 0.01030019, + "balance_loss_clip": 1.0268898, + "balance_loss_mlp": 1.01663816, + "epoch": 0.7167894183075304, + "flos": 14571257800320.0, + "grad_norm": 1.8970166468920537, + "language_loss": 0.7508893, + "learning_rate": 7.83880905416755e-07, + "loss": 0.77151871, + "num_input_tokens_seen": 257179300, + "step": 11922, + "time_per_iteration": 2.716356039047241 + }, + { + "auxiliary_loss_clip": 0.00989387, + "auxiliary_loss_mlp": 0.01004175, + "balance_loss_clip": 1.00310183, + "balance_loss_mlp": 1.00322771, + "epoch": 0.7168495415601984, + "flos": 64110674407680.0, + "grad_norm": 0.7615253904181163, + "language_loss": 0.55114198, + "learning_rate": 7.83571738189001e-07, + "loss": 0.5710777, + "num_input_tokens_seen": 257235470, + "step": 11923, + "time_per_iteration": 3.060020923614502 + }, + { + "auxiliary_loss_clip": 0.01033061, + "auxiliary_loss_mlp": 0.01030889, + "balance_loss_clip": 1.02355766, + "balance_loss_mlp": 1.01943934, + "epoch": 0.7169096648128663, + "flos": 24681440119680.0, + "grad_norm": 1.7249128174777164, + "language_loss": 0.77297962, + "learning_rate": 7.832626170883279e-07, + "loss": 0.79361916, + "num_input_tokens_seen": 257255850, + "step": 11924, + "time_per_iteration": 2.8393630981445312 + }, + { + "auxiliary_loss_clip": 0.01034824, + "auxiliary_loss_mlp": 0.01029147, + "balance_loss_clip": 1.02642727, + "balance_loss_mlp": 1.01918709, + "epoch": 0.7169697880655344, + "flos": 20667525050880.0, + "grad_norm": 2.4222500544741155, + "language_loss": 0.67813075, + "learning_rate": 7.829535421264588e-07, + "loss": 0.69877052, + "num_input_tokens_seen": 257275425, + "step": 11925, + "time_per_iteration": 2.786044120788574 + }, + { + "auxiliary_loss_clip": 0.01039402, + "auxiliary_loss_mlp": 0.01024129, + "balance_loss_clip": 1.02476144, + "balance_loss_mlp": 1.01395416, + "epoch": 0.7170299113182023, + "flos": 21032700670080.0, + "grad_norm": 1.4736936117458983, + "language_loss": 0.77545321, + "learning_rate": 7.826445133151133e-07, + "loss": 0.79608852, + "num_input_tokens_seen": 257295740, + "step": 11926, + "time_per_iteration": 2.7255754470825195 + }, + { + "auxiliary_loss_clip": 0.01056955, + "auxiliary_loss_mlp": 0.0074781, + "balance_loss_clip": 1.02620864, + "balance_loss_mlp": 1.00046253, + "epoch": 0.7170900345708703, + "flos": 22893304239360.0, + "grad_norm": 2.0153802964619416, + "language_loss": 0.77135563, + "learning_rate": 7.823355306660093e-07, + "loss": 0.78940326, + "num_input_tokens_seen": 257315970, + "step": 11927, + "time_per_iteration": 2.665315866470337 + }, + { + "auxiliary_loss_clip": 0.01054269, + "auxiliary_loss_mlp": 0.01027534, + "balance_loss_clip": 1.02668405, + "balance_loss_mlp": 1.0158813, + "epoch": 0.7171501578235383, + "flos": 15518688883200.0, + "grad_norm": 1.4748674149797516, + "language_loss": 0.68757749, + "learning_rate": 7.820265941908642e-07, + "loss": 0.70839554, + "num_input_tokens_seen": 257334230, + "step": 11928, + "time_per_iteration": 2.5915374755859375 + }, + { + "auxiliary_loss_clip": 0.01014786, + "auxiliary_loss_mlp": 0.01027652, + "balance_loss_clip": 1.02358615, + "balance_loss_mlp": 1.01704228, + "epoch": 0.7172102810762062, + "flos": 26104292640000.0, + "grad_norm": 1.797534366048621, + "language_loss": 0.65230048, + "learning_rate": 7.817177039013931e-07, + "loss": 0.67272484, + "num_input_tokens_seen": 257352145, + "step": 11929, + "time_per_iteration": 2.842202663421631 + }, + { + "auxiliary_loss_clip": 0.01035316, + "auxiliary_loss_mlp": 0.01029185, + "balance_loss_clip": 1.02192879, + "balance_loss_mlp": 1.01758564, + "epoch": 0.7172704043288742, + "flos": 21506649649920.0, + "grad_norm": 2.2252163823783846, + "language_loss": 0.69598722, + "learning_rate": 7.81408859809308e-07, + "loss": 0.71663225, + "num_input_tokens_seen": 257371460, + "step": 11930, + "time_per_iteration": 2.6887271404266357 + }, + { + "auxiliary_loss_clip": 0.01027753, + "auxiliary_loss_mlp": 0.01026542, + "balance_loss_clip": 1.02136159, + "balance_loss_mlp": 1.01566958, + "epoch": 0.7173305275815421, + "flos": 18770939032320.0, + "grad_norm": 1.6630225596753025, + "language_loss": 0.80561972, + "learning_rate": 7.811000619263219e-07, + "loss": 0.82616258, + "num_input_tokens_seen": 257390800, + "step": 11931, + "time_per_iteration": 5.814584970474243 + }, + { + "auxiliary_loss_clip": 0.01054298, + "auxiliary_loss_mlp": 0.01032265, + "balance_loss_clip": 1.02598965, + "balance_loss_mlp": 1.02211988, + "epoch": 0.7173906508342102, + "flos": 16179876483840.0, + "grad_norm": 1.8767377585607958, + "language_loss": 0.78038186, + "learning_rate": 7.80791310264143e-07, + "loss": 0.80124748, + "num_input_tokens_seen": 257407495, + "step": 11932, + "time_per_iteration": 2.566941738128662 + }, + { + "auxiliary_loss_clip": 0.01052555, + "auxiliary_loss_mlp": 0.01028579, + "balance_loss_clip": 1.02453899, + "balance_loss_mlp": 1.01799297, + "epoch": 0.7174507740868781, + "flos": 26613864933120.0, + "grad_norm": 1.2958896614071507, + "language_loss": 0.74889338, + "learning_rate": 7.804826048344803e-07, + "loss": 0.7697047, + "num_input_tokens_seen": 257429675, + "step": 11933, + "time_per_iteration": 2.6612589359283447 + }, + { + "auxiliary_loss_clip": 0.01069878, + "auxiliary_loss_mlp": 0.01037093, + "balance_loss_clip": 1.02740002, + "balance_loss_mlp": 1.0232048, + "epoch": 0.7175108973395461, + "flos": 18432911116800.0, + "grad_norm": 2.391273404240788, + "language_loss": 0.69601333, + "learning_rate": 7.801739456490388e-07, + "loss": 0.7170831, + "num_input_tokens_seen": 257442765, + "step": 11934, + "time_per_iteration": 2.5923960208892822 + }, + { + "auxiliary_loss_clip": 0.01053306, + "auxiliary_loss_mlp": 0.01030607, + "balance_loss_clip": 1.02455127, + "balance_loss_mlp": 1.01963973, + "epoch": 0.717571020592214, + "flos": 23914962777600.0, + "grad_norm": 2.792698316000176, + "language_loss": 0.86161757, + "learning_rate": 7.798653327195237e-07, + "loss": 0.88245666, + "num_input_tokens_seen": 257459310, + "step": 11935, + "time_per_iteration": 2.6498641967773438 + }, + { + "auxiliary_loss_clip": 0.01016227, + "auxiliary_loss_mlp": 0.01028318, + "balance_loss_clip": 1.02085614, + "balance_loss_mlp": 1.01727939, + "epoch": 0.717631143844882, + "flos": 38256930109440.0, + "grad_norm": 3.0669930608952853, + "language_loss": 0.7373333, + "learning_rate": 7.795567660576388e-07, + "loss": 0.75777876, + "num_input_tokens_seen": 257484750, + "step": 11936, + "time_per_iteration": 2.9469809532165527 + }, + { + "auxiliary_loss_clip": 0.01007328, + "auxiliary_loss_mlp": 0.0100608, + "balance_loss_clip": 1.00194478, + "balance_loss_mlp": 1.00503683, + "epoch": 0.7176912670975499, + "flos": 65515896328320.0, + "grad_norm": 0.7576554768602779, + "language_loss": 0.55896872, + "learning_rate": 7.79248245675082e-07, + "loss": 0.57910281, + "num_input_tokens_seen": 257543110, + "step": 11937, + "time_per_iteration": 3.1951746940612793 + }, + { + "auxiliary_loss_clip": 0.01056487, + "auxiliary_loss_mlp": 0.01031402, + "balance_loss_clip": 1.02669907, + "balance_loss_mlp": 1.01954639, + "epoch": 0.717751390350218, + "flos": 31281066610560.0, + "grad_norm": 1.832594122107357, + "language_loss": 0.54852104, + "learning_rate": 7.789397715835542e-07, + "loss": 0.56939989, + "num_input_tokens_seen": 257567410, + "step": 11938, + "time_per_iteration": 2.7486836910247803 + }, + { + "auxiliary_loss_clip": 0.01049979, + "auxiliary_loss_mlp": 0.0102511, + "balance_loss_clip": 1.02369511, + "balance_loss_mlp": 1.01504326, + "epoch": 0.7178115136028859, + "flos": 19859031774720.0, + "grad_norm": 1.5035519966023851, + "language_loss": 0.76405263, + "learning_rate": 7.786313437947527e-07, + "loss": 0.78480363, + "num_input_tokens_seen": 257586270, + "step": 11939, + "time_per_iteration": 2.635432720184326 + }, + { + "auxiliary_loss_clip": 0.00987828, + "auxiliary_loss_mlp": 0.01001159, + "balance_loss_clip": 1.00214672, + "balance_loss_mlp": 1.00010955, + "epoch": 0.7178716368555539, + "flos": 64348655967360.0, + "grad_norm": 0.756571080797662, + "language_loss": 0.61424303, + "learning_rate": 7.783229623203738e-07, + "loss": 0.63413292, + "num_input_tokens_seen": 257647415, + "step": 11940, + "time_per_iteration": 3.2238950729370117 + }, + { + "auxiliary_loss_clip": 0.0102718, + "auxiliary_loss_mlp": 0.01030494, + "balance_loss_clip": 1.0219357, + "balance_loss_mlp": 1.0200572, + "epoch": 0.7179317601082219, + "flos": 26762607152640.0, + "grad_norm": 2.594119467238957, + "language_loss": 0.59130591, + "learning_rate": 7.780146271721097e-07, + "loss": 0.61188269, + "num_input_tokens_seen": 257669795, + "step": 11941, + "time_per_iteration": 2.8447022438049316 + }, + { + "auxiliary_loss_clip": 0.01043282, + "auxiliary_loss_mlp": 0.01029212, + "balance_loss_clip": 1.02499318, + "balance_loss_mlp": 1.01842928, + "epoch": 0.7179918833608898, + "flos": 23513804709120.0, + "grad_norm": 1.8230480469597854, + "language_loss": 0.79217535, + "learning_rate": 7.777063383616543e-07, + "loss": 0.8129003, + "num_input_tokens_seen": 257687415, + "step": 11942, + "time_per_iteration": 2.770655632019043 + }, + { + "auxiliary_loss_clip": 0.01056292, + "auxiliary_loss_mlp": 0.0103554, + "balance_loss_clip": 1.02696872, + "balance_loss_mlp": 1.02479374, + "epoch": 0.7180520066135578, + "flos": 17165588486400.0, + "grad_norm": 1.7306370369997577, + "language_loss": 0.65862489, + "learning_rate": 7.773980959006968e-07, + "loss": 0.67954326, + "num_input_tokens_seen": 257706215, + "step": 11943, + "time_per_iteration": 2.742856740951538 + }, + { + "auxiliary_loss_clip": 0.01061616, + "auxiliary_loss_mlp": 0.01028932, + "balance_loss_clip": 1.02493, + "balance_loss_mlp": 1.01822698, + "epoch": 0.7181121298662257, + "flos": 17566638814080.0, + "grad_norm": 1.7021273561747325, + "language_loss": 0.78818119, + "learning_rate": 7.770898998009254e-07, + "loss": 0.80908674, + "num_input_tokens_seen": 257724740, + "step": 11944, + "time_per_iteration": 2.5682601928710938 + }, + { + "auxiliary_loss_clip": 0.01035526, + "auxiliary_loss_mlp": 0.00747779, + "balance_loss_clip": 1.02395272, + "balance_loss_mlp": 1.00050867, + "epoch": 0.7181722531188938, + "flos": 11947660508160.0, + "grad_norm": 2.1850340818074208, + "language_loss": 0.6275813, + "learning_rate": 7.767817500740277e-07, + "loss": 0.64541435, + "num_input_tokens_seen": 257742060, + "step": 11945, + "time_per_iteration": 2.628206968307495 + }, + { + "auxiliary_loss_clip": 0.00998805, + "auxiliary_loss_mlp": 0.01002095, + "balance_loss_clip": 1.00265276, + "balance_loss_mlp": 1.00100458, + "epoch": 0.7182323763715617, + "flos": 65503649790720.0, + "grad_norm": 0.7023924781968778, + "language_loss": 0.51100147, + "learning_rate": 7.76473646731689e-07, + "loss": 0.53101051, + "num_input_tokens_seen": 257802250, + "step": 11946, + "time_per_iteration": 3.080045461654663 + }, + { + "auxiliary_loss_clip": 0.01032928, + "auxiliary_loss_mlp": 0.01033376, + "balance_loss_clip": 1.0245297, + "balance_loss_mlp": 1.02066207, + "epoch": 0.7182924996242297, + "flos": 20630932070400.0, + "grad_norm": 3.0042924595839477, + "language_loss": 0.74485159, + "learning_rate": 7.761655897855925e-07, + "loss": 0.76551461, + "num_input_tokens_seen": 257821155, + "step": 11947, + "time_per_iteration": 2.7743513584136963 + }, + { + "auxiliary_loss_clip": 0.01017626, + "auxiliary_loss_mlp": 0.00747606, + "balance_loss_clip": 1.02111602, + "balance_loss_mlp": 1.0004518, + "epoch": 0.7183526228768976, + "flos": 16216433550720.0, + "grad_norm": 1.46931258311251, + "language_loss": 0.72373092, + "learning_rate": 7.758575792474187e-07, + "loss": 0.74138325, + "num_input_tokens_seen": 257839905, + "step": 11948, + "time_per_iteration": 4.311559438705444 + }, + { + "auxiliary_loss_clip": 0.0103963, + "auxiliary_loss_mlp": 0.0103568, + "balance_loss_clip": 1.02339244, + "balance_loss_mlp": 1.024248, + "epoch": 0.7184127461295656, + "flos": 22232655342720.0, + "grad_norm": 1.677477704585451, + "language_loss": 0.71652442, + "learning_rate": 7.755496151288483e-07, + "loss": 0.73727751, + "num_input_tokens_seen": 257860055, + "step": 11949, + "time_per_iteration": 2.672451972961426 + }, + { + "auxiliary_loss_clip": 0.01062047, + "auxiliary_loss_mlp": 0.00747624, + "balance_loss_clip": 1.02486169, + "balance_loss_mlp": 1.00036788, + "epoch": 0.7184728693822335, + "flos": 27344503480320.0, + "grad_norm": 1.9796484356928585, + "language_loss": 0.76076066, + "learning_rate": 7.752416974415598e-07, + "loss": 0.77885747, + "num_input_tokens_seen": 257879315, + "step": 11950, + "time_per_iteration": 2.610025644302368 + }, + { + "auxiliary_loss_clip": 0.01065094, + "auxiliary_loss_mlp": 0.01031639, + "balance_loss_clip": 1.02559984, + "balance_loss_mlp": 1.01987278, + "epoch": 0.7185329926349016, + "flos": 16508530949760.0, + "grad_norm": 2.4016244615914517, + "language_loss": 0.67961591, + "learning_rate": 7.749338261972282e-07, + "loss": 0.70058328, + "num_input_tokens_seen": 257896570, + "step": 11951, + "time_per_iteration": 2.5413920879364014 + }, + { + "auxiliary_loss_clip": 0.01043656, + "auxiliary_loss_mlp": 0.01035317, + "balance_loss_clip": 1.02669907, + "balance_loss_mlp": 1.02324104, + "epoch": 0.7185931158875695, + "flos": 23951052967680.0, + "grad_norm": 1.6206636975707602, + "language_loss": 0.78049111, + "learning_rate": 7.746260014075286e-07, + "loss": 0.80128086, + "num_input_tokens_seen": 257916855, + "step": 11952, + "time_per_iteration": 2.7051522731781006 + }, + { + "auxiliary_loss_clip": 0.01055813, + "auxiliary_loss_mlp": 0.01031262, + "balance_loss_clip": 1.0254035, + "balance_loss_mlp": 1.019943, + "epoch": 0.7186532391402375, + "flos": 26542007775360.0, + "grad_norm": 2.4080264664957105, + "language_loss": 0.7495966, + "learning_rate": 7.743182230841352e-07, + "loss": 0.77046734, + "num_input_tokens_seen": 257937140, + "step": 11953, + "time_per_iteration": 2.735637664794922 + }, + { + "auxiliary_loss_clip": 0.01052107, + "auxiliary_loss_mlp": 0.0102922, + "balance_loss_clip": 1.02341175, + "balance_loss_mlp": 1.0180974, + "epoch": 0.7187133623929055, + "flos": 22383049587840.0, + "grad_norm": 2.1375670479041866, + "language_loss": 0.73366106, + "learning_rate": 7.740104912387164e-07, + "loss": 0.75447434, + "num_input_tokens_seen": 257956785, + "step": 11954, + "time_per_iteration": 2.6900734901428223 + }, + { + "auxiliary_loss_clip": 0.01045691, + "auxiliary_loss_mlp": 0.0103779, + "balance_loss_clip": 1.02683139, + "balance_loss_mlp": 1.0265547, + "epoch": 0.7187734856455734, + "flos": 15779580341760.0, + "grad_norm": 1.6624523187894125, + "language_loss": 0.74252689, + "learning_rate": 7.737028058829425e-07, + "loss": 0.76336169, + "num_input_tokens_seen": 257975455, + "step": 11955, + "time_per_iteration": 2.6888396739959717 + }, + { + "auxiliary_loss_clip": 0.01033505, + "auxiliary_loss_mlp": 0.01035228, + "balance_loss_clip": 1.02471471, + "balance_loss_mlp": 1.0245347, + "epoch": 0.7188336088982414, + "flos": 31759612531200.0, + "grad_norm": 1.6077272761731245, + "language_loss": 0.73372942, + "learning_rate": 7.733951670284817e-07, + "loss": 0.75441676, + "num_input_tokens_seen": 257996850, + "step": 11956, + "time_per_iteration": 2.850658416748047 + }, + { + "auxiliary_loss_clip": 0.0098171, + "auxiliary_loss_mlp": 0.01038585, + "balance_loss_clip": 1.01863289, + "balance_loss_mlp": 1.02548385, + "epoch": 0.7188937321509093, + "flos": 21465208333440.0, + "grad_norm": 1.9401552618008047, + "language_loss": 0.7112965, + "learning_rate": 7.730875746869987e-07, + "loss": 0.73149943, + "num_input_tokens_seen": 258016145, + "step": 11957, + "time_per_iteration": 4.559526205062866 + }, + { + "auxiliary_loss_clip": 0.01010794, + "auxiliary_loss_mlp": 0.01035663, + "balance_loss_clip": 1.02225578, + "balance_loss_mlp": 1.02433252, + "epoch": 0.7189538554035774, + "flos": 27271497087360.0, + "grad_norm": 1.6953643570661905, + "language_loss": 0.73954284, + "learning_rate": 7.727800288701582e-07, + "loss": 0.76000738, + "num_input_tokens_seen": 258035420, + "step": 11958, + "time_per_iteration": 2.879222869873047 + }, + { + "auxiliary_loss_clip": 0.0104663, + "auxiliary_loss_mlp": 0.01033745, + "balance_loss_clip": 1.02284265, + "balance_loss_mlp": 1.02263451, + "epoch": 0.7190139786562453, + "flos": 21580625710080.0, + "grad_norm": 1.5267109862303876, + "language_loss": 0.83947259, + "learning_rate": 7.724725295896215e-07, + "loss": 0.8602764, + "num_input_tokens_seen": 258053520, + "step": 11959, + "time_per_iteration": 2.6313555240631104 + }, + { + "auxiliary_loss_clip": 0.0106869, + "auxiliary_loss_mlp": 0.01031064, + "balance_loss_clip": 1.02875781, + "balance_loss_mlp": 1.01939917, + "epoch": 0.7190741019089133, + "flos": 26721237663360.0, + "grad_norm": 1.537059955812137, + "language_loss": 0.81854445, + "learning_rate": 7.7216507685705e-07, + "loss": 0.83954197, + "num_input_tokens_seen": 258073020, + "step": 11960, + "time_per_iteration": 2.681640863418579 + }, + { + "auxiliary_loss_clip": 0.0103939, + "auxiliary_loss_mlp": 0.01035344, + "balance_loss_clip": 1.02435076, + "balance_loss_mlp": 1.02361369, + "epoch": 0.7191342251615812, + "flos": 26104759516800.0, + "grad_norm": 1.8656902045094872, + "language_loss": 0.77419877, + "learning_rate": 7.718576706841013e-07, + "loss": 0.79494607, + "num_input_tokens_seen": 258093155, + "step": 11961, + "time_per_iteration": 2.740546464920044 + }, + { + "auxiliary_loss_clip": 0.01049292, + "auxiliary_loss_mlp": 0.0102956, + "balance_loss_clip": 1.02398705, + "balance_loss_mlp": 1.01969528, + "epoch": 0.7191943484142492, + "flos": 22967028904320.0, + "grad_norm": 1.4231352222896763, + "language_loss": 0.74903941, + "learning_rate": 7.715503110824326e-07, + "loss": 0.76982796, + "num_input_tokens_seen": 258113905, + "step": 11962, + "time_per_iteration": 2.6923985481262207 + }, + { + "auxiliary_loss_clip": 0.010544, + "auxiliary_loss_mlp": 0.01032046, + "balance_loss_clip": 1.02519953, + "balance_loss_mlp": 1.01973164, + "epoch": 0.7192544716669171, + "flos": 22565332131840.0, + "grad_norm": 1.7155496156965893, + "language_loss": 0.75211763, + "learning_rate": 7.712429980637001e-07, + "loss": 0.77298206, + "num_input_tokens_seen": 258132820, + "step": 11963, + "time_per_iteration": 2.699208974838257 + }, + { + "auxiliary_loss_clip": 0.01028831, + "auxiliary_loss_mlp": 0.01039098, + "balance_loss_clip": 1.02586508, + "balance_loss_mlp": 1.02674246, + "epoch": 0.7193145949195852, + "flos": 18982200873600.0, + "grad_norm": 3.242347794146268, + "language_loss": 0.81289923, + "learning_rate": 7.709357316395564e-07, + "loss": 0.83357859, + "num_input_tokens_seen": 258148055, + "step": 11964, + "time_per_iteration": 2.7087666988372803 + }, + { + "auxiliary_loss_clip": 0.01053156, + "auxiliary_loss_mlp": 0.01033958, + "balance_loss_clip": 1.02491176, + "balance_loss_mlp": 1.02261496, + "epoch": 0.7193747181722531, + "flos": 18004246208640.0, + "grad_norm": 1.6711923734267522, + "language_loss": 0.7484681, + "learning_rate": 7.70628511821652e-07, + "loss": 0.76933926, + "num_input_tokens_seen": 258165995, + "step": 11965, + "time_per_iteration": 2.612755298614502 + }, + { + "auxiliary_loss_clip": 0.01045558, + "auxiliary_loss_mlp": 0.01029978, + "balance_loss_clip": 1.02695394, + "balance_loss_mlp": 1.01843309, + "epoch": 0.7194348414249211, + "flos": 24389414547840.0, + "grad_norm": 1.581452993141075, + "language_loss": 0.77470392, + "learning_rate": 7.703213386216377e-07, + "loss": 0.79545927, + "num_input_tokens_seen": 258186165, + "step": 11966, + "time_per_iteration": 2.7148396968841553 + }, + { + "auxiliary_loss_clip": 0.0103687, + "auxiliary_loss_mlp": 0.01029815, + "balance_loss_clip": 1.02308869, + "balance_loss_mlp": 1.01832926, + "epoch": 0.7194949646775891, + "flos": 22163455791360.0, + "grad_norm": 1.9100682392320412, + "language_loss": 0.72967952, + "learning_rate": 7.700142120511619e-07, + "loss": 0.75034642, + "num_input_tokens_seen": 258204595, + "step": 11967, + "time_per_iteration": 2.739556074142456 + }, + { + "auxiliary_loss_clip": 0.01040066, + "auxiliary_loss_mlp": 0.01028971, + "balance_loss_clip": 1.0264535, + "balance_loss_mlp": 1.0192498, + "epoch": 0.719555087930257, + "flos": 20266366982400.0, + "grad_norm": 9.818103281331608, + "language_loss": 0.81670696, + "learning_rate": 7.6970713212187e-07, + "loss": 0.83739734, + "num_input_tokens_seen": 258223110, + "step": 11968, + "time_per_iteration": 2.681407928466797 + }, + { + "auxiliary_loss_clip": 0.01043208, + "auxiliary_loss_mlp": 0.01026969, + "balance_loss_clip": 1.02526641, + "balance_loss_mlp": 1.01663375, + "epoch": 0.719615211182925, + "flos": 24716309247360.0, + "grad_norm": 2.0695606031810208, + "language_loss": 0.76476359, + "learning_rate": 7.69400098845407e-07, + "loss": 0.78546536, + "num_input_tokens_seen": 258242660, + "step": 11969, + "time_per_iteration": 2.8999829292297363 + }, + { + "auxiliary_loss_clip": 0.01018794, + "auxiliary_loss_mlp": 0.01028967, + "balance_loss_clip": 1.01989913, + "balance_loss_mlp": 1.01701045, + "epoch": 0.719675334435593, + "flos": 20009641501440.0, + "grad_norm": 1.3766407463149213, + "language_loss": 0.70932823, + "learning_rate": 7.69093112233417e-07, + "loss": 0.72980583, + "num_input_tokens_seen": 258261850, + "step": 11970, + "time_per_iteration": 2.7419300079345703 + }, + { + "auxiliary_loss_clip": 0.00986976, + "auxiliary_loss_mlp": 0.00999761, + "balance_loss_clip": 1.00105619, + "balance_loss_mlp": 0.99864686, + "epoch": 0.719735457688261, + "flos": 44199861177600.0, + "grad_norm": 0.9592152769091761, + "language_loss": 0.60839468, + "learning_rate": 7.68786172297538e-07, + "loss": 0.62826204, + "num_input_tokens_seen": 258312570, + "step": 11971, + "time_per_iteration": 3.1402204036712646 + }, + { + "auxiliary_loss_clip": 0.0106923, + "auxiliary_loss_mlp": 0.01032199, + "balance_loss_clip": 1.02772093, + "balance_loss_mlp": 1.02088594, + "epoch": 0.7197955809409289, + "flos": 16802890905600.0, + "grad_norm": 1.7762370662237068, + "language_loss": 0.80199802, + "learning_rate": 7.684792790494105e-07, + "loss": 0.82301235, + "num_input_tokens_seen": 258331600, + "step": 11972, + "time_per_iteration": 2.6018993854522705 + }, + { + "auxiliary_loss_clip": 0.01040316, + "auxiliary_loss_mlp": 0.01031713, + "balance_loss_clip": 1.02273297, + "balance_loss_mlp": 1.02031064, + "epoch": 0.7198557041935969, + "flos": 24535391420160.0, + "grad_norm": 1.760177007974699, + "language_loss": 0.7527684, + "learning_rate": 7.681724325006733e-07, + "loss": 0.7734887, + "num_input_tokens_seen": 258351785, + "step": 11973, + "time_per_iteration": 2.71079421043396 + }, + { + "auxiliary_loss_clip": 0.00971126, + "auxiliary_loss_mlp": 0.01000714, + "balance_loss_clip": 1.0052346, + "balance_loss_mlp": 0.99976683, + "epoch": 0.7199158274462648, + "flos": 70710839602560.0, + "grad_norm": 0.8532042707543324, + "language_loss": 0.57176602, + "learning_rate": 7.6786563266296e-07, + "loss": 0.59148443, + "num_input_tokens_seen": 258404035, + "step": 11974, + "time_per_iteration": 3.1073191165924072 + }, + { + "auxiliary_loss_clip": 0.01044979, + "auxiliary_loss_mlp": 0.01030388, + "balance_loss_clip": 1.02467656, + "balance_loss_mlp": 1.01917088, + "epoch": 0.7199759506989328, + "flos": 29347995352320.0, + "grad_norm": 1.8496924672178323, + "language_loss": 0.60772038, + "learning_rate": 7.675588795479062e-07, + "loss": 0.62847412, + "num_input_tokens_seen": 258424850, + "step": 11975, + "time_per_iteration": 2.8645126819610596 + }, + { + "auxiliary_loss_clip": 0.01048971, + "auxiliary_loss_mlp": 0.01028966, + "balance_loss_clip": 1.02245879, + "balance_loss_mlp": 1.01863694, + "epoch": 0.7200360739516007, + "flos": 24640465680000.0, + "grad_norm": 1.982582946000717, + "language_loss": 0.6774658, + "learning_rate": 7.672521731671425e-07, + "loss": 0.69824517, + "num_input_tokens_seen": 258445485, + "step": 11976, + "time_per_iteration": 2.694666862487793 + }, + { + "auxiliary_loss_clip": 0.01033779, + "auxiliary_loss_mlp": 0.01024207, + "balance_loss_clip": 1.02503729, + "balance_loss_mlp": 1.01419342, + "epoch": 0.7200961972042688, + "flos": 20812855478400.0, + "grad_norm": 1.6598547459513475, + "language_loss": 0.6748879, + "learning_rate": 7.669455135323004e-07, + "loss": 0.69546777, + "num_input_tokens_seen": 258464505, + "step": 11977, + "time_per_iteration": 2.7930479049682617 + }, + { + "auxiliary_loss_clip": 0.01040971, + "auxiliary_loss_mlp": 0.01029853, + "balance_loss_clip": 1.02286804, + "balance_loss_mlp": 1.01917219, + "epoch": 0.7201563204569367, + "flos": 31245910174080.0, + "grad_norm": 1.778224716782109, + "language_loss": 0.75145084, + "learning_rate": 7.666389006550074e-07, + "loss": 0.7721591, + "num_input_tokens_seen": 258487190, + "step": 11978, + "time_per_iteration": 4.440374135971069 + }, + { + "auxiliary_loss_clip": 0.01060314, + "auxiliary_loss_mlp": 0.01032407, + "balance_loss_clip": 1.02309644, + "balance_loss_mlp": 1.02139235, + "epoch": 0.7202164437096047, + "flos": 26651391667200.0, + "grad_norm": 1.7177009433733268, + "language_loss": 0.79029965, + "learning_rate": 7.663323345468908e-07, + "loss": 0.81122684, + "num_input_tokens_seen": 258503790, + "step": 11979, + "time_per_iteration": 4.3622705936431885 + }, + { + "auxiliary_loss_clip": 0.01049912, + "auxiliary_loss_mlp": 0.01027201, + "balance_loss_clip": 1.02370977, + "balance_loss_mlp": 1.01608503, + "epoch": 0.7202765669622727, + "flos": 25959608657280.0, + "grad_norm": 2.2677677889997816, + "language_loss": 0.64947397, + "learning_rate": 7.660258152195767e-07, + "loss": 0.67024505, + "num_input_tokens_seen": 258527335, + "step": 11980, + "time_per_iteration": 2.781935453414917 + }, + { + "auxiliary_loss_clip": 0.01056846, + "auxiliary_loss_mlp": 0.01031955, + "balance_loss_clip": 1.02719474, + "balance_loss_mlp": 1.01970601, + "epoch": 0.7203366902149406, + "flos": 28512354372480.0, + "grad_norm": 2.0332014948041635, + "language_loss": 0.67278099, + "learning_rate": 7.657193426846871e-07, + "loss": 0.69366896, + "num_input_tokens_seen": 258546690, + "step": 11981, + "time_per_iteration": 2.6636741161346436 + }, + { + "auxiliary_loss_clip": 0.01028943, + "auxiliary_loss_mlp": 0.01031844, + "balance_loss_clip": 1.02287364, + "balance_loss_mlp": 1.02017307, + "epoch": 0.7203968134676086, + "flos": 21106030285440.0, + "grad_norm": 1.6175730168937703, + "language_loss": 0.73634803, + "learning_rate": 7.65412916953843e-07, + "loss": 0.75695586, + "num_input_tokens_seen": 258566340, + "step": 11982, + "time_per_iteration": 2.6800615787506104 + }, + { + "auxiliary_loss_clip": 0.01037207, + "auxiliary_loss_mlp": 0.00747751, + "balance_loss_clip": 1.02413344, + "balance_loss_mlp": 1.00040507, + "epoch": 0.7204569367202766, + "flos": 18332146488960.0, + "grad_norm": 1.9062882303404378, + "language_loss": 0.66702199, + "learning_rate": 7.65106538038665e-07, + "loss": 0.68487155, + "num_input_tokens_seen": 258584455, + "step": 11983, + "time_per_iteration": 2.6879193782806396 + }, + { + "auxiliary_loss_clip": 0.01047131, + "auxiliary_loss_mlp": 0.01032954, + "balance_loss_clip": 1.0287509, + "balance_loss_mlp": 1.02154589, + "epoch": 0.7205170599729446, + "flos": 23255103980160.0, + "grad_norm": 1.665947294939103, + "language_loss": 0.66493279, + "learning_rate": 7.648002059507715e-07, + "loss": 0.68573362, + "num_input_tokens_seen": 258604725, + "step": 11984, + "time_per_iteration": 2.865027904510498 + }, + { + "auxiliary_loss_clip": 0.01049572, + "auxiliary_loss_mlp": 0.01028464, + "balance_loss_clip": 1.02434325, + "balance_loss_mlp": 1.01678789, + "epoch": 0.7205771832256125, + "flos": 20120892900480.0, + "grad_norm": 1.5863499540086856, + "language_loss": 0.73671412, + "learning_rate": 7.644939207017771e-07, + "loss": 0.75749445, + "num_input_tokens_seen": 258622885, + "step": 11985, + "time_per_iteration": 2.613452911376953 + }, + { + "auxiliary_loss_clip": 0.0105515, + "auxiliary_loss_mlp": 0.01028083, + "balance_loss_clip": 1.02725542, + "balance_loss_mlp": 1.01772928, + "epoch": 0.7206373064782805, + "flos": 27703250565120.0, + "grad_norm": 1.6786514518350313, + "language_loss": 0.62766182, + "learning_rate": 7.641876823032977e-07, + "loss": 0.64849412, + "num_input_tokens_seen": 258644305, + "step": 11986, + "time_per_iteration": 2.7391443252563477 + }, + { + "auxiliary_loss_clip": 0.01046557, + "auxiliary_loss_mlp": 0.01032143, + "balance_loss_clip": 1.02688384, + "balance_loss_mlp": 1.01979327, + "epoch": 0.7206974297309484, + "flos": 17968156018560.0, + "grad_norm": 1.9254697567462893, + "language_loss": 0.7284435, + "learning_rate": 7.638814907669455e-07, + "loss": 0.7492305, + "num_input_tokens_seen": 258661775, + "step": 11987, + "time_per_iteration": 2.6582143306732178 + }, + { + "auxiliary_loss_clip": 0.01045657, + "auxiliary_loss_mlp": 0.01030601, + "balance_loss_clip": 1.025769, + "balance_loss_mlp": 1.01953244, + "epoch": 0.7207575529836164, + "flos": 16983162288000.0, + "grad_norm": 1.936012593894307, + "language_loss": 0.78307068, + "learning_rate": 7.635753461043301e-07, + "loss": 0.80383325, + "num_input_tokens_seen": 258679830, + "step": 11988, + "time_per_iteration": 2.6825225353240967 + }, + { + "auxiliary_loss_clip": 0.01063188, + "auxiliary_loss_mlp": 0.01029641, + "balance_loss_clip": 1.02568281, + "balance_loss_mlp": 1.01906753, + "epoch": 0.7208176762362843, + "flos": 18727594295040.0, + "grad_norm": 2.3619383370091183, + "language_loss": 0.79178023, + "learning_rate": 7.632692483270618e-07, + "loss": 0.81270856, + "num_input_tokens_seen": 258697415, + "step": 11989, + "time_per_iteration": 2.565305471420288 + }, + { + "auxiliary_loss_clip": 0.01062955, + "auxiliary_loss_mlp": 0.01027393, + "balance_loss_clip": 1.02528131, + "balance_loss_mlp": 1.01664042, + "epoch": 0.7208777994889524, + "flos": 18734489706240.0, + "grad_norm": 1.9742285050715667, + "language_loss": 0.83068168, + "learning_rate": 7.629631974467481e-07, + "loss": 0.85158509, + "num_input_tokens_seen": 258716755, + "step": 11990, + "time_per_iteration": 2.633308172225952 + }, + { + "auxiliary_loss_clip": 0.01034143, + "auxiliary_loss_mlp": 0.01032789, + "balance_loss_clip": 1.02433443, + "balance_loss_mlp": 1.02235866, + "epoch": 0.7209379227416203, + "flos": 14793437376000.0, + "grad_norm": 1.8468727145996462, + "language_loss": 0.76530981, + "learning_rate": 7.626571934749931e-07, + "loss": 0.78597915, + "num_input_tokens_seen": 258733270, + "step": 11991, + "time_per_iteration": 2.7302961349487305 + }, + { + "auxiliary_loss_clip": 0.01024544, + "auxiliary_loss_mlp": 0.01027981, + "balance_loss_clip": 1.02273166, + "balance_loss_mlp": 1.01668048, + "epoch": 0.7209980459942883, + "flos": 29636860527360.0, + "grad_norm": 1.8987407640429803, + "language_loss": 0.72742349, + "learning_rate": 7.623512364234022e-07, + "loss": 0.74794877, + "num_input_tokens_seen": 258755270, + "step": 11992, + "time_per_iteration": 2.8034071922302246 + }, + { + "auxiliary_loss_clip": 0.01050392, + "auxiliary_loss_mlp": 0.01027458, + "balance_loss_clip": 1.02267146, + "balance_loss_mlp": 1.0168066, + "epoch": 0.7210581692469563, + "flos": 23477175815040.0, + "grad_norm": 1.4482536802435295, + "language_loss": 0.66339123, + "learning_rate": 7.620453263035755e-07, + "loss": 0.68416971, + "num_input_tokens_seen": 258775340, + "step": 11993, + "time_per_iteration": 2.656649351119995 + }, + { + "auxiliary_loss_clip": 0.01052974, + "auxiliary_loss_mlp": 0.01028213, + "balance_loss_clip": 1.0239526, + "balance_loss_mlp": 1.01756752, + "epoch": 0.7211182924996242, + "flos": 26099839353600.0, + "grad_norm": 2.0751688316640826, + "language_loss": 0.66026735, + "learning_rate": 7.61739463127115e-07, + "loss": 0.68107921, + "num_input_tokens_seen": 258794580, + "step": 11994, + "time_per_iteration": 2.6660218238830566 + }, + { + "auxiliary_loss_clip": 0.01049027, + "auxiliary_loss_mlp": 0.01031644, + "balance_loss_clip": 1.02271736, + "balance_loss_mlp": 1.0195024, + "epoch": 0.7211784157522922, + "flos": 17712076982400.0, + "grad_norm": 1.740775096724601, + "language_loss": 0.67071068, + "learning_rate": 7.614336469056172e-07, + "loss": 0.69151735, + "num_input_tokens_seen": 258812330, + "step": 11995, + "time_per_iteration": 4.2357892990112305 + }, + { + "auxiliary_loss_clip": 0.01036104, + "auxiliary_loss_mlp": 0.01028996, + "balance_loss_clip": 1.02304196, + "balance_loss_mlp": 1.01730704, + "epoch": 0.7212385390049602, + "flos": 24423637230720.0, + "grad_norm": 1.6140857495163297, + "language_loss": 0.79315645, + "learning_rate": 7.6112787765068e-07, + "loss": 0.81380743, + "num_input_tokens_seen": 258831770, + "step": 11996, + "time_per_iteration": 2.7128589153289795 + }, + { + "auxiliary_loss_clip": 0.01064789, + "auxiliary_loss_mlp": 0.01032367, + "balance_loss_clip": 1.02575147, + "balance_loss_mlp": 1.02160239, + "epoch": 0.7212986622576282, + "flos": 28147250580480.0, + "grad_norm": 2.9406829663723846, + "language_loss": 0.81479836, + "learning_rate": 7.60822155373899e-07, + "loss": 0.83576995, + "num_input_tokens_seen": 258849090, + "step": 11997, + "time_per_iteration": 2.6501941680908203 + }, + { + "auxiliary_loss_clip": 0.0106485, + "auxiliary_loss_mlp": 0.0103022, + "balance_loss_clip": 1.02599192, + "balance_loss_mlp": 1.01891279, + "epoch": 0.7213587855102961, + "flos": 21835770992640.0, + "grad_norm": 2.281476747986967, + "language_loss": 0.66900152, + "learning_rate": 7.605164800868646e-07, + "loss": 0.68995225, + "num_input_tokens_seen": 258868230, + "step": 11998, + "time_per_iteration": 2.6316168308258057 + }, + { + "auxiliary_loss_clip": 0.01062317, + "auxiliary_loss_mlp": 0.01029806, + "balance_loss_clip": 1.0252533, + "balance_loss_mlp": 1.01992369, + "epoch": 0.7214189087629641, + "flos": 14611549881600.0, + "grad_norm": 1.7410310651806407, + "language_loss": 0.72382808, + "learning_rate": 7.602108518011696e-07, + "loss": 0.74474931, + "num_input_tokens_seen": 258885525, + "step": 11999, + "time_per_iteration": 2.5552728176116943 + }, + { + "auxiliary_loss_clip": 0.01043539, + "auxiliary_loss_mlp": 0.01023824, + "balance_loss_clip": 1.02488101, + "balance_loss_mlp": 1.01308274, + "epoch": 0.721479032015632, + "flos": 19390864884480.0, + "grad_norm": 2.184215026728049, + "language_loss": 0.82739627, + "learning_rate": 7.599052705284039e-07, + "loss": 0.84806991, + "num_input_tokens_seen": 258903245, + "step": 12000, + "time_per_iteration": 2.641516923904419 + }, + { + "auxiliary_loss_clip": 0.0105583, + "auxiliary_loss_mlp": 0.0103031, + "balance_loss_clip": 1.0264796, + "balance_loss_mlp": 1.0196228, + "epoch": 0.7215391552683, + "flos": 18512884748160.0, + "grad_norm": 2.1529129821310216, + "language_loss": 0.77180248, + "learning_rate": 7.59599736280154e-07, + "loss": 0.79266387, + "num_input_tokens_seen": 258921245, + "step": 12001, + "time_per_iteration": 2.5926613807678223 + }, + { + "auxiliary_loss_clip": 0.01045028, + "auxiliary_loss_mlp": 0.01035901, + "balance_loss_clip": 1.02423561, + "balance_loss_mlp": 1.02472472, + "epoch": 0.721599278520968, + "flos": 23258731253760.0, + "grad_norm": 1.626971538140541, + "language_loss": 0.81625479, + "learning_rate": 7.592942490680066e-07, + "loss": 0.83706409, + "num_input_tokens_seen": 258939425, + "step": 12002, + "time_per_iteration": 2.752722978591919 + }, + { + "auxiliary_loss_clip": 0.01055267, + "auxiliary_loss_mlp": 0.01027326, + "balance_loss_clip": 1.02631199, + "balance_loss_mlp": 1.01589346, + "epoch": 0.721659401773636, + "flos": 39199045979520.0, + "grad_norm": 1.773954852337339, + "language_loss": 0.62305152, + "learning_rate": 7.589888089035462e-07, + "loss": 0.64387751, + "num_input_tokens_seen": 258960710, + "step": 12003, + "time_per_iteration": 2.808730363845825 + }, + { + "auxiliary_loss_clip": 0.01063314, + "auxiliary_loss_mlp": 0.01031542, + "balance_loss_clip": 1.02479243, + "balance_loss_mlp": 1.02013409, + "epoch": 0.7217195250263039, + "flos": 14939917038720.0, + "grad_norm": 2.106148619319932, + "language_loss": 0.68292993, + "learning_rate": 7.586834157983544e-07, + "loss": 0.70387846, + "num_input_tokens_seen": 258978475, + "step": 12004, + "time_per_iteration": 2.624305009841919 + }, + { + "auxiliary_loss_clip": 0.00991195, + "auxiliary_loss_mlp": 0.01003867, + "balance_loss_clip": 1.00483966, + "balance_loss_mlp": 1.00263333, + "epoch": 0.7217796482789719, + "flos": 70869206666880.0, + "grad_norm": 1.1524830740801977, + "language_loss": 0.54191601, + "learning_rate": 7.583780697640112e-07, + "loss": 0.56186664, + "num_input_tokens_seen": 259037520, + "step": 12005, + "time_per_iteration": 4.8253233432769775 + }, + { + "auxiliary_loss_clip": 0.01037568, + "auxiliary_loss_mlp": 0.01029182, + "balance_loss_clip": 1.02830005, + "balance_loss_mlp": 1.01768398, + "epoch": 0.7218397715316398, + "flos": 37451525402880.0, + "grad_norm": 1.4373460406146694, + "language_loss": 0.63343382, + "learning_rate": 7.580727708120962e-07, + "loss": 0.65410131, + "num_input_tokens_seen": 259061325, + "step": 12006, + "time_per_iteration": 2.787362575531006 + }, + { + "auxiliary_loss_clip": 0.01034614, + "auxiliary_loss_mlp": 0.01031368, + "balance_loss_clip": 1.02312016, + "balance_loss_mlp": 1.02066302, + "epoch": 0.7218998947843078, + "flos": 22710662559360.0, + "grad_norm": 1.7254295310244319, + "language_loss": 0.91827118, + "learning_rate": 7.577675189541865e-07, + "loss": 0.93893099, + "num_input_tokens_seen": 259078135, + "step": 12007, + "time_per_iteration": 2.644943952560425 + }, + { + "auxiliary_loss_clip": 0.01024085, + "auxiliary_loss_mlp": 0.01031459, + "balance_loss_clip": 1.02091265, + "balance_loss_mlp": 1.01856613, + "epoch": 0.7219600180369758, + "flos": 12167182477440.0, + "grad_norm": 1.9244325178811905, + "language_loss": 0.64259517, + "learning_rate": 7.574623142018568e-07, + "loss": 0.66315055, + "num_input_tokens_seen": 259095910, + "step": 12008, + "time_per_iteration": 2.682485580444336 + }, + { + "auxiliary_loss_clip": 0.01054082, + "auxiliary_loss_mlp": 0.01030814, + "balance_loss_clip": 1.02475989, + "balance_loss_mlp": 1.01971507, + "epoch": 0.7220201412896438, + "flos": 22596573985920.0, + "grad_norm": 2.713719835508369, + "language_loss": 0.7813285, + "learning_rate": 7.57157156566681e-07, + "loss": 0.80217743, + "num_input_tokens_seen": 259114225, + "step": 12009, + "time_per_iteration": 2.6187379360198975 + }, + { + "auxiliary_loss_clip": 0.01054628, + "auxiliary_loss_mlp": 0.0103704, + "balance_loss_clip": 1.02569902, + "balance_loss_mlp": 1.02442765, + "epoch": 0.7220802645423118, + "flos": 26718651884160.0, + "grad_norm": 1.946255068749444, + "language_loss": 0.64029074, + "learning_rate": 7.568520460602297e-07, + "loss": 0.66120744, + "num_input_tokens_seen": 259134660, + "step": 12010, + "time_per_iteration": 2.7383785247802734 + }, + { + "auxiliary_loss_clip": 0.01062701, + "auxiliary_loss_mlp": 0.01028386, + "balance_loss_clip": 1.02433395, + "balance_loss_mlp": 1.01727557, + "epoch": 0.7221403877949797, + "flos": 24420548661120.0, + "grad_norm": 2.3370985930545323, + "language_loss": 0.77016371, + "learning_rate": 7.565469826940742e-07, + "loss": 0.79107463, + "num_input_tokens_seen": 259153300, + "step": 12011, + "time_per_iteration": 2.7463576793670654 + }, + { + "auxiliary_loss_clip": 0.01044069, + "auxiliary_loss_mlp": 0.0102969, + "balance_loss_clip": 1.02263188, + "balance_loss_mlp": 1.01886582, + "epoch": 0.7222005110476477, + "flos": 23514379326720.0, + "grad_norm": 1.5649359495974544, + "language_loss": 0.78988755, + "learning_rate": 7.56241966479781e-07, + "loss": 0.81062514, + "num_input_tokens_seen": 259172115, + "step": 12012, + "time_per_iteration": 2.617422103881836 + }, + { + "auxiliary_loss_clip": 0.0104725, + "auxiliary_loss_mlp": 0.01030301, + "balance_loss_clip": 1.02718997, + "balance_loss_mlp": 1.01969123, + "epoch": 0.7222606343003156, + "flos": 23112538899840.0, + "grad_norm": 1.6808943785611712, + "language_loss": 0.75805652, + "learning_rate": 7.559369974289171e-07, + "loss": 0.77883208, + "num_input_tokens_seen": 259191345, + "step": 12013, + "time_per_iteration": 2.7186806201934814 + }, + { + "auxiliary_loss_clip": 0.01063383, + "auxiliary_loss_mlp": 0.01023641, + "balance_loss_clip": 1.02570832, + "balance_loss_mlp": 1.01349664, + "epoch": 0.7223207575529836, + "flos": 24351169541760.0, + "grad_norm": 1.461184477873987, + "language_loss": 0.75750959, + "learning_rate": 7.556320755530484e-07, + "loss": 0.7783798, + "num_input_tokens_seen": 259211700, + "step": 12014, + "time_per_iteration": 2.5662477016448975 + }, + { + "auxiliary_loss_clip": 0.01054197, + "auxiliary_loss_mlp": 0.01030532, + "balance_loss_clip": 1.02452874, + "balance_loss_mlp": 1.01934409, + "epoch": 0.7223808808056515, + "flos": 28330179569280.0, + "grad_norm": 1.7059762429330032, + "language_loss": 0.86699426, + "learning_rate": 7.553272008637346e-07, + "loss": 0.88784152, + "num_input_tokens_seen": 259233825, + "step": 12015, + "time_per_iteration": 2.6540379524230957 + }, + { + "auxiliary_loss_clip": 0.01052871, + "auxiliary_loss_mlp": 0.01033903, + "balance_loss_clip": 1.02580428, + "balance_loss_mlp": 1.02343631, + "epoch": 0.7224410040583196, + "flos": 21069437304960.0, + "grad_norm": 1.817554376273802, + "language_loss": 0.77807361, + "learning_rate": 7.55022373372538e-07, + "loss": 0.79894131, + "num_input_tokens_seen": 259253055, + "step": 12016, + "time_per_iteration": 2.6143136024475098 + }, + { + "auxiliary_loss_clip": 0.01015886, + "auxiliary_loss_mlp": 0.01041597, + "balance_loss_clip": 1.01995504, + "balance_loss_mlp": 1.02942574, + "epoch": 0.7225011273109875, + "flos": 26795429205120.0, + "grad_norm": 1.4393939810015572, + "language_loss": 0.77786791, + "learning_rate": 7.547175930910186e-07, + "loss": 0.79844272, + "num_input_tokens_seen": 259273420, + "step": 12017, + "time_per_iteration": 2.8053243160247803 + }, + { + "auxiliary_loss_clip": 0.01060308, + "auxiliary_loss_mlp": 0.01027584, + "balance_loss_clip": 1.02516723, + "balance_loss_mlp": 1.0176717, + "epoch": 0.7225612505636555, + "flos": 23583578878080.0, + "grad_norm": 2.021334676515381, + "language_loss": 0.73721123, + "learning_rate": 7.54412860030732e-07, + "loss": 0.75809014, + "num_input_tokens_seen": 259291000, + "step": 12018, + "time_per_iteration": 2.566070079803467 + }, + { + "auxiliary_loss_clip": 0.01036491, + "auxiliary_loss_mlp": 0.01030737, + "balance_loss_clip": 1.02969027, + "balance_loss_mlp": 1.0209322, + "epoch": 0.7226213738163234, + "flos": 20777627214720.0, + "grad_norm": 2.8042952632144913, + "language_loss": 0.7756182, + "learning_rate": 7.541081742032347e-07, + "loss": 0.79629052, + "num_input_tokens_seen": 259312390, + "step": 12019, + "time_per_iteration": 2.79128360748291 + }, + { + "auxiliary_loss_clip": 0.01046019, + "auxiliary_loss_mlp": 0.01025122, + "balance_loss_clip": 1.02696705, + "balance_loss_mlp": 1.01373756, + "epoch": 0.7226814970689914, + "flos": 32635832901120.0, + "grad_norm": 1.6697808170470767, + "language_loss": 0.73652881, + "learning_rate": 7.53803535620081e-07, + "loss": 0.75724018, + "num_input_tokens_seen": 259332645, + "step": 12020, + "time_per_iteration": 2.745140552520752 + }, + { + "auxiliary_loss_clip": 0.01045718, + "auxiliary_loss_mlp": 0.01026013, + "balance_loss_clip": 1.02460134, + "balance_loss_mlp": 1.01592231, + "epoch": 0.7227416203216595, + "flos": 22454368041600.0, + "grad_norm": 1.6909077702790727, + "language_loss": 0.77221704, + "learning_rate": 7.534989442928219e-07, + "loss": 0.79293436, + "num_input_tokens_seen": 259353810, + "step": 12021, + "time_per_iteration": 2.666667938232422 + }, + { + "auxiliary_loss_clip": 0.01025349, + "auxiliary_loss_mlp": 0.01031079, + "balance_loss_clip": 1.02248633, + "balance_loss_mlp": 1.01995111, + "epoch": 0.7228017435743274, + "flos": 21652303299840.0, + "grad_norm": 1.5833345622311596, + "language_loss": 0.68171054, + "learning_rate": 7.531944002330073e-07, + "loss": 0.7022748, + "num_input_tokens_seen": 259372460, + "step": 12022, + "time_per_iteration": 2.6394596099853516 + }, + { + "auxiliary_loss_clip": 0.0105501, + "auxiliary_loss_mlp": 0.01028778, + "balance_loss_clip": 1.0260222, + "balance_loss_mlp": 1.01716661, + "epoch": 0.7228618668269954, + "flos": 29533474206720.0, + "grad_norm": 1.7514645860417237, + "language_loss": 0.69269079, + "learning_rate": 7.528899034521858e-07, + "loss": 0.71352869, + "num_input_tokens_seen": 259393275, + "step": 12023, + "time_per_iteration": 2.701493740081787 + }, + { + "auxiliary_loss_clip": 0.01031747, + "auxiliary_loss_mlp": 0.01026676, + "balance_loss_clip": 1.02076626, + "balance_loss_mlp": 1.01516664, + "epoch": 0.7229219900796633, + "flos": 27453815544960.0, + "grad_norm": 1.782501688181484, + "language_loss": 0.70874357, + "learning_rate": 7.525854539619052e-07, + "loss": 0.7293278, + "num_input_tokens_seen": 259416205, + "step": 12024, + "time_per_iteration": 2.7292072772979736 + }, + { + "auxiliary_loss_clip": 0.01032068, + "auxiliary_loss_mlp": 0.01030186, + "balance_loss_clip": 1.02443862, + "balance_loss_mlp": 1.0199759, + "epoch": 0.7229821133323313, + "flos": 16289368116480.0, + "grad_norm": 1.9029104935578156, + "language_loss": 0.75349522, + "learning_rate": 7.522810517737089e-07, + "loss": 0.77411771, + "num_input_tokens_seen": 259433115, + "step": 12025, + "time_per_iteration": 2.780153751373291 + }, + { + "auxiliary_loss_clip": 0.01049581, + "auxiliary_loss_mlp": 0.01027333, + "balance_loss_clip": 1.0235672, + "balance_loss_mlp": 1.01715827, + "epoch": 0.7230422365849992, + "flos": 20412343854720.0, + "grad_norm": 1.9439048961629941, + "language_loss": 0.76144648, + "learning_rate": 7.519766968991395e-07, + "loss": 0.7822156, + "num_input_tokens_seen": 259450475, + "step": 12026, + "time_per_iteration": 5.880870580673218 + }, + { + "auxiliary_loss_clip": 0.01053489, + "auxiliary_loss_mlp": 0.01037557, + "balance_loss_clip": 1.02496064, + "balance_loss_mlp": 1.02719212, + "epoch": 0.7231023598376672, + "flos": 25593499284480.0, + "grad_norm": 2.1690060189621976, + "language_loss": 0.66786885, + "learning_rate": 7.516723893497388e-07, + "loss": 0.68877935, + "num_input_tokens_seen": 259469355, + "step": 12027, + "time_per_iteration": 2.611821174621582 + }, + { + "auxiliary_loss_clip": 0.01028021, + "auxiliary_loss_mlp": 0.01029977, + "balance_loss_clip": 1.02837706, + "balance_loss_mlp": 1.01911688, + "epoch": 0.7231624830903352, + "flos": 25149607009920.0, + "grad_norm": 2.243473003827977, + "language_loss": 0.79306561, + "learning_rate": 7.513681291370469e-07, + "loss": 0.8136456, + "num_input_tokens_seen": 259486565, + "step": 12028, + "time_per_iteration": 2.7532801628112793 + }, + { + "auxiliary_loss_clip": 0.01019346, + "auxiliary_loss_mlp": 0.01026708, + "balance_loss_clip": 1.02227998, + "balance_loss_mlp": 1.01499617, + "epoch": 0.7232226063430032, + "flos": 21725740656000.0, + "grad_norm": 1.6434084737533614, + "language_loss": 0.81976902, + "learning_rate": 7.510639162726e-07, + "loss": 0.84022963, + "num_input_tokens_seen": 259505070, + "step": 12029, + "time_per_iteration": 2.7070553302764893 + }, + { + "auxiliary_loss_clip": 0.00989325, + "auxiliary_loss_mlp": 0.01002261, + "balance_loss_clip": 1.00317299, + "balance_loss_mlp": 1.00119424, + "epoch": 0.7232827295956711, + "flos": 68436798491520.0, + "grad_norm": 0.8173474096666637, + "language_loss": 0.61742294, + "learning_rate": 7.507597507679347e-07, + "loss": 0.63733882, + "num_input_tokens_seen": 259569135, + "step": 12030, + "time_per_iteration": 3.2866690158843994 + }, + { + "auxiliary_loss_clip": 0.01043577, + "auxiliary_loss_mlp": 0.01027569, + "balance_loss_clip": 1.02181363, + "balance_loss_mlp": 1.01542783, + "epoch": 0.7233428528483391, + "flos": 20192642317440.0, + "grad_norm": 1.6626547447338427, + "language_loss": 0.78211272, + "learning_rate": 7.504556326345859e-07, + "loss": 0.80282414, + "num_input_tokens_seen": 259587035, + "step": 12031, + "time_per_iteration": 2.5579235553741455 + }, + { + "auxiliary_loss_clip": 0.0105439, + "auxiliary_loss_mlp": 0.01026819, + "balance_loss_clip": 1.02570009, + "balance_loss_mlp": 1.01580358, + "epoch": 0.723402976101007, + "flos": 23949472769280.0, + "grad_norm": 2.19108095281473, + "language_loss": 0.81525391, + "learning_rate": 7.501515618840834e-07, + "loss": 0.83606601, + "num_input_tokens_seen": 259606140, + "step": 12032, + "time_per_iteration": 2.61326265335083 + }, + { + "auxiliary_loss_clip": 0.01021887, + "auxiliary_loss_mlp": 0.01030527, + "balance_loss_clip": 1.02126861, + "balance_loss_mlp": 1.0191009, + "epoch": 0.723463099353675, + "flos": 20813394182400.0, + "grad_norm": 1.849353399477082, + "language_loss": 0.75253987, + "learning_rate": 7.498475385279592e-07, + "loss": 0.77306402, + "num_input_tokens_seen": 259624275, + "step": 12033, + "time_per_iteration": 2.6052842140197754 + }, + { + "auxiliary_loss_clip": 0.01028271, + "auxiliary_loss_mlp": 0.01025586, + "balance_loss_clip": 1.0225153, + "balance_loss_mlp": 1.01593018, + "epoch": 0.723523222606343, + "flos": 19098013299840.0, + "grad_norm": 1.6746663123347942, + "language_loss": 0.75135034, + "learning_rate": 7.495435625777423e-07, + "loss": 0.77188897, + "num_input_tokens_seen": 259643465, + "step": 12034, + "time_per_iteration": 2.6748008728027344 + }, + { + "auxiliary_loss_clip": 0.01042349, + "auxiliary_loss_mlp": 0.01028476, + "balance_loss_clip": 1.02470422, + "balance_loss_mlp": 1.01858211, + "epoch": 0.723583345859011, + "flos": 26506994993280.0, + "grad_norm": 1.6734165706806032, + "language_loss": 0.8038289, + "learning_rate": 7.492396340449578e-07, + "loss": 0.82453716, + "num_input_tokens_seen": 259662500, + "step": 12035, + "time_per_iteration": 2.8036813735961914 + }, + { + "auxiliary_loss_clip": 0.01002111, + "auxiliary_loss_mlp": 0.01029886, + "balance_loss_clip": 1.02520013, + "balance_loss_mlp": 1.01842356, + "epoch": 0.723643469111679, + "flos": 16033863697920.0, + "grad_norm": 1.7851230118840264, + "language_loss": 0.61078459, + "learning_rate": 7.489357529411326e-07, + "loss": 0.63110459, + "num_input_tokens_seen": 259680140, + "step": 12036, + "time_per_iteration": 2.770545482635498 + }, + { + "auxiliary_loss_clip": 0.01051477, + "auxiliary_loss_mlp": 0.01029113, + "balance_loss_clip": 1.02511251, + "balance_loss_mlp": 1.01963592, + "epoch": 0.7237035923643469, + "flos": 21945549934080.0, + "grad_norm": 1.8085609192632244, + "language_loss": 0.67135227, + "learning_rate": 7.486319192777883e-07, + "loss": 0.69215816, + "num_input_tokens_seen": 259700160, + "step": 12037, + "time_per_iteration": 2.7208967208862305 + }, + { + "auxiliary_loss_clip": 0.01061791, + "auxiliary_loss_mlp": 0.01031556, + "balance_loss_clip": 1.02460194, + "balance_loss_mlp": 1.02081561, + "epoch": 0.7237637156170149, + "flos": 23583112001280.0, + "grad_norm": 2.8958620051949984, + "language_loss": 0.72393501, + "learning_rate": 7.483281330664479e-07, + "loss": 0.7448684, + "num_input_tokens_seen": 259720525, + "step": 12038, + "time_per_iteration": 2.618260383605957 + }, + { + "auxiliary_loss_clip": 0.01062903, + "auxiliary_loss_mlp": 0.01032642, + "balance_loss_clip": 1.02519894, + "balance_loss_mlp": 1.02124, + "epoch": 0.7238238388696828, + "flos": 20594698225920.0, + "grad_norm": 1.6411137312381991, + "language_loss": 0.72340059, + "learning_rate": 7.480243943186293e-07, + "loss": 0.74435604, + "num_input_tokens_seen": 259738680, + "step": 12039, + "time_per_iteration": 2.576803684234619 + }, + { + "auxiliary_loss_clip": 0.01065438, + "auxiliary_loss_mlp": 0.01028536, + "balance_loss_clip": 1.02596509, + "balance_loss_mlp": 1.01816511, + "epoch": 0.7238839621223508, + "flos": 24207024263040.0, + "grad_norm": 1.7327296113142814, + "language_loss": 0.75658298, + "learning_rate": 7.477207030458513e-07, + "loss": 0.7775228, + "num_input_tokens_seen": 259758790, + "step": 12040, + "time_per_iteration": 2.6181769371032715 + }, + { + "auxiliary_loss_clip": 0.01032213, + "auxiliary_loss_mlp": 0.01028171, + "balance_loss_clip": 1.02382052, + "balance_loss_mlp": 1.0173707, + "epoch": 0.7239440853750188, + "flos": 14209745368320.0, + "grad_norm": 2.8965569633587998, + "language_loss": 0.76657593, + "learning_rate": 7.474170592596301e-07, + "loss": 0.78717971, + "num_input_tokens_seen": 259777370, + "step": 12041, + "time_per_iteration": 2.7046995162963867 + }, + { + "auxiliary_loss_clip": 0.01054413, + "auxiliary_loss_mlp": 0.01027058, + "balance_loss_clip": 1.02450049, + "balance_loss_mlp": 1.016675, + "epoch": 0.7240042086276868, + "flos": 21614812479360.0, + "grad_norm": 2.041289674095027, + "language_loss": 0.63644284, + "learning_rate": 7.471134629714797e-07, + "loss": 0.65725756, + "num_input_tokens_seen": 259794665, + "step": 12042, + "time_per_iteration": 2.6743240356445312 + }, + { + "auxiliary_loss_clip": 0.01036944, + "auxiliary_loss_mlp": 0.01029288, + "balance_loss_clip": 1.0263865, + "balance_loss_mlp": 1.01770687, + "epoch": 0.7240643318803547, + "flos": 23331450337920.0, + "grad_norm": 3.3353196159952683, + "language_loss": 0.83730829, + "learning_rate": 7.468099141929116e-07, + "loss": 0.8579706, + "num_input_tokens_seen": 259811110, + "step": 12043, + "time_per_iteration": 4.553791522979736 + }, + { + "auxiliary_loss_clip": 0.01031815, + "auxiliary_loss_mlp": 0.01028567, + "balance_loss_clip": 1.02371013, + "balance_loss_mlp": 1.01701593, + "epoch": 0.7241244551330227, + "flos": 24024849459840.0, + "grad_norm": 1.582065529378799, + "language_loss": 0.64015722, + "learning_rate": 7.465064129354379e-07, + "loss": 0.660761, + "num_input_tokens_seen": 259831080, + "step": 12044, + "time_per_iteration": 2.6590890884399414 + }, + { + "auxiliary_loss_clip": 0.01064803, + "auxiliary_loss_mlp": 0.01029745, + "balance_loss_clip": 1.02687407, + "balance_loss_mlp": 1.01874828, + "epoch": 0.7241845783856906, + "flos": 18730323728640.0, + "grad_norm": 1.4564432778737413, + "language_loss": 0.81590021, + "learning_rate": 7.462029592105658e-07, + "loss": 0.83684576, + "num_input_tokens_seen": 259850135, + "step": 12045, + "time_per_iteration": 2.5948901176452637 + }, + { + "auxiliary_loss_clip": 0.01060702, + "auxiliary_loss_mlp": 0.01025786, + "balance_loss_clip": 1.02391565, + "balance_loss_mlp": 1.01544511, + "epoch": 0.7242447016383586, + "flos": 19498668577920.0, + "grad_norm": 1.6735306269424683, + "language_loss": 0.72081476, + "learning_rate": 7.458995530298034e-07, + "loss": 0.74167967, + "num_input_tokens_seen": 259868185, + "step": 12046, + "time_per_iteration": 2.730393409729004 + }, + { + "auxiliary_loss_clip": 0.01024837, + "auxiliary_loss_mlp": 0.01029041, + "balance_loss_clip": 1.02143097, + "balance_loss_mlp": 1.01750779, + "epoch": 0.7243048248910267, + "flos": 22163491704960.0, + "grad_norm": 5.109801231569089, + "language_loss": 0.7103464, + "learning_rate": 7.455961944046553e-07, + "loss": 0.73088515, + "num_input_tokens_seen": 259887055, + "step": 12047, + "time_per_iteration": 2.717613697052002 + }, + { + "auxiliary_loss_clip": 0.01032459, + "auxiliary_loss_mlp": 0.01033228, + "balance_loss_clip": 1.02385914, + "balance_loss_mlp": 1.02166438, + "epoch": 0.7243649481436946, + "flos": 27672762896640.0, + "grad_norm": 1.6612763350980913, + "language_loss": 0.70289099, + "learning_rate": 7.45292883346627e-07, + "loss": 0.72354788, + "num_input_tokens_seen": 259908295, + "step": 12048, + "time_per_iteration": 2.670133590698242 + }, + { + "auxiliary_loss_clip": 0.00989109, + "auxiliary_loss_mlp": 0.01007038, + "balance_loss_clip": 1.00308073, + "balance_loss_mlp": 1.00590503, + "epoch": 0.7244250713963626, + "flos": 63244545759360.0, + "grad_norm": 0.8311868373461614, + "language_loss": 0.53720653, + "learning_rate": 7.449896198672168e-07, + "loss": 0.55716801, + "num_input_tokens_seen": 259968475, + "step": 12049, + "time_per_iteration": 3.2759034633636475 + }, + { + "auxiliary_loss_clip": 0.01037923, + "auxiliary_loss_mlp": 0.01029347, + "balance_loss_clip": 1.02476072, + "balance_loss_mlp": 1.01708043, + "epoch": 0.7244851946490305, + "flos": 17967114524160.0, + "grad_norm": 2.1393146126224676, + "language_loss": 0.60207403, + "learning_rate": 7.446864039779258e-07, + "loss": 0.62274671, + "num_input_tokens_seen": 259984865, + "step": 12050, + "time_per_iteration": 2.6067633628845215 + }, + { + "auxiliary_loss_clip": 0.00972662, + "auxiliary_loss_mlp": 0.01002102, + "balance_loss_clip": 1.00557232, + "balance_loss_mlp": 1.00093329, + "epoch": 0.7245453179016985, + "flos": 70943649603840.0, + "grad_norm": 0.7181409589771758, + "language_loss": 0.53266317, + "learning_rate": 7.443832356902528e-07, + "loss": 0.55241084, + "num_input_tokens_seen": 260046735, + "step": 12051, + "time_per_iteration": 3.367724895477295 + }, + { + "auxiliary_loss_clip": 0.01047963, + "auxiliary_loss_mlp": 0.01028032, + "balance_loss_clip": 1.02288151, + "balance_loss_mlp": 1.01795244, + "epoch": 0.7246054411543664, + "flos": 24568464867840.0, + "grad_norm": 1.487347202295258, + "language_loss": 0.7221086, + "learning_rate": 7.440801150156927e-07, + "loss": 0.74286854, + "num_input_tokens_seen": 260067950, + "step": 12052, + "time_per_iteration": 2.606985569000244 + }, + { + "auxiliary_loss_clip": 0.01047647, + "auxiliary_loss_mlp": 0.01027966, + "balance_loss_clip": 1.02317047, + "balance_loss_mlp": 1.01628947, + "epoch": 0.7246655644070344, + "flos": 32338312548480.0, + "grad_norm": 1.7405623828804584, + "language_loss": 0.74097407, + "learning_rate": 7.437770419657415e-07, + "loss": 0.76173019, + "num_input_tokens_seen": 260087730, + "step": 12053, + "time_per_iteration": 4.383322715759277 + }, + { + "auxiliary_loss_clip": 0.0102815, + "auxiliary_loss_mlp": 0.01027779, + "balance_loss_clip": 1.02407718, + "balance_loss_mlp": 1.01580477, + "epoch": 0.7247256876597024, + "flos": 21872471713920.0, + "grad_norm": 3.859227567649688, + "language_loss": 0.77792901, + "learning_rate": 7.434740165518898e-07, + "loss": 0.79848832, + "num_input_tokens_seen": 260107760, + "step": 12054, + "time_per_iteration": 2.786909818649292 + }, + { + "auxiliary_loss_clip": 0.01032292, + "auxiliary_loss_mlp": 0.01032798, + "balance_loss_clip": 1.02475381, + "balance_loss_mlp": 1.02199745, + "epoch": 0.7247858109123704, + "flos": 16213093585920.0, + "grad_norm": 2.582288850944899, + "language_loss": 0.68422902, + "learning_rate": 7.431710387856301e-07, + "loss": 0.70487988, + "num_input_tokens_seen": 260123660, + "step": 12055, + "time_per_iteration": 2.6908884048461914 + }, + { + "auxiliary_loss_clip": 0.01032495, + "auxiliary_loss_mlp": 0.01030491, + "balance_loss_clip": 1.02527356, + "balance_loss_mlp": 1.02052462, + "epoch": 0.7248459341650383, + "flos": 20850705434880.0, + "grad_norm": 1.785167605132958, + "language_loss": 0.7380904, + "learning_rate": 7.428681086784496e-07, + "loss": 0.75872028, + "num_input_tokens_seen": 260142690, + "step": 12056, + "time_per_iteration": 2.799166202545166 + }, + { + "auxiliary_loss_clip": 0.01060754, + "auxiliary_loss_mlp": 0.0102481, + "balance_loss_clip": 1.02460778, + "balance_loss_mlp": 1.0143733, + "epoch": 0.7249060574177063, + "flos": 25921794614400.0, + "grad_norm": 1.424835841965354, + "language_loss": 0.70783329, + "learning_rate": 7.425652262418368e-07, + "loss": 0.72868896, + "num_input_tokens_seen": 260162590, + "step": 12057, + "time_per_iteration": 2.710617780685425 + }, + { + "auxiliary_loss_clip": 0.01008331, + "auxiliary_loss_mlp": 0.01041947, + "balance_loss_clip": 1.02277756, + "balance_loss_mlp": 1.03007352, + "epoch": 0.7249661806703742, + "flos": 17345536646400.0, + "grad_norm": 1.6720512091502209, + "language_loss": 0.62875283, + "learning_rate": 7.42262391487277e-07, + "loss": 0.64925563, + "num_input_tokens_seen": 260181065, + "step": 12058, + "time_per_iteration": 2.784844160079956 + }, + { + "auxiliary_loss_clip": 0.01020349, + "auxiliary_loss_mlp": 0.01031427, + "balance_loss_clip": 1.02346158, + "balance_loss_mlp": 1.0203464, + "epoch": 0.7250263039230422, + "flos": 19574153009280.0, + "grad_norm": 2.4054122320033673, + "language_loss": 0.74955875, + "learning_rate": 7.419596044262535e-07, + "loss": 0.77007651, + "num_input_tokens_seen": 260200330, + "step": 12059, + "time_per_iteration": 2.776479721069336 + }, + { + "auxiliary_loss_clip": 0.01053602, + "auxiliary_loss_mlp": 0.01025922, + "balance_loss_clip": 1.02624452, + "balance_loss_mlp": 1.01571774, + "epoch": 0.7250864271757103, + "flos": 21976648133760.0, + "grad_norm": 1.9194997797059405, + "language_loss": 0.79267812, + "learning_rate": 7.416568650702472e-07, + "loss": 0.81347334, + "num_input_tokens_seen": 260219975, + "step": 12060, + "time_per_iteration": 2.682049036026001 + }, + { + "auxiliary_loss_clip": 0.0105322, + "auxiliary_loss_mlp": 0.01025234, + "balance_loss_clip": 1.02539241, + "balance_loss_mlp": 1.01417756, + "epoch": 0.7251465504283782, + "flos": 25012608537600.0, + "grad_norm": 1.8367653054831823, + "language_loss": 0.76756406, + "learning_rate": 7.413541734307393e-07, + "loss": 0.78834862, + "num_input_tokens_seen": 260242025, + "step": 12061, + "time_per_iteration": 2.72462797164917 + }, + { + "auxiliary_loss_clip": 0.01060588, + "auxiliary_loss_mlp": 0.00747518, + "balance_loss_clip": 1.02479517, + "balance_loss_mlp": 1.000453, + "epoch": 0.7252066736810462, + "flos": 16690131135360.0, + "grad_norm": 1.903636750176295, + "language_loss": 0.81218207, + "learning_rate": 7.410515295192068e-07, + "loss": 0.83026308, + "num_input_tokens_seen": 260260015, + "step": 12062, + "time_per_iteration": 2.5652575492858887 + }, + { + "auxiliary_loss_clip": 0.01010486, + "auxiliary_loss_mlp": 0.0102792, + "balance_loss_clip": 1.02241957, + "balance_loss_mlp": 1.01480711, + "epoch": 0.7252667969337141, + "flos": 25703026830720.0, + "grad_norm": 4.484715540414723, + "language_loss": 0.69441521, + "learning_rate": 7.407489333471262e-07, + "loss": 0.71479928, + "num_input_tokens_seen": 260278635, + "step": 12063, + "time_per_iteration": 2.7519891262054443 + }, + { + "auxiliary_loss_clip": 0.01027039, + "auxiliary_loss_mlp": 0.01025568, + "balance_loss_clip": 1.02303529, + "balance_loss_mlp": 1.01483893, + "epoch": 0.7253269201863821, + "flos": 18259930195200.0, + "grad_norm": 1.5530167038603722, + "language_loss": 0.69842601, + "learning_rate": 7.40446384925973e-07, + "loss": 0.71895206, + "num_input_tokens_seen": 260298510, + "step": 12064, + "time_per_iteration": 2.7395687103271484 + }, + { + "auxiliary_loss_clip": 0.0104339, + "auxiliary_loss_mlp": 0.01028562, + "balance_loss_clip": 1.02556646, + "balance_loss_mlp": 1.017416, + "epoch": 0.72538704343905, + "flos": 20411805150720.0, + "grad_norm": 1.717309266998147, + "language_loss": 0.90238869, + "learning_rate": 7.401438842672192e-07, + "loss": 0.92310828, + "num_input_tokens_seen": 260317405, + "step": 12065, + "time_per_iteration": 2.6613235473632812 + }, + { + "auxiliary_loss_clip": 0.00998702, + "auxiliary_loss_mlp": 0.01003231, + "balance_loss_clip": 1.00288343, + "balance_loss_mlp": 1.00222993, + "epoch": 0.725447166691718, + "flos": 70151209706880.0, + "grad_norm": 0.6598365119305593, + "language_loss": 0.56137198, + "learning_rate": 7.398414313823349e-07, + "loss": 0.58139127, + "num_input_tokens_seen": 260388085, + "step": 12066, + "time_per_iteration": 3.3822202682495117 + }, + { + "auxiliary_loss_clip": 0.01024918, + "auxiliary_loss_mlp": 0.01029233, + "balance_loss_clip": 1.02596068, + "balance_loss_mlp": 1.0187428, + "epoch": 0.725507289944386, + "flos": 27052334254080.0, + "grad_norm": 1.8202695650436467, + "language_loss": 0.76799917, + "learning_rate": 7.395390262827897e-07, + "loss": 0.78854072, + "num_input_tokens_seen": 260406165, + "step": 12067, + "time_per_iteration": 2.889826536178589 + }, + { + "auxiliary_loss_clip": 0.0099024, + "auxiliary_loss_mlp": 0.01001811, + "balance_loss_clip": 1.00433421, + "balance_loss_mlp": 1.00078607, + "epoch": 0.725567413197054, + "flos": 62921924778240.0, + "grad_norm": 0.724806031246088, + "language_loss": 0.57071257, + "learning_rate": 7.392366689800515e-07, + "loss": 0.59063309, + "num_input_tokens_seen": 260461365, + "step": 12068, + "time_per_iteration": 3.146444320678711 + }, + { + "auxiliary_loss_clip": 0.00970586, + "auxiliary_loss_mlp": 0.01000505, + "balance_loss_clip": 1.00462127, + "balance_loss_mlp": 0.99925387, + "epoch": 0.7256275364497219, + "flos": 60295957188480.0, + "grad_norm": 0.6595087485685515, + "language_loss": 0.55404139, + "learning_rate": 7.389343594855848e-07, + "loss": 0.57375228, + "num_input_tokens_seen": 260523795, + "step": 12069, + "time_per_iteration": 3.294267177581787 + }, + { + "auxiliary_loss_clip": 0.01031489, + "auxiliary_loss_mlp": 0.01024625, + "balance_loss_clip": 1.0252856, + "balance_loss_mlp": 1.01511812, + "epoch": 0.7256876597023899, + "flos": 24498511130880.0, + "grad_norm": 2.248360975058126, + "language_loss": 0.79887539, + "learning_rate": 7.38632097810854e-07, + "loss": 0.81943655, + "num_input_tokens_seen": 260544765, + "step": 12070, + "time_per_iteration": 2.727675676345825 + }, + { + "auxiliary_loss_clip": 0.01035045, + "auxiliary_loss_mlp": 0.01032983, + "balance_loss_clip": 1.02283478, + "balance_loss_mlp": 1.02272511, + "epoch": 0.7257477829550578, + "flos": 24352749740160.0, + "grad_norm": 1.8869442645906345, + "language_loss": 0.71533316, + "learning_rate": 7.383298839673197e-07, + "loss": 0.73601341, + "num_input_tokens_seen": 260564340, + "step": 12071, + "time_per_iteration": 2.692068099975586 + }, + { + "auxiliary_loss_clip": 0.01062303, + "auxiliary_loss_mlp": 0.01033216, + "balance_loss_clip": 1.0255394, + "balance_loss_mlp": 1.02336359, + "epoch": 0.7258079062077258, + "flos": 17202217380480.0, + "grad_norm": 1.8194958095565776, + "language_loss": 0.70075685, + "learning_rate": 7.380277179664436e-07, + "loss": 0.72171205, + "num_input_tokens_seen": 260582565, + "step": 12072, + "time_per_iteration": 4.198249816894531 + }, + { + "auxiliary_loss_clip": 0.01027269, + "auxiliary_loss_mlp": 0.01029319, + "balance_loss_clip": 1.02279329, + "balance_loss_mlp": 1.01776206, + "epoch": 0.7258680294603939, + "flos": 21580338401280.0, + "grad_norm": 1.841395261946785, + "language_loss": 0.78505075, + "learning_rate": 7.377255998196821e-07, + "loss": 0.80561662, + "num_input_tokens_seen": 260601700, + "step": 12073, + "time_per_iteration": 4.28879976272583 + }, + { + "auxiliary_loss_clip": 0.01044195, + "auxiliary_loss_mlp": 0.01027345, + "balance_loss_clip": 1.02624965, + "balance_loss_mlp": 1.01655674, + "epoch": 0.7259281527130618, + "flos": 34855399036800.0, + "grad_norm": 1.3703164863129615, + "language_loss": 0.70318735, + "learning_rate": 7.374235295384923e-07, + "loss": 0.72390282, + "num_input_tokens_seen": 260623040, + "step": 12074, + "time_per_iteration": 2.7533671855926514 + }, + { + "auxiliary_loss_clip": 0.01040033, + "auxiliary_loss_mlp": 0.01025625, + "balance_loss_clip": 1.02315891, + "balance_loss_mlp": 1.01468778, + "epoch": 0.7259882759657298, + "flos": 25404644551680.0, + "grad_norm": 1.7496763910460364, + "language_loss": 0.74548942, + "learning_rate": 7.371215071343302e-07, + "loss": 0.766146, + "num_input_tokens_seen": 260642735, + "step": 12075, + "time_per_iteration": 2.697906255722046 + }, + { + "auxiliary_loss_clip": 0.01053719, + "auxiliary_loss_mlp": 0.01033447, + "balance_loss_clip": 1.0255692, + "balance_loss_mlp": 1.02213967, + "epoch": 0.7260483992183977, + "flos": 62953630531200.0, + "grad_norm": 1.4370796612443926, + "language_loss": 0.63554025, + "learning_rate": 7.368195326186458e-07, + "loss": 0.65641189, + "num_input_tokens_seen": 260669935, + "step": 12076, + "time_per_iteration": 3.0998597145080566 + }, + { + "auxiliary_loss_clip": 0.01024487, + "auxiliary_loss_mlp": 0.01028947, + "balance_loss_clip": 1.02215528, + "balance_loss_mlp": 1.01820064, + "epoch": 0.7261085224710657, + "flos": 26467528924800.0, + "grad_norm": 1.8789043625129602, + "language_loss": 0.79140675, + "learning_rate": 7.365176060028912e-07, + "loss": 0.81194109, + "num_input_tokens_seen": 260689605, + "step": 12077, + "time_per_iteration": 2.8029377460479736 + }, + { + "auxiliary_loss_clip": 0.01007611, + "auxiliary_loss_mlp": 0.00746851, + "balance_loss_clip": 1.00222754, + "balance_loss_mlp": 1.00090301, + "epoch": 0.7261686457237336, + "flos": 66772732187520.0, + "grad_norm": 0.8906643575450445, + "language_loss": 0.65033114, + "learning_rate": 7.362157272985163e-07, + "loss": 0.66787577, + "num_input_tokens_seen": 260748265, + "step": 12078, + "time_per_iteration": 3.3324666023254395 + }, + { + "auxiliary_loss_clip": 0.00998932, + "auxiliary_loss_mlp": 0.01004443, + "balance_loss_clip": 1.00329828, + "balance_loss_mlp": 1.00350714, + "epoch": 0.7262287689764017, + "flos": 69999594399360.0, + "grad_norm": 1.2364464444949905, + "language_loss": 0.59269679, + "learning_rate": 7.359138965169671e-07, + "loss": 0.6127305, + "num_input_tokens_seen": 260816715, + "step": 12079, + "time_per_iteration": 3.329284191131592 + }, + { + "auxiliary_loss_clip": 0.01020588, + "auxiliary_loss_mlp": 0.01025481, + "balance_loss_clip": 1.0231638, + "balance_loss_mlp": 1.01431143, + "epoch": 0.7262888922290696, + "flos": 23805435231360.0, + "grad_norm": 2.059692586973865, + "language_loss": 0.64802945, + "learning_rate": 7.356121136696895e-07, + "loss": 0.66849017, + "num_input_tokens_seen": 260836765, + "step": 12080, + "time_per_iteration": 2.770277976989746 + }, + { + "auxiliary_loss_clip": 0.01018738, + "auxiliary_loss_mlp": 0.01026531, + "balance_loss_clip": 1.02330649, + "balance_loss_mlp": 1.01529562, + "epoch": 0.7263490154817376, + "flos": 19500320603520.0, + "grad_norm": 3.5098696068709874, + "language_loss": 0.70458025, + "learning_rate": 7.35310378768128e-07, + "loss": 0.72503293, + "num_input_tokens_seen": 260854610, + "step": 12081, + "time_per_iteration": 2.852771282196045 + }, + { + "auxiliary_loss_clip": 0.01066238, + "auxiliary_loss_mlp": 0.01029276, + "balance_loss_clip": 1.02661884, + "balance_loss_mlp": 1.0186187, + "epoch": 0.7264091387344055, + "flos": 16286243633280.0, + "grad_norm": 2.1243034366240243, + "language_loss": 0.8144089, + "learning_rate": 7.350086918237237e-07, + "loss": 0.83536398, + "num_input_tokens_seen": 260871620, + "step": 12082, + "time_per_iteration": 2.5725276470184326 + }, + { + "auxiliary_loss_clip": 0.01053343, + "auxiliary_loss_mlp": 0.01031785, + "balance_loss_clip": 1.02388501, + "balance_loss_mlp": 1.01943469, + "epoch": 0.7264692619870735, + "flos": 24352031468160.0, + "grad_norm": 1.637350713134915, + "language_loss": 0.77202034, + "learning_rate": 7.347070528479158e-07, + "loss": 0.79287165, + "num_input_tokens_seen": 260890490, + "step": 12083, + "time_per_iteration": 2.883704662322998 + }, + { + "auxiliary_loss_clip": 0.01066191, + "auxiliary_loss_mlp": 0.01031254, + "balance_loss_clip": 1.02727151, + "balance_loss_mlp": 1.02007771, + "epoch": 0.7265293852397414, + "flos": 25119478477440.0, + "grad_norm": 1.6245653563540101, + "language_loss": 0.73149407, + "learning_rate": 7.344054618521433e-07, + "loss": 0.75246847, + "num_input_tokens_seen": 260909700, + "step": 12084, + "time_per_iteration": 2.65366530418396 + }, + { + "auxiliary_loss_clip": 0.01066195, + "auxiliary_loss_mlp": 0.01032415, + "balance_loss_clip": 1.02699697, + "balance_loss_mlp": 1.02157319, + "epoch": 0.7265895084924094, + "flos": 22638230784000.0, + "grad_norm": 1.5818513257180509, + "language_loss": 0.7773208, + "learning_rate": 7.34103918847843e-07, + "loss": 0.79830688, + "num_input_tokens_seen": 260929090, + "step": 12085, + "time_per_iteration": 2.6819164752960205 + }, + { + "auxiliary_loss_clip": 0.01054033, + "auxiliary_loss_mlp": 0.01033161, + "balance_loss_clip": 1.02545846, + "balance_loss_mlp": 1.02266467, + "epoch": 0.7266496317450775, + "flos": 23368222886400.0, + "grad_norm": 1.7016583181838851, + "language_loss": 0.7254082, + "learning_rate": 7.338024238464493e-07, + "loss": 0.74628019, + "num_input_tokens_seen": 260946615, + "step": 12086, + "time_per_iteration": 2.691628932952881 + }, + { + "auxiliary_loss_clip": 0.01024445, + "auxiliary_loss_mlp": 0.01041534, + "balance_loss_clip": 1.02224541, + "balance_loss_mlp": 1.0293746, + "epoch": 0.7267097549977454, + "flos": 28074603323520.0, + "grad_norm": 1.5572698040869077, + "language_loss": 0.69720566, + "learning_rate": 7.335009768593938e-07, + "loss": 0.71786547, + "num_input_tokens_seen": 260968515, + "step": 12087, + "time_per_iteration": 2.7825655937194824 + }, + { + "auxiliary_loss_clip": 0.01064658, + "auxiliary_loss_mlp": 0.01034652, + "balance_loss_clip": 1.02534223, + "balance_loss_mlp": 1.02295148, + "epoch": 0.7267698782504134, + "flos": 22195523658240.0, + "grad_norm": 1.8277211081779894, + "language_loss": 0.79129303, + "learning_rate": 7.331995778981088e-07, + "loss": 0.81228614, + "num_input_tokens_seen": 260986790, + "step": 12088, + "time_per_iteration": 2.551802158355713 + }, + { + "auxiliary_loss_clip": 0.01054679, + "auxiliary_loss_mlp": 0.01035821, + "balance_loss_clip": 1.02494884, + "balance_loss_mlp": 1.02518749, + "epoch": 0.7268300015030813, + "flos": 18514859996160.0, + "grad_norm": 1.694048085408465, + "language_loss": 0.7358833, + "learning_rate": 7.328982269740221e-07, + "loss": 0.75678831, + "num_input_tokens_seen": 261004925, + "step": 12089, + "time_per_iteration": 2.7037904262542725 + }, + { + "auxiliary_loss_clip": 0.01039352, + "auxiliary_loss_mlp": 0.01032015, + "balance_loss_clip": 1.02348638, + "balance_loss_mlp": 1.02094102, + "epoch": 0.7268901247557493, + "flos": 23986029836160.0, + "grad_norm": 1.647195193909958, + "language_loss": 0.70912957, + "learning_rate": 7.325969240985616e-07, + "loss": 0.72984332, + "num_input_tokens_seen": 261023895, + "step": 12090, + "time_per_iteration": 2.662224292755127 + }, + { + "auxiliary_loss_clip": 0.01013388, + "auxiliary_loss_mlp": 0.01029203, + "balance_loss_clip": 1.02374148, + "balance_loss_mlp": 1.01794398, + "epoch": 0.7269502480084172, + "flos": 32088087429120.0, + "grad_norm": 1.7982772696891722, + "language_loss": 0.77101815, + "learning_rate": 7.322956692831528e-07, + "loss": 0.791444, + "num_input_tokens_seen": 261045445, + "step": 12091, + "time_per_iteration": 5.313915967941284 + }, + { + "auxiliary_loss_clip": 0.01043959, + "auxiliary_loss_mlp": 0.0074773, + "balance_loss_clip": 1.02247834, + "balance_loss_mlp": 1.00050914, + "epoch": 0.7270103712610853, + "flos": 19062785036160.0, + "grad_norm": 2.2676757661544027, + "language_loss": 0.71720707, + "learning_rate": 7.319944625392205e-07, + "loss": 0.73512399, + "num_input_tokens_seen": 261064275, + "step": 12092, + "time_per_iteration": 2.7136425971984863 + }, + { + "auxiliary_loss_clip": 0.01053901, + "auxiliary_loss_mlp": 0.01026017, + "balance_loss_clip": 1.02609849, + "balance_loss_mlp": 1.01484084, + "epoch": 0.7270704945137532, + "flos": 34532921710080.0, + "grad_norm": 1.7729836439335962, + "language_loss": 0.60723472, + "learning_rate": 7.31693303878184e-07, + "loss": 0.62803388, + "num_input_tokens_seen": 261083310, + "step": 12093, + "time_per_iteration": 2.7856009006500244 + }, + { + "auxiliary_loss_clip": 0.01042039, + "auxiliary_loss_mlp": 0.0103088, + "balance_loss_clip": 1.02381182, + "balance_loss_mlp": 1.01978803, + "epoch": 0.7271306177664212, + "flos": 21507583403520.0, + "grad_norm": 1.9343381173790697, + "language_loss": 0.75531363, + "learning_rate": 7.313921933114644e-07, + "loss": 0.77604282, + "num_input_tokens_seen": 261103460, + "step": 12094, + "time_per_iteration": 2.6368696689605713 + }, + { + "auxiliary_loss_clip": 0.01026706, + "auxiliary_loss_mlp": 0.01026697, + "balance_loss_clip": 1.02165461, + "balance_loss_mlp": 1.01718402, + "epoch": 0.7271907410190891, + "flos": 22272444633600.0, + "grad_norm": 1.9119878802008279, + "language_loss": 0.84958059, + "learning_rate": 7.310911308504808e-07, + "loss": 0.87011468, + "num_input_tokens_seen": 261121375, + "step": 12095, + "time_per_iteration": 2.83738374710083 + }, + { + "auxiliary_loss_clip": 0.01052559, + "auxiliary_loss_mlp": 0.01035779, + "balance_loss_clip": 1.02420557, + "balance_loss_mlp": 1.02380443, + "epoch": 0.7272508642717571, + "flos": 22893124671360.0, + "grad_norm": 1.6849004083086974, + "language_loss": 0.77767038, + "learning_rate": 7.307901165066479e-07, + "loss": 0.79855376, + "num_input_tokens_seen": 261141105, + "step": 12096, + "time_per_iteration": 2.6678590774536133 + }, + { + "auxiliary_loss_clip": 0.01064807, + "auxiliary_loss_mlp": 0.01030818, + "balance_loss_clip": 1.02694035, + "balance_loss_mlp": 1.02029204, + "epoch": 0.727310987524425, + "flos": 11655886331520.0, + "grad_norm": 1.932247174486877, + "language_loss": 0.72082764, + "learning_rate": 7.30489150291381e-07, + "loss": 0.74178386, + "num_input_tokens_seen": 261159255, + "step": 12097, + "time_per_iteration": 2.667541027069092 + }, + { + "auxiliary_loss_clip": 0.0105435, + "auxiliary_loss_mlp": 0.00747675, + "balance_loss_clip": 1.02626467, + "balance_loss_mlp": 1.00047266, + "epoch": 0.727371110777093, + "flos": 24535319592960.0, + "grad_norm": 1.6833000985971094, + "language_loss": 0.76467472, + "learning_rate": 7.301882322160935e-07, + "loss": 0.78269494, + "num_input_tokens_seen": 261177960, + "step": 12098, + "time_per_iteration": 2.7441911697387695 + }, + { + "auxiliary_loss_clip": 0.01043757, + "auxiliary_loss_mlp": 0.01029672, + "balance_loss_clip": 1.02362454, + "balance_loss_mlp": 1.01857364, + "epoch": 0.7274312340297611, + "flos": 74739835405440.0, + "grad_norm": 1.7803787028984113, + "language_loss": 0.67268968, + "learning_rate": 7.298873622921952e-07, + "loss": 0.69342399, + "num_input_tokens_seen": 261205660, + "step": 12099, + "time_per_iteration": 3.15268611907959 + }, + { + "auxiliary_loss_clip": 0.01049325, + "auxiliary_loss_mlp": 0.01038926, + "balance_loss_clip": 1.02286148, + "balance_loss_mlp": 1.02522349, + "epoch": 0.727491357282429, + "flos": 22342865247360.0, + "grad_norm": 1.627866465012292, + "language_loss": 0.72666931, + "learning_rate": 7.29586540531095e-07, + "loss": 0.7475518, + "num_input_tokens_seen": 261225185, + "step": 12100, + "time_per_iteration": 4.486457824707031 + }, + { + "auxiliary_loss_clip": 0.01053659, + "auxiliary_loss_mlp": 0.01031175, + "balance_loss_clip": 1.02494168, + "balance_loss_mlp": 1.02081537, + "epoch": 0.727551480535097, + "flos": 23297550877440.0, + "grad_norm": 1.5775253924280228, + "language_loss": 0.74727261, + "learning_rate": 7.292857669442005e-07, + "loss": 0.76812094, + "num_input_tokens_seen": 261247965, + "step": 12101, + "time_per_iteration": 2.707125663757324 + }, + { + "auxiliary_loss_clip": 0.01027627, + "auxiliary_loss_mlp": 0.01031091, + "balance_loss_clip": 1.02559626, + "balance_loss_mlp": 1.02062392, + "epoch": 0.7276116037877649, + "flos": 21470559459840.0, + "grad_norm": 1.8364337027357778, + "language_loss": 0.82533443, + "learning_rate": 7.289850415429177e-07, + "loss": 0.84592164, + "num_input_tokens_seen": 261267585, + "step": 12102, + "time_per_iteration": 2.7685699462890625 + }, + { + "auxiliary_loss_clip": 0.01051515, + "auxiliary_loss_mlp": 0.01030588, + "balance_loss_clip": 1.02424657, + "balance_loss_mlp": 1.02032375, + "epoch": 0.7276717270404329, + "flos": 21464059098240.0, + "grad_norm": 2.064200986396229, + "language_loss": 0.81533313, + "learning_rate": 7.286843643386495e-07, + "loss": 0.83615416, + "num_input_tokens_seen": 261285200, + "step": 12103, + "time_per_iteration": 2.6929996013641357 + }, + { + "auxiliary_loss_clip": 0.0104527, + "auxiliary_loss_mlp": 0.01023811, + "balance_loss_clip": 1.02657354, + "balance_loss_mlp": 1.01297438, + "epoch": 0.7277318502931008, + "flos": 16837221329280.0, + "grad_norm": 1.511315705020805, + "language_loss": 0.6637342, + "learning_rate": 7.283837353427968e-07, + "loss": 0.684425, + "num_input_tokens_seen": 261303645, + "step": 12104, + "time_per_iteration": 2.80991792678833 + }, + { + "auxiliary_loss_clip": 0.01031301, + "auxiliary_loss_mlp": 0.01030452, + "balance_loss_clip": 1.0244, + "balance_loss_mlp": 1.01995587, + "epoch": 0.7277919735457689, + "flos": 33400550476800.0, + "grad_norm": 1.8236589694538985, + "language_loss": 0.66294593, + "learning_rate": 7.280831545667611e-07, + "loss": 0.68356347, + "num_input_tokens_seen": 261323265, + "step": 12105, + "time_per_iteration": 2.8806817531585693 + }, + { + "auxiliary_loss_clip": 0.01065211, + "auxiliary_loss_mlp": 0.01031403, + "balance_loss_clip": 1.02674162, + "balance_loss_mlp": 1.02003646, + "epoch": 0.7278520967984368, + "flos": 19206499351680.0, + "grad_norm": 32.57802037345474, + "language_loss": 0.75763381, + "learning_rate": 7.27782622021939e-07, + "loss": 0.77859998, + "num_input_tokens_seen": 261339745, + "step": 12106, + "time_per_iteration": 2.5719549655914307 + }, + { + "auxiliary_loss_clip": 0.01054418, + "auxiliary_loss_mlp": 0.0102904, + "balance_loss_clip": 1.02519202, + "balance_loss_mlp": 1.01763773, + "epoch": 0.7279122200511048, + "flos": 34094667870720.0, + "grad_norm": 2.314789856125017, + "language_loss": 0.70503235, + "learning_rate": 7.274821377197273e-07, + "loss": 0.72586697, + "num_input_tokens_seen": 261359310, + "step": 12107, + "time_per_iteration": 2.7848129272460938 + }, + { + "auxiliary_loss_clip": 0.01044644, + "auxiliary_loss_mlp": 0.0102926, + "balance_loss_clip": 1.02317047, + "balance_loss_mlp": 1.01814342, + "epoch": 0.7279723433037727, + "flos": 54599049348480.0, + "grad_norm": 2.837803489912338, + "language_loss": 0.75417024, + "learning_rate": 7.271817016715205e-07, + "loss": 0.77490926, + "num_input_tokens_seen": 261384640, + "step": 12108, + "time_per_iteration": 2.956026554107666 + }, + { + "auxiliary_loss_clip": 0.01063185, + "auxiliary_loss_mlp": 0.01027557, + "balance_loss_clip": 1.02496397, + "balance_loss_mlp": 1.01641643, + "epoch": 0.7280324665564407, + "flos": 36137482156800.0, + "grad_norm": 2.268651474249195, + "language_loss": 0.66649556, + "learning_rate": 7.268813138887124e-07, + "loss": 0.68740296, + "num_input_tokens_seen": 261405290, + "step": 12109, + "time_per_iteration": 2.7409493923187256 + }, + { + "auxiliary_loss_clip": 0.0102876, + "auxiliary_loss_mlp": 0.01032377, + "balance_loss_clip": 1.0235759, + "balance_loss_mlp": 1.02055752, + "epoch": 0.7280925898091086, + "flos": 11618539165440.0, + "grad_norm": 1.9661061605767085, + "language_loss": 0.63364339, + "learning_rate": 7.265809743826912e-07, + "loss": 0.65425479, + "num_input_tokens_seen": 261419710, + "step": 12110, + "time_per_iteration": 2.7803428173065186 + }, + { + "auxiliary_loss_clip": 0.01033331, + "auxiliary_loss_mlp": 0.01023137, + "balance_loss_clip": 1.02304316, + "balance_loss_mlp": 1.01147866, + "epoch": 0.7281527130617766, + "flos": 34277094069120.0, + "grad_norm": 1.6931265594194553, + "language_loss": 0.57906646, + "learning_rate": 7.26280683164847e-07, + "loss": 0.59963113, + "num_input_tokens_seen": 261442385, + "step": 12111, + "time_per_iteration": 2.850094795227051 + }, + { + "auxiliary_loss_clip": 0.01017147, + "auxiliary_loss_mlp": 0.01030187, + "balance_loss_clip": 1.02603912, + "balance_loss_mlp": 1.01879072, + "epoch": 0.7282128363144446, + "flos": 13918043018880.0, + "grad_norm": 2.5692924078261634, + "language_loss": 0.74373245, + "learning_rate": 7.259804402465677e-07, + "loss": 0.76420569, + "num_input_tokens_seen": 261459805, + "step": 12112, + "time_per_iteration": 2.851653575897217 + }, + { + "auxiliary_loss_clip": 0.01050428, + "auxiliary_loss_mlp": 0.0102972, + "balance_loss_clip": 1.02328014, + "balance_loss_mlp": 1.01940227, + "epoch": 0.7282729595671126, + "flos": 20777627214720.0, + "grad_norm": 2.050775955597634, + "language_loss": 0.66768771, + "learning_rate": 7.25680245639237e-07, + "loss": 0.6884892, + "num_input_tokens_seen": 261477175, + "step": 12113, + "time_per_iteration": 2.642059803009033 + }, + { + "auxiliary_loss_clip": 0.01033732, + "auxiliary_loss_mlp": 0.01031072, + "balance_loss_clip": 1.02469993, + "balance_loss_mlp": 1.01975942, + "epoch": 0.7283330828197806, + "flos": 16325422392960.0, + "grad_norm": 1.53599305424487, + "language_loss": 0.73032105, + "learning_rate": 7.253800993542399e-07, + "loss": 0.75096905, + "num_input_tokens_seen": 261494990, + "step": 12114, + "time_per_iteration": 2.7229104042053223 + }, + { + "auxiliary_loss_clip": 0.01033092, + "auxiliary_loss_mlp": 0.01025413, + "balance_loss_clip": 1.02376831, + "balance_loss_mlp": 1.01448727, + "epoch": 0.7283932060724485, + "flos": 27490193043840.0, + "grad_norm": 2.3430643772177024, + "language_loss": 0.68265104, + "learning_rate": 7.250800014029564e-07, + "loss": 0.7032361, + "num_input_tokens_seen": 261514445, + "step": 12115, + "time_per_iteration": 2.8192429542541504 + }, + { + "auxiliary_loss_clip": 0.01064534, + "auxiliary_loss_mlp": 0.01028868, + "balance_loss_clip": 1.02526081, + "balance_loss_mlp": 1.01799583, + "epoch": 0.7284533293251165, + "flos": 18367877543040.0, + "grad_norm": 1.7912678541832636, + "language_loss": 0.59796411, + "learning_rate": 7.247799517967674e-07, + "loss": 0.61889815, + "num_input_tokens_seen": 261533565, + "step": 12116, + "time_per_iteration": 2.6473777294158936 + }, + { + "auxiliary_loss_clip": 0.01052114, + "auxiliary_loss_mlp": 0.01029908, + "balance_loss_clip": 1.02464795, + "balance_loss_mlp": 1.01892281, + "epoch": 0.7285134525777844, + "flos": 21725525174400.0, + "grad_norm": 1.7714277135142718, + "language_loss": 0.72935486, + "learning_rate": 7.2447995054705e-07, + "loss": 0.75017512, + "num_input_tokens_seen": 261553795, + "step": 12117, + "time_per_iteration": 2.6721863746643066 + }, + { + "auxiliary_loss_clip": 0.01046256, + "auxiliary_loss_mlp": 0.01024466, + "balance_loss_clip": 1.02331638, + "balance_loss_mlp": 1.01328397, + "epoch": 0.7285735758304525, + "flos": 20741357456640.0, + "grad_norm": 1.7231999885083433, + "language_loss": 0.69740707, + "learning_rate": 7.241799976651807e-07, + "loss": 0.71811426, + "num_input_tokens_seen": 261572565, + "step": 12118, + "time_per_iteration": 2.6741251945495605 + }, + { + "auxiliary_loss_clip": 0.0101086, + "auxiliary_loss_mlp": 0.01032674, + "balance_loss_clip": 1.02150226, + "balance_loss_mlp": 1.02231479, + "epoch": 0.7286336990831204, + "flos": 17310954827520.0, + "grad_norm": 1.7469128026010547, + "language_loss": 0.83974302, + "learning_rate": 7.238800931625346e-07, + "loss": 0.86017835, + "num_input_tokens_seen": 261590910, + "step": 12119, + "time_per_iteration": 2.7804338932037354 + }, + { + "auxiliary_loss_clip": 0.01064501, + "auxiliary_loss_mlp": 0.0102518, + "balance_loss_clip": 1.02578878, + "balance_loss_mlp": 1.01477277, + "epoch": 0.7286938223357884, + "flos": 19787390098560.0, + "grad_norm": 3.1318014165193224, + "language_loss": 0.82126081, + "learning_rate": 7.235802370504831e-07, + "loss": 0.8421576, + "num_input_tokens_seen": 261606005, + "step": 12120, + "time_per_iteration": 4.272568225860596 + }, + { + "auxiliary_loss_clip": 0.01024694, + "auxiliary_loss_mlp": 0.01036483, + "balance_loss_clip": 1.02276886, + "balance_loss_mlp": 1.02556324, + "epoch": 0.7287539455884563, + "flos": 15340859625600.0, + "grad_norm": 1.875906112720356, + "language_loss": 0.78655624, + "learning_rate": 7.232804293403963e-07, + "loss": 0.80716801, + "num_input_tokens_seen": 261622305, + "step": 12121, + "time_per_iteration": 2.8896121978759766 + }, + { + "auxiliary_loss_clip": 0.01063307, + "auxiliary_loss_mlp": 0.01031827, + "balance_loss_clip": 1.02352905, + "balance_loss_mlp": 1.02060914, + "epoch": 0.7288140688411243, + "flos": 25192484870400.0, + "grad_norm": 1.9633294934516963, + "language_loss": 0.68900311, + "learning_rate": 7.229806700436441e-07, + "loss": 0.7099545, + "num_input_tokens_seen": 261642465, + "step": 12122, + "time_per_iteration": 2.6646034717559814 + }, + { + "auxiliary_loss_clip": 0.01015623, + "auxiliary_loss_mlp": 0.01028378, + "balance_loss_clip": 1.02102172, + "balance_loss_mlp": 1.01826882, + "epoch": 0.7288741920937922, + "flos": 23984162328960.0, + "grad_norm": 1.821533925889875, + "language_loss": 0.86634243, + "learning_rate": 7.226809591715923e-07, + "loss": 0.88678241, + "num_input_tokens_seen": 261661420, + "step": 12123, + "time_per_iteration": 2.809006929397583 + }, + { + "auxiliary_loss_clip": 0.0102813, + "auxiliary_loss_mlp": 0.01028554, + "balance_loss_clip": 1.02210045, + "balance_loss_mlp": 1.01809335, + "epoch": 0.7289343153464602, + "flos": 22744921155840.0, + "grad_norm": 1.6855171841897911, + "language_loss": 0.82943451, + "learning_rate": 7.223812967356065e-07, + "loss": 0.85000134, + "num_input_tokens_seen": 261680865, + "step": 12124, + "time_per_iteration": 2.74361515045166 + }, + { + "auxiliary_loss_clip": 0.01034733, + "auxiliary_loss_mlp": 0.01026796, + "balance_loss_clip": 1.0252037, + "balance_loss_mlp": 1.016186, + "epoch": 0.7289944385991282, + "flos": 24900028335360.0, + "grad_norm": 1.8651285746084558, + "language_loss": 0.67158353, + "learning_rate": 7.220816827470499e-07, + "loss": 0.69219881, + "num_input_tokens_seen": 261701455, + "step": 12125, + "time_per_iteration": 2.869823932647705 + }, + { + "auxiliary_loss_clip": 0.01053669, + "auxiliary_loss_mlp": 0.01034731, + "balance_loss_clip": 1.02452087, + "balance_loss_mlp": 1.02291119, + "epoch": 0.7290545618517962, + "flos": 22967064817920.0, + "grad_norm": 1.8939263338006735, + "language_loss": 0.75244534, + "learning_rate": 7.217821172172855e-07, + "loss": 0.77332938, + "num_input_tokens_seen": 261721260, + "step": 12126, + "time_per_iteration": 2.7757070064544678 + }, + { + "auxiliary_loss_clip": 0.00989942, + "auxiliary_loss_mlp": 0.01005083, + "balance_loss_clip": 1.00387168, + "balance_loss_mlp": 1.00403392, + "epoch": 0.7291146851044642, + "flos": 61901523216000.0, + "grad_norm": 0.942900639701561, + "language_loss": 0.58706319, + "learning_rate": 7.2148260015767e-07, + "loss": 0.60701346, + "num_input_tokens_seen": 261779370, + "step": 12127, + "time_per_iteration": 3.2565667629241943 + }, + { + "auxiliary_loss_clip": 0.01026858, + "auxiliary_loss_mlp": 0.01027501, + "balance_loss_clip": 1.02249742, + "balance_loss_mlp": 1.01748157, + "epoch": 0.7291748083571321, + "flos": 23330947547520.0, + "grad_norm": 2.5225603732409922, + "language_loss": 0.68854505, + "learning_rate": 7.21183131579562e-07, + "loss": 0.70908868, + "num_input_tokens_seen": 261798050, + "step": 12128, + "time_per_iteration": 2.7542426586151123 + }, + { + "auxiliary_loss_clip": 0.0103741, + "auxiliary_loss_mlp": 0.01035385, + "balance_loss_clip": 1.02450931, + "balance_loss_mlp": 1.02332115, + "epoch": 0.7292349316098001, + "flos": 28330000001280.0, + "grad_norm": 2.389144092025668, + "language_loss": 0.65390396, + "learning_rate": 7.20883711494319e-07, + "loss": 0.67463195, + "num_input_tokens_seen": 261817660, + "step": 12129, + "time_per_iteration": 2.762627124786377 + }, + { + "auxiliary_loss_clip": 0.01060675, + "auxiliary_loss_mlp": 0.01028358, + "balance_loss_clip": 1.02401233, + "balance_loss_mlp": 1.01752758, + "epoch": 0.729295054862468, + "flos": 24132222190080.0, + "grad_norm": 1.9404151005700392, + "language_loss": 0.74168086, + "learning_rate": 7.205843399132927e-07, + "loss": 0.76257116, + "num_input_tokens_seen": 261837935, + "step": 12130, + "time_per_iteration": 2.6524972915649414 + }, + { + "auxiliary_loss_clip": 0.01034465, + "auxiliary_loss_mlp": 0.01026758, + "balance_loss_clip": 1.02161324, + "balance_loss_mlp": 1.01558757, + "epoch": 0.7293551781151361, + "flos": 22816239609600.0, + "grad_norm": 1.5203631766151586, + "language_loss": 0.69927222, + "learning_rate": 7.202850168478374e-07, + "loss": 0.71988446, + "num_input_tokens_seen": 261857575, + "step": 12131, + "time_per_iteration": 2.7686827182769775 + }, + { + "auxiliary_loss_clip": 0.01031785, + "auxiliary_loss_mlp": 0.01027347, + "balance_loss_clip": 1.02590275, + "balance_loss_mlp": 1.01757812, + "epoch": 0.729415301367804, + "flos": 22126683242880.0, + "grad_norm": 1.5883311495043722, + "language_loss": 0.7737416, + "learning_rate": 7.199857423093025e-07, + "loss": 0.79433292, + "num_input_tokens_seen": 261877265, + "step": 12132, + "time_per_iteration": 2.847055435180664 + }, + { + "auxiliary_loss_clip": 0.01052574, + "auxiliary_loss_mlp": 0.01034363, + "balance_loss_clip": 1.02469802, + "balance_loss_mlp": 1.02413452, + "epoch": 0.729475424620472, + "flos": 12349608675840.0, + "grad_norm": 2.505201213284287, + "language_loss": 0.78996015, + "learning_rate": 7.196865163090358e-07, + "loss": 0.81082952, + "num_input_tokens_seen": 261893695, + "step": 12133, + "time_per_iteration": 2.712477445602417 + }, + { + "auxiliary_loss_clip": 0.01013024, + "auxiliary_loss_mlp": 0.01025868, + "balance_loss_clip": 1.02066803, + "balance_loss_mlp": 1.01529455, + "epoch": 0.7295355478731399, + "flos": 22195308176640.0, + "grad_norm": 2.2375686376363157, + "language_loss": 0.71980792, + "learning_rate": 7.193873388583846e-07, + "loss": 0.74019682, + "num_input_tokens_seen": 261911825, + "step": 12134, + "time_per_iteration": 2.898615598678589 + }, + { + "auxiliary_loss_clip": 0.01044171, + "auxiliary_loss_mlp": 0.01035914, + "balance_loss_clip": 1.02576649, + "balance_loss_mlp": 1.02494049, + "epoch": 0.7295956711258079, + "flos": 23222030532480.0, + "grad_norm": 1.6590923576383785, + "language_loss": 0.71290684, + "learning_rate": 7.190882099686939e-07, + "loss": 0.73370767, + "num_input_tokens_seen": 261931190, + "step": 12135, + "time_per_iteration": 2.7447094917297363 + }, + { + "auxiliary_loss_clip": 0.01028354, + "auxiliary_loss_mlp": 0.01036044, + "balance_loss_clip": 1.0229919, + "balance_loss_mlp": 1.02489185, + "epoch": 0.7296557943784758, + "flos": 31869104163840.0, + "grad_norm": 1.8783080185315586, + "language_loss": 0.61988926, + "learning_rate": 7.187891296513075e-07, + "loss": 0.64053327, + "num_input_tokens_seen": 261951240, + "step": 12136, + "time_per_iteration": 2.828803539276123 + }, + { + "auxiliary_loss_clip": 0.01050838, + "auxiliary_loss_mlp": 0.0074765, + "balance_loss_clip": 1.02381849, + "balance_loss_mlp": 1.00048637, + "epoch": 0.7297159176311439, + "flos": 26651714889600.0, + "grad_norm": 1.7824552348828253, + "language_loss": 0.7429117, + "learning_rate": 7.184900979175654e-07, + "loss": 0.76089662, + "num_input_tokens_seen": 261971605, + "step": 12137, + "time_per_iteration": 2.718775510787964 + }, + { + "auxiliary_loss_clip": 0.01057376, + "auxiliary_loss_mlp": 0.00747645, + "balance_loss_clip": 1.02829289, + "balance_loss_mlp": 1.00047028, + "epoch": 0.7297760408838118, + "flos": 24749562263040.0, + "grad_norm": 2.881969660740019, + "language_loss": 0.74469405, + "learning_rate": 7.181911147788069e-07, + "loss": 0.76274419, + "num_input_tokens_seen": 261990830, + "step": 12138, + "time_per_iteration": 4.597218751907349 + }, + { + "auxiliary_loss_clip": 0.01033085, + "auxiliary_loss_mlp": 0.01030672, + "balance_loss_clip": 1.02468836, + "balance_loss_mlp": 1.02006185, + "epoch": 0.7298361641364798, + "flos": 18073768982400.0, + "grad_norm": 2.2214316126772093, + "language_loss": 0.7170471, + "learning_rate": 7.178921802463702e-07, + "loss": 0.73768473, + "num_input_tokens_seen": 262008190, + "step": 12139, + "time_per_iteration": 2.790128231048584 + }, + { + "auxiliary_loss_clip": 0.01050066, + "auxiliary_loss_mlp": 0.01024552, + "balance_loss_clip": 1.02506137, + "balance_loss_mlp": 1.01488459, + "epoch": 0.7298962873891478, + "flos": 29895597169920.0, + "grad_norm": 1.4403937355600875, + "language_loss": 0.7332828, + "learning_rate": 7.175932943315898e-07, + "loss": 0.75402898, + "num_input_tokens_seen": 262030460, + "step": 12140, + "time_per_iteration": 2.8200671672821045 + }, + { + "auxiliary_loss_clip": 0.01043758, + "auxiliary_loss_mlp": 0.01031596, + "balance_loss_clip": 1.02534032, + "balance_loss_mlp": 1.02009261, + "epoch": 0.7299564106418157, + "flos": 32266096254720.0, + "grad_norm": 1.5777581451073819, + "language_loss": 0.55828166, + "learning_rate": 7.172944570458003e-07, + "loss": 0.57903522, + "num_input_tokens_seen": 262050830, + "step": 12141, + "time_per_iteration": 2.9283018112182617 + }, + { + "auxiliary_loss_clip": 0.01023896, + "auxiliary_loss_mlp": 0.0102582, + "balance_loss_clip": 1.02225304, + "balance_loss_mlp": 1.01547217, + "epoch": 0.7300165338944837, + "flos": 22930292269440.0, + "grad_norm": 1.5178364715572807, + "language_loss": 0.72436917, + "learning_rate": 7.169956684003342e-07, + "loss": 0.74486625, + "num_input_tokens_seen": 262071245, + "step": 12142, + "time_per_iteration": 2.8528852462768555 + }, + { + "auxiliary_loss_clip": 0.01061495, + "auxiliary_loss_mlp": 0.01030232, + "balance_loss_clip": 1.02503383, + "balance_loss_mlp": 1.02040303, + "epoch": 0.7300766571471516, + "flos": 19828795501440.0, + "grad_norm": 1.7310628592242263, + "language_loss": 0.74097264, + "learning_rate": 7.16696928406521e-07, + "loss": 0.76188993, + "num_input_tokens_seen": 262087525, + "step": 12143, + "time_per_iteration": 2.7733209133148193 + }, + { + "auxiliary_loss_clip": 0.01027088, + "auxiliary_loss_mlp": 0.01028849, + "balance_loss_clip": 1.02222645, + "balance_loss_mlp": 1.01832867, + "epoch": 0.7301367803998197, + "flos": 24347829576960.0, + "grad_norm": 2.0962032047750268, + "language_loss": 0.66865396, + "learning_rate": 7.163982370756882e-07, + "loss": 0.68921328, + "num_input_tokens_seen": 262107355, + "step": 12144, + "time_per_iteration": 2.817122459411621 + }, + { + "auxiliary_loss_clip": 0.01043498, + "auxiliary_loss_mlp": 0.01028786, + "balance_loss_clip": 1.02528846, + "balance_loss_mlp": 1.01830769, + "epoch": 0.7301969036524876, + "flos": 15304518040320.0, + "grad_norm": 3.93697291559793, + "language_loss": 0.78819138, + "learning_rate": 7.160995944191627e-07, + "loss": 0.80891424, + "num_input_tokens_seen": 262125645, + "step": 12145, + "time_per_iteration": 2.639514684677124 + }, + { + "auxiliary_loss_clip": 0.01024132, + "auxiliary_loss_mlp": 0.01032437, + "balance_loss_clip": 1.02336407, + "balance_loss_mlp": 1.02115941, + "epoch": 0.7302570269051556, + "flos": 23507268433920.0, + "grad_norm": 1.8460436348314369, + "language_loss": 0.91088742, + "learning_rate": 7.158010004482702e-07, + "loss": 0.93145311, + "num_input_tokens_seen": 262144075, + "step": 12146, + "time_per_iteration": 2.749307632446289 + }, + { + "auxiliary_loss_clip": 0.01060778, + "auxiliary_loss_mlp": 0.01027462, + "balance_loss_clip": 1.02488852, + "balance_loss_mlp": 1.01727557, + "epoch": 0.7303171501578235, + "flos": 20523056549760.0, + "grad_norm": 1.656383008601094, + "language_loss": 0.6217705, + "learning_rate": 7.155024551743316e-07, + "loss": 0.64265293, + "num_input_tokens_seen": 262165940, + "step": 12147, + "time_per_iteration": 2.687281608581543 + }, + { + "auxiliary_loss_clip": 0.01064231, + "auxiliary_loss_mlp": 0.01034727, + "balance_loss_clip": 1.02650404, + "balance_loss_mlp": 1.02374148, + "epoch": 0.7303772734104915, + "flos": 18332613365760.0, + "grad_norm": 1.9188780137884571, + "language_loss": 0.75635105, + "learning_rate": 7.152039586086693e-07, + "loss": 0.77734059, + "num_input_tokens_seen": 262184520, + "step": 12148, + "time_per_iteration": 4.324657917022705 + }, + { + "auxiliary_loss_clip": 0.0098875, + "auxiliary_loss_mlp": 0.0074667, + "balance_loss_clip": 1.00298357, + "balance_loss_mlp": 1.00074542, + "epoch": 0.7304373966631594, + "flos": 60654776100480.0, + "grad_norm": 0.6833685211154207, + "language_loss": 0.56711203, + "learning_rate": 7.149055107626017e-07, + "loss": 0.58446622, + "num_input_tokens_seen": 262247070, + "step": 12149, + "time_per_iteration": 3.236069440841675 + }, + { + "auxiliary_loss_clip": 0.01042701, + "auxiliary_loss_mlp": 0.0103086, + "balance_loss_clip": 1.02348018, + "balance_loss_mlp": 1.02002406, + "epoch": 0.7304975199158275, + "flos": 19828077229440.0, + "grad_norm": 1.632113768525236, + "language_loss": 0.74168003, + "learning_rate": 7.146071116474451e-07, + "loss": 0.76241565, + "num_input_tokens_seen": 262266605, + "step": 12150, + "time_per_iteration": 2.6718740463256836 + }, + { + "auxiliary_loss_clip": 0.01063361, + "auxiliary_loss_mlp": 0.0103097, + "balance_loss_clip": 1.02428746, + "balance_loss_mlp": 1.01957333, + "epoch": 0.7305576431684954, + "flos": 13223997452160.0, + "grad_norm": 2.251943813214177, + "language_loss": 0.84008342, + "learning_rate": 7.143087612745158e-07, + "loss": 0.8610267, + "num_input_tokens_seen": 262283880, + "step": 12151, + "time_per_iteration": 2.5369558334350586 + }, + { + "auxiliary_loss_clip": 0.01019271, + "auxiliary_loss_mlp": 0.01032783, + "balance_loss_clip": 1.02180505, + "balance_loss_mlp": 1.0214107, + "epoch": 0.7306177664211634, + "flos": 24060472773120.0, + "grad_norm": 1.809445951019591, + "language_loss": 0.78021979, + "learning_rate": 7.14010459655127e-07, + "loss": 0.80074036, + "num_input_tokens_seen": 262304155, + "step": 12152, + "time_per_iteration": 2.7612524032592773 + }, + { + "auxiliary_loss_clip": 0.01036468, + "auxiliary_loss_mlp": 0.01034152, + "balance_loss_clip": 1.02715564, + "balance_loss_mlp": 1.02288055, + "epoch": 0.7306778896738314, + "flos": 27089106802560.0, + "grad_norm": 1.6331575732682053, + "language_loss": 0.79730165, + "learning_rate": 7.137122068005919e-07, + "loss": 0.81800789, + "num_input_tokens_seen": 262325660, + "step": 12153, + "time_per_iteration": 2.8870019912719727 + }, + { + "auxiliary_loss_clip": 0.01053533, + "auxiliary_loss_mlp": 0.01032162, + "balance_loss_clip": 1.02455235, + "balance_loss_mlp": 1.02143955, + "epoch": 0.7307380129264993, + "flos": 16690669839360.0, + "grad_norm": 1.6496322636572476, + "language_loss": 0.67145491, + "learning_rate": 7.134140027222173e-07, + "loss": 0.69231188, + "num_input_tokens_seen": 262344075, + "step": 12154, + "time_per_iteration": 2.738001585006714 + }, + { + "auxiliary_loss_clip": 0.01015377, + "auxiliary_loss_mlp": 0.01029626, + "balance_loss_clip": 1.02405429, + "balance_loss_mlp": 1.01877773, + "epoch": 0.7307981361791673, + "flos": 21725740656000.0, + "grad_norm": 1.8343274770506024, + "language_loss": 0.65825158, + "learning_rate": 7.131158474313128e-07, + "loss": 0.67870164, + "num_input_tokens_seen": 262363305, + "step": 12155, + "time_per_iteration": 2.796015977859497 + }, + { + "auxiliary_loss_clip": 0.01046552, + "auxiliary_loss_mlp": 0.0102609, + "balance_loss_clip": 1.02781606, + "balance_loss_mlp": 1.01542032, + "epoch": 0.7308582594318352, + "flos": 18040659621120.0, + "grad_norm": 1.969744935688386, + "language_loss": 0.81730855, + "learning_rate": 7.128177409391851e-07, + "loss": 0.83803493, + "num_input_tokens_seen": 262380730, + "step": 12156, + "time_per_iteration": 2.653367519378662 + }, + { + "auxiliary_loss_clip": 0.01021817, + "auxiliary_loss_mlp": 0.01030805, + "balance_loss_clip": 1.02237833, + "balance_loss_mlp": 1.02064824, + "epoch": 0.7309183826845033, + "flos": 13844964798720.0, + "grad_norm": 2.042353584320565, + "language_loss": 0.75519168, + "learning_rate": 7.125196832571367e-07, + "loss": 0.77571785, + "num_input_tokens_seen": 262395480, + "step": 12157, + "time_per_iteration": 2.789459466934204 + }, + { + "auxiliary_loss_clip": 0.01050614, + "auxiliary_loss_mlp": 0.01026743, + "balance_loss_clip": 1.0244019, + "balance_loss_mlp": 1.01724839, + "epoch": 0.7309785059371712, + "flos": 17019216564480.0, + "grad_norm": 2.0856924948699205, + "language_loss": 0.72853541, + "learning_rate": 7.122216743964713e-07, + "loss": 0.74930894, + "num_input_tokens_seen": 262413340, + "step": 12158, + "time_per_iteration": 2.643500566482544 + }, + { + "auxiliary_loss_clip": 0.01037231, + "auxiliary_loss_mlp": 0.0102986, + "balance_loss_clip": 1.0230422, + "balance_loss_mlp": 1.01904726, + "epoch": 0.7310386291898392, + "flos": 26502398052480.0, + "grad_norm": 1.9568892455458482, + "language_loss": 0.85762334, + "learning_rate": 7.119237143684896e-07, + "loss": 0.87829423, + "num_input_tokens_seen": 262433455, + "step": 12159, + "time_per_iteration": 2.7570641040802 + }, + { + "auxiliary_loss_clip": 0.01042901, + "auxiliary_loss_mlp": 0.01030112, + "balance_loss_clip": 1.02288628, + "balance_loss_mlp": 1.01867414, + "epoch": 0.7310987524425071, + "flos": 16945922862720.0, + "grad_norm": 4.21740939628761, + "language_loss": 0.7340678, + "learning_rate": 7.116258031844895e-07, + "loss": 0.75479794, + "num_input_tokens_seen": 262450335, + "step": 12160, + "time_per_iteration": 2.7475788593292236 + }, + { + "auxiliary_loss_clip": 0.01054767, + "auxiliary_loss_mlp": 0.01032057, + "balance_loss_clip": 1.02543306, + "balance_loss_mlp": 1.02077985, + "epoch": 0.7311588756951751, + "flos": 13845288021120.0, + "grad_norm": 2.0127733212679515, + "language_loss": 0.72993529, + "learning_rate": 7.113279408557675e-07, + "loss": 0.75080359, + "num_input_tokens_seen": 262468240, + "step": 12161, + "time_per_iteration": 2.7860400676727295 + }, + { + "auxiliary_loss_clip": 0.01031862, + "auxiliary_loss_mlp": 0.00747645, + "balance_loss_clip": 1.02397716, + "balance_loss_mlp": 1.00044942, + "epoch": 0.731218998947843, + "flos": 28767894704640.0, + "grad_norm": 1.969172881354765, + "language_loss": 0.69805658, + "learning_rate": 7.110301273936192e-07, + "loss": 0.7158516, + "num_input_tokens_seen": 262487045, + "step": 12162, + "time_per_iteration": 2.8227264881134033 + }, + { + "auxiliary_loss_clip": 0.01055397, + "auxiliary_loss_mlp": 0.0102851, + "balance_loss_clip": 1.02634001, + "balance_loss_mlp": 1.01718497, + "epoch": 0.7312791222005111, + "flos": 27088783580160.0, + "grad_norm": 1.7153812547215375, + "language_loss": 0.67027462, + "learning_rate": 7.107323628093382e-07, + "loss": 0.69111371, + "num_input_tokens_seen": 262504855, + "step": 12163, + "time_per_iteration": 2.7794137001037598 + }, + { + "auxiliary_loss_clip": 0.01044879, + "auxiliary_loss_mlp": 0.01030792, + "balance_loss_clip": 1.02617371, + "balance_loss_mlp": 1.01950312, + "epoch": 0.731339245453179, + "flos": 20924035050240.0, + "grad_norm": 1.452449527439678, + "language_loss": 0.68192893, + "learning_rate": 7.104346471142153e-07, + "loss": 0.70268559, + "num_input_tokens_seen": 262524920, + "step": 12164, + "time_per_iteration": 2.861366033554077 + }, + { + "auxiliary_loss_clip": 0.01018841, + "auxiliary_loss_mlp": 0.01032457, + "balance_loss_clip": 1.02559018, + "balance_loss_mlp": 1.0222702, + "epoch": 0.731399368705847, + "flos": 23075694524160.0, + "grad_norm": 1.6836159514827655, + "language_loss": 0.73157352, + "learning_rate": 7.101369803195391e-07, + "loss": 0.75208652, + "num_input_tokens_seen": 262545725, + "step": 12165, + "time_per_iteration": 3.0074715614318848 + }, + { + "auxiliary_loss_clip": 0.01056489, + "auxiliary_loss_mlp": 0.01031311, + "balance_loss_clip": 1.02708316, + "balance_loss_mlp": 1.02050495, + "epoch": 0.731459491958515, + "flos": 23582681038080.0, + "grad_norm": 1.7347836349817851, + "language_loss": 0.76476824, + "learning_rate": 7.098393624365988e-07, + "loss": 0.78564626, + "num_input_tokens_seen": 262565480, + "step": 12166, + "time_per_iteration": 2.9076499938964844 + }, + { + "auxiliary_loss_clip": 0.01032356, + "auxiliary_loss_mlp": 0.01028323, + "balance_loss_clip": 1.02437723, + "balance_loss_mlp": 1.01782036, + "epoch": 0.7315196152111829, + "flos": 22379278659840.0, + "grad_norm": 1.6311117394536754, + "language_loss": 0.7966128, + "learning_rate": 7.095417934766781e-07, + "loss": 0.81721961, + "num_input_tokens_seen": 262584145, + "step": 12167, + "time_per_iteration": 2.8709821701049805 + }, + { + "auxiliary_loss_clip": 0.01051515, + "auxiliary_loss_mlp": 0.01036306, + "balance_loss_clip": 1.02516675, + "balance_loss_mlp": 1.02630472, + "epoch": 0.7315797384638509, + "flos": 26177047637760.0, + "grad_norm": 1.5044745674312867, + "language_loss": 0.76803207, + "learning_rate": 7.092442734510622e-07, + "loss": 0.78891027, + "num_input_tokens_seen": 262604045, + "step": 12168, + "time_per_iteration": 5.8994104862213135 + }, + { + "auxiliary_loss_clip": 0.01047052, + "auxiliary_loss_mlp": 0.01042747, + "balance_loss_clip": 1.02339184, + "balance_loss_mlp": 1.0294137, + "epoch": 0.7316398617165188, + "flos": 21506326427520.0, + "grad_norm": 2.6680117739570197, + "language_loss": 0.81717634, + "learning_rate": 7.089468023710326e-07, + "loss": 0.83807433, + "num_input_tokens_seen": 262624540, + "step": 12169, + "time_per_iteration": 2.855224370956421 + }, + { + "auxiliary_loss_clip": 0.01055476, + "auxiliary_loss_mlp": 0.01036593, + "balance_loss_clip": 1.0262599, + "balance_loss_mlp": 1.02532148, + "epoch": 0.7316999849691869, + "flos": 30482557315200.0, + "grad_norm": 1.6062710869091812, + "language_loss": 0.70198452, + "learning_rate": 7.08649380247871e-07, + "loss": 0.72290516, + "num_input_tokens_seen": 262644545, + "step": 12170, + "time_per_iteration": 2.915128707885742 + }, + { + "auxiliary_loss_clip": 0.01062186, + "auxiliary_loss_mlp": 0.01029191, + "balance_loss_clip": 1.02494311, + "balance_loss_mlp": 1.01742458, + "epoch": 0.7317601082218548, + "flos": 21543781334400.0, + "grad_norm": 2.233867860006343, + "language_loss": 0.69545251, + "learning_rate": 7.083520070928533e-07, + "loss": 0.71636629, + "num_input_tokens_seen": 262662570, + "step": 12171, + "time_per_iteration": 2.6928794384002686 + }, + { + "auxiliary_loss_clip": 0.01062743, + "auxiliary_loss_mlp": 0.01032731, + "balance_loss_clip": 1.02582896, + "balance_loss_mlp": 1.02209783, + "epoch": 0.7318202314745228, + "flos": 33251592775680.0, + "grad_norm": 1.4898388407698715, + "language_loss": 0.65692443, + "learning_rate": 7.080546829172564e-07, + "loss": 0.67787915, + "num_input_tokens_seen": 262683245, + "step": 12172, + "time_per_iteration": 2.870335102081299 + }, + { + "auxiliary_loss_clip": 0.01064268, + "auxiliary_loss_mlp": 0.01025671, + "balance_loss_clip": 1.02591944, + "balance_loss_mlp": 1.014889, + "epoch": 0.7318803547271907, + "flos": 20157054917760.0, + "grad_norm": 2.241342866225545, + "language_loss": 0.61341363, + "learning_rate": 7.077574077323564e-07, + "loss": 0.63431305, + "num_input_tokens_seen": 262701585, + "step": 12173, + "time_per_iteration": 2.595529794692993 + }, + { + "auxiliary_loss_clip": 0.01016653, + "auxiliary_loss_mlp": 0.01028675, + "balance_loss_clip": 1.0241971, + "balance_loss_mlp": 1.01766038, + "epoch": 0.7319404779798587, + "flos": 20558536208640.0, + "grad_norm": 2.1219258330034036, + "language_loss": 0.73976517, + "learning_rate": 7.074601815494243e-07, + "loss": 0.76021844, + "num_input_tokens_seen": 262719295, + "step": 12174, + "time_per_iteration": 2.8418636322021484 + }, + { + "auxiliary_loss_clip": 0.0106113, + "auxiliary_loss_mlp": 0.01025314, + "balance_loss_clip": 1.02554929, + "balance_loss_mlp": 1.01531816, + "epoch": 0.7320006012325266, + "flos": 28695391102080.0, + "grad_norm": 1.601358177852493, + "language_loss": 0.8093273, + "learning_rate": 7.071630043797317e-07, + "loss": 0.83019173, + "num_input_tokens_seen": 262739995, + "step": 12175, + "time_per_iteration": 2.707550525665283 + }, + { + "auxiliary_loss_clip": 0.01044304, + "auxiliary_loss_mlp": 0.01028629, + "balance_loss_clip": 1.02573872, + "balance_loss_mlp": 1.01775742, + "epoch": 0.7320607244851947, + "flos": 16362697731840.0, + "grad_norm": 1.9742013659002648, + "language_loss": 0.77430749, + "learning_rate": 7.068658762345488e-07, + "loss": 0.79503685, + "num_input_tokens_seen": 262757680, + "step": 12176, + "time_per_iteration": 2.8637499809265137 + }, + { + "auxiliary_loss_clip": 0.010525, + "auxiliary_loss_mlp": 0.01031348, + "balance_loss_clip": 1.0256108, + "balance_loss_mlp": 1.02066064, + "epoch": 0.7321208477378626, + "flos": 20955097336320.0, + "grad_norm": 1.441434465089893, + "language_loss": 0.7654748, + "learning_rate": 7.065687971251399e-07, + "loss": 0.78631324, + "num_input_tokens_seen": 262776990, + "step": 12177, + "time_per_iteration": 2.722646951675415 + }, + { + "auxiliary_loss_clip": 0.01033171, + "auxiliary_loss_mlp": 0.01029479, + "balance_loss_clip": 1.02446342, + "balance_loss_mlp": 1.01923907, + "epoch": 0.7321809709905306, + "flos": 13845072539520.0, + "grad_norm": 2.433569197189853, + "language_loss": 0.74280834, + "learning_rate": 7.06271767062772e-07, + "loss": 0.76343489, + "num_input_tokens_seen": 262795440, + "step": 12178, + "time_per_iteration": 2.7728023529052734 + }, + { + "auxiliary_loss_clip": 0.01041527, + "auxiliary_loss_mlp": 0.01031783, + "balance_loss_clip": 1.02302015, + "balance_loss_mlp": 1.02112532, + "epoch": 0.7322410942431986, + "flos": 26979938392320.0, + "grad_norm": 1.834676193533482, + "language_loss": 0.8292135, + "learning_rate": 7.059747860587084e-07, + "loss": 0.84994662, + "num_input_tokens_seen": 262816385, + "step": 12179, + "time_per_iteration": 2.7531497478485107 + }, + { + "auxiliary_loss_clip": 0.01024198, + "auxiliary_loss_mlp": 0.01034519, + "balance_loss_clip": 1.02237391, + "balance_loss_mlp": 1.02245545, + "epoch": 0.7323012174958665, + "flos": 17639717034240.0, + "grad_norm": 1.474144799611358, + "language_loss": 0.74566936, + "learning_rate": 7.056778541242115e-07, + "loss": 0.76625657, + "num_input_tokens_seen": 262834955, + "step": 12180, + "time_per_iteration": 2.6924428939819336 + }, + { + "auxiliary_loss_clip": 0.01054934, + "auxiliary_loss_mlp": 0.00747763, + "balance_loss_clip": 1.02404559, + "balance_loss_mlp": 1.00053024, + "epoch": 0.7323613407485345, + "flos": 32342765834880.0, + "grad_norm": 1.9646880364415669, + "language_loss": 0.79564893, + "learning_rate": 7.053809712705396e-07, + "loss": 0.81367588, + "num_input_tokens_seen": 262853555, + "step": 12181, + "time_per_iteration": 2.8482866287231445 + }, + { + "auxiliary_loss_clip": 0.0105541, + "auxiliary_loss_mlp": 0.00747675, + "balance_loss_clip": 1.02563167, + "balance_loss_mlp": 1.00050497, + "epoch": 0.7324214640012024, + "flos": 18362777811840.0, + "grad_norm": 1.885790783498759, + "language_loss": 0.72147739, + "learning_rate": 7.050841375089506e-07, + "loss": 0.73950827, + "num_input_tokens_seen": 262870975, + "step": 12182, + "time_per_iteration": 2.6513028144836426 + }, + { + "auxiliary_loss_clip": 0.01065832, + "auxiliary_loss_mlp": 0.01029623, + "balance_loss_clip": 1.0267117, + "balance_loss_mlp": 1.01909649, + "epoch": 0.7324815872538705, + "flos": 30812289189120.0, + "grad_norm": 1.4976227939046447, + "language_loss": 0.70874566, + "learning_rate": 7.047873528507015e-07, + "loss": 0.72970021, + "num_input_tokens_seen": 262892635, + "step": 12183, + "time_per_iteration": 2.7829220294952393 + }, + { + "auxiliary_loss_clip": 0.01059108, + "auxiliary_loss_mlp": 0.01034164, + "balance_loss_clip": 1.02863479, + "balance_loss_mlp": 1.02238059, + "epoch": 0.7325417105065384, + "flos": 21505069451520.0, + "grad_norm": 1.9182070553808246, + "language_loss": 0.72675395, + "learning_rate": 7.04490617307045e-07, + "loss": 0.74768668, + "num_input_tokens_seen": 262910725, + "step": 12184, + "time_per_iteration": 2.661731481552124 + }, + { + "auxiliary_loss_clip": 0.00988099, + "auxiliary_loss_mlp": 0.01002288, + "balance_loss_clip": 1.00224137, + "balance_loss_mlp": 1.00136435, + "epoch": 0.7326018337592064, + "flos": 67257742556160.0, + "grad_norm": 0.754791635420547, + "language_loss": 0.65154719, + "learning_rate": 7.041939308892344e-07, + "loss": 0.67145109, + "num_input_tokens_seen": 262974150, + "step": 12185, + "time_per_iteration": 5.091768980026245 + }, + { + "auxiliary_loss_clip": 0.01062778, + "auxiliary_loss_mlp": 0.01024462, + "balance_loss_clip": 1.02388382, + "balance_loss_mlp": 1.01351833, + "epoch": 0.7326619570118743, + "flos": 22857070394880.0, + "grad_norm": 1.8329192922473192, + "language_loss": 0.80278939, + "learning_rate": 7.038972936085197e-07, + "loss": 0.8236618, + "num_input_tokens_seen": 262993370, + "step": 12186, + "time_per_iteration": 2.6195387840270996 + }, + { + "auxiliary_loss_clip": 0.01045198, + "auxiliary_loss_mlp": 0.01032323, + "balance_loss_clip": 1.02434361, + "balance_loss_mlp": 1.02076542, + "epoch": 0.7327220802645423, + "flos": 23327499841920.0, + "grad_norm": 2.479611776819264, + "language_loss": 0.73365873, + "learning_rate": 7.036007054761508e-07, + "loss": 0.75443387, + "num_input_tokens_seen": 263012665, + "step": 12187, + "time_per_iteration": 2.6713600158691406 + }, + { + "auxiliary_loss_clip": 0.01065601, + "auxiliary_loss_mlp": 0.01034561, + "balance_loss_clip": 1.02591872, + "balance_loss_mlp": 1.02342725, + "epoch": 0.7327822035172102, + "flos": 23180661043200.0, + "grad_norm": 1.7269749128511063, + "language_loss": 0.88847196, + "learning_rate": 7.033041665033716e-07, + "loss": 0.90947366, + "num_input_tokens_seen": 263031475, + "step": 12188, + "time_per_iteration": 2.5913827419281006 + }, + { + "auxiliary_loss_clip": 0.01018673, + "auxiliary_loss_mlp": 0.01032969, + "balance_loss_clip": 1.02308547, + "balance_loss_mlp": 1.02145922, + "epoch": 0.7328423267698783, + "flos": 21066600130560.0, + "grad_norm": 2.0387729124321345, + "language_loss": 0.74191362, + "learning_rate": 7.030076767014284e-07, + "loss": 0.76243007, + "num_input_tokens_seen": 263051445, + "step": 12189, + "time_per_iteration": 2.812955617904663 + }, + { + "auxiliary_loss_clip": 0.01034229, + "auxiliary_loss_mlp": 0.01028026, + "balance_loss_clip": 1.0255332, + "balance_loss_mlp": 1.01700473, + "epoch": 0.7329024500225462, + "flos": 21689578638720.0, + "grad_norm": 1.619588809244662, + "language_loss": 0.82162875, + "learning_rate": 7.027112360815648e-07, + "loss": 0.8422513, + "num_input_tokens_seen": 263070835, + "step": 12190, + "time_per_iteration": 2.726327896118164 + }, + { + "auxiliary_loss_clip": 0.01031697, + "auxiliary_loss_mlp": 0.01032999, + "balance_loss_clip": 1.0258882, + "balance_loss_mlp": 1.02126241, + "epoch": 0.7329625732752142, + "flos": 24164038661760.0, + "grad_norm": 1.8196824868369352, + "language_loss": 0.71787322, + "learning_rate": 7.024148446550204e-07, + "loss": 0.73852015, + "num_input_tokens_seen": 263090070, + "step": 12191, + "time_per_iteration": 2.83363938331604 + }, + { + "auxiliary_loss_clip": 0.01066001, + "auxiliary_loss_mlp": 0.01034402, + "balance_loss_clip": 1.02721262, + "balance_loss_mlp": 1.02333927, + "epoch": 0.7330226965278822, + "flos": 30077915627520.0, + "grad_norm": 1.5797329891266072, + "language_loss": 0.69382799, + "learning_rate": 7.021185024330361e-07, + "loss": 0.71483207, + "num_input_tokens_seen": 263110030, + "step": 12192, + "time_per_iteration": 2.6948885917663574 + }, + { + "auxiliary_loss_clip": 0.01051998, + "auxiliary_loss_mlp": 0.01029024, + "balance_loss_clip": 1.02458429, + "balance_loss_mlp": 1.01835513, + "epoch": 0.7330828197805501, + "flos": 23368294713600.0, + "grad_norm": 1.4687329586014575, + "language_loss": 0.7292403, + "learning_rate": 7.01822209426848e-07, + "loss": 0.75005054, + "num_input_tokens_seen": 263129735, + "step": 12193, + "time_per_iteration": 2.8049399852752686 + }, + { + "auxiliary_loss_clip": 0.01044044, + "auxiliary_loss_mlp": 0.01030462, + "balance_loss_clip": 1.02303791, + "balance_loss_mlp": 1.01927996, + "epoch": 0.7331429430332181, + "flos": 21032808410880.0, + "grad_norm": 1.8322110490233263, + "language_loss": 0.77209085, + "learning_rate": 7.015259656476911e-07, + "loss": 0.79283589, + "num_input_tokens_seen": 263149100, + "step": 12194, + "time_per_iteration": 2.754024028778076 + }, + { + "auxiliary_loss_clip": 0.01053043, + "auxiliary_loss_mlp": 0.01027278, + "balance_loss_clip": 1.02557719, + "balance_loss_mlp": 1.01604235, + "epoch": 0.733203066285886, + "flos": 14647891466880.0, + "grad_norm": 1.8553318423354666, + "language_loss": 0.70561993, + "learning_rate": 7.012297711067998e-07, + "loss": 0.72642314, + "num_input_tokens_seen": 263166620, + "step": 12195, + "time_per_iteration": 5.498805999755859 + }, + { + "auxiliary_loss_clip": 0.01063644, + "auxiliary_loss_mlp": 0.0103394, + "balance_loss_clip": 1.02550352, + "balance_loss_mlp": 1.02363992, + "epoch": 0.7332631895385541, + "flos": 17165301177600.0, + "grad_norm": 1.9182313800778918, + "language_loss": 0.72225666, + "learning_rate": 7.009336258154057e-07, + "loss": 0.74323255, + "num_input_tokens_seen": 263184780, + "step": 12196, + "time_per_iteration": 2.646540880203247 + }, + { + "auxiliary_loss_clip": 0.01063194, + "auxiliary_loss_mlp": 0.01026397, + "balance_loss_clip": 1.02612948, + "balance_loss_mlp": 1.01526332, + "epoch": 0.733323312791222, + "flos": 28658151676800.0, + "grad_norm": 1.8780786640064155, + "language_loss": 0.7153883, + "learning_rate": 7.006375297847394e-07, + "loss": 0.7362842, + "num_input_tokens_seen": 263204625, + "step": 12197, + "time_per_iteration": 2.70151424407959 + }, + { + "auxiliary_loss_clip": 0.01013822, + "auxiliary_loss_mlp": 0.00747896, + "balance_loss_clip": 1.02330303, + "balance_loss_mlp": 1.0005393, + "epoch": 0.73338343604389, + "flos": 16618417632000.0, + "grad_norm": 2.0475263846577074, + "language_loss": 0.77787721, + "learning_rate": 7.003414830260282e-07, + "loss": 0.79549438, + "num_input_tokens_seen": 263221565, + "step": 12198, + "time_per_iteration": 2.866046190261841 + }, + { + "auxiliary_loss_clip": 0.01008953, + "auxiliary_loss_mlp": 0.01027601, + "balance_loss_clip": 1.02446473, + "balance_loss_mlp": 1.01689613, + "epoch": 0.7334435592965579, + "flos": 21142084561920.0, + "grad_norm": 2.024711998202016, + "language_loss": 0.7458474, + "learning_rate": 7.000454855504974e-07, + "loss": 0.76621294, + "num_input_tokens_seen": 263240620, + "step": 12199, + "time_per_iteration": 2.844287633895874 + }, + { + "auxiliary_loss_clip": 0.01047531, + "auxiliary_loss_mlp": 0.01033881, + "balance_loss_clip": 1.0272752, + "balance_loss_mlp": 1.02242494, + "epoch": 0.7335036825492259, + "flos": 17125332318720.0, + "grad_norm": 2.474509916833651, + "language_loss": 0.77000844, + "learning_rate": 6.997495373693729e-07, + "loss": 0.79082257, + "num_input_tokens_seen": 263254365, + "step": 12200, + "time_per_iteration": 2.7455763816833496 + }, + { + "auxiliary_loss_clip": 0.01034867, + "auxiliary_loss_mlp": 0.01028473, + "balance_loss_clip": 1.0269469, + "balance_loss_mlp": 1.01793504, + "epoch": 0.7335638058018938, + "flos": 23731818307200.0, + "grad_norm": 1.78135321383554, + "language_loss": 0.61407679, + "learning_rate": 6.994536384938754e-07, + "loss": 0.63471019, + "num_input_tokens_seen": 263275880, + "step": 12201, + "time_per_iteration": 2.981708526611328 + }, + { + "auxiliary_loss_clip": 0.01025799, + "auxiliary_loss_mlp": 0.00747607, + "balance_loss_clip": 1.02171707, + "balance_loss_mlp": 1.00046134, + "epoch": 0.7336239290545619, + "flos": 34933289679360.0, + "grad_norm": 1.9237974415779173, + "language_loss": 0.51483399, + "learning_rate": 6.991577889352264e-07, + "loss": 0.5325681, + "num_input_tokens_seen": 263298315, + "step": 12202, + "time_per_iteration": 2.8979196548461914 + }, + { + "auxiliary_loss_clip": 0.01038162, + "auxiliary_loss_mlp": 0.01027495, + "balance_loss_clip": 1.02284884, + "balance_loss_mlp": 1.01700449, + "epoch": 0.7336840523072298, + "flos": 21103049456640.0, + "grad_norm": 1.755437226088887, + "language_loss": 0.68753338, + "learning_rate": 6.98861988704645e-07, + "loss": 0.70818996, + "num_input_tokens_seen": 263318615, + "step": 12203, + "time_per_iteration": 2.719285488128662 + }, + { + "auxiliary_loss_clip": 0.0104621, + "auxiliary_loss_mlp": 0.01032707, + "balance_loss_clip": 1.02619052, + "balance_loss_mlp": 1.02131617, + "epoch": 0.7337441755598978, + "flos": 24024418496640.0, + "grad_norm": 1.8859478332730313, + "language_loss": 0.6594646, + "learning_rate": 6.985662378133474e-07, + "loss": 0.68025374, + "num_input_tokens_seen": 263336705, + "step": 12204, + "time_per_iteration": 2.71152400970459 + }, + { + "auxiliary_loss_clip": 0.01042515, + "auxiliary_loss_mlp": 0.01030456, + "balance_loss_clip": 1.02569675, + "balance_loss_mlp": 1.01975703, + "epoch": 0.7338042988125658, + "flos": 22711309004160.0, + "grad_norm": 2.1673882204744928, + "language_loss": 0.78312004, + "learning_rate": 6.982705362725479e-07, + "loss": 0.80384976, + "num_input_tokens_seen": 263355065, + "step": 12205, + "time_per_iteration": 2.7515687942504883 + }, + { + "auxiliary_loss_clip": 0.01009106, + "auxiliary_loss_mlp": 0.01026828, + "balance_loss_clip": 1.02255678, + "balance_loss_mlp": 1.0166471, + "epoch": 0.7338644220652337, + "flos": 21360996000000.0, + "grad_norm": 1.6558205769457413, + "language_loss": 0.79900301, + "learning_rate": 6.979748840934601e-07, + "loss": 0.81936234, + "num_input_tokens_seen": 263374460, + "step": 12206, + "time_per_iteration": 2.7491092681884766 + }, + { + "auxiliary_loss_clip": 0.01023169, + "auxiliary_loss_mlp": 0.01027552, + "balance_loss_clip": 1.02177715, + "balance_loss_mlp": 1.0163703, + "epoch": 0.7339245453179017, + "flos": 30920236536960.0, + "grad_norm": 1.9023663738442185, + "language_loss": 0.71589726, + "learning_rate": 6.976792812872958e-07, + "loss": 0.73640448, + "num_input_tokens_seen": 263393610, + "step": 12207, + "time_per_iteration": 2.887664318084717 + }, + { + "auxiliary_loss_clip": 0.0098746, + "auxiliary_loss_mlp": 0.01001211, + "balance_loss_clip": 1.00147629, + "balance_loss_mlp": 1.00021577, + "epoch": 0.7339846685705697, + "flos": 67899429072000.0, + "grad_norm": 0.7762667962508208, + "language_loss": 0.5481627, + "learning_rate": 6.97383727865263e-07, + "loss": 0.56804937, + "num_input_tokens_seen": 263450340, + "step": 12208, + "time_per_iteration": 3.3385677337646484 + }, + { + "auxiliary_loss_clip": 0.01061845, + "auxiliary_loss_mlp": 0.0102615, + "balance_loss_clip": 1.0247519, + "balance_loss_mlp": 1.01635695, + "epoch": 0.7340447918232377, + "flos": 22236749493120.0, + "grad_norm": 1.429416678471318, + "language_loss": 0.80676299, + "learning_rate": 6.970882238385703e-07, + "loss": 0.82764292, + "num_input_tokens_seen": 263471735, + "step": 12209, + "time_per_iteration": 2.7542781829833984 + }, + { + "auxiliary_loss_clip": 0.01059395, + "auxiliary_loss_mlp": 0.01026624, + "balance_loss_clip": 1.02280426, + "balance_loss_mlp": 1.01674163, + "epoch": 0.7341049150759056, + "flos": 23764784014080.0, + "grad_norm": 2.034307568031224, + "language_loss": 0.7889092, + "learning_rate": 6.96792769218423e-07, + "loss": 0.80976945, + "num_input_tokens_seen": 263493245, + "step": 12210, + "time_per_iteration": 2.6442177295684814 + }, + { + "auxiliary_loss_clip": 0.01062617, + "auxiliary_loss_mlp": 0.01025947, + "balance_loss_clip": 1.02556562, + "balance_loss_mlp": 1.01525974, + "epoch": 0.7341650383285736, + "flos": 17236547804160.0, + "grad_norm": 2.3940639469651503, + "language_loss": 0.76557255, + "learning_rate": 6.964973640160236e-07, + "loss": 0.78645819, + "num_input_tokens_seen": 263511660, + "step": 12211, + "time_per_iteration": 2.6783130168914795 + }, + { + "auxiliary_loss_clip": 0.010461, + "auxiliary_loss_mlp": 0.01025119, + "balance_loss_clip": 1.02721083, + "balance_loss_mlp": 1.01409805, + "epoch": 0.7342251615812415, + "flos": 23403953940480.0, + "grad_norm": 1.833067561902812, + "language_loss": 0.71970463, + "learning_rate": 6.962020082425748e-07, + "loss": 0.74041677, + "num_input_tokens_seen": 263530875, + "step": 12212, + "time_per_iteration": 2.6675469875335693 + }, + { + "auxiliary_loss_clip": 0.01064567, + "auxiliary_loss_mlp": 0.0103017, + "balance_loss_clip": 1.02612329, + "balance_loss_mlp": 1.0195725, + "epoch": 0.7342852848339095, + "flos": 22747183712640.0, + "grad_norm": 1.5092286476237045, + "language_loss": 0.69013166, + "learning_rate": 6.959067019092766e-07, + "loss": 0.711079, + "num_input_tokens_seen": 263551585, + "step": 12213, + "time_per_iteration": 2.625166893005371 + }, + { + "auxiliary_loss_clip": 0.01006688, + "auxiliary_loss_mlp": 0.01001067, + "balance_loss_clip": 1.0013814, + "balance_loss_mlp": 1.000072, + "epoch": 0.7343454080865774, + "flos": 53942353925760.0, + "grad_norm": 0.7209922865503514, + "language_loss": 0.54300058, + "learning_rate": 6.956114450273276e-07, + "loss": 0.56307811, + "num_input_tokens_seen": 263609545, + "step": 12214, + "time_per_iteration": 3.157543659210205 + }, + { + "auxiliary_loss_clip": 0.01063895, + "auxiliary_loss_mlp": 0.01028001, + "balance_loss_clip": 1.02458453, + "balance_loss_mlp": 1.01761198, + "epoch": 0.7344055313392455, + "flos": 12166859255040.0, + "grad_norm": 2.107415490852763, + "language_loss": 0.70391929, + "learning_rate": 6.953162376079233e-07, + "loss": 0.72483826, + "num_input_tokens_seen": 263627880, + "step": 12215, + "time_per_iteration": 6.166451692581177 + }, + { + "auxiliary_loss_clip": 0.01033429, + "auxiliary_loss_mlp": 0.01025461, + "balance_loss_clip": 1.02325702, + "balance_loss_mlp": 1.01550722, + "epoch": 0.7344656545919134, + "flos": 18550052346240.0, + "grad_norm": 4.0812807585433735, + "language_loss": 0.72715855, + "learning_rate": 6.950210796622573e-07, + "loss": 0.74774742, + "num_input_tokens_seen": 263645665, + "step": 12216, + "time_per_iteration": 2.7743594646453857 + }, + { + "auxiliary_loss_clip": 0.01068132, + "auxiliary_loss_mlp": 0.01038503, + "balance_loss_clip": 1.02602184, + "balance_loss_mlp": 1.02564585, + "epoch": 0.7345257778445814, + "flos": 23661649088640.0, + "grad_norm": 1.853724119652973, + "language_loss": 0.78633565, + "learning_rate": 6.947259712015236e-07, + "loss": 0.80740201, + "num_input_tokens_seen": 263668170, + "step": 12217, + "time_per_iteration": 2.643184185028076 + }, + { + "auxiliary_loss_clip": 0.01032062, + "auxiliary_loss_mlp": 0.01024224, + "balance_loss_clip": 1.02500725, + "balance_loss_mlp": 1.01485431, + "epoch": 0.7345859010972494, + "flos": 13808659127040.0, + "grad_norm": 1.8680463961507419, + "language_loss": 0.77771628, + "learning_rate": 6.94430912236911e-07, + "loss": 0.79827917, + "num_input_tokens_seen": 263684190, + "step": 12218, + "time_per_iteration": 2.7856881618499756 + }, + { + "auxiliary_loss_clip": 0.0101336, + "auxiliary_loss_mlp": 0.01034679, + "balance_loss_clip": 1.0219301, + "balance_loss_mlp": 1.0220542, + "epoch": 0.7346460243499173, + "flos": 22272731942400.0, + "grad_norm": 1.7355558333359709, + "language_loss": 0.72099173, + "learning_rate": 6.941359027796092e-07, + "loss": 0.74147213, + "num_input_tokens_seen": 263702095, + "step": 12219, + "time_per_iteration": 2.825957775115967 + }, + { + "auxiliary_loss_clip": 0.01037846, + "auxiliary_loss_mlp": 0.01029428, + "balance_loss_clip": 1.02282572, + "balance_loss_mlp": 1.01905692, + "epoch": 0.7347061476025853, + "flos": 23255247634560.0, + "grad_norm": 1.6774984772380945, + "language_loss": 0.75018483, + "learning_rate": 6.938409428408061e-07, + "loss": 0.77085757, + "num_input_tokens_seen": 263721385, + "step": 12220, + "time_per_iteration": 2.6735787391662598 + }, + { + "auxiliary_loss_clip": 0.01051128, + "auxiliary_loss_mlp": 0.01027141, + "balance_loss_clip": 1.02359092, + "balance_loss_mlp": 1.01662064, + "epoch": 0.7347662708552533, + "flos": 15267565923840.0, + "grad_norm": 1.7077124153352166, + "language_loss": 0.65765703, + "learning_rate": 6.93546032431684e-07, + "loss": 0.67843974, + "num_input_tokens_seen": 263737835, + "step": 12221, + "time_per_iteration": 2.698791265487671 + }, + { + "auxiliary_loss_clip": 0.01035326, + "auxiliary_loss_mlp": 0.0103335, + "balance_loss_clip": 1.02271485, + "balance_loss_mlp": 1.02216804, + "epoch": 0.7348263941079213, + "flos": 24859987649280.0, + "grad_norm": 1.7399413784933455, + "language_loss": 0.69508898, + "learning_rate": 6.932511715634273e-07, + "loss": 0.71577573, + "num_input_tokens_seen": 263756480, + "step": 12222, + "time_per_iteration": 2.7328548431396484 + }, + { + "auxiliary_loss_clip": 0.0101852, + "auxiliary_loss_mlp": 0.01030121, + "balance_loss_clip": 1.02256727, + "balance_loss_mlp": 1.02040505, + "epoch": 0.7348865173605892, + "flos": 24352103295360.0, + "grad_norm": 1.64337130400418, + "language_loss": 0.65902334, + "learning_rate": 6.92956360247217e-07, + "loss": 0.67950976, + "num_input_tokens_seen": 263776440, + "step": 12223, + "time_per_iteration": 2.8859288692474365 + }, + { + "auxiliary_loss_clip": 0.01044656, + "auxiliary_loss_mlp": 0.01029065, + "balance_loss_clip": 1.02324021, + "balance_loss_mlp": 1.01822305, + "epoch": 0.7349466406132572, + "flos": 20004613597440.0, + "grad_norm": 1.763935183101742, + "language_loss": 0.72172832, + "learning_rate": 6.926615984942332e-07, + "loss": 0.7424655, + "num_input_tokens_seen": 263793700, + "step": 12224, + "time_per_iteration": 2.7028539180755615 + }, + { + "auxiliary_loss_clip": 0.01036809, + "auxiliary_loss_mlp": 0.01029294, + "balance_loss_clip": 1.02750158, + "balance_loss_mlp": 1.018893, + "epoch": 0.7350067638659251, + "flos": 29825068815360.0, + "grad_norm": 1.765259199202, + "language_loss": 0.72622681, + "learning_rate": 6.92366886315652e-07, + "loss": 0.7468878, + "num_input_tokens_seen": 263814620, + "step": 12225, + "time_per_iteration": 2.796692132949829 + }, + { + "auxiliary_loss_clip": 0.01064342, + "auxiliary_loss_mlp": 0.01027092, + "balance_loss_clip": 1.02479792, + "balance_loss_mlp": 1.01560616, + "epoch": 0.7350668871185931, + "flos": 21866150920320.0, + "grad_norm": 1.7347543228457454, + "language_loss": 0.75876415, + "learning_rate": 6.920722237226501e-07, + "loss": 0.77967846, + "num_input_tokens_seen": 263832725, + "step": 12226, + "time_per_iteration": 2.635283946990967 + }, + { + "auxiliary_loss_clip": 0.01035747, + "auxiliary_loss_mlp": 0.01031436, + "balance_loss_clip": 1.02261174, + "balance_loss_mlp": 1.0192287, + "epoch": 0.735127010371261, + "flos": 22566122231040.0, + "grad_norm": 1.5191095221927156, + "language_loss": 0.6691854, + "learning_rate": 6.917776107264008e-07, + "loss": 0.68985718, + "num_input_tokens_seen": 263853850, + "step": 12227, + "time_per_iteration": 2.722323417663574 + }, + { + "auxiliary_loss_clip": 0.01051743, + "auxiliary_loss_mlp": 0.01030655, + "balance_loss_clip": 1.02349901, + "balance_loss_mlp": 1.02033734, + "epoch": 0.7351871336239291, + "flos": 25884339707520.0, + "grad_norm": 1.5208559386088756, + "language_loss": 0.63492513, + "learning_rate": 6.914830473380749e-07, + "loss": 0.65574908, + "num_input_tokens_seen": 263874760, + "step": 12228, + "time_per_iteration": 2.654290199279785 + }, + { + "auxiliary_loss_clip": 0.01042517, + "auxiliary_loss_mlp": 0.01030005, + "balance_loss_clip": 1.02460313, + "balance_loss_mlp": 1.02035546, + "epoch": 0.735247256876597, + "flos": 17932173569280.0, + "grad_norm": 1.4802362554101196, + "language_loss": 0.63582677, + "learning_rate": 6.911885335688427e-07, + "loss": 0.65655196, + "num_input_tokens_seen": 263893390, + "step": 12229, + "time_per_iteration": 2.6406474113464355 + }, + { + "auxiliary_loss_clip": 0.01046714, + "auxiliary_loss_mlp": 0.010331, + "balance_loss_clip": 1.02713084, + "balance_loss_mlp": 1.02178729, + "epoch": 0.735307380129265, + "flos": 28875159694080.0, + "grad_norm": 2.834010076679451, + "language_loss": 0.7339375, + "learning_rate": 6.908940694298726e-07, + "loss": 0.75473559, + "num_input_tokens_seen": 263911180, + "step": 12230, + "time_per_iteration": 2.7820069789886475 + }, + { + "auxiliary_loss_clip": 0.01010017, + "auxiliary_loss_mlp": 0.01025166, + "balance_loss_clip": 1.02299261, + "balance_loss_mlp": 1.01472354, + "epoch": 0.7353675033819329, + "flos": 13625658311040.0, + "grad_norm": 1.938687039538147, + "language_loss": 0.71976209, + "learning_rate": 6.90599654932332e-07, + "loss": 0.74011397, + "num_input_tokens_seen": 263928975, + "step": 12231, + "time_per_iteration": 2.770439624786377 + }, + { + "auxiliary_loss_clip": 0.0105501, + "auxiliary_loss_mlp": 0.01031552, + "balance_loss_clip": 1.02545023, + "balance_loss_mlp": 1.01976824, + "epoch": 0.7354276266346009, + "flos": 19463081178240.0, + "grad_norm": 1.9915305360186903, + "language_loss": 0.63854015, + "learning_rate": 6.903052900873823e-07, + "loss": 0.65940583, + "num_input_tokens_seen": 263944495, + "step": 12232, + "time_per_iteration": 4.320194244384766 + }, + { + "auxiliary_loss_clip": 0.0104463, + "auxiliary_loss_mlp": 0.01030226, + "balance_loss_clip": 1.02520466, + "balance_loss_mlp": 1.01924706, + "epoch": 0.735487749887269, + "flos": 15771858917760.0, + "grad_norm": 1.763985975459211, + "language_loss": 0.75173348, + "learning_rate": 6.900109749061874e-07, + "loss": 0.77248204, + "num_input_tokens_seen": 263961325, + "step": 12233, + "time_per_iteration": 2.701216697692871 + }, + { + "auxiliary_loss_clip": 0.01064161, + "auxiliary_loss_mlp": 0.01026062, + "balance_loss_clip": 1.02622414, + "balance_loss_mlp": 1.0153693, + "epoch": 0.7355478731399369, + "flos": 18260648467200.0, + "grad_norm": 1.5899116421692645, + "language_loss": 0.7325722, + "learning_rate": 6.897167093999079e-07, + "loss": 0.75347441, + "num_input_tokens_seen": 263980445, + "step": 12234, + "time_per_iteration": 2.613809823989868 + }, + { + "auxiliary_loss_clip": 0.01049042, + "auxiliary_loss_mlp": 0.01029287, + "balance_loss_clip": 1.02430964, + "balance_loss_mlp": 1.01795053, + "epoch": 0.7356079963926049, + "flos": 26542043688960.0, + "grad_norm": 2.343489781378075, + "language_loss": 0.59901214, + "learning_rate": 6.894224935797017e-07, + "loss": 0.61979544, + "num_input_tokens_seen": 263999330, + "step": 12235, + "time_per_iteration": 2.7014904022216797 + }, + { + "auxiliary_loss_clip": 0.01038127, + "auxiliary_loss_mlp": 0.01027537, + "balance_loss_clip": 1.02315044, + "balance_loss_mlp": 1.01723742, + "epoch": 0.7356681196452728, + "flos": 10778624467200.0, + "grad_norm": 4.833856349618113, + "language_loss": 0.86216545, + "learning_rate": 6.891283274567259e-07, + "loss": 0.8828221, + "num_input_tokens_seen": 264014150, + "step": 12236, + "time_per_iteration": 2.7682814598083496 + }, + { + "auxiliary_loss_clip": 0.01052892, + "auxiliary_loss_mlp": 0.0074767, + "balance_loss_clip": 1.0248518, + "balance_loss_mlp": 1.00048423, + "epoch": 0.7357282428979408, + "flos": 19718693337600.0, + "grad_norm": 1.6838077045114725, + "language_loss": 0.69351029, + "learning_rate": 6.888342110421364e-07, + "loss": 0.7115159, + "num_input_tokens_seen": 264033140, + "step": 12237, + "time_per_iteration": 2.685551166534424 + }, + { + "auxiliary_loss_clip": 0.00976805, + "auxiliary_loss_mlp": 0.01028751, + "balance_loss_clip": 1.01954079, + "balance_loss_mlp": 1.01786137, + "epoch": 0.7357883661506087, + "flos": 19464014931840.0, + "grad_norm": 1.5737778238379665, + "language_loss": 0.71997017, + "learning_rate": 6.885401443470839e-07, + "loss": 0.74002576, + "num_input_tokens_seen": 264052105, + "step": 12238, + "time_per_iteration": 3.2606656551361084 + }, + { + "auxiliary_loss_clip": 0.01032337, + "auxiliary_loss_mlp": 0.01028313, + "balance_loss_clip": 1.0227201, + "balance_loss_mlp": 1.01677406, + "epoch": 0.7358484894032767, + "flos": 27123006263040.0, + "grad_norm": 1.6037976783411543, + "language_loss": 0.72168374, + "learning_rate": 6.882461273827205e-07, + "loss": 0.7422902, + "num_input_tokens_seen": 264070690, + "step": 12239, + "time_per_iteration": 3.096287727355957 + }, + { + "auxiliary_loss_clip": 0.01031484, + "auxiliary_loss_mlp": 0.01030092, + "balance_loss_clip": 1.02359474, + "balance_loss_mlp": 1.01994753, + "epoch": 0.7359086126559446, + "flos": 24502282058880.0, + "grad_norm": 1.525252236645884, + "language_loss": 0.78900397, + "learning_rate": 6.879521601601954e-07, + "loss": 0.80961978, + "num_input_tokens_seen": 264094225, + "step": 12240, + "time_per_iteration": 2.7498788833618164 + }, + { + "auxiliary_loss_clip": 0.01051523, + "auxiliary_loss_mlp": 0.01033579, + "balance_loss_clip": 1.02473688, + "balance_loss_mlp": 1.02308309, + "epoch": 0.7359687359086127, + "flos": 23331270769920.0, + "grad_norm": 1.7683894432911758, + "language_loss": 0.82756466, + "learning_rate": 6.876582426906565e-07, + "loss": 0.84841573, + "num_input_tokens_seen": 264113190, + "step": 12241, + "time_per_iteration": 2.710130214691162 + }, + { + "auxiliary_loss_clip": 0.01050099, + "auxiliary_loss_mlp": 0.01025376, + "balance_loss_clip": 1.02272773, + "balance_loss_mlp": 1.01503444, + "epoch": 0.7360288591612806, + "flos": 20193396503040.0, + "grad_norm": 1.8298422890196717, + "language_loss": 0.78826833, + "learning_rate": 6.873643749852484e-07, + "loss": 0.80902308, + "num_input_tokens_seen": 264132050, + "step": 12242, + "time_per_iteration": 4.798541307449341 + }, + { + "auxiliary_loss_clip": 0.01013092, + "auxiliary_loss_mlp": 0.01026807, + "balance_loss_clip": 1.02110708, + "balance_loss_mlp": 1.01616752, + "epoch": 0.7360889824139486, + "flos": 24972783333120.0, + "grad_norm": 1.736433468983344, + "language_loss": 0.79379106, + "learning_rate": 6.870705570551145e-07, + "loss": 0.81419003, + "num_input_tokens_seen": 264152800, + "step": 12243, + "time_per_iteration": 3.2503015995025635 + }, + { + "auxiliary_loss_clip": 0.01050553, + "auxiliary_loss_mlp": 0.01028521, + "balance_loss_clip": 1.02349424, + "balance_loss_mlp": 1.01718962, + "epoch": 0.7361491056666165, + "flos": 15012312900480.0, + "grad_norm": 2.493185890056559, + "language_loss": 0.74650139, + "learning_rate": 6.867767889113969e-07, + "loss": 0.76729214, + "num_input_tokens_seen": 264169650, + "step": 12244, + "time_per_iteration": 2.6483616828918457 + }, + { + "auxiliary_loss_clip": 0.01043721, + "auxiliary_loss_mlp": 0.01029435, + "balance_loss_clip": 1.02213335, + "balance_loss_mlp": 1.0184145, + "epoch": 0.7362092289192845, + "flos": 22930400010240.0, + "grad_norm": 1.9940297688660096, + "language_loss": 0.69805568, + "learning_rate": 6.864830705652347e-07, + "loss": 0.71878719, + "num_input_tokens_seen": 264190530, + "step": 12245, + "time_per_iteration": 2.705817937850952 + }, + { + "auxiliary_loss_clip": 0.01030092, + "auxiliary_loss_mlp": 0.01029479, + "balance_loss_clip": 1.02351975, + "balance_loss_mlp": 1.01849389, + "epoch": 0.7362693521719526, + "flos": 20702681487360.0, + "grad_norm": 1.4535407995864136, + "language_loss": 0.73459154, + "learning_rate": 6.861894020277658e-07, + "loss": 0.75518727, + "num_input_tokens_seen": 264210820, + "step": 12246, + "time_per_iteration": 2.698624610900879 + }, + { + "auxiliary_loss_clip": 0.01037155, + "auxiliary_loss_mlp": 0.01019622, + "balance_loss_clip": 1.02153516, + "balance_loss_mlp": 1.00981069, + "epoch": 0.7363294754246205, + "flos": 13111381336320.0, + "grad_norm": 1.983245576810776, + "language_loss": 0.73541558, + "learning_rate": 6.858957833101266e-07, + "loss": 0.75598335, + "num_input_tokens_seen": 264227430, + "step": 12247, + "time_per_iteration": 2.669431209564209 + }, + { + "auxiliary_loss_clip": 0.01053161, + "auxiliary_loss_mlp": 0.01025308, + "balance_loss_clip": 1.0276798, + "balance_loss_mlp": 1.01540232, + "epoch": 0.7363895986772885, + "flos": 14027426910720.0, + "grad_norm": 2.5720229944678947, + "language_loss": 0.73988086, + "learning_rate": 6.856022144234526e-07, + "loss": 0.76066554, + "num_input_tokens_seen": 264245230, + "step": 12248, + "time_per_iteration": 2.5444273948669434 + }, + { + "auxiliary_loss_clip": 0.01041591, + "auxiliary_loss_mlp": 0.01032147, + "balance_loss_clip": 1.02365983, + "balance_loss_mlp": 1.02133465, + "epoch": 0.7364497219299564, + "flos": 19719986227200.0, + "grad_norm": 2.7039005993559835, + "language_loss": 0.73036993, + "learning_rate": 6.853086953788727e-07, + "loss": 0.75110722, + "num_input_tokens_seen": 264263945, + "step": 12249, + "time_per_iteration": 2.7042760848999023 + }, + { + "auxiliary_loss_clip": 0.0103962, + "auxiliary_loss_mlp": 0.01028974, + "balance_loss_clip": 1.02278781, + "balance_loss_mlp": 1.01800036, + "epoch": 0.7365098451826244, + "flos": 21361391049600.0, + "grad_norm": 1.8347531877261096, + "language_loss": 0.77148318, + "learning_rate": 6.850152261875189e-07, + "loss": 0.79216909, + "num_input_tokens_seen": 264281500, + "step": 12250, + "time_per_iteration": 2.8111460208892822 + }, + { + "auxiliary_loss_clip": 0.01022354, + "auxiliary_loss_mlp": 0.0102666, + "balance_loss_clip": 1.02405095, + "balance_loss_mlp": 1.0155437, + "epoch": 0.7365699684352923, + "flos": 23368222886400.0, + "grad_norm": 1.5540521619061907, + "language_loss": 0.71384835, + "learning_rate": 6.8472180686052e-07, + "loss": 0.73433846, + "num_input_tokens_seen": 264301625, + "step": 12251, + "time_per_iteration": 2.818819999694824 + }, + { + "auxiliary_loss_clip": 0.01051994, + "auxiliary_loss_mlp": 0.01027646, + "balance_loss_clip": 1.02502143, + "balance_loss_mlp": 1.01739359, + "epoch": 0.7366300916879603, + "flos": 59524879927680.0, + "grad_norm": 1.4559969705316078, + "language_loss": 0.65476954, + "learning_rate": 6.844284374090015e-07, + "loss": 0.6755659, + "num_input_tokens_seen": 264323975, + "step": 12252, + "time_per_iteration": 3.046220064163208 + }, + { + "auxiliary_loss_clip": 0.0101099, + "auxiliary_loss_mlp": 0.01027209, + "balance_loss_clip": 1.02279866, + "balance_loss_mlp": 1.01595604, + "epoch": 0.7366902149406283, + "flos": 20923137210240.0, + "grad_norm": 2.100407275430403, + "language_loss": 0.79331958, + "learning_rate": 6.841351178440884e-07, + "loss": 0.81370151, + "num_input_tokens_seen": 264343785, + "step": 12253, + "time_per_iteration": 2.8042471408843994 + }, + { + "auxiliary_loss_clip": 0.0105833, + "auxiliary_loss_mlp": 0.00747645, + "balance_loss_clip": 1.02383447, + "balance_loss_mlp": 1.00049567, + "epoch": 0.7367503381932963, + "flos": 17348158339200.0, + "grad_norm": 1.9033977705372715, + "language_loss": 0.76170361, + "learning_rate": 6.83841848176905e-07, + "loss": 0.77976334, + "num_input_tokens_seen": 264361130, + "step": 12254, + "time_per_iteration": 2.579643726348877 + }, + { + "auxiliary_loss_clip": 0.01034227, + "auxiliary_loss_mlp": 0.010306, + "balance_loss_clip": 1.02195406, + "balance_loss_mlp": 1.01860797, + "epoch": 0.7368104614459642, + "flos": 17821317219840.0, + "grad_norm": 2.59524552422777, + "language_loss": 0.70650601, + "learning_rate": 6.835486284185692e-07, + "loss": 0.72715425, + "num_input_tokens_seen": 264376965, + "step": 12255, + "time_per_iteration": 2.669403314590454 + }, + { + "auxiliary_loss_clip": 0.01053198, + "auxiliary_loss_mlp": 0.01027214, + "balance_loss_clip": 1.02550817, + "balance_loss_mlp": 1.0160501, + "epoch": 0.7368705846986322, + "flos": 24606099342720.0, + "grad_norm": 2.460962611929046, + "language_loss": 0.755041, + "learning_rate": 6.832554585802012e-07, + "loss": 0.77584517, + "num_input_tokens_seen": 264396310, + "step": 12256, + "time_per_iteration": 2.6801886558532715 + }, + { + "auxiliary_loss_clip": 0.01052879, + "auxiliary_loss_mlp": 0.01027176, + "balance_loss_clip": 1.02441561, + "balance_loss_mlp": 1.01596999, + "epoch": 0.7369307079513001, + "flos": 34970169968640.0, + "grad_norm": 1.662483779666048, + "language_loss": 0.73681462, + "learning_rate": 6.829623386729182e-07, + "loss": 0.75761515, + "num_input_tokens_seen": 264418085, + "step": 12257, + "time_per_iteration": 2.7383248805999756 + }, + { + "auxiliary_loss_clip": 0.01042632, + "auxiliary_loss_mlp": 0.01033931, + "balance_loss_clip": 1.02169728, + "balance_loss_mlp": 1.02363145, + "epoch": 0.7369908312039681, + "flos": 21214588164480.0, + "grad_norm": 1.6083999289507447, + "language_loss": 0.78142428, + "learning_rate": 6.826692687078362e-07, + "loss": 0.80218995, + "num_input_tokens_seen": 264437595, + "step": 12258, + "time_per_iteration": 2.6624293327331543 + }, + { + "auxiliary_loss_clip": 0.01054401, + "auxiliary_loss_mlp": 0.01030787, + "balance_loss_clip": 1.0260942, + "balance_loss_mlp": 1.02054119, + "epoch": 0.7370509544566362, + "flos": 23623655477760.0, + "grad_norm": 1.4161566242043067, + "language_loss": 0.66255587, + "learning_rate": 6.823762486960674e-07, + "loss": 0.68340772, + "num_input_tokens_seen": 264457385, + "step": 12259, + "time_per_iteration": 2.6731297969818115 + }, + { + "auxiliary_loss_clip": 0.01053335, + "auxiliary_loss_mlp": 0.01028763, + "balance_loss_clip": 1.02605319, + "balance_loss_mlp": 1.01722348, + "epoch": 0.7371110777093041, + "flos": 24827704300800.0, + "grad_norm": 1.6096218355262446, + "language_loss": 0.73469365, + "learning_rate": 6.820832786487225e-07, + "loss": 0.75551462, + "num_input_tokens_seen": 264477205, + "step": 12260, + "time_per_iteration": 2.65354323387146 + }, + { + "auxiliary_loss_clip": 0.01045881, + "auxiliary_loss_mlp": 0.01032533, + "balance_loss_clip": 1.02366936, + "balance_loss_mlp": 1.02147663, + "epoch": 0.7371712009619721, + "flos": 23149491016320.0, + "grad_norm": 1.5200993958936666, + "language_loss": 0.73280239, + "learning_rate": 6.817903585769125e-07, + "loss": 0.75358653, + "num_input_tokens_seen": 264497195, + "step": 12261, + "time_per_iteration": 2.688671588897705 + }, + { + "auxiliary_loss_clip": 0.01042126, + "auxiliary_loss_mlp": 0.01030296, + "balance_loss_clip": 1.02332449, + "balance_loss_mlp": 1.01891708, + "epoch": 0.73723132421464, + "flos": 23112898035840.0, + "grad_norm": 1.8982751707487668, + "language_loss": 0.67224818, + "learning_rate": 6.814974884917438e-07, + "loss": 0.69297242, + "num_input_tokens_seen": 264516950, + "step": 12262, + "time_per_iteration": 2.7105870246887207 + }, + { + "auxiliary_loss_clip": 0.01064103, + "auxiliary_loss_mlp": 0.0102763, + "balance_loss_clip": 1.02513433, + "balance_loss_mlp": 1.01643658, + "epoch": 0.737291447467308, + "flos": 19273328605440.0, + "grad_norm": 1.7682646099532837, + "language_loss": 0.88791561, + "learning_rate": 6.81204668404322e-07, + "loss": 0.90883291, + "num_input_tokens_seen": 264532675, + "step": 12263, + "time_per_iteration": 5.82892632484436 + }, + { + "auxiliary_loss_clip": 0.01057679, + "auxiliary_loss_mlp": 0.01025621, + "balance_loss_clip": 1.02377832, + "balance_loss_mlp": 1.0162394, + "epoch": 0.7373515707199759, + "flos": 25118257415040.0, + "grad_norm": 1.5167979490776617, + "language_loss": 0.66962832, + "learning_rate": 6.809118983257522e-07, + "loss": 0.6904614, + "num_input_tokens_seen": 264555635, + "step": 12264, + "time_per_iteration": 2.7205004692077637 + }, + { + "auxiliary_loss_clip": 0.01059404, + "auxiliary_loss_mlp": 0.01027448, + "balance_loss_clip": 1.02372444, + "balance_loss_mlp": 1.01754165, + "epoch": 0.737411693972644, + "flos": 32408481767040.0, + "grad_norm": 1.7485217763947494, + "language_loss": 0.80233598, + "learning_rate": 6.806191782671356e-07, + "loss": 0.82320446, + "num_input_tokens_seen": 264573140, + "step": 12265, + "time_per_iteration": 2.638507604598999 + }, + { + "auxiliary_loss_clip": 0.01058006, + "auxiliary_loss_mlp": 0.01026821, + "balance_loss_clip": 1.02640617, + "balance_loss_mlp": 1.01615167, + "epoch": 0.7374718172253119, + "flos": 24315797623680.0, + "grad_norm": 1.6452797122489053, + "language_loss": 0.74357998, + "learning_rate": 6.803265082395711e-07, + "loss": 0.76442826, + "num_input_tokens_seen": 264591610, + "step": 12266, + "time_per_iteration": 2.588287830352783 + }, + { + "auxiliary_loss_clip": 0.01054923, + "auxiliary_loss_mlp": 0.01032276, + "balance_loss_clip": 1.0268712, + "balance_loss_mlp": 1.02107584, + "epoch": 0.7375319404779799, + "flos": 27156115624320.0, + "grad_norm": 1.6386293800403564, + "language_loss": 0.73693228, + "learning_rate": 6.800338882541576e-07, + "loss": 0.75780427, + "num_input_tokens_seen": 264611170, + "step": 12267, + "time_per_iteration": 2.6313068866729736 + }, + { + "auxiliary_loss_clip": 0.01029175, + "auxiliary_loss_mlp": 0.01024368, + "balance_loss_clip": 1.02392697, + "balance_loss_mlp": 1.01466417, + "epoch": 0.7375920637306478, + "flos": 18879999701760.0, + "grad_norm": 2.0592169332820447, + "language_loss": 0.8323437, + "learning_rate": 6.797413183219923e-07, + "loss": 0.85287911, + "num_input_tokens_seen": 264629365, + "step": 12268, + "time_per_iteration": 2.6645495891571045 + }, + { + "auxiliary_loss_clip": 0.01060946, + "auxiliary_loss_mlp": 0.01032489, + "balance_loss_clip": 1.02443433, + "balance_loss_mlp": 1.0219512, + "epoch": 0.7376521869833158, + "flos": 15669765486720.0, + "grad_norm": 1.7190164692251542, + "language_loss": 0.72734267, + "learning_rate": 6.794487984541677e-07, + "loss": 0.74827701, + "num_input_tokens_seen": 264647915, + "step": 12269, + "time_per_iteration": 2.54366397857666 + }, + { + "auxiliary_loss_clip": 0.01032295, + "auxiliary_loss_mlp": 0.01035435, + "balance_loss_clip": 1.02068317, + "balance_loss_mlp": 1.02369297, + "epoch": 0.7377123102359837, + "flos": 36971973901440.0, + "grad_norm": 2.108477025089728, + "language_loss": 0.70055866, + "learning_rate": 6.791563286617776e-07, + "loss": 0.72123593, + "num_input_tokens_seen": 264669620, + "step": 12270, + "time_per_iteration": 2.7687225341796875 + }, + { + "auxiliary_loss_clip": 0.01045024, + "auxiliary_loss_mlp": 0.01028716, + "balance_loss_clip": 1.0218302, + "balance_loss_mlp": 1.01885772, + "epoch": 0.7377724334886517, + "flos": 24496284487680.0, + "grad_norm": 1.6059649494435113, + "language_loss": 0.69306988, + "learning_rate": 6.788639089559119e-07, + "loss": 0.71380728, + "num_input_tokens_seen": 264689345, + "step": 12271, + "time_per_iteration": 2.6818981170654297 + }, + { + "auxiliary_loss_clip": 0.01037725, + "auxiliary_loss_mlp": 0.01029737, + "balance_loss_clip": 1.02607679, + "balance_loss_mlp": 1.01834655, + "epoch": 0.7378325567413198, + "flos": 24390025079040.0, + "grad_norm": 1.9492318588378041, + "language_loss": 0.67886746, + "learning_rate": 6.785715393476586e-07, + "loss": 0.69954205, + "num_input_tokens_seen": 264707625, + "step": 12272, + "time_per_iteration": 2.718111515045166 + }, + { + "auxiliary_loss_clip": 0.01039648, + "auxiliary_loss_mlp": 0.01027525, + "balance_loss_clip": 1.02414989, + "balance_loss_mlp": 1.01760042, + "epoch": 0.7378926799939877, + "flos": 17416388223360.0, + "grad_norm": 2.183112821280269, + "language_loss": 0.78106225, + "learning_rate": 6.782792198481049e-07, + "loss": 0.80173397, + "num_input_tokens_seen": 264725575, + "step": 12273, + "time_per_iteration": 2.747321605682373 + }, + { + "auxiliary_loss_clip": 0.01058984, + "auxiliary_loss_mlp": 0.01026152, + "balance_loss_clip": 1.02232385, + "balance_loss_mlp": 1.01590037, + "epoch": 0.7379528032466557, + "flos": 18474208778880.0, + "grad_norm": 1.8810288021303214, + "language_loss": 0.83719039, + "learning_rate": 6.779869504683355e-07, + "loss": 0.85804176, + "num_input_tokens_seen": 264742855, + "step": 12274, + "time_per_iteration": 2.651049852371216 + }, + { + "auxiliary_loss_clip": 0.01047931, + "auxiliary_loss_mlp": 0.00747851, + "balance_loss_clip": 1.02671731, + "balance_loss_mlp": 1.00055218, + "epoch": 0.7380129264993236, + "flos": 17821999578240.0, + "grad_norm": 1.8635415941268636, + "language_loss": 0.73727745, + "learning_rate": 6.776947312194341e-07, + "loss": 0.7552352, + "num_input_tokens_seen": 264761155, + "step": 12275, + "time_per_iteration": 2.6267216205596924 + }, + { + "auxiliary_loss_clip": 0.0102723, + "auxiliary_loss_mlp": 0.01042051, + "balance_loss_clip": 1.02259874, + "balance_loss_mlp": 1.0296948, + "epoch": 0.7380730497519916, + "flos": 22997372918400.0, + "grad_norm": 1.7653756485310328, + "language_loss": 0.73665172, + "learning_rate": 6.774025621124813e-07, + "loss": 0.75734448, + "num_input_tokens_seen": 264780660, + "step": 12276, + "time_per_iteration": 2.7123072147369385 + }, + { + "auxiliary_loss_clip": 0.01063316, + "auxiliary_loss_mlp": 0.01027919, + "balance_loss_clip": 1.02502298, + "balance_loss_mlp": 1.01729727, + "epoch": 0.7381331730046595, + "flos": 20266259241600.0, + "grad_norm": 1.9737745392156871, + "language_loss": 0.77696157, + "learning_rate": 6.771104431585551e-07, + "loss": 0.79787397, + "num_input_tokens_seen": 264798850, + "step": 12277, + "time_per_iteration": 2.7730112075805664 + }, + { + "auxiliary_loss_clip": 0.01061134, + "auxiliary_loss_mlp": 0.01035237, + "balance_loss_clip": 1.02494407, + "balance_loss_mlp": 1.02460337, + "epoch": 0.7381932962573275, + "flos": 19754532132480.0, + "grad_norm": 2.150690607681755, + "language_loss": 0.78503919, + "learning_rate": 6.768183743687338e-07, + "loss": 0.80600291, + "num_input_tokens_seen": 264816795, + "step": 12278, + "time_per_iteration": 2.8623769283294678 + }, + { + "auxiliary_loss_clip": 0.01051155, + "auxiliary_loss_mlp": 0.00747616, + "balance_loss_clip": 1.02302337, + "balance_loss_mlp": 1.00045896, + "epoch": 0.7382534195099955, + "flos": 17305316392320.0, + "grad_norm": 1.9769808492433003, + "language_loss": 0.71709925, + "learning_rate": 6.765263557540921e-07, + "loss": 0.73508704, + "num_input_tokens_seen": 264834105, + "step": 12279, + "time_per_iteration": 4.488002777099609 + }, + { + "auxiliary_loss_clip": 0.01054076, + "auxiliary_loss_mlp": 0.0103021, + "balance_loss_clip": 1.02418995, + "balance_loss_mlp": 1.01857555, + "epoch": 0.7383135427626635, + "flos": 18697358021760.0, + "grad_norm": 2.3210197455687465, + "language_loss": 0.86222285, + "learning_rate": 6.762343873257034e-07, + "loss": 0.88306576, + "num_input_tokens_seen": 264850895, + "step": 12280, + "time_per_iteration": 2.9646825790405273 + }, + { + "auxiliary_loss_clip": 0.01023507, + "auxiliary_loss_mlp": 0.01028346, + "balance_loss_clip": 1.0218575, + "balance_loss_mlp": 1.01690793, + "epoch": 0.7383736660153314, + "flos": 20881300844160.0, + "grad_norm": 1.8377815754158553, + "language_loss": 0.72552466, + "learning_rate": 6.759424690946408e-07, + "loss": 0.74604321, + "num_input_tokens_seen": 264869505, + "step": 12281, + "time_per_iteration": 2.8182058334350586 + }, + { + "auxiliary_loss_clip": 0.01020325, + "auxiliary_loss_mlp": 0.01028302, + "balance_loss_clip": 1.02307844, + "balance_loss_mlp": 1.01751339, + "epoch": 0.7384337892679994, + "flos": 20663215418880.0, + "grad_norm": 1.6762066687177428, + "language_loss": 0.60478842, + "learning_rate": 6.756506010719711e-07, + "loss": 0.62527472, + "num_input_tokens_seen": 264886915, + "step": 12282, + "time_per_iteration": 2.806715250015259 + }, + { + "auxiliary_loss_clip": 0.01033565, + "auxiliary_loss_mlp": 0.01029728, + "balance_loss_clip": 1.02514827, + "balance_loss_mlp": 1.01861727, + "epoch": 0.7384939125206673, + "flos": 29169627390720.0, + "grad_norm": 2.0888457216387257, + "language_loss": 0.67897069, + "learning_rate": 6.753587832687632e-07, + "loss": 0.69960356, + "num_input_tokens_seen": 264910350, + "step": 12283, + "time_per_iteration": 2.7854881286621094 + }, + { + "auxiliary_loss_clip": 0.01064415, + "auxiliary_loss_mlp": 0.00747743, + "balance_loss_clip": 1.02646327, + "balance_loss_mlp": 1.00049579, + "epoch": 0.7385540357733353, + "flos": 36312833376000.0, + "grad_norm": 1.6187487797987108, + "language_loss": 0.75755697, + "learning_rate": 6.750670156960832e-07, + "loss": 0.77567858, + "num_input_tokens_seen": 264930705, + "step": 12284, + "time_per_iteration": 2.8454301357269287 + }, + { + "auxiliary_loss_clip": 0.01043597, + "auxiliary_loss_mlp": 0.01027011, + "balance_loss_clip": 1.02134681, + "balance_loss_mlp": 1.01518536, + "epoch": 0.7386141590260034, + "flos": 20302600826880.0, + "grad_norm": 1.7943179057039331, + "language_loss": 0.69266868, + "learning_rate": 6.747752983649954e-07, + "loss": 0.71337473, + "num_input_tokens_seen": 264946975, + "step": 12285, + "time_per_iteration": 2.7695870399475098 + }, + { + "auxiliary_loss_clip": 0.0104882, + "auxiliary_loss_mlp": 0.01030394, + "balance_loss_clip": 1.02665973, + "balance_loss_mlp": 1.0185442, + "epoch": 0.7386742822786713, + "flos": 25483792170240.0, + "grad_norm": 1.979922922880242, + "language_loss": 0.7982465, + "learning_rate": 6.744836312865602e-07, + "loss": 0.81903869, + "num_input_tokens_seen": 264967665, + "step": 12286, + "time_per_iteration": 2.8368821144104004 + }, + { + "auxiliary_loss_clip": 0.01021976, + "auxiliary_loss_mlp": 0.01025639, + "balance_loss_clip": 1.02408123, + "balance_loss_mlp": 1.01476681, + "epoch": 0.7387344055313393, + "flos": 13771958405760.0, + "grad_norm": 1.7646396150362604, + "language_loss": 0.65329385, + "learning_rate": 6.741920144718396e-07, + "loss": 0.67377001, + "num_input_tokens_seen": 264985480, + "step": 12287, + "time_per_iteration": 2.7720184326171875 + }, + { + "auxiliary_loss_clip": 0.01039372, + "auxiliary_loss_mlp": 0.01025396, + "balance_loss_clip": 1.02319026, + "balance_loss_mlp": 1.01582396, + "epoch": 0.7387945287840072, + "flos": 27855189095040.0, + "grad_norm": 1.8942069443724296, + "language_loss": 0.76813781, + "learning_rate": 6.739004479318903e-07, + "loss": 0.78878558, + "num_input_tokens_seen": 265004790, + "step": 12288, + "time_per_iteration": 2.8032584190368652 + }, + { + "auxiliary_loss_clip": 0.01055796, + "auxiliary_loss_mlp": 0.00747868, + "balance_loss_clip": 1.02570963, + "balance_loss_mlp": 1.00051618, + "epoch": 0.7388546520366752, + "flos": 44233039388160.0, + "grad_norm": 1.7132027046185534, + "language_loss": 0.58287126, + "learning_rate": 6.736089316777684e-07, + "loss": 0.60090792, + "num_input_tokens_seen": 265028790, + "step": 12289, + "time_per_iteration": 2.8516995906829834 + }, + { + "auxiliary_loss_clip": 0.01006665, + "auxiliary_loss_mlp": 0.00746883, + "balance_loss_clip": 1.00141978, + "balance_loss_mlp": 1.00103331, + "epoch": 0.7389147752893431, + "flos": 70680890638080.0, + "grad_norm": 0.6387308914656045, + "language_loss": 0.49270692, + "learning_rate": 6.733174657205287e-07, + "loss": 0.51024234, + "num_input_tokens_seen": 265096660, + "step": 12290, + "time_per_iteration": 5.1063830852508545 + }, + { + "auxiliary_loss_clip": 0.01046022, + "auxiliary_loss_mlp": 0.01029183, + "balance_loss_clip": 1.02498913, + "balance_loss_mlp": 1.01733375, + "epoch": 0.7389748985420111, + "flos": 25994980575360.0, + "grad_norm": 1.6736355497388273, + "language_loss": 0.6731019, + "learning_rate": 6.730260500712237e-07, + "loss": 0.69385403, + "num_input_tokens_seen": 265116375, + "step": 12291, + "time_per_iteration": 2.665022611618042 + }, + { + "auxiliary_loss_clip": 0.00968565, + "auxiliary_loss_mlp": 0.01001876, + "balance_loss_clip": 1.00311089, + "balance_loss_mlp": 1.00090408, + "epoch": 0.7390350217946791, + "flos": 54403661318400.0, + "grad_norm": 1.0061244044653943, + "language_loss": 0.6086058, + "learning_rate": 6.727346847409052e-07, + "loss": 0.6283102, + "num_input_tokens_seen": 265161230, + "step": 12292, + "time_per_iteration": 2.9395720958709717 + }, + { + "auxiliary_loss_clip": 0.01016431, + "auxiliary_loss_mlp": 0.01032888, + "balance_loss_clip": 1.02314973, + "balance_loss_mlp": 1.02253461, + "epoch": 0.7390951450473471, + "flos": 32196968530560.0, + "grad_norm": 1.7538335195737673, + "language_loss": 0.67162359, + "learning_rate": 6.724433697406191e-07, + "loss": 0.6921168, + "num_input_tokens_seen": 265182515, + "step": 12293, + "time_per_iteration": 2.8642585277557373 + }, + { + "auxiliary_loss_clip": 0.01053203, + "auxiliary_loss_mlp": 0.01029101, + "balance_loss_clip": 1.02474439, + "balance_loss_mlp": 1.01838422, + "epoch": 0.739155268300015, + "flos": 16684241304960.0, + "grad_norm": 1.8391838829727123, + "language_loss": 0.83340406, + "learning_rate": 6.721521050814134e-07, + "loss": 0.85422713, + "num_input_tokens_seen": 265198160, + "step": 12294, + "time_per_iteration": 2.5743916034698486 + }, + { + "auxiliary_loss_clip": 0.01031644, + "auxiliary_loss_mlp": 0.01030204, + "balance_loss_clip": 1.0246501, + "balance_loss_mlp": 1.01948726, + "epoch": 0.739215391552683, + "flos": 31649761762560.0, + "grad_norm": 1.667568930774934, + "language_loss": 0.72956312, + "learning_rate": 6.718608907743337e-07, + "loss": 0.75018156, + "num_input_tokens_seen": 265218480, + "step": 12295, + "time_per_iteration": 2.818833827972412 + }, + { + "auxiliary_loss_clip": 0.01052967, + "auxiliary_loss_mlp": 0.01038757, + "balance_loss_clip": 1.02598631, + "balance_loss_mlp": 1.02815914, + "epoch": 0.7392755148053509, + "flos": 29718522097920.0, + "grad_norm": 1.706727016317067, + "language_loss": 0.78743625, + "learning_rate": 6.715697268304215e-07, + "loss": 0.80835354, + "num_input_tokens_seen": 265240165, + "step": 12296, + "time_per_iteration": 2.788628578186035 + }, + { + "auxiliary_loss_clip": 0.01062826, + "auxiliary_loss_mlp": 0.01027139, + "balance_loss_clip": 1.02500105, + "balance_loss_mlp": 1.01602829, + "epoch": 0.7393356380580189, + "flos": 37050475075200.0, + "grad_norm": 2.3935622625895276, + "language_loss": 0.66143829, + "learning_rate": 6.712786132607182e-07, + "loss": 0.68233788, + "num_input_tokens_seen": 265263295, + "step": 12297, + "time_per_iteration": 2.7350337505340576 + }, + { + "auxiliary_loss_clip": 0.01043204, + "auxiliary_loss_mlp": 0.01035302, + "balance_loss_clip": 1.02578497, + "balance_loss_mlp": 1.02403665, + "epoch": 0.739395761310687, + "flos": 19719627091200.0, + "grad_norm": 2.5147394856901495, + "language_loss": 0.68211752, + "learning_rate": 6.709875500762645e-07, + "loss": 0.70290256, + "num_input_tokens_seen": 265282740, + "step": 12298, + "time_per_iteration": 2.7321197986602783 + }, + { + "auxiliary_loss_clip": 0.0103287, + "auxiliary_loss_mlp": 0.01028967, + "balance_loss_clip": 1.02240062, + "balance_loss_mlp": 1.01803565, + "epoch": 0.7394558845633549, + "flos": 11801504067840.0, + "grad_norm": 2.0925093324568733, + "language_loss": 0.74818194, + "learning_rate": 6.706965372880946e-07, + "loss": 0.76880026, + "num_input_tokens_seen": 265300175, + "step": 12299, + "time_per_iteration": 2.673902988433838 + }, + { + "auxiliary_loss_clip": 0.00992128, + "auxiliary_loss_mlp": 0.0100362, + "balance_loss_clip": 1.00526953, + "balance_loss_mlp": 1.00220132, + "epoch": 0.7395160078160229, + "flos": 66195827850240.0, + "grad_norm": 0.7588059517344098, + "language_loss": 0.60805106, + "learning_rate": 6.704055749072455e-07, + "loss": 0.62800848, + "num_input_tokens_seen": 265363275, + "step": 12300, + "time_per_iteration": 3.317671060562134 + }, + { + "auxiliary_loss_clip": 0.01034128, + "auxiliary_loss_mlp": 0.01028081, + "balance_loss_clip": 1.02524233, + "balance_loss_mlp": 1.0170598, + "epoch": 0.7395761310686908, + "flos": 21249708687360.0, + "grad_norm": 1.7387167845998819, + "language_loss": 0.8032496, + "learning_rate": 6.7011466294475e-07, + "loss": 0.82387161, + "num_input_tokens_seen": 265382935, + "step": 12301, + "time_per_iteration": 2.7239291667938232 + }, + { + "auxiliary_loss_clip": 0.01061404, + "auxiliary_loss_mlp": 0.01028125, + "balance_loss_clip": 1.02462757, + "balance_loss_mlp": 1.01809978, + "epoch": 0.7396362543213588, + "flos": 25955299025280.0, + "grad_norm": 1.7119341206894994, + "language_loss": 0.73235571, + "learning_rate": 6.698238014116406e-07, + "loss": 0.75325102, + "num_input_tokens_seen": 265403245, + "step": 12302, + "time_per_iteration": 2.7080078125 + }, + { + "auxiliary_loss_clip": 0.0106285, + "auxiliary_loss_mlp": 0.01035334, + "balance_loss_clip": 1.02474785, + "balance_loss_mlp": 1.02465916, + "epoch": 0.7396963775740267, + "flos": 27377936064000.0, + "grad_norm": 1.9061708888961892, + "language_loss": 0.73596525, + "learning_rate": 6.695329903189451e-07, + "loss": 0.7569471, + "num_input_tokens_seen": 265423105, + "step": 12303, + "time_per_iteration": 2.592207431793213 + }, + { + "auxiliary_loss_clip": 0.01061418, + "auxiliary_loss_mlp": 0.01025467, + "balance_loss_clip": 1.02514017, + "balance_loss_mlp": 1.01565611, + "epoch": 0.7397565008266948, + "flos": 25520133755520.0, + "grad_norm": 1.623902891057925, + "language_loss": 0.54681253, + "learning_rate": 6.692422296776927e-07, + "loss": 0.56768143, + "num_input_tokens_seen": 265443445, + "step": 12304, + "time_per_iteration": 2.6391637325286865 + }, + { + "auxiliary_loss_clip": 0.01040971, + "auxiliary_loss_mlp": 0.01033022, + "balance_loss_clip": 1.02397132, + "balance_loss_mlp": 1.02169716, + "epoch": 0.7398166240793627, + "flos": 23727760070400.0, + "grad_norm": 1.9625893700191415, + "language_loss": 0.84750664, + "learning_rate": 6.689515194989084e-07, + "loss": 0.86824656, + "num_input_tokens_seen": 265462085, + "step": 12305, + "time_per_iteration": 2.8094303607940674 + }, + { + "auxiliary_loss_clip": 0.00989438, + "auxiliary_loss_mlp": 0.01000987, + "balance_loss_clip": 1.00283003, + "balance_loss_mlp": 1.00003898, + "epoch": 0.7398767473320307, + "flos": 67267582882560.0, + "grad_norm": 0.8724263797949408, + "language_loss": 0.57705402, + "learning_rate": 6.68660859793615e-07, + "loss": 0.59695828, + "num_input_tokens_seen": 265521190, + "step": 12306, + "time_per_iteration": 3.341658592224121 + }, + { + "auxiliary_loss_clip": 0.01046645, + "auxiliary_loss_mlp": 0.01025812, + "balance_loss_clip": 1.02680826, + "balance_loss_mlp": 1.01489842, + "epoch": 0.7399368705846986, + "flos": 22018699981440.0, + "grad_norm": 2.0448564898304236, + "language_loss": 0.81552368, + "learning_rate": 6.683702505728355e-07, + "loss": 0.83624828, + "num_input_tokens_seen": 265539705, + "step": 12307, + "time_per_iteration": 2.7312984466552734 + }, + { + "auxiliary_loss_clip": 0.01053521, + "auxiliary_loss_mlp": 0.01029969, + "balance_loss_clip": 1.02711618, + "balance_loss_mlp": 1.01964557, + "epoch": 0.7399969938373666, + "flos": 14173870659840.0, + "grad_norm": 1.7180140740725047, + "language_loss": 0.69766271, + "learning_rate": 6.680796918475893e-07, + "loss": 0.71849763, + "num_input_tokens_seen": 265555855, + "step": 12308, + "time_per_iteration": 2.727745771408081 + }, + { + "auxiliary_loss_clip": 0.01031464, + "auxiliary_loss_mlp": 0.01028317, + "balance_loss_clip": 1.02346778, + "balance_loss_mlp": 1.017856, + "epoch": 0.7400571170900345, + "flos": 25301473712640.0, + "grad_norm": 1.7580380457292917, + "language_loss": 0.815727, + "learning_rate": 6.67789183628896e-07, + "loss": 0.83632481, + "num_input_tokens_seen": 265575455, + "step": 12309, + "time_per_iteration": 2.7774648666381836 + }, + { + "auxiliary_loss_clip": 0.01055061, + "auxiliary_loss_mlp": 0.01032666, + "balance_loss_clip": 1.02576864, + "balance_loss_mlp": 1.02119792, + "epoch": 0.7401172403427025, + "flos": 22711344917760.0, + "grad_norm": 1.6814871251983319, + "language_loss": 0.72757316, + "learning_rate": 6.674987259277692e-07, + "loss": 0.7484504, + "num_input_tokens_seen": 265595250, + "step": 12310, + "time_per_iteration": 4.3020758628845215 + }, + { + "auxiliary_loss_clip": 0.01031816, + "auxiliary_loss_mlp": 0.01036973, + "balance_loss_clip": 1.02636933, + "balance_loss_mlp": 1.0251714, + "epoch": 0.7401773635953706, + "flos": 18067448188800.0, + "grad_norm": 2.4587539011721278, + "language_loss": 0.88373196, + "learning_rate": 6.672083187552239e-07, + "loss": 0.9044199, + "num_input_tokens_seen": 265606945, + "step": 12311, + "time_per_iteration": 2.655963897705078 + }, + { + "auxiliary_loss_clip": 0.01007998, + "auxiliary_loss_mlp": 0.01026307, + "balance_loss_clip": 1.02199924, + "balance_loss_mlp": 1.016258, + "epoch": 0.7402374868480385, + "flos": 22712135016960.0, + "grad_norm": 1.6035222469661279, + "language_loss": 0.80562699, + "learning_rate": 6.669179621222738e-07, + "loss": 0.82597005, + "num_input_tokens_seen": 265626115, + "step": 12312, + "time_per_iteration": 2.8145222663879395 + }, + { + "auxiliary_loss_clip": 0.01008311, + "auxiliary_loss_mlp": 0.01028816, + "balance_loss_clip": 1.01932287, + "balance_loss_mlp": 1.01828933, + "epoch": 0.7402976101007065, + "flos": 22856675345280.0, + "grad_norm": 1.7141365099804968, + "language_loss": 0.78673923, + "learning_rate": 6.666276560399273e-07, + "loss": 0.80711043, + "num_input_tokens_seen": 265646520, + "step": 12313, + "time_per_iteration": 2.8372254371643066 + }, + { + "auxiliary_loss_clip": 0.01013632, + "auxiliary_loss_mlp": 0.01029549, + "balance_loss_clip": 1.02263379, + "balance_loss_mlp": 1.01862395, + "epoch": 0.7403577333533744, + "flos": 12345801834240.0, + "grad_norm": 1.9976966526293447, + "language_loss": 0.78362632, + "learning_rate": 6.663374005191937e-07, + "loss": 0.80405807, + "num_input_tokens_seen": 265661875, + "step": 12314, + "time_per_iteration": 2.699237585067749 + }, + { + "auxiliary_loss_clip": 0.00998386, + "auxiliary_loss_mlp": 0.01002254, + "balance_loss_clip": 1.00330281, + "balance_loss_mlp": 1.00130618, + "epoch": 0.7404178566060424, + "flos": 60327270869760.0, + "grad_norm": 0.8154874226086521, + "language_loss": 0.55175292, + "learning_rate": 6.660471955710809e-07, + "loss": 0.57175934, + "num_input_tokens_seen": 265721255, + "step": 12315, + "time_per_iteration": 3.131941318511963 + }, + { + "auxiliary_loss_clip": 0.01051886, + "auxiliary_loss_mlp": 0.01031823, + "balance_loss_clip": 1.02620173, + "balance_loss_mlp": 1.02108192, + "epoch": 0.7404779798587103, + "flos": 32014650072960.0, + "grad_norm": 1.4652302490263276, + "language_loss": 0.79485261, + "learning_rate": 6.65757041206591e-07, + "loss": 0.81568968, + "num_input_tokens_seen": 265743970, + "step": 12316, + "time_per_iteration": 2.724005937576294 + }, + { + "auxiliary_loss_clip": 0.01050608, + "auxiliary_loss_mlp": 0.01025215, + "balance_loss_clip": 1.0231359, + "balance_loss_mlp": 1.01480842, + "epoch": 0.7405381031113784, + "flos": 12889704551040.0, + "grad_norm": 2.044223329621528, + "language_loss": 0.74824536, + "learning_rate": 6.654669374367275e-07, + "loss": 0.76900357, + "num_input_tokens_seen": 265760890, + "step": 12317, + "time_per_iteration": 2.713360071182251 + }, + { + "auxiliary_loss_clip": 0.0103159, + "auxiliary_loss_mlp": 0.01030723, + "balance_loss_clip": 1.02462125, + "balance_loss_mlp": 1.02094173, + "epoch": 0.7405982263640463, + "flos": 20229127557120.0, + "grad_norm": 1.9533934105527173, + "language_loss": 0.81517327, + "learning_rate": 6.651768842724917e-07, + "loss": 0.83579636, + "num_input_tokens_seen": 265779600, + "step": 12318, + "time_per_iteration": 2.673558235168457 + }, + { + "auxiliary_loss_clip": 0.01036326, + "auxiliary_loss_mlp": 0.01026901, + "balance_loss_clip": 1.02323222, + "balance_loss_mlp": 1.01603532, + "epoch": 0.7406583496167143, + "flos": 17567213431680.0, + "grad_norm": 2.6420132334199424, + "language_loss": 0.76864755, + "learning_rate": 6.648868817248827e-07, + "loss": 0.78927982, + "num_input_tokens_seen": 265797030, + "step": 12319, + "time_per_iteration": 2.817612409591675 + }, + { + "auxiliary_loss_clip": 0.01027879, + "auxiliary_loss_mlp": 0.01029529, + "balance_loss_clip": 1.0220679, + "balance_loss_mlp": 1.01978946, + "epoch": 0.7407184728693822, + "flos": 18295733076480.0, + "grad_norm": 3.9418922923493103, + "language_loss": 0.63984293, + "learning_rate": 6.64596929804897e-07, + "loss": 0.66041708, + "num_input_tokens_seen": 265815055, + "step": 12320, + "time_per_iteration": 2.632777214050293 + }, + { + "auxiliary_loss_clip": 0.01056005, + "auxiliary_loss_mlp": 0.01035313, + "balance_loss_clip": 1.02673149, + "balance_loss_mlp": 1.02402961, + "epoch": 0.7407785961220502, + "flos": 16690562098560.0, + "grad_norm": 2.2381966322185183, + "language_loss": 0.82275414, + "learning_rate": 6.643070285235288e-07, + "loss": 0.84366733, + "num_input_tokens_seen": 265828480, + "step": 12321, + "time_per_iteration": 2.633742570877075 + }, + { + "auxiliary_loss_clip": 0.01046676, + "auxiliary_loss_mlp": 0.01041433, + "balance_loss_clip": 1.02555144, + "balance_loss_mlp": 1.02930331, + "epoch": 0.7408387193747181, + "flos": 22088330496000.0, + "grad_norm": 2.94553192531401, + "language_loss": 0.72036028, + "learning_rate": 6.640171778917727e-07, + "loss": 0.74124134, + "num_input_tokens_seen": 265845825, + "step": 12322, + "time_per_iteration": 2.6385576725006104 + }, + { + "auxiliary_loss_clip": 0.01050307, + "auxiliary_loss_mlp": 0.00747686, + "balance_loss_clip": 1.02341676, + "balance_loss_mlp": 1.00046635, + "epoch": 0.7408988426273861, + "flos": 24236721832320.0, + "grad_norm": 1.7150112746926764, + "language_loss": 0.64171934, + "learning_rate": 6.637273779206183e-07, + "loss": 0.65969926, + "num_input_tokens_seen": 265866335, + "step": 12323, + "time_per_iteration": 2.6907296180725098 + }, + { + "auxiliary_loss_clip": 0.01031265, + "auxiliary_loss_mlp": 0.01025994, + "balance_loss_clip": 1.02271056, + "balance_loss_mlp": 1.01473451, + "epoch": 0.7409589658800542, + "flos": 29023004073600.0, + "grad_norm": 1.4385914901411896, + "language_loss": 0.76197708, + "learning_rate": 6.634376286210559e-07, + "loss": 0.78254968, + "num_input_tokens_seen": 265888945, + "step": 12324, + "time_per_iteration": 2.7583608627319336 + }, + { + "auxiliary_loss_clip": 0.01045789, + "auxiliary_loss_mlp": 0.01025216, + "balance_loss_clip": 1.02698243, + "balance_loss_mlp": 1.01462412, + "epoch": 0.7410190891327221, + "flos": 19351362902400.0, + "grad_norm": 1.5953129570703397, + "language_loss": 0.74874067, + "learning_rate": 6.63147930004073e-07, + "loss": 0.76945078, + "num_input_tokens_seen": 265908030, + "step": 12325, + "time_per_iteration": 2.742692232131958 + }, + { + "auxiliary_loss_clip": 0.01023497, + "auxiliary_loss_mlp": 0.01031556, + "balance_loss_clip": 1.02342367, + "balance_loss_mlp": 1.02025533, + "epoch": 0.7410792123853901, + "flos": 22747650589440.0, + "grad_norm": 1.829440806273393, + "language_loss": 0.68388128, + "learning_rate": 6.628582820806545e-07, + "loss": 0.70443189, + "num_input_tokens_seen": 265927030, + "step": 12326, + "time_per_iteration": 2.7501931190490723 + }, + { + "auxiliary_loss_clip": 0.01034601, + "auxiliary_loss_mlp": 0.01029993, + "balance_loss_clip": 1.02695608, + "balance_loss_mlp": 1.01960969, + "epoch": 0.741139335638058, + "flos": 25372433030400.0, + "grad_norm": 1.6641965305760977, + "language_loss": 0.89809, + "learning_rate": 6.625686848617835e-07, + "loss": 0.91873598, + "num_input_tokens_seen": 265945490, + "step": 12327, + "time_per_iteration": 4.403039216995239 + }, + { + "auxiliary_loss_clip": 0.01063863, + "auxiliary_loss_mlp": 0.01029259, + "balance_loss_clip": 1.02611899, + "balance_loss_mlp": 1.01841068, + "epoch": 0.741199458890726, + "flos": 18585639745920.0, + "grad_norm": 1.652582701606918, + "language_loss": 0.85384476, + "learning_rate": 6.62279138358442e-07, + "loss": 0.87477601, + "num_input_tokens_seen": 265963265, + "step": 12328, + "time_per_iteration": 2.704620599746704 + }, + { + "auxiliary_loss_clip": 0.01048968, + "auxiliary_loss_mlp": 0.0102394, + "balance_loss_clip": 1.02266455, + "balance_loss_mlp": 1.01270461, + "epoch": 0.7412595821433939, + "flos": 22127078292480.0, + "grad_norm": 1.5971506843350174, + "language_loss": 0.66935039, + "learning_rate": 6.619896425816103e-07, + "loss": 0.69007945, + "num_input_tokens_seen": 265982270, + "step": 12329, + "time_per_iteration": 2.64532732963562 + }, + { + "auxiliary_loss_clip": 0.01031806, + "auxiliary_loss_mlp": 0.01038355, + "balance_loss_clip": 1.02514052, + "balance_loss_mlp": 1.02615404, + "epoch": 0.741319705396062, + "flos": 29169699217920.0, + "grad_norm": 1.723771563100533, + "language_loss": 0.66863, + "learning_rate": 6.617001975422647e-07, + "loss": 0.68933165, + "num_input_tokens_seen": 266003835, + "step": 12330, + "time_per_iteration": 2.8094255924224854 + }, + { + "auxiliary_loss_clip": 0.01041584, + "auxiliary_loss_mlp": 0.01032288, + "balance_loss_clip": 1.02887082, + "balance_loss_mlp": 1.01949716, + "epoch": 0.7413798286487299, + "flos": 20667489137280.0, + "grad_norm": 2.160382966765648, + "language_loss": 0.8595624, + "learning_rate": 6.614108032513823e-07, + "loss": 0.88030106, + "num_input_tokens_seen": 266021595, + "step": 12331, + "time_per_iteration": 2.7661221027374268 + }, + { + "auxiliary_loss_clip": 0.00997435, + "auxiliary_loss_mlp": 0.01026368, + "balance_loss_clip": 1.02618337, + "balance_loss_mlp": 1.01542425, + "epoch": 0.7414399519013979, + "flos": 16398895662720.0, + "grad_norm": 2.3492436398681398, + "language_loss": 0.69423294, + "learning_rate": 6.611214597199364e-07, + "loss": 0.71447104, + "num_input_tokens_seen": 266039860, + "step": 12332, + "time_per_iteration": 2.987008571624756 + }, + { + "auxiliary_loss_clip": 0.01064427, + "auxiliary_loss_mlp": 0.01032305, + "balance_loss_clip": 1.02623618, + "balance_loss_mlp": 1.02064049, + "epoch": 0.7415000751540658, + "flos": 25630235919360.0, + "grad_norm": 2.078718283167094, + "language_loss": 0.63469988, + "learning_rate": 6.608321669588984e-07, + "loss": 0.65566725, + "num_input_tokens_seen": 266058050, + "step": 12333, + "time_per_iteration": 2.8207755088806152 + }, + { + "auxiliary_loss_clip": 0.01042721, + "auxiliary_loss_mlp": 0.01031624, + "balance_loss_clip": 1.02711272, + "balance_loss_mlp": 1.02138972, + "epoch": 0.7415601984067338, + "flos": 24499732193280.0, + "grad_norm": 1.6177838012201262, + "language_loss": 0.71349132, + "learning_rate": 6.605429249792387e-07, + "loss": 0.73423475, + "num_input_tokens_seen": 266078060, + "step": 12334, + "time_per_iteration": 2.836254358291626 + }, + { + "auxiliary_loss_clip": 0.01023608, + "auxiliary_loss_mlp": 0.01023944, + "balance_loss_clip": 1.02401686, + "balance_loss_mlp": 1.01388872, + "epoch": 0.7416203216594017, + "flos": 20887154760960.0, + "grad_norm": 1.729751732017792, + "language_loss": 0.82617742, + "learning_rate": 6.602537337919257e-07, + "loss": 0.84665298, + "num_input_tokens_seen": 266097110, + "step": 12335, + "time_per_iteration": 2.7927443981170654 + }, + { + "auxiliary_loss_clip": 0.0106397, + "auxiliary_loss_mlp": 0.01028947, + "balance_loss_clip": 1.02531958, + "balance_loss_mlp": 1.01733017, + "epoch": 0.7416804449120697, + "flos": 15624265933440.0, + "grad_norm": 2.7393893480290297, + "language_loss": 0.74992025, + "learning_rate": 6.599645934079259e-07, + "loss": 0.77084941, + "num_input_tokens_seen": 266110870, + "step": 12336, + "time_per_iteration": 2.7226459980010986 + }, + { + "auxiliary_loss_clip": 0.01018515, + "auxiliary_loss_mlp": 0.01029578, + "balance_loss_clip": 1.02412963, + "balance_loss_mlp": 1.01867652, + "epoch": 0.7417405681647377, + "flos": 17120483982720.0, + "grad_norm": 1.7708233686082184, + "language_loss": 0.73440355, + "learning_rate": 6.596755038382029e-07, + "loss": 0.75488448, + "num_input_tokens_seen": 266127845, + "step": 12337, + "time_per_iteration": 4.511249303817749 + }, + { + "auxiliary_loss_clip": 0.01041032, + "auxiliary_loss_mlp": 0.01030714, + "balance_loss_clip": 1.0244956, + "balance_loss_mlp": 1.02024114, + "epoch": 0.7418006914174057, + "flos": 18880322924160.0, + "grad_norm": 2.2406719421564882, + "language_loss": 0.76750779, + "learning_rate": 6.593864650937186e-07, + "loss": 0.78822529, + "num_input_tokens_seen": 266145400, + "step": 12338, + "time_per_iteration": 2.854433059692383 + }, + { + "auxiliary_loss_clip": 0.01051868, + "auxiliary_loss_mlp": 0.01025272, + "balance_loss_clip": 1.02442122, + "balance_loss_mlp": 1.01564574, + "epoch": 0.7418608146700737, + "flos": 21580733450880.0, + "grad_norm": 1.671564139781895, + "language_loss": 0.72768593, + "learning_rate": 6.590974771854345e-07, + "loss": 0.74845731, + "num_input_tokens_seen": 266164430, + "step": 12339, + "time_per_iteration": 2.7759668827056885 + }, + { + "auxiliary_loss_clip": 0.01044531, + "auxiliary_loss_mlp": 0.01028705, + "balance_loss_clip": 1.02604365, + "balance_loss_mlp": 1.01735079, + "epoch": 0.7419209379227416, + "flos": 22340459036160.0, + "grad_norm": 2.3414483494114657, + "language_loss": 0.7959398, + "learning_rate": 6.588085401243077e-07, + "loss": 0.81667221, + "num_input_tokens_seen": 266183855, + "step": 12340, + "time_per_iteration": 2.81400990486145 + }, + { + "auxiliary_loss_clip": 0.01017947, + "auxiliary_loss_mlp": 0.01033123, + "balance_loss_clip": 1.02098525, + "balance_loss_mlp": 1.02205992, + "epoch": 0.7419810611754096, + "flos": 16762275601920.0, + "grad_norm": 1.4716625355900752, + "language_loss": 0.75753266, + "learning_rate": 6.585196539212958e-07, + "loss": 0.77804333, + "num_input_tokens_seen": 266202085, + "step": 12341, + "time_per_iteration": 2.7876861095428467 + }, + { + "auxiliary_loss_clip": 0.01032496, + "auxiliary_loss_mlp": 0.01034277, + "balance_loss_clip": 1.02344382, + "balance_loss_mlp": 1.02262402, + "epoch": 0.7420411844280775, + "flos": 26212958259840.0, + "grad_norm": 1.4213075596845144, + "language_loss": 0.80113471, + "learning_rate": 6.582308185873535e-07, + "loss": 0.8218025, + "num_input_tokens_seen": 266223445, + "step": 12342, + "time_per_iteration": 2.76955246925354 + }, + { + "auxiliary_loss_clip": 0.01026254, + "auxiliary_loss_mlp": 0.01027168, + "balance_loss_clip": 1.02189803, + "balance_loss_mlp": 1.01674914, + "epoch": 0.7421013076807456, + "flos": 68529371840640.0, + "grad_norm": 2.303000606192894, + "language_loss": 0.77328408, + "learning_rate": 6.57942034133433e-07, + "loss": 0.7938183, + "num_input_tokens_seen": 266246575, + "step": 12343, + "time_per_iteration": 3.1150219440460205 + }, + { + "auxiliary_loss_clip": 0.01035258, + "auxiliary_loss_mlp": 0.01029012, + "balance_loss_clip": 1.02192211, + "balance_loss_mlp": 1.01850414, + "epoch": 0.7421614309334135, + "flos": 24425325169920.0, + "grad_norm": 1.486130744213826, + "language_loss": 0.67684078, + "learning_rate": 6.576533005704843e-07, + "loss": 0.69748348, + "num_input_tokens_seen": 266266055, + "step": 12344, + "time_per_iteration": 2.7343997955322266 + }, + { + "auxiliary_loss_clip": 0.01025649, + "auxiliary_loss_mlp": 0.01030894, + "balance_loss_clip": 1.0265913, + "balance_loss_mlp": 1.01862741, + "epoch": 0.7422215541860815, + "flos": 12311076360960.0, + "grad_norm": 2.3000737768840502, + "language_loss": 0.81251621, + "learning_rate": 6.573646179094572e-07, + "loss": 0.8330816, + "num_input_tokens_seen": 266282240, + "step": 12345, + "time_per_iteration": 2.785858631134033 + }, + { + "auxiliary_loss_clip": 0.01020596, + "auxiliary_loss_mlp": 0.01031183, + "balance_loss_clip": 1.02253532, + "balance_loss_mlp": 1.01984572, + "epoch": 0.7422816774387494, + "flos": 19645579203840.0, + "grad_norm": 1.9702453787312568, + "language_loss": 0.70611823, + "learning_rate": 6.570759861612988e-07, + "loss": 0.72663599, + "num_input_tokens_seen": 266300980, + "step": 12346, + "time_per_iteration": 2.7575249671936035 + }, + { + "auxiliary_loss_clip": 0.01054441, + "auxiliary_loss_mlp": 0.01028895, + "balance_loss_clip": 1.02651989, + "balance_loss_mlp": 1.01820779, + "epoch": 0.7423418006914174, + "flos": 32015978876160.0, + "grad_norm": 1.5708278989419804, + "language_loss": 0.73270237, + "learning_rate": 6.56787405336953e-07, + "loss": 0.75353575, + "num_input_tokens_seen": 266322215, + "step": 12347, + "time_per_iteration": 2.7495102882385254 + }, + { + "auxiliary_loss_clip": 0.0104126, + "auxiliary_loss_mlp": 0.01030474, + "balance_loss_clip": 1.02401435, + "balance_loss_mlp": 1.01932764, + "epoch": 0.7424019239440853, + "flos": 18916951818240.0, + "grad_norm": 3.1457165202324404, + "language_loss": 0.81157041, + "learning_rate": 6.564988754473642e-07, + "loss": 0.83228767, + "num_input_tokens_seen": 266341600, + "step": 12348, + "time_per_iteration": 2.639270544052124 + }, + { + "auxiliary_loss_clip": 0.01061435, + "auxiliary_loss_mlp": 0.01028529, + "balance_loss_clip": 1.02390313, + "balance_loss_mlp": 1.01821125, + "epoch": 0.7424620471967533, + "flos": 35876518871040.0, + "grad_norm": 1.8476700589091586, + "language_loss": 0.72440541, + "learning_rate": 6.562103965034724e-07, + "loss": 0.745305, + "num_input_tokens_seen": 266362895, + "step": 12349, + "time_per_iteration": 2.7298927307128906 + }, + { + "auxiliary_loss_clip": 0.01042848, + "auxiliary_loss_mlp": 0.01033277, + "balance_loss_clip": 1.02344561, + "balance_loss_mlp": 1.02134418, + "epoch": 0.7425221704494213, + "flos": 27016603200000.0, + "grad_norm": 3.6531557331865976, + "language_loss": 0.78417885, + "learning_rate": 6.559219685162165e-07, + "loss": 0.8049401, + "num_input_tokens_seen": 266384015, + "step": 12350, + "time_per_iteration": 2.744189977645874 + }, + { + "auxiliary_loss_clip": 0.0102702, + "auxiliary_loss_mlp": 0.01030393, + "balance_loss_clip": 1.02678514, + "balance_loss_mlp": 1.01954472, + "epoch": 0.7425822937020893, + "flos": 34167135559680.0, + "grad_norm": 4.143817178133297, + "language_loss": 0.75265968, + "learning_rate": 6.556335914965343e-07, + "loss": 0.77323377, + "num_input_tokens_seen": 266405990, + "step": 12351, + "time_per_iteration": 2.9232447147369385 + }, + { + "auxiliary_loss_clip": 0.01007902, + "auxiliary_loss_mlp": 0.01025051, + "balance_loss_clip": 1.02564752, + "balance_loss_mlp": 1.01426804, + "epoch": 0.7426424169547573, + "flos": 21283572234240.0, + "grad_norm": 3.8985636029905337, + "language_loss": 0.8146292, + "learning_rate": 6.553452654553611e-07, + "loss": 0.83495867, + "num_input_tokens_seen": 266424260, + "step": 12352, + "time_per_iteration": 2.864043951034546 + }, + { + "auxiliary_loss_clip": 0.01054906, + "auxiliary_loss_mlp": 0.0103008, + "balance_loss_clip": 1.02673483, + "balance_loss_mlp": 1.01939905, + "epoch": 0.7427025402074252, + "flos": 22448442297600.0, + "grad_norm": 1.6431357904418442, + "language_loss": 0.71588099, + "learning_rate": 6.550569904036307e-07, + "loss": 0.73673081, + "num_input_tokens_seen": 266444580, + "step": 12353, + "time_per_iteration": 2.6827094554901123 + }, + { + "auxiliary_loss_clip": 0.01053766, + "auxiliary_loss_mlp": 0.0103156, + "balance_loss_clip": 1.02617621, + "balance_loss_mlp": 1.02106977, + "epoch": 0.7427626634600932, + "flos": 22524609087360.0, + "grad_norm": 1.8588930166382263, + "language_loss": 0.7234143, + "learning_rate": 6.547687663522739e-07, + "loss": 0.74426758, + "num_input_tokens_seen": 266465640, + "step": 12354, + "time_per_iteration": 2.785813808441162 + }, + { + "auxiliary_loss_clip": 0.00997792, + "auxiliary_loss_mlp": 0.0100228, + "balance_loss_clip": 1.00204611, + "balance_loss_mlp": 1.00130868, + "epoch": 0.7428227867127611, + "flos": 67209477655680.0, + "grad_norm": 0.6992641459060325, + "language_loss": 0.59518957, + "learning_rate": 6.544805933122199e-07, + "loss": 0.61519033, + "num_input_tokens_seen": 266531950, + "step": 12355, + "time_per_iteration": 3.3592119216918945 + }, + { + "auxiliary_loss_clip": 0.01062629, + "auxiliary_loss_mlp": 0.01028973, + "balance_loss_clip": 1.02478313, + "balance_loss_mlp": 1.01803565, + "epoch": 0.7428829099654292, + "flos": 14721221082240.0, + "grad_norm": 3.104625622641202, + "language_loss": 0.67919934, + "learning_rate": 6.541924712943971e-07, + "loss": 0.70011538, + "num_input_tokens_seen": 266550665, + "step": 12356, + "time_per_iteration": 2.6634461879730225 + }, + { + "auxiliary_loss_clip": 0.01051044, + "auxiliary_loss_mlp": 0.00747764, + "balance_loss_clip": 1.02243376, + "balance_loss_mlp": 1.00042379, + "epoch": 0.7429430332180971, + "flos": 48646496413440.0, + "grad_norm": 1.8571960883939158, + "language_loss": 0.71591687, + "learning_rate": 6.539044003097301e-07, + "loss": 0.73390496, + "num_input_tokens_seen": 266572455, + "step": 12357, + "time_per_iteration": 4.682558059692383 + }, + { + "auxiliary_loss_clip": 0.01041572, + "auxiliary_loss_mlp": 0.01023905, + "balance_loss_clip": 1.02574158, + "balance_loss_mlp": 1.01393914, + "epoch": 0.7430031564707651, + "flos": 16764071281920.0, + "grad_norm": 1.7975685357974591, + "language_loss": 0.65670997, + "learning_rate": 6.53616380369143e-07, + "loss": 0.67736477, + "num_input_tokens_seen": 266590895, + "step": 12358, + "time_per_iteration": 4.417608737945557 + }, + { + "auxiliary_loss_clip": 0.01026039, + "auxiliary_loss_mlp": 0.01033541, + "balance_loss_clip": 1.02554607, + "balance_loss_mlp": 1.02160788, + "epoch": 0.743063279723433, + "flos": 23870576545920.0, + "grad_norm": 2.347057665396335, + "language_loss": 0.80993831, + "learning_rate": 6.533284114835591e-07, + "loss": 0.8305341, + "num_input_tokens_seen": 266607660, + "step": 12359, + "time_per_iteration": 2.7962098121643066 + }, + { + "auxiliary_loss_clip": 0.01051321, + "auxiliary_loss_mlp": 0.01026305, + "balance_loss_clip": 1.02357841, + "balance_loss_mlp": 1.01596904, + "epoch": 0.743123402976101, + "flos": 14391704689920.0, + "grad_norm": 1.8084708517886996, + "language_loss": 0.6805867, + "learning_rate": 6.530404936638956e-07, + "loss": 0.70136297, + "num_input_tokens_seen": 266624260, + "step": 12360, + "time_per_iteration": 2.8799266815185547 + }, + { + "auxiliary_loss_clip": 0.01050887, + "auxiliary_loss_mlp": 0.00747809, + "balance_loss_clip": 1.02286923, + "balance_loss_mlp": 1.0004859, + "epoch": 0.7431835262287689, + "flos": 27454318335360.0, + "grad_norm": 1.5638514504134209, + "language_loss": 0.72477144, + "learning_rate": 6.527526269210715e-07, + "loss": 0.74275839, + "num_input_tokens_seen": 266644210, + "step": 12361, + "time_per_iteration": 2.8570024967193604 + }, + { + "auxiliary_loss_clip": 0.01012222, + "auxiliary_loss_mlp": 0.01038718, + "balance_loss_clip": 1.02109778, + "balance_loss_mlp": 1.02635622, + "epoch": 0.743243649481437, + "flos": 20959514709120.0, + "grad_norm": 1.9168756014946213, + "language_loss": 0.56134945, + "learning_rate": 6.524648112660027e-07, + "loss": 0.58185887, + "num_input_tokens_seen": 266664230, + "step": 12362, + "time_per_iteration": 2.8975002765655518 + }, + { + "auxiliary_loss_clip": 0.01025833, + "auxiliary_loss_mlp": 0.01027409, + "balance_loss_clip": 1.02485764, + "balance_loss_mlp": 1.01629221, + "epoch": 0.7433037727341049, + "flos": 22783166161920.0, + "grad_norm": 1.787116844493036, + "language_loss": 0.77009261, + "learning_rate": 6.521770467096039e-07, + "loss": 0.79062498, + "num_input_tokens_seen": 266683270, + "step": 12363, + "time_per_iteration": 2.731623411178589 + }, + { + "auxiliary_loss_clip": 0.0103355, + "auxiliary_loss_mlp": 0.01029176, + "balance_loss_clip": 1.02273536, + "balance_loss_mlp": 1.01877451, + "epoch": 0.7433638959867729, + "flos": 22196708807040.0, + "grad_norm": 1.8331282256370012, + "language_loss": 0.78211254, + "learning_rate": 6.518893332627862e-07, + "loss": 0.80273986, + "num_input_tokens_seen": 266701235, + "step": 12364, + "time_per_iteration": 2.705673933029175 + }, + { + "auxiliary_loss_clip": 0.01051294, + "auxiliary_loss_mlp": 0.01029874, + "balance_loss_clip": 1.02365541, + "balance_loss_mlp": 1.01972938, + "epoch": 0.7434240192394409, + "flos": 23296760778240.0, + "grad_norm": 1.9012956714907567, + "language_loss": 0.78793359, + "learning_rate": 6.516016709364604e-07, + "loss": 0.80874527, + "num_input_tokens_seen": 266721495, + "step": 12365, + "time_per_iteration": 2.642268180847168 + }, + { + "auxiliary_loss_clip": 0.01045486, + "auxiliary_loss_mlp": 0.01033637, + "balance_loss_clip": 1.02506459, + "balance_loss_mlp": 1.02215743, + "epoch": 0.7434841424921088, + "flos": 54009575251200.0, + "grad_norm": 1.5572111521192085, + "language_loss": 0.77179766, + "learning_rate": 6.513140597415346e-07, + "loss": 0.79258889, + "num_input_tokens_seen": 266747400, + "step": 12366, + "time_per_iteration": 3.009843587875366 + }, + { + "auxiliary_loss_clip": 0.01051732, + "auxiliary_loss_mlp": 0.01025166, + "balance_loss_clip": 1.02563012, + "balance_loss_mlp": 1.01605797, + "epoch": 0.7435442657447768, + "flos": 21433966479360.0, + "grad_norm": 1.4254696346179012, + "language_loss": 0.71069801, + "learning_rate": 6.510264996889141e-07, + "loss": 0.73146689, + "num_input_tokens_seen": 266767630, + "step": 12367, + "time_per_iteration": 2.603889226913452 + }, + { + "auxiliary_loss_clip": 0.01034177, + "auxiliary_loss_mlp": 0.01032043, + "balance_loss_clip": 1.02517462, + "balance_loss_mlp": 1.02146935, + "epoch": 0.7436043889974447, + "flos": 24499408970880.0, + "grad_norm": 1.5305773622896748, + "language_loss": 0.74585724, + "learning_rate": 6.507389907895038e-07, + "loss": 0.76651943, + "num_input_tokens_seen": 266788015, + "step": 12368, + "time_per_iteration": 2.7772929668426514 + }, + { + "auxiliary_loss_clip": 0.01051295, + "auxiliary_loss_mlp": 0.01032576, + "balance_loss_clip": 1.02592707, + "balance_loss_mlp": 1.02343273, + "epoch": 0.7436645122501128, + "flos": 40698388512000.0, + "grad_norm": 1.6680611600581234, + "language_loss": 0.69159734, + "learning_rate": 6.50451533054207e-07, + "loss": 0.71243608, + "num_input_tokens_seen": 266809010, + "step": 12369, + "time_per_iteration": 2.7825841903686523 + }, + { + "auxiliary_loss_clip": 0.01041458, + "auxiliary_loss_mlp": 0.00747643, + "balance_loss_clip": 1.02434158, + "balance_loss_mlp": 1.00042081, + "epoch": 0.7437246355027807, + "flos": 18908835344640.0, + "grad_norm": 1.7581832943511573, + "language_loss": 0.75685596, + "learning_rate": 6.501641264939233e-07, + "loss": 0.77474689, + "num_input_tokens_seen": 266825390, + "step": 12370, + "time_per_iteration": 2.6399433612823486 + }, + { + "auxiliary_loss_clip": 0.0106558, + "auxiliary_loss_mlp": 0.01032358, + "balance_loss_clip": 1.02786279, + "balance_loss_mlp": 1.02189744, + "epoch": 0.7437847587554487, + "flos": 21543817248000.0, + "grad_norm": 1.6310523541823208, + "language_loss": 0.78106606, + "learning_rate": 6.498767711195503e-07, + "loss": 0.80204546, + "num_input_tokens_seen": 266844675, + "step": 12371, + "time_per_iteration": 2.693600654602051 + }, + { + "auxiliary_loss_clip": 0.01043697, + "auxiliary_loss_mlp": 0.01022941, + "balance_loss_clip": 1.02509809, + "balance_loss_mlp": 1.01242709, + "epoch": 0.7438448820081166, + "flos": 27782470010880.0, + "grad_norm": 1.7944399948912306, + "language_loss": 0.69376111, + "learning_rate": 6.495894669419857e-07, + "loss": 0.71442759, + "num_input_tokens_seen": 266865160, + "step": 12372, + "time_per_iteration": 2.679910659790039 + }, + { + "auxiliary_loss_clip": 0.0103424, + "auxiliary_loss_mlp": 0.01029724, + "balance_loss_clip": 1.02323651, + "balance_loss_mlp": 1.01907885, + "epoch": 0.7439050052607846, + "flos": 17967832796160.0, + "grad_norm": 2.397184982846289, + "language_loss": 0.75089562, + "learning_rate": 6.493022139721245e-07, + "loss": 0.77153528, + "num_input_tokens_seen": 266883285, + "step": 12373, + "time_per_iteration": 2.6289925575256348 + }, + { + "auxiliary_loss_clip": 0.01010813, + "auxiliary_loss_mlp": 0.01032189, + "balance_loss_clip": 1.0202415, + "balance_loss_mlp": 1.01985097, + "epoch": 0.7439651285134525, + "flos": 22958696949120.0, + "grad_norm": 1.58979111448116, + "language_loss": 0.76856077, + "learning_rate": 6.49015012220858e-07, + "loss": 0.7889908, + "num_input_tokens_seen": 266900960, + "step": 12374, + "time_per_iteration": 4.407110691070557 + }, + { + "auxiliary_loss_clip": 0.00997539, + "auxiliary_loss_mlp": 0.0103004, + "balance_loss_clip": 1.02183366, + "balance_loss_mlp": 1.01915061, + "epoch": 0.7440252517661206, + "flos": 18806777827200.0, + "grad_norm": 2.5279269445955714, + "language_loss": 0.76126546, + "learning_rate": 6.487278616990774e-07, + "loss": 0.78154123, + "num_input_tokens_seen": 266917710, + "step": 12375, + "time_per_iteration": 2.787205457687378 + }, + { + "auxiliary_loss_clip": 0.01048167, + "auxiliary_loss_mlp": 0.01027234, + "balance_loss_clip": 1.02342892, + "balance_loss_mlp": 1.01803732, + "epoch": 0.7440853750187885, + "flos": 20266295155200.0, + "grad_norm": 2.274354450516412, + "language_loss": 0.77056777, + "learning_rate": 6.484407624176733e-07, + "loss": 0.79132181, + "num_input_tokens_seen": 266934220, + "step": 12376, + "time_per_iteration": 2.6922056674957275 + }, + { + "auxiliary_loss_clip": 0.01019531, + "auxiliary_loss_mlp": 0.01027245, + "balance_loss_clip": 1.02029014, + "balance_loss_mlp": 1.01479912, + "epoch": 0.7441454982714565, + "flos": 25337276593920.0, + "grad_norm": 1.6727473278045133, + "language_loss": 0.79659879, + "learning_rate": 6.481537143875296e-07, + "loss": 0.81706655, + "num_input_tokens_seen": 266955210, + "step": 12377, + "time_per_iteration": 2.8169336318969727 + }, + { + "auxiliary_loss_clip": 0.01055131, + "auxiliary_loss_mlp": 0.0102406, + "balance_loss_clip": 1.02612615, + "balance_loss_mlp": 1.01275325, + "epoch": 0.7442056215241245, + "flos": 64480910866560.0, + "grad_norm": 2.1897673529442163, + "language_loss": 0.67094564, + "learning_rate": 6.478667176195322e-07, + "loss": 0.69173753, + "num_input_tokens_seen": 266976555, + "step": 12378, + "time_per_iteration": 3.031196117401123 + }, + { + "auxiliary_loss_clip": 0.01036187, + "auxiliary_loss_mlp": 0.01033452, + "balance_loss_clip": 1.02641535, + "balance_loss_mlp": 1.0214057, + "epoch": 0.7442657447767924, + "flos": 31285376242560.0, + "grad_norm": 7.824926762358727, + "language_loss": 0.71894169, + "learning_rate": 6.475797721245648e-07, + "loss": 0.73963809, + "num_input_tokens_seen": 266997640, + "step": 12379, + "time_per_iteration": 2.8947649002075195 + }, + { + "auxiliary_loss_clip": 0.01026206, + "auxiliary_loss_mlp": 0.00747817, + "balance_loss_clip": 1.02271104, + "balance_loss_mlp": 1.00056982, + "epoch": 0.7443258680294604, + "flos": 20807899401600.0, + "grad_norm": 2.655132888741134, + "language_loss": 0.65384734, + "learning_rate": 6.472928779135085e-07, + "loss": 0.67158753, + "num_input_tokens_seen": 267016165, + "step": 12380, + "time_per_iteration": 2.680845022201538 + }, + { + "auxiliary_loss_clip": 0.01053877, + "auxiliary_loss_mlp": 0.01029962, + "balance_loss_clip": 1.0256294, + "balance_loss_mlp": 1.01905429, + "epoch": 0.7443859912821283, + "flos": 22199833290240.0, + "grad_norm": 1.7558103597364567, + "language_loss": 0.78353214, + "learning_rate": 6.470060349972411e-07, + "loss": 0.80437052, + "num_input_tokens_seen": 267034075, + "step": 12381, + "time_per_iteration": 2.6852145195007324 + }, + { + "auxiliary_loss_clip": 0.01029676, + "auxiliary_loss_mlp": 0.01034766, + "balance_loss_clip": 1.02547121, + "balance_loss_mlp": 1.02210641, + "epoch": 0.7444461145347964, + "flos": 22017838055040.0, + "grad_norm": 1.945223489800388, + "language_loss": 0.73270428, + "learning_rate": 6.467192433866411e-07, + "loss": 0.75334871, + "num_input_tokens_seen": 267053645, + "step": 12382, + "time_per_iteration": 2.752058506011963 + }, + { + "auxiliary_loss_clip": 0.00979004, + "auxiliary_loss_mlp": 0.01005592, + "balance_loss_clip": 1.00293732, + "balance_loss_mlp": 1.00463223, + "epoch": 0.7445062377874643, + "flos": 70559047704960.0, + "grad_norm": 0.6574620555874187, + "language_loss": 0.54680282, + "learning_rate": 6.464325030925831e-07, + "loss": 0.56664878, + "num_input_tokens_seen": 267121830, + "step": 12383, + "time_per_iteration": 3.446207046508789 + }, + { + "auxiliary_loss_clip": 0.01042079, + "auxiliary_loss_mlp": 0.010268, + "balance_loss_clip": 1.02303517, + "balance_loss_mlp": 1.01617193, + "epoch": 0.7445663610401323, + "flos": 22164425458560.0, + "grad_norm": 2.1699682015828765, + "language_loss": 0.7592361, + "learning_rate": 6.461458141259395e-07, + "loss": 0.77992487, + "num_input_tokens_seen": 267141145, + "step": 12384, + "time_per_iteration": 4.390983581542969 + }, + { + "auxiliary_loss_clip": 0.01052782, + "auxiliary_loss_mlp": 0.01024273, + "balance_loss_clip": 1.02469265, + "balance_loss_mlp": 1.01386595, + "epoch": 0.7446264842928002, + "flos": 24170251714560.0, + "grad_norm": 1.8820083576925384, + "language_loss": 0.79540509, + "learning_rate": 6.458591764975823e-07, + "loss": 0.81617564, + "num_input_tokens_seen": 267159280, + "step": 12385, + "time_per_iteration": 2.7053403854370117 + }, + { + "auxiliary_loss_clip": 0.0103705, + "auxiliary_loss_mlp": 0.01031033, + "balance_loss_clip": 1.02591991, + "balance_loss_mlp": 1.0190587, + "epoch": 0.7446866075454682, + "flos": 24134556574080.0, + "grad_norm": 1.8508975515209614, + "language_loss": 0.81470728, + "learning_rate": 6.455725902183813e-07, + "loss": 0.83538806, + "num_input_tokens_seen": 267179390, + "step": 12386, + "time_per_iteration": 2.804323673248291 + }, + { + "auxiliary_loss_clip": 0.01045184, + "auxiliary_loss_mlp": 0.01029333, + "balance_loss_clip": 1.02297032, + "balance_loss_mlp": 1.01889575, + "epoch": 0.7447467307981361, + "flos": 23548063305600.0, + "grad_norm": 1.6899562628408815, + "language_loss": 0.70982891, + "learning_rate": 6.452860552992037e-07, + "loss": 0.73057413, + "num_input_tokens_seen": 267198165, + "step": 12387, + "time_per_iteration": 2.747387647628784 + }, + { + "auxiliary_loss_clip": 0.01033133, + "auxiliary_loss_mlp": 0.01026091, + "balance_loss_clip": 1.02425933, + "balance_loss_mlp": 1.01608956, + "epoch": 0.7448068540508042, + "flos": 19567832215680.0, + "grad_norm": 2.112607483609791, + "language_loss": 0.7038514, + "learning_rate": 6.449995717509138e-07, + "loss": 0.72444367, + "num_input_tokens_seen": 267214520, + "step": 12388, + "time_per_iteration": 2.666799545288086 + }, + { + "auxiliary_loss_clip": 0.01049556, + "auxiliary_loss_mlp": 0.01032229, + "balance_loss_clip": 1.02274668, + "balance_loss_mlp": 1.02179861, + "epoch": 0.7448669773034721, + "flos": 21839721488640.0, + "grad_norm": 1.412121650157794, + "language_loss": 0.85160816, + "learning_rate": 6.447131395843761e-07, + "loss": 0.87242603, + "num_input_tokens_seen": 267236555, + "step": 12389, + "time_per_iteration": 2.6793625354766846 + }, + { + "auxiliary_loss_clip": 0.01019962, + "auxiliary_loss_mlp": 0.0102961, + "balance_loss_clip": 1.02303851, + "balance_loss_mlp": 1.01953661, + "epoch": 0.7449271005561401, + "flos": 25155389099520.0, + "grad_norm": 1.9114961585986052, + "language_loss": 0.79235339, + "learning_rate": 6.444267588104526e-07, + "loss": 0.8128491, + "num_input_tokens_seen": 267254800, + "step": 12390, + "time_per_iteration": 2.7757632732391357 + }, + { + "auxiliary_loss_clip": 0.01040815, + "auxiliary_loss_mlp": 0.01028891, + "balance_loss_clip": 1.02375031, + "balance_loss_mlp": 1.01743507, + "epoch": 0.7449872238088081, + "flos": 22273342473600.0, + "grad_norm": 1.8652592082022164, + "language_loss": 0.84734595, + "learning_rate": 6.441404294400014e-07, + "loss": 0.86804295, + "num_input_tokens_seen": 267274610, + "step": 12391, + "time_per_iteration": 2.6677327156066895 + }, + { + "auxiliary_loss_clip": 0.01061096, + "auxiliary_loss_mlp": 0.01025846, + "balance_loss_clip": 1.02362621, + "balance_loss_mlp": 1.01561213, + "epoch": 0.745047347061476, + "flos": 20594805966720.0, + "grad_norm": 1.9537366190873824, + "language_loss": 0.73784459, + "learning_rate": 6.438541514838811e-07, + "loss": 0.75871402, + "num_input_tokens_seen": 267292600, + "step": 12392, + "time_per_iteration": 2.5927202701568604 + }, + { + "auxiliary_loss_clip": 0.01049835, + "auxiliary_loss_mlp": 0.01031815, + "balance_loss_clip": 1.02472425, + "balance_loss_mlp": 1.02188516, + "epoch": 0.745107470314144, + "flos": 22127545169280.0, + "grad_norm": 1.6149361103808757, + "language_loss": 0.76778525, + "learning_rate": 6.435679249529487e-07, + "loss": 0.78860176, + "num_input_tokens_seen": 267311295, + "step": 12393, + "time_per_iteration": 2.6926939487457275 + }, + { + "auxiliary_loss_clip": 0.01055176, + "auxiliary_loss_mlp": 0.0103043, + "balance_loss_clip": 1.02680039, + "balance_loss_mlp": 1.01867032, + "epoch": 0.745167593566812, + "flos": 22236498097920.0, + "grad_norm": 1.802973927302809, + "language_loss": 0.72623074, + "learning_rate": 6.432817498580552e-07, + "loss": 0.74708676, + "num_input_tokens_seen": 267328390, + "step": 12394, + "time_per_iteration": 2.6760776042938232 + }, + { + "auxiliary_loss_clip": 0.01020426, + "auxiliary_loss_mlp": 0.00747604, + "balance_loss_clip": 1.02961373, + "balance_loss_mlp": 1.00040472, + "epoch": 0.74522771681948, + "flos": 20666232161280.0, + "grad_norm": 1.8913802797169648, + "language_loss": 0.81505144, + "learning_rate": 6.429956262100535e-07, + "loss": 0.83273172, + "num_input_tokens_seen": 267348185, + "step": 12395, + "time_per_iteration": 2.747770309448242 + }, + { + "auxiliary_loss_clip": 0.01054288, + "auxiliary_loss_mlp": 0.01031194, + "balance_loss_clip": 1.02517247, + "balance_loss_mlp": 1.02018487, + "epoch": 0.7452878400721479, + "flos": 21106999952640.0, + "grad_norm": 1.9843746509356668, + "language_loss": 0.71060568, + "learning_rate": 6.427095540197937e-07, + "loss": 0.73146051, + "num_input_tokens_seen": 267367010, + "step": 12396, + "time_per_iteration": 2.688272476196289 + }, + { + "auxiliary_loss_clip": 0.01026987, + "auxiliary_loss_mlp": 0.01027673, + "balance_loss_clip": 1.02622092, + "balance_loss_mlp": 1.01722419, + "epoch": 0.7453479633248159, + "flos": 26688056474880.0, + "grad_norm": 1.7878269201160373, + "language_loss": 0.68327338, + "learning_rate": 6.424235332981245e-07, + "loss": 0.70381993, + "num_input_tokens_seen": 267386605, + "step": 12397, + "time_per_iteration": 2.7702462673187256 + }, + { + "auxiliary_loss_clip": 0.01062476, + "auxiliary_loss_mlp": 0.0103464, + "balance_loss_clip": 1.02437496, + "balance_loss_mlp": 1.02346385, + "epoch": 0.7454080865774838, + "flos": 17016056167680.0, + "grad_norm": 1.9277204630526925, + "language_loss": 0.76987433, + "learning_rate": 6.421375640558908e-07, + "loss": 0.79084539, + "num_input_tokens_seen": 267404135, + "step": 12398, + "time_per_iteration": 2.5663726329803467 + }, + { + "auxiliary_loss_clip": 0.01051591, + "auxiliary_loss_mlp": 0.01024847, + "balance_loss_clip": 1.02478552, + "balance_loss_mlp": 1.01427317, + "epoch": 0.7454682098301518, + "flos": 21323900229120.0, + "grad_norm": 1.7444335092089105, + "language_loss": 0.77624464, + "learning_rate": 6.418516463039363e-07, + "loss": 0.79700899, + "num_input_tokens_seen": 267423120, + "step": 12399, + "time_per_iteration": 2.644815683364868 + }, + { + "auxiliary_loss_clip": 0.01033203, + "auxiliary_loss_mlp": 0.01033868, + "balance_loss_clip": 1.02077615, + "balance_loss_mlp": 1.02394998, + "epoch": 0.7455283330828197, + "flos": 17858341163520.0, + "grad_norm": 2.0131266127521816, + "language_loss": 0.73767704, + "learning_rate": 6.415657800531038e-07, + "loss": 0.75834775, + "num_input_tokens_seen": 267441250, + "step": 12400, + "time_per_iteration": 2.642285108566284 + }, + { + "auxiliary_loss_clip": 0.01050273, + "auxiliary_loss_mlp": 0.01026399, + "balance_loss_clip": 1.02367282, + "balance_loss_mlp": 1.01619482, + "epoch": 0.7455884563354878, + "flos": 30774259664640.0, + "grad_norm": 1.7863767855847608, + "language_loss": 0.82224554, + "learning_rate": 6.412799653142327e-07, + "loss": 0.84301227, + "num_input_tokens_seen": 267462820, + "step": 12401, + "time_per_iteration": 2.7006659507751465 + }, + { + "auxiliary_loss_clip": 0.01030174, + "auxiliary_loss_mlp": 0.01030632, + "balance_loss_clip": 1.02384484, + "balance_loss_mlp": 1.02094674, + "epoch": 0.7456485795881557, + "flos": 23185545292800.0, + "grad_norm": 1.6851681041085262, + "language_loss": 0.64822817, + "learning_rate": 6.409942020981611e-07, + "loss": 0.66883618, + "num_input_tokens_seen": 267483065, + "step": 12402, + "time_per_iteration": 2.7438032627105713 + }, + { + "auxiliary_loss_clip": 0.01028784, + "auxiliary_loss_mlp": 0.01029893, + "balance_loss_clip": 1.02308559, + "balance_loss_mlp": 1.02033281, + "epoch": 0.7457087028408237, + "flos": 38727144074880.0, + "grad_norm": 1.59821865257815, + "language_loss": 0.73241401, + "learning_rate": 6.407084904157265e-07, + "loss": 0.75300074, + "num_input_tokens_seen": 267504825, + "step": 12403, + "time_per_iteration": 2.859100341796875 + }, + { + "auxiliary_loss_clip": 0.00976962, + "auxiliary_loss_mlp": 0.01000295, + "balance_loss_clip": 1.00198221, + "balance_loss_mlp": 0.99948996, + "epoch": 0.7457688260934917, + "flos": 56043737337600.0, + "grad_norm": 0.8299271339476383, + "language_loss": 0.58790338, + "learning_rate": 6.404228302777621e-07, + "loss": 0.60767591, + "num_input_tokens_seen": 267559260, + "step": 12404, + "time_per_iteration": 4.748319387435913 + }, + { + "auxiliary_loss_clip": 0.01060909, + "auxiliary_loss_mlp": 0.01029809, + "balance_loss_clip": 1.02351928, + "balance_loss_mlp": 1.01955068, + "epoch": 0.7458289493461596, + "flos": 20116152305280.0, + "grad_norm": 1.4813472057506958, + "language_loss": 0.77795237, + "learning_rate": 6.401372216950995e-07, + "loss": 0.79885954, + "num_input_tokens_seen": 267578720, + "step": 12405, + "time_per_iteration": 4.262241363525391 + }, + { + "auxiliary_loss_clip": 0.01030697, + "auxiliary_loss_mlp": 0.01033814, + "balance_loss_clip": 1.02109361, + "balance_loss_mlp": 1.02306676, + "epoch": 0.7458890725988276, + "flos": 20193073280640.0, + "grad_norm": 3.419074468244473, + "language_loss": 0.69246089, + "learning_rate": 6.398516646785698e-07, + "loss": 0.71310598, + "num_input_tokens_seen": 267598250, + "step": 12406, + "time_per_iteration": 2.7851157188415527 + }, + { + "auxiliary_loss_clip": 0.00997232, + "auxiliary_loss_mlp": 0.01030145, + "balance_loss_clip": 1.02406311, + "balance_loss_mlp": 1.01815295, + "epoch": 0.7459491958514956, + "flos": 17018749687680.0, + "grad_norm": 1.5564573972515003, + "language_loss": 0.64942002, + "learning_rate": 6.39566159239002e-07, + "loss": 0.66969383, + "num_input_tokens_seen": 267615430, + "step": 12407, + "time_per_iteration": 2.770212173461914 + }, + { + "auxiliary_loss_clip": 0.01027855, + "auxiliary_loss_mlp": 0.01034775, + "balance_loss_clip": 1.02470005, + "balance_loss_mlp": 1.02234125, + "epoch": 0.7460093191041636, + "flos": 25078719519360.0, + "grad_norm": 1.852064630667483, + "language_loss": 0.7219488, + "learning_rate": 6.392807053872212e-07, + "loss": 0.74257511, + "num_input_tokens_seen": 267635075, + "step": 12408, + "time_per_iteration": 2.8071324825286865 + }, + { + "auxiliary_loss_clip": 0.01056867, + "auxiliary_loss_mlp": 0.01029874, + "balance_loss_clip": 1.02585864, + "balance_loss_mlp": 1.01779222, + "epoch": 0.7460694423568315, + "flos": 21908525990400.0, + "grad_norm": 1.8286260435195487, + "language_loss": 0.72712481, + "learning_rate": 6.38995303134053e-07, + "loss": 0.74799216, + "num_input_tokens_seen": 267654105, + "step": 12409, + "time_per_iteration": 2.689439058303833 + }, + { + "auxiliary_loss_clip": 0.01049139, + "auxiliary_loss_mlp": 0.01028789, + "balance_loss_clip": 1.02315557, + "balance_loss_mlp": 1.01963341, + "epoch": 0.7461295656094995, + "flos": 21215737399680.0, + "grad_norm": 1.6089875585816433, + "language_loss": 0.65909529, + "learning_rate": 6.38709952490319e-07, + "loss": 0.6798746, + "num_input_tokens_seen": 267673090, + "step": 12410, + "time_per_iteration": 2.7457613945007324 + }, + { + "auxiliary_loss_clip": 0.0105159, + "auxiliary_loss_mlp": 0.00747727, + "balance_loss_clip": 1.02515793, + "balance_loss_mlp": 1.00046647, + "epoch": 0.7461896888621674, + "flos": 22346851656960.0, + "grad_norm": 2.0671356730947363, + "language_loss": 0.84227741, + "learning_rate": 6.384246534668396e-07, + "loss": 0.8602705, + "num_input_tokens_seen": 267690605, + "step": 12411, + "time_per_iteration": 2.7017252445220947 + }, + { + "auxiliary_loss_clip": 0.01032152, + "auxiliary_loss_mlp": 0.01026163, + "balance_loss_clip": 1.02346563, + "balance_loss_mlp": 1.01488626, + "epoch": 0.7462498121148354, + "flos": 25482930243840.0, + "grad_norm": 1.4734629101712244, + "language_loss": 0.77911931, + "learning_rate": 6.381394060744339e-07, + "loss": 0.79970247, + "num_input_tokens_seen": 267710540, + "step": 12412, + "time_per_iteration": 2.8190948963165283 + }, + { + "auxiliary_loss_clip": 0.01016566, + "auxiliary_loss_mlp": 0.01032052, + "balance_loss_clip": 1.02027202, + "balance_loss_mlp": 1.02099538, + "epoch": 0.7463099353675033, + "flos": 33947936812800.0, + "grad_norm": 1.7954877125764883, + "language_loss": 0.62453353, + "learning_rate": 6.378542103239188e-07, + "loss": 0.64501971, + "num_input_tokens_seen": 267730780, + "step": 12413, + "time_per_iteration": 2.8767590522766113 + }, + { + "auxiliary_loss_clip": 0.00996834, + "auxiliary_loss_mlp": 0.00746847, + "balance_loss_clip": 1.00125206, + "balance_loss_mlp": 1.00087094, + "epoch": 0.7463700586201714, + "flos": 62767723691520.0, + "grad_norm": 0.7387723482477193, + "language_loss": 0.54875755, + "learning_rate": 6.375690662261082e-07, + "loss": 0.5661943, + "num_input_tokens_seen": 267794240, + "step": 12414, + "time_per_iteration": 3.3217546939849854 + }, + { + "auxiliary_loss_clip": 0.01035852, + "auxiliary_loss_mlp": 0.01030161, + "balance_loss_clip": 1.02196383, + "balance_loss_mlp": 1.01891351, + "epoch": 0.7464301818728393, + "flos": 33432654257280.0, + "grad_norm": 1.9549614100577084, + "language_loss": 0.54958963, + "learning_rate": 6.372839737918154e-07, + "loss": 0.5702498, + "num_input_tokens_seen": 267817190, + "step": 12415, + "time_per_iteration": 2.807018756866455 + }, + { + "auxiliary_loss_clip": 0.00998496, + "auxiliary_loss_mlp": 0.01032863, + "balance_loss_clip": 1.02269244, + "balance_loss_mlp": 1.02049506, + "epoch": 0.7464903051255073, + "flos": 26869872142080.0, + "grad_norm": 1.8638008081802528, + "language_loss": 0.75054812, + "learning_rate": 6.369989330318506e-07, + "loss": 0.77086174, + "num_input_tokens_seen": 267836245, + "step": 12416, + "time_per_iteration": 2.876828193664551 + }, + { + "auxiliary_loss_clip": 0.01012899, + "auxiliary_loss_mlp": 0.01041205, + "balance_loss_clip": 1.02084708, + "balance_loss_mlp": 1.02922416, + "epoch": 0.7465504283781753, + "flos": 44086954775040.0, + "grad_norm": 1.4314786465613472, + "language_loss": 0.69582796, + "learning_rate": 6.367139439570233e-07, + "loss": 0.71636903, + "num_input_tokens_seen": 267858310, + "step": 12417, + "time_per_iteration": 2.974034547805786 + }, + { + "auxiliary_loss_clip": 0.01039862, + "auxiliary_loss_mlp": 0.01028562, + "balance_loss_clip": 1.02929211, + "balance_loss_mlp": 1.01731503, + "epoch": 0.7466105516308432, + "flos": 19676102785920.0, + "grad_norm": 2.3344455010559657, + "language_loss": 0.73614579, + "learning_rate": 6.364290065781392e-07, + "loss": 0.75682998, + "num_input_tokens_seen": 267876345, + "step": 12418, + "time_per_iteration": 2.8014512062072754 + }, + { + "auxiliary_loss_clip": 0.01053612, + "auxiliary_loss_mlp": 0.01025465, + "balance_loss_clip": 1.02529144, + "balance_loss_mlp": 1.01495695, + "epoch": 0.7466706748835112, + "flos": 20520722165760.0, + "grad_norm": 3.416989261676016, + "language_loss": 0.69244373, + "learning_rate": 6.361441209060039e-07, + "loss": 0.71323448, + "num_input_tokens_seen": 267896740, + "step": 12419, + "time_per_iteration": 2.7410027980804443 + }, + { + "auxiliary_loss_clip": 0.01059448, + "auxiliary_loss_mlp": 0.01032359, + "balance_loss_clip": 1.0247705, + "balance_loss_mlp": 1.02244687, + "epoch": 0.7467307981361792, + "flos": 21690260997120.0, + "grad_norm": 1.676849085366726, + "language_loss": 0.74519473, + "learning_rate": 6.358592869514216e-07, + "loss": 0.7661128, + "num_input_tokens_seen": 267914765, + "step": 12420, + "time_per_iteration": 2.7676842212677 + }, + { + "auxiliary_loss_clip": 0.01056248, + "auxiliary_loss_mlp": 0.01026623, + "balance_loss_clip": 1.02666664, + "balance_loss_mlp": 1.01564395, + "epoch": 0.7467909213888472, + "flos": 19573686132480.0, + "grad_norm": 1.7264120421796931, + "language_loss": 0.67362702, + "learning_rate": 6.355745047251904e-07, + "loss": 0.69445574, + "num_input_tokens_seen": 267934085, + "step": 12421, + "time_per_iteration": 4.411803245544434 + }, + { + "auxiliary_loss_clip": 0.01047194, + "auxiliary_loss_mlp": 0.01030387, + "balance_loss_clip": 1.02690434, + "balance_loss_mlp": 1.01842391, + "epoch": 0.7468510446415151, + "flos": 23695225326720.0, + "grad_norm": 1.9306896511259146, + "language_loss": 0.7222141, + "learning_rate": 6.352897742381107e-07, + "loss": 0.7429899, + "num_input_tokens_seen": 267955170, + "step": 12422, + "time_per_iteration": 2.7888689041137695 + }, + { + "auxiliary_loss_clip": 0.01023922, + "auxiliary_loss_mlp": 0.01029124, + "balance_loss_clip": 1.02339613, + "balance_loss_mlp": 1.01795363, + "epoch": 0.7469111678941831, + "flos": 29315783831040.0, + "grad_norm": 1.8756467059993651, + "language_loss": 0.74354374, + "learning_rate": 6.350050955009796e-07, + "loss": 0.76407421, + "num_input_tokens_seen": 267974980, + "step": 12423, + "time_per_iteration": 2.8581340312957764 + }, + { + "auxiliary_loss_clip": 0.01051376, + "auxiliary_loss_mlp": 0.01022964, + "balance_loss_clip": 1.02514887, + "balance_loss_mlp": 1.01298583, + "epoch": 0.746971291146851, + "flos": 21798639308160.0, + "grad_norm": 1.3710841642948493, + "language_loss": 0.6782431, + "learning_rate": 6.347204685245929e-07, + "loss": 0.69898647, + "num_input_tokens_seen": 267994985, + "step": 12424, + "time_per_iteration": 2.75785493850708 + }, + { + "auxiliary_loss_clip": 0.01057537, + "auxiliary_loss_mlp": 0.0103226, + "balance_loss_clip": 1.02751994, + "balance_loss_mlp": 1.02104795, + "epoch": 0.747031414399519, + "flos": 36245070368640.0, + "grad_norm": 1.6112304798537138, + "language_loss": 0.7389608, + "learning_rate": 6.344358933197418e-07, + "loss": 0.75985879, + "num_input_tokens_seen": 268014985, + "step": 12425, + "time_per_iteration": 2.986794948577881 + }, + { + "auxiliary_loss_clip": 0.010254, + "auxiliary_loss_mlp": 0.01030337, + "balance_loss_clip": 1.02345228, + "balance_loss_mlp": 1.0183208, + "epoch": 0.7470915376521869, + "flos": 19974916028160.0, + "grad_norm": 1.9357299013610452, + "language_loss": 0.69240594, + "learning_rate": 6.341513698972194e-07, + "loss": 0.71296334, + "num_input_tokens_seen": 268034395, + "step": 12426, + "time_per_iteration": 2.928889036178589 + }, + { + "auxiliary_loss_clip": 0.01028965, + "auxiliary_loss_mlp": 0.01031752, + "balance_loss_clip": 1.02356327, + "balance_loss_mlp": 1.02135682, + "epoch": 0.747151660904855, + "flos": 20084299920000.0, + "grad_norm": 4.6345207324109365, + "language_loss": 0.65300792, + "learning_rate": 6.338668982678139e-07, + "loss": 0.6736151, + "num_input_tokens_seen": 268054485, + "step": 12427, + "time_per_iteration": 2.857099771499634 + }, + { + "auxiliary_loss_clip": 0.01062458, + "auxiliary_loss_mlp": 0.01027836, + "balance_loss_clip": 1.02514708, + "balance_loss_mlp": 1.01642215, + "epoch": 0.7472117841575229, + "flos": 16290373697280.0, + "grad_norm": 1.617926231856209, + "language_loss": 0.74765682, + "learning_rate": 6.335824784423118e-07, + "loss": 0.76855981, + "num_input_tokens_seen": 268072250, + "step": 12428, + "time_per_iteration": 2.541748046875 + }, + { + "auxiliary_loss_clip": 0.01056406, + "auxiliary_loss_mlp": 0.01031912, + "balance_loss_clip": 1.02535367, + "balance_loss_mlp": 1.01931763, + "epoch": 0.7472719074101909, + "flos": 21389939383680.0, + "grad_norm": 2.1665794849721562, + "language_loss": 0.5824939, + "learning_rate": 6.33298110431499e-07, + "loss": 0.6033771, + "num_input_tokens_seen": 268089840, + "step": 12429, + "time_per_iteration": 2.607217788696289 + }, + { + "auxiliary_loss_clip": 0.01057496, + "auxiliary_loss_mlp": 0.01027962, + "balance_loss_clip": 1.02683306, + "balance_loss_mlp": 1.01663709, + "epoch": 0.7473320306628589, + "flos": 29643289061760.0, + "grad_norm": 1.8342656999134048, + "language_loss": 0.60629088, + "learning_rate": 6.330137942461595e-07, + "loss": 0.62714541, + "num_input_tokens_seen": 268109360, + "step": 12430, + "time_per_iteration": 2.7261786460876465 + }, + { + "auxiliary_loss_clip": 0.01042979, + "auxiliary_loss_mlp": 0.01028284, + "balance_loss_clip": 1.02438903, + "balance_loss_mlp": 1.01742375, + "epoch": 0.7473921539155268, + "flos": 24136100858880.0, + "grad_norm": 1.6854230429598702, + "language_loss": 0.75834596, + "learning_rate": 6.327295298970734e-07, + "loss": 0.77905858, + "num_input_tokens_seen": 268131840, + "step": 12431, + "time_per_iteration": 4.515880346298218 + }, + { + "auxiliary_loss_clip": 0.0105443, + "auxiliary_loss_mlp": 0.01026435, + "balance_loss_clip": 1.02492118, + "balance_loss_mlp": 1.01542568, + "epoch": 0.7474522771681948, + "flos": 17487958072320.0, + "grad_norm": 1.9070550767300825, + "language_loss": 0.75529706, + "learning_rate": 6.32445317395021e-07, + "loss": 0.77610564, + "num_input_tokens_seen": 268148300, + "step": 12432, + "time_per_iteration": 2.5643672943115234 + }, + { + "auxiliary_loss_clip": 0.01047233, + "auxiliary_loss_mlp": 0.01034735, + "balance_loss_clip": 1.02552342, + "balance_loss_mlp": 1.02227187, + "epoch": 0.7475124004208628, + "flos": 16727298733440.0, + "grad_norm": 2.2816047933672334, + "language_loss": 0.70129776, + "learning_rate": 6.321611567507787e-07, + "loss": 0.72211742, + "num_input_tokens_seen": 268166450, + "step": 12433, + "time_per_iteration": 2.731667995452881 + }, + { + "auxiliary_loss_clip": 0.01020495, + "auxiliary_loss_mlp": 0.01029247, + "balance_loss_clip": 1.02251792, + "balance_loss_mlp": 1.01742101, + "epoch": 0.7475725236735308, + "flos": 19720237622400.0, + "grad_norm": 1.8832766955971312, + "language_loss": 0.6724869, + "learning_rate": 6.318770479751232e-07, + "loss": 0.69298434, + "num_input_tokens_seen": 268186165, + "step": 12434, + "time_per_iteration": 2.802187442779541 + }, + { + "auxiliary_loss_clip": 0.01059513, + "auxiliary_loss_mlp": 0.01031868, + "balance_loss_clip": 1.02493739, + "balance_loss_mlp": 1.0223608, + "epoch": 0.7476326469261987, + "flos": 26286000566400.0, + "grad_norm": 2.9976456128912483, + "language_loss": 0.79523933, + "learning_rate": 6.315929910788263e-07, + "loss": 0.81615317, + "num_input_tokens_seen": 268208145, + "step": 12435, + "time_per_iteration": 2.616489887237549 + }, + { + "auxiliary_loss_clip": 0.01034794, + "auxiliary_loss_mlp": 0.0102654, + "balance_loss_clip": 1.02544487, + "balance_loss_mlp": 1.01595473, + "epoch": 0.7476927701788667, + "flos": 31831828824960.0, + "grad_norm": 1.555723867914388, + "language_loss": 0.67402101, + "learning_rate": 6.313089860726604e-07, + "loss": 0.69463432, + "num_input_tokens_seen": 268228345, + "step": 12436, + "time_per_iteration": 2.8208811283111572 + }, + { + "auxiliary_loss_clip": 0.01035936, + "auxiliary_loss_mlp": 0.01037669, + "balance_loss_clip": 1.02488136, + "balance_loss_mlp": 1.02639818, + "epoch": 0.7477528934315346, + "flos": 31795487239680.0, + "grad_norm": 3.7286134468543297, + "language_loss": 0.70602548, + "learning_rate": 6.31025032967396e-07, + "loss": 0.72676146, + "num_input_tokens_seen": 268250260, + "step": 12437, + "time_per_iteration": 2.8691065311431885 + }, + { + "auxiliary_loss_clip": 0.01025946, + "auxiliary_loss_mlp": 0.0102934, + "balance_loss_clip": 1.02202249, + "balance_loss_mlp": 1.01935005, + "epoch": 0.7478130166842026, + "flos": 20371979946240.0, + "grad_norm": 1.6658163385901599, + "language_loss": 0.67104, + "learning_rate": 6.307411317737986e-07, + "loss": 0.69159281, + "num_input_tokens_seen": 268268440, + "step": 12438, + "time_per_iteration": 2.909438371658325 + }, + { + "auxiliary_loss_clip": 0.01038608, + "auxiliary_loss_mlp": 0.01030548, + "balance_loss_clip": 1.02277994, + "balance_loss_mlp": 1.02013493, + "epoch": 0.7478731399368705, + "flos": 18148930191360.0, + "grad_norm": 1.7386220846320206, + "language_loss": 0.80449224, + "learning_rate": 6.304572825026344e-07, + "loss": 0.82518375, + "num_input_tokens_seen": 268285765, + "step": 12439, + "time_per_iteration": 2.664097547531128 + }, + { + "auxiliary_loss_clip": 0.01026599, + "auxiliary_loss_mlp": 0.01032162, + "balance_loss_clip": 1.02182329, + "balance_loss_mlp": 1.02148676, + "epoch": 0.7479332631895386, + "flos": 15267889146240.0, + "grad_norm": 2.047804683318934, + "language_loss": 0.71138561, + "learning_rate": 6.301734851646674e-07, + "loss": 0.73197317, + "num_input_tokens_seen": 268304015, + "step": 12440, + "time_per_iteration": 2.88527774810791 + }, + { + "auxiliary_loss_clip": 0.01044389, + "auxiliary_loss_mlp": 0.01027013, + "balance_loss_clip": 1.0276897, + "balance_loss_mlp": 1.01651073, + "epoch": 0.7479933864422065, + "flos": 21142515525120.0, + "grad_norm": 1.8483279992602555, + "language_loss": 0.74001038, + "learning_rate": 6.298897397706597e-07, + "loss": 0.76072443, + "num_input_tokens_seen": 268323290, + "step": 12441, + "time_per_iteration": 2.87609601020813 + }, + { + "auxiliary_loss_clip": 0.01057204, + "auxiliary_loss_mlp": 0.00747796, + "balance_loss_clip": 1.02676487, + "balance_loss_mlp": 1.00041699, + "epoch": 0.7480535096948745, + "flos": 14392027912320.0, + "grad_norm": 2.1896392973960186, + "language_loss": 0.8252691, + "learning_rate": 6.296060463313698e-07, + "loss": 0.84331918, + "num_input_tokens_seen": 268339490, + "step": 12442, + "time_per_iteration": 2.7411112785339355 + }, + { + "auxiliary_loss_clip": 0.0100884, + "auxiliary_loss_mlp": 0.01031545, + "balance_loss_clip": 1.02425992, + "balance_loss_mlp": 1.01937366, + "epoch": 0.7481136329475425, + "flos": 27344683048320.0, + "grad_norm": 1.867587451446154, + "language_loss": 0.6292994, + "learning_rate": 6.293224048575565e-07, + "loss": 0.64970326, + "num_input_tokens_seen": 268359865, + "step": 12443, + "time_per_iteration": 3.0391886234283447 + }, + { + "auxiliary_loss_clip": 0.01024743, + "auxiliary_loss_mlp": 0.01022495, + "balance_loss_clip": 1.02226198, + "balance_loss_mlp": 1.01235652, + "epoch": 0.7481737562002104, + "flos": 19531454716800.0, + "grad_norm": 1.95894209566081, + "language_loss": 0.71490741, + "learning_rate": 6.29038815359975e-07, + "loss": 0.73537976, + "num_input_tokens_seen": 268377065, + "step": 12444, + "time_per_iteration": 2.898660659790039 + }, + { + "auxiliary_loss_clip": 0.01016945, + "auxiliary_loss_mlp": 0.01029337, + "balance_loss_clip": 1.02377343, + "balance_loss_mlp": 1.01823223, + "epoch": 0.7482338794528784, + "flos": 21760035166080.0, + "grad_norm": 1.4579242729525754, + "language_loss": 0.6871143, + "learning_rate": 6.287552778493786e-07, + "loss": 0.70757711, + "num_input_tokens_seen": 268396935, + "step": 12445, + "time_per_iteration": 2.7914953231811523 + }, + { + "auxiliary_loss_clip": 0.01051699, + "auxiliary_loss_mlp": 0.01023694, + "balance_loss_clip": 1.0244627, + "balance_loss_mlp": 1.01341796, + "epoch": 0.7482940027055464, + "flos": 18697358021760.0, + "grad_norm": 1.8798680718251575, + "language_loss": 0.74336398, + "learning_rate": 6.28471792336519e-07, + "loss": 0.76411784, + "num_input_tokens_seen": 268414460, + "step": 12446, + "time_per_iteration": 2.733096122741699 + }, + { + "auxiliary_loss_clip": 0.01045541, + "auxiliary_loss_mlp": 0.00747796, + "balance_loss_clip": 1.02464497, + "balance_loss_mlp": 1.00048065, + "epoch": 0.7483541259582144, + "flos": 15998024903040.0, + "grad_norm": 2.118209428596654, + "language_loss": 0.73295712, + "learning_rate": 6.281883588321475e-07, + "loss": 0.75089049, + "num_input_tokens_seen": 268432225, + "step": 12447, + "time_per_iteration": 2.801959753036499 + }, + { + "auxiliary_loss_clip": 0.0102278, + "auxiliary_loss_mlp": 0.01029176, + "balance_loss_clip": 1.02411556, + "balance_loss_mlp": 1.01893592, + "epoch": 0.7484142492108823, + "flos": 25556295772800.0, + "grad_norm": 2.4926082357160784, + "language_loss": 0.71957183, + "learning_rate": 6.279049773470109e-07, + "loss": 0.74009138, + "num_input_tokens_seen": 268449270, + "step": 12448, + "time_per_iteration": 2.7433876991271973 + }, + { + "auxiliary_loss_clip": 0.01065717, + "auxiliary_loss_mlp": 0.01035839, + "balance_loss_clip": 1.02652812, + "balance_loss_mlp": 1.02521753, + "epoch": 0.7484743724635503, + "flos": 22887737631360.0, + "grad_norm": 2.8910102723843543, + "language_loss": 0.7351467, + "learning_rate": 6.276216478918543e-07, + "loss": 0.75616229, + "num_input_tokens_seen": 268467250, + "step": 12449, + "time_per_iteration": 2.6212916374206543 + }, + { + "auxiliary_loss_clip": 0.01027608, + "auxiliary_loss_mlp": 0.01032901, + "balance_loss_clip": 1.02519202, + "balance_loss_mlp": 1.02125466, + "epoch": 0.7485344957162182, + "flos": 25300288563840.0, + "grad_norm": 2.0033106389425077, + "language_loss": 0.61087787, + "learning_rate": 6.273383704774225e-07, + "loss": 0.6314829, + "num_input_tokens_seen": 268487270, + "step": 12450, + "time_per_iteration": 2.704056978225708 + }, + { + "auxiliary_loss_clip": 0.01060037, + "auxiliary_loss_mlp": 0.01023122, + "balance_loss_clip": 1.0246501, + "balance_loss_mlp": 1.01299489, + "epoch": 0.7485946189688862, + "flos": 27053016612480.0, + "grad_norm": 1.7073447899118006, + "language_loss": 0.70074874, + "learning_rate": 6.270551451144577e-07, + "loss": 0.72158033, + "num_input_tokens_seen": 268508020, + "step": 12451, + "time_per_iteration": 5.881060838699341 + }, + { + "auxiliary_loss_clip": 0.01055865, + "auxiliary_loss_mlp": 0.01030714, + "balance_loss_clip": 1.02526879, + "balance_loss_mlp": 1.01936483, + "epoch": 0.7486547422215541, + "flos": 26906752431360.0, + "grad_norm": 2.204711485190443, + "language_loss": 0.79957426, + "learning_rate": 6.267719718136988e-07, + "loss": 0.82043999, + "num_input_tokens_seen": 268527375, + "step": 12452, + "time_per_iteration": 2.668174982070923 + }, + { + "auxiliary_loss_clip": 0.01070587, + "auxiliary_loss_mlp": 0.01033062, + "balance_loss_clip": 1.02922404, + "balance_loss_mlp": 1.02146316, + "epoch": 0.7487148654742222, + "flos": 22346277039360.0, + "grad_norm": 2.344115783553131, + "language_loss": 0.71284533, + "learning_rate": 6.264888505858843e-07, + "loss": 0.73388177, + "num_input_tokens_seen": 268544870, + "step": 12453, + "time_per_iteration": 2.553175449371338 + }, + { + "auxiliary_loss_clip": 0.01044352, + "auxiliary_loss_mlp": 0.01031693, + "balance_loss_clip": 1.02623284, + "balance_loss_mlp": 1.02093458, + "epoch": 0.7487749887268901, + "flos": 23038814234880.0, + "grad_norm": 1.5644763962372346, + "language_loss": 0.73679161, + "learning_rate": 6.262057814417517e-07, + "loss": 0.75755209, + "num_input_tokens_seen": 268564580, + "step": 12454, + "time_per_iteration": 2.644728660583496 + }, + { + "auxiliary_loss_clip": 0.0097452, + "auxiliary_loss_mlp": 0.01007302, + "balance_loss_clip": 1.00174189, + "balance_loss_mlp": 1.00622272, + "epoch": 0.7488351119795581, + "flos": 71525294536320.0, + "grad_norm": 0.7370871381440269, + "language_loss": 0.59421301, + "learning_rate": 6.259227643920322e-07, + "loss": 0.6140312, + "num_input_tokens_seen": 268629550, + "step": 12455, + "time_per_iteration": 3.347841739654541 + }, + { + "auxiliary_loss_clip": 0.01027603, + "auxiliary_loss_mlp": 0.01029711, + "balance_loss_clip": 1.02254152, + "balance_loss_mlp": 1.01882756, + "epoch": 0.748895235232226, + "flos": 17196255722880.0, + "grad_norm": 2.0105539332549185, + "language_loss": 0.79306507, + "learning_rate": 6.256397994474592e-07, + "loss": 0.81363827, + "num_input_tokens_seen": 268646645, + "step": 12456, + "time_per_iteration": 2.6922738552093506 + }, + { + "auxiliary_loss_clip": 0.0099826, + "auxiliary_loss_mlp": 0.01001078, + "balance_loss_clip": 1.0023787, + "balance_loss_mlp": 1.00010681, + "epoch": 0.748955358484894, + "flos": 58979256336000.0, + "grad_norm": 0.8297881297088928, + "language_loss": 0.61389863, + "learning_rate": 6.25356886618763e-07, + "loss": 0.63389206, + "num_input_tokens_seen": 268702275, + "step": 12457, + "time_per_iteration": 3.135551929473877 + }, + { + "auxiliary_loss_clip": 0.01049132, + "auxiliary_loss_mlp": 0.01034487, + "balance_loss_clip": 1.02723813, + "balance_loss_mlp": 1.02351379, + "epoch": 0.749015481737562, + "flos": 11360413054080.0, + "grad_norm": 1.8865369125525753, + "language_loss": 0.67549908, + "learning_rate": 6.250740259166711e-07, + "loss": 0.69633526, + "num_input_tokens_seen": 268716265, + "step": 12458, + "time_per_iteration": 2.689321756362915 + }, + { + "auxiliary_loss_clip": 0.01018408, + "auxiliary_loss_mlp": 0.010317, + "balance_loss_clip": 1.02253366, + "balance_loss_mlp": 1.02102494, + "epoch": 0.74907560499023, + "flos": 21106497162240.0, + "grad_norm": 1.9603462252116055, + "language_loss": 0.79874319, + "learning_rate": 6.247912173519106e-07, + "loss": 0.81924427, + "num_input_tokens_seen": 268734330, + "step": 12459, + "time_per_iteration": 2.7518250942230225 + }, + { + "auxiliary_loss_clip": 0.01023189, + "auxiliary_loss_mlp": 0.01033851, + "balance_loss_clip": 1.02230668, + "balance_loss_mlp": 1.02210879, + "epoch": 0.749135728242898, + "flos": 22268027260800.0, + "grad_norm": 1.498002116819065, + "language_loss": 0.80554259, + "learning_rate": 6.245084609352043e-07, + "loss": 0.82611299, + "num_input_tokens_seen": 268753500, + "step": 12460, + "time_per_iteration": 2.812523126602173 + }, + { + "auxiliary_loss_clip": 0.01035154, + "auxiliary_loss_mlp": 0.01029728, + "balance_loss_clip": 1.02355099, + "balance_loss_mlp": 1.01797974, + "epoch": 0.7491958514955659, + "flos": 24057527857920.0, + "grad_norm": 1.7535529100033453, + "language_loss": 0.86170697, + "learning_rate": 6.242257566772755e-07, + "loss": 0.88235581, + "num_input_tokens_seen": 268772055, + "step": 12461, + "time_per_iteration": 2.8817527294158936 + }, + { + "auxiliary_loss_clip": 0.01053333, + "auxiliary_loss_mlp": 0.01027714, + "balance_loss_clip": 1.02673316, + "balance_loss_mlp": 1.01737833, + "epoch": 0.7492559747482339, + "flos": 24492118510080.0, + "grad_norm": 2.0089980924388966, + "language_loss": 0.69741505, + "learning_rate": 6.239431045888435e-07, + "loss": 0.71822548, + "num_input_tokens_seen": 268792265, + "step": 12462, + "time_per_iteration": 2.885629653930664 + }, + { + "auxiliary_loss_clip": 0.01062985, + "auxiliary_loss_mlp": 0.01026829, + "balance_loss_clip": 1.02530921, + "balance_loss_mlp": 1.01583815, + "epoch": 0.7493160980009018, + "flos": 27745338326400.0, + "grad_norm": 1.8262397848204195, + "language_loss": 0.70599222, + "learning_rate": 6.236605046806267e-07, + "loss": 0.72689033, + "num_input_tokens_seen": 268812735, + "step": 12463, + "time_per_iteration": 2.635186195373535 + }, + { + "auxiliary_loss_clip": 0.01029698, + "auxiliary_loss_mlp": 0.01032864, + "balance_loss_clip": 1.02419364, + "balance_loss_mlp": 1.02217674, + "epoch": 0.7493762212535698, + "flos": 30226190970240.0, + "grad_norm": 1.8465644402868515, + "language_loss": 0.77338946, + "learning_rate": 6.233779569633419e-07, + "loss": 0.79401505, + "num_input_tokens_seen": 268833090, + "step": 12464, + "time_per_iteration": 2.7541117668151855 + }, + { + "auxiliary_loss_clip": 0.01039466, + "auxiliary_loss_mlp": 0.01023488, + "balance_loss_clip": 1.02240884, + "balance_loss_mlp": 1.01333773, + "epoch": 0.7494363445062378, + "flos": 21944472526080.0, + "grad_norm": 1.666230279861217, + "language_loss": 0.78456062, + "learning_rate": 6.230954614477034e-07, + "loss": 0.80519021, + "num_input_tokens_seen": 268851880, + "step": 12465, + "time_per_iteration": 2.859867811203003 + }, + { + "auxiliary_loss_clip": 0.0103358, + "auxiliary_loss_mlp": 0.01037109, + "balance_loss_clip": 1.02477503, + "balance_loss_mlp": 1.0238111, + "epoch": 0.7494964677589058, + "flos": 12490342162560.0, + "grad_norm": 2.176038712972122, + "language_loss": 0.74099481, + "learning_rate": 6.22813018144422e-07, + "loss": 0.76170164, + "num_input_tokens_seen": 268867910, + "step": 12466, + "time_per_iteration": 2.701533317565918 + }, + { + "auxiliary_loss_clip": 0.01050762, + "auxiliary_loss_mlp": 0.01035871, + "balance_loss_clip": 1.02425468, + "balance_loss_mlp": 1.02479661, + "epoch": 0.7495565910115737, + "flos": 21653057485440.0, + "grad_norm": 3.059035053899831, + "language_loss": 0.66440809, + "learning_rate": 6.22530627064209e-07, + "loss": 0.68527448, + "num_input_tokens_seen": 268887260, + "step": 12467, + "time_per_iteration": 2.6871678829193115 + }, + { + "auxiliary_loss_clip": 0.01019127, + "auxiliary_loss_mlp": 0.00747725, + "balance_loss_clip": 1.02419829, + "balance_loss_mlp": 1.00043964, + "epoch": 0.7496167142642417, + "flos": 15268535591040.0, + "grad_norm": 2.300166024137754, + "language_loss": 0.76723343, + "learning_rate": 6.222482882177735e-07, + "loss": 0.78490198, + "num_input_tokens_seen": 268902520, + "step": 12468, + "time_per_iteration": 5.851422309875488 + }, + { + "auxiliary_loss_clip": 0.01036899, + "auxiliary_loss_mlp": 0.01030694, + "balance_loss_clip": 1.02767682, + "balance_loss_mlp": 1.01972699, + "epoch": 0.7496768375169096, + "flos": 22054933825920.0, + "grad_norm": 2.000152476663072, + "language_loss": 0.69192457, + "learning_rate": 6.219660016158201e-07, + "loss": 0.71260053, + "num_input_tokens_seen": 268920970, + "step": 12469, + "time_per_iteration": 2.738929033279419 + }, + { + "auxiliary_loss_clip": 0.01043458, + "auxiliary_loss_mlp": 0.0102946, + "balance_loss_clip": 1.02458549, + "balance_loss_mlp": 1.01825404, + "epoch": 0.7497369607695776, + "flos": 19057038860160.0, + "grad_norm": 1.9569687267066669, + "language_loss": 0.68870652, + "learning_rate": 6.216837672690543e-07, + "loss": 0.7094357, + "num_input_tokens_seen": 268936600, + "step": 12470, + "time_per_iteration": 2.6817755699157715 + }, + { + "auxiliary_loss_clip": 0.01038432, + "auxiliary_loss_mlp": 0.01036741, + "balance_loss_clip": 1.0248903, + "balance_loss_mlp": 1.02377689, + "epoch": 0.7497970840222457, + "flos": 21617434172160.0, + "grad_norm": 2.2074289306770436, + "language_loss": 0.75031793, + "learning_rate": 6.214015851881793e-07, + "loss": 0.77106965, + "num_input_tokens_seen": 268956560, + "step": 12471, + "time_per_iteration": 2.734896183013916 + }, + { + "auxiliary_loss_clip": 0.01038029, + "auxiliary_loss_mlp": 0.01035015, + "balance_loss_clip": 1.02378678, + "balance_loss_mlp": 1.02271807, + "epoch": 0.7498572072749136, + "flos": 13735580906880.0, + "grad_norm": 2.5776200433610303, + "language_loss": 0.76711684, + "learning_rate": 6.211194553838929e-07, + "loss": 0.78784728, + "num_input_tokens_seen": 268973945, + "step": 12472, + "time_per_iteration": 2.67319655418396 + }, + { + "auxiliary_loss_clip": 0.0105277, + "auxiliary_loss_mlp": 0.00747796, + "balance_loss_clip": 1.02461147, + "balance_loss_mlp": 1.00046802, + "epoch": 0.7499173305275816, + "flos": 22966526113920.0, + "grad_norm": 1.5648244162796516, + "language_loss": 0.84447467, + "learning_rate": 6.208373778668951e-07, + "loss": 0.86248028, + "num_input_tokens_seen": 268993245, + "step": 12473, + "time_per_iteration": 2.6893529891967773 + }, + { + "auxiliary_loss_clip": 0.01030778, + "auxiliary_loss_mlp": 0.01034653, + "balance_loss_clip": 1.02315569, + "balance_loss_mlp": 1.02236891, + "epoch": 0.7499774537802495, + "flos": 22740467869440.0, + "grad_norm": 2.9822794495787974, + "language_loss": 0.73848271, + "learning_rate": 6.205553526478829e-07, + "loss": 0.75913697, + "num_input_tokens_seen": 269012125, + "step": 12474, + "time_per_iteration": 2.793191909790039 + }, + { + "auxiliary_loss_clip": 0.01045786, + "auxiliary_loss_mlp": 0.01033068, + "balance_loss_clip": 1.02534032, + "balance_loss_mlp": 1.02137923, + "epoch": 0.7500375770329175, + "flos": 18296559089280.0, + "grad_norm": 1.6193349734499718, + "language_loss": 0.74302995, + "learning_rate": 6.202733797375492e-07, + "loss": 0.7638185, + "num_input_tokens_seen": 269030545, + "step": 12475, + "time_per_iteration": 2.8203461170196533 + }, + { + "auxiliary_loss_clip": 0.01060104, + "auxiliary_loss_mlp": 0.01035933, + "balance_loss_clip": 1.02659762, + "balance_loss_mlp": 1.02392244, + "epoch": 0.7500977002855854, + "flos": 19169978198400.0, + "grad_norm": 1.693258160972183, + "language_loss": 0.79780507, + "learning_rate": 6.199914591465878e-07, + "loss": 0.8187654, + "num_input_tokens_seen": 269048180, + "step": 12476, + "time_per_iteration": 2.7450623512268066 + }, + { + "auxiliary_loss_clip": 0.01029863, + "auxiliary_loss_mlp": 0.01034837, + "balance_loss_clip": 1.02272844, + "balance_loss_mlp": 1.02361941, + "epoch": 0.7501578235382534, + "flos": 22163886754560.0, + "grad_norm": 1.8438342542436599, + "language_loss": 0.78016824, + "learning_rate": 6.19709590885688e-07, + "loss": 0.80081522, + "num_input_tokens_seen": 269068600, + "step": 12477, + "time_per_iteration": 2.8394644260406494 + }, + { + "auxiliary_loss_clip": 0.00988009, + "auxiliary_loss_mlp": 0.01000912, + "balance_loss_clip": 1.00232983, + "balance_loss_mlp": 0.99998188, + "epoch": 0.7502179467909214, + "flos": 64465040033280.0, + "grad_norm": 0.8012995600751346, + "language_loss": 0.544819, + "learning_rate": 6.194277749655394e-07, + "loss": 0.56470817, + "num_input_tokens_seen": 269119045, + "step": 12478, + "time_per_iteration": 3.319596767425537 + }, + { + "auxiliary_loss_clip": 0.01040265, + "auxiliary_loss_mlp": 0.01028946, + "balance_loss_clip": 1.02437925, + "balance_loss_mlp": 1.0181694, + "epoch": 0.7502780700435894, + "flos": 20478275268480.0, + "grad_norm": 1.5623785209247376, + "language_loss": 0.8017509, + "learning_rate": 6.191460113968272e-07, + "loss": 0.82244307, + "num_input_tokens_seen": 269136755, + "step": 12479, + "time_per_iteration": 4.498440504074097 + }, + { + "auxiliary_loss_clip": 0.01057947, + "auxiliary_loss_mlp": 0.01036499, + "balance_loss_clip": 1.02678275, + "balance_loss_mlp": 1.02399445, + "epoch": 0.7503381932962573, + "flos": 20445273648000.0, + "grad_norm": 2.3726643235810916, + "language_loss": 0.62856162, + "learning_rate": 6.188643001902369e-07, + "loss": 0.64950603, + "num_input_tokens_seen": 269156120, + "step": 12480, + "time_per_iteration": 2.683587074279785 + }, + { + "auxiliary_loss_clip": 0.0103655, + "auxiliary_loss_mlp": 0.01032862, + "balance_loss_clip": 1.02337563, + "balance_loss_mlp": 1.02259171, + "epoch": 0.7503983165489253, + "flos": 22381936266240.0, + "grad_norm": 1.6615396814577024, + "language_loss": 0.7796483, + "learning_rate": 6.185826413564512e-07, + "loss": 0.80034244, + "num_input_tokens_seen": 269175650, + "step": 12481, + "time_per_iteration": 2.7236838340759277 + }, + { + "auxiliary_loss_clip": 0.0102687, + "auxiliary_loss_mlp": 0.0103401, + "balance_loss_clip": 1.02353907, + "balance_loss_mlp": 1.0217793, + "epoch": 0.7504584398015932, + "flos": 24899453717760.0, + "grad_norm": 2.385103009511503, + "language_loss": 0.71341169, + "learning_rate": 6.183010349061501e-07, + "loss": 0.73402047, + "num_input_tokens_seen": 269197080, + "step": 12482, + "time_per_iteration": 2.779405117034912 + }, + { + "auxiliary_loss_clip": 0.01064818, + "auxiliary_loss_mlp": 0.01033745, + "balance_loss_clip": 1.02613091, + "balance_loss_mlp": 1.02250934, + "epoch": 0.7505185630542612, + "flos": 25885237547520.0, + "grad_norm": 2.108239552291244, + "language_loss": 0.70273519, + "learning_rate": 6.180194808500118e-07, + "loss": 0.72372085, + "num_input_tokens_seen": 269218600, + "step": 12483, + "time_per_iteration": 2.7250771522521973 + }, + { + "auxiliary_loss_clip": 0.01063654, + "auxiliary_loss_mlp": 0.01026471, + "balance_loss_clip": 1.0254066, + "balance_loss_mlp": 1.01669574, + "epoch": 0.7505786863069293, + "flos": 23143852581120.0, + "grad_norm": 2.0740378972401263, + "language_loss": 0.74631625, + "learning_rate": 6.177379791987131e-07, + "loss": 0.76721752, + "num_input_tokens_seen": 269239245, + "step": 12484, + "time_per_iteration": 2.7388737201690674 + }, + { + "auxiliary_loss_clip": 0.01041251, + "auxiliary_loss_mlp": 0.01026011, + "balance_loss_clip": 1.02370572, + "balance_loss_mlp": 1.01524663, + "epoch": 0.7506388095595972, + "flos": 16983377769600.0, + "grad_norm": 1.897373068002353, + "language_loss": 0.84831059, + "learning_rate": 6.174565299629295e-07, + "loss": 0.86898315, + "num_input_tokens_seen": 269258520, + "step": 12485, + "time_per_iteration": 2.789372682571411 + }, + { + "auxiliary_loss_clip": 0.01032218, + "auxiliary_loss_mlp": 0.01027723, + "balance_loss_clip": 1.02485657, + "balance_loss_mlp": 1.01721513, + "epoch": 0.7506989328122652, + "flos": 22344984149760.0, + "grad_norm": 2.4961178076852124, + "language_loss": 0.78234255, + "learning_rate": 6.171751331533323e-07, + "loss": 0.80294204, + "num_input_tokens_seen": 269278320, + "step": 12486, + "time_per_iteration": 2.842219352722168 + }, + { + "auxiliary_loss_clip": 0.01050813, + "auxiliary_loss_mlp": 0.0103142, + "balance_loss_clip": 1.02452278, + "balance_loss_mlp": 1.01976728, + "epoch": 0.7507590560649331, + "flos": 25776069137280.0, + "grad_norm": 2.0963389370660477, + "language_loss": 0.72869086, + "learning_rate": 6.168937887805932e-07, + "loss": 0.74951315, + "num_input_tokens_seen": 269298025, + "step": 12487, + "time_per_iteration": 2.6605443954467773 + }, + { + "auxiliary_loss_clip": 0.01037714, + "auxiliary_loss_mlp": 0.01026899, + "balance_loss_clip": 1.02185833, + "balance_loss_mlp": 1.01587868, + "epoch": 0.7508191793176011, + "flos": 24279420124800.0, + "grad_norm": 1.8530248572008854, + "language_loss": 0.67107189, + "learning_rate": 6.166124968553801e-07, + "loss": 0.69171798, + "num_input_tokens_seen": 269316770, + "step": 12488, + "time_per_iteration": 2.801586627960205 + }, + { + "auxiliary_loss_clip": 0.010053, + "auxiliary_loss_mlp": 0.01027692, + "balance_loss_clip": 1.02336836, + "balance_loss_mlp": 1.01621819, + "epoch": 0.750879302570269, + "flos": 19899575251200.0, + "grad_norm": 1.6281477986517285, + "language_loss": 0.77210402, + "learning_rate": 6.163312573883592e-07, + "loss": 0.79243398, + "num_input_tokens_seen": 269334755, + "step": 12489, + "time_per_iteration": 3.012946844100952 + }, + { + "auxiliary_loss_clip": 0.01052751, + "auxiliary_loss_mlp": 0.01027788, + "balance_loss_clip": 1.0255425, + "balance_loss_mlp": 1.01782811, + "epoch": 0.750939425822937, + "flos": 29205681667200.0, + "grad_norm": 1.6660455568019046, + "language_loss": 0.75315869, + "learning_rate": 6.160500703901956e-07, + "loss": 0.77396411, + "num_input_tokens_seen": 269353810, + "step": 12490, + "time_per_iteration": 2.86531400680542 + }, + { + "auxiliary_loss_clip": 0.01064315, + "auxiliary_loss_mlp": 0.01027132, + "balance_loss_clip": 1.02595997, + "balance_loss_mlp": 1.01631951, + "epoch": 0.750999549075605, + "flos": 21142300043520.0, + "grad_norm": 1.5541998370553038, + "language_loss": 0.78325981, + "learning_rate": 6.157689358715527e-07, + "loss": 0.8041743, + "num_input_tokens_seen": 269372910, + "step": 12491, + "time_per_iteration": 2.6170742511749268 + }, + { + "auxiliary_loss_clip": 0.0105113, + "auxiliary_loss_mlp": 0.01030413, + "balance_loss_clip": 1.02383626, + "balance_loss_mlp": 1.02049422, + "epoch": 0.751059672328273, + "flos": 23547740083200.0, + "grad_norm": 1.6828700431360915, + "language_loss": 0.76147127, + "learning_rate": 6.154878538430899e-07, + "loss": 0.78228664, + "num_input_tokens_seen": 269391545, + "step": 12492, + "time_per_iteration": 2.7650156021118164 + }, + { + "auxiliary_loss_clip": 0.01033539, + "auxiliary_loss_mlp": 0.01025501, + "balance_loss_clip": 1.02427638, + "balance_loss_mlp": 1.01496887, + "epoch": 0.7511197955809409, + "flos": 18989742729600.0, + "grad_norm": 1.7403583609048752, + "language_loss": 0.71216583, + "learning_rate": 6.152068243154671e-07, + "loss": 0.73275626, + "num_input_tokens_seen": 269408530, + "step": 12493, + "time_per_iteration": 2.7096829414367676 + }, + { + "auxiliary_loss_clip": 0.01054103, + "auxiliary_loss_mlp": 0.00747656, + "balance_loss_clip": 1.02606702, + "balance_loss_mlp": 1.00038481, + "epoch": 0.7511799188336089, + "flos": 22046961006720.0, + "grad_norm": 1.6187615222005955, + "language_loss": 0.80592954, + "learning_rate": 6.149258472993395e-07, + "loss": 0.82394713, + "num_input_tokens_seen": 269425930, + "step": 12494, + "time_per_iteration": 2.6188395023345947 + }, + { + "auxiliary_loss_clip": 0.01064597, + "auxiliary_loss_mlp": 0.01026262, + "balance_loss_clip": 1.02618742, + "balance_loss_mlp": 1.01541996, + "epoch": 0.7512400420862768, + "flos": 16467125546880.0, + "grad_norm": 2.3154898369511048, + "language_loss": 0.78506827, + "learning_rate": 6.146449228053634e-07, + "loss": 0.80597687, + "num_input_tokens_seen": 269443945, + "step": 12495, + "time_per_iteration": 2.570533275604248 + }, + { + "auxiliary_loss_clip": 0.01065131, + "auxiliary_loss_mlp": 0.00747622, + "balance_loss_clip": 1.02667642, + "balance_loss_mlp": 1.00035739, + "epoch": 0.7513001653389448, + "flos": 20448326304000.0, + "grad_norm": 1.9405648176241805, + "language_loss": 0.70973665, + "learning_rate": 6.143640508441898e-07, + "loss": 0.72786415, + "num_input_tokens_seen": 269463625, + "step": 12496, + "time_per_iteration": 2.7252209186553955 + }, + { + "auxiliary_loss_clip": 0.01018564, + "auxiliary_loss_mlp": 0.01027809, + "balance_loss_clip": 1.0226469, + "balance_loss_mlp": 1.01758695, + "epoch": 0.7513602885916129, + "flos": 23476816679040.0, + "grad_norm": 1.7050812855739346, + "language_loss": 0.78429908, + "learning_rate": 6.140832314264705e-07, + "loss": 0.80476284, + "num_input_tokens_seen": 269483415, + "step": 12497, + "time_per_iteration": 2.804556131362915 + }, + { + "auxiliary_loss_clip": 0.01055083, + "auxiliary_loss_mlp": 0.01032647, + "balance_loss_clip": 1.0254519, + "balance_loss_mlp": 1.02145982, + "epoch": 0.7514204118442808, + "flos": 26797224885120.0, + "grad_norm": 1.6331627261412833, + "language_loss": 0.76933008, + "learning_rate": 6.13802464562855e-07, + "loss": 0.79020739, + "num_input_tokens_seen": 269504635, + "step": 12498, + "time_per_iteration": 4.330719232559204 + }, + { + "auxiliary_loss_clip": 0.0104112, + "auxiliary_loss_mlp": 0.01029463, + "balance_loss_clip": 1.02505898, + "balance_loss_mlp": 1.01959229, + "epoch": 0.7514805350969488, + "flos": 19865639877120.0, + "grad_norm": 3.8829068605578927, + "language_loss": 0.73690224, + "learning_rate": 6.135217502639878e-07, + "loss": 0.757608, + "num_input_tokens_seen": 269523955, + "step": 12499, + "time_per_iteration": 4.248448848724365 + }, + { + "auxiliary_loss_clip": 0.01050214, + "auxiliary_loss_mlp": 0.01025943, + "balance_loss_clip": 1.02294517, + "balance_loss_mlp": 1.01574492, + "epoch": 0.7515406583496167, + "flos": 24571553437440.0, + "grad_norm": 2.122397145344656, + "language_loss": 0.79428768, + "learning_rate": 6.132410885405148e-07, + "loss": 0.81504917, + "num_input_tokens_seen": 269544410, + "step": 12500, + "time_per_iteration": 2.691603183746338 + }, + { + "auxiliary_loss_clip": 0.01062244, + "auxiliary_loss_mlp": 0.01033211, + "balance_loss_clip": 1.0284133, + "balance_loss_mlp": 1.0199672, + "epoch": 0.7516007816022847, + "flos": 20120246455680.0, + "grad_norm": 1.8910549549072802, + "language_loss": 0.7350384, + "learning_rate": 6.129604794030794e-07, + "loss": 0.75599295, + "num_input_tokens_seen": 269563315, + "step": 12501, + "time_per_iteration": 2.677603006362915 + }, + { + "auxiliary_loss_clip": 0.01042826, + "auxiliary_loss_mlp": 0.0102576, + "balance_loss_clip": 1.02436054, + "balance_loss_mlp": 1.01500177, + "epoch": 0.7516609048549526, + "flos": 22784638619520.0, + "grad_norm": 1.8141517709612853, + "language_loss": 0.78378564, + "learning_rate": 6.126799228623207e-07, + "loss": 0.80447149, + "num_input_tokens_seen": 269583950, + "step": 12502, + "time_per_iteration": 2.708364486694336 + }, + { + "auxiliary_loss_clip": 0.01044627, + "auxiliary_loss_mlp": 0.01030496, + "balance_loss_clip": 1.0258503, + "balance_loss_mlp": 1.01988077, + "epoch": 0.7517210281076206, + "flos": 10634012311680.0, + "grad_norm": 2.235719588163508, + "language_loss": 0.70471555, + "learning_rate": 6.123994189288786e-07, + "loss": 0.72546685, + "num_input_tokens_seen": 269600120, + "step": 12503, + "time_per_iteration": 2.618736505508423 + }, + { + "auxiliary_loss_clip": 0.01006092, + "auxiliary_loss_mlp": 0.01002109, + "balance_loss_clip": 1.00075006, + "balance_loss_mlp": 1.00122726, + "epoch": 0.7517811513602886, + "flos": 66052221275520.0, + "grad_norm": 0.9907533288857391, + "language_loss": 0.63989794, + "learning_rate": 6.121189676133903e-07, + "loss": 0.65997994, + "num_input_tokens_seen": 269659815, + "step": 12504, + "time_per_iteration": 3.141357898712158 + }, + { + "auxiliary_loss_clip": 0.01019024, + "auxiliary_loss_mlp": 0.01028034, + "balance_loss_clip": 1.01955366, + "balance_loss_mlp": 1.01744223, + "epoch": 0.7518412746129566, + "flos": 37268345018880.0, + "grad_norm": 1.3842823329434624, + "language_loss": 0.68302613, + "learning_rate": 6.118385689264896e-07, + "loss": 0.70349669, + "num_input_tokens_seen": 269684565, + "step": 12505, + "time_per_iteration": 2.8605871200561523 + }, + { + "auxiliary_loss_clip": 0.0099671, + "auxiliary_loss_mlp": 0.00746699, + "balance_loss_clip": 1.0016216, + "balance_loss_mlp": 1.00070155, + "epoch": 0.7519013978656245, + "flos": 60518567727360.0, + "grad_norm": 0.64761519003511, + "language_loss": 0.55098099, + "learning_rate": 6.11558222878809e-07, + "loss": 0.56841505, + "num_input_tokens_seen": 269752325, + "step": 12506, + "time_per_iteration": 3.307265281677246 + }, + { + "auxiliary_loss_clip": 0.01050666, + "auxiliary_loss_mlp": 0.01034364, + "balance_loss_clip": 1.02596772, + "balance_loss_mlp": 1.02286077, + "epoch": 0.7519615211182925, + "flos": 18806885568000.0, + "grad_norm": 1.8460492340433692, + "language_loss": 0.78481883, + "learning_rate": 6.112779294809796e-07, + "loss": 0.80566913, + "num_input_tokens_seen": 269770630, + "step": 12507, + "time_per_iteration": 2.6843924522399902 + }, + { + "auxiliary_loss_clip": 0.01044698, + "auxiliary_loss_mlp": 0.01030005, + "balance_loss_clip": 1.02744019, + "balance_loss_mlp": 1.01950848, + "epoch": 0.7520216443709604, + "flos": 14575244209920.0, + "grad_norm": 1.8137069257338543, + "language_loss": 0.71213043, + "learning_rate": 6.10997688743631e-07, + "loss": 0.73287737, + "num_input_tokens_seen": 269787280, + "step": 12508, + "time_per_iteration": 2.61063289642334 + }, + { + "auxiliary_loss_clip": 0.01045275, + "auxiliary_loss_mlp": 0.01028886, + "balance_loss_clip": 1.02273226, + "balance_loss_mlp": 1.01788855, + "epoch": 0.7520817676236284, + "flos": 17056599644160.0, + "grad_norm": 1.6793446631507691, + "language_loss": 0.72281522, + "learning_rate": 6.107175006773885e-07, + "loss": 0.7435568, + "num_input_tokens_seen": 269805205, + "step": 12509, + "time_per_iteration": 2.67614483833313 + }, + { + "auxiliary_loss_clip": 0.01067547, + "auxiliary_loss_mlp": 0.01037763, + "balance_loss_clip": 1.02611613, + "balance_loss_mlp": 1.02522767, + "epoch": 0.7521418908762965, + "flos": 25666397936640.0, + "grad_norm": 2.2067805312038606, + "language_loss": 0.62268603, + "learning_rate": 6.104373652928785e-07, + "loss": 0.64373916, + "num_input_tokens_seen": 269824820, + "step": 12510, + "time_per_iteration": 2.624084949493408 + }, + { + "auxiliary_loss_clip": 0.01051578, + "auxiliary_loss_mlp": 0.01026754, + "balance_loss_clip": 1.02536702, + "balance_loss_mlp": 1.01641297, + "epoch": 0.7522020141289644, + "flos": 20886759711360.0, + "grad_norm": 1.5768933233617264, + "language_loss": 0.81287485, + "learning_rate": 6.10157282600722e-07, + "loss": 0.83365816, + "num_input_tokens_seen": 269842825, + "step": 12511, + "time_per_iteration": 2.624743700027466 + }, + { + "auxiliary_loss_clip": 0.01047874, + "auxiliary_loss_mlp": 0.01032717, + "balance_loss_clip": 1.0270685, + "balance_loss_mlp": 1.02067077, + "epoch": 0.7522621373816324, + "flos": 12640305444480.0, + "grad_norm": 1.8445133278660237, + "language_loss": 0.75645435, + "learning_rate": 6.098772526115412e-07, + "loss": 0.77726024, + "num_input_tokens_seen": 269859000, + "step": 12512, + "time_per_iteration": 2.688122510910034 + }, + { + "auxiliary_loss_clip": 0.01049956, + "auxiliary_loss_mlp": 0.01027219, + "balance_loss_clip": 1.02413309, + "balance_loss_mlp": 1.01715803, + "epoch": 0.7523222606343003, + "flos": 25626141768960.0, + "grad_norm": 1.731564277504499, + "language_loss": 0.82319599, + "learning_rate": 6.095972753359537e-07, + "loss": 0.84396774, + "num_input_tokens_seen": 269878895, + "step": 12513, + "time_per_iteration": 2.6398367881774902 + }, + { + "auxiliary_loss_clip": 0.0105651, + "auxiliary_loss_mlp": 0.01030899, + "balance_loss_clip": 1.02646875, + "balance_loss_mlp": 1.01953244, + "epoch": 0.7523823838869683, + "flos": 20448900921600.0, + "grad_norm": 4.121283289941464, + "language_loss": 0.74779689, + "learning_rate": 6.093173507845771e-07, + "loss": 0.76867098, + "num_input_tokens_seen": 269897280, + "step": 12514, + "time_per_iteration": 2.6153059005737305 + }, + { + "auxiliary_loss_clip": 0.01052628, + "auxiliary_loss_mlp": 0.01029964, + "balance_loss_clip": 1.02680159, + "balance_loss_mlp": 1.01996267, + "epoch": 0.7524425071396362, + "flos": 14720610551040.0, + "grad_norm": 1.9849976920540564, + "language_loss": 0.6889655, + "learning_rate": 6.090374789680271e-07, + "loss": 0.70979142, + "num_input_tokens_seen": 269914640, + "step": 12515, + "time_per_iteration": 2.5999743938446045 + }, + { + "auxiliary_loss_clip": 0.01054754, + "auxiliary_loss_mlp": 0.01032249, + "balance_loss_clip": 1.02623439, + "balance_loss_mlp": 1.02197361, + "epoch": 0.7525026303923043, + "flos": 30592048947840.0, + "grad_norm": 1.7065562779474635, + "language_loss": 0.70114988, + "learning_rate": 6.087576598969137e-07, + "loss": 0.72201997, + "num_input_tokens_seen": 269934960, + "step": 12516, + "time_per_iteration": 4.602639675140381 + }, + { + "auxiliary_loss_clip": 0.01021821, + "auxiliary_loss_mlp": 0.01031931, + "balance_loss_clip": 1.02529252, + "balance_loss_mlp": 1.02161956, + "epoch": 0.7525627536449722, + "flos": 24791757765120.0, + "grad_norm": 1.5425013544270576, + "language_loss": 0.89461899, + "learning_rate": 6.084778935818495e-07, + "loss": 0.9151566, + "num_input_tokens_seen": 269956655, + "step": 12517, + "time_per_iteration": 2.7159461975097656 + }, + { + "auxiliary_loss_clip": 0.01045372, + "auxiliary_loss_mlp": 0.01032176, + "balance_loss_clip": 1.02574873, + "balance_loss_mlp": 1.02188778, + "epoch": 0.7526228768976402, + "flos": 20779782030720.0, + "grad_norm": 1.547300396804018, + "language_loss": 0.74617147, + "learning_rate": 6.081981800334437e-07, + "loss": 0.76694691, + "num_input_tokens_seen": 269976835, + "step": 12518, + "time_per_iteration": 2.6918013095855713 + }, + { + "auxiliary_loss_clip": 0.00973921, + "auxiliary_loss_mlp": 0.01007004, + "balance_loss_clip": 1.00614429, + "balance_loss_mlp": 1.00572824, + "epoch": 0.7526830001503081, + "flos": 66559243703040.0, + "grad_norm": 0.7081337951766797, + "language_loss": 0.55725813, + "learning_rate": 6.079185192623017e-07, + "loss": 0.57706738, + "num_input_tokens_seen": 270040630, + "step": 12519, + "time_per_iteration": 3.3584144115448 + }, + { + "auxiliary_loss_clip": 0.01052802, + "auxiliary_loss_mlp": 0.01031379, + "balance_loss_clip": 1.02602029, + "balance_loss_mlp": 1.0215385, + "epoch": 0.7527431234029761, + "flos": 23477894087040.0, + "grad_norm": 1.3915419483377993, + "language_loss": 0.77683353, + "learning_rate": 6.07638911279029e-07, + "loss": 0.79767537, + "num_input_tokens_seen": 270059695, + "step": 12520, + "time_per_iteration": 2.807847023010254 + }, + { + "auxiliary_loss_clip": 0.01044736, + "auxiliary_loss_mlp": 0.01036613, + "balance_loss_clip": 1.02313113, + "balance_loss_mlp": 1.02611029, + "epoch": 0.752803246655644, + "flos": 22049546785920.0, + "grad_norm": 2.1777450057247325, + "language_loss": 0.73848099, + "learning_rate": 6.07359356094229e-07, + "loss": 0.75929445, + "num_input_tokens_seen": 270078420, + "step": 12521, + "time_per_iteration": 2.616598606109619 + }, + { + "auxiliary_loss_clip": 0.01038409, + "auxiliary_loss_mlp": 0.01032205, + "balance_loss_clip": 1.02556074, + "balance_loss_mlp": 1.02018833, + "epoch": 0.752863369908312, + "flos": 30153795108480.0, + "grad_norm": 2.2239468157921594, + "language_loss": 0.67272431, + "learning_rate": 6.070798537185016e-07, + "loss": 0.69343042, + "num_input_tokens_seen": 270097040, + "step": 12522, + "time_per_iteration": 2.79425048828125 + }, + { + "auxiliary_loss_clip": 0.01056749, + "auxiliary_loss_mlp": 0.01037406, + "balance_loss_clip": 1.02755868, + "balance_loss_mlp": 1.02675498, + "epoch": 0.7529234931609801, + "flos": 24567638855040.0, + "grad_norm": 2.0155286176935125, + "language_loss": 0.7821998, + "learning_rate": 6.068004041624453e-07, + "loss": 0.80314136, + "num_input_tokens_seen": 270116365, + "step": 12523, + "time_per_iteration": 2.851533889770508 + }, + { + "auxiliary_loss_clip": 0.01062082, + "auxiliary_loss_mlp": 0.01025685, + "balance_loss_clip": 1.02524829, + "balance_loss_mlp": 1.01570082, + "epoch": 0.752983616413648, + "flos": 23112395245440.0, + "grad_norm": 1.8075916905396412, + "language_loss": 0.80562186, + "learning_rate": 6.065210074366571e-07, + "loss": 0.82649958, + "num_input_tokens_seen": 270135395, + "step": 12524, + "time_per_iteration": 2.7093982696533203 + }, + { + "auxiliary_loss_clip": 0.01053225, + "auxiliary_loss_mlp": 0.00747535, + "balance_loss_clip": 1.02555621, + "balance_loss_mlp": 1.00037122, + "epoch": 0.753043739666316, + "flos": 24316946858880.0, + "grad_norm": 1.7585448286806549, + "language_loss": 0.74179953, + "learning_rate": 6.062416635517326e-07, + "loss": 0.75980717, + "num_input_tokens_seen": 270156425, + "step": 12525, + "time_per_iteration": 2.6599018573760986 + }, + { + "auxiliary_loss_clip": 0.01032694, + "auxiliary_loss_mlp": 0.01028848, + "balance_loss_clip": 1.02476978, + "balance_loss_mlp": 1.01848817, + "epoch": 0.7531038629189839, + "flos": 24243294021120.0, + "grad_norm": 2.333540166820107, + "language_loss": 0.71963906, + "learning_rate": 6.059623725182641e-07, + "loss": 0.74025452, + "num_input_tokens_seen": 270176905, + "step": 12526, + "time_per_iteration": 4.613531827926636 + }, + { + "auxiliary_loss_clip": 0.01040049, + "auxiliary_loss_mlp": 0.01025449, + "balance_loss_clip": 1.02337718, + "balance_loss_mlp": 1.01582265, + "epoch": 0.7531639861716519, + "flos": 30188807890560.0, + "grad_norm": 1.6946441026458532, + "language_loss": 0.72242695, + "learning_rate": 6.056831343468414e-07, + "loss": 0.74308193, + "num_input_tokens_seen": 270196640, + "step": 12527, + "time_per_iteration": 2.771036148071289 + }, + { + "auxiliary_loss_clip": 0.01031873, + "auxiliary_loss_mlp": 0.01023266, + "balance_loss_clip": 1.02535224, + "balance_loss_mlp": 1.0137887, + "epoch": 0.7532241094243198, + "flos": 18223193560320.0, + "grad_norm": 1.7839801496641612, + "language_loss": 0.81329036, + "learning_rate": 6.054039490480539e-07, + "loss": 0.8338418, + "num_input_tokens_seen": 270213905, + "step": 12528, + "time_per_iteration": 2.782855749130249 + }, + { + "auxiliary_loss_clip": 0.01011623, + "auxiliary_loss_mlp": 0.01035094, + "balance_loss_clip": 1.0262748, + "balance_loss_mlp": 1.02335751, + "epoch": 0.7532842326769879, + "flos": 20881049448960.0, + "grad_norm": 1.9829496631854195, + "language_loss": 0.85297251, + "learning_rate": 6.051248166324892e-07, + "loss": 0.87343967, + "num_input_tokens_seen": 270231995, + "step": 12529, + "time_per_iteration": 2.810175895690918 + }, + { + "auxiliary_loss_clip": 0.01037085, + "auxiliary_loss_mlp": 0.01032123, + "balance_loss_clip": 1.02705979, + "balance_loss_mlp": 1.02086985, + "epoch": 0.7533443559296558, + "flos": 18078689145600.0, + "grad_norm": 2.122905360454551, + "language_loss": 0.73945558, + "learning_rate": 6.048457371107303e-07, + "loss": 0.76014769, + "num_input_tokens_seen": 270251480, + "step": 12530, + "time_per_iteration": 2.714236259460449 + }, + { + "auxiliary_loss_clip": 0.00960721, + "auxiliary_loss_mlp": 0.01003382, + "balance_loss_clip": 1.00521326, + "balance_loss_mlp": 1.0022254, + "epoch": 0.7534044791823238, + "flos": 50254830766080.0, + "grad_norm": 0.8261128401011011, + "language_loss": 0.63627213, + "learning_rate": 6.045667104933612e-07, + "loss": 0.65591317, + "num_input_tokens_seen": 270306480, + "step": 12531, + "time_per_iteration": 3.1561851501464844 + }, + { + "auxiliary_loss_clip": 0.01044904, + "auxiliary_loss_mlp": 0.01027303, + "balance_loss_clip": 1.02501082, + "balance_loss_mlp": 1.01612735, + "epoch": 0.7534646024349917, + "flos": 20850274471680.0, + "grad_norm": 3.817092678353933, + "language_loss": 0.69607359, + "learning_rate": 6.042877367909633e-07, + "loss": 0.71679568, + "num_input_tokens_seen": 270324595, + "step": 12532, + "time_per_iteration": 2.7235279083251953 + }, + { + "auxiliary_loss_clip": 0.0103347, + "auxiliary_loss_mlp": 0.01024553, + "balance_loss_clip": 1.025244, + "balance_loss_mlp": 1.01512337, + "epoch": 0.7535247256876597, + "flos": 23071779941760.0, + "grad_norm": 1.7038178237577966, + "language_loss": 0.77325535, + "learning_rate": 6.040088160141132e-07, + "loss": 0.79383552, + "num_input_tokens_seen": 270344375, + "step": 12533, + "time_per_iteration": 2.7883031368255615 + }, + { + "auxiliary_loss_clip": 0.00997501, + "auxiliary_loss_mlp": 0.01001264, + "balance_loss_clip": 1.00175166, + "balance_loss_mlp": 1.00040519, + "epoch": 0.7535848489403276, + "flos": 58623418252800.0, + "grad_norm": 0.7840292337267892, + "language_loss": 0.57269949, + "learning_rate": 6.037299481733886e-07, + "loss": 0.59268713, + "num_input_tokens_seen": 270405235, + "step": 12534, + "time_per_iteration": 3.2457549571990967 + }, + { + "auxiliary_loss_clip": 0.01041668, + "auxiliary_loss_mlp": 0.01024453, + "balance_loss_clip": 1.02399349, + "balance_loss_mlp": 1.01368856, + "epoch": 0.7536449721929956, + "flos": 26577882483840.0, + "grad_norm": 1.4014727627364838, + "language_loss": 0.71087277, + "learning_rate": 6.03451133279365e-07, + "loss": 0.731534, + "num_input_tokens_seen": 270425820, + "step": 12535, + "time_per_iteration": 2.8657684326171875 + }, + { + "auxiliary_loss_clip": 0.01039164, + "auxiliary_loss_mlp": 0.01030145, + "balance_loss_clip": 1.02261019, + "balance_loss_mlp": 1.0186708, + "epoch": 0.7537050954456637, + "flos": 25735992537600.0, + "grad_norm": 1.5450193734116733, + "language_loss": 0.80788511, + "learning_rate": 6.031723713426135e-07, + "loss": 0.82857823, + "num_input_tokens_seen": 270447120, + "step": 12536, + "time_per_iteration": 2.686936616897583 + }, + { + "auxiliary_loss_clip": 0.01026348, + "auxiliary_loss_mlp": 0.0102792, + "balance_loss_clip": 1.02129841, + "balance_loss_mlp": 1.0180316, + "epoch": 0.7537652186983316, + "flos": 30224431203840.0, + "grad_norm": 3.084806160278634, + "language_loss": 0.74718463, + "learning_rate": 6.028936623737067e-07, + "loss": 0.76772738, + "num_input_tokens_seen": 270468680, + "step": 12537, + "time_per_iteration": 2.7334952354431152 + }, + { + "auxiliary_loss_clip": 0.01062683, + "auxiliary_loss_mlp": 0.01028802, + "balance_loss_clip": 1.02428901, + "balance_loss_mlp": 1.01791811, + "epoch": 0.7538253419509996, + "flos": 12641239198080.0, + "grad_norm": 1.564297881453164, + "language_loss": 0.73881382, + "learning_rate": 6.026150063832111e-07, + "loss": 0.75972861, + "num_input_tokens_seen": 270486310, + "step": 12538, + "time_per_iteration": 2.5560641288757324 + }, + { + "auxiliary_loss_clip": 0.01033955, + "auxiliary_loss_mlp": 0.01027489, + "balance_loss_clip": 1.02582872, + "balance_loss_mlp": 1.01646233, + "epoch": 0.7538854652036675, + "flos": 23185976256000.0, + "grad_norm": 1.4879594042043318, + "language_loss": 0.6763829, + "learning_rate": 6.023364033816956e-07, + "loss": 0.6969974, + "num_input_tokens_seen": 270507210, + "step": 12539, + "time_per_iteration": 2.7322349548339844 + }, + { + "auxiliary_loss_clip": 0.01062122, + "auxiliary_loss_mlp": 0.010259, + "balance_loss_clip": 1.02573299, + "balance_loss_mlp": 1.01490343, + "epoch": 0.7539455884563355, + "flos": 23186227651200.0, + "grad_norm": 1.8189269941614576, + "language_loss": 0.74885666, + "learning_rate": 6.020578533797229e-07, + "loss": 0.76973689, + "num_input_tokens_seen": 270525250, + "step": 12540, + "time_per_iteration": 2.601961135864258 + }, + { + "auxiliary_loss_clip": 0.01064705, + "auxiliary_loss_mlp": 0.01027153, + "balance_loss_clip": 1.02479506, + "balance_loss_mlp": 1.01607299, + "epoch": 0.7540057117090034, + "flos": 13181155505280.0, + "grad_norm": 2.530304419503202, + "language_loss": 0.729792, + "learning_rate": 6.017793563878566e-07, + "loss": 0.75071049, + "num_input_tokens_seen": 270539295, + "step": 12541, + "time_per_iteration": 2.5868732929229736 + }, + { + "auxiliary_loss_clip": 0.01061932, + "auxiliary_loss_mlp": 0.01027312, + "balance_loss_clip": 1.0249536, + "balance_loss_mlp": 1.01611209, + "epoch": 0.7540658349616715, + "flos": 45478134478080.0, + "grad_norm": 1.895009890222621, + "language_loss": 0.71901107, + "learning_rate": 6.015009124166576e-07, + "loss": 0.73990357, + "num_input_tokens_seen": 270562815, + "step": 12542, + "time_per_iteration": 2.7975916862487793 + }, + { + "auxiliary_loss_clip": 0.01035735, + "auxiliary_loss_mlp": 0.01024255, + "balance_loss_clip": 1.02158308, + "balance_loss_mlp": 1.01349008, + "epoch": 0.7541259582143394, + "flos": 19930817105280.0, + "grad_norm": 1.958748550666968, + "language_loss": 0.84594512, + "learning_rate": 6.012225214766844e-07, + "loss": 0.86654508, + "num_input_tokens_seen": 270579055, + "step": 12543, + "time_per_iteration": 2.651190996170044 + }, + { + "auxiliary_loss_clip": 0.01035365, + "auxiliary_loss_mlp": 0.0102385, + "balance_loss_clip": 1.029212, + "balance_loss_mlp": 1.0137943, + "epoch": 0.7541860814670074, + "flos": 27198239299200.0, + "grad_norm": 2.382961628214999, + "language_loss": 0.73397034, + "learning_rate": 6.009441835784927e-07, + "loss": 0.7545625, + "num_input_tokens_seen": 270599080, + "step": 12544, + "time_per_iteration": 2.785425901412964 + }, + { + "auxiliary_loss_clip": 0.01055715, + "auxiliary_loss_mlp": 0.01033348, + "balance_loss_clip": 1.02741218, + "balance_loss_mlp": 1.02300715, + "epoch": 0.7542462047196753, + "flos": 21324151624320.0, + "grad_norm": 3.611132859589881, + "language_loss": 0.68228698, + "learning_rate": 6.006658987326383e-07, + "loss": 0.70317757, + "num_input_tokens_seen": 270618715, + "step": 12545, + "time_per_iteration": 4.271802663803101 + }, + { + "auxiliary_loss_clip": 0.01036255, + "auxiliary_loss_mlp": 0.01029744, + "balance_loss_clip": 1.02222109, + "balance_loss_mlp": 1.01845503, + "epoch": 0.7543063279723433, + "flos": 11940944664960.0, + "grad_norm": 2.5269914203949115, + "language_loss": 0.68454373, + "learning_rate": 6.003876669496728e-07, + "loss": 0.70520365, + "num_input_tokens_seen": 270635695, + "step": 12546, + "time_per_iteration": 2.63556170463562 + }, + { + "auxiliary_loss_clip": 0.01052005, + "auxiliary_loss_mlp": 0.01034189, + "balance_loss_clip": 1.02439797, + "balance_loss_mlp": 1.02242303, + "epoch": 0.7543664512250112, + "flos": 22819974624000.0, + "grad_norm": 2.2070882687603497, + "language_loss": 0.73403651, + "learning_rate": 6.00109488240147e-07, + "loss": 0.75489849, + "num_input_tokens_seen": 270654325, + "step": 12547, + "time_per_iteration": 4.254960775375366 + }, + { + "auxiliary_loss_clip": 0.01064624, + "auxiliary_loss_mlp": 0.01024899, + "balance_loss_clip": 1.02623606, + "balance_loss_mlp": 1.01318645, + "epoch": 0.7544265744776792, + "flos": 20923855482240.0, + "grad_norm": 1.854396087907923, + "language_loss": 0.67707223, + "learning_rate": 5.998313626146099e-07, + "loss": 0.69796741, + "num_input_tokens_seen": 270674260, + "step": 12548, + "time_per_iteration": 2.5561628341674805 + }, + { + "auxiliary_loss_clip": 0.01044243, + "auxiliary_loss_mlp": 0.01033041, + "balance_loss_clip": 1.02397323, + "balance_loss_mlp": 1.02198434, + "epoch": 0.7544866977303473, + "flos": 15195493284480.0, + "grad_norm": 2.1076748812811736, + "language_loss": 0.87084937, + "learning_rate": 5.995532900836088e-07, + "loss": 0.89162219, + "num_input_tokens_seen": 270692200, + "step": 12549, + "time_per_iteration": 2.749282121658325 + }, + { + "auxiliary_loss_clip": 0.01021069, + "auxiliary_loss_mlp": 0.01028983, + "balance_loss_clip": 1.02519393, + "balance_loss_mlp": 1.01901674, + "epoch": 0.7545468209830152, + "flos": 27083683848960.0, + "grad_norm": 2.054200708459908, + "language_loss": 0.76962072, + "learning_rate": 5.992752706576865e-07, + "loss": 0.7901212, + "num_input_tokens_seen": 270709675, + "step": 12550, + "time_per_iteration": 2.896962881088257 + }, + { + "auxiliary_loss_clip": 0.01063389, + "auxiliary_loss_mlp": 0.01024038, + "balance_loss_clip": 1.02482486, + "balance_loss_mlp": 1.0135355, + "epoch": 0.7546069442356832, + "flos": 26871703735680.0, + "grad_norm": 1.5915832857479895, + "language_loss": 0.69527698, + "learning_rate": 5.98997304347386e-07, + "loss": 0.71615124, + "num_input_tokens_seen": 270733055, + "step": 12551, + "time_per_iteration": 2.707590341567993 + }, + { + "auxiliary_loss_clip": 0.01040369, + "auxiliary_loss_mlp": 0.01023032, + "balance_loss_clip": 1.02463531, + "balance_loss_mlp": 1.01182675, + "epoch": 0.7546670674883511, + "flos": 15743131015680.0, + "grad_norm": 2.6162119180155834, + "language_loss": 0.86389107, + "learning_rate": 5.987193911632487e-07, + "loss": 0.88452506, + "num_input_tokens_seen": 270749275, + "step": 12552, + "time_per_iteration": 2.6000685691833496 + }, + { + "auxiliary_loss_clip": 0.0105304, + "auxiliary_loss_mlp": 0.01028128, + "balance_loss_clip": 1.02395868, + "balance_loss_mlp": 1.01734519, + "epoch": 0.7547271907410191, + "flos": 23477714519040.0, + "grad_norm": 1.62848879733928, + "language_loss": 0.77704275, + "learning_rate": 5.98441531115812e-07, + "loss": 0.79785448, + "num_input_tokens_seen": 270768230, + "step": 12553, + "time_per_iteration": 2.5917465686798096 + }, + { + "auxiliary_loss_clip": 0.01054421, + "auxiliary_loss_mlp": 0.01032456, + "balance_loss_clip": 1.0264647, + "balance_loss_mlp": 1.02144134, + "epoch": 0.754787313993687, + "flos": 31722804069120.0, + "grad_norm": 2.1671144286947244, + "language_loss": 0.62244153, + "learning_rate": 5.981637242156135e-07, + "loss": 0.64331037, + "num_input_tokens_seen": 270786285, + "step": 12554, + "time_per_iteration": 2.699941635131836 + }, + { + "auxiliary_loss_clip": 0.01041354, + "auxiliary_loss_mlp": 0.01030589, + "balance_loss_clip": 1.02402997, + "balance_loss_mlp": 1.020069, + "epoch": 0.7548474372463551, + "flos": 27563055782400.0, + "grad_norm": 1.546038272700435, + "language_loss": 0.73456824, + "learning_rate": 5.978859704731864e-07, + "loss": 0.75528765, + "num_input_tokens_seen": 270805505, + "step": 12555, + "time_per_iteration": 2.698672294616699 + }, + { + "auxiliary_loss_clip": 0.01048667, + "auxiliary_loss_mlp": 0.01031731, + "balance_loss_clip": 1.02890253, + "balance_loss_mlp": 1.02032828, + "epoch": 0.754907560499023, + "flos": 19318576763520.0, + "grad_norm": 1.8814895010506711, + "language_loss": 0.78694594, + "learning_rate": 5.976082698990645e-07, + "loss": 0.80774999, + "num_input_tokens_seen": 270824610, + "step": 12556, + "time_per_iteration": 2.712144136428833 + }, + { + "auxiliary_loss_clip": 0.00997662, + "auxiliary_loss_mlp": 0.01000477, + "balance_loss_clip": 1.00225639, + "balance_loss_mlp": 0.99964285, + "epoch": 0.754967683751691, + "flos": 69744628684800.0, + "grad_norm": 0.7077595026888615, + "language_loss": 0.50439584, + "learning_rate": 5.973306225037769e-07, + "loss": 0.52437723, + "num_input_tokens_seen": 270886155, + "step": 12557, + "time_per_iteration": 3.210322380065918 + }, + { + "auxiliary_loss_clip": 0.01056024, + "auxiliary_loss_mlp": 0.0103052, + "balance_loss_clip": 1.02716136, + "balance_loss_mlp": 1.0191474, + "epoch": 0.7550278070043589, + "flos": 24421913377920.0, + "grad_norm": 1.4969600332711501, + "language_loss": 0.71720803, + "learning_rate": 5.970530282978525e-07, + "loss": 0.73807347, + "num_input_tokens_seen": 270905325, + "step": 12558, + "time_per_iteration": 2.676461696624756 + }, + { + "auxiliary_loss_clip": 0.01037028, + "auxiliary_loss_mlp": 0.01030312, + "balance_loss_clip": 1.02324557, + "balance_loss_mlp": 1.01848006, + "epoch": 0.7550879302570269, + "flos": 32634611838720.0, + "grad_norm": 1.6328499548943067, + "language_loss": 0.79891396, + "learning_rate": 5.967754872918187e-07, + "loss": 0.81958735, + "num_input_tokens_seen": 270927535, + "step": 12559, + "time_per_iteration": 2.856768846511841 + }, + { + "auxiliary_loss_clip": 0.01025097, + "auxiliary_loss_mlp": 0.01027842, + "balance_loss_clip": 1.02625048, + "balance_loss_mlp": 1.01633215, + "epoch": 0.7551480535096948, + "flos": 21795550738560.0, + "grad_norm": 1.6099187737832221, + "language_loss": 0.78731525, + "learning_rate": 5.96497999496199e-07, + "loss": 0.8078447, + "num_input_tokens_seen": 270946920, + "step": 12560, + "time_per_iteration": 2.88144588470459 + }, + { + "auxiliary_loss_clip": 0.01014279, + "auxiliary_loss_mlp": 0.01034606, + "balance_loss_clip": 1.02175319, + "balance_loss_mlp": 1.02304256, + "epoch": 0.7552081767623628, + "flos": 18515111391360.0, + "grad_norm": 1.6130944121499502, + "language_loss": 0.7040292, + "learning_rate": 5.96220564921515e-07, + "loss": 0.724518, + "num_input_tokens_seen": 270965705, + "step": 12561, + "time_per_iteration": 2.899231433868408 + }, + { + "auxiliary_loss_clip": 0.0103637, + "auxiliary_loss_mlp": 0.00747738, + "balance_loss_clip": 1.02238202, + "balance_loss_mlp": 1.00047505, + "epoch": 0.7552683000150308, + "flos": 27634805199360.0, + "grad_norm": 1.667077749133079, + "language_loss": 0.7546469, + "learning_rate": 5.959431835782889e-07, + "loss": 0.772488, + "num_input_tokens_seen": 270986550, + "step": 12562, + "time_per_iteration": 2.8118765354156494 + }, + { + "auxiliary_loss_clip": 0.01042226, + "auxiliary_loss_mlp": 0.01028429, + "balance_loss_clip": 1.02419007, + "balance_loss_mlp": 1.01750946, + "epoch": 0.7553284232676988, + "flos": 20302924049280.0, + "grad_norm": 3.3421322967929408, + "language_loss": 0.76033151, + "learning_rate": 5.956658554770371e-07, + "loss": 0.78103805, + "num_input_tokens_seen": 271006250, + "step": 12563, + "time_per_iteration": 2.8100497722625732 + }, + { + "auxiliary_loss_clip": 0.01031255, + "auxiliary_loss_mlp": 0.01033532, + "balance_loss_clip": 1.02421343, + "balance_loss_mlp": 1.01967931, + "epoch": 0.7553885465203668, + "flos": 33255471444480.0, + "grad_norm": 2.1907738172339264, + "language_loss": 0.66986078, + "learning_rate": 5.953885806282768e-07, + "loss": 0.6905086, + "num_input_tokens_seen": 271025575, + "step": 12564, + "time_per_iteration": 4.6385815143585205 + }, + { + "auxiliary_loss_clip": 0.01036944, + "auxiliary_loss_mlp": 0.01034901, + "balance_loss_clip": 1.02383924, + "balance_loss_mlp": 1.02274179, + "epoch": 0.7554486697730347, + "flos": 21616249023360.0, + "grad_norm": 3.38045033400975, + "language_loss": 0.68244386, + "learning_rate": 5.951113590425228e-07, + "loss": 0.70316231, + "num_input_tokens_seen": 271045805, + "step": 12565, + "time_per_iteration": 2.7518699169158936 + }, + { + "auxiliary_loss_clip": 0.01046414, + "auxiliary_loss_mlp": 0.01029246, + "balance_loss_clip": 1.02530384, + "balance_loss_mlp": 1.0175035, + "epoch": 0.7555087930257027, + "flos": 27632973605760.0, + "grad_norm": 1.7158103304997623, + "language_loss": 0.74820417, + "learning_rate": 5.94834190730287e-07, + "loss": 0.76896083, + "num_input_tokens_seen": 271066065, + "step": 12566, + "time_per_iteration": 2.742530107498169 + }, + { + "auxiliary_loss_clip": 0.01056192, + "auxiliary_loss_mlp": 0.01034695, + "balance_loss_clip": 1.02570045, + "balance_loss_mlp": 1.02197504, + "epoch": 0.7555689162783706, + "flos": 23621644316160.0, + "grad_norm": 2.183218032423894, + "language_loss": 0.74033451, + "learning_rate": 5.945570757020789e-07, + "loss": 0.76124334, + "num_input_tokens_seen": 271085870, + "step": 12567, + "time_per_iteration": 2.759099006652832 + }, + { + "auxiliary_loss_clip": 0.01062182, + "auxiliary_loss_mlp": 0.01023272, + "balance_loss_clip": 1.02458775, + "balance_loss_mlp": 1.01296067, + "epoch": 0.7556290395310387, + "flos": 24863076218880.0, + "grad_norm": 1.95355233817742, + "language_loss": 0.63128662, + "learning_rate": 5.942800139684073e-07, + "loss": 0.65214121, + "num_input_tokens_seen": 271104260, + "step": 12568, + "time_per_iteration": 2.6055357456207275 + }, + { + "auxiliary_loss_clip": 0.00985296, + "auxiliary_loss_mlp": 0.01033077, + "balance_loss_clip": 1.02473223, + "balance_loss_mlp": 1.0214119, + "epoch": 0.7556891627837066, + "flos": 43543770330240.0, + "grad_norm": 2.197196933685024, + "language_loss": 0.66047668, + "learning_rate": 5.940030055397789e-07, + "loss": 0.68066043, + "num_input_tokens_seen": 271125745, + "step": 12569, + "time_per_iteration": 3.2678215503692627 + }, + { + "auxiliary_loss_clip": 0.01057508, + "auxiliary_loss_mlp": 0.01035112, + "balance_loss_clip": 1.02732325, + "balance_loss_mlp": 1.02313709, + "epoch": 0.7557492860363746, + "flos": 26650924790400.0, + "grad_norm": 1.670127644674919, + "language_loss": 0.67122829, + "learning_rate": 5.93726050426697e-07, + "loss": 0.69215453, + "num_input_tokens_seen": 271147145, + "step": 12570, + "time_per_iteration": 3.053955316543579 + }, + { + "auxiliary_loss_clip": 0.01064957, + "auxiliary_loss_mlp": 0.01028642, + "balance_loss_clip": 1.02611256, + "balance_loss_mlp": 1.01745379, + "epoch": 0.7558094092890425, + "flos": 55182885010560.0, + "grad_norm": 1.980612930614521, + "language_loss": 0.71729821, + "learning_rate": 5.934491486396647e-07, + "loss": 0.73823422, + "num_input_tokens_seen": 271170865, + "step": 12571, + "time_per_iteration": 3.412386417388916 + }, + { + "auxiliary_loss_clip": 0.0101831, + "auxiliary_loss_mlp": 0.01035441, + "balance_loss_clip": 1.02248216, + "balance_loss_mlp": 1.02296615, + "epoch": 0.7558695325417105, + "flos": 23988292392960.0, + "grad_norm": 1.5998838918306064, + "language_loss": 0.73709261, + "learning_rate": 5.931723001891811e-07, + "loss": 0.75763017, + "num_input_tokens_seen": 271191450, + "step": 12572, + "time_per_iteration": 2.8890607357025146 + }, + { + "auxiliary_loss_clip": 0.01045496, + "auxiliary_loss_mlp": 0.01032323, + "balance_loss_clip": 1.02647138, + "balance_loss_mlp": 1.02130198, + "epoch": 0.7559296557943784, + "flos": 14611262572800.0, + "grad_norm": 2.5899529980742746, + "language_loss": 0.76760077, + "learning_rate": 5.928955050857456e-07, + "loss": 0.78837895, + "num_input_tokens_seen": 271207335, + "step": 12573, + "time_per_iteration": 4.443894386291504 + }, + { + "auxiliary_loss_clip": 0.01040942, + "auxiliary_loss_mlp": 0.01027111, + "balance_loss_clip": 1.02807379, + "balance_loss_mlp": 1.01639986, + "epoch": 0.7559897790470465, + "flos": 18550483309440.0, + "grad_norm": 1.5998780473260406, + "language_loss": 0.69414085, + "learning_rate": 5.926187633398527e-07, + "loss": 0.71482134, + "num_input_tokens_seen": 271226895, + "step": 12574, + "time_per_iteration": 2.7408556938171387 + }, + { + "auxiliary_loss_clip": 0.01022692, + "auxiliary_loss_mlp": 0.01031802, + "balance_loss_clip": 1.02071166, + "balance_loss_mlp": 1.02014339, + "epoch": 0.7560499022997144, + "flos": 17967868709760.0, + "grad_norm": 2.274259245670754, + "language_loss": 0.71600151, + "learning_rate": 5.923420749619974e-07, + "loss": 0.7365464, + "num_input_tokens_seen": 271244375, + "step": 12575, + "time_per_iteration": 2.802722454071045 + }, + { + "auxiliary_loss_clip": 0.01061234, + "auxiliary_loss_mlp": 0.00747797, + "balance_loss_clip": 1.02439594, + "balance_loss_mlp": 1.00042725, + "epoch": 0.7561100255523824, + "flos": 15737815802880.0, + "grad_norm": 2.219390382452799, + "language_loss": 0.72290409, + "learning_rate": 5.92065439962673e-07, + "loss": 0.74099439, + "num_input_tokens_seen": 271259530, + "step": 12576, + "time_per_iteration": 2.631742477416992 + }, + { + "auxiliary_loss_clip": 0.01031858, + "auxiliary_loss_mlp": 0.01026972, + "balance_loss_clip": 1.02556872, + "balance_loss_mlp": 1.016011, + "epoch": 0.7561701488050504, + "flos": 15888102307200.0, + "grad_norm": 2.0987903214674466, + "language_loss": 0.67327261, + "learning_rate": 5.917888583523669e-07, + "loss": 0.69386089, + "num_input_tokens_seen": 271276835, + "step": 12577, + "time_per_iteration": 2.7636454105377197 + }, + { + "auxiliary_loss_clip": 0.0104179, + "auxiliary_loss_mlp": 0.0103487, + "balance_loss_clip": 1.02345514, + "balance_loss_mlp": 1.02386737, + "epoch": 0.7562302720577183, + "flos": 20339157893760.0, + "grad_norm": 2.3242244890150383, + "language_loss": 0.78331286, + "learning_rate": 5.915123301415685e-07, + "loss": 0.80407947, + "num_input_tokens_seen": 271296275, + "step": 12578, + "time_per_iteration": 2.7733817100524902 + }, + { + "auxiliary_loss_clip": 0.01051736, + "auxiliary_loss_mlp": 0.0102973, + "balance_loss_clip": 1.0235548, + "balance_loss_mlp": 1.01879871, + "epoch": 0.7562903953103863, + "flos": 20812209033600.0, + "grad_norm": 1.5844681728836028, + "language_loss": 0.75423074, + "learning_rate": 5.912358553407641e-07, + "loss": 0.77504539, + "num_input_tokens_seen": 271315685, + "step": 12579, + "time_per_iteration": 2.7168383598327637 + }, + { + "auxiliary_loss_clip": 0.01025193, + "auxiliary_loss_mlp": 0.01031176, + "balance_loss_clip": 1.0239985, + "balance_loss_mlp": 1.01811624, + "epoch": 0.7563505185630542, + "flos": 37596999484800.0, + "grad_norm": 1.9086674232520509, + "language_loss": 0.62897158, + "learning_rate": 5.90959433960437e-07, + "loss": 0.64953524, + "num_input_tokens_seen": 271336790, + "step": 12580, + "time_per_iteration": 2.8593971729278564 + }, + { + "auxiliary_loss_clip": 0.01016964, + "auxiliary_loss_mlp": 0.010271, + "balance_loss_clip": 1.02191627, + "balance_loss_mlp": 1.0169611, + "epoch": 0.7564106418157223, + "flos": 20230995064320.0, + "grad_norm": 1.6403670979008982, + "language_loss": 0.74807638, + "learning_rate": 5.906830660110691e-07, + "loss": 0.76851702, + "num_input_tokens_seen": 271355470, + "step": 12581, + "time_per_iteration": 2.7290472984313965 + }, + { + "auxiliary_loss_clip": 0.0103352, + "auxiliary_loss_mlp": 0.01029309, + "balance_loss_clip": 1.0252322, + "balance_loss_mlp": 1.01831174, + "epoch": 0.7564707650683902, + "flos": 24754877475840.0, + "grad_norm": 1.5794288469127795, + "language_loss": 0.62616289, + "learning_rate": 5.904067515031412e-07, + "loss": 0.64679122, + "num_input_tokens_seen": 271375810, + "step": 12582, + "time_per_iteration": 2.6985323429107666 + }, + { + "auxiliary_loss_clip": 0.01006053, + "auxiliary_loss_mlp": 0.01001311, + "balance_loss_clip": 1.00092459, + "balance_loss_mlp": 1.00041091, + "epoch": 0.7565308883210582, + "flos": 48530076433920.0, + "grad_norm": 0.9678448112199649, + "language_loss": 0.60763931, + "learning_rate": 5.901304904471307e-07, + "loss": 0.62771297, + "num_input_tokens_seen": 271424775, + "step": 12583, + "time_per_iteration": 2.8545961380004883 + }, + { + "auxiliary_loss_clip": 0.01046991, + "auxiliary_loss_mlp": 0.01033393, + "balance_loss_clip": 1.02743518, + "balance_loss_mlp": 1.02169251, + "epoch": 0.7565910115737261, + "flos": 12495082757760.0, + "grad_norm": 2.6761271159208526, + "language_loss": 0.78963149, + "learning_rate": 5.898542828535125e-07, + "loss": 0.8104353, + "num_input_tokens_seen": 271440500, + "step": 12584, + "time_per_iteration": 2.652392625808716 + }, + { + "auxiliary_loss_clip": 0.01028982, + "auxiliary_loss_mlp": 0.01033325, + "balance_loss_clip": 1.02133071, + "balance_loss_mlp": 1.02114797, + "epoch": 0.7566511348263941, + "flos": 21173003193600.0, + "grad_norm": 2.0760221940940844, + "language_loss": 0.77820766, + "learning_rate": 5.895781287327612e-07, + "loss": 0.79883081, + "num_input_tokens_seen": 271458180, + "step": 12585, + "time_per_iteration": 2.665586471557617 + }, + { + "auxiliary_loss_clip": 0.01067862, + "auxiliary_loss_mlp": 0.01037738, + "balance_loss_clip": 1.02795172, + "balance_loss_mlp": 1.02605009, + "epoch": 0.756711258079062, + "flos": 21754827694080.0, + "grad_norm": 1.8861325485759237, + "language_loss": 0.83041251, + "learning_rate": 5.893020280953493e-07, + "loss": 0.85146844, + "num_input_tokens_seen": 271475730, + "step": 12586, + "time_per_iteration": 2.5852432250976562 + }, + { + "auxiliary_loss_clip": 0.01067345, + "auxiliary_loss_mlp": 0.01028534, + "balance_loss_clip": 1.02729261, + "balance_loss_mlp": 1.01815677, + "epoch": 0.75677138133173, + "flos": 22382905933440.0, + "grad_norm": 2.0477682532975434, + "language_loss": 0.83526421, + "learning_rate": 5.890259809517459e-07, + "loss": 0.85622299, + "num_input_tokens_seen": 271495030, + "step": 12587, + "time_per_iteration": 2.5716793537139893 + }, + { + "auxiliary_loss_clip": 0.01029816, + "auxiliary_loss_mlp": 0.01025034, + "balance_loss_clip": 1.02429891, + "balance_loss_mlp": 1.01432312, + "epoch": 0.756831504584398, + "flos": 22708974620160.0, + "grad_norm": 1.489733727769345, + "language_loss": 0.70818007, + "learning_rate": 5.88749987312418e-07, + "loss": 0.72872859, + "num_input_tokens_seen": 271515355, + "step": 12588, + "time_per_iteration": 2.755892515182495 + }, + { + "auxiliary_loss_clip": 0.01064843, + "auxiliary_loss_mlp": 0.00747804, + "balance_loss_clip": 1.02613974, + "balance_loss_mlp": 1.00040984, + "epoch": 0.756891627837066, + "flos": 24098358643200.0, + "grad_norm": 1.715471858694322, + "language_loss": 0.6900301, + "learning_rate": 5.884740471878327e-07, + "loss": 0.70815659, + "num_input_tokens_seen": 271535090, + "step": 12589, + "time_per_iteration": 2.573558807373047 + }, + { + "auxiliary_loss_clip": 0.01052193, + "auxiliary_loss_mlp": 0.01026483, + "balance_loss_clip": 1.02434099, + "balance_loss_mlp": 1.01577795, + "epoch": 0.756951751089734, + "flos": 19749001438080.0, + "grad_norm": 1.7292772326989703, + "language_loss": 0.92459702, + "learning_rate": 5.881981605884522e-07, + "loss": 0.94538379, + "num_input_tokens_seen": 271551075, + "step": 12590, + "time_per_iteration": 2.6087002754211426 + }, + { + "auxiliary_loss_clip": 0.01034616, + "auxiliary_loss_mlp": 0.01030757, + "balance_loss_clip": 1.02256417, + "balance_loss_mlp": 1.01990914, + "epoch": 0.7570118743424019, + "flos": 35079266551680.0, + "grad_norm": 1.8278196162586666, + "language_loss": 0.65527761, + "learning_rate": 5.879223275247391e-07, + "loss": 0.67593133, + "num_input_tokens_seen": 271571035, + "step": 12591, + "time_per_iteration": 2.719241142272949 + }, + { + "auxiliary_loss_clip": 0.01051881, + "auxiliary_loss_mlp": 0.01024959, + "balance_loss_clip": 1.02514172, + "balance_loss_mlp": 1.01529694, + "epoch": 0.7570719975950699, + "flos": 25594540778880.0, + "grad_norm": 1.5128943047277288, + "language_loss": 0.73555309, + "learning_rate": 5.876465480071528e-07, + "loss": 0.75632143, + "num_input_tokens_seen": 271592950, + "step": 12592, + "time_per_iteration": 2.6491901874542236 + }, + { + "auxiliary_loss_clip": 0.01054434, + "auxiliary_loss_mlp": 0.01035138, + "balance_loss_clip": 1.02526021, + "balance_loss_mlp": 1.02413487, + "epoch": 0.7571321208477378, + "flos": 10816223028480.0, + "grad_norm": 2.313527820243462, + "language_loss": 0.71542639, + "learning_rate": 5.873708220461522e-07, + "loss": 0.73632216, + "num_input_tokens_seen": 271608835, + "step": 12593, + "time_per_iteration": 4.219806432723999 + }, + { + "auxiliary_loss_clip": 0.01064439, + "auxiliary_loss_mlp": 0.01029365, + "balance_loss_clip": 1.02594006, + "balance_loss_mlp": 1.01839781, + "epoch": 0.7571922441004059, + "flos": 18260109763200.0, + "grad_norm": 1.935917811752988, + "language_loss": 0.66084433, + "learning_rate": 5.870951496521903e-07, + "loss": 0.68178236, + "num_input_tokens_seen": 271627730, + "step": 12594, + "time_per_iteration": 4.263274908065796 + }, + { + "auxiliary_loss_clip": 0.01035824, + "auxiliary_loss_mlp": 0.01034461, + "balance_loss_clip": 1.02525067, + "balance_loss_mlp": 1.02330303, + "epoch": 0.7572523673530738, + "flos": 22890502978560.0, + "grad_norm": 1.629422632824441, + "language_loss": 0.80672646, + "learning_rate": 5.86819530835722e-07, + "loss": 0.82742929, + "num_input_tokens_seen": 271646415, + "step": 12595, + "time_per_iteration": 2.793649196624756 + }, + { + "auxiliary_loss_clip": 0.01035794, + "auxiliary_loss_mlp": 0.01030223, + "balance_loss_clip": 1.02735543, + "balance_loss_mlp": 1.01960695, + "epoch": 0.7573124906057418, + "flos": 20996323171200.0, + "grad_norm": 2.63969800174664, + "language_loss": 0.71889579, + "learning_rate": 5.865439656071993e-07, + "loss": 0.73955595, + "num_input_tokens_seen": 271666240, + "step": 12596, + "time_per_iteration": 2.723789930343628 + }, + { + "auxiliary_loss_clip": 0.00976362, + "auxiliary_loss_mlp": 0.01029161, + "balance_loss_clip": 1.02602839, + "balance_loss_mlp": 1.01911151, + "epoch": 0.7573726138584097, + "flos": 20886292834560.0, + "grad_norm": 1.44640522706473, + "language_loss": 0.8018626, + "learning_rate": 5.862684539770706e-07, + "loss": 0.82191777, + "num_input_tokens_seen": 271686370, + "step": 12597, + "time_per_iteration": 3.1363556385040283 + }, + { + "auxiliary_loss_clip": 0.0103818, + "auxiliary_loss_mlp": 0.01027614, + "balance_loss_clip": 1.02783918, + "balance_loss_mlp": 1.01594973, + "epoch": 0.7574327371110777, + "flos": 24530507170560.0, + "grad_norm": 2.783005314126316, + "language_loss": 0.83075434, + "learning_rate": 5.859929959557835e-07, + "loss": 0.85141224, + "num_input_tokens_seen": 271705050, + "step": 12598, + "time_per_iteration": 3.500241279602051 + }, + { + "auxiliary_loss_clip": 0.01042913, + "auxiliary_loss_mlp": 0.0102482, + "balance_loss_clip": 1.02581453, + "balance_loss_mlp": 1.01509845, + "epoch": 0.7574928603637456, + "flos": 23364523785600.0, + "grad_norm": 1.603512225587816, + "language_loss": 0.62214541, + "learning_rate": 5.857175915537845e-07, + "loss": 0.64282274, + "num_input_tokens_seen": 271724915, + "step": 12599, + "time_per_iteration": 2.7323033809661865 + }, + { + "auxiliary_loss_clip": 0.01036535, + "auxiliary_loss_mlp": 0.00747859, + "balance_loss_clip": 1.02455473, + "balance_loss_mlp": 1.00043344, + "epoch": 0.7575529836164137, + "flos": 13516274419200.0, + "grad_norm": 2.5691421211719954, + "language_loss": 0.63253486, + "learning_rate": 5.854422407815161e-07, + "loss": 0.65037882, + "num_input_tokens_seen": 271742410, + "step": 12600, + "time_per_iteration": 2.627790927886963 + }, + { + "auxiliary_loss_clip": 0.01035219, + "auxiliary_loss_mlp": 0.01031357, + "balance_loss_clip": 1.02258599, + "balance_loss_mlp": 1.01982975, + "epoch": 0.7576131068690816, + "flos": 19646584784640.0, + "grad_norm": 2.1464237596232416, + "language_loss": 0.66404498, + "learning_rate": 5.851669436494191e-07, + "loss": 0.68471074, + "num_input_tokens_seen": 271761425, + "step": 12601, + "time_per_iteration": 2.7742679119110107 + }, + { + "auxiliary_loss_clip": 0.01033039, + "auxiliary_loss_mlp": 0.01029294, + "balance_loss_clip": 1.02408397, + "balance_loss_mlp": 1.01945949, + "epoch": 0.7576732301217496, + "flos": 20048245643520.0, + "grad_norm": 1.4745160296333175, + "language_loss": 0.67592174, + "learning_rate": 5.848917001679335e-07, + "loss": 0.69654512, + "num_input_tokens_seen": 271780875, + "step": 12602, + "time_per_iteration": 2.767714738845825 + }, + { + "auxiliary_loss_clip": 0.0105551, + "auxiliary_loss_mlp": 0.0103126, + "balance_loss_clip": 1.02579188, + "balance_loss_mlp": 1.01934552, + "epoch": 0.7577333533744176, + "flos": 15377093470080.0, + "grad_norm": 1.9299448344393224, + "language_loss": 0.66701424, + "learning_rate": 5.846165103474967e-07, + "loss": 0.68788189, + "num_input_tokens_seen": 271799490, + "step": 12603, + "time_per_iteration": 2.639741897583008 + }, + { + "auxiliary_loss_clip": 0.01037907, + "auxiliary_loss_mlp": 0.01029592, + "balance_loss_clip": 1.02286446, + "balance_loss_mlp": 1.01980495, + "epoch": 0.7577934766270855, + "flos": 17894862316800.0, + "grad_norm": 1.903289281616963, + "language_loss": 0.61644113, + "learning_rate": 5.843413741985439e-07, + "loss": 0.63711607, + "num_input_tokens_seen": 271817040, + "step": 12604, + "time_per_iteration": 2.7295939922332764 + }, + { + "auxiliary_loss_clip": 0.010643, + "auxiliary_loss_mlp": 0.01030943, + "balance_loss_clip": 1.02710223, + "balance_loss_mlp": 1.01993966, + "epoch": 0.7578535998797535, + "flos": 21613770984960.0, + "grad_norm": 1.786801446356553, + "language_loss": 0.80162185, + "learning_rate": 5.840662917315076e-07, + "loss": 0.82257426, + "num_input_tokens_seen": 271835480, + "step": 12605, + "time_per_iteration": 2.6868371963500977 + }, + { + "auxiliary_loss_clip": 0.01066871, + "auxiliary_loss_mlp": 0.01026715, + "balance_loss_clip": 1.02644944, + "balance_loss_mlp": 1.01505661, + "epoch": 0.7579137231324214, + "flos": 18478374756480.0, + "grad_norm": 2.7711075670383334, + "language_loss": 0.79374409, + "learning_rate": 5.837912629568198e-07, + "loss": 0.81467998, + "num_input_tokens_seen": 271849835, + "step": 12606, + "time_per_iteration": 2.64156436920166 + }, + { + "auxiliary_loss_clip": 0.01045933, + "auxiliary_loss_mlp": 0.01029384, + "balance_loss_clip": 1.02458394, + "balance_loss_mlp": 1.02015734, + "epoch": 0.7579738463850895, + "flos": 23255032152960.0, + "grad_norm": 1.4720485681588473, + "language_loss": 0.73131078, + "learning_rate": 5.835162878849087e-07, + "loss": 0.75206399, + "num_input_tokens_seen": 271869560, + "step": 12607, + "time_per_iteration": 2.7903099060058594 + }, + { + "auxiliary_loss_clip": 0.01046856, + "auxiliary_loss_mlp": 0.01029079, + "balance_loss_clip": 1.02677035, + "balance_loss_mlp": 1.01768255, + "epoch": 0.7580339696377574, + "flos": 14027031861120.0, + "grad_norm": 1.8384012405990808, + "language_loss": 0.74922836, + "learning_rate": 5.83241366526202e-07, + "loss": 0.7699877, + "num_input_tokens_seen": 271887950, + "step": 12608, + "time_per_iteration": 2.7426962852478027 + }, + { + "auxiliary_loss_clip": 0.01030277, + "auxiliary_loss_mlp": 0.00747563, + "balance_loss_clip": 1.02402186, + "balance_loss_mlp": 1.00037682, + "epoch": 0.7580940928904254, + "flos": 25082777756160.0, + "grad_norm": 1.5655230809805263, + "language_loss": 0.71398312, + "learning_rate": 5.829664988911245e-07, + "loss": 0.73176157, + "num_input_tokens_seen": 271907700, + "step": 12609, + "time_per_iteration": 2.7344112396240234 + }, + { + "auxiliary_loss_clip": 0.01063772, + "auxiliary_loss_mlp": 0.01029091, + "balance_loss_clip": 1.02448106, + "balance_loss_mlp": 1.01715827, + "epoch": 0.7581542161430933, + "flos": 23836425690240.0, + "grad_norm": 1.5667294692002167, + "language_loss": 0.81560165, + "learning_rate": 5.826916849901007e-07, + "loss": 0.83653033, + "num_input_tokens_seen": 271926840, + "step": 12610, + "time_per_iteration": 4.6175055503845215 + }, + { + "auxiliary_loss_clip": 0.01047203, + "auxiliary_loss_mlp": 0.01031815, + "balance_loss_clip": 1.02588892, + "balance_loss_mlp": 1.0203886, + "epoch": 0.7582143393957613, + "flos": 22237000888320.0, + "grad_norm": 1.5343690080866055, + "language_loss": 0.7040633, + "learning_rate": 5.824169248335488e-07, + "loss": 0.72485346, + "num_input_tokens_seen": 271946465, + "step": 12611, + "time_per_iteration": 2.665165424346924 + }, + { + "auxiliary_loss_clip": 0.01063171, + "auxiliary_loss_mlp": 0.0102577, + "balance_loss_clip": 1.02539003, + "balance_loss_mlp": 1.01486874, + "epoch": 0.7582744626484292, + "flos": 21106389421440.0, + "grad_norm": 1.9433725715263275, + "language_loss": 0.708763, + "learning_rate": 5.821422184318893e-07, + "loss": 0.7296524, + "num_input_tokens_seen": 271967295, + "step": 12612, + "time_per_iteration": 2.6211161613464355 + }, + { + "auxiliary_loss_clip": 0.0100599, + "auxiliary_loss_mlp": 0.01039707, + "balance_loss_clip": 1.02345407, + "balance_loss_mlp": 1.02829301, + "epoch": 0.7583345859010973, + "flos": 24604770539520.0, + "grad_norm": 1.581707250137785, + "language_loss": 0.59667897, + "learning_rate": 5.818675657955397e-07, + "loss": 0.61713588, + "num_input_tokens_seen": 271987960, + "step": 12613, + "time_per_iteration": 2.838866710662842 + }, + { + "auxiliary_loss_clip": 0.01037305, + "auxiliary_loss_mlp": 0.01036165, + "balance_loss_clip": 1.02311015, + "balance_loss_mlp": 1.0242027, + "epoch": 0.7583947091537652, + "flos": 33546814657920.0, + "grad_norm": 1.5986162711662903, + "language_loss": 0.59725893, + "learning_rate": 5.815929669349135e-07, + "loss": 0.61799359, + "num_input_tokens_seen": 272011780, + "step": 12614, + "time_per_iteration": 2.7818922996520996 + }, + { + "auxiliary_loss_clip": 0.01027809, + "auxiliary_loss_mlp": 0.01029152, + "balance_loss_clip": 1.02232933, + "balance_loss_mlp": 1.0181911, + "epoch": 0.7584548324064332, + "flos": 20121000641280.0, + "grad_norm": 2.1074028531651594, + "language_loss": 0.73199821, + "learning_rate": 5.813184218604246e-07, + "loss": 0.75256777, + "num_input_tokens_seen": 272030825, + "step": 12615, + "time_per_iteration": 2.749972343444824 + }, + { + "auxiliary_loss_clip": 0.00990184, + "auxiliary_loss_mlp": 0.01002048, + "balance_loss_clip": 1.0039196, + "balance_loss_mlp": 1.00096941, + "epoch": 0.7585149556591012, + "flos": 70402584061440.0, + "grad_norm": 0.8090880644521228, + "language_loss": 0.67712092, + "learning_rate": 5.810439305824828e-07, + "loss": 0.6970433, + "num_input_tokens_seen": 272095825, + "step": 12616, + "time_per_iteration": 3.2521190643310547 + }, + { + "auxiliary_loss_clip": 0.0103259, + "auxiliary_loss_mlp": 0.01032635, + "balance_loss_clip": 1.02694488, + "balance_loss_mlp": 1.0209167, + "epoch": 0.7585750789117691, + "flos": 16143786293760.0, + "grad_norm": 1.8184477181058911, + "language_loss": 0.84865719, + "learning_rate": 5.807694931114979e-07, + "loss": 0.86930937, + "num_input_tokens_seen": 272113950, + "step": 12617, + "time_per_iteration": 2.682651996612549 + }, + { + "auxiliary_loss_clip": 0.01033605, + "auxiliary_loss_mlp": 0.01029564, + "balance_loss_clip": 1.0262543, + "balance_loss_mlp": 1.01956868, + "epoch": 0.7586352021644371, + "flos": 17493165544320.0, + "grad_norm": 8.498622302186918, + "language_loss": 0.74819267, + "learning_rate": 5.804951094578757e-07, + "loss": 0.76882434, + "num_input_tokens_seen": 272130315, + "step": 12618, + "time_per_iteration": 2.7203876972198486 + }, + { + "auxiliary_loss_clip": 0.01047236, + "auxiliary_loss_mlp": 0.01032972, + "balance_loss_clip": 1.02644205, + "balance_loss_mlp": 1.02148652, + "epoch": 0.758695325417105, + "flos": 17275187859840.0, + "grad_norm": 1.9199325556643183, + "language_loss": 0.77172399, + "learning_rate": 5.802207796320209e-07, + "loss": 0.79252607, + "num_input_tokens_seen": 272149080, + "step": 12619, + "time_per_iteration": 2.6727354526519775 + }, + { + "auxiliary_loss_clip": 0.01023629, + "auxiliary_loss_mlp": 0.01032659, + "balance_loss_clip": 1.02258468, + "balance_loss_mlp": 1.02158475, + "epoch": 0.7587554486697731, + "flos": 29495660163840.0, + "grad_norm": 1.9787669054595869, + "language_loss": 0.82447183, + "learning_rate": 5.79946503644337e-07, + "loss": 0.84503472, + "num_input_tokens_seen": 272168285, + "step": 12620, + "time_per_iteration": 2.7550363540649414 + }, + { + "auxiliary_loss_clip": 0.01040829, + "auxiliary_loss_mlp": 0.01034155, + "balance_loss_clip": 1.02374291, + "balance_loss_mlp": 1.02167416, + "epoch": 0.758815571922441, + "flos": 16100800692480.0, + "grad_norm": 2.3928261191320552, + "language_loss": 0.82296968, + "learning_rate": 5.796722815052242e-07, + "loss": 0.8437196, + "num_input_tokens_seen": 272184585, + "step": 12621, + "time_per_iteration": 4.576429843902588 + }, + { + "auxiliary_loss_clip": 0.01044706, + "auxiliary_loss_mlp": 0.01030285, + "balance_loss_clip": 1.02516222, + "balance_loss_mlp": 1.01925194, + "epoch": 0.758875695175109, + "flos": 16143714466560.0, + "grad_norm": 2.149973611645903, + "language_loss": 0.73402584, + "learning_rate": 5.7939811322508e-07, + "loss": 0.75477576, + "num_input_tokens_seen": 272200205, + "step": 12622, + "time_per_iteration": 2.744516372680664 + }, + { + "auxiliary_loss_clip": 0.00996881, + "auxiliary_loss_mlp": 0.01001565, + "balance_loss_clip": 1.00151706, + "balance_loss_mlp": 1.00058794, + "epoch": 0.7589358184277769, + "flos": 68462006860800.0, + "grad_norm": 0.8398412150287777, + "language_loss": 0.60851693, + "learning_rate": 5.791239988143024e-07, + "loss": 0.62850142, + "num_input_tokens_seen": 272259670, + "step": 12623, + "time_per_iteration": 3.184614419937134 + }, + { + "auxiliary_loss_clip": 0.01061319, + "auxiliary_loss_mlp": 0.0102872, + "balance_loss_clip": 1.02526772, + "balance_loss_mlp": 1.01884902, + "epoch": 0.7589959416804449, + "flos": 20047311889920.0, + "grad_norm": 3.6280493995912373, + "language_loss": 0.67216563, + "learning_rate": 5.788499382832847e-07, + "loss": 0.693066, + "num_input_tokens_seen": 272277925, + "step": 12624, + "time_per_iteration": 2.7091591358184814 + }, + { + "auxiliary_loss_clip": 0.01062258, + "auxiliary_loss_mlp": 0.01025546, + "balance_loss_clip": 1.02528572, + "balance_loss_mlp": 1.01451313, + "epoch": 0.7590560649331128, + "flos": 18771800958720.0, + "grad_norm": 1.7516325046060794, + "language_loss": 0.75994521, + "learning_rate": 5.785759316424196e-07, + "loss": 0.78082323, + "num_input_tokens_seen": 272296010, + "step": 12625, + "time_per_iteration": 2.573509931564331 + }, + { + "auxiliary_loss_clip": 0.01035225, + "auxiliary_loss_mlp": 0.01038617, + "balance_loss_clip": 1.0231111, + "balance_loss_mlp": 1.02636266, + "epoch": 0.7591161881857809, + "flos": 29825284296960.0, + "grad_norm": 1.9746803216213415, + "language_loss": 0.62567741, + "learning_rate": 5.783019789020977e-07, + "loss": 0.64641577, + "num_input_tokens_seen": 272318330, + "step": 12626, + "time_per_iteration": 2.719313621520996 + }, + { + "auxiliary_loss_clip": 0.01032043, + "auxiliary_loss_mlp": 0.00747627, + "balance_loss_clip": 1.0309099, + "balance_loss_mlp": 1.00036991, + "epoch": 0.7591763114384488, + "flos": 20302708567680.0, + "grad_norm": 2.070382552473805, + "language_loss": 0.74061173, + "learning_rate": 5.780280800727084e-07, + "loss": 0.75840843, + "num_input_tokens_seen": 272335265, + "step": 12627, + "time_per_iteration": 2.787191152572632 + }, + { + "auxiliary_loss_clip": 0.01056491, + "auxiliary_loss_mlp": 0.01025054, + "balance_loss_clip": 1.0273881, + "balance_loss_mlp": 1.01437879, + "epoch": 0.7592364346911168, + "flos": 20813609664000.0, + "grad_norm": 2.912325727824478, + "language_loss": 0.68658304, + "learning_rate": 5.777542351646356e-07, + "loss": 0.70739853, + "num_input_tokens_seen": 272354795, + "step": 12628, + "time_per_iteration": 2.7177774906158447 + }, + { + "auxiliary_loss_clip": 0.0106029, + "auxiliary_loss_mlp": 0.01031611, + "balance_loss_clip": 1.02843142, + "balance_loss_mlp": 1.01970208, + "epoch": 0.7592965579437848, + "flos": 21251504367360.0, + "grad_norm": 3.5434042953392355, + "language_loss": 0.62709278, + "learning_rate": 5.774804441882648e-07, + "loss": 0.6480118, + "num_input_tokens_seen": 272372875, + "step": 12629, + "time_per_iteration": 2.626065492630005 + }, + { + "auxiliary_loss_clip": 0.01037405, + "auxiliary_loss_mlp": 0.01025362, + "balance_loss_clip": 1.02168894, + "balance_loss_mlp": 1.0152055, + "epoch": 0.7593566811964527, + "flos": 26213604704640.0, + "grad_norm": 1.6655378606726456, + "language_loss": 0.7781384, + "learning_rate": 5.772067071539786e-07, + "loss": 0.79876602, + "num_input_tokens_seen": 272394715, + "step": 12630, + "time_per_iteration": 2.8096072673797607 + }, + { + "auxiliary_loss_clip": 0.01006036, + "auxiliary_loss_mlp": 0.01001079, + "balance_loss_clip": 1.00070465, + "balance_loss_mlp": 1.00013149, + "epoch": 0.7594168044491207, + "flos": 71237255374080.0, + "grad_norm": 0.8128831199054898, + "language_loss": 0.61500543, + "learning_rate": 5.769330240721562e-07, + "loss": 0.63507658, + "num_input_tokens_seen": 272458775, + "step": 12631, + "time_per_iteration": 3.281766414642334 + }, + { + "auxiliary_loss_clip": 0.01035078, + "auxiliary_loss_mlp": 0.00747834, + "balance_loss_clip": 1.024804, + "balance_loss_mlp": 1.00044143, + "epoch": 0.7594769277017887, + "flos": 26613326229120.0, + "grad_norm": 1.9141694761427095, + "language_loss": 0.74371773, + "learning_rate": 5.766593949531767e-07, + "loss": 0.76154685, + "num_input_tokens_seen": 272479355, + "step": 12632, + "time_per_iteration": 2.7707128524780273 + }, + { + "auxiliary_loss_clip": 0.01043029, + "auxiliary_loss_mlp": 0.01028693, + "balance_loss_clip": 1.02504802, + "balance_loss_mlp": 1.01826835, + "epoch": 0.7595370509544567, + "flos": 17595941333760.0, + "grad_norm": 2.038127690214916, + "language_loss": 0.74951315, + "learning_rate": 5.763858198074154e-07, + "loss": 0.77023041, + "num_input_tokens_seen": 272493555, + "step": 12633, + "time_per_iteration": 2.6418192386627197 + }, + { + "auxiliary_loss_clip": 0.01041307, + "auxiliary_loss_mlp": 0.01026056, + "balance_loss_clip": 1.02418208, + "balance_loss_mlp": 1.01592946, + "epoch": 0.7595971742071246, + "flos": 18002953319040.0, + "grad_norm": 2.028711148432269, + "language_loss": 0.73419929, + "learning_rate": 5.76112298645246e-07, + "loss": 0.75487292, + "num_input_tokens_seen": 272508925, + "step": 12634, + "time_per_iteration": 2.647770404815674 + }, + { + "auxiliary_loss_clip": 0.01065167, + "auxiliary_loss_mlp": 0.01029057, + "balance_loss_clip": 1.02669072, + "balance_loss_mlp": 1.01840544, + "epoch": 0.7596572974597926, + "flos": 28840326480000.0, + "grad_norm": 1.702401618190503, + "language_loss": 0.65125847, + "learning_rate": 5.758388314770408e-07, + "loss": 0.67220068, + "num_input_tokens_seen": 272528805, + "step": 12635, + "time_per_iteration": 2.7753796577453613 + }, + { + "auxiliary_loss_clip": 0.01009094, + "auxiliary_loss_mlp": 0.01035549, + "balance_loss_clip": 1.0220201, + "balance_loss_mlp": 1.02220976, + "epoch": 0.7597174207124605, + "flos": 14282823588480.0, + "grad_norm": 2.1512768171308028, + "language_loss": 0.68786091, + "learning_rate": 5.7556541831317e-07, + "loss": 0.70830733, + "num_input_tokens_seen": 272546655, + "step": 12636, + "time_per_iteration": 2.7720229625701904 + }, + { + "auxiliary_loss_clip": 0.0104491, + "auxiliary_loss_mlp": 0.01029047, + "balance_loss_clip": 1.0261867, + "balance_loss_mlp": 1.01880145, + "epoch": 0.7597775439651285, + "flos": 21688932193920.0, + "grad_norm": 1.8137237511099689, + "language_loss": 0.80902654, + "learning_rate": 5.752920591640018e-07, + "loss": 0.82976615, + "num_input_tokens_seen": 272564010, + "step": 12637, + "time_per_iteration": 2.632469654083252 + }, + { + "auxiliary_loss_clip": 0.01052279, + "auxiliary_loss_mlp": 0.01028641, + "balance_loss_clip": 1.02491224, + "balance_loss_mlp": 1.01791191, + "epoch": 0.7598376672177964, + "flos": 36101248312320.0, + "grad_norm": 2.092982908721765, + "language_loss": 0.66345412, + "learning_rate": 5.750187540399017e-07, + "loss": 0.68426335, + "num_input_tokens_seen": 272585840, + "step": 12638, + "time_per_iteration": 2.775538444519043 + }, + { + "auxiliary_loss_clip": 0.01065442, + "auxiliary_loss_mlp": 0.01033164, + "balance_loss_clip": 1.02709091, + "balance_loss_mlp": 1.0215292, + "epoch": 0.7598977904704645, + "flos": 18332326056960.0, + "grad_norm": 2.590319054291025, + "language_loss": 0.64843428, + "learning_rate": 5.747455029512323e-07, + "loss": 0.66942036, + "num_input_tokens_seen": 272602300, + "step": 12639, + "time_per_iteration": 4.230057954788208 + }, + { + "auxiliary_loss_clip": 0.01050097, + "auxiliary_loss_mlp": 0.01027166, + "balance_loss_clip": 1.02329087, + "balance_loss_mlp": 1.01608586, + "epoch": 0.7599579137231324, + "flos": 20192642317440.0, + "grad_norm": 1.96186844776188, + "language_loss": 0.70392978, + "learning_rate": 5.744723059083572e-07, + "loss": 0.72470248, + "num_input_tokens_seen": 272619595, + "step": 12640, + "time_per_iteration": 2.725287914276123 + }, + { + "auxiliary_loss_clip": 0.01046884, + "auxiliary_loss_mlp": 0.01028052, + "balance_loss_clip": 1.0263983, + "balance_loss_mlp": 1.01597571, + "epoch": 0.7600180369758004, + "flos": 24024849459840.0, + "grad_norm": 1.9475342416701376, + "language_loss": 0.67220956, + "learning_rate": 5.741991629216343e-07, + "loss": 0.69295895, + "num_input_tokens_seen": 272638825, + "step": 12641, + "time_per_iteration": 2.7752318382263184 + }, + { + "auxiliary_loss_clip": 0.01052523, + "auxiliary_loss_mlp": 0.01032625, + "balance_loss_clip": 1.02300262, + "balance_loss_mlp": 1.02069807, + "epoch": 0.7600781602284684, + "flos": 18989527248000.0, + "grad_norm": 2.2543172082173935, + "language_loss": 0.66753846, + "learning_rate": 5.73926074001422e-07, + "loss": 0.6883899, + "num_input_tokens_seen": 272657240, + "step": 12642, + "time_per_iteration": 4.343684911727905 + }, + { + "auxiliary_loss_clip": 0.01046509, + "auxiliary_loss_mlp": 0.0102984, + "balance_loss_clip": 1.02798378, + "balance_loss_mlp": 1.01897407, + "epoch": 0.7601382834811363, + "flos": 26067520091520.0, + "grad_norm": 2.869250352899632, + "language_loss": 0.75275606, + "learning_rate": 5.736530391580765e-07, + "loss": 0.77351958, + "num_input_tokens_seen": 272677520, + "step": 12643, + "time_per_iteration": 2.700465440750122 + }, + { + "auxiliary_loss_clip": 0.01036115, + "auxiliary_loss_mlp": 0.01033106, + "balance_loss_clip": 1.02703214, + "balance_loss_mlp": 1.02111316, + "epoch": 0.7601984067338043, + "flos": 18844232734080.0, + "grad_norm": 1.7336486527750004, + "language_loss": 0.78453612, + "learning_rate": 5.733800584019508e-07, + "loss": 0.80522835, + "num_input_tokens_seen": 272696770, + "step": 12644, + "time_per_iteration": 2.7331182956695557 + }, + { + "auxiliary_loss_clip": 0.01034999, + "auxiliary_loss_mlp": 0.01027293, + "balance_loss_clip": 1.02256417, + "balance_loss_mlp": 1.0161593, + "epoch": 0.7602585299864723, + "flos": 24646391424000.0, + "grad_norm": 1.471419688354859, + "language_loss": 0.80367386, + "learning_rate": 5.731071317433957e-07, + "loss": 0.82429677, + "num_input_tokens_seen": 272718340, + "step": 12645, + "time_per_iteration": 2.673398017883301 + }, + { + "auxiliary_loss_clip": 0.01039336, + "auxiliary_loss_mlp": 0.01030115, + "balance_loss_clip": 1.02641582, + "balance_loss_mlp": 1.01901078, + "epoch": 0.7603186532391403, + "flos": 23842100039040.0, + "grad_norm": 1.6183337326350271, + "language_loss": 0.73144984, + "learning_rate": 5.728342591927611e-07, + "loss": 0.75214434, + "num_input_tokens_seen": 272739575, + "step": 12646, + "time_per_iteration": 2.693739414215088 + }, + { + "auxiliary_loss_clip": 0.01052221, + "auxiliary_loss_mlp": 0.01031249, + "balance_loss_clip": 1.02495587, + "balance_loss_mlp": 1.0209074, + "epoch": 0.7603787764918082, + "flos": 22199905117440.0, + "grad_norm": 2.2008627075976777, + "language_loss": 0.68006945, + "learning_rate": 5.725614407603949e-07, + "loss": 0.70090413, + "num_input_tokens_seen": 272758710, + "step": 12647, + "time_per_iteration": 2.606097936630249 + }, + { + "auxiliary_loss_clip": 0.00996882, + "auxiliary_loss_mlp": 0.01001876, + "balance_loss_clip": 1.00138676, + "balance_loss_mlp": 1.00094652, + "epoch": 0.7604388997444762, + "flos": 54086894254080.0, + "grad_norm": 0.6709419230815229, + "language_loss": 0.48956791, + "learning_rate": 5.722886764566415e-07, + "loss": 0.50955546, + "num_input_tokens_seen": 272814855, + "step": 12648, + "time_per_iteration": 3.1897644996643066 + }, + { + "auxiliary_loss_clip": 0.0105021, + "auxiliary_loss_mlp": 0.01030704, + "balance_loss_clip": 1.02424049, + "balance_loss_mlp": 1.02048755, + "epoch": 0.7604990229971441, + "flos": 19681920789120.0, + "grad_norm": 1.5947729393505212, + "language_loss": 0.76330143, + "learning_rate": 5.720159662918451e-07, + "loss": 0.78411061, + "num_input_tokens_seen": 272834400, + "step": 12649, + "time_per_iteration": 2.5655925273895264 + }, + { + "auxiliary_loss_clip": 0.01029558, + "auxiliary_loss_mlp": 0.01028858, + "balance_loss_clip": 1.02468657, + "balance_loss_mlp": 1.01741457, + "epoch": 0.7605591462498121, + "flos": 25228036356480.0, + "grad_norm": 2.2944632272411374, + "language_loss": 0.68513083, + "learning_rate": 5.717433102763462e-07, + "loss": 0.705715, + "num_input_tokens_seen": 272854760, + "step": 12650, + "time_per_iteration": 2.742628574371338 + }, + { + "auxiliary_loss_clip": 0.00997007, + "auxiliary_loss_mlp": 0.01001198, + "balance_loss_clip": 1.00154364, + "balance_loss_mlp": 1.00032222, + "epoch": 0.76061926950248, + "flos": 66783757662720.0, + "grad_norm": 0.7618784345479069, + "language_loss": 0.6279189, + "learning_rate": 5.714707084204838e-07, + "loss": 0.64790094, + "num_input_tokens_seen": 272919030, + "step": 12651, + "time_per_iteration": 3.1778533458709717 + }, + { + "auxiliary_loss_clip": 0.01030946, + "auxiliary_loss_mlp": 0.01029515, + "balance_loss_clip": 1.0235045, + "balance_loss_mlp": 1.01938796, + "epoch": 0.7606793927551481, + "flos": 25338354001920.0, + "grad_norm": 1.4652378099218164, + "language_loss": 0.71318316, + "learning_rate": 5.711981607345951e-07, + "loss": 0.73378778, + "num_input_tokens_seen": 272938925, + "step": 12652, + "time_per_iteration": 2.7460153102874756 + }, + { + "auxiliary_loss_clip": 0.01014131, + "auxiliary_loss_mlp": 0.01034562, + "balance_loss_clip": 1.02496696, + "balance_loss_mlp": 1.02346373, + "epoch": 0.760739516007816, + "flos": 18223624523520.0, + "grad_norm": 1.8420960042249495, + "language_loss": 0.80356139, + "learning_rate": 5.709256672290152e-07, + "loss": 0.82404834, + "num_input_tokens_seen": 272954945, + "step": 12653, + "time_per_iteration": 2.8138325214385986 + }, + { + "auxiliary_loss_clip": 0.01067766, + "auxiliary_loss_mlp": 0.01028791, + "balance_loss_clip": 1.02724016, + "balance_loss_mlp": 1.01816368, + "epoch": 0.760799639260484, + "flos": 22559119079040.0, + "grad_norm": 3.762168278829408, + "language_loss": 0.80302596, + "learning_rate": 5.706532279140785e-07, + "loss": 0.82399154, + "num_input_tokens_seen": 272972855, + "step": 12654, + "time_per_iteration": 2.601738929748535 + }, + { + "auxiliary_loss_clip": 0.01030518, + "auxiliary_loss_mlp": 0.01035085, + "balance_loss_clip": 1.02278018, + "balance_loss_mlp": 1.0237062, + "epoch": 0.760859762513152, + "flos": 22309324922880.0, + "grad_norm": 2.9902743486315817, + "language_loss": 0.7939254, + "learning_rate": 5.703808428001136e-07, + "loss": 0.81458151, + "num_input_tokens_seen": 272989895, + "step": 12655, + "time_per_iteration": 2.672255516052246 + }, + { + "auxiliary_loss_clip": 0.01050781, + "auxiliary_loss_mlp": 0.01022152, + "balance_loss_clip": 1.0250746, + "balance_loss_mlp": 1.01320505, + "epoch": 0.7609198857658199, + "flos": 24863902231680.0, + "grad_norm": 1.5598397614277806, + "language_loss": 0.68594706, + "learning_rate": 5.701085118974505e-07, + "loss": 0.70667636, + "num_input_tokens_seen": 273011695, + "step": 12656, + "time_per_iteration": 2.699235200881958 + }, + { + "auxiliary_loss_clip": 0.01050402, + "auxiliary_loss_mlp": 0.01025711, + "balance_loss_clip": 1.02121377, + "balance_loss_mlp": 1.01432681, + "epoch": 0.760980009018488, + "flos": 16836790366080.0, + "grad_norm": 2.513070517668213, + "language_loss": 0.73072577, + "learning_rate": 5.698362352164164e-07, + "loss": 0.7514869, + "num_input_tokens_seen": 273028815, + "step": 12657, + "time_per_iteration": 2.6326050758361816 + }, + { + "auxiliary_loss_clip": 0.00989436, + "auxiliary_loss_mlp": 0.01000948, + "balance_loss_clip": 1.00354719, + "balance_loss_mlp": 1.00006604, + "epoch": 0.7610401322711559, + "flos": 61230603029760.0, + "grad_norm": 0.8632557790752134, + "language_loss": 0.64942569, + "learning_rate": 5.695640127673347e-07, + "loss": 0.66932952, + "num_input_tokens_seen": 273084080, + "step": 12658, + "time_per_iteration": 4.960347414016724 + }, + { + "auxiliary_loss_clip": 0.01050768, + "auxiliary_loss_mlp": 0.0103083, + "balance_loss_clip": 1.02452731, + "balance_loss_mlp": 1.0207386, + "epoch": 0.7611002555238239, + "flos": 19640730867840.0, + "grad_norm": 1.5921671381860234, + "language_loss": 0.79687965, + "learning_rate": 5.692918445605293e-07, + "loss": 0.81769562, + "num_input_tokens_seen": 273102295, + "step": 12659, + "time_per_iteration": 2.6146793365478516 + }, + { + "auxiliary_loss_clip": 0.01051703, + "auxiliary_loss_mlp": 0.0102612, + "balance_loss_clip": 1.02429783, + "balance_loss_mlp": 1.01566505, + "epoch": 0.7611603787764918, + "flos": 26872206526080.0, + "grad_norm": 1.4870568999095641, + "language_loss": 0.68822443, + "learning_rate": 5.690197306063209e-07, + "loss": 0.70900261, + "num_input_tokens_seen": 273123400, + "step": 12660, + "time_per_iteration": 2.6960415840148926 + }, + { + "auxiliary_loss_clip": 0.01063041, + "auxiliary_loss_mlp": 0.01028617, + "balance_loss_clip": 1.02458382, + "balance_loss_mlp": 1.01825142, + "epoch": 0.7612205020291598, + "flos": 27344252085120.0, + "grad_norm": 1.6996006248135793, + "language_loss": 0.7018857, + "learning_rate": 5.687476709150281e-07, + "loss": 0.72280228, + "num_input_tokens_seen": 273145150, + "step": 12661, + "time_per_iteration": 2.654395580291748 + }, + { + "auxiliary_loss_clip": 0.01052443, + "auxiliary_loss_mlp": 0.01030248, + "balance_loss_clip": 1.02409577, + "balance_loss_mlp": 1.01972795, + "epoch": 0.7612806252818277, + "flos": 29314598682240.0, + "grad_norm": 1.5658435767019323, + "language_loss": 0.83430767, + "learning_rate": 5.68475665496966e-07, + "loss": 0.85513455, + "num_input_tokens_seen": 273165180, + "step": 12662, + "time_per_iteration": 2.7663331031799316 + }, + { + "auxiliary_loss_clip": 0.01041345, + "auxiliary_loss_mlp": 0.01034822, + "balance_loss_clip": 1.02349424, + "balance_loss_mlp": 1.02448058, + "epoch": 0.7613407485344957, + "flos": 19026048401280.0, + "grad_norm": 1.6896517138321672, + "language_loss": 0.69113004, + "learning_rate": 5.682037143624505e-07, + "loss": 0.71189177, + "num_input_tokens_seen": 273184005, + "step": 12663, + "time_per_iteration": 2.6528830528259277 + }, + { + "auxiliary_loss_clip": 0.01052333, + "auxiliary_loss_mlp": 0.0102589, + "balance_loss_clip": 1.02532732, + "balance_loss_mlp": 1.01541185, + "epoch": 0.7614008717871636, + "flos": 23256037733760.0, + "grad_norm": 1.6446130862221564, + "language_loss": 0.70277059, + "learning_rate": 5.67931817521794e-07, + "loss": 0.72355282, + "num_input_tokens_seen": 273203565, + "step": 12664, + "time_per_iteration": 2.7296624183654785 + }, + { + "auxiliary_loss_clip": 0.01058132, + "auxiliary_loss_mlp": 0.01034291, + "balance_loss_clip": 1.02727962, + "balance_loss_mlp": 1.02251339, + "epoch": 0.7614609950398317, + "flos": 21579907438080.0, + "grad_norm": 1.584323533226411, + "language_loss": 0.79191953, + "learning_rate": 5.676599749853066e-07, + "loss": 0.8128438, + "num_input_tokens_seen": 273221645, + "step": 12665, + "time_per_iteration": 2.591614246368408 + }, + { + "auxiliary_loss_clip": 0.01062666, + "auxiliary_loss_mlp": 0.0074755, + "balance_loss_clip": 1.02611685, + "balance_loss_mlp": 1.00042534, + "epoch": 0.7615211182924996, + "flos": 29277897960960.0, + "grad_norm": 1.7329759174714723, + "language_loss": 0.87789011, + "learning_rate": 5.673881867632959e-07, + "loss": 0.89599228, + "num_input_tokens_seen": 273242040, + "step": 12666, + "time_per_iteration": 2.6278181076049805 + }, + { + "auxiliary_loss_clip": 0.0101985, + "auxiliary_loss_mlp": 0.01033264, + "balance_loss_clip": 1.02513862, + "balance_loss_mlp": 1.02123547, + "epoch": 0.7615812415451676, + "flos": 13261129136640.0, + "grad_norm": 1.9722403692489328, + "language_loss": 0.83060539, + "learning_rate": 5.671164528660693e-07, + "loss": 0.85113651, + "num_input_tokens_seen": 273257365, + "step": 12667, + "time_per_iteration": 2.7287657260894775 + }, + { + "auxiliary_loss_clip": 0.01042067, + "auxiliary_loss_mlp": 0.01028762, + "balance_loss_clip": 1.02479231, + "balance_loss_mlp": 1.01910579, + "epoch": 0.7616413647978356, + "flos": 18584741905920.0, + "grad_norm": 1.6155223680066788, + "language_loss": 0.78637493, + "learning_rate": 5.668447733039296e-07, + "loss": 0.80708319, + "num_input_tokens_seen": 273274710, + "step": 12668, + "time_per_iteration": 4.400977373123169 + }, + { + "auxiliary_loss_clip": 0.01027014, + "auxiliary_loss_mlp": 0.01027544, + "balance_loss_clip": 1.02229977, + "balance_loss_mlp": 1.01735806, + "epoch": 0.7617014880505035, + "flos": 18516188799360.0, + "grad_norm": 2.127477459656297, + "language_loss": 0.6463204, + "learning_rate": 5.6657314808718e-07, + "loss": 0.66686606, + "num_input_tokens_seen": 273292870, + "step": 12669, + "time_per_iteration": 2.7116403579711914 + }, + { + "auxiliary_loss_clip": 0.01036375, + "auxiliary_loss_mlp": 0.01036159, + "balance_loss_clip": 1.02314162, + "balance_loss_mlp": 1.02326083, + "epoch": 0.7617616113031715, + "flos": 24973178382720.0, + "grad_norm": 2.1464092148814538, + "language_loss": 0.6633774, + "learning_rate": 5.663015772261202e-07, + "loss": 0.68410277, + "num_input_tokens_seen": 273312375, + "step": 12670, + "time_per_iteration": 2.6501665115356445 + }, + { + "auxiliary_loss_clip": 0.01054471, + "auxiliary_loss_mlp": 0.01033414, + "balance_loss_clip": 1.02520442, + "balance_loss_mlp": 1.02281022, + "epoch": 0.7618217345558395, + "flos": 23295036925440.0, + "grad_norm": 1.780422111653584, + "language_loss": 0.73109275, + "learning_rate": 5.660300607310493e-07, + "loss": 0.7519716, + "num_input_tokens_seen": 273332590, + "step": 12671, + "time_per_iteration": 2.60775089263916 + }, + { + "auxiliary_loss_clip": 0.01021942, + "auxiliary_loss_mlp": 0.01031433, + "balance_loss_clip": 1.02085185, + "balance_loss_mlp": 1.02042425, + "epoch": 0.7618818578085075, + "flos": 25482894330240.0, + "grad_norm": 1.6406970476175586, + "language_loss": 0.7332927, + "learning_rate": 5.657585986122613e-07, + "loss": 0.75382644, + "num_input_tokens_seen": 273352885, + "step": 12672, + "time_per_iteration": 2.790583610534668 + }, + { + "auxiliary_loss_clip": 0.00974086, + "auxiliary_loss_mlp": 0.01001187, + "balance_loss_clip": 1.00093389, + "balance_loss_mlp": 1.00013828, + "epoch": 0.7619419810611754, + "flos": 61151994115200.0, + "grad_norm": 0.7590176291803524, + "language_loss": 0.56718671, + "learning_rate": 5.654871908800506e-07, + "loss": 0.58693945, + "num_input_tokens_seen": 273411730, + "step": 12673, + "time_per_iteration": 3.2235095500946045 + }, + { + "auxiliary_loss_clip": 0.01053131, + "auxiliary_loss_mlp": 0.0102483, + "balance_loss_clip": 1.02440906, + "balance_loss_mlp": 1.01370215, + "epoch": 0.7620021043138434, + "flos": 23258659426560.0, + "grad_norm": 2.7025653740568867, + "language_loss": 0.74453247, + "learning_rate": 5.652158375447102e-07, + "loss": 0.76531202, + "num_input_tokens_seen": 273430020, + "step": 12674, + "time_per_iteration": 2.750849485397339 + }, + { + "auxiliary_loss_clip": 0.01029051, + "auxiliary_loss_mlp": 0.01031035, + "balance_loss_clip": 1.02168214, + "balance_loss_mlp": 1.01973414, + "epoch": 0.7620622275665113, + "flos": 25082490447360.0, + "grad_norm": 1.9047354328267503, + "language_loss": 0.7228964, + "learning_rate": 5.649445386165286e-07, + "loss": 0.74349725, + "num_input_tokens_seen": 273448690, + "step": 12675, + "time_per_iteration": 2.6994919776916504 + }, + { + "auxiliary_loss_clip": 0.01047203, + "auxiliary_loss_mlp": 0.01026359, + "balance_loss_clip": 1.02389765, + "balance_loss_mlp": 1.01649475, + "epoch": 0.7621223508191793, + "flos": 20155007842560.0, + "grad_norm": 2.0708080464499465, + "language_loss": 0.72494149, + "learning_rate": 5.646732941057936e-07, + "loss": 0.74567711, + "num_input_tokens_seen": 273465190, + "step": 12676, + "time_per_iteration": 2.623737096786499 + }, + { + "auxiliary_loss_clip": 0.01039101, + "auxiliary_loss_mlp": 0.00747896, + "balance_loss_clip": 1.02706742, + "balance_loss_mlp": 1.00048816, + "epoch": 0.7621824740718472, + "flos": 18000187971840.0, + "grad_norm": 2.4533786540957006, + "language_loss": 0.53939134, + "learning_rate": 5.644021040227927e-07, + "loss": 0.55726135, + "num_input_tokens_seen": 273478620, + "step": 12677, + "time_per_iteration": 2.714158773422241 + }, + { + "auxiliary_loss_clip": 0.01006673, + "auxiliary_loss_mlp": 0.01033615, + "balance_loss_clip": 1.02200711, + "balance_loss_mlp": 1.02252889, + "epoch": 0.7622425973245153, + "flos": 21725668828800.0, + "grad_norm": 1.832349284562765, + "language_loss": 0.78496385, + "learning_rate": 5.641309683778064e-07, + "loss": 0.80536675, + "num_input_tokens_seen": 273497635, + "step": 12678, + "time_per_iteration": 2.7596535682678223 + }, + { + "auxiliary_loss_clip": 0.01026422, + "auxiliary_loss_mlp": 0.0102986, + "balance_loss_clip": 1.02212644, + "balance_loss_mlp": 1.01842177, + "epoch": 0.7623027205771832, + "flos": 19718549683200.0, + "grad_norm": 1.9892428301102376, + "language_loss": 0.77290529, + "learning_rate": 5.638598871811175e-07, + "loss": 0.79346812, + "num_input_tokens_seen": 273513955, + "step": 12679, + "time_per_iteration": 2.90645432472229 + }, + { + "auxiliary_loss_clip": 0.01053155, + "auxiliary_loss_mlp": 0.01022282, + "balance_loss_clip": 1.02469814, + "balance_loss_mlp": 1.01189899, + "epoch": 0.7623628438298512, + "flos": 23988831096960.0, + "grad_norm": 1.399840306776225, + "language_loss": 0.80010247, + "learning_rate": 5.635888604430059e-07, + "loss": 0.82085687, + "num_input_tokens_seen": 273533970, + "step": 12680, + "time_per_iteration": 2.6677095890045166 + }, + { + "auxiliary_loss_clip": 0.01045328, + "auxiliary_loss_mlp": 0.01029088, + "balance_loss_clip": 1.0257988, + "balance_loss_mlp": 1.01721454, + "epoch": 0.7624229670825191, + "flos": 22345702421760.0, + "grad_norm": 1.7905153050382232, + "language_loss": 0.62700921, + "learning_rate": 5.633178881737493e-07, + "loss": 0.64775336, + "num_input_tokens_seen": 273553090, + "step": 12681, + "time_per_iteration": 2.6741511821746826 + }, + { + "auxiliary_loss_clip": 0.01033054, + "auxiliary_loss_mlp": 0.01028114, + "balance_loss_clip": 1.02583134, + "balance_loss_mlp": 1.01813042, + "epoch": 0.7624830903351871, + "flos": 22711775880960.0, + "grad_norm": 4.757897914430954, + "language_loss": 0.75887805, + "learning_rate": 5.63046970383622e-07, + "loss": 0.77948976, + "num_input_tokens_seen": 273572460, + "step": 12682, + "time_per_iteration": 2.7584738731384277 + }, + { + "auxiliary_loss_clip": 0.01039609, + "auxiliary_loss_mlp": 0.010236, + "balance_loss_clip": 1.02392995, + "balance_loss_mlp": 1.01373506, + "epoch": 0.7625432135878552, + "flos": 25593714766080.0, + "grad_norm": 1.9238143184830845, + "language_loss": 0.68201196, + "learning_rate": 5.627761070828974e-07, + "loss": 0.70264399, + "num_input_tokens_seen": 273592815, + "step": 12683, + "time_per_iteration": 2.7870981693267822 + }, + { + "auxiliary_loss_clip": 0.0102606, + "auxiliary_loss_mlp": 0.00747762, + "balance_loss_clip": 1.0224545, + "balance_loss_mlp": 1.00041199, + "epoch": 0.7626033368405231, + "flos": 23987645948160.0, + "grad_norm": 2.4107862846844714, + "language_loss": 0.83647752, + "learning_rate": 5.625052982818472e-07, + "loss": 0.85421574, + "num_input_tokens_seen": 273611790, + "step": 12684, + "time_per_iteration": 2.6956334114074707 + }, + { + "auxiliary_loss_clip": 0.01042597, + "auxiliary_loss_mlp": 0.01035404, + "balance_loss_clip": 1.024544, + "balance_loss_mlp": 1.02437127, + "epoch": 0.7626634600931911, + "flos": 12599115523200.0, + "grad_norm": 1.7958476716111744, + "language_loss": 0.82594734, + "learning_rate": 5.622345439907396e-07, + "loss": 0.84672737, + "num_input_tokens_seen": 273628340, + "step": 12685, + "time_per_iteration": 2.660809278488159 + }, + { + "auxiliary_loss_clip": 0.01035371, + "auxiliary_loss_mlp": 0.00747575, + "balance_loss_clip": 1.02585018, + "balance_loss_mlp": 1.00039864, + "epoch": 0.762723583345859, + "flos": 26322593546880.0, + "grad_norm": 1.7235418440561288, + "language_loss": 0.77009535, + "learning_rate": 5.619638442198422e-07, + "loss": 0.78792477, + "num_input_tokens_seen": 273646585, + "step": 12686, + "time_per_iteration": 2.9650607109069824 + }, + { + "auxiliary_loss_clip": 0.01009538, + "auxiliary_loss_mlp": 0.01046995, + "balance_loss_clip": 1.02210736, + "balance_loss_mlp": 1.03242218, + "epoch": 0.762783706598527, + "flos": 21907053532800.0, + "grad_norm": 1.9935598593568482, + "language_loss": 0.71883136, + "learning_rate": 5.616931989794198e-07, + "loss": 0.73939669, + "num_input_tokens_seen": 273665410, + "step": 12687, + "time_per_iteration": 4.474161386489868 + }, + { + "auxiliary_loss_clip": 0.01035904, + "auxiliary_loss_mlp": 0.01037331, + "balance_loss_clip": 1.02179003, + "balance_loss_mlp": 1.02436066, + "epoch": 0.7628438298511949, + "flos": 15339782217600.0, + "grad_norm": 2.37785416454252, + "language_loss": 0.64877784, + "learning_rate": 5.614226082797369e-07, + "loss": 0.66951019, + "num_input_tokens_seen": 273683035, + "step": 12688, + "time_per_iteration": 4.290286302566528 + }, + { + "auxiliary_loss_clip": 0.01051274, + "auxiliary_loss_mlp": 0.01024368, + "balance_loss_clip": 1.02477384, + "balance_loss_mlp": 1.01473594, + "epoch": 0.7629039531038629, + "flos": 13006307076480.0, + "grad_norm": 1.9400008876363743, + "language_loss": 0.70906824, + "learning_rate": 5.611520721310515e-07, + "loss": 0.72982466, + "num_input_tokens_seen": 273700130, + "step": 12689, + "time_per_iteration": 2.6056177616119385 + }, + { + "auxiliary_loss_clip": 0.01031206, + "auxiliary_loss_mlp": 0.01036416, + "balance_loss_clip": 1.02456439, + "balance_loss_mlp": 1.02506125, + "epoch": 0.7629640763565309, + "flos": 26171660597760.0, + "grad_norm": 1.8510971351030716, + "language_loss": 0.69522512, + "learning_rate": 5.608815905436238e-07, + "loss": 0.71590126, + "num_input_tokens_seen": 273720310, + "step": 12690, + "time_per_iteration": 2.7821922302246094 + }, + { + "auxiliary_loss_clip": 0.01035213, + "auxiliary_loss_mlp": 0.01037004, + "balance_loss_clip": 1.02176881, + "balance_loss_mlp": 1.02546489, + "epoch": 0.7630241996091989, + "flos": 36793713680640.0, + "grad_norm": 1.468016959765529, + "language_loss": 0.69486129, + "learning_rate": 5.606111635277109e-07, + "loss": 0.7155835, + "num_input_tokens_seen": 273744475, + "step": 12691, + "time_per_iteration": 2.824946165084839 + }, + { + "auxiliary_loss_clip": 0.01051456, + "auxiliary_loss_mlp": 0.01031237, + "balance_loss_clip": 1.02453446, + "balance_loss_mlp": 1.02166498, + "epoch": 0.7630843228618668, + "flos": 21835160461440.0, + "grad_norm": 1.5943058757022845, + "language_loss": 0.81606698, + "learning_rate": 5.603407910935662e-07, + "loss": 0.83689392, + "num_input_tokens_seen": 273764635, + "step": 12692, + "time_per_iteration": 2.7821500301361084 + }, + { + "auxiliary_loss_clip": 0.01037752, + "auxiliary_loss_mlp": 0.01030869, + "balance_loss_clip": 1.02768922, + "balance_loss_mlp": 1.02046776, + "epoch": 0.7631444461145348, + "flos": 12640520926080.0, + "grad_norm": 2.676728419080922, + "language_loss": 0.77057767, + "learning_rate": 5.600704732514438e-07, + "loss": 0.79126388, + "num_input_tokens_seen": 273780115, + "step": 12693, + "time_per_iteration": 2.7579731941223145 + }, + { + "auxiliary_loss_clip": 0.01035251, + "auxiliary_loss_mlp": 0.01030112, + "balance_loss_clip": 1.02721357, + "balance_loss_mlp": 1.01872718, + "epoch": 0.7632045693672027, + "flos": 16836610798080.0, + "grad_norm": 2.168390440295914, + "language_loss": 0.72989613, + "learning_rate": 5.598002100115933e-07, + "loss": 0.75054979, + "num_input_tokens_seen": 273796605, + "step": 12694, + "time_per_iteration": 2.69028639793396 + }, + { + "auxiliary_loss_clip": 0.01050285, + "auxiliary_loss_mlp": 0.01025492, + "balance_loss_clip": 1.02364552, + "balance_loss_mlp": 1.01473355, + "epoch": 0.7632646926198707, + "flos": 22017335264640.0, + "grad_norm": 1.8850804962171295, + "language_loss": 0.70702338, + "learning_rate": 5.595300013842625e-07, + "loss": 0.72778112, + "num_input_tokens_seen": 273816515, + "step": 12695, + "time_per_iteration": 2.6777713298797607 + }, + { + "auxiliary_loss_clip": 0.01062334, + "auxiliary_loss_mlp": 0.01026788, + "balance_loss_clip": 1.02454734, + "balance_loss_mlp": 1.01646447, + "epoch": 0.7633248158725388, + "flos": 23114011357440.0, + "grad_norm": 1.488354895617096, + "language_loss": 0.71954763, + "learning_rate": 5.592598473796985e-07, + "loss": 0.74043888, + "num_input_tokens_seen": 273837060, + "step": 12696, + "time_per_iteration": 2.612558126449585 + }, + { + "auxiliary_loss_clip": 0.01009993, + "auxiliary_loss_mlp": 0.0103447, + "balance_loss_clip": 1.024369, + "balance_loss_mlp": 1.02191162, + "epoch": 0.7633849391252067, + "flos": 10889839952640.0, + "grad_norm": 3.511822388159595, + "language_loss": 0.71232677, + "learning_rate": 5.589897480081453e-07, + "loss": 0.7327714, + "num_input_tokens_seen": 273853365, + "step": 12697, + "time_per_iteration": 2.8484325408935547 + }, + { + "auxiliary_loss_clip": 0.01035034, + "auxiliary_loss_mlp": 0.01029388, + "balance_loss_clip": 1.02665043, + "balance_loss_mlp": 1.01849854, + "epoch": 0.7634450623778747, + "flos": 20994168355200.0, + "grad_norm": 1.9560788285949842, + "language_loss": 0.67034012, + "learning_rate": 5.587197032798461e-07, + "loss": 0.69098431, + "num_input_tokens_seen": 273870750, + "step": 12698, + "time_per_iteration": 2.746136426925659 + }, + { + "auxiliary_loss_clip": 0.01047307, + "auxiliary_loss_mlp": 0.01026545, + "balance_loss_clip": 1.02205026, + "balance_loss_mlp": 1.01598859, + "epoch": 0.7635051856305426, + "flos": 18882046776960.0, + "grad_norm": 1.9904724298323, + "language_loss": 0.72326803, + "learning_rate": 5.5844971320504e-07, + "loss": 0.74400663, + "num_input_tokens_seen": 273890890, + "step": 12699, + "time_per_iteration": 2.635240316390991 + }, + { + "auxiliary_loss_clip": 0.01040099, + "auxiliary_loss_mlp": 0.01027602, + "balance_loss_clip": 1.02309453, + "balance_loss_mlp": 1.01783919, + "epoch": 0.7635653088832106, + "flos": 34786989584640.0, + "grad_norm": 1.564019149969322, + "language_loss": 0.7330386, + "learning_rate": 5.581797777939648e-07, + "loss": 0.75371563, + "num_input_tokens_seen": 273914015, + "step": 12700, + "time_per_iteration": 2.774183988571167 + }, + { + "auxiliary_loss_clip": 0.01062675, + "auxiliary_loss_mlp": 0.01027826, + "balance_loss_clip": 1.02488399, + "balance_loss_mlp": 1.01762772, + "epoch": 0.7636254321358785, + "flos": 23178434400000.0, + "grad_norm": 1.9201206019436252, + "language_loss": 0.69136178, + "learning_rate": 5.579098970568574e-07, + "loss": 0.7122668, + "num_input_tokens_seen": 273927415, + "step": 12701, + "time_per_iteration": 2.5887463092803955 + }, + { + "auxiliary_loss_clip": 0.01045121, + "auxiliary_loss_mlp": 0.01027238, + "balance_loss_clip": 1.02715194, + "balance_loss_mlp": 1.01644909, + "epoch": 0.7636855553885465, + "flos": 21325229032320.0, + "grad_norm": 1.6690484148912008, + "language_loss": 0.64231336, + "learning_rate": 5.576400710039508e-07, + "loss": 0.663037, + "num_input_tokens_seen": 273946690, + "step": 12702, + "time_per_iteration": 2.686389684677124 + }, + { + "auxiliary_loss_clip": 0.01035026, + "auxiliary_loss_mlp": 0.01025032, + "balance_loss_clip": 1.02641082, + "balance_loss_mlp": 1.01444077, + "epoch": 0.7637456786412145, + "flos": 28658079849600.0, + "grad_norm": 1.8518691542229824, + "language_loss": 0.65132093, + "learning_rate": 5.57370299645477e-07, + "loss": 0.67192149, + "num_input_tokens_seen": 273966870, + "step": 12703, + "time_per_iteration": 2.7906227111816406 + }, + { + "auxiliary_loss_clip": 0.01035807, + "auxiliary_loss_mlp": 0.01022343, + "balance_loss_clip": 1.02318847, + "balance_loss_mlp": 1.01200175, + "epoch": 0.7638058018938825, + "flos": 21907269014400.0, + "grad_norm": 3.3589602607395563, + "language_loss": 0.83456564, + "learning_rate": 5.571005829916668e-07, + "loss": 0.85514712, + "num_input_tokens_seen": 273986360, + "step": 12704, + "time_per_iteration": 2.654878854751587 + }, + { + "auxiliary_loss_clip": 0.01038999, + "auxiliary_loss_mlp": 0.0103134, + "balance_loss_clip": 1.02500701, + "balance_loss_mlp": 1.02066433, + "epoch": 0.7638659251465504, + "flos": 29643899592960.0, + "grad_norm": 1.3599401480520241, + "language_loss": 0.67993391, + "learning_rate": 5.568309210527469e-07, + "loss": 0.70063734, + "num_input_tokens_seen": 274009745, + "step": 12705, + "time_per_iteration": 4.478713512420654 + }, + { + "auxiliary_loss_clip": 0.01033668, + "auxiliary_loss_mlp": 0.01027605, + "balance_loss_clip": 1.02236342, + "balance_loss_mlp": 1.01669753, + "epoch": 0.7639260483992184, + "flos": 26141172929280.0, + "grad_norm": 1.813690163176829, + "language_loss": 0.74044734, + "learning_rate": 5.565613138389427e-07, + "loss": 0.76106006, + "num_input_tokens_seen": 274028775, + "step": 12706, + "time_per_iteration": 2.6824254989624023 + }, + { + "auxiliary_loss_clip": 0.01046045, + "auxiliary_loss_mlp": 0.01034293, + "balance_loss_clip": 1.0240519, + "balance_loss_mlp": 1.0235579, + "epoch": 0.7639861716518863, + "flos": 20156695781760.0, + "grad_norm": 1.7268484110187539, + "language_loss": 0.78141999, + "learning_rate": 5.562917613604781e-07, + "loss": 0.80222332, + "num_input_tokens_seen": 274047520, + "step": 12707, + "time_per_iteration": 2.600937604904175 + }, + { + "auxiliary_loss_clip": 0.01041417, + "auxiliary_loss_mlp": 0.01022345, + "balance_loss_clip": 1.02450895, + "balance_loss_mlp": 1.01183653, + "epoch": 0.7640462949045543, + "flos": 18583125793920.0, + "grad_norm": 2.932919213242514, + "language_loss": 0.80204701, + "learning_rate": 5.560222636275751e-07, + "loss": 0.82268465, + "num_input_tokens_seen": 274065350, + "step": 12708, + "time_per_iteration": 2.7083253860473633 + }, + { + "auxiliary_loss_clip": 0.01000159, + "auxiliary_loss_mlp": 0.01000384, + "balance_loss_clip": 1.00452626, + "balance_loss_mlp": 0.99952537, + "epoch": 0.7641064181572224, + "flos": 68321991646080.0, + "grad_norm": 0.8130242550498691, + "language_loss": 0.56341887, + "learning_rate": 5.557528206504521e-07, + "loss": 0.58342433, + "num_input_tokens_seen": 274122315, + "step": 12709, + "time_per_iteration": 3.2144720554351807 + }, + { + "auxiliary_loss_clip": 0.0104805, + "auxiliary_loss_mlp": 0.01032136, + "balance_loss_clip": 1.02346623, + "balance_loss_mlp": 1.02040029, + "epoch": 0.7641665414098903, + "flos": 17968982031360.0, + "grad_norm": 1.6728982632514398, + "language_loss": 0.63667649, + "learning_rate": 5.554834324393271e-07, + "loss": 0.65747839, + "num_input_tokens_seen": 274140555, + "step": 12710, + "time_per_iteration": 2.6439061164855957 + }, + { + "auxiliary_loss_clip": 0.01017361, + "auxiliary_loss_mlp": 0.00747665, + "balance_loss_clip": 1.02347863, + "balance_loss_mlp": 1.00041831, + "epoch": 0.7642266646625583, + "flos": 21252078984960.0, + "grad_norm": 2.1993676987055406, + "language_loss": 0.64546794, + "learning_rate": 5.552140990044154e-07, + "loss": 0.66311824, + "num_input_tokens_seen": 274161125, + "step": 12711, + "time_per_iteration": 2.7190964221954346 + }, + { + "auxiliary_loss_clip": 0.01040972, + "auxiliary_loss_mlp": 0.0102872, + "balance_loss_clip": 1.02389991, + "balance_loss_mlp": 1.01853967, + "epoch": 0.7642867879152262, + "flos": 22747794243840.0, + "grad_norm": 1.5981029711897212, + "language_loss": 0.72980344, + "learning_rate": 5.549448203559293e-07, + "loss": 0.75050032, + "num_input_tokens_seen": 274180835, + "step": 12712, + "time_per_iteration": 2.7202999591827393 + }, + { + "auxiliary_loss_clip": 0.01031837, + "auxiliary_loss_mlp": 0.01027004, + "balance_loss_clip": 1.02539468, + "balance_loss_mlp": 1.01706803, + "epoch": 0.7643469111678942, + "flos": 23332132696320.0, + "grad_norm": 1.6448618817389218, + "language_loss": 0.80655462, + "learning_rate": 5.546755965040804e-07, + "loss": 0.82714295, + "num_input_tokens_seen": 274201190, + "step": 12713, + "time_per_iteration": 2.6731362342834473 + }, + { + "auxiliary_loss_clip": 0.01053544, + "auxiliary_loss_mlp": 0.00747672, + "balance_loss_clip": 1.02519643, + "balance_loss_mlp": 1.00037849, + "epoch": 0.7644070344205621, + "flos": 19857092440320.0, + "grad_norm": 2.0372021568832763, + "language_loss": 0.83569163, + "learning_rate": 5.544064274590776e-07, + "loss": 0.85370386, + "num_input_tokens_seen": 274217595, + "step": 12714, + "time_per_iteration": 2.675590753555298 + }, + { + "auxiliary_loss_clip": 0.01056471, + "auxiliary_loss_mlp": 0.01032259, + "balance_loss_clip": 1.02565122, + "balance_loss_mlp": 1.02111864, + "epoch": 0.7644671576732301, + "flos": 22090628966400.0, + "grad_norm": 1.533397378959716, + "language_loss": 0.72845149, + "learning_rate": 5.541373132311287e-07, + "loss": 0.74933875, + "num_input_tokens_seen": 274237885, + "step": 12715, + "time_per_iteration": 4.436304807662964 + }, + { + "auxiliary_loss_clip": 0.0102683, + "auxiliary_loss_mlp": 0.01025359, + "balance_loss_clip": 1.02322221, + "balance_loss_mlp": 1.01477957, + "epoch": 0.7645272809258981, + "flos": 25481421872640.0, + "grad_norm": 2.7041169999277406, + "language_loss": 0.63252443, + "learning_rate": 5.538682538304376e-07, + "loss": 0.65304637, + "num_input_tokens_seen": 274258820, + "step": 12716, + "time_per_iteration": 2.814342737197876 + }, + { + "auxiliary_loss_clip": 0.01065131, + "auxiliary_loss_mlp": 0.01030115, + "balance_loss_clip": 1.02573085, + "balance_loss_mlp": 1.01867652, + "epoch": 0.7645874041785661, + "flos": 21541877913600.0, + "grad_norm": 1.6111282223500496, + "language_loss": 0.79753828, + "learning_rate": 5.535992492672068e-07, + "loss": 0.81849074, + "num_input_tokens_seen": 274278835, + "step": 12717, + "time_per_iteration": 2.585118055343628 + }, + { + "auxiliary_loss_clip": 0.01063583, + "auxiliary_loss_mlp": 0.01033068, + "balance_loss_clip": 1.02628398, + "balance_loss_mlp": 1.02240479, + "epoch": 0.764647527431234, + "flos": 20630896156800.0, + "grad_norm": 2.3384970603234, + "language_loss": 0.66938674, + "learning_rate": 5.53330299551638e-07, + "loss": 0.69035327, + "num_input_tokens_seen": 274297110, + "step": 12718, + "time_per_iteration": 2.6098275184631348 + }, + { + "auxiliary_loss_clip": 0.01022612, + "auxiliary_loss_mlp": 0.01031917, + "balance_loss_clip": 1.02414513, + "balance_loss_mlp": 1.02237439, + "epoch": 0.764707650683902, + "flos": 21434074220160.0, + "grad_norm": 1.9202181876669289, + "language_loss": 0.77331793, + "learning_rate": 5.530614046939286e-07, + "loss": 0.7938633, + "num_input_tokens_seen": 274315610, + "step": 12719, + "time_per_iteration": 2.770484447479248 + }, + { + "auxiliary_loss_clip": 0.01062446, + "auxiliary_loss_mlp": 0.01027057, + "balance_loss_clip": 1.02491069, + "balance_loss_mlp": 1.01607752, + "epoch": 0.7647677739365699, + "flos": 22711201263360.0, + "grad_norm": 1.7463815781341328, + "language_loss": 0.70225596, + "learning_rate": 5.527925647042754e-07, + "loss": 0.72315103, + "num_input_tokens_seen": 274333975, + "step": 12720, + "time_per_iteration": 2.7031867504119873 + }, + { + "auxiliary_loss_clip": 0.01033142, + "auxiliary_loss_mlp": 0.01030347, + "balance_loss_clip": 1.02505958, + "balance_loss_mlp": 1.01982641, + "epoch": 0.7648278971892379, + "flos": 21324115710720.0, + "grad_norm": 1.62248525616609, + "language_loss": 0.73893195, + "learning_rate": 5.52523779592875e-07, + "loss": 0.75956678, + "num_input_tokens_seen": 274353695, + "step": 12721, + "time_per_iteration": 2.6984293460845947 + }, + { + "auxiliary_loss_clip": 0.01028179, + "auxiliary_loss_mlp": 0.01027141, + "balance_loss_clip": 1.02402472, + "balance_loss_mlp": 1.0164299, + "epoch": 0.764888020441906, + "flos": 20667345482880.0, + "grad_norm": 1.68474105933936, + "language_loss": 0.73759854, + "learning_rate": 5.522550493699163e-07, + "loss": 0.75815165, + "num_input_tokens_seen": 274371120, + "step": 12722, + "time_per_iteration": 2.7794110774993896 + }, + { + "auxiliary_loss_clip": 0.01050116, + "auxiliary_loss_mlp": 0.01030252, + "balance_loss_clip": 1.02336383, + "balance_loss_mlp": 1.01962495, + "epoch": 0.7649481436945739, + "flos": 25082526360960.0, + "grad_norm": 1.775253764783625, + "language_loss": 0.73715758, + "learning_rate": 5.519863740455912e-07, + "loss": 0.75796121, + "num_input_tokens_seen": 274389665, + "step": 12723, + "time_per_iteration": 2.612718105316162 + }, + { + "auxiliary_loss_clip": 0.0106151, + "auxiliary_loss_mlp": 0.01029782, + "balance_loss_clip": 1.02332067, + "balance_loss_mlp": 1.01882124, + "epoch": 0.7650082669472419, + "flos": 24900890261760.0, + "grad_norm": 1.8689679099193488, + "language_loss": 0.73163581, + "learning_rate": 5.517177536300881e-07, + "loss": 0.75254881, + "num_input_tokens_seen": 274408750, + "step": 12724, + "time_per_iteration": 2.571566581726074 + }, + { + "auxiliary_loss_clip": 0.01049515, + "auxiliary_loss_mlp": 0.01025276, + "balance_loss_clip": 1.02464509, + "balance_loss_mlp": 1.01559043, + "epoch": 0.7650683901999098, + "flos": 14647388676480.0, + "grad_norm": 1.8549146300801012, + "language_loss": 0.84136117, + "learning_rate": 5.514491881335935e-07, + "loss": 0.86210912, + "num_input_tokens_seen": 274424600, + "step": 12725, + "time_per_iteration": 2.5797502994537354 + }, + { + "auxiliary_loss_clip": 0.01035336, + "auxiliary_loss_mlp": 0.01035492, + "balance_loss_clip": 1.02569675, + "balance_loss_mlp": 1.02403057, + "epoch": 0.7651285134525778, + "flos": 26352434770560.0, + "grad_norm": 1.8328461783284855, + "language_loss": 0.77418375, + "learning_rate": 5.511806775662901e-07, + "loss": 0.79489207, + "num_input_tokens_seen": 274443075, + "step": 12726, + "time_per_iteration": 2.7564594745635986 + }, + { + "auxiliary_loss_clip": 0.01052457, + "auxiliary_loss_mlp": 0.01030509, + "balance_loss_clip": 1.02469933, + "balance_loss_mlp": 1.02007818, + "epoch": 0.7651886367052457, + "flos": 26646866553600.0, + "grad_norm": 1.6871380997493313, + "language_loss": 0.70284253, + "learning_rate": 5.509122219383615e-07, + "loss": 0.72367221, + "num_input_tokens_seen": 274463240, + "step": 12727, + "time_per_iteration": 2.683833122253418 + }, + { + "auxiliary_loss_clip": 0.01057931, + "auxiliary_loss_mlp": 0.01026818, + "balance_loss_clip": 1.02296102, + "balance_loss_mlp": 1.01706028, + "epoch": 0.7652487599579137, + "flos": 25702847262720.0, + "grad_norm": 1.7098439632937403, + "language_loss": 0.79811907, + "learning_rate": 5.506438212599864e-07, + "loss": 0.81896657, + "num_input_tokens_seen": 274482750, + "step": 12728, + "time_per_iteration": 2.602839231491089 + }, + { + "auxiliary_loss_clip": 0.01062673, + "auxiliary_loss_mlp": 0.01027848, + "balance_loss_clip": 1.02484608, + "balance_loss_mlp": 1.01668978, + "epoch": 0.7653088832105817, + "flos": 28585576247040.0, + "grad_norm": 3.125067609932334, + "language_loss": 0.54936028, + "learning_rate": 5.503754755413424e-07, + "loss": 0.57026541, + "num_input_tokens_seen": 274503545, + "step": 12729, + "time_per_iteration": 2.580078125 + }, + { + "auxiliary_loss_clip": 0.01042669, + "auxiliary_loss_mlp": 0.00747653, + "balance_loss_clip": 1.02448213, + "balance_loss_mlp": 1.00041544, + "epoch": 0.7653690064632497, + "flos": 23366750428800.0, + "grad_norm": 1.6150091172349796, + "language_loss": 0.77719903, + "learning_rate": 5.501071847926055e-07, + "loss": 0.79510224, + "num_input_tokens_seen": 274523825, + "step": 12730, + "time_per_iteration": 2.6676583290100098 + }, + { + "auxiliary_loss_clip": 0.01060359, + "auxiliary_loss_mlp": 0.01036146, + "balance_loss_clip": 1.02999258, + "balance_loss_mlp": 1.02491641, + "epoch": 0.7654291297159176, + "flos": 15773905992960.0, + "grad_norm": 1.732029570307205, + "language_loss": 0.68857062, + "learning_rate": 5.498389490239495e-07, + "loss": 0.70953572, + "num_input_tokens_seen": 274541625, + "step": 12731, + "time_per_iteration": 2.6940505504608154 + }, + { + "auxiliary_loss_clip": 0.01064905, + "auxiliary_loss_mlp": 0.01032501, + "balance_loss_clip": 1.02546144, + "balance_loss_mlp": 1.02183747, + "epoch": 0.7654892529685856, + "flos": 18033800123520.0, + "grad_norm": 2.1491621463010735, + "language_loss": 0.70700777, + "learning_rate": 5.495707682455471e-07, + "loss": 0.72798193, + "num_input_tokens_seen": 274557580, + "step": 12732, + "time_per_iteration": 2.6926944255828857 + }, + { + "auxiliary_loss_clip": 0.01044475, + "auxiliary_loss_mlp": 0.01026309, + "balance_loss_clip": 1.02500999, + "balance_loss_mlp": 1.01515675, + "epoch": 0.7655493762212535, + "flos": 27236017428480.0, + "grad_norm": 1.5284353912374922, + "language_loss": 0.78246081, + "learning_rate": 5.493026424675653e-07, + "loss": 0.80316859, + "num_input_tokens_seen": 274578135, + "step": 12733, + "time_per_iteration": 4.571497201919556 + }, + { + "auxiliary_loss_clip": 0.01048735, + "auxiliary_loss_mlp": 0.01028353, + "balance_loss_clip": 1.02291703, + "balance_loss_mlp": 1.01778495, + "epoch": 0.7656094994739215, + "flos": 20773964027520.0, + "grad_norm": 2.35311138286582, + "language_loss": 0.77454239, + "learning_rate": 5.490345717001726e-07, + "loss": 0.7953133, + "num_input_tokens_seen": 274595655, + "step": 12734, + "time_per_iteration": 2.690368175506592 + }, + { + "auxiliary_loss_clip": 0.01038661, + "auxiliary_loss_mlp": 0.01029096, + "balance_loss_clip": 1.02397037, + "balance_loss_mlp": 1.0173893, + "epoch": 0.7656696227265896, + "flos": 23039245198080.0, + "grad_norm": 1.626472430839466, + "language_loss": 0.73267794, + "learning_rate": 5.48766555953535e-07, + "loss": 0.7533555, + "num_input_tokens_seen": 274616305, + "step": 12735, + "time_per_iteration": 4.369852304458618 + }, + { + "auxiliary_loss_clip": 0.01035844, + "auxiliary_loss_mlp": 0.01030714, + "balance_loss_clip": 1.02498031, + "balance_loss_mlp": 1.01984215, + "epoch": 0.7657297459792575, + "flos": 27525636789120.0, + "grad_norm": 1.4689583282394498, + "language_loss": 0.72817612, + "learning_rate": 5.484985952378145e-07, + "loss": 0.74884164, + "num_input_tokens_seen": 274638110, + "step": 12736, + "time_per_iteration": 2.8297102451324463 + }, + { + "auxiliary_loss_clip": 0.01057441, + "auxiliary_loss_mlp": 0.00747862, + "balance_loss_clip": 1.02613986, + "balance_loss_mlp": 1.00044656, + "epoch": 0.7657898692319255, + "flos": 17128456801920.0, + "grad_norm": 2.037457332806514, + "language_loss": 0.77563459, + "learning_rate": 5.482306895631728e-07, + "loss": 0.79368764, + "num_input_tokens_seen": 274656565, + "step": 12737, + "time_per_iteration": 2.6745383739471436 + }, + { + "auxiliary_loss_clip": 0.01040994, + "auxiliary_loss_mlp": 0.01028207, + "balance_loss_clip": 1.02338648, + "balance_loss_mlp": 1.01694131, + "epoch": 0.7658499924845934, + "flos": 21465747037440.0, + "grad_norm": 1.9583232420579229, + "language_loss": 0.76422447, + "learning_rate": 5.479628389397699e-07, + "loss": 0.78491646, + "num_input_tokens_seen": 274674215, + "step": 12738, + "time_per_iteration": 2.6430013179779053 + }, + { + "auxiliary_loss_clip": 0.01042076, + "auxiliary_loss_mlp": 0.01028065, + "balance_loss_clip": 1.02463627, + "balance_loss_mlp": 1.01670396, + "epoch": 0.7659101157372614, + "flos": 29496665744640.0, + "grad_norm": 1.7963797943505548, + "language_loss": 0.62442058, + "learning_rate": 5.476950433777603e-07, + "loss": 0.64512193, + "num_input_tokens_seen": 274693445, + "step": 12739, + "time_per_iteration": 2.7403082847595215 + }, + { + "auxiliary_loss_clip": 0.01063858, + "auxiliary_loss_mlp": 0.0103297, + "balance_loss_clip": 1.02518082, + "balance_loss_mlp": 1.02196097, + "epoch": 0.7659702389899293, + "flos": 18551812112640.0, + "grad_norm": 2.022119992264782, + "language_loss": 0.79335713, + "learning_rate": 5.474273028873004e-07, + "loss": 0.81432533, + "num_input_tokens_seen": 274712815, + "step": 12740, + "time_per_iteration": 2.600611686706543 + }, + { + "auxiliary_loss_clip": 0.0105235, + "auxiliary_loss_mlp": 0.01032758, + "balance_loss_clip": 1.02421796, + "balance_loss_mlp": 1.02162409, + "epoch": 0.7660303622425974, + "flos": 23549176627200.0, + "grad_norm": 1.85595320810691, + "language_loss": 0.65851796, + "learning_rate": 5.471596174785429e-07, + "loss": 0.67936909, + "num_input_tokens_seen": 274732690, + "step": 12741, + "time_per_iteration": 2.695554256439209 + }, + { + "auxiliary_loss_clip": 0.01035091, + "auxiliary_loss_mlp": 0.0102559, + "balance_loss_clip": 1.02275896, + "balance_loss_mlp": 1.01461649, + "epoch": 0.7660904854952653, + "flos": 18916736336640.0, + "grad_norm": 1.525742968520591, + "language_loss": 0.76040876, + "learning_rate": 5.468919871616386e-07, + "loss": 0.78101557, + "num_input_tokens_seen": 274752460, + "step": 12742, + "time_per_iteration": 2.7200474739074707 + }, + { + "auxiliary_loss_clip": 0.01040244, + "auxiliary_loss_mlp": 0.01031724, + "balance_loss_clip": 1.02469885, + "balance_loss_mlp": 1.02190697, + "epoch": 0.7661506087479333, + "flos": 23147515768320.0, + "grad_norm": 1.3780488766411112, + "language_loss": 0.76363814, + "learning_rate": 5.46624411946736e-07, + "loss": 0.78435779, + "num_input_tokens_seen": 274773070, + "step": 12743, + "time_per_iteration": 2.7805891036987305 + }, + { + "auxiliary_loss_clip": 0.01035974, + "auxiliary_loss_mlp": 0.01029427, + "balance_loss_clip": 1.0223906, + "balance_loss_mlp": 1.01919317, + "epoch": 0.7662107320006012, + "flos": 17565776887680.0, + "grad_norm": 1.8484695102407358, + "language_loss": 0.74752998, + "learning_rate": 5.463568918439805e-07, + "loss": 0.76818395, + "num_input_tokens_seen": 274790220, + "step": 12744, + "time_per_iteration": 2.679659605026245 + }, + { + "auxiliary_loss_clip": 0.01051044, + "auxiliary_loss_mlp": 0.01028286, + "balance_loss_clip": 1.02432704, + "balance_loss_mlp": 1.01765299, + "epoch": 0.7662708552532692, + "flos": 22303075956480.0, + "grad_norm": 2.1477083066811296, + "language_loss": 0.71183264, + "learning_rate": 5.460894268635181e-07, + "loss": 0.73262596, + "num_input_tokens_seen": 274805095, + "step": 12745, + "time_per_iteration": 2.6085822582244873 + }, + { + "auxiliary_loss_clip": 0.01047209, + "auxiliary_loss_mlp": 0.01035886, + "balance_loss_clip": 1.02325654, + "balance_loss_mlp": 1.02385139, + "epoch": 0.7663309785059371, + "flos": 15742053607680.0, + "grad_norm": 2.591100198998092, + "language_loss": 0.76916432, + "learning_rate": 5.458220170154896e-07, + "loss": 0.78999531, + "num_input_tokens_seen": 274821800, + "step": 12746, + "time_per_iteration": 2.6121160984039307 + }, + { + "auxiliary_loss_clip": 0.00979924, + "auxiliary_loss_mlp": 0.01001224, + "balance_loss_clip": 1.00448585, + "balance_loss_mlp": 1.00026417, + "epoch": 0.7663911017586051, + "flos": 62163312514560.0, + "grad_norm": 0.9287883199614735, + "language_loss": 0.56770879, + "learning_rate": 5.455546623100362e-07, + "loss": 0.58752024, + "num_input_tokens_seen": 274886970, + "step": 12747, + "time_per_iteration": 3.321662664413452 + }, + { + "auxiliary_loss_clip": 0.01060705, + "auxiliary_loss_mlp": 0.01028078, + "balance_loss_clip": 1.02511549, + "balance_loss_mlp": 1.0188868, + "epoch": 0.7664512250112732, + "flos": 26506025326080.0, + "grad_norm": 1.4977248469374875, + "language_loss": 0.72272897, + "learning_rate": 5.452873627572956e-07, + "loss": 0.74361682, + "num_input_tokens_seen": 274907240, + "step": 12748, + "time_per_iteration": 2.599825620651245 + }, + { + "auxiliary_loss_clip": 0.01026724, + "auxiliary_loss_mlp": 0.01030806, + "balance_loss_clip": 1.02207947, + "balance_loss_mlp": 1.01980901, + "epoch": 0.7665113482639411, + "flos": 16249542912000.0, + "grad_norm": 3.1530169135909225, + "language_loss": 0.68845844, + "learning_rate": 5.450201183674052e-07, + "loss": 0.70903379, + "num_input_tokens_seen": 274924650, + "step": 12749, + "time_per_iteration": 2.723965883255005 + }, + { + "auxiliary_loss_clip": 0.01052809, + "auxiliary_loss_mlp": 0.01025893, + "balance_loss_clip": 1.02488065, + "balance_loss_mlp": 1.01491332, + "epoch": 0.7665714715166091, + "flos": 27197880163200.0, + "grad_norm": 1.7993843485694736, + "language_loss": 0.7357024, + "learning_rate": 5.447529291504967e-07, + "loss": 0.7564894, + "num_input_tokens_seen": 274944550, + "step": 12750, + "time_per_iteration": 2.6391100883483887 + }, + { + "auxiliary_loss_clip": 0.01049427, + "auxiliary_loss_mlp": 0.01025302, + "balance_loss_clip": 1.02371264, + "balance_loss_mlp": 1.01562262, + "epoch": 0.766631594769277, + "flos": 21067785279360.0, + "grad_norm": 4.886854815919302, + "language_loss": 0.75893658, + "learning_rate": 5.444857951167026e-07, + "loss": 0.77968383, + "num_input_tokens_seen": 274961330, + "step": 12751, + "time_per_iteration": 2.5758495330810547 + }, + { + "auxiliary_loss_clip": 0.01028896, + "auxiliary_loss_mlp": 0.01035331, + "balance_loss_clip": 1.02367902, + "balance_loss_mlp": 1.02411342, + "epoch": 0.766691718021945, + "flos": 24097963593600.0, + "grad_norm": 1.775083326487078, + "language_loss": 0.6155647, + "learning_rate": 5.442187162761537e-07, + "loss": 0.63620704, + "num_input_tokens_seen": 274981655, + "step": 12752, + "time_per_iteration": 4.468302965164185 + }, + { + "auxiliary_loss_clip": 0.01054999, + "auxiliary_loss_mlp": 0.01030546, + "balance_loss_clip": 1.02577293, + "balance_loss_mlp": 1.01891148, + "epoch": 0.7667518412746129, + "flos": 23440654661760.0, + "grad_norm": 1.85335249738716, + "language_loss": 0.69180882, + "learning_rate": 5.439516926389767e-07, + "loss": 0.71266431, + "num_input_tokens_seen": 274999970, + "step": 12753, + "time_per_iteration": 2.6353628635406494 + }, + { + "auxiliary_loss_clip": 0.01052642, + "auxiliary_loss_mlp": 0.01034536, + "balance_loss_clip": 1.02481675, + "balance_loss_mlp": 1.02375937, + "epoch": 0.766811964527281, + "flos": 18148786536960.0, + "grad_norm": 2.1529902041910693, + "language_loss": 0.62300849, + "learning_rate": 5.436847242152971e-07, + "loss": 0.64388025, + "num_input_tokens_seen": 275015805, + "step": 12754, + "time_per_iteration": 2.637748956680298 + }, + { + "auxiliary_loss_clip": 0.0106244, + "auxiliary_loss_mlp": 0.01027617, + "balance_loss_clip": 1.02529359, + "balance_loss_mlp": 1.01744246, + "epoch": 0.7668720877799489, + "flos": 19536051657600.0, + "grad_norm": 3.5472517015773306, + "language_loss": 0.79898459, + "learning_rate": 5.434178110152401e-07, + "loss": 0.81988519, + "num_input_tokens_seen": 275031810, + "step": 12755, + "time_per_iteration": 2.5790982246398926 + }, + { + "auxiliary_loss_clip": 0.01062406, + "auxiliary_loss_mlp": 0.01026165, + "balance_loss_clip": 1.02581728, + "balance_loss_mlp": 1.01607978, + "epoch": 0.7669322110326169, + "flos": 22674320974080.0, + "grad_norm": 1.8208442123759565, + "language_loss": 0.7007761, + "learning_rate": 5.431509530489242e-07, + "loss": 0.72166181, + "num_input_tokens_seen": 275049325, + "step": 12756, + "time_per_iteration": 2.605668544769287 + }, + { + "auxiliary_loss_clip": 0.01052432, + "auxiliary_loss_mlp": 0.01031679, + "balance_loss_clip": 1.02484584, + "balance_loss_mlp": 1.02140892, + "epoch": 0.7669923342852848, + "flos": 26469396432000.0, + "grad_norm": 1.9220114618670017, + "language_loss": 0.69900757, + "learning_rate": 5.428841503264706e-07, + "loss": 0.71984869, + "num_input_tokens_seen": 275070865, + "step": 12757, + "time_per_iteration": 2.6444268226623535 + }, + { + "auxiliary_loss_clip": 0.01045173, + "auxiliary_loss_mlp": 0.01034116, + "balance_loss_clip": 1.02721512, + "balance_loss_mlp": 1.02250457, + "epoch": 0.7670524575379528, + "flos": 22856136641280.0, + "grad_norm": 2.434085820134793, + "language_loss": 0.76068556, + "learning_rate": 5.426174028579955e-07, + "loss": 0.7814784, + "num_input_tokens_seen": 275088015, + "step": 12758, + "time_per_iteration": 2.6639320850372314 + }, + { + "auxiliary_loss_clip": 0.01048944, + "auxiliary_loss_mlp": 0.01031854, + "balance_loss_clip": 1.02280223, + "balance_loss_mlp": 1.02157187, + "epoch": 0.7671125807906207, + "flos": 22452141398400.0, + "grad_norm": 1.579280543409804, + "language_loss": 0.76195329, + "learning_rate": 5.423507106536156e-07, + "loss": 0.78276128, + "num_input_tokens_seen": 275106975, + "step": 12759, + "time_per_iteration": 2.788496732711792 + }, + { + "auxiliary_loss_clip": 0.01040366, + "auxiliary_loss_mlp": 0.01022789, + "balance_loss_clip": 1.02280068, + "balance_loss_mlp": 1.01269221, + "epoch": 0.7671727040432887, + "flos": 35371543518720.0, + "grad_norm": 1.999833664709831, + "language_loss": 0.68656886, + "learning_rate": 5.420840737234425e-07, + "loss": 0.70720041, + "num_input_tokens_seen": 275129560, + "step": 12760, + "time_per_iteration": 2.789795160293579 + }, + { + "auxiliary_loss_clip": 0.01040086, + "auxiliary_loss_mlp": 0.01026612, + "balance_loss_clip": 1.02429652, + "balance_loss_mlp": 1.01612723, + "epoch": 0.7672328272959568, + "flos": 22494947431680.0, + "grad_norm": 1.4967764905432626, + "language_loss": 0.79128271, + "learning_rate": 5.418174920775871e-07, + "loss": 0.81194967, + "num_input_tokens_seen": 275151180, + "step": 12761, + "time_per_iteration": 2.647279739379883 + }, + { + "auxiliary_loss_clip": 0.01035848, + "auxiliary_loss_mlp": 0.0103018, + "balance_loss_clip": 1.02236927, + "balance_loss_mlp": 1.01979089, + "epoch": 0.7672929505486247, + "flos": 22815557251200.0, + "grad_norm": 2.2914567382957167, + "language_loss": 0.66072398, + "learning_rate": 5.415509657261589e-07, + "loss": 0.68138421, + "num_input_tokens_seen": 275170605, + "step": 12762, + "time_per_iteration": 2.708301305770874 + }, + { + "auxiliary_loss_clip": 0.01054047, + "auxiliary_loss_mlp": 0.01026636, + "balance_loss_clip": 1.02579653, + "balance_loss_mlp": 1.01623523, + "epoch": 0.7673530738012927, + "flos": 20338834671360.0, + "grad_norm": 1.8372018835075892, + "language_loss": 0.74133241, + "learning_rate": 5.412844946792639e-07, + "loss": 0.76213926, + "num_input_tokens_seen": 275188750, + "step": 12763, + "time_per_iteration": 4.330575942993164 + }, + { + "auxiliary_loss_clip": 0.01043132, + "auxiliary_loss_mlp": 0.01030586, + "balance_loss_clip": 1.02592945, + "balance_loss_mlp": 1.01993501, + "epoch": 0.7674131970539606, + "flos": 34933576988160.0, + "grad_norm": 2.3735512479513097, + "language_loss": 0.70869076, + "learning_rate": 5.410180789470067e-07, + "loss": 0.72942799, + "num_input_tokens_seen": 275211365, + "step": 12764, + "time_per_iteration": 2.793632745742798 + }, + { + "auxiliary_loss_clip": 0.01053612, + "auxiliary_loss_mlp": 0.01026261, + "balance_loss_clip": 1.02633834, + "balance_loss_mlp": 1.01584244, + "epoch": 0.7674733203066286, + "flos": 28328850766080.0, + "grad_norm": 2.034856894863683, + "language_loss": 0.69194543, + "learning_rate": 5.40751718539491e-07, + "loss": 0.71274412, + "num_input_tokens_seen": 275231670, + "step": 12765, + "time_per_iteration": 2.6767220497131348 + }, + { + "auxiliary_loss_clip": 0.01031272, + "auxiliary_loss_mlp": 0.01025974, + "balance_loss_clip": 1.02253103, + "balance_loss_mlp": 1.01660395, + "epoch": 0.7675334435592965, + "flos": 16289727252480.0, + "grad_norm": 2.2220111330772907, + "language_loss": 0.60766679, + "learning_rate": 5.404854134668162e-07, + "loss": 0.62823927, + "num_input_tokens_seen": 275249425, + "step": 12766, + "time_per_iteration": 2.7316579818725586 + }, + { + "auxiliary_loss_clip": 0.00982409, + "auxiliary_loss_mlp": 0.01001865, + "balance_loss_clip": 1.00534856, + "balance_loss_mlp": 1.00073266, + "epoch": 0.7675935668119646, + "flos": 64826232220800.0, + "grad_norm": 0.7320239944171884, + "language_loss": 0.60785574, + "learning_rate": 5.402191637390803e-07, + "loss": 0.62769848, + "num_input_tokens_seen": 275312485, + "step": 12767, + "time_per_iteration": 3.3881096839904785 + }, + { + "auxiliary_loss_clip": 0.01040808, + "auxiliary_loss_mlp": 0.01022877, + "balance_loss_clip": 1.02525818, + "balance_loss_mlp": 1.01328015, + "epoch": 0.7676536900646325, + "flos": 22675398382080.0, + "grad_norm": 1.8170592105244872, + "language_loss": 0.69661069, + "learning_rate": 5.399529693663801e-07, + "loss": 0.71724749, + "num_input_tokens_seen": 275331680, + "step": 12768, + "time_per_iteration": 2.650840997695923 + }, + { + "auxiliary_loss_clip": 0.0105437, + "auxiliary_loss_mlp": 0.01030701, + "balance_loss_clip": 1.02533603, + "balance_loss_mlp": 1.01953077, + "epoch": 0.7677138133173005, + "flos": 26939682224640.0, + "grad_norm": 1.6747487328239385, + "language_loss": 0.70690507, + "learning_rate": 5.3968683035881e-07, + "loss": 0.72775578, + "num_input_tokens_seen": 275351615, + "step": 12769, + "time_per_iteration": 2.6494104862213135 + }, + { + "auxiliary_loss_clip": 0.01053135, + "auxiliary_loss_mlp": 0.01026115, + "balance_loss_clip": 1.02527308, + "balance_loss_mlp": 1.0153023, + "epoch": 0.7677739365699684, + "flos": 23799545400960.0, + "grad_norm": 2.063118483472454, + "language_loss": 0.80457258, + "learning_rate": 5.394207467264611e-07, + "loss": 0.82536507, + "num_input_tokens_seen": 275368815, + "step": 12770, + "time_per_iteration": 2.668952703475952 + }, + { + "auxiliary_loss_clip": 0.01024402, + "auxiliary_loss_mlp": 0.01033689, + "balance_loss_clip": 1.02215016, + "balance_loss_mlp": 1.0229187, + "epoch": 0.7678340598226364, + "flos": 34455497944320.0, + "grad_norm": 1.526927360311805, + "language_loss": 0.78731734, + "learning_rate": 5.391547184794245e-07, + "loss": 0.80789822, + "num_input_tokens_seen": 275389345, + "step": 12771, + "time_per_iteration": 2.804049491882324 + }, + { + "auxiliary_loss_clip": 0.01060261, + "auxiliary_loss_mlp": 0.01026054, + "balance_loss_clip": 1.02379179, + "balance_loss_mlp": 1.01601076, + "epoch": 0.7678941830753043, + "flos": 23841740903040.0, + "grad_norm": 1.412742945725589, + "language_loss": 0.68318355, + "learning_rate": 5.388887456277876e-07, + "loss": 0.70404667, + "num_input_tokens_seen": 275411240, + "step": 12772, + "time_per_iteration": 2.6159164905548096 + }, + { + "auxiliary_loss_clip": 0.0104617, + "auxiliary_loss_mlp": 0.01023716, + "balance_loss_clip": 1.02384019, + "balance_loss_mlp": 1.01380408, + "epoch": 0.7679543063279723, + "flos": 25410929431680.0, + "grad_norm": 1.559765427345315, + "language_loss": 0.73235118, + "learning_rate": 5.386228281816349e-07, + "loss": 0.75305009, + "num_input_tokens_seen": 275432010, + "step": 12773, + "time_per_iteration": 2.6533265113830566 + }, + { + "auxiliary_loss_clip": 0.01020468, + "auxiliary_loss_mlp": 0.01025538, + "balance_loss_clip": 1.02039576, + "balance_loss_mlp": 1.01595378, + "epoch": 0.7680144295806404, + "flos": 27962382257280.0, + "grad_norm": 1.7053508006272915, + "language_loss": 0.80950963, + "learning_rate": 5.383569661510512e-07, + "loss": 0.82996964, + "num_input_tokens_seen": 275453710, + "step": 12774, + "time_per_iteration": 2.7656548023223877 + }, + { + "auxiliary_loss_clip": 0.01054678, + "auxiliary_loss_mlp": 0.00747547, + "balance_loss_clip": 1.02713466, + "balance_loss_mlp": 1.00033212, + "epoch": 0.7680745528333083, + "flos": 20412810731520.0, + "grad_norm": 1.5567014068295466, + "language_loss": 0.70503742, + "learning_rate": 5.380911595461177e-07, + "loss": 0.72305965, + "num_input_tokens_seen": 275472915, + "step": 12775, + "time_per_iteration": 2.6194722652435303 + }, + { + "auxiliary_loss_clip": 0.00971754, + "auxiliary_loss_mlp": 0.01004216, + "balance_loss_clip": 1.00576282, + "balance_loss_mlp": 1.00314939, + "epoch": 0.7681346760859763, + "flos": 68401103351040.0, + "grad_norm": 0.7061870926078073, + "language_loss": 0.56874168, + "learning_rate": 5.378254083769147e-07, + "loss": 0.58850139, + "num_input_tokens_seen": 275534785, + "step": 12776, + "time_per_iteration": 3.428706645965576 + }, + { + "auxiliary_loss_clip": 0.01045895, + "auxiliary_loss_mlp": 0.01031731, + "balance_loss_clip": 1.02260303, + "balance_loss_mlp": 1.02166355, + "epoch": 0.7681947993386442, + "flos": 21251468453760.0, + "grad_norm": 1.8840898564402546, + "language_loss": 0.73776579, + "learning_rate": 5.375597126535188e-07, + "loss": 0.75854206, + "num_input_tokens_seen": 275553205, + "step": 12777, + "time_per_iteration": 2.7499828338623047 + }, + { + "auxiliary_loss_clip": 0.01034555, + "auxiliary_loss_mlp": 0.01030066, + "balance_loss_clip": 1.0270288, + "balance_loss_mlp": 1.01967728, + "epoch": 0.7682549225913122, + "flos": 21397696721280.0, + "grad_norm": 2.037195426979077, + "language_loss": 0.701451, + "learning_rate": 5.372940723860043e-07, + "loss": 0.72209728, + "num_input_tokens_seen": 275571490, + "step": 12778, + "time_per_iteration": 2.818248987197876 + }, + { + "auxiliary_loss_clip": 0.01052728, + "auxiliary_loss_mlp": 0.01025979, + "balance_loss_clip": 1.02623439, + "balance_loss_mlp": 1.01613808, + "epoch": 0.7683150458439801, + "flos": 23038921975680.0, + "grad_norm": 2.075143867655356, + "language_loss": 0.69959128, + "learning_rate": 5.37028487584446e-07, + "loss": 0.72037834, + "num_input_tokens_seen": 275589665, + "step": 12779, + "time_per_iteration": 2.5893397331237793 + }, + { + "auxiliary_loss_clip": 0.01043317, + "auxiliary_loss_mlp": 0.01027307, + "balance_loss_clip": 1.02511382, + "balance_loss_mlp": 1.01650691, + "epoch": 0.7683751690966482, + "flos": 67332397996800.0, + "grad_norm": 1.6578351869350145, + "language_loss": 0.58687711, + "learning_rate": 5.367629582589133e-07, + "loss": 0.6075834, + "num_input_tokens_seen": 275615605, + "step": 12780, + "time_per_iteration": 3.0275115966796875 + }, + { + "auxiliary_loss_clip": 0.01054428, + "auxiliary_loss_mlp": 0.01034847, + "balance_loss_clip": 1.02468097, + "balance_loss_mlp": 1.02259839, + "epoch": 0.7684352923493161, + "flos": 21798890703360.0, + "grad_norm": 1.7589133483402837, + "language_loss": 0.6763643, + "learning_rate": 5.364974844194759e-07, + "loss": 0.69725704, + "num_input_tokens_seen": 275634965, + "step": 12781, + "time_per_iteration": 4.341970920562744 + }, + { + "auxiliary_loss_clip": 0.01012614, + "auxiliary_loss_mlp": 0.0102682, + "balance_loss_clip": 1.02366018, + "balance_loss_mlp": 1.01725972, + "epoch": 0.7684954156019841, + "flos": 25847603072640.0, + "grad_norm": 1.477841813852753, + "language_loss": 0.79361469, + "learning_rate": 5.362320660762016e-07, + "loss": 0.81400907, + "num_input_tokens_seen": 275655785, + "step": 12782, + "time_per_iteration": 2.8294615745544434 + }, + { + "auxiliary_loss_clip": 0.01035166, + "auxiliary_loss_mlp": 0.01026043, + "balance_loss_clip": 1.0250355, + "balance_loss_mlp": 1.01488483, + "epoch": 0.768555538854652, + "flos": 25447378757760.0, + "grad_norm": 1.62693815821968, + "language_loss": 0.66674602, + "learning_rate": 5.35966703239153e-07, + "loss": 0.68735814, + "num_input_tokens_seen": 275676160, + "step": 12783, + "time_per_iteration": 4.2962422370910645 + }, + { + "auxiliary_loss_clip": 0.01042014, + "auxiliary_loss_mlp": 0.01028862, + "balance_loss_clip": 1.02443933, + "balance_loss_mlp": 1.01762033, + "epoch": 0.76861566210732, + "flos": 19646369303040.0, + "grad_norm": 1.6097085961922337, + "language_loss": 0.69304752, + "learning_rate": 5.357013959183938e-07, + "loss": 0.71375632, + "num_input_tokens_seen": 275695660, + "step": 12784, + "time_per_iteration": 2.6701223850250244 + }, + { + "auxiliary_loss_clip": 0.01024892, + "auxiliary_loss_mlp": 0.01023909, + "balance_loss_clip": 1.02648795, + "balance_loss_mlp": 1.0147115, + "epoch": 0.7686757853599879, + "flos": 22419032037120.0, + "grad_norm": 1.794420917408553, + "language_loss": 0.80776906, + "learning_rate": 5.354361441239843e-07, + "loss": 0.82825702, + "num_input_tokens_seen": 275714025, + "step": 12785, + "time_per_iteration": 2.7452056407928467 + }, + { + "auxiliary_loss_clip": 0.01046085, + "auxiliary_loss_mlp": 0.01031261, + "balance_loss_clip": 1.02312756, + "balance_loss_mlp": 1.01842797, + "epoch": 0.768735908612656, + "flos": 47774262453120.0, + "grad_norm": 2.0610091612896744, + "language_loss": 0.77591306, + "learning_rate": 5.351709478659836e-07, + "loss": 0.79668653, + "num_input_tokens_seen": 275737300, + "step": 12786, + "time_per_iteration": 2.8598954677581787 + }, + { + "auxiliary_loss_clip": 0.01062275, + "auxiliary_loss_mlp": 0.01028071, + "balance_loss_clip": 1.02488315, + "balance_loss_mlp": 1.01756263, + "epoch": 0.7687960318653239, + "flos": 30263179000320.0, + "grad_norm": 2.260762936711769, + "language_loss": 0.58812201, + "learning_rate": 5.349058071544468e-07, + "loss": 0.60902548, + "num_input_tokens_seen": 275757895, + "step": 12787, + "time_per_iteration": 2.5903332233428955 + }, + { + "auxiliary_loss_clip": 0.01033779, + "auxiliary_loss_mlp": 0.01025992, + "balance_loss_clip": 1.02111924, + "balance_loss_mlp": 1.01601362, + "epoch": 0.7688561551179919, + "flos": 19573434737280.0, + "grad_norm": 1.5838034945606063, + "language_loss": 0.75912589, + "learning_rate": 5.346407219994292e-07, + "loss": 0.77972358, + "num_input_tokens_seen": 275776745, + "step": 12788, + "time_per_iteration": 2.6479861736297607 + }, + { + "auxiliary_loss_clip": 0.01022064, + "auxiliary_loss_mlp": 0.00747602, + "balance_loss_clip": 1.02346647, + "balance_loss_mlp": 1.00036192, + "epoch": 0.7689162783706599, + "flos": 22783776693120.0, + "grad_norm": 1.786414895872796, + "language_loss": 0.66512454, + "learning_rate": 5.343756924109821e-07, + "loss": 0.68282121, + "num_input_tokens_seen": 275797205, + "step": 12789, + "time_per_iteration": 2.9487311840057373 + }, + { + "auxiliary_loss_clip": 0.01039831, + "auxiliary_loss_mlp": 0.01032224, + "balance_loss_clip": 1.02239799, + "balance_loss_mlp": 1.01980805, + "epoch": 0.7689764016233278, + "flos": 34204195416960.0, + "grad_norm": 3.254280305181853, + "language_loss": 0.68679965, + "learning_rate": 5.341107183991553e-07, + "loss": 0.70752025, + "num_input_tokens_seen": 275817935, + "step": 12790, + "time_per_iteration": 2.721672534942627 + }, + { + "auxiliary_loss_clip": 0.01042724, + "auxiliary_loss_mlp": 0.01033338, + "balance_loss_clip": 1.02463889, + "balance_loss_mlp": 1.02249622, + "epoch": 0.7690365248759958, + "flos": 17274469587840.0, + "grad_norm": 1.4621947876382642, + "language_loss": 0.68473321, + "learning_rate": 5.338457999739969e-07, + "loss": 0.70549387, + "num_input_tokens_seen": 275837145, + "step": 12791, + "time_per_iteration": 2.6665565967559814 + }, + { + "auxiliary_loss_clip": 0.01051791, + "auxiliary_loss_mlp": 0.01029202, + "balance_loss_clip": 1.02606487, + "balance_loss_mlp": 1.01972485, + "epoch": 0.7690966481286637, + "flos": 18223157646720.0, + "grad_norm": 2.0902139826776787, + "language_loss": 0.79877424, + "learning_rate": 5.335809371455526e-07, + "loss": 0.81958419, + "num_input_tokens_seen": 275855705, + "step": 12792, + "time_per_iteration": 2.6421148777008057 + }, + { + "auxiliary_loss_clip": 0.01038323, + "auxiliary_loss_mlp": 0.00747602, + "balance_loss_clip": 1.02860045, + "balance_loss_mlp": 1.00030625, + "epoch": 0.7691567713813318, + "flos": 21537568281600.0, + "grad_norm": 1.7980245619446478, + "language_loss": 0.72566152, + "learning_rate": 5.333161299238673e-07, + "loss": 0.74352074, + "num_input_tokens_seen": 275873930, + "step": 12793, + "time_per_iteration": 2.649841785430908 + }, + { + "auxiliary_loss_clip": 0.01035153, + "auxiliary_loss_mlp": 0.01032435, + "balance_loss_clip": 1.02722168, + "balance_loss_mlp": 1.0213902, + "epoch": 0.7692168946339997, + "flos": 39379999720320.0, + "grad_norm": 1.6448971674168582, + "language_loss": 0.63151312, + "learning_rate": 5.330513783189803e-07, + "loss": 0.65218902, + "num_input_tokens_seen": 275895895, + "step": 12794, + "time_per_iteration": 2.8914413452148438 + }, + { + "auxiliary_loss_clip": 0.0103448, + "auxiliary_loss_mlp": 0.01031522, + "balance_loss_clip": 1.02310979, + "balance_loss_mlp": 1.02028656, + "epoch": 0.7692770178866677, + "flos": 25009950931200.0, + "grad_norm": 1.5136730483750402, + "language_loss": 0.76416463, + "learning_rate": 5.327866823409319e-07, + "loss": 0.78482461, + "num_input_tokens_seen": 275917825, + "step": 12795, + "time_per_iteration": 2.706835985183716 + }, + { + "auxiliary_loss_clip": 0.01031958, + "auxiliary_loss_mlp": 0.01025181, + "balance_loss_clip": 1.02348089, + "balance_loss_mlp": 1.01451802, + "epoch": 0.7693371411393356, + "flos": 24716273333760.0, + "grad_norm": 1.5388640917435894, + "language_loss": 0.71592838, + "learning_rate": 5.325220419997601e-07, + "loss": 0.73649979, + "num_input_tokens_seen": 275937890, + "step": 12796, + "time_per_iteration": 2.7790133953094482 + }, + { + "auxiliary_loss_clip": 0.01062127, + "auxiliary_loss_mlp": 0.01024482, + "balance_loss_clip": 1.02484632, + "balance_loss_mlp": 1.01392031, + "epoch": 0.7693972643920036, + "flos": 15924803028480.0, + "grad_norm": 2.2211797694628403, + "language_loss": 0.6486423, + "learning_rate": 5.32257457305499e-07, + "loss": 0.66950846, + "num_input_tokens_seen": 275954495, + "step": 12797, + "time_per_iteration": 2.6184394359588623 + }, + { + "auxiliary_loss_clip": 0.01026026, + "auxiliary_loss_mlp": 0.0103306, + "balance_loss_clip": 1.02280545, + "balance_loss_mlp": 1.02154386, + "epoch": 0.7694573876446715, + "flos": 25405901527680.0, + "grad_norm": 1.9878919779881887, + "language_loss": 0.91625786, + "learning_rate": 5.319929282681823e-07, + "loss": 0.93684876, + "num_input_tokens_seen": 275972395, + "step": 12798, + "time_per_iteration": 2.7160141468048096 + }, + { + "auxiliary_loss_clip": 0.01015067, + "auxiliary_loss_mlp": 0.01022434, + "balance_loss_clip": 1.02480662, + "balance_loss_mlp": 1.01239085, + "epoch": 0.7695175108973396, + "flos": 16654220513280.0, + "grad_norm": 1.8625504132851325, + "language_loss": 0.82455599, + "learning_rate": 5.317284548978418e-07, + "loss": 0.84493101, + "num_input_tokens_seen": 275989020, + "step": 12799, + "time_per_iteration": 2.688473701477051 + }, + { + "auxiliary_loss_clip": 0.01010383, + "auxiliary_loss_mlp": 0.01025557, + "balance_loss_clip": 1.02390814, + "balance_loss_mlp": 1.01507211, + "epoch": 0.7695776341500075, + "flos": 13626520237440.0, + "grad_norm": 2.1404621495392306, + "language_loss": 0.78205138, + "learning_rate": 5.314640372045045e-07, + "loss": 0.80241072, + "num_input_tokens_seen": 276006525, + "step": 12800, + "time_per_iteration": 4.470959901809692 + }, + { + "auxiliary_loss_clip": 0.01045607, + "auxiliary_loss_mlp": 0.01026254, + "balance_loss_clip": 1.02477145, + "balance_loss_mlp": 1.01488185, + "epoch": 0.7696377574026755, + "flos": 24276690691200.0, + "grad_norm": 1.5564512229628698, + "language_loss": 0.83998001, + "learning_rate": 5.31199675198198e-07, + "loss": 0.86069858, + "num_input_tokens_seen": 276027130, + "step": 12801, + "time_per_iteration": 2.8225862979888916 + }, + { + "auxiliary_loss_clip": 0.01041192, + "auxiliary_loss_mlp": 0.0102686, + "balance_loss_clip": 1.02488708, + "balance_loss_mlp": 1.01653063, + "epoch": 0.7696978806553435, + "flos": 20923137210240.0, + "grad_norm": 2.0030800441628687, + "language_loss": 0.71771157, + "learning_rate": 5.30935368888947e-07, + "loss": 0.73839211, + "num_input_tokens_seen": 276045715, + "step": 12802, + "time_per_iteration": 2.673287868499756 + }, + { + "auxiliary_loss_clip": 0.0103154, + "auxiliary_loss_mlp": 0.01025684, + "balance_loss_clip": 1.02168608, + "balance_loss_mlp": 1.01525927, + "epoch": 0.7697580039080114, + "flos": 22929609911040.0, + "grad_norm": 1.7214748244001086, + "language_loss": 0.76163489, + "learning_rate": 5.306711182867747e-07, + "loss": 0.78220713, + "num_input_tokens_seen": 276065375, + "step": 12803, + "time_per_iteration": 2.7829291820526123 + }, + { + "auxiliary_loss_clip": 0.00991227, + "auxiliary_loss_mlp": 0.01003935, + "balance_loss_clip": 1.00521898, + "balance_loss_mlp": 1.00305903, + "epoch": 0.7698181271606794, + "flos": 68717654933760.0, + "grad_norm": 0.7882584780037865, + "language_loss": 0.5582087, + "learning_rate": 5.304069234017001e-07, + "loss": 0.57816035, + "num_input_tokens_seen": 276131405, + "step": 12804, + "time_per_iteration": 3.2625443935394287 + }, + { + "auxiliary_loss_clip": 0.00988005, + "auxiliary_loss_mlp": 0.01004341, + "balance_loss_clip": 1.00240064, + "balance_loss_mlp": 1.00343502, + "epoch": 0.7698782504133473, + "flos": 67409716999680.0, + "grad_norm": 0.7858262439186241, + "language_loss": 0.54054654, + "learning_rate": 5.301427842437429e-07, + "loss": 0.56046999, + "num_input_tokens_seen": 276200755, + "step": 12805, + "time_per_iteration": 3.351274251937866 + }, + { + "auxiliary_loss_clip": 0.010364, + "auxiliary_loss_mlp": 0.01031222, + "balance_loss_clip": 1.02767658, + "balance_loss_mlp": 1.02015901, + "epoch": 0.7699383736660154, + "flos": 22488842119680.0, + "grad_norm": 1.76534077533246, + "language_loss": 0.72884667, + "learning_rate": 5.298787008229187e-07, + "loss": 0.74952286, + "num_input_tokens_seen": 276217880, + "step": 12806, + "time_per_iteration": 2.6993162631988525 + }, + { + "auxiliary_loss_clip": 0.01041708, + "auxiliary_loss_mlp": 0.01032206, + "balance_loss_clip": 1.02398765, + "balance_loss_mlp": 1.02151322, + "epoch": 0.7699984969186833, + "flos": 21539723097600.0, + "grad_norm": 1.9219761956863335, + "language_loss": 0.75061762, + "learning_rate": 5.296146731492408e-07, + "loss": 0.77135682, + "num_input_tokens_seen": 276234810, + "step": 12807, + "time_per_iteration": 2.7130515575408936 + }, + { + "auxiliary_loss_clip": 0.01058604, + "auxiliary_loss_mlp": 0.01031713, + "balance_loss_clip": 1.02751184, + "balance_loss_mlp": 1.02019715, + "epoch": 0.7700586201713513, + "flos": 21719096640000.0, + "grad_norm": 2.3211807970508795, + "language_loss": 0.80231512, + "learning_rate": 5.293507012327218e-07, + "loss": 0.82321823, + "num_input_tokens_seen": 276252850, + "step": 12808, + "time_per_iteration": 2.6352412700653076 + }, + { + "auxiliary_loss_clip": 0.01057768, + "auxiliary_loss_mlp": 0.01035105, + "balance_loss_clip": 1.0266397, + "balance_loss_mlp": 1.02371478, + "epoch": 0.7701187434240192, + "flos": 27856015107840.0, + "grad_norm": 1.7940636826483816, + "language_loss": 0.78930831, + "learning_rate": 5.290867850833718e-07, + "loss": 0.81023705, + "num_input_tokens_seen": 276272525, + "step": 12809, + "time_per_iteration": 2.6844394207000732 + }, + { + "auxiliary_loss_clip": 0.01027928, + "auxiliary_loss_mlp": 0.01023888, + "balance_loss_clip": 1.0228318, + "balance_loss_mlp": 1.01442242, + "epoch": 0.7701788666766872, + "flos": 28621307301120.0, + "grad_norm": 1.54265352429791, + "language_loss": 0.70374346, + "learning_rate": 5.288229247111993e-07, + "loss": 0.72426158, + "num_input_tokens_seen": 276294210, + "step": 12810, + "time_per_iteration": 4.415963888168335 + }, + { + "auxiliary_loss_clip": 0.01036873, + "auxiliary_loss_mlp": 0.01037826, + "balance_loss_clip": 1.02244556, + "balance_loss_mlp": 1.02416503, + "epoch": 0.7702389899293551, + "flos": 14246446089600.0, + "grad_norm": 2.638402123028193, + "language_loss": 0.78237802, + "learning_rate": 5.285591201262079e-07, + "loss": 0.80312502, + "num_input_tokens_seen": 276310290, + "step": 12811, + "time_per_iteration": 2.6894338130950928 + }, + { + "auxiliary_loss_clip": 0.00973776, + "auxiliary_loss_mlp": 0.01003777, + "balance_loss_clip": 1.00069726, + "balance_loss_mlp": 1.00275791, + "epoch": 0.7702991131820232, + "flos": 70574128439040.0, + "grad_norm": 0.8097556522867686, + "language_loss": 0.5669229, + "learning_rate": 5.28295371338402e-07, + "loss": 0.58669853, + "num_input_tokens_seen": 276371715, + "step": 12812, + "time_per_iteration": 3.236398696899414 + }, + { + "auxiliary_loss_clip": 0.01027652, + "auxiliary_loss_mlp": 0.01031833, + "balance_loss_clip": 1.02607489, + "balance_loss_mlp": 1.0208236, + "epoch": 0.7703592364346911, + "flos": 25480021242240.0, + "grad_norm": 1.6247426995412058, + "language_loss": 0.71892548, + "learning_rate": 5.280316783577836e-07, + "loss": 0.73952031, + "num_input_tokens_seen": 276389895, + "step": 12813, + "time_per_iteration": 2.7663090229034424 + }, + { + "auxiliary_loss_clip": 0.01054262, + "auxiliary_loss_mlp": 0.01025268, + "balance_loss_clip": 1.02546155, + "balance_loss_mlp": 1.01410985, + "epoch": 0.7704193596873591, + "flos": 19280906375040.0, + "grad_norm": 1.4972602286663903, + "language_loss": 0.66249692, + "learning_rate": 5.27768041194351e-07, + "loss": 0.68329221, + "num_input_tokens_seen": 276408990, + "step": 12814, + "time_per_iteration": 2.6358602046966553 + }, + { + "auxiliary_loss_clip": 0.01038431, + "auxiliary_loss_mlp": 0.01033131, + "balance_loss_clip": 1.02268243, + "balance_loss_mlp": 1.02250314, + "epoch": 0.7704794829400271, + "flos": 23658452778240.0, + "grad_norm": 2.147972223174394, + "language_loss": 0.65736592, + "learning_rate": 5.275044598581018e-07, + "loss": 0.67808151, + "num_input_tokens_seen": 276428190, + "step": 12815, + "time_per_iteration": 2.6879966259002686 + }, + { + "auxiliary_loss_clip": 0.01052614, + "auxiliary_loss_mlp": 0.01026331, + "balance_loss_clip": 1.02474284, + "balance_loss_mlp": 1.01550651, + "epoch": 0.770539606192695, + "flos": 18989311766400.0, + "grad_norm": 2.930568179128498, + "language_loss": 0.65684414, + "learning_rate": 5.272409343590322e-07, + "loss": 0.67763352, + "num_input_tokens_seen": 276446855, + "step": 12816, + "time_per_iteration": 2.652657985687256 + }, + { + "auxiliary_loss_clip": 0.01053451, + "auxiliary_loss_mlp": 0.0102867, + "balance_loss_clip": 1.02643979, + "balance_loss_mlp": 1.01849532, + "epoch": 0.770599729445363, + "flos": 11830160142720.0, + "grad_norm": 2.2389568297949918, + "language_loss": 0.72496319, + "learning_rate": 5.26977464707133e-07, + "loss": 0.7457844, + "num_input_tokens_seen": 276462000, + "step": 12817, + "time_per_iteration": 2.622044324874878 + }, + { + "auxiliary_loss_clip": 0.01022305, + "auxiliary_loss_mlp": 0.0102852, + "balance_loss_clip": 1.02459335, + "balance_loss_mlp": 1.01809537, + "epoch": 0.770659852698031, + "flos": 17822610109440.0, + "grad_norm": 2.970234908272209, + "language_loss": 0.61215788, + "learning_rate": 5.267140509123957e-07, + "loss": 0.63266611, + "num_input_tokens_seen": 276481190, + "step": 12818, + "time_per_iteration": 2.827179431915283 + }, + { + "auxiliary_loss_clip": 0.01049703, + "auxiliary_loss_mlp": 0.01023895, + "balance_loss_clip": 1.02375579, + "balance_loss_mlp": 1.0145967, + "epoch": 0.770719975950699, + "flos": 21871968923520.0, + "grad_norm": 2.036005524424119, + "language_loss": 0.67238033, + "learning_rate": 5.264506929848093e-07, + "loss": 0.69311631, + "num_input_tokens_seen": 276499520, + "step": 12819, + "time_per_iteration": 2.737213373184204 + }, + { + "auxiliary_loss_clip": 0.01064427, + "auxiliary_loss_mlp": 0.01028291, + "balance_loss_clip": 1.02685332, + "balance_loss_mlp": 1.0174433, + "epoch": 0.7707800992033669, + "flos": 21325049464320.0, + "grad_norm": 1.8220227975550294, + "language_loss": 0.57303637, + "learning_rate": 5.261873909343608e-07, + "loss": 0.59396356, + "num_input_tokens_seen": 276519110, + "step": 12820, + "time_per_iteration": 2.594118595123291 + }, + { + "auxiliary_loss_clip": 0.01041779, + "auxiliary_loss_mlp": 0.01027943, + "balance_loss_clip": 1.02484894, + "balance_loss_mlp": 1.01683903, + "epoch": 0.7708402224560349, + "flos": 28179426188160.0, + "grad_norm": 1.6065950528731774, + "language_loss": 0.81013298, + "learning_rate": 5.259241447710343e-07, + "loss": 0.83083022, + "num_input_tokens_seen": 276538805, + "step": 12821, + "time_per_iteration": 2.740201950073242 + }, + { + "auxiliary_loss_clip": 0.01062889, + "auxiliary_loss_mlp": 0.01030296, + "balance_loss_clip": 1.02565193, + "balance_loss_mlp": 1.01952589, + "epoch": 0.7709003457087028, + "flos": 15377057556480.0, + "grad_norm": 2.5833488329426357, + "language_loss": 0.68235779, + "learning_rate": 5.256609545048114e-07, + "loss": 0.70328963, + "num_input_tokens_seen": 276554770, + "step": 12822, + "time_per_iteration": 2.5877785682678223 + }, + { + "auxiliary_loss_clip": 0.01038804, + "auxiliary_loss_mlp": 0.01030398, + "balance_loss_clip": 1.02330816, + "balance_loss_mlp": 1.01950824, + "epoch": 0.7709604689613708, + "flos": 30621854257920.0, + "grad_norm": 1.7809037648496642, + "language_loss": 0.72615063, + "learning_rate": 5.253978201456733e-07, + "loss": 0.74684262, + "num_input_tokens_seen": 276574535, + "step": 12823, + "time_per_iteration": 2.839794158935547 + }, + { + "auxiliary_loss_clip": 0.01056652, + "auxiliary_loss_mlp": 0.01035617, + "balance_loss_clip": 1.02557552, + "balance_loss_mlp": 1.02296245, + "epoch": 0.7710205922140387, + "flos": 20301272023680.0, + "grad_norm": 1.5464639127272255, + "language_loss": 0.76741743, + "learning_rate": 5.251347417035969e-07, + "loss": 0.78834009, + "num_input_tokens_seen": 276592925, + "step": 12824, + "time_per_iteration": 2.6836588382720947 + }, + { + "auxiliary_loss_clip": 0.01044826, + "auxiliary_loss_mlp": 0.01025939, + "balance_loss_clip": 1.02677655, + "balance_loss_mlp": 1.01513231, + "epoch": 0.7710807154667068, + "flos": 19644214487040.0, + "grad_norm": 1.6706263905555672, + "language_loss": 0.71822912, + "learning_rate": 5.248717191885592e-07, + "loss": 0.73893678, + "num_input_tokens_seen": 276610540, + "step": 12825, + "time_per_iteration": 2.7525596618652344 + }, + { + "auxiliary_loss_clip": 0.01061332, + "auxiliary_loss_mlp": 0.01031721, + "balance_loss_clip": 1.02540159, + "balance_loss_mlp": 1.02206492, + "epoch": 0.7711408387193747, + "flos": 20006337450240.0, + "grad_norm": 1.7531679853439017, + "language_loss": 0.73650253, + "learning_rate": 5.246087526105343e-07, + "loss": 0.75743306, + "num_input_tokens_seen": 276629200, + "step": 12826, + "time_per_iteration": 2.5539538860321045 + }, + { + "auxiliary_loss_clip": 0.01064118, + "auxiliary_loss_mlp": 0.01026496, + "balance_loss_clip": 1.02461839, + "balance_loss_mlp": 1.01546907, + "epoch": 0.7712009619720427, + "flos": 24971131307520.0, + "grad_norm": 1.7369095633847427, + "language_loss": 0.81358433, + "learning_rate": 5.243458419794933e-07, + "loss": 0.83449042, + "num_input_tokens_seen": 276648655, + "step": 12827, + "time_per_iteration": 2.5772173404693604 + }, + { + "auxiliary_loss_clip": 0.01005455, + "auxiliary_loss_mlp": 0.01000285, + "balance_loss_clip": 1.00037146, + "balance_loss_mlp": 0.99933112, + "epoch": 0.7712610852247107, + "flos": 63249681404160.0, + "grad_norm": 0.8573855235246531, + "language_loss": 0.55099726, + "learning_rate": 5.240829873054051e-07, + "loss": 0.57105458, + "num_input_tokens_seen": 276716500, + "step": 12828, + "time_per_iteration": 4.984652519226074 + }, + { + "auxiliary_loss_clip": 0.01009856, + "auxiliary_loss_mlp": 0.01028801, + "balance_loss_clip": 1.02010417, + "balance_loss_mlp": 1.01850748, + "epoch": 0.7713212084773786, + "flos": 18697860812160.0, + "grad_norm": 1.7447066601505445, + "language_loss": 0.69921523, + "learning_rate": 5.23820188598238e-07, + "loss": 0.71960181, + "num_input_tokens_seen": 276733535, + "step": 12829, + "time_per_iteration": 2.6523706912994385 + }, + { + "auxiliary_loss_clip": 0.0103977, + "auxiliary_loss_mlp": 0.01030991, + "balance_loss_clip": 1.02775443, + "balance_loss_mlp": 1.01936233, + "epoch": 0.7713813317300466, + "flos": 14173367869440.0, + "grad_norm": 2.6541612900639078, + "language_loss": 0.79739958, + "learning_rate": 5.235574458679579e-07, + "loss": 0.81810713, + "num_input_tokens_seen": 276749575, + "step": 12830, + "time_per_iteration": 2.687385082244873 + }, + { + "auxiliary_loss_clip": 0.01057298, + "auxiliary_loss_mlp": 0.01031383, + "balance_loss_clip": 1.02642441, + "balance_loss_mlp": 1.01912856, + "epoch": 0.7714414549827145, + "flos": 25703960584320.0, + "grad_norm": 2.8608500326905273, + "language_loss": 0.78200346, + "learning_rate": 5.232947591245269e-07, + "loss": 0.8028903, + "num_input_tokens_seen": 276769460, + "step": 12831, + "time_per_iteration": 4.287460565567017 + }, + { + "auxiliary_loss_clip": 0.01036524, + "auxiliary_loss_mlp": 0.01031486, + "balance_loss_clip": 1.02310979, + "balance_loss_mlp": 1.02022099, + "epoch": 0.7715015782353826, + "flos": 30555312312960.0, + "grad_norm": 3.7467162585650833, + "language_loss": 0.61403251, + "learning_rate": 5.230321283779071e-07, + "loss": 0.63471258, + "num_input_tokens_seen": 276790820, + "step": 12832, + "time_per_iteration": 2.738797664642334 + }, + { + "auxiliary_loss_clip": 0.01043598, + "auxiliary_loss_mlp": 0.01033312, + "balance_loss_clip": 1.02477717, + "balance_loss_mlp": 1.02232659, + "epoch": 0.7715617014880505, + "flos": 20229343038720.0, + "grad_norm": 1.5033076455745096, + "language_loss": 0.79572839, + "learning_rate": 5.227695536380572e-07, + "loss": 0.81649745, + "num_input_tokens_seen": 276811345, + "step": 12833, + "time_per_iteration": 2.662578582763672 + }, + { + "auxiliary_loss_clip": 0.00970409, + "auxiliary_loss_mlp": 0.0100593, + "balance_loss_clip": 1.00390041, + "balance_loss_mlp": 1.00493431, + "epoch": 0.7716218247407185, + "flos": 63664770971520.0, + "grad_norm": 0.8550683752045681, + "language_loss": 0.55380011, + "learning_rate": 5.22507034914933e-07, + "loss": 0.57356346, + "num_input_tokens_seen": 276870950, + "step": 12834, + "time_per_iteration": 3.259648323059082 + }, + { + "auxiliary_loss_clip": 0.0101314, + "auxiliary_loss_mlp": 0.01031626, + "balance_loss_clip": 1.02191365, + "balance_loss_mlp": 1.01994967, + "epoch": 0.7716819479933864, + "flos": 19791807471360.0, + "grad_norm": 3.6861806131434673, + "language_loss": 0.72891521, + "learning_rate": 5.222445722184903e-07, + "loss": 0.74936295, + "num_input_tokens_seen": 276890760, + "step": 12835, + "time_per_iteration": 2.7498178482055664 + }, + { + "auxiliary_loss_clip": 0.01031533, + "auxiliary_loss_mlp": 0.00747589, + "balance_loss_clip": 1.02308834, + "balance_loss_mlp": 1.00037837, + "epoch": 0.7717420712460544, + "flos": 18442176825600.0, + "grad_norm": 1.7604832363717335, + "language_loss": 0.70197368, + "learning_rate": 5.219821655586814e-07, + "loss": 0.71976483, + "num_input_tokens_seen": 276909625, + "step": 12836, + "time_per_iteration": 2.737114429473877 + }, + { + "auxiliary_loss_clip": 0.01042111, + "auxiliary_loss_mlp": 0.01028108, + "balance_loss_clip": 1.02549982, + "balance_loss_mlp": 1.01800489, + "epoch": 0.7718021944987223, + "flos": 35189476456320.0, + "grad_norm": 2.369670267318842, + "language_loss": 0.59261668, + "learning_rate": 5.217198149454575e-07, + "loss": 0.61331892, + "num_input_tokens_seen": 276930760, + "step": 12837, + "time_per_iteration": 2.789026975631714 + }, + { + "auxiliary_loss_clip": 0.01000964, + "auxiliary_loss_mlp": 0.01004143, + "balance_loss_clip": 1.00445688, + "balance_loss_mlp": 1.00302792, + "epoch": 0.7718623177513904, + "flos": 67923167961600.0, + "grad_norm": 0.8599920122041804, + "language_loss": 0.55807924, + "learning_rate": 5.214575203887666e-07, + "loss": 0.57813036, + "num_input_tokens_seen": 276989580, + "step": 12838, + "time_per_iteration": 3.174576759338379 + }, + { + "auxiliary_loss_clip": 0.01050374, + "auxiliary_loss_mlp": 0.01025335, + "balance_loss_clip": 1.02406096, + "balance_loss_mlp": 1.0154469, + "epoch": 0.7719224410040583, + "flos": 18581401941120.0, + "grad_norm": 2.150659889662785, + "language_loss": 0.69622201, + "learning_rate": 5.211952818985538e-07, + "loss": 0.71697903, + "num_input_tokens_seen": 277005450, + "step": 12839, + "time_per_iteration": 2.6365106105804443 + }, + { + "auxiliary_loss_clip": 0.01050799, + "auxiliary_loss_mlp": 0.01026455, + "balance_loss_clip": 1.02483845, + "balance_loss_mlp": 1.01614976, + "epoch": 0.7719825642567263, + "flos": 23075802264960.0, + "grad_norm": 1.7440838085552535, + "language_loss": 0.80149853, + "learning_rate": 5.209330994847647e-07, + "loss": 0.82227111, + "num_input_tokens_seen": 277023055, + "step": 12840, + "time_per_iteration": 2.6357598304748535 + }, + { + "auxiliary_loss_clip": 0.01051828, + "auxiliary_loss_mlp": 0.00747584, + "balance_loss_clip": 1.02519548, + "balance_loss_mlp": 1.0003264, + "epoch": 0.7720426875093943, + "flos": 20339086066560.0, + "grad_norm": 1.8037219561886961, + "language_loss": 0.8003636, + "learning_rate": 5.206709731573402e-07, + "loss": 0.81835771, + "num_input_tokens_seen": 277041150, + "step": 12841, + "time_per_iteration": 2.6865928173065186 + }, + { + "auxiliary_loss_clip": 0.01032828, + "auxiliary_loss_mlp": 0.01026151, + "balance_loss_clip": 1.02555895, + "balance_loss_mlp": 1.01529145, + "epoch": 0.7721028107620622, + "flos": 23880704181120.0, + "grad_norm": 1.4657828722776434, + "language_loss": 0.76438034, + "learning_rate": 5.204089029262208e-07, + "loss": 0.7849701, + "num_input_tokens_seen": 277063895, + "step": 12842, + "time_per_iteration": 2.7655906677246094 + }, + { + "auxiliary_loss_clip": 0.0101295, + "auxiliary_loss_mlp": 0.00747788, + "balance_loss_clip": 1.02412832, + "balance_loss_mlp": 1.00037837, + "epoch": 0.7721629340147302, + "flos": 26651571235200.0, + "grad_norm": 1.616225220665253, + "language_loss": 0.68771672, + "learning_rate": 5.201468888013445e-07, + "loss": 0.70532405, + "num_input_tokens_seen": 277084045, + "step": 12843, + "time_per_iteration": 2.8084070682525635 + }, + { + "auxiliary_loss_clip": 0.01038392, + "auxiliary_loss_mlp": 0.01026342, + "balance_loss_clip": 1.02253067, + "balance_loss_mlp": 1.01623869, + "epoch": 0.7722230572673981, + "flos": 21178857110400.0, + "grad_norm": 1.8028185382058086, + "language_loss": 0.73457515, + "learning_rate": 5.198849307926465e-07, + "loss": 0.75522256, + "num_input_tokens_seen": 277102625, + "step": 12844, + "time_per_iteration": 2.6731560230255127 + }, + { + "auxiliary_loss_clip": 0.01046449, + "auxiliary_loss_mlp": 0.01029278, + "balance_loss_clip": 1.02380967, + "balance_loss_mlp": 1.01797688, + "epoch": 0.7722831805200662, + "flos": 27964644814080.0, + "grad_norm": 1.4745443082810314, + "language_loss": 0.71484172, + "learning_rate": 5.196230289100596e-07, + "loss": 0.73559904, + "num_input_tokens_seen": 277123210, + "step": 12845, + "time_per_iteration": 2.6582303047180176 + }, + { + "auxiliary_loss_clip": 0.01060617, + "auxiliary_loss_mlp": 0.01027672, + "balance_loss_clip": 1.02481818, + "balance_loss_mlp": 1.01788521, + "epoch": 0.7723433037727341, + "flos": 33875576864640.0, + "grad_norm": 1.741574192949804, + "language_loss": 0.64559042, + "learning_rate": 5.193611831635159e-07, + "loss": 0.66647339, + "num_input_tokens_seen": 277144895, + "step": 12846, + "time_per_iteration": 2.676030158996582 + }, + { + "auxiliary_loss_clip": 0.00996963, + "auxiliary_loss_mlp": 0.00746804, + "balance_loss_clip": 1.00148034, + "balance_loss_mlp": 1.00079441, + "epoch": 0.7724034270254021, + "flos": 62848271940480.0, + "grad_norm": 0.767496860453017, + "language_loss": 0.61771643, + "learning_rate": 5.19099393562945e-07, + "loss": 0.63515413, + "num_input_tokens_seen": 277205160, + "step": 12847, + "time_per_iteration": 5.096048355102539 + }, + { + "auxiliary_loss_clip": 0.01060088, + "auxiliary_loss_mlp": 0.01026383, + "balance_loss_clip": 1.02311397, + "balance_loss_mlp": 1.0158987, + "epoch": 0.77246355027807, + "flos": 23295467888640.0, + "grad_norm": 1.611520868196418, + "language_loss": 0.78589231, + "learning_rate": 5.188376601182732e-07, + "loss": 0.80675697, + "num_input_tokens_seen": 277223005, + "step": 12848, + "time_per_iteration": 2.659435272216797 + }, + { + "auxiliary_loss_clip": 0.01027279, + "auxiliary_loss_mlp": 0.01032403, + "balance_loss_clip": 1.02317679, + "balance_loss_mlp": 1.02064323, + "epoch": 0.772523673530738, + "flos": 20121287950080.0, + "grad_norm": 1.8992361638922302, + "language_loss": 0.72624511, + "learning_rate": 5.185759828394261e-07, + "loss": 0.74684191, + "num_input_tokens_seen": 277241785, + "step": 12849, + "time_per_iteration": 2.760044574737549 + }, + { + "auxiliary_loss_clip": 0.01062031, + "auxiliary_loss_mlp": 0.01027853, + "balance_loss_clip": 1.02481437, + "balance_loss_mlp": 1.01720786, + "epoch": 0.7725837967834059, + "flos": 17820096157440.0, + "grad_norm": 1.794437025492539, + "language_loss": 0.77884936, + "learning_rate": 5.183143617363261e-07, + "loss": 0.79974818, + "num_input_tokens_seen": 277259050, + "step": 12850, + "time_per_iteration": 2.6197757720947266 + }, + { + "auxiliary_loss_clip": 0.01008684, + "auxiliary_loss_mlp": 0.00747633, + "balance_loss_clip": 1.02111495, + "balance_loss_mlp": 1.00035453, + "epoch": 0.772643920036074, + "flos": 27198921657600.0, + "grad_norm": 1.5562221058166077, + "language_loss": 0.79724473, + "learning_rate": 5.180527968188935e-07, + "loss": 0.81480789, + "num_input_tokens_seen": 277278235, + "step": 12851, + "time_per_iteration": 2.8866400718688965 + }, + { + "auxiliary_loss_clip": 0.01047183, + "auxiliary_loss_mlp": 0.01028376, + "balance_loss_clip": 1.02335727, + "balance_loss_mlp": 1.01655006, + "epoch": 0.7727040432887419, + "flos": 21579512388480.0, + "grad_norm": 1.4121185756049959, + "language_loss": 0.73474765, + "learning_rate": 5.177912880970474e-07, + "loss": 0.75550324, + "num_input_tokens_seen": 277298355, + "step": 12852, + "time_per_iteration": 2.733477830886841 + }, + { + "auxiliary_loss_clip": 0.01061088, + "auxiliary_loss_mlp": 0.01030642, + "balance_loss_clip": 1.02407193, + "balance_loss_mlp": 1.02024734, + "epoch": 0.7727641665414099, + "flos": 22236641752320.0, + "grad_norm": 1.622394690582212, + "language_loss": 0.82605219, + "learning_rate": 5.17529835580704e-07, + "loss": 0.84696949, + "num_input_tokens_seen": 277316095, + "step": 12853, + "time_per_iteration": 2.5504205226898193 + }, + { + "auxiliary_loss_clip": 0.01006047, + "auxiliary_loss_mlp": 0.01000378, + "balance_loss_clip": 1.00084817, + "balance_loss_mlp": 0.99941808, + "epoch": 0.7728242897940779, + "flos": 54832221463680.0, + "grad_norm": 0.7940003297191313, + "language_loss": 0.545048, + "learning_rate": 5.172684392797786e-07, + "loss": 0.56511229, + "num_input_tokens_seen": 277380130, + "step": 12854, + "time_per_iteration": 3.2427306175231934 + }, + { + "auxiliary_loss_clip": 0.01054277, + "auxiliary_loss_mlp": 0.0102921, + "balance_loss_clip": 1.02449846, + "balance_loss_mlp": 1.01728308, + "epoch": 0.7728844130467458, + "flos": 34461962392320.0, + "grad_norm": 1.5514887943424085, + "language_loss": 0.71954668, + "learning_rate": 5.170070992041826e-07, + "loss": 0.74038154, + "num_input_tokens_seen": 277404015, + "step": 12855, + "time_per_iteration": 2.703860282897949 + }, + { + "auxiliary_loss_clip": 0.01062352, + "auxiliary_loss_mlp": 0.01029676, + "balance_loss_clip": 1.02476168, + "balance_loss_mlp": 1.01839352, + "epoch": 0.7729445362994138, + "flos": 18916341287040.0, + "grad_norm": 1.8316758190010178, + "language_loss": 0.67625844, + "learning_rate": 5.167458153638254e-07, + "loss": 0.69717878, + "num_input_tokens_seen": 277421375, + "step": 12856, + "time_per_iteration": 2.634089469909668 + }, + { + "auxiliary_loss_clip": 0.01029139, + "auxiliary_loss_mlp": 0.01028047, + "balance_loss_clip": 1.02261436, + "balance_loss_mlp": 1.01744294, + "epoch": 0.7730046595520818, + "flos": 22200048771840.0, + "grad_norm": 2.109825119067812, + "language_loss": 0.79009229, + "learning_rate": 5.164845877686162e-07, + "loss": 0.81066418, + "num_input_tokens_seen": 277440170, + "step": 12857, + "time_per_iteration": 4.564117193222046 + }, + { + "auxiliary_loss_clip": 0.01021022, + "auxiliary_loss_mlp": 0.0074756, + "balance_loss_clip": 1.03166199, + "balance_loss_mlp": 1.00030947, + "epoch": 0.7730647828047498, + "flos": 13552328695680.0, + "grad_norm": 2.1855361398387587, + "language_loss": 0.78667295, + "learning_rate": 5.162234164284591e-07, + "loss": 0.80435872, + "num_input_tokens_seen": 277456880, + "step": 12858, + "time_per_iteration": 2.850369930267334 + }, + { + "auxiliary_loss_clip": 0.01062278, + "auxiliary_loss_mlp": 0.01028561, + "balance_loss_clip": 1.02444029, + "balance_loss_mlp": 1.01780272, + "epoch": 0.7731249060574177, + "flos": 21976037602560.0, + "grad_norm": 1.8566993223376533, + "language_loss": 0.76875728, + "learning_rate": 5.159623013532591e-07, + "loss": 0.7896657, + "num_input_tokens_seen": 277475365, + "step": 12859, + "time_per_iteration": 2.564047336578369 + }, + { + "auxiliary_loss_clip": 0.01051194, + "auxiliary_loss_mlp": 0.01026318, + "balance_loss_clip": 1.02622211, + "balance_loss_mlp": 1.01706171, + "epoch": 0.7731850293100857, + "flos": 22601817371520.0, + "grad_norm": 1.4571914972081668, + "language_loss": 0.67709315, + "learning_rate": 5.157012425529186e-07, + "loss": 0.69786823, + "num_input_tokens_seen": 277494975, + "step": 12860, + "time_per_iteration": 2.755124568939209 + }, + { + "auxiliary_loss_clip": 0.01062374, + "auxiliary_loss_mlp": 0.01033275, + "balance_loss_clip": 1.02327061, + "balance_loss_mlp": 1.02225447, + "epoch": 0.7732451525627536, + "flos": 14098422142080.0, + "grad_norm": 2.6364371680670793, + "language_loss": 0.74942875, + "learning_rate": 5.154402400373343e-07, + "loss": 0.77038527, + "num_input_tokens_seen": 277510520, + "step": 12861, + "time_per_iteration": 2.564188003540039 + }, + { + "auxiliary_loss_clip": 0.01054193, + "auxiliary_loss_mlp": 0.01023913, + "balance_loss_clip": 1.02559769, + "balance_loss_mlp": 1.01265955, + "epoch": 0.7733052758154216, + "flos": 21470020755840.0, + "grad_norm": 1.9495126181454354, + "language_loss": 0.74584246, + "learning_rate": 5.15179293816405e-07, + "loss": 0.7666235, + "num_input_tokens_seen": 277530505, + "step": 12862, + "time_per_iteration": 2.9474117755889893 + }, + { + "auxiliary_loss_clip": 0.01015001, + "auxiliary_loss_mlp": 0.0102916, + "balance_loss_clip": 1.02168393, + "balance_loss_mlp": 1.0191884, + "epoch": 0.7733653990680895, + "flos": 21394284929280.0, + "grad_norm": 1.5286086959820786, + "language_loss": 0.83025813, + "learning_rate": 5.149184039000256e-07, + "loss": 0.85069978, + "num_input_tokens_seen": 277550810, + "step": 12863, + "time_per_iteration": 2.8003125190734863 + }, + { + "auxiliary_loss_clip": 0.01060309, + "auxiliary_loss_mlp": 0.01030161, + "balance_loss_clip": 1.02352417, + "balance_loss_mlp": 1.02002835, + "epoch": 0.7734255223207576, + "flos": 17676058619520.0, + "grad_norm": 1.6554914972618753, + "language_loss": 0.72871661, + "learning_rate": 5.146575702980898e-07, + "loss": 0.74962133, + "num_input_tokens_seen": 277567680, + "step": 12864, + "time_per_iteration": 2.5346264839172363 + }, + { + "auxiliary_loss_clip": 0.01039636, + "auxiliary_loss_mlp": 0.01028163, + "balance_loss_clip": 1.02353334, + "balance_loss_mlp": 1.01808357, + "epoch": 0.7734856455734255, + "flos": 25230837617280.0, + "grad_norm": 1.750360361978733, + "language_loss": 0.82568836, + "learning_rate": 5.143967930204871e-07, + "loss": 0.84636635, + "num_input_tokens_seen": 277588970, + "step": 12865, + "time_per_iteration": 2.682859420776367 + }, + { + "auxiliary_loss_clip": 0.01066827, + "auxiliary_loss_mlp": 0.01028786, + "balance_loss_clip": 1.02660537, + "balance_loss_mlp": 1.01683545, + "epoch": 0.7735457688260935, + "flos": 23433112805760.0, + "grad_norm": 1.968007949108204, + "language_loss": 0.71996164, + "learning_rate": 5.141360720771077e-07, + "loss": 0.7409178, + "num_input_tokens_seen": 277605450, + "step": 12866, + "time_per_iteration": 2.604640245437622 + }, + { + "auxiliary_loss_clip": 0.01023211, + "auxiliary_loss_mlp": 0.00747655, + "balance_loss_clip": 1.02472222, + "balance_loss_mlp": 1.00032306, + "epoch": 0.7736058920787615, + "flos": 18729246320640.0, + "grad_norm": 3.2148474347998017, + "language_loss": 0.64961624, + "learning_rate": 5.138754074778371e-07, + "loss": 0.6673249, + "num_input_tokens_seen": 277622530, + "step": 12867, + "time_per_iteration": 2.8558969497680664 + }, + { + "auxiliary_loss_clip": 0.01050782, + "auxiliary_loss_mlp": 0.01029867, + "balance_loss_clip": 1.02413261, + "balance_loss_mlp": 1.01984096, + "epoch": 0.7736660153314294, + "flos": 22893304239360.0, + "grad_norm": 1.4340532169442952, + "language_loss": 0.71015847, + "learning_rate": 5.136147992325595e-07, + "loss": 0.7309649, + "num_input_tokens_seen": 277642700, + "step": 12868, + "time_per_iteration": 2.737809419631958 + }, + { + "auxiliary_loss_clip": 0.01056476, + "auxiliary_loss_mlp": 0.01029085, + "balance_loss_clip": 1.02706897, + "balance_loss_mlp": 1.01812327, + "epoch": 0.7737261385840974, + "flos": 13800901789440.0, + "grad_norm": 1.9908447509584775, + "language_loss": 0.77783573, + "learning_rate": 5.133542473511578e-07, + "loss": 0.79869127, + "num_input_tokens_seen": 277660005, + "step": 12869, + "time_per_iteration": 2.623501777648926 + }, + { + "auxiliary_loss_clip": 0.01049146, + "auxiliary_loss_mlp": 0.01026865, + "balance_loss_clip": 1.0230341, + "balance_loss_mlp": 1.01615453, + "epoch": 0.7737862618367654, + "flos": 28730727106560.0, + "grad_norm": 1.5770420408894528, + "language_loss": 0.73694193, + "learning_rate": 5.130937518435124e-07, + "loss": 0.75770199, + "num_input_tokens_seen": 277682890, + "step": 12870, + "time_per_iteration": 2.6631624698638916 + }, + { + "auxiliary_loss_clip": 0.01050996, + "auxiliary_loss_mlp": 0.01030276, + "balance_loss_clip": 1.02362776, + "balance_loss_mlp": 1.01935005, + "epoch": 0.7738463850894334, + "flos": 17018570119680.0, + "grad_norm": 1.8417057723702883, + "language_loss": 0.76395285, + "learning_rate": 5.12833312719501e-07, + "loss": 0.78476554, + "num_input_tokens_seen": 277699330, + "step": 12871, + "time_per_iteration": 2.595088481903076 + }, + { + "auxiliary_loss_clip": 0.01037659, + "auxiliary_loss_mlp": 0.01032014, + "balance_loss_clip": 1.0232054, + "balance_loss_mlp": 1.02190483, + "epoch": 0.7739065083421013, + "flos": 20704010290560.0, + "grad_norm": 1.6048053225203345, + "language_loss": 0.6915037, + "learning_rate": 5.12572929988999e-07, + "loss": 0.71220046, + "num_input_tokens_seen": 277718750, + "step": 12872, + "time_per_iteration": 2.65653133392334 + }, + { + "auxiliary_loss_clip": 0.01063469, + "auxiliary_loss_mlp": 0.01028476, + "balance_loss_clip": 1.02495027, + "balance_loss_mlp": 1.01651287, + "epoch": 0.7739666315947693, + "flos": 20697222620160.0, + "grad_norm": 2.0814502266484864, + "language_loss": 0.85233104, + "learning_rate": 5.123126036618804e-07, + "loss": 0.87325048, + "num_input_tokens_seen": 277734645, + "step": 12873, + "time_per_iteration": 2.551135301589966 + }, + { + "auxiliary_loss_clip": 0.01063217, + "auxiliary_loss_mlp": 0.01027681, + "balance_loss_clip": 1.02513969, + "balance_loss_mlp": 1.01723266, + "epoch": 0.7740267548474372, + "flos": 29570677718400.0, + "grad_norm": 2.501881595812001, + "language_loss": 0.65565681, + "learning_rate": 5.120523337480174e-07, + "loss": 0.67656583, + "num_input_tokens_seen": 277755535, + "step": 12874, + "time_per_iteration": 2.676635265350342 + }, + { + "auxiliary_loss_clip": 0.01028894, + "auxiliary_loss_mlp": 0.01027832, + "balance_loss_clip": 1.02972984, + "balance_loss_mlp": 1.01710916, + "epoch": 0.7740868781001052, + "flos": 23659099223040.0, + "grad_norm": 1.581204324656758, + "language_loss": 0.6215288, + "learning_rate": 5.117921202572785e-07, + "loss": 0.6420961, + "num_input_tokens_seen": 277775585, + "step": 12875, + "time_per_iteration": 2.8117971420288086 + }, + { + "auxiliary_loss_clip": 0.01053648, + "auxiliary_loss_mlp": 0.01028707, + "balance_loss_clip": 1.02532446, + "balance_loss_mlp": 1.01769185, + "epoch": 0.7741470013527731, + "flos": 24717314828160.0, + "grad_norm": 1.9666696952698215, + "language_loss": 0.65391815, + "learning_rate": 5.115319631995318e-07, + "loss": 0.67474174, + "num_input_tokens_seen": 277794795, + "step": 12876, + "time_per_iteration": 4.307993650436401 + }, + { + "auxiliary_loss_clip": 0.01040925, + "auxiliary_loss_mlp": 0.01030807, + "balance_loss_clip": 1.02368999, + "balance_loss_mlp": 1.02030492, + "epoch": 0.7742071246054412, + "flos": 21871645701120.0, + "grad_norm": 2.1044629186498867, + "language_loss": 0.70991373, + "learning_rate": 5.112718625846433e-07, + "loss": 0.73063105, + "num_input_tokens_seen": 277813235, + "step": 12877, + "time_per_iteration": 4.313999891281128 + }, + { + "auxiliary_loss_clip": 0.01027487, + "auxiliary_loss_mlp": 0.0103723, + "balance_loss_clip": 1.02360845, + "balance_loss_mlp": 1.02474833, + "epoch": 0.7742672478581091, + "flos": 22674249146880.0, + "grad_norm": 1.7543131673700874, + "language_loss": 0.8270551, + "learning_rate": 5.110118184224736e-07, + "loss": 0.84770226, + "num_input_tokens_seen": 277832560, + "step": 12878, + "time_per_iteration": 2.8580923080444336 + }, + { + "auxiliary_loss_clip": 0.01045815, + "auxiliary_loss_mlp": 0.01029917, + "balance_loss_clip": 1.02696049, + "balance_loss_mlp": 1.01805592, + "epoch": 0.7743273711107771, + "flos": 18840892769280.0, + "grad_norm": 1.9168213912919951, + "language_loss": 0.73484969, + "learning_rate": 5.10751830722885e-07, + "loss": 0.75560701, + "num_input_tokens_seen": 277850120, + "step": 12879, + "time_per_iteration": 2.686992645263672 + }, + { + "auxiliary_loss_clip": 0.01037865, + "auxiliary_loss_mlp": 0.01025087, + "balance_loss_clip": 1.0221889, + "balance_loss_mlp": 1.01476979, + "epoch": 0.7743874943634451, + "flos": 28729326476160.0, + "grad_norm": 1.8611331784338503, + "language_loss": 0.79470438, + "learning_rate": 5.104918994957364e-07, + "loss": 0.81533384, + "num_input_tokens_seen": 277871020, + "step": 12880, + "time_per_iteration": 2.8068511486053467 + }, + { + "auxiliary_loss_clip": 0.01042069, + "auxiliary_loss_mlp": 0.01029631, + "balance_loss_clip": 1.02533984, + "balance_loss_mlp": 1.01934969, + "epoch": 0.774447617616113, + "flos": 21909639312000.0, + "grad_norm": 1.5531750198280743, + "language_loss": 0.70031697, + "learning_rate": 5.102320247508847e-07, + "loss": 0.72103393, + "num_input_tokens_seen": 277891525, + "step": 12881, + "time_per_iteration": 2.6709816455841064 + }, + { + "auxiliary_loss_clip": 0.01042715, + "auxiliary_loss_mlp": 0.01037847, + "balance_loss_clip": 1.02415287, + "balance_loss_mlp": 1.02611041, + "epoch": 0.774507740868781, + "flos": 19500643825920.0, + "grad_norm": 2.2757703898852375, + "language_loss": 0.84452188, + "learning_rate": 5.099722064981832e-07, + "loss": 0.86532748, + "num_input_tokens_seen": 277910425, + "step": 12882, + "time_per_iteration": 2.6620500087738037 + }, + { + "auxiliary_loss_clip": 0.00982072, + "auxiliary_loss_mlp": 0.01001826, + "balance_loss_clip": 1.0049907, + "balance_loss_mlp": 1.00077713, + "epoch": 0.774567864121449, + "flos": 59426560402560.0, + "grad_norm": 0.7714857594799238, + "language_loss": 0.60518456, + "learning_rate": 5.097124447474858e-07, + "loss": 0.62502348, + "num_input_tokens_seen": 277972795, + "step": 12883, + "time_per_iteration": 3.2019765377044678 + }, + { + "auxiliary_loss_clip": 0.0101835, + "auxiliary_loss_mlp": 0.01036069, + "balance_loss_clip": 1.02401829, + "balance_loss_mlp": 1.02370644, + "epoch": 0.774627987374117, + "flos": 13225326255360.0, + "grad_norm": 1.603729864981099, + "language_loss": 0.72783458, + "learning_rate": 5.094527395086416e-07, + "loss": 0.74837881, + "num_input_tokens_seen": 277990675, + "step": 12884, + "time_per_iteration": 2.7331185340881348 + }, + { + "auxiliary_loss_clip": 0.01052518, + "auxiliary_loss_mlp": 0.01028597, + "balance_loss_clip": 1.025267, + "balance_loss_mlp": 1.01891148, + "epoch": 0.7746881106267849, + "flos": 21394033534080.0, + "grad_norm": 1.6819902344980393, + "language_loss": 0.8100934, + "learning_rate": 5.091930907914986e-07, + "loss": 0.83090454, + "num_input_tokens_seen": 278010050, + "step": 12885, + "time_per_iteration": 2.6726129055023193 + }, + { + "auxiliary_loss_clip": 0.0106081, + "auxiliary_loss_mlp": 0.0102891, + "balance_loss_clip": 1.02390134, + "balance_loss_mlp": 1.01903331, + "epoch": 0.7747482338794529, + "flos": 25629338079360.0, + "grad_norm": 1.8249210252064654, + "language_loss": 0.64332867, + "learning_rate": 5.089334986059029e-07, + "loss": 0.66422588, + "num_input_tokens_seen": 278030660, + "step": 12886, + "time_per_iteration": 2.6169605255126953 + }, + { + "auxiliary_loss_clip": 0.01022441, + "auxiliary_loss_mlp": 0.01026375, + "balance_loss_clip": 1.0216012, + "balance_loss_mlp": 1.01678467, + "epoch": 0.7748083571321208, + "flos": 11546933402880.0, + "grad_norm": 1.985911029925226, + "language_loss": 0.69277787, + "learning_rate": 5.086739629616987e-07, + "loss": 0.71326602, + "num_input_tokens_seen": 278047645, + "step": 12887, + "time_per_iteration": 2.638076066970825 + }, + { + "auxiliary_loss_clip": 0.01051095, + "auxiliary_loss_mlp": 0.0102475, + "balance_loss_clip": 1.02447879, + "balance_loss_mlp": 1.01514757, + "epoch": 0.7748684803847888, + "flos": 19062425900160.0, + "grad_norm": 1.8920209904188123, + "language_loss": 0.70271492, + "learning_rate": 5.084144838687275e-07, + "loss": 0.72347343, + "num_input_tokens_seen": 278066170, + "step": 12888, + "time_per_iteration": 2.6197309494018555 + }, + { + "auxiliary_loss_clip": 0.01051542, + "auxiliary_loss_mlp": 0.01029436, + "balance_loss_clip": 1.02344763, + "balance_loss_mlp": 1.01848662, + "epoch": 0.7749286036374567, + "flos": 22273162905600.0, + "grad_norm": 1.6350031732605341, + "language_loss": 0.81702936, + "learning_rate": 5.081550613368279e-07, + "loss": 0.83783913, + "num_input_tokens_seen": 278085545, + "step": 12889, + "time_per_iteration": 2.664414167404175 + }, + { + "auxiliary_loss_clip": 0.01029276, + "auxiliary_loss_mlp": 0.01028129, + "balance_loss_clip": 1.02376676, + "balance_loss_mlp": 1.01819921, + "epoch": 0.7749887268901248, + "flos": 20192462749440.0, + "grad_norm": 1.8862528415788737, + "language_loss": 0.79399347, + "learning_rate": 5.07895695375838e-07, + "loss": 0.81456757, + "num_input_tokens_seen": 278102995, + "step": 12890, + "time_per_iteration": 2.695544719696045 + }, + { + "auxiliary_loss_clip": 0.01036282, + "auxiliary_loss_mlp": 0.01030779, + "balance_loss_clip": 1.02677989, + "balance_loss_mlp": 1.01973367, + "epoch": 0.7750488501427927, + "flos": 20337541781760.0, + "grad_norm": 2.1236207553213973, + "language_loss": 0.66169155, + "learning_rate": 5.076363859955932e-07, + "loss": 0.6823622, + "num_input_tokens_seen": 278121460, + "step": 12891, + "time_per_iteration": 2.7775747776031494 + }, + { + "auxiliary_loss_clip": 0.01050016, + "auxiliary_loss_mlp": 0.01029094, + "balance_loss_clip": 1.02355814, + "balance_loss_mlp": 1.01836514, + "epoch": 0.7751089733954607, + "flos": 28364043116160.0, + "grad_norm": 1.4842502801293034, + "language_loss": 0.78776443, + "learning_rate": 5.073771332059257e-07, + "loss": 0.80855554, + "num_input_tokens_seen": 278143905, + "step": 12892, + "time_per_iteration": 2.6676204204559326 + }, + { + "auxiliary_loss_clip": 0.01055413, + "auxiliary_loss_mlp": 0.01028109, + "balance_loss_clip": 1.02616596, + "balance_loss_mlp": 1.01699901, + "epoch": 0.7751690966481286, + "flos": 16943803960320.0, + "grad_norm": 2.281771327154613, + "language_loss": 0.67268944, + "learning_rate": 5.071179370166669e-07, + "loss": 0.69352466, + "num_input_tokens_seen": 278160850, + "step": 12893, + "time_per_iteration": 2.5935654640197754 + }, + { + "auxiliary_loss_clip": 0.00998854, + "auxiliary_loss_mlp": 0.01005128, + "balance_loss_clip": 1.00350034, + "balance_loss_mlp": 1.0040493, + "epoch": 0.7752292199007966, + "flos": 65668050339840.0, + "grad_norm": 0.8131748995578619, + "language_loss": 0.58573073, + "learning_rate": 5.068587974376468e-07, + "loss": 0.60577065, + "num_input_tokens_seen": 278219950, + "step": 12894, + "time_per_iteration": 5.133985757827759 + }, + { + "auxiliary_loss_clip": 0.01045516, + "auxiliary_loss_mlp": 0.01031012, + "balance_loss_clip": 1.0266695, + "balance_loss_mlp": 1.01985407, + "epoch": 0.7752893431534646, + "flos": 20594662312320.0, + "grad_norm": 2.050137845082361, + "language_loss": 0.78595138, + "learning_rate": 5.065997144786895e-07, + "loss": 0.80671668, + "num_input_tokens_seen": 278237805, + "step": 12895, + "time_per_iteration": 2.6739466190338135 + }, + { + "auxiliary_loss_clip": 0.01028264, + "auxiliary_loss_mlp": 0.01030107, + "balance_loss_clip": 1.02482724, + "balance_loss_mlp": 1.01782823, + "epoch": 0.7753494664061326, + "flos": 20485350247680.0, + "grad_norm": 1.9099634046386886, + "language_loss": 0.67695069, + "learning_rate": 5.063406881496209e-07, + "loss": 0.69753438, + "num_input_tokens_seen": 278257660, + "step": 12896, + "time_per_iteration": 2.748270273208618 + }, + { + "auxiliary_loss_clip": 0.01040166, + "auxiliary_loss_mlp": 0.01032643, + "balance_loss_clip": 1.0237534, + "balance_loss_mlp": 1.02279019, + "epoch": 0.7754095896588006, + "flos": 20265900105600.0, + "grad_norm": 1.741075827717598, + "language_loss": 0.68889868, + "learning_rate": 5.060817184602629e-07, + "loss": 0.70962679, + "num_input_tokens_seen": 278275110, + "step": 12897, + "time_per_iteration": 2.727724313735962 + }, + { + "auxiliary_loss_clip": 0.01064828, + "auxiliary_loss_mlp": 0.01033826, + "balance_loss_clip": 1.02641153, + "balance_loss_mlp": 1.02221537, + "epoch": 0.7754697129114685, + "flos": 23331091201920.0, + "grad_norm": 1.6707590038675426, + "language_loss": 0.75156522, + "learning_rate": 5.058228054204364e-07, + "loss": 0.77255177, + "num_input_tokens_seen": 278293035, + "step": 12898, + "time_per_iteration": 2.627478837966919 + }, + { + "auxiliary_loss_clip": 0.01052522, + "auxiliary_loss_mlp": 0.00747811, + "balance_loss_clip": 1.02451384, + "balance_loss_mlp": 1.00040436, + "epoch": 0.7755298361641365, + "flos": 17347619635200.0, + "grad_norm": 1.9820145221388443, + "language_loss": 0.70061767, + "learning_rate": 5.055639490399588e-07, + "loss": 0.71862102, + "num_input_tokens_seen": 278311010, + "step": 12899, + "time_per_iteration": 2.6836981773376465 + }, + { + "auxiliary_loss_clip": 0.01024364, + "auxiliary_loss_mlp": 0.01031707, + "balance_loss_clip": 1.02141297, + "balance_loss_mlp": 1.02046597, + "epoch": 0.7755899594168044, + "flos": 19645866512640.0, + "grad_norm": 2.713663812950573, + "language_loss": 0.74844533, + "learning_rate": 5.053051493286453e-07, + "loss": 0.76900601, + "num_input_tokens_seen": 278329900, + "step": 12900, + "time_per_iteration": 2.704848051071167 + }, + { + "auxiliary_loss_clip": 0.01050831, + "auxiliary_loss_mlp": 0.01034393, + "balance_loss_clip": 1.02497721, + "balance_loss_mlp": 1.02464759, + "epoch": 0.7756500826694724, + "flos": 27414457217280.0, + "grad_norm": 1.5194792275698163, + "language_loss": 0.77242267, + "learning_rate": 5.050464062963113e-07, + "loss": 0.79327488, + "num_input_tokens_seen": 278349980, + "step": 12901, + "time_per_iteration": 2.649777889251709 + }, + { + "auxiliary_loss_clip": 0.01055834, + "auxiliary_loss_mlp": 0.01028956, + "balance_loss_clip": 1.02825046, + "balance_loss_mlp": 1.01786995, + "epoch": 0.7757102059221404, + "flos": 28730511624960.0, + "grad_norm": 6.095131108172069, + "language_loss": 0.77429652, + "learning_rate": 5.047877199527666e-07, + "loss": 0.79514444, + "num_input_tokens_seen": 278372485, + "step": 12902, + "time_per_iteration": 2.673323631286621 + }, + { + "auxiliary_loss_clip": 0.01052553, + "auxiliary_loss_mlp": 0.01026515, + "balance_loss_clip": 1.02455902, + "balance_loss_mlp": 1.01622081, + "epoch": 0.7757703291748084, + "flos": 22486795044480.0, + "grad_norm": 2.0863335014638498, + "language_loss": 0.72967583, + "learning_rate": 5.045290903078215e-07, + "loss": 0.75046647, + "num_input_tokens_seen": 278391660, + "step": 12903, + "time_per_iteration": 2.6246254444122314 + }, + { + "auxiliary_loss_clip": 0.01042293, + "auxiliary_loss_mlp": 0.01026389, + "balance_loss_clip": 1.02586222, + "balance_loss_mlp": 1.01651239, + "epoch": 0.7758304524274763, + "flos": 21430159637760.0, + "grad_norm": 3.422909714679528, + "language_loss": 0.76273483, + "learning_rate": 5.042705173712835e-07, + "loss": 0.78342164, + "num_input_tokens_seen": 278409125, + "step": 12904, + "time_per_iteration": 2.670653820037842 + }, + { + "auxiliary_loss_clip": 0.01059207, + "auxiliary_loss_mlp": 0.01023914, + "balance_loss_clip": 1.02436113, + "balance_loss_mlp": 1.0141921, + "epoch": 0.7758905756801443, + "flos": 23659242877440.0, + "grad_norm": 2.082378131889838, + "language_loss": 0.67802125, + "learning_rate": 5.040120011529576e-07, + "loss": 0.69885242, + "num_input_tokens_seen": 278429450, + "step": 12905, + "time_per_iteration": 4.316468000411987 + }, + { + "auxiliary_loss_clip": 0.01052441, + "auxiliary_loss_mlp": 0.00747532, + "balance_loss_clip": 1.02664518, + "balance_loss_mlp": 1.00031686, + "epoch": 0.7759506989328122, + "flos": 28365479660160.0, + "grad_norm": 1.8408351623445465, + "language_loss": 0.67373574, + "learning_rate": 5.037535416626459e-07, + "loss": 0.69173551, + "num_input_tokens_seen": 278449925, + "step": 12906, + "time_per_iteration": 2.705009937286377 + }, + { + "auxiliary_loss_clip": 0.0101973, + "auxiliary_loss_mlp": 0.01028009, + "balance_loss_clip": 1.02062261, + "balance_loss_mlp": 1.01711321, + "epoch": 0.7760108221854802, + "flos": 14902785354240.0, + "grad_norm": 2.127616802933437, + "language_loss": 0.81388217, + "learning_rate": 5.034951389101498e-07, + "loss": 0.83435959, + "num_input_tokens_seen": 278467255, + "step": 12907, + "time_per_iteration": 2.672899007797241 + }, + { + "auxiliary_loss_clip": 0.01047582, + "auxiliary_loss_mlp": 0.01032602, + "balance_loss_clip": 1.02442622, + "balance_loss_mlp": 1.02206993, + "epoch": 0.7760709454381483, + "flos": 14792503622400.0, + "grad_norm": 2.3819999049311544, + "language_loss": 0.67411911, + "learning_rate": 5.032367929052685e-07, + "loss": 0.69492102, + "num_input_tokens_seen": 278484250, + "step": 12908, + "time_per_iteration": 2.650308132171631 + }, + { + "auxiliary_loss_clip": 0.01033548, + "auxiliary_loss_mlp": 0.01034194, + "balance_loss_clip": 1.02561831, + "balance_loss_mlp": 1.02350712, + "epoch": 0.7761310686908162, + "flos": 17379831156480.0, + "grad_norm": 1.6623341258351017, + "language_loss": 0.70392644, + "learning_rate": 5.029785036577976e-07, + "loss": 0.72460389, + "num_input_tokens_seen": 278502740, + "step": 12909, + "time_per_iteration": 2.6318717002868652 + }, + { + "auxiliary_loss_clip": 0.01051347, + "auxiliary_loss_mlp": 0.01028056, + "balance_loss_clip": 1.02534854, + "balance_loss_mlp": 1.01840019, + "epoch": 0.7761911919434842, + "flos": 25556547168000.0, + "grad_norm": 1.5014516818547619, + "language_loss": 0.68061978, + "learning_rate": 5.027202711775324e-07, + "loss": 0.70141381, + "num_input_tokens_seen": 278523890, + "step": 12910, + "time_per_iteration": 2.663477897644043 + }, + { + "auxiliary_loss_clip": 0.01019239, + "auxiliary_loss_mlp": 0.01029789, + "balance_loss_clip": 1.0245657, + "balance_loss_mlp": 1.01979327, + "epoch": 0.7762513151961521, + "flos": 23179763203200.0, + "grad_norm": 1.7253244942864294, + "language_loss": 0.71632051, + "learning_rate": 5.024620954742646e-07, + "loss": 0.7368108, + "num_input_tokens_seen": 278543185, + "step": 12911, + "time_per_iteration": 2.712787628173828 + }, + { + "auxiliary_loss_clip": 0.01066412, + "auxiliary_loss_mlp": 0.00747571, + "balance_loss_clip": 1.02708697, + "balance_loss_mlp": 1.0003463, + "epoch": 0.7763114384488201, + "flos": 21689614552320.0, + "grad_norm": 2.649546323233148, + "language_loss": 0.63449132, + "learning_rate": 5.022039765577836e-07, + "loss": 0.6526311, + "num_input_tokens_seen": 278559220, + "step": 12912, + "time_per_iteration": 2.5695273876190186 + }, + { + "auxiliary_loss_clip": 0.00974272, + "auxiliary_loss_mlp": 0.01000505, + "balance_loss_clip": 1.00128984, + "balance_loss_mlp": 0.99958736, + "epoch": 0.776371561701488, + "flos": 69025554316800.0, + "grad_norm": 0.7732092468327151, + "language_loss": 0.53292811, + "learning_rate": 5.019459144378779e-07, + "loss": 0.55267584, + "num_input_tokens_seen": 278618185, + "step": 12913, + "time_per_iteration": 3.3492674827575684 + }, + { + "auxiliary_loss_clip": 0.01044792, + "auxiliary_loss_mlp": 0.01030725, + "balance_loss_clip": 1.02642608, + "balance_loss_mlp": 1.01977539, + "epoch": 0.776431684954156, + "flos": 22893914770560.0, + "grad_norm": 1.6702602726703089, + "language_loss": 0.61914909, + "learning_rate": 5.016879091243338e-07, + "loss": 0.63990426, + "num_input_tokens_seen": 278636210, + "step": 12914, + "time_per_iteration": 2.6914796829223633 + }, + { + "auxiliary_loss_clip": 0.01041742, + "auxiliary_loss_mlp": 0.01029731, + "balance_loss_clip": 1.02427006, + "balance_loss_mlp": 1.01897824, + "epoch": 0.776491808206824, + "flos": 20261554560000.0, + "grad_norm": 1.7751613624991944, + "language_loss": 0.82126445, + "learning_rate": 5.014299606269339e-07, + "loss": 0.84197921, + "num_input_tokens_seen": 278653305, + "step": 12915, + "time_per_iteration": 2.6399788856506348 + }, + { + "auxiliary_loss_clip": 0.01057265, + "auxiliary_loss_mlp": 0.01031971, + "balance_loss_clip": 1.02690208, + "balance_loss_mlp": 1.02047348, + "epoch": 0.776551931459492, + "flos": 26759051706240.0, + "grad_norm": 1.7551193026455163, + "language_loss": 0.74652332, + "learning_rate": 5.011720689554603e-07, + "loss": 0.76741564, + "num_input_tokens_seen": 278671850, + "step": 12916, + "time_per_iteration": 2.6711599826812744 + }, + { + "auxiliary_loss_clip": 0.01007183, + "auxiliary_loss_mlp": 0.01031295, + "balance_loss_clip": 1.02261019, + "balance_loss_mlp": 1.01947522, + "epoch": 0.7766120547121599, + "flos": 52665080250240.0, + "grad_norm": 1.9424398509059786, + "language_loss": 0.65681732, + "learning_rate": 5.009142341196919e-07, + "loss": 0.67720211, + "num_input_tokens_seen": 278697860, + "step": 12917, + "time_per_iteration": 3.069526433944702 + }, + { + "auxiliary_loss_clip": 0.01052137, + "auxiliary_loss_mlp": 0.01032869, + "balance_loss_clip": 1.0236907, + "balance_loss_mlp": 1.02242613, + "epoch": 0.7766721779648279, + "flos": 25156215112320.0, + "grad_norm": 1.6261012421337862, + "language_loss": 0.64759064, + "learning_rate": 5.006564561294065e-07, + "loss": 0.66844064, + "num_input_tokens_seen": 278720655, + "step": 12918, + "time_per_iteration": 2.7856686115264893 + }, + { + "auxiliary_loss_clip": 0.0106112, + "auxiliary_loss_mlp": 0.01030963, + "balance_loss_clip": 1.02447033, + "balance_loss_mlp": 1.02152109, + "epoch": 0.7767323012174958, + "flos": 23760761690880.0, + "grad_norm": 2.302813195088226, + "language_loss": 0.73803872, + "learning_rate": 5.003987349943777e-07, + "loss": 0.75895953, + "num_input_tokens_seen": 278737375, + "step": 12919, + "time_per_iteration": 2.621443271636963 + }, + { + "auxiliary_loss_clip": 0.01022377, + "auxiliary_loss_mlp": 0.01030136, + "balance_loss_clip": 1.02433562, + "balance_loss_mlp": 1.01922882, + "epoch": 0.7767924244701638, + "flos": 22086642556800.0, + "grad_norm": 1.9438577181856393, + "language_loss": 0.78933036, + "learning_rate": 5.001410707243792e-07, + "loss": 0.80985546, + "num_input_tokens_seen": 278756510, + "step": 12920, + "time_per_iteration": 2.820883274078369 + }, + { + "auxiliary_loss_clip": 0.01053018, + "auxiliary_loss_mlp": 0.01027168, + "balance_loss_clip": 1.02576292, + "balance_loss_mlp": 1.01668358, + "epoch": 0.7768525477228319, + "flos": 21981640124160.0, + "grad_norm": 1.7498774577367653, + "language_loss": 0.70958757, + "learning_rate": 4.998834633291829e-07, + "loss": 0.73038948, + "num_input_tokens_seen": 278775410, + "step": 12921, + "time_per_iteration": 2.6478993892669678 + }, + { + "auxiliary_loss_clip": 0.01057264, + "auxiliary_loss_mlp": 0.01031655, + "balance_loss_clip": 1.02645028, + "balance_loss_mlp": 1.02013958, + "epoch": 0.7769126709754998, + "flos": 21794581071360.0, + "grad_norm": 1.5886825001893663, + "language_loss": 0.75633776, + "learning_rate": 4.996259128185547e-07, + "loss": 0.77722698, + "num_input_tokens_seen": 278794260, + "step": 12922, + "time_per_iteration": 2.6038668155670166 + }, + { + "auxiliary_loss_clip": 0.01017483, + "auxiliary_loss_mlp": 0.01031578, + "balance_loss_clip": 1.02555656, + "balance_loss_mlp": 1.02033114, + "epoch": 0.7769727942281678, + "flos": 20047994248320.0, + "grad_norm": 1.7825539452161498, + "language_loss": 0.80369449, + "learning_rate": 4.993684192022625e-07, + "loss": 0.82418507, + "num_input_tokens_seen": 278813290, + "step": 12923, + "time_per_iteration": 2.7486140727996826 + }, + { + "auxiliary_loss_clip": 0.01034089, + "auxiliary_loss_mlp": 0.01032897, + "balance_loss_clip": 1.0270648, + "balance_loss_mlp": 1.02278781, + "epoch": 0.7770329174808357, + "flos": 21686777377920.0, + "grad_norm": 2.117127742240787, + "language_loss": 0.92375112, + "learning_rate": 4.991109824900699e-07, + "loss": 0.94442105, + "num_input_tokens_seen": 278830610, + "step": 12924, + "time_per_iteration": 4.319290637969971 + }, + { + "auxiliary_loss_clip": 0.01052158, + "auxiliary_loss_mlp": 0.01027791, + "balance_loss_clip": 1.02409315, + "balance_loss_mlp": 1.01721704, + "epoch": 0.7770930407335037, + "flos": 25849255098240.0, + "grad_norm": 1.8817688074802268, + "language_loss": 0.66174585, + "learning_rate": 4.988536026917401e-07, + "loss": 0.68254536, + "num_input_tokens_seen": 278849530, + "step": 12925, + "time_per_iteration": 4.291771650314331 + }, + { + "auxiliary_loss_clip": 0.01033104, + "auxiliary_loss_mlp": 0.01030773, + "balance_loss_clip": 1.02519405, + "balance_loss_mlp": 1.02012146, + "epoch": 0.7771531639861716, + "flos": 24347865490560.0, + "grad_norm": 2.0769077136707654, + "language_loss": 0.71838653, + "learning_rate": 4.985962798170314e-07, + "loss": 0.73902524, + "num_input_tokens_seen": 278869005, + "step": 12926, + "time_per_iteration": 2.7262039184570312 + }, + { + "auxiliary_loss_clip": 0.0105516, + "auxiliary_loss_mlp": 0.01025355, + "balance_loss_clip": 1.02592015, + "balance_loss_mlp": 1.01400065, + "epoch": 0.7772132872388396, + "flos": 25629948610560.0, + "grad_norm": 1.5856038826255001, + "language_loss": 0.65413165, + "learning_rate": 4.983390138757027e-07, + "loss": 0.67493689, + "num_input_tokens_seen": 278888790, + "step": 12927, + "time_per_iteration": 2.6235458850860596 + }, + { + "auxiliary_loss_clip": 0.01042647, + "auxiliary_loss_mlp": 0.01035, + "balance_loss_clip": 1.02470493, + "balance_loss_mlp": 1.02391386, + "epoch": 0.7772734104915076, + "flos": 26067412350720.0, + "grad_norm": 2.029087475230105, + "language_loss": 0.72498721, + "learning_rate": 4.980818048775093e-07, + "loss": 0.74576366, + "num_input_tokens_seen": 278908150, + "step": 12928, + "time_per_iteration": 2.678574562072754 + }, + { + "auxiliary_loss_clip": 0.01022148, + "auxiliary_loss_mlp": 0.01025229, + "balance_loss_clip": 1.02547657, + "balance_loss_mlp": 1.01535225, + "epoch": 0.7773335337441756, + "flos": 22925048883840.0, + "grad_norm": 1.6543485795615018, + "language_loss": 0.74202669, + "learning_rate": 4.978246528322036e-07, + "loss": 0.76250046, + "num_input_tokens_seen": 278927425, + "step": 12929, + "time_per_iteration": 2.7426390647888184 + }, + { + "auxiliary_loss_clip": 0.01032855, + "auxiliary_loss_mlp": 0.01028996, + "balance_loss_clip": 1.02487612, + "balance_loss_mlp": 1.01868391, + "epoch": 0.7773936569968435, + "flos": 20776765288320.0, + "grad_norm": 1.8463416598677447, + "language_loss": 0.77577102, + "learning_rate": 4.975675577495377e-07, + "loss": 0.79638946, + "num_input_tokens_seen": 278946475, + "step": 12930, + "time_per_iteration": 2.7568016052246094 + }, + { + "auxiliary_loss_clip": 0.01063449, + "auxiliary_loss_mlp": 0.01030633, + "balance_loss_clip": 1.02657437, + "balance_loss_mlp": 1.020226, + "epoch": 0.7774537802495115, + "flos": 20372267255040.0, + "grad_norm": 1.7289889233979194, + "language_loss": 0.79615808, + "learning_rate": 4.973105196392613e-07, + "loss": 0.81709886, + "num_input_tokens_seen": 278964345, + "step": 12931, + "time_per_iteration": 2.6105074882507324 + }, + { + "auxiliary_loss_clip": 0.0100015, + "auxiliary_loss_mlp": 0.0100713, + "balance_loss_clip": 1.02101827, + "balance_loss_mlp": 1.00509107, + "epoch": 0.7775139035021794, + "flos": 53912081738880.0, + "grad_norm": 0.8879095974911506, + "language_loss": 0.59838521, + "learning_rate": 4.970535385111199e-07, + "loss": 0.61845797, + "num_input_tokens_seen": 279022380, + "step": 12932, + "time_per_iteration": 3.2419967651367188 + }, + { + "auxiliary_loss_clip": 0.0105318, + "auxiliary_loss_mlp": 0.01032043, + "balance_loss_clip": 1.02580237, + "balance_loss_mlp": 1.02159452, + "epoch": 0.7775740267548474, + "flos": 28842481296000.0, + "grad_norm": 3.4635809813545615, + "language_loss": 0.76132989, + "learning_rate": 4.967966143748595e-07, + "loss": 0.7821821, + "num_input_tokens_seen": 279044275, + "step": 12933, + "time_per_iteration": 2.6542258262634277 + }, + { + "auxiliary_loss_clip": 0.01036113, + "auxiliary_loss_mlp": 0.01029089, + "balance_loss_clip": 1.0248847, + "balance_loss_mlp": 1.01785314, + "epoch": 0.7776341500075155, + "flos": 21872471713920.0, + "grad_norm": 2.260086970960813, + "language_loss": 0.73220146, + "learning_rate": 4.965397472402215e-07, + "loss": 0.75285351, + "num_input_tokens_seen": 279063375, + "step": 12934, + "time_per_iteration": 2.6762866973876953 + }, + { + "auxiliary_loss_clip": 0.01019783, + "auxiliary_loss_mlp": 0.01024273, + "balance_loss_clip": 1.02393055, + "balance_loss_mlp": 1.01335335, + "epoch": 0.7776942732601834, + "flos": 20229845829120.0, + "grad_norm": 1.7283091631468637, + "language_loss": 0.70201576, + "learning_rate": 4.962829371169475e-07, + "loss": 0.72245622, + "num_input_tokens_seen": 279082680, + "step": 12935, + "time_per_iteration": 2.8783514499664307 + }, + { + "auxiliary_loss_clip": 0.01044287, + "auxiliary_loss_mlp": 0.00747708, + "balance_loss_clip": 1.02665305, + "balance_loss_mlp": 1.00039768, + "epoch": 0.7777543965128514, + "flos": 22231829329920.0, + "grad_norm": 1.521480904666277, + "language_loss": 0.83746624, + "learning_rate": 4.960261840147746e-07, + "loss": 0.8553862, + "num_input_tokens_seen": 279099805, + "step": 12936, + "time_per_iteration": 2.669524908065796 + }, + { + "auxiliary_loss_clip": 0.01055953, + "auxiliary_loss_mlp": 0.01029048, + "balance_loss_clip": 1.0259136, + "balance_loss_mlp": 1.01880205, + "epoch": 0.7778145197655193, + "flos": 14501950508160.0, + "grad_norm": 2.0549209320479096, + "language_loss": 0.67809367, + "learning_rate": 4.957694879434397e-07, + "loss": 0.69894367, + "num_input_tokens_seen": 279117975, + "step": 12937, + "time_per_iteration": 2.6031954288482666 + }, + { + "auxiliary_loss_clip": 0.01064284, + "auxiliary_loss_mlp": 0.01025644, + "balance_loss_clip": 1.02558255, + "balance_loss_mlp": 1.01483798, + "epoch": 0.7778746430181873, + "flos": 21140288881920.0, + "grad_norm": 3.7298657955928722, + "language_loss": 0.87425673, + "learning_rate": 4.955128489126777e-07, + "loss": 0.89515597, + "num_input_tokens_seen": 279137255, + "step": 12938, + "time_per_iteration": 2.721587657928467 + }, + { + "auxiliary_loss_clip": 0.01053541, + "auxiliary_loss_mlp": 0.01028213, + "balance_loss_clip": 1.02518487, + "balance_loss_mlp": 1.01765704, + "epoch": 0.7779347662708552, + "flos": 20266366982400.0, + "grad_norm": 1.8955898126703832, + "language_loss": 0.85064256, + "learning_rate": 4.95256266932218e-07, + "loss": 0.87146008, + "num_input_tokens_seen": 279154500, + "step": 12939, + "time_per_iteration": 2.642117500305176 + }, + { + "auxiliary_loss_clip": 0.01060507, + "auxiliary_loss_mlp": 0.00747522, + "balance_loss_clip": 1.02456367, + "balance_loss_mlp": 1.00032413, + "epoch": 0.7779948895235232, + "flos": 19209013303680.0, + "grad_norm": 1.6919337305427948, + "language_loss": 0.69154704, + "learning_rate": 4.949997420117915e-07, + "loss": 0.70962739, + "num_input_tokens_seen": 279173635, + "step": 12940, + "time_per_iteration": 2.5358731746673584 + }, + { + "auxiliary_loss_clip": 0.01033909, + "auxiliary_loss_mlp": 0.01023735, + "balance_loss_clip": 1.02489352, + "balance_loss_mlp": 1.01387024, + "epoch": 0.7780550127761912, + "flos": 23914711382400.0, + "grad_norm": 1.581154337523132, + "language_loss": 0.77873653, + "learning_rate": 4.947432741611255e-07, + "loss": 0.79931295, + "num_input_tokens_seen": 279194430, + "step": 12941, + "time_per_iteration": 4.474688291549683 + }, + { + "auxiliary_loss_clip": 0.01054643, + "auxiliary_loss_mlp": 0.01032802, + "balance_loss_clip": 1.02402937, + "balance_loss_mlp": 1.02042198, + "epoch": 0.7781151360288592, + "flos": 32415951795840.0, + "grad_norm": 2.1623399967823818, + "language_loss": 0.73184806, + "learning_rate": 4.944868633899462e-07, + "loss": 0.7527225, + "num_input_tokens_seen": 279212920, + "step": 12942, + "time_per_iteration": 2.710972547531128 + }, + { + "auxiliary_loss_clip": 0.01016854, + "auxiliary_loss_mlp": 0.01033188, + "balance_loss_clip": 1.02451992, + "balance_loss_mlp": 1.02143979, + "epoch": 0.7781752592815271, + "flos": 22346384780160.0, + "grad_norm": 1.932136903555007, + "language_loss": 0.68040907, + "learning_rate": 4.942305097079751e-07, + "loss": 0.7009095, + "num_input_tokens_seen": 279232310, + "step": 12943, + "time_per_iteration": 2.823760509490967 + }, + { + "auxiliary_loss_clip": 0.00974878, + "auxiliary_loss_mlp": 0.01004783, + "balance_loss_clip": 1.00058126, + "balance_loss_mlp": 1.00375795, + "epoch": 0.7782353825341951, + "flos": 70460183520000.0, + "grad_norm": 0.7838466114600476, + "language_loss": 0.58601904, + "learning_rate": 4.939742131249347e-07, + "loss": 0.60581565, + "num_input_tokens_seen": 279295375, + "step": 12944, + "time_per_iteration": 3.3803977966308594 + }, + { + "auxiliary_loss_clip": 0.010643, + "auxiliary_loss_mlp": 0.01032896, + "balance_loss_clip": 1.02533722, + "balance_loss_mlp": 1.02064157, + "epoch": 0.778295505786863, + "flos": 19062569554560.0, + "grad_norm": 2.3883779107631087, + "language_loss": 0.67413503, + "learning_rate": 4.937179736505428e-07, + "loss": 0.69510698, + "num_input_tokens_seen": 279313660, + "step": 12945, + "time_per_iteration": 2.5590853691101074 + }, + { + "auxiliary_loss_clip": 0.01047492, + "auxiliary_loss_mlp": 0.01031508, + "balance_loss_clip": 1.02423334, + "balance_loss_mlp": 1.02066612, + "epoch": 0.778355629039531, + "flos": 20999734963200.0, + "grad_norm": 1.8358250984958635, + "language_loss": 0.69100893, + "learning_rate": 4.93461791294516e-07, + "loss": 0.71179891, + "num_input_tokens_seen": 279334495, + "step": 12946, + "time_per_iteration": 2.620485782623291 + }, + { + "auxiliary_loss_clip": 0.01063158, + "auxiliary_loss_mlp": 0.0102952, + "balance_loss_clip": 1.02530408, + "balance_loss_mlp": 1.01810026, + "epoch": 0.7784157522921991, + "flos": 21398091770880.0, + "grad_norm": 1.911617129320362, + "language_loss": 0.65544015, + "learning_rate": 4.932056660665689e-07, + "loss": 0.67636693, + "num_input_tokens_seen": 279352985, + "step": 12947, + "time_per_iteration": 2.5447964668273926 + }, + { + "auxiliary_loss_clip": 0.00999179, + "auxiliary_loss_mlp": 0.01033348, + "balance_loss_clip": 1.02145505, + "balance_loss_mlp": 1.02081919, + "epoch": 0.778475875544867, + "flos": 20813861059200.0, + "grad_norm": 1.9170991920331162, + "language_loss": 0.65515542, + "learning_rate": 4.929495979764147e-07, + "loss": 0.67548072, + "num_input_tokens_seen": 279371360, + "step": 12948, + "time_per_iteration": 2.733706474304199 + }, + { + "auxiliary_loss_clip": 0.01063871, + "auxiliary_loss_mlp": 0.01031276, + "balance_loss_clip": 1.02609015, + "balance_loss_mlp": 1.01992774, + "epoch": 0.778535998797535, + "flos": 14355363104640.0, + "grad_norm": 1.6418751794256334, + "language_loss": 0.75011444, + "learning_rate": 4.926935870337625e-07, + "loss": 0.77106595, + "num_input_tokens_seen": 279389400, + "step": 12949, + "time_per_iteration": 2.640744924545288 + }, + { + "auxiliary_loss_clip": 0.01066494, + "auxiliary_loss_mlp": 0.0103128, + "balance_loss_clip": 1.02672005, + "balance_loss_mlp": 1.0202893, + "epoch": 0.7785961220502029, + "flos": 19209552007680.0, + "grad_norm": 1.6695203247483976, + "language_loss": 0.68982285, + "learning_rate": 4.924376332483202e-07, + "loss": 0.71080065, + "num_input_tokens_seen": 279409715, + "step": 12950, + "time_per_iteration": 2.5720605850219727 + }, + { + "auxiliary_loss_clip": 0.01054817, + "auxiliary_loss_mlp": 0.01027029, + "balance_loss_clip": 1.02667248, + "balance_loss_mlp": 1.01627588, + "epoch": 0.7786562453028709, + "flos": 25738757884800.0, + "grad_norm": 2.017903524307049, + "language_loss": 0.72249007, + "learning_rate": 4.921817366297938e-07, + "loss": 0.74330854, + "num_input_tokens_seen": 279427705, + "step": 12951, + "time_per_iteration": 2.655574321746826 + }, + { + "auxiliary_loss_clip": 0.01037027, + "auxiliary_loss_mlp": 0.01030036, + "balance_loss_clip": 1.02251458, + "balance_loss_mlp": 1.01924133, + "epoch": 0.7787163685555388, + "flos": 25739440243200.0, + "grad_norm": 2.169325576386194, + "language_loss": 0.65672779, + "learning_rate": 4.919258971878877e-07, + "loss": 0.67739844, + "num_input_tokens_seen": 279448215, + "step": 12952, + "time_per_iteration": 4.356974124908447 + }, + { + "auxiliary_loss_clip": 0.01035946, + "auxiliary_loss_mlp": 0.01025037, + "balance_loss_clip": 1.0226028, + "balance_loss_mlp": 1.01524436, + "epoch": 0.7787764918082068, + "flos": 22747722416640.0, + "grad_norm": 6.78360312381048, + "language_loss": 0.81424427, + "learning_rate": 4.916701149323022e-07, + "loss": 0.83485413, + "num_input_tokens_seen": 279466260, + "step": 12953, + "time_per_iteration": 2.6375794410705566 + }, + { + "auxiliary_loss_clip": 0.01067261, + "auxiliary_loss_mlp": 0.01031223, + "balance_loss_clip": 1.02753377, + "balance_loss_mlp": 1.02009487, + "epoch": 0.7788366150608748, + "flos": 15190860430080.0, + "grad_norm": 2.085310779491499, + "language_loss": 0.77062869, + "learning_rate": 4.91414389872737e-07, + "loss": 0.79161346, + "num_input_tokens_seen": 279484520, + "step": 12954, + "time_per_iteration": 2.5299532413482666 + }, + { + "auxiliary_loss_clip": 0.01047992, + "auxiliary_loss_mlp": 0.01027185, + "balance_loss_clip": 1.02354622, + "balance_loss_mlp": 1.01693332, + "epoch": 0.7788967383135428, + "flos": 21210242618880.0, + "grad_norm": 1.5903251642542453, + "language_loss": 0.72833002, + "learning_rate": 4.911587220188905e-07, + "loss": 0.74908179, + "num_input_tokens_seen": 279503130, + "step": 12955, + "time_per_iteration": 2.6488687992095947 + }, + { + "auxiliary_loss_clip": 0.01035712, + "auxiliary_loss_mlp": 0.01031062, + "balance_loss_clip": 1.02411008, + "balance_loss_mlp": 1.0201304, + "epoch": 0.7789568615662107, + "flos": 21682970536320.0, + "grad_norm": 1.405030681957046, + "language_loss": 0.68866986, + "learning_rate": 4.909031113804551e-07, + "loss": 0.70933759, + "num_input_tokens_seen": 279521930, + "step": 12956, + "time_per_iteration": 2.676924705505371 + }, + { + "auxiliary_loss_clip": 0.01026524, + "auxiliary_loss_mlp": 0.01028079, + "balance_loss_clip": 1.02311385, + "balance_loss_mlp": 1.01778531, + "epoch": 0.7790169848188787, + "flos": 26360371676160.0, + "grad_norm": 1.4600939408432732, + "language_loss": 0.75653338, + "learning_rate": 4.906475579671252e-07, + "loss": 0.77707946, + "num_input_tokens_seen": 279542375, + "step": 12957, + "time_per_iteration": 2.7229392528533936 + }, + { + "auxiliary_loss_clip": 0.01006324, + "auxiliary_loss_mlp": 0.01024585, + "balance_loss_clip": 1.02704489, + "balance_loss_mlp": 1.01375461, + "epoch": 0.7790771080715466, + "flos": 25516183259520.0, + "grad_norm": 1.765140904576264, + "language_loss": 0.7754252, + "learning_rate": 4.903920617885917e-07, + "loss": 0.79573429, + "num_input_tokens_seen": 279561885, + "step": 12958, + "time_per_iteration": 3.01330304145813 + }, + { + "auxiliary_loss_clip": 0.01048312, + "auxiliary_loss_mlp": 0.01041518, + "balance_loss_clip": 1.02421963, + "balance_loss_mlp": 1.02892995, + "epoch": 0.7791372313242146, + "flos": 16034186920320.0, + "grad_norm": 1.952083299063681, + "language_loss": 0.71541655, + "learning_rate": 4.901366228545418e-07, + "loss": 0.73631477, + "num_input_tokens_seen": 279579965, + "step": 12959, + "time_per_iteration": 2.928388833999634 + }, + { + "auxiliary_loss_clip": 0.01043976, + "auxiliary_loss_mlp": 0.00747592, + "balance_loss_clip": 1.02381265, + "balance_loss_mlp": 1.00031066, + "epoch": 0.7791973545768827, + "flos": 23842207779840.0, + "grad_norm": 1.7552127593774836, + "language_loss": 0.77956605, + "learning_rate": 4.898812411746632e-07, + "loss": 0.79748178, + "num_input_tokens_seen": 279599030, + "step": 12960, + "time_per_iteration": 2.696836471557617 + }, + { + "auxiliary_loss_clip": 0.010567, + "auxiliary_loss_mlp": 0.01033593, + "balance_loss_clip": 1.02752495, + "balance_loss_mlp": 1.0225358, + "epoch": 0.7792574778295506, + "flos": 24168384207360.0, + "grad_norm": 3.8177879036010225, + "language_loss": 0.75612915, + "learning_rate": 4.896259167586385e-07, + "loss": 0.77703208, + "num_input_tokens_seen": 279614400, + "step": 12961, + "time_per_iteration": 2.617814779281616 + }, + { + "auxiliary_loss_clip": 0.01041125, + "auxiliary_loss_mlp": 0.01032973, + "balance_loss_clip": 1.02578855, + "balance_loss_mlp": 1.0233767, + "epoch": 0.7793176010822186, + "flos": 21464921024640.0, + "grad_norm": 1.5611981326654791, + "language_loss": 0.73798847, + "learning_rate": 4.893706496161511e-07, + "loss": 0.75872946, + "num_input_tokens_seen": 279633745, + "step": 12962, + "time_per_iteration": 2.6943647861480713 + }, + { + "auxiliary_loss_clip": 0.01054091, + "auxiliary_loss_mlp": 0.0102628, + "balance_loss_clip": 1.0264647, + "balance_loss_mlp": 1.0154382, + "epoch": 0.7793777243348865, + "flos": 20666699038080.0, + "grad_norm": 1.6787916798286728, + "language_loss": 0.69733971, + "learning_rate": 4.891154397568795e-07, + "loss": 0.71814346, + "num_input_tokens_seen": 279651165, + "step": 12963, + "time_per_iteration": 2.688390016555786 + }, + { + "auxiliary_loss_clip": 0.01052518, + "auxiliary_loss_mlp": 0.0074755, + "balance_loss_clip": 1.02549839, + "balance_loss_mlp": 1.00032687, + "epoch": 0.7794378475875545, + "flos": 27125771610240.0, + "grad_norm": 2.8757605435358347, + "language_loss": 0.63614249, + "learning_rate": 4.888602871905019e-07, + "loss": 0.65414315, + "num_input_tokens_seen": 279671175, + "step": 12964, + "time_per_iteration": 2.701671600341797 + }, + { + "auxiliary_loss_clip": 0.01045224, + "auxiliary_loss_mlp": 0.01031726, + "balance_loss_clip": 1.02557611, + "balance_loss_mlp": 1.02130151, + "epoch": 0.7794979708402224, + "flos": 28074136446720.0, + "grad_norm": 1.7131263845159523, + "language_loss": 0.76950753, + "learning_rate": 4.88605191926694e-07, + "loss": 0.790277, + "num_input_tokens_seen": 279688675, + "step": 12965, + "time_per_iteration": 2.703479528427124 + }, + { + "auxiliary_loss_clip": 0.01040704, + "auxiliary_loss_mlp": 0.01030436, + "balance_loss_clip": 1.02116287, + "balance_loss_mlp": 1.01959991, + "epoch": 0.7795580940928905, + "flos": 26869548919680.0, + "grad_norm": 1.5380794266342337, + "language_loss": 0.72916389, + "learning_rate": 4.883501539751289e-07, + "loss": 0.74987531, + "num_input_tokens_seen": 279710245, + "step": 12966, + "time_per_iteration": 2.681630849838257 + }, + { + "auxiliary_loss_clip": 0.01042853, + "auxiliary_loss_mlp": 0.00747492, + "balance_loss_clip": 1.026829, + "balance_loss_mlp": 1.00037646, + "epoch": 0.7796182173455584, + "flos": 23835384195840.0, + "grad_norm": 1.4462686161233995, + "language_loss": 0.74119508, + "learning_rate": 4.880951733454768e-07, + "loss": 0.75909853, + "num_input_tokens_seen": 279729045, + "step": 12967, + "time_per_iteration": 2.660918712615967 + }, + { + "auxiliary_loss_clip": 0.01064493, + "auxiliary_loss_mlp": 0.01029381, + "balance_loss_clip": 1.02630782, + "balance_loss_mlp": 1.01849079, + "epoch": 0.7796783405982264, + "flos": 19792238434560.0, + "grad_norm": 4.784572027921555, + "language_loss": 0.72343504, + "learning_rate": 4.878402500474073e-07, + "loss": 0.7443738, + "num_input_tokens_seen": 279748350, + "step": 12968, + "time_per_iteration": 2.5591342449188232 + }, + { + "auxiliary_loss_clip": 0.01043723, + "auxiliary_loss_mlp": 0.01030806, + "balance_loss_clip": 1.02575028, + "balance_loss_mlp": 1.02030993, + "epoch": 0.7797384638508943, + "flos": 15450207603840.0, + "grad_norm": 2.818309950182782, + "language_loss": 0.60982758, + "learning_rate": 4.875853840905874e-07, + "loss": 0.63057286, + "num_input_tokens_seen": 279765620, + "step": 12969, + "time_per_iteration": 2.6995246410369873 + }, + { + "auxiliary_loss_clip": 0.01049627, + "auxiliary_loss_mlp": 0.01027968, + "balance_loss_clip": 1.02435625, + "balance_loss_mlp": 1.01843166, + "epoch": 0.7797985871035623, + "flos": 20922742160640.0, + "grad_norm": 1.675521462865303, + "language_loss": 0.70430565, + "learning_rate": 4.873305754846811e-07, + "loss": 0.72508162, + "num_input_tokens_seen": 279782485, + "step": 12970, + "time_per_iteration": 2.6163439750671387 + }, + { + "auxiliary_loss_clip": 0.0102488, + "auxiliary_loss_mlp": 0.0074758, + "balance_loss_clip": 1.02499735, + "balance_loss_mlp": 1.00031197, + "epoch": 0.7798587103562302, + "flos": 36937212514560.0, + "grad_norm": 1.5829576248403143, + "language_loss": 0.72006333, + "learning_rate": 4.870758242393507e-07, + "loss": 0.7377879, + "num_input_tokens_seen": 279804170, + "step": 12971, + "time_per_iteration": 4.369696378707886 + }, + { + "auxiliary_loss_clip": 0.01015079, + "auxiliary_loss_mlp": 0.01026032, + "balance_loss_clip": 1.02165473, + "balance_loss_mlp": 1.01476669, + "epoch": 0.7799188336088982, + "flos": 22419283432320.0, + "grad_norm": 2.2543265292157275, + "language_loss": 0.74166644, + "learning_rate": 4.868211303642578e-07, + "loss": 0.76207757, + "num_input_tokens_seen": 279823730, + "step": 12972, + "time_per_iteration": 4.280784606933594 + }, + { + "auxiliary_loss_clip": 0.01062195, + "auxiliary_loss_mlp": 0.01022989, + "balance_loss_clip": 1.02500105, + "balance_loss_mlp": 1.01187301, + "epoch": 0.7799789568615663, + "flos": 18880466578560.0, + "grad_norm": 1.8978292500303338, + "language_loss": 0.71085215, + "learning_rate": 4.865664938690584e-07, + "loss": 0.731704, + "num_input_tokens_seen": 279843035, + "step": 12973, + "time_per_iteration": 2.595417022705078 + }, + { + "auxiliary_loss_clip": 0.01050207, + "auxiliary_loss_mlp": 0.01026953, + "balance_loss_clip": 1.0249685, + "balance_loss_mlp": 1.01737428, + "epoch": 0.7800390801142342, + "flos": 20262272832000.0, + "grad_norm": 1.8204771951117962, + "language_loss": 0.77622426, + "learning_rate": 4.863119147634089e-07, + "loss": 0.79699588, + "num_input_tokens_seen": 279861450, + "step": 12974, + "time_per_iteration": 2.7459771633148193 + }, + { + "auxiliary_loss_clip": 0.01029784, + "auxiliary_loss_mlp": 0.01025903, + "balance_loss_clip": 1.0230161, + "balance_loss_mlp": 1.0155673, + "epoch": 0.7800992033669022, + "flos": 16690310703360.0, + "grad_norm": 1.8971772240825089, + "language_loss": 0.69194597, + "learning_rate": 4.86057393056964e-07, + "loss": 0.71250284, + "num_input_tokens_seen": 279878660, + "step": 12975, + "time_per_iteration": 2.776026487350464 + }, + { + "auxiliary_loss_clip": 0.01030019, + "auxiliary_loss_mlp": 0.01030555, + "balance_loss_clip": 1.02455282, + "balance_loss_mlp": 1.02052355, + "epoch": 0.7801593266195701, + "flos": 18585208782720.0, + "grad_norm": 2.0095115055991375, + "language_loss": 0.82254446, + "learning_rate": 4.858029287593739e-07, + "loss": 0.84315026, + "num_input_tokens_seen": 279895685, + "step": 12976, + "time_per_iteration": 2.667872190475464 + }, + { + "auxiliary_loss_clip": 0.01043944, + "auxiliary_loss_mlp": 0.00747702, + "balance_loss_clip": 1.02524686, + "balance_loss_mlp": 1.00040674, + "epoch": 0.7802194498722381, + "flos": 25484941405440.0, + "grad_norm": 1.3964155545789578, + "language_loss": 0.66231513, + "learning_rate": 4.85548521880289e-07, + "loss": 0.68023157, + "num_input_tokens_seen": 279917240, + "step": 12977, + "time_per_iteration": 2.8090295791625977 + }, + { + "auxiliary_loss_clip": 0.01040359, + "auxiliary_loss_mlp": 0.01023227, + "balance_loss_clip": 1.0248425, + "balance_loss_mlp": 1.01345778, + "epoch": 0.780279573124906, + "flos": 31176315573120.0, + "grad_norm": 1.4876988913740867, + "language_loss": 0.74623358, + "learning_rate": 4.852941724293554e-07, + "loss": 0.76686943, + "num_input_tokens_seen": 279938665, + "step": 12978, + "time_per_iteration": 2.712514638900757 + }, + { + "auxiliary_loss_clip": 0.01036786, + "auxiliary_loss_mlp": 0.01037833, + "balance_loss_clip": 1.02289391, + "balance_loss_mlp": 1.02542281, + "epoch": 0.780339696377574, + "flos": 26944027770240.0, + "grad_norm": 3.978884553859718, + "language_loss": 0.62407196, + "learning_rate": 4.85039880416219e-07, + "loss": 0.64481813, + "num_input_tokens_seen": 279957965, + "step": 12979, + "time_per_iteration": 2.6929285526275635 + }, + { + "auxiliary_loss_clip": 0.01063258, + "auxiliary_loss_mlp": 0.01029225, + "balance_loss_clip": 1.02617741, + "balance_loss_mlp": 1.01890099, + "epoch": 0.780399819630242, + "flos": 27957426180480.0, + "grad_norm": 1.9887463200953432, + "language_loss": 0.77001476, + "learning_rate": 4.847856458505217e-07, + "loss": 0.79093957, + "num_input_tokens_seen": 279977490, + "step": 12980, + "time_per_iteration": 2.6143581867218018 + }, + { + "auxiliary_loss_clip": 0.01062815, + "auxiliary_loss_mlp": 0.01029227, + "balance_loss_clip": 1.02476239, + "balance_loss_mlp": 1.01942801, + "epoch": 0.78045994288291, + "flos": 22486795044480.0, + "grad_norm": 2.762421148932285, + "language_loss": 0.78078496, + "learning_rate": 4.845314687419046e-07, + "loss": 0.80170536, + "num_input_tokens_seen": 279994220, + "step": 12981, + "time_per_iteration": 2.6567673683166504 + }, + { + "auxiliary_loss_clip": 0.0101821, + "auxiliary_loss_mlp": 0.01032108, + "balance_loss_clip": 1.02370763, + "balance_loss_mlp": 1.02120066, + "epoch": 0.7805200661355779, + "flos": 20850849089280.0, + "grad_norm": 1.659542917826794, + "language_loss": 0.72486359, + "learning_rate": 4.842773491000067e-07, + "loss": 0.74536675, + "num_input_tokens_seen": 280012590, + "step": 12982, + "time_per_iteration": 2.7845263481140137 + }, + { + "auxiliary_loss_clip": 0.01042506, + "auxiliary_loss_mlp": 0.01027289, + "balance_loss_clip": 1.02442443, + "balance_loss_mlp": 1.01726329, + "epoch": 0.7805801893882459, + "flos": 25665966973440.0, + "grad_norm": 1.3529946538945714, + "language_loss": 0.73211873, + "learning_rate": 4.840232869344636e-07, + "loss": 0.75281668, + "num_input_tokens_seen": 280033700, + "step": 12983, + "time_per_iteration": 2.74428653717041 + }, + { + "auxiliary_loss_clip": 0.01042619, + "auxiliary_loss_mlp": 0.01023601, + "balance_loss_clip": 1.02523446, + "balance_loss_mlp": 1.01365328, + "epoch": 0.7806403126409138, + "flos": 11327806483200.0, + "grad_norm": 1.8757943974013265, + "language_loss": 0.75008929, + "learning_rate": 4.837692822549086e-07, + "loss": 0.77075148, + "num_input_tokens_seen": 280052215, + "step": 12984, + "time_per_iteration": 2.7017767429351807 + }, + { + "auxiliary_loss_clip": 0.01028175, + "auxiliary_loss_mlp": 0.01030703, + "balance_loss_clip": 1.01950169, + "balance_loss_mlp": 1.02018905, + "epoch": 0.7807004358935818, + "flos": 19573362910080.0, + "grad_norm": 2.0575777707664598, + "language_loss": 0.81545973, + "learning_rate": 4.835153350709746e-07, + "loss": 0.83604854, + "num_input_tokens_seen": 280070525, + "step": 12985, + "time_per_iteration": 2.7139999866485596 + }, + { + "auxiliary_loss_clip": 0.01041612, + "auxiliary_loss_mlp": 0.01032635, + "balance_loss_clip": 1.0243237, + "balance_loss_mlp": 1.02203178, + "epoch": 0.7807605591462499, + "flos": 19135827342720.0, + "grad_norm": 1.4781336898365722, + "language_loss": 0.77326131, + "learning_rate": 4.832614453922915e-07, + "loss": 0.79400373, + "num_input_tokens_seen": 280089855, + "step": 12986, + "time_per_iteration": 2.828080892562866 + }, + { + "auxiliary_loss_clip": 0.0105088, + "auxiliary_loss_mlp": 0.01026537, + "balance_loss_clip": 1.02359343, + "balance_loss_mlp": 1.01610589, + "epoch": 0.7808206823989178, + "flos": 32374654133760.0, + "grad_norm": 1.6687646213788505, + "language_loss": 0.74033016, + "learning_rate": 4.830076132284859e-07, + "loss": 0.76110429, + "num_input_tokens_seen": 280109960, + "step": 12987, + "time_per_iteration": 2.657614231109619 + }, + { + "auxiliary_loss_clip": 0.00996289, + "auxiliary_loss_mlp": 0.01002757, + "balance_loss_clip": 1.00131011, + "balance_loss_mlp": 1.00180364, + "epoch": 0.7808808056515858, + "flos": 55050235061760.0, + "grad_norm": 0.7359817653764423, + "language_loss": 0.55123645, + "learning_rate": 4.82753838589184e-07, + "loss": 0.57122689, + "num_input_tokens_seen": 280169805, + "step": 12988, + "time_per_iteration": 4.929311990737915 + }, + { + "auxiliary_loss_clip": 0.01027982, + "auxiliary_loss_mlp": 0.01031969, + "balance_loss_clip": 1.02301109, + "balance_loss_mlp": 1.02222359, + "epoch": 0.7809409289042537, + "flos": 12859468277760.0, + "grad_norm": 2.7199302135264607, + "language_loss": 0.80457699, + "learning_rate": 4.82500121484009e-07, + "loss": 0.82517648, + "num_input_tokens_seen": 280184630, + "step": 12989, + "time_per_iteration": 2.606541633605957 + }, + { + "auxiliary_loss_clip": 0.01030725, + "auxiliary_loss_mlp": 0.01027292, + "balance_loss_clip": 1.02370477, + "balance_loss_mlp": 1.01695037, + "epoch": 0.7810010521569217, + "flos": 21687244254720.0, + "grad_norm": 1.5442744029667517, + "language_loss": 0.70257854, + "learning_rate": 4.822464619225806e-07, + "loss": 0.72315866, + "num_input_tokens_seen": 280203880, + "step": 12990, + "time_per_iteration": 2.6536450386047363 + }, + { + "auxiliary_loss_clip": 0.01038732, + "auxiliary_loss_mlp": 0.01026237, + "balance_loss_clip": 1.02361393, + "balance_loss_mlp": 1.01482832, + "epoch": 0.7810611754095896, + "flos": 16757068129920.0, + "grad_norm": 3.459977444378218, + "language_loss": 0.7748462, + "learning_rate": 4.819928599145184e-07, + "loss": 0.79549593, + "num_input_tokens_seen": 280220460, + "step": 12991, + "time_per_iteration": 2.582162618637085 + }, + { + "auxiliary_loss_clip": 0.01028471, + "auxiliary_loss_mlp": 0.01032435, + "balance_loss_clip": 1.02293229, + "balance_loss_mlp": 1.02178943, + "epoch": 0.7811212986622577, + "flos": 43507464658560.0, + "grad_norm": 1.3503825595855432, + "language_loss": 0.66172856, + "learning_rate": 4.817393154694398e-07, + "loss": 0.68233752, + "num_input_tokens_seen": 280242680, + "step": 12992, + "time_per_iteration": 2.8381545543670654 + }, + { + "auxiliary_loss_clip": 0.01063459, + "auxiliary_loss_mlp": 0.01028301, + "balance_loss_clip": 1.02593112, + "balance_loss_mlp": 1.01776338, + "epoch": 0.7811814219149256, + "flos": 21757700782080.0, + "grad_norm": 3.951449873578505, + "language_loss": 0.61817992, + "learning_rate": 4.814858285969578e-07, + "loss": 0.63909757, + "num_input_tokens_seen": 280260655, + "step": 12993, + "time_per_iteration": 2.5748629570007324 + }, + { + "auxiliary_loss_clip": 0.01037353, + "auxiliary_loss_mlp": 0.01028676, + "balance_loss_clip": 1.02301002, + "balance_loss_mlp": 1.01767933, + "epoch": 0.7812415451675936, + "flos": 24061514267520.0, + "grad_norm": 1.3869623241113151, + "language_loss": 0.68383408, + "learning_rate": 4.812323993066862e-07, + "loss": 0.70449436, + "num_input_tokens_seen": 280281185, + "step": 12994, + "time_per_iteration": 2.664597272872925 + }, + { + "auxiliary_loss_clip": 0.01060791, + "auxiliary_loss_mlp": 0.01025556, + "balance_loss_clip": 1.02505422, + "balance_loss_mlp": 1.01551247, + "epoch": 0.7813016684202615, + "flos": 18989706816000.0, + "grad_norm": 1.8679363711372559, + "language_loss": 0.69018668, + "learning_rate": 4.809790276082335e-07, + "loss": 0.71105015, + "num_input_tokens_seen": 280298255, + "step": 12995, + "time_per_iteration": 2.5775225162506104 + }, + { + "auxiliary_loss_clip": 0.01021574, + "auxiliary_loss_mlp": 0.01026621, + "balance_loss_clip": 1.02359772, + "balance_loss_mlp": 1.01710868, + "epoch": 0.7813617916729295, + "flos": 25260786581760.0, + "grad_norm": 1.7320574585414537, + "language_loss": 0.74943888, + "learning_rate": 4.807257135112088e-07, + "loss": 0.76992083, + "num_input_tokens_seen": 280319000, + "step": 12996, + "time_per_iteration": 2.7795469760894775 + }, + { + "auxiliary_loss_clip": 0.01066477, + "auxiliary_loss_mlp": 0.01028514, + "balance_loss_clip": 1.02642345, + "balance_loss_mlp": 1.01736832, + "epoch": 0.7814219149255974, + "flos": 17966037116160.0, + "grad_norm": 2.2770088474670325, + "language_loss": 0.68271708, + "learning_rate": 4.804724570252167e-07, + "loss": 0.70366704, + "num_input_tokens_seen": 280336375, + "step": 12997, + "time_per_iteration": 2.6432125568389893 + }, + { + "auxiliary_loss_clip": 0.01064622, + "auxiliary_loss_mlp": 0.01031482, + "balance_loss_clip": 1.02524781, + "balance_loss_mlp": 1.02001989, + "epoch": 0.7814820381782654, + "flos": 25776176878080.0, + "grad_norm": 1.963993272850784, + "language_loss": 0.82230377, + "learning_rate": 4.802192581598614e-07, + "loss": 0.84326488, + "num_input_tokens_seen": 280358760, + "step": 12998, + "time_per_iteration": 2.783942222595215 + }, + { + "auxiliary_loss_clip": 0.01030777, + "auxiliary_loss_mlp": 0.01031049, + "balance_loss_clip": 1.02055967, + "balance_loss_mlp": 1.0192945, + "epoch": 0.7815421614309335, + "flos": 20519572930560.0, + "grad_norm": 2.077844171970004, + "language_loss": 0.74444354, + "learning_rate": 4.799661169247453e-07, + "loss": 0.76506174, + "num_input_tokens_seen": 280377085, + "step": 12999, + "time_per_iteration": 2.659841775894165 + }, + { + "auxiliary_loss_clip": 0.01048708, + "auxiliary_loss_mlp": 0.01038665, + "balance_loss_clip": 1.02385426, + "balance_loss_mlp": 1.02582037, + "epoch": 0.7816022846836014, + "flos": 21287666384640.0, + "grad_norm": 1.5582555128464761, + "language_loss": 0.84512663, + "learning_rate": 4.797130333294652e-07, + "loss": 0.86600041, + "num_input_tokens_seen": 280395465, + "step": 13000, + "time_per_iteration": 4.277542352676392 + }, + { + "auxiliary_loss_clip": 0.01053496, + "auxiliary_loss_mlp": 0.01026834, + "balance_loss_clip": 1.02515101, + "balance_loss_mlp": 1.01622474, + "epoch": 0.7816624079362694, + "flos": 19208402772480.0, + "grad_norm": 2.5237400636170157, + "language_loss": 0.66122365, + "learning_rate": 4.794600073836192e-07, + "loss": 0.68202686, + "num_input_tokens_seen": 280412775, + "step": 13001, + "time_per_iteration": 2.592928886413574 + }, + { + "auxiliary_loss_clip": 0.01024417, + "auxiliary_loss_mlp": 0.01032215, + "balance_loss_clip": 1.02292919, + "balance_loss_mlp": 1.02167654, + "epoch": 0.7817225311889373, + "flos": 26104687689600.0, + "grad_norm": 2.1601885023757585, + "language_loss": 0.66871881, + "learning_rate": 4.792070390968027e-07, + "loss": 0.68928516, + "num_input_tokens_seen": 280432905, + "step": 13002, + "time_per_iteration": 2.7402193546295166 + }, + { + "auxiliary_loss_clip": 0.0105561, + "auxiliary_loss_mlp": 0.01031046, + "balance_loss_clip": 1.02608418, + "balance_loss_mlp": 1.01930404, + "epoch": 0.7817826544416053, + "flos": 21250929749760.0, + "grad_norm": 2.0736912749554985, + "language_loss": 0.73464262, + "learning_rate": 4.78954128478607e-07, + "loss": 0.75550914, + "num_input_tokens_seen": 280450785, + "step": 13003, + "time_per_iteration": 2.6705574989318848 + }, + { + "auxiliary_loss_clip": 0.01050689, + "auxiliary_loss_mlp": 0.01027605, + "balance_loss_clip": 1.0244627, + "balance_loss_mlp": 1.01751411, + "epoch": 0.7818427776942732, + "flos": 19932181822080.0, + "grad_norm": 1.6550156420728337, + "language_loss": 0.62155688, + "learning_rate": 4.787012755386233e-07, + "loss": 0.64233983, + "num_input_tokens_seen": 280468400, + "step": 13004, + "time_per_iteration": 2.5863287448883057 + }, + { + "auxiliary_loss_clip": 0.01057733, + "auxiliary_loss_mlp": 0.01026428, + "balance_loss_clip": 1.02360618, + "balance_loss_mlp": 1.01691508, + "epoch": 0.7819029009469413, + "flos": 11363753018880.0, + "grad_norm": 1.783225960920773, + "language_loss": 0.82969391, + "learning_rate": 4.784484802864403e-07, + "loss": 0.85053545, + "num_input_tokens_seen": 280483930, + "step": 13005, + "time_per_iteration": 2.620711088180542 + }, + { + "auxiliary_loss_clip": 0.01021965, + "auxiliary_loss_mlp": 0.00747686, + "balance_loss_clip": 1.02092588, + "balance_loss_mlp": 1.00035083, + "epoch": 0.7819630241996092, + "flos": 24279276470400.0, + "grad_norm": 1.8995233098495412, + "language_loss": 0.72782278, + "learning_rate": 4.781957427316432e-07, + "loss": 0.74551928, + "num_input_tokens_seen": 280503465, + "step": 13006, + "time_per_iteration": 2.7922027111053467 + }, + { + "auxiliary_loss_clip": 0.0105437, + "auxiliary_loss_mlp": 0.00747771, + "balance_loss_clip": 1.02563548, + "balance_loss_mlp": 1.00040483, + "epoch": 0.7820231474522772, + "flos": 22708902792960.0, + "grad_norm": 1.6562355915594, + "language_loss": 0.71968567, + "learning_rate": 4.779430628838157e-07, + "loss": 0.73770702, + "num_input_tokens_seen": 280523375, + "step": 13007, + "time_per_iteration": 2.8091487884521484 + }, + { + "auxiliary_loss_clip": 0.01062697, + "auxiliary_loss_mlp": 0.01027681, + "balance_loss_clip": 1.02390862, + "balance_loss_mlp": 1.01670218, + "epoch": 0.7820832707049451, + "flos": 20047419630720.0, + "grad_norm": 1.8505980014029009, + "language_loss": 0.68679827, + "learning_rate": 4.776904407525397e-07, + "loss": 0.7077021, + "num_input_tokens_seen": 280542920, + "step": 13008, + "time_per_iteration": 2.584070920944214 + }, + { + "auxiliary_loss_clip": 0.01041732, + "auxiliary_loss_mlp": 0.01024718, + "balance_loss_clip": 1.02400494, + "balance_loss_mlp": 1.01332128, + "epoch": 0.7821433939576131, + "flos": 27162795553920.0, + "grad_norm": 1.7962138320062189, + "language_loss": 0.69719809, + "learning_rate": 4.774378763473954e-07, + "loss": 0.71786261, + "num_input_tokens_seen": 280561700, + "step": 13009, + "time_per_iteration": 2.6878480911254883 + }, + { + "auxiliary_loss_clip": 0.01024469, + "auxiliary_loss_mlp": 0.01028607, + "balance_loss_clip": 1.02286601, + "balance_loss_mlp": 1.01727045, + "epoch": 0.782203517210281, + "flos": 22602068766720.0, + "grad_norm": 2.2691028675977543, + "language_loss": 0.81363559, + "learning_rate": 4.771853696779586e-07, + "loss": 0.83416641, + "num_input_tokens_seen": 280580605, + "step": 13010, + "time_per_iteration": 2.670398235321045 + }, + { + "auxiliary_loss_clip": 0.01048902, + "auxiliary_loss_mlp": 0.01030047, + "balance_loss_clip": 1.02274501, + "balance_loss_mlp": 1.02018261, + "epoch": 0.782263640462949, + "flos": 29059812535680.0, + "grad_norm": 1.4969926284067765, + "language_loss": 0.62224066, + "learning_rate": 4.76932920753806e-07, + "loss": 0.64303017, + "num_input_tokens_seen": 280601495, + "step": 13011, + "time_per_iteration": 2.6577718257904053 + }, + { + "auxiliary_loss_clip": 0.01050788, + "auxiliary_loss_mlp": 0.01025117, + "balance_loss_clip": 1.02491641, + "balance_loss_mlp": 1.01583648, + "epoch": 0.782323763715617, + "flos": 25299498464640.0, + "grad_norm": 1.653832947202734, + "language_loss": 0.70159447, + "learning_rate": 4.7668052958450913e-07, + "loss": 0.72235352, + "num_input_tokens_seen": 280622760, + "step": 13012, + "time_per_iteration": 2.628977060317993 + }, + { + "auxiliary_loss_clip": 0.01005721, + "auxiliary_loss_mlp": 0.01000756, + "balance_loss_clip": 1.00075829, + "balance_loss_mlp": 0.99982023, + "epoch": 0.782383886968285, + "flos": 65194388668800.0, + "grad_norm": 0.7199040085831336, + "language_loss": 0.55002052, + "learning_rate": 4.764281961796395e-07, + "loss": 0.57008529, + "num_input_tokens_seen": 280687115, + "step": 13013, + "time_per_iteration": 3.2408254146575928 + }, + { + "auxiliary_loss_clip": 0.01037158, + "auxiliary_loss_mlp": 0.0103226, + "balance_loss_clip": 1.02489388, + "balance_loss_mlp": 1.02115011, + "epoch": 0.782444010220953, + "flos": 18405440190720.0, + "grad_norm": 1.765639988165933, + "language_loss": 0.65182567, + "learning_rate": 4.76175920548765e-07, + "loss": 0.67251986, + "num_input_tokens_seen": 280705000, + "step": 13014, + "time_per_iteration": 2.6118714809417725 + }, + { + "auxiliary_loss_clip": 0.00988693, + "auxiliary_loss_mlp": 0.01001677, + "balance_loss_clip": 1.00237393, + "balance_loss_mlp": 1.00049686, + "epoch": 0.7825041334736209, + "flos": 63955003841280.0, + "grad_norm": 0.7176601505790915, + "language_loss": 0.58453453, + "learning_rate": 4.759237027014524e-07, + "loss": 0.60443819, + "num_input_tokens_seen": 280773525, + "step": 13015, + "time_per_iteration": 3.3145039081573486 + }, + { + "auxiliary_loss_clip": 0.01034459, + "auxiliary_loss_mlp": 0.01027272, + "balance_loss_clip": 1.02547002, + "balance_loss_mlp": 1.01756883, + "epoch": 0.7825642567262889, + "flos": 20339373375360.0, + "grad_norm": 1.6216072002141388, + "language_loss": 0.74610901, + "learning_rate": 4.756715426472666e-07, + "loss": 0.76672626, + "num_input_tokens_seen": 280791915, + "step": 13016, + "time_per_iteration": 2.614806652069092 + }, + { + "auxiliary_loss_clip": 0.01063398, + "auxiliary_loss_mlp": 0.01031684, + "balance_loss_clip": 1.02485657, + "balance_loss_mlp": 1.02013278, + "epoch": 0.7826243799789568, + "flos": 20262955190400.0, + "grad_norm": 1.9919843186338417, + "language_loss": 0.74920678, + "learning_rate": 4.7541944039576766e-07, + "loss": 0.77015758, + "num_input_tokens_seen": 280811460, + "step": 13017, + "time_per_iteration": 2.5937373638153076 + }, + { + "auxiliary_loss_clip": 0.01043721, + "auxiliary_loss_mlp": 0.01029384, + "balance_loss_clip": 1.02419448, + "balance_loss_mlp": 1.01814878, + "epoch": 0.7826845032316249, + "flos": 21132926593920.0, + "grad_norm": 2.72729857395406, + "language_loss": 0.75305533, + "learning_rate": 4.7516739595651636e-07, + "loss": 0.77378637, + "num_input_tokens_seen": 280825415, + "step": 13018, + "time_per_iteration": 4.3459250926971436 + }, + { + "auxiliary_loss_clip": 0.01061159, + "auxiliary_loss_mlp": 0.0102526, + "balance_loss_clip": 1.02384615, + "balance_loss_mlp": 1.01474547, + "epoch": 0.7827446264842928, + "flos": 22492253911680.0, + "grad_norm": 1.3835862537378956, + "language_loss": 0.7739284, + "learning_rate": 4.749154093390708e-07, + "loss": 0.79479265, + "num_input_tokens_seen": 280845335, + "step": 13019, + "time_per_iteration": 4.19604754447937 + }, + { + "auxiliary_loss_clip": 0.01022013, + "auxiliary_loss_mlp": 0.01023715, + "balance_loss_clip": 1.0243299, + "balance_loss_mlp": 1.01371288, + "epoch": 0.7828047497369608, + "flos": 28840649702400.0, + "grad_norm": 2.0857840426290912, + "language_loss": 0.67847419, + "learning_rate": 4.746634805529852e-07, + "loss": 0.69893146, + "num_input_tokens_seen": 280867145, + "step": 13020, + "time_per_iteration": 2.771786689758301 + }, + { + "auxiliary_loss_clip": 0.01054501, + "auxiliary_loss_mlp": 0.01028955, + "balance_loss_clip": 1.02735496, + "balance_loss_mlp": 1.01814902, + "epoch": 0.7828648729896287, + "flos": 23257689759360.0, + "grad_norm": 1.8541187089292748, + "language_loss": 0.62365711, + "learning_rate": 4.7441160960781325e-07, + "loss": 0.64449167, + "num_input_tokens_seen": 280886185, + "step": 13021, + "time_per_iteration": 2.6260669231414795 + }, + { + "auxiliary_loss_clip": 0.01059303, + "auxiliary_loss_mlp": 0.01027276, + "balance_loss_clip": 1.02410758, + "balance_loss_mlp": 1.017519, + "epoch": 0.7829249962422967, + "flos": 25265670831360.0, + "grad_norm": 2.0309143627618282, + "language_loss": 0.69235563, + "learning_rate": 4.7415979651310636e-07, + "loss": 0.71322143, + "num_input_tokens_seen": 280907665, + "step": 13022, + "time_per_iteration": 2.594459056854248 + }, + { + "auxiliary_loss_clip": 0.00957784, + "auxiliary_loss_mlp": 0.01002404, + "balance_loss_clip": 1.00355756, + "balance_loss_mlp": 1.00149202, + "epoch": 0.7829851194949646, + "flos": 70722044645760.0, + "grad_norm": 0.6426859013375193, + "language_loss": 0.56173146, + "learning_rate": 4.739080412784131e-07, + "loss": 0.58133328, + "num_input_tokens_seen": 280971405, + "step": 13023, + "time_per_iteration": 3.424766778945923 + }, + { + "auxiliary_loss_clip": 0.01030206, + "auxiliary_loss_mlp": 0.01030492, + "balance_loss_clip": 1.01985657, + "balance_loss_mlp": 1.02022219, + "epoch": 0.7830452427476327, + "flos": 25660795415040.0, + "grad_norm": 1.6380915404372103, + "language_loss": 0.67487121, + "learning_rate": 4.736563439132792e-07, + "loss": 0.6954782, + "num_input_tokens_seen": 280989615, + "step": 13024, + "time_per_iteration": 2.6428308486938477 + }, + { + "auxiliary_loss_clip": 0.01064286, + "auxiliary_loss_mlp": 0.01024383, + "balance_loss_clip": 1.02611148, + "balance_loss_mlp": 1.013273, + "epoch": 0.7831053660003006, + "flos": 22784315397120.0, + "grad_norm": 1.716361655865431, + "language_loss": 0.7764706, + "learning_rate": 4.734047044272498e-07, + "loss": 0.79735732, + "num_input_tokens_seen": 281009450, + "step": 13025, + "time_per_iteration": 2.625394344329834 + }, + { + "auxiliary_loss_clip": 0.01035743, + "auxiliary_loss_mlp": 0.01032808, + "balance_loss_clip": 1.02381349, + "balance_loss_mlp": 1.02250826, + "epoch": 0.7831654892529686, + "flos": 25812267068160.0, + "grad_norm": 1.707279480513865, + "language_loss": 0.77958393, + "learning_rate": 4.731531228298673e-07, + "loss": 0.80026942, + "num_input_tokens_seen": 281028120, + "step": 13026, + "time_per_iteration": 2.6711204051971436 + }, + { + "auxiliary_loss_clip": 0.01050445, + "auxiliary_loss_mlp": 0.01023702, + "balance_loss_clip": 1.02473891, + "balance_loss_mlp": 1.01342654, + "epoch": 0.7832256125056366, + "flos": 20771557816320.0, + "grad_norm": 2.139007831137979, + "language_loss": 0.75472379, + "learning_rate": 4.729015991306715e-07, + "loss": 0.77546531, + "num_input_tokens_seen": 281042130, + "step": 13027, + "time_per_iteration": 2.5907845497131348 + }, + { + "auxiliary_loss_clip": 0.01052164, + "auxiliary_loss_mlp": 0.01027097, + "balance_loss_clip": 1.02500761, + "balance_loss_mlp": 1.01722074, + "epoch": 0.7832857357583045, + "flos": 21506541909120.0, + "grad_norm": 1.9162429556241938, + "language_loss": 0.70580149, + "learning_rate": 4.726501333391997e-07, + "loss": 0.72659409, + "num_input_tokens_seen": 281060945, + "step": 13028, + "time_per_iteration": 2.644667863845825 + }, + { + "auxiliary_loss_clip": 0.01022868, + "auxiliary_loss_mlp": 0.01039173, + "balance_loss_clip": 1.02725744, + "balance_loss_mlp": 1.02802062, + "epoch": 0.7833458590109725, + "flos": 18077791305600.0, + "grad_norm": 2.038518667123853, + "language_loss": 0.69774187, + "learning_rate": 4.7239872546498774e-07, + "loss": 0.71836227, + "num_input_tokens_seen": 281079270, + "step": 13029, + "time_per_iteration": 2.743274450302124 + }, + { + "auxiliary_loss_clip": 0.0103252, + "auxiliary_loss_mlp": 0.01026939, + "balance_loss_clip": 1.02416229, + "balance_loss_mlp": 1.01610303, + "epoch": 0.7834059822636404, + "flos": 28288738252800.0, + "grad_norm": 1.64988312310823, + "language_loss": 0.80945754, + "learning_rate": 4.721473755175698e-07, + "loss": 0.83005214, + "num_input_tokens_seen": 281099500, + "step": 13030, + "time_per_iteration": 2.7288942337036133 + }, + { + "auxiliary_loss_clip": 0.01056027, + "auxiliary_loss_mlp": 0.01027574, + "balance_loss_clip": 1.02590048, + "balance_loss_mlp": 1.01694632, + "epoch": 0.7834661055163085, + "flos": 31686211088640.0, + "grad_norm": 1.6892290225684325, + "language_loss": 0.70508885, + "learning_rate": 4.71896083506476e-07, + "loss": 0.72592485, + "num_input_tokens_seen": 281121250, + "step": 13031, + "time_per_iteration": 2.7074062824249268 + }, + { + "auxiliary_loss_clip": 0.0103189, + "auxiliary_loss_mlp": 0.01031003, + "balance_loss_clip": 1.02432394, + "balance_loss_mlp": 1.02047658, + "epoch": 0.7835262287689764, + "flos": 12933192942720.0, + "grad_norm": 1.9899561520966766, + "language_loss": 0.7879864, + "learning_rate": 4.7164484944123574e-07, + "loss": 0.80861533, + "num_input_tokens_seen": 281138760, + "step": 13032, + "time_per_iteration": 2.661360740661621 + }, + { + "auxiliary_loss_clip": 0.0105813, + "auxiliary_loss_mlp": 0.01033679, + "balance_loss_clip": 1.02792835, + "balance_loss_mlp": 1.0227356, + "epoch": 0.7835863520216444, + "flos": 16143211676160.0, + "grad_norm": 2.5216615957356927, + "language_loss": 0.63091218, + "learning_rate": 4.7139367333137726e-07, + "loss": 0.65183032, + "num_input_tokens_seen": 281157420, + "step": 13033, + "time_per_iteration": 2.5933730602264404 + }, + { + "auxiliary_loss_clip": 0.01051256, + "auxiliary_loss_mlp": 0.01028785, + "balance_loss_clip": 1.02429903, + "balance_loss_mlp": 1.01773465, + "epoch": 0.7836464752743123, + "flos": 11509909459200.0, + "grad_norm": 1.482977473084605, + "language_loss": 0.72234875, + "learning_rate": 4.7114255518642255e-07, + "loss": 0.74314916, + "num_input_tokens_seen": 281174620, + "step": 13034, + "time_per_iteration": 2.5854036808013916 + }, + { + "auxiliary_loss_clip": 0.01064097, + "auxiliary_loss_mlp": 0.00747721, + "balance_loss_clip": 1.02585804, + "balance_loss_mlp": 1.00032187, + "epoch": 0.7837065985269803, + "flos": 18223696350720.0, + "grad_norm": 1.8918012952109475, + "language_loss": 0.7196995, + "learning_rate": 4.7089149501589555e-07, + "loss": 0.73781765, + "num_input_tokens_seen": 281193865, + "step": 13035, + "time_per_iteration": 2.549330949783325 + }, + { + "auxiliary_loss_clip": 0.01063726, + "auxiliary_loss_mlp": 0.01033539, + "balance_loss_clip": 1.02624273, + "balance_loss_mlp": 1.02226758, + "epoch": 0.7837667217796482, + "flos": 24754410599040.0, + "grad_norm": 1.9799267470386053, + "language_loss": 0.66121149, + "learning_rate": 4.7064049282931664e-07, + "loss": 0.68218422, + "num_input_tokens_seen": 281212250, + "step": 13036, + "time_per_iteration": 4.456714391708374 + }, + { + "auxiliary_loss_clip": 0.01055915, + "auxiliary_loss_mlp": 0.01031885, + "balance_loss_clip": 1.02528715, + "balance_loss_mlp": 1.02023864, + "epoch": 0.7838268450323163, + "flos": 22383121415040.0, + "grad_norm": 2.5543148754012845, + "language_loss": 0.72652996, + "learning_rate": 4.703895486362031e-07, + "loss": 0.74740791, + "num_input_tokens_seen": 281230850, + "step": 13037, + "time_per_iteration": 2.6687214374542236 + }, + { + "auxiliary_loss_clip": 0.01020506, + "auxiliary_loss_mlp": 0.01033759, + "balance_loss_clip": 1.02067614, + "balance_loss_mlp": 1.02214217, + "epoch": 0.7838869682849842, + "flos": 19500284689920.0, + "grad_norm": 2.081861589582317, + "language_loss": 0.59915668, + "learning_rate": 4.701386624460717e-07, + "loss": 0.6196993, + "num_input_tokens_seen": 281249810, + "step": 13038, + "time_per_iteration": 2.6961145401000977 + }, + { + "auxiliary_loss_clip": 0.01040893, + "auxiliary_loss_mlp": 0.01026768, + "balance_loss_clip": 1.02477527, + "balance_loss_mlp": 1.01675487, + "epoch": 0.7839470915376522, + "flos": 32892845690880.0, + "grad_norm": 1.502843749086736, + "language_loss": 0.68167996, + "learning_rate": 4.698878342684349e-07, + "loss": 0.70235658, + "num_input_tokens_seen": 281273730, + "step": 13039, + "time_per_iteration": 2.7676920890808105 + }, + { + "auxiliary_loss_clip": 0.01026012, + "auxiliary_loss_mlp": 0.01022095, + "balance_loss_clip": 1.02186406, + "balance_loss_mlp": 1.01315391, + "epoch": 0.7840072147903202, + "flos": 29676003373440.0, + "grad_norm": 1.7849736816845712, + "language_loss": 0.69018435, + "learning_rate": 4.6963706411280537e-07, + "loss": 0.71066546, + "num_input_tokens_seen": 281293670, + "step": 13040, + "time_per_iteration": 2.875683069229126 + }, + { + "auxiliary_loss_clip": 0.01022203, + "auxiliary_loss_mlp": 0.01028864, + "balance_loss_clip": 1.02703977, + "balance_loss_mlp": 1.01746809, + "epoch": 0.7840673380429881, + "flos": 18186744234240.0, + "grad_norm": 1.4012833121900163, + "language_loss": 0.67471349, + "learning_rate": 4.6938635198869116e-07, + "loss": 0.69522417, + "num_input_tokens_seen": 281313070, + "step": 13041, + "time_per_iteration": 2.7296342849731445 + }, + { + "auxiliary_loss_clip": 0.00997597, + "auxiliary_loss_mlp": 0.00746796, + "balance_loss_clip": 1.00214469, + "balance_loss_mlp": 1.00076914, + "epoch": 0.7841274612956561, + "flos": 66346006613760.0, + "grad_norm": 0.6762878361292, + "language_loss": 0.57461476, + "learning_rate": 4.691356979055998e-07, + "loss": 0.59205866, + "num_input_tokens_seen": 281374880, + "step": 13042, + "time_per_iteration": 3.181576728820801 + }, + { + "auxiliary_loss_clip": 0.01036339, + "auxiliary_loss_mlp": 0.01027817, + "balance_loss_clip": 1.02353096, + "balance_loss_mlp": 1.01691496, + "epoch": 0.784187584548324, + "flos": 26648482665600.0, + "grad_norm": 1.7831791075440575, + "language_loss": 0.83955938, + "learning_rate": 4.688851018730369e-07, + "loss": 0.86020094, + "num_input_tokens_seen": 281392620, + "step": 13043, + "time_per_iteration": 2.723369598388672 + }, + { + "auxiliary_loss_clip": 0.01049791, + "auxiliary_loss_mlp": 0.0102638, + "balance_loss_clip": 1.0239743, + "balance_loss_mlp": 1.01582992, + "epoch": 0.7842477078009921, + "flos": 25740158515200.0, + "grad_norm": 1.3849327078014162, + "language_loss": 0.88518536, + "learning_rate": 4.6863456390050425e-07, + "loss": 0.90594709, + "num_input_tokens_seen": 281413140, + "step": 13044, + "time_per_iteration": 2.725630760192871 + }, + { + "auxiliary_loss_clip": 0.01044766, + "auxiliary_loss_mlp": 0.01031706, + "balance_loss_clip": 1.02514625, + "balance_loss_mlp": 1.02076876, + "epoch": 0.78430783105366, + "flos": 21980957765760.0, + "grad_norm": 1.681025143933823, + "language_loss": 0.79032373, + "learning_rate": 4.6838408399750195e-07, + "loss": 0.81108844, + "num_input_tokens_seen": 281430860, + "step": 13045, + "time_per_iteration": 2.6526503562927246 + }, + { + "auxiliary_loss_clip": 0.01040569, + "auxiliary_loss_mlp": 0.01024741, + "balance_loss_clip": 1.0240612, + "balance_loss_mlp": 1.01502573, + "epoch": 0.784367954306328, + "flos": 23842279607040.0, + "grad_norm": 1.5670392767786365, + "language_loss": 0.72285581, + "learning_rate": 4.6813366217352925e-07, + "loss": 0.74350893, + "num_input_tokens_seen": 281451385, + "step": 13046, + "time_per_iteration": 2.6602323055267334 + }, + { + "auxiliary_loss_clip": 0.01019898, + "auxiliary_loss_mlp": 0.01036119, + "balance_loss_clip": 1.0257678, + "balance_loss_mlp": 1.02508032, + "epoch": 0.7844280775589959, + "flos": 24826662806400.0, + "grad_norm": 1.6383046221348077, + "language_loss": 0.63207215, + "learning_rate": 4.678832984380809e-07, + "loss": 0.65263236, + "num_input_tokens_seen": 281472255, + "step": 13047, + "time_per_iteration": 4.434095621109009 + }, + { + "auxiliary_loss_clip": 0.01051598, + "auxiliary_loss_mlp": 0.01025941, + "balance_loss_clip": 1.02476716, + "balance_loss_mlp": 1.01574874, + "epoch": 0.7844882008116639, + "flos": 22455660931200.0, + "grad_norm": 1.6891154082208222, + "language_loss": 0.73172456, + "learning_rate": 4.676329928006515e-07, + "loss": 0.75249994, + "num_input_tokens_seen": 281492860, + "step": 13048, + "time_per_iteration": 2.6178061962127686 + }, + { + "auxiliary_loss_clip": 0.01045571, + "auxiliary_loss_mlp": 0.01029349, + "balance_loss_clip": 1.0269537, + "balance_loss_mlp": 1.01844692, + "epoch": 0.7845483240643318, + "flos": 26104041244800.0, + "grad_norm": 1.772388420070464, + "language_loss": 0.74406242, + "learning_rate": 4.6738274527073243e-07, + "loss": 0.76481164, + "num_input_tokens_seen": 281511815, + "step": 13049, + "time_per_iteration": 2.6646313667297363 + }, + { + "auxiliary_loss_clip": 0.01064192, + "auxiliary_loss_mlp": 0.0102999, + "balance_loss_clip": 1.02458644, + "balance_loss_mlp": 1.01829565, + "epoch": 0.7846084473169999, + "flos": 19354307817600.0, + "grad_norm": 1.6961103305336855, + "language_loss": 0.72370797, + "learning_rate": 4.6713255585781454e-07, + "loss": 0.74464977, + "num_input_tokens_seen": 281530090, + "step": 13050, + "time_per_iteration": 2.6378564834594727 + }, + { + "auxiliary_loss_clip": 0.01047986, + "auxiliary_loss_mlp": 0.01032116, + "balance_loss_clip": 1.02359617, + "balance_loss_mlp": 1.02172089, + "epoch": 0.7846685705696678, + "flos": 23325811902720.0, + "grad_norm": 2.017123062483192, + "language_loss": 0.73918676, + "learning_rate": 4.668824245713825e-07, + "loss": 0.75998783, + "num_input_tokens_seen": 281547075, + "step": 13051, + "time_per_iteration": 2.7661454677581787 + }, + { + "auxiliary_loss_clip": 0.01063965, + "auxiliary_loss_mlp": 0.01033647, + "balance_loss_clip": 1.02605772, + "balance_loss_mlp": 1.02242947, + "epoch": 0.7847286938223358, + "flos": 35809545962880.0, + "grad_norm": 2.1947285248776622, + "language_loss": 0.72737426, + "learning_rate": 4.666323514209227e-07, + "loss": 0.74835038, + "num_input_tokens_seen": 281568080, + "step": 13052, + "time_per_iteration": 2.6972503662109375 + }, + { + "auxiliary_loss_clip": 0.01039926, + "auxiliary_loss_mlp": 0.01029637, + "balance_loss_clip": 1.02533984, + "balance_loss_mlp": 1.01965904, + "epoch": 0.7847888170750038, + "flos": 18478159274880.0, + "grad_norm": 1.914323770080079, + "language_loss": 0.69057322, + "learning_rate": 4.663823364159183e-07, + "loss": 0.71126878, + "num_input_tokens_seen": 281586925, + "step": 13053, + "time_per_iteration": 2.610637664794922 + }, + { + "auxiliary_loss_clip": 0.01042278, + "auxiliary_loss_mlp": 0.01028256, + "balance_loss_clip": 1.02375865, + "balance_loss_mlp": 1.01848125, + "epoch": 0.7848489403276717, + "flos": 25119155255040.0, + "grad_norm": 2.062679818489175, + "language_loss": 0.70208824, + "learning_rate": 4.6613237956584893e-07, + "loss": 0.72279358, + "num_input_tokens_seen": 281603915, + "step": 13054, + "time_per_iteration": 2.63852596282959 + }, + { + "auxiliary_loss_clip": 0.01053851, + "auxiliary_loss_mlp": 0.01033644, + "balance_loss_clip": 1.0253371, + "balance_loss_mlp": 1.02281952, + "epoch": 0.7849090635803397, + "flos": 26502433966080.0, + "grad_norm": 2.266754842874936, + "language_loss": 0.75842464, + "learning_rate": 4.658824808801938e-07, + "loss": 0.77929962, + "num_input_tokens_seen": 281624220, + "step": 13055, + "time_per_iteration": 2.6883974075317383 + }, + { + "auxiliary_loss_clip": 0.01066607, + "auxiliary_loss_mlp": 0.01031113, + "balance_loss_clip": 1.02614021, + "balance_loss_mlp": 1.01992512, + "epoch": 0.7849691868330076, + "flos": 20959658363520.0, + "grad_norm": 1.9524898868662233, + "language_loss": 0.74620426, + "learning_rate": 4.656326403684283e-07, + "loss": 0.7671814, + "num_input_tokens_seen": 281642325, + "step": 13056, + "time_per_iteration": 2.597714424133301 + }, + { + "auxiliary_loss_clip": 0.01006016, + "auxiliary_loss_mlp": 0.01027132, + "balance_loss_clip": 1.02500296, + "balance_loss_mlp": 1.01655221, + "epoch": 0.7850293100856757, + "flos": 26067484177920.0, + "grad_norm": 1.6643132824614744, + "language_loss": 0.70211732, + "learning_rate": 4.6538285804002744e-07, + "loss": 0.72244883, + "num_input_tokens_seen": 281663065, + "step": 13057, + "time_per_iteration": 2.8055450916290283 + }, + { + "auxiliary_loss_clip": 0.01028308, + "auxiliary_loss_mlp": 0.01034245, + "balance_loss_clip": 1.02760923, + "balance_loss_mlp": 1.02368295, + "epoch": 0.7850894333383436, + "flos": 22491894775680.0, + "grad_norm": 1.7650668115428694, + "language_loss": 0.76423949, + "learning_rate": 4.6513313390446175e-07, + "loss": 0.78486502, + "num_input_tokens_seen": 281681005, + "step": 13058, + "time_per_iteration": 2.696023941040039 + }, + { + "auxiliary_loss_clip": 0.01051657, + "auxiliary_loss_mlp": 0.0102985, + "balance_loss_clip": 1.02538395, + "balance_loss_mlp": 1.01879358, + "epoch": 0.7851495565910116, + "flos": 20558643949440.0, + "grad_norm": 2.6969715666930734, + "language_loss": 0.70769495, + "learning_rate": 4.6488346797120146e-07, + "loss": 0.72851002, + "num_input_tokens_seen": 281697965, + "step": 13059, + "time_per_iteration": 2.6312241554260254 + }, + { + "auxiliary_loss_clip": 0.01032065, + "auxiliary_loss_mlp": 0.01036615, + "balance_loss_clip": 1.02389359, + "balance_loss_mlp": 1.02462244, + "epoch": 0.7852096798436795, + "flos": 15924838942080.0, + "grad_norm": 2.0667248654080814, + "language_loss": 0.76231247, + "learning_rate": 4.646338602497144e-07, + "loss": 0.78299928, + "num_input_tokens_seen": 281716035, + "step": 13060, + "time_per_iteration": 2.657397508621216 + }, + { + "auxiliary_loss_clip": 0.01035561, + "auxiliary_loss_mlp": 0.0102632, + "balance_loss_clip": 1.02531481, + "balance_loss_mlp": 1.01552534, + "epoch": 0.7852698030963475, + "flos": 19062282245760.0, + "grad_norm": 1.9861038527642152, + "language_loss": 0.77082497, + "learning_rate": 4.643843107494654e-07, + "loss": 0.79144382, + "num_input_tokens_seen": 281732815, + "step": 13061, + "time_per_iteration": 2.5986647605895996 + }, + { + "auxiliary_loss_clip": 0.01031842, + "auxiliary_loss_mlp": 0.01031598, + "balance_loss_clip": 1.02327621, + "balance_loss_mlp": 1.02010036, + "epoch": 0.7853299263490154, + "flos": 24644380262400.0, + "grad_norm": 1.9374591905984182, + "language_loss": 0.74541718, + "learning_rate": 4.641348194799164e-07, + "loss": 0.76605159, + "num_input_tokens_seen": 281751980, + "step": 13062, + "time_per_iteration": 2.7084107398986816 + }, + { + "auxiliary_loss_clip": 0.01048519, + "auxiliary_loss_mlp": 0.01030942, + "balance_loss_clip": 1.02282631, + "balance_loss_mlp": 1.02042139, + "epoch": 0.7853900496016835, + "flos": 22017981709440.0, + "grad_norm": 1.5696403605902671, + "language_loss": 0.68363053, + "learning_rate": 4.638853864505297e-07, + "loss": 0.7044251, + "num_input_tokens_seen": 281772670, + "step": 13063, + "time_per_iteration": 2.7057740688323975 + }, + { + "auxiliary_loss_clip": 0.01048625, + "auxiliary_loss_mlp": 0.01029018, + "balance_loss_clip": 1.02603436, + "balance_loss_mlp": 1.01884317, + "epoch": 0.7854501728543514, + "flos": 30227412032640.0, + "grad_norm": 2.3250402575347393, + "language_loss": 0.73199421, + "learning_rate": 4.636360116707625e-07, + "loss": 0.75277066, + "num_input_tokens_seen": 281792930, + "step": 13064, + "time_per_iteration": 2.74401593208313 + }, + { + "auxiliary_loss_clip": 0.01035206, + "auxiliary_loss_mlp": 0.01032394, + "balance_loss_clip": 1.02604032, + "balance_loss_mlp": 1.021451, + "epoch": 0.7855102961070194, + "flos": 18843694030080.0, + "grad_norm": 1.9384525719227157, + "language_loss": 0.67835641, + "learning_rate": 4.633866951500718e-07, + "loss": 0.69903237, + "num_input_tokens_seen": 281811805, + "step": 13065, + "time_per_iteration": 2.7445223331451416 + }, + { + "auxiliary_loss_clip": 0.01056086, + "auxiliary_loss_mlp": 0.01031507, + "balance_loss_clip": 1.02824259, + "balance_loss_mlp": 1.0210166, + "epoch": 0.7855704193596874, + "flos": 22309971367680.0, + "grad_norm": 2.0923411416234883, + "language_loss": 0.76262897, + "learning_rate": 4.6313743689791196e-07, + "loss": 0.78350496, + "num_input_tokens_seen": 281831885, + "step": 13066, + "time_per_iteration": 5.903966903686523 + }, + { + "auxiliary_loss_clip": 0.01006078, + "auxiliary_loss_mlp": 0.01001792, + "balance_loss_clip": 1.00095987, + "balance_loss_mlp": 1.00098193, + "epoch": 0.7856305426123553, + "flos": 60004434407040.0, + "grad_norm": 0.7060441020287839, + "language_loss": 0.53422374, + "learning_rate": 4.628882369237346e-07, + "loss": 0.55430239, + "num_input_tokens_seen": 281900310, + "step": 13067, + "time_per_iteration": 3.236726999282837 + }, + { + "auxiliary_loss_clip": 0.01023774, + "auxiliary_loss_mlp": 0.01027876, + "balance_loss_clip": 1.0256052, + "balance_loss_mlp": 1.01722455, + "epoch": 0.7856906658650233, + "flos": 21868593045120.0, + "grad_norm": 1.5577690757329827, + "language_loss": 0.67663133, + "learning_rate": 4.62639095236989e-07, + "loss": 0.69714773, + "num_input_tokens_seen": 281918870, + "step": 13068, + "time_per_iteration": 2.898750066757202 + }, + { + "auxiliary_loss_clip": 0.01031793, + "auxiliary_loss_mlp": 0.01025298, + "balance_loss_clip": 1.02599788, + "balance_loss_mlp": 1.01543343, + "epoch": 0.7857507891176913, + "flos": 23622937205760.0, + "grad_norm": 2.4259784757530696, + "language_loss": 0.68015766, + "learning_rate": 4.6239001184712267e-07, + "loss": 0.70072854, + "num_input_tokens_seen": 281936905, + "step": 13069, + "time_per_iteration": 2.740966796875 + }, + { + "auxiliary_loss_clip": 0.01054256, + "auxiliary_loss_mlp": 0.01031268, + "balance_loss_clip": 1.0263555, + "balance_loss_mlp": 1.0207237, + "epoch": 0.7858109123703593, + "flos": 25520061928320.0, + "grad_norm": 1.4482894046123984, + "language_loss": 0.76860225, + "learning_rate": 4.6214098676358195e-07, + "loss": 0.78945744, + "num_input_tokens_seen": 281955625, + "step": 13070, + "time_per_iteration": 2.6722843647003174 + }, + { + "auxiliary_loss_clip": 0.01005857, + "auxiliary_loss_mlp": 0.01037228, + "balance_loss_clip": 1.01980519, + "balance_loss_mlp": 1.02542627, + "epoch": 0.7858710356230272, + "flos": 17457398576640.0, + "grad_norm": 1.8076184157666637, + "language_loss": 0.65816587, + "learning_rate": 4.618920199958083e-07, + "loss": 0.67859674, + "num_input_tokens_seen": 281973285, + "step": 13071, + "time_per_iteration": 2.689516544342041 + }, + { + "auxiliary_loss_clip": 0.01013389, + "auxiliary_loss_mlp": 0.01026442, + "balance_loss_clip": 1.02122521, + "balance_loss_mlp": 1.01619577, + "epoch": 0.7859311588756952, + "flos": 24679680353280.0, + "grad_norm": 1.5334094578348985, + "language_loss": 0.73814094, + "learning_rate": 4.616431115532442e-07, + "loss": 0.75853932, + "num_input_tokens_seen": 281991410, + "step": 13072, + "time_per_iteration": 2.7649528980255127 + }, + { + "auxiliary_loss_clip": 0.01057392, + "auxiliary_loss_mlp": 0.01032162, + "balance_loss_clip": 1.02906275, + "balance_loss_mlp": 1.02024698, + "epoch": 0.7859912821283631, + "flos": 21799142098560.0, + "grad_norm": 1.7078192302547408, + "language_loss": 0.7097702, + "learning_rate": 4.613942614453268e-07, + "loss": 0.73066574, + "num_input_tokens_seen": 282010845, + "step": 13073, + "time_per_iteration": 2.6128828525543213 + }, + { + "auxiliary_loss_clip": 0.01044511, + "auxiliary_loss_mlp": 0.01028826, + "balance_loss_clip": 1.02713335, + "balance_loss_mlp": 1.01735234, + "epoch": 0.7860514053810311, + "flos": 20847293642880.0, + "grad_norm": 1.9966174085644823, + "language_loss": 0.7665363, + "learning_rate": 4.611454696814938e-07, + "loss": 0.78726971, + "num_input_tokens_seen": 282029635, + "step": 13074, + "time_per_iteration": 2.7044565677642822 + }, + { + "auxiliary_loss_clip": 0.01016154, + "auxiliary_loss_mlp": 0.01027646, + "balance_loss_clip": 1.02089357, + "balance_loss_mlp": 1.01748323, + "epoch": 0.786111528633699, + "flos": 24315689882880.0, + "grad_norm": 1.6587983056568814, + "language_loss": 0.74845421, + "learning_rate": 4.608967362711782e-07, + "loss": 0.76889229, + "num_input_tokens_seen": 282050285, + "step": 13075, + "time_per_iteration": 2.7082929611206055 + }, + { + "auxiliary_loss_clip": 0.01036111, + "auxiliary_loss_mlp": 0.01026048, + "balance_loss_clip": 1.02669859, + "balance_loss_mlp": 1.01611757, + "epoch": 0.7861716518863671, + "flos": 24353180703360.0, + "grad_norm": 5.597930623007102, + "language_loss": 0.68854469, + "learning_rate": 4.6064806122381283e-07, + "loss": 0.70916629, + "num_input_tokens_seen": 282071040, + "step": 13076, + "time_per_iteration": 2.7588675022125244 + }, + { + "auxiliary_loss_clip": 0.01046673, + "auxiliary_loss_mlp": 0.01028758, + "balance_loss_clip": 1.02355993, + "balance_loss_mlp": 1.01821995, + "epoch": 0.786231775139035, + "flos": 14022399006720.0, + "grad_norm": 1.9363410847545937, + "language_loss": 0.79944193, + "learning_rate": 4.603994445488282e-07, + "loss": 0.82019621, + "num_input_tokens_seen": 282086610, + "step": 13077, + "time_per_iteration": 2.5963971614837646 + }, + { + "auxiliary_loss_clip": 0.0105421, + "auxiliary_loss_mlp": 0.01029154, + "balance_loss_clip": 1.02668285, + "balance_loss_mlp": 1.01840186, + "epoch": 0.786291898391703, + "flos": 33724248865920.0, + "grad_norm": 1.5153922196279153, + "language_loss": 0.70771229, + "learning_rate": 4.6015088625564956e-07, + "loss": 0.7285459, + "num_input_tokens_seen": 282107440, + "step": 13078, + "time_per_iteration": 2.753413200378418 + }, + { + "auxiliary_loss_clip": 0.01046907, + "auxiliary_loss_mlp": 0.0102939, + "balance_loss_clip": 1.02469528, + "balance_loss_mlp": 1.01888156, + "epoch": 0.786352021644371, + "flos": 25811476968960.0, + "grad_norm": 1.8425632960394767, + "language_loss": 0.81539118, + "learning_rate": 4.599023863537039e-07, + "loss": 0.8361541, + "num_input_tokens_seen": 282127290, + "step": 13079, + "time_per_iteration": 2.6333634853363037 + }, + { + "auxiliary_loss_clip": 0.0102905, + "auxiliary_loss_mlp": 0.01025637, + "balance_loss_clip": 1.02643442, + "balance_loss_mlp": 1.01506925, + "epoch": 0.7864121448970389, + "flos": 28910818920960.0, + "grad_norm": 1.4125324415098606, + "language_loss": 0.68335366, + "learning_rate": 4.596539448524146e-07, + "loss": 0.70390052, + "num_input_tokens_seen": 282147505, + "step": 13080, + "time_per_iteration": 2.7948694229125977 + }, + { + "auxiliary_loss_clip": 0.01053912, + "auxiliary_loss_mlp": 0.01029156, + "balance_loss_clip": 1.02608657, + "balance_loss_mlp": 1.01835561, + "epoch": 0.7864722681497069, + "flos": 19208833735680.0, + "grad_norm": 2.279252246116686, + "language_loss": 0.69375551, + "learning_rate": 4.594055617612016e-07, + "loss": 0.71458626, + "num_input_tokens_seen": 282166450, + "step": 13081, + "time_per_iteration": 2.6750664710998535 + }, + { + "auxiliary_loss_clip": 0.01043044, + "auxiliary_loss_mlp": 0.01032725, + "balance_loss_clip": 1.02523744, + "balance_loss_mlp": 1.02261043, + "epoch": 0.7865323914023749, + "flos": 21871573873920.0, + "grad_norm": 1.62694074258465, + "language_loss": 0.68602824, + "learning_rate": 4.591572370894838e-07, + "loss": 0.70678598, + "num_input_tokens_seen": 282186465, + "step": 13082, + "time_per_iteration": 2.7199840545654297 + }, + { + "auxiliary_loss_clip": 0.01038676, + "auxiliary_loss_mlp": 0.010331, + "balance_loss_clip": 1.02379286, + "balance_loss_mlp": 1.02214491, + "epoch": 0.7865925146550429, + "flos": 25520313323520.0, + "grad_norm": 1.7718188335237117, + "language_loss": 0.66231143, + "learning_rate": 4.589089708466789e-07, + "loss": 0.68302917, + "num_input_tokens_seen": 282207180, + "step": 13083, + "time_per_iteration": 4.937463283538818 + }, + { + "auxiliary_loss_clip": 0.01044463, + "auxiliary_loss_mlp": 0.01030519, + "balance_loss_clip": 1.02588761, + "balance_loss_mlp": 1.01923025, + "epoch": 0.7866526379077108, + "flos": 19097366855040.0, + "grad_norm": 2.243363658541004, + "language_loss": 0.75122976, + "learning_rate": 4.5866076304220015e-07, + "loss": 0.77197957, + "num_input_tokens_seen": 282225865, + "step": 13084, + "time_per_iteration": 2.6203227043151855 + }, + { + "auxiliary_loss_clip": 0.01035966, + "auxiliary_loss_mlp": 0.01031755, + "balance_loss_clip": 1.02412963, + "balance_loss_mlp": 1.02069283, + "epoch": 0.7867127611603788, + "flos": 16173771171840.0, + "grad_norm": 1.9026397716035093, + "language_loss": 0.70716226, + "learning_rate": 4.584126136854591e-07, + "loss": 0.72783947, + "num_input_tokens_seen": 282242895, + "step": 13085, + "time_per_iteration": 2.6225602626800537 + }, + { + "auxiliary_loss_clip": 0.01043959, + "auxiliary_loss_mlp": 0.01028237, + "balance_loss_clip": 1.02424288, + "balance_loss_mlp": 1.01717448, + "epoch": 0.7867728844130467, + "flos": 20773640805120.0, + "grad_norm": 1.7788334652328173, + "language_loss": 0.72197437, + "learning_rate": 4.5816452278586617e-07, + "loss": 0.74269629, + "num_input_tokens_seen": 282260425, + "step": 13086, + "time_per_iteration": 2.654608964920044 + }, + { + "auxiliary_loss_clip": 0.01061064, + "auxiliary_loss_mlp": 0.01025087, + "balance_loss_clip": 1.02399588, + "balance_loss_mlp": 1.01491821, + "epoch": 0.7868330076657147, + "flos": 21760106993280.0, + "grad_norm": 1.835601320931333, + "language_loss": 0.74725944, + "learning_rate": 4.5791649035282965e-07, + "loss": 0.76812094, + "num_input_tokens_seen": 282279335, + "step": 13087, + "time_per_iteration": 2.7683916091918945 + }, + { + "auxiliary_loss_clip": 0.01039839, + "auxiliary_loss_mlp": 0.01024637, + "balance_loss_clip": 1.02330542, + "balance_loss_mlp": 1.01499891, + "epoch": 0.7868931309183826, + "flos": 25700692446720.0, + "grad_norm": 1.5779489552098669, + "language_loss": 0.71396166, + "learning_rate": 4.5766851639575456e-07, + "loss": 0.73460644, + "num_input_tokens_seen": 282299905, + "step": 13088, + "time_per_iteration": 2.8536553382873535 + }, + { + "auxiliary_loss_clip": 0.01006032, + "auxiliary_loss_mlp": 0.01002166, + "balance_loss_clip": 1.00084555, + "balance_loss_mlp": 1.00129604, + "epoch": 0.7869532541710507, + "flos": 64644883430400.0, + "grad_norm": 0.6784745867789648, + "language_loss": 0.55494797, + "learning_rate": 4.574206009240431e-07, + "loss": 0.57502997, + "num_input_tokens_seen": 282367620, + "step": 13089, + "time_per_iteration": 3.2881722450256348 + }, + { + "auxiliary_loss_clip": 0.00995907, + "auxiliary_loss_mlp": 0.00999856, + "balance_loss_clip": 1.00094247, + "balance_loss_mlp": 0.99894953, + "epoch": 0.7870133774237186, + "flos": 67453600440960.0, + "grad_norm": 0.7221530868613365, + "language_loss": 0.49977064, + "learning_rate": 4.571727439470976e-07, + "loss": 0.51972818, + "num_input_tokens_seen": 282435695, + "step": 13090, + "time_per_iteration": 3.3788599967956543 + }, + { + "auxiliary_loss_clip": 0.01053374, + "auxiliary_loss_mlp": 0.01024348, + "balance_loss_clip": 1.02634943, + "balance_loss_mlp": 1.01467443, + "epoch": 0.7870735006763866, + "flos": 26068310190720.0, + "grad_norm": 1.4924683405660892, + "language_loss": 0.83788681, + "learning_rate": 4.5692494547431583e-07, + "loss": 0.85866404, + "num_input_tokens_seen": 282456025, + "step": 13091, + "time_per_iteration": 2.671776056289673 + }, + { + "auxiliary_loss_clip": 0.00997122, + "auxiliary_loss_mlp": 0.0100205, + "balance_loss_clip": 1.00170791, + "balance_loss_mlp": 1.00117958, + "epoch": 0.7871336239290546, + "flos": 70289572896000.0, + "grad_norm": 0.7098899212543692, + "language_loss": 0.63979661, + "learning_rate": 4.566772055150947e-07, + "loss": 0.65978837, + "num_input_tokens_seen": 282520995, + "step": 13092, + "time_per_iteration": 3.2219138145446777 + }, + { + "auxiliary_loss_clip": 0.01046601, + "auxiliary_loss_mlp": 0.01030927, + "balance_loss_clip": 1.02719724, + "balance_loss_mlp": 1.0203414, + "epoch": 0.7871937471817225, + "flos": 15778574760960.0, + "grad_norm": 2.211054981460252, + "language_loss": 0.79010439, + "learning_rate": 4.564295240788285e-07, + "loss": 0.81087971, + "num_input_tokens_seen": 282539355, + "step": 13093, + "time_per_iteration": 2.8050570487976074 + }, + { + "auxiliary_loss_clip": 0.01043306, + "auxiliary_loss_mlp": 0.0102786, + "balance_loss_clip": 1.02705407, + "balance_loss_mlp": 1.01747072, + "epoch": 0.7872538704343905, + "flos": 20485242506880.0, + "grad_norm": 1.8804303142710852, + "language_loss": 0.75622189, + "learning_rate": 4.561819011749106e-07, + "loss": 0.77693355, + "num_input_tokens_seen": 282555735, + "step": 13094, + "time_per_iteration": 4.462306022644043 + }, + { + "auxiliary_loss_clip": 0.01011334, + "auxiliary_loss_mlp": 0.01039868, + "balance_loss_clip": 1.02104545, + "balance_loss_mlp": 1.02792907, + "epoch": 0.7873139936870585, + "flos": 25082670015360.0, + "grad_norm": 1.5860076717642337, + "language_loss": 0.79871333, + "learning_rate": 4.5593433681272884e-07, + "loss": 0.81922537, + "num_input_tokens_seen": 282574550, + "step": 13095, + "time_per_iteration": 2.916395425796509 + }, + { + "auxiliary_loss_clip": 0.01053363, + "auxiliary_loss_mlp": 0.01028811, + "balance_loss_clip": 1.02508664, + "balance_loss_mlp": 1.01795673, + "epoch": 0.7873741169397265, + "flos": 30883176679680.0, + "grad_norm": 1.7707286596775875, + "language_loss": 0.6815362, + "learning_rate": 4.556868310016715e-07, + "loss": 0.70235795, + "num_input_tokens_seen": 282596520, + "step": 13096, + "time_per_iteration": 2.7582955360412598 + }, + { + "auxiliary_loss_clip": 0.01033628, + "auxiliary_loss_mlp": 0.01025228, + "balance_loss_clip": 1.02145076, + "balance_loss_mlp": 1.01593566, + "epoch": 0.7874342401923944, + "flos": 46791962242560.0, + "grad_norm": 1.4147870902458632, + "language_loss": 0.70087767, + "learning_rate": 4.55439383751125e-07, + "loss": 0.72146618, + "num_input_tokens_seen": 282620560, + "step": 13097, + "time_per_iteration": 3.2411839962005615 + }, + { + "auxiliary_loss_clip": 0.01045451, + "auxiliary_loss_mlp": 0.01032173, + "balance_loss_clip": 1.02736282, + "balance_loss_mlp": 1.02169466, + "epoch": 0.7874943634450624, + "flos": 23584548545280.0, + "grad_norm": 1.5603039353959813, + "language_loss": 0.80485594, + "learning_rate": 4.5519199507047126e-07, + "loss": 0.82563221, + "num_input_tokens_seen": 282639830, + "step": 13098, + "time_per_iteration": 2.8629064559936523 + }, + { + "auxiliary_loss_clip": 0.01021637, + "auxiliary_loss_mlp": 0.01027585, + "balance_loss_clip": 1.02244544, + "balance_loss_mlp": 1.01754212, + "epoch": 0.7875544866977303, + "flos": 20191169859840.0, + "grad_norm": 1.8648695269590991, + "language_loss": 0.74236989, + "learning_rate": 4.5494466496909177e-07, + "loss": 0.76286209, + "num_input_tokens_seen": 282660130, + "step": 13099, + "time_per_iteration": 2.746480703353882 + }, + { + "auxiliary_loss_clip": 0.01039346, + "auxiliary_loss_mlp": 0.01023866, + "balance_loss_clip": 1.02308965, + "balance_loss_mlp": 1.01329231, + "epoch": 0.7876146099503983, + "flos": 22602571557120.0, + "grad_norm": 1.610868269543931, + "language_loss": 0.77918923, + "learning_rate": 4.5469739345636603e-07, + "loss": 0.79982138, + "num_input_tokens_seen": 282681125, + "step": 13100, + "time_per_iteration": 2.6856181621551514 + }, + { + "auxiliary_loss_clip": 0.01051132, + "auxiliary_loss_mlp": 0.00747661, + "balance_loss_clip": 1.02467823, + "balance_loss_mlp": 1.00035834, + "epoch": 0.7876747332030662, + "flos": 10705833555840.0, + "grad_norm": 2.187526014193395, + "language_loss": 0.66253757, + "learning_rate": 4.5445018054167007e-07, + "loss": 0.68052554, + "num_input_tokens_seen": 282696690, + "step": 13101, + "time_per_iteration": 2.584425449371338 + }, + { + "auxiliary_loss_clip": 0.01043461, + "auxiliary_loss_mlp": 0.01029946, + "balance_loss_clip": 1.02574444, + "balance_loss_mlp": 1.020105, + "epoch": 0.7877348564557343, + "flos": 38399315621760.0, + "grad_norm": 1.4883834198741437, + "language_loss": 0.77764475, + "learning_rate": 4.5420302623437745e-07, + "loss": 0.79837877, + "num_input_tokens_seen": 282721210, + "step": 13102, + "time_per_iteration": 3.00131893157959 + }, + { + "auxiliary_loss_clip": 0.01048331, + "auxiliary_loss_mlp": 0.01032456, + "balance_loss_clip": 1.02438247, + "balance_loss_mlp": 1.02257943, + "epoch": 0.7877949797084022, + "flos": 18329524796160.0, + "grad_norm": 1.840722441149677, + "language_loss": 0.8204149, + "learning_rate": 4.5395593054386093e-07, + "loss": 0.84122282, + "num_input_tokens_seen": 282738505, + "step": 13103, + "time_per_iteration": 2.7421021461486816 + }, + { + "auxiliary_loss_clip": 0.01054167, + "auxiliary_loss_mlp": 0.01031351, + "balance_loss_clip": 1.02575326, + "balance_loss_mlp": 1.02007401, + "epoch": 0.7878551029610702, + "flos": 25806736373760.0, + "grad_norm": 2.4036322571005693, + "language_loss": 0.80239224, + "learning_rate": 4.537088934794913e-07, + "loss": 0.82324743, + "num_input_tokens_seen": 282756895, + "step": 13104, + "time_per_iteration": 2.6366889476776123 + }, + { + "auxiliary_loss_clip": 0.0106404, + "auxiliary_loss_mlp": 0.01035038, + "balance_loss_clip": 1.0260359, + "balance_loss_mlp": 1.02405906, + "epoch": 0.7879152262137382, + "flos": 22342685679360.0, + "grad_norm": 1.5627188318374368, + "language_loss": 0.74393439, + "learning_rate": 4.5346191505063515e-07, + "loss": 0.76492518, + "num_input_tokens_seen": 282774955, + "step": 13105, + "time_per_iteration": 2.6005170345306396 + }, + { + "auxiliary_loss_clip": 0.01009233, + "auxiliary_loss_mlp": 0.01036753, + "balance_loss_clip": 1.02272844, + "balance_loss_mlp": 1.02625632, + "epoch": 0.7879753494664061, + "flos": 24785329230720.0, + "grad_norm": 1.8524863317620779, + "language_loss": 0.75713575, + "learning_rate": 4.5321499526665776e-07, + "loss": 0.77759558, + "num_input_tokens_seen": 282793165, + "step": 13106, + "time_per_iteration": 2.8793177604675293 + }, + { + "auxiliary_loss_clip": 0.01024458, + "auxiliary_loss_mlp": 0.01033666, + "balance_loss_clip": 1.02631104, + "balance_loss_mlp": 1.02370596, + "epoch": 0.7880354727190741, + "flos": 16909078487040.0, + "grad_norm": 3.4137031568389333, + "language_loss": 0.73381984, + "learning_rate": 4.5296813413692337e-07, + "loss": 0.75440109, + "num_input_tokens_seen": 282809820, + "step": 13107, + "time_per_iteration": 2.6794705390930176 + }, + { + "auxiliary_loss_clip": 0.01061918, + "auxiliary_loss_mlp": 0.01031368, + "balance_loss_clip": 1.02464604, + "balance_loss_mlp": 1.02076435, + "epoch": 0.7880955959717421, + "flos": 22230500526720.0, + "grad_norm": 1.7246994350801503, + "language_loss": 0.73531568, + "learning_rate": 4.5272133167079165e-07, + "loss": 0.75624859, + "num_input_tokens_seen": 282828600, + "step": 13108, + "time_per_iteration": 2.5784666538238525 + }, + { + "auxiliary_loss_clip": 0.01006065, + "auxiliary_loss_mlp": 0.01001287, + "balance_loss_clip": 1.00068593, + "balance_loss_mlp": 1.00045824, + "epoch": 0.7881557192244101, + "flos": 69183200131200.0, + "grad_norm": 0.8887069226082028, + "language_loss": 0.60367846, + "learning_rate": 4.5247458787762216e-07, + "loss": 0.623752, + "num_input_tokens_seen": 282882775, + "step": 13109, + "time_per_iteration": 3.134000778198242 + }, + { + "auxiliary_loss_clip": 0.01016724, + "auxiliary_loss_mlp": 0.01028022, + "balance_loss_clip": 1.02317357, + "balance_loss_mlp": 1.01785314, + "epoch": 0.788215842477078, + "flos": 24935436167040.0, + "grad_norm": 1.5748649433256083, + "language_loss": 0.72029209, + "learning_rate": 4.5222790276677126e-07, + "loss": 0.74073958, + "num_input_tokens_seen": 282902680, + "step": 13110, + "time_per_iteration": 2.695272922515869 + }, + { + "auxiliary_loss_clip": 0.01016951, + "auxiliary_loss_mlp": 0.0102405, + "balance_loss_clip": 1.02332115, + "balance_loss_mlp": 1.01477528, + "epoch": 0.788275965729746, + "flos": 26106483369600.0, + "grad_norm": 1.3941556131245567, + "language_loss": 0.75327069, + "learning_rate": 4.5198127634759455e-07, + "loss": 0.77368069, + "num_input_tokens_seen": 282923625, + "step": 13111, + "time_per_iteration": 2.768832206726074 + }, + { + "auxiliary_loss_clip": 0.01044591, + "auxiliary_loss_mlp": 0.01033873, + "balance_loss_clip": 1.02388787, + "balance_loss_mlp": 1.02317357, + "epoch": 0.7883360889824139, + "flos": 21214803646080.0, + "grad_norm": 1.8775392643412874, + "language_loss": 0.61691296, + "learning_rate": 4.5173470862944206e-07, + "loss": 0.63769764, + "num_input_tokens_seen": 282941955, + "step": 13112, + "time_per_iteration": 2.6215925216674805 + }, + { + "auxiliary_loss_clip": 0.01042604, + "auxiliary_loss_mlp": 0.01027095, + "balance_loss_clip": 1.0250175, + "balance_loss_mlp": 1.0159961, + "epoch": 0.7883962122350819, + "flos": 21142551438720.0, + "grad_norm": 2.1696911132734678, + "language_loss": 0.67222011, + "learning_rate": 4.514881996216644e-07, + "loss": 0.69291711, + "num_input_tokens_seen": 282961280, + "step": 13113, + "time_per_iteration": 4.296723127365112 + }, + { + "auxiliary_loss_clip": 0.01026124, + "auxiliary_loss_mlp": 0.0103134, + "balance_loss_clip": 1.02392864, + "balance_loss_mlp": 1.02058756, + "epoch": 0.7884563354877498, + "flos": 15302901928320.0, + "grad_norm": 2.099121316795292, + "language_loss": 0.57800752, + "learning_rate": 4.5124174933361e-07, + "loss": 0.59858215, + "num_input_tokens_seen": 282978210, + "step": 13114, + "time_per_iteration": 4.31870174407959 + }, + { + "auxiliary_loss_clip": 0.01024504, + "auxiliary_loss_mlp": 0.01028018, + "balance_loss_clip": 1.02681708, + "balance_loss_mlp": 1.0169611, + "epoch": 0.7885164587404179, + "flos": 24388301226240.0, + "grad_norm": 1.703578785466801, + "language_loss": 0.66964865, + "learning_rate": 4.5099535777462306e-07, + "loss": 0.69017386, + "num_input_tokens_seen": 282998845, + "step": 13115, + "time_per_iteration": 2.784102439880371 + }, + { + "auxiliary_loss_clip": 0.01037299, + "auxiliary_loss_mlp": 0.01028822, + "balance_loss_clip": 1.02392006, + "balance_loss_mlp": 1.01722932, + "epoch": 0.7885765819930858, + "flos": 14385886686720.0, + "grad_norm": 1.9404772181486283, + "language_loss": 0.88592219, + "learning_rate": 4.50749024954048e-07, + "loss": 0.90658343, + "num_input_tokens_seen": 283015200, + "step": 13116, + "time_per_iteration": 2.774693489074707 + }, + { + "auxiliary_loss_clip": 0.01047546, + "auxiliary_loss_mlp": 0.01032602, + "balance_loss_clip": 1.02595842, + "balance_loss_mlp": 1.02038956, + "epoch": 0.7886367052457538, + "flos": 18259930195200.0, + "grad_norm": 1.6176601070959475, + "language_loss": 0.72698438, + "learning_rate": 4.505027508812245e-07, + "loss": 0.74778581, + "num_input_tokens_seen": 283033680, + "step": 13117, + "time_per_iteration": 2.7911489009857178 + }, + { + "auxiliary_loss_clip": 0.0104893, + "auxiliary_loss_mlp": 0.010236, + "balance_loss_clip": 1.02478456, + "balance_loss_mlp": 1.01417029, + "epoch": 0.7886968284984217, + "flos": 15305092657920.0, + "grad_norm": 1.4652145848838425, + "language_loss": 0.79974699, + "learning_rate": 4.502565355654926e-07, + "loss": 0.82047236, + "num_input_tokens_seen": 283050620, + "step": 13118, + "time_per_iteration": 2.790299415588379 + }, + { + "auxiliary_loss_clip": 0.01051226, + "auxiliary_loss_mlp": 0.01023214, + "balance_loss_clip": 1.02433133, + "balance_loss_mlp": 1.01297355, + "epoch": 0.7887569517510897, + "flos": 21215450090880.0, + "grad_norm": 1.7801695191949316, + "language_loss": 0.73287141, + "learning_rate": 4.500103790161878e-07, + "loss": 0.75361586, + "num_input_tokens_seen": 283070215, + "step": 13119, + "time_per_iteration": 2.766245126724243 + }, + { + "auxiliary_loss_clip": 0.0104613, + "auxiliary_loss_mlp": 0.01025101, + "balance_loss_clip": 1.02324224, + "balance_loss_mlp": 1.01435995, + "epoch": 0.7888170750037578, + "flos": 22711237176960.0, + "grad_norm": 1.428043529264383, + "language_loss": 0.71754009, + "learning_rate": 4.4976428124264454e-07, + "loss": 0.73825234, + "num_input_tokens_seen": 283091485, + "step": 13120, + "time_per_iteration": 2.643251895904541 + }, + { + "auxiliary_loss_clip": 0.01034516, + "auxiliary_loss_mlp": 0.00747528, + "balance_loss_clip": 1.02193356, + "balance_loss_mlp": 1.00034547, + "epoch": 0.7888771982564257, + "flos": 36429148592640.0, + "grad_norm": 1.603437153611058, + "language_loss": 0.7875942, + "learning_rate": 4.4951824225419564e-07, + "loss": 0.80541468, + "num_input_tokens_seen": 283115040, + "step": 13121, + "time_per_iteration": 2.862217664718628 + }, + { + "auxiliary_loss_clip": 0.01047209, + "auxiliary_loss_mlp": 0.01027748, + "balance_loss_clip": 1.02316535, + "balance_loss_mlp": 1.01689434, + "epoch": 0.7889373215090937, + "flos": 27309993488640.0, + "grad_norm": 1.3863583828582393, + "language_loss": 0.80272275, + "learning_rate": 4.4927226206017057e-07, + "loss": 0.82347226, + "num_input_tokens_seen": 283136925, + "step": 13122, + "time_per_iteration": 2.76294207572937 + }, + { + "auxiliary_loss_clip": 0.01033342, + "auxiliary_loss_mlp": 0.01023422, + "balance_loss_clip": 1.02364326, + "balance_loss_mlp": 1.013677, + "epoch": 0.7889974447617616, + "flos": 19829010983040.0, + "grad_norm": 1.8210269939336055, + "language_loss": 0.78044629, + "learning_rate": 4.4902634066989597e-07, + "loss": 0.80101383, + "num_input_tokens_seen": 283155725, + "step": 13123, + "time_per_iteration": 2.7885921001434326 + }, + { + "auxiliary_loss_clip": 0.01040249, + "auxiliary_loss_mlp": 0.01030711, + "balance_loss_clip": 1.02799571, + "balance_loss_mlp": 1.01983929, + "epoch": 0.7890575680144296, + "flos": 17271201450240.0, + "grad_norm": 1.8577501647083723, + "language_loss": 0.66921794, + "learning_rate": 4.487804780926985e-07, + "loss": 0.68992758, + "num_input_tokens_seen": 283173845, + "step": 13124, + "time_per_iteration": 2.697603225708008 + }, + { + "auxiliary_loss_clip": 0.01037001, + "auxiliary_loss_mlp": 0.01020632, + "balance_loss_clip": 1.0229876, + "balance_loss_mlp": 1.00945592, + "epoch": 0.7891176912670975, + "flos": 27600151553280.0, + "grad_norm": 3.271405083293185, + "language_loss": 0.72801048, + "learning_rate": 4.4853467433790036e-07, + "loss": 0.74858683, + "num_input_tokens_seen": 283191985, + "step": 13125, + "time_per_iteration": 2.717923402786255 + }, + { + "auxiliary_loss_clip": 0.01045396, + "auxiliary_loss_mlp": 0.01026292, + "balance_loss_clip": 1.0250721, + "balance_loss_mlp": 1.01564622, + "epoch": 0.7891778145197655, + "flos": 22711668140160.0, + "grad_norm": 1.8179928179406182, + "language_loss": 0.72306442, + "learning_rate": 4.4828892941482267e-07, + "loss": 0.74378133, + "num_input_tokens_seen": 283210855, + "step": 13126, + "time_per_iteration": 2.703578233718872 + }, + { + "auxiliary_loss_clip": 0.0104165, + "auxiliary_loss_mlp": 0.01026698, + "balance_loss_clip": 1.02431607, + "balance_loss_mlp": 1.01565337, + "epoch": 0.7892379377724335, + "flos": 17310775259520.0, + "grad_norm": 1.9108945673119275, + "language_loss": 0.77063608, + "learning_rate": 4.480432433327845e-07, + "loss": 0.79131961, + "num_input_tokens_seen": 283229665, + "step": 13127, + "time_per_iteration": 2.6358797550201416 + }, + { + "auxiliary_loss_clip": 0.01046016, + "auxiliary_loss_mlp": 0.01032355, + "balance_loss_clip": 1.02490425, + "balance_loss_mlp": 1.02127433, + "epoch": 0.7892980610251015, + "flos": 25775674087680.0, + "grad_norm": 1.8202345523018928, + "language_loss": 0.85931802, + "learning_rate": 4.47797616101103e-07, + "loss": 0.88010174, + "num_input_tokens_seen": 283248615, + "step": 13128, + "time_per_iteration": 2.6311376094818115 + }, + { + "auxiliary_loss_clip": 0.01047661, + "auxiliary_loss_mlp": 0.01028732, + "balance_loss_clip": 1.02266288, + "balance_loss_mlp": 1.01905775, + "epoch": 0.7893581842777694, + "flos": 21579943351680.0, + "grad_norm": 2.308190335374825, + "language_loss": 0.69022012, + "learning_rate": 4.475520477290904e-07, + "loss": 0.71098399, + "num_input_tokens_seen": 283267135, + "step": 13129, + "time_per_iteration": 2.6128041744232178 + }, + { + "auxiliary_loss_clip": 0.00996599, + "auxiliary_loss_mlp": 0.01011052, + "balance_loss_clip": 1.00117469, + "balance_loss_mlp": 1.01012838, + "epoch": 0.7894183075304374, + "flos": 69016468176000.0, + "grad_norm": 0.7189074140850461, + "language_loss": 0.61560208, + "learning_rate": 4.473065382260597e-07, + "loss": 0.63567859, + "num_input_tokens_seen": 283328940, + "step": 13130, + "time_per_iteration": 3.230109930038452 + }, + { + "auxiliary_loss_clip": 0.01053885, + "auxiliary_loss_mlp": 0.01027636, + "balance_loss_clip": 1.02678716, + "balance_loss_mlp": 1.01763427, + "epoch": 0.7894784307831053, + "flos": 24243258107520.0, + "grad_norm": 1.5437701903190737, + "language_loss": 0.73830199, + "learning_rate": 4.4706108760132124e-07, + "loss": 0.75911725, + "num_input_tokens_seen": 283350000, + "step": 13131, + "time_per_iteration": 4.398151397705078 + }, + { + "auxiliary_loss_clip": 0.01038512, + "auxiliary_loss_mlp": 0.01023146, + "balance_loss_clip": 1.02441335, + "balance_loss_mlp": 1.01083755, + "epoch": 0.7895385540357733, + "flos": 20266546550400.0, + "grad_norm": 3.086788287496045, + "language_loss": 0.69440126, + "learning_rate": 4.4681569586418153e-07, + "loss": 0.7150178, + "num_input_tokens_seen": 283368020, + "step": 13132, + "time_per_iteration": 2.6284356117248535 + }, + { + "auxiliary_loss_clip": 0.01056516, + "auxiliary_loss_mlp": 0.01034511, + "balance_loss_clip": 1.02630615, + "balance_loss_mlp": 1.02297711, + "epoch": 0.7895986772884414, + "flos": 20996574566400.0, + "grad_norm": 2.2283456788090983, + "language_loss": 0.62001431, + "learning_rate": 4.465703630239468e-07, + "loss": 0.64092457, + "num_input_tokens_seen": 283387030, + "step": 13133, + "time_per_iteration": 2.626506805419922 + }, + { + "auxiliary_loss_clip": 0.0103526, + "auxiliary_loss_mlp": 0.01032768, + "balance_loss_clip": 1.02574766, + "balance_loss_mlp": 1.02128863, + "epoch": 0.7896588005411093, + "flos": 18657999694080.0, + "grad_norm": 2.860915942984869, + "language_loss": 0.80527985, + "learning_rate": 4.463250890899195e-07, + "loss": 0.82596016, + "num_input_tokens_seen": 283402090, + "step": 13134, + "time_per_iteration": 2.6833159923553467 + }, + { + "auxiliary_loss_clip": 0.01046809, + "auxiliary_loss_mlp": 0.01025525, + "balance_loss_clip": 1.02225649, + "balance_loss_mlp": 1.0148375, + "epoch": 0.7897189237937773, + "flos": 18405907067520.0, + "grad_norm": 4.213384554150275, + "language_loss": 0.80068707, + "learning_rate": 4.460798740713998e-07, + "loss": 0.82141048, + "num_input_tokens_seen": 283421035, + "step": 13135, + "time_per_iteration": 2.592087745666504 + }, + { + "auxiliary_loss_clip": 0.01051934, + "auxiliary_loss_mlp": 0.01027836, + "balance_loss_clip": 1.02481246, + "balance_loss_mlp": 1.0170536, + "epoch": 0.7897790470464452, + "flos": 23731602825600.0, + "grad_norm": 1.5304291290311773, + "language_loss": 0.72420108, + "learning_rate": 4.4583471797768733e-07, + "loss": 0.74499881, + "num_input_tokens_seen": 283441830, + "step": 13136, + "time_per_iteration": 2.6504762172698975 + }, + { + "auxiliary_loss_clip": 0.01067941, + "auxiliary_loss_mlp": 0.01028938, + "balance_loss_clip": 1.02678013, + "balance_loss_mlp": 1.01780963, + "epoch": 0.7898391702991132, + "flos": 15918949111680.0, + "grad_norm": 2.2171495648914394, + "language_loss": 0.7092427, + "learning_rate": 4.455896208180778e-07, + "loss": 0.7302115, + "num_input_tokens_seen": 283459540, + "step": 13137, + "time_per_iteration": 2.574512243270874 + }, + { + "auxiliary_loss_clip": 0.01062861, + "auxiliary_loss_mlp": 0.01029782, + "balance_loss_clip": 1.02634764, + "balance_loss_mlp": 1.01800466, + "epoch": 0.7898992935517811, + "flos": 19829046896640.0, + "grad_norm": 1.63820134375005, + "language_loss": 0.74044645, + "learning_rate": 4.4534458260186645e-07, + "loss": 0.7613728, + "num_input_tokens_seen": 283478790, + "step": 13138, + "time_per_iteration": 2.586052179336548 + }, + { + "auxiliary_loss_clip": 0.01035211, + "auxiliary_loss_mlp": 0.01028299, + "balance_loss_clip": 1.02738595, + "balance_loss_mlp": 1.01796937, + "epoch": 0.7899594168044491, + "flos": 16216253982720.0, + "grad_norm": 1.935740397715786, + "language_loss": 0.67904723, + "learning_rate": 4.4509960333834426e-07, + "loss": 0.69968235, + "num_input_tokens_seen": 283495720, + "step": 13139, + "time_per_iteration": 2.748852491378784 + }, + { + "auxiliary_loss_clip": 0.00997894, + "auxiliary_loss_mlp": 0.01000027, + "balance_loss_clip": 1.00195205, + "balance_loss_mlp": 0.99915713, + "epoch": 0.790019540057117, + "flos": 68331005959680.0, + "grad_norm": 0.9187409225274432, + "language_loss": 0.60197151, + "learning_rate": 4.448546830368003e-07, + "loss": 0.62195075, + "num_input_tokens_seen": 283558795, + "step": 13140, + "time_per_iteration": 3.2812774181365967 + }, + { + "auxiliary_loss_clip": 0.01065062, + "auxiliary_loss_mlp": 0.01031741, + "balance_loss_clip": 1.02691793, + "balance_loss_mlp": 1.02074373, + "epoch": 0.7900796633097851, + "flos": 30332773601280.0, + "grad_norm": 1.895479295901361, + "language_loss": 0.76051629, + "learning_rate": 4.4460982170652304e-07, + "loss": 0.78148431, + "num_input_tokens_seen": 283579305, + "step": 13141, + "time_per_iteration": 4.425227880477905 + }, + { + "auxiliary_loss_clip": 0.01052859, + "auxiliary_loss_mlp": 0.01031811, + "balance_loss_clip": 1.02461112, + "balance_loss_mlp": 1.02127266, + "epoch": 0.790139786562453, + "flos": 22126790983680.0, + "grad_norm": 1.8196472692635552, + "language_loss": 0.68442237, + "learning_rate": 4.4436501935679694e-07, + "loss": 0.7052691, + "num_input_tokens_seen": 283597840, + "step": 13142, + "time_per_iteration": 2.656571626663208 + }, + { + "auxiliary_loss_clip": 0.00959432, + "auxiliary_loss_mlp": 0.01002937, + "balance_loss_clip": 1.00254369, + "balance_loss_mlp": 1.00189972, + "epoch": 0.790199909815121, + "flos": 58207284213120.0, + "grad_norm": 0.8391013267737062, + "language_loss": 0.6000967, + "learning_rate": 4.441202759969049e-07, + "loss": 0.61972046, + "num_input_tokens_seen": 283647950, + "step": 13143, + "time_per_iteration": 3.1499876976013184 + }, + { + "auxiliary_loss_clip": 0.01038255, + "auxiliary_loss_mlp": 0.01031261, + "balance_loss_clip": 1.02756703, + "balance_loss_mlp": 1.02004337, + "epoch": 0.7902600330677889, + "flos": 34533316759680.0, + "grad_norm": 1.4894411494852422, + "language_loss": 0.74448609, + "learning_rate": 4.4387559163612875e-07, + "loss": 0.76518124, + "num_input_tokens_seen": 283670645, + "step": 13144, + "time_per_iteration": 2.773223876953125 + }, + { + "auxiliary_loss_clip": 0.01055172, + "auxiliary_loss_mlp": 0.01031173, + "balance_loss_clip": 1.02539957, + "balance_loss_mlp": 1.019508, + "epoch": 0.7903201563204569, + "flos": 22346384780160.0, + "grad_norm": 1.985037517942039, + "language_loss": 0.82828194, + "learning_rate": 4.4363096628374605e-07, + "loss": 0.84914541, + "num_input_tokens_seen": 283688830, + "step": 13145, + "time_per_iteration": 2.710523843765259 + }, + { + "auxiliary_loss_clip": 0.01049037, + "auxiliary_loss_mlp": 0.01026216, + "balance_loss_clip": 1.02351248, + "balance_loss_mlp": 1.01656008, + "epoch": 0.790380279573125, + "flos": 22053533195520.0, + "grad_norm": 1.7589640856780295, + "language_loss": 0.72230744, + "learning_rate": 4.4338639994903235e-07, + "loss": 0.74305999, + "num_input_tokens_seen": 283708625, + "step": 13146, + "time_per_iteration": 2.7425568103790283 + }, + { + "auxiliary_loss_clip": 0.01064047, + "auxiliary_loss_mlp": 0.01027839, + "balance_loss_clip": 1.02501059, + "balance_loss_mlp": 1.01787305, + "epoch": 0.7904404028257929, + "flos": 20302600826880.0, + "grad_norm": 1.8009010447214104, + "language_loss": 0.75675416, + "learning_rate": 4.4314189264126246e-07, + "loss": 0.77767301, + "num_input_tokens_seen": 283725710, + "step": 13147, + "time_per_iteration": 2.6871626377105713 + }, + { + "auxiliary_loss_clip": 0.01051412, + "auxiliary_loss_mlp": 0.01033214, + "balance_loss_clip": 1.0243082, + "balance_loss_mlp": 1.02219296, + "epoch": 0.7905005260784609, + "flos": 20008923229440.0, + "grad_norm": 1.7076610841216067, + "language_loss": 0.72301245, + "learning_rate": 4.428974443697087e-07, + "loss": 0.7438587, + "num_input_tokens_seen": 283744150, + "step": 13148, + "time_per_iteration": 2.6348819732666016 + }, + { + "auxiliary_loss_clip": 0.01045553, + "auxiliary_loss_mlp": 0.01028871, + "balance_loss_clip": 1.02225518, + "balance_loss_mlp": 1.01757622, + "epoch": 0.7905606493311288, + "flos": 26905926418560.0, + "grad_norm": 1.6449087602283208, + "language_loss": 0.71404219, + "learning_rate": 4.4265305514363913e-07, + "loss": 0.73478645, + "num_input_tokens_seen": 283764170, + "step": 13149, + "time_per_iteration": 2.656748056411743 + }, + { + "auxiliary_loss_clip": 0.01023775, + "auxiliary_loss_mlp": 0.0103084, + "balance_loss_clip": 1.02178979, + "balance_loss_mlp": 1.01748848, + "epoch": 0.7906207725837968, + "flos": 23696230907520.0, + "grad_norm": 3.1036156749607806, + "language_loss": 0.65342957, + "learning_rate": 4.424087249723225e-07, + "loss": 0.67397577, + "num_input_tokens_seen": 283784305, + "step": 13150, + "time_per_iteration": 3.007391929626465 + }, + { + "auxiliary_loss_clip": 0.01061238, + "auxiliary_loss_mlp": 0.01028585, + "balance_loss_clip": 1.02485013, + "balance_loss_mlp": 1.01856542, + "epoch": 0.7906808958364647, + "flos": 20848837927680.0, + "grad_norm": 1.6239293439629252, + "language_loss": 0.6996032, + "learning_rate": 4.421644538650231e-07, + "loss": 0.72050142, + "num_input_tokens_seen": 283804040, + "step": 13151, + "time_per_iteration": 2.6079485416412354 + }, + { + "auxiliary_loss_clip": 0.01044043, + "auxiliary_loss_mlp": 0.01035373, + "balance_loss_clip": 1.02488112, + "balance_loss_mlp": 1.02383971, + "epoch": 0.7907410190891327, + "flos": 40735196974080.0, + "grad_norm": 1.575154633751921, + "language_loss": 0.70074284, + "learning_rate": 4.4192024183100306e-07, + "loss": 0.72153705, + "num_input_tokens_seen": 283827120, + "step": 13152, + "time_per_iteration": 2.8602919578552246 + }, + { + "auxiliary_loss_clip": 0.01030826, + "auxiliary_loss_mlp": 0.00747443, + "balance_loss_clip": 1.02447963, + "balance_loss_mlp": 1.0003581, + "epoch": 0.7908011423418007, + "flos": 13261165050240.0, + "grad_norm": 1.7677873499221959, + "language_loss": 0.72818899, + "learning_rate": 4.4167608887952367e-07, + "loss": 0.74597168, + "num_input_tokens_seen": 283844820, + "step": 13153, + "time_per_iteration": 2.6572468280792236 + }, + { + "auxiliary_loss_clip": 0.01061909, + "auxiliary_loss_mlp": 0.01025383, + "balance_loss_clip": 1.02388394, + "balance_loss_mlp": 1.01477933, + "epoch": 0.7908612655944687, + "flos": 19754747614080.0, + "grad_norm": 1.6544443558354676, + "language_loss": 0.79025006, + "learning_rate": 4.4143199501984306e-07, + "loss": 0.81112301, + "num_input_tokens_seen": 283862870, + "step": 13154, + "time_per_iteration": 2.5812203884124756 + }, + { + "auxiliary_loss_clip": 0.01056768, + "auxiliary_loss_mlp": 0.01027194, + "balance_loss_clip": 1.02539146, + "balance_loss_mlp": 1.01503491, + "epoch": 0.7909213888471366, + "flos": 21287738211840.0, + "grad_norm": 3.2386449760567935, + "language_loss": 0.70527166, + "learning_rate": 4.411879602612185e-07, + "loss": 0.72611129, + "num_input_tokens_seen": 283882405, + "step": 13155, + "time_per_iteration": 2.62544584274292 + }, + { + "auxiliary_loss_clip": 0.01063016, + "auxiliary_loss_mlp": 0.01023771, + "balance_loss_clip": 1.02518106, + "balance_loss_mlp": 1.01351893, + "epoch": 0.7909815120998046, + "flos": 22528882805760.0, + "grad_norm": 2.5486167298413624, + "language_loss": 0.76653999, + "learning_rate": 4.4094398461290174e-07, + "loss": 0.78740788, + "num_input_tokens_seen": 283902070, + "step": 13156, + "time_per_iteration": 2.6086199283599854 + }, + { + "auxiliary_loss_clip": 0.01024481, + "auxiliary_loss_mlp": 0.01031029, + "balance_loss_clip": 1.02014196, + "balance_loss_mlp": 1.02053261, + "epoch": 0.7910416353524725, + "flos": 26727702111360.0, + "grad_norm": 1.8016437180718734, + "language_loss": 0.65698314, + "learning_rate": 4.4070006808414526e-07, + "loss": 0.67753822, + "num_input_tokens_seen": 283924100, + "step": 13157, + "time_per_iteration": 2.7836008071899414 + }, + { + "auxiliary_loss_clip": 0.0104931, + "auxiliary_loss_mlp": 0.01032869, + "balance_loss_clip": 1.02384806, + "balance_loss_mlp": 1.02143729, + "epoch": 0.7911017586051405, + "flos": 24644847139200.0, + "grad_norm": 5.39381669140406, + "language_loss": 0.74291062, + "learning_rate": 4.4045621068419894e-07, + "loss": 0.76373243, + "num_input_tokens_seen": 283944955, + "step": 13158, + "time_per_iteration": 2.6335387229919434 + }, + { + "auxiliary_loss_clip": 0.010495, + "auxiliary_loss_mlp": 0.0102753, + "balance_loss_clip": 1.02344334, + "balance_loss_mlp": 1.0178982, + "epoch": 0.7911618818578086, + "flos": 17565489578880.0, + "grad_norm": 2.642422003262614, + "language_loss": 0.67617047, + "learning_rate": 4.40212412422309e-07, + "loss": 0.69694078, + "num_input_tokens_seen": 283963125, + "step": 13159, + "time_per_iteration": 2.710951805114746 + }, + { + "auxiliary_loss_clip": 0.01051222, + "auxiliary_loss_mlp": 0.01030305, + "balance_loss_clip": 1.02461219, + "balance_loss_mlp": 1.02005863, + "epoch": 0.7912220051104765, + "flos": 16721660298240.0, + "grad_norm": 1.8982640399508073, + "language_loss": 0.66879201, + "learning_rate": 4.399686733077206e-07, + "loss": 0.68960726, + "num_input_tokens_seen": 283982850, + "step": 13160, + "time_per_iteration": 2.6244802474975586 + }, + { + "auxiliary_loss_clip": 0.0103623, + "auxiliary_loss_mlp": 0.0102659, + "balance_loss_clip": 1.02195024, + "balance_loss_mlp": 1.01771474, + "epoch": 0.7912821283631445, + "flos": 13698736531200.0, + "grad_norm": 2.0916856127514323, + "language_loss": 0.72997177, + "learning_rate": 4.3972499334967694e-07, + "loss": 0.75059998, + "num_input_tokens_seen": 283998275, + "step": 13161, + "time_per_iteration": 5.894730806350708 + }, + { + "auxiliary_loss_clip": 0.01032811, + "auxiliary_loss_mlp": 0.01028033, + "balance_loss_clip": 1.02321339, + "balance_loss_mlp": 1.01689255, + "epoch": 0.7913422516158124, + "flos": 23769021818880.0, + "grad_norm": 1.7267278573175495, + "language_loss": 0.73158628, + "learning_rate": 4.39481372557418e-07, + "loss": 0.75219476, + "num_input_tokens_seen": 284018750, + "step": 13162, + "time_per_iteration": 2.6647086143493652 + }, + { + "auxiliary_loss_clip": 0.01044484, + "auxiliary_loss_mlp": 0.01027863, + "balance_loss_clip": 1.02574873, + "balance_loss_mlp": 1.01717019, + "epoch": 0.7914023748684804, + "flos": 19938251220480.0, + "grad_norm": 1.7704695191891502, + "language_loss": 0.72140056, + "learning_rate": 4.392378109401811e-07, + "loss": 0.74212408, + "num_input_tokens_seen": 284037850, + "step": 13163, + "time_per_iteration": 2.6611273288726807 + }, + { + "auxiliary_loss_clip": 0.01024786, + "auxiliary_loss_mlp": 0.01028195, + "balance_loss_clip": 1.02160406, + "balance_loss_mlp": 1.01673269, + "epoch": 0.7914624981211483, + "flos": 20594805966720.0, + "grad_norm": 1.7935921869387348, + "language_loss": 0.698493, + "learning_rate": 4.3899430850720296e-07, + "loss": 0.71902281, + "num_input_tokens_seen": 284056380, + "step": 13164, + "time_per_iteration": 2.7275383472442627 + }, + { + "auxiliary_loss_clip": 0.01033806, + "auxiliary_loss_mlp": 0.01029504, + "balance_loss_clip": 1.02597523, + "balance_loss_mlp": 1.01888859, + "epoch": 0.7915226213738163, + "flos": 21799465320960.0, + "grad_norm": 1.9133001550421167, + "language_loss": 0.66270667, + "learning_rate": 4.387508652677177e-07, + "loss": 0.68333977, + "num_input_tokens_seen": 284074945, + "step": 13165, + "time_per_iteration": 2.8250715732574463 + }, + { + "auxiliary_loss_clip": 0.0101955, + "auxiliary_loss_mlp": 0.01024614, + "balance_loss_clip": 1.02519512, + "balance_loss_mlp": 1.01467252, + "epoch": 0.7915827446264843, + "flos": 16288362535680.0, + "grad_norm": 2.489057255610784, + "language_loss": 0.72532481, + "learning_rate": 4.385074812309557e-07, + "loss": 0.74576646, + "num_input_tokens_seen": 284092070, + "step": 13166, + "time_per_iteration": 2.7907373905181885 + }, + { + "auxiliary_loss_clip": 0.0106171, + "auxiliary_loss_mlp": 0.01031667, + "balance_loss_clip": 1.02372181, + "balance_loss_mlp": 1.02050316, + "epoch": 0.7916428678791523, + "flos": 25702595867520.0, + "grad_norm": 1.814296098555218, + "language_loss": 0.77311957, + "learning_rate": 4.382641564061462e-07, + "loss": 0.79405338, + "num_input_tokens_seen": 284112255, + "step": 13167, + "time_per_iteration": 2.624721050262451 + }, + { + "auxiliary_loss_clip": 0.01030959, + "auxiliary_loss_mlp": 0.01027178, + "balance_loss_clip": 1.02451587, + "balance_loss_mlp": 1.01771891, + "epoch": 0.7917029911318202, + "flos": 23878513451520.0, + "grad_norm": 2.148292869304729, + "language_loss": 0.84343559, + "learning_rate": 4.3802089080251713e-07, + "loss": 0.86401695, + "num_input_tokens_seen": 284132330, + "step": 13168, + "time_per_iteration": 2.69421124458313 + }, + { + "auxiliary_loss_clip": 0.01062878, + "auxiliary_loss_mlp": 0.01025158, + "balance_loss_clip": 1.02574384, + "balance_loss_mlp": 1.01484632, + "epoch": 0.7917631143844882, + "flos": 21646593037440.0, + "grad_norm": 1.612046210627558, + "language_loss": 0.72414786, + "learning_rate": 4.3777768442929155e-07, + "loss": 0.7450282, + "num_input_tokens_seen": 284150640, + "step": 13169, + "time_per_iteration": 2.6127960681915283 + }, + { + "auxiliary_loss_clip": 0.01065293, + "auxiliary_loss_mlp": 0.01028626, + "balance_loss_clip": 1.02626252, + "balance_loss_mlp": 1.0175221, + "epoch": 0.7918232376371561, + "flos": 38874198355200.0, + "grad_norm": 2.203844972004173, + "language_loss": 0.66781896, + "learning_rate": 4.3753453729569287e-07, + "loss": 0.68875813, + "num_input_tokens_seen": 284171910, + "step": 13170, + "time_per_iteration": 2.700368642807007 + }, + { + "auxiliary_loss_clip": 0.01050334, + "auxiliary_loss_mlp": 0.01023294, + "balance_loss_clip": 1.02346659, + "balance_loss_mlp": 1.01331639, + "epoch": 0.7918833608898241, + "flos": 20775544225920.0, + "grad_norm": 1.8142267793331248, + "language_loss": 0.70586777, + "learning_rate": 4.372914494109412e-07, + "loss": 0.72660404, + "num_input_tokens_seen": 284191340, + "step": 13171, + "time_per_iteration": 2.624250888824463 + }, + { + "auxiliary_loss_clip": 0.01051176, + "auxiliary_loss_mlp": 0.01025521, + "balance_loss_clip": 1.02425265, + "balance_loss_mlp": 1.01513767, + "epoch": 0.7919434841424922, + "flos": 33910122769920.0, + "grad_norm": 1.561155840341799, + "language_loss": 0.66715777, + "learning_rate": 4.370484207842553e-07, + "loss": 0.68792468, + "num_input_tokens_seen": 284212495, + "step": 13172, + "time_per_iteration": 2.7306160926818848 + }, + { + "auxiliary_loss_clip": 0.01036769, + "auxiliary_loss_mlp": 0.01028185, + "balance_loss_clip": 1.02367866, + "balance_loss_mlp": 1.01715803, + "epoch": 0.7920036073951601, + "flos": 21064660796160.0, + "grad_norm": 1.93642373037581, + "language_loss": 0.79582608, + "learning_rate": 4.3680545142484893e-07, + "loss": 0.81647563, + "num_input_tokens_seen": 284230825, + "step": 13173, + "time_per_iteration": 2.661060094833374 + }, + { + "auxiliary_loss_clip": 0.01023065, + "auxiliary_loss_mlp": 0.0102854, + "balance_loss_clip": 1.02205813, + "balance_loss_mlp": 1.01757836, + "epoch": 0.7920637306478281, + "flos": 23655974739840.0, + "grad_norm": 1.8506891018040983, + "language_loss": 0.76683736, + "learning_rate": 4.365625413419365e-07, + "loss": 0.7873534, + "num_input_tokens_seen": 284250365, + "step": 13174, + "time_per_iteration": 2.690073013305664 + }, + { + "auxiliary_loss_clip": 0.01033695, + "auxiliary_loss_mlp": 0.01029519, + "balance_loss_clip": 1.02126336, + "balance_loss_mlp": 1.01933265, + "epoch": 0.792123853900496, + "flos": 27195438038400.0, + "grad_norm": 1.5159390916935942, + "language_loss": 0.71663797, + "learning_rate": 4.363196905447297e-07, + "loss": 0.73727012, + "num_input_tokens_seen": 284269635, + "step": 13175, + "time_per_iteration": 2.675776481628418 + }, + { + "auxiliary_loss_clip": 0.01048724, + "auxiliary_loss_mlp": 0.01025472, + "balance_loss_clip": 1.02268696, + "balance_loss_mlp": 1.01470757, + "epoch": 0.792183977153164, + "flos": 19098659744640.0, + "grad_norm": 2.4002511069774775, + "language_loss": 0.59293103, + "learning_rate": 4.360768990424364e-07, + "loss": 0.61367297, + "num_input_tokens_seen": 284288380, + "step": 13176, + "time_per_iteration": 2.6007871627807617 + }, + { + "auxiliary_loss_clip": 0.01065527, + "auxiliary_loss_mlp": 0.01028813, + "balance_loss_clip": 1.02811849, + "balance_loss_mlp": 1.01828122, + "epoch": 0.7922441004058319, + "flos": 17128851851520.0, + "grad_norm": 1.8474366935733755, + "language_loss": 0.73810494, + "learning_rate": 4.3583416684426376e-07, + "loss": 0.75904834, + "num_input_tokens_seen": 284306920, + "step": 13177, + "time_per_iteration": 2.6395626068115234 + }, + { + "auxiliary_loss_clip": 0.01044223, + "auxiliary_loss_mlp": 0.01032279, + "balance_loss_clip": 1.02428055, + "balance_loss_mlp": 1.02196717, + "epoch": 0.7923042236585, + "flos": 17821640442240.0, + "grad_norm": 1.6552858329425681, + "language_loss": 0.63976032, + "learning_rate": 4.355914939594174e-07, + "loss": 0.66052532, + "num_input_tokens_seen": 284324700, + "step": 13178, + "time_per_iteration": 4.419967412948608 + }, + { + "auxiliary_loss_clip": 0.01039526, + "auxiliary_loss_mlp": 0.01027604, + "balance_loss_clip": 1.02333546, + "balance_loss_mlp": 1.01806116, + "epoch": 0.7923643469111679, + "flos": 29935206892800.0, + "grad_norm": 1.4417500040675555, + "language_loss": 0.68550968, + "learning_rate": 4.3534888039709726e-07, + "loss": 0.70618099, + "num_input_tokens_seen": 284345985, + "step": 13179, + "time_per_iteration": 2.752922534942627 + }, + { + "auxiliary_loss_clip": 0.01061433, + "auxiliary_loss_mlp": 0.01025102, + "balance_loss_clip": 1.02420127, + "balance_loss_mlp": 1.01484418, + "epoch": 0.7924244701638359, + "flos": 22674716023680.0, + "grad_norm": 1.8686954289685613, + "language_loss": 0.73972952, + "learning_rate": 4.3510632616650444e-07, + "loss": 0.7605949, + "num_input_tokens_seen": 284364475, + "step": 13180, + "time_per_iteration": 2.586230754852295 + }, + { + "auxiliary_loss_clip": 0.0105686, + "auxiliary_loss_mlp": 0.01030537, + "balance_loss_clip": 1.02702117, + "balance_loss_mlp": 1.01924229, + "epoch": 0.7924845934165038, + "flos": 17968156018560.0, + "grad_norm": 2.0212259504495393, + "language_loss": 0.81660891, + "learning_rate": 4.3486383127683646e-07, + "loss": 0.83748293, + "num_input_tokens_seen": 284382125, + "step": 13181, + "time_per_iteration": 2.646103858947754 + }, + { + "auxiliary_loss_clip": 0.01036825, + "auxiliary_loss_mlp": 0.01033013, + "balance_loss_clip": 1.02291632, + "balance_loss_mlp": 1.02213466, + "epoch": 0.7925447166691718, + "flos": 23476960333440.0, + "grad_norm": 1.9589145212831858, + "language_loss": 0.7774083, + "learning_rate": 4.346213957372895e-07, + "loss": 0.79810667, + "num_input_tokens_seen": 284401585, + "step": 13182, + "time_per_iteration": 2.867971181869507 + }, + { + "auxiliary_loss_clip": 0.01046574, + "auxiliary_loss_mlp": 0.01031984, + "balance_loss_clip": 1.02481031, + "balance_loss_mlp": 1.01965213, + "epoch": 0.7926048399218397, + "flos": 20447572118400.0, + "grad_norm": 1.7429352554189608, + "language_loss": 0.74025548, + "learning_rate": 4.34379019557056e-07, + "loss": 0.76104105, + "num_input_tokens_seen": 284419125, + "step": 13183, + "time_per_iteration": 2.708219289779663 + }, + { + "auxiliary_loss_clip": 0.01037022, + "auxiliary_loss_mlp": 0.01026397, + "balance_loss_clip": 1.02436972, + "balance_loss_mlp": 1.01464903, + "epoch": 0.7926649631745077, + "flos": 37160038535040.0, + "grad_norm": 1.838420784514378, + "language_loss": 0.67851043, + "learning_rate": 4.341367027453264e-07, + "loss": 0.6991446, + "num_input_tokens_seen": 284440445, + "step": 13184, + "time_per_iteration": 3.0331597328186035 + }, + { + "auxiliary_loss_clip": 0.01038388, + "auxiliary_loss_mlp": 0.010316, + "balance_loss_clip": 1.02837384, + "balance_loss_mlp": 1.0208528, + "epoch": 0.7927250864271758, + "flos": 17018606033280.0, + "grad_norm": 1.791867006001555, + "language_loss": 0.70784307, + "learning_rate": 4.338944453112907e-07, + "loss": 0.72854298, + "num_input_tokens_seen": 284459370, + "step": 13185, + "time_per_iteration": 2.7565207481384277 + }, + { + "auxiliary_loss_clip": 0.01055839, + "auxiliary_loss_mlp": 0.01028499, + "balance_loss_clip": 1.02712369, + "balance_loss_mlp": 1.01728749, + "epoch": 0.7927852096798437, + "flos": 17749208666880.0, + "grad_norm": 1.9225618709828447, + "language_loss": 0.65231824, + "learning_rate": 4.3365224726413375e-07, + "loss": 0.67316169, + "num_input_tokens_seen": 284477525, + "step": 13186, + "time_per_iteration": 2.5732927322387695 + }, + { + "auxiliary_loss_clip": 0.01045041, + "auxiliary_loss_mlp": 0.01029905, + "balance_loss_clip": 1.02377081, + "balance_loss_mlp": 1.01945591, + "epoch": 0.7928453329325117, + "flos": 23838436851840.0, + "grad_norm": 1.5047682839576626, + "language_loss": 0.76659131, + "learning_rate": 4.334101086130408e-07, + "loss": 0.78734076, + "num_input_tokens_seen": 284496590, + "step": 13187, + "time_per_iteration": 2.7449629306793213 + }, + { + "auxiliary_loss_clip": 0.01039571, + "auxiliary_loss_mlp": 0.01027256, + "balance_loss_clip": 1.02326179, + "balance_loss_mlp": 1.01687896, + "epoch": 0.7929054561851796, + "flos": 17454920538240.0, + "grad_norm": 1.9583788930624304, + "language_loss": 0.72478688, + "learning_rate": 4.3316802936719334e-07, + "loss": 0.74545521, + "num_input_tokens_seen": 284511470, + "step": 13188, + "time_per_iteration": 2.6321332454681396 + }, + { + "auxiliary_loss_clip": 0.0106404, + "auxiliary_loss_mlp": 0.00747806, + "balance_loss_clip": 1.02488458, + "balance_loss_mlp": 1.00038385, + "epoch": 0.7929655794378476, + "flos": 21981280988160.0, + "grad_norm": 3.9298479953901952, + "language_loss": 0.6305635, + "learning_rate": 4.329260095357725e-07, + "loss": 0.648682, + "num_input_tokens_seen": 284531125, + "step": 13189, + "time_per_iteration": 4.32234787940979 + }, + { + "auxiliary_loss_clip": 0.01020282, + "auxiliary_loss_mlp": 0.01028757, + "balance_loss_clip": 1.02361786, + "balance_loss_mlp": 1.01879072, + "epoch": 0.7930257026905155, + "flos": 17273930883840.0, + "grad_norm": 2.1341917249138778, + "language_loss": 0.72512627, + "learning_rate": 4.3268404912795307e-07, + "loss": 0.74561667, + "num_input_tokens_seen": 284549340, + "step": 13190, + "time_per_iteration": 2.742474317550659 + }, + { + "auxiliary_loss_clip": 0.01048029, + "auxiliary_loss_mlp": 0.01024728, + "balance_loss_clip": 1.02372575, + "balance_loss_mlp": 1.01599574, + "epoch": 0.7930858259431836, + "flos": 27300584125440.0, + "grad_norm": 1.715943815424869, + "language_loss": 0.73173159, + "learning_rate": 4.3244214815291166e-07, + "loss": 0.75245917, + "num_input_tokens_seen": 284567060, + "step": 13191, + "time_per_iteration": 2.693126678466797 + }, + { + "auxiliary_loss_clip": 0.01051942, + "auxiliary_loss_mlp": 0.01035086, + "balance_loss_clip": 1.02427745, + "balance_loss_mlp": 1.02400517, + "epoch": 0.7931459491958515, + "flos": 19863736456320.0, + "grad_norm": 1.7528366159656432, + "language_loss": 0.69212461, + "learning_rate": 4.322003066198219e-07, + "loss": 0.71299487, + "num_input_tokens_seen": 284586600, + "step": 13192, + "time_per_iteration": 2.8024773597717285 + }, + { + "auxiliary_loss_clip": 0.01025394, + "auxiliary_loss_mlp": 0.01032497, + "balance_loss_clip": 1.02246165, + "balance_loss_mlp": 1.02153599, + "epoch": 0.7932060724485195, + "flos": 23147120718720.0, + "grad_norm": 1.489367477069992, + "language_loss": 0.74606442, + "learning_rate": 4.3195852453785274e-07, + "loss": 0.76664329, + "num_input_tokens_seen": 284605715, + "step": 13193, + "time_per_iteration": 2.9295949935913086 + }, + { + "auxiliary_loss_clip": 0.01053215, + "auxiliary_loss_mlp": 0.01030927, + "balance_loss_clip": 1.02574658, + "balance_loss_mlp": 1.01948261, + "epoch": 0.7932661957011874, + "flos": 29934847756800.0, + "grad_norm": 2.0339418565545015, + "language_loss": 0.7180813, + "learning_rate": 4.317168019161741e-07, + "loss": 0.73892272, + "num_input_tokens_seen": 284628540, + "step": 13194, + "time_per_iteration": 2.833251476287842 + }, + { + "auxiliary_loss_clip": 0.01065846, + "auxiliary_loss_mlp": 0.01031767, + "balance_loss_clip": 1.02607465, + "balance_loss_mlp": 1.02041817, + "epoch": 0.7933263189538554, + "flos": 22559119079040.0, + "grad_norm": 2.0776588597044015, + "language_loss": 0.69990325, + "learning_rate": 4.314751387639517e-07, + "loss": 0.72087938, + "num_input_tokens_seen": 284646040, + "step": 13195, + "time_per_iteration": 2.762227773666382 + }, + { + "auxiliary_loss_clip": 0.01023946, + "auxiliary_loss_mlp": 0.01027543, + "balance_loss_clip": 1.02607501, + "balance_loss_mlp": 1.01694524, + "epoch": 0.7933864422065233, + "flos": 25479051575040.0, + "grad_norm": 1.616589204618958, + "language_loss": 0.7765981, + "learning_rate": 4.3123353509034844e-07, + "loss": 0.797113, + "num_input_tokens_seen": 284665110, + "step": 13196, + "time_per_iteration": 3.014801263809204 + }, + { + "auxiliary_loss_clip": 0.01033849, + "auxiliary_loss_mlp": 0.01036539, + "balance_loss_clip": 1.02526355, + "balance_loss_mlp": 1.02616811, + "epoch": 0.7934465654591913, + "flos": 33583156243200.0, + "grad_norm": 1.8952796802024938, + "language_loss": 0.68846875, + "learning_rate": 4.309919909045268e-07, + "loss": 0.70917267, + "num_input_tokens_seen": 284686515, + "step": 13197, + "time_per_iteration": 3.0661263465881348 + }, + { + "auxiliary_loss_clip": 0.01050168, + "auxiliary_loss_mlp": 0.01026185, + "balance_loss_clip": 1.02399683, + "balance_loss_mlp": 1.01615369, + "epoch": 0.7935066887118594, + "flos": 31432538263680.0, + "grad_norm": 1.7564479976877487, + "language_loss": 0.65010607, + "learning_rate": 4.30750506215646e-07, + "loss": 0.67086959, + "num_input_tokens_seen": 284707300, + "step": 13198, + "time_per_iteration": 2.890831470489502 + }, + { + "auxiliary_loss_clip": 0.01019569, + "auxiliary_loss_mlp": 0.01037802, + "balance_loss_clip": 1.0239327, + "balance_loss_mlp": 1.02550578, + "epoch": 0.7935668119645273, + "flos": 14682616940160.0, + "grad_norm": 1.9535276061290134, + "language_loss": 0.7227633, + "learning_rate": 4.30509081032864e-07, + "loss": 0.74333704, + "num_input_tokens_seen": 284723545, + "step": 13199, + "time_per_iteration": 3.1400604248046875 + }, + { + "auxiliary_loss_clip": 0.01031087, + "auxiliary_loss_mlp": 0.01029108, + "balance_loss_clip": 1.02125275, + "balance_loss_mlp": 1.01861787, + "epoch": 0.7936269352171953, + "flos": 18004246208640.0, + "grad_norm": 1.824441794130631, + "language_loss": 0.80633891, + "learning_rate": 4.302677153653349e-07, + "loss": 0.82694089, + "num_input_tokens_seen": 284742650, + "step": 13200, + "time_per_iteration": 2.9610788822174072 + }, + { + "auxiliary_loss_clip": 0.01051794, + "auxiliary_loss_mlp": 0.01028319, + "balance_loss_clip": 1.02580369, + "balance_loss_mlp": 1.01834083, + "epoch": 0.7936870584698632, + "flos": 18880215183360.0, + "grad_norm": 1.7267739590597861, + "language_loss": 0.77505219, + "learning_rate": 4.3002640922221077e-07, + "loss": 0.79585332, + "num_input_tokens_seen": 284760955, + "step": 13201, + "time_per_iteration": 2.9005372524261475 + }, + { + "auxiliary_loss_clip": 0.01060456, + "auxiliary_loss_mlp": 0.01027519, + "balance_loss_clip": 1.02381742, + "balance_loss_mlp": 1.01757145, + "epoch": 0.7937471817225312, + "flos": 23367001824000.0, + "grad_norm": 1.6239384950132105, + "language_loss": 0.67009163, + "learning_rate": 4.2978516261264296e-07, + "loss": 0.69097137, + "num_input_tokens_seen": 284780745, + "step": 13202, + "time_per_iteration": 2.823133707046509 + }, + { + "auxiliary_loss_clip": 0.01051336, + "auxiliary_loss_mlp": 0.01032581, + "balance_loss_clip": 1.02469802, + "balance_loss_mlp": 1.02176905, + "epoch": 0.7938073049751991, + "flos": 22674428714880.0, + "grad_norm": 1.9605711792283151, + "language_loss": 0.74997234, + "learning_rate": 4.2954397554577884e-07, + "loss": 0.7708115, + "num_input_tokens_seen": 284799000, + "step": 13203, + "time_per_iteration": 2.7590909004211426 + }, + { + "auxiliary_loss_clip": 0.01010979, + "auxiliary_loss_mlp": 0.01027629, + "balance_loss_clip": 1.02464569, + "balance_loss_mlp": 1.01756132, + "epoch": 0.7938674282278672, + "flos": 22851431959680.0, + "grad_norm": 1.6425298958378363, + "language_loss": 0.65900409, + "learning_rate": 4.293028480307643e-07, + "loss": 0.67939019, + "num_input_tokens_seen": 284817450, + "step": 13204, + "time_per_iteration": 2.8976423740386963 + }, + { + "auxiliary_loss_clip": 0.01010096, + "auxiliary_loss_mlp": 0.01026916, + "balance_loss_clip": 1.02269399, + "balance_loss_mlp": 1.01674724, + "epoch": 0.7939275514805351, + "flos": 27012509049600.0, + "grad_norm": 1.3309749504946977, + "language_loss": 0.79419887, + "learning_rate": 4.290617800767438e-07, + "loss": 0.814569, + "num_input_tokens_seen": 284838865, + "step": 13205, + "time_per_iteration": 2.888810157775879 + }, + { + "auxiliary_loss_clip": 0.01023003, + "auxiliary_loss_mlp": 0.01024297, + "balance_loss_clip": 1.02035165, + "balance_loss_mlp": 1.01363945, + "epoch": 0.7939876747332031, + "flos": 21142838747520.0, + "grad_norm": 1.8065151906051862, + "language_loss": 0.7773453, + "learning_rate": 4.28820771692858e-07, + "loss": 0.79781824, + "num_input_tokens_seen": 284857975, + "step": 13206, + "time_per_iteration": 2.773984670639038 + }, + { + "auxiliary_loss_clip": 0.0103937, + "auxiliary_loss_mlp": 0.01036743, + "balance_loss_clip": 1.02471256, + "balance_loss_mlp": 1.02447045, + "epoch": 0.794047797985871, + "flos": 23289075267840.0, + "grad_norm": 2.1528341209231043, + "language_loss": 0.79540944, + "learning_rate": 4.285798228882456e-07, + "loss": 0.81617063, + "num_input_tokens_seen": 284877145, + "step": 13207, + "time_per_iteration": 2.6746573448181152 + }, + { + "auxiliary_loss_clip": 0.01031875, + "auxiliary_loss_mlp": 0.01030621, + "balance_loss_clip": 1.02451277, + "balance_loss_mlp": 1.02015448, + "epoch": 0.794107921238539, + "flos": 24608074590720.0, + "grad_norm": 1.6548401332233325, + "language_loss": 0.83950651, + "learning_rate": 4.2833893367204375e-07, + "loss": 0.86013138, + "num_input_tokens_seen": 284895560, + "step": 13208, + "time_per_iteration": 4.319191217422485 + }, + { + "auxiliary_loss_clip": 0.00970401, + "auxiliary_loss_mlp": 0.0101208, + "balance_loss_clip": 1.00378585, + "balance_loss_mlp": 1.01111436, + "epoch": 0.7941680444912069, + "flos": 64093690252800.0, + "grad_norm": 0.9590614362535855, + "language_loss": 0.58346581, + "learning_rate": 4.280981040533875e-07, + "loss": 0.60329062, + "num_input_tokens_seen": 284963135, + "step": 13209, + "time_per_iteration": 4.967225551605225 + }, + { + "auxiliary_loss_clip": 0.0102516, + "auxiliary_loss_mlp": 0.01027806, + "balance_loss_clip": 1.02322698, + "balance_loss_mlp": 1.0164628, + "epoch": 0.794228167743875, + "flos": 24388839930240.0, + "grad_norm": 17.769308412872547, + "language_loss": 0.63164461, + "learning_rate": 4.2785733404140825e-07, + "loss": 0.65217423, + "num_input_tokens_seen": 284981755, + "step": 13210, + "time_per_iteration": 2.8731536865234375 + }, + { + "auxiliary_loss_clip": 0.01053666, + "auxiliary_loss_mlp": 0.01028082, + "balance_loss_clip": 1.02591562, + "balance_loss_mlp": 1.01799655, + "epoch": 0.794288290996543, + "flos": 28512498026880.0, + "grad_norm": 1.753846988059043, + "language_loss": 0.69300222, + "learning_rate": 4.2761662364523676e-07, + "loss": 0.71381974, + "num_input_tokens_seen": 285003060, + "step": 13211, + "time_per_iteration": 2.695650815963745 + }, + { + "auxiliary_loss_clip": 0.01054026, + "auxiliary_loss_mlp": 0.01035668, + "balance_loss_clip": 1.02480078, + "balance_loss_mlp": 1.02418196, + "epoch": 0.7943484142492109, + "flos": 25922117836800.0, + "grad_norm": 7.832803789417135, + "language_loss": 0.72247666, + "learning_rate": 4.2737597287400074e-07, + "loss": 0.74337363, + "num_input_tokens_seen": 285021640, + "step": 13212, + "time_per_iteration": 2.7830629348754883 + }, + { + "auxiliary_loss_clip": 0.01048904, + "auxiliary_loss_mlp": 0.01024487, + "balance_loss_clip": 1.02389812, + "balance_loss_mlp": 1.01452708, + "epoch": 0.7944085375018789, + "flos": 23915286000000.0, + "grad_norm": 1.8643519730412348, + "language_loss": 0.80585718, + "learning_rate": 4.271353817368246e-07, + "loss": 0.82659107, + "num_input_tokens_seen": 285040490, + "step": 13213, + "time_per_iteration": 2.667436122894287 + }, + { + "auxiliary_loss_clip": 0.0105586, + "auxiliary_loss_mlp": 0.01027975, + "balance_loss_clip": 1.02664566, + "balance_loss_mlp": 1.01679921, + "epoch": 0.7944686607545468, + "flos": 20229953569920.0, + "grad_norm": 2.155421876784512, + "language_loss": 0.68119228, + "learning_rate": 4.268948502428327e-07, + "loss": 0.7020306, + "num_input_tokens_seen": 285059270, + "step": 13214, + "time_per_iteration": 2.6787571907043457 + }, + { + "auxiliary_loss_clip": 0.01060526, + "auxiliary_loss_mlp": 0.01025557, + "balance_loss_clip": 1.02428198, + "balance_loss_mlp": 1.0153048, + "epoch": 0.7945287840072148, + "flos": 21980993679360.0, + "grad_norm": 1.7877513468131276, + "language_loss": 0.72451138, + "learning_rate": 4.2665437840114535e-07, + "loss": 0.74537218, + "num_input_tokens_seen": 285075390, + "step": 13215, + "time_per_iteration": 2.628471612930298 + }, + { + "auxiliary_loss_clip": 0.01014821, + "auxiliary_loss_mlp": 0.01029724, + "balance_loss_clip": 1.02301645, + "balance_loss_mlp": 1.01848269, + "epoch": 0.7945889072598827, + "flos": 26397718842240.0, + "grad_norm": 1.6109580275835857, + "language_loss": 0.78776252, + "learning_rate": 4.2641396622088253e-07, + "loss": 0.80820799, + "num_input_tokens_seen": 285096290, + "step": 13216, + "time_per_iteration": 2.7868874073028564 + }, + { + "auxiliary_loss_clip": 0.01045159, + "auxiliary_loss_mlp": 0.01029195, + "balance_loss_clip": 1.02315402, + "balance_loss_mlp": 1.01838851, + "epoch": 0.7946490305125508, + "flos": 25810255906560.0, + "grad_norm": 1.6937516144198543, + "language_loss": 0.73918533, + "learning_rate": 4.261736137111598e-07, + "loss": 0.75992888, + "num_input_tokens_seen": 285116020, + "step": 13217, + "time_per_iteration": 2.649486780166626 + }, + { + "auxiliary_loss_clip": 0.01033559, + "auxiliary_loss_mlp": 0.01028479, + "balance_loss_clip": 1.02293944, + "balance_loss_mlp": 1.01817954, + "epoch": 0.7947091537652187, + "flos": 15960965045760.0, + "grad_norm": 1.8329607417765088, + "language_loss": 0.74149591, + "learning_rate": 4.259333208810907e-07, + "loss": 0.76211631, + "num_input_tokens_seen": 285133510, + "step": 13218, + "time_per_iteration": 2.647700786590576 + }, + { + "auxiliary_loss_clip": 0.010518, + "auxiliary_loss_mlp": 0.01033506, + "balance_loss_clip": 1.02330804, + "balance_loss_mlp": 1.02236557, + "epoch": 0.7947692770178867, + "flos": 18587866389120.0, + "grad_norm": 1.8226229183385483, + "language_loss": 0.83645892, + "learning_rate": 4.2569308773978817e-07, + "loss": 0.85731196, + "num_input_tokens_seen": 285151690, + "step": 13219, + "time_per_iteration": 2.605067253112793 + }, + { + "auxiliary_loss_clip": 0.01050043, + "auxiliary_loss_mlp": 0.01035923, + "balance_loss_clip": 1.02541447, + "balance_loss_mlp": 1.02316761, + "epoch": 0.7948294002705546, + "flos": 20442220992000.0, + "grad_norm": 1.8084206464175763, + "language_loss": 0.75209618, + "learning_rate": 4.2545291429636123e-07, + "loss": 0.77295589, + "num_input_tokens_seen": 285170485, + "step": 13220, + "time_per_iteration": 2.6551308631896973 + }, + { + "auxiliary_loss_clip": 0.01033854, + "auxiliary_loss_mlp": 0.01034281, + "balance_loss_clip": 1.0235064, + "balance_loss_mlp": 1.0230459, + "epoch": 0.7948895235232226, + "flos": 38181194282880.0, + "grad_norm": 4.002947019767397, + "language_loss": 0.72666085, + "learning_rate": 4.252128005599176e-07, + "loss": 0.74734217, + "num_input_tokens_seen": 285191050, + "step": 13221, + "time_per_iteration": 2.790562868118286 + }, + { + "auxiliary_loss_clip": 0.01052759, + "auxiliary_loss_mlp": 0.01028942, + "balance_loss_clip": 1.02636349, + "balance_loss_mlp": 1.01922619, + "epoch": 0.7949496467758905, + "flos": 15559806977280.0, + "grad_norm": 2.05190736538559, + "language_loss": 0.74599624, + "learning_rate": 4.249727465395634e-07, + "loss": 0.76681322, + "num_input_tokens_seen": 285208750, + "step": 13222, + "time_per_iteration": 2.6330041885375977 + }, + { + "auxiliary_loss_clip": 0.00987864, + "auxiliary_loss_mlp": 0.00999948, + "balance_loss_clip": 1.00221801, + "balance_loss_mlp": 0.99915522, + "epoch": 0.7950097700285585, + "flos": 70897036728960.0, + "grad_norm": 0.7683188721798202, + "language_loss": 0.67062032, + "learning_rate": 4.247327522443993e-07, + "loss": 0.69049847, + "num_input_tokens_seen": 285264605, + "step": 13223, + "time_per_iteration": 3.1032378673553467 + }, + { + "auxiliary_loss_clip": 0.01051002, + "auxiliary_loss_mlp": 0.01026818, + "balance_loss_clip": 1.02487564, + "balance_loss_mlp": 1.01653636, + "epoch": 0.7950698932812266, + "flos": 23951627585280.0, + "grad_norm": 1.5908994995588175, + "language_loss": 0.70947683, + "learning_rate": 4.2449281768352717e-07, + "loss": 0.73025501, + "num_input_tokens_seen": 285283940, + "step": 13224, + "time_per_iteration": 2.63904070854187 + }, + { + "auxiliary_loss_clip": 0.01006828, + "auxiliary_loss_mlp": 0.01004624, + "balance_loss_clip": 1.00158417, + "balance_loss_mlp": 1.00371826, + "epoch": 0.7951300165338945, + "flos": 60282561415680.0, + "grad_norm": 0.66948186135837, + "language_loss": 0.55043244, + "learning_rate": 4.2425294286604527e-07, + "loss": 0.57054698, + "num_input_tokens_seen": 285349525, + "step": 13225, + "time_per_iteration": 4.977863311767578 + }, + { + "auxiliary_loss_clip": 0.01037484, + "auxiliary_loss_mlp": 0.01021855, + "balance_loss_clip": 1.02178288, + "balance_loss_mlp": 1.01221681, + "epoch": 0.7951901397865625, + "flos": 22819004956800.0, + "grad_norm": 3.8057648636205177, + "language_loss": 0.64738822, + "learning_rate": 4.2401312780105034e-07, + "loss": 0.66798162, + "num_input_tokens_seen": 285367355, + "step": 13226, + "time_per_iteration": 2.7229654788970947 + }, + { + "auxiliary_loss_clip": 0.01022817, + "auxiliary_loss_mlp": 0.01034356, + "balance_loss_clip": 1.0250572, + "balance_loss_mlp": 1.02412128, + "epoch": 0.7952502630392304, + "flos": 35695672871040.0, + "grad_norm": 1.9871881158416551, + "language_loss": 0.70514953, + "learning_rate": 4.237733724976349e-07, + "loss": 0.72572124, + "num_input_tokens_seen": 285386190, + "step": 13227, + "time_per_iteration": 2.856451988220215 + }, + { + "auxiliary_loss_clip": 0.01014635, + "auxiliary_loss_mlp": 0.01026024, + "balance_loss_clip": 1.02129173, + "balance_loss_mlp": 1.01673138, + "epoch": 0.7953103862918984, + "flos": 25629840869760.0, + "grad_norm": 2.121395805933463, + "language_loss": 0.69312769, + "learning_rate": 4.2353367696489184e-07, + "loss": 0.71353424, + "num_input_tokens_seen": 285406150, + "step": 13228, + "time_per_iteration": 2.7218830585479736 + }, + { + "auxiliary_loss_clip": 0.01013414, + "auxiliary_loss_mlp": 0.01039455, + "balance_loss_clip": 1.0210191, + "balance_loss_mlp": 1.02810073, + "epoch": 0.7953705095445663, + "flos": 40551980676480.0, + "grad_norm": 1.4701715508656588, + "language_loss": 0.7084744, + "learning_rate": 4.232940412119095e-07, + "loss": 0.72900307, + "num_input_tokens_seen": 285429900, + "step": 13229, + "time_per_iteration": 2.8877406120300293 + }, + { + "auxiliary_loss_clip": 0.01056464, + "auxiliary_loss_mlp": 0.01031722, + "balance_loss_clip": 1.02798808, + "balance_loss_mlp": 1.02069557, + "epoch": 0.7954306327972344, + "flos": 27636672706560.0, + "grad_norm": 2.0339256846628326, + "language_loss": 0.71889615, + "learning_rate": 4.2305446524777457e-07, + "loss": 0.73977804, + "num_input_tokens_seen": 285452555, + "step": 13230, + "time_per_iteration": 2.701356887817383 + }, + { + "auxiliary_loss_clip": 0.00986679, + "auxiliary_loss_mlp": 0.01001474, + "balance_loss_clip": 1.00180864, + "balance_loss_mlp": 1.00065708, + "epoch": 0.7954907560499023, + "flos": 59504055995520.0, + "grad_norm": 0.8939851315617798, + "language_loss": 0.63586473, + "learning_rate": 4.2281494908157247e-07, + "loss": 0.65574622, + "num_input_tokens_seen": 285515700, + "step": 13231, + "time_per_iteration": 3.2882747650146484 + }, + { + "auxiliary_loss_clip": 0.01040493, + "auxiliary_loss_mlp": 0.01024388, + "balance_loss_clip": 1.02399278, + "balance_loss_mlp": 1.01483357, + "epoch": 0.7955508793025703, + "flos": 20120533764480.0, + "grad_norm": 1.6059293038453615, + "language_loss": 0.69565833, + "learning_rate": 4.2257549272238566e-07, + "loss": 0.71630716, + "num_input_tokens_seen": 285533910, + "step": 13232, + "time_per_iteration": 2.8183770179748535 + }, + { + "auxiliary_loss_clip": 0.01051438, + "auxiliary_loss_mlp": 0.01025653, + "balance_loss_clip": 1.02377677, + "balance_loss_mlp": 1.01523447, + "epoch": 0.7956110025552382, + "flos": 26505378881280.0, + "grad_norm": 1.5502474575028176, + "language_loss": 0.78364992, + "learning_rate": 4.223360961792952e-07, + "loss": 0.80442077, + "num_input_tokens_seen": 285554080, + "step": 13233, + "time_per_iteration": 2.6703250408172607 + }, + { + "auxiliary_loss_clip": 0.01052847, + "auxiliary_loss_mlp": 0.01027794, + "balance_loss_clip": 1.02483582, + "balance_loss_mlp": 1.01776814, + "epoch": 0.7956711258079062, + "flos": 22565475786240.0, + "grad_norm": 1.915042695461717, + "language_loss": 0.78870094, + "learning_rate": 4.220967594613769e-07, + "loss": 0.80950731, + "num_input_tokens_seen": 285572325, + "step": 13234, + "time_per_iteration": 2.632999897003174 + }, + { + "auxiliary_loss_clip": 0.01043144, + "auxiliary_loss_mlp": 0.00747534, + "balance_loss_clip": 1.02671051, + "balance_loss_mlp": 1.00027311, + "epoch": 0.7957312490605741, + "flos": 17379005143680.0, + "grad_norm": 1.7880246801047028, + "language_loss": 0.70106465, + "learning_rate": 4.218574825777077e-07, + "loss": 0.71897149, + "num_input_tokens_seen": 285589770, + "step": 13235, + "time_per_iteration": 2.6355631351470947 + }, + { + "auxiliary_loss_clip": 0.01026287, + "auxiliary_loss_mlp": 0.01030504, + "balance_loss_clip": 1.02315092, + "balance_loss_mlp": 1.01947713, + "epoch": 0.7957913723132422, + "flos": 22491427898880.0, + "grad_norm": 2.3543229488854247, + "language_loss": 0.67767769, + "learning_rate": 4.2161826553736145e-07, + "loss": 0.69824558, + "num_input_tokens_seen": 285610065, + "step": 13236, + "time_per_iteration": 4.507572412490845 + }, + { + "auxiliary_loss_clip": 0.01012331, + "auxiliary_loss_mlp": 0.01025151, + "balance_loss_clip": 1.02293873, + "balance_loss_mlp": 1.01523924, + "epoch": 0.7958514955659101, + "flos": 22638087129600.0, + "grad_norm": 1.622968671121497, + "language_loss": 0.7529062, + "learning_rate": 4.2137910834940826e-07, + "loss": 0.77328104, + "num_input_tokens_seen": 285628480, + "step": 13237, + "time_per_iteration": 2.9076716899871826 + }, + { + "auxiliary_loss_clip": 0.01053489, + "auxiliary_loss_mlp": 0.01032821, + "balance_loss_clip": 1.02573967, + "balance_loss_mlp": 1.02168119, + "epoch": 0.7959116188185781, + "flos": 20704225772160.0, + "grad_norm": 2.0539623541402676, + "language_loss": 0.7133351, + "learning_rate": 4.211400110229175e-07, + "loss": 0.73419821, + "num_input_tokens_seen": 285647805, + "step": 13238, + "time_per_iteration": 2.692636013031006 + }, + { + "auxiliary_loss_clip": 0.01051762, + "auxiliary_loss_mlp": 0.01024567, + "balance_loss_clip": 1.02373135, + "balance_loss_mlp": 1.01411819, + "epoch": 0.7959717420712461, + "flos": 19024683684480.0, + "grad_norm": 1.9054348345545669, + "language_loss": 0.7327832, + "learning_rate": 4.2090097356695684e-07, + "loss": 0.75354648, + "num_input_tokens_seen": 285665505, + "step": 13239, + "time_per_iteration": 2.685145378112793 + }, + { + "auxiliary_loss_clip": 0.01065497, + "auxiliary_loss_mlp": 0.01031556, + "balance_loss_clip": 1.02621007, + "balance_loss_mlp": 1.02125669, + "epoch": 0.796031865323914, + "flos": 26356636661760.0, + "grad_norm": 3.734721925188749, + "language_loss": 0.69013166, + "learning_rate": 4.2066199599058814e-07, + "loss": 0.71110225, + "num_input_tokens_seen": 285685855, + "step": 13240, + "time_per_iteration": 2.7237112522125244 + }, + { + "auxiliary_loss_clip": 0.01000111, + "auxiliary_loss_mlp": 0.01001373, + "balance_loss_clip": 1.00427675, + "balance_loss_mlp": 1.00052094, + "epoch": 0.796091988576582, + "flos": 62069440320000.0, + "grad_norm": 0.8861670934898203, + "language_loss": 0.58647126, + "learning_rate": 4.2042307830287526e-07, + "loss": 0.60648614, + "num_input_tokens_seen": 285735710, + "step": 13241, + "time_per_iteration": 3.0535383224487305 + }, + { + "auxiliary_loss_clip": 0.01033754, + "auxiliary_loss_mlp": 0.01027921, + "balance_loss_clip": 1.0267148, + "balance_loss_mlp": 1.01844382, + "epoch": 0.7961521118292499, + "flos": 39020103400320.0, + "grad_norm": 1.8261217406882035, + "language_loss": 0.63936305, + "learning_rate": 4.201842205128772e-07, + "loss": 0.65997982, + "num_input_tokens_seen": 285757045, + "step": 13242, + "time_per_iteration": 2.9101932048797607 + }, + { + "auxiliary_loss_clip": 0.0106374, + "auxiliary_loss_mlp": 0.01033322, + "balance_loss_clip": 1.02623117, + "balance_loss_mlp": 1.02201509, + "epoch": 0.796212235081918, + "flos": 21762836426880.0, + "grad_norm": 3.0666101220990476, + "language_loss": 0.75745988, + "learning_rate": 4.199454226296526e-07, + "loss": 0.77843058, + "num_input_tokens_seen": 285776050, + "step": 13243, + "time_per_iteration": 2.6502974033355713 + }, + { + "auxiliary_loss_clip": 0.01036689, + "auxiliary_loss_mlp": 0.01028278, + "balance_loss_clip": 1.02691281, + "balance_loss_mlp": 1.01734662, + "epoch": 0.7962723583345859, + "flos": 21178857110400.0, + "grad_norm": 1.6994056198698404, + "language_loss": 0.79308945, + "learning_rate": 4.1970668466225565e-07, + "loss": 0.81373918, + "num_input_tokens_seen": 285796830, + "step": 13244, + "time_per_iteration": 2.8660929203033447 + }, + { + "auxiliary_loss_clip": 0.01054103, + "auxiliary_loss_mlp": 0.01029738, + "balance_loss_clip": 1.02447557, + "balance_loss_mlp": 1.0188185, + "epoch": 0.7963324815872539, + "flos": 17128636369920.0, + "grad_norm": 2.1410009418584024, + "language_loss": 0.68202776, + "learning_rate": 4.1946800661973934e-07, + "loss": 0.7028662, + "num_input_tokens_seen": 285814755, + "step": 13245, + "time_per_iteration": 2.6272940635681152 + }, + { + "auxiliary_loss_clip": 0.01041072, + "auxiliary_loss_mlp": 0.01032306, + "balance_loss_clip": 1.02389097, + "balance_loss_mlp": 1.02157712, + "epoch": 0.7963926048399218, + "flos": 21397481239680.0, + "grad_norm": 1.3874488220881376, + "language_loss": 0.79157448, + "learning_rate": 4.192293885111549e-07, + "loss": 0.81230819, + "num_input_tokens_seen": 285834255, + "step": 13246, + "time_per_iteration": 2.755237579345703 + }, + { + "auxiliary_loss_clip": 0.01043759, + "auxiliary_loss_mlp": 0.01029294, + "balance_loss_clip": 1.02419841, + "balance_loss_mlp": 1.01807022, + "epoch": 0.7964527280925898, + "flos": 25184188828800.0, + "grad_norm": 2.103657654664841, + "language_loss": 0.66430092, + "learning_rate": 4.1899083034555007e-07, + "loss": 0.68503153, + "num_input_tokens_seen": 285853540, + "step": 13247, + "time_per_iteration": 2.7221031188964844 + }, + { + "auxiliary_loss_clip": 0.01040074, + "auxiliary_loss_mlp": 0.01027194, + "balance_loss_clip": 1.02339542, + "balance_loss_mlp": 1.01748407, + "epoch": 0.7965128513452577, + "flos": 27015884928000.0, + "grad_norm": 1.9153555715265433, + "language_loss": 0.71409148, + "learning_rate": 4.1875233213197123e-07, + "loss": 0.73476416, + "num_input_tokens_seen": 285872705, + "step": 13248, + "time_per_iteration": 2.6754183769226074 + }, + { + "auxiliary_loss_clip": 0.01045537, + "auxiliary_loss_mlp": 0.01027193, + "balance_loss_clip": 1.02542198, + "balance_loss_mlp": 1.01594615, + "epoch": 0.7965729745979258, + "flos": 24419578993920.0, + "grad_norm": 2.988413152688075, + "language_loss": 0.76306957, + "learning_rate": 4.1851389387946255e-07, + "loss": 0.78379685, + "num_input_tokens_seen": 285890290, + "step": 13249, + "time_per_iteration": 2.6955461502075195 + }, + { + "auxiliary_loss_clip": 0.01044517, + "auxiliary_loss_mlp": 0.01029477, + "balance_loss_clip": 1.02789176, + "balance_loss_mlp": 1.01884937, + "epoch": 0.7966330978505937, + "flos": 18840389978880.0, + "grad_norm": 2.1079810824247107, + "language_loss": 0.61573291, + "learning_rate": 4.1827551559706674e-07, + "loss": 0.63647282, + "num_input_tokens_seen": 285909190, + "step": 13250, + "time_per_iteration": 2.8103208541870117 + }, + { + "auxiliary_loss_clip": 0.01040112, + "auxiliary_loss_mlp": 0.01024413, + "balance_loss_clip": 1.02406931, + "balance_loss_mlp": 1.01364255, + "epoch": 0.7966932211032617, + "flos": 13152319862400.0, + "grad_norm": 2.0566577991366572, + "language_loss": 0.71709692, + "learning_rate": 4.180371972938206e-07, + "loss": 0.73774219, + "num_input_tokens_seen": 285927570, + "step": 13251, + "time_per_iteration": 2.728269100189209 + }, + { + "auxiliary_loss_clip": 0.01066036, + "auxiliary_loss_mlp": 0.01028655, + "balance_loss_clip": 1.02657723, + "balance_loss_mlp": 1.01688886, + "epoch": 0.7967533443559297, + "flos": 23949760078080.0, + "grad_norm": 2.033405294687538, + "language_loss": 0.72396779, + "learning_rate": 4.177989389787624e-07, + "loss": 0.74491465, + "num_input_tokens_seen": 285945810, + "step": 13252, + "time_per_iteration": 2.7055671215057373 + }, + { + "auxiliary_loss_clip": 0.01061097, + "auxiliary_loss_mlp": 0.01028042, + "balance_loss_clip": 1.02595615, + "balance_loss_mlp": 1.01748586, + "epoch": 0.7968134676085976, + "flos": 30368791964160.0, + "grad_norm": 1.7277537029666064, + "language_loss": 0.65873098, + "learning_rate": 4.175607406609278e-07, + "loss": 0.67962235, + "num_input_tokens_seen": 285964235, + "step": 13253, + "time_per_iteration": 2.62605357170105 + }, + { + "auxiliary_loss_clip": 0.01038533, + "auxiliary_loss_mlp": 0.01030101, + "balance_loss_clip": 1.02813148, + "balance_loss_mlp": 1.01875234, + "epoch": 0.7968735908612656, + "flos": 23075048079360.0, + "grad_norm": 1.4923710685344085, + "language_loss": 0.67682612, + "learning_rate": 4.1732260234934767e-07, + "loss": 0.69751251, + "num_input_tokens_seen": 285983710, + "step": 13254, + "time_per_iteration": 2.768156051635742 + }, + { + "auxiliary_loss_clip": 0.01049441, + "auxiliary_loss_mlp": 0.01031838, + "balance_loss_clip": 1.02326441, + "balance_loss_mlp": 1.02168775, + "epoch": 0.7969337141139335, + "flos": 23582250074880.0, + "grad_norm": 2.32751326712209, + "language_loss": 0.6991837, + "learning_rate": 4.1708452405305314e-07, + "loss": 0.71999651, + "num_input_tokens_seen": 286003425, + "step": 13255, + "time_per_iteration": 4.279808282852173 + }, + { + "auxiliary_loss_clip": 0.01059477, + "auxiliary_loss_mlp": 0.01027333, + "balance_loss_clip": 1.02291393, + "balance_loss_mlp": 1.01764131, + "epoch": 0.7969938373666016, + "flos": 19755860935680.0, + "grad_norm": 1.837317005115558, + "language_loss": 0.79283434, + "learning_rate": 4.168465057810733e-07, + "loss": 0.81370246, + "num_input_tokens_seen": 286020130, + "step": 13256, + "time_per_iteration": 4.2826128005981445 + }, + { + "auxiliary_loss_clip": 0.01052256, + "auxiliary_loss_mlp": 0.01030271, + "balance_loss_clip": 1.02556586, + "balance_loss_mlp": 1.01945233, + "epoch": 0.7970539606192695, + "flos": 24134089697280.0, + "grad_norm": 1.7941842766957141, + "language_loss": 0.65852982, + "learning_rate": 4.166085475424315e-07, + "loss": 0.67935503, + "num_input_tokens_seen": 286040230, + "step": 13257, + "time_per_iteration": 2.6644461154937744 + }, + { + "auxiliary_loss_clip": 0.01042117, + "auxiliary_loss_mlp": 0.01033182, + "balance_loss_clip": 1.02452075, + "balance_loss_mlp": 1.02224481, + "epoch": 0.7971140838719375, + "flos": 17968622895360.0, + "grad_norm": 1.7625739948533394, + "language_loss": 0.71839547, + "learning_rate": 4.163706493461523e-07, + "loss": 0.73914844, + "num_input_tokens_seen": 286059475, + "step": 13258, + "time_per_iteration": 2.7256741523742676 + }, + { + "auxiliary_loss_clip": 0.01052662, + "auxiliary_loss_mlp": 0.01030542, + "balance_loss_clip": 1.02368593, + "balance_loss_mlp": 1.01987243, + "epoch": 0.7971742071246054, + "flos": 19169547235200.0, + "grad_norm": 3.261025427731728, + "language_loss": 0.68798369, + "learning_rate": 4.1613281120125655e-07, + "loss": 0.70881575, + "num_input_tokens_seen": 286077820, + "step": 13259, + "time_per_iteration": 2.625244379043579 + }, + { + "auxiliary_loss_clip": 0.0104942, + "auxiliary_loss_mlp": 0.01026643, + "balance_loss_clip": 1.02391863, + "balance_loss_mlp": 1.01730931, + "epoch": 0.7972343303772734, + "flos": 27125951178240.0, + "grad_norm": 1.8074021765972441, + "language_loss": 0.73566657, + "learning_rate": 4.158950331167641e-07, + "loss": 0.75642717, + "num_input_tokens_seen": 286097285, + "step": 13260, + "time_per_iteration": 2.8502070903778076 + }, + { + "auxiliary_loss_clip": 0.01033504, + "auxiliary_loss_mlp": 0.01026422, + "balance_loss_clip": 1.0204829, + "balance_loss_mlp": 1.01680732, + "epoch": 0.7972944536299413, + "flos": 20996646393600.0, + "grad_norm": 1.7249988804774565, + "language_loss": 0.78420472, + "learning_rate": 4.1565731510169065e-07, + "loss": 0.80480397, + "num_input_tokens_seen": 286116000, + "step": 13261, + "time_per_iteration": 2.8292527198791504 + }, + { + "auxiliary_loss_clip": 0.01049393, + "auxiliary_loss_mlp": 0.01028064, + "balance_loss_clip": 1.02531075, + "balance_loss_mlp": 1.01926017, + "epoch": 0.7973545768826094, + "flos": 21580015178880.0, + "grad_norm": 1.4400247089494906, + "language_loss": 0.7615096, + "learning_rate": 4.154196571650501e-07, + "loss": 0.78228408, + "num_input_tokens_seen": 286135110, + "step": 13262, + "time_per_iteration": 2.6804845333099365 + }, + { + "auxiliary_loss_clip": 0.01039264, + "auxiliary_loss_mlp": 0.0102759, + "balance_loss_clip": 1.02766347, + "balance_loss_mlp": 1.0156219, + "epoch": 0.7974147001352773, + "flos": 20558536208640.0, + "grad_norm": 2.349209047225744, + "language_loss": 0.70428598, + "learning_rate": 4.1518205931585524e-07, + "loss": 0.72495449, + "num_input_tokens_seen": 286152835, + "step": 13263, + "time_per_iteration": 2.687584400177002 + }, + { + "auxiliary_loss_clip": 0.01056948, + "auxiliary_loss_mlp": 0.01031449, + "balance_loss_clip": 1.02583909, + "balance_loss_mlp": 1.02014792, + "epoch": 0.7974748233879453, + "flos": 20996790048000.0, + "grad_norm": 1.7563253120677798, + "language_loss": 0.70946419, + "learning_rate": 4.149445215631153e-07, + "loss": 0.73034811, + "num_input_tokens_seen": 286171785, + "step": 13264, + "time_per_iteration": 2.6101527214050293 + }, + { + "auxiliary_loss_clip": 0.01061377, + "auxiliary_loss_mlp": 0.01030027, + "balance_loss_clip": 1.02598059, + "balance_loss_mlp": 1.01998949, + "epoch": 0.7975349466406133, + "flos": 22565188477440.0, + "grad_norm": 1.6754120519096323, + "language_loss": 0.7649743, + "learning_rate": 4.1470704391583776e-07, + "loss": 0.78588837, + "num_input_tokens_seen": 286190420, + "step": 13265, + "time_per_iteration": 2.552429437637329 + }, + { + "auxiliary_loss_clip": 0.01034598, + "auxiliary_loss_mlp": 0.01027922, + "balance_loss_clip": 1.02563906, + "balance_loss_mlp": 1.01807487, + "epoch": 0.7975950698932812, + "flos": 21689542725120.0, + "grad_norm": 2.10080202330415, + "language_loss": 0.75732625, + "learning_rate": 4.144696263830285e-07, + "loss": 0.77795142, + "num_input_tokens_seen": 286210105, + "step": 13266, + "time_per_iteration": 2.672889232635498 + }, + { + "auxiliary_loss_clip": 0.01040659, + "auxiliary_loss_mlp": 0.01026458, + "balance_loss_clip": 1.02411675, + "balance_loss_mlp": 1.01632571, + "epoch": 0.7976551931459492, + "flos": 19604568850560.0, + "grad_norm": 3.338306920192241, + "language_loss": 0.84376788, + "learning_rate": 4.1423226897369015e-07, + "loss": 0.86443913, + "num_input_tokens_seen": 286228180, + "step": 13267, + "time_per_iteration": 2.6072630882263184 + }, + { + "auxiliary_loss_clip": 0.0105245, + "auxiliary_loss_mlp": 0.01031378, + "balance_loss_clip": 1.02526128, + "balance_loss_mlp": 1.02086377, + "epoch": 0.7977153163986171, + "flos": 21687603390720.0, + "grad_norm": 2.166490065881918, + "language_loss": 0.75426745, + "learning_rate": 4.139949716968223e-07, + "loss": 0.77510571, + "num_input_tokens_seen": 286247305, + "step": 13268, + "time_per_iteration": 2.616424560546875 + }, + { + "auxiliary_loss_clip": 0.01064106, + "auxiliary_loss_mlp": 0.01025929, + "balance_loss_clip": 1.02726495, + "balance_loss_mlp": 1.01593935, + "epoch": 0.7977754396512852, + "flos": 23476780765440.0, + "grad_norm": 1.4956115339431133, + "language_loss": 0.78064525, + "learning_rate": 4.1375773456142403e-07, + "loss": 0.80154562, + "num_input_tokens_seen": 286268145, + "step": 13269, + "time_per_iteration": 2.7080979347229004 + }, + { + "auxiliary_loss_clip": 0.01042057, + "auxiliary_loss_mlp": 0.01033929, + "balance_loss_clip": 1.02137351, + "balance_loss_mlp": 1.02342033, + "epoch": 0.7978355629039531, + "flos": 22382223575040.0, + "grad_norm": 1.7295698486230369, + "language_loss": 0.82440948, + "learning_rate": 4.135205575764922e-07, + "loss": 0.84516931, + "num_input_tokens_seen": 286286775, + "step": 13270, + "time_per_iteration": 2.5829734802246094 + }, + { + "auxiliary_loss_clip": 0.01026962, + "auxiliary_loss_mlp": 0.01029278, + "balance_loss_clip": 1.02389157, + "balance_loss_mlp": 1.0177151, + "epoch": 0.7978956861566211, + "flos": 20266331068800.0, + "grad_norm": 1.7498563000239384, + "language_loss": 0.59580386, + "learning_rate": 4.1328344075101905e-07, + "loss": 0.61636627, + "num_input_tokens_seen": 286305590, + "step": 13271, + "time_per_iteration": 2.704069137573242 + }, + { + "auxiliary_loss_clip": 0.01043257, + "auxiliary_loss_mlp": 0.01028765, + "balance_loss_clip": 1.02474785, + "balance_loss_mlp": 1.01829231, + "epoch": 0.797955809409289, + "flos": 28112417366400.0, + "grad_norm": 1.539416270600162, + "language_loss": 0.73141235, + "learning_rate": 4.130463840939975e-07, + "loss": 0.75213259, + "num_input_tokens_seen": 286328050, + "step": 13272, + "time_per_iteration": 2.696199417114258 + }, + { + "auxiliary_loss_clip": 0.00997206, + "auxiliary_loss_mlp": 0.01031022, + "balance_loss_clip": 1.02046418, + "balance_loss_mlp": 1.01963711, + "epoch": 0.798015932661957, + "flos": 15559591495680.0, + "grad_norm": 1.8159370440904048, + "language_loss": 0.71533501, + "learning_rate": 4.128093876144161e-07, + "loss": 0.73561728, + "num_input_tokens_seen": 286345265, + "step": 13273, + "time_per_iteration": 4.614300012588501 + }, + { + "auxiliary_loss_clip": 0.01038896, + "auxiliary_loss_mlp": 0.01031742, + "balance_loss_clip": 1.02458739, + "balance_loss_mlp": 1.02043462, + "epoch": 0.7980760559146249, + "flos": 23951196622080.0, + "grad_norm": 1.6613677774140316, + "language_loss": 0.75864983, + "learning_rate": 4.1257245132126117e-07, + "loss": 0.77935612, + "num_input_tokens_seen": 286364465, + "step": 13274, + "time_per_iteration": 2.8116695880889893 + }, + { + "auxiliary_loss_clip": 0.01014124, + "auxiliary_loss_mlp": 0.01026252, + "balance_loss_clip": 1.02277625, + "balance_loss_mlp": 1.01661408, + "epoch": 0.798136179167293, + "flos": 28038082170240.0, + "grad_norm": 1.3224390390233196, + "language_loss": 0.77696764, + "learning_rate": 4.12335575223518e-07, + "loss": 0.79737139, + "num_input_tokens_seen": 286385565, + "step": 13275, + "time_per_iteration": 2.792285680770874 + }, + { + "auxiliary_loss_clip": 0.01054683, + "auxiliary_loss_mlp": 0.01033256, + "balance_loss_clip": 1.02592146, + "balance_loss_mlp": 1.0222764, + "epoch": 0.7981963024199609, + "flos": 35984538046080.0, + "grad_norm": 2.558944807207892, + "language_loss": 0.63997757, + "learning_rate": 4.1209875933016877e-07, + "loss": 0.66085696, + "num_input_tokens_seen": 286403950, + "step": 13276, + "time_per_iteration": 2.7861175537109375 + }, + { + "auxiliary_loss_clip": 0.01025327, + "auxiliary_loss_mlp": 0.01028258, + "balance_loss_clip": 1.02247572, + "balance_loss_mlp": 1.01838183, + "epoch": 0.7982564256726289, + "flos": 25884914325120.0, + "grad_norm": 1.5669324102538391, + "language_loss": 0.61042988, + "learning_rate": 4.118620036501945e-07, + "loss": 0.63096571, + "num_input_tokens_seen": 286426160, + "step": 13277, + "time_per_iteration": 2.8661608695983887 + }, + { + "auxiliary_loss_clip": 0.01037388, + "auxiliary_loss_mlp": 0.01029355, + "balance_loss_clip": 1.02514672, + "balance_loss_mlp": 1.01873302, + "epoch": 0.7983165489252969, + "flos": 25739152934400.0, + "grad_norm": 2.2712279101649893, + "language_loss": 0.80017304, + "learning_rate": 4.1162530819257227e-07, + "loss": 0.82084048, + "num_input_tokens_seen": 286446610, + "step": 13278, + "time_per_iteration": 2.7543628215789795 + }, + { + "auxiliary_loss_clip": 0.01044833, + "auxiliary_loss_mlp": 0.01034609, + "balance_loss_clip": 1.02581346, + "balance_loss_mlp": 1.02353489, + "epoch": 0.7983766721779648, + "flos": 21908202768000.0, + "grad_norm": 1.8041799795946507, + "language_loss": 0.63490027, + "learning_rate": 4.113886729662768e-07, + "loss": 0.65569472, + "num_input_tokens_seen": 286465460, + "step": 13279, + "time_per_iteration": 2.6975152492523193 + }, + { + "auxiliary_loss_clip": 0.01047087, + "auxiliary_loss_mlp": 0.01023991, + "balance_loss_clip": 1.023036, + "balance_loss_mlp": 1.01490736, + "epoch": 0.7984367954306328, + "flos": 29347420734720.0, + "grad_norm": 1.8046205637562958, + "language_loss": 0.70836598, + "learning_rate": 4.111520979802825e-07, + "loss": 0.72907674, + "num_input_tokens_seen": 286485720, + "step": 13280, + "time_per_iteration": 2.719937562942505 + }, + { + "auxiliary_loss_clip": 0.01027505, + "auxiliary_loss_mlp": 0.01034684, + "balance_loss_clip": 1.02400839, + "balance_loss_mlp": 1.02154732, + "epoch": 0.7984969186833007, + "flos": 31357772104320.0, + "grad_norm": 1.8784737441274049, + "language_loss": 0.62331814, + "learning_rate": 4.1091558324355955e-07, + "loss": 0.64393997, + "num_input_tokens_seen": 286507465, + "step": 13281, + "time_per_iteration": 2.7937676906585693 + }, + { + "auxiliary_loss_clip": 0.01049863, + "auxiliary_loss_mlp": 0.0103259, + "balance_loss_clip": 1.02322388, + "balance_loss_mlp": 1.02167082, + "epoch": 0.7985570419359688, + "flos": 24312924535680.0, + "grad_norm": 2.7868246954837197, + "language_loss": 0.80211431, + "learning_rate": 4.1067912876507683e-07, + "loss": 0.8229388, + "num_input_tokens_seen": 286526345, + "step": 13282, + "time_per_iteration": 2.7361810207366943 + }, + { + "auxiliary_loss_clip": 0.01025077, + "auxiliary_loss_mlp": 0.00747613, + "balance_loss_clip": 1.02095997, + "balance_loss_mlp": 1.00032783, + "epoch": 0.7986171651886367, + "flos": 15742233175680.0, + "grad_norm": 1.9342084450212553, + "language_loss": 0.71428496, + "learning_rate": 4.10442734553802e-07, + "loss": 0.73201191, + "num_input_tokens_seen": 286544095, + "step": 13283, + "time_per_iteration": 4.536837100982666 + }, + { + "auxiliary_loss_clip": 0.01049295, + "auxiliary_loss_mlp": 0.01025051, + "balance_loss_clip": 1.02292943, + "balance_loss_mlp": 1.01563358, + "epoch": 0.7986772884413047, + "flos": 11619401091840.0, + "grad_norm": 1.8874368934122536, + "language_loss": 0.73118287, + "learning_rate": 4.102064006186967e-07, + "loss": 0.75192636, + "num_input_tokens_seen": 286560960, + "step": 13284, + "time_per_iteration": 2.6103856563568115 + }, + { + "auxiliary_loss_clip": 0.01029092, + "auxiliary_loss_mlp": 0.01029111, + "balance_loss_clip": 1.02073956, + "balance_loss_mlp": 1.01967502, + "epoch": 0.7987374116939726, + "flos": 22091059929600.0, + "grad_norm": 2.0851845874199624, + "language_loss": 0.70410752, + "learning_rate": 4.0997012696872415e-07, + "loss": 0.7246896, + "num_input_tokens_seen": 286579865, + "step": 13285, + "time_per_iteration": 2.6287384033203125 + }, + { + "auxiliary_loss_clip": 0.01031148, + "auxiliary_loss_mlp": 0.01026822, + "balance_loss_clip": 1.02107143, + "balance_loss_mlp": 1.01682591, + "epoch": 0.7987975349466406, + "flos": 17890696339200.0, + "grad_norm": 2.5801921960792615, + "language_loss": 0.73655218, + "learning_rate": 4.097339136128437e-07, + "loss": 0.75713187, + "num_input_tokens_seen": 286597295, + "step": 13286, + "time_per_iteration": 2.630526065826416 + }, + { + "auxiliary_loss_clip": 0.01043216, + "auxiliary_loss_mlp": 0.0102719, + "balance_loss_clip": 1.0259192, + "balance_loss_mlp": 1.01680112, + "epoch": 0.7988576581993085, + "flos": 19719232041600.0, + "grad_norm": 1.7424479772167585, + "language_loss": 0.74958766, + "learning_rate": 4.0949776056001296e-07, + "loss": 0.77029169, + "num_input_tokens_seen": 286616270, + "step": 13287, + "time_per_iteration": 2.691523551940918 + }, + { + "auxiliary_loss_clip": 0.01042648, + "auxiliary_loss_mlp": 0.01028391, + "balance_loss_clip": 1.02558923, + "balance_loss_mlp": 1.01812673, + "epoch": 0.7989177814519766, + "flos": 28036358317440.0, + "grad_norm": 1.5323362037079875, + "language_loss": 0.61849278, + "learning_rate": 4.092616678191863e-07, + "loss": 0.63920319, + "num_input_tokens_seen": 286638315, + "step": 13288, + "time_per_iteration": 2.7275707721710205 + }, + { + "auxiliary_loss_clip": 0.01052022, + "auxiliary_loss_mlp": 0.01027382, + "balance_loss_clip": 1.02621591, + "balance_loss_mlp": 1.01748157, + "epoch": 0.7989779047046445, + "flos": 28871029630080.0, + "grad_norm": 2.226832060350969, + "language_loss": 0.69767118, + "learning_rate": 4.090256353993169e-07, + "loss": 0.71846521, + "num_input_tokens_seen": 286658630, + "step": 13289, + "time_per_iteration": 2.6366631984710693 + }, + { + "auxiliary_loss_clip": 0.01032349, + "auxiliary_loss_mlp": 0.01027865, + "balance_loss_clip": 1.02596068, + "balance_loss_mlp": 1.01722574, + "epoch": 0.7990380279573125, + "flos": 18186887888640.0, + "grad_norm": 2.1369568757107755, + "language_loss": 0.62668401, + "learning_rate": 4.0878966330935506e-07, + "loss": 0.64728618, + "num_input_tokens_seen": 286676870, + "step": 13290, + "time_per_iteration": 2.719390392303467 + }, + { + "auxiliary_loss_clip": 0.0105288, + "auxiliary_loss_mlp": 0.01026912, + "balance_loss_clip": 1.02433145, + "balance_loss_mlp": 1.01559877, + "epoch": 0.7990981512099805, + "flos": 20879936127360.0, + "grad_norm": 1.874359724034776, + "language_loss": 0.71092564, + "learning_rate": 4.08553751558248e-07, + "loss": 0.73172355, + "num_input_tokens_seen": 286694300, + "step": 13291, + "time_per_iteration": 2.769448757171631 + }, + { + "auxiliary_loss_clip": 0.01030614, + "auxiliary_loss_mlp": 0.01024757, + "balance_loss_clip": 1.02360821, + "balance_loss_mlp": 1.01509476, + "epoch": 0.7991582744626484, + "flos": 26099911180800.0, + "grad_norm": 1.4963053566344224, + "language_loss": 0.63511437, + "learning_rate": 4.083179001549422e-07, + "loss": 0.65566802, + "num_input_tokens_seen": 286714545, + "step": 13292, + "time_per_iteration": 2.7306854724884033 + }, + { + "auxiliary_loss_clip": 0.0105239, + "auxiliary_loss_mlp": 0.01026355, + "balance_loss_clip": 1.02588224, + "balance_loss_mlp": 1.01678801, + "epoch": 0.7992183977153164, + "flos": 35295843605760.0, + "grad_norm": 1.6475645763982045, + "language_loss": 0.56025237, + "learning_rate": 4.0808210910838105e-07, + "loss": 0.58103979, + "num_input_tokens_seen": 286734525, + "step": 13293, + "time_per_iteration": 2.7945873737335205 + }, + { + "auxiliary_loss_clip": 0.01044308, + "auxiliary_loss_mlp": 0.01027408, + "balance_loss_clip": 1.02738476, + "balance_loss_mlp": 1.01663125, + "epoch": 0.7992785209679844, + "flos": 51853426577280.0, + "grad_norm": 2.3236986496316225, + "language_loss": 0.71536142, + "learning_rate": 4.0784637842750704e-07, + "loss": 0.73607862, + "num_input_tokens_seen": 286753430, + "step": 13294, + "time_per_iteration": 2.8509767055511475 + }, + { + "auxiliary_loss_clip": 0.01022053, + "auxiliary_loss_mlp": 0.01029986, + "balance_loss_clip": 1.02168417, + "balance_loss_mlp": 1.01885772, + "epoch": 0.7993386442206524, + "flos": 22565116650240.0, + "grad_norm": 2.1093530444439863, + "language_loss": 0.72250307, + "learning_rate": 4.0761070812125675e-07, + "loss": 0.74302346, + "num_input_tokens_seen": 286771915, + "step": 13295, + "time_per_iteration": 2.6631362438201904 + }, + { + "auxiliary_loss_clip": 0.01026544, + "auxiliary_loss_mlp": 0.01031696, + "balance_loss_clip": 1.02475965, + "balance_loss_mlp": 1.02196813, + "epoch": 0.7993987674733203, + "flos": 18800277465600.0, + "grad_norm": 1.6797123696061906, + "language_loss": 0.7649526, + "learning_rate": 4.0737509819856797e-07, + "loss": 0.78553504, + "num_input_tokens_seen": 286789835, + "step": 13296, + "time_per_iteration": 2.5952584743499756 + }, + { + "auxiliary_loss_clip": 0.0098241, + "auxiliary_loss_mlp": 0.01001177, + "balance_loss_clip": 1.00579131, + "balance_loss_mlp": 1.00018787, + "epoch": 0.7994588907259883, + "flos": 69421720394880.0, + "grad_norm": 0.6915493335634657, + "language_loss": 0.60815513, + "learning_rate": 4.0713954866837573e-07, + "loss": 0.62799108, + "num_input_tokens_seen": 286855580, + "step": 13297, + "time_per_iteration": 3.3303513526916504 + }, + { + "auxiliary_loss_clip": 0.01041889, + "auxiliary_loss_mlp": 0.01028642, + "balance_loss_clip": 1.0249927, + "balance_loss_mlp": 1.0188787, + "epoch": 0.7995190139786562, + "flos": 13480327883520.0, + "grad_norm": 2.027827116279966, + "language_loss": 0.70267522, + "learning_rate": 4.0690405953961073e-07, + "loss": 0.72338057, + "num_input_tokens_seen": 286874360, + "step": 13298, + "time_per_iteration": 2.6500790119171143 + }, + { + "auxiliary_loss_clip": 0.01026724, + "auxiliary_loss_mlp": 0.01034904, + "balance_loss_clip": 1.02364051, + "balance_loss_mlp": 1.02237546, + "epoch": 0.7995791372313242, + "flos": 21652842003840.0, + "grad_norm": 2.099242180637433, + "language_loss": 0.75867039, + "learning_rate": 4.066686308212037e-07, + "loss": 0.77928668, + "num_input_tokens_seen": 286891950, + "step": 13299, + "time_per_iteration": 2.831421375274658 + }, + { + "auxiliary_loss_clip": 0.01034646, + "auxiliary_loss_mlp": 0.01026428, + "balance_loss_clip": 1.02209234, + "balance_loss_mlp": 1.01677203, + "epoch": 0.7996392604839921, + "flos": 26068130622720.0, + "grad_norm": 1.6037213768301772, + "language_loss": 0.77732009, + "learning_rate": 4.064332625220828e-07, + "loss": 0.79793084, + "num_input_tokens_seen": 286911725, + "step": 13300, + "time_per_iteration": 2.6901941299438477 + }, + { + "auxiliary_loss_clip": 0.0101519, + "auxiliary_loss_mlp": 0.01033362, + "balance_loss_clip": 1.01942956, + "balance_loss_mlp": 1.02005768, + "epoch": 0.7996993837366602, + "flos": 24606889441920.0, + "grad_norm": 1.6492553449244267, + "language_loss": 0.63843393, + "learning_rate": 4.0619795465117115e-07, + "loss": 0.65891945, + "num_input_tokens_seen": 286931400, + "step": 13301, + "time_per_iteration": 2.732790946960449 + }, + { + "auxiliary_loss_clip": 0.01050749, + "auxiliary_loss_mlp": 0.01029631, + "balance_loss_clip": 1.02469766, + "balance_loss_mlp": 1.01900971, + "epoch": 0.7997595069893281, + "flos": 20992049452800.0, + "grad_norm": 1.5481211818792837, + "language_loss": 0.71782732, + "learning_rate": 4.059627072173928e-07, + "loss": 0.73863113, + "num_input_tokens_seen": 286949795, + "step": 13302, + "time_per_iteration": 2.7606382369995117 + }, + { + "auxiliary_loss_clip": 0.01066207, + "auxiliary_loss_mlp": 0.00747759, + "balance_loss_clip": 1.02712774, + "balance_loss_mlp": 1.00042379, + "epoch": 0.7998196302419961, + "flos": 24426510318720.0, + "grad_norm": 2.1706928988675713, + "language_loss": 0.83390319, + "learning_rate": 4.057275202296684e-07, + "loss": 0.85204291, + "num_input_tokens_seen": 286968805, + "step": 13303, + "time_per_iteration": 5.929046869277954 + }, + { + "auxiliary_loss_clip": 0.01059785, + "auxiliary_loss_mlp": 0.01028153, + "balance_loss_clip": 1.02499843, + "balance_loss_mlp": 1.01881266, + "epoch": 0.7998797534946641, + "flos": 30264651457920.0, + "grad_norm": 2.0193982797778096, + "language_loss": 0.58897507, + "learning_rate": 4.054923936969166e-07, + "loss": 0.60985446, + "num_input_tokens_seen": 286990235, + "step": 13304, + "time_per_iteration": 2.6464715003967285 + }, + { + "auxiliary_loss_clip": 0.01060902, + "auxiliary_loss_mlp": 0.01024414, + "balance_loss_clip": 1.02272856, + "balance_loss_mlp": 1.01338732, + "epoch": 0.799939876747332, + "flos": 23513984277120.0, + "grad_norm": 1.5878607293984208, + "language_loss": 0.69193995, + "learning_rate": 4.0525732762805265e-07, + "loss": 0.71279317, + "num_input_tokens_seen": 287011060, + "step": 13305, + "time_per_iteration": 2.6162877082824707 + }, + { + "auxiliary_loss_clip": 0.01033154, + "auxiliary_loss_mlp": 0.01029137, + "balance_loss_clip": 1.02615941, + "balance_loss_mlp": 1.01962411, + "epoch": 0.8, + "flos": 19318109886720.0, + "grad_norm": 1.589209983593091, + "language_loss": 0.69311935, + "learning_rate": 4.0502232203199107e-07, + "loss": 0.71374226, + "num_input_tokens_seen": 287029215, + "step": 13306, + "time_per_iteration": 2.7062435150146484 + }, + { + "auxiliary_loss_clip": 0.01053983, + "auxiliary_loss_mlp": 0.01030581, + "balance_loss_clip": 1.02588928, + "balance_loss_mlp": 1.02057886, + "epoch": 0.800060123252668, + "flos": 32412432263040.0, + "grad_norm": 1.4740419588026148, + "language_loss": 0.69488508, + "learning_rate": 4.0478737691764286e-07, + "loss": 0.71573073, + "num_input_tokens_seen": 287050855, + "step": 13307, + "time_per_iteration": 2.713879346847534 + }, + { + "auxiliary_loss_clip": 0.01036986, + "auxiliary_loss_mlp": 0.01031088, + "balance_loss_clip": 1.02347207, + "balance_loss_mlp": 1.02110386, + "epoch": 0.800120246505336, + "flos": 20010611168640.0, + "grad_norm": 1.917993972692778, + "language_loss": 0.76621985, + "learning_rate": 4.0455249229391677e-07, + "loss": 0.78690064, + "num_input_tokens_seen": 287069915, + "step": 13308, + "time_per_iteration": 2.6284360885620117 + }, + { + "auxiliary_loss_clip": 0.01022092, + "auxiliary_loss_mlp": 0.01029668, + "balance_loss_clip": 1.02403796, + "balance_loss_mlp": 1.01818204, + "epoch": 0.8001803697580039, + "flos": 31868278151040.0, + "grad_norm": 1.4622017772665459, + "language_loss": 0.78789181, + "learning_rate": 4.0431766816972e-07, + "loss": 0.80840939, + "num_input_tokens_seen": 287091450, + "step": 13309, + "time_per_iteration": 2.7759933471679688 + }, + { + "auxiliary_loss_clip": 0.01006766, + "auxiliary_loss_mlp": 0.01000415, + "balance_loss_clip": 1.00181866, + "balance_loss_mlp": 0.99955064, + "epoch": 0.8002404930106719, + "flos": 63392066916480.0, + "grad_norm": 0.9946328886916279, + "language_loss": 0.6470865, + "learning_rate": 4.040829045539571e-07, + "loss": 0.66715837, + "num_input_tokens_seen": 287148365, + "step": 13310, + "time_per_iteration": 3.0829684734344482 + }, + { + "auxiliary_loss_clip": 0.01051213, + "auxiliary_loss_mlp": 0.01032362, + "balance_loss_clip": 1.02492428, + "balance_loss_mlp": 1.02203822, + "epoch": 0.8003006162633398, + "flos": 27855476403840.0, + "grad_norm": 3.9390744725571274, + "language_loss": 0.82797825, + "learning_rate": 4.0384820145553156e-07, + "loss": 0.84881401, + "num_input_tokens_seen": 287168280, + "step": 13311, + "time_per_iteration": 2.720644235610962 + }, + { + "auxiliary_loss_clip": 0.01052159, + "auxiliary_loss_mlp": 0.01030117, + "balance_loss_clip": 1.0253247, + "balance_loss_mlp": 1.0196979, + "epoch": 0.8003607395160078, + "flos": 18223337214720.0, + "grad_norm": 2.0403194961548903, + "language_loss": 0.66529983, + "learning_rate": 4.0361355888334116e-07, + "loss": 0.6861226, + "num_input_tokens_seen": 287185980, + "step": 13312, + "time_per_iteration": 2.6106531620025635 + }, + { + "auxiliary_loss_clip": 0.01065939, + "auxiliary_loss_mlp": 0.01030028, + "balance_loss_clip": 1.02716875, + "balance_loss_mlp": 1.01853681, + "epoch": 0.8004208627686757, + "flos": 20886975192960.0, + "grad_norm": 1.5718372912639185, + "language_loss": 0.75286913, + "learning_rate": 4.033789768462843e-07, + "loss": 0.7738288, + "num_input_tokens_seen": 287203875, + "step": 13313, + "time_per_iteration": 2.6648972034454346 + }, + { + "auxiliary_loss_clip": 0.01048007, + "auxiliary_loss_mlp": 0.01028044, + "balance_loss_clip": 1.023031, + "balance_loss_mlp": 1.01729119, + "epoch": 0.8004809860213438, + "flos": 26436143416320.0, + "grad_norm": 1.368391025580254, + "language_loss": 0.75702453, + "learning_rate": 4.031444553532575e-07, + "loss": 0.777785, + "num_input_tokens_seen": 287226445, + "step": 13314, + "time_per_iteration": 2.667212963104248 + }, + { + "auxiliary_loss_clip": 0.00970892, + "auxiliary_loss_mlp": 0.01002306, + "balance_loss_clip": 1.00428796, + "balance_loss_mlp": 1.00126302, + "epoch": 0.8005411092740117, + "flos": 63648612829440.0, + "grad_norm": 0.7942121483719243, + "language_loss": 0.53764319, + "learning_rate": 4.029099944131522e-07, + "loss": 0.55737519, + "num_input_tokens_seen": 287286240, + "step": 13315, + "time_per_iteration": 3.179429054260254 + }, + { + "auxiliary_loss_clip": 0.0104193, + "auxiliary_loss_mlp": 0.01026119, + "balance_loss_clip": 1.02500963, + "balance_loss_mlp": 1.01601052, + "epoch": 0.8006012325266797, + "flos": 36138056774400.0, + "grad_norm": 3.4257360357103113, + "language_loss": 0.71530247, + "learning_rate": 4.026755940348603e-07, + "loss": 0.73598301, + "num_input_tokens_seen": 287310265, + "step": 13316, + "time_per_iteration": 2.783946990966797 + }, + { + "auxiliary_loss_clip": 0.01045327, + "auxiliary_loss_mlp": 0.01030609, + "balance_loss_clip": 1.02634978, + "balance_loss_mlp": 1.02014875, + "epoch": 0.8006613557793477, + "flos": 33838947970560.0, + "grad_norm": 2.9976335274635226, + "language_loss": 0.6451785, + "learning_rate": 4.024412542272706e-07, + "loss": 0.66593784, + "num_input_tokens_seen": 287331610, + "step": 13317, + "time_per_iteration": 2.7551372051239014 + }, + { + "auxiliary_loss_clip": 0.01006222, + "auxiliary_loss_mlp": 0.01000592, + "balance_loss_clip": 1.0010165, + "balance_loss_mlp": 0.99967974, + "epoch": 0.8007214790320156, + "flos": 67348310699520.0, + "grad_norm": 0.7691611529302832, + "language_loss": 0.59039181, + "learning_rate": 4.0220697499926783e-07, + "loss": 0.61045992, + "num_input_tokens_seen": 287394795, + "step": 13318, + "time_per_iteration": 3.2042558193206787 + }, + { + "auxiliary_loss_clip": 0.01031339, + "auxiliary_loss_mlp": 0.01023678, + "balance_loss_clip": 1.02460408, + "balance_loss_mlp": 1.01376557, + "epoch": 0.8007816022846836, + "flos": 23185653033600.0, + "grad_norm": 2.80041831514432, + "language_loss": 0.66240084, + "learning_rate": 4.019727563597366e-07, + "loss": 0.68295097, + "num_input_tokens_seen": 287414595, + "step": 13319, + "time_per_iteration": 4.447602033615112 + }, + { + "auxiliary_loss_clip": 0.01063976, + "auxiliary_loss_mlp": 0.00747623, + "balance_loss_clip": 1.02421272, + "balance_loss_mlp": 1.00038159, + "epoch": 0.8008417255373516, + "flos": 21981388728960.0, + "grad_norm": 1.8988358708323956, + "language_loss": 0.74111277, + "learning_rate": 4.0173859831755873e-07, + "loss": 0.75922877, + "num_input_tokens_seen": 287434395, + "step": 13320, + "time_per_iteration": 2.636871576309204 + }, + { + "auxiliary_loss_clip": 0.01054798, + "auxiliary_loss_mlp": 0.01024754, + "balance_loss_clip": 1.02550924, + "balance_loss_mlp": 1.01390624, + "epoch": 0.8009018487900196, + "flos": 16727334647040.0, + "grad_norm": 1.9585476094876886, + "language_loss": 0.80099308, + "learning_rate": 4.015045008816138e-07, + "loss": 0.82178855, + "num_input_tokens_seen": 287450590, + "step": 13321, + "time_per_iteration": 2.647672176361084 + }, + { + "auxiliary_loss_clip": 0.00982393, + "auxiliary_loss_mlp": 0.01030324, + "balance_loss_clip": 1.0169189, + "balance_loss_mlp": 1.01997018, + "epoch": 0.8009619720426875, + "flos": 20813609664000.0, + "grad_norm": 1.8427617822570164, + "language_loss": 0.65919536, + "learning_rate": 4.0127046406077825e-07, + "loss": 0.6793226, + "num_input_tokens_seen": 287468455, + "step": 13322, + "time_per_iteration": 2.725735902786255 + }, + { + "auxiliary_loss_clip": 0.01051979, + "auxiliary_loss_mlp": 0.01024522, + "balance_loss_clip": 1.02482378, + "balance_loss_mlp": 1.01465154, + "epoch": 0.8010220952953555, + "flos": 17931096161280.0, + "grad_norm": 1.6793164053343417, + "language_loss": 0.77924746, + "learning_rate": 4.010364878639265e-07, + "loss": 0.80001247, + "num_input_tokens_seen": 287486485, + "step": 13323, + "time_per_iteration": 2.582599639892578 + }, + { + "auxiliary_loss_clip": 0.01064197, + "auxiliary_loss_mlp": 0.0102617, + "balance_loss_clip": 1.02504885, + "balance_loss_mlp": 1.01560235, + "epoch": 0.8010822185480234, + "flos": 24572235795840.0, + "grad_norm": 4.514130922240067, + "language_loss": 0.71371049, + "learning_rate": 4.00802572299932e-07, + "loss": 0.73461413, + "num_input_tokens_seen": 287503940, + "step": 13324, + "time_per_iteration": 2.668311357498169 + }, + { + "auxiliary_loss_clip": 0.0101863, + "auxiliary_loss_mlp": 0.01028947, + "balance_loss_clip": 1.02148914, + "balance_loss_mlp": 1.01819396, + "epoch": 0.8011423418006914, + "flos": 21829988903040.0, + "grad_norm": 1.7426191845380232, + "language_loss": 0.76512396, + "learning_rate": 4.005687173776635e-07, + "loss": 0.78559971, + "num_input_tokens_seen": 287521660, + "step": 13325, + "time_per_iteration": 2.6468982696533203 + }, + { + "auxiliary_loss_clip": 0.01036881, + "auxiliary_loss_mlp": 0.01025433, + "balance_loss_clip": 1.02172697, + "balance_loss_mlp": 1.01637268, + "epoch": 0.8012024650533593, + "flos": 23915178259200.0, + "grad_norm": 1.6923217779586, + "language_loss": 0.79646778, + "learning_rate": 4.003349231059898e-07, + "loss": 0.81709087, + "num_input_tokens_seen": 287541505, + "step": 13326, + "time_per_iteration": 2.6301307678222656 + }, + { + "auxiliary_loss_clip": 0.01050248, + "auxiliary_loss_mlp": 0.01030761, + "balance_loss_clip": 1.02400661, + "balance_loss_mlp": 1.02087224, + "epoch": 0.8012625883060274, + "flos": 23587062497280.0, + "grad_norm": 1.9816010764734244, + "language_loss": 0.66166079, + "learning_rate": 4.001011894937765e-07, + "loss": 0.6824708, + "num_input_tokens_seen": 287560015, + "step": 13327, + "time_per_iteration": 2.6090147495269775 + }, + { + "auxiliary_loss_clip": 0.01049329, + "auxiliary_loss_mlp": 0.0102741, + "balance_loss_clip": 1.02462196, + "balance_loss_mlp": 1.01779008, + "epoch": 0.8013227115586953, + "flos": 20813932886400.0, + "grad_norm": 1.7152925641570498, + "language_loss": 0.73397094, + "learning_rate": 3.9986751654988636e-07, + "loss": 0.75473833, + "num_input_tokens_seen": 287579150, + "step": 13328, + "time_per_iteration": 2.6193926334381104 + }, + { + "auxiliary_loss_clip": 0.01010434, + "auxiliary_loss_mlp": 0.01032964, + "balance_loss_clip": 1.02330065, + "balance_loss_mlp": 1.02123964, + "epoch": 0.8013828348113633, + "flos": 15888317788800.0, + "grad_norm": 2.4117272678178785, + "language_loss": 0.73769927, + "learning_rate": 3.996339042831798e-07, + "loss": 0.75813323, + "num_input_tokens_seen": 287597420, + "step": 13329, + "time_per_iteration": 2.682343006134033 + }, + { + "auxiliary_loss_clip": 0.00998177, + "auxiliary_loss_mlp": 0.01001234, + "balance_loss_clip": 1.00245523, + "balance_loss_mlp": 1.00032806, + "epoch": 0.8014429580640313, + "flos": 71062981562880.0, + "grad_norm": 0.7287629803032413, + "language_loss": 0.52944845, + "learning_rate": 3.9940035270251605e-07, + "loss": 0.54944259, + "num_input_tokens_seen": 287667280, + "step": 13330, + "time_per_iteration": 3.2506659030914307 + }, + { + "auxiliary_loss_clip": 0.01044542, + "auxiliary_loss_mlp": 0.01032992, + "balance_loss_clip": 1.02463317, + "balance_loss_mlp": 1.02098763, + "epoch": 0.8015030813166992, + "flos": 23076340968960.0, + "grad_norm": 2.008344458045579, + "language_loss": 0.7283529, + "learning_rate": 3.991668618167519e-07, + "loss": 0.74912816, + "num_input_tokens_seen": 287687375, + "step": 13331, + "time_per_iteration": 4.4990808963775635 + }, + { + "auxiliary_loss_clip": 0.01050861, + "auxiliary_loss_mlp": 0.0102379, + "balance_loss_clip": 1.02437341, + "balance_loss_mlp": 1.01465249, + "epoch": 0.8015632045693672, + "flos": 21872328059520.0, + "grad_norm": 1.833388554988262, + "language_loss": 0.77208447, + "learning_rate": 3.989334316347401e-07, + "loss": 0.79283106, + "num_input_tokens_seen": 287707895, + "step": 13332, + "time_per_iteration": 2.667879343032837 + }, + { + "auxiliary_loss_clip": 0.01063101, + "auxiliary_loss_mlp": 0.01025664, + "balance_loss_clip": 1.026196, + "balance_loss_mlp": 1.01493573, + "epoch": 0.8016233278220352, + "flos": 23656728925440.0, + "grad_norm": 1.8803038904228138, + "language_loss": 0.83138722, + "learning_rate": 3.987000621653338e-07, + "loss": 0.85227489, + "num_input_tokens_seen": 287723990, + "step": 13333, + "time_per_iteration": 2.628544330596924 + }, + { + "auxiliary_loss_clip": 0.01041495, + "auxiliary_loss_mlp": 0.01025446, + "balance_loss_clip": 1.02356672, + "balance_loss_mlp": 1.01450276, + "epoch": 0.8016834510747032, + "flos": 16253170185600.0, + "grad_norm": 1.6211821595602718, + "language_loss": 0.73685652, + "learning_rate": 3.9846675341738133e-07, + "loss": 0.75752592, + "num_input_tokens_seen": 287742380, + "step": 13334, + "time_per_iteration": 2.6111629009246826 + }, + { + "auxiliary_loss_clip": 0.0102488, + "auxiliary_loss_mlp": 0.01033163, + "balance_loss_clip": 1.02306819, + "balance_loss_mlp": 1.02185559, + "epoch": 0.8017435743273711, + "flos": 12276027665280.0, + "grad_norm": 2.6710530279921962, + "language_loss": 0.74516279, + "learning_rate": 3.9823350539972967e-07, + "loss": 0.76574314, + "num_input_tokens_seen": 287760130, + "step": 13335, + "time_per_iteration": 2.6254074573516846 + }, + { + "auxiliary_loss_clip": 0.01025145, + "auxiliary_loss_mlp": 0.01028157, + "balance_loss_clip": 1.02198458, + "balance_loss_mlp": 1.01712465, + "epoch": 0.8018036975800391, + "flos": 17196112068480.0, + "grad_norm": 1.9853892705654257, + "language_loss": 0.75584859, + "learning_rate": 3.9800031812122416e-07, + "loss": 0.77638161, + "num_input_tokens_seen": 287777565, + "step": 13336, + "time_per_iteration": 2.7096433639526367 + }, + { + "auxiliary_loss_clip": 0.01038472, + "auxiliary_loss_mlp": 0.01032997, + "balance_loss_clip": 1.02729499, + "balance_loss_mlp": 1.02175522, + "epoch": 0.801863820832707, + "flos": 20631865824000.0, + "grad_norm": 2.2365902870014147, + "language_loss": 0.75435829, + "learning_rate": 3.977671915907068e-07, + "loss": 0.77507293, + "num_input_tokens_seen": 287796310, + "step": 13337, + "time_per_iteration": 2.69486927986145 + }, + { + "auxiliary_loss_clip": 0.01011539, + "auxiliary_loss_mlp": 0.00747729, + "balance_loss_clip": 1.02616072, + "balance_loss_mlp": 1.00036597, + "epoch": 0.801923944085375, + "flos": 30445569285120.0, + "grad_norm": 1.6319622452264426, + "language_loss": 0.79933232, + "learning_rate": 3.9753412581701883e-07, + "loss": 0.81692505, + "num_input_tokens_seen": 287817330, + "step": 13338, + "time_per_iteration": 2.866793394088745 + }, + { + "auxiliary_loss_clip": 0.01024387, + "auxiliary_loss_mlp": 0.01026817, + "balance_loss_clip": 1.02148139, + "balance_loss_mlp": 1.01564145, + "epoch": 0.801984067338043, + "flos": 20010575255040.0, + "grad_norm": 1.8098104691413062, + "language_loss": 0.74500495, + "learning_rate": 3.9730112080899733e-07, + "loss": 0.765517, + "num_input_tokens_seen": 287835095, + "step": 13339, + "time_per_iteration": 2.782289981842041 + }, + { + "auxiliary_loss_clip": 0.01049072, + "auxiliary_loss_mlp": 0.010259, + "balance_loss_clip": 1.02438867, + "balance_loss_mlp": 1.01639938, + "epoch": 0.802044190590711, + "flos": 22784028088320.0, + "grad_norm": 1.5996538905535205, + "language_loss": 0.78923684, + "learning_rate": 3.970681765754775e-07, + "loss": 0.80998659, + "num_input_tokens_seen": 287854595, + "step": 13340, + "time_per_iteration": 2.6573994159698486 + }, + { + "auxiliary_loss_clip": 0.01035044, + "auxiliary_loss_mlp": 0.01026039, + "balance_loss_clip": 1.02685893, + "balance_loss_mlp": 1.01630592, + "epoch": 0.8021043138433789, + "flos": 27600115639680.0, + "grad_norm": 1.696229482284831, + "language_loss": 0.67745352, + "learning_rate": 3.968352931252936e-07, + "loss": 0.69806433, + "num_input_tokens_seen": 287876960, + "step": 13341, + "time_per_iteration": 2.699444055557251 + }, + { + "auxiliary_loss_clip": 0.00989331, + "auxiliary_loss_mlp": 0.01005149, + "balance_loss_clip": 1.0035485, + "balance_loss_mlp": 1.00419581, + "epoch": 0.8021644370960469, + "flos": 62063730057600.0, + "grad_norm": 0.8045098831352728, + "language_loss": 0.61635017, + "learning_rate": 3.9660247046727547e-07, + "loss": 0.63629496, + "num_input_tokens_seen": 287936530, + "step": 13342, + "time_per_iteration": 3.162923574447632 + }, + { + "auxiliary_loss_clip": 0.01045245, + "auxiliary_loss_mlp": 0.01031028, + "balance_loss_clip": 1.02703953, + "balance_loss_mlp": 1.01935172, + "epoch": 0.8022245603487148, + "flos": 23361794352000.0, + "grad_norm": 2.246980604003997, + "language_loss": 0.64035219, + "learning_rate": 3.963697086102522e-07, + "loss": 0.66111493, + "num_input_tokens_seen": 287954285, + "step": 13343, + "time_per_iteration": 2.7072277069091797 + }, + { + "auxiliary_loss_clip": 0.01038627, + "auxiliary_loss_mlp": 0.01022481, + "balance_loss_clip": 1.02394974, + "balance_loss_mlp": 1.01335597, + "epoch": 0.8022846836013828, + "flos": 10853354712960.0, + "grad_norm": 1.973988675363026, + "language_loss": 0.68707258, + "learning_rate": 3.96137007563051e-07, + "loss": 0.70768368, + "num_input_tokens_seen": 287971595, + "step": 13344, + "time_per_iteration": 2.744814157485962 + }, + { + "auxiliary_loss_clip": 0.01053027, + "auxiliary_loss_mlp": 0.01023956, + "balance_loss_clip": 1.02575862, + "balance_loss_mlp": 1.01403785, + "epoch": 0.8023448068540509, + "flos": 29240443054080.0, + "grad_norm": 1.7009778291084976, + "language_loss": 0.70806766, + "learning_rate": 3.9590436733449506e-07, + "loss": 0.72883749, + "num_input_tokens_seen": 287992540, + "step": 13345, + "time_per_iteration": 2.7082536220550537 + }, + { + "auxiliary_loss_clip": 0.00986717, + "auxiliary_loss_mlp": 0.01001888, + "balance_loss_clip": 1.00145125, + "balance_loss_mlp": 1.00096405, + "epoch": 0.8024049301067188, + "flos": 64153588181760.0, + "grad_norm": 0.8769780861295274, + "language_loss": 0.62991357, + "learning_rate": 3.956717879334059e-07, + "loss": 0.64979959, + "num_input_tokens_seen": 288052810, + "step": 13346, + "time_per_iteration": 3.3222081661224365 + }, + { + "auxiliary_loss_clip": 0.01042587, + "auxiliary_loss_mlp": 0.0103034, + "balance_loss_clip": 1.02588582, + "balance_loss_mlp": 1.01993883, + "epoch": 0.8024650533593868, + "flos": 28585360765440.0, + "grad_norm": 2.0729242745674155, + "language_loss": 0.72516406, + "learning_rate": 3.9543926936860327e-07, + "loss": 0.7458933, + "num_input_tokens_seen": 288073045, + "step": 13347, + "time_per_iteration": 2.6522529125213623 + }, + { + "auxiliary_loss_clip": 0.01053955, + "auxiliary_loss_mlp": 0.01027784, + "balance_loss_clip": 1.02595949, + "balance_loss_mlp": 1.0168941, + "epoch": 0.8025251766120547, + "flos": 16982264448000.0, + "grad_norm": 2.9481008323265816, + "language_loss": 0.72863638, + "learning_rate": 3.9520681164890493e-07, + "loss": 0.74945378, + "num_input_tokens_seen": 288091165, + "step": 13348, + "time_per_iteration": 2.5718119144439697 + }, + { + "auxiliary_loss_clip": 0.01033596, + "auxiliary_loss_mlp": 0.01024482, + "balance_loss_clip": 1.02566624, + "balance_loss_mlp": 1.01455784, + "epoch": 0.8025852998647227, + "flos": 22163671272960.0, + "grad_norm": 1.5988267081245635, + "language_loss": 0.75669914, + "learning_rate": 3.9497441478312444e-07, + "loss": 0.77727997, + "num_input_tokens_seen": 288110595, + "step": 13349, + "time_per_iteration": 4.310225248336792 + }, + { + "auxiliary_loss_clip": 0.01064163, + "auxiliary_loss_mlp": 0.01033056, + "balance_loss_clip": 1.02687252, + "balance_loss_mlp": 1.02337074, + "epoch": 0.8026454231173906, + "flos": 22017012042240.0, + "grad_norm": 3.346045919809628, + "language_loss": 0.83641827, + "learning_rate": 3.947420787800755e-07, + "loss": 0.85739052, + "num_input_tokens_seen": 288128995, + "step": 13350, + "time_per_iteration": 4.1491405963897705 + }, + { + "auxiliary_loss_clip": 0.01057031, + "auxiliary_loss_mlp": 0.01034251, + "balance_loss_clip": 1.02957141, + "balance_loss_mlp": 1.02404094, + "epoch": 0.8027055463700586, + "flos": 22491320158080.0, + "grad_norm": 1.6540720640242486, + "language_loss": 0.71066809, + "learning_rate": 3.945098036485679e-07, + "loss": 0.73158091, + "num_input_tokens_seen": 288149265, + "step": 13351, + "time_per_iteration": 2.671844482421875 + }, + { + "auxiliary_loss_clip": 0.01019266, + "auxiliary_loss_mlp": 0.01023667, + "balance_loss_clip": 1.02183425, + "balance_loss_mlp": 1.01336122, + "epoch": 0.8027656696227266, + "flos": 28912901909760.0, + "grad_norm": 2.465736749499154, + "language_loss": 0.62026209, + "learning_rate": 3.9427758939740885e-07, + "loss": 0.6406914, + "num_input_tokens_seen": 288170745, + "step": 13352, + "time_per_iteration": 2.723675012588501 + }, + { + "auxiliary_loss_clip": 0.01052183, + "auxiliary_loss_mlp": 0.01029882, + "balance_loss_clip": 1.0258683, + "balance_loss_mlp": 1.01983893, + "epoch": 0.8028257928753946, + "flos": 18589374760320.0, + "grad_norm": 1.85611919385589, + "language_loss": 0.7684747, + "learning_rate": 3.940454360354046e-07, + "loss": 0.78929532, + "num_input_tokens_seen": 288189415, + "step": 13353, + "time_per_iteration": 2.6002986431121826 + }, + { + "auxiliary_loss_clip": 0.01014783, + "auxiliary_loss_mlp": 0.01031448, + "balance_loss_clip": 1.02536774, + "balance_loss_mlp": 1.01942563, + "epoch": 0.8028859161280625, + "flos": 19130009339520.0, + "grad_norm": 2.075590351560145, + "language_loss": 0.73480225, + "learning_rate": 3.938133435713582e-07, + "loss": 0.75526464, + "num_input_tokens_seen": 288206900, + "step": 13354, + "time_per_iteration": 2.7871155738830566 + }, + { + "auxiliary_loss_clip": 0.01026924, + "auxiliary_loss_mlp": 0.01033785, + "balance_loss_clip": 1.02361083, + "balance_loss_mlp": 1.02262688, + "epoch": 0.8029460393807305, + "flos": 20229881742720.0, + "grad_norm": 1.8836465137573477, + "language_loss": 0.65706968, + "learning_rate": 3.935813120140714e-07, + "loss": 0.6776768, + "num_input_tokens_seen": 288224800, + "step": 13355, + "time_per_iteration": 2.637511968612671 + }, + { + "auxiliary_loss_clip": 0.01022056, + "auxiliary_loss_mlp": 0.01031807, + "balance_loss_clip": 1.02136827, + "balance_loss_mlp": 1.01996946, + "epoch": 0.8030061626333984, + "flos": 49783320933120.0, + "grad_norm": 4.120722558469588, + "language_loss": 0.69157219, + "learning_rate": 3.9334934137234235e-07, + "loss": 0.71211088, + "num_input_tokens_seen": 288249400, + "step": 13356, + "time_per_iteration": 2.9586033821105957 + }, + { + "auxiliary_loss_clip": 0.01025391, + "auxiliary_loss_mlp": 0.01028102, + "balance_loss_clip": 1.02730918, + "balance_loss_mlp": 1.01762342, + "epoch": 0.8030662858860664, + "flos": 21615243442560.0, + "grad_norm": 1.4706702991153677, + "language_loss": 0.77255082, + "learning_rate": 3.931174316549666e-07, + "loss": 0.79308575, + "num_input_tokens_seen": 288268780, + "step": 13357, + "time_per_iteration": 2.690027952194214 + }, + { + "auxiliary_loss_clip": 0.01026705, + "auxiliary_loss_mlp": 0.01029951, + "balance_loss_clip": 1.02209783, + "balance_loss_mlp": 1.01860785, + "epoch": 0.8031264091387345, + "flos": 25630056351360.0, + "grad_norm": 1.389743861821363, + "language_loss": 0.76944643, + "learning_rate": 3.9288558287073937e-07, + "loss": 0.79001296, + "num_input_tokens_seen": 288290830, + "step": 13358, + "time_per_iteration": 2.707869529724121 + }, + { + "auxiliary_loss_clip": 0.01050278, + "auxiliary_loss_mlp": 0.01029642, + "balance_loss_clip": 1.02361834, + "balance_loss_mlp": 1.01947963, + "epoch": 0.8031865323914024, + "flos": 19646225648640.0, + "grad_norm": 1.9057762133938383, + "language_loss": 0.84409559, + "learning_rate": 3.9265379502845143e-07, + "loss": 0.86489475, + "num_input_tokens_seen": 288308865, + "step": 13359, + "time_per_iteration": 2.6129813194274902 + }, + { + "auxiliary_loss_clip": 0.01042782, + "auxiliary_loss_mlp": 0.01025499, + "balance_loss_clip": 1.02643299, + "balance_loss_mlp": 1.0159502, + "epoch": 0.8032466556440704, + "flos": 26169110732160.0, + "grad_norm": 1.771463500214766, + "language_loss": 0.73186827, + "learning_rate": 3.924220681368928e-07, + "loss": 0.75255108, + "num_input_tokens_seen": 288327325, + "step": 13360, + "time_per_iteration": 2.7521393299102783 + }, + { + "auxiliary_loss_clip": 0.01062878, + "auxiliary_loss_mlp": 0.0102372, + "balance_loss_clip": 1.02523446, + "balance_loss_mlp": 1.01359284, + "epoch": 0.8033067788967383, + "flos": 25520026014720.0, + "grad_norm": 1.7380602309379922, + "language_loss": 0.69609427, + "learning_rate": 3.921904022048512e-07, + "loss": 0.71696019, + "num_input_tokens_seen": 288347285, + "step": 13361, + "time_per_iteration": 2.6219091415405273 + }, + { + "auxiliary_loss_clip": 0.01065369, + "auxiliary_loss_mlp": 0.01032943, + "balance_loss_clip": 1.02595329, + "balance_loss_mlp": 1.02183843, + "epoch": 0.8033669021494063, + "flos": 24024274842240.0, + "grad_norm": 1.7789809946820734, + "language_loss": 0.70362449, + "learning_rate": 3.919587972411098e-07, + "loss": 0.72460765, + "num_input_tokens_seen": 288367785, + "step": 13362, + "time_per_iteration": 2.576556444168091 + }, + { + "auxiliary_loss_clip": 0.01069018, + "auxiliary_loss_mlp": 0.01033688, + "balance_loss_clip": 1.0273031, + "balance_loss_mlp": 1.02143919, + "epoch": 0.8034270254020742, + "flos": 13588059749760.0, + "grad_norm": 2.336600983714033, + "language_loss": 0.78165048, + "learning_rate": 3.91727253254452e-07, + "loss": 0.80267751, + "num_input_tokens_seen": 288384135, + "step": 13363, + "time_per_iteration": 2.5237040519714355 + }, + { + "auxiliary_loss_clip": 0.01050744, + "auxiliary_loss_mlp": 0.01030174, + "balance_loss_clip": 1.02390075, + "balance_loss_mlp": 1.01962447, + "epoch": 0.8034871486547422, + "flos": 27412661537280.0, + "grad_norm": 2.6178597852943755, + "language_loss": 0.74613529, + "learning_rate": 3.9149577025365787e-07, + "loss": 0.76694453, + "num_input_tokens_seen": 288403805, + "step": 13364, + "time_per_iteration": 2.6532602310180664 + }, + { + "auxiliary_loss_clip": 0.01055248, + "auxiliary_loss_mlp": 0.01027148, + "balance_loss_clip": 1.02818751, + "balance_loss_mlp": 1.01767087, + "epoch": 0.8035472719074102, + "flos": 32598593475840.0, + "grad_norm": 1.9094296923470648, + "language_loss": 0.60295969, + "learning_rate": 3.9126434824750596e-07, + "loss": 0.62378359, + "num_input_tokens_seen": 288424895, + "step": 13365, + "time_per_iteration": 2.68400239944458 + }, + { + "auxiliary_loss_clip": 0.01038812, + "auxiliary_loss_mlp": 0.01032402, + "balance_loss_clip": 1.02250838, + "balance_loss_mlp": 1.02105951, + "epoch": 0.8036073951600782, + "flos": 21287989607040.0, + "grad_norm": 2.35283086700854, + "language_loss": 0.66222441, + "learning_rate": 3.910329872447706e-07, + "loss": 0.68293655, + "num_input_tokens_seen": 288443865, + "step": 13366, + "time_per_iteration": 2.6530842781066895 + }, + { + "auxiliary_loss_clip": 0.01060607, + "auxiliary_loss_mlp": 0.0102739, + "balance_loss_clip": 1.02453494, + "balance_loss_mlp": 1.01732874, + "epoch": 0.8036675184127461, + "flos": 18113845582080.0, + "grad_norm": 2.2112842173680245, + "language_loss": 0.74801958, + "learning_rate": 3.908016872542259e-07, + "loss": 0.76889944, + "num_input_tokens_seen": 288461065, + "step": 13367, + "time_per_iteration": 4.250295639038086 + }, + { + "auxiliary_loss_clip": 0.01060283, + "auxiliary_loss_mlp": 0.01023315, + "balance_loss_clip": 1.0241884, + "balance_loss_mlp": 1.01338434, + "epoch": 0.8037276416654141, + "flos": 26030280666240.0, + "grad_norm": 1.8988792000386854, + "language_loss": 0.74145305, + "learning_rate": 3.905704482846428e-07, + "loss": 0.76228899, + "num_input_tokens_seen": 288481865, + "step": 13368, + "time_per_iteration": 2.69397234916687 + }, + { + "auxiliary_loss_clip": 0.01064497, + "auxiliary_loss_mlp": 0.01030078, + "balance_loss_clip": 1.02604461, + "balance_loss_mlp": 1.01926589, + "epoch": 0.803787764918082, + "flos": 18802180886400.0, + "grad_norm": 2.015729419418405, + "language_loss": 0.69950712, + "learning_rate": 3.90339270344789e-07, + "loss": 0.7204529, + "num_input_tokens_seen": 288499345, + "step": 13369, + "time_per_iteration": 2.5981533527374268 + }, + { + "auxiliary_loss_clip": 0.01041583, + "auxiliary_loss_mlp": 0.01027191, + "balance_loss_clip": 1.02432215, + "balance_loss_mlp": 1.01779735, + "epoch": 0.80384788817075, + "flos": 20225787592320.0, + "grad_norm": 1.7397568942262318, + "language_loss": 0.73894149, + "learning_rate": 3.901081534434312e-07, + "loss": 0.75962919, + "num_input_tokens_seen": 288517660, + "step": 13370, + "time_per_iteration": 2.6800429821014404 + }, + { + "auxiliary_loss_clip": 0.0103811, + "auxiliary_loss_mlp": 0.01032095, + "balance_loss_clip": 1.02297366, + "balance_loss_mlp": 1.02046585, + "epoch": 0.8039080114234181, + "flos": 18515290959360.0, + "grad_norm": 2.3924899125943027, + "language_loss": 0.87062138, + "learning_rate": 3.898770975893342e-07, + "loss": 0.89132333, + "num_input_tokens_seen": 288534180, + "step": 13371, + "time_per_iteration": 2.6247122287750244 + }, + { + "auxiliary_loss_clip": 0.01054228, + "auxiliary_loss_mlp": 0.01032135, + "balance_loss_clip": 1.02367806, + "balance_loss_mlp": 1.02072716, + "epoch": 0.803968134676086, + "flos": 22382510883840.0, + "grad_norm": 1.9780974800146054, + "language_loss": 0.74674094, + "learning_rate": 3.89646102791259e-07, + "loss": 0.76760453, + "num_input_tokens_seen": 288553350, + "step": 13372, + "time_per_iteration": 2.6015732288360596 + }, + { + "auxiliary_loss_clip": 0.01024421, + "auxiliary_loss_mlp": 0.01028314, + "balance_loss_clip": 1.0259465, + "balance_loss_mlp": 1.01674509, + "epoch": 0.804028257928754, + "flos": 23842566915840.0, + "grad_norm": 1.891418493611005, + "language_loss": 0.7874732, + "learning_rate": 3.894151690579646e-07, + "loss": 0.80800056, + "num_input_tokens_seen": 288571325, + "step": 13373, + "time_per_iteration": 2.785490036010742 + }, + { + "auxiliary_loss_clip": 0.01032136, + "auxiliary_loss_mlp": 0.01032127, + "balance_loss_clip": 1.02160943, + "balance_loss_mlp": 1.02182746, + "epoch": 0.8040883811814219, + "flos": 23550720912000.0, + "grad_norm": 1.470423249600706, + "language_loss": 0.74487239, + "learning_rate": 3.8918429639820815e-07, + "loss": 0.76551509, + "num_input_tokens_seen": 288592100, + "step": 13374, + "time_per_iteration": 2.673828125 + }, + { + "auxiliary_loss_clip": 0.01011606, + "auxiliary_loss_mlp": 0.01035943, + "balance_loss_clip": 1.01981044, + "balance_loss_mlp": 1.02364063, + "epoch": 0.8041485044340899, + "flos": 19026263882880.0, + "grad_norm": 1.9033586233873845, + "language_loss": 0.6852451, + "learning_rate": 3.889534848207452e-07, + "loss": 0.70572054, + "num_input_tokens_seen": 288612305, + "step": 13375, + "time_per_iteration": 2.727635145187378 + }, + { + "auxiliary_loss_clip": 0.0098196, + "auxiliary_loss_mlp": 0.01000923, + "balance_loss_clip": 1.00578022, + "balance_loss_mlp": 1.00013673, + "epoch": 0.8042086276867578, + "flos": 70005663797760.0, + "grad_norm": 0.722183118348265, + "language_loss": 0.55667388, + "learning_rate": 3.887227343343271e-07, + "loss": 0.57650268, + "num_input_tokens_seen": 288676015, + "step": 13376, + "time_per_iteration": 3.383531093597412 + }, + { + "auxiliary_loss_clip": 0.01006022, + "auxiliary_loss_mlp": 0.01030508, + "balance_loss_clip": 1.02066445, + "balance_loss_mlp": 1.01872444, + "epoch": 0.8042687509394258, + "flos": 21872435800320.0, + "grad_norm": 3.675584394733518, + "language_loss": 0.73027527, + "learning_rate": 3.8849204494770425e-07, + "loss": 0.75064057, + "num_input_tokens_seen": 288696455, + "step": 13377, + "time_per_iteration": 2.7443931102752686 + }, + { + "auxiliary_loss_clip": 0.0105045, + "auxiliary_loss_mlp": 0.01028119, + "balance_loss_clip": 1.02308917, + "balance_loss_mlp": 1.01725316, + "epoch": 0.8043288741920938, + "flos": 26614870513920.0, + "grad_norm": 2.1207472752637355, + "language_loss": 0.70258462, + "learning_rate": 3.8826141666962567e-07, + "loss": 0.72337031, + "num_input_tokens_seen": 288715560, + "step": 13378, + "time_per_iteration": 2.6892876625061035 + }, + { + "auxiliary_loss_clip": 0.01052147, + "auxiliary_loss_mlp": 0.01024646, + "balance_loss_clip": 1.0241878, + "balance_loss_mlp": 1.01409602, + "epoch": 0.8043889974447618, + "flos": 33403387651200.0, + "grad_norm": 1.4062174675361954, + "language_loss": 0.69418061, + "learning_rate": 3.880308495088347e-07, + "loss": 0.71494859, + "num_input_tokens_seen": 288739485, + "step": 13379, + "time_per_iteration": 4.502264499664307 + }, + { + "auxiliary_loss_clip": 0.01066236, + "auxiliary_loss_mlp": 0.01029922, + "balance_loss_clip": 1.02657962, + "balance_loss_mlp": 1.01812613, + "epoch": 0.8044491206974297, + "flos": 20375966355840.0, + "grad_norm": 1.6185790531501842, + "language_loss": 0.76428413, + "learning_rate": 3.8780034347407533e-07, + "loss": 0.78524572, + "num_input_tokens_seen": 288757420, + "step": 13380, + "time_per_iteration": 2.543252944946289 + }, + { + "auxiliary_loss_clip": 0.01013899, + "auxiliary_loss_mlp": 0.01024796, + "balance_loss_clip": 1.02119637, + "balance_loss_mlp": 1.01482356, + "epoch": 0.8045092439500977, + "flos": 23403810286080.0, + "grad_norm": 1.789927027014696, + "language_loss": 0.69220012, + "learning_rate": 3.875698985740887e-07, + "loss": 0.71258712, + "num_input_tokens_seen": 288775535, + "step": 13381, + "time_per_iteration": 2.7130002975463867 + }, + { + "auxiliary_loss_clip": 0.0105473, + "auxiliary_loss_mlp": 0.01032537, + "balance_loss_clip": 1.02645493, + "balance_loss_mlp": 1.02150416, + "epoch": 0.8045693672027656, + "flos": 24097245321600.0, + "grad_norm": 1.8029835777976053, + "language_loss": 0.63922501, + "learning_rate": 3.873395148176135e-07, + "loss": 0.66009766, + "num_input_tokens_seen": 288795035, + "step": 13382, + "time_per_iteration": 2.6573448181152344 + }, + { + "auxiliary_loss_clip": 0.01042439, + "auxiliary_loss_mlp": 0.01031765, + "balance_loss_clip": 1.02607036, + "balance_loss_mlp": 1.02253211, + "epoch": 0.8046294904554336, + "flos": 27707165147520.0, + "grad_norm": 2.1837583267840706, + "language_loss": 0.76780713, + "learning_rate": 3.8710919221338487e-07, + "loss": 0.78854918, + "num_input_tokens_seen": 288816270, + "step": 13383, + "time_per_iteration": 2.7489547729492188 + }, + { + "auxiliary_loss_clip": 0.0104689, + "auxiliary_loss_mlp": 0.01031598, + "balance_loss_clip": 1.02373087, + "balance_loss_mlp": 1.02024913, + "epoch": 0.8046896137081017, + "flos": 24972998814720.0, + "grad_norm": 2.0107718489612854, + "language_loss": 0.6988548, + "learning_rate": 3.868789307701381e-07, + "loss": 0.71963966, + "num_input_tokens_seen": 288836050, + "step": 13384, + "time_per_iteration": 2.658719301223755 + }, + { + "auxiliary_loss_clip": 0.01051819, + "auxiliary_loss_mlp": 0.01033702, + "balance_loss_clip": 1.02270508, + "balance_loss_mlp": 1.02237749, + "epoch": 0.8047497369607696, + "flos": 17675484001920.0, + "grad_norm": 5.1196686623988406, + "language_loss": 0.79554307, + "learning_rate": 3.8664873049660375e-07, + "loss": 0.81639826, + "num_input_tokens_seen": 288852900, + "step": 13385, + "time_per_iteration": 2.60368275642395 + }, + { + "auxiliary_loss_clip": 0.01062619, + "auxiliary_loss_mlp": 0.01029225, + "balance_loss_clip": 1.02496004, + "balance_loss_mlp": 1.01843655, + "epoch": 0.8048098602134376, + "flos": 22382079920640.0, + "grad_norm": 1.823699118782922, + "language_loss": 0.71945286, + "learning_rate": 3.864185914015108e-07, + "loss": 0.74037135, + "num_input_tokens_seen": 288872625, + "step": 13386, + "time_per_iteration": 2.606811761856079 + }, + { + "auxiliary_loss_clip": 0.00974716, + "auxiliary_loss_mlp": 0.01001309, + "balance_loss_clip": 1.00183153, + "balance_loss_mlp": 1.00045109, + "epoch": 0.8048699834661055, + "flos": 71200949702400.0, + "grad_norm": 0.6698479269212256, + "language_loss": 0.51230276, + "learning_rate": 3.861885134935865e-07, + "loss": 0.53206301, + "num_input_tokens_seen": 288939180, + "step": 13387, + "time_per_iteration": 3.317103624343872 + }, + { + "auxiliary_loss_clip": 0.01062806, + "auxiliary_loss_mlp": 0.01030196, + "balance_loss_clip": 1.02482569, + "balance_loss_mlp": 1.01835215, + "epoch": 0.8049301067187735, + "flos": 23660320285440.0, + "grad_norm": 1.8854306319976328, + "language_loss": 0.74007136, + "learning_rate": 3.859584967815559e-07, + "loss": 0.76100135, + "num_input_tokens_seen": 288958925, + "step": 13388, + "time_per_iteration": 2.592047691345215 + }, + { + "auxiliary_loss_clip": 0.01034136, + "auxiliary_loss_mlp": 0.01024166, + "balance_loss_clip": 1.02640963, + "balance_loss_mlp": 1.01417589, + "epoch": 0.8049902299714414, + "flos": 24426330750720.0, + "grad_norm": 1.5695581546261184, + "language_loss": 0.71654904, + "learning_rate": 3.857285412741411e-07, + "loss": 0.73713207, + "num_input_tokens_seen": 288980935, + "step": 13389, + "time_per_iteration": 2.730541467666626 + }, + { + "auxiliary_loss_clip": 0.01045931, + "auxiliary_loss_mlp": 0.01037072, + "balance_loss_clip": 1.02771735, + "balance_loss_mlp": 1.02587235, + "epoch": 0.8050503532241094, + "flos": 17492626840320.0, + "grad_norm": 2.1229078351337987, + "language_loss": 0.82975656, + "learning_rate": 3.8549864698006097e-07, + "loss": 0.85058659, + "num_input_tokens_seen": 288996780, + "step": 13390, + "time_per_iteration": 2.6459197998046875 + }, + { + "auxiliary_loss_clip": 0.00995579, + "auxiliary_loss_mlp": 0.01003099, + "balance_loss_clip": 1.00083053, + "balance_loss_mlp": 1.00225246, + "epoch": 0.8051104764767774, + "flos": 57658030369920.0, + "grad_norm": 0.7900714615949621, + "language_loss": 0.5551098, + "learning_rate": 3.8526881390803424e-07, + "loss": 0.57509661, + "num_input_tokens_seen": 289057590, + "step": 13391, + "time_per_iteration": 3.2433431148529053 + }, + { + "auxiliary_loss_clip": 0.01050355, + "auxiliary_loss_mlp": 0.01026765, + "balance_loss_clip": 1.02498031, + "balance_loss_mlp": 1.01657224, + "epoch": 0.8051705997294454, + "flos": 18003456109440.0, + "grad_norm": 1.520381622996144, + "language_loss": 0.84492481, + "learning_rate": 3.850390420667762e-07, + "loss": 0.86569601, + "num_input_tokens_seen": 289076285, + "step": 13392, + "time_per_iteration": 2.6668503284454346 + }, + { + "auxiliary_loss_clip": 0.01029329, + "auxiliary_loss_mlp": 0.01027397, + "balance_loss_clip": 1.02215159, + "balance_loss_mlp": 1.01708555, + "epoch": 0.8052307229821133, + "flos": 26397754755840.0, + "grad_norm": 1.473677305800731, + "language_loss": 0.70129782, + "learning_rate": 3.8480933146499914e-07, + "loss": 0.72186506, + "num_input_tokens_seen": 289097585, + "step": 13393, + "time_per_iteration": 2.7510170936584473 + }, + { + "auxiliary_loss_clip": 0.01053847, + "auxiliary_loss_mlp": 0.01027662, + "balance_loss_clip": 1.02578783, + "balance_loss_mlp": 1.01704001, + "epoch": 0.8052908462347813, + "flos": 21757018423680.0, + "grad_norm": 1.8439838123472863, + "language_loss": 0.76302934, + "learning_rate": 3.84579682111414e-07, + "loss": 0.78384441, + "num_input_tokens_seen": 289116890, + "step": 13394, + "time_per_iteration": 2.7256715297698975 + }, + { + "auxiliary_loss_clip": 0.01065618, + "auxiliary_loss_mlp": 0.01028829, + "balance_loss_clip": 1.02765989, + "balance_loss_mlp": 1.01895261, + "epoch": 0.8053509694874492, + "flos": 25442279026560.0, + "grad_norm": 1.703191826603364, + "language_loss": 0.65030855, + "learning_rate": 3.843500940147304e-07, + "loss": 0.67125309, + "num_input_tokens_seen": 289136670, + "step": 13395, + "time_per_iteration": 2.632227897644043 + }, + { + "auxiliary_loss_clip": 0.00996392, + "auxiliary_loss_mlp": 0.01004492, + "balance_loss_clip": 1.0012641, + "balance_loss_mlp": 1.00365186, + "epoch": 0.8054110927401172, + "flos": 57668122091520.0, + "grad_norm": 0.7559796615980958, + "language_loss": 0.57389456, + "learning_rate": 3.8412056718365206e-07, + "loss": 0.59390342, + "num_input_tokens_seen": 289200150, + "step": 13396, + "time_per_iteration": 4.8928062915802 + }, + { + "auxiliary_loss_clip": 0.01052998, + "auxiliary_loss_mlp": 0.01032212, + "balance_loss_clip": 1.0254966, + "balance_loss_mlp": 1.02106047, + "epoch": 0.8054712159927853, + "flos": 19276201693440.0, + "grad_norm": 1.6649076400408058, + "language_loss": 0.77413261, + "learning_rate": 3.8389110162688353e-07, + "loss": 0.7949847, + "num_input_tokens_seen": 289218125, + "step": 13397, + "time_per_iteration": 2.7595465183258057 + }, + { + "auxiliary_loss_clip": 0.01054952, + "auxiliary_loss_mlp": 0.01024613, + "balance_loss_clip": 1.02857709, + "balance_loss_mlp": 1.01455188, + "epoch": 0.8055313392454532, + "flos": 17967617314560.0, + "grad_norm": 1.8420126782267487, + "language_loss": 0.70261538, + "learning_rate": 3.836616973531266e-07, + "loss": 0.72341102, + "num_input_tokens_seen": 289237115, + "step": 13398, + "time_per_iteration": 4.195930004119873 + }, + { + "auxiliary_loss_clip": 0.01040892, + "auxiliary_loss_mlp": 0.01029707, + "balance_loss_clip": 1.02433372, + "balance_loss_mlp": 1.02014637, + "epoch": 0.8055914624981212, + "flos": 13478352635520.0, + "grad_norm": 2.118561675426086, + "language_loss": 0.6900447, + "learning_rate": 3.834323543710805e-07, + "loss": 0.71075076, + "num_input_tokens_seen": 289253635, + "step": 13399, + "time_per_iteration": 2.6280996799468994 + }, + { + "auxiliary_loss_clip": 0.01062744, + "auxiliary_loss_mlp": 0.0103058, + "balance_loss_clip": 1.02589428, + "balance_loss_mlp": 1.02091765, + "epoch": 0.8056515857507891, + "flos": 13224787551360.0, + "grad_norm": 2.541858788930348, + "language_loss": 0.72126424, + "learning_rate": 3.8320307268944153e-07, + "loss": 0.74219751, + "num_input_tokens_seen": 289270085, + "step": 13400, + "time_per_iteration": 2.574489116668701 + }, + { + "auxiliary_loss_clip": 0.01047957, + "auxiliary_loss_mlp": 0.0102992, + "balance_loss_clip": 1.02209246, + "balance_loss_mlp": 1.01975739, + "epoch": 0.8057117090034571, + "flos": 23878190229120.0, + "grad_norm": 1.7191086766175883, + "language_loss": 0.63831341, + "learning_rate": 3.829738523169037e-07, + "loss": 0.65909219, + "num_input_tokens_seen": 289289645, + "step": 13401, + "time_per_iteration": 2.5878353118896484 + }, + { + "auxiliary_loss_clip": 0.0104994, + "auxiliary_loss_mlp": 0.01027841, + "balance_loss_clip": 1.02350581, + "balance_loss_mlp": 1.01760709, + "epoch": 0.805771832256125, + "flos": 21214300855680.0, + "grad_norm": 2.922864404715402, + "language_loss": 0.83880067, + "learning_rate": 3.8274469326215985e-07, + "loss": 0.85957849, + "num_input_tokens_seen": 289306630, + "step": 13402, + "time_per_iteration": 2.6031200885772705 + }, + { + "auxiliary_loss_clip": 0.01020464, + "auxiliary_loss_mlp": 0.01030478, + "balance_loss_clip": 1.02508688, + "balance_loss_mlp": 1.02012444, + "epoch": 0.805831955508793, + "flos": 17566818382080.0, + "grad_norm": 1.7449903612922983, + "language_loss": 0.67834806, + "learning_rate": 3.8251559553389876e-07, + "loss": 0.69885749, + "num_input_tokens_seen": 289324960, + "step": 13403, + "time_per_iteration": 2.770642042160034 + }, + { + "auxiliary_loss_clip": 0.01023837, + "auxiliary_loss_mlp": 0.00747236, + "balance_loss_clip": 1.02384615, + "balance_loss_mlp": 1.00030696, + "epoch": 0.805892078761461, + "flos": 26907542530560.0, + "grad_norm": 1.546826598495259, + "language_loss": 0.84714949, + "learning_rate": 3.822865591408084e-07, + "loss": 0.86486018, + "num_input_tokens_seen": 289344980, + "step": 13404, + "time_per_iteration": 2.7415783405303955 + }, + { + "auxiliary_loss_clip": 0.01020372, + "auxiliary_loss_mlp": 0.0102736, + "balance_loss_clip": 1.02341521, + "balance_loss_mlp": 1.01799655, + "epoch": 0.805952202014129, + "flos": 31506442496640.0, + "grad_norm": 2.6386679667418034, + "language_loss": 0.70158136, + "learning_rate": 3.820575840915743e-07, + "loss": 0.72205877, + "num_input_tokens_seen": 289367500, + "step": 13405, + "time_per_iteration": 2.776188611984253 + }, + { + "auxiliary_loss_clip": 0.01051315, + "auxiliary_loss_mlp": 0.01025422, + "balance_loss_clip": 1.02429795, + "balance_loss_mlp": 1.01559949, + "epoch": 0.8060123252667969, + "flos": 24389953251840.0, + "grad_norm": 2.3686434434781414, + "language_loss": 0.75464612, + "learning_rate": 3.818286703948788e-07, + "loss": 0.77541351, + "num_input_tokens_seen": 289385930, + "step": 13406, + "time_per_iteration": 2.640559196472168 + }, + { + "auxiliary_loss_clip": 0.01056006, + "auxiliary_loss_mlp": 0.01035419, + "balance_loss_clip": 1.02708673, + "balance_loss_mlp": 1.02427351, + "epoch": 0.8060724485194649, + "flos": 23479941162240.0, + "grad_norm": 5.724510176453215, + "language_loss": 0.76270837, + "learning_rate": 3.815998180594018e-07, + "loss": 0.78362262, + "num_input_tokens_seen": 289408025, + "step": 13407, + "time_per_iteration": 2.6250312328338623 + }, + { + "auxiliary_loss_clip": 0.0103202, + "auxiliary_loss_mlp": 0.00747566, + "balance_loss_clip": 1.02140856, + "balance_loss_mlp": 1.00035262, + "epoch": 0.8061325717721328, + "flos": 18624495283200.0, + "grad_norm": 1.6246064593841798, + "language_loss": 0.73624551, + "learning_rate": 3.81371027093822e-07, + "loss": 0.75404137, + "num_input_tokens_seen": 289426575, + "step": 13408, + "time_per_iteration": 2.744319438934326 + }, + { + "auxiliary_loss_clip": 0.01035348, + "auxiliary_loss_mlp": 0.01028873, + "balance_loss_clip": 1.02282524, + "balance_loss_mlp": 1.01754808, + "epoch": 0.8061926950248008, + "flos": 23582752865280.0, + "grad_norm": 1.8339936286816516, + "language_loss": 0.70508909, + "learning_rate": 3.8114229750681523e-07, + "loss": 0.72573131, + "num_input_tokens_seen": 289447760, + "step": 13409, + "time_per_iteration": 2.717334747314453 + }, + { + "auxiliary_loss_clip": 0.01061504, + "auxiliary_loss_mlp": 0.0102389, + "balance_loss_clip": 1.02365756, + "balance_loss_mlp": 1.01334572, + "epoch": 0.8062528182774689, + "flos": 11143333209600.0, + "grad_norm": 2.979597048854504, + "language_loss": 0.76527667, + "learning_rate": 3.809136293070545e-07, + "loss": 0.78613061, + "num_input_tokens_seen": 289463920, + "step": 13410, + "time_per_iteration": 2.526111364364624 + }, + { + "auxiliary_loss_clip": 0.01052514, + "auxiliary_loss_mlp": 0.01030244, + "balance_loss_clip": 1.02561688, + "balance_loss_mlp": 1.02014136, + "epoch": 0.8063129415301368, + "flos": 22346815743360.0, + "grad_norm": 1.6747725251109815, + "language_loss": 0.68407583, + "learning_rate": 3.806850225032117e-07, + "loss": 0.70490336, + "num_input_tokens_seen": 289482635, + "step": 13411, + "time_per_iteration": 2.618713140487671 + }, + { + "auxiliary_loss_clip": 0.01033058, + "auxiliary_loss_mlp": 0.01025433, + "balance_loss_clip": 1.02352214, + "balance_loss_mlp": 1.01503217, + "epoch": 0.8063730647828048, + "flos": 23988400133760.0, + "grad_norm": 1.625766612664839, + "language_loss": 0.68209463, + "learning_rate": 3.804564771039551e-07, + "loss": 0.70267951, + "num_input_tokens_seen": 289502040, + "step": 13412, + "time_per_iteration": 2.645440101623535 + }, + { + "auxiliary_loss_clip": 0.01056252, + "auxiliary_loss_mlp": 0.01030246, + "balance_loss_clip": 1.02674818, + "balance_loss_mlp": 1.01813483, + "epoch": 0.8064331880354727, + "flos": 21321494017920.0, + "grad_norm": 1.7345590024158226, + "language_loss": 0.81412256, + "learning_rate": 3.8022799311795064e-07, + "loss": 0.83498752, + "num_input_tokens_seen": 289520740, + "step": 13413, + "time_per_iteration": 4.3156983852386475 + }, + { + "auxiliary_loss_clip": 0.01044851, + "auxiliary_loss_mlp": 0.0103228, + "balance_loss_clip": 1.0227344, + "balance_loss_mlp": 1.02153921, + "epoch": 0.8064933112881407, + "flos": 19682890456320.0, + "grad_norm": 2.14583934437654, + "language_loss": 0.85205591, + "learning_rate": 3.7999957055386303e-07, + "loss": 0.87282729, + "num_input_tokens_seen": 289535840, + "step": 13414, + "time_per_iteration": 2.566555976867676 + }, + { + "auxiliary_loss_clip": 0.01034472, + "auxiliary_loss_mlp": 0.01028337, + "balance_loss_clip": 1.0223639, + "balance_loss_mlp": 1.01787615, + "epoch": 0.8065534345408086, + "flos": 19279721226240.0, + "grad_norm": 1.76266371559064, + "language_loss": 0.67009979, + "learning_rate": 3.7977120942035467e-07, + "loss": 0.69072789, + "num_input_tokens_seen": 289555205, + "step": 13415, + "time_per_iteration": 2.6205246448516846 + }, + { + "auxiliary_loss_clip": 0.0102748, + "auxiliary_loss_mlp": 0.01023273, + "balance_loss_clip": 1.02343488, + "balance_loss_mlp": 1.0133611, + "epoch": 0.8066135577934767, + "flos": 19677718897920.0, + "grad_norm": 1.535062677216916, + "language_loss": 0.76400101, + "learning_rate": 3.7954290972608383e-07, + "loss": 0.78450853, + "num_input_tokens_seen": 289573000, + "step": 13416, + "time_per_iteration": 2.672968864440918 + }, + { + "auxiliary_loss_clip": 0.0104581, + "auxiliary_loss_mlp": 0.01032407, + "balance_loss_clip": 1.02335548, + "balance_loss_mlp": 1.02197075, + "epoch": 0.8066736810461446, + "flos": 21143592933120.0, + "grad_norm": 1.4665349099231555, + "language_loss": 0.65233898, + "learning_rate": 3.793146714797086e-07, + "loss": 0.67312121, + "num_input_tokens_seen": 289592625, + "step": 13417, + "time_per_iteration": 2.56998348236084 + }, + { + "auxiliary_loss_clip": 0.01025836, + "auxiliary_loss_mlp": 0.01044302, + "balance_loss_clip": 1.02266514, + "balance_loss_mlp": 1.03286982, + "epoch": 0.8067338042988126, + "flos": 22598261925120.0, + "grad_norm": 1.4753542608080672, + "language_loss": 0.8063941, + "learning_rate": 3.7908649468988306e-07, + "loss": 0.82709545, + "num_input_tokens_seen": 289610780, + "step": 13418, + "time_per_iteration": 2.6566011905670166 + }, + { + "auxiliary_loss_clip": 0.01044334, + "auxiliary_loss_mlp": 0.01027431, + "balance_loss_clip": 1.02520227, + "balance_loss_mlp": 1.0165292, + "epoch": 0.8067939275514805, + "flos": 16508423208960.0, + "grad_norm": 1.737901224319396, + "language_loss": 0.8494395, + "learning_rate": 3.7885837936526066e-07, + "loss": 0.87015712, + "num_input_tokens_seen": 289628890, + "step": 13419, + "time_per_iteration": 2.6259605884552 + }, + { + "auxiliary_loss_clip": 0.01024015, + "auxiliary_loss_mlp": 0.0074774, + "balance_loss_clip": 1.02375984, + "balance_loss_mlp": 1.00040102, + "epoch": 0.8068540508041485, + "flos": 28541836460160.0, + "grad_norm": 1.7046363482012232, + "language_loss": 0.76031756, + "learning_rate": 3.7863032551449047e-07, + "loss": 0.7780351, + "num_input_tokens_seen": 289647220, + "step": 13420, + "time_per_iteration": 2.7448182106018066 + }, + { + "auxiliary_loss_clip": 0.01041122, + "auxiliary_loss_mlp": 0.007476, + "balance_loss_clip": 1.02130389, + "balance_loss_mlp": 1.00032902, + "epoch": 0.8069141740568164, + "flos": 21652482867840.0, + "grad_norm": 1.74812431117957, + "language_loss": 0.78448319, + "learning_rate": 3.784023331462207e-07, + "loss": 0.80237043, + "num_input_tokens_seen": 289665800, + "step": 13421, + "time_per_iteration": 2.631899833679199 + }, + { + "auxiliary_loss_clip": 0.01036062, + "auxiliary_loss_mlp": 0.01024034, + "balance_loss_clip": 1.02775288, + "balance_loss_mlp": 1.01335239, + "epoch": 0.8069742973094844, + "flos": 17529327561600.0, + "grad_norm": 4.459153819588588, + "language_loss": 0.79484129, + "learning_rate": 3.78174402269098e-07, + "loss": 0.8154422, + "num_input_tokens_seen": 289682705, + "step": 13422, + "time_per_iteration": 2.668172597885132 + }, + { + "auxiliary_loss_clip": 0.01061502, + "auxiliary_loss_mlp": 0.01032273, + "balance_loss_clip": 1.02487576, + "balance_loss_mlp": 1.02211678, + "epoch": 0.8070344205621525, + "flos": 23367037737600.0, + "grad_norm": 1.4785166415633448, + "language_loss": 0.68081093, + "learning_rate": 3.7794653289176347e-07, + "loss": 0.70174879, + "num_input_tokens_seen": 289702920, + "step": 13423, + "time_per_iteration": 2.561037302017212 + }, + { + "auxiliary_loss_clip": 0.01045875, + "auxiliary_loss_mlp": 0.01033773, + "balance_loss_clip": 1.02643776, + "balance_loss_mlp": 1.02259707, + "epoch": 0.8070945438148204, + "flos": 22930184528640.0, + "grad_norm": 1.7015629332538726, + "language_loss": 0.80210841, + "learning_rate": 3.7771872502285904e-07, + "loss": 0.82290488, + "num_input_tokens_seen": 289723280, + "step": 13424, + "time_per_iteration": 2.7838218212127686 + }, + { + "auxiliary_loss_clip": 0.01052388, + "auxiliary_loss_mlp": 0.01024609, + "balance_loss_clip": 1.02383852, + "balance_loss_mlp": 1.01438689, + "epoch": 0.8071546670674884, + "flos": 25300683613440.0, + "grad_norm": 1.5171634461608394, + "language_loss": 0.7903713, + "learning_rate": 3.774909786710232e-07, + "loss": 0.81114125, + "num_input_tokens_seen": 289743475, + "step": 13425, + "time_per_iteration": 2.691695213317871 + }, + { + "auxiliary_loss_clip": 0.01032959, + "auxiliary_loss_mlp": 0.01029125, + "balance_loss_clip": 1.02272892, + "balance_loss_mlp": 1.01883769, + "epoch": 0.8072147903201563, + "flos": 18113701927680.0, + "grad_norm": 2.235376748049951, + "language_loss": 0.74875724, + "learning_rate": 3.772632938448923e-07, + "loss": 0.76937807, + "num_input_tokens_seen": 289761400, + "step": 13426, + "time_per_iteration": 4.333250999450684 + }, + { + "auxiliary_loss_clip": 0.01050924, + "auxiliary_loss_mlp": 0.01022449, + "balance_loss_clip": 1.0236485, + "balance_loss_mlp": 1.01239979, + "epoch": 0.8072749135728243, + "flos": 26688164215680.0, + "grad_norm": 2.5363400335469892, + "language_loss": 0.73348224, + "learning_rate": 3.770356705530997e-07, + "loss": 0.75421602, + "num_input_tokens_seen": 289781025, + "step": 13427, + "time_per_iteration": 2.761294364929199 + }, + { + "auxiliary_loss_clip": 0.01015736, + "auxiliary_loss_mlp": 0.01033338, + "balance_loss_clip": 1.02680123, + "balance_loss_mlp": 1.0219419, + "epoch": 0.8073350368254922, + "flos": 19240291071360.0, + "grad_norm": 1.8031279099095703, + "language_loss": 0.70255136, + "learning_rate": 3.768081088042774e-07, + "loss": 0.72304213, + "num_input_tokens_seen": 289798380, + "step": 13428, + "time_per_iteration": 2.7679011821746826 + }, + { + "auxiliary_loss_clip": 0.0103708, + "auxiliary_loss_mlp": 0.01027951, + "balance_loss_clip": 1.02269316, + "balance_loss_mlp": 1.01831889, + "epoch": 0.8073951600781603, + "flos": 13334530579200.0, + "grad_norm": 2.401079844973969, + "language_loss": 0.74930215, + "learning_rate": 3.765806086070544e-07, + "loss": 0.76995242, + "num_input_tokens_seen": 289814515, + "step": 13429, + "time_per_iteration": 2.6363677978515625 + }, + { + "auxiliary_loss_clip": 0.01050622, + "auxiliary_loss_mlp": 0.01026714, + "balance_loss_clip": 1.02493477, + "balance_loss_mlp": 1.01675415, + "epoch": 0.8074552833308282, + "flos": 22853191726080.0, + "grad_norm": 2.1407242842030114, + "language_loss": 0.66497838, + "learning_rate": 3.763531699700568e-07, + "loss": 0.68575168, + "num_input_tokens_seen": 289834315, + "step": 13430, + "time_per_iteration": 2.6646487712860107 + }, + { + "auxiliary_loss_clip": 0.01024981, + "auxiliary_loss_mlp": 0.01024563, + "balance_loss_clip": 1.02218354, + "balance_loss_mlp": 1.01401925, + "epoch": 0.8075154065834962, + "flos": 20339409288960.0, + "grad_norm": 1.6207414718065563, + "language_loss": 0.80072129, + "learning_rate": 3.7612579290190994e-07, + "loss": 0.82121676, + "num_input_tokens_seen": 289853770, + "step": 13431, + "time_per_iteration": 2.7775213718414307 + }, + { + "auxiliary_loss_clip": 0.01040928, + "auxiliary_loss_mlp": 0.01023598, + "balance_loss_clip": 1.02513611, + "balance_loss_mlp": 1.01313722, + "epoch": 0.8075755298361641, + "flos": 21908059113600.0, + "grad_norm": 3.7439736728947532, + "language_loss": 0.80051595, + "learning_rate": 3.7589847741123593e-07, + "loss": 0.82116121, + "num_input_tokens_seen": 289870480, + "step": 13432, + "time_per_iteration": 2.6661806106567383 + }, + { + "auxiliary_loss_clip": 0.0103582, + "auxiliary_loss_mlp": 0.01030679, + "balance_loss_clip": 1.02629554, + "balance_loss_mlp": 1.02017117, + "epoch": 0.8076356530888321, + "flos": 15669298609920.0, + "grad_norm": 2.1441746237628823, + "language_loss": 0.70064175, + "learning_rate": 3.7567122350665415e-07, + "loss": 0.72130668, + "num_input_tokens_seen": 289888275, + "step": 13433, + "time_per_iteration": 2.697958469390869 + }, + { + "auxiliary_loss_clip": 0.01042367, + "auxiliary_loss_mlp": 0.01024366, + "balance_loss_clip": 1.0260253, + "balance_loss_mlp": 1.01496077, + "epoch": 0.8076957763415, + "flos": 37777414521600.0, + "grad_norm": 1.5700479700928547, + "language_loss": 0.72684765, + "learning_rate": 3.754440311967828e-07, + "loss": 0.74751496, + "num_input_tokens_seen": 289911495, + "step": 13434, + "time_per_iteration": 2.82011342048645 + }, + { + "auxiliary_loss_clip": 0.01025612, + "auxiliary_loss_mlp": 0.01028175, + "balance_loss_clip": 1.02484894, + "balance_loss_mlp": 1.01756525, + "epoch": 0.807755899594168, + "flos": 19610781903360.0, + "grad_norm": 1.863312553655215, + "language_loss": 0.68101722, + "learning_rate": 3.752169004902361e-07, + "loss": 0.70155513, + "num_input_tokens_seen": 289930045, + "step": 13435, + "time_per_iteration": 2.720484495162964 + }, + { + "auxiliary_loss_clip": 0.01025098, + "auxiliary_loss_mlp": 0.01033377, + "balance_loss_clip": 1.0266397, + "balance_loss_mlp": 1.02157509, + "epoch": 0.8078160228468361, + "flos": 23294893271040.0, + "grad_norm": 1.477566432507798, + "language_loss": 0.74921536, + "learning_rate": 3.749898313956279e-07, + "loss": 0.76980007, + "num_input_tokens_seen": 289950815, + "step": 13436, + "time_per_iteration": 2.7295522689819336 + }, + { + "auxiliary_loss_clip": 0.01058001, + "auxiliary_loss_mlp": 0.01024548, + "balance_loss_clip": 1.02283502, + "balance_loss_mlp": 1.01440334, + "epoch": 0.807876146099504, + "flos": 27162651899520.0, + "grad_norm": 1.8979172417514019, + "language_loss": 0.70467007, + "learning_rate": 3.747628239215674e-07, + "loss": 0.72549558, + "num_input_tokens_seen": 289971730, + "step": 13437, + "time_per_iteration": 2.619130849838257 + }, + { + "auxiliary_loss_clip": 0.01034622, + "auxiliary_loss_mlp": 0.01028796, + "balance_loss_clip": 1.02665472, + "balance_loss_mlp": 1.01920843, + "epoch": 0.807936269352172, + "flos": 27160030206720.0, + "grad_norm": 1.618502023803574, + "language_loss": 0.72682309, + "learning_rate": 3.745358780766636e-07, + "loss": 0.74745727, + "num_input_tokens_seen": 289992995, + "step": 13438, + "time_per_iteration": 2.6930339336395264 + }, + { + "auxiliary_loss_clip": 0.01038811, + "auxiliary_loss_mlp": 0.01026143, + "balance_loss_clip": 1.02347469, + "balance_loss_mlp": 1.01587355, + "epoch": 0.8079963926048399, + "flos": 20740423703040.0, + "grad_norm": 1.819073628572734, + "language_loss": 0.77267629, + "learning_rate": 3.7430899386952344e-07, + "loss": 0.79332584, + "num_input_tokens_seen": 290009405, + "step": 13439, + "time_per_iteration": 2.652207374572754 + }, + { + "auxiliary_loss_clip": 0.01061523, + "auxiliary_loss_mlp": 0.01029211, + "balance_loss_clip": 1.02441776, + "balance_loss_mlp": 1.01875687, + "epoch": 0.8080565158575079, + "flos": 25009663622400.0, + "grad_norm": 1.38073655128471, + "language_loss": 0.7881093, + "learning_rate": 3.7408217130874786e-07, + "loss": 0.80901664, + "num_input_tokens_seen": 290031085, + "step": 13440, + "time_per_iteration": 2.5917322635650635 + }, + { + "auxiliary_loss_clip": 0.01042766, + "auxiliary_loss_mlp": 0.00747738, + "balance_loss_clip": 1.02436185, + "balance_loss_mlp": 1.00038886, + "epoch": 0.8081166391101758, + "flos": 18698076293760.0, + "grad_norm": 3.0149219885379277, + "language_loss": 0.58990073, + "learning_rate": 3.7385541040293946e-07, + "loss": 0.60780573, + "num_input_tokens_seen": 290048670, + "step": 13441, + "time_per_iteration": 2.5658106803894043 + }, + { + "auxiliary_loss_clip": 0.01048179, + "auxiliary_loss_mlp": 0.01028606, + "balance_loss_clip": 1.02333498, + "balance_loss_mlp": 1.01788902, + "epoch": 0.8081767623628439, + "flos": 19828651847040.0, + "grad_norm": 2.2332943991701355, + "language_loss": 0.76045156, + "learning_rate": 3.7362871116069684e-07, + "loss": 0.78121936, + "num_input_tokens_seen": 290064085, + "step": 13442, + "time_per_iteration": 2.6251516342163086 + }, + { + "auxiliary_loss_clip": 0.01041823, + "auxiliary_loss_mlp": 0.01028196, + "balance_loss_clip": 1.0250442, + "balance_loss_mlp": 1.01806331, + "epoch": 0.8082368856155118, + "flos": 35772952982400.0, + "grad_norm": 1.428301045688071, + "language_loss": 0.70341706, + "learning_rate": 3.734020735906169e-07, + "loss": 0.72411728, + "num_input_tokens_seen": 290086255, + "step": 13443, + "time_per_iteration": 2.756040096282959 + }, + { + "auxiliary_loss_clip": 0.01028955, + "auxiliary_loss_mlp": 0.01036303, + "balance_loss_clip": 1.02521062, + "balance_loss_mlp": 1.02599096, + "epoch": 0.8082970088681798, + "flos": 17198015489280.0, + "grad_norm": 1.9052603866152986, + "language_loss": 0.82473516, + "learning_rate": 3.7317549770129286e-07, + "loss": 0.8453877, + "num_input_tokens_seen": 290103995, + "step": 13444, + "time_per_iteration": 4.3638646602630615 + }, + { + "auxiliary_loss_clip": 0.00970826, + "auxiliary_loss_mlp": 0.00746589, + "balance_loss_clip": 1.00596499, + "balance_loss_mlp": 1.00042653, + "epoch": 0.8083571321208477, + "flos": 63555207511680.0, + "grad_norm": 0.8389537248517296, + "language_loss": 0.53651547, + "learning_rate": 3.7294898350131754e-07, + "loss": 0.55368966, + "num_input_tokens_seen": 290157245, + "step": 13445, + "time_per_iteration": 4.730676651000977 + }, + { + "auxiliary_loss_clip": 0.01027437, + "auxiliary_loss_mlp": 0.01029046, + "balance_loss_clip": 1.0234977, + "balance_loss_mlp": 1.01735115, + "epoch": 0.8084172553735157, + "flos": 17930701111680.0, + "grad_norm": 2.142494701031491, + "language_loss": 0.72215867, + "learning_rate": 3.7272253099927964e-07, + "loss": 0.74272346, + "num_input_tokens_seen": 290174970, + "step": 13446, + "time_per_iteration": 2.6871578693389893 + }, + { + "auxiliary_loss_clip": 0.01037064, + "auxiliary_loss_mlp": 0.01031932, + "balance_loss_clip": 1.0242064, + "balance_loss_mlp": 1.02045166, + "epoch": 0.8084773786261836, + "flos": 24097999507200.0, + "grad_norm": 1.7701733233483896, + "language_loss": 0.71225488, + "learning_rate": 3.7249614020376606e-07, + "loss": 0.73294479, + "num_input_tokens_seen": 290194395, + "step": 13447, + "time_per_iteration": 2.6396803855895996 + }, + { + "auxiliary_loss_clip": 0.01005626, + "auxiliary_loss_mlp": 0.01034269, + "balance_loss_clip": 1.02284956, + "balance_loss_mlp": 1.02182364, + "epoch": 0.8085375018788516, + "flos": 15588211656960.0, + "grad_norm": 2.1343284784267857, + "language_loss": 0.74769115, + "learning_rate": 3.7226981112336197e-07, + "loss": 0.76809013, + "num_input_tokens_seen": 290209200, + "step": 13448, + "time_per_iteration": 2.734875440597534 + }, + { + "auxiliary_loss_clip": 0.01005172, + "auxiliary_loss_mlp": 0.0100042, + "balance_loss_clip": 1.00035322, + "balance_loss_mlp": 0.99947256, + "epoch": 0.8085976251315197, + "flos": 67561296393600.0, + "grad_norm": 0.7393294483184201, + "language_loss": 0.63856077, + "learning_rate": 3.7204354376665024e-07, + "loss": 0.65861672, + "num_input_tokens_seen": 290274565, + "step": 13449, + "time_per_iteration": 3.1856424808502197 + }, + { + "auxiliary_loss_clip": 0.01053215, + "auxiliary_loss_mlp": 0.01026865, + "balance_loss_clip": 1.02523756, + "balance_loss_mlp": 1.01605272, + "epoch": 0.8086577483841876, + "flos": 22561453463040.0, + "grad_norm": 4.626367389409792, + "language_loss": 0.74078107, + "learning_rate": 3.718173381422105e-07, + "loss": 0.7615819, + "num_input_tokens_seen": 290293630, + "step": 13450, + "time_per_iteration": 2.572721481323242 + }, + { + "auxiliary_loss_clip": 0.0104072, + "auxiliary_loss_mlp": 0.00747637, + "balance_loss_clip": 1.02314103, + "balance_loss_mlp": 1.00034189, + "epoch": 0.8087178716368556, + "flos": 17968084191360.0, + "grad_norm": 1.8595117166662583, + "language_loss": 0.74005771, + "learning_rate": 3.7159119425861986e-07, + "loss": 0.75794125, + "num_input_tokens_seen": 290311450, + "step": 13451, + "time_per_iteration": 2.626795530319214 + }, + { + "auxiliary_loss_clip": 0.01036351, + "auxiliary_loss_mlp": 0.01027765, + "balance_loss_clip": 1.02273607, + "balance_loss_mlp": 1.01507509, + "epoch": 0.8087779948895235, + "flos": 21719527603200.0, + "grad_norm": 1.6647209888939913, + "language_loss": 0.80252808, + "learning_rate": 3.713651121244543e-07, + "loss": 0.82316923, + "num_input_tokens_seen": 290330165, + "step": 13452, + "time_per_iteration": 2.861525774002075 + }, + { + "auxiliary_loss_clip": 0.01053321, + "auxiliary_loss_mlp": 0.01032691, + "balance_loss_clip": 1.0249536, + "balance_loss_mlp": 1.02189648, + "epoch": 0.8088381181421915, + "flos": 29092885983360.0, + "grad_norm": 1.5486519892397712, + "language_loss": 0.7820164, + "learning_rate": 3.711390917482875e-07, + "loss": 0.80287653, + "num_input_tokens_seen": 290350815, + "step": 13453, + "time_per_iteration": 2.6547064781188965 + }, + { + "auxiliary_loss_clip": 0.01013387, + "auxiliary_loss_mlp": 0.01028509, + "balance_loss_clip": 1.02070785, + "balance_loss_mlp": 1.01749444, + "epoch": 0.8088982413948594, + "flos": 22198432659840.0, + "grad_norm": 2.646529076905948, + "language_loss": 0.77528751, + "learning_rate": 3.709131331386892e-07, + "loss": 0.79570645, + "num_input_tokens_seen": 290367380, + "step": 13454, + "time_per_iteration": 2.7617318630218506 + }, + { + "auxiliary_loss_clip": 0.01030874, + "auxiliary_loss_mlp": 0.01027117, + "balance_loss_clip": 1.02416301, + "balance_loss_mlp": 1.01678181, + "epoch": 0.8089583646475275, + "flos": 28036717453440.0, + "grad_norm": 2.8436142136295, + "language_loss": 0.76506698, + "learning_rate": 3.7068723630422795e-07, + "loss": 0.78564692, + "num_input_tokens_seen": 290387965, + "step": 13455, + "time_per_iteration": 2.868824005126953 + }, + { + "auxiliary_loss_clip": 0.01034906, + "auxiliary_loss_mlp": 0.01029466, + "balance_loss_clip": 1.02184796, + "balance_loss_mlp": 1.01808167, + "epoch": 0.8090184879001954, + "flos": 16617735273600.0, + "grad_norm": 6.139452518298683, + "language_loss": 0.79166287, + "learning_rate": 3.70461401253471e-07, + "loss": 0.81230664, + "num_input_tokens_seen": 290404150, + "step": 13456, + "time_per_iteration": 2.7563693523406982 + }, + { + "auxiliary_loss_clip": 0.01061756, + "auxiliary_loss_mlp": 0.01033846, + "balance_loss_clip": 1.02595127, + "balance_loss_mlp": 1.02368367, + "epoch": 0.8090786111528634, + "flos": 27340804379520.0, + "grad_norm": 2.185282813541294, + "language_loss": 0.71732509, + "learning_rate": 3.702356279949801e-07, + "loss": 0.73828113, + "num_input_tokens_seen": 290422370, + "step": 13457, + "time_per_iteration": 2.617185354232788 + }, + { + "auxiliary_loss_clip": 0.01039536, + "auxiliary_loss_mlp": 0.01027874, + "balance_loss_clip": 1.02357221, + "balance_loss_mlp": 1.01857555, + "epoch": 0.8091387344055313, + "flos": 21105742976640.0, + "grad_norm": 1.8537218384779333, + "language_loss": 0.72783482, + "learning_rate": 3.700099165373176e-07, + "loss": 0.74850887, + "num_input_tokens_seen": 290442645, + "step": 13458, + "time_per_iteration": 2.6410841941833496 + }, + { + "auxiliary_loss_clip": 0.0105152, + "auxiliary_loss_mlp": 0.01028411, + "balance_loss_clip": 1.02456069, + "balance_loss_mlp": 1.01789069, + "epoch": 0.8091988576581993, + "flos": 11655060318720.0, + "grad_norm": 2.158734699830138, + "language_loss": 0.78992426, + "learning_rate": 3.6978426688904275e-07, + "loss": 0.81072354, + "num_input_tokens_seen": 290458520, + "step": 13459, + "time_per_iteration": 2.695988655090332 + }, + { + "auxiliary_loss_clip": 0.01045123, + "auxiliary_loss_mlp": 0.01025873, + "balance_loss_clip": 1.02681482, + "balance_loss_mlp": 1.01487637, + "epoch": 0.8092589809108672, + "flos": 22963329803520.0, + "grad_norm": 1.9877333228757295, + "language_loss": 0.7994625, + "learning_rate": 3.695586790587113e-07, + "loss": 0.82017243, + "num_input_tokens_seen": 290474465, + "step": 13460, + "time_per_iteration": 2.634974241256714 + }, + { + "auxiliary_loss_clip": 0.01038025, + "auxiliary_loss_mlp": 0.01030554, + "balance_loss_clip": 1.02235317, + "balance_loss_mlp": 1.01935434, + "epoch": 0.8093191041635353, + "flos": 13260985482240.0, + "grad_norm": 1.6576959889903633, + "language_loss": 0.84561801, + "learning_rate": 3.693331530548789e-07, + "loss": 0.86630386, + "num_input_tokens_seen": 290492060, + "step": 13461, + "time_per_iteration": 4.435650587081909 + }, + { + "auxiliary_loss_clip": 0.01052508, + "auxiliary_loss_mlp": 0.01034591, + "balance_loss_clip": 1.02487898, + "balance_loss_mlp": 1.02374268, + "epoch": 0.8093792274162032, + "flos": 25516003691520.0, + "grad_norm": 1.8058928864926906, + "language_loss": 0.76641279, + "learning_rate": 3.69107688886096e-07, + "loss": 0.78728378, + "num_input_tokens_seen": 290511510, + "step": 13462, + "time_per_iteration": 2.676255702972412 + }, + { + "auxiliary_loss_clip": 0.01044589, + "auxiliary_loss_mlp": 0.01032078, + "balance_loss_clip": 1.02614295, + "balance_loss_mlp": 1.0209502, + "epoch": 0.8094393506688712, + "flos": 23546483107200.0, + "grad_norm": 2.192949602396234, + "language_loss": 0.82801497, + "learning_rate": 3.6888228656091357e-07, + "loss": 0.8487817, + "num_input_tokens_seen": 290530035, + "step": 13463, + "time_per_iteration": 2.716461658477783 + }, + { + "auxiliary_loss_clip": 0.01061695, + "auxiliary_loss_mlp": 0.01031967, + "balance_loss_clip": 1.02609849, + "balance_loss_mlp": 1.02253199, + "epoch": 0.8094994739215392, + "flos": 17055917285760.0, + "grad_norm": 1.8015171594747514, + "language_loss": 0.61902767, + "learning_rate": 3.686569460878779e-07, + "loss": 0.63996434, + "num_input_tokens_seen": 290548245, + "step": 13464, + "time_per_iteration": 2.5930967330932617 + }, + { + "auxiliary_loss_clip": 0.01059954, + "auxiliary_loss_mlp": 0.01026155, + "balance_loss_clip": 1.02456009, + "balance_loss_mlp": 1.01683903, + "epoch": 0.8095595971742071, + "flos": 23551223702400.0, + "grad_norm": 1.6738219027125174, + "language_loss": 0.61834341, + "learning_rate": 3.684316674755341e-07, + "loss": 0.6392045, + "num_input_tokens_seen": 290568625, + "step": 13465, + "time_per_iteration": 2.5823585987091064 + }, + { + "auxiliary_loss_clip": 0.01054624, + "auxiliary_loss_mlp": 0.01033355, + "balance_loss_clip": 1.02838576, + "balance_loss_mlp": 1.02291203, + "epoch": 0.8096197204268751, + "flos": 20373201008640.0, + "grad_norm": 1.718573966641393, + "language_loss": 0.81693661, + "learning_rate": 3.682064507324256e-07, + "loss": 0.83781636, + "num_input_tokens_seen": 290586575, + "step": 13466, + "time_per_iteration": 2.613602876663208 + }, + { + "auxiliary_loss_clip": 0.01046342, + "auxiliary_loss_mlp": 0.00747664, + "balance_loss_clip": 1.02681231, + "balance_loss_mlp": 1.00038934, + "epoch": 0.809679843679543, + "flos": 27818775682560.0, + "grad_norm": 1.7221102771430266, + "language_loss": 0.761307, + "learning_rate": 3.6798129586709204e-07, + "loss": 0.77924699, + "num_input_tokens_seen": 290606790, + "step": 13467, + "time_per_iteration": 2.6920127868652344 + }, + { + "auxiliary_loss_clip": 0.01025107, + "auxiliary_loss_mlp": 0.01023414, + "balance_loss_clip": 1.01990199, + "balance_loss_mlp": 1.01329315, + "epoch": 0.8097399669322111, + "flos": 22014103040640.0, + "grad_norm": 1.9806475986088488, + "language_loss": 0.79271787, + "learning_rate": 3.6775620288807073e-07, + "loss": 0.8132031, + "num_input_tokens_seen": 290625525, + "step": 13468, + "time_per_iteration": 2.680887460708618 + }, + { + "auxiliary_loss_clip": 0.01048098, + "auxiliary_loss_mlp": 0.0102735, + "balance_loss_clip": 1.02282321, + "balance_loss_mlp": 1.01772428, + "epoch": 0.809800090184879, + "flos": 18988988544000.0, + "grad_norm": 1.8093076921354208, + "language_loss": 0.68066955, + "learning_rate": 3.675311718038978e-07, + "loss": 0.701424, + "num_input_tokens_seen": 290644935, + "step": 13469, + "time_per_iteration": 2.5526371002197266 + }, + { + "auxiliary_loss_clip": 0.00978504, + "auxiliary_loss_mlp": 0.01021874, + "balance_loss_clip": 1.00320721, + "balance_loss_mlp": 1.02087831, + "epoch": 0.809860213437547, + "flos": 66099516508800.0, + "grad_norm": 0.7006651606479375, + "language_loss": 0.54689372, + "learning_rate": 3.6730620262310683e-07, + "loss": 0.56689751, + "num_input_tokens_seen": 290710735, + "step": 13470, + "time_per_iteration": 3.3410961627960205 + }, + { + "auxiliary_loss_clip": 0.01061116, + "auxiliary_loss_mlp": 0.0102649, + "balance_loss_clip": 1.02413499, + "balance_loss_mlp": 1.01722741, + "epoch": 0.8099203366902149, + "flos": 20882485992960.0, + "grad_norm": 1.6323245819633168, + "language_loss": 0.69367671, + "learning_rate": 3.670812953542279e-07, + "loss": 0.71455276, + "num_input_tokens_seen": 290729565, + "step": 13471, + "time_per_iteration": 2.517859697341919 + }, + { + "auxiliary_loss_clip": 0.01053959, + "auxiliary_loss_mlp": 0.01024126, + "balance_loss_clip": 1.02648377, + "balance_loss_mlp": 1.01411843, + "epoch": 0.8099804599428829, + "flos": 26030927111040.0, + "grad_norm": 1.7322412559988432, + "language_loss": 0.79384023, + "learning_rate": 3.6685645000579003e-07, + "loss": 0.81462109, + "num_input_tokens_seen": 290749360, + "step": 13472, + "time_per_iteration": 2.6060380935668945 + }, + { + "auxiliary_loss_clip": 0.00995986, + "auxiliary_loss_mlp": 0.01001404, + "balance_loss_clip": 1.00096047, + "balance_loss_mlp": 1.0005579, + "epoch": 0.8100405831955508, + "flos": 69303573584640.0, + "grad_norm": 0.753368530485972, + "language_loss": 0.57787633, + "learning_rate": 3.666316665863201e-07, + "loss": 0.59785026, + "num_input_tokens_seen": 290812145, + "step": 13473, + "time_per_iteration": 3.136176109313965 + }, + { + "auxiliary_loss_clip": 0.01017278, + "auxiliary_loss_mlp": 0.01024312, + "balance_loss_clip": 1.02426744, + "balance_loss_mlp": 1.01350558, + "epoch": 0.8101007064482189, + "flos": 15012492468480.0, + "grad_norm": 2.010555822348409, + "language_loss": 0.74545783, + "learning_rate": 3.664069451043399e-07, + "loss": 0.76587373, + "num_input_tokens_seen": 290829845, + "step": 13474, + "time_per_iteration": 4.499842405319214 + }, + { + "auxiliary_loss_clip": 0.01056957, + "auxiliary_loss_mlp": 0.01035316, + "balance_loss_clip": 1.02740073, + "balance_loss_mlp": 1.02449155, + "epoch": 0.8101608297008868, + "flos": 21067210661760.0, + "grad_norm": 1.6622487606309204, + "language_loss": 0.78828824, + "learning_rate": 3.661822855683723e-07, + "loss": 0.80921102, + "num_input_tokens_seen": 290848815, + "step": 13475, + "time_per_iteration": 2.616732597351074 + }, + { + "auxiliary_loss_clip": 0.01050996, + "auxiliary_loss_mlp": 0.01032704, + "balance_loss_clip": 1.02427089, + "balance_loss_mlp": 1.02315557, + "epoch": 0.8102209529535548, + "flos": 23731279603200.0, + "grad_norm": 1.6989859723443432, + "language_loss": 0.75238669, + "learning_rate": 3.659576879869364e-07, + "loss": 0.77322376, + "num_input_tokens_seen": 290868580, + "step": 13476, + "time_per_iteration": 2.668051242828369 + }, + { + "auxiliary_loss_clip": 0.01042223, + "auxiliary_loss_mlp": 0.01033713, + "balance_loss_clip": 1.02369511, + "balance_loss_mlp": 1.02216768, + "epoch": 0.8102810762062228, + "flos": 10955879107200.0, + "grad_norm": 1.89278147552493, + "language_loss": 0.73333514, + "learning_rate": 3.657331523685485e-07, + "loss": 0.75409448, + "num_input_tokens_seen": 290883540, + "step": 13477, + "time_per_iteration": 2.623474359512329 + }, + { + "auxiliary_loss_clip": 0.01034173, + "auxiliary_loss_mlp": 0.01033294, + "balance_loss_clip": 1.02458966, + "balance_loss_mlp": 1.02342951, + "epoch": 0.8103411994588907, + "flos": 14648825220480.0, + "grad_norm": 2.4947540464965936, + "language_loss": 0.70244431, + "learning_rate": 3.6550867872172365e-07, + "loss": 0.72311902, + "num_input_tokens_seen": 290901560, + "step": 13478, + "time_per_iteration": 2.6389048099517822 + }, + { + "auxiliary_loss_clip": 0.01005684, + "auxiliary_loss_mlp": 0.0100034, + "balance_loss_clip": 1.00063539, + "balance_loss_mlp": 0.99946958, + "epoch": 0.8104013227115587, + "flos": 59153314665600.0, + "grad_norm": 0.6859265153701224, + "language_loss": 0.52167034, + "learning_rate": 3.6528426705497293e-07, + "loss": 0.54173064, + "num_input_tokens_seen": 290959185, + "step": 13479, + "time_per_iteration": 3.0838735103607178 + }, + { + "auxiliary_loss_clip": 0.01005577, + "auxiliary_loss_mlp": 0.01028757, + "balance_loss_clip": 1.0209291, + "balance_loss_mlp": 1.01846325, + "epoch": 0.8104614459642266, + "flos": 19828687760640.0, + "grad_norm": 1.4711478245110228, + "language_loss": 0.71305633, + "learning_rate": 3.650599173768072e-07, + "loss": 0.73339969, + "num_input_tokens_seen": 290979585, + "step": 13480, + "time_per_iteration": 2.7220394611358643 + }, + { + "auxiliary_loss_clip": 0.01061489, + "auxiliary_loss_mlp": 0.01025785, + "balance_loss_clip": 1.02430761, + "balance_loss_mlp": 1.0161171, + "epoch": 0.8105215692168947, + "flos": 25374264624000.0, + "grad_norm": 1.7194694381534192, + "language_loss": 0.79536062, + "learning_rate": 3.648356296957327e-07, + "loss": 0.8162334, + "num_input_tokens_seen": 291000865, + "step": 13481, + "time_per_iteration": 2.7718260288238525 + }, + { + "auxiliary_loss_clip": 0.01041389, + "auxiliary_loss_mlp": 0.01028302, + "balance_loss_clip": 1.02490854, + "balance_loss_mlp": 1.01799071, + "epoch": 0.8105816924695626, + "flos": 20481722974080.0, + "grad_norm": 2.7501497988745536, + "language_loss": 0.72474545, + "learning_rate": 3.646114040202548e-07, + "loss": 0.74544227, + "num_input_tokens_seen": 291018285, + "step": 13482, + "time_per_iteration": 2.7481443881988525 + }, + { + "auxiliary_loss_clip": 0.0100475, + "auxiliary_loss_mlp": 0.01023839, + "balance_loss_clip": 1.02270401, + "balance_loss_mlp": 1.01337218, + "epoch": 0.8106418157222306, + "flos": 14538687143040.0, + "grad_norm": 2.05548221226079, + "language_loss": 0.65560663, + "learning_rate": 3.6438724035887705e-07, + "loss": 0.67589259, + "num_input_tokens_seen": 291035745, + "step": 13483, + "time_per_iteration": 2.927321434020996 + }, + { + "auxiliary_loss_clip": 0.01031765, + "auxiliary_loss_mlp": 0.01023399, + "balance_loss_clip": 1.02143002, + "balance_loss_mlp": 1.0124259, + "epoch": 0.8107019389748985, + "flos": 22564470205440.0, + "grad_norm": 2.063936121625773, + "language_loss": 0.76076555, + "learning_rate": 3.641631387200992e-07, + "loss": 0.78131717, + "num_input_tokens_seen": 291053280, + "step": 13484, + "time_per_iteration": 2.682607889175415 + }, + { + "auxiliary_loss_clip": 0.01041534, + "auxiliary_loss_mlp": 0.01028484, + "balance_loss_clip": 1.02430773, + "balance_loss_mlp": 1.01685548, + "epoch": 0.8107620622275665, + "flos": 19609560840960.0, + "grad_norm": 1.4607105690131035, + "language_loss": 0.72018701, + "learning_rate": 3.639390991124183e-07, + "loss": 0.74088717, + "num_input_tokens_seen": 291072855, + "step": 13485, + "time_per_iteration": 2.8327527046203613 + }, + { + "auxiliary_loss_clip": 0.01009673, + "auxiliary_loss_mlp": 0.01031962, + "balance_loss_clip": 1.01846933, + "balance_loss_mlp": 1.02055407, + "epoch": 0.8108221854802344, + "flos": 16143498984960.0, + "grad_norm": 1.7884561936701064, + "language_loss": 0.76023698, + "learning_rate": 3.637151215443308e-07, + "loss": 0.7806533, + "num_input_tokens_seen": 291090285, + "step": 13486, + "time_per_iteration": 2.9138827323913574 + }, + { + "auxiliary_loss_clip": 0.01044095, + "auxiliary_loss_mlp": 0.01029461, + "balance_loss_clip": 1.02517247, + "balance_loss_mlp": 1.01929235, + "epoch": 0.8108823087329025, + "flos": 21106209853440.0, + "grad_norm": 2.887894034080833, + "language_loss": 0.71803725, + "learning_rate": 3.6349120602433045e-07, + "loss": 0.73877275, + "num_input_tokens_seen": 291107675, + "step": 13487, + "time_per_iteration": 2.660220146179199 + }, + { + "auxiliary_loss_clip": 0.01006716, + "auxiliary_loss_mlp": 0.01027692, + "balance_loss_clip": 1.02450752, + "balance_loss_mlp": 1.01757157, + "epoch": 0.8109424319855704, + "flos": 29199648182400.0, + "grad_norm": 1.7234130253196382, + "language_loss": 0.8434813, + "learning_rate": 3.6326735256090715e-07, + "loss": 0.86382538, + "num_input_tokens_seen": 291126900, + "step": 13488, + "time_per_iteration": 2.852874755859375 + }, + { + "auxiliary_loss_clip": 0.01063538, + "auxiliary_loss_mlp": 0.01030527, + "balance_loss_clip": 1.0254494, + "balance_loss_mlp": 1.01965499, + "epoch": 0.8110025552382384, + "flos": 23111856541440.0, + "grad_norm": 1.988841766858879, + "language_loss": 0.73766798, + "learning_rate": 3.630435611625502e-07, + "loss": 0.75860864, + "num_input_tokens_seen": 291145285, + "step": 13489, + "time_per_iteration": 2.5869240760803223 + }, + { + "auxiliary_loss_clip": 0.01022603, + "auxiliary_loss_mlp": 0.00747637, + "balance_loss_clip": 1.02592337, + "balance_loss_mlp": 1.00034738, + "epoch": 0.8110626784909064, + "flos": 22379961018240.0, + "grad_norm": 1.612912688081541, + "language_loss": 0.7131443, + "learning_rate": 3.628198318377453e-07, + "loss": 0.73084676, + "num_input_tokens_seen": 291163485, + "step": 13490, + "time_per_iteration": 2.8383312225341797 + }, + { + "auxiliary_loss_clip": 0.01028959, + "auxiliary_loss_mlp": 0.01036117, + "balance_loss_clip": 1.02440214, + "balance_loss_mlp": 1.02409482, + "epoch": 0.8111228017435743, + "flos": 23368043318400.0, + "grad_norm": 2.3058065687489773, + "language_loss": 0.71756983, + "learning_rate": 3.625961645949762e-07, + "loss": 0.73822057, + "num_input_tokens_seen": 291182215, + "step": 13491, + "time_per_iteration": 2.760863780975342 + }, + { + "auxiliary_loss_clip": 0.01061415, + "auxiliary_loss_mlp": 0.01026525, + "balance_loss_clip": 1.0240798, + "balance_loss_mlp": 1.01652896, + "epoch": 0.8111829249962423, + "flos": 21286553063040.0, + "grad_norm": 1.3729277529530144, + "language_loss": 0.67663455, + "learning_rate": 3.623725594427245e-07, + "loss": 0.69751394, + "num_input_tokens_seen": 291203145, + "step": 13492, + "time_per_iteration": 5.774450063705444 + }, + { + "auxiliary_loss_clip": 0.01023637, + "auxiliary_loss_mlp": 0.0102989, + "balance_loss_clip": 1.02475929, + "balance_loss_mlp": 1.01911378, + "epoch": 0.8112430482489102, + "flos": 22345558767360.0, + "grad_norm": 1.9050194545819559, + "language_loss": 0.72287679, + "learning_rate": 3.6214901638947006e-07, + "loss": 0.74341202, + "num_input_tokens_seen": 291220600, + "step": 13493, + "time_per_iteration": 2.664137125015259 + }, + { + "auxiliary_loss_clip": 0.01041859, + "auxiliary_loss_mlp": 0.01039269, + "balance_loss_clip": 1.02124119, + "balance_loss_mlp": 1.02803993, + "epoch": 0.8113031715015783, + "flos": 31138321962240.0, + "grad_norm": 1.9056244559298112, + "language_loss": 0.70787781, + "learning_rate": 3.619255354436885e-07, + "loss": 0.72868913, + "num_input_tokens_seen": 291241195, + "step": 13494, + "time_per_iteration": 2.707463502883911 + }, + { + "auxiliary_loss_clip": 0.01054814, + "auxiliary_loss_mlp": 0.01031422, + "balance_loss_clip": 1.02614224, + "balance_loss_mlp": 1.01957309, + "epoch": 0.8113632947542462, + "flos": 25335445000320.0, + "grad_norm": 2.2045916711754625, + "language_loss": 0.76275069, + "learning_rate": 3.6170211661385543e-07, + "loss": 0.78361309, + "num_input_tokens_seen": 291258715, + "step": 13495, + "time_per_iteration": 2.672966957092285 + }, + { + "auxiliary_loss_clip": 0.0104122, + "auxiliary_loss_mlp": 0.01027967, + "balance_loss_clip": 1.02344179, + "balance_loss_mlp": 1.01732171, + "epoch": 0.8114234180069142, + "flos": 28439168411520.0, + "grad_norm": 1.9786056680498632, + "language_loss": 0.8024168, + "learning_rate": 3.614787599084417e-07, + "loss": 0.82310867, + "num_input_tokens_seen": 291278030, + "step": 13496, + "time_per_iteration": 2.644406318664551 + }, + { + "auxiliary_loss_clip": 0.01048595, + "auxiliary_loss_mlp": 0.01029846, + "balance_loss_clip": 1.02218044, + "balance_loss_mlp": 1.01862288, + "epoch": 0.8114835412595821, + "flos": 20338870584960.0, + "grad_norm": 2.000333567107523, + "language_loss": 0.71560836, + "learning_rate": 3.6125546533591787e-07, + "loss": 0.73639274, + "num_input_tokens_seen": 291296740, + "step": 13497, + "time_per_iteration": 2.5911145210266113 + }, + { + "auxiliary_loss_clip": 0.01024456, + "auxiliary_loss_mlp": 0.01028156, + "balance_loss_clip": 1.02228487, + "balance_loss_mlp": 1.01826119, + "epoch": 0.8115436645122501, + "flos": 22490889194880.0, + "grad_norm": 2.6793372443943655, + "language_loss": 0.77050781, + "learning_rate": 3.610322329047508e-07, + "loss": 0.79103386, + "num_input_tokens_seen": 291318730, + "step": 13498, + "time_per_iteration": 2.6858808994293213 + }, + { + "auxiliary_loss_clip": 0.01062902, + "auxiliary_loss_mlp": 0.0103073, + "balance_loss_clip": 1.02551627, + "balance_loss_mlp": 1.02040064, + "epoch": 0.811603787764918, + "flos": 13845288021120.0, + "grad_norm": 2.1446197363614727, + "language_loss": 0.83515769, + "learning_rate": 3.608090626234055e-07, + "loss": 0.85609406, + "num_input_tokens_seen": 291336755, + "step": 13499, + "time_per_iteration": 2.535648822784424 + }, + { + "auxiliary_loss_clip": 0.01028778, + "auxiliary_loss_mlp": 0.01027801, + "balance_loss_clip": 1.02360749, + "balance_loss_mlp": 1.01661897, + "epoch": 0.8116639110175861, + "flos": 21614632911360.0, + "grad_norm": 1.5013748836369025, + "language_loss": 0.76110566, + "learning_rate": 3.6058595450034603e-07, + "loss": 0.7816714, + "num_input_tokens_seen": 291356795, + "step": 13500, + "time_per_iteration": 2.6670475006103516 + }, + { + "auxiliary_loss_clip": 0.00986879, + "auxiliary_loss_mlp": 0.01001216, + "balance_loss_clip": 1.00167441, + "balance_loss_mlp": 1.00028598, + "epoch": 0.811724034270254, + "flos": 64459799625600.0, + "grad_norm": 0.80579173133963, + "language_loss": 0.59948486, + "learning_rate": 3.603629085440303e-07, + "loss": 0.61936581, + "num_input_tokens_seen": 291416005, + "step": 13501, + "time_per_iteration": 3.3259949684143066 + }, + { + "auxiliary_loss_clip": 0.01049962, + "auxiliary_loss_mlp": 0.01023831, + "balance_loss_clip": 1.02484775, + "balance_loss_mlp": 1.01400208, + "epoch": 0.811784157522922, + "flos": 24754123290240.0, + "grad_norm": 1.4579248030576983, + "language_loss": 0.7886734, + "learning_rate": 3.6013992476291753e-07, + "loss": 0.80941129, + "num_input_tokens_seen": 291434870, + "step": 13502, + "time_per_iteration": 2.6454954147338867 + }, + { + "auxiliary_loss_clip": 0.01034401, + "auxiliary_loss_mlp": 0.01032355, + "balance_loss_clip": 1.02217937, + "balance_loss_mlp": 1.02098799, + "epoch": 0.81184428077559, + "flos": 12167146563840.0, + "grad_norm": 1.7747061724617796, + "language_loss": 0.70850277, + "learning_rate": 3.599170031654635e-07, + "loss": 0.72917032, + "num_input_tokens_seen": 291452230, + "step": 13503, + "time_per_iteration": 2.644819498062134 + }, + { + "auxiliary_loss_clip": 0.01035258, + "auxiliary_loss_mlp": 0.01027624, + "balance_loss_clip": 1.02373838, + "balance_loss_mlp": 1.01602483, + "epoch": 0.8119044040282579, + "flos": 44422037775360.0, + "grad_norm": 3.5199463995732385, + "language_loss": 0.68013853, + "learning_rate": 3.5969414376012065e-07, + "loss": 0.70076734, + "num_input_tokens_seen": 291477425, + "step": 13504, + "time_per_iteration": 2.858473062515259 + }, + { + "auxiliary_loss_clip": 0.01043465, + "auxiliary_loss_mlp": 0.01029506, + "balance_loss_clip": 1.02523482, + "balance_loss_mlp": 1.01792502, + "epoch": 0.8119645272809259, + "flos": 52155507957120.0, + "grad_norm": 2.2958474349594313, + "language_loss": 0.74858826, + "learning_rate": 3.594713465553403e-07, + "loss": 0.76931798, + "num_input_tokens_seen": 291501070, + "step": 13505, + "time_per_iteration": 2.910135269165039 + }, + { + "auxiliary_loss_clip": 0.01043015, + "auxiliary_loss_mlp": 0.01026668, + "balance_loss_clip": 1.02550983, + "balance_loss_mlp": 1.01516473, + "epoch": 0.8120246505335939, + "flos": 30232978640640.0, + "grad_norm": 1.8679088225597975, + "language_loss": 0.72825855, + "learning_rate": 3.5924861155957123e-07, + "loss": 0.74895537, + "num_input_tokens_seen": 291524945, + "step": 13506, + "time_per_iteration": 2.7003045082092285 + }, + { + "auxiliary_loss_clip": 0.01066732, + "auxiliary_loss_mlp": 0.01031208, + "balance_loss_clip": 1.02653003, + "balance_loss_mlp": 1.02026451, + "epoch": 0.8120847737862619, + "flos": 22127652910080.0, + "grad_norm": 2.169385012891237, + "language_loss": 0.76319063, + "learning_rate": 3.590259387812593e-07, + "loss": 0.78417003, + "num_input_tokens_seen": 291544605, + "step": 13507, + "time_per_iteration": 2.540571689605713 + }, + { + "auxiliary_loss_clip": 0.01062392, + "auxiliary_loss_mlp": 0.01023198, + "balance_loss_clip": 1.02362323, + "balance_loss_mlp": 1.01261258, + "epoch": 0.8121448970389298, + "flos": 23295180579840.0, + "grad_norm": 1.637735121013287, + "language_loss": 0.70537031, + "learning_rate": 3.5880332822884783e-07, + "loss": 0.72622621, + "num_input_tokens_seen": 291563850, + "step": 13508, + "time_per_iteration": 2.605832815170288 + }, + { + "auxiliary_loss_clip": 0.01051904, + "auxiliary_loss_mlp": 0.01029983, + "balance_loss_clip": 1.02525425, + "balance_loss_mlp": 1.01978481, + "epoch": 0.8122050202915978, + "flos": 22164138149760.0, + "grad_norm": 1.9116974256762151, + "language_loss": 0.75993979, + "learning_rate": 3.585807799107785e-07, + "loss": 0.78075868, + "num_input_tokens_seen": 291581730, + "step": 13509, + "time_per_iteration": 4.312496900558472 + }, + { + "auxiliary_loss_clip": 0.01063061, + "auxiliary_loss_mlp": 0.01032783, + "balance_loss_clip": 1.02561045, + "balance_loss_mlp": 1.02164936, + "epoch": 0.8122651435442657, + "flos": 23258946735360.0, + "grad_norm": 1.7904397595017318, + "language_loss": 0.76790088, + "learning_rate": 3.58358293835491e-07, + "loss": 0.78885925, + "num_input_tokens_seen": 291601225, + "step": 13510, + "time_per_iteration": 2.588557243347168 + }, + { + "auxiliary_loss_clip": 0.01051234, + "auxiliary_loss_mlp": 0.01031024, + "balance_loss_clip": 1.02381039, + "balance_loss_mlp": 1.01987171, + "epoch": 0.8123252667969337, + "flos": 16140015365760.0, + "grad_norm": 1.6470384452763052, + "language_loss": 0.69872433, + "learning_rate": 3.581358700114212e-07, + "loss": 0.71954691, + "num_input_tokens_seen": 291616995, + "step": 13511, + "time_per_iteration": 2.532470703125 + }, + { + "auxiliary_loss_clip": 0.01044621, + "auxiliary_loss_mlp": 0.01032369, + "balance_loss_clip": 1.02589917, + "balance_loss_mlp": 1.02169406, + "epoch": 0.8123853900496016, + "flos": 21245399055360.0, + "grad_norm": 1.6264329891039437, + "language_loss": 0.79508758, + "learning_rate": 3.57913508447004e-07, + "loss": 0.81585747, + "num_input_tokens_seen": 291636145, + "step": 13512, + "time_per_iteration": 2.6486639976501465 + }, + { + "auxiliary_loss_clip": 0.01048866, + "auxiliary_loss_mlp": 0.01028227, + "balance_loss_clip": 1.02305543, + "balance_loss_mlp": 1.01833892, + "epoch": 0.8124455133022697, + "flos": 64377596373120.0, + "grad_norm": 1.6219878656158817, + "language_loss": 0.63611078, + "learning_rate": 3.5769120915067076e-07, + "loss": 0.65688181, + "num_input_tokens_seen": 291662440, + "step": 13513, + "time_per_iteration": 2.928096055984497 + }, + { + "auxiliary_loss_clip": 0.0101863, + "auxiliary_loss_mlp": 0.0103096, + "balance_loss_clip": 1.02350879, + "balance_loss_mlp": 1.01995659, + "epoch": 0.8125056365549376, + "flos": 23842207779840.0, + "grad_norm": 1.6785507697566389, + "language_loss": 0.715119, + "learning_rate": 3.5746897213085194e-07, + "loss": 0.7356149, + "num_input_tokens_seen": 291680950, + "step": 13514, + "time_per_iteration": 2.716073513031006 + }, + { + "auxiliary_loss_clip": 0.0103058, + "auxiliary_loss_mlp": 0.01026107, + "balance_loss_clip": 1.02473819, + "balance_loss_mlp": 1.01603353, + "epoch": 0.8125657598076056, + "flos": 23550325862400.0, + "grad_norm": 1.603605077088115, + "language_loss": 0.63187295, + "learning_rate": 3.5724679739597364e-07, + "loss": 0.65243983, + "num_input_tokens_seen": 291702395, + "step": 13515, + "time_per_iteration": 2.699659585952759 + }, + { + "auxiliary_loss_clip": 0.01056296, + "auxiliary_loss_mlp": 0.00747465, + "balance_loss_clip": 1.02251494, + "balance_loss_mlp": 1.00031805, + "epoch": 0.8126258830602736, + "flos": 20704225772160.0, + "grad_norm": 2.8737181957095608, + "language_loss": 0.75171721, + "learning_rate": 3.570246849544616e-07, + "loss": 0.76975477, + "num_input_tokens_seen": 291721135, + "step": 13516, + "time_per_iteration": 2.7246155738830566 + }, + { + "auxiliary_loss_clip": 0.01016233, + "auxiliary_loss_mlp": 0.01028545, + "balance_loss_clip": 1.02569258, + "balance_loss_mlp": 1.01853108, + "epoch": 0.8126860063129415, + "flos": 23618160696960.0, + "grad_norm": 1.591575967304665, + "language_loss": 0.91376436, + "learning_rate": 3.5680263481473907e-07, + "loss": 0.93421209, + "num_input_tokens_seen": 291741235, + "step": 13517, + "time_per_iteration": 2.8370487689971924 + }, + { + "auxiliary_loss_clip": 0.01056408, + "auxiliary_loss_mlp": 0.00747605, + "balance_loss_clip": 1.02777863, + "balance_loss_mlp": 1.00041127, + "epoch": 0.8127461295656095, + "flos": 25007149670400.0, + "grad_norm": 1.4545022036431647, + "language_loss": 0.78581637, + "learning_rate": 3.565806469852244e-07, + "loss": 0.80385649, + "num_input_tokens_seen": 291761430, + "step": 13518, + "time_per_iteration": 2.684455394744873 + }, + { + "auxiliary_loss_clip": 0.01046933, + "auxiliary_loss_mlp": 0.01026392, + "balance_loss_clip": 1.02368474, + "balance_loss_mlp": 1.01708794, + "epoch": 0.8128062528182775, + "flos": 27342169096320.0, + "grad_norm": 1.5722840630386112, + "language_loss": 0.79055643, + "learning_rate": 3.56358721474336e-07, + "loss": 0.81128973, + "num_input_tokens_seen": 291781755, + "step": 13519, + "time_per_iteration": 2.8879566192626953 + }, + { + "auxiliary_loss_clip": 0.01062327, + "auxiliary_loss_mlp": 0.01032716, + "balance_loss_clip": 1.02430403, + "balance_loss_mlp": 1.02238059, + "epoch": 0.8128663760709455, + "flos": 26506312634880.0, + "grad_norm": 1.706852444558557, + "language_loss": 0.70762372, + "learning_rate": 3.561368582904905e-07, + "loss": 0.7285741, + "num_input_tokens_seen": 291804410, + "step": 13520, + "time_per_iteration": 2.5937092304229736 + }, + { + "auxiliary_loss_clip": 0.01043584, + "auxiliary_loss_mlp": 0.01026114, + "balance_loss_clip": 1.02537012, + "balance_loss_mlp": 1.01553416, + "epoch": 0.8129264993236134, + "flos": 17931239815680.0, + "grad_norm": 1.4065882181719522, + "language_loss": 0.72714269, + "learning_rate": 3.5591505744209925e-07, + "loss": 0.74783969, + "num_input_tokens_seen": 291823285, + "step": 13521, + "time_per_iteration": 4.491543292999268 + }, + { + "auxiliary_loss_clip": 0.01052635, + "auxiliary_loss_mlp": 0.01029067, + "balance_loss_clip": 1.02411723, + "balance_loss_mlp": 1.01855314, + "epoch": 0.8129866225762814, + "flos": 26177694082560.0, + "grad_norm": 1.8839092825777681, + "language_loss": 0.69743317, + "learning_rate": 3.5569331893757394e-07, + "loss": 0.71825022, + "num_input_tokens_seen": 291845305, + "step": 13522, + "time_per_iteration": 2.7075347900390625 + }, + { + "auxiliary_loss_clip": 0.01049137, + "auxiliary_loss_mlp": 0.01029181, + "balance_loss_clip": 1.02419019, + "balance_loss_mlp": 1.01971579, + "epoch": 0.8130467458289493, + "flos": 21032197879680.0, + "grad_norm": 1.4988961601344377, + "language_loss": 0.70325863, + "learning_rate": 3.554716427853233e-07, + "loss": 0.72404182, + "num_input_tokens_seen": 291863715, + "step": 13523, + "time_per_iteration": 2.697901487350464 + }, + { + "auxiliary_loss_clip": 0.01049524, + "auxiliary_loss_mlp": 0.01027606, + "balance_loss_clip": 1.02273154, + "balance_loss_mlp": 1.01728284, + "epoch": 0.8131068690816173, + "flos": 15487051979520.0, + "grad_norm": 2.046558462713289, + "language_loss": 0.70627499, + "learning_rate": 3.5525002899375256e-07, + "loss": 0.72704631, + "num_input_tokens_seen": 291880735, + "step": 13524, + "time_per_iteration": 2.550269842147827 + }, + { + "auxiliary_loss_clip": 0.01050101, + "auxiliary_loss_mlp": 0.01025352, + "balance_loss_clip": 1.02336979, + "balance_loss_mlp": 1.01588631, + "epoch": 0.8131669923342852, + "flos": 29351227576320.0, + "grad_norm": 1.7370336122778576, + "language_loss": 0.62432456, + "learning_rate": 3.550284775712653e-07, + "loss": 0.64507914, + "num_input_tokens_seen": 291900535, + "step": 13525, + "time_per_iteration": 2.6387956142425537 + }, + { + "auxiliary_loss_clip": 0.01034422, + "auxiliary_loss_mlp": 0.01029823, + "balance_loss_clip": 1.0253942, + "balance_loss_mlp": 1.02008307, + "epoch": 0.8132271155869533, + "flos": 35256162055680.0, + "grad_norm": 1.8859078638116942, + "language_loss": 0.6550864, + "learning_rate": 3.548069885262628e-07, + "loss": 0.67572892, + "num_input_tokens_seen": 291919760, + "step": 13526, + "time_per_iteration": 2.7531204223632812 + }, + { + "auxiliary_loss_clip": 0.01040438, + "auxiliary_loss_mlp": 0.01026398, + "balance_loss_clip": 1.02477932, + "balance_loss_mlp": 1.01690888, + "epoch": 0.8132872388396212, + "flos": 27781895393280.0, + "grad_norm": 1.5342007411459482, + "language_loss": 0.75104886, + "learning_rate": 3.5458556186714473e-07, + "loss": 0.77171719, + "num_input_tokens_seen": 291938915, + "step": 13527, + "time_per_iteration": 2.7284395694732666 + }, + { + "auxiliary_loss_clip": 0.01060259, + "auxiliary_loss_mlp": 0.01027055, + "balance_loss_clip": 1.02416706, + "balance_loss_mlp": 1.01718414, + "epoch": 0.8133473620922892, + "flos": 27819601695360.0, + "grad_norm": 1.6909788688372036, + "language_loss": 0.70754707, + "learning_rate": 3.5436419760230706e-07, + "loss": 0.7284202, + "num_input_tokens_seen": 291958145, + "step": 13528, + "time_per_iteration": 2.6045520305633545 + }, + { + "auxiliary_loss_clip": 0.01061693, + "auxiliary_loss_mlp": 0.01028904, + "balance_loss_clip": 1.02397037, + "balance_loss_mlp": 1.01925433, + "epoch": 0.8134074853449572, + "flos": 18989527248000.0, + "grad_norm": 1.9243634500043187, + "language_loss": 0.68765008, + "learning_rate": 3.5414289574014357e-07, + "loss": 0.70855612, + "num_input_tokens_seen": 291976860, + "step": 13529, + "time_per_iteration": 2.845423936843872 + }, + { + "auxiliary_loss_clip": 0.01042049, + "auxiliary_loss_mlp": 0.01026522, + "balance_loss_clip": 1.02352345, + "balance_loss_mlp": 1.01664603, + "epoch": 0.8134676085976251, + "flos": 24242863057920.0, + "grad_norm": 1.375325411068059, + "language_loss": 0.77371144, + "learning_rate": 3.5392165628904635e-07, + "loss": 0.79439712, + "num_input_tokens_seen": 291998085, + "step": 13530, + "time_per_iteration": 2.5977859497070312 + }, + { + "auxiliary_loss_clip": 0.01052778, + "auxiliary_loss_mlp": 0.010301, + "balance_loss_clip": 1.02550089, + "balance_loss_mlp": 1.01947284, + "epoch": 0.8135277318502931, + "flos": 19062389986560.0, + "grad_norm": 2.2934307501545135, + "language_loss": 0.81923091, + "learning_rate": 3.537004792574052e-07, + "loss": 0.84005964, + "num_input_tokens_seen": 292016585, + "step": 13531, + "time_per_iteration": 2.526431083679199 + }, + { + "auxiliary_loss_clip": 0.01037236, + "auxiliary_loss_mlp": 0.01028409, + "balance_loss_clip": 1.02264106, + "balance_loss_mlp": 1.01590443, + "epoch": 0.813587855102961, + "flos": 17269728992640.0, + "grad_norm": 1.9326616447357263, + "language_loss": 0.71488106, + "learning_rate": 3.534793646536065e-07, + "loss": 0.73553753, + "num_input_tokens_seen": 292033255, + "step": 13532, + "time_per_iteration": 2.5214695930480957 + }, + { + "auxiliary_loss_clip": 0.01029566, + "auxiliary_loss_mlp": 0.01024998, + "balance_loss_clip": 1.02305031, + "balance_loss_mlp": 1.01518118, + "epoch": 0.8136479783556291, + "flos": 20157593621760.0, + "grad_norm": 4.096952223265699, + "language_loss": 0.76437378, + "learning_rate": 3.5325831248603533e-07, + "loss": 0.78491944, + "num_input_tokens_seen": 292051800, + "step": 13533, + "time_per_iteration": 2.7021865844726562 + }, + { + "auxiliary_loss_clip": 0.01065989, + "auxiliary_loss_mlp": 0.00747639, + "balance_loss_clip": 1.02590084, + "balance_loss_mlp": 1.00036788, + "epoch": 0.813708101608297, + "flos": 22052348046720.0, + "grad_norm": 1.6182815548298115, + "language_loss": 0.76487893, + "learning_rate": 3.5303732276307495e-07, + "loss": 0.78301513, + "num_input_tokens_seen": 292072215, + "step": 13534, + "time_per_iteration": 2.5920701026916504 + }, + { + "auxiliary_loss_clip": 0.01042625, + "auxiliary_loss_mlp": 0.010258, + "balance_loss_clip": 1.02478194, + "balance_loss_mlp": 1.0166688, + "epoch": 0.813768224860965, + "flos": 16173412035840.0, + "grad_norm": 2.048586637979777, + "language_loss": 0.93199039, + "learning_rate": 3.5281639549310336e-07, + "loss": 0.95267463, + "num_input_tokens_seen": 292088830, + "step": 13535, + "time_per_iteration": 2.5884039402008057 + }, + { + "auxiliary_loss_clip": 0.01032354, + "auxiliary_loss_mlp": 0.0102392, + "balance_loss_clip": 1.02663255, + "balance_loss_mlp": 1.0139122, + "epoch": 0.8138283481136329, + "flos": 24352318776960.0, + "grad_norm": 2.4247931642157976, + "language_loss": 0.70369804, + "learning_rate": 3.52595530684499e-07, + "loss": 0.72426081, + "num_input_tokens_seen": 292109225, + "step": 13536, + "time_per_iteration": 2.766072988510132 + }, + { + "auxiliary_loss_clip": 0.01025745, + "auxiliary_loss_mlp": 0.01030086, + "balance_loss_clip": 1.02333093, + "balance_loss_mlp": 1.01959538, + "epoch": 0.8138884713663009, + "flos": 25516362827520.0, + "grad_norm": 1.7599373640811697, + "language_loss": 0.75712502, + "learning_rate": 3.5237472834563775e-07, + "loss": 0.77768338, + "num_input_tokens_seen": 292129660, + "step": 13537, + "time_per_iteration": 2.767195224761963 + }, + { + "auxiliary_loss_clip": 0.0103473, + "auxiliary_loss_mlp": 0.01028294, + "balance_loss_clip": 1.02301574, + "balance_loss_mlp": 1.0174644, + "epoch": 0.8139485946189688, + "flos": 22454368041600.0, + "grad_norm": 1.862523235155295, + "language_loss": 0.76357299, + "learning_rate": 3.5215398848489163e-07, + "loss": 0.78420323, + "num_input_tokens_seen": 292149090, + "step": 13538, + "time_per_iteration": 2.776007652282715 + }, + { + "auxiliary_loss_clip": 0.01050684, + "auxiliary_loss_mlp": 0.01026869, + "balance_loss_clip": 1.02389622, + "balance_loss_mlp": 1.01678443, + "epoch": 0.8140087178716369, + "flos": 21250391045760.0, + "grad_norm": 1.5435700079105623, + "language_loss": 0.78040123, + "learning_rate": 3.5193331111063176e-07, + "loss": 0.80117667, + "num_input_tokens_seen": 292169260, + "step": 13539, + "time_per_iteration": 5.944421052932739 + }, + { + "auxiliary_loss_clip": 0.01030116, + "auxiliary_loss_mlp": 0.01029303, + "balance_loss_clip": 1.03135252, + "balance_loss_mlp": 1.0198679, + "epoch": 0.8140688411243048, + "flos": 39415730774400.0, + "grad_norm": 4.11862251213617, + "language_loss": 0.66262084, + "learning_rate": 3.5171269623122533e-07, + "loss": 0.68321502, + "num_input_tokens_seen": 292188145, + "step": 13540, + "time_per_iteration": 2.9301352500915527 + }, + { + "auxiliary_loss_clip": 0.01053928, + "auxiliary_loss_mlp": 0.0102969, + "balance_loss_clip": 1.02623951, + "balance_loss_mlp": 1.02012324, + "epoch": 0.8141289643769728, + "flos": 25415885508480.0, + "grad_norm": 2.1262369832263612, + "language_loss": 0.67159498, + "learning_rate": 3.5149214385503913e-07, + "loss": 0.69243121, + "num_input_tokens_seen": 292212135, + "step": 13541, + "time_per_iteration": 2.645411729812622 + }, + { + "auxiliary_loss_clip": 0.01060205, + "auxiliary_loss_mlp": 0.01030562, + "balance_loss_clip": 1.02374244, + "balance_loss_mlp": 1.02001214, + "epoch": 0.8141890876296408, + "flos": 12568053237120.0, + "grad_norm": 1.8753294863675818, + "language_loss": 0.69052362, + "learning_rate": 3.512716539904355e-07, + "loss": 0.71143132, + "num_input_tokens_seen": 292230645, + "step": 13542, + "time_per_iteration": 2.590285539627075 + }, + { + "auxiliary_loss_clip": 0.01064095, + "auxiliary_loss_mlp": 0.01029403, + "balance_loss_clip": 1.02485287, + "balance_loss_mlp": 1.01851296, + "epoch": 0.8142492108823087, + "flos": 14967172483200.0, + "grad_norm": 3.037508823558399, + "language_loss": 0.80032229, + "learning_rate": 3.5105122664577613e-07, + "loss": 0.82125717, + "num_input_tokens_seen": 292243540, + "step": 13543, + "time_per_iteration": 2.6014037132263184 + }, + { + "auxiliary_loss_clip": 0.01037542, + "auxiliary_loss_mlp": 0.01035773, + "balance_loss_clip": 1.02769256, + "balance_loss_mlp": 1.02425718, + "epoch": 0.8143093341349767, + "flos": 12422004537600.0, + "grad_norm": 6.072372822262152, + "language_loss": 0.78346187, + "learning_rate": 3.5083086182942003e-07, + "loss": 0.80419493, + "num_input_tokens_seen": 292261715, + "step": 13544, + "time_per_iteration": 2.6979427337646484 + }, + { + "auxiliary_loss_clip": 0.01066797, + "auxiliary_loss_mlp": 0.01032441, + "balance_loss_clip": 1.02579069, + "balance_loss_mlp": 1.02020478, + "epoch": 0.8143694573876447, + "flos": 11910564737280.0, + "grad_norm": 6.856459869980073, + "language_loss": 0.73610878, + "learning_rate": 3.5061055954972264e-07, + "loss": 0.75710118, + "num_input_tokens_seen": 292275080, + "step": 13545, + "time_per_iteration": 2.557903289794922 + }, + { + "auxiliary_loss_clip": 0.01049632, + "auxiliary_loss_mlp": 0.0102597, + "balance_loss_clip": 1.02386189, + "balance_loss_mlp": 1.01630199, + "epoch": 0.8144295806403127, + "flos": 21212900225280.0, + "grad_norm": 2.2009188753672664, + "language_loss": 0.76801455, + "learning_rate": 3.5039031981503776e-07, + "loss": 0.78877056, + "num_input_tokens_seen": 292294635, + "step": 13546, + "time_per_iteration": 2.578599214553833 + }, + { + "auxiliary_loss_clip": 0.01053431, + "auxiliary_loss_mlp": 0.01025912, + "balance_loss_clip": 1.02608275, + "balance_loss_mlp": 1.0163219, + "epoch": 0.8144897038929806, + "flos": 19865280741120.0, + "grad_norm": 2.0170516180663625, + "language_loss": 0.70499527, + "learning_rate": 3.501701426337178e-07, + "loss": 0.72578871, + "num_input_tokens_seen": 292312695, + "step": 13547, + "time_per_iteration": 2.5694241523742676 + }, + { + "auxiliary_loss_clip": 0.01064509, + "auxiliary_loss_mlp": 0.01034966, + "balance_loss_clip": 1.0262866, + "balance_loss_mlp": 1.02292037, + "epoch": 0.8145498271456486, + "flos": 24571733005440.0, + "grad_norm": 1.8882352938588842, + "language_loss": 0.7046451, + "learning_rate": 3.49950028014111e-07, + "loss": 0.72563988, + "num_input_tokens_seen": 292332005, + "step": 13548, + "time_per_iteration": 2.5872855186462402 + }, + { + "auxiliary_loss_clip": 0.0105608, + "auxiliary_loss_mlp": 0.01031377, + "balance_loss_clip": 1.02714157, + "balance_loss_mlp": 1.01979566, + "epoch": 0.8146099503983165, + "flos": 20193037367040.0, + "grad_norm": 2.1622819985537953, + "language_loss": 0.76949275, + "learning_rate": 3.4972997596456444e-07, + "loss": 0.79036736, + "num_input_tokens_seen": 292348365, + "step": 13549, + "time_per_iteration": 2.5275256633758545 + }, + { + "auxiliary_loss_clip": 0.01064061, + "auxiliary_loss_mlp": 0.01026893, + "balance_loss_clip": 1.02745748, + "balance_loss_mlp": 1.01636088, + "epoch": 0.8146700736509845, + "flos": 19536949497600.0, + "grad_norm": 3.815973847025921, + "language_loss": 0.70931447, + "learning_rate": 3.4950998649342233e-07, + "loss": 0.73022401, + "num_input_tokens_seen": 292368050, + "step": 13550, + "time_per_iteration": 2.4960756301879883 + }, + { + "auxiliary_loss_clip": 0.01048521, + "auxiliary_loss_mlp": 0.01024223, + "balance_loss_clip": 1.0242064, + "balance_loss_mlp": 1.01464415, + "epoch": 0.8147301969036524, + "flos": 18041341979520.0, + "grad_norm": 1.7245716547181056, + "language_loss": 0.72214603, + "learning_rate": 3.4929005960902826e-07, + "loss": 0.74287349, + "num_input_tokens_seen": 292385315, + "step": 13551, + "time_per_iteration": 2.5679399967193604 + }, + { + "auxiliary_loss_clip": 0.01027625, + "auxiliary_loss_mlp": 0.01030565, + "balance_loss_clip": 1.0260632, + "balance_loss_mlp": 1.01876366, + "epoch": 0.8147903201563205, + "flos": 18004713085440.0, + "grad_norm": 1.9517845082160918, + "language_loss": 0.68710017, + "learning_rate": 3.4907019531971926e-07, + "loss": 0.70768201, + "num_input_tokens_seen": 292403375, + "step": 13552, + "time_per_iteration": 2.7828783988952637 + }, + { + "auxiliary_loss_clip": 0.0106026, + "auxiliary_loss_mlp": 0.01038939, + "balance_loss_clip": 1.02333117, + "balance_loss_mlp": 1.02833521, + "epoch": 0.8148504434089884, + "flos": 20259327916800.0, + "grad_norm": 5.412827170533328, + "language_loss": 0.82308245, + "learning_rate": 3.4885039363383407e-07, + "loss": 0.84407437, + "num_input_tokens_seen": 292419260, + "step": 13553, + "time_per_iteration": 2.755066156387329 + }, + { + "auxiliary_loss_clip": 0.01047849, + "auxiliary_loss_mlp": 0.01027504, + "balance_loss_clip": 1.0222168, + "balance_loss_mlp": 1.01709747, + "epoch": 0.8149105666616564, + "flos": 12494723621760.0, + "grad_norm": 5.281353684267827, + "language_loss": 0.68067694, + "learning_rate": 3.4863065455970795e-07, + "loss": 0.70143056, + "num_input_tokens_seen": 292436095, + "step": 13554, + "time_per_iteration": 2.6021597385406494 + }, + { + "auxiliary_loss_clip": 0.01031128, + "auxiliary_loss_mlp": 0.01032167, + "balance_loss_clip": 1.02288187, + "balance_loss_mlp": 1.02001309, + "epoch": 0.8149706899143244, + "flos": 32523683662080.0, + "grad_norm": 1.675718678741458, + "language_loss": 0.6637826, + "learning_rate": 3.484109781056723e-07, + "loss": 0.68441546, + "num_input_tokens_seen": 292457190, + "step": 13555, + "time_per_iteration": 2.694152593612671 + }, + { + "auxiliary_loss_clip": 0.01053507, + "auxiliary_loss_mlp": 0.01030768, + "balance_loss_clip": 1.02425146, + "balance_loss_mlp": 1.01983643, + "epoch": 0.8150308131669923, + "flos": 19386088375680.0, + "grad_norm": 2.090461653287941, + "language_loss": 0.73186588, + "learning_rate": 3.4819136428005844e-07, + "loss": 0.75270867, + "num_input_tokens_seen": 292474300, + "step": 13556, + "time_per_iteration": 4.4299187660217285 + }, + { + "auxiliary_loss_clip": 0.01052288, + "auxiliary_loss_mlp": 0.0102397, + "balance_loss_clip": 1.02627611, + "balance_loss_mlp": 1.0146364, + "epoch": 0.8150909364196604, + "flos": 17421380213760.0, + "grad_norm": 2.1067023590641423, + "language_loss": 0.80610752, + "learning_rate": 3.4797181309119307e-07, + "loss": 0.82687008, + "num_input_tokens_seen": 292492420, + "step": 13557, + "time_per_iteration": 2.604872465133667 + }, + { + "auxiliary_loss_clip": 0.0104358, + "auxiliary_loss_mlp": 0.01028508, + "balance_loss_clip": 1.02539146, + "balance_loss_mlp": 1.01814294, + "epoch": 0.8151510596723283, + "flos": 27162795553920.0, + "grad_norm": 1.5633533309980732, + "language_loss": 0.66160512, + "learning_rate": 3.4775232454740255e-07, + "loss": 0.68232602, + "num_input_tokens_seen": 292512895, + "step": 13558, + "time_per_iteration": 2.79183292388916 + }, + { + "auxiliary_loss_clip": 0.01006107, + "auxiliary_loss_mlp": 0.01000361, + "balance_loss_clip": 1.00100565, + "balance_loss_mlp": 0.99948484, + "epoch": 0.8152111829249963, + "flos": 64219052718720.0, + "grad_norm": 0.8082821901731474, + "language_loss": 0.56925571, + "learning_rate": 3.4753289865700896e-07, + "loss": 0.58932042, + "num_input_tokens_seen": 292566580, + "step": 13559, + "time_per_iteration": 3.1490917205810547 + }, + { + "auxiliary_loss_clip": 0.00988693, + "auxiliary_loss_mlp": 0.01010112, + "balance_loss_clip": 1.00276208, + "balance_loss_mlp": 1.00923026, + "epoch": 0.8152713061776642, + "flos": 67072012306560.0, + "grad_norm": 0.6944174241548188, + "language_loss": 0.55270696, + "learning_rate": 3.473135354283334e-07, + "loss": 0.57269502, + "num_input_tokens_seen": 292621490, + "step": 13560, + "time_per_iteration": 3.0949127674102783 + }, + { + "auxiliary_loss_clip": 0.01038654, + "auxiliary_loss_mlp": 0.01025108, + "balance_loss_clip": 1.02327538, + "balance_loss_mlp": 1.0156424, + "epoch": 0.8153314294303322, + "flos": 14391130072320.0, + "grad_norm": 1.6291199746596596, + "language_loss": 0.67702997, + "learning_rate": 3.470942348696948e-07, + "loss": 0.6976676, + "num_input_tokens_seen": 292638660, + "step": 13561, + "time_per_iteration": 2.667693614959717 + }, + { + "auxiliary_loss_clip": 0.01056158, + "auxiliary_loss_mlp": 0.01031726, + "balance_loss_clip": 1.02633977, + "balance_loss_mlp": 1.02122378, + "epoch": 0.8153915526830001, + "flos": 25623520076160.0, + "grad_norm": 1.5950808265632557, + "language_loss": 0.81762409, + "learning_rate": 3.468749969894085e-07, + "loss": 0.83850294, + "num_input_tokens_seen": 292658545, + "step": 13562, + "time_per_iteration": 2.812425136566162 + }, + { + "auxiliary_loss_clip": 0.0104305, + "auxiliary_loss_mlp": 0.01027913, + "balance_loss_clip": 1.02509832, + "balance_loss_mlp": 1.01753592, + "epoch": 0.8154516759356681, + "flos": 23369156640000.0, + "grad_norm": 1.4680630377164323, + "language_loss": 0.71763074, + "learning_rate": 3.4665582179578734e-07, + "loss": 0.73834032, + "num_input_tokens_seen": 292678460, + "step": 13563, + "time_per_iteration": 3.0045182704925537 + }, + { + "auxiliary_loss_clip": 0.00987169, + "auxiliary_loss_mlp": 0.01028867, + "balance_loss_clip": 1.02466607, + "balance_loss_mlp": 1.017465, + "epoch": 0.815511799188336, + "flos": 28149189914880.0, + "grad_norm": 2.4797794854773363, + "language_loss": 0.69736326, + "learning_rate": 3.4643670929714387e-07, + "loss": 0.71752357, + "num_input_tokens_seen": 292699815, + "step": 13564, + "time_per_iteration": 3.1757466793060303 + }, + { + "auxiliary_loss_clip": 0.01042044, + "auxiliary_loss_mlp": 0.01024366, + "balance_loss_clip": 1.02490449, + "balance_loss_mlp": 1.01413202, + "epoch": 0.8155719224410041, + "flos": 16983413683200.0, + "grad_norm": 1.7226083095675186, + "language_loss": 0.70257431, + "learning_rate": 3.462176595017854e-07, + "loss": 0.72323841, + "num_input_tokens_seen": 292717370, + "step": 13565, + "time_per_iteration": 3.0219225883483887 + }, + { + "auxiliary_loss_clip": 0.01051246, + "auxiliary_loss_mlp": 0.01030277, + "balance_loss_clip": 1.02472699, + "balance_loss_mlp": 1.02018547, + "epoch": 0.815632045693672, + "flos": 24681727428480.0, + "grad_norm": 2.0203261687530647, + "language_loss": 0.78855217, + "learning_rate": 3.459986724180188e-07, + "loss": 0.8093673, + "num_input_tokens_seen": 292737110, + "step": 13566, + "time_per_iteration": 2.9876840114593506 + }, + { + "auxiliary_loss_clip": 0.01040246, + "auxiliary_loss_mlp": 0.01028104, + "balance_loss_clip": 1.02469635, + "balance_loss_mlp": 1.0187943, + "epoch": 0.81569216894634, + "flos": 19938323047680.0, + "grad_norm": 1.696778485928496, + "language_loss": 0.82517141, + "learning_rate": 3.457797480541491e-07, + "loss": 0.84585488, + "num_input_tokens_seen": 292756510, + "step": 13567, + "time_per_iteration": 2.8663699626922607 + }, + { + "auxiliary_loss_clip": 0.01060502, + "auxiliary_loss_mlp": 0.010246, + "balance_loss_clip": 1.0255115, + "balance_loss_mlp": 1.0156002, + "epoch": 0.8157522921990079, + "flos": 21799393493760.0, + "grad_norm": 2.132106443866976, + "language_loss": 0.79771996, + "learning_rate": 3.455608864184771e-07, + "loss": 0.81857097, + "num_input_tokens_seen": 292776710, + "step": 13568, + "time_per_iteration": 4.571023225784302 + }, + { + "auxiliary_loss_clip": 0.01042663, + "auxiliary_loss_mlp": 0.01026012, + "balance_loss_clip": 1.0265913, + "balance_loss_mlp": 1.01656497, + "epoch": 0.8158124154516759, + "flos": 18508323720960.0, + "grad_norm": 1.9033121147693401, + "language_loss": 0.7766434, + "learning_rate": 3.453420875193016e-07, + "loss": 0.79733014, + "num_input_tokens_seen": 292794350, + "step": 13569, + "time_per_iteration": 2.716212272644043 + }, + { + "auxiliary_loss_clip": 0.01061026, + "auxiliary_loss_mlp": 0.01032754, + "balance_loss_clip": 1.02486634, + "balance_loss_mlp": 1.0234201, + "epoch": 0.815872538704344, + "flos": 26830801123200.0, + "grad_norm": 3.7995681124670813, + "language_loss": 0.59263378, + "learning_rate": 3.451233513649199e-07, + "loss": 0.61357152, + "num_input_tokens_seen": 292814005, + "step": 13570, + "time_per_iteration": 2.6207492351531982 + }, + { + "auxiliary_loss_clip": 0.01050023, + "auxiliary_loss_mlp": 0.01034146, + "balance_loss_clip": 1.02384007, + "balance_loss_mlp": 1.02308941, + "epoch": 0.8159326619570119, + "flos": 21725704742400.0, + "grad_norm": 1.7877550421050905, + "language_loss": 0.82321155, + "learning_rate": 3.4490467796362687e-07, + "loss": 0.84405321, + "num_input_tokens_seen": 292833485, + "step": 13571, + "time_per_iteration": 2.6387434005737305 + }, + { + "auxiliary_loss_clip": 0.01034611, + "auxiliary_loss_mlp": 0.01038027, + "balance_loss_clip": 1.02354598, + "balance_loss_mlp": 1.02675557, + "epoch": 0.8159927852096799, + "flos": 13840726993920.0, + "grad_norm": 2.388159334626591, + "language_loss": 0.78462601, + "learning_rate": 3.446860673237142e-07, + "loss": 0.80535233, + "num_input_tokens_seen": 292848045, + "step": 13572, + "time_per_iteration": 2.7023136615753174 + }, + { + "auxiliary_loss_clip": 0.0106223, + "auxiliary_loss_mlp": 0.01028586, + "balance_loss_clip": 1.0243597, + "balance_loss_mlp": 1.01865566, + "epoch": 0.8160529084623478, + "flos": 24499516711680.0, + "grad_norm": 1.6003441491712795, + "language_loss": 0.64743531, + "learning_rate": 3.4446751945347186e-07, + "loss": 0.66834342, + "num_input_tokens_seen": 292869965, + "step": 13573, + "time_per_iteration": 2.6175191402435303 + }, + { + "auxiliary_loss_clip": 0.01034826, + "auxiliary_loss_mlp": 0.01028501, + "balance_loss_clip": 1.02728558, + "balance_loss_mlp": 1.01892281, + "epoch": 0.8161130317150158, + "flos": 24826339584000.0, + "grad_norm": 1.6842073979633336, + "language_loss": 0.75226408, + "learning_rate": 3.442490343611868e-07, + "loss": 0.77289736, + "num_input_tokens_seen": 292889680, + "step": 13574, + "time_per_iteration": 2.795830488204956 + }, + { + "auxiliary_loss_clip": 0.01054685, + "auxiliary_loss_mlp": 0.01030643, + "balance_loss_clip": 1.02559936, + "balance_loss_mlp": 1.02002144, + "epoch": 0.8161731549676837, + "flos": 30956542208640.0, + "grad_norm": 2.730973486399243, + "language_loss": 0.59680223, + "learning_rate": 3.4403061205514485e-07, + "loss": 0.61765552, + "num_input_tokens_seen": 292912360, + "step": 13575, + "time_per_iteration": 2.732332944869995 + }, + { + "auxiliary_loss_clip": 0.00989284, + "auxiliary_loss_mlp": 0.01033054, + "balance_loss_clip": 1.02074087, + "balance_loss_mlp": 1.02143073, + "epoch": 0.8162332782203517, + "flos": 18551991680640.0, + "grad_norm": 1.9595941102970682, + "language_loss": 0.74597621, + "learning_rate": 3.4381225254362736e-07, + "loss": 0.76619959, + "num_input_tokens_seen": 292928325, + "step": 13576, + "time_per_iteration": 2.826364040374756 + }, + { + "auxiliary_loss_clip": 0.00988423, + "auxiliary_loss_mlp": 0.01001904, + "balance_loss_clip": 1.00315261, + "balance_loss_mlp": 1.00098038, + "epoch": 0.8162934014730197, + "flos": 70386853904640.0, + "grad_norm": 0.8343311076202484, + "language_loss": 0.58685553, + "learning_rate": 3.435939558349155e-07, + "loss": 0.60675883, + "num_input_tokens_seen": 292992795, + "step": 13577, + "time_per_iteration": 3.35968279838562 + }, + { + "auxiliary_loss_clip": 0.00998824, + "auxiliary_loss_mlp": 0.01028454, + "balance_loss_clip": 1.01982617, + "balance_loss_mlp": 1.01825535, + "epoch": 0.8163535247256877, + "flos": 21214839559680.0, + "grad_norm": 1.57831270598274, + "language_loss": 0.71166146, + "learning_rate": 3.4337572193728747e-07, + "loss": 0.73193425, + "num_input_tokens_seen": 293011950, + "step": 13578, + "time_per_iteration": 3.079333543777466 + }, + { + "auxiliary_loss_clip": 0.01026695, + "auxiliary_loss_mlp": 0.0102782, + "balance_loss_clip": 1.02370858, + "balance_loss_mlp": 1.01762795, + "epoch": 0.8164136479783556, + "flos": 21098847565440.0, + "grad_norm": 1.6699469768615116, + "language_loss": 0.7308507, + "learning_rate": 3.431575508590172e-07, + "loss": 0.75139588, + "num_input_tokens_seen": 293030175, + "step": 13579, + "time_per_iteration": 3.121445894241333 + }, + { + "auxiliary_loss_clip": 0.0106286, + "auxiliary_loss_mlp": 0.01023324, + "balance_loss_clip": 1.02435303, + "balance_loss_mlp": 1.01350701, + "epoch": 0.8164737712310236, + "flos": 21720640924800.0, + "grad_norm": 2.087338929533717, + "language_loss": 0.79153985, + "learning_rate": 3.4293944260837873e-07, + "loss": 0.81240177, + "num_input_tokens_seen": 293047980, + "step": 13580, + "time_per_iteration": 2.8820812702178955 + }, + { + "auxiliary_loss_clip": 0.01024521, + "auxiliary_loss_mlp": 0.01030629, + "balance_loss_clip": 1.02220631, + "balance_loss_mlp": 1.02025127, + "epoch": 0.8165338944836915, + "flos": 19536805843200.0, + "grad_norm": 2.3576322888910974, + "language_loss": 0.68726313, + "learning_rate": 3.4272139719364314e-07, + "loss": 0.70781463, + "num_input_tokens_seen": 293067030, + "step": 13581, + "time_per_iteration": 2.8209080696105957 + }, + { + "auxiliary_loss_clip": 0.01062359, + "auxiliary_loss_mlp": 0.01023898, + "balance_loss_clip": 1.02552593, + "balance_loss_mlp": 1.01418817, + "epoch": 0.8165940177363595, + "flos": 22928568416640.0, + "grad_norm": 1.5834778960922504, + "language_loss": 0.60052645, + "learning_rate": 3.4250341462307786e-07, + "loss": 0.62138909, + "num_input_tokens_seen": 293085575, + "step": 13582, + "time_per_iteration": 2.682086706161499 + }, + { + "auxiliary_loss_clip": 0.01028545, + "auxiliary_loss_mlp": 0.00747493, + "balance_loss_clip": 1.02354717, + "balance_loss_mlp": 1.00031948, + "epoch": 0.8166541409890276, + "flos": 23370377702400.0, + "grad_norm": 1.3799248848360959, + "language_loss": 0.82265639, + "learning_rate": 3.4228549490494897e-07, + "loss": 0.84041679, + "num_input_tokens_seen": 293108200, + "step": 13583, + "time_per_iteration": 2.9291646480560303 + }, + { + "auxiliary_loss_clip": 0.01037195, + "auxiliary_loss_mlp": 0.01023415, + "balance_loss_clip": 1.0220418, + "balance_loss_mlp": 1.01340783, + "epoch": 0.8167142642416955, + "flos": 18441997257600.0, + "grad_norm": 1.8318898444911527, + "language_loss": 0.74259424, + "learning_rate": 3.4206763804752093e-07, + "loss": 0.76320034, + "num_input_tokens_seen": 293126020, + "step": 13584, + "time_per_iteration": 2.725356340408325 + }, + { + "auxiliary_loss_clip": 0.01053017, + "auxiliary_loss_mlp": 0.01024709, + "balance_loss_clip": 1.02617049, + "balance_loss_mlp": 1.01444554, + "epoch": 0.8167743874943635, + "flos": 21214983214080.0, + "grad_norm": 1.842360520817778, + "language_loss": 0.74768877, + "learning_rate": 3.4184984405905405e-07, + "loss": 0.76846606, + "num_input_tokens_seen": 293144620, + "step": 13585, + "time_per_iteration": 2.613053798675537 + }, + { + "auxiliary_loss_clip": 0.01043688, + "auxiliary_loss_mlp": 0.01030449, + "balance_loss_clip": 1.02603793, + "balance_loss_mlp": 1.01979768, + "epoch": 0.8168345107470314, + "flos": 18697681244160.0, + "grad_norm": 1.9499896154089729, + "language_loss": 0.69574302, + "learning_rate": 3.416321129478068e-07, + "loss": 0.71648443, + "num_input_tokens_seen": 293162850, + "step": 13586, + "time_per_iteration": 2.682504177093506 + }, + { + "auxiliary_loss_clip": 0.01003607, + "auxiliary_loss_mlp": 0.0103387, + "balance_loss_clip": 1.0222857, + "balance_loss_mlp": 1.02379107, + "epoch": 0.8168946339996994, + "flos": 16253098358400.0, + "grad_norm": 1.482452557425525, + "language_loss": 0.60743213, + "learning_rate": 3.4141444472203594e-07, + "loss": 0.6278069, + "num_input_tokens_seen": 293181620, + "step": 13587, + "time_per_iteration": 4.512277126312256 + }, + { + "auxiliary_loss_clip": 0.01054622, + "auxiliary_loss_mlp": 0.01033477, + "balance_loss_clip": 1.02541852, + "balance_loss_mlp": 1.02262926, + "epoch": 0.8169547572523673, + "flos": 26941585645440.0, + "grad_norm": 2.3777443836245666, + "language_loss": 0.69002455, + "learning_rate": 3.4119683938999624e-07, + "loss": 0.71090555, + "num_input_tokens_seen": 293200270, + "step": 13588, + "time_per_iteration": 2.7990713119506836 + }, + { + "auxiliary_loss_clip": 0.01038764, + "auxiliary_loss_mlp": 0.01033096, + "balance_loss_clip": 1.02440596, + "balance_loss_mlp": 1.02073359, + "epoch": 0.8170148805050353, + "flos": 18952323736320.0, + "grad_norm": 2.3319060607810376, + "language_loss": 0.72760844, + "learning_rate": 3.4097929695993854e-07, + "loss": 0.74832702, + "num_input_tokens_seen": 293218960, + "step": 13589, + "time_per_iteration": 2.685039758682251 + }, + { + "auxiliary_loss_clip": 0.01050679, + "auxiliary_loss_mlp": 0.01026826, + "balance_loss_clip": 1.02482951, + "balance_loss_mlp": 1.01661563, + "epoch": 0.8170750037577033, + "flos": 21834909066240.0, + "grad_norm": 1.6754414532913207, + "language_loss": 0.73353487, + "learning_rate": 3.4076181744011166e-07, + "loss": 0.75430995, + "num_input_tokens_seen": 293236450, + "step": 13590, + "time_per_iteration": 2.76203989982605 + }, + { + "auxiliary_loss_clip": 0.01065229, + "auxiliary_loss_mlp": 0.01028874, + "balance_loss_clip": 1.02548099, + "balance_loss_mlp": 1.01725125, + "epoch": 0.8171351270103713, + "flos": 33507169021440.0, + "grad_norm": 1.7569196237863514, + "language_loss": 0.65411115, + "learning_rate": 3.4054440083876345e-07, + "loss": 0.67505217, + "num_input_tokens_seen": 293256480, + "step": 13591, + "time_per_iteration": 2.6284947395324707 + }, + { + "auxiliary_loss_clip": 0.01063325, + "auxiliary_loss_mlp": 0.01028343, + "balance_loss_clip": 1.02418399, + "balance_loss_mlp": 1.01748896, + "epoch": 0.8171952502630392, + "flos": 22708184520960.0, + "grad_norm": 2.4759939116267757, + "language_loss": 0.67615271, + "learning_rate": 3.403270471641373e-07, + "loss": 0.69706935, + "num_input_tokens_seen": 293274960, + "step": 13592, + "time_per_iteration": 2.7091236114501953 + }, + { + "auxiliary_loss_clip": 0.01035228, + "auxiliary_loss_mlp": 0.01028342, + "balance_loss_clip": 1.02225399, + "balance_loss_mlp": 1.01754808, + "epoch": 0.8172553735157072, + "flos": 26723715701760.0, + "grad_norm": 3.6452325714682066, + "language_loss": 0.66511893, + "learning_rate": 3.401097564244759e-07, + "loss": 0.68575466, + "num_input_tokens_seen": 293295945, + "step": 13593, + "time_per_iteration": 2.680837392807007 + }, + { + "auxiliary_loss_clip": 0.0104917, + "auxiliary_loss_mlp": 0.0102555, + "balance_loss_clip": 1.02247834, + "balance_loss_mlp": 1.0161984, + "epoch": 0.8173154967683751, + "flos": 15961072786560.0, + "grad_norm": 2.007278918463013, + "language_loss": 0.69505525, + "learning_rate": 3.398925286280188e-07, + "loss": 0.71580243, + "num_input_tokens_seen": 293313300, + "step": 13594, + "time_per_iteration": 2.6014597415924072 + }, + { + "auxiliary_loss_clip": 0.01063462, + "auxiliary_loss_mlp": 0.01030321, + "balance_loss_clip": 1.02494502, + "balance_loss_mlp": 1.02014017, + "epoch": 0.8173756200210431, + "flos": 25986720447360.0, + "grad_norm": 1.8005845721181595, + "language_loss": 0.65504426, + "learning_rate": 3.3967536378300456e-07, + "loss": 0.675982, + "num_input_tokens_seen": 293333085, + "step": 13595, + "time_per_iteration": 2.785104513168335 + }, + { + "auxiliary_loss_clip": 0.01027099, + "auxiliary_loss_mlp": 0.01027737, + "balance_loss_clip": 1.02556825, + "balance_loss_mlp": 1.01669216, + "epoch": 0.8174357432737112, + "flos": 25664422688640.0, + "grad_norm": 1.4329962355052344, + "language_loss": 0.78706336, + "learning_rate": 3.394582618976658e-07, + "loss": 0.8076117, + "num_input_tokens_seen": 293351895, + "step": 13596, + "time_per_iteration": 2.7107372283935547 + }, + { + "auxiliary_loss_clip": 0.01034319, + "auxiliary_loss_mlp": 0.01024058, + "balance_loss_clip": 1.0214684, + "balance_loss_mlp": 1.01362133, + "epoch": 0.8174958665263791, + "flos": 21835088634240.0, + "grad_norm": 10.341341578554456, + "language_loss": 0.57856619, + "learning_rate": 3.392412229802362e-07, + "loss": 0.59914994, + "num_input_tokens_seen": 293371165, + "step": 13597, + "time_per_iteration": 2.754833698272705 + }, + { + "auxiliary_loss_clip": 0.01025872, + "auxiliary_loss_mlp": 0.01027203, + "balance_loss_clip": 1.02852404, + "balance_loss_mlp": 1.01741612, + "epoch": 0.8175559897790471, + "flos": 22455517276800.0, + "grad_norm": 1.4828410452548617, + "language_loss": 0.82181585, + "learning_rate": 3.390242470389462e-07, + "loss": 0.84234667, + "num_input_tokens_seen": 293391150, + "step": 13598, + "time_per_iteration": 2.8084664344787598 + }, + { + "auxiliary_loss_clip": 0.01006611, + "auxiliary_loss_mlp": 0.01028471, + "balance_loss_clip": 1.02682996, + "balance_loss_mlp": 1.01850522, + "epoch": 0.817616113031715, + "flos": 23615790399360.0, + "grad_norm": 1.750469814736823, + "language_loss": 0.82267922, + "learning_rate": 3.3880733408202277e-07, + "loss": 0.8430301, + "num_input_tokens_seen": 293409440, + "step": 13599, + "time_per_iteration": 2.8390238285064697 + }, + { + "auxiliary_loss_clip": 0.01010599, + "auxiliary_loss_mlp": 0.01034365, + "balance_loss_clip": 1.02038705, + "balance_loss_mlp": 1.0233202, + "epoch": 0.817676236284383, + "flos": 27672260106240.0, + "grad_norm": 1.8570812714868723, + "language_loss": 0.83555257, + "learning_rate": 3.3859048411769186e-07, + "loss": 0.85600227, + "num_input_tokens_seen": 293428995, + "step": 13600, + "time_per_iteration": 2.799837112426758 + }, + { + "auxiliary_loss_clip": 0.01031749, + "auxiliary_loss_mlp": 0.01030177, + "balance_loss_clip": 1.02478027, + "balance_loss_mlp": 1.01962674, + "epoch": 0.8177363595370509, + "flos": 24681009156480.0, + "grad_norm": 1.797269041429543, + "language_loss": 0.7388708, + "learning_rate": 3.383736971541766e-07, + "loss": 0.75949007, + "num_input_tokens_seen": 293449155, + "step": 13601, + "time_per_iteration": 2.7507708072662354 + }, + { + "auxiliary_loss_clip": 0.01034797, + "auxiliary_loss_mlp": 0.01030824, + "balance_loss_clip": 1.0253799, + "balance_loss_mlp": 1.02001762, + "epoch": 0.817796482789719, + "flos": 17346326745600.0, + "grad_norm": 2.458978175827863, + "language_loss": 0.68582809, + "learning_rate": 3.3815697319969737e-07, + "loss": 0.70648426, + "num_input_tokens_seen": 293466125, + "step": 13602, + "time_per_iteration": 2.650562286376953 + }, + { + "auxiliary_loss_clip": 0.01023464, + "auxiliary_loss_mlp": 0.01029349, + "balance_loss_clip": 1.02221251, + "balance_loss_mlp": 1.01884675, + "epoch": 0.8178566060423869, + "flos": 17778475272960.0, + "grad_norm": 1.9614019148480535, + "language_loss": 0.83697343, + "learning_rate": 3.379403122624718e-07, + "loss": 0.85750157, + "num_input_tokens_seen": 293481345, + "step": 13603, + "time_per_iteration": 4.485063076019287 + }, + { + "auxiliary_loss_clip": 0.01023262, + "auxiliary_loss_mlp": 0.01022827, + "balance_loss_clip": 1.02588439, + "balance_loss_mlp": 1.01305175, + "epoch": 0.8179167292950549, + "flos": 24973250209920.0, + "grad_norm": 1.571995117764631, + "language_loss": 0.69253248, + "learning_rate": 3.377237143507159e-07, + "loss": 0.71299332, + "num_input_tokens_seen": 293502330, + "step": 13604, + "time_per_iteration": 2.757350206375122 + }, + { + "auxiliary_loss_clip": 0.01034336, + "auxiliary_loss_mlp": 0.01031493, + "balance_loss_clip": 1.02595949, + "balance_loss_mlp": 1.02099633, + "epoch": 0.8179768525477228, + "flos": 22856783086080.0, + "grad_norm": 2.2042852951125744, + "language_loss": 0.74162626, + "learning_rate": 3.3750717947264406e-07, + "loss": 0.76228452, + "num_input_tokens_seen": 293521415, + "step": 13605, + "time_per_iteration": 2.7718350887298584 + }, + { + "auxiliary_loss_clip": 0.01034392, + "auxiliary_loss_mlp": 0.0103856, + "balance_loss_clip": 1.02783704, + "balance_loss_mlp": 1.02803373, + "epoch": 0.8180369758003908, + "flos": 18515147304960.0, + "grad_norm": 1.849503725691562, + "language_loss": 0.74114847, + "learning_rate": 3.372907076364666e-07, + "loss": 0.76187801, + "num_input_tokens_seen": 293539245, + "step": 13606, + "time_per_iteration": 2.9842963218688965 + }, + { + "auxiliary_loss_clip": 0.01062396, + "auxiliary_loss_mlp": 0.01025671, + "balance_loss_clip": 1.02571893, + "balance_loss_mlp": 1.01567566, + "epoch": 0.8180970990530587, + "flos": 33182105915520.0, + "grad_norm": 1.7695449445214917, + "language_loss": 0.65563512, + "learning_rate": 3.370742988503916e-07, + "loss": 0.6765157, + "num_input_tokens_seen": 293560640, + "step": 13607, + "time_per_iteration": 2.8185994625091553 + }, + { + "auxiliary_loss_clip": 0.01042062, + "auxiliary_loss_mlp": 0.01023445, + "balance_loss_clip": 1.02547157, + "balance_loss_mlp": 1.0127759, + "epoch": 0.8181572223057267, + "flos": 25010022758400.0, + "grad_norm": 1.7070521155982674, + "language_loss": 0.69989204, + "learning_rate": 3.3685795312262634e-07, + "loss": 0.72054714, + "num_input_tokens_seen": 293579465, + "step": 13608, + "time_per_iteration": 2.7775166034698486 + }, + { + "auxiliary_loss_clip": 0.01045722, + "auxiliary_loss_mlp": 0.01032444, + "balance_loss_clip": 1.02255082, + "balance_loss_mlp": 1.02234054, + "epoch": 0.8182173455583948, + "flos": 28548731871360.0, + "grad_norm": 1.8978825925463567, + "language_loss": 0.79824686, + "learning_rate": 3.366416704613735e-07, + "loss": 0.81902856, + "num_input_tokens_seen": 293600540, + "step": 13609, + "time_per_iteration": 2.707714557647705 + }, + { + "auxiliary_loss_clip": 0.00985363, + "auxiliary_loss_mlp": 0.01003748, + "balance_loss_clip": 1.00823271, + "balance_loss_mlp": 1.00289011, + "epoch": 0.8182774688110627, + "flos": 72028043245440.0, + "grad_norm": 0.7486179305503775, + "language_loss": 0.55922186, + "learning_rate": 3.3642545087483544e-07, + "loss": 0.57911289, + "num_input_tokens_seen": 293665160, + "step": 13610, + "time_per_iteration": 3.5631768703460693 + }, + { + "auxiliary_loss_clip": 0.00997457, + "auxiliary_loss_mlp": 0.00747376, + "balance_loss_clip": 1.01977313, + "balance_loss_mlp": 1.00032759, + "epoch": 0.8183375920637307, + "flos": 19755358145280.0, + "grad_norm": 1.9948682983464283, + "language_loss": 0.77811527, + "learning_rate": 3.362092943712107e-07, + "loss": 0.79556364, + "num_input_tokens_seen": 293683995, + "step": 13611, + "time_per_iteration": 2.8590028285980225 + }, + { + "auxiliary_loss_clip": 0.01034179, + "auxiliary_loss_mlp": 0.01035142, + "balance_loss_clip": 1.0234468, + "balance_loss_mlp": 1.0230242, + "epoch": 0.8183977153163986, + "flos": 22341895580160.0, + "grad_norm": 1.7654005112138744, + "language_loss": 0.77352226, + "learning_rate": 3.3599320095869745e-07, + "loss": 0.7942155, + "num_input_tokens_seen": 293704115, + "step": 13612, + "time_per_iteration": 2.82308292388916 + }, + { + "auxiliary_loss_clip": 0.01026466, + "auxiliary_loss_mlp": 0.01021853, + "balance_loss_clip": 1.02247679, + "balance_loss_mlp": 1.01176798, + "epoch": 0.8184578385690666, + "flos": 17712472032000.0, + "grad_norm": 2.768627002330617, + "language_loss": 0.86045074, + "learning_rate": 3.3577717064548793e-07, + "loss": 0.880934, + "num_input_tokens_seen": 293722225, + "step": 13613, + "time_per_iteration": 2.702057361602783 + }, + { + "auxiliary_loss_clip": 0.01053096, + "auxiliary_loss_mlp": 0.0103775, + "balance_loss_clip": 1.02622628, + "balance_loss_mlp": 1.02769458, + "epoch": 0.8185179618217345, + "flos": 25701159323520.0, + "grad_norm": 2.9990232854895016, + "language_loss": 0.72910452, + "learning_rate": 3.355612034397746e-07, + "loss": 0.75001299, + "num_input_tokens_seen": 293743995, + "step": 13614, + "time_per_iteration": 2.75329852104187 + }, + { + "auxiliary_loss_clip": 0.01038093, + "auxiliary_loss_mlp": 0.01033268, + "balance_loss_clip": 1.02373648, + "balance_loss_mlp": 1.02261615, + "epoch": 0.8185780850744026, + "flos": 25960326929280.0, + "grad_norm": 1.8175482736487587, + "language_loss": 0.80915391, + "learning_rate": 3.353452993497479e-07, + "loss": 0.82986748, + "num_input_tokens_seen": 293764935, + "step": 13615, + "time_per_iteration": 2.6922030448913574 + }, + { + "auxiliary_loss_clip": 0.01047417, + "auxiliary_loss_mlp": 0.01030039, + "balance_loss_clip": 1.0229969, + "balance_loss_mlp": 1.01920915, + "epoch": 0.8186382083270705, + "flos": 25228431406080.0, + "grad_norm": 1.9326111120781957, + "language_loss": 0.7580632, + "learning_rate": 3.3512945838359375e-07, + "loss": 0.7788378, + "num_input_tokens_seen": 293784035, + "step": 13616, + "time_per_iteration": 4.551297187805176 + }, + { + "auxiliary_loss_clip": 0.01019239, + "auxiliary_loss_mlp": 0.0103503, + "balance_loss_clip": 1.01923358, + "balance_loss_mlp": 1.02293062, + "epoch": 0.8186983315797385, + "flos": 22415009713920.0, + "grad_norm": 1.8590674981113142, + "language_loss": 0.75118768, + "learning_rate": 3.349136805494979e-07, + "loss": 0.77173036, + "num_input_tokens_seen": 293803360, + "step": 13617, + "time_per_iteration": 2.690807819366455 + }, + { + "auxiliary_loss_clip": 0.01038324, + "auxiliary_loss_mlp": 0.01028608, + "balance_loss_clip": 1.02260518, + "balance_loss_mlp": 1.01895189, + "epoch": 0.8187584548324064, + "flos": 22018017623040.0, + "grad_norm": 1.8684329730001912, + "language_loss": 0.68291605, + "learning_rate": 3.346979658556415e-07, + "loss": 0.70358539, + "num_input_tokens_seen": 293821325, + "step": 13618, + "time_per_iteration": 2.7949435710906982 + }, + { + "auxiliary_loss_clip": 0.01045036, + "auxiliary_loss_mlp": 0.01028899, + "balance_loss_clip": 1.02603018, + "balance_loss_mlp": 1.01828349, + "epoch": 0.8188185780850744, + "flos": 29241664116480.0, + "grad_norm": 1.707297742814655, + "language_loss": 0.69760287, + "learning_rate": 3.344823143102058e-07, + "loss": 0.71834219, + "num_input_tokens_seen": 293840315, + "step": 13619, + "time_per_iteration": 2.8605129718780518 + }, + { + "auxiliary_loss_clip": 0.01017643, + "auxiliary_loss_mlp": 0.01031722, + "balance_loss_clip": 1.02807653, + "balance_loss_mlp": 1.02082634, + "epoch": 0.8188787013377423, + "flos": 20696504348160.0, + "grad_norm": 2.412607960251, + "language_loss": 0.73704553, + "learning_rate": 3.3426672592136694e-07, + "loss": 0.75753915, + "num_input_tokens_seen": 293855685, + "step": 13620, + "time_per_iteration": 2.7890031337738037 + }, + { + "auxiliary_loss_clip": 0.01033151, + "auxiliary_loss_mlp": 0.00747533, + "balance_loss_clip": 1.02171886, + "balance_loss_mlp": 1.00039721, + "epoch": 0.8189388245904103, + "flos": 23732967542400.0, + "grad_norm": 1.5180742425840996, + "language_loss": 0.76045579, + "learning_rate": 3.340512006973011e-07, + "loss": 0.77826267, + "num_input_tokens_seen": 293875540, + "step": 13621, + "time_per_iteration": 2.7625181674957275 + }, + { + "auxiliary_loss_clip": 0.01030949, + "auxiliary_loss_mlp": 0.0102357, + "balance_loss_clip": 1.01994133, + "balance_loss_mlp": 1.01309776, + "epoch": 0.8189989478430784, + "flos": 28255090187520.0, + "grad_norm": 2.159414116624181, + "language_loss": 0.65652364, + "learning_rate": 3.3383573864618076e-07, + "loss": 0.67706883, + "num_input_tokens_seen": 293896570, + "step": 13622, + "time_per_iteration": 2.7693395614624023 + }, + { + "auxiliary_loss_clip": 0.01065171, + "auxiliary_loss_mlp": 0.01026454, + "balance_loss_clip": 1.02709782, + "balance_loss_mlp": 1.01533151, + "epoch": 0.8190590710957463, + "flos": 21397696721280.0, + "grad_norm": 2.435944080209482, + "language_loss": 0.75084788, + "learning_rate": 3.3362033977617653e-07, + "loss": 0.77176416, + "num_input_tokens_seen": 293914680, + "step": 13623, + "time_per_iteration": 2.5936973094940186 + }, + { + "auxiliary_loss_clip": 0.01038608, + "auxiliary_loss_mlp": 0.01033832, + "balance_loss_clip": 1.02264166, + "balance_loss_mlp": 1.02225661, + "epoch": 0.8191191943484143, + "flos": 38796451367040.0, + "grad_norm": 3.370591817601013, + "language_loss": 0.63083291, + "learning_rate": 3.3340500409545527e-07, + "loss": 0.65155733, + "num_input_tokens_seen": 293936480, + "step": 13624, + "time_per_iteration": 2.9319236278533936 + }, + { + "auxiliary_loss_clip": 0.01060694, + "auxiliary_loss_mlp": 0.01030083, + "balance_loss_clip": 1.02503347, + "balance_loss_mlp": 1.02037358, + "epoch": 0.8191793176010822, + "flos": 25446516831360.0, + "grad_norm": 1.5949587249429487, + "language_loss": 0.78423309, + "learning_rate": 3.3318973161218386e-07, + "loss": 0.80514085, + "num_input_tokens_seen": 293957815, + "step": 13625, + "time_per_iteration": 2.641127824783325 + }, + { + "auxiliary_loss_clip": 0.01052238, + "auxiliary_loss_mlp": 0.00747803, + "balance_loss_clip": 1.02321053, + "balance_loss_mlp": 1.00043631, + "epoch": 0.8192394408537502, + "flos": 25083029151360.0, + "grad_norm": 2.2248759476203466, + "language_loss": 0.7586205, + "learning_rate": 3.329745223345244e-07, + "loss": 0.77662086, + "num_input_tokens_seen": 293975440, + "step": 13626, + "time_per_iteration": 2.6983306407928467 + }, + { + "auxiliary_loss_clip": 0.0105112, + "auxiliary_loss_mlp": 0.01032384, + "balance_loss_clip": 1.02487981, + "balance_loss_mlp": 1.02248979, + "epoch": 0.8192995641064181, + "flos": 27673732563840.0, + "grad_norm": 1.6268231906639308, + "language_loss": 0.73488933, + "learning_rate": 3.3275937627063823e-07, + "loss": 0.75572437, + "num_input_tokens_seen": 293997540, + "step": 13627, + "time_per_iteration": 2.7901947498321533 + }, + { + "auxiliary_loss_clip": 0.01061778, + "auxiliary_loss_mlp": 0.01031131, + "balance_loss_clip": 1.02506328, + "balance_loss_mlp": 1.01998544, + "epoch": 0.8193596873590862, + "flos": 21288492397440.0, + "grad_norm": 1.7588371602212791, + "language_loss": 0.68920004, + "learning_rate": 3.3254429342868353e-07, + "loss": 0.71012914, + "num_input_tokens_seen": 294017030, + "step": 13628, + "time_per_iteration": 2.5502052307128906 + }, + { + "auxiliary_loss_clip": 0.0103885, + "auxiliary_loss_mlp": 0.0103374, + "balance_loss_clip": 1.02426314, + "balance_loss_mlp": 1.02147973, + "epoch": 0.8194198106117541, + "flos": 17492626840320.0, + "grad_norm": 1.6649611146559729, + "language_loss": 0.85608482, + "learning_rate": 3.323292738168171e-07, + "loss": 0.87681079, + "num_input_tokens_seen": 294035700, + "step": 13629, + "time_per_iteration": 2.5959908962249756 + }, + { + "auxiliary_loss_clip": 0.0105998, + "auxiliary_loss_mlp": 0.01022284, + "balance_loss_clip": 1.02387738, + "balance_loss_mlp": 1.01197207, + "epoch": 0.8194799338644221, + "flos": 15267925059840.0, + "grad_norm": 2.1981155883249315, + "language_loss": 0.74253201, + "learning_rate": 3.3211431744319084e-07, + "loss": 0.76335466, + "num_input_tokens_seen": 294049730, + "step": 13630, + "time_per_iteration": 2.6407575607299805 + }, + { + "auxiliary_loss_clip": 0.01047318, + "auxiliary_loss_mlp": 0.01030805, + "balance_loss_clip": 1.02417755, + "balance_loss_mlp": 1.01953983, + "epoch": 0.81954005711709, + "flos": 14718814871040.0, + "grad_norm": 3.7014112922069105, + "language_loss": 0.7189405, + "learning_rate": 3.31899424315957e-07, + "loss": 0.73972172, + "num_input_tokens_seen": 294066545, + "step": 13631, + "time_per_iteration": 2.577498435974121 + }, + { + "auxiliary_loss_clip": 0.010624, + "auxiliary_loss_mlp": 0.01027016, + "balance_loss_clip": 1.02483821, + "balance_loss_mlp": 1.01691341, + "epoch": 0.819600180369758, + "flos": 23074042498560.0, + "grad_norm": 1.5065400878705004, + "language_loss": 0.76814294, + "learning_rate": 3.3168459444326447e-07, + "loss": 0.78903711, + "num_input_tokens_seen": 294087455, + "step": 13632, + "time_per_iteration": 2.5851871967315674 + }, + { + "auxiliary_loss_clip": 0.01033168, + "auxiliary_loss_mlp": 0.01030795, + "balance_loss_clip": 1.02114058, + "balance_loss_mlp": 1.02047694, + "epoch": 0.8196603036224259, + "flos": 27599792417280.0, + "grad_norm": 1.906889243305762, + "language_loss": 0.6580193, + "learning_rate": 3.314698278332588e-07, + "loss": 0.6786589, + "num_input_tokens_seen": 294107480, + "step": 13633, + "time_per_iteration": 2.684965133666992 + }, + { + "auxiliary_loss_clip": 0.01043732, + "auxiliary_loss_mlp": 0.0103117, + "balance_loss_clip": 1.02324283, + "balance_loss_mlp": 1.02138257, + "epoch": 0.8197204268750939, + "flos": 28582020800640.0, + "grad_norm": 2.1082698090903107, + "language_loss": 0.75736725, + "learning_rate": 3.3125512449408513e-07, + "loss": 0.77811635, + "num_input_tokens_seen": 294130115, + "step": 13634, + "time_per_iteration": 4.267121076583862 + }, + { + "auxiliary_loss_clip": 0.01009787, + "auxiliary_loss_mlp": 0.00747506, + "balance_loss_clip": 1.025051, + "balance_loss_mlp": 1.00036836, + "epoch": 0.819780550127762, + "flos": 23258300290560.0, + "grad_norm": 2.297687551358933, + "language_loss": 0.81595743, + "learning_rate": 3.310404844338841e-07, + "loss": 0.83353037, + "num_input_tokens_seen": 294148495, + "step": 13635, + "time_per_iteration": 4.483858346939087 + }, + { + "auxiliary_loss_clip": 0.01045486, + "auxiliary_loss_mlp": 0.01029332, + "balance_loss_clip": 1.02152133, + "balance_loss_mlp": 1.01810241, + "epoch": 0.8198406733804299, + "flos": 26685255214080.0, + "grad_norm": 1.5560547962593654, + "language_loss": 0.76140314, + "learning_rate": 3.308259076607949e-07, + "loss": 0.78215134, + "num_input_tokens_seen": 294169595, + "step": 13636, + "time_per_iteration": 2.7373404502868652 + }, + { + "auxiliary_loss_clip": 0.01036653, + "auxiliary_loss_mlp": 0.01030416, + "balance_loss_clip": 1.0283103, + "balance_loss_mlp": 1.01949668, + "epoch": 0.8199007966330979, + "flos": 20084084438400.0, + "grad_norm": 1.9000794419924143, + "language_loss": 0.81196249, + "learning_rate": 3.3061139418295445e-07, + "loss": 0.83263314, + "num_input_tokens_seen": 294183885, + "step": 13637, + "time_per_iteration": 2.7264492511749268 + }, + { + "auxiliary_loss_clip": 0.01050993, + "auxiliary_loss_mlp": 0.01024895, + "balance_loss_clip": 1.0245409, + "balance_loss_mlp": 1.01509631, + "epoch": 0.8199609198857658, + "flos": 31902788142720.0, + "grad_norm": 2.148652072511, + "language_loss": 0.7096498, + "learning_rate": 3.3039694400849725e-07, + "loss": 0.73040867, + "num_input_tokens_seen": 294200150, + "step": 13638, + "time_per_iteration": 2.722573757171631 + }, + { + "auxiliary_loss_clip": 0.01018674, + "auxiliary_loss_mlp": 0.01032494, + "balance_loss_clip": 1.02197456, + "balance_loss_mlp": 1.01966095, + "epoch": 0.8200210431384338, + "flos": 26470150617600.0, + "grad_norm": 2.020141053778561, + "language_loss": 0.79417109, + "learning_rate": 3.3018255714555564e-07, + "loss": 0.81468272, + "num_input_tokens_seen": 294220385, + "step": 13639, + "time_per_iteration": 2.7608230113983154 + }, + { + "auxiliary_loss_clip": 0.01019076, + "auxiliary_loss_mlp": 0.01026342, + "balance_loss_clip": 1.02182448, + "balance_loss_mlp": 1.01552331, + "epoch": 0.8200811663911017, + "flos": 22091454979200.0, + "grad_norm": 1.5982764556459368, + "language_loss": 0.792225, + "learning_rate": 3.299682336022589e-07, + "loss": 0.81267917, + "num_input_tokens_seen": 294239355, + "step": 13640, + "time_per_iteration": 2.7189722061157227 + }, + { + "auxiliary_loss_clip": 0.01027227, + "auxiliary_loss_mlp": 0.01032054, + "balance_loss_clip": 1.02166522, + "balance_loss_mlp": 1.02112305, + "epoch": 0.8201412896437698, + "flos": 37593659520000.0, + "grad_norm": 1.8626931787667094, + "language_loss": 0.63131392, + "learning_rate": 3.297539733867336e-07, + "loss": 0.65190673, + "num_input_tokens_seen": 294259395, + "step": 13641, + "time_per_iteration": 2.8318939208984375 + }, + { + "auxiliary_loss_clip": 0.01017478, + "auxiliary_loss_mlp": 0.01028313, + "balance_loss_clip": 1.02407384, + "balance_loss_mlp": 1.01710784, + "epoch": 0.8202014128964377, + "flos": 19646333389440.0, + "grad_norm": 1.7650275214030193, + "language_loss": 0.73520851, + "learning_rate": 3.295397765071055e-07, + "loss": 0.75566649, + "num_input_tokens_seen": 294277365, + "step": 13642, + "time_per_iteration": 2.7393319606781006 + }, + { + "auxiliary_loss_clip": 0.01043742, + "auxiliary_loss_mlp": 0.01030361, + "balance_loss_clip": 1.02632833, + "balance_loss_mlp": 1.02012694, + "epoch": 0.8202615361491057, + "flos": 31467335564160.0, + "grad_norm": 1.6353596982426133, + "language_loss": 0.70302403, + "learning_rate": 3.2932564297149615e-07, + "loss": 0.72376513, + "num_input_tokens_seen": 294297555, + "step": 13643, + "time_per_iteration": 2.919959545135498 + }, + { + "auxiliary_loss_clip": 0.0105136, + "auxiliary_loss_mlp": 0.01028096, + "balance_loss_clip": 1.02531695, + "balance_loss_mlp": 1.01789212, + "epoch": 0.8203216594017736, + "flos": 24715555061760.0, + "grad_norm": 1.6367587201767115, + "language_loss": 0.65238214, + "learning_rate": 3.291115727880256e-07, + "loss": 0.67317671, + "num_input_tokens_seen": 294317600, + "step": 13644, + "time_per_iteration": 2.710907220840454 + }, + { + "auxiliary_loss_clip": 0.01023684, + "auxiliary_loss_mlp": 0.01029981, + "balance_loss_clip": 1.02375174, + "balance_loss_mlp": 1.01969361, + "epoch": 0.8203817826544416, + "flos": 26031824951040.0, + "grad_norm": 1.420127760318771, + "language_loss": 0.70777738, + "learning_rate": 3.2889756596481234e-07, + "loss": 0.72831398, + "num_input_tokens_seen": 294340215, + "step": 13645, + "time_per_iteration": 2.7238690853118896 + }, + { + "auxiliary_loss_clip": 0.01041259, + "auxiliary_loss_mlp": 0.01026535, + "balance_loss_clip": 1.02554083, + "balance_loss_mlp": 1.01628888, + "epoch": 0.8204419059071095, + "flos": 25954544839680.0, + "grad_norm": 2.2859831670840456, + "language_loss": 0.71517026, + "learning_rate": 3.286836225099707e-07, + "loss": 0.73584819, + "num_input_tokens_seen": 294358590, + "step": 13646, + "time_per_iteration": 2.688321352005005 + }, + { + "auxiliary_loss_clip": 0.0104109, + "auxiliary_loss_mlp": 0.01028682, + "balance_loss_clip": 1.02448535, + "balance_loss_mlp": 1.01787019, + "epoch": 0.8205020291597775, + "flos": 23580059345280.0, + "grad_norm": 2.1003515705628484, + "language_loss": 0.78540349, + "learning_rate": 3.284697424316132e-07, + "loss": 0.8061012, + "num_input_tokens_seen": 294375825, + "step": 13647, + "time_per_iteration": 2.691154718399048 + }, + { + "auxiliary_loss_clip": 0.01059994, + "auxiliary_loss_mlp": 0.01028244, + "balance_loss_clip": 1.02552044, + "balance_loss_mlp": 1.01848114, + "epoch": 0.8205621524124456, + "flos": 26799164219520.0, + "grad_norm": 1.4259935010822626, + "language_loss": 0.67977732, + "learning_rate": 3.2825592573785034e-07, + "loss": 0.70065975, + "num_input_tokens_seen": 294398500, + "step": 13648, + "time_per_iteration": 2.675875663757324 + }, + { + "auxiliary_loss_clip": 0.01033382, + "auxiliary_loss_mlp": 0.01024056, + "balance_loss_clip": 1.02107739, + "balance_loss_mlp": 1.01329112, + "epoch": 0.8206222756651135, + "flos": 27527863432320.0, + "grad_norm": 1.7588389012707264, + "language_loss": 0.80565512, + "learning_rate": 3.28042172436791e-07, + "loss": 0.82622945, + "num_input_tokens_seen": 294418840, + "step": 13649, + "time_per_iteration": 2.913309335708618 + }, + { + "auxiliary_loss_clip": 0.01044496, + "auxiliary_loss_mlp": 0.01030384, + "balance_loss_clip": 1.02596152, + "balance_loss_mlp": 1.01916623, + "epoch": 0.8206823989177815, + "flos": 21178605715200.0, + "grad_norm": 1.58420816460914, + "language_loss": 0.68970734, + "learning_rate": 3.278284825365396e-07, + "loss": 0.71045613, + "num_input_tokens_seen": 294438215, + "step": 13650, + "time_per_iteration": 4.5688934326171875 + }, + { + "auxiliary_loss_clip": 0.01045795, + "auxiliary_loss_mlp": 0.01026827, + "balance_loss_clip": 1.02656054, + "balance_loss_mlp": 1.01549613, + "epoch": 0.8207425221704494, + "flos": 11509622150400.0, + "grad_norm": 2.1063783547937582, + "language_loss": 0.60134852, + "learning_rate": 3.276148560452001e-07, + "loss": 0.62207472, + "num_input_tokens_seen": 294455260, + "step": 13651, + "time_per_iteration": 2.665187358856201 + }, + { + "auxiliary_loss_clip": 0.01026876, + "auxiliary_loss_mlp": 0.00747638, + "balance_loss_clip": 1.02380037, + "balance_loss_mlp": 1.00037408, + "epoch": 0.8208026454231174, + "flos": 19791987039360.0, + "grad_norm": 2.2901609211287175, + "language_loss": 0.7212922, + "learning_rate": 3.2740129297087293e-07, + "loss": 0.73903733, + "num_input_tokens_seen": 294473205, + "step": 13652, + "time_per_iteration": 2.725879669189453 + }, + { + "auxiliary_loss_clip": 0.01038902, + "auxiliary_loss_mlp": 0.01026261, + "balance_loss_clip": 1.02472138, + "balance_loss_mlp": 1.01708186, + "epoch": 0.8208627686757853, + "flos": 15667538843520.0, + "grad_norm": 1.720394317536929, + "language_loss": 0.72860253, + "learning_rate": 3.271877933216558e-07, + "loss": 0.74925411, + "num_input_tokens_seen": 294490645, + "step": 13653, + "time_per_iteration": 2.6618494987487793 + }, + { + "auxiliary_loss_clip": 0.01022731, + "auxiliary_loss_mlp": 0.01029111, + "balance_loss_clip": 1.02388942, + "balance_loss_mlp": 1.01737463, + "epoch": 0.8209228919284534, + "flos": 37482659516160.0, + "grad_norm": 3.7034192059503366, + "language_loss": 0.6270408, + "learning_rate": 3.269743571056451e-07, + "loss": 0.64755929, + "num_input_tokens_seen": 294513500, + "step": 13654, + "time_per_iteration": 2.9022629261016846 + }, + { + "auxiliary_loss_clip": 0.01033504, + "auxiliary_loss_mlp": 0.01023374, + "balance_loss_clip": 1.02381861, + "balance_loss_mlp": 1.01347995, + "epoch": 0.8209830151811213, + "flos": 23112969863040.0, + "grad_norm": 1.4278825998735885, + "language_loss": 0.69715792, + "learning_rate": 3.2676098433093447e-07, + "loss": 0.71772665, + "num_input_tokens_seen": 294535710, + "step": 13655, + "time_per_iteration": 2.7348625659942627 + }, + { + "auxiliary_loss_clip": 0.01042361, + "auxiliary_loss_mlp": 0.01031471, + "balance_loss_clip": 1.02567148, + "balance_loss_mlp": 1.02114749, + "epoch": 0.8210431384337893, + "flos": 21288169175040.0, + "grad_norm": 1.9150844648407728, + "language_loss": 0.8178333, + "learning_rate": 3.265476750056162e-07, + "loss": 0.83857161, + "num_input_tokens_seen": 294554055, + "step": 13656, + "time_per_iteration": 2.6626365184783936 + }, + { + "auxiliary_loss_clip": 0.01039277, + "auxiliary_loss_mlp": 0.01028676, + "balance_loss_clip": 1.0248152, + "balance_loss_mlp": 1.0182631, + "epoch": 0.8211032616864572, + "flos": 11502403516800.0, + "grad_norm": 2.360038065369338, + "language_loss": 0.73979652, + "learning_rate": 3.2633442913777654e-07, + "loss": 0.76047605, + "num_input_tokens_seen": 294570390, + "step": 13657, + "time_per_iteration": 2.6280517578125 + }, + { + "auxiliary_loss_clip": 0.01028327, + "auxiliary_loss_mlp": 0.01028175, + "balance_loss_clip": 1.02219176, + "balance_loss_mlp": 1.01801288, + "epoch": 0.8211633849391252, + "flos": 29821477455360.0, + "grad_norm": 1.9695356936028547, + "language_loss": 0.55501592, + "learning_rate": 3.2612124673550325e-07, + "loss": 0.57558095, + "num_input_tokens_seen": 294593050, + "step": 13658, + "time_per_iteration": 2.7243635654449463 + }, + { + "auxiliary_loss_clip": 0.01000048, + "auxiliary_loss_mlp": 0.01030575, + "balance_loss_clip": 1.02212167, + "balance_loss_mlp": 1.02023375, + "epoch": 0.8212235081917931, + "flos": 13115439573120.0, + "grad_norm": 2.7770514113274687, + "language_loss": 0.79437864, + "learning_rate": 3.259081278068805e-07, + "loss": 0.81468487, + "num_input_tokens_seen": 294608550, + "step": 13659, + "time_per_iteration": 2.8248748779296875 + }, + { + "auxiliary_loss_clip": 0.01046329, + "auxiliary_loss_mlp": 0.01023095, + "balance_loss_clip": 1.02291822, + "balance_loss_mlp": 1.01416659, + "epoch": 0.8212836314444611, + "flos": 40515351782400.0, + "grad_norm": 1.4863327213501867, + "language_loss": 0.59887069, + "learning_rate": 3.256950723599887e-07, + "loss": 0.61956489, + "num_input_tokens_seen": 294630380, + "step": 13660, + "time_per_iteration": 2.867931604385376 + }, + { + "auxiliary_loss_clip": 0.01046185, + "auxiliary_loss_mlp": 0.01029818, + "balance_loss_clip": 1.02322221, + "balance_loss_mlp": 1.01862419, + "epoch": 0.8213437546971292, + "flos": 18770543982720.0, + "grad_norm": 1.8353630833954642, + "language_loss": 0.72495031, + "learning_rate": 3.254820804029075e-07, + "loss": 0.74571037, + "num_input_tokens_seen": 294648655, + "step": 13661, + "time_per_iteration": 2.7554163932800293 + }, + { + "auxiliary_loss_clip": 0.01043797, + "auxiliary_loss_mlp": 0.01031603, + "balance_loss_clip": 1.02338922, + "balance_loss_mlp": 1.0208205, + "epoch": 0.8214038779497971, + "flos": 19682279925120.0, + "grad_norm": 2.4312016169785426, + "language_loss": 0.75023651, + "learning_rate": 3.252691519437143e-07, + "loss": 0.77099055, + "num_input_tokens_seen": 294666915, + "step": 13662, + "time_per_iteration": 2.578233003616333 + }, + { + "auxiliary_loss_clip": 0.01007557, + "auxiliary_loss_mlp": 0.01000531, + "balance_loss_clip": 1.00230408, + "balance_loss_mlp": 0.99963111, + "epoch": 0.8214640012024651, + "flos": 71602969697280.0, + "grad_norm": 0.7437622005888421, + "language_loss": 0.54054737, + "learning_rate": 3.250562869904825e-07, + "loss": 0.56062829, + "num_input_tokens_seen": 294731545, + "step": 13663, + "time_per_iteration": 4.975616216659546 + }, + { + "auxiliary_loss_clip": 0.01012565, + "auxiliary_loss_mlp": 0.01028024, + "balance_loss_clip": 1.0203824, + "balance_loss_mlp": 1.01735473, + "epoch": 0.821524124455133, + "flos": 14757203531520.0, + "grad_norm": 2.5564615687934356, + "language_loss": 0.65425742, + "learning_rate": 3.248434855512838e-07, + "loss": 0.67466331, + "num_input_tokens_seen": 294748745, + "step": 13664, + "time_per_iteration": 2.661177396774292 + }, + { + "auxiliary_loss_clip": 0.01035337, + "auxiliary_loss_mlp": 0.01026454, + "balance_loss_clip": 1.02318525, + "balance_loss_mlp": 1.01717365, + "epoch": 0.821584247707801, + "flos": 25082274965760.0, + "grad_norm": 1.5813199786282521, + "language_loss": 0.75054866, + "learning_rate": 3.246307476341881e-07, + "loss": 0.77116656, + "num_input_tokens_seen": 294768955, + "step": 13665, + "time_per_iteration": 2.75138258934021 + }, + { + "auxiliary_loss_clip": 0.01053786, + "auxiliary_loss_mlp": 0.00747733, + "balance_loss_clip": 1.02626824, + "balance_loss_mlp": 1.00041342, + "epoch": 0.8216443709604689, + "flos": 36830701710720.0, + "grad_norm": 1.985913456256623, + "language_loss": 0.65108037, + "learning_rate": 3.2441807324726256e-07, + "loss": 0.66909564, + "num_input_tokens_seen": 294789250, + "step": 13666, + "time_per_iteration": 2.691107749938965 + }, + { + "auxiliary_loss_clip": 0.01019936, + "auxiliary_loss_mlp": 0.01028049, + "balance_loss_clip": 1.02432525, + "balance_loss_mlp": 1.01836371, + "epoch": 0.821704494213137, + "flos": 25081808088960.0, + "grad_norm": 1.6388652768400331, + "language_loss": 0.77208984, + "learning_rate": 3.2420546239857174e-07, + "loss": 0.7925697, + "num_input_tokens_seen": 294809760, + "step": 13667, + "time_per_iteration": 2.775285005569458 + }, + { + "auxiliary_loss_clip": 0.01033267, + "auxiliary_loss_mlp": 0.01033568, + "balance_loss_clip": 1.02501726, + "balance_loss_mlp": 1.02328587, + "epoch": 0.8217646174658049, + "flos": 14356117290240.0, + "grad_norm": 1.922385923228633, + "language_loss": 0.77172661, + "learning_rate": 3.239929150961773e-07, + "loss": 0.792395, + "num_input_tokens_seen": 294826495, + "step": 13668, + "time_per_iteration": 2.6590473651885986 + }, + { + "auxiliary_loss_clip": 0.01019584, + "auxiliary_loss_mlp": 0.01027552, + "balance_loss_clip": 1.02340484, + "balance_loss_mlp": 1.01777053, + "epoch": 0.8218247407184729, + "flos": 22090557139200.0, + "grad_norm": 2.750598742043329, + "language_loss": 0.73439437, + "learning_rate": 3.2378043134813984e-07, + "loss": 0.75486577, + "num_input_tokens_seen": 294845370, + "step": 13669, + "time_per_iteration": 2.7929131984710693 + }, + { + "auxiliary_loss_clip": 0.01049894, + "auxiliary_loss_mlp": 0.01023353, + "balance_loss_clip": 1.02384198, + "balance_loss_mlp": 1.01366138, + "epoch": 0.8218848639711408, + "flos": 16764035368320.0, + "grad_norm": 1.7782518325020948, + "language_loss": 0.79148459, + "learning_rate": 3.235680111625161e-07, + "loss": 0.81221712, + "num_input_tokens_seen": 294863740, + "step": 13670, + "time_per_iteration": 2.6018364429473877 + }, + { + "auxiliary_loss_clip": 0.01054171, + "auxiliary_loss_mlp": 0.01035442, + "balance_loss_clip": 1.02552104, + "balance_loss_mlp": 1.0247426, + "epoch": 0.8219449872238088, + "flos": 25994801007360.0, + "grad_norm": 8.44763064103336, + "language_loss": 0.74980724, + "learning_rate": 3.2335565454736123e-07, + "loss": 0.77070343, + "num_input_tokens_seen": 294882815, + "step": 13671, + "time_per_iteration": 2.7119383811950684 + }, + { + "auxiliary_loss_clip": 0.01055687, + "auxiliary_loss_mlp": 0.0102956, + "balance_loss_clip": 1.02547479, + "balance_loss_mlp": 1.01838374, + "epoch": 0.8220051104764767, + "flos": 20778094091520.0, + "grad_norm": 1.755760538486745, + "language_loss": 0.76422071, + "learning_rate": 3.23143361510728e-07, + "loss": 0.78507316, + "num_input_tokens_seen": 294901985, + "step": 13672, + "time_per_iteration": 2.6391749382019043 + }, + { + "auxiliary_loss_clip": 0.01017474, + "auxiliary_loss_mlp": 0.0103722, + "balance_loss_clip": 1.02265048, + "balance_loss_mlp": 1.02522206, + "epoch": 0.8220652337291448, + "flos": 14574849160320.0, + "grad_norm": 2.0633086739111364, + "language_loss": 0.74807417, + "learning_rate": 3.2293113206066733e-07, + "loss": 0.76862109, + "num_input_tokens_seen": 294919705, + "step": 13673, + "time_per_iteration": 2.7798237800598145 + }, + { + "auxiliary_loss_clip": 0.01043306, + "auxiliary_loss_mlp": 0.01029202, + "balance_loss_clip": 1.02534437, + "balance_loss_mlp": 1.01824093, + "epoch": 0.8221253569818128, + "flos": 23805866194560.0, + "grad_norm": 1.714397164833982, + "language_loss": 0.79305953, + "learning_rate": 3.227189662052254e-07, + "loss": 0.8137846, + "num_input_tokens_seen": 294939900, + "step": 13674, + "time_per_iteration": 2.679588794708252 + }, + { + "auxiliary_loss_clip": 0.01035128, + "auxiliary_loss_mlp": 0.01028286, + "balance_loss_clip": 1.02146637, + "balance_loss_mlp": 1.0182308, + "epoch": 0.8221854802344807, + "flos": 21288241002240.0, + "grad_norm": 1.8754080889411022, + "language_loss": 0.70151198, + "learning_rate": 3.225068639524484e-07, + "loss": 0.72214609, + "num_input_tokens_seen": 294959110, + "step": 13675, + "time_per_iteration": 2.6183502674102783 + }, + { + "auxiliary_loss_clip": 0.01043056, + "auxiliary_loss_mlp": 0.01032095, + "balance_loss_clip": 1.0236237, + "balance_loss_mlp": 1.02190304, + "epoch": 0.8222456034871487, + "flos": 20956785275520.0, + "grad_norm": 1.5325257800537917, + "language_loss": 0.74488366, + "learning_rate": 3.2229482531037965e-07, + "loss": 0.76563519, + "num_input_tokens_seen": 294978660, + "step": 13676, + "time_per_iteration": 2.60764741897583 + }, + { + "auxiliary_loss_clip": 0.01041376, + "auxiliary_loss_mlp": 0.01028049, + "balance_loss_clip": 1.02473879, + "balance_loss_mlp": 1.01818442, + "epoch": 0.8223057267398166, + "flos": 21397517153280.0, + "grad_norm": 1.7229925913398645, + "language_loss": 0.80656993, + "learning_rate": 3.2208285028705893e-07, + "loss": 0.82726425, + "num_input_tokens_seen": 294998075, + "step": 13677, + "time_per_iteration": 2.6652379035949707 + }, + { + "auxiliary_loss_clip": 0.01045323, + "auxiliary_loss_mlp": 0.01031868, + "balance_loss_clip": 1.023628, + "balance_loss_mlp": 1.0210495, + "epoch": 0.8223658499924846, + "flos": 15268212368640.0, + "grad_norm": 1.7596430493968815, + "language_loss": 0.69938767, + "learning_rate": 3.218709388905245e-07, + "loss": 0.72015953, + "num_input_tokens_seen": 295015950, + "step": 13678, + "time_per_iteration": 2.5843746662139893 + }, + { + "auxiliary_loss_clip": 0.0106116, + "auxiliary_loss_mlp": 0.01030144, + "balance_loss_clip": 1.02460206, + "balance_loss_mlp": 1.02017784, + "epoch": 0.8224259732451525, + "flos": 31249537447680.0, + "grad_norm": 1.5682365753954282, + "language_loss": 0.71045953, + "learning_rate": 3.216590911288133e-07, + "loss": 0.73137259, + "num_input_tokens_seen": 295036800, + "step": 13679, + "time_per_iteration": 2.6258018016815186 + }, + { + "auxiliary_loss_clip": 0.01034381, + "auxiliary_loss_mlp": 0.01030136, + "balance_loss_clip": 1.02204454, + "balance_loss_mlp": 1.01906133, + "epoch": 0.8224860964978206, + "flos": 21574628138880.0, + "grad_norm": 1.9356190317361017, + "language_loss": 0.69534516, + "learning_rate": 3.214473070099564e-07, + "loss": 0.7159903, + "num_input_tokens_seen": 295055300, + "step": 13680, + "time_per_iteration": 2.633310556411743 + }, + { + "auxiliary_loss_clip": 0.01031534, + "auxiliary_loss_mlp": 0.01027306, + "balance_loss_clip": 1.0243907, + "balance_loss_mlp": 1.01781118, + "epoch": 0.8225462197504885, + "flos": 25483217552640.0, + "grad_norm": 1.8233580243189098, + "language_loss": 0.59972179, + "learning_rate": 3.21235586541986e-07, + "loss": 0.62031019, + "num_input_tokens_seen": 295076420, + "step": 13681, + "time_per_iteration": 5.9760870933532715 + }, + { + "auxiliary_loss_clip": 0.01036048, + "auxiliary_loss_mlp": 0.01030347, + "balance_loss_clip": 1.02272892, + "balance_loss_mlp": 1.01936805, + "epoch": 0.8226063430031565, + "flos": 39385458587520.0, + "grad_norm": 1.6597413262218264, + "language_loss": 0.69593942, + "learning_rate": 3.2102392973293047e-07, + "loss": 0.7166034, + "num_input_tokens_seen": 295100540, + "step": 13682, + "time_per_iteration": 2.813983201980591 + }, + { + "auxiliary_loss_clip": 0.01062866, + "auxiliary_loss_mlp": 0.0103167, + "balance_loss_clip": 1.02478957, + "balance_loss_mlp": 1.02026796, + "epoch": 0.8226664662558244, + "flos": 22815269942400.0, + "grad_norm": 1.7915872227876426, + "language_loss": 0.79360056, + "learning_rate": 3.20812336590816e-07, + "loss": 0.81454599, + "num_input_tokens_seen": 295120180, + "step": 13683, + "time_per_iteration": 2.6259524822235107 + }, + { + "auxiliary_loss_clip": 0.0105819, + "auxiliary_loss_mlp": 0.01028892, + "balance_loss_clip": 1.02402103, + "balance_loss_mlp": 1.01969528, + "epoch": 0.8227265895084924, + "flos": 25665607837440.0, + "grad_norm": 1.924504981097571, + "language_loss": 0.86425304, + "learning_rate": 3.206008071236661e-07, + "loss": 0.88512391, + "num_input_tokens_seen": 295138530, + "step": 13684, + "time_per_iteration": 2.6643006801605225 + }, + { + "auxiliary_loss_clip": 0.01058594, + "auxiliary_loss_mlp": 0.01023734, + "balance_loss_clip": 1.02382398, + "balance_loss_mlp": 1.01366091, + "epoch": 0.8227867127611603, + "flos": 26179274280960.0, + "grad_norm": 1.5398757957300357, + "language_loss": 0.79844666, + "learning_rate": 3.2038934133950157e-07, + "loss": 0.81926996, + "num_input_tokens_seen": 295160260, + "step": 13685, + "time_per_iteration": 2.742701292037964 + }, + { + "auxiliary_loss_clip": 0.01030932, + "auxiliary_loss_mlp": 0.01027273, + "balance_loss_clip": 1.02386868, + "balance_loss_mlp": 1.01645494, + "epoch": 0.8228468360138284, + "flos": 22018053536640.0, + "grad_norm": 1.5956130381773594, + "language_loss": 0.69014418, + "learning_rate": 3.2017793924634194e-07, + "loss": 0.71072626, + "num_input_tokens_seen": 295177055, + "step": 13686, + "time_per_iteration": 2.817230224609375 + }, + { + "auxiliary_loss_clip": 0.01032458, + "auxiliary_loss_mlp": 0.01031776, + "balance_loss_clip": 1.02360332, + "balance_loss_mlp": 1.02127981, + "epoch": 0.8229069592664963, + "flos": 14903359971840.0, + "grad_norm": 2.1605913192747135, + "language_loss": 0.78262311, + "learning_rate": 3.1996660085220263e-07, + "loss": 0.80326539, + "num_input_tokens_seen": 295193870, + "step": 13687, + "time_per_iteration": 2.731088161468506 + }, + { + "auxiliary_loss_clip": 0.01050672, + "auxiliary_loss_mlp": 0.01028888, + "balance_loss_clip": 1.02365565, + "balance_loss_mlp": 1.01834428, + "epoch": 0.8229670825191643, + "flos": 15669478177920.0, + "grad_norm": 3.2325067305882125, + "language_loss": 0.72094691, + "learning_rate": 3.1975532616509825e-07, + "loss": 0.74174249, + "num_input_tokens_seen": 295211040, + "step": 13688, + "time_per_iteration": 2.633502244949341 + }, + { + "auxiliary_loss_clip": 0.01061787, + "auxiliary_loss_mlp": 0.00747548, + "balance_loss_clip": 1.02562428, + "balance_loss_mlp": 1.0003407, + "epoch": 0.8230272057718323, + "flos": 23183498217600.0, + "grad_norm": 2.1699619211113474, + "language_loss": 0.7337749, + "learning_rate": 3.1954411519304025e-07, + "loss": 0.75186819, + "num_input_tokens_seen": 295231300, + "step": 13689, + "time_per_iteration": 2.5896472930908203 + }, + { + "auxiliary_loss_clip": 0.01051018, + "auxiliary_loss_mlp": 0.01029312, + "balance_loss_clip": 1.02354372, + "balance_loss_mlp": 1.01849413, + "epoch": 0.8230873290245002, + "flos": 21032413361280.0, + "grad_norm": 1.9097382276518011, + "language_loss": 0.689147, + "learning_rate": 3.1933296794403887e-07, + "loss": 0.70995033, + "num_input_tokens_seen": 295251045, + "step": 13690, + "time_per_iteration": 2.6399428844451904 + }, + { + "auxiliary_loss_clip": 0.01011985, + "auxiliary_loss_mlp": 0.01036284, + "balance_loss_clip": 1.02096462, + "balance_loss_mlp": 1.02442908, + "epoch": 0.8231474522771682, + "flos": 21250139650560.0, + "grad_norm": 2.5411582091552067, + "language_loss": 0.85231328, + "learning_rate": 3.191218844260988e-07, + "loss": 0.87279594, + "num_input_tokens_seen": 295270225, + "step": 13691, + "time_per_iteration": 2.776036262512207 + }, + { + "auxiliary_loss_clip": 0.01054741, + "auxiliary_loss_mlp": 0.01030134, + "balance_loss_clip": 1.02634168, + "balance_loss_mlp": 1.02028155, + "epoch": 0.8232075755298361, + "flos": 23842028211840.0, + "grad_norm": 2.6521917076501302, + "language_loss": 0.76905072, + "learning_rate": 3.189108646472252e-07, + "loss": 0.78989941, + "num_input_tokens_seen": 295288950, + "step": 13692, + "time_per_iteration": 2.6046199798583984 + }, + { + "auxiliary_loss_clip": 0.01050041, + "auxiliary_loss_mlp": 0.01024602, + "balance_loss_clip": 1.02409339, + "balance_loss_mlp": 1.01449251, + "epoch": 0.8232676987825042, + "flos": 21653955325440.0, + "grad_norm": 1.5476071324375618, + "language_loss": 0.71479034, + "learning_rate": 3.186999086154205e-07, + "loss": 0.73553681, + "num_input_tokens_seen": 295309405, + "step": 13693, + "time_per_iteration": 2.7355458736419678 + }, + { + "auxiliary_loss_clip": 0.01022716, + "auxiliary_loss_mlp": 0.01028855, + "balance_loss_clip": 1.02084517, + "balance_loss_mlp": 1.01946735, + "epoch": 0.8233278220351721, + "flos": 26322701287680.0, + "grad_norm": 1.3831574121285386, + "language_loss": 0.83777207, + "learning_rate": 3.1848901633868355e-07, + "loss": 0.85828781, + "num_input_tokens_seen": 295331115, + "step": 13694, + "time_per_iteration": 2.7148025035858154 + }, + { + "auxiliary_loss_clip": 0.01015799, + "auxiliary_loss_mlp": 0.01027992, + "balance_loss_clip": 1.02682304, + "balance_loss_mlp": 1.01679206, + "epoch": 0.8233879452878401, + "flos": 21725812483200.0, + "grad_norm": 1.7107830167579092, + "language_loss": 0.77015126, + "learning_rate": 3.182781878250118e-07, + "loss": 0.79058921, + "num_input_tokens_seen": 295350495, + "step": 13695, + "time_per_iteration": 2.804316282272339 + }, + { + "auxiliary_loss_clip": 0.01041128, + "auxiliary_loss_mlp": 0.01031683, + "balance_loss_clip": 1.02511394, + "balance_loss_mlp": 1.02186024, + "epoch": 0.823448068540508, + "flos": 20557746109440.0, + "grad_norm": 1.8844533603325697, + "language_loss": 0.80979037, + "learning_rate": 3.1806742308239985e-07, + "loss": 0.83051848, + "num_input_tokens_seen": 295368225, + "step": 13696, + "time_per_iteration": 2.618267774581909 + }, + { + "auxiliary_loss_clip": 0.00997048, + "auxiliary_loss_mlp": 0.01000058, + "balance_loss_clip": 1.00137496, + "balance_loss_mlp": 0.99921131, + "epoch": 0.823508191793176, + "flos": 67273688194560.0, + "grad_norm": 0.7624363459059081, + "language_loss": 0.63852555, + "learning_rate": 3.178567221188393e-07, + "loss": 0.65849662, + "num_input_tokens_seen": 295430035, + "step": 13697, + "time_per_iteration": 3.228553533554077 + }, + { + "auxiliary_loss_clip": 0.0102756, + "auxiliary_loss_mlp": 0.01021862, + "balance_loss_clip": 1.02304935, + "balance_loss_mlp": 1.01267695, + "epoch": 0.8235683150458439, + "flos": 17928402641280.0, + "grad_norm": 1.9639377530808082, + "language_loss": 0.73059344, + "learning_rate": 3.1764608494232037e-07, + "loss": 0.75108773, + "num_input_tokens_seen": 295447765, + "step": 13698, + "time_per_iteration": 4.636750221252441 + }, + { + "auxiliary_loss_clip": 0.01011317, + "auxiliary_loss_mlp": 0.01034306, + "balance_loss_clip": 1.01882672, + "balance_loss_mlp": 1.02121711, + "epoch": 0.823628438298512, + "flos": 18916089891840.0, + "grad_norm": 1.7971492190666867, + "language_loss": 0.71929795, + "learning_rate": 3.174355115608305e-07, + "loss": 0.7397542, + "num_input_tokens_seen": 295464810, + "step": 13699, + "time_per_iteration": 2.9296276569366455 + }, + { + "auxiliary_loss_clip": 0.01041408, + "auxiliary_loss_mlp": 0.01025519, + "balance_loss_clip": 1.02551782, + "balance_loss_mlp": 1.01562488, + "epoch": 0.8236885615511799, + "flos": 18696460181760.0, + "grad_norm": 1.9185201952995516, + "language_loss": 0.81903934, + "learning_rate": 3.1722500198235526e-07, + "loss": 0.83970863, + "num_input_tokens_seen": 295482605, + "step": 13700, + "time_per_iteration": 2.7690632343292236 + }, + { + "auxiliary_loss_clip": 0.01041184, + "auxiliary_loss_mlp": 0.01035037, + "balance_loss_clip": 1.02363157, + "balance_loss_mlp": 1.02490973, + "epoch": 0.8237486848038479, + "flos": 23695009845120.0, + "grad_norm": 1.6128816453765378, + "language_loss": 0.72649741, + "learning_rate": 3.170145562148763e-07, + "loss": 0.74725968, + "num_input_tokens_seen": 295503780, + "step": 13701, + "time_per_iteration": 2.791707992553711 + }, + { + "auxiliary_loss_clip": 0.01050927, + "auxiliary_loss_mlp": 0.01030631, + "balance_loss_clip": 1.0227685, + "balance_loss_mlp": 1.02014613, + "epoch": 0.8238088080565159, + "flos": 23441301106560.0, + "grad_norm": 1.848869732421822, + "language_loss": 0.6917274, + "learning_rate": 3.1680417426637384e-07, + "loss": 0.71254301, + "num_input_tokens_seen": 295522035, + "step": 13702, + "time_per_iteration": 2.788670539855957 + }, + { + "auxiliary_loss_clip": 0.01019452, + "auxiliary_loss_mlp": 0.0103071, + "balance_loss_clip": 1.02449834, + "balance_loss_mlp": 1.01990354, + "epoch": 0.8238689313091838, + "flos": 22746537267840.0, + "grad_norm": 1.7637294668992767, + "language_loss": 0.75023669, + "learning_rate": 3.1659385614482603e-07, + "loss": 0.77073836, + "num_input_tokens_seen": 295541190, + "step": 13703, + "time_per_iteration": 2.946643829345703 + }, + { + "auxiliary_loss_clip": 0.01063797, + "auxiliary_loss_mlp": 0.01032052, + "balance_loss_clip": 1.02427721, + "balance_loss_mlp": 1.02082276, + "epoch": 0.8239290545618518, + "flos": 25630092264960.0, + "grad_norm": 1.7924044646776822, + "language_loss": 0.69965708, + "learning_rate": 3.1638360185820755e-07, + "loss": 0.72061557, + "num_input_tokens_seen": 295558860, + "step": 13704, + "time_per_iteration": 2.599252939224243 + }, + { + "auxiliary_loss_clip": 0.01059314, + "auxiliary_loss_mlp": 0.01027006, + "balance_loss_clip": 1.02289176, + "balance_loss_mlp": 1.01674783, + "epoch": 0.8239891778145197, + "flos": 26026473824640.0, + "grad_norm": 2.6654232533779334, + "language_loss": 0.6401329, + "learning_rate": 3.161734114144916e-07, + "loss": 0.6609962, + "num_input_tokens_seen": 295578155, + "step": 13705, + "time_per_iteration": 2.6367037296295166 + }, + { + "auxiliary_loss_clip": 0.01062278, + "auxiliary_loss_mlp": 0.01029883, + "balance_loss_clip": 1.02419746, + "balance_loss_mlp": 1.01876628, + "epoch": 0.8240493010671878, + "flos": 21833257040640.0, + "grad_norm": 1.521584117522585, + "language_loss": 0.69400513, + "learning_rate": 3.1596328482164915e-07, + "loss": 0.71492672, + "num_input_tokens_seen": 295599170, + "step": 13706, + "time_per_iteration": 2.5658645629882812 + }, + { + "auxiliary_loss_clip": 0.01035052, + "auxiliary_loss_mlp": 0.01034003, + "balance_loss_clip": 1.02425981, + "balance_loss_mlp": 1.02270794, + "epoch": 0.8241094243198557, + "flos": 18551919853440.0, + "grad_norm": 1.6348415616100562, + "language_loss": 0.69866753, + "learning_rate": 3.157532220876475e-07, + "loss": 0.71935809, + "num_input_tokens_seen": 295617465, + "step": 13707, + "time_per_iteration": 2.6875054836273193 + }, + { + "auxiliary_loss_clip": 0.01027595, + "auxiliary_loss_mlp": 0.0102965, + "balance_loss_clip": 1.02340353, + "balance_loss_mlp": 1.01804543, + "epoch": 0.8241695475725237, + "flos": 25447163276160.0, + "grad_norm": 1.9720726155979411, + "language_loss": 0.78970766, + "learning_rate": 3.1554322322045226e-07, + "loss": 0.81028008, + "num_input_tokens_seen": 295634960, + "step": 13708, + "time_per_iteration": 2.704789400100708 + }, + { + "auxiliary_loss_clip": 0.01042937, + "auxiliary_loss_mlp": 0.01027038, + "balance_loss_clip": 1.02186763, + "balance_loss_mlp": 1.01577878, + "epoch": 0.8242296708251916, + "flos": 18989670902400.0, + "grad_norm": 2.0261353111218856, + "language_loss": 0.68809283, + "learning_rate": 3.1533328822802664e-07, + "loss": 0.70879257, + "num_input_tokens_seen": 295652725, + "step": 13709, + "time_per_iteration": 2.7171380519866943 + }, + { + "auxiliary_loss_clip": 0.01014781, + "auxiliary_loss_mlp": 0.01029721, + "balance_loss_clip": 1.02018583, + "balance_loss_mlp": 1.01980901, + "epoch": 0.8242897940778596, + "flos": 22600883617920.0, + "grad_norm": 2.3866341249234746, + "language_loss": 0.82381946, + "learning_rate": 3.151234171183319e-07, + "loss": 0.84426451, + "num_input_tokens_seen": 295671195, + "step": 13710, + "time_per_iteration": 4.623475551605225 + }, + { + "auxiliary_loss_clip": 0.0105137, + "auxiliary_loss_mlp": 0.01029733, + "balance_loss_clip": 1.02425909, + "balance_loss_mlp": 1.01905787, + "epoch": 0.8243499173305275, + "flos": 21468153248640.0, + "grad_norm": 1.8663289940347618, + "language_loss": 0.78683448, + "learning_rate": 3.149136098993257e-07, + "loss": 0.8076455, + "num_input_tokens_seen": 295689130, + "step": 13711, + "time_per_iteration": 2.668116569519043 + }, + { + "auxiliary_loss_clip": 0.01025675, + "auxiliary_loss_mlp": 0.01026208, + "balance_loss_clip": 1.02161264, + "balance_loss_mlp": 1.01558638, + "epoch": 0.8244100405831956, + "flos": 20010359773440.0, + "grad_norm": 1.691807805633522, + "language_loss": 0.65527993, + "learning_rate": 3.1470386657896473e-07, + "loss": 0.67579877, + "num_input_tokens_seen": 295706385, + "step": 13712, + "time_per_iteration": 2.6727821826934814 + }, + { + "auxiliary_loss_clip": 0.01041633, + "auxiliary_loss_mlp": 0.01027337, + "balance_loss_clip": 1.02375388, + "balance_loss_mlp": 1.01732397, + "epoch": 0.8244701638358635, + "flos": 26430684549120.0, + "grad_norm": 1.7382621442569492, + "language_loss": 0.74200553, + "learning_rate": 3.14494187165202e-07, + "loss": 0.76269519, + "num_input_tokens_seen": 295727925, + "step": 13713, + "time_per_iteration": 2.6632843017578125 + }, + { + "auxiliary_loss_clip": 0.01050887, + "auxiliary_loss_mlp": 0.01023795, + "balance_loss_clip": 1.02375531, + "balance_loss_mlp": 1.01378179, + "epoch": 0.8245302870885315, + "flos": 17640004343040.0, + "grad_norm": 2.29328367002977, + "language_loss": 0.81262672, + "learning_rate": 3.1428457166598833e-07, + "loss": 0.83337361, + "num_input_tokens_seen": 295744420, + "step": 13714, + "time_per_iteration": 2.7291455268859863 + }, + { + "auxiliary_loss_clip": 0.01053524, + "auxiliary_loss_mlp": 0.0103304, + "balance_loss_clip": 1.02691722, + "balance_loss_mlp": 1.02181661, + "epoch": 0.8245904103411995, + "flos": 26209510554240.0, + "grad_norm": 1.720436848163961, + "language_loss": 0.66811049, + "learning_rate": 3.1407502008927235e-07, + "loss": 0.68897611, + "num_input_tokens_seen": 295765105, + "step": 13715, + "time_per_iteration": 2.7533674240112305 + }, + { + "auxiliary_loss_clip": 0.01033086, + "auxiliary_loss_mlp": 0.01029614, + "balance_loss_clip": 1.02588308, + "balance_loss_mlp": 1.01902866, + "epoch": 0.8246505335938674, + "flos": 24205084928640.0, + "grad_norm": 2.377800783140673, + "language_loss": 0.74852616, + "learning_rate": 3.1386553244300086e-07, + "loss": 0.76915318, + "num_input_tokens_seen": 295784200, + "step": 13716, + "time_per_iteration": 2.6940722465515137 + }, + { + "auxiliary_loss_clip": 0.00967118, + "auxiliary_loss_mlp": 0.01000876, + "balance_loss_clip": 1.00369716, + "balance_loss_mlp": 0.99986285, + "epoch": 0.8247106568465354, + "flos": 67092195749760.0, + "grad_norm": 0.7264387637487045, + "language_loss": 0.58931398, + "learning_rate": 3.136561087351175e-07, + "loss": 0.60899401, + "num_input_tokens_seen": 295846555, + "step": 13717, + "time_per_iteration": 3.372781753540039 + }, + { + "auxiliary_loss_clip": 0.01046416, + "auxiliary_loss_mlp": 0.00747464, + "balance_loss_clip": 1.02457118, + "balance_loss_mlp": 1.00031638, + "epoch": 0.8247707800992033, + "flos": 12568232805120.0, + "grad_norm": 1.9548370766654626, + "language_loss": 0.79480147, + "learning_rate": 3.1344674897356373e-07, + "loss": 0.81274033, + "num_input_tokens_seen": 295863425, + "step": 13718, + "time_per_iteration": 2.612032175064087 + }, + { + "auxiliary_loss_clip": 0.01039928, + "auxiliary_loss_mlp": 0.01027337, + "balance_loss_clip": 1.02346349, + "balance_loss_mlp": 1.01734161, + "epoch": 0.8248309033518714, + "flos": 15923617879680.0, + "grad_norm": 1.5799975900136405, + "language_loss": 0.68351108, + "learning_rate": 3.132374531662778e-07, + "loss": 0.7041837, + "num_input_tokens_seen": 295880925, + "step": 13719, + "time_per_iteration": 2.5729176998138428 + }, + { + "auxiliary_loss_clip": 0.01036396, + "auxiliary_loss_mlp": 0.01029564, + "balance_loss_clip": 1.02236247, + "balance_loss_mlp": 1.01727331, + "epoch": 0.8248910266045393, + "flos": 17564735393280.0, + "grad_norm": 6.204319207575934, + "language_loss": 0.69804251, + "learning_rate": 3.13028221321197e-07, + "loss": 0.71870208, + "num_input_tokens_seen": 295898205, + "step": 13720, + "time_per_iteration": 2.6270034313201904 + }, + { + "auxiliary_loss_clip": 0.00995552, + "auxiliary_loss_mlp": 0.01024528, + "balance_loss_clip": 1.02586389, + "balance_loss_mlp": 1.01407337, + "epoch": 0.8249511498572073, + "flos": 28619655275520.0, + "grad_norm": 1.5463254731439489, + "language_loss": 0.75989628, + "learning_rate": 3.1281905344625467e-07, + "loss": 0.78009713, + "num_input_tokens_seen": 295918130, + "step": 13721, + "time_per_iteration": 2.8426201343536377 + }, + { + "auxiliary_loss_clip": 0.01018534, + "auxiliary_loss_mlp": 0.01024247, + "balance_loss_clip": 1.02925134, + "balance_loss_mlp": 1.01410282, + "epoch": 0.8250112731098752, + "flos": 25556583081600.0, + "grad_norm": 1.7538082128749353, + "language_loss": 0.77489531, + "learning_rate": 3.1260994954938305e-07, + "loss": 0.79532313, + "num_input_tokens_seen": 295937760, + "step": 13722, + "time_per_iteration": 2.9703400135040283 + }, + { + "auxiliary_loss_clip": 0.01061605, + "auxiliary_loss_mlp": 0.01026539, + "balance_loss_clip": 1.02573621, + "balance_loss_mlp": 1.01690662, + "epoch": 0.8250713963625432, + "flos": 27746164339200.0, + "grad_norm": 1.8519678354816498, + "language_loss": 0.62580431, + "learning_rate": 3.1240090963851205e-07, + "loss": 0.64668578, + "num_input_tokens_seen": 295957585, + "step": 13723, + "time_per_iteration": 2.697526454925537 + }, + { + "auxiliary_loss_clip": 0.01063265, + "auxiliary_loss_mlp": 0.01029343, + "balance_loss_clip": 1.02559328, + "balance_loss_mlp": 1.01874542, + "epoch": 0.8251315196152111, + "flos": 21610610588160.0, + "grad_norm": 1.5162964323205617, + "language_loss": 0.74471402, + "learning_rate": 3.121919337215666e-07, + "loss": 0.76564014, + "num_input_tokens_seen": 295977135, + "step": 13724, + "time_per_iteration": 2.7346410751342773 + }, + { + "auxiliary_loss_clip": 0.01028824, + "auxiliary_loss_mlp": 0.01029914, + "balance_loss_clip": 1.02331853, + "balance_loss_mlp": 1.01885724, + "epoch": 0.8251916428678792, + "flos": 28579363194240.0, + "grad_norm": 1.8666739644071204, + "language_loss": 0.64604253, + "learning_rate": 3.1198302180647253e-07, + "loss": 0.66662991, + "num_input_tokens_seen": 295996265, + "step": 13725, + "time_per_iteration": 2.8297762870788574 + }, + { + "auxiliary_loss_clip": 0.01032694, + "auxiliary_loss_mlp": 0.01024813, + "balance_loss_clip": 1.02143264, + "balance_loss_mlp": 1.01450729, + "epoch": 0.8252517661205471, + "flos": 23075191733760.0, + "grad_norm": 1.5365356114272788, + "language_loss": 0.81742787, + "learning_rate": 3.1177417390115125e-07, + "loss": 0.83800298, + "num_input_tokens_seen": 296014745, + "step": 13726, + "time_per_iteration": 2.723803758621216 + }, + { + "auxiliary_loss_clip": 0.01042373, + "auxiliary_loss_mlp": 0.01027609, + "balance_loss_clip": 1.01999462, + "balance_loss_mlp": 1.01788771, + "epoch": 0.8253118893732151, + "flos": 31759576617600.0, + "grad_norm": 1.5575626117646042, + "language_loss": 0.70364892, + "learning_rate": 3.1156539001352286e-07, + "loss": 0.72434872, + "num_input_tokens_seen": 296036960, + "step": 13727, + "time_per_iteration": 2.7452518939971924 + }, + { + "auxiliary_loss_clip": 0.01053237, + "auxiliary_loss_mlp": 0.01027839, + "balance_loss_clip": 1.02503419, + "balance_loss_mlp": 1.01688361, + "epoch": 0.8253720126258831, + "flos": 18296415434880.0, + "grad_norm": 2.250672609054617, + "language_loss": 0.62817198, + "learning_rate": 3.113566701515036e-07, + "loss": 0.64898276, + "num_input_tokens_seen": 296056540, + "step": 13728, + "time_per_iteration": 4.327796697616577 + }, + { + "auxiliary_loss_clip": 0.01046304, + "auxiliary_loss_mlp": 0.01027404, + "balance_loss_clip": 1.02650774, + "balance_loss_mlp": 1.01683569, + "epoch": 0.825432135878551, + "flos": 26797332625920.0, + "grad_norm": 1.644501871782893, + "language_loss": 0.71326393, + "learning_rate": 3.111480143230092e-07, + "loss": 0.73400104, + "num_input_tokens_seen": 296077950, + "step": 13729, + "time_per_iteration": 4.354540586471558 + }, + { + "auxiliary_loss_clip": 0.009888, + "auxiliary_loss_mlp": 0.01002065, + "balance_loss_clip": 1.00287557, + "balance_loss_mlp": 1.00121295, + "epoch": 0.825492259131219, + "flos": 54219116217600.0, + "grad_norm": 1.165614893261506, + "language_loss": 0.62715769, + "learning_rate": 3.109394225359514e-07, + "loss": 0.64706635, + "num_input_tokens_seen": 296127060, + "step": 13730, + "time_per_iteration": 3.1698384284973145 + }, + { + "auxiliary_loss_clip": 0.01013897, + "auxiliary_loss_mlp": 0.01034232, + "balance_loss_clip": 1.0234108, + "balance_loss_mlp": 1.02321708, + "epoch": 0.825552382383887, + "flos": 43756145493120.0, + "grad_norm": 2.3324631771745716, + "language_loss": 0.63039494, + "learning_rate": 3.1073089479823945e-07, + "loss": 0.65087628, + "num_input_tokens_seen": 296147775, + "step": 13731, + "time_per_iteration": 2.8838865756988525 + }, + { + "auxiliary_loss_clip": 0.01019483, + "auxiliary_loss_mlp": 0.00747731, + "balance_loss_clip": 1.01966429, + "balance_loss_mlp": 1.00044382, + "epoch": 0.825612505636555, + "flos": 12602814624000.0, + "grad_norm": 2.0605829330218204, + "language_loss": 0.70016575, + "learning_rate": 3.105224311177812e-07, + "loss": 0.71783793, + "num_input_tokens_seen": 296163560, + "step": 13732, + "time_per_iteration": 2.683166742324829 + }, + { + "auxiliary_loss_clip": 0.0105398, + "auxiliary_loss_mlp": 0.01033379, + "balance_loss_clip": 1.02471066, + "balance_loss_mlp": 1.0223999, + "epoch": 0.8256726288892229, + "flos": 17595618111360.0, + "grad_norm": 2.3177307019249027, + "language_loss": 0.7097615, + "learning_rate": 3.103140315024817e-07, + "loss": 0.73063517, + "num_input_tokens_seen": 296178730, + "step": 13733, + "time_per_iteration": 2.606804370880127 + }, + { + "auxiliary_loss_clip": 0.01058361, + "auxiliary_loss_mlp": 0.01028607, + "balance_loss_clip": 1.02310002, + "balance_loss_mlp": 1.01833749, + "epoch": 0.8257327521418909, + "flos": 23805794367360.0, + "grad_norm": 1.423097697496915, + "language_loss": 0.82275259, + "learning_rate": 3.1010569596024437e-07, + "loss": 0.84362221, + "num_input_tokens_seen": 296200175, + "step": 13734, + "time_per_iteration": 2.6581709384918213 + }, + { + "auxiliary_loss_clip": 0.01030067, + "auxiliary_loss_mlp": 0.01028372, + "balance_loss_clip": 1.02102709, + "balance_loss_mlp": 1.01780999, + "epoch": 0.8257928753945588, + "flos": 19281121856640.0, + "grad_norm": 1.848633996366454, + "language_loss": 0.82710153, + "learning_rate": 3.098974244989676e-07, + "loss": 0.84768593, + "num_input_tokens_seen": 296219305, + "step": 13735, + "time_per_iteration": 2.728036403656006 + }, + { + "auxiliary_loss_clip": 0.01055814, + "auxiliary_loss_mlp": 0.01025493, + "balance_loss_clip": 1.02737737, + "balance_loss_mlp": 1.01626635, + "epoch": 0.8258529986472268, + "flos": 18478841633280.0, + "grad_norm": 1.8603095420047444, + "language_loss": 0.70666277, + "learning_rate": 3.096892171265497e-07, + "loss": 0.72747588, + "num_input_tokens_seen": 296236945, + "step": 13736, + "time_per_iteration": 2.8066792488098145 + }, + { + "auxiliary_loss_clip": 0.00997869, + "auxiliary_loss_mlp": 0.01002512, + "balance_loss_clip": 1.00255203, + "balance_loss_mlp": 1.00161791, + "epoch": 0.8259131218998947, + "flos": 62137957512960.0, + "grad_norm": 0.9667236582186046, + "language_loss": 0.67978853, + "learning_rate": 3.0948107385088665e-07, + "loss": 0.69979239, + "num_input_tokens_seen": 296294685, + "step": 13737, + "time_per_iteration": 3.135263442993164 + }, + { + "auxiliary_loss_clip": 0.01036993, + "auxiliary_loss_mlp": 0.0102953, + "balance_loss_clip": 1.0221175, + "balance_loss_mlp": 1.01971269, + "epoch": 0.8259732451525628, + "flos": 22159038418560.0, + "grad_norm": 2.055813934076619, + "language_loss": 0.69422662, + "learning_rate": 3.0927299467987e-07, + "loss": 0.71489185, + "num_input_tokens_seen": 296314790, + "step": 13738, + "time_per_iteration": 2.8130431175231934 + }, + { + "auxiliary_loss_clip": 0.01045484, + "auxiliary_loss_mlp": 0.01030769, + "balance_loss_clip": 1.02623975, + "balance_loss_mlp": 1.01892555, + "epoch": 0.8260333684052307, + "flos": 38361645233280.0, + "grad_norm": 2.60367759558749, + "language_loss": 0.63081169, + "learning_rate": 3.090649796213911e-07, + "loss": 0.65157425, + "num_input_tokens_seen": 296335355, + "step": 13739, + "time_per_iteration": 2.886481285095215 + }, + { + "auxiliary_loss_clip": 0.00988429, + "auxiliary_loss_mlp": 0.01002002, + "balance_loss_clip": 1.00285685, + "balance_loss_mlp": 1.00117385, + "epoch": 0.8260934916578987, + "flos": 62185611882240.0, + "grad_norm": 0.8293979911325614, + "language_loss": 0.59355056, + "learning_rate": 3.0885702868333853e-07, + "loss": 0.61345482, + "num_input_tokens_seen": 296399885, + "step": 13740, + "time_per_iteration": 3.2805392742156982 + }, + { + "auxiliary_loss_clip": 0.01066063, + "auxiliary_loss_mlp": 0.01028556, + "balance_loss_clip": 1.02619195, + "balance_loss_mlp": 1.01684356, + "epoch": 0.8261536149105667, + "flos": 22565475786240.0, + "grad_norm": 1.9265668824056823, + "language_loss": 0.75258183, + "learning_rate": 3.086491418735959e-07, + "loss": 0.77352798, + "num_input_tokens_seen": 296417660, + "step": 13741, + "time_per_iteration": 2.552332878112793 + }, + { + "auxiliary_loss_clip": 0.01051021, + "auxiliary_loss_mlp": 0.01026995, + "balance_loss_clip": 1.02441239, + "balance_loss_mlp": 1.01705885, + "epoch": 0.8262137381632346, + "flos": 32525479342080.0, + "grad_norm": 2.2751446406031537, + "language_loss": 0.62204051, + "learning_rate": 3.0844131920004726e-07, + "loss": 0.6428206, + "num_input_tokens_seen": 296438255, + "step": 13742, + "time_per_iteration": 2.7246224880218506 + }, + { + "auxiliary_loss_clip": 0.01023152, + "auxiliary_loss_mlp": 0.01034914, + "balance_loss_clip": 1.02467465, + "balance_loss_mlp": 1.02253962, + "epoch": 0.8262738614159026, + "flos": 14136451666560.0, + "grad_norm": 2.437848656545222, + "language_loss": 0.66115338, + "learning_rate": 3.0823356067057327e-07, + "loss": 0.68173409, + "num_input_tokens_seen": 296454485, + "step": 13743, + "time_per_iteration": 2.728543996810913 + }, + { + "auxiliary_loss_clip": 0.01038446, + "auxiliary_loss_mlp": 0.01031073, + "balance_loss_clip": 1.02274394, + "balance_loss_mlp": 1.0201714, + "epoch": 0.8263339846685706, + "flos": 19825347795840.0, + "grad_norm": 1.8709234146496216, + "language_loss": 0.6665765, + "learning_rate": 3.0802586629305283e-07, + "loss": 0.68727171, + "num_input_tokens_seen": 296473740, + "step": 13744, + "time_per_iteration": 4.491167306900024 + }, + { + "auxiliary_loss_clip": 0.0103371, + "auxiliary_loss_mlp": 0.01029173, + "balance_loss_clip": 1.02568483, + "balance_loss_mlp": 1.01932025, + "epoch": 0.8263941079212386, + "flos": 22745962650240.0, + "grad_norm": 1.7414702084587488, + "language_loss": 0.75416982, + "learning_rate": 3.078182360753612e-07, + "loss": 0.77479863, + "num_input_tokens_seen": 296493355, + "step": 13745, + "time_per_iteration": 2.895913600921631 + }, + { + "auxiliary_loss_clip": 0.01033789, + "auxiliary_loss_mlp": 0.00747615, + "balance_loss_clip": 1.02281547, + "balance_loss_mlp": 1.00036407, + "epoch": 0.8264542311739065, + "flos": 20120641505280.0, + "grad_norm": 2.1010501933459116, + "language_loss": 0.78740144, + "learning_rate": 3.076106700253709e-07, + "loss": 0.80521548, + "num_input_tokens_seen": 296510520, + "step": 13746, + "time_per_iteration": 2.92665958404541 + }, + { + "auxiliary_loss_clip": 0.01056305, + "auxiliary_loss_mlp": 0.01031017, + "balance_loss_clip": 1.02695811, + "balance_loss_mlp": 1.0198946, + "epoch": 0.8265143544265745, + "flos": 16837149502080.0, + "grad_norm": 2.7360565852294383, + "language_loss": 0.68393445, + "learning_rate": 3.0740316815095415e-07, + "loss": 0.70480764, + "num_input_tokens_seen": 296528265, + "step": 13747, + "time_per_iteration": 2.792121171951294 + }, + { + "auxiliary_loss_clip": 0.01046305, + "auxiliary_loss_mlp": 0.01036896, + "balance_loss_clip": 1.02246547, + "balance_loss_mlp": 1.02461135, + "epoch": 0.8265744776792424, + "flos": 22018592240640.0, + "grad_norm": 3.889467235839489, + "language_loss": 0.75518024, + "learning_rate": 3.0719573045997835e-07, + "loss": 0.77601218, + "num_input_tokens_seen": 296547810, + "step": 13748, + "time_per_iteration": 2.6871583461761475 + }, + { + "auxiliary_loss_clip": 0.0102977, + "auxiliary_loss_mlp": 0.01033562, + "balance_loss_clip": 1.02507377, + "balance_loss_mlp": 1.02430582, + "epoch": 0.8266346009319104, + "flos": 19244852098560.0, + "grad_norm": 1.9878499016652786, + "language_loss": 0.63851404, + "learning_rate": 3.069883569603102e-07, + "loss": 0.65914732, + "num_input_tokens_seen": 296565940, + "step": 13749, + "time_per_iteration": 2.7565572261810303 + }, + { + "auxiliary_loss_clip": 0.01036569, + "auxiliary_loss_mlp": 0.01027077, + "balance_loss_clip": 1.02157521, + "balance_loss_mlp": 1.01756394, + "epoch": 0.8266947241845783, + "flos": 24166768095360.0, + "grad_norm": 2.0181024934929224, + "language_loss": 0.73616779, + "learning_rate": 3.067810476598132e-07, + "loss": 0.75680423, + "num_input_tokens_seen": 296585090, + "step": 13750, + "time_per_iteration": 2.727031707763672 + }, + { + "auxiliary_loss_clip": 0.01051929, + "auxiliary_loss_mlp": 0.01031808, + "balance_loss_clip": 1.02551425, + "balance_loss_mlp": 1.02140105, + "epoch": 0.8267548474372464, + "flos": 21105814803840.0, + "grad_norm": 2.2778610098085217, + "language_loss": 0.65672904, + "learning_rate": 3.065738025663496e-07, + "loss": 0.67756641, + "num_input_tokens_seen": 296604950, + "step": 13751, + "time_per_iteration": 2.6447246074676514 + }, + { + "auxiliary_loss_clip": 0.01032079, + "auxiliary_loss_mlp": 0.01028249, + "balance_loss_clip": 1.01997614, + "balance_loss_mlp": 1.01886117, + "epoch": 0.8268149706899143, + "flos": 39968288668800.0, + "grad_norm": 1.5868890636271475, + "language_loss": 0.6091522, + "learning_rate": 3.0636662168777607e-07, + "loss": 0.6297555, + "num_input_tokens_seen": 296627780, + "step": 13752, + "time_per_iteration": 2.836127996444702 + }, + { + "auxiliary_loss_clip": 0.00997033, + "auxiliary_loss_mlp": 0.01001301, + "balance_loss_clip": 1.00183916, + "balance_loss_mlp": 1.00046062, + "epoch": 0.8268750939425823, + "flos": 65782423244160.0, + "grad_norm": 0.7778135283952392, + "language_loss": 0.57457554, + "learning_rate": 3.0615950503194986e-07, + "loss": 0.59455884, + "num_input_tokens_seen": 296683850, + "step": 13753, + "time_per_iteration": 3.2240772247314453 + }, + { + "auxiliary_loss_clip": 0.00961565, + "auxiliary_loss_mlp": 0.00746679, + "balance_loss_clip": 1.00620508, + "balance_loss_mlp": 1.00065446, + "epoch": 0.8269352171952503, + "flos": 52981455242880.0, + "grad_norm": 0.7010692111870922, + "language_loss": 0.54975915, + "learning_rate": 3.0595245260672563e-07, + "loss": 0.5668416, + "num_input_tokens_seen": 296741420, + "step": 13754, + "time_per_iteration": 3.42203426361084 + }, + { + "auxiliary_loss_clip": 0.01019629, + "auxiliary_loss_mlp": 0.01032369, + "balance_loss_clip": 1.01993442, + "balance_loss_mlp": 1.02291608, + "epoch": 0.8269953404479182, + "flos": 23076125487360.0, + "grad_norm": 1.8970206434979708, + "language_loss": 0.69050109, + "learning_rate": 3.0574546441995354e-07, + "loss": 0.71102107, + "num_input_tokens_seen": 296759620, + "step": 13755, + "time_per_iteration": 2.836690664291382 + }, + { + "auxiliary_loss_clip": 0.0101806, + "auxiliary_loss_mlp": 0.01026725, + "balance_loss_clip": 1.0232929, + "balance_loss_mlp": 1.0170151, + "epoch": 0.8270554637005862, + "flos": 14209996763520.0, + "grad_norm": 1.9533667258229677, + "language_loss": 0.70042926, + "learning_rate": 3.0553854047948324e-07, + "loss": 0.72087711, + "num_input_tokens_seen": 296777275, + "step": 13756, + "time_per_iteration": 2.848069190979004 + }, + { + "auxiliary_loss_clip": 0.01054022, + "auxiliary_loss_mlp": 0.01032182, + "balance_loss_clip": 1.02653575, + "balance_loss_mlp": 1.02185202, + "epoch": 0.8271155869532542, + "flos": 21762046327680.0, + "grad_norm": 2.359738709471985, + "language_loss": 0.72620797, + "learning_rate": 3.053316807931623e-07, + "loss": 0.74706995, + "num_input_tokens_seen": 296796655, + "step": 13757, + "time_per_iteration": 2.652174472808838 + }, + { + "auxiliary_loss_clip": 0.01053631, + "auxiliary_loss_mlp": 0.0102915, + "balance_loss_clip": 1.0249722, + "balance_loss_mlp": 1.01733673, + "epoch": 0.8271757102059222, + "flos": 15120475729920.0, + "grad_norm": 2.0356830674306714, + "language_loss": 0.69465905, + "learning_rate": 3.0512488536883283e-07, + "loss": 0.71548682, + "num_input_tokens_seen": 296813705, + "step": 13758, + "time_per_iteration": 4.6738152503967285 + }, + { + "auxiliary_loss_clip": 0.01037132, + "auxiliary_loss_mlp": 0.010241, + "balance_loss_clip": 1.02240419, + "balance_loss_mlp": 1.01426542, + "epoch": 0.8272358334585901, + "flos": 24133730561280.0, + "grad_norm": 1.9545720741959811, + "language_loss": 0.70026624, + "learning_rate": 3.0491815421433775e-07, + "loss": 0.72087854, + "num_input_tokens_seen": 296833985, + "step": 13759, + "time_per_iteration": 2.731525182723999 + }, + { + "auxiliary_loss_clip": 0.01042624, + "auxiliary_loss_mlp": 0.01030364, + "balance_loss_clip": 1.02639174, + "balance_loss_mlp": 1.01998663, + "epoch": 0.8272959567112581, + "flos": 18990712396800.0, + "grad_norm": 1.701690401712203, + "language_loss": 0.70962614, + "learning_rate": 3.047114873375161e-07, + "loss": 0.73035598, + "num_input_tokens_seen": 296850150, + "step": 13760, + "time_per_iteration": 2.677216053009033 + }, + { + "auxiliary_loss_clip": 0.01016229, + "auxiliary_loss_mlp": 0.0102487, + "balance_loss_clip": 1.02198839, + "balance_loss_mlp": 1.01476669, + "epoch": 0.827356079963926, + "flos": 20631614428800.0, + "grad_norm": 1.718586990908923, + "language_loss": 0.77732182, + "learning_rate": 3.0450488474620505e-07, + "loss": 0.79773283, + "num_input_tokens_seen": 296869585, + "step": 13761, + "time_per_iteration": 2.8191583156585693 + }, + { + "auxiliary_loss_clip": 0.01030193, + "auxiliary_loss_mlp": 0.01028113, + "balance_loss_clip": 1.02504253, + "balance_loss_mlp": 1.01862383, + "epoch": 0.827416203216594, + "flos": 22416625825920.0, + "grad_norm": 1.6834480166360368, + "language_loss": 0.69916934, + "learning_rate": 3.042983464482387e-07, + "loss": 0.71975243, + "num_input_tokens_seen": 296887710, + "step": 13762, + "time_per_iteration": 2.78633975982666 + }, + { + "auxiliary_loss_clip": 0.01015221, + "auxiliary_loss_mlp": 0.01024971, + "balance_loss_clip": 1.02354288, + "balance_loss_mlp": 1.01486158, + "epoch": 0.827476326469262, + "flos": 19026192055680.0, + "grad_norm": 1.8699945901335469, + "language_loss": 0.69832754, + "learning_rate": 3.0409187245144853e-07, + "loss": 0.7187295, + "num_input_tokens_seen": 296906265, + "step": 13763, + "time_per_iteration": 2.795600414276123 + }, + { + "auxiliary_loss_clip": 0.00988708, + "auxiliary_loss_mlp": 0.01001517, + "balance_loss_clip": 1.01181841, + "balance_loss_mlp": 1.00054586, + "epoch": 0.82753644972193, + "flos": 68500575089280.0, + "grad_norm": 0.8424238819695186, + "language_loss": 0.65186113, + "learning_rate": 3.038854627636651e-07, + "loss": 0.67176342, + "num_input_tokens_seen": 296971290, + "step": 13764, + "time_per_iteration": 3.392045736312866 + }, + { + "auxiliary_loss_clip": 0.01053782, + "auxiliary_loss_mlp": 0.01029975, + "balance_loss_clip": 1.02605176, + "balance_loss_mlp": 1.01931739, + "epoch": 0.8275965729745979, + "flos": 18405404277120.0, + "grad_norm": 2.152851282342262, + "language_loss": 0.77922195, + "learning_rate": 3.0367911739271423e-07, + "loss": 0.80005956, + "num_input_tokens_seen": 296989060, + "step": 13765, + "time_per_iteration": 2.602905035018921 + }, + { + "auxiliary_loss_clip": 0.01006635, + "auxiliary_loss_mlp": 0.01028525, + "balance_loss_clip": 1.02162564, + "balance_loss_mlp": 1.01697373, + "epoch": 0.8276566962272659, + "flos": 28512067063680.0, + "grad_norm": 1.5802726024534883, + "language_loss": 0.62197411, + "learning_rate": 3.034728363464214e-07, + "loss": 0.64232576, + "num_input_tokens_seen": 297011300, + "step": 13766, + "time_per_iteration": 2.8894152641296387 + }, + { + "auxiliary_loss_clip": 0.01030984, + "auxiliary_loss_mlp": 0.01031154, + "balance_loss_clip": 1.02321362, + "balance_loss_mlp": 1.02055049, + "epoch": 0.8277168194799339, + "flos": 20230240878720.0, + "grad_norm": 1.7478188016503509, + "language_loss": 0.82477468, + "learning_rate": 3.03266619632609e-07, + "loss": 0.84539604, + "num_input_tokens_seen": 297030350, + "step": 13767, + "time_per_iteration": 2.723792552947998 + }, + { + "auxiliary_loss_clip": 0.01046443, + "auxiliary_loss_mlp": 0.01030951, + "balance_loss_clip": 1.02844167, + "balance_loss_mlp": 1.02037764, + "epoch": 0.8277769427326018, + "flos": 28476623318400.0, + "grad_norm": 1.6322348280979047, + "language_loss": 0.69164026, + "learning_rate": 3.030604672590964e-07, + "loss": 0.71241415, + "num_input_tokens_seen": 297049710, + "step": 13768, + "time_per_iteration": 2.844207286834717 + }, + { + "auxiliary_loss_clip": 0.00989404, + "auxiliary_loss_mlp": 0.01029584, + "balance_loss_clip": 1.01831603, + "balance_loss_mlp": 1.01902843, + "epoch": 0.8278370659852698, + "flos": 27197628768000.0, + "grad_norm": 1.717021019444298, + "language_loss": 0.74219966, + "learning_rate": 3.028543792337006e-07, + "loss": 0.76238954, + "num_input_tokens_seen": 297070510, + "step": 13769, + "time_per_iteration": 2.9184179306030273 + }, + { + "auxiliary_loss_clip": 0.01040784, + "auxiliary_loss_mlp": 0.0102826, + "balance_loss_clip": 1.02353549, + "balance_loss_mlp": 1.01794267, + "epoch": 0.8278971892379378, + "flos": 37816126404480.0, + "grad_norm": 1.816878707261284, + "language_loss": 0.73810339, + "learning_rate": 3.0264835556423675e-07, + "loss": 0.75879389, + "num_input_tokens_seen": 297092585, + "step": 13770, + "time_per_iteration": 3.004669666290283 + }, + { + "auxiliary_loss_clip": 0.0103306, + "auxiliary_loss_mlp": 0.01027645, + "balance_loss_clip": 1.02404976, + "balance_loss_mlp": 1.01679659, + "epoch": 0.8279573124906058, + "flos": 22560160573440.0, + "grad_norm": 1.9826865941900993, + "language_loss": 0.75750792, + "learning_rate": 3.0244239625851785e-07, + "loss": 0.77811497, + "num_input_tokens_seen": 297110055, + "step": 13771, + "time_per_iteration": 2.913241386413574 + }, + { + "auxiliary_loss_clip": 0.01060922, + "auxiliary_loss_mlp": 0.01026935, + "balance_loss_clip": 1.02430725, + "balance_loss_mlp": 1.01689124, + "epoch": 0.8280174357432737, + "flos": 36064619418240.0, + "grad_norm": 2.0026101858936416, + "language_loss": 0.72892261, + "learning_rate": 3.0223650132435284e-07, + "loss": 0.74980116, + "num_input_tokens_seen": 297132170, + "step": 13772, + "time_per_iteration": 2.8721888065338135 + }, + { + "auxiliary_loss_clip": 0.01040152, + "auxiliary_loss_mlp": 0.01027026, + "balance_loss_clip": 1.0241549, + "balance_loss_mlp": 1.0165658, + "epoch": 0.8280775589959417, + "flos": 22961067246720.0, + "grad_norm": 2.154093325356248, + "language_loss": 0.74216652, + "learning_rate": 3.0203067076955035e-07, + "loss": 0.76283824, + "num_input_tokens_seen": 297149515, + "step": 13773, + "time_per_iteration": 4.43292498588562 + }, + { + "auxiliary_loss_clip": 0.01020752, + "auxiliary_loss_mlp": 0.01033027, + "balance_loss_clip": 1.02434182, + "balance_loss_mlp": 1.02300191, + "epoch": 0.8281376822486096, + "flos": 26063282286720.0, + "grad_norm": 1.9556617867063408, + "language_loss": 0.75938296, + "learning_rate": 3.01824904601915e-07, + "loss": 0.7799207, + "num_input_tokens_seen": 297170320, + "step": 13774, + "time_per_iteration": 2.981168508529663 + }, + { + "auxiliary_loss_clip": 0.01034677, + "auxiliary_loss_mlp": 0.0074751, + "balance_loss_clip": 1.02626562, + "balance_loss_mlp": 1.00035775, + "epoch": 0.8281978055012776, + "flos": 20667776446080.0, + "grad_norm": 1.594759655389137, + "language_loss": 0.75140595, + "learning_rate": 3.01619202829249e-07, + "loss": 0.76922786, + "num_input_tokens_seen": 297189935, + "step": 13775, + "time_per_iteration": 4.390661954879761 + }, + { + "auxiliary_loss_clip": 0.01063539, + "auxiliary_loss_mlp": 0.0102741, + "balance_loss_clip": 1.0238328, + "balance_loss_mlp": 1.01603699, + "epoch": 0.8282579287539455, + "flos": 29315281040640.0, + "grad_norm": 2.1741591520895196, + "language_loss": 0.73965877, + "learning_rate": 3.01413565459353e-07, + "loss": 0.76056826, + "num_input_tokens_seen": 297210885, + "step": 13776, + "time_per_iteration": 2.746417999267578 + }, + { + "auxiliary_loss_clip": 0.01003676, + "auxiliary_loss_mlp": 0.01027227, + "balance_loss_clip": 1.01805806, + "balance_loss_mlp": 1.0158006, + "epoch": 0.8283180520066136, + "flos": 15706178899200.0, + "grad_norm": 2.1012250350937074, + "language_loss": 0.77452427, + "learning_rate": 3.0120799250002483e-07, + "loss": 0.7948333, + "num_input_tokens_seen": 297228500, + "step": 13777, + "time_per_iteration": 2.967634677886963 + }, + { + "auxiliary_loss_clip": 0.01049944, + "auxiliary_loss_mlp": 0.01025551, + "balance_loss_clip": 1.02523232, + "balance_loss_mlp": 1.01613331, + "epoch": 0.8283781752592815, + "flos": 24791470456320.0, + "grad_norm": 1.4991750836472701, + "language_loss": 0.82288557, + "learning_rate": 3.010024839590604e-07, + "loss": 0.84364051, + "num_input_tokens_seen": 297249470, + "step": 13778, + "time_per_iteration": 2.8899128437042236 + }, + { + "auxiliary_loss_clip": 0.01042289, + "auxiliary_loss_mlp": 0.01020815, + "balance_loss_clip": 1.02108848, + "balance_loss_mlp": 1.01072407, + "epoch": 0.8284382985119495, + "flos": 18982811404800.0, + "grad_norm": 1.7927224316032326, + "language_loss": 0.74375951, + "learning_rate": 3.0079703984425187e-07, + "loss": 0.76439047, + "num_input_tokens_seen": 297265970, + "step": 13779, + "time_per_iteration": 2.8784775733947754 + }, + { + "auxiliary_loss_clip": 0.00978747, + "auxiliary_loss_mlp": 0.01002387, + "balance_loss_clip": 1.00290346, + "balance_loss_mlp": 1.00151694, + "epoch": 0.8284984217646175, + "flos": 61034460814080.0, + "grad_norm": 0.7962372268050625, + "language_loss": 0.56740785, + "learning_rate": 3.0059166016338954e-07, + "loss": 0.58721924, + "num_input_tokens_seen": 297325525, + "step": 13780, + "time_per_iteration": 3.412470579147339 + }, + { + "auxiliary_loss_clip": 0.0102732, + "auxiliary_loss_mlp": 0.01026026, + "balance_loss_clip": 1.02472484, + "balance_loss_mlp": 1.01522565, + "epoch": 0.8285585450172854, + "flos": 19714635100800.0, + "grad_norm": 1.7917100283695417, + "language_loss": 0.79827642, + "learning_rate": 3.0038634492426205e-07, + "loss": 0.81880987, + "num_input_tokens_seen": 297345025, + "step": 13781, + "time_per_iteration": 2.870575428009033 + }, + { + "auxiliary_loss_clip": 0.01021943, + "auxiliary_loss_mlp": 0.01030723, + "balance_loss_clip": 1.02517986, + "balance_loss_mlp": 1.01960135, + "epoch": 0.8286186682699535, + "flos": 21688896280320.0, + "grad_norm": 1.8474539611892276, + "language_loss": 0.75689876, + "learning_rate": 3.001810941346543e-07, + "loss": 0.77742541, + "num_input_tokens_seen": 297363570, + "step": 13782, + "time_per_iteration": 2.794095754623413 + }, + { + "auxiliary_loss_clip": 0.01049367, + "auxiliary_loss_mlp": 0.01026878, + "balance_loss_clip": 1.0227375, + "balance_loss_mlp": 1.01679921, + "epoch": 0.8286787915226214, + "flos": 25775566346880.0, + "grad_norm": 1.5422741342055162, + "language_loss": 0.76132411, + "learning_rate": 2.9997590780234983e-07, + "loss": 0.78208655, + "num_input_tokens_seen": 297385385, + "step": 13783, + "time_per_iteration": 2.748300552368164 + }, + { + "auxiliary_loss_clip": 0.0106195, + "auxiliary_loss_mlp": 0.0102443, + "balance_loss_clip": 1.02503109, + "balance_loss_mlp": 1.01433861, + "epoch": 0.8287389147752894, + "flos": 21288348743040.0, + "grad_norm": 1.5418421948453913, + "language_loss": 0.73914337, + "learning_rate": 2.997707859351304e-07, + "loss": 0.76000714, + "num_input_tokens_seen": 297403950, + "step": 13784, + "time_per_iteration": 2.6170177459716797 + }, + { + "auxiliary_loss_clip": 0.01054362, + "auxiliary_loss_mlp": 0.01032853, + "balance_loss_clip": 1.02461898, + "balance_loss_mlp": 1.02160525, + "epoch": 0.8287990380279573, + "flos": 33544875323520.0, + "grad_norm": 1.741035129287112, + "language_loss": 0.69506365, + "learning_rate": 2.99565728540772e-07, + "loss": 0.71593583, + "num_input_tokens_seen": 297424565, + "step": 13785, + "time_per_iteration": 2.7598767280578613 + }, + { + "auxiliary_loss_clip": 0.01044221, + "auxiliary_loss_mlp": 0.01031815, + "balance_loss_clip": 1.02712345, + "balance_loss_mlp": 1.02151573, + "epoch": 0.8288591612806253, + "flos": 22966346545920.0, + "grad_norm": 1.388811044686552, + "language_loss": 0.68548465, + "learning_rate": 2.993607356270516e-07, + "loss": 0.70624501, + "num_input_tokens_seen": 297445180, + "step": 13786, + "time_per_iteration": 2.6589066982269287 + }, + { + "auxiliary_loss_clip": 0.01023776, + "auxiliary_loss_mlp": 0.01032474, + "balance_loss_clip": 1.02404571, + "balance_loss_mlp": 1.02138114, + "epoch": 0.8289192845332932, + "flos": 18588979710720.0, + "grad_norm": 2.0264776510517453, + "language_loss": 0.77119803, + "learning_rate": 2.991558072017426e-07, + "loss": 0.79176056, + "num_input_tokens_seen": 297463790, + "step": 13787, + "time_per_iteration": 2.645843029022217 + }, + { + "auxiliary_loss_clip": 0.01043859, + "auxiliary_loss_mlp": 0.0103018, + "balance_loss_clip": 1.02515876, + "balance_loss_mlp": 1.02062559, + "epoch": 0.8289794077859612, + "flos": 15450423085440.0, + "grad_norm": 1.6456152008358182, + "language_loss": 0.80389476, + "learning_rate": 2.989509432726163e-07, + "loss": 0.82463515, + "num_input_tokens_seen": 297480100, + "step": 13788, + "time_per_iteration": 2.696707248687744 + }, + { + "auxiliary_loss_clip": 0.0104117, + "auxiliary_loss_mlp": 0.01032147, + "balance_loss_clip": 1.02499068, + "balance_loss_mlp": 1.02214539, + "epoch": 0.8290395310386292, + "flos": 28877853214080.0, + "grad_norm": 1.7162554144903157, + "language_loss": 0.71290386, + "learning_rate": 2.9874614384744014e-07, + "loss": 0.73363703, + "num_input_tokens_seen": 297499890, + "step": 13789, + "time_per_iteration": 2.7445225715637207 + }, + { + "auxiliary_loss_clip": 0.0102494, + "auxiliary_loss_mlp": 0.01028707, + "balance_loss_clip": 1.01967096, + "balance_loss_mlp": 1.01797867, + "epoch": 0.8290996542912972, + "flos": 36576274700160.0, + "grad_norm": 1.6307800832388775, + "language_loss": 0.68370038, + "learning_rate": 2.985414089339813e-07, + "loss": 0.70423687, + "num_input_tokens_seen": 297521440, + "step": 13790, + "time_per_iteration": 2.7462520599365234 + }, + { + "auxiliary_loss_clip": 0.01053593, + "auxiliary_loss_mlp": 0.01024902, + "balance_loss_clip": 1.02498174, + "balance_loss_mlp": 1.01345181, + "epoch": 0.8291597775439651, + "flos": 23623009032960.0, + "grad_norm": 1.7806220670088493, + "language_loss": 0.77523667, + "learning_rate": 2.9833673854000265e-07, + "loss": 0.79602158, + "num_input_tokens_seen": 297539920, + "step": 13791, + "time_per_iteration": 2.5617501735687256 + }, + { + "auxiliary_loss_clip": 0.01033374, + "auxiliary_loss_mlp": 0.01023143, + "balance_loss_clip": 1.02281821, + "balance_loss_mlp": 1.0127064, + "epoch": 0.8292199007966331, + "flos": 21397481239680.0, + "grad_norm": 1.3883733992397955, + "language_loss": 0.69947207, + "learning_rate": 2.981321326732651e-07, + "loss": 0.72003722, + "num_input_tokens_seen": 297560000, + "step": 13792, + "time_per_iteration": 4.410595178604126 + }, + { + "auxiliary_loss_clip": 0.01041009, + "auxiliary_loss_mlp": 0.01030812, + "balance_loss_clip": 1.02406108, + "balance_loss_mlp": 1.02033353, + "epoch": 0.829280024049301, + "flos": 28767607395840.0, + "grad_norm": 1.548463995777138, + "language_loss": 0.64801276, + "learning_rate": 2.9792759134152736e-07, + "loss": 0.66873097, + "num_input_tokens_seen": 297579300, + "step": 13793, + "time_per_iteration": 2.6758577823638916 + }, + { + "auxiliary_loss_clip": 0.01006576, + "auxiliary_loss_mlp": 0.01031607, + "balance_loss_clip": 1.02210689, + "balance_loss_mlp": 1.01943624, + "epoch": 0.829340147301969, + "flos": 19938071652480.0, + "grad_norm": 1.6669161484815354, + "language_loss": 0.66396213, + "learning_rate": 2.977231145525461e-07, + "loss": 0.68434393, + "num_input_tokens_seen": 297598095, + "step": 13794, + "time_per_iteration": 2.7252259254455566 + }, + { + "auxiliary_loss_clip": 0.01060973, + "auxiliary_loss_mlp": 0.01032912, + "balance_loss_clip": 1.02382088, + "balance_loss_mlp": 1.02199864, + "epoch": 0.829400270554637, + "flos": 25228575060480.0, + "grad_norm": 1.880441253765868, + "language_loss": 0.66349941, + "learning_rate": 2.975187023140757e-07, + "loss": 0.68443829, + "num_input_tokens_seen": 297615955, + "step": 13795, + "time_per_iteration": 2.5803446769714355 + }, + { + "auxiliary_loss_clip": 0.00976413, + "auxiliary_loss_mlp": 0.01037359, + "balance_loss_clip": 1.02130938, + "balance_loss_mlp": 1.02525961, + "epoch": 0.829460393807305, + "flos": 24463570176000.0, + "grad_norm": 1.7591112157152733, + "language_loss": 0.66243249, + "learning_rate": 2.973143546338661e-07, + "loss": 0.68257022, + "num_input_tokens_seen": 297636285, + "step": 13796, + "time_per_iteration": 3.0359442234039307 + }, + { + "auxiliary_loss_clip": 0.0101105, + "auxiliary_loss_mlp": 0.01029585, + "balance_loss_clip": 1.02099657, + "balance_loss_mlp": 1.01871276, + "epoch": 0.829520517059973, + "flos": 15122486891520.0, + "grad_norm": 1.7607157479796778, + "language_loss": 0.71544796, + "learning_rate": 2.971100715196666e-07, + "loss": 0.73585439, + "num_input_tokens_seen": 297653315, + "step": 13797, + "time_per_iteration": 2.9382078647613525 + }, + { + "auxiliary_loss_clip": 0.0099591, + "auxiliary_loss_mlp": 0.0103018, + "balance_loss_clip": 1.02358651, + "balance_loss_mlp": 1.01991045, + "epoch": 0.8295806403126409, + "flos": 21579979265280.0, + "grad_norm": 2.2866648304119876, + "language_loss": 0.72131884, + "learning_rate": 2.969058529792243e-07, + "loss": 0.74157977, + "num_input_tokens_seen": 297673480, + "step": 13798, + "time_per_iteration": 2.8739681243896484 + }, + { + "auxiliary_loss_clip": 0.01017945, + "auxiliary_loss_mlp": 0.01028657, + "balance_loss_clip": 1.01899254, + "balance_loss_mlp": 1.01851845, + "epoch": 0.8296407635653089, + "flos": 21726566668800.0, + "grad_norm": 1.7137659822585487, + "language_loss": 0.75935113, + "learning_rate": 2.967016990202822e-07, + "loss": 0.77981716, + "num_input_tokens_seen": 297693250, + "step": 13799, + "time_per_iteration": 2.8167808055877686 + }, + { + "auxiliary_loss_clip": 0.01063184, + "auxiliary_loss_mlp": 0.01032013, + "balance_loss_clip": 1.02702641, + "balance_loss_mlp": 1.02217865, + "epoch": 0.8297008868179768, + "flos": 11181147252480.0, + "grad_norm": 1.742658734286453, + "language_loss": 0.6732651, + "learning_rate": 2.9649760965058245e-07, + "loss": 0.69421709, + "num_input_tokens_seen": 297710975, + "step": 13800, + "time_per_iteration": 2.661558151245117 + }, + { + "auxiliary_loss_clip": 0.01026803, + "auxiliary_loss_mlp": 0.01031119, + "balance_loss_clip": 1.02639031, + "balance_loss_mlp": 1.01977026, + "epoch": 0.8297610100706448, + "flos": 20664041431680.0, + "grad_norm": 7.88002704108163, + "language_loss": 0.74404752, + "learning_rate": 2.9629358487786515e-07, + "loss": 0.76462668, + "num_input_tokens_seen": 297730860, + "step": 13801, + "time_per_iteration": 2.74997615814209 + }, + { + "auxiliary_loss_clip": 0.01020269, + "auxiliary_loss_mlp": 0.01024014, + "balance_loss_clip": 1.02348018, + "balance_loss_mlp": 1.01408386, + "epoch": 0.8298211333233128, + "flos": 20376325491840.0, + "grad_norm": 1.6610549003393191, + "language_loss": 0.73448509, + "learning_rate": 2.9608962470986476e-07, + "loss": 0.75492787, + "num_input_tokens_seen": 297749765, + "step": 13802, + "time_per_iteration": 2.78265643119812 + }, + { + "auxiliary_loss_clip": 0.01038755, + "auxiliary_loss_mlp": 0.01029774, + "balance_loss_clip": 1.02166104, + "balance_loss_mlp": 1.01937926, + "epoch": 0.8298812565759808, + "flos": 21508696725120.0, + "grad_norm": 1.4793259812632167, + "language_loss": 0.74490082, + "learning_rate": 2.9588572915431644e-07, + "loss": 0.76558608, + "num_input_tokens_seen": 297770380, + "step": 13803, + "time_per_iteration": 4.785612344741821 + }, + { + "auxiliary_loss_clip": 0.01051749, + "auxiliary_loss_mlp": 0.01027832, + "balance_loss_clip": 1.0259763, + "balance_loss_mlp": 1.01804543, + "epoch": 0.8299413798286487, + "flos": 22818681734400.0, + "grad_norm": 1.8279155507886353, + "language_loss": 0.79482967, + "learning_rate": 2.9568189821895215e-07, + "loss": 0.81562543, + "num_input_tokens_seen": 297789440, + "step": 13804, + "time_per_iteration": 2.70652174949646 + }, + { + "auxiliary_loss_clip": 0.01060715, + "auxiliary_loss_mlp": 0.01028962, + "balance_loss_clip": 1.02446079, + "balance_loss_mlp": 1.01926482, + "epoch": 0.8300015030813167, + "flos": 29679199683840.0, + "grad_norm": 1.8391480256088921, + "language_loss": 0.73115271, + "learning_rate": 2.954781319115016e-07, + "loss": 0.75204945, + "num_input_tokens_seen": 297810425, + "step": 13805, + "time_per_iteration": 2.6285600662231445 + }, + { + "auxiliary_loss_clip": 0.01052662, + "auxiliary_loss_mlp": 0.00747584, + "balance_loss_clip": 1.02438498, + "balance_loss_mlp": 1.00038564, + "epoch": 0.8300616263339846, + "flos": 19719483436800.0, + "grad_norm": 5.049549452796134, + "language_loss": 0.77299035, + "learning_rate": 2.952744302396906e-07, + "loss": 0.7909928, + "num_input_tokens_seen": 297827680, + "step": 13806, + "time_per_iteration": 2.6004366874694824 + }, + { + "auxiliary_loss_clip": 0.01055468, + "auxiliary_loss_mlp": 0.01028613, + "balance_loss_clip": 1.02630746, + "balance_loss_mlp": 1.01744342, + "epoch": 0.8301217495866526, + "flos": 19901945548800.0, + "grad_norm": 1.643603533993515, + "language_loss": 0.63091969, + "learning_rate": 2.950707932112444e-07, + "loss": 0.65176052, + "num_input_tokens_seen": 297848005, + "step": 13807, + "time_per_iteration": 2.650177478790283 + }, + { + "auxiliary_loss_clip": 0.01053598, + "auxiliary_loss_mlp": 0.01025572, + "balance_loss_clip": 1.02742982, + "balance_loss_mlp": 1.01523662, + "epoch": 0.8301818728393207, + "flos": 19715784336000.0, + "grad_norm": 2.0530821299114277, + "language_loss": 0.73216993, + "learning_rate": 2.948672208338847e-07, + "loss": 0.75296164, + "num_input_tokens_seen": 297866730, + "step": 13808, + "time_per_iteration": 2.611694097518921 + }, + { + "auxiliary_loss_clip": 0.01042086, + "auxiliary_loss_mlp": 0.01034685, + "balance_loss_clip": 1.02618611, + "balance_loss_mlp": 1.02277017, + "epoch": 0.8302419960919886, + "flos": 28293658416000.0, + "grad_norm": 1.8702283522665557, + "language_loss": 0.66482437, + "learning_rate": 2.9466371311533046e-07, + "loss": 0.68559206, + "num_input_tokens_seen": 297886390, + "step": 13809, + "time_per_iteration": 2.6688225269317627 + }, + { + "auxiliary_loss_clip": 0.01062966, + "auxiliary_loss_mlp": 0.0102381, + "balance_loss_clip": 1.02486634, + "balance_loss_mlp": 1.01410019, + "epoch": 0.8303021193446566, + "flos": 18223444955520.0, + "grad_norm": 2.2759691503942845, + "language_loss": 0.73823714, + "learning_rate": 2.9446027006329896e-07, + "loss": 0.75910485, + "num_input_tokens_seen": 297905110, + "step": 13810, + "time_per_iteration": 2.483163595199585 + }, + { + "auxiliary_loss_clip": 0.01029833, + "auxiliary_loss_mlp": 0.01030155, + "balance_loss_clip": 1.02371705, + "balance_loss_mlp": 1.02093434, + "epoch": 0.8303622425973245, + "flos": 23111425578240.0, + "grad_norm": 1.5672546652687456, + "language_loss": 0.81352133, + "learning_rate": 2.94256891685505e-07, + "loss": 0.83412123, + "num_input_tokens_seen": 297925460, + "step": 13811, + "time_per_iteration": 2.701218605041504 + }, + { + "auxiliary_loss_clip": 0.01028228, + "auxiliary_loss_mlp": 0.01039575, + "balance_loss_clip": 1.02427959, + "balance_loss_mlp": 1.02821422, + "epoch": 0.8304223658499925, + "flos": 19572860119680.0, + "grad_norm": 1.9542772712748187, + "language_loss": 0.73381865, + "learning_rate": 2.9405357798966156e-07, + "loss": 0.75449669, + "num_input_tokens_seen": 297941760, + "step": 13812, + "time_per_iteration": 2.6616742610931396 + }, + { + "auxiliary_loss_clip": 0.01043018, + "auxiliary_loss_mlp": 0.01027529, + "balance_loss_clip": 1.02691698, + "balance_loss_mlp": 1.01774776, + "epoch": 0.8304824891026604, + "flos": 24426115269120.0, + "grad_norm": 1.5168638504756282, + "language_loss": 0.78233397, + "learning_rate": 2.9385032898347664e-07, + "loss": 0.80303949, + "num_input_tokens_seen": 297959745, + "step": 13813, + "time_per_iteration": 2.6180479526519775 + }, + { + "auxiliary_loss_clip": 0.0101163, + "auxiliary_loss_mlp": 0.00747706, + "balance_loss_clip": 1.02363205, + "balance_loss_mlp": 1.00038123, + "epoch": 0.8305426123553284, + "flos": 22381792611840.0, + "grad_norm": 2.4067044660169947, + "language_loss": 0.71115327, + "learning_rate": 2.93647144674658e-07, + "loss": 0.72874659, + "num_input_tokens_seen": 297977665, + "step": 13814, + "time_per_iteration": 2.8416526317596436 + }, + { + "auxiliary_loss_clip": 0.01068471, + "auxiliary_loss_mlp": 0.01038129, + "balance_loss_clip": 1.02647412, + "balance_loss_mlp": 1.02554631, + "epoch": 0.8306027356079964, + "flos": 14903575453440.0, + "grad_norm": 2.1678522359158543, + "language_loss": 0.67901033, + "learning_rate": 2.9344402507091116e-07, + "loss": 0.70007634, + "num_input_tokens_seen": 297993525, + "step": 13815, + "time_per_iteration": 2.5597074031829834 + }, + { + "auxiliary_loss_clip": 0.010522, + "auxiliary_loss_mlp": 0.01027364, + "balance_loss_clip": 1.02491522, + "balance_loss_mlp": 1.01732707, + "epoch": 0.8306628588606644, + "flos": 19644573623040.0, + "grad_norm": 2.88210304785957, + "language_loss": 0.75452185, + "learning_rate": 2.9324097017993745e-07, + "loss": 0.77531749, + "num_input_tokens_seen": 298012920, + "step": 13816, + "time_per_iteration": 2.626063108444214 + }, + { + "auxiliary_loss_clip": 0.01025949, + "auxiliary_loss_mlp": 0.01028938, + "balance_loss_clip": 1.02173519, + "balance_loss_mlp": 1.01966929, + "epoch": 0.8307229821133323, + "flos": 24389737770240.0, + "grad_norm": 1.594305931468568, + "language_loss": 0.81283569, + "learning_rate": 2.930379800094371e-07, + "loss": 0.83338463, + "num_input_tokens_seen": 298033310, + "step": 13817, + "time_per_iteration": 2.7613565921783447 + }, + { + "auxiliary_loss_clip": 0.01056324, + "auxiliary_loss_mlp": 0.01032321, + "balance_loss_clip": 1.02789211, + "balance_loss_mlp": 1.02132416, + "epoch": 0.8307831053660003, + "flos": 20996933702400.0, + "grad_norm": 1.6518609229078953, + "language_loss": 0.77984726, + "learning_rate": 2.9283505456710875e-07, + "loss": 0.80073369, + "num_input_tokens_seen": 298053530, + "step": 13818, + "time_per_iteration": 2.615025043487549 + }, + { + "auxiliary_loss_clip": 0.01047498, + "auxiliary_loss_mlp": 0.01032322, + "balance_loss_clip": 1.02763438, + "balance_loss_mlp": 1.02158725, + "epoch": 0.8308432286186682, + "flos": 21397301671680.0, + "grad_norm": 1.9486136177951925, + "language_loss": 0.82163966, + "learning_rate": 2.926321938606453e-07, + "loss": 0.84243786, + "num_input_tokens_seen": 298069305, + "step": 13819, + "time_per_iteration": 2.7529850006103516 + }, + { + "auxiliary_loss_clip": 0.00997195, + "auxiliary_loss_mlp": 0.01001248, + "balance_loss_clip": 1.00186169, + "balance_loss_mlp": 1.00035381, + "epoch": 0.8309033518713362, + "flos": 62533656714240.0, + "grad_norm": 0.7663824141886638, + "language_loss": 0.56277502, + "learning_rate": 2.924293978977399e-07, + "loss": 0.5827595, + "num_input_tokens_seen": 298125830, + "step": 13820, + "time_per_iteration": 3.3085310459136963 + }, + { + "auxiliary_loss_clip": 0.0104808, + "auxiliary_loss_mlp": 0.01022927, + "balance_loss_clip": 1.02310646, + "balance_loss_mlp": 1.01297319, + "epoch": 0.8309634751240043, + "flos": 16979104051200.0, + "grad_norm": 1.985267813663572, + "language_loss": 0.68300915, + "learning_rate": 2.922266666860831e-07, + "loss": 0.70371926, + "num_input_tokens_seen": 298142320, + "step": 13821, + "time_per_iteration": 4.378791332244873 + }, + { + "auxiliary_loss_clip": 0.00989479, + "auxiliary_loss_mlp": 0.01035402, + "balance_loss_clip": 1.0188601, + "balance_loss_mlp": 1.023386, + "epoch": 0.8310235983766722, + "flos": 22674464628480.0, + "grad_norm": 1.726914230310284, + "language_loss": 0.69091243, + "learning_rate": 2.920240002333625e-07, + "loss": 0.71116126, + "num_input_tokens_seen": 298161845, + "step": 13822, + "time_per_iteration": 4.613468170166016 + }, + { + "auxiliary_loss_clip": 0.01013379, + "auxiliary_loss_mlp": 0.01032315, + "balance_loss_clip": 1.02486181, + "balance_loss_mlp": 1.02257562, + "epoch": 0.8310837216293402, + "flos": 30811463176320.0, + "grad_norm": 1.717911548979755, + "language_loss": 0.62246513, + "learning_rate": 2.918213985472631e-07, + "loss": 0.64292204, + "num_input_tokens_seen": 298184165, + "step": 13823, + "time_per_iteration": 2.9634721279144287 + }, + { + "auxiliary_loss_clip": 0.0098778, + "auxiliary_loss_mlp": 0.01001791, + "balance_loss_clip": 1.00218058, + "balance_loss_mlp": 1.00097394, + "epoch": 0.8311438448820081, + "flos": 71276074997760.0, + "grad_norm": 0.862225360766018, + "language_loss": 0.62014925, + "learning_rate": 2.916188616354669e-07, + "loss": 0.64004505, + "num_input_tokens_seen": 298251720, + "step": 13824, + "time_per_iteration": 3.3799140453338623 + }, + { + "auxiliary_loss_clip": 0.01062568, + "auxiliary_loss_mlp": 0.01027167, + "balance_loss_clip": 1.02622974, + "balance_loss_mlp": 1.0171895, + "epoch": 0.8312039681346761, + "flos": 20887082933760.0, + "grad_norm": 1.4976776459509997, + "language_loss": 0.74498975, + "learning_rate": 2.914163895056552e-07, + "loss": 0.76588714, + "num_input_tokens_seen": 298271910, + "step": 13825, + "time_per_iteration": 2.787410259246826 + }, + { + "auxiliary_loss_clip": 0.01015259, + "auxiliary_loss_mlp": 0.00747584, + "balance_loss_clip": 1.02417016, + "balance_loss_mlp": 1.00031102, + "epoch": 0.831264091387344, + "flos": 17017528625280.0, + "grad_norm": 1.9172574899359325, + "language_loss": 0.80016124, + "learning_rate": 2.9121398216550486e-07, + "loss": 0.81778955, + "num_input_tokens_seen": 298288105, + "step": 13826, + "time_per_iteration": 2.9172937870025635 + }, + { + "auxiliary_loss_clip": 0.01060901, + "auxiliary_loss_mlp": 0.01026891, + "balance_loss_clip": 1.02413023, + "balance_loss_mlp": 1.01673436, + "epoch": 0.831324214640012, + "flos": 24419578993920.0, + "grad_norm": 2.6779136416983342, + "language_loss": 0.68048763, + "learning_rate": 2.910116396226914e-07, + "loss": 0.70136553, + "num_input_tokens_seen": 298307600, + "step": 13827, + "time_per_iteration": 2.965047597885132 + }, + { + "auxiliary_loss_clip": 0.01049651, + "auxiliary_loss_mlp": 0.0102487, + "balance_loss_clip": 1.02250862, + "balance_loss_mlp": 1.01560128, + "epoch": 0.83138433789268, + "flos": 13545576938880.0, + "grad_norm": 1.8130818839425247, + "language_loss": 0.73654228, + "learning_rate": 2.9080936188488834e-07, + "loss": 0.7572875, + "num_input_tokens_seen": 298323055, + "step": 13828, + "time_per_iteration": 2.957409143447876 + }, + { + "auxiliary_loss_clip": 0.01022836, + "auxiliary_loss_mlp": 0.010312, + "balance_loss_clip": 1.02093792, + "balance_loss_mlp": 1.02017307, + "epoch": 0.831444461145348, + "flos": 44492386561920.0, + "grad_norm": 1.514341317768582, + "language_loss": 0.6725775, + "learning_rate": 2.906071489597657e-07, + "loss": 0.69311792, + "num_input_tokens_seen": 298346950, + "step": 13829, + "time_per_iteration": 3.056121587753296 + }, + { + "auxiliary_loss_clip": 0.01044031, + "auxiliary_loss_mlp": 0.0102444, + "balance_loss_clip": 1.02514684, + "balance_loss_mlp": 1.01349616, + "epoch": 0.8315045843980159, + "flos": 22705024124160.0, + "grad_norm": 1.5565148032579825, + "language_loss": 0.82557023, + "learning_rate": 2.9040500085499054e-07, + "loss": 0.84625494, + "num_input_tokens_seen": 298366315, + "step": 13830, + "time_per_iteration": 3.0267751216888428 + }, + { + "auxiliary_loss_clip": 0.01051454, + "auxiliary_loss_mlp": 0.01027918, + "balance_loss_clip": 1.02500868, + "balance_loss_mlp": 1.01773715, + "epoch": 0.8315647076506839, + "flos": 16873491087360.0, + "grad_norm": 3.4455473017372835, + "language_loss": 0.74302566, + "learning_rate": 2.9020291757822925e-07, + "loss": 0.76381934, + "num_input_tokens_seen": 298385185, + "step": 13831, + "time_per_iteration": 2.88012433052063 + }, + { + "auxiliary_loss_clip": 0.0106326, + "auxiliary_loss_mlp": 0.01028305, + "balance_loss_clip": 1.02597857, + "balance_loss_mlp": 1.01752269, + "epoch": 0.8316248309033518, + "flos": 13808730954240.0, + "grad_norm": 1.8734646074459875, + "language_loss": 0.71343684, + "learning_rate": 2.9000089913714523e-07, + "loss": 0.73435247, + "num_input_tokens_seen": 298402335, + "step": 13832, + "time_per_iteration": 2.804105520248413 + }, + { + "auxiliary_loss_clip": 0.0103672, + "auxiliary_loss_mlp": 0.01029961, + "balance_loss_clip": 1.0221951, + "balance_loss_mlp": 1.01964903, + "epoch": 0.8316849541560198, + "flos": 23512511819520.0, + "grad_norm": 1.6712220891688352, + "language_loss": 0.84462464, + "learning_rate": 2.897989455393979e-07, + "loss": 0.86529148, + "num_input_tokens_seen": 298423370, + "step": 13833, + "time_per_iteration": 2.9528090953826904 + }, + { + "auxiliary_loss_clip": 0.0104583, + "auxiliary_loss_mlp": 0.01032094, + "balance_loss_clip": 1.02640772, + "balance_loss_mlp": 1.02153802, + "epoch": 0.8317450774086879, + "flos": 23771356202880.0, + "grad_norm": 1.8624592056176388, + "language_loss": 0.7637288, + "learning_rate": 2.8959705679264625e-07, + "loss": 0.78450799, + "num_input_tokens_seen": 298444835, + "step": 13834, + "time_per_iteration": 2.9015650749206543 + }, + { + "auxiliary_loss_clip": 0.01058278, + "auxiliary_loss_mlp": 0.00747501, + "balance_loss_clip": 1.02308524, + "balance_loss_mlp": 1.00034237, + "epoch": 0.8318052006613558, + "flos": 16215535710720.0, + "grad_norm": 1.9478426926154317, + "language_loss": 0.79973, + "learning_rate": 2.893952329045459e-07, + "loss": 0.81778783, + "num_input_tokens_seen": 298461845, + "step": 13835, + "time_per_iteration": 2.6490345001220703 + }, + { + "auxiliary_loss_clip": 0.01055707, + "auxiliary_loss_mlp": 0.01032086, + "balance_loss_clip": 1.02678096, + "balance_loss_mlp": 1.02054024, + "epoch": 0.8318653239140238, + "flos": 19974556892160.0, + "grad_norm": 1.9982718604439489, + "language_loss": 0.80893165, + "learning_rate": 2.8919347388274905e-07, + "loss": 0.82980955, + "num_input_tokens_seen": 298479095, + "step": 13836, + "time_per_iteration": 2.938779592514038 + }, + { + "auxiliary_loss_clip": 0.01041777, + "auxiliary_loss_mlp": 0.01027695, + "balance_loss_clip": 1.02490902, + "balance_loss_mlp": 1.01775861, + "epoch": 0.8319254471666917, + "flos": 17704714694400.0, + "grad_norm": 2.052961320261103, + "language_loss": 0.77590513, + "learning_rate": 2.8899177973490727e-07, + "loss": 0.79659986, + "num_input_tokens_seen": 298494475, + "step": 13837, + "time_per_iteration": 2.911574125289917 + }, + { + "auxiliary_loss_clip": 0.01065807, + "auxiliary_loss_mlp": 0.01028187, + "balance_loss_clip": 1.0256964, + "balance_loss_mlp": 1.01635551, + "epoch": 0.8319855704193597, + "flos": 19536554448000.0, + "grad_norm": 1.6596674734011678, + "language_loss": 0.83231312, + "learning_rate": 2.887901504686685e-07, + "loss": 0.85325307, + "num_input_tokens_seen": 298513185, + "step": 13838, + "time_per_iteration": 2.908322334289551 + }, + { + "auxiliary_loss_clip": 0.01028757, + "auxiliary_loss_mlp": 0.01034856, + "balance_loss_clip": 1.01986384, + "balance_loss_mlp": 1.02345347, + "epoch": 0.8320456936720276, + "flos": 21178067011200.0, + "grad_norm": 1.881572170972399, + "language_loss": 0.74466467, + "learning_rate": 2.885885860916795e-07, + "loss": 0.76530081, + "num_input_tokens_seen": 298531885, + "step": 13839, + "time_per_iteration": 4.768864631652832 + }, + { + "auxiliary_loss_clip": 0.01046382, + "auxiliary_loss_mlp": 0.01027755, + "balance_loss_clip": 1.02346063, + "balance_loss_mlp": 1.01716352, + "epoch": 0.8321058169246957, + "flos": 33250874503680.0, + "grad_norm": 1.8577305987131452, + "language_loss": 0.68327749, + "learning_rate": 2.8838708661158253e-07, + "loss": 0.70401883, + "num_input_tokens_seen": 298554905, + "step": 13840, + "time_per_iteration": 2.8534605503082275 + }, + { + "auxiliary_loss_clip": 0.01007318, + "auxiliary_loss_mlp": 0.01031663, + "balance_loss_clip": 1.02221048, + "balance_loss_mlp": 1.01995087, + "epoch": 0.8321659401773636, + "flos": 14208129256320.0, + "grad_norm": 2.164683091624516, + "language_loss": 0.79030985, + "learning_rate": 2.8818565203601843e-07, + "loss": 0.8106997, + "num_input_tokens_seen": 298571185, + "step": 13841, + "time_per_iteration": 2.7116053104400635 + }, + { + "auxiliary_loss_clip": 0.0102381, + "auxiliary_loss_mlp": 0.01026323, + "balance_loss_clip": 1.02587366, + "balance_loss_mlp": 1.01583219, + "epoch": 0.8322260634300316, + "flos": 15158253859200.0, + "grad_norm": 1.7774381432844644, + "language_loss": 0.68719292, + "learning_rate": 2.879842823726262e-07, + "loss": 0.70769429, + "num_input_tokens_seen": 298588505, + "step": 13842, + "time_per_iteration": 2.766796112060547 + }, + { + "auxiliary_loss_clip": 0.01043469, + "auxiliary_loss_mlp": 0.01026191, + "balance_loss_clip": 1.02569699, + "balance_loss_mlp": 1.01534843, + "epoch": 0.8322861866826995, + "flos": 25300827267840.0, + "grad_norm": 2.0984393199574747, + "language_loss": 0.73305583, + "learning_rate": 2.8778297762904124e-07, + "loss": 0.75375247, + "num_input_tokens_seen": 298609295, + "step": 13843, + "time_per_iteration": 2.816976547241211 + }, + { + "auxiliary_loss_clip": 0.01030996, + "auxiliary_loss_mlp": 0.01024797, + "balance_loss_clip": 1.02512097, + "balance_loss_mlp": 1.01482534, + "epoch": 0.8323463099353675, + "flos": 17019360218880.0, + "grad_norm": 1.7634732735274268, + "language_loss": 0.77599752, + "learning_rate": 2.875817378128975e-07, + "loss": 0.7965554, + "num_input_tokens_seen": 298625765, + "step": 13844, + "time_per_iteration": 2.8890953063964844 + }, + { + "auxiliary_loss_clip": 0.00986741, + "auxiliary_loss_mlp": 0.00999768, + "balance_loss_clip": 1.00184906, + "balance_loss_mlp": 0.99896353, + "epoch": 0.8324064331880354, + "flos": 55607889709440.0, + "grad_norm": 0.7721799753692135, + "language_loss": 0.55295944, + "learning_rate": 2.8738056293182624e-07, + "loss": 0.5728246, + "num_input_tokens_seen": 298683005, + "step": 13845, + "time_per_iteration": 3.1103427410125732 + }, + { + "auxiliary_loss_clip": 0.01054817, + "auxiliary_loss_mlp": 0.01039872, + "balance_loss_clip": 1.02578449, + "balance_loss_mlp": 1.02867842, + "epoch": 0.8324665564407034, + "flos": 26138623063680.0, + "grad_norm": 1.5336655367021534, + "language_loss": 0.75459796, + "learning_rate": 2.871794529934555e-07, + "loss": 0.77554488, + "num_input_tokens_seen": 298703060, + "step": 13846, + "time_per_iteration": 2.70111083984375 + }, + { + "auxiliary_loss_clip": 0.01008527, + "auxiliary_loss_mlp": 0.01027839, + "balance_loss_clip": 1.01993549, + "balance_loss_mlp": 1.01516664, + "epoch": 0.8325266796933715, + "flos": 22049187649920.0, + "grad_norm": 1.7976515847354846, + "language_loss": 0.78873968, + "learning_rate": 2.8697840800541115e-07, + "loss": 0.80910325, + "num_input_tokens_seen": 298721765, + "step": 13847, + "time_per_iteration": 2.8109118938446045 + }, + { + "auxiliary_loss_clip": 0.01014509, + "auxiliary_loss_mlp": 0.01023854, + "balance_loss_clip": 1.02764082, + "balance_loss_mlp": 1.01393569, + "epoch": 0.8325868029460394, + "flos": 22816634659200.0, + "grad_norm": 1.9526041743002012, + "language_loss": 0.74456561, + "learning_rate": 2.867774279753175e-07, + "loss": 0.7649492, + "num_input_tokens_seen": 298740825, + "step": 13848, + "time_per_iteration": 2.9194531440734863 + }, + { + "auxiliary_loss_clip": 0.01052903, + "auxiliary_loss_mlp": 0.01025384, + "balance_loss_clip": 1.02586818, + "balance_loss_mlp": 1.01526332, + "epoch": 0.8326469261987074, + "flos": 14757454926720.0, + "grad_norm": 1.7127504169195262, + "language_loss": 0.63579416, + "learning_rate": 2.8657651291079554e-07, + "loss": 0.65657705, + "num_input_tokens_seen": 298758515, + "step": 13849, + "time_per_iteration": 2.6356780529022217 + }, + { + "auxiliary_loss_clip": 0.01034705, + "auxiliary_loss_mlp": 0.01026986, + "balance_loss_clip": 1.02186191, + "balance_loss_mlp": 1.01622748, + "epoch": 0.8327070494513753, + "flos": 22926126291840.0, + "grad_norm": 2.2325413420571105, + "language_loss": 0.79102242, + "learning_rate": 2.863756628194638e-07, + "loss": 0.81163943, + "num_input_tokens_seen": 298776375, + "step": 13850, + "time_per_iteration": 2.712822437286377 + }, + { + "auxiliary_loss_clip": 0.01023664, + "auxiliary_loss_mlp": 0.01030349, + "balance_loss_clip": 1.02112472, + "balance_loss_mlp": 1.02115774, + "epoch": 0.8327671727040433, + "flos": 20665334321280.0, + "grad_norm": 1.7153047439197369, + "language_loss": 0.78058785, + "learning_rate": 2.8617487770893877e-07, + "loss": 0.80112803, + "num_input_tokens_seen": 298795135, + "step": 13851, + "time_per_iteration": 4.428085088729858 + }, + { + "auxiliary_loss_clip": 0.00997779, + "auxiliary_loss_mlp": 0.0100011, + "balance_loss_clip": 1.00236225, + "balance_loss_mlp": 0.99921554, + "epoch": 0.8328272959567112, + "flos": 56060760384000.0, + "grad_norm": 0.7608624588780055, + "language_loss": 0.5582031, + "learning_rate": 2.859741575868344e-07, + "loss": 0.57818198, + "num_input_tokens_seen": 298855475, + "step": 13852, + "time_per_iteration": 3.3306195735931396 + }, + { + "auxiliary_loss_clip": 0.01051095, + "auxiliary_loss_mlp": 0.01026966, + "balance_loss_clip": 1.02509129, + "balance_loss_mlp": 1.01692271, + "epoch": 0.8328874192093793, + "flos": 32303084284800.0, + "grad_norm": 1.4788248149475856, + "language_loss": 0.67099822, + "learning_rate": 2.8577350246076125e-07, + "loss": 0.69177878, + "num_input_tokens_seen": 298875875, + "step": 13853, + "time_per_iteration": 2.816209316253662 + }, + { + "auxiliary_loss_clip": 0.01039988, + "auxiliary_loss_mlp": 0.0102925, + "balance_loss_clip": 1.02512622, + "balance_loss_mlp": 1.0186348, + "epoch": 0.8329475424620472, + "flos": 23512691387520.0, + "grad_norm": 1.4352490899905335, + "language_loss": 0.78160781, + "learning_rate": 2.855729123383286e-07, + "loss": 0.80230021, + "num_input_tokens_seen": 298895950, + "step": 13854, + "time_per_iteration": 2.982044219970703 + }, + { + "auxiliary_loss_clip": 0.01006311, + "auxiliary_loss_mlp": 0.01000793, + "balance_loss_clip": 1.00123835, + "balance_loss_mlp": 0.99995309, + "epoch": 0.8330076657147152, + "flos": 67840680378240.0, + "grad_norm": 0.7605447658503626, + "language_loss": 0.58682096, + "learning_rate": 2.8537238722714295e-07, + "loss": 0.60689199, + "num_input_tokens_seen": 298955770, + "step": 13855, + "time_per_iteration": 3.2337024211883545 + }, + { + "auxiliary_loss_clip": 0.01050209, + "auxiliary_loss_mlp": 0.01024681, + "balance_loss_clip": 1.02447438, + "balance_loss_mlp": 1.01456618, + "epoch": 0.8330677889673831, + "flos": 22892801448960.0, + "grad_norm": 1.6807716549032936, + "language_loss": 0.71858883, + "learning_rate": 2.8517192713480853e-07, + "loss": 0.7393378, + "num_input_tokens_seen": 298976545, + "step": 13856, + "time_per_iteration": 2.860360622406006 + }, + { + "auxiliary_loss_clip": 0.01051362, + "auxiliary_loss_mlp": 0.01024238, + "balance_loss_clip": 1.02457845, + "balance_loss_mlp": 1.01424277, + "epoch": 0.8331279122200511, + "flos": 27345042184320.0, + "grad_norm": 1.5945537152948788, + "language_loss": 0.75468475, + "learning_rate": 2.8497153206892677e-07, + "loss": 0.77544081, + "num_input_tokens_seen": 298996750, + "step": 13857, + "time_per_iteration": 2.81001615524292 + }, + { + "auxiliary_loss_clip": 0.01017505, + "auxiliary_loss_mlp": 0.01022439, + "balance_loss_clip": 1.02384663, + "balance_loss_mlp": 1.01376045, + "epoch": 0.833188035472719, + "flos": 19938179393280.0, + "grad_norm": 2.4391768598290082, + "language_loss": 0.73305279, + "learning_rate": 2.847712020370958e-07, + "loss": 0.75345218, + "num_input_tokens_seen": 299014895, + "step": 13858, + "time_per_iteration": 2.7753520011901855 + }, + { + "auxiliary_loss_clip": 0.01064609, + "auxiliary_loss_mlp": 0.01031031, + "balance_loss_clip": 1.02487481, + "balance_loss_mlp": 1.02031469, + "epoch": 0.833248158725387, + "flos": 15232624968960.0, + "grad_norm": 2.2121933976959802, + "language_loss": 0.73480082, + "learning_rate": 2.8457093704691316e-07, + "loss": 0.75575721, + "num_input_tokens_seen": 299032855, + "step": 13859, + "time_per_iteration": 2.7131996154785156 + }, + { + "auxiliary_loss_clip": 0.01046726, + "auxiliary_loss_mlp": 0.01022656, + "balance_loss_clip": 1.02292693, + "balance_loss_mlp": 1.0133518, + "epoch": 0.8333082819780551, + "flos": 24535535074560.0, + "grad_norm": 1.7003314339735198, + "language_loss": 0.78888404, + "learning_rate": 2.8437073710597205e-07, + "loss": 0.80957782, + "num_input_tokens_seen": 299052055, + "step": 13860, + "time_per_iteration": 2.724210023880005 + }, + { + "auxiliary_loss_clip": 0.01003454, + "auxiliary_loss_mlp": 0.01024591, + "balance_loss_clip": 1.02685857, + "balance_loss_mlp": 1.01462531, + "epoch": 0.833368405230723, + "flos": 31467407391360.0, + "grad_norm": 1.3544981150891926, + "language_loss": 0.82126433, + "learning_rate": 2.841706022218644e-07, + "loss": 0.84154475, + "num_input_tokens_seen": 299075285, + "step": 13861, + "time_per_iteration": 3.1018567085266113 + }, + { + "auxiliary_loss_clip": 0.01063757, + "auxiliary_loss_mlp": 0.01031316, + "balance_loss_clip": 1.02690434, + "balance_loss_mlp": 1.02086186, + "epoch": 0.833428528483391, + "flos": 14902713527040.0, + "grad_norm": 1.9594654005667695, + "language_loss": 0.790712, + "learning_rate": 2.839705324021806e-07, + "loss": 0.81166273, + "num_input_tokens_seen": 299092520, + "step": 13862, + "time_per_iteration": 2.903358221054077 + }, + { + "auxiliary_loss_clip": 0.01051409, + "auxiliary_loss_mlp": 0.01030911, + "balance_loss_clip": 1.02386892, + "balance_loss_mlp": 1.02056968, + "epoch": 0.8334886517360589, + "flos": 22199833290240.0, + "grad_norm": 3.3950197078082494, + "language_loss": 0.75189126, + "learning_rate": 2.83770527654505e-07, + "loss": 0.7727145, + "num_input_tokens_seen": 299109450, + "step": 13863, + "time_per_iteration": 2.955256462097168 + }, + { + "auxiliary_loss_clip": 0.01001324, + "auxiliary_loss_mlp": 0.00747577, + "balance_loss_clip": 1.01979733, + "balance_loss_mlp": 1.00042582, + "epoch": 0.8335487749887269, + "flos": 30372562892160.0, + "grad_norm": 2.3796151217033774, + "language_loss": 0.75092208, + "learning_rate": 2.835705879864232e-07, + "loss": 0.76841104, + "num_input_tokens_seen": 299129540, + "step": 13864, + "time_per_iteration": 2.91709566116333 + }, + { + "auxiliary_loss_clip": 0.01040946, + "auxiliary_loss_mlp": 0.01031908, + "balance_loss_clip": 1.02440953, + "balance_loss_mlp": 1.02139401, + "epoch": 0.8336088982413948, + "flos": 24681152810880.0, + "grad_norm": 1.7278362964310405, + "language_loss": 0.69530022, + "learning_rate": 2.833707134055168e-07, + "loss": 0.71602875, + "num_input_tokens_seen": 299148670, + "step": 13865, + "time_per_iteration": 2.6462182998657227 + }, + { + "auxiliary_loss_clip": 0.01053017, + "auxiliary_loss_mlp": 0.01030164, + "balance_loss_clip": 1.02556241, + "balance_loss_mlp": 1.01994801, + "epoch": 0.8336690214940629, + "flos": 38177207873280.0, + "grad_norm": 1.884081253883522, + "language_loss": 0.75581521, + "learning_rate": 2.831709039193653e-07, + "loss": 0.77664703, + "num_input_tokens_seen": 299169330, + "step": 13866, + "time_per_iteration": 2.748507499694824 + }, + { + "auxiliary_loss_clip": 0.00989048, + "auxiliary_loss_mlp": 0.01001347, + "balance_loss_clip": 1.00324488, + "balance_loss_mlp": 1.00013065, + "epoch": 0.8337291447467308, + "flos": 55565119589760.0, + "grad_norm": 0.8668726617099136, + "language_loss": 0.63087189, + "learning_rate": 2.8297115953554465e-07, + "loss": 0.65077585, + "num_input_tokens_seen": 299220980, + "step": 13867, + "time_per_iteration": 3.1051840782165527 + }, + { + "auxiliary_loss_clip": 0.01039164, + "auxiliary_loss_mlp": 0.01029698, + "balance_loss_clip": 1.02512789, + "balance_loss_mlp": 1.02053642, + "epoch": 0.8337892679993988, + "flos": 24133550993280.0, + "grad_norm": 1.5978847741903979, + "language_loss": 0.72050697, + "learning_rate": 2.827714802616301e-07, + "loss": 0.74119562, + "num_input_tokens_seen": 299240130, + "step": 13868, + "time_per_iteration": 4.263440370559692 + }, + { + "auxiliary_loss_clip": 0.01044495, + "auxiliary_loss_mlp": 0.01028237, + "balance_loss_clip": 1.02725136, + "balance_loss_mlp": 1.01797938, + "epoch": 0.8338493912520667, + "flos": 28183915388160.0, + "grad_norm": 1.4297518940839313, + "language_loss": 0.8043865, + "learning_rate": 2.8257186610519325e-07, + "loss": 0.82511389, + "num_input_tokens_seen": 299260705, + "step": 13869, + "time_per_iteration": 4.328808546066284 + }, + { + "auxiliary_loss_clip": 0.01055312, + "auxiliary_loss_mlp": 0.01031006, + "balance_loss_clip": 1.02688122, + "balance_loss_mlp": 1.02052772, + "epoch": 0.8339095145047347, + "flos": 22158356060160.0, + "grad_norm": 1.5645688551892774, + "language_loss": 0.82352924, + "learning_rate": 2.823723170738028e-07, + "loss": 0.84439242, + "num_input_tokens_seen": 299278925, + "step": 13870, + "time_per_iteration": 2.6757473945617676 + }, + { + "auxiliary_loss_clip": 0.01035018, + "auxiliary_loss_mlp": 0.01027509, + "balance_loss_clip": 1.02491176, + "balance_loss_mlp": 1.01641071, + "epoch": 0.8339696377574026, + "flos": 17307112072320.0, + "grad_norm": 2.9910933527786905, + "language_loss": 0.70518291, + "learning_rate": 2.821728331750264e-07, + "loss": 0.72580814, + "num_input_tokens_seen": 299291580, + "step": 13871, + "time_per_iteration": 2.718770742416382 + }, + { + "auxiliary_loss_clip": 0.01051719, + "auxiliary_loss_mlp": 0.0103124, + "balance_loss_clip": 1.02585399, + "balance_loss_mlp": 1.02105331, + "epoch": 0.8340297610100706, + "flos": 20668351063680.0, + "grad_norm": 1.8298310860744873, + "language_loss": 0.68820459, + "learning_rate": 2.8197341441642853e-07, + "loss": 0.70903409, + "num_input_tokens_seen": 299310385, + "step": 13872, + "time_per_iteration": 2.754190683364868 + }, + { + "auxiliary_loss_clip": 0.01040278, + "auxiliary_loss_mlp": 0.0102132, + "balance_loss_clip": 1.02402115, + "balance_loss_mlp": 1.01165259, + "epoch": 0.8340898842627387, + "flos": 20515442866560.0, + "grad_norm": 1.7496668637626642, + "language_loss": 0.73233479, + "learning_rate": 2.817740608055712e-07, + "loss": 0.75295079, + "num_input_tokens_seen": 299327660, + "step": 13873, + "time_per_iteration": 2.7099363803863525 + }, + { + "auxiliary_loss_clip": 0.010423, + "auxiliary_loss_mlp": 0.01033572, + "balance_loss_clip": 1.0238502, + "balance_loss_mlp": 1.02133548, + "epoch": 0.8341500075154066, + "flos": 21425850005760.0, + "grad_norm": 2.9775382108804878, + "language_loss": 0.75726628, + "learning_rate": 2.81574772350013e-07, + "loss": 0.77802503, + "num_input_tokens_seen": 299343685, + "step": 13874, + "time_per_iteration": 2.761125087738037 + }, + { + "auxiliary_loss_clip": 0.01031945, + "auxiliary_loss_mlp": 0.0102308, + "balance_loss_clip": 1.0224607, + "balance_loss_mlp": 1.01275063, + "epoch": 0.8342101307680746, + "flos": 22090988102400.0, + "grad_norm": 1.687834921533048, + "language_loss": 0.66011453, + "learning_rate": 2.813755490573118e-07, + "loss": 0.68066478, + "num_input_tokens_seen": 299363305, + "step": 13875, + "time_per_iteration": 2.718590497970581 + }, + { + "auxiliary_loss_clip": 0.01015282, + "auxiliary_loss_mlp": 0.01033549, + "balance_loss_clip": 1.02254748, + "balance_loss_mlp": 1.0231781, + "epoch": 0.8342702540207425, + "flos": 21871466133120.0, + "grad_norm": 1.726449234750805, + "language_loss": 0.79453999, + "learning_rate": 2.8117639093502243e-07, + "loss": 0.81502831, + "num_input_tokens_seen": 299382630, + "step": 13876, + "time_per_iteration": 2.7996299266815186 + }, + { + "auxiliary_loss_clip": 0.0104458, + "auxiliary_loss_mlp": 0.01031119, + "balance_loss_clip": 1.0228976, + "balance_loss_mlp": 1.01976478, + "epoch": 0.8343303772734105, + "flos": 22528487756160.0, + "grad_norm": 1.81406957178159, + "language_loss": 0.87156665, + "learning_rate": 2.8097729799069615e-07, + "loss": 0.89232361, + "num_input_tokens_seen": 299402385, + "step": 13877, + "time_per_iteration": 2.7564406394958496 + }, + { + "auxiliary_loss_clip": 0.01013961, + "auxiliary_loss_mlp": 0.0102605, + "balance_loss_clip": 1.02151418, + "balance_loss_mlp": 1.01613164, + "epoch": 0.8343905005260784, + "flos": 14939773384320.0, + "grad_norm": 1.7045088189461586, + "language_loss": 0.69353056, + "learning_rate": 2.807782702318828e-07, + "loss": 0.71393073, + "num_input_tokens_seen": 299419820, + "step": 13878, + "time_per_iteration": 2.7926790714263916 + }, + { + "auxiliary_loss_clip": 0.01040864, + "auxiliary_loss_mlp": 0.01026078, + "balance_loss_clip": 1.02482593, + "balance_loss_mlp": 1.01627874, + "epoch": 0.8344506237787465, + "flos": 15012456554880.0, + "grad_norm": 1.756196064099369, + "language_loss": 0.79047203, + "learning_rate": 2.805793076661309e-07, + "loss": 0.81114149, + "num_input_tokens_seen": 299436265, + "step": 13879, + "time_per_iteration": 2.6732161045074463 + }, + { + "auxiliary_loss_clip": 0.01023871, + "auxiliary_loss_mlp": 0.01028247, + "balance_loss_clip": 1.02721047, + "balance_loss_mlp": 1.01891303, + "epoch": 0.8345107470314144, + "flos": 17560389847680.0, + "grad_norm": 1.9619859064161793, + "language_loss": 0.83242911, + "learning_rate": 2.803804103009828e-07, + "loss": 0.85295033, + "num_input_tokens_seen": 299451660, + "step": 13880, + "time_per_iteration": 2.79229474067688 + }, + { + "auxiliary_loss_clip": 0.01042713, + "auxiliary_loss_mlp": 0.01030428, + "balance_loss_clip": 1.02444792, + "balance_loss_mlp": 1.02031922, + "epoch": 0.8345708702840824, + "flos": 25187277398400.0, + "grad_norm": 1.4866174133160632, + "language_loss": 0.78342736, + "learning_rate": 2.80181578143982e-07, + "loss": 0.80415875, + "num_input_tokens_seen": 299472070, + "step": 13881, + "time_per_iteration": 2.71659255027771 + }, + { + "auxiliary_loss_clip": 0.01026787, + "auxiliary_loss_mlp": 0.0102373, + "balance_loss_clip": 1.02517617, + "balance_loss_mlp": 1.01440239, + "epoch": 0.8346309935367503, + "flos": 15083559527040.0, + "grad_norm": 2.7421669108340825, + "language_loss": 0.78107172, + "learning_rate": 2.7998281120266807e-07, + "loss": 0.80157691, + "num_input_tokens_seen": 299486725, + "step": 13882, + "time_per_iteration": 2.6496598720550537 + }, + { + "auxiliary_loss_clip": 0.01021856, + "auxiliary_loss_mlp": 0.0104127, + "balance_loss_clip": 1.02166331, + "balance_loss_mlp": 1.02977204, + "epoch": 0.8346911167894183, + "flos": 22930615491840.0, + "grad_norm": 1.877903782517666, + "language_loss": 0.80070931, + "learning_rate": 2.79784109484579e-07, + "loss": 0.82134056, + "num_input_tokens_seen": 299505435, + "step": 13883, + "time_per_iteration": 2.717749834060669 + }, + { + "auxiliary_loss_clip": 0.01052901, + "auxiliary_loss_mlp": 0.01031653, + "balance_loss_clip": 1.02426088, + "balance_loss_mlp": 1.02063215, + "epoch": 0.8347512400420862, + "flos": 20193037367040.0, + "grad_norm": 13.485643687053612, + "language_loss": 0.74362636, + "learning_rate": 2.795854729972482e-07, + "loss": 0.76447189, + "num_input_tokens_seen": 299523555, + "step": 13884, + "time_per_iteration": 2.7476072311401367 + }, + { + "auxiliary_loss_clip": 0.0104916, + "auxiliary_loss_mlp": 0.01036678, + "balance_loss_clip": 1.02751279, + "balance_loss_mlp": 1.02467334, + "epoch": 0.8348113632947542, + "flos": 25954832148480.0, + "grad_norm": 2.0601793894810254, + "language_loss": 0.7027868, + "learning_rate": 2.7938690174820913e-07, + "loss": 0.72364521, + "num_input_tokens_seen": 299541660, + "step": 13885, + "time_per_iteration": 2.719287395477295 + }, + { + "auxiliary_loss_clip": 0.01032683, + "auxiliary_loss_mlp": 0.0102774, + "balance_loss_clip": 1.02450347, + "balance_loss_mlp": 1.017452, + "epoch": 0.8348714865474223, + "flos": 34204554552960.0, + "grad_norm": 1.6267244004003982, + "language_loss": 0.69840276, + "learning_rate": 2.791883957449912e-07, + "loss": 0.71900702, + "num_input_tokens_seen": 299562465, + "step": 13886, + "time_per_iteration": 4.52210807800293 + }, + { + "auxiliary_loss_clip": 0.01024978, + "auxiliary_loss_mlp": 0.0102845, + "balance_loss_clip": 1.02238822, + "balance_loss_mlp": 1.01676202, + "epoch": 0.8349316098000902, + "flos": 24390132819840.0, + "grad_norm": 1.4810593800827876, + "language_loss": 0.78982174, + "learning_rate": 2.7898995499512134e-07, + "loss": 0.81035602, + "num_input_tokens_seen": 299582700, + "step": 13887, + "time_per_iteration": 2.8318965435028076 + }, + { + "auxiliary_loss_clip": 0.01047199, + "auxiliary_loss_mlp": 0.00747682, + "balance_loss_clip": 1.0268085, + "balance_loss_mlp": 1.00044394, + "epoch": 0.8349917330527582, + "flos": 23032744836480.0, + "grad_norm": 2.069873664137941, + "language_loss": 0.64171934, + "learning_rate": 2.7879157950612467e-07, + "loss": 0.65966809, + "num_input_tokens_seen": 299600310, + "step": 13888, + "time_per_iteration": 2.766899347305298 + }, + { + "auxiliary_loss_clip": 0.01043343, + "auxiliary_loss_mlp": 0.01026106, + "balance_loss_clip": 1.02443552, + "balance_loss_mlp": 1.01597309, + "epoch": 0.8350518563054261, + "flos": 13625873792640.0, + "grad_norm": 2.2610240024766513, + "language_loss": 0.66457891, + "learning_rate": 2.785932692855244e-07, + "loss": 0.68527335, + "num_input_tokens_seen": 299617025, + "step": 13889, + "time_per_iteration": 2.65335750579834 + }, + { + "auxiliary_loss_clip": 0.01040866, + "auxiliary_loss_mlp": 0.01026963, + "balance_loss_clip": 1.02093565, + "balance_loss_mlp": 1.01720011, + "epoch": 0.8351119795580941, + "flos": 21579799697280.0, + "grad_norm": 2.7406817510894745, + "language_loss": 0.68696201, + "learning_rate": 2.783950243408399e-07, + "loss": 0.70764023, + "num_input_tokens_seen": 299633050, + "step": 13890, + "time_per_iteration": 2.611694574356079 + }, + { + "auxiliary_loss_clip": 0.01042971, + "auxiliary_loss_mlp": 0.01032523, + "balance_loss_clip": 1.02499962, + "balance_loss_mlp": 1.02190101, + "epoch": 0.835172102810762, + "flos": 20038297576320.0, + "grad_norm": 2.4213264072678027, + "language_loss": 0.59204805, + "learning_rate": 2.7819684467958817e-07, + "loss": 0.61280298, + "num_input_tokens_seen": 299646445, + "step": 13891, + "time_per_iteration": 2.6342616081237793 + }, + { + "auxiliary_loss_clip": 0.01053217, + "auxiliary_loss_mlp": 0.01027197, + "balance_loss_clip": 1.02553749, + "balance_loss_mlp": 1.01724267, + "epoch": 0.8352322260634301, + "flos": 25111577485440.0, + "grad_norm": 2.2698846991711457, + "language_loss": 0.7163316, + "learning_rate": 2.779987303092846e-07, + "loss": 0.73713577, + "num_input_tokens_seen": 299662665, + "step": 13892, + "time_per_iteration": 2.6534922122955322 + }, + { + "auxiliary_loss_clip": 0.01059852, + "auxiliary_loss_mlp": 0.01027384, + "balance_loss_clip": 1.02435017, + "balance_loss_mlp": 1.01742387, + "epoch": 0.835292349316098, + "flos": 24863758577280.0, + "grad_norm": 1.5342859705009477, + "language_loss": 0.6596095, + "learning_rate": 2.7780068123744207e-07, + "loss": 0.68048179, + "num_input_tokens_seen": 299683585, + "step": 13893, + "time_per_iteration": 2.587766647338867 + }, + { + "auxiliary_loss_clip": 0.01030867, + "auxiliary_loss_mlp": 0.0102337, + "balance_loss_clip": 1.02205181, + "balance_loss_mlp": 1.01328444, + "epoch": 0.835352472568766, + "flos": 19865568049920.0, + "grad_norm": 1.9434298004983863, + "language_loss": 0.78236645, + "learning_rate": 2.7760269747156996e-07, + "loss": 0.80290878, + "num_input_tokens_seen": 299702680, + "step": 13894, + "time_per_iteration": 2.617143154144287 + }, + { + "auxiliary_loss_clip": 0.01042051, + "auxiliary_loss_mlp": 0.01025988, + "balance_loss_clip": 1.02318847, + "balance_loss_mlp": 1.01569998, + "epoch": 0.8354125958214339, + "flos": 22054754257920.0, + "grad_norm": 1.6804845496384877, + "language_loss": 0.72583723, + "learning_rate": 2.7740477901917625e-07, + "loss": 0.74651766, + "num_input_tokens_seen": 299721050, + "step": 13895, + "time_per_iteration": 2.725360155105591 + }, + { + "auxiliary_loss_clip": 0.01054383, + "auxiliary_loss_mlp": 0.01035796, + "balance_loss_clip": 1.02657807, + "balance_loss_mlp": 1.02432799, + "epoch": 0.8354727190741019, + "flos": 21397804462080.0, + "grad_norm": 2.2013337907118062, + "language_loss": 0.7170732, + "learning_rate": 2.772069258877667e-07, + "loss": 0.73797506, + "num_input_tokens_seen": 299738255, + "step": 13896, + "time_per_iteration": 2.620807409286499 + }, + { + "auxiliary_loss_clip": 0.01049226, + "auxiliary_loss_mlp": 0.0102547, + "balance_loss_clip": 1.02395964, + "balance_loss_mlp": 1.01542091, + "epoch": 0.8355328423267698, + "flos": 50840997834240.0, + "grad_norm": 2.157774371097841, + "language_loss": 0.59115052, + "learning_rate": 2.770091380848423e-07, + "loss": 0.61189747, + "num_input_tokens_seen": 299761315, + "step": 13897, + "time_per_iteration": 2.808810234069824 + }, + { + "auxiliary_loss_clip": 0.01006594, + "auxiliary_loss_mlp": 0.00746613, + "balance_loss_clip": 1.00162792, + "balance_loss_mlp": 1.00058162, + "epoch": 0.8355929655794379, + "flos": 65551052764800.0, + "grad_norm": 0.6995594284990684, + "language_loss": 0.57671475, + "learning_rate": 2.7681141561790423e-07, + "loss": 0.59424686, + "num_input_tokens_seen": 299828735, + "step": 13898, + "time_per_iteration": 3.226376533508301 + }, + { + "auxiliary_loss_clip": 0.01045931, + "auxiliary_loss_mlp": 0.01030588, + "balance_loss_clip": 1.02556872, + "balance_loss_mlp": 1.01963902, + "epoch": 0.8356530888321058, + "flos": 19170516902400.0, + "grad_norm": 1.8097954491496642, + "language_loss": 0.80286491, + "learning_rate": 2.7661375849444967e-07, + "loss": 0.82363009, + "num_input_tokens_seen": 299848395, + "step": 13899, + "time_per_iteration": 4.334767580032349 + }, + { + "auxiliary_loss_clip": 0.01062679, + "auxiliary_loss_mlp": 0.01029989, + "balance_loss_clip": 1.0252291, + "balance_loss_mlp": 1.02026737, + "epoch": 0.8357132120847738, + "flos": 44126672238720.0, + "grad_norm": 1.7578190405648082, + "language_loss": 0.69086921, + "learning_rate": 2.764161667219749e-07, + "loss": 0.71179593, + "num_input_tokens_seen": 299871665, + "step": 13900, + "time_per_iteration": 2.777292251586914 + }, + { + "auxiliary_loss_clip": 0.01043797, + "auxiliary_loss_mlp": 0.01030627, + "balance_loss_clip": 1.02643085, + "balance_loss_mlp": 1.02063179, + "epoch": 0.8357733353374418, + "flos": 24389701856640.0, + "grad_norm": 1.3784329325311786, + "language_loss": 0.71269375, + "learning_rate": 2.762186403079716e-07, + "loss": 0.73343801, + "num_input_tokens_seen": 299891960, + "step": 13901, + "time_per_iteration": 2.6533467769622803 + }, + { + "auxiliary_loss_clip": 0.01019579, + "auxiliary_loss_mlp": 0.01036288, + "balance_loss_clip": 1.0230149, + "balance_loss_mlp": 1.02526152, + "epoch": 0.8358334585901097, + "flos": 20916313626240.0, + "grad_norm": 2.123417248364483, + "language_loss": 0.80173254, + "learning_rate": 2.7602117925992963e-07, + "loss": 0.82229114, + "num_input_tokens_seen": 299905070, + "step": 13902, + "time_per_iteration": 2.7104735374450684 + }, + { + "auxiliary_loss_clip": 0.01049987, + "auxiliary_loss_mlp": 0.01028785, + "balance_loss_clip": 1.0247519, + "balance_loss_mlp": 1.01879549, + "epoch": 0.8358935818427777, + "flos": 19244169740160.0, + "grad_norm": 1.5534717366035897, + "language_loss": 0.62709522, + "learning_rate": 2.758237835853379e-07, + "loss": 0.64788294, + "num_input_tokens_seen": 299925130, + "step": 13903, + "time_per_iteration": 2.6541576385498047 + }, + { + "auxiliary_loss_clip": 0.01035907, + "auxiliary_loss_mlp": 0.01035725, + "balance_loss_clip": 1.0230844, + "balance_loss_mlp": 1.02488852, + "epoch": 0.8359537050954456, + "flos": 24134053783680.0, + "grad_norm": 1.8799202944234086, + "language_loss": 0.74040318, + "learning_rate": 2.7562645329168054e-07, + "loss": 0.76111954, + "num_input_tokens_seen": 299943845, + "step": 13904, + "time_per_iteration": 2.66214656829834 + }, + { + "auxiliary_loss_clip": 0.01034647, + "auxiliary_loss_mlp": 0.01028784, + "balance_loss_clip": 1.02246737, + "balance_loss_mlp": 1.0182991, + "epoch": 0.8360138283481137, + "flos": 16180415187840.0, + "grad_norm": 1.7180308498377557, + "language_loss": 0.72741807, + "learning_rate": 2.7542918838644104e-07, + "loss": 0.74805236, + "num_input_tokens_seen": 299961620, + "step": 13905, + "time_per_iteration": 2.6549019813537598 + }, + { + "auxiliary_loss_clip": 0.01047775, + "auxiliary_loss_mlp": 0.01034477, + "balance_loss_clip": 1.02409935, + "balance_loss_mlp": 1.02544713, + "epoch": 0.8360739516007816, + "flos": 22198899536640.0, + "grad_norm": 2.3766152447698743, + "language_loss": 0.66387892, + "learning_rate": 2.752319888771e-07, + "loss": 0.68470144, + "num_input_tokens_seen": 299982170, + "step": 13906, + "time_per_iteration": 2.7038981914520264 + }, + { + "auxiliary_loss_clip": 0.01050121, + "auxiliary_loss_mlp": 0.01025493, + "balance_loss_clip": 1.02387512, + "balance_loss_mlp": 1.01530051, + "epoch": 0.8361340748534496, + "flos": 20923137210240.0, + "grad_norm": 1.4017606094437036, + "language_loss": 0.74110985, + "learning_rate": 2.7503485477113475e-07, + "loss": 0.76186597, + "num_input_tokens_seen": 300001330, + "step": 13907, + "time_per_iteration": 2.6525826454162598 + }, + { + "auxiliary_loss_clip": 0.01033034, + "auxiliary_loss_mlp": 0.01029061, + "balance_loss_clip": 1.02444696, + "balance_loss_mlp": 1.01818299, + "epoch": 0.8361941981061175, + "flos": 26173599932160.0, + "grad_norm": 2.3704602484360806, + "language_loss": 0.75204039, + "learning_rate": 2.7483778607602005e-07, + "loss": 0.77266127, + "num_input_tokens_seen": 300020645, + "step": 13908, + "time_per_iteration": 2.7220146656036377 + }, + { + "auxiliary_loss_clip": 0.01053507, + "auxiliary_loss_mlp": 0.01029423, + "balance_loss_clip": 1.02520704, + "balance_loss_mlp": 1.01800847, + "epoch": 0.8362543213587855, + "flos": 24419363512320.0, + "grad_norm": 2.0580916090066896, + "language_loss": 0.71522504, + "learning_rate": 2.7464078279922964e-07, + "loss": 0.73605442, + "num_input_tokens_seen": 300039945, + "step": 13909, + "time_per_iteration": 2.6293983459472656 + }, + { + "auxiliary_loss_clip": 0.01063624, + "auxiliary_loss_mlp": 0.00747703, + "balance_loss_clip": 1.02498889, + "balance_loss_mlp": 1.00041986, + "epoch": 0.8363144446114534, + "flos": 17202396948480.0, + "grad_norm": 2.631579758051844, + "language_loss": 0.73164463, + "learning_rate": 2.744438449482338e-07, + "loss": 0.74975783, + "num_input_tokens_seen": 300058260, + "step": 13910, + "time_per_iteration": 2.569051504135132 + }, + { + "auxiliary_loss_clip": 0.01046249, + "auxiliary_loss_mlp": 0.00747753, + "balance_loss_clip": 1.02416575, + "balance_loss_mlp": 1.000422, + "epoch": 0.8363745678641215, + "flos": 19279398003840.0, + "grad_norm": 1.79465666704753, + "language_loss": 0.73288143, + "learning_rate": 2.742469725305001e-07, + "loss": 0.75082147, + "num_input_tokens_seen": 300076720, + "step": 13911, + "time_per_iteration": 2.610595941543579 + }, + { + "auxiliary_loss_clip": 0.0104482, + "auxiliary_loss_mlp": 0.0103358, + "balance_loss_clip": 1.02567458, + "balance_loss_mlp": 1.02305388, + "epoch": 0.8364346911167894, + "flos": 11874869596800.0, + "grad_norm": 2.174106118928338, + "language_loss": 0.78981674, + "learning_rate": 2.740501655534946e-07, + "loss": 0.81060076, + "num_input_tokens_seen": 300092950, + "step": 13912, + "time_per_iteration": 2.6262729167938232 + }, + { + "auxiliary_loss_clip": 0.01053646, + "auxiliary_loss_mlp": 0.01029144, + "balance_loss_clip": 1.02587652, + "balance_loss_mlp": 1.01945806, + "epoch": 0.8364948143694574, + "flos": 20225212974720.0, + "grad_norm": 1.7214887573189475, + "language_loss": 0.78670895, + "learning_rate": 2.738534240246797e-07, + "loss": 0.80753684, + "num_input_tokens_seen": 300110950, + "step": 13913, + "time_per_iteration": 2.6411609649658203 + }, + { + "auxiliary_loss_clip": 0.01049967, + "auxiliary_loss_mlp": 0.010282, + "balance_loss_clip": 1.02327836, + "balance_loss_mlp": 1.01741171, + "epoch": 0.8365549376221254, + "flos": 21612909058560.0, + "grad_norm": 1.9466998671389688, + "language_loss": 0.73380625, + "learning_rate": 2.736567479515153e-07, + "loss": 0.75458789, + "num_input_tokens_seen": 300128705, + "step": 13914, + "time_per_iteration": 2.625349521636963 + }, + { + "auxiliary_loss_clip": 0.01025345, + "auxiliary_loss_mlp": 0.01032927, + "balance_loss_clip": 1.02707529, + "balance_loss_mlp": 1.0221858, + "epoch": 0.8366150608747933, + "flos": 23294210912640.0, + "grad_norm": 2.407362165984367, + "language_loss": 0.71217734, + "learning_rate": 2.7346013734146025e-07, + "loss": 0.73276007, + "num_input_tokens_seen": 300148635, + "step": 13915, + "time_per_iteration": 4.374583959579468 + }, + { + "auxiliary_loss_clip": 0.01032715, + "auxiliary_loss_mlp": 0.01027784, + "balance_loss_clip": 1.02437067, + "balance_loss_mlp": 1.01762724, + "epoch": 0.8366751841274613, + "flos": 15267673664640.0, + "grad_norm": 1.8085885941506297, + "language_loss": 0.72198033, + "learning_rate": 2.7326359220197035e-07, + "loss": 0.7425853, + "num_input_tokens_seen": 300165490, + "step": 13916, + "time_per_iteration": 4.2734785079956055 + }, + { + "auxiliary_loss_clip": 0.01034105, + "auxiliary_loss_mlp": 0.00747683, + "balance_loss_clip": 1.02592945, + "balance_loss_mlp": 1.00039864, + "epoch": 0.8367353073801292, + "flos": 13224931205760.0, + "grad_norm": 1.9005070500698291, + "language_loss": 0.75099397, + "learning_rate": 2.7306711254049755e-07, + "loss": 0.76881182, + "num_input_tokens_seen": 300182130, + "step": 13917, + "time_per_iteration": 2.7381505966186523 + }, + { + "auxiliary_loss_clip": 0.01058698, + "auxiliary_loss_mlp": 0.0102811, + "balance_loss_clip": 1.02491558, + "balance_loss_mlp": 1.01835871, + "epoch": 0.8367954306327973, + "flos": 24205084928640.0, + "grad_norm": 11.59805485901157, + "language_loss": 0.79288054, + "learning_rate": 2.728706983644933e-07, + "loss": 0.8137486, + "num_input_tokens_seen": 300203050, + "step": 13918, + "time_per_iteration": 2.5512566566467285 + }, + { + "auxiliary_loss_clip": 0.01023864, + "auxiliary_loss_mlp": 0.01028022, + "balance_loss_clip": 1.02644968, + "balance_loss_mlp": 1.01804984, + "epoch": 0.8368555538854652, + "flos": 24534744975360.0, + "grad_norm": 1.657884817267501, + "language_loss": 0.68108451, + "learning_rate": 2.7267434968140457e-07, + "loss": 0.70160335, + "num_input_tokens_seen": 300224380, + "step": 13919, + "time_per_iteration": 2.7682459354400635 + }, + { + "auxiliary_loss_clip": 0.0104411, + "auxiliary_loss_mlp": 0.01034485, + "balance_loss_clip": 1.02184987, + "balance_loss_mlp": 1.02307665, + "epoch": 0.8369156771381332, + "flos": 20259363830400.0, + "grad_norm": 1.735256386468595, + "language_loss": 0.73696363, + "learning_rate": 2.7247806649867835e-07, + "loss": 0.75774956, + "num_input_tokens_seen": 300242915, + "step": 13920, + "time_per_iteration": 2.597569465637207 + }, + { + "auxiliary_loss_clip": 0.01038618, + "auxiliary_loss_mlp": 0.01031287, + "balance_loss_clip": 1.02169466, + "balance_loss_mlp": 1.02062988, + "epoch": 0.8369758003908011, + "flos": 21835555511040.0, + "grad_norm": 1.709092457642163, + "language_loss": 0.68974477, + "learning_rate": 2.722818488237566e-07, + "loss": 0.71044385, + "num_input_tokens_seen": 300261905, + "step": 13921, + "time_per_iteration": 2.6744916439056396 + }, + { + "auxiliary_loss_clip": 0.01053068, + "auxiliary_loss_mlp": 0.01030501, + "balance_loss_clip": 1.02518129, + "balance_loss_mlp": 1.02003479, + "epoch": 0.8370359236434691, + "flos": 21719312121600.0, + "grad_norm": 1.8906494693313534, + "language_loss": 0.85244834, + "learning_rate": 2.720856966640801e-07, + "loss": 0.8732841, + "num_input_tokens_seen": 300281145, + "step": 13922, + "time_per_iteration": 2.6165964603424072 + }, + { + "auxiliary_loss_clip": 0.01029358, + "auxiliary_loss_mlp": 0.00747577, + "balance_loss_clip": 1.02441585, + "balance_loss_mlp": 1.00040686, + "epoch": 0.837096046896137, + "flos": 23148880485120.0, + "grad_norm": 1.6236387573128253, + "language_loss": 0.71690577, + "learning_rate": 2.71889610027088e-07, + "loss": 0.73467517, + "num_input_tokens_seen": 300301610, + "step": 13923, + "time_per_iteration": 2.712918758392334 + }, + { + "auxiliary_loss_clip": 0.01042929, + "auxiliary_loss_mlp": 0.01028292, + "balance_loss_clip": 1.02564335, + "balance_loss_mlp": 1.01694369, + "epoch": 0.8371561701488051, + "flos": 24492872695680.0, + "grad_norm": 2.454077255367681, + "language_loss": 0.76339215, + "learning_rate": 2.7169358892021433e-07, + "loss": 0.78410435, + "num_input_tokens_seen": 300319420, + "step": 13924, + "time_per_iteration": 2.76863956451416 + }, + { + "auxiliary_loss_clip": 0.01039599, + "auxiliary_loss_mlp": 0.01023666, + "balance_loss_clip": 1.02371705, + "balance_loss_mlp": 1.01386058, + "epoch": 0.837216293401473, + "flos": 29206723161600.0, + "grad_norm": 1.5930358673476839, + "language_loss": 0.64236283, + "learning_rate": 2.7149763335089293e-07, + "loss": 0.66299546, + "num_input_tokens_seen": 300341325, + "step": 13925, + "time_per_iteration": 2.747795581817627 + }, + { + "auxiliary_loss_clip": 0.01046595, + "auxiliary_loss_mlp": 0.01029185, + "balance_loss_clip": 1.02696657, + "balance_loss_mlp": 1.01868296, + "epoch": 0.837276416654141, + "flos": 25265275781760.0, + "grad_norm": 1.6380070466624856, + "language_loss": 0.7429769, + "learning_rate": 2.713017433265543e-07, + "loss": 0.7637347, + "num_input_tokens_seen": 300361620, + "step": 13926, + "time_per_iteration": 2.7134931087493896 + }, + { + "auxiliary_loss_clip": 0.01052614, + "auxiliary_loss_mlp": 0.01033024, + "balance_loss_clip": 1.02579618, + "balance_loss_mlp": 1.02234268, + "epoch": 0.837336539906809, + "flos": 13882024656000.0, + "grad_norm": 1.893738566597803, + "language_loss": 0.71106362, + "learning_rate": 2.711059188546274e-07, + "loss": 0.73192, + "num_input_tokens_seen": 300378675, + "step": 13927, + "time_per_iteration": 2.6166930198669434 + }, + { + "auxiliary_loss_clip": 0.00975108, + "auxiliary_loss_mlp": 0.0099987, + "balance_loss_clip": 1.00203013, + "balance_loss_mlp": 0.9990055, + "epoch": 0.8373966631594769, + "flos": 68870599044480.0, + "grad_norm": 0.7152326268961658, + "language_loss": 0.58887464, + "learning_rate": 2.7091015994253695e-07, + "loss": 0.6086244, + "num_input_tokens_seen": 300449740, + "step": 13928, + "time_per_iteration": 3.427431583404541 + }, + { + "auxiliary_loss_clip": 0.01028159, + "auxiliary_loss_mlp": 0.01035944, + "balance_loss_clip": 1.02569103, + "balance_loss_mlp": 1.02488732, + "epoch": 0.8374567864121449, + "flos": 20448972748800.0, + "grad_norm": 2.133327269302302, + "language_loss": 0.69884849, + "learning_rate": 2.707144665977068e-07, + "loss": 0.71948951, + "num_input_tokens_seen": 300470000, + "step": 13929, + "time_per_iteration": 2.708735942840576 + }, + { + "auxiliary_loss_clip": 0.01052561, + "auxiliary_loss_mlp": 0.01027052, + "balance_loss_clip": 1.02403176, + "balance_loss_mlp": 1.01587605, + "epoch": 0.8375169096648128, + "flos": 41904197101440.0, + "grad_norm": 1.7154424828647616, + "language_loss": 0.67210674, + "learning_rate": 2.705188388275574e-07, + "loss": 0.69290292, + "num_input_tokens_seen": 300494975, + "step": 13930, + "time_per_iteration": 2.8164472579956055 + }, + { + "auxiliary_loss_clip": 0.01021715, + "auxiliary_loss_mlp": 0.01024632, + "balance_loss_clip": 1.02558851, + "balance_loss_mlp": 1.01457119, + "epoch": 0.8375770329174809, + "flos": 20009354192640.0, + "grad_norm": 1.5849409272634367, + "language_loss": 0.71420205, + "learning_rate": 2.703232766395067e-07, + "loss": 0.73466551, + "num_input_tokens_seen": 300513175, + "step": 13931, + "time_per_iteration": 2.719820976257324 + }, + { + "auxiliary_loss_clip": 0.0102369, + "auxiliary_loss_mlp": 0.0102666, + "balance_loss_clip": 1.02077734, + "balance_loss_mlp": 1.01640224, + "epoch": 0.8376371561701488, + "flos": 22783597125120.0, + "grad_norm": 1.5830213701431604, + "language_loss": 0.71798331, + "learning_rate": 2.701277800409705e-07, + "loss": 0.73848677, + "num_input_tokens_seen": 300533770, + "step": 13932, + "time_per_iteration": 2.7401273250579834 + }, + { + "auxiliary_loss_clip": 0.01001106, + "auxiliary_loss_mlp": 0.01028584, + "balance_loss_clip": 1.02387953, + "balance_loss_mlp": 1.01944661, + "epoch": 0.8376972794228168, + "flos": 23914459987200.0, + "grad_norm": 2.0597923386864343, + "language_loss": 0.66646469, + "learning_rate": 2.699323490393628e-07, + "loss": 0.68676162, + "num_input_tokens_seen": 300552995, + "step": 13933, + "time_per_iteration": 4.730133295059204 + }, + { + "auxiliary_loss_clip": 0.0103366, + "auxiliary_loss_mlp": 0.01035925, + "balance_loss_clip": 1.02388048, + "balance_loss_mlp": 1.0254823, + "epoch": 0.8377574026754847, + "flos": 13734718980480.0, + "grad_norm": 1.9272061278175416, + "language_loss": 0.76700711, + "learning_rate": 2.697369836420933e-07, + "loss": 0.78770292, + "num_input_tokens_seen": 300570275, + "step": 13934, + "time_per_iteration": 2.687028169631958 + }, + { + "auxiliary_loss_clip": 0.01053126, + "auxiliary_loss_mlp": 0.01025908, + "balance_loss_clip": 1.02792478, + "balance_loss_mlp": 1.01593065, + "epoch": 0.8378175259281527, + "flos": 21651333632640.0, + "grad_norm": 1.5639917497096258, + "language_loss": 0.77303016, + "learning_rate": 2.6954168385657115e-07, + "loss": 0.7938205, + "num_input_tokens_seen": 300590875, + "step": 13935, + "time_per_iteration": 2.9716289043426514 + }, + { + "auxiliary_loss_clip": 0.01033398, + "auxiliary_loss_mlp": 0.01026483, + "balance_loss_clip": 1.02553082, + "balance_loss_mlp": 1.01573586, + "epoch": 0.8378776491808206, + "flos": 15448806973440.0, + "grad_norm": 2.2377034572343395, + "language_loss": 0.55862117, + "learning_rate": 2.6934644969020135e-07, + "loss": 0.57921994, + "num_input_tokens_seen": 300607490, + "step": 13936, + "time_per_iteration": 2.705672025680542 + }, + { + "auxiliary_loss_clip": 0.01043309, + "auxiliary_loss_mlp": 0.01028551, + "balance_loss_clip": 1.0218904, + "balance_loss_mlp": 1.01792347, + "epoch": 0.8379377724334887, + "flos": 14720395069440.0, + "grad_norm": 1.7014558519180325, + "language_loss": 0.8954311, + "learning_rate": 2.691512811503882e-07, + "loss": 0.91614974, + "num_input_tokens_seen": 300623635, + "step": 13937, + "time_per_iteration": 2.6276907920837402 + }, + { + "auxiliary_loss_clip": 0.01053007, + "auxiliary_loss_mlp": 0.01027425, + "balance_loss_clip": 1.02551055, + "balance_loss_mlp": 1.0170958, + "epoch": 0.8379978956861566, + "flos": 24535247765760.0, + "grad_norm": 1.716100854928126, + "language_loss": 0.81943941, + "learning_rate": 2.689561782445313e-07, + "loss": 0.84024376, + "num_input_tokens_seen": 300643835, + "step": 13938, + "time_per_iteration": 2.6457278728485107 + }, + { + "auxiliary_loss_clip": 0.01052733, + "auxiliary_loss_mlp": 0.01027151, + "balance_loss_clip": 1.02436709, + "balance_loss_mlp": 1.01624346, + "epoch": 0.8380580189388246, + "flos": 18952611045120.0, + "grad_norm": 1.6939181695813361, + "language_loss": 0.70592892, + "learning_rate": 2.6876114098002965e-07, + "loss": 0.72672772, + "num_input_tokens_seen": 300662500, + "step": 13939, + "time_per_iteration": 2.6465654373168945 + }, + { + "auxiliary_loss_clip": 0.01030595, + "auxiliary_loss_mlp": 0.01033223, + "balance_loss_clip": 1.02407312, + "balance_loss_mlp": 1.02248847, + "epoch": 0.8381181421914926, + "flos": 26540283922560.0, + "grad_norm": 1.8034832639636103, + "language_loss": 0.7644515, + "learning_rate": 2.6856616936428e-07, + "loss": 0.78508973, + "num_input_tokens_seen": 300681480, + "step": 13940, + "time_per_iteration": 2.7161121368408203 + }, + { + "auxiliary_loss_clip": 0.01044692, + "auxiliary_loss_mlp": 0.01030111, + "balance_loss_clip": 1.02265644, + "balance_loss_mlp": 1.019835, + "epoch": 0.8381782654441605, + "flos": 23291481479040.0, + "grad_norm": 1.586361595978233, + "language_loss": 0.76607627, + "learning_rate": 2.6837126340467374e-07, + "loss": 0.78682435, + "num_input_tokens_seen": 300699165, + "step": 13941, + "time_per_iteration": 2.6003005504608154 + }, + { + "auxiliary_loss_clip": 0.01026194, + "auxiliary_loss_mlp": 0.01027796, + "balance_loss_clip": 1.02584636, + "balance_loss_mlp": 1.01665044, + "epoch": 0.8382383886968285, + "flos": 26758800311040.0, + "grad_norm": 2.282498183151096, + "language_loss": 0.73771042, + "learning_rate": 2.6817642310860276e-07, + "loss": 0.75825024, + "num_input_tokens_seen": 300714615, + "step": 13942, + "time_per_iteration": 2.7272188663482666 + }, + { + "auxiliary_loss_clip": 0.01023373, + "auxiliary_loss_mlp": 0.01034227, + "balance_loss_clip": 1.02544856, + "balance_loss_mlp": 1.02212083, + "epoch": 0.8382985119494964, + "flos": 26104544035200.0, + "grad_norm": 1.4378206060383887, + "language_loss": 0.79515386, + "learning_rate": 2.679816484834554e-07, + "loss": 0.81572986, + "num_input_tokens_seen": 300734860, + "step": 13943, + "time_per_iteration": 2.7887823581695557 + }, + { + "auxiliary_loss_clip": 0.01010862, + "auxiliary_loss_mlp": 0.01028041, + "balance_loss_clip": 1.02126479, + "balance_loss_mlp": 1.0171926, + "epoch": 0.8383586352021645, + "flos": 16435129507200.0, + "grad_norm": 1.8786541990457815, + "language_loss": 0.85143191, + "learning_rate": 2.6778693953661766e-07, + "loss": 0.87182099, + "num_input_tokens_seen": 300752735, + "step": 13944, + "time_per_iteration": 2.7634496688842773 + }, + { + "auxiliary_loss_clip": 0.00984647, + "auxiliary_loss_mlp": 0.00746761, + "balance_loss_clip": 1.00112271, + "balance_loss_mlp": 1.00080538, + "epoch": 0.8384187584548324, + "flos": 64195532288640.0, + "grad_norm": 0.619538794361321, + "language_loss": 0.50302023, + "learning_rate": 2.6759229627547263e-07, + "loss": 0.5203343, + "num_input_tokens_seen": 300820760, + "step": 13945, + "time_per_iteration": 3.3952558040618896 + }, + { + "auxiliary_loss_clip": 0.010183, + "auxiliary_loss_mlp": 0.01029297, + "balance_loss_clip": 1.02532685, + "balance_loss_mlp": 1.018592, + "epoch": 0.8384788817075004, + "flos": 22382905933440.0, + "grad_norm": 1.7771740005404968, + "language_loss": 0.65636599, + "learning_rate": 2.673977187074017e-07, + "loss": 0.67684191, + "num_input_tokens_seen": 300840025, + "step": 13946, + "time_per_iteration": 4.506995677947998 + }, + { + "auxiliary_loss_clip": 0.01014519, + "auxiliary_loss_mlp": 0.01026894, + "balance_loss_clip": 1.02302063, + "balance_loss_mlp": 1.01590884, + "epoch": 0.8385390049601683, + "flos": 29496845312640.0, + "grad_norm": 1.6635744256224216, + "language_loss": 0.67304379, + "learning_rate": 2.672032068397829e-07, + "loss": 0.6934579, + "num_input_tokens_seen": 300860380, + "step": 13947, + "time_per_iteration": 2.791078805923462 + }, + { + "auxiliary_loss_clip": 0.01035975, + "auxiliary_loss_mlp": 0.01029561, + "balance_loss_clip": 1.02401781, + "balance_loss_mlp": 1.01758099, + "epoch": 0.8385991282128363, + "flos": 32707797799680.0, + "grad_norm": 1.5632535834152088, + "language_loss": 0.69819319, + "learning_rate": 2.6700876067999176e-07, + "loss": 0.71884859, + "num_input_tokens_seen": 300881895, + "step": 13948, + "time_per_iteration": 2.8031625747680664 + }, + { + "auxiliary_loss_clip": 0.01038414, + "auxiliary_loss_mlp": 0.01028637, + "balance_loss_clip": 1.02365208, + "balance_loss_mlp": 1.01982713, + "epoch": 0.8386592514655042, + "flos": 25441022050560.0, + "grad_norm": 1.8462829671812284, + "language_loss": 0.85277128, + "learning_rate": 2.6681438023540194e-07, + "loss": 0.87344176, + "num_input_tokens_seen": 300901575, + "step": 13949, + "time_per_iteration": 2.7162673473358154 + }, + { + "auxiliary_loss_clip": 0.01045318, + "auxiliary_loss_mlp": 0.01025067, + "balance_loss_clip": 1.02825725, + "balance_loss_mlp": 1.01463056, + "epoch": 0.8387193747181723, + "flos": 22015898720640.0, + "grad_norm": 1.769165440959685, + "language_loss": 0.70884752, + "learning_rate": 2.66620065513385e-07, + "loss": 0.72955132, + "num_input_tokens_seen": 300919735, + "step": 13950, + "time_per_iteration": 2.744537591934204 + }, + { + "auxiliary_loss_clip": 0.01042839, + "auxiliary_loss_mlp": 0.01026929, + "balance_loss_clip": 1.02429426, + "balance_loss_mlp": 1.01651025, + "epoch": 0.8387794979708402, + "flos": 18150223080960.0, + "grad_norm": 1.693748812261547, + "language_loss": 0.64756501, + "learning_rate": 2.6642581652130913e-07, + "loss": 0.66826272, + "num_input_tokens_seen": 300939150, + "step": 13951, + "time_per_iteration": 2.6522011756896973 + }, + { + "auxiliary_loss_clip": 0.01052639, + "auxiliary_loss_mlp": 0.01026817, + "balance_loss_clip": 1.02557921, + "balance_loss_mlp": 1.01670814, + "epoch": 0.8388396212235082, + "flos": 25411216740480.0, + "grad_norm": 1.4639579172902222, + "language_loss": 0.7019679, + "learning_rate": 2.662316332665393e-07, + "loss": 0.72276247, + "num_input_tokens_seen": 300959730, + "step": 13952, + "time_per_iteration": 2.669952869415283 + }, + { + "auxiliary_loss_clip": 0.01050353, + "auxiliary_loss_mlp": 0.0102649, + "balance_loss_clip": 1.02527583, + "balance_loss_mlp": 1.01666677, + "epoch": 0.8388997444761762, + "flos": 22273055164800.0, + "grad_norm": 3.7712961097199305, + "language_loss": 0.72839898, + "learning_rate": 2.6603751575643987e-07, + "loss": 0.74916738, + "num_input_tokens_seen": 300976120, + "step": 13953, + "time_per_iteration": 2.6426963806152344 + }, + { + "auxiliary_loss_clip": 0.0099316, + "auxiliary_loss_mlp": 0.01028938, + "balance_loss_clip": 1.02142072, + "balance_loss_mlp": 1.01763105, + "epoch": 0.8389598677288441, + "flos": 19573219255680.0, + "grad_norm": 1.8761883541452244, + "language_loss": 0.68166935, + "learning_rate": 2.6584346399837176e-07, + "loss": 0.70189035, + "num_input_tokens_seen": 300995080, + "step": 13954, + "time_per_iteration": 2.740213632583618 + }, + { + "auxiliary_loss_clip": 0.01044105, + "auxiliary_loss_mlp": 0.01029486, + "balance_loss_clip": 1.02663493, + "balance_loss_mlp": 1.01960909, + "epoch": 0.8390199909815121, + "flos": 17384715406080.0, + "grad_norm": 1.9298266668721566, + "language_loss": 0.73420089, + "learning_rate": 2.656494779996932e-07, + "loss": 0.75493681, + "num_input_tokens_seen": 301012920, + "step": 13955, + "time_per_iteration": 2.6206393241882324 + }, + { + "auxiliary_loss_clip": 0.01000913, + "auxiliary_loss_mlp": 0.01027129, + "balance_loss_clip": 1.02062571, + "balance_loss_mlp": 1.01640666, + "epoch": 0.83908011423418, + "flos": 24639639667200.0, + "grad_norm": 2.309598491828318, + "language_loss": 0.66177309, + "learning_rate": 2.6545555776775995e-07, + "loss": 0.68205357, + "num_input_tokens_seen": 301028875, + "step": 13956, + "time_per_iteration": 2.8587396144866943 + }, + { + "auxiliary_loss_clip": 0.01053388, + "auxiliary_loss_mlp": 0.01033239, + "balance_loss_clip": 1.02509332, + "balance_loss_mlp": 1.02246273, + "epoch": 0.8391402374868481, + "flos": 24718356322560.0, + "grad_norm": 1.7190114620229997, + "language_loss": 0.79738462, + "learning_rate": 2.6526170330992667e-07, + "loss": 0.81825089, + "num_input_tokens_seen": 301050115, + "step": 13957, + "time_per_iteration": 2.6238467693328857 + }, + { + "auxiliary_loss_clip": 0.009607, + "auxiliary_loss_mlp": 0.01011141, + "balance_loss_clip": 1.00422168, + "balance_loss_mlp": 1.00992489, + "epoch": 0.839200360739516, + "flos": 56871695784960.0, + "grad_norm": 0.7568052148730076, + "language_loss": 0.53364486, + "learning_rate": 2.6506791463354283e-07, + "loss": 0.55336326, + "num_input_tokens_seen": 301114155, + "step": 13958, + "time_per_iteration": 3.3948121070861816 + }, + { + "auxiliary_loss_clip": 0.0105199, + "auxiliary_loss_mlp": 0.01029737, + "balance_loss_clip": 1.02545762, + "balance_loss_mlp": 1.01863885, + "epoch": 0.839260483992184, + "flos": 18332792933760.0, + "grad_norm": 1.8803139823449555, + "language_loss": 0.73606205, + "learning_rate": 2.648741917459574e-07, + "loss": 0.75687927, + "num_input_tokens_seen": 301133150, + "step": 13959, + "time_per_iteration": 2.5993731021881104 + }, + { + "auxiliary_loss_clip": 0.01031742, + "auxiliary_loss_mlp": 0.01024406, + "balance_loss_clip": 1.02556372, + "balance_loss_mlp": 1.01457143, + "epoch": 0.8393206072448519, + "flos": 27087921653760.0, + "grad_norm": 1.7510596813032149, + "language_loss": 0.55658114, + "learning_rate": 2.646805346545169e-07, + "loss": 0.5771426, + "num_input_tokens_seen": 301153600, + "step": 13960, + "time_per_iteration": 2.7317309379577637 + }, + { + "auxiliary_loss_clip": 0.00979694, + "auxiliary_loss_mlp": 0.01000487, + "balance_loss_clip": 1.00340211, + "balance_loss_mlp": 0.99952698, + "epoch": 0.8393807304975199, + "flos": 61521192057600.0, + "grad_norm": 0.7719129130951333, + "language_loss": 0.60728633, + "learning_rate": 2.6448694336656397e-07, + "loss": 0.62708819, + "num_input_tokens_seen": 301214335, + "step": 13961, + "time_per_iteration": 3.410351276397705 + }, + { + "auxiliary_loss_clip": 0.01007435, + "auxiliary_loss_mlp": 0.01034009, + "balance_loss_clip": 1.01844883, + "balance_loss_mlp": 1.02324486, + "epoch": 0.8394408537501878, + "flos": 14894848448640.0, + "grad_norm": 2.0739772204837763, + "language_loss": 0.68228447, + "learning_rate": 2.642934178894405e-07, + "loss": 0.70269895, + "num_input_tokens_seen": 301228960, + "step": 13962, + "time_per_iteration": 4.337152004241943 + }, + { + "auxiliary_loss_clip": 0.01034248, + "auxiliary_loss_mlp": 0.01025845, + "balance_loss_clip": 1.02486491, + "balance_loss_mlp": 1.01572442, + "epoch": 0.8395009770028559, + "flos": 17412186332160.0, + "grad_norm": 2.2405373588429605, + "language_loss": 0.73308837, + "learning_rate": 2.640999582304841e-07, + "loss": 0.75368935, + "num_input_tokens_seen": 301245875, + "step": 13963, + "time_per_iteration": 4.243620872497559 + }, + { + "auxiliary_loss_clip": 0.01037404, + "auxiliary_loss_mlp": 0.01033081, + "balance_loss_clip": 1.02264857, + "balance_loss_mlp": 1.0230974, + "epoch": 0.8395611002555238, + "flos": 27924747782400.0, + "grad_norm": 1.764928710204359, + "language_loss": 0.76463491, + "learning_rate": 2.6390656439703173e-07, + "loss": 0.78533971, + "num_input_tokens_seen": 301265550, + "step": 13964, + "time_per_iteration": 2.6519062519073486 + }, + { + "auxiliary_loss_clip": 0.01045258, + "auxiliary_loss_mlp": 0.01034183, + "balance_loss_clip": 1.02624047, + "balance_loss_mlp": 1.02286434, + "epoch": 0.8396212235081918, + "flos": 11100922225920.0, + "grad_norm": 1.8264469088767172, + "language_loss": 0.7771306, + "learning_rate": 2.637132363964161e-07, + "loss": 0.797925, + "num_input_tokens_seen": 301282035, + "step": 13965, + "time_per_iteration": 2.6772093772888184 + }, + { + "auxiliary_loss_clip": 0.01049531, + "auxiliary_loss_mlp": 0.01028396, + "balance_loss_clip": 1.02360642, + "balance_loss_mlp": 1.01860261, + "epoch": 0.8396813467608598, + "flos": 35735641729920.0, + "grad_norm": 2.4417249469597135, + "language_loss": 0.66026783, + "learning_rate": 2.635199742359684e-07, + "loss": 0.68104708, + "num_input_tokens_seen": 301305210, + "step": 13966, + "time_per_iteration": 2.7734432220458984 + }, + { + "auxiliary_loss_clip": 0.0104092, + "auxiliary_loss_mlp": 0.01030484, + "balance_loss_clip": 1.024122, + "balance_loss_mlp": 1.02039313, + "epoch": 0.8397414700135277, + "flos": 26176724415360.0, + "grad_norm": 1.7173471711123753, + "language_loss": 0.74666691, + "learning_rate": 2.633267779230177e-07, + "loss": 0.76738095, + "num_input_tokens_seen": 301324885, + "step": 13967, + "time_per_iteration": 2.7607061862945557 + }, + { + "auxiliary_loss_clip": 0.01041275, + "auxiliary_loss_mlp": 0.01027491, + "balance_loss_clip": 1.02494192, + "balance_loss_mlp": 1.01733446, + "epoch": 0.8398015932661957, + "flos": 18333116156160.0, + "grad_norm": 2.058846538422308, + "language_loss": 0.82934642, + "learning_rate": 2.6313364746488974e-07, + "loss": 0.85003412, + "num_input_tokens_seen": 301343070, + "step": 13968, + "time_per_iteration": 2.755115509033203 + }, + { + "auxiliary_loss_clip": 0.01037329, + "auxiliary_loss_mlp": 0.01031266, + "balance_loss_clip": 1.02731657, + "balance_loss_mlp": 1.02061427, + "epoch": 0.8398617165188637, + "flos": 17379507934080.0, + "grad_norm": 1.894255367151854, + "language_loss": 0.7725755, + "learning_rate": 2.629405828689075e-07, + "loss": 0.79326141, + "num_input_tokens_seen": 301359280, + "step": 13969, + "time_per_iteration": 2.7504398822784424 + }, + { + "auxiliary_loss_clip": 0.01044279, + "auxiliary_loss_mlp": 0.01026595, + "balance_loss_clip": 1.02421749, + "balance_loss_mlp": 1.01497817, + "epoch": 0.8399218397715317, + "flos": 22929681738240.0, + "grad_norm": 2.068381293994915, + "language_loss": 0.77204347, + "learning_rate": 2.627475841423923e-07, + "loss": 0.79275227, + "num_input_tokens_seen": 301376465, + "step": 13970, + "time_per_iteration": 2.6612184047698975 + }, + { + "auxiliary_loss_clip": 0.01041392, + "auxiliary_loss_mlp": 0.01031784, + "balance_loss_clip": 1.02427375, + "balance_loss_mlp": 1.02188349, + "epoch": 0.8399819630241996, + "flos": 23149562843520.0, + "grad_norm": 2.0998738260720606, + "language_loss": 0.72387993, + "learning_rate": 2.625546512926633e-07, + "loss": 0.74461174, + "num_input_tokens_seen": 301396000, + "step": 13971, + "time_per_iteration": 2.663249969482422 + }, + { + "auxiliary_loss_clip": 0.01038508, + "auxiliary_loss_mlp": 0.01026211, + "balance_loss_clip": 1.02270997, + "balance_loss_mlp": 1.01525533, + "epoch": 0.8400420862768676, + "flos": 16397423205120.0, + "grad_norm": 2.441172407069201, + "language_loss": 0.77519941, + "learning_rate": 2.623617843270358e-07, + "loss": 0.79584658, + "num_input_tokens_seen": 301413160, + "step": 13972, + "time_per_iteration": 2.6398513317108154 + }, + { + "auxiliary_loss_clip": 0.00997437, + "auxiliary_loss_mlp": 0.01035388, + "balance_loss_clip": 1.02091551, + "balance_loss_mlp": 1.02378249, + "epoch": 0.8401022095295355, + "flos": 21287486816640.0, + "grad_norm": 1.3253443963433857, + "language_loss": 0.68507636, + "learning_rate": 2.6216898325282333e-07, + "loss": 0.70540452, + "num_input_tokens_seen": 301433325, + "step": 13973, + "time_per_iteration": 2.9279937744140625 + }, + { + "auxiliary_loss_clip": 0.0104464, + "auxiliary_loss_mlp": 0.01023688, + "balance_loss_clip": 1.0261426, + "balance_loss_mlp": 1.01298904, + "epoch": 0.8401623327822035, + "flos": 17311313963520.0, + "grad_norm": 1.8563713443216194, + "language_loss": 0.78015614, + "learning_rate": 2.619762480773382e-07, + "loss": 0.80083936, + "num_input_tokens_seen": 301450265, + "step": 13974, + "time_per_iteration": 2.67648983001709 + }, + { + "auxiliary_loss_clip": 0.01054113, + "auxiliary_loss_mlp": 0.01026889, + "balance_loss_clip": 1.0268302, + "balance_loss_mlp": 1.01619053, + "epoch": 0.8402224560348714, + "flos": 22236677665920.0, + "grad_norm": 1.5612551351939254, + "language_loss": 0.7237637, + "learning_rate": 2.617835788078868e-07, + "loss": 0.74457371, + "num_input_tokens_seen": 301470760, + "step": 13975, + "time_per_iteration": 2.7269506454467773 + }, + { + "auxiliary_loss_clip": 0.01039102, + "auxiliary_loss_mlp": 0.0102409, + "balance_loss_clip": 1.02357411, + "balance_loss_mlp": 1.0138979, + "epoch": 0.8402825792875395, + "flos": 20229953569920.0, + "grad_norm": 1.655541276231117, + "language_loss": 0.72605073, + "learning_rate": 2.6159097545177645e-07, + "loss": 0.7466827, + "num_input_tokens_seen": 301489425, + "step": 13976, + "time_per_iteration": 2.6581804752349854 + }, + { + "auxiliary_loss_clip": 0.01060269, + "auxiliary_loss_mlp": 0.0074751, + "balance_loss_clip": 1.02420533, + "balance_loss_mlp": 1.00034583, + "epoch": 0.8403427025402074, + "flos": 23289973107840.0, + "grad_norm": 1.6400125138553079, + "language_loss": 0.7187736, + "learning_rate": 2.61398438016311e-07, + "loss": 0.73685145, + "num_input_tokens_seen": 301508885, + "step": 13977, + "time_per_iteration": 2.7793896198272705 + }, + { + "auxiliary_loss_clip": 0.01048282, + "auxiliary_loss_mlp": 0.01026685, + "balance_loss_clip": 1.02133012, + "balance_loss_mlp": 1.01685047, + "epoch": 0.8404028257928754, + "flos": 32675586278400.0, + "grad_norm": 1.5003758168662507, + "language_loss": 0.68738407, + "learning_rate": 2.6120596650879043e-07, + "loss": 0.70813376, + "num_input_tokens_seen": 301533780, + "step": 13978, + "time_per_iteration": 2.7190606594085693 + }, + { + "auxiliary_loss_clip": 0.01023429, + "auxiliary_loss_mlp": 0.01026724, + "balance_loss_clip": 1.0213058, + "balance_loss_mlp": 1.01675749, + "epoch": 0.8404629490455434, + "flos": 16180522928640.0, + "grad_norm": 1.6930240040082305, + "language_loss": 0.77936757, + "learning_rate": 2.610135609365145e-07, + "loss": 0.79986906, + "num_input_tokens_seen": 301551775, + "step": 13979, + "time_per_iteration": 2.743216037750244 + }, + { + "auxiliary_loss_clip": 0.01053176, + "auxiliary_loss_mlp": 0.0102839, + "balance_loss_clip": 1.02687788, + "balance_loss_mlp": 1.01779795, + "epoch": 0.8405230722982113, + "flos": 15194451790080.0, + "grad_norm": 1.809776764706287, + "language_loss": 0.78140879, + "learning_rate": 2.60821221306778e-07, + "loss": 0.80222452, + "num_input_tokens_seen": 301570495, + "step": 13980, + "time_per_iteration": 4.333677291870117 + }, + { + "auxiliary_loss_clip": 0.01025797, + "auxiliary_loss_mlp": 0.01024865, + "balance_loss_clip": 1.02270246, + "balance_loss_mlp": 1.0148809, + "epoch": 0.8405831955508793, + "flos": 27812418975360.0, + "grad_norm": 1.6253622072356402, + "language_loss": 0.86530167, + "learning_rate": 2.606289476268757e-07, + "loss": 0.88580829, + "num_input_tokens_seen": 301591705, + "step": 13981, + "time_per_iteration": 2.7746529579162598 + }, + { + "auxiliary_loss_clip": 0.01049975, + "auxiliary_loss_mlp": 0.0103272, + "balance_loss_clip": 1.02391016, + "balance_loss_mlp": 1.02225983, + "epoch": 0.8406433188035473, + "flos": 23769452782080.0, + "grad_norm": 3.041417725492904, + "language_loss": 0.68171954, + "learning_rate": 2.6043673990409745e-07, + "loss": 0.70254648, + "num_input_tokens_seen": 301611670, + "step": 13982, + "time_per_iteration": 2.6233229637145996 + }, + { + "auxiliary_loss_clip": 0.01016281, + "auxiliary_loss_mlp": 0.01034699, + "balance_loss_clip": 1.02402818, + "balance_loss_mlp": 1.02251005, + "epoch": 0.8407034420562153, + "flos": 29205681667200.0, + "grad_norm": 1.6815180964179577, + "language_loss": 0.67837167, + "learning_rate": 2.602445981457324e-07, + "loss": 0.69888139, + "num_input_tokens_seen": 301632540, + "step": 13983, + "time_per_iteration": 2.945626974105835 + }, + { + "auxiliary_loss_clip": 0.01021265, + "auxiliary_loss_mlp": 0.01028894, + "balance_loss_clip": 1.02040648, + "balance_loss_mlp": 1.01780713, + "epoch": 0.8407635653088832, + "flos": 26360084367360.0, + "grad_norm": 1.6712110031114367, + "language_loss": 0.78510809, + "learning_rate": 2.6005252235906684e-07, + "loss": 0.8056097, + "num_input_tokens_seen": 301651480, + "step": 13984, + "time_per_iteration": 2.7709155082702637 + }, + { + "auxiliary_loss_clip": 0.0104937, + "auxiliary_loss_mlp": 0.01026406, + "balance_loss_clip": 1.02300847, + "balance_loss_mlp": 1.01650596, + "epoch": 0.8408236885615512, + "flos": 21468799693440.0, + "grad_norm": 2.088628005629009, + "language_loss": 0.60406393, + "learning_rate": 2.598605125513842e-07, + "loss": 0.62482172, + "num_input_tokens_seen": 301670010, + "step": 13985, + "time_per_iteration": 2.6876635551452637 + }, + { + "auxiliary_loss_clip": 0.01017719, + "auxiliary_loss_mlp": 0.0102681, + "balance_loss_clip": 1.02280951, + "balance_loss_mlp": 1.01602745, + "epoch": 0.8408838118142191, + "flos": 22963724853120.0, + "grad_norm": 2.030062007433132, + "language_loss": 0.82134622, + "learning_rate": 2.5966856872996467e-07, + "loss": 0.84179151, + "num_input_tokens_seen": 301689785, + "step": 13986, + "time_per_iteration": 2.7879319190979004 + }, + { + "auxiliary_loss_clip": 0.01043135, + "auxiliary_loss_mlp": 0.00747623, + "balance_loss_clip": 1.02620625, + "balance_loss_mlp": 1.00039637, + "epoch": 0.8409439350668871, + "flos": 26800026145920.0, + "grad_norm": 1.773842457036343, + "language_loss": 0.65805745, + "learning_rate": 2.5947669090208755e-07, + "loss": 0.67596501, + "num_input_tokens_seen": 301712225, + "step": 13987, + "time_per_iteration": 2.7540316581726074 + }, + { + "auxiliary_loss_clip": 0.01061774, + "auxiliary_loss_mlp": 0.00747681, + "balance_loss_clip": 1.02526212, + "balance_loss_mlp": 1.0004611, + "epoch": 0.841004058319555, + "flos": 26578672583040.0, + "grad_norm": 8.776505390044886, + "language_loss": 0.67465001, + "learning_rate": 2.5928487907502906e-07, + "loss": 0.69274455, + "num_input_tokens_seen": 301730955, + "step": 13988, + "time_per_iteration": 2.5948855876922607 + }, + { + "auxiliary_loss_clip": 0.01058382, + "auxiliary_loss_mlp": 0.01035899, + "balance_loss_clip": 1.02864957, + "balance_loss_mlp": 1.02421641, + "epoch": 0.8410641815722231, + "flos": 14501878680960.0, + "grad_norm": 1.9757598977934918, + "language_loss": 0.81131601, + "learning_rate": 2.590931332560622e-07, + "loss": 0.83225888, + "num_input_tokens_seen": 301746930, + "step": 13989, + "time_per_iteration": 2.692272663116455 + }, + { + "auxiliary_loss_clip": 0.01045667, + "auxiliary_loss_mlp": 0.01023259, + "balance_loss_clip": 1.02192354, + "balance_loss_mlp": 1.01230323, + "epoch": 0.841124304824891, + "flos": 29166682475520.0, + "grad_norm": 1.7034670690246436, + "language_loss": 0.75307167, + "learning_rate": 2.5890145345245826e-07, + "loss": 0.77376086, + "num_input_tokens_seen": 301766945, + "step": 13990, + "time_per_iteration": 2.751396417617798 + }, + { + "auxiliary_loss_clip": 0.01048649, + "auxiliary_loss_mlp": 0.01030416, + "balance_loss_clip": 1.02339792, + "balance_loss_mlp": 1.02058697, + "epoch": 0.841184428077559, + "flos": 22412028885120.0, + "grad_norm": 1.9039531823655937, + "language_loss": 0.8072018, + "learning_rate": 2.5870983967148597e-07, + "loss": 0.82799244, + "num_input_tokens_seen": 301785460, + "step": 13991, + "time_per_iteration": 2.708651065826416 + }, + { + "auxiliary_loss_clip": 0.01030222, + "auxiliary_loss_mlp": 0.01032364, + "balance_loss_clip": 1.02470469, + "balance_loss_mlp": 1.02260637, + "epoch": 0.841244551330227, + "flos": 22962791099520.0, + "grad_norm": 2.563952110105526, + "language_loss": 0.70954883, + "learning_rate": 2.585182919204105e-07, + "loss": 0.73017472, + "num_input_tokens_seen": 301804180, + "step": 13992, + "time_per_iteration": 2.729210615158081 + }, + { + "auxiliary_loss_clip": 0.01024828, + "auxiliary_loss_mlp": 0.01020723, + "balance_loss_clip": 1.02252221, + "balance_loss_mlp": 1.01056039, + "epoch": 0.8413046745828949, + "flos": 21032736583680.0, + "grad_norm": 1.7518341026358226, + "language_loss": 0.76611805, + "learning_rate": 2.583268102064959e-07, + "loss": 0.78657359, + "num_input_tokens_seen": 301823670, + "step": 13993, + "time_per_iteration": 4.443119764328003 + }, + { + "auxiliary_loss_clip": 0.01051945, + "auxiliary_loss_mlp": 0.01029853, + "balance_loss_clip": 1.02265167, + "balance_loss_mlp": 1.01798546, + "epoch": 0.841364797835563, + "flos": 27052082858880.0, + "grad_norm": 2.0526904222338693, + "language_loss": 0.74488652, + "learning_rate": 2.5813539453700393e-07, + "loss": 0.76570451, + "num_input_tokens_seen": 301845890, + "step": 13994, + "time_per_iteration": 2.6582913398742676 + }, + { + "auxiliary_loss_clip": 0.01048207, + "auxiliary_loss_mlp": 0.0102632, + "balance_loss_clip": 1.02357495, + "balance_loss_mlp": 1.01703334, + "epoch": 0.8414249210882309, + "flos": 17895688329600.0, + "grad_norm": 1.5364260286735283, + "language_loss": 0.59488928, + "learning_rate": 2.5794404491919163e-07, + "loss": 0.61563456, + "num_input_tokens_seen": 301863985, + "step": 13995, + "time_per_iteration": 2.709963083267212 + }, + { + "auxiliary_loss_clip": 0.0105065, + "auxiliary_loss_mlp": 0.01029179, + "balance_loss_clip": 1.02479362, + "balance_loss_mlp": 1.01855111, + "epoch": 0.8414850443408989, + "flos": 25441201618560.0, + "grad_norm": 1.7128089237210629, + "language_loss": 0.71680844, + "learning_rate": 2.577527613603163e-07, + "loss": 0.73760664, + "num_input_tokens_seen": 301882765, + "step": 13996, + "time_per_iteration": 2.773510694503784 + }, + { + "auxiliary_loss_clip": 0.01034487, + "auxiliary_loss_mlp": 0.01025476, + "balance_loss_clip": 1.02239478, + "balance_loss_mlp": 1.01539648, + "epoch": 0.8415451675935668, + "flos": 23220055284480.0, + "grad_norm": 1.7667675114546089, + "language_loss": 0.64003843, + "learning_rate": 2.5756154386763017e-07, + "loss": 0.66063809, + "num_input_tokens_seen": 301902720, + "step": 13997, + "time_per_iteration": 2.727691888809204 + }, + { + "auxiliary_loss_clip": 0.01046338, + "auxiliary_loss_mlp": 0.01031687, + "balance_loss_clip": 1.02734423, + "balance_loss_mlp": 1.0203979, + "epoch": 0.8416052908462348, + "flos": 18546496899840.0, + "grad_norm": 2.0776554530350144, + "language_loss": 0.82170606, + "learning_rate": 2.5737039244838565e-07, + "loss": 0.84248638, + "num_input_tokens_seen": 301921245, + "step": 13998, + "time_per_iteration": 2.7235190868377686 + }, + { + "auxiliary_loss_clip": 0.01053819, + "auxiliary_loss_mlp": 0.00747737, + "balance_loss_clip": 1.02675223, + "balance_loss_mlp": 1.00042951, + "epoch": 0.8416654140989027, + "flos": 26105190480000.0, + "grad_norm": 1.5474013775110234, + "language_loss": 0.80095685, + "learning_rate": 2.5717930710982984e-07, + "loss": 0.81897247, + "num_input_tokens_seen": 301942320, + "step": 13999, + "time_per_iteration": 2.7529714107513428 + }, + { + "auxiliary_loss_clip": 0.01048365, + "auxiliary_loss_mlp": 0.01028525, + "balance_loss_clip": 1.02358365, + "balance_loss_mlp": 1.01736128, + "epoch": 0.8417255373515707, + "flos": 26433270328320.0, + "grad_norm": 2.07605290896217, + "language_loss": 0.66812563, + "learning_rate": 2.569882878592096e-07, + "loss": 0.68889451, + "num_input_tokens_seen": 301963110, + "step": 14000, + "time_per_iteration": 2.6786270141601562 + }, + { + "auxiliary_loss_clip": 0.01053825, + "auxiliary_loss_mlp": 0.01027917, + "balance_loss_clip": 1.02504754, + "balance_loss_mlp": 1.01687205, + "epoch": 0.8417856606042387, + "flos": 24717745791360.0, + "grad_norm": 1.4389661726222593, + "language_loss": 0.79210776, + "learning_rate": 2.5679733470376885e-07, + "loss": 0.81292522, + "num_input_tokens_seen": 301984915, + "step": 14001, + "time_per_iteration": 2.704929828643799 + }, + { + "auxiliary_loss_clip": 0.01006775, + "auxiliary_loss_mlp": 0.0102984, + "balance_loss_clip": 1.02414954, + "balance_loss_mlp": 1.01937938, + "epoch": 0.8418457838569067, + "flos": 20850849089280.0, + "grad_norm": 1.5957436968629433, + "language_loss": 0.78633928, + "learning_rate": 2.5660644765074703e-07, + "loss": 0.80670542, + "num_input_tokens_seen": 302004095, + "step": 14002, + "time_per_iteration": 2.7765703201293945 + }, + { + "auxiliary_loss_clip": 0.01020518, + "auxiliary_loss_mlp": 0.00747482, + "balance_loss_clip": 1.02377033, + "balance_loss_mlp": 1.00037074, + "epoch": 0.8419059071095746, + "flos": 28660629715200.0, + "grad_norm": 1.468659259477111, + "language_loss": 0.7838009, + "learning_rate": 2.5641562670738334e-07, + "loss": 0.80148089, + "num_input_tokens_seen": 302027250, + "step": 14003, + "time_per_iteration": 2.8741424083709717 + }, + { + "auxiliary_loss_clip": 0.01042656, + "auxiliary_loss_mlp": 0.01025492, + "balance_loss_clip": 1.02571237, + "balance_loss_mlp": 1.0150075, + "epoch": 0.8419660303622426, + "flos": 21653596189440.0, + "grad_norm": 1.7902716283279425, + "language_loss": 0.6577605, + "learning_rate": 2.5622487188091436e-07, + "loss": 0.67844194, + "num_input_tokens_seen": 302046950, + "step": 14004, + "time_per_iteration": 2.977581024169922 + }, + { + "auxiliary_loss_clip": 0.01047646, + "auxiliary_loss_mlp": 0.0103129, + "balance_loss_clip": 1.02279544, + "balance_loss_mlp": 1.02014971, + "epoch": 0.8420261536149106, + "flos": 25301114576640.0, + "grad_norm": 2.4771739733962166, + "language_loss": 0.7570805, + "learning_rate": 2.560341831785724e-07, + "loss": 0.77786982, + "num_input_tokens_seen": 302065470, + "step": 14005, + "time_per_iteration": 2.84389328956604 + }, + { + "auxiliary_loss_clip": 0.01022906, + "auxiliary_loss_mlp": 0.00747564, + "balance_loss_clip": 1.02073991, + "balance_loss_mlp": 1.00038445, + "epoch": 0.8420862768675785, + "flos": 18763397176320.0, + "grad_norm": 1.9675684903192137, + "language_loss": 0.77826047, + "learning_rate": 2.5584356060758906e-07, + "loss": 0.79596514, + "num_input_tokens_seen": 302083190, + "step": 14006, + "time_per_iteration": 2.73941707611084 + }, + { + "auxiliary_loss_clip": 0.01051486, + "auxiliary_loss_mlp": 0.01033408, + "balance_loss_clip": 1.02462316, + "balance_loss_mlp": 1.0230484, + "epoch": 0.8421464001202466, + "flos": 18328052338560.0, + "grad_norm": 1.7656658425599299, + "language_loss": 0.77127451, + "learning_rate": 2.556530041751932e-07, + "loss": 0.79212344, + "num_input_tokens_seen": 302098820, + "step": 14007, + "time_per_iteration": 2.590104818344116 + }, + { + "auxiliary_loss_clip": 0.0103416, + "auxiliary_loss_mlp": 0.01027005, + "balance_loss_clip": 1.02440691, + "balance_loss_mlp": 1.01639581, + "epoch": 0.8422065233729145, + "flos": 31537181560320.0, + "grad_norm": 1.8248678302654477, + "language_loss": 0.65716183, + "learning_rate": 2.554625138886102e-07, + "loss": 0.67777348, + "num_input_tokens_seen": 302117075, + "step": 14008, + "time_per_iteration": 2.9294626712799072 + }, + { + "auxiliary_loss_clip": 0.00997553, + "auxiliary_loss_mlp": 0.0100151, + "balance_loss_clip": 1.00194776, + "balance_loss_mlp": 1.00060368, + "epoch": 0.8422666466255825, + "flos": 64298128510080.0, + "grad_norm": 0.7108373520578816, + "language_loss": 0.56934756, + "learning_rate": 2.552720897550631e-07, + "loss": 0.58933818, + "num_input_tokens_seen": 302179735, + "step": 14009, + "time_per_iteration": 3.380751132965088 + }, + { + "auxiliary_loss_clip": 0.009925, + "auxiliary_loss_mlp": 0.01032057, + "balance_loss_clip": 1.02013683, + "balance_loss_mlp": 1.0221988, + "epoch": 0.8423267698782504, + "flos": 24316731377280.0, + "grad_norm": 1.2870940917736202, + "language_loss": 0.78002733, + "learning_rate": 2.5508173178177304e-07, + "loss": 0.80027294, + "num_input_tokens_seen": 302202055, + "step": 14010, + "time_per_iteration": 6.430105209350586 + }, + { + "auxiliary_loss_clip": 0.01066543, + "auxiliary_loss_mlp": 0.0103597, + "balance_loss_clip": 1.02701044, + "balance_loss_mlp": 1.02460289, + "epoch": 0.8423868931309184, + "flos": 18296092212480.0, + "grad_norm": 1.6200433198504987, + "language_loss": 0.72159636, + "learning_rate": 2.548914399759592e-07, + "loss": 0.74262148, + "num_input_tokens_seen": 302221360, + "step": 14011, + "time_per_iteration": 2.577928304672241 + }, + { + "auxiliary_loss_clip": 0.01053836, + "auxiliary_loss_mlp": 0.01035359, + "balance_loss_clip": 1.02642488, + "balance_loss_mlp": 1.02492821, + "epoch": 0.8424470163835863, + "flos": 23550218121600.0, + "grad_norm": 2.014689374197973, + "language_loss": 0.84237134, + "learning_rate": 2.5470121434483636e-07, + "loss": 0.86326325, + "num_input_tokens_seen": 302240715, + "step": 14012, + "time_per_iteration": 2.849360942840576 + }, + { + "auxiliary_loss_clip": 0.01056037, + "auxiliary_loss_mlp": 0.01026187, + "balance_loss_clip": 1.02318382, + "balance_loss_mlp": 1.01759243, + "epoch": 0.8425071396362543, + "flos": 23769488695680.0, + "grad_norm": 1.8319369754419508, + "language_loss": 0.68253708, + "learning_rate": 2.5451105489561884e-07, + "loss": 0.70335937, + "num_input_tokens_seen": 302260950, + "step": 14013, + "time_per_iteration": 2.6672439575195312 + }, + { + "auxiliary_loss_clip": 0.01066986, + "auxiliary_loss_mlp": 0.01028058, + "balance_loss_clip": 1.02693331, + "balance_loss_mlp": 1.01707935, + "epoch": 0.8425672628889223, + "flos": 16178906816640.0, + "grad_norm": 2.7565382697718217, + "language_loss": 0.78668487, + "learning_rate": 2.5432096163551644e-07, + "loss": 0.80763537, + "num_input_tokens_seen": 302277500, + "step": 14014, + "time_per_iteration": 2.5179243087768555 + }, + { + "auxiliary_loss_clip": 0.01029537, + "auxiliary_loss_mlp": 0.00747422, + "balance_loss_clip": 1.02313566, + "balance_loss_mlp": 1.00041842, + "epoch": 0.8426273861415903, + "flos": 23149131880320.0, + "grad_norm": 1.6371695259641474, + "language_loss": 0.67399931, + "learning_rate": 2.5413093457173884e-07, + "loss": 0.69176888, + "num_input_tokens_seen": 302297930, + "step": 14015, + "time_per_iteration": 2.765690326690674 + }, + { + "auxiliary_loss_clip": 0.01061539, + "auxiliary_loss_mlp": 0.01026049, + "balance_loss_clip": 1.02491736, + "balance_loss_mlp": 1.0150758, + "epoch": 0.8426875093942582, + "flos": 17457757712640.0, + "grad_norm": 1.7933774670422784, + "language_loss": 0.75509548, + "learning_rate": 2.5394097371149036e-07, + "loss": 0.77597135, + "num_input_tokens_seen": 302315735, + "step": 14016, + "time_per_iteration": 2.5626628398895264 + }, + { + "auxiliary_loss_clip": 0.01036741, + "auxiliary_loss_mlp": 0.01031998, + "balance_loss_clip": 1.02375317, + "balance_loss_mlp": 1.02143598, + "epoch": 0.8427476326469262, + "flos": 19640551299840.0, + "grad_norm": 2.302949581365801, + "language_loss": 0.79173827, + "learning_rate": 2.5375107906197544e-07, + "loss": 0.81242573, + "num_input_tokens_seen": 302332790, + "step": 14017, + "time_per_iteration": 2.6050596237182617 + }, + { + "auxiliary_loss_clip": 0.01039012, + "auxiliary_loss_mlp": 0.01029223, + "balance_loss_clip": 1.0242393, + "balance_loss_mlp": 1.01909041, + "epoch": 0.8428077558995941, + "flos": 11941160146560.0, + "grad_norm": 2.423571862174154, + "language_loss": 0.62647915, + "learning_rate": 2.5356125063039525e-07, + "loss": 0.64716148, + "num_input_tokens_seen": 302346490, + "step": 14018, + "time_per_iteration": 2.5781474113464355 + }, + { + "auxiliary_loss_clip": 0.01053292, + "auxiliary_loss_mlp": 0.0102948, + "balance_loss_clip": 1.02577233, + "balance_loss_mlp": 1.01950848, + "epoch": 0.8428678791522621, + "flos": 10451729767680.0, + "grad_norm": 1.8192882846637117, + "language_loss": 0.78912318, + "learning_rate": 2.5337148842394687e-07, + "loss": 0.80995089, + "num_input_tokens_seen": 302363235, + "step": 14019, + "time_per_iteration": 2.5965845584869385 + }, + { + "auxiliary_loss_clip": 0.01016085, + "auxiliary_loss_mlp": 0.0103391, + "balance_loss_clip": 1.01994872, + "balance_loss_mlp": 1.02143478, + "epoch": 0.8429280024049302, + "flos": 28767248259840.0, + "grad_norm": 1.7287234598348356, + "language_loss": 0.78360635, + "learning_rate": 2.531817924498265e-07, + "loss": 0.8041063, + "num_input_tokens_seen": 302383270, + "step": 14020, + "time_per_iteration": 2.7695412635803223 + }, + { + "auxiliary_loss_clip": 0.01052598, + "auxiliary_loss_mlp": 0.01024603, + "balance_loss_clip": 1.0251689, + "balance_loss_mlp": 1.01457715, + "epoch": 0.8429881256575981, + "flos": 19537093152000.0, + "grad_norm": 2.2902994518262907, + "language_loss": 0.71133137, + "learning_rate": 2.5299216271522805e-07, + "loss": 0.73210335, + "num_input_tokens_seen": 302401355, + "step": 14021, + "time_per_iteration": 2.667205572128296 + }, + { + "auxiliary_loss_clip": 0.01033822, + "auxiliary_loss_mlp": 0.01035007, + "balance_loss_clip": 1.02437449, + "balance_loss_mlp": 1.02411079, + "epoch": 0.8430482489102661, + "flos": 24790931752320.0, + "grad_norm": 1.7024570030495465, + "language_loss": 0.69399345, + "learning_rate": 2.5280259922734125e-07, + "loss": 0.71468174, + "num_input_tokens_seen": 302419515, + "step": 14022, + "time_per_iteration": 2.7511658668518066 + }, + { + "auxiliary_loss_clip": 0.01010496, + "auxiliary_loss_mlp": 0.01031645, + "balance_loss_clip": 1.02455521, + "balance_loss_mlp": 1.02025414, + "epoch": 0.843108372162934, + "flos": 21544248211200.0, + "grad_norm": 1.760057876047836, + "language_loss": 0.71827137, + "learning_rate": 2.526131019933553e-07, + "loss": 0.73869276, + "num_input_tokens_seen": 302438280, + "step": 14023, + "time_per_iteration": 2.858804225921631 + }, + { + "auxiliary_loss_clip": 0.01051521, + "auxiliary_loss_mlp": 0.01032067, + "balance_loss_clip": 1.02544284, + "balance_loss_mlp": 1.02130198, + "epoch": 0.843168495415602, + "flos": 24608792862720.0, + "grad_norm": 1.4116206187044167, + "language_loss": 0.66709983, + "learning_rate": 2.524236710204559e-07, + "loss": 0.68793571, + "num_input_tokens_seen": 302460860, + "step": 14024, + "time_per_iteration": 2.818449020385742 + }, + { + "auxiliary_loss_clip": 0.01050332, + "auxiliary_loss_mlp": 0.01028514, + "balance_loss_clip": 1.02445161, + "balance_loss_mlp": 1.0183996, + "epoch": 0.8432286186682699, + "flos": 15122738286720.0, + "grad_norm": 2.063375425696657, + "language_loss": 0.81024182, + "learning_rate": 2.522343063158261e-07, + "loss": 0.83103025, + "num_input_tokens_seen": 302476980, + "step": 14025, + "time_per_iteration": 2.6627583503723145 + }, + { + "auxiliary_loss_clip": 0.01047257, + "auxiliary_loss_mlp": 0.01026898, + "balance_loss_clip": 1.02286601, + "balance_loss_mlp": 1.01830268, + "epoch": 0.843288741920938, + "flos": 20301882554880.0, + "grad_norm": 1.4644960506433367, + "language_loss": 0.77636838, + "learning_rate": 2.5204500788664606e-07, + "loss": 0.7971099, + "num_input_tokens_seen": 302496380, + "step": 14026, + "time_per_iteration": 2.694608688354492 + }, + { + "auxiliary_loss_clip": 0.01032323, + "auxiliary_loss_mlp": 0.01028981, + "balance_loss_clip": 1.02132583, + "balance_loss_mlp": 1.0183773, + "epoch": 0.8433488651736059, + "flos": 23332096782720.0, + "grad_norm": 1.384283747822171, + "language_loss": 0.82626808, + "learning_rate": 2.518557757400945e-07, + "loss": 0.84688115, + "num_input_tokens_seen": 302516845, + "step": 14027, + "time_per_iteration": 2.7612290382385254 + }, + { + "auxiliary_loss_clip": 0.01040094, + "auxiliary_loss_mlp": 0.01024907, + "balance_loss_clip": 1.02313662, + "balance_loss_mlp": 1.01519775, + "epoch": 0.8434089884262739, + "flos": 39458105844480.0, + "grad_norm": 1.3545080563583765, + "language_loss": 0.56383818, + "learning_rate": 2.5166660988334754e-07, + "loss": 0.58448821, + "num_input_tokens_seen": 302538865, + "step": 14028, + "time_per_iteration": 4.559312105178833 + }, + { + "auxiliary_loss_clip": 0.01034412, + "auxiliary_loss_mlp": 0.01025628, + "balance_loss_clip": 1.02192354, + "balance_loss_mlp": 1.01579964, + "epoch": 0.8434691116789418, + "flos": 23768842250880.0, + "grad_norm": 2.2009125283731166, + "language_loss": 0.63777781, + "learning_rate": 2.51477510323578e-07, + "loss": 0.65837824, + "num_input_tokens_seen": 302557970, + "step": 14029, + "time_per_iteration": 2.710559129714966 + }, + { + "auxiliary_loss_clip": 0.01057876, + "auxiliary_loss_mlp": 0.01027108, + "balance_loss_clip": 1.02449429, + "balance_loss_mlp": 1.01820338, + "epoch": 0.8435292349316098, + "flos": 22671411972480.0, + "grad_norm": 1.5434316588178971, + "language_loss": 0.75314522, + "learning_rate": 2.51288477067956e-07, + "loss": 0.7739951, + "num_input_tokens_seen": 302578915, + "step": 14030, + "time_per_iteration": 2.6063170433044434 + }, + { + "auxiliary_loss_clip": 0.01042644, + "auxiliary_loss_mlp": 0.01026902, + "balance_loss_clip": 1.02653313, + "balance_loss_mlp": 1.01716828, + "epoch": 0.8435893581842777, + "flos": 18843622202880.0, + "grad_norm": 1.7308174180318834, + "language_loss": 0.83329332, + "learning_rate": 2.510995101236502e-07, + "loss": 0.85398877, + "num_input_tokens_seen": 302596300, + "step": 14031, + "time_per_iteration": 2.633549451828003 + }, + { + "auxiliary_loss_clip": 0.01036706, + "auxiliary_loss_mlp": 0.01025418, + "balance_loss_clip": 1.02181411, + "balance_loss_mlp": 1.01592255, + "epoch": 0.8436494814369457, + "flos": 20704225772160.0, + "grad_norm": 1.7820330465155336, + "language_loss": 0.80342525, + "learning_rate": 2.509106094978266e-07, + "loss": 0.82404649, + "num_input_tokens_seen": 302614975, + "step": 14032, + "time_per_iteration": 2.6260433197021484 + }, + { + "auxiliary_loss_clip": 0.01029406, + "auxiliary_loss_mlp": 0.01028638, + "balance_loss_clip": 1.02228189, + "balance_loss_mlp": 1.01647854, + "epoch": 0.8437096046896138, + "flos": 22674177319680.0, + "grad_norm": 2.0174599629530885, + "language_loss": 0.75290972, + "learning_rate": 2.507217751976478e-07, + "loss": 0.77349019, + "num_input_tokens_seen": 302636415, + "step": 14033, + "time_per_iteration": 2.713947296142578 + }, + { + "auxiliary_loss_clip": 0.01023845, + "auxiliary_loss_mlp": 0.01030266, + "balance_loss_clip": 1.02185214, + "balance_loss_mlp": 1.02068198, + "epoch": 0.8437697279422817, + "flos": 16180127879040.0, + "grad_norm": 1.7047881662186317, + "language_loss": 0.83467919, + "learning_rate": 2.505330072302743e-07, + "loss": 0.85522032, + "num_input_tokens_seen": 302653605, + "step": 14034, + "time_per_iteration": 2.657792806625366 + }, + { + "auxiliary_loss_clip": 0.01026995, + "auxiliary_loss_mlp": 0.01026688, + "balance_loss_clip": 1.02325845, + "balance_loss_mlp": 1.01533914, + "epoch": 0.8438298511949497, + "flos": 28765847629440.0, + "grad_norm": 1.4635251908632758, + "language_loss": 0.7831735, + "learning_rate": 2.503443056028656e-07, + "loss": 0.80371034, + "num_input_tokens_seen": 302673965, + "step": 14035, + "time_per_iteration": 2.8000385761260986 + }, + { + "auxiliary_loss_clip": 0.01044679, + "auxiliary_loss_mlp": 0.01028722, + "balance_loss_clip": 1.02345073, + "balance_loss_mlp": 1.01854742, + "epoch": 0.8438899744476176, + "flos": 33724284779520.0, + "grad_norm": 1.4900523807897594, + "language_loss": 0.7210784, + "learning_rate": 2.501556703225751e-07, + "loss": 0.74181235, + "num_input_tokens_seen": 302695560, + "step": 14036, + "time_per_iteration": 2.7148122787475586 + }, + { + "auxiliary_loss_clip": 0.01056965, + "auxiliary_loss_mlp": 0.01020686, + "balance_loss_clip": 1.02418399, + "balance_loss_mlp": 1.01243639, + "epoch": 0.8439500977002856, + "flos": 25110787386240.0, + "grad_norm": 1.7034899120199245, + "language_loss": 0.69992083, + "learning_rate": 2.49967101396557e-07, + "loss": 0.7206974, + "num_input_tokens_seen": 302713480, + "step": 14037, + "time_per_iteration": 2.6111369132995605 + }, + { + "auxiliary_loss_clip": 0.0106063, + "auxiliary_loss_mlp": 0.01024924, + "balance_loss_clip": 1.02463007, + "balance_loss_mlp": 1.015167, + "epoch": 0.8440102209529535, + "flos": 32850362880000.0, + "grad_norm": 1.6056735276226282, + "language_loss": 0.68872738, + "learning_rate": 2.4977859883196227e-07, + "loss": 0.70958292, + "num_input_tokens_seen": 302736860, + "step": 14038, + "time_per_iteration": 2.7179930210113525 + }, + { + "auxiliary_loss_clip": 0.00999068, + "auxiliary_loss_mlp": 0.01033898, + "balance_loss_clip": 1.01912665, + "balance_loss_mlp": 1.023211, + "epoch": 0.8440703442056215, + "flos": 23730202195200.0, + "grad_norm": 1.52794930214534, + "language_loss": 0.765607, + "learning_rate": 2.49590162635938e-07, + "loss": 0.78593665, + "num_input_tokens_seen": 302757745, + "step": 14039, + "time_per_iteration": 2.882087469100952 + }, + { + "auxiliary_loss_clip": 0.01066092, + "auxiliary_loss_mlp": 0.0102556, + "balance_loss_clip": 1.02686906, + "balance_loss_mlp": 1.01532054, + "epoch": 0.8441304674582895, + "flos": 20193719725440.0, + "grad_norm": 2.375708529045588, + "language_loss": 0.79338813, + "learning_rate": 2.4940179281563046e-07, + "loss": 0.81430465, + "num_input_tokens_seen": 302774885, + "step": 14040, + "time_per_iteration": 4.323674201965332 + }, + { + "auxiliary_loss_clip": 0.01030288, + "auxiliary_loss_mlp": 0.01031704, + "balance_loss_clip": 1.02459002, + "balance_loss_mlp": 1.02090406, + "epoch": 0.8441905907109575, + "flos": 20219897761920.0, + "grad_norm": 1.8632875992862243, + "language_loss": 0.69280005, + "learning_rate": 2.492134893781821e-07, + "loss": 0.71342003, + "num_input_tokens_seen": 302791035, + "step": 14041, + "time_per_iteration": 2.706644058227539 + }, + { + "auxiliary_loss_clip": 0.01039043, + "auxiliary_loss_mlp": 0.01029689, + "balance_loss_clip": 1.02272725, + "balance_loss_mlp": 1.01926434, + "epoch": 0.8442507139636254, + "flos": 13516453987200.0, + "grad_norm": 1.6699100323185887, + "language_loss": 0.68689406, + "learning_rate": 2.490252523307341e-07, + "loss": 0.7075814, + "num_input_tokens_seen": 302808650, + "step": 14042, + "time_per_iteration": 2.7355892658233643 + }, + { + "auxiliary_loss_clip": 0.01037451, + "auxiliary_loss_mlp": 0.01030779, + "balance_loss_clip": 1.02250195, + "balance_loss_mlp": 1.02128434, + "epoch": 0.8443108372162934, + "flos": 18220212731520.0, + "grad_norm": 1.9643542077579605, + "language_loss": 0.74837101, + "learning_rate": 2.4883708168042373e-07, + "loss": 0.76905328, + "num_input_tokens_seen": 302824605, + "step": 14043, + "time_per_iteration": 2.855236530303955 + }, + { + "auxiliary_loss_clip": 0.01059618, + "auxiliary_loss_mlp": 0.00747638, + "balance_loss_clip": 1.0240736, + "balance_loss_mlp": 1.00039101, + "epoch": 0.8443709604689613, + "flos": 16105110324480.0, + "grad_norm": 2.3129211782981187, + "language_loss": 0.72380149, + "learning_rate": 2.486489774343865e-07, + "loss": 0.74187404, + "num_input_tokens_seen": 302840170, + "step": 14044, + "time_per_iteration": 2.559263229370117 + }, + { + "auxiliary_loss_clip": 0.01040686, + "auxiliary_loss_mlp": 0.010251, + "balance_loss_clip": 1.02428687, + "balance_loss_mlp": 1.01480007, + "epoch": 0.8444310837216293, + "flos": 18512130562560.0, + "grad_norm": 1.5852968036416544, + "language_loss": 0.74767727, + "learning_rate": 2.484609395997559e-07, + "loss": 0.7683351, + "num_input_tokens_seen": 302858320, + "step": 14045, + "time_per_iteration": 2.7818031311035156 + }, + { + "auxiliary_loss_clip": 0.01037884, + "auxiliary_loss_mlp": 0.00747613, + "balance_loss_clip": 1.02214015, + "balance_loss_mlp": 1.00043082, + "epoch": 0.8444912069742974, + "flos": 14939845211520.0, + "grad_norm": 2.497440027422692, + "language_loss": 0.78392327, + "learning_rate": 2.4827296818366216e-07, + "loss": 0.8017782, + "num_input_tokens_seen": 302875255, + "step": 14046, + "time_per_iteration": 2.7141995429992676 + }, + { + "auxiliary_loss_clip": 0.01037514, + "auxiliary_loss_mlp": 0.01032928, + "balance_loss_clip": 1.02389157, + "balance_loss_mlp": 1.0205425, + "epoch": 0.8445513302269653, + "flos": 20120318282880.0, + "grad_norm": 1.733604090761504, + "language_loss": 0.78089285, + "learning_rate": 2.4808506319323255e-07, + "loss": 0.80159736, + "num_input_tokens_seen": 302894690, + "step": 14047, + "time_per_iteration": 2.675633192062378 + }, + { + "auxiliary_loss_clip": 0.01044361, + "auxiliary_loss_mlp": 0.01026511, + "balance_loss_clip": 1.02740455, + "balance_loss_mlp": 1.0168786, + "epoch": 0.8446114534796333, + "flos": 31170928533120.0, + "grad_norm": 1.8749140065805663, + "language_loss": 0.71881771, + "learning_rate": 2.478972246355935e-07, + "loss": 0.73952639, + "num_input_tokens_seen": 302912405, + "step": 14048, + "time_per_iteration": 2.9273521900177 + }, + { + "auxiliary_loss_clip": 0.00992523, + "auxiliary_loss_mlp": 0.01033297, + "balance_loss_clip": 1.0246402, + "balance_loss_mlp": 1.02274108, + "epoch": 0.8446715767323012, + "flos": 23948323534080.0, + "grad_norm": 1.5909055345654275, + "language_loss": 0.73808157, + "learning_rate": 2.477094525178667e-07, + "loss": 0.75833976, + "num_input_tokens_seen": 302932525, + "step": 14049, + "time_per_iteration": 3.2556216716766357 + }, + { + "auxiliary_loss_clip": 0.00997887, + "auxiliary_loss_mlp": 0.0074655, + "balance_loss_clip": 1.00226271, + "balance_loss_mlp": 1.00043488, + "epoch": 0.8447316999849692, + "flos": 67984897484160.0, + "grad_norm": 0.8056031851297264, + "language_loss": 0.60711312, + "learning_rate": 2.475217468471729e-07, + "loss": 0.6245575, + "num_input_tokens_seen": 302991285, + "step": 14050, + "time_per_iteration": 3.3868539333343506 + }, + { + "auxiliary_loss_clip": 0.01035864, + "auxiliary_loss_mlp": 0.00747638, + "balance_loss_clip": 1.02319002, + "balance_loss_mlp": 1.00042963, + "epoch": 0.8447918232376371, + "flos": 22418924296320.0, + "grad_norm": 2.3339015197678714, + "language_loss": 0.72342193, + "learning_rate": 2.473341076306303e-07, + "loss": 0.74125695, + "num_input_tokens_seen": 303009515, + "step": 14051, + "time_per_iteration": 2.723301410675049 + }, + { + "auxiliary_loss_clip": 0.01050285, + "auxiliary_loss_mlp": 0.01025764, + "balance_loss_clip": 1.02402568, + "balance_loss_mlp": 1.01578569, + "epoch": 0.8448519464903052, + "flos": 23694147918720.0, + "grad_norm": 2.941795266849169, + "language_loss": 0.74636841, + "learning_rate": 2.471465348753547e-07, + "loss": 0.76712894, + "num_input_tokens_seen": 303026905, + "step": 14052, + "time_per_iteration": 2.6886672973632812 + }, + { + "auxiliary_loss_clip": 0.01038605, + "auxiliary_loss_mlp": 0.01026243, + "balance_loss_clip": 1.0251739, + "balance_loss_mlp": 1.0171833, + "epoch": 0.8449120697429731, + "flos": 13735904129280.0, + "grad_norm": 1.7206699795377516, + "language_loss": 0.73941755, + "learning_rate": 2.469590285884575e-07, + "loss": 0.76006603, + "num_input_tokens_seen": 303045245, + "step": 14053, + "time_per_iteration": 2.693082332611084 + }, + { + "auxiliary_loss_clip": 0.01042388, + "auxiliary_loss_mlp": 0.01023515, + "balance_loss_clip": 1.0226295, + "balance_loss_mlp": 1.01332831, + "epoch": 0.8449721929956411, + "flos": 20886795624960.0, + "grad_norm": 1.73296531084079, + "language_loss": 0.74009585, + "learning_rate": 2.467715887770494e-07, + "loss": 0.76075488, + "num_input_tokens_seen": 303065205, + "step": 14054, + "time_per_iteration": 3.0050482749938965 + }, + { + "auxiliary_loss_clip": 0.01053293, + "auxiliary_loss_mlp": 0.01026414, + "balance_loss_clip": 1.02589273, + "balance_loss_mlp": 1.01592386, + "epoch": 0.845032316248309, + "flos": 33216939129600.0, + "grad_norm": 1.5815063655736799, + "language_loss": 0.78322917, + "learning_rate": 2.4658421544823895e-07, + "loss": 0.80402631, + "num_input_tokens_seen": 303088250, + "step": 14055, + "time_per_iteration": 2.8130099773406982 + }, + { + "auxiliary_loss_clip": 0.01051451, + "auxiliary_loss_mlp": 0.0103222, + "balance_loss_clip": 1.02561116, + "balance_loss_mlp": 1.02244508, + "epoch": 0.845092439500977, + "flos": 23585230903680.0, + "grad_norm": 2.399862298493095, + "language_loss": 0.72649825, + "learning_rate": 2.463969086091302e-07, + "loss": 0.74733496, + "num_input_tokens_seen": 303109280, + "step": 14056, + "time_per_iteration": 2.6293067932128906 + }, + { + "auxiliary_loss_clip": 0.01051514, + "auxiliary_loss_mlp": 0.01030475, + "balance_loss_clip": 1.02561533, + "balance_loss_mlp": 1.01940608, + "epoch": 0.8451525627536449, + "flos": 13333920048000.0, + "grad_norm": 2.093960953317878, + "language_loss": 0.67151499, + "learning_rate": 2.4620966826682686e-07, + "loss": 0.69233483, + "num_input_tokens_seen": 303126075, + "step": 14057, + "time_per_iteration": 4.1990580558776855 + }, + { + "auxiliary_loss_clip": 0.01015922, + "auxiliary_loss_mlp": 0.01025897, + "balance_loss_clip": 1.02283621, + "balance_loss_mlp": 1.01572251, + "epoch": 0.8452126860063129, + "flos": 27817985583360.0, + "grad_norm": 1.6226804428613897, + "language_loss": 0.77641129, + "learning_rate": 2.460224944284284e-07, + "loss": 0.79682946, + "num_input_tokens_seen": 303146920, + "step": 14058, + "time_per_iteration": 2.9112143516540527 + }, + { + "auxiliary_loss_clip": 0.01061532, + "auxiliary_loss_mlp": 0.01031351, + "balance_loss_clip": 1.02378273, + "balance_loss_mlp": 1.02107549, + "epoch": 0.845272809258981, + "flos": 27124694202240.0, + "grad_norm": 1.5596854929173618, + "language_loss": 0.70012128, + "learning_rate": 2.45835387101033e-07, + "loss": 0.72105008, + "num_input_tokens_seen": 303167885, + "step": 14059, + "time_per_iteration": 2.6858901977539062 + }, + { + "auxiliary_loss_clip": 0.01063162, + "auxiliary_loss_mlp": 0.01033256, + "balance_loss_clip": 1.02458596, + "balance_loss_mlp": 1.02171087, + "epoch": 0.8453329325116489, + "flos": 18332577452160.0, + "grad_norm": 2.767758312738026, + "language_loss": 0.57556599, + "learning_rate": 2.4564834629173516e-07, + "loss": 0.5965302, + "num_input_tokens_seen": 303185000, + "step": 14060, + "time_per_iteration": 2.641756296157837 + }, + { + "auxiliary_loss_clip": 0.01039351, + "auxiliary_loss_mlp": 0.01030702, + "balance_loss_clip": 1.02163744, + "balance_loss_mlp": 1.01953244, + "epoch": 0.8453930557643169, + "flos": 22675254727680.0, + "grad_norm": 1.580967796479277, + "language_loss": 0.75923359, + "learning_rate": 2.454613720076277e-07, + "loss": 0.77993417, + "num_input_tokens_seen": 303205210, + "step": 14061, + "time_per_iteration": 2.663728952407837 + }, + { + "auxiliary_loss_clip": 0.01032125, + "auxiliary_loss_mlp": 0.010243, + "balance_loss_clip": 1.02289414, + "balance_loss_mlp": 1.01391697, + "epoch": 0.8454531790169848, + "flos": 22487261921280.0, + "grad_norm": 2.387118825640658, + "language_loss": 0.71091187, + "learning_rate": 2.452744642558013e-07, + "loss": 0.73147613, + "num_input_tokens_seen": 303224655, + "step": 14062, + "time_per_iteration": 2.6777617931365967 + }, + { + "auxiliary_loss_clip": 0.00974082, + "auxiliary_loss_mlp": 0.0100212, + "balance_loss_clip": 1.00682569, + "balance_loss_mlp": 1.0008564, + "epoch": 0.8455133022696528, + "flos": 58277848481280.0, + "grad_norm": 0.6384941067516335, + "language_loss": 0.52665865, + "learning_rate": 2.450876230433432e-07, + "loss": 0.54642069, + "num_input_tokens_seen": 303289645, + "step": 14063, + "time_per_iteration": 3.4485037326812744 + }, + { + "auxiliary_loss_clip": 0.01028782, + "auxiliary_loss_mlp": 0.01021478, + "balance_loss_clip": 1.02538919, + "balance_loss_mlp": 1.01222098, + "epoch": 0.8455734255223207, + "flos": 21361283308800.0, + "grad_norm": 1.7396941568633206, + "language_loss": 0.82187021, + "learning_rate": 2.449008483773378e-07, + "loss": 0.84237283, + "num_input_tokens_seen": 303308350, + "step": 14064, + "time_per_iteration": 3.1981115341186523 + }, + { + "auxiliary_loss_clip": 0.01054983, + "auxiliary_loss_mlp": 0.01029587, + "balance_loss_clip": 1.02732444, + "balance_loss_mlp": 1.01847672, + "epoch": 0.8456335487749888, + "flos": 20449260057600.0, + "grad_norm": 2.2255992872061157, + "language_loss": 0.72545683, + "learning_rate": 2.447141402648685e-07, + "loss": 0.74630255, + "num_input_tokens_seen": 303325230, + "step": 14065, + "time_per_iteration": 2.6050329208374023 + }, + { + "auxiliary_loss_clip": 0.01027123, + "auxiliary_loss_mlp": 0.01023925, + "balance_loss_clip": 1.02304387, + "balance_loss_mlp": 1.0146625, + "epoch": 0.8456936720276567, + "flos": 28840901097600.0, + "grad_norm": 1.486452560324199, + "language_loss": 0.77317131, + "learning_rate": 2.445274987130146e-07, + "loss": 0.7936818, + "num_input_tokens_seen": 303345810, + "step": 14066, + "time_per_iteration": 2.738241672515869 + }, + { + "auxiliary_loss_clip": 0.01026032, + "auxiliary_loss_mlp": 0.01027319, + "balance_loss_clip": 1.02683365, + "balance_loss_mlp": 1.01744866, + "epoch": 0.8457537952803247, + "flos": 22672884430080.0, + "grad_norm": 1.5233297453856371, + "language_loss": 0.69931531, + "learning_rate": 2.4434092372885363e-07, + "loss": 0.71984881, + "num_input_tokens_seen": 303365140, + "step": 14067, + "time_per_iteration": 2.661290168762207 + }, + { + "auxiliary_loss_clip": 0.01024269, + "auxiliary_loss_mlp": 0.01021825, + "balance_loss_clip": 1.02223325, + "balance_loss_mlp": 1.01176357, + "epoch": 0.8458139185329926, + "flos": 33802929607680.0, + "grad_norm": 1.6072944972563608, + "language_loss": 0.70366639, + "learning_rate": 2.4415441531946144e-07, + "loss": 0.72412729, + "num_input_tokens_seen": 303386150, + "step": 14068, + "time_per_iteration": 2.8509840965270996 + }, + { + "auxiliary_loss_clip": 0.00966958, + "auxiliary_loss_mlp": 0.01006481, + "balance_loss_clip": 1.00298738, + "balance_loss_mlp": 1.00547397, + "epoch": 0.8458740417856606, + "flos": 70295929603200.0, + "grad_norm": 0.6970502614427376, + "language_loss": 0.60499513, + "learning_rate": 2.4396797349190976e-07, + "loss": 0.62472957, + "num_input_tokens_seen": 303453770, + "step": 14069, + "time_per_iteration": 3.3779685497283936 + }, + { + "auxiliary_loss_clip": 0.01044079, + "auxiliary_loss_mlp": 0.01025299, + "balance_loss_clip": 1.02623916, + "balance_loss_mlp": 1.01549435, + "epoch": 0.8459341650383285, + "flos": 24170862245760.0, + "grad_norm": 1.4571994770212775, + "language_loss": 0.74701381, + "learning_rate": 2.4378159825326804e-07, + "loss": 0.76770759, + "num_input_tokens_seen": 303474520, + "step": 14070, + "time_per_iteration": 2.7063589096069336 + }, + { + "auxiliary_loss_clip": 0.01016855, + "auxiliary_loss_mlp": 0.01028375, + "balance_loss_clip": 1.02317643, + "balance_loss_mlp": 1.01844525, + "epoch": 0.8459942882909965, + "flos": 38181158369280.0, + "grad_norm": 1.5999767519903845, + "language_loss": 0.67309201, + "learning_rate": 2.435952896106039e-07, + "loss": 0.69354439, + "num_input_tokens_seen": 303497345, + "step": 14071, + "time_per_iteration": 2.9428346157073975 + }, + { + "auxiliary_loss_clip": 0.00997228, + "auxiliary_loss_mlp": 0.0074654, + "balance_loss_clip": 1.00221777, + "balance_loss_mlp": 1.00038779, + "epoch": 0.8460544115436646, + "flos": 64118252177280.0, + "grad_norm": 0.7306947767982767, + "language_loss": 0.61101526, + "learning_rate": 2.4340904757098313e-07, + "loss": 0.62845296, + "num_input_tokens_seen": 303554890, + "step": 14072, + "time_per_iteration": 3.0658891201019287 + }, + { + "auxiliary_loss_clip": 0.01035648, + "auxiliary_loss_mlp": 0.01031355, + "balance_loss_clip": 1.02667379, + "balance_loss_mlp": 1.01956534, + "epoch": 0.8461145347963325, + "flos": 24170826332160.0, + "grad_norm": 2.340023775826269, + "language_loss": 0.72666132, + "learning_rate": 2.4322287214146664e-07, + "loss": 0.74733126, + "num_input_tokens_seen": 303574380, + "step": 14073, + "time_per_iteration": 2.70440411567688 + }, + { + "auxiliary_loss_clip": 0.01045979, + "auxiliary_loss_mlp": 0.01029922, + "balance_loss_clip": 1.02673745, + "balance_loss_mlp": 1.01873994, + "epoch": 0.8461746580490005, + "flos": 34893787697280.0, + "grad_norm": 1.8702250239787994, + "language_loss": 0.78063047, + "learning_rate": 2.430367633291155e-07, + "loss": 0.80138952, + "num_input_tokens_seen": 303594910, + "step": 14074, + "time_per_iteration": 2.887725353240967 + }, + { + "auxiliary_loss_clip": 0.01052135, + "auxiliary_loss_mlp": 0.01031378, + "balance_loss_clip": 1.02545571, + "balance_loss_mlp": 1.02085185, + "epoch": 0.8462347813016684, + "flos": 25557014044800.0, + "grad_norm": 2.0664980192376126, + "language_loss": 0.75402236, + "learning_rate": 2.4285072114098583e-07, + "loss": 0.77485752, + "num_input_tokens_seen": 303613520, + "step": 14075, + "time_per_iteration": 3.000908374786377 + }, + { + "auxiliary_loss_clip": 0.01039458, + "auxiliary_loss_mlp": 0.01024986, + "balance_loss_clip": 1.0237354, + "balance_loss_mlp": 1.01453185, + "epoch": 0.8462949045543364, + "flos": 21325336773120.0, + "grad_norm": 2.105910951367576, + "language_loss": 0.72745645, + "learning_rate": 2.4266474558413355e-07, + "loss": 0.74810094, + "num_input_tokens_seen": 303631225, + "step": 14076, + "time_per_iteration": 4.410984754562378 + }, + { + "auxiliary_loss_clip": 0.01045522, + "auxiliary_loss_mlp": 0.01033095, + "balance_loss_clip": 1.02569652, + "balance_loss_mlp": 1.02265263, + "epoch": 0.8463550278070043, + "flos": 22637440684800.0, + "grad_norm": 2.037729853794339, + "language_loss": 0.77266431, + "learning_rate": 2.4247883666560945e-07, + "loss": 0.79345047, + "num_input_tokens_seen": 303649175, + "step": 14077, + "time_per_iteration": 2.841088056564331 + }, + { + "auxiliary_loss_clip": 0.01031133, + "auxiliary_loss_mlp": 0.01030716, + "balance_loss_clip": 1.0244385, + "balance_loss_mlp": 1.02034509, + "epoch": 0.8464151510596724, + "flos": 13005588804480.0, + "grad_norm": 2.1078003105748895, + "language_loss": 0.74978447, + "learning_rate": 2.422929943924643e-07, + "loss": 0.77040297, + "num_input_tokens_seen": 303665915, + "step": 14078, + "time_per_iteration": 2.695354700088501 + }, + { + "auxiliary_loss_clip": 0.01020537, + "auxiliary_loss_mlp": 0.01023158, + "balance_loss_clip": 1.0228374, + "balance_loss_mlp": 1.01303101, + "epoch": 0.8464752743123403, + "flos": 15704921923200.0, + "grad_norm": 2.1793817829105255, + "language_loss": 0.84900665, + "learning_rate": 2.4210721877174565e-07, + "loss": 0.8694436, + "num_input_tokens_seen": 303679985, + "step": 14079, + "time_per_iteration": 2.7534725666046143 + }, + { + "auxiliary_loss_clip": 0.01036928, + "auxiliary_loss_mlp": 0.01031119, + "balance_loss_clip": 1.02685761, + "balance_loss_mlp": 1.01973486, + "epoch": 0.8465353975650083, + "flos": 21653955325440.0, + "grad_norm": 2.29527901438646, + "language_loss": 0.58319932, + "learning_rate": 2.419215098104965e-07, + "loss": 0.60387981, + "num_input_tokens_seen": 303698470, + "step": 14080, + "time_per_iteration": 2.7869582176208496 + }, + { + "auxiliary_loss_clip": 0.01043207, + "auxiliary_loss_mlp": 0.01028599, + "balance_loss_clip": 1.02463102, + "balance_loss_mlp": 1.01782286, + "epoch": 0.8465955208176762, + "flos": 18515650095360.0, + "grad_norm": 2.2402457803735003, + "language_loss": 0.66510046, + "learning_rate": 2.4173586751576014e-07, + "loss": 0.68581843, + "num_input_tokens_seen": 303716415, + "step": 14081, + "time_per_iteration": 2.7400906085968018 + }, + { + "auxiliary_loss_clip": 0.01046267, + "auxiliary_loss_mlp": 0.01028011, + "balance_loss_clip": 1.02297568, + "balance_loss_mlp": 1.01721644, + "epoch": 0.8466556440703442, + "flos": 24200559815040.0, + "grad_norm": 2.5417696417933233, + "language_loss": 0.73096979, + "learning_rate": 2.41550291894576e-07, + "loss": 0.7517125, + "num_input_tokens_seen": 303734490, + "step": 14082, + "time_per_iteration": 2.6441233158111572 + }, + { + "auxiliary_loss_clip": 0.01009916, + "auxiliary_loss_mlp": 0.01029081, + "balance_loss_clip": 1.01972377, + "balance_loss_mlp": 1.01785731, + "epoch": 0.8467157673230121, + "flos": 20375894528640.0, + "grad_norm": 2.1688817086273717, + "language_loss": 0.7619679, + "learning_rate": 2.413647829539809e-07, + "loss": 0.78235781, + "num_input_tokens_seen": 303752310, + "step": 14083, + "time_per_iteration": 2.9072229862213135 + }, + { + "auxiliary_loss_clip": 0.0101435, + "auxiliary_loss_mlp": 0.01029564, + "balance_loss_clip": 1.02187037, + "balance_loss_mlp": 1.01783931, + "epoch": 0.8467758905756801, + "flos": 28473642489600.0, + "grad_norm": 1.7242469975767545, + "language_loss": 0.66062778, + "learning_rate": 2.411793407010092e-07, + "loss": 0.68106699, + "num_input_tokens_seen": 303776065, + "step": 14084, + "time_per_iteration": 2.803501605987549 + }, + { + "auxiliary_loss_clip": 0.01022981, + "auxiliary_loss_mlp": 0.01027188, + "balance_loss_clip": 1.02555895, + "balance_loss_mlp": 1.01706743, + "epoch": 0.8468360138283482, + "flos": 11692551139200.0, + "grad_norm": 2.1461186494722186, + "language_loss": 0.69485223, + "learning_rate": 2.409939651426938e-07, + "loss": 0.71535397, + "num_input_tokens_seen": 303793500, + "step": 14085, + "time_per_iteration": 2.772778034210205 + }, + { + "auxiliary_loss_clip": 0.01023249, + "auxiliary_loss_mlp": 0.01029742, + "balance_loss_clip": 1.0246557, + "balance_loss_mlp": 1.01960325, + "epoch": 0.8468961370810161, + "flos": 24607859109120.0, + "grad_norm": 1.7688574876760195, + "language_loss": 0.70786762, + "learning_rate": 2.408086562860634e-07, + "loss": 0.72839749, + "num_input_tokens_seen": 303814835, + "step": 14086, + "time_per_iteration": 2.80318284034729 + }, + { + "auxiliary_loss_clip": 0.01042452, + "auxiliary_loss_mlp": 0.01028045, + "balance_loss_clip": 1.02224576, + "balance_loss_mlp": 1.01748919, + "epoch": 0.8469562603336841, + "flos": 19609812236160.0, + "grad_norm": 1.8316313789332899, + "language_loss": 0.74780941, + "learning_rate": 2.4062341413814445e-07, + "loss": 0.76851439, + "num_input_tokens_seen": 303834505, + "step": 14087, + "time_per_iteration": 4.598698854446411 + }, + { + "auxiliary_loss_clip": 0.01033415, + "auxiliary_loss_mlp": 0.01023851, + "balance_loss_clip": 1.02494299, + "balance_loss_mlp": 1.01335442, + "epoch": 0.847016383586352, + "flos": 22638949056000.0, + "grad_norm": 1.4676995918225606, + "language_loss": 0.74270594, + "learning_rate": 2.4043823870596227e-07, + "loss": 0.7632786, + "num_input_tokens_seen": 303855050, + "step": 14088, + "time_per_iteration": 2.684699535369873 + }, + { + "auxiliary_loss_clip": 0.01052996, + "auxiliary_loss_mlp": 0.01029783, + "balance_loss_clip": 1.02467465, + "balance_loss_mlp": 1.01936412, + "epoch": 0.84707650683902, + "flos": 20960161153920.0, + "grad_norm": 1.8666392444309297, + "language_loss": 0.72057021, + "learning_rate": 2.402531299965387e-07, + "loss": 0.74139798, + "num_input_tokens_seen": 303875635, + "step": 14089, + "time_per_iteration": 2.6507656574249268 + }, + { + "auxiliary_loss_clip": 0.01059807, + "auxiliary_loss_mlp": 0.01024775, + "balance_loss_clip": 1.02504599, + "balance_loss_mlp": 1.01507115, + "epoch": 0.8471366300916879, + "flos": 24093007516800.0, + "grad_norm": 1.6044390067370369, + "language_loss": 0.7921167, + "learning_rate": 2.400680880168928e-07, + "loss": 0.81296253, + "num_input_tokens_seen": 303896750, + "step": 14090, + "time_per_iteration": 2.5040230751037598 + }, + { + "auxiliary_loss_clip": 0.0100552, + "auxiliary_loss_mlp": 0.01034649, + "balance_loss_clip": 1.02047968, + "balance_loss_mlp": 1.02311003, + "epoch": 0.847196753344356, + "flos": 18332900674560.0, + "grad_norm": 2.034087172278395, + "language_loss": 0.77368081, + "learning_rate": 2.3988311277404085e-07, + "loss": 0.79408246, + "num_input_tokens_seen": 303915435, + "step": 14091, + "time_per_iteration": 2.7419016361236572 + }, + { + "auxiliary_loss_clip": 0.01006066, + "auxiliary_loss_mlp": 0.01002125, + "balance_loss_clip": 1.00090897, + "balance_loss_mlp": 1.00129604, + "epoch": 0.8472568765970239, + "flos": 49567536956160.0, + "grad_norm": 0.8290792949985248, + "language_loss": 0.59426308, + "learning_rate": 2.396982042749982e-07, + "loss": 0.61434507, + "num_input_tokens_seen": 303977245, + "step": 14092, + "time_per_iteration": 3.2450225353240967 + }, + { + "auxiliary_loss_clip": 0.01034325, + "auxiliary_loss_mlp": 0.01031998, + "balance_loss_clip": 1.02189362, + "balance_loss_mlp": 1.02063191, + "epoch": 0.8473169998496919, + "flos": 19279074781440.0, + "grad_norm": 1.9595290427253775, + "language_loss": 0.70540649, + "learning_rate": 2.395133625267756e-07, + "loss": 0.72606969, + "num_input_tokens_seen": 303996055, + "step": 14093, + "time_per_iteration": 2.621873140335083 + }, + { + "auxiliary_loss_clip": 0.01059227, + "auxiliary_loss_mlp": 0.01023871, + "balance_loss_clip": 1.02434027, + "balance_loss_mlp": 1.01451921, + "epoch": 0.8473771231023598, + "flos": 17675555829120.0, + "grad_norm": 2.0023991257239055, + "language_loss": 0.83385849, + "learning_rate": 2.3932858753638263e-07, + "loss": 0.85468942, + "num_input_tokens_seen": 304012205, + "step": 14094, + "time_per_iteration": 2.540144920349121 + }, + { + "auxiliary_loss_clip": 0.01040813, + "auxiliary_loss_mlp": 0.01026759, + "balance_loss_clip": 1.02475333, + "balance_loss_mlp": 1.01740086, + "epoch": 0.8474372463550278, + "flos": 26359761144960.0, + "grad_norm": 1.7136987926718037, + "language_loss": 0.71319199, + "learning_rate": 2.3914387931082626e-07, + "loss": 0.73386765, + "num_input_tokens_seen": 304033475, + "step": 14095, + "time_per_iteration": 2.678382158279419 + }, + { + "auxiliary_loss_clip": 0.01048206, + "auxiliary_loss_mlp": 0.00747533, + "balance_loss_clip": 1.02306306, + "balance_loss_mlp": 1.00039864, + "epoch": 0.8474973696076957, + "flos": 23402050519680.0, + "grad_norm": 3.1990471997176373, + "language_loss": 0.80622506, + "learning_rate": 2.3895923785711105e-07, + "loss": 0.82418245, + "num_input_tokens_seen": 304051845, + "step": 14096, + "time_per_iteration": 2.7512125968933105 + }, + { + "auxiliary_loss_clip": 0.01051767, + "auxiliary_loss_mlp": 0.0102895, + "balance_loss_clip": 1.02380061, + "balance_loss_mlp": 1.01773834, + "epoch": 0.8475574928603637, + "flos": 25075666863360.0, + "grad_norm": 1.843320165064238, + "language_loss": 0.77465367, + "learning_rate": 2.387746631822374e-07, + "loss": 0.79546082, + "num_input_tokens_seen": 304069965, + "step": 14097, + "time_per_iteration": 2.6456799507141113 + }, + { + "auxiliary_loss_clip": 0.01031455, + "auxiliary_loss_mlp": 0.0102439, + "balance_loss_clip": 1.02529442, + "balance_loss_mlp": 1.01454282, + "epoch": 0.8476176161130318, + "flos": 19966691813760.0, + "grad_norm": 1.6528065590628143, + "language_loss": 0.80136263, + "learning_rate": 2.385901552932048e-07, + "loss": 0.82192111, + "num_input_tokens_seen": 304086805, + "step": 14098, + "time_per_iteration": 2.6728968620300293 + }, + { + "auxiliary_loss_clip": 0.01043228, + "auxiliary_loss_mlp": 0.00747616, + "balance_loss_clip": 1.02282059, + "balance_loss_mlp": 1.00037646, + "epoch": 0.8476777393656997, + "flos": 21285834791040.0, + "grad_norm": 3.233617413071021, + "language_loss": 0.71757305, + "learning_rate": 2.3840571419701062e-07, + "loss": 0.7354815, + "num_input_tokens_seen": 304105865, + "step": 14099, + "time_per_iteration": 2.6703102588653564 + }, + { + "auxiliary_loss_clip": 0.010474, + "auxiliary_loss_mlp": 0.010287, + "balance_loss_clip": 1.02250338, + "balance_loss_mlp": 1.01741672, + "epoch": 0.8477378626183677, + "flos": 29971476650880.0, + "grad_norm": 2.3018538387976766, + "language_loss": 0.63628954, + "learning_rate": 2.3822133990064787e-07, + "loss": 0.65705055, + "num_input_tokens_seen": 304128300, + "step": 14100, + "time_per_iteration": 2.709291696548462 + }, + { + "auxiliary_loss_clip": 0.01053534, + "auxiliary_loss_mlp": 0.01030418, + "balance_loss_clip": 1.02498221, + "balance_loss_mlp": 1.0196358, + "epoch": 0.8477979858710356, + "flos": 24237727413120.0, + "grad_norm": 1.8433435205225706, + "language_loss": 0.73835683, + "learning_rate": 2.380370324111085e-07, + "loss": 0.75919628, + "num_input_tokens_seen": 304143695, + "step": 14101, + "time_per_iteration": 2.642305612564087 + }, + { + "auxiliary_loss_clip": 0.01051071, + "auxiliary_loss_mlp": 0.01025676, + "balance_loss_clip": 1.02405369, + "balance_loss_mlp": 1.0159781, + "epoch": 0.8478581091237036, + "flos": 25593678852480.0, + "grad_norm": 2.2126252636013732, + "language_loss": 0.7140882, + "learning_rate": 2.3785279173538163e-07, + "loss": 0.73485565, + "num_input_tokens_seen": 304165800, + "step": 14102, + "time_per_iteration": 2.6563801765441895 + }, + { + "auxiliary_loss_clip": 0.01034958, + "auxiliary_loss_mlp": 0.01032363, + "balance_loss_clip": 1.02263343, + "balance_loss_mlp": 1.02073383, + "epoch": 0.8479182323763715, + "flos": 12057116227200.0, + "grad_norm": 2.21492148449228, + "language_loss": 0.81382573, + "learning_rate": 2.3766861788045366e-07, + "loss": 0.83449894, + "num_input_tokens_seen": 304182910, + "step": 14103, + "time_per_iteration": 2.6440207958221436 + }, + { + "auxiliary_loss_clip": 0.01061261, + "auxiliary_loss_mlp": 0.01027947, + "balance_loss_clip": 1.02473712, + "balance_loss_mlp": 1.01803493, + "epoch": 0.8479783556290396, + "flos": 21433391861760.0, + "grad_norm": 3.670474344388176, + "language_loss": 0.78315663, + "learning_rate": 2.374845108533079e-07, + "loss": 0.80404878, + "num_input_tokens_seen": 304200175, + "step": 14104, + "time_per_iteration": 4.311907529830933 + }, + { + "auxiliary_loss_clip": 0.01056261, + "auxiliary_loss_mlp": 0.0103196, + "balance_loss_clip": 1.02728295, + "balance_loss_mlp": 1.0210464, + "epoch": 0.8480384788817075, + "flos": 19642634288640.0, + "grad_norm": 4.0898284078049665, + "language_loss": 0.78718424, + "learning_rate": 2.3730047066092607e-07, + "loss": 0.80806649, + "num_input_tokens_seen": 304217775, + "step": 14105, + "time_per_iteration": 2.6128110885620117 + }, + { + "auxiliary_loss_clip": 0.01040248, + "auxiliary_loss_mlp": 0.01035015, + "balance_loss_clip": 1.02503622, + "balance_loss_mlp": 1.02246189, + "epoch": 0.8480986021343755, + "flos": 22489201255680.0, + "grad_norm": 1.723729034229637, + "language_loss": 0.50072318, + "learning_rate": 2.3711649731028749e-07, + "loss": 0.52147585, + "num_input_tokens_seen": 304235760, + "step": 14106, + "time_per_iteration": 2.630174398422241 + }, + { + "auxiliary_loss_clip": 0.01033933, + "auxiliary_loss_mlp": 0.01030231, + "balance_loss_clip": 1.02578044, + "balance_loss_mlp": 1.02015233, + "epoch": 0.8481587253870434, + "flos": 22090557139200.0, + "grad_norm": 2.0148267189184823, + "language_loss": 0.75442648, + "learning_rate": 2.3693259080836792e-07, + "loss": 0.7750681, + "num_input_tokens_seen": 304253985, + "step": 14107, + "time_per_iteration": 2.7149600982666016 + }, + { + "auxiliary_loss_clip": 0.01029668, + "auxiliary_loss_mlp": 0.01024452, + "balance_loss_clip": 1.0230273, + "balance_loss_mlp": 1.01439095, + "epoch": 0.8482188486397114, + "flos": 33582689366400.0, + "grad_norm": 1.6022699412265677, + "language_loss": 0.73498464, + "learning_rate": 2.3674875116214087e-07, + "loss": 0.75552583, + "num_input_tokens_seen": 304276785, + "step": 14108, + "time_per_iteration": 2.828494071960449 + }, + { + "auxiliary_loss_clip": 0.01059073, + "auxiliary_loss_mlp": 0.01024452, + "balance_loss_clip": 1.0238955, + "balance_loss_mlp": 1.013026, + "epoch": 0.8482789718923793, + "flos": 20919402195840.0, + "grad_norm": 1.6762160403051343, + "language_loss": 0.72441751, + "learning_rate": 2.3656497837857836e-07, + "loss": 0.74525279, + "num_input_tokens_seen": 304296310, + "step": 14109, + "time_per_iteration": 2.630319118499756 + }, + { + "auxiliary_loss_clip": 0.00992162, + "auxiliary_loss_mlp": 0.01030051, + "balance_loss_clip": 1.02086198, + "balance_loss_mlp": 1.01922095, + "epoch": 0.8483390951450474, + "flos": 12896204912640.0, + "grad_norm": 2.2674480263143564, + "language_loss": 0.73656362, + "learning_rate": 2.3638127246464811e-07, + "loss": 0.75678575, + "num_input_tokens_seen": 304311715, + "step": 14110, + "time_per_iteration": 2.7626984119415283 + }, + { + "auxiliary_loss_clip": 0.01010818, + "auxiliary_loss_mlp": 0.01029954, + "balance_loss_clip": 1.02473152, + "balance_loss_mlp": 1.02029204, + "epoch": 0.8483992183977154, + "flos": 25081628520960.0, + "grad_norm": 1.5848192272925699, + "language_loss": 0.76363897, + "learning_rate": 2.3619763342731658e-07, + "loss": 0.78404665, + "num_input_tokens_seen": 304331910, + "step": 14111, + "time_per_iteration": 2.7849721908569336 + }, + { + "auxiliary_loss_clip": 0.01060631, + "auxiliary_loss_mlp": 0.01024513, + "balance_loss_clip": 1.02481008, + "balance_loss_mlp": 1.01470804, + "epoch": 0.8484593416503833, + "flos": 25557445008000.0, + "grad_norm": 1.9487538248419358, + "language_loss": 0.67475641, + "learning_rate": 2.3601406127354772e-07, + "loss": 0.69560784, + "num_input_tokens_seen": 304351405, + "step": 14112, + "time_per_iteration": 2.645735025405884 + }, + { + "auxiliary_loss_clip": 0.01044478, + "auxiliary_loss_mlp": 0.0103103, + "balance_loss_clip": 1.02146399, + "balance_loss_mlp": 1.0207839, + "epoch": 0.8485194649030513, + "flos": 27198454780800.0, + "grad_norm": 1.9360304883289041, + "language_loss": 0.73688972, + "learning_rate": 2.3583055601030312e-07, + "loss": 0.75764477, + "num_input_tokens_seen": 304372935, + "step": 14113, + "time_per_iteration": 2.6509289741516113 + }, + { + "auxiliary_loss_clip": 0.01034559, + "auxiliary_loss_mlp": 0.01028286, + "balance_loss_clip": 1.02766299, + "balance_loss_mlp": 1.01800442, + "epoch": 0.8485795881557192, + "flos": 24205910941440.0, + "grad_norm": 5.06864928725629, + "language_loss": 0.66407984, + "learning_rate": 2.3564711764454003e-07, + "loss": 0.6847083, + "num_input_tokens_seen": 304393070, + "step": 14114, + "time_per_iteration": 2.769353151321411 + }, + { + "auxiliary_loss_clip": 0.0106539, + "auxiliary_loss_mlp": 0.01032295, + "balance_loss_clip": 1.02710724, + "balance_loss_mlp": 1.02151847, + "epoch": 0.8486397114083872, + "flos": 21141653598720.0, + "grad_norm": 1.5920248663263201, + "language_loss": 0.78856933, + "learning_rate": 2.3546374618321495e-07, + "loss": 0.80954623, + "num_input_tokens_seen": 304411195, + "step": 14115, + "time_per_iteration": 2.6492857933044434 + }, + { + "auxiliary_loss_clip": 0.01061784, + "auxiliary_loss_mlp": 0.01029247, + "balance_loss_clip": 1.02511811, + "balance_loss_mlp": 1.0192306, + "epoch": 0.8486998346610551, + "flos": 19974772373760.0, + "grad_norm": 1.7603984749226484, + "language_loss": 0.78905833, + "learning_rate": 2.3528044163328187e-07, + "loss": 0.80996865, + "num_input_tokens_seen": 304429425, + "step": 14116, + "time_per_iteration": 2.5823965072631836 + }, + { + "auxiliary_loss_clip": 0.01054383, + "auxiliary_loss_mlp": 0.01026657, + "balance_loss_clip": 1.02598882, + "balance_loss_mlp": 1.0158627, + "epoch": 0.8487599579137232, + "flos": 19792310261760.0, + "grad_norm": 2.0274831311477337, + "language_loss": 0.68302661, + "learning_rate": 2.3509720400169076e-07, + "loss": 0.70383704, + "num_input_tokens_seen": 304447460, + "step": 14117, + "time_per_iteration": 2.5892767906188965 + }, + { + "auxiliary_loss_clip": 0.0104139, + "auxiliary_loss_mlp": 0.01025333, + "balance_loss_clip": 1.02317834, + "balance_loss_mlp": 1.0146637, + "epoch": 0.8488200811663911, + "flos": 26396030903040.0, + "grad_norm": 1.9936918530382217, + "language_loss": 0.65482169, + "learning_rate": 2.3491403329539096e-07, + "loss": 0.67548895, + "num_input_tokens_seen": 304468230, + "step": 14118, + "time_per_iteration": 2.748264789581299 + }, + { + "auxiliary_loss_clip": 0.01025745, + "auxiliary_loss_mlp": 0.01027492, + "balance_loss_clip": 1.02308452, + "balance_loss_mlp": 1.01746702, + "epoch": 0.8488802044190591, + "flos": 16359285939840.0, + "grad_norm": 1.6688756077947111, + "language_loss": 0.73429543, + "learning_rate": 2.3473092952132757e-07, + "loss": 0.75482774, + "num_input_tokens_seen": 304484860, + "step": 14119, + "time_per_iteration": 2.66438627243042 + }, + { + "auxiliary_loss_clip": 0.01030088, + "auxiliary_loss_mlp": 0.01026209, + "balance_loss_clip": 1.02294755, + "balance_loss_mlp": 1.01518786, + "epoch": 0.848940327671727, + "flos": 19208869649280.0, + "grad_norm": 1.713945421706629, + "language_loss": 0.77955925, + "learning_rate": 2.345478926864446e-07, + "loss": 0.80012226, + "num_input_tokens_seen": 304503575, + "step": 14120, + "time_per_iteration": 2.6991984844207764 + }, + { + "auxiliary_loss_clip": 0.01056159, + "auxiliary_loss_mlp": 0.01028114, + "balance_loss_clip": 1.0283581, + "balance_loss_mlp": 1.01734996, + "epoch": 0.849000450924395, + "flos": 21871178824320.0, + "grad_norm": 2.5379548084432715, + "language_loss": 0.7555632, + "learning_rate": 2.3436492279768227e-07, + "loss": 0.77640593, + "num_input_tokens_seen": 304525005, + "step": 14121, + "time_per_iteration": 2.707047939300537 + }, + { + "auxiliary_loss_clip": 0.0096679, + "auxiliary_loss_mlp": 0.01001986, + "balance_loss_clip": 1.0029428, + "balance_loss_mlp": 1.00118721, + "epoch": 0.8490605741770629, + "flos": 71166475624320.0, + "grad_norm": 0.8213368201785833, + "language_loss": 0.60116804, + "learning_rate": 2.3418201986197883e-07, + "loss": 0.62085569, + "num_input_tokens_seen": 304585220, + "step": 14122, + "time_per_iteration": 5.118768692016602 + }, + { + "auxiliary_loss_clip": 0.01052425, + "auxiliary_loss_mlp": 0.01030285, + "balance_loss_clip": 1.02564526, + "balance_loss_mlp": 1.02022398, + "epoch": 0.849120697429731, + "flos": 24973357950720.0, + "grad_norm": 2.0744016357381736, + "language_loss": 0.80411464, + "learning_rate": 2.3399918388627048e-07, + "loss": 0.82494175, + "num_input_tokens_seen": 304604665, + "step": 14123, + "time_per_iteration": 2.7747440338134766 + }, + { + "auxiliary_loss_clip": 0.010499, + "auxiliary_loss_mlp": 0.01023472, + "balance_loss_clip": 1.02444637, + "balance_loss_mlp": 1.01364923, + "epoch": 0.8491808206823989, + "flos": 23032277959680.0, + "grad_norm": 3.4434779861453526, + "language_loss": 0.8323071, + "learning_rate": 2.3381641487749016e-07, + "loss": 0.85304081, + "num_input_tokens_seen": 304620600, + "step": 14124, + "time_per_iteration": 2.732565402984619 + }, + { + "auxiliary_loss_clip": 0.01021891, + "auxiliary_loss_mlp": 0.01033713, + "balance_loss_clip": 1.02587759, + "balance_loss_mlp": 1.02250779, + "epoch": 0.8492409439350669, + "flos": 23878549365120.0, + "grad_norm": 1.8334441155554246, + "language_loss": 0.71681309, + "learning_rate": 2.3363371284256805e-07, + "loss": 0.73736918, + "num_input_tokens_seen": 304639540, + "step": 14125, + "time_per_iteration": 2.896867036819458 + }, + { + "auxiliary_loss_clip": 0.01065575, + "auxiliary_loss_mlp": 0.01032713, + "balance_loss_clip": 1.02565169, + "balance_loss_mlp": 1.02131104, + "epoch": 0.8493010671877349, + "flos": 22419893963520.0, + "grad_norm": 1.6999933226981363, + "language_loss": 0.73536527, + "learning_rate": 2.3345107778843288e-07, + "loss": 0.75634813, + "num_input_tokens_seen": 304660595, + "step": 14126, + "time_per_iteration": 2.757697582244873 + }, + { + "auxiliary_loss_clip": 0.01014765, + "auxiliary_loss_mlp": 0.01034079, + "balance_loss_clip": 1.02195323, + "balance_loss_mlp": 1.0230161, + "epoch": 0.8493611904404028, + "flos": 17529435302400.0, + "grad_norm": 2.1303711005536234, + "language_loss": 0.67627311, + "learning_rate": 2.3326850972200928e-07, + "loss": 0.69676155, + "num_input_tokens_seen": 304679580, + "step": 14127, + "time_per_iteration": 2.7671666145324707 + }, + { + "auxiliary_loss_clip": 0.01027099, + "auxiliary_loss_mlp": 0.00747777, + "balance_loss_clip": 1.02261448, + "balance_loss_mlp": 1.00043857, + "epoch": 0.8494213136930708, + "flos": 19462937523840.0, + "grad_norm": 1.7661172895803754, + "language_loss": 0.68677175, + "learning_rate": 2.330860086502211e-07, + "loss": 0.70452052, + "num_input_tokens_seen": 304698385, + "step": 14128, + "time_per_iteration": 2.7517621517181396 + }, + { + "auxiliary_loss_clip": 0.01033339, + "auxiliary_loss_mlp": 0.01031633, + "balance_loss_clip": 1.02240956, + "balance_loss_mlp": 1.02095151, + "epoch": 0.8494814369457387, + "flos": 18770292587520.0, + "grad_norm": 1.9939617714287683, + "language_loss": 0.77857387, + "learning_rate": 2.3290357457998855e-07, + "loss": 0.7992236, + "num_input_tokens_seen": 304715430, + "step": 14129, + "time_per_iteration": 2.69380259513855 + }, + { + "auxiliary_loss_clip": 0.01004888, + "auxiliary_loss_mlp": 0.01030945, + "balance_loss_clip": 1.02477932, + "balance_loss_mlp": 1.01995969, + "epoch": 0.8495415601984068, + "flos": 23331486251520.0, + "grad_norm": 1.6435576654811055, + "language_loss": 0.68582284, + "learning_rate": 2.3272120751823031e-07, + "loss": 0.70618117, + "num_input_tokens_seen": 304734345, + "step": 14130, + "time_per_iteration": 2.7713828086853027 + }, + { + "auxiliary_loss_clip": 0.01052283, + "auxiliary_loss_mlp": 0.01026858, + "balance_loss_clip": 1.02542925, + "balance_loss_mlp": 1.01664805, + "epoch": 0.8496016834510747, + "flos": 26612859352320.0, + "grad_norm": 1.7714682666856665, + "language_loss": 0.71112543, + "learning_rate": 2.3253890747186e-07, + "loss": 0.7319169, + "num_input_tokens_seen": 304755030, + "step": 14131, + "time_per_iteration": 2.68198299407959 + }, + { + "auxiliary_loss_clip": 0.01036526, + "auxiliary_loss_mlp": 0.01026563, + "balance_loss_clip": 1.02772033, + "balance_loss_mlp": 1.01624513, + "epoch": 0.8496618067037427, + "flos": 25480380378240.0, + "grad_norm": 1.887930786378812, + "language_loss": 0.68145216, + "learning_rate": 2.3235667444779162e-07, + "loss": 0.70208311, + "num_input_tokens_seen": 304774320, + "step": 14132, + "time_per_iteration": 2.754568576812744 + }, + { + "auxiliary_loss_clip": 0.01058399, + "auxiliary_loss_mlp": 0.01029541, + "balance_loss_clip": 1.02283275, + "balance_loss_mlp": 1.01987922, + "epoch": 0.8497219299564106, + "flos": 25374587846400.0, + "grad_norm": 1.6789927674158123, + "language_loss": 0.70325661, + "learning_rate": 2.3217450845293564e-07, + "loss": 0.72413594, + "num_input_tokens_seen": 304795355, + "step": 14133, + "time_per_iteration": 2.696920871734619 + }, + { + "auxiliary_loss_clip": 0.00968641, + "auxiliary_loss_mlp": 0.00746424, + "balance_loss_clip": 1.00282145, + "balance_loss_mlp": 1.00039184, + "epoch": 0.8497820532090786, + "flos": 67780279658880.0, + "grad_norm": 0.728566735807018, + "language_loss": 0.57598519, + "learning_rate": 2.3199240949419918e-07, + "loss": 0.59313583, + "num_input_tokens_seen": 304863915, + "step": 14134, + "time_per_iteration": 5.129170179367065 + }, + { + "auxiliary_loss_clip": 0.01034583, + "auxiliary_loss_mlp": 0.01027044, + "balance_loss_clip": 1.0258894, + "balance_loss_mlp": 1.01590943, + "epoch": 0.8498421764617465, + "flos": 23440546920960.0, + "grad_norm": 2.297862788463867, + "language_loss": 0.78858894, + "learning_rate": 2.3181037757848787e-07, + "loss": 0.80920517, + "num_input_tokens_seen": 304881555, + "step": 14135, + "time_per_iteration": 2.764021873474121 + }, + { + "auxiliary_loss_clip": 0.01055379, + "auxiliary_loss_mlp": 0.01026069, + "balance_loss_clip": 1.02603102, + "balance_loss_mlp": 1.01518512, + "epoch": 0.8499022997144146, + "flos": 17712615686400.0, + "grad_norm": 2.1190962187643905, + "language_loss": 0.62916136, + "learning_rate": 2.316284127127044e-07, + "loss": 0.64997584, + "num_input_tokens_seen": 304898760, + "step": 14136, + "time_per_iteration": 2.8043789863586426 + }, + { + "auxiliary_loss_clip": 0.01054666, + "auxiliary_loss_mlp": 0.0102765, + "balance_loss_clip": 1.02615786, + "balance_loss_mlp": 1.0169034, + "epoch": 0.8499624229670825, + "flos": 18588512833920.0, + "grad_norm": 1.9716884831031214, + "language_loss": 0.84157455, + "learning_rate": 2.3144651490374835e-07, + "loss": 0.86239767, + "num_input_tokens_seen": 304915465, + "step": 14137, + "time_per_iteration": 2.7282681465148926 + }, + { + "auxiliary_loss_clip": 0.01034592, + "auxiliary_loss_mlp": 0.01021839, + "balance_loss_clip": 1.02761304, + "balance_loss_mlp": 1.01257622, + "epoch": 0.8500225462197505, + "flos": 24345854328960.0, + "grad_norm": 2.5118179746715823, + "language_loss": 0.7842325, + "learning_rate": 2.3126468415851773e-07, + "loss": 0.80479687, + "num_input_tokens_seen": 304933190, + "step": 14138, + "time_per_iteration": 2.6981217861175537 + }, + { + "auxiliary_loss_clip": 0.01052438, + "auxiliary_loss_mlp": 0.01023564, + "balance_loss_clip": 1.02527452, + "balance_loss_mlp": 1.0135324, + "epoch": 0.8500826694724185, + "flos": 16545518979840.0, + "grad_norm": 1.6548850139972302, + "language_loss": 0.64653313, + "learning_rate": 2.310829204839073e-07, + "loss": 0.66729307, + "num_input_tokens_seen": 304951110, + "step": 14139, + "time_per_iteration": 2.5948026180267334 + }, + { + "auxiliary_loss_clip": 0.01032116, + "auxiliary_loss_mlp": 0.01026426, + "balance_loss_clip": 1.02513146, + "balance_loss_mlp": 1.01640654, + "epoch": 0.8501427927250864, + "flos": 16289404030080.0, + "grad_norm": 1.5731251181294819, + "language_loss": 0.70911372, + "learning_rate": 2.3090122388681043e-07, + "loss": 0.72969913, + "num_input_tokens_seen": 304969095, + "step": 14140, + "time_per_iteration": 2.6804568767547607 + }, + { + "auxiliary_loss_clip": 0.01028914, + "auxiliary_loss_mlp": 0.0103015, + "balance_loss_clip": 1.02451766, + "balance_loss_mlp": 1.01880765, + "epoch": 0.8502029159777544, + "flos": 26687912820480.0, + "grad_norm": 2.1532408923988124, + "language_loss": 0.63987637, + "learning_rate": 2.3071959437411648e-07, + "loss": 0.66046703, + "num_input_tokens_seen": 304989315, + "step": 14141, + "time_per_iteration": 2.7358295917510986 + }, + { + "auxiliary_loss_clip": 0.01032776, + "auxiliary_loss_mlp": 0.01029737, + "balance_loss_clip": 1.02549171, + "balance_loss_mlp": 1.01927042, + "epoch": 0.8502630392304223, + "flos": 35590778179200.0, + "grad_norm": 1.481750031326812, + "language_loss": 0.70730686, + "learning_rate": 2.3053803195271214e-07, + "loss": 0.72793198, + "num_input_tokens_seen": 305011020, + "step": 14142, + "time_per_iteration": 2.792872905731201 + }, + { + "auxiliary_loss_clip": 0.01011761, + "auxiliary_loss_mlp": 0.01026599, + "balance_loss_clip": 1.02075613, + "balance_loss_mlp": 1.01688945, + "epoch": 0.8503231624830904, + "flos": 21649466125440.0, + "grad_norm": 2.505783319312354, + "language_loss": 0.65184021, + "learning_rate": 2.3035653662948375e-07, + "loss": 0.67222381, + "num_input_tokens_seen": 305033550, + "step": 14143, + "time_per_iteration": 2.8112785816192627 + }, + { + "auxiliary_loss_clip": 0.01036135, + "auxiliary_loss_mlp": 0.00747661, + "balance_loss_clip": 1.02532864, + "balance_loss_mlp": 1.00040364, + "epoch": 0.8503832857357583, + "flos": 22417451838720.0, + "grad_norm": 1.9617187760888748, + "language_loss": 0.67817366, + "learning_rate": 2.3017510841131216e-07, + "loss": 0.6960116, + "num_input_tokens_seen": 305052885, + "step": 14144, + "time_per_iteration": 2.753058671951294 + }, + { + "auxiliary_loss_clip": 0.00995274, + "auxiliary_loss_mlp": 0.0103574, + "balance_loss_clip": 1.02131212, + "balance_loss_mlp": 1.02454603, + "epoch": 0.8504434089884263, + "flos": 18697968552960.0, + "grad_norm": 1.814497332281299, + "language_loss": 0.65063059, + "learning_rate": 2.299937473050777e-07, + "loss": 0.6709407, + "num_input_tokens_seen": 305071995, + "step": 14145, + "time_per_iteration": 2.807392120361328 + }, + { + "auxiliary_loss_clip": 0.01039774, + "auxiliary_loss_mlp": 0.01029842, + "balance_loss_clip": 1.02338958, + "balance_loss_mlp": 1.01923192, + "epoch": 0.8505035322410942, + "flos": 20007989475840.0, + "grad_norm": 1.9247736225743517, + "language_loss": 0.85686809, + "learning_rate": 2.2981245331765842e-07, + "loss": 0.87756419, + "num_input_tokens_seen": 305090190, + "step": 14146, + "time_per_iteration": 2.802750587463379 + }, + { + "auxiliary_loss_clip": 0.01059548, + "auxiliary_loss_mlp": 0.01024463, + "balance_loss_clip": 1.02332056, + "balance_loss_mlp": 1.01426458, + "epoch": 0.8505636554937622, + "flos": 20812173120000.0, + "grad_norm": 1.8230309855671576, + "language_loss": 0.83892119, + "learning_rate": 2.2963122645592814e-07, + "loss": 0.85976136, + "num_input_tokens_seen": 305109355, + "step": 14147, + "time_per_iteration": 2.6667962074279785 + }, + { + "auxiliary_loss_clip": 0.01048007, + "auxiliary_loss_mlp": 0.01029607, + "balance_loss_clip": 1.02384162, + "balance_loss_mlp": 1.0188365, + "epoch": 0.8506237787464301, + "flos": 14174445277440.0, + "grad_norm": 2.6383381041086578, + "language_loss": 0.85046792, + "learning_rate": 2.2945006672675894e-07, + "loss": 0.87124407, + "num_input_tokens_seen": 305124165, + "step": 14148, + "time_per_iteration": 2.620907783508301 + }, + { + "auxiliary_loss_clip": 0.0103779, + "auxiliary_loss_mlp": 0.01028201, + "balance_loss_clip": 1.02282166, + "balance_loss_mlp": 1.01776457, + "epoch": 0.8506839019990982, + "flos": 23258372117760.0, + "grad_norm": 1.5884197566442222, + "language_loss": 0.71673357, + "learning_rate": 2.292689741370204e-07, + "loss": 0.7373935, + "num_input_tokens_seen": 305143940, + "step": 14149, + "time_per_iteration": 2.6457598209381104 + }, + { + "auxiliary_loss_clip": 0.01042017, + "auxiliary_loss_mlp": 0.01027986, + "balance_loss_clip": 1.02461767, + "balance_loss_mlp": 1.01773441, + "epoch": 0.8507440252517661, + "flos": 23659206963840.0, + "grad_norm": 1.693974509232271, + "language_loss": 0.76112616, + "learning_rate": 2.290879486935804e-07, + "loss": 0.7818262, + "num_input_tokens_seen": 305163505, + "step": 14150, + "time_per_iteration": 2.7735211849212646 + }, + { + "auxiliary_loss_clip": 0.01019547, + "auxiliary_loss_mlp": 0.01030729, + "balance_loss_clip": 1.02300692, + "balance_loss_mlp": 1.02026248, + "epoch": 0.8508041485044341, + "flos": 18661339658880.0, + "grad_norm": 1.5544745904647415, + "language_loss": 0.72524196, + "learning_rate": 2.2890699040330231e-07, + "loss": 0.74574471, + "num_input_tokens_seen": 305182325, + "step": 14151, + "time_per_iteration": 5.852767467498779 + }, + { + "auxiliary_loss_clip": 0.0096422, + "auxiliary_loss_mlp": 0.01003545, + "balance_loss_clip": 1.00721765, + "balance_loss_mlp": 1.00271654, + "epoch": 0.8508642717571021, + "flos": 52510918055040.0, + "grad_norm": 0.8805220483301854, + "language_loss": 0.59607196, + "learning_rate": 2.2872609927304909e-07, + "loss": 0.6157496, + "num_input_tokens_seen": 305230775, + "step": 14152, + "time_per_iteration": 3.2555384635925293 + }, + { + "auxiliary_loss_clip": 0.00986754, + "auxiliary_loss_mlp": 0.01000845, + "balance_loss_clip": 1.00134051, + "balance_loss_mlp": 1.00001049, + "epoch": 0.85092439500977, + "flos": 69297145050240.0, + "grad_norm": 0.6884025092940385, + "language_loss": 0.61180484, + "learning_rate": 2.285452753096797e-07, + "loss": 0.63168085, + "num_input_tokens_seen": 305296000, + "step": 14153, + "time_per_iteration": 3.377431631088257 + }, + { + "auxiliary_loss_clip": 0.01053361, + "auxiliary_loss_mlp": 0.01029139, + "balance_loss_clip": 1.02572501, + "balance_loss_mlp": 1.01878023, + "epoch": 0.850984518262438, + "flos": 24389737770240.0, + "grad_norm": 1.544451285864772, + "language_loss": 0.80818355, + "learning_rate": 2.2836451852005067e-07, + "loss": 0.82900858, + "num_input_tokens_seen": 305314705, + "step": 14154, + "time_per_iteration": 2.6907856464385986 + }, + { + "auxiliary_loss_clip": 0.01029475, + "auxiliary_loss_mlp": 0.01028374, + "balance_loss_clip": 1.02467895, + "balance_loss_mlp": 1.01931977, + "epoch": 0.851044641515106, + "flos": 23294821443840.0, + "grad_norm": 1.6444190484389039, + "language_loss": 0.7960484, + "learning_rate": 2.281838289110165e-07, + "loss": 0.81662691, + "num_input_tokens_seen": 305333870, + "step": 14155, + "time_per_iteration": 2.689356565475464 + }, + { + "auxiliary_loss_clip": 0.01032027, + "auxiliary_loss_mlp": 0.01026587, + "balance_loss_clip": 1.02274799, + "balance_loss_mlp": 1.0159595, + "epoch": 0.851104764767774, + "flos": 22050085489920.0, + "grad_norm": 1.738183351230754, + "language_loss": 0.70420718, + "learning_rate": 2.2800320648942904e-07, + "loss": 0.72479331, + "num_input_tokens_seen": 305352780, + "step": 14156, + "time_per_iteration": 2.7187209129333496 + }, + { + "auxiliary_loss_clip": 0.01030366, + "auxiliary_loss_mlp": 0.01027262, + "balance_loss_clip": 1.02513289, + "balance_loss_mlp": 1.01737309, + "epoch": 0.8511648880204419, + "flos": 20704728562560.0, + "grad_norm": 2.0667351298519416, + "language_loss": 0.73772305, + "learning_rate": 2.278226512621386e-07, + "loss": 0.75829935, + "num_input_tokens_seen": 305371370, + "step": 14157, + "time_per_iteration": 2.7389018535614014 + }, + { + "auxiliary_loss_clip": 0.00996362, + "auxiliary_loss_mlp": 0.01023174, + "balance_loss_clip": 1.02529025, + "balance_loss_mlp": 1.01414967, + "epoch": 0.8512250112731099, + "flos": 24024669891840.0, + "grad_norm": 1.989620582441011, + "language_loss": 0.79201543, + "learning_rate": 2.2764216323598995e-07, + "loss": 0.8122108, + "num_input_tokens_seen": 305387955, + "step": 14158, + "time_per_iteration": 2.8278939723968506 + }, + { + "auxiliary_loss_clip": 0.01045291, + "auxiliary_loss_mlp": 0.01031996, + "balance_loss_clip": 1.02311921, + "balance_loss_mlp": 1.01980066, + "epoch": 0.8512851345257778, + "flos": 22015467757440.0, + "grad_norm": 1.8779562367883191, + "language_loss": 0.78605497, + "learning_rate": 2.27461742417828e-07, + "loss": 0.80682778, + "num_input_tokens_seen": 305406285, + "step": 14159, + "time_per_iteration": 2.6347129344940186 + }, + { + "auxiliary_loss_clip": 0.01041033, + "auxiliary_loss_mlp": 0.01028246, + "balance_loss_clip": 1.02522945, + "balance_loss_mlp": 1.01753461, + "epoch": 0.8513452577784458, + "flos": 14830209924480.0, + "grad_norm": 1.6777626429721666, + "language_loss": 0.71452004, + "learning_rate": 2.2728138881449488e-07, + "loss": 0.7352128, + "num_input_tokens_seen": 305424500, + "step": 14160, + "time_per_iteration": 2.7472052574157715 + }, + { + "auxiliary_loss_clip": 0.01055884, + "auxiliary_loss_mlp": 0.01028294, + "balance_loss_clip": 1.02529669, + "balance_loss_mlp": 1.01693332, + "epoch": 0.8514053810311137, + "flos": 33035662166400.0, + "grad_norm": 2.286334138478006, + "language_loss": 0.70613384, + "learning_rate": 2.2710110243282866e-07, + "loss": 0.72697568, + "num_input_tokens_seen": 305442990, + "step": 14161, + "time_per_iteration": 2.7185120582580566 + }, + { + "auxiliary_loss_clip": 0.01049756, + "auxiliary_loss_mlp": 0.01026578, + "balance_loss_clip": 1.02153277, + "balance_loss_mlp": 1.01673734, + "epoch": 0.8514655042837818, + "flos": 27564456412800.0, + "grad_norm": 1.980835540875647, + "language_loss": 0.77862519, + "learning_rate": 2.2692088327966653e-07, + "loss": 0.79938853, + "num_input_tokens_seen": 305463065, + "step": 14162, + "time_per_iteration": 2.712886333465576 + }, + { + "auxiliary_loss_clip": 0.01053844, + "auxiliary_loss_mlp": 0.01034999, + "balance_loss_clip": 1.02562022, + "balance_loss_mlp": 1.02416873, + "epoch": 0.8515256275364497, + "flos": 35556052705920.0, + "grad_norm": 2.0789248470702653, + "language_loss": 0.77402461, + "learning_rate": 2.2674073136184235e-07, + "loss": 0.79491293, + "num_input_tokens_seen": 305489070, + "step": 14163, + "time_per_iteration": 2.7628629207611084 + }, + { + "auxiliary_loss_clip": 0.00987918, + "auxiliary_loss_mlp": 0.01001923, + "balance_loss_clip": 1.00289857, + "balance_loss_mlp": 1.00099277, + "epoch": 0.8515857507891177, + "flos": 70207372621440.0, + "grad_norm": 0.7017857673866994, + "language_loss": 0.55017406, + "learning_rate": 2.2656064668618735e-07, + "loss": 0.57007247, + "num_input_tokens_seen": 305551490, + "step": 14164, + "time_per_iteration": 3.3024654388427734 + }, + { + "auxiliary_loss_clip": 0.01051766, + "auxiliary_loss_mlp": 0.01029221, + "balance_loss_clip": 1.02356958, + "balance_loss_mlp": 1.01850998, + "epoch": 0.8516458740417857, + "flos": 22675290641280.0, + "grad_norm": 1.7160847858058295, + "language_loss": 0.7254023, + "learning_rate": 2.2638062925953005e-07, + "loss": 0.74621212, + "num_input_tokens_seen": 305570535, + "step": 14165, + "time_per_iteration": 2.6829030513763428 + }, + { + "auxiliary_loss_clip": 0.01031401, + "auxiliary_loss_mlp": 0.01026282, + "balance_loss_clip": 1.02474141, + "balance_loss_mlp": 1.01594663, + "epoch": 0.8517059972944536, + "flos": 22747435107840.0, + "grad_norm": 1.910911016479939, + "language_loss": 0.67294741, + "learning_rate": 2.26200679088697e-07, + "loss": 0.69352424, + "num_input_tokens_seen": 305590800, + "step": 14166, + "time_per_iteration": 2.7446701526641846 + }, + { + "auxiliary_loss_clip": 0.01033506, + "auxiliary_loss_mlp": 0.010276, + "balance_loss_clip": 1.02014756, + "balance_loss_mlp": 1.01751471, + "epoch": 0.8517661205471216, + "flos": 21689147675520.0, + "grad_norm": 1.9155272589789683, + "language_loss": 0.73347247, + "learning_rate": 2.260207961805125e-07, + "loss": 0.75408351, + "num_input_tokens_seen": 305609495, + "step": 14167, + "time_per_iteration": 2.7345962524414062 + }, + { + "auxiliary_loss_clip": 0.01061751, + "auxiliary_loss_mlp": 0.01027331, + "balance_loss_clip": 1.02506244, + "balance_loss_mlp": 1.01787162, + "epoch": 0.8518262437997896, + "flos": 25374839241600.0, + "grad_norm": 1.6919122510147664, + "language_loss": 0.80626971, + "learning_rate": 2.258409805417969e-07, + "loss": 0.82716054, + "num_input_tokens_seen": 305629420, + "step": 14168, + "time_per_iteration": 2.608800172805786 + }, + { + "auxiliary_loss_clip": 0.01059529, + "auxiliary_loss_mlp": 0.01024338, + "balance_loss_clip": 1.02341175, + "balance_loss_mlp": 1.01459861, + "epoch": 0.8518863670524576, + "flos": 27235406897280.0, + "grad_norm": 1.8831524288418753, + "language_loss": 0.76231623, + "learning_rate": 2.2566123217936893e-07, + "loss": 0.7831549, + "num_input_tokens_seen": 305649835, + "step": 14169, + "time_per_iteration": 4.353280305862427 + }, + { + "auxiliary_loss_clip": 0.01064914, + "auxiliary_loss_mlp": 0.0102838, + "balance_loss_clip": 1.02626014, + "balance_loss_mlp": 1.01795459, + "epoch": 0.8519464903051255, + "flos": 20959514709120.0, + "grad_norm": 1.623249484547916, + "language_loss": 0.63649929, + "learning_rate": 2.254815511000452e-07, + "loss": 0.6574322, + "num_input_tokens_seen": 305668840, + "step": 14170, + "time_per_iteration": 2.6012516021728516 + }, + { + "auxiliary_loss_clip": 0.01039469, + "auxiliary_loss_mlp": 0.01023746, + "balance_loss_clip": 1.02133119, + "balance_loss_mlp": 1.01361346, + "epoch": 0.8520066135577935, + "flos": 18441745862400.0, + "grad_norm": 2.4154084091142343, + "language_loss": 0.86638862, + "learning_rate": 2.253019373106384e-07, + "loss": 0.88702083, + "num_input_tokens_seen": 305686955, + "step": 14171, + "time_per_iteration": 2.5718586444854736 + }, + { + "auxiliary_loss_clip": 0.01044458, + "auxiliary_loss_mlp": 0.01028734, + "balance_loss_clip": 1.02336168, + "balance_loss_mlp": 1.01814806, + "epoch": 0.8520667368104614, + "flos": 29130233149440.0, + "grad_norm": 1.767159159390105, + "language_loss": 0.5479486, + "learning_rate": 2.2512239081796003e-07, + "loss": 0.56868047, + "num_input_tokens_seen": 305706290, + "step": 14172, + "time_per_iteration": 2.6551899909973145 + }, + { + "auxiliary_loss_clip": 0.01034536, + "auxiliary_loss_mlp": 0.01026301, + "balance_loss_clip": 1.02159405, + "balance_loss_mlp": 1.01730669, + "epoch": 0.8521268600631294, + "flos": 16034366488320.0, + "grad_norm": 2.387888347524841, + "language_loss": 0.70020151, + "learning_rate": 2.2494291162881862e-07, + "loss": 0.72080994, + "num_input_tokens_seen": 305723835, + "step": 14173, + "time_per_iteration": 2.587785005569458 + }, + { + "auxiliary_loss_clip": 0.01040789, + "auxiliary_loss_mlp": 0.00747572, + "balance_loss_clip": 1.02330995, + "balance_loss_mlp": 1.0004214, + "epoch": 0.8521869833157973, + "flos": 22454870832000.0, + "grad_norm": 2.4105055952844565, + "language_loss": 0.76590347, + "learning_rate": 2.247634997500205e-07, + "loss": 0.78378701, + "num_input_tokens_seen": 305741655, + "step": 14174, + "time_per_iteration": 2.6359124183654785 + }, + { + "auxiliary_loss_clip": 0.010245, + "auxiliary_loss_mlp": 0.00747542, + "balance_loss_clip": 1.02231181, + "balance_loss_mlp": 1.00041389, + "epoch": 0.8522471065684654, + "flos": 24972029147520.0, + "grad_norm": 1.5528418490125853, + "language_loss": 0.82234108, + "learning_rate": 2.245841551883676e-07, + "loss": 0.84006149, + "num_input_tokens_seen": 305761890, + "step": 14175, + "time_per_iteration": 2.6795272827148438 + }, + { + "auxiliary_loss_clip": 0.01064816, + "auxiliary_loss_mlp": 0.01029317, + "balance_loss_clip": 1.02643228, + "balance_loss_mlp": 1.01825476, + "epoch": 0.8523072298211333, + "flos": 17710604524800.0, + "grad_norm": 2.3092955056219617, + "language_loss": 0.6580537, + "learning_rate": 2.2440487795066153e-07, + "loss": 0.67899501, + "num_input_tokens_seen": 305779190, + "step": 14176, + "time_per_iteration": 2.538454055786133 + }, + { + "auxiliary_loss_clip": 0.01035844, + "auxiliary_loss_mlp": 0.00747466, + "balance_loss_clip": 1.02220726, + "balance_loss_mlp": 1.00039244, + "epoch": 0.8523673530738013, + "flos": 25446193608960.0, + "grad_norm": 1.8055147248894914, + "language_loss": 0.78464246, + "learning_rate": 2.2422566804370068e-07, + "loss": 0.80247551, + "num_input_tokens_seen": 305799870, + "step": 14177, + "time_per_iteration": 2.6577420234680176 + }, + { + "auxiliary_loss_clip": 0.01041829, + "auxiliary_loss_mlp": 0.01027795, + "balance_loss_clip": 1.02464676, + "balance_loss_mlp": 1.0175308, + "epoch": 0.8524274763264693, + "flos": 31429593348480.0, + "grad_norm": 3.6919423308444257, + "language_loss": 0.73586762, + "learning_rate": 2.2404652547428026e-07, + "loss": 0.75656384, + "num_input_tokens_seen": 305819695, + "step": 14178, + "time_per_iteration": 2.6939477920532227 + }, + { + "auxiliary_loss_clip": 0.01033985, + "auxiliary_loss_mlp": 0.01032701, + "balance_loss_clip": 1.02608347, + "balance_loss_mlp": 1.02238369, + "epoch": 0.8524875995791372, + "flos": 17712651600000.0, + "grad_norm": 1.8182545901803155, + "language_loss": 0.74812555, + "learning_rate": 2.238674502491935e-07, + "loss": 0.76879251, + "num_input_tokens_seen": 305837270, + "step": 14179, + "time_per_iteration": 2.67763614654541 + }, + { + "auxiliary_loss_clip": 0.01061458, + "auxiliary_loss_mlp": 0.01022668, + "balance_loss_clip": 1.02607703, + "balance_loss_mlp": 1.012887, + "epoch": 0.8525477228318052, + "flos": 21687316081920.0, + "grad_norm": 1.910468823008229, + "language_loss": 0.81700319, + "learning_rate": 2.2368844237523165e-07, + "loss": 0.83784443, + "num_input_tokens_seen": 305855250, + "step": 14180, + "time_per_iteration": 2.6358144283294678 + }, + { + "auxiliary_loss_clip": 0.01008729, + "auxiliary_loss_mlp": 0.01031193, + "balance_loss_clip": 1.0228107, + "balance_loss_mlp": 1.02073216, + "epoch": 0.8526078460844732, + "flos": 24827057856000.0, + "grad_norm": 2.742436834202009, + "language_loss": 0.61161399, + "learning_rate": 2.235095018591815e-07, + "loss": 0.63201314, + "num_input_tokens_seen": 305875660, + "step": 14181, + "time_per_iteration": 4.580879211425781 + }, + { + "auxiliary_loss_clip": 0.01060854, + "auxiliary_loss_mlp": 0.01025583, + "balance_loss_clip": 1.02538943, + "balance_loss_mlp": 1.01590335, + "epoch": 0.8526679693371412, + "flos": 13516418073600.0, + "grad_norm": 2.1487071421837975, + "language_loss": 0.72286862, + "learning_rate": 2.2333062870782894e-07, + "loss": 0.74373299, + "num_input_tokens_seen": 305892415, + "step": 14182, + "time_per_iteration": 2.5999786853790283 + }, + { + "auxiliary_loss_clip": 0.01019534, + "auxiliary_loss_mlp": 0.01031032, + "balance_loss_clip": 1.02372932, + "balance_loss_mlp": 1.02080345, + "epoch": 0.8527280925898091, + "flos": 23514092017920.0, + "grad_norm": 1.8930633091867093, + "language_loss": 0.7089147, + "learning_rate": 2.2315182292795697e-07, + "loss": 0.72942036, + "num_input_tokens_seen": 305912665, + "step": 14183, + "time_per_iteration": 2.8097660541534424 + }, + { + "auxiliary_loss_clip": 0.0104105, + "auxiliary_loss_mlp": 0.01025379, + "balance_loss_clip": 1.02662373, + "balance_loss_mlp": 1.01589596, + "epoch": 0.8527882158424771, + "flos": 20303031790080.0, + "grad_norm": 1.802024744280887, + "language_loss": 0.72892594, + "learning_rate": 2.2297308452634644e-07, + "loss": 0.74959022, + "num_input_tokens_seen": 305931515, + "step": 14184, + "time_per_iteration": 2.6523594856262207 + }, + { + "auxiliary_loss_clip": 0.01062659, + "auxiliary_loss_mlp": 0.01029113, + "balance_loss_clip": 1.0255506, + "balance_loss_mlp": 1.01878405, + "epoch": 0.852848339095145, + "flos": 17202504689280.0, + "grad_norm": 1.7841733159723006, + "language_loss": 0.7688061, + "learning_rate": 2.2279441350977457e-07, + "loss": 0.78972387, + "num_input_tokens_seen": 305949965, + "step": 14185, + "time_per_iteration": 2.7277467250823975 + }, + { + "auxiliary_loss_clip": 0.01021117, + "auxiliary_loss_mlp": 0.010297, + "balance_loss_clip": 1.02052259, + "balance_loss_mlp": 1.01792777, + "epoch": 0.852908462347813, + "flos": 18368990864640.0, + "grad_norm": 2.1666842180768597, + "language_loss": 0.79649413, + "learning_rate": 2.2261580988501637e-07, + "loss": 0.8170023, + "num_input_tokens_seen": 305967820, + "step": 14186, + "time_per_iteration": 2.686965227127075 + }, + { + "auxiliary_loss_clip": 0.01038844, + "auxiliary_loss_mlp": 0.01025074, + "balance_loss_clip": 1.02200079, + "balance_loss_mlp": 1.01435685, + "epoch": 0.8529685856004809, + "flos": 18624890332800.0, + "grad_norm": 1.7642964680697495, + "language_loss": 0.62827075, + "learning_rate": 2.224372736588449e-07, + "loss": 0.64890993, + "num_input_tokens_seen": 305985505, + "step": 14187, + "time_per_iteration": 2.6508383750915527 + }, + { + "auxiliary_loss_clip": 0.01010538, + "auxiliary_loss_mlp": 0.01026873, + "balance_loss_clip": 1.02108359, + "balance_loss_mlp": 1.01559544, + "epoch": 0.853028708853149, + "flos": 29607665748480.0, + "grad_norm": 1.7849259369508244, + "language_loss": 0.76940668, + "learning_rate": 2.2225880483803005e-07, + "loss": 0.7897808, + "num_input_tokens_seen": 306005220, + "step": 14188, + "time_per_iteration": 2.772031307220459 + }, + { + "auxiliary_loss_clip": 0.01051316, + "auxiliary_loss_mlp": 0.01025824, + "balance_loss_clip": 1.02415407, + "balance_loss_mlp": 1.01483846, + "epoch": 0.8530888321058169, + "flos": 26353153042560.0, + "grad_norm": 1.5361498975622916, + "language_loss": 0.78414506, + "learning_rate": 2.2208040342933932e-07, + "loss": 0.80491638, + "num_input_tokens_seen": 306023785, + "step": 14189, + "time_per_iteration": 2.605057954788208 + }, + { + "auxiliary_loss_clip": 0.01035814, + "auxiliary_loss_mlp": 0.01028326, + "balance_loss_clip": 1.02127457, + "balance_loss_mlp": 1.01743007, + "epoch": 0.8531489553584849, + "flos": 20521979141760.0, + "grad_norm": 2.0572692325632618, + "language_loss": 0.79323995, + "learning_rate": 2.2190206943953793e-07, + "loss": 0.81388134, + "num_input_tokens_seen": 306041600, + "step": 14190, + "time_per_iteration": 2.629464626312256 + }, + { + "auxiliary_loss_clip": 0.01021317, + "auxiliary_loss_mlp": 0.01027503, + "balance_loss_clip": 1.02442431, + "balance_loss_mlp": 1.01687551, + "epoch": 0.8532090786111529, + "flos": 20704297599360.0, + "grad_norm": 2.152147250211599, + "language_loss": 0.76173925, + "learning_rate": 2.2172380287538894e-07, + "loss": 0.78222746, + "num_input_tokens_seen": 306060345, + "step": 14191, + "time_per_iteration": 2.7226765155792236 + }, + { + "auxiliary_loss_clip": 0.01053554, + "auxiliary_loss_mlp": 0.01025627, + "balance_loss_clip": 1.02660632, + "balance_loss_mlp": 1.01550603, + "epoch": 0.8532692018638208, + "flos": 19828903242240.0, + "grad_norm": 2.2411724914905107, + "language_loss": 0.69038451, + "learning_rate": 2.2154560374365073e-07, + "loss": 0.71117634, + "num_input_tokens_seen": 306078285, + "step": 14192, + "time_per_iteration": 2.628567695617676 + }, + { + "auxiliary_loss_clip": 0.01056833, + "auxiliary_loss_mlp": 0.01033693, + "balance_loss_clip": 1.02524257, + "balance_loss_mlp": 1.02126575, + "epoch": 0.8533293251164888, + "flos": 20996790048000.0, + "grad_norm": 2.2238391196423475, + "language_loss": 0.631019, + "learning_rate": 2.2136747205108164e-07, + "loss": 0.65192425, + "num_input_tokens_seen": 306093760, + "step": 14193, + "time_per_iteration": 2.5453073978424072 + }, + { + "auxiliary_loss_clip": 0.01033712, + "auxiliary_loss_mlp": 0.01027584, + "balance_loss_clip": 1.0238682, + "balance_loss_mlp": 1.01775479, + "epoch": 0.8533894483691568, + "flos": 22419606654720.0, + "grad_norm": 1.7089078669039015, + "language_loss": 0.76670527, + "learning_rate": 2.211894078044365e-07, + "loss": 0.78731823, + "num_input_tokens_seen": 306112595, + "step": 14194, + "time_per_iteration": 2.6335861682891846 + }, + { + "auxiliary_loss_clip": 0.01063755, + "auxiliary_loss_mlp": 0.0102481, + "balance_loss_clip": 1.02575588, + "balance_loss_mlp": 1.01535642, + "epoch": 0.8534495716218248, + "flos": 21616536332160.0, + "grad_norm": 1.6647247524470572, + "language_loss": 0.69431496, + "learning_rate": 2.2101141101046705e-07, + "loss": 0.71520066, + "num_input_tokens_seen": 306131800, + "step": 14195, + "time_per_iteration": 2.5528194904327393 + }, + { + "auxiliary_loss_clip": 0.01043418, + "auxiliary_loss_mlp": 0.01030301, + "balance_loss_clip": 1.02530134, + "balance_loss_mlp": 1.01943564, + "epoch": 0.8535096948744927, + "flos": 22346277039360.0, + "grad_norm": 1.9961056436181206, + "language_loss": 0.85525632, + "learning_rate": 2.2083348167592343e-07, + "loss": 0.87599349, + "num_input_tokens_seen": 306150590, + "step": 14196, + "time_per_iteration": 2.6336655616760254 + }, + { + "auxiliary_loss_clip": 0.00986976, + "auxiliary_loss_mlp": 0.01001214, + "balance_loss_clip": 1.00129628, + "balance_loss_mlp": 1.00033748, + "epoch": 0.8535698181271607, + "flos": 52762507891200.0, + "grad_norm": 0.7576845798693149, + "language_loss": 0.55042696, + "learning_rate": 2.2065561980755243e-07, + "loss": 0.57030886, + "num_input_tokens_seen": 306205850, + "step": 14197, + "time_per_iteration": 3.1909191608428955 + }, + { + "auxiliary_loss_clip": 0.0102884, + "auxiliary_loss_mlp": 0.00747516, + "balance_loss_clip": 1.02357674, + "balance_loss_mlp": 1.00034988, + "epoch": 0.8536299413798286, + "flos": 19062892776960.0, + "grad_norm": 1.5315622329612268, + "language_loss": 0.81710494, + "learning_rate": 2.2047782541209826e-07, + "loss": 0.83486855, + "num_input_tokens_seen": 306225220, + "step": 14198, + "time_per_iteration": 6.016807317733765 + }, + { + "auxiliary_loss_clip": 0.01060923, + "auxiliary_loss_mlp": 0.0102371, + "balance_loss_clip": 1.0247612, + "balance_loss_mlp": 1.01478112, + "epoch": 0.8536900646324966, + "flos": 49344743871360.0, + "grad_norm": 1.3896383480005965, + "language_loss": 0.68482912, + "learning_rate": 2.203000984963035e-07, + "loss": 0.70567542, + "num_input_tokens_seen": 306249865, + "step": 14199, + "time_per_iteration": 2.8236353397369385 + }, + { + "auxiliary_loss_clip": 0.01026205, + "auxiliary_loss_mlp": 0.01025096, + "balance_loss_clip": 1.02214146, + "balance_loss_mlp": 1.01617324, + "epoch": 0.8537501878851645, + "flos": 21762333636480.0, + "grad_norm": 1.6914916477844246, + "language_loss": 0.86536932, + "learning_rate": 2.201224390669072e-07, + "loss": 0.88588232, + "num_input_tokens_seen": 306270215, + "step": 14200, + "time_per_iteration": 2.7836039066314697 + }, + { + "auxiliary_loss_clip": 0.0103428, + "auxiliary_loss_mlp": 0.0102263, + "balance_loss_clip": 1.02599394, + "balance_loss_mlp": 1.01344526, + "epoch": 0.8538103111378326, + "flos": 22269176496000.0, + "grad_norm": 1.8833983505694607, + "language_loss": 0.77912474, + "learning_rate": 2.1994484713064666e-07, + "loss": 0.79969382, + "num_input_tokens_seen": 306288960, + "step": 14201, + "time_per_iteration": 2.7387192249298096 + }, + { + "auxiliary_loss_clip": 0.01042908, + "auxiliary_loss_mlp": 0.01025209, + "balance_loss_clip": 1.02569818, + "balance_loss_mlp": 1.0153209, + "epoch": 0.8538704343905005, + "flos": 20303929630080.0, + "grad_norm": 2.201721881259682, + "language_loss": 0.6889199, + "learning_rate": 2.19767322694256e-07, + "loss": 0.70960104, + "num_input_tokens_seen": 306308735, + "step": 14202, + "time_per_iteration": 2.6943299770355225 + }, + { + "auxiliary_loss_clip": 0.01054217, + "auxiliary_loss_mlp": 0.01029788, + "balance_loss_clip": 1.02671432, + "balance_loss_mlp": 1.01892257, + "epoch": 0.8539305576431685, + "flos": 24755164784640.0, + "grad_norm": 1.8841079640996607, + "language_loss": 0.80198902, + "learning_rate": 2.195898657644666e-07, + "loss": 0.82282913, + "num_input_tokens_seen": 306329015, + "step": 14203, + "time_per_iteration": 2.6237642765045166 + }, + { + "auxiliary_loss_clip": 0.01043526, + "auxiliary_loss_mlp": 0.01030465, + "balance_loss_clip": 1.0240624, + "balance_loss_mlp": 1.01903844, + "epoch": 0.8539906808958365, + "flos": 26687625511680.0, + "grad_norm": 1.9350478107595561, + "language_loss": 0.65740108, + "learning_rate": 2.1941247634800808e-07, + "loss": 0.678141, + "num_input_tokens_seen": 306349085, + "step": 14204, + "time_per_iteration": 2.733746290206909 + }, + { + "auxiliary_loss_clip": 0.01061968, + "auxiliary_loss_mlp": 0.01032853, + "balance_loss_clip": 1.02428579, + "balance_loss_mlp": 1.02236867, + "epoch": 0.8540508041485044, + "flos": 13365521038080.0, + "grad_norm": 4.200883201235786, + "language_loss": 0.59684908, + "learning_rate": 2.1923515445160667e-07, + "loss": 0.61779726, + "num_input_tokens_seen": 306365385, + "step": 14205, + "time_per_iteration": 2.5218100547790527 + }, + { + "auxiliary_loss_clip": 0.01039845, + "auxiliary_loss_mlp": 0.01024234, + "balance_loss_clip": 1.02366066, + "balance_loss_mlp": 1.01408315, + "epoch": 0.8541109274011724, + "flos": 32780876019840.0, + "grad_norm": 2.3899935516018944, + "language_loss": 0.72188032, + "learning_rate": 2.1905790008198655e-07, + "loss": 0.74252105, + "num_input_tokens_seen": 306384585, + "step": 14206, + "time_per_iteration": 2.7523739337921143 + }, + { + "auxiliary_loss_clip": 0.01054408, + "auxiliary_loss_mlp": 0.01027827, + "balance_loss_clip": 1.02597547, + "balance_loss_mlp": 1.01783156, + "epoch": 0.8541710506538404, + "flos": 17639286071040.0, + "grad_norm": 2.8284191971710504, + "language_loss": 0.75759757, + "learning_rate": 2.1888071324586987e-07, + "loss": 0.77841985, + "num_input_tokens_seen": 306401565, + "step": 14207, + "time_per_iteration": 2.6870129108428955 + }, + { + "auxiliary_loss_clip": 0.01063529, + "auxiliary_loss_mlp": 0.01029137, + "balance_loss_clip": 1.02538228, + "balance_loss_mlp": 1.01810384, + "epoch": 0.8542311739065084, + "flos": 20263062931200.0, + "grad_norm": 1.6980938410640036, + "language_loss": 0.85049713, + "learning_rate": 2.1870359394997485e-07, + "loss": 0.87142372, + "num_input_tokens_seen": 306419995, + "step": 14208, + "time_per_iteration": 2.62261962890625 + }, + { + "auxiliary_loss_clip": 0.01042228, + "auxiliary_loss_mlp": 0.01029233, + "balance_loss_clip": 1.02544284, + "balance_loss_mlp": 1.01936877, + "epoch": 0.8542912971591763, + "flos": 17785657992960.0, + "grad_norm": 1.4649725956890214, + "language_loss": 0.6578958, + "learning_rate": 2.1852654220101785e-07, + "loss": 0.67861044, + "num_input_tokens_seen": 306439240, + "step": 14209, + "time_per_iteration": 2.635484218597412 + }, + { + "auxiliary_loss_clip": 0.01015142, + "auxiliary_loss_mlp": 0.01027102, + "balance_loss_clip": 1.02195776, + "balance_loss_mlp": 1.01722574, + "epoch": 0.8543514204118443, + "flos": 26979507429120.0, + "grad_norm": 1.9397437541765008, + "language_loss": 0.71000016, + "learning_rate": 2.1834955800571287e-07, + "loss": 0.73042262, + "num_input_tokens_seen": 306458425, + "step": 14210, + "time_per_iteration": 2.763266086578369 + }, + { + "auxiliary_loss_clip": 0.01041055, + "auxiliary_loss_mlp": 0.01028223, + "balance_loss_clip": 1.02390337, + "balance_loss_mlp": 1.01800084, + "epoch": 0.8544115436645122, + "flos": 24024598064640.0, + "grad_norm": 1.3435680736791515, + "language_loss": 0.70272809, + "learning_rate": 2.1817264137077141e-07, + "loss": 0.72342086, + "num_input_tokens_seen": 306477210, + "step": 14211, + "time_per_iteration": 2.684089422225952 + }, + { + "auxiliary_loss_clip": 0.01043035, + "auxiliary_loss_mlp": 0.01031182, + "balance_loss_clip": 1.02435136, + "balance_loss_mlp": 1.02086425, + "epoch": 0.8544716669171802, + "flos": 16617986668800.0, + "grad_norm": 2.115073608026624, + "language_loss": 0.81285858, + "learning_rate": 2.1799579230290166e-07, + "loss": 0.83360076, + "num_input_tokens_seen": 306495820, + "step": 14212, + "time_per_iteration": 2.6158952713012695 + }, + { + "auxiliary_loss_clip": 0.01025778, + "auxiliary_loss_mlp": 0.01031545, + "balance_loss_clip": 1.02161932, + "balance_loss_mlp": 1.01951623, + "epoch": 0.8545317901698481, + "flos": 40005779489280.0, + "grad_norm": 2.320783834493071, + "language_loss": 0.66224885, + "learning_rate": 2.178190108088105e-07, + "loss": 0.68282205, + "num_input_tokens_seen": 306516420, + "step": 14213, + "time_per_iteration": 2.8232836723327637 + }, + { + "auxiliary_loss_clip": 0.01059764, + "auxiliary_loss_mlp": 0.01022291, + "balance_loss_clip": 1.02380621, + "balance_loss_mlp": 1.01261711, + "epoch": 0.8545919134225162, + "flos": 19902520166400.0, + "grad_norm": 1.642187496542145, + "language_loss": 0.78447556, + "learning_rate": 2.1764229689520098e-07, + "loss": 0.80529606, + "num_input_tokens_seen": 306534785, + "step": 14214, + "time_per_iteration": 2.6930930614471436 + }, + { + "auxiliary_loss_clip": 0.01043592, + "auxiliary_loss_mlp": 0.01026747, + "balance_loss_clip": 1.02377987, + "balance_loss_mlp": 1.01536226, + "epoch": 0.8546520366751841, + "flos": 18952970181120.0, + "grad_norm": 2.411063757218914, + "language_loss": 0.66819918, + "learning_rate": 2.1746565056877397e-07, + "loss": 0.68890262, + "num_input_tokens_seen": 306552440, + "step": 14215, + "time_per_iteration": 2.641956090927124 + }, + { + "auxiliary_loss_clip": 0.01061926, + "auxiliary_loss_mlp": 0.01026054, + "balance_loss_clip": 1.02551532, + "balance_loss_mlp": 1.01601672, + "epoch": 0.8547121599278521, + "flos": 35621445415680.0, + "grad_norm": 1.7780799577707025, + "language_loss": 0.62197381, + "learning_rate": 2.172890718362279e-07, + "loss": 0.64285362, + "num_input_tokens_seen": 306573600, + "step": 14216, + "time_per_iteration": 4.567246437072754 + }, + { + "auxiliary_loss_clip": 0.01025804, + "auxiliary_loss_mlp": 0.01028921, + "balance_loss_clip": 1.02266026, + "balance_loss_mlp": 1.01857913, + "epoch": 0.8547722831805201, + "flos": 16910048154240.0, + "grad_norm": 1.8571467028847755, + "language_loss": 0.65608311, + "learning_rate": 2.17112560704259e-07, + "loss": 0.67663038, + "num_input_tokens_seen": 306592840, + "step": 14217, + "time_per_iteration": 2.6655821800231934 + }, + { + "auxiliary_loss_clip": 0.01050782, + "auxiliary_loss_mlp": 0.01029462, + "balance_loss_clip": 1.02592826, + "balance_loss_mlp": 1.0197885, + "epoch": 0.854832406433188, + "flos": 23002616304000.0, + "grad_norm": 1.489820462102936, + "language_loss": 0.65292543, + "learning_rate": 2.1693611717956072e-07, + "loss": 0.67372787, + "num_input_tokens_seen": 306613210, + "step": 14218, + "time_per_iteration": 2.663867950439453 + }, + { + "auxiliary_loss_clip": 0.01046275, + "auxiliary_loss_mlp": 0.01030345, + "balance_loss_clip": 1.02257538, + "balance_loss_mlp": 1.01934814, + "epoch": 0.854892529685856, + "flos": 20412595249920.0, + "grad_norm": 1.822341984018746, + "language_loss": 0.70030558, + "learning_rate": 2.167597412688238e-07, + "loss": 0.72107184, + "num_input_tokens_seen": 306631620, + "step": 14219, + "time_per_iteration": 2.763835906982422 + }, + { + "auxiliary_loss_clip": 0.01038003, + "auxiliary_loss_mlp": 0.01032504, + "balance_loss_clip": 1.02255559, + "balance_loss_mlp": 1.02160871, + "epoch": 0.854952652938524, + "flos": 16398716094720.0, + "grad_norm": 2.9031740956349865, + "language_loss": 0.67312968, + "learning_rate": 2.1658343297873549e-07, + "loss": 0.69383478, + "num_input_tokens_seen": 306646695, + "step": 14220, + "time_per_iteration": 2.7490150928497314 + }, + { + "auxiliary_loss_clip": 0.01059534, + "auxiliary_loss_mlp": 0.01027337, + "balance_loss_clip": 1.02386856, + "balance_loss_mlp": 1.01757383, + "epoch": 0.855012776191192, + "flos": 21178677542400.0, + "grad_norm": 2.143548272228308, + "language_loss": 0.71773171, + "learning_rate": 2.164071923159827e-07, + "loss": 0.73860043, + "num_input_tokens_seen": 306665465, + "step": 14221, + "time_per_iteration": 2.6447176933288574 + }, + { + "auxiliary_loss_clip": 0.0103073, + "auxiliary_loss_mlp": 0.01036044, + "balance_loss_clip": 1.02303743, + "balance_loss_mlp": 1.02508831, + "epoch": 0.8550728994438599, + "flos": 26140993361280.0, + "grad_norm": 1.6895432465499518, + "language_loss": 0.59832686, + "learning_rate": 2.1623101928724763e-07, + "loss": 0.61899459, + "num_input_tokens_seen": 306685950, + "step": 14222, + "time_per_iteration": 2.739595651626587 + }, + { + "auxiliary_loss_clip": 0.01031331, + "auxiliary_loss_mlp": 0.01029518, + "balance_loss_clip": 1.0218147, + "balance_loss_mlp": 1.01906371, + "epoch": 0.8551330226965279, + "flos": 22786793435520.0, + "grad_norm": 1.7631326529296765, + "language_loss": 0.84249717, + "learning_rate": 2.1605491389921093e-07, + "loss": 0.86310565, + "num_input_tokens_seen": 306705740, + "step": 14223, + "time_per_iteration": 2.645602226257324 + }, + { + "auxiliary_loss_clip": 0.01051329, + "auxiliary_loss_mlp": 0.0102936, + "balance_loss_clip": 1.02511382, + "balance_loss_mlp": 1.01939416, + "epoch": 0.8551931459491958, + "flos": 22419032037120.0, + "grad_norm": 1.516243280222302, + "language_loss": 0.74145114, + "learning_rate": 2.158788761585515e-07, + "loss": 0.76225805, + "num_input_tokens_seen": 306725065, + "step": 14224, + "time_per_iteration": 2.646109104156494 + }, + { + "auxiliary_loss_clip": 0.01034153, + "auxiliary_loss_mlp": 0.00747597, + "balance_loss_clip": 1.02202106, + "balance_loss_mlp": 1.00046301, + "epoch": 0.8552532692018638, + "flos": 19573183342080.0, + "grad_norm": 2.3656377621176476, + "language_loss": 0.75360179, + "learning_rate": 2.1570290607194307e-07, + "loss": 0.77141929, + "num_input_tokens_seen": 306743630, + "step": 14225, + "time_per_iteration": 2.7318036556243896 + }, + { + "auxiliary_loss_clip": 0.00999362, + "auxiliary_loss_mlp": 0.01035943, + "balance_loss_clip": 1.02270555, + "balance_loss_mlp": 1.02592897, + "epoch": 0.8553133924545318, + "flos": 26432767537920.0, + "grad_norm": 2.34951032803329, + "language_loss": 0.76809782, + "learning_rate": 2.1552700364605925e-07, + "loss": 0.78845084, + "num_input_tokens_seen": 306763105, + "step": 14226, + "time_per_iteration": 2.802703857421875 + }, + { + "auxiliary_loss_clip": 0.01064268, + "auxiliary_loss_mlp": 0.01031844, + "balance_loss_clip": 1.02542508, + "balance_loss_mlp": 1.02089477, + "epoch": 0.8553735157071998, + "flos": 16362446336640.0, + "grad_norm": 1.8371473820610371, + "language_loss": 0.54425704, + "learning_rate": 2.153511688875702e-07, + "loss": 0.56521821, + "num_input_tokens_seen": 306779875, + "step": 14227, + "time_per_iteration": 2.603931427001953 + }, + { + "auxiliary_loss_clip": 0.0103619, + "auxiliary_loss_mlp": 0.00747635, + "balance_loss_clip": 1.02555919, + "balance_loss_mlp": 1.00034297, + "epoch": 0.8554336389598677, + "flos": 20887334328960.0, + "grad_norm": 1.8593151063845126, + "language_loss": 0.65377557, + "learning_rate": 2.151754018031442e-07, + "loss": 0.67161381, + "num_input_tokens_seen": 306800015, + "step": 14228, + "time_per_iteration": 2.7171261310577393 + }, + { + "auxiliary_loss_clip": 0.01035676, + "auxiliary_loss_mlp": 0.0103039, + "balance_loss_clip": 1.02731705, + "balance_loss_mlp": 1.01938105, + "epoch": 0.8554937622125357, + "flos": 21284721469440.0, + "grad_norm": 3.4961919922733937, + "language_loss": 0.74463284, + "learning_rate": 2.1499970239944542e-07, + "loss": 0.76529348, + "num_input_tokens_seen": 306814160, + "step": 14229, + "time_per_iteration": 4.34982442855835 + }, + { + "auxiliary_loss_clip": 0.01049397, + "auxiliary_loss_mlp": 0.01025441, + "balance_loss_clip": 1.02357531, + "balance_loss_mlp": 1.016029, + "epoch": 0.8555538854652037, + "flos": 22413178120320.0, + "grad_norm": 3.370263185155941, + "language_loss": 0.72995663, + "learning_rate": 2.1482407068313724e-07, + "loss": 0.750705, + "num_input_tokens_seen": 306833310, + "step": 14230, + "time_per_iteration": 2.6088082790374756 + }, + { + "auxiliary_loss_clip": 0.01045428, + "auxiliary_loss_mlp": 0.01026933, + "balance_loss_clip": 1.02235651, + "balance_loss_mlp": 1.01667488, + "epoch": 0.8556140087178716, + "flos": 20193719725440.0, + "grad_norm": 1.6874253124961545, + "language_loss": 0.82469547, + "learning_rate": 2.1464850666087897e-07, + "loss": 0.84541905, + "num_input_tokens_seen": 306851345, + "step": 14231, + "time_per_iteration": 2.599452257156372 + }, + { + "auxiliary_loss_clip": 0.01052576, + "auxiliary_loss_mlp": 0.0103254, + "balance_loss_clip": 1.0256809, + "balance_loss_mlp": 1.02116752, + "epoch": 0.8556741319705397, + "flos": 22638123043200.0, + "grad_norm": 1.9177203019985032, + "language_loss": 0.67656493, + "learning_rate": 2.1447301033932796e-07, + "loss": 0.69741607, + "num_input_tokens_seen": 306871040, + "step": 14232, + "time_per_iteration": 2.702242374420166 + }, + { + "auxiliary_loss_clip": 0.01041526, + "auxiliary_loss_mlp": 0.0102866, + "balance_loss_clip": 1.02407002, + "balance_loss_mlp": 1.01788318, + "epoch": 0.8557342552232076, + "flos": 23549320281600.0, + "grad_norm": 1.6008749826694333, + "language_loss": 0.6726228, + "learning_rate": 2.1429758172513955e-07, + "loss": 0.69332463, + "num_input_tokens_seen": 306891625, + "step": 14233, + "time_per_iteration": 2.6476550102233887 + }, + { + "auxiliary_loss_clip": 0.01050945, + "auxiliary_loss_mlp": 0.01027757, + "balance_loss_clip": 1.02418303, + "balance_loss_mlp": 1.01791596, + "epoch": 0.8557943784758756, + "flos": 19609884063360.0, + "grad_norm": 1.7359316626114036, + "language_loss": 0.76368588, + "learning_rate": 2.1412222082496556e-07, + "loss": 0.78447294, + "num_input_tokens_seen": 306910020, + "step": 14234, + "time_per_iteration": 2.726144313812256 + }, + { + "auxiliary_loss_clip": 0.00977021, + "auxiliary_loss_mlp": 0.01002907, + "balance_loss_clip": 1.00156951, + "balance_loss_mlp": 1.00213802, + "epoch": 0.8558545017285435, + "flos": 70641891446400.0, + "grad_norm": 0.7607778289849596, + "language_loss": 0.58007652, + "learning_rate": 2.1394692764545684e-07, + "loss": 0.59987581, + "num_input_tokens_seen": 306969505, + "step": 14235, + "time_per_iteration": 3.2006313800811768 + }, + { + "auxiliary_loss_clip": 0.00987398, + "auxiliary_loss_mlp": 0.01003481, + "balance_loss_clip": 1.0013144, + "balance_loss_mlp": 1.00267053, + "epoch": 0.8559146249812115, + "flos": 56649983086080.0, + "grad_norm": 0.7847272662620948, + "language_loss": 0.56688857, + "learning_rate": 2.1377170219325858e-07, + "loss": 0.58679736, + "num_input_tokens_seen": 307027710, + "step": 14236, + "time_per_iteration": 3.118037700653076 + }, + { + "auxiliary_loss_clip": 0.0103968, + "auxiliary_loss_mlp": 0.01028306, + "balance_loss_clip": 1.02311933, + "balance_loss_mlp": 1.01808989, + "epoch": 0.8559747482338794, + "flos": 22888240421760.0, + "grad_norm": 1.64602198574813, + "language_loss": 0.70452356, + "learning_rate": 2.1359654447501673e-07, + "loss": 0.72520339, + "num_input_tokens_seen": 307045515, + "step": 14237, + "time_per_iteration": 2.616605043411255 + }, + { + "auxiliary_loss_clip": 0.01039472, + "auxiliary_loss_mlp": 0.01024926, + "balance_loss_clip": 1.02332759, + "balance_loss_mlp": 1.01574087, + "epoch": 0.8560348714865474, + "flos": 22601925112320.0, + "grad_norm": 2.371480627923604, + "language_loss": 0.6383816, + "learning_rate": 2.1342145449737314e-07, + "loss": 0.65902555, + "num_input_tokens_seen": 307064470, + "step": 14238, + "time_per_iteration": 2.6613755226135254 + }, + { + "auxiliary_loss_clip": 0.01057515, + "auxiliary_loss_mlp": 0.01030721, + "balance_loss_clip": 1.02395606, + "balance_loss_mlp": 1.02242434, + "epoch": 0.8560949947392154, + "flos": 17931455297280.0, + "grad_norm": 1.4222750365214232, + "language_loss": 0.69304937, + "learning_rate": 2.1324643226696648e-07, + "loss": 0.71393168, + "num_input_tokens_seen": 307083900, + "step": 14239, + "time_per_iteration": 2.5921099185943604 + }, + { + "auxiliary_loss_clip": 0.01062714, + "auxiliary_loss_mlp": 0.01028335, + "balance_loss_clip": 1.02425075, + "balance_loss_mlp": 1.01804781, + "epoch": 0.8561551179918834, + "flos": 31026208636800.0, + "grad_norm": 1.9749945690309736, + "language_loss": 0.66680121, + "learning_rate": 2.1307147779043455e-07, + "loss": 0.68771172, + "num_input_tokens_seen": 307104590, + "step": 14240, + "time_per_iteration": 2.6273083686828613 + }, + { + "auxiliary_loss_clip": 0.01025935, + "auxiliary_loss_mlp": 0.0102917, + "balance_loss_clip": 1.02215838, + "balance_loss_mlp": 1.0169692, + "epoch": 0.8562152412445513, + "flos": 30665198995200.0, + "grad_norm": 2.227565551459624, + "language_loss": 0.62211013, + "learning_rate": 2.1289659107441182e-07, + "loss": 0.64266115, + "num_input_tokens_seen": 307125580, + "step": 14241, + "time_per_iteration": 2.8837008476257324 + }, + { + "auxiliary_loss_clip": 0.01063731, + "auxiliary_loss_mlp": 0.01033756, + "balance_loss_clip": 1.02436924, + "balance_loss_mlp": 1.02233028, + "epoch": 0.8562753644972193, + "flos": 31576144838400.0, + "grad_norm": 1.4543446634040347, + "language_loss": 0.74402308, + "learning_rate": 2.1272177212552855e-07, + "loss": 0.76499802, + "num_input_tokens_seen": 307147625, + "step": 14242, + "time_per_iteration": 2.6648645401000977 + }, + { + "auxiliary_loss_clip": 0.01007512, + "auxiliary_loss_mlp": 0.01043314, + "balance_loss_clip": 1.02991819, + "balance_loss_mlp": 1.03168535, + "epoch": 0.8563354877498872, + "flos": 26213640618240.0, + "grad_norm": 1.8195703303853328, + "language_loss": 0.76465452, + "learning_rate": 2.1254702095041498e-07, + "loss": 0.78516275, + "num_input_tokens_seen": 307164665, + "step": 14243, + "time_per_iteration": 2.889492988586426 + }, + { + "auxiliary_loss_clip": 0.01042719, + "auxiliary_loss_mlp": 0.00747564, + "balance_loss_clip": 1.02501917, + "balance_loss_mlp": 1.00042367, + "epoch": 0.8563956110025552, + "flos": 24134341092480.0, + "grad_norm": 1.9895213622786279, + "language_loss": 0.6804409, + "learning_rate": 2.123723375556974e-07, + "loss": 0.69834375, + "num_input_tokens_seen": 307182530, + "step": 14244, + "time_per_iteration": 4.4410319328308105 + }, + { + "auxiliary_loss_clip": 0.00997238, + "auxiliary_loss_mlp": 0.01002629, + "balance_loss_clip": 1.00180244, + "balance_loss_mlp": 1.00170505, + "epoch": 0.8564557342552233, + "flos": 56271986311680.0, + "grad_norm": 0.7538564370598868, + "language_loss": 0.58501518, + "learning_rate": 2.1219772194800046e-07, + "loss": 0.60501385, + "num_input_tokens_seen": 307241240, + "step": 14245, + "time_per_iteration": 4.739208459854126 + }, + { + "auxiliary_loss_clip": 0.01053699, + "auxiliary_loss_mlp": 0.01025748, + "balance_loss_clip": 1.02485275, + "balance_loss_mlp": 1.01504898, + "epoch": 0.8565158575078912, + "flos": 23440618748160.0, + "grad_norm": 1.9956073670172625, + "language_loss": 0.77314156, + "learning_rate": 2.1202317413394488e-07, + "loss": 0.79393601, + "num_input_tokens_seen": 307261485, + "step": 14246, + "time_per_iteration": 2.7092151641845703 + }, + { + "auxiliary_loss_clip": 0.01032324, + "auxiliary_loss_mlp": 0.01025224, + "balance_loss_clip": 1.01968217, + "balance_loss_mlp": 1.01488304, + "epoch": 0.8565759807605592, + "flos": 20375930442240.0, + "grad_norm": 2.0219946179533976, + "language_loss": 0.81647032, + "learning_rate": 2.1184869412014938e-07, + "loss": 0.83704579, + "num_input_tokens_seen": 307279160, + "step": 14247, + "time_per_iteration": 2.7444376945495605 + }, + { + "auxiliary_loss_clip": 0.01043435, + "auxiliary_loss_mlp": 0.01028057, + "balance_loss_clip": 1.02543151, + "balance_loss_mlp": 1.01766229, + "epoch": 0.8566361040132271, + "flos": 18807101049600.0, + "grad_norm": 3.4850781658946897, + "language_loss": 0.77686828, + "learning_rate": 2.1167428191323112e-07, + "loss": 0.79758322, + "num_input_tokens_seen": 307297920, + "step": 14248, + "time_per_iteration": 2.643519639968872 + }, + { + "auxiliary_loss_clip": 0.01010234, + "auxiliary_loss_mlp": 0.01032784, + "balance_loss_clip": 1.01914191, + "balance_loss_mlp": 1.02066052, + "epoch": 0.8566962272658951, + "flos": 24535355506560.0, + "grad_norm": 2.7090595786699203, + "language_loss": 0.77835476, + "learning_rate": 2.1149993751980278e-07, + "loss": 0.79878497, + "num_input_tokens_seen": 307318320, + "step": 14249, + "time_per_iteration": 2.7600533962249756 + }, + { + "auxiliary_loss_clip": 0.0103447, + "auxiliary_loss_mlp": 0.01029505, + "balance_loss_clip": 1.02143908, + "balance_loss_mlp": 1.0192709, + "epoch": 0.856756350518563, + "flos": 23178506227200.0, + "grad_norm": 1.673114900290099, + "language_loss": 0.7818774, + "learning_rate": 2.1132566094647597e-07, + "loss": 0.80251712, + "num_input_tokens_seen": 307336720, + "step": 14250, + "time_per_iteration": 2.761035680770874 + }, + { + "auxiliary_loss_clip": 0.01039614, + "auxiliary_loss_mlp": 0.01024473, + "balance_loss_clip": 1.02445281, + "balance_loss_mlp": 1.01519835, + "epoch": 0.856816473771231, + "flos": 20808581760000.0, + "grad_norm": 1.8224000959341105, + "language_loss": 0.79983544, + "learning_rate": 2.1115145219985942e-07, + "loss": 0.82047635, + "num_input_tokens_seen": 307354120, + "step": 14251, + "time_per_iteration": 2.7066891193389893 + }, + { + "auxiliary_loss_clip": 0.01029199, + "auxiliary_loss_mlp": 0.01024463, + "balance_loss_clip": 1.02342927, + "balance_loss_mlp": 1.01480138, + "epoch": 0.856876597023899, + "flos": 20228157889920.0, + "grad_norm": 2.018135883324487, + "language_loss": 0.61659116, + "learning_rate": 2.1097731128656005e-07, + "loss": 0.63712776, + "num_input_tokens_seen": 307373165, + "step": 14252, + "time_per_iteration": 2.69693660736084 + }, + { + "auxiliary_loss_clip": 0.01049411, + "auxiliary_loss_mlp": 0.01027764, + "balance_loss_clip": 1.03039694, + "balance_loss_mlp": 1.01638532, + "epoch": 0.856936720276567, + "flos": 18296128126080.0, + "grad_norm": 1.7660731379915486, + "language_loss": 0.69981515, + "learning_rate": 2.1080323821317924e-07, + "loss": 0.7205869, + "num_input_tokens_seen": 307391000, + "step": 14253, + "time_per_iteration": 2.642371892929077 + }, + { + "auxiliary_loss_clip": 0.00989628, + "auxiliary_loss_mlp": 0.01000648, + "balance_loss_clip": 1.00391603, + "balance_loss_mlp": 0.99964064, + "epoch": 0.8569968435292349, + "flos": 69878394933120.0, + "grad_norm": 0.7907678725049541, + "language_loss": 0.59234536, + "learning_rate": 2.1062923298631907e-07, + "loss": 0.61224812, + "num_input_tokens_seen": 307452865, + "step": 14254, + "time_per_iteration": 3.2811293601989746 + }, + { + "auxiliary_loss_clip": 0.01040163, + "auxiliary_loss_mlp": 0.0102911, + "balance_loss_clip": 1.0234493, + "balance_loss_mlp": 1.01798773, + "epoch": 0.8570569667819029, + "flos": 25848572739840.0, + "grad_norm": 1.6506496408316653, + "language_loss": 0.80661064, + "learning_rate": 2.1045529561257825e-07, + "loss": 0.82730341, + "num_input_tokens_seen": 307471940, + "step": 14255, + "time_per_iteration": 2.644240379333496 + }, + { + "auxiliary_loss_clip": 0.01060215, + "auxiliary_loss_mlp": 0.01024223, + "balance_loss_clip": 1.02485132, + "balance_loss_mlp": 1.01456153, + "epoch": 0.8571170900345708, + "flos": 23257115141760.0, + "grad_norm": 2.3120893715516986, + "language_loss": 0.67513913, + "learning_rate": 2.1028142609855126e-07, + "loss": 0.69598353, + "num_input_tokens_seen": 307488745, + "step": 14256, + "time_per_iteration": 2.7111687660217285 + }, + { + "auxiliary_loss_clip": 0.01052155, + "auxiliary_loss_mlp": 0.01025523, + "balance_loss_clip": 1.02553988, + "balance_loss_mlp": 1.01552773, + "epoch": 0.8571772132872388, + "flos": 18917670090240.0, + "grad_norm": 1.492298774764259, + "language_loss": 0.6965546, + "learning_rate": 2.1010762445083218e-07, + "loss": 0.71733135, + "num_input_tokens_seen": 307506855, + "step": 14257, + "time_per_iteration": 2.6976356506347656 + }, + { + "auxiliary_loss_clip": 0.01023241, + "auxiliary_loss_mlp": 0.01029834, + "balance_loss_clip": 1.02070475, + "balance_loss_mlp": 1.01928973, + "epoch": 0.8572373365399069, + "flos": 33250120318080.0, + "grad_norm": 2.193737979721997, + "language_loss": 0.77135956, + "learning_rate": 2.0993389067601197e-07, + "loss": 0.79189026, + "num_input_tokens_seen": 307526115, + "step": 14258, + "time_per_iteration": 2.7488386631011963 + }, + { + "auxiliary_loss_clip": 0.01049025, + "auxiliary_loss_mlp": 0.0074739, + "balance_loss_clip": 1.02398312, + "balance_loss_mlp": 1.00041008, + "epoch": 0.8572974597925748, + "flos": 23327535755520.0, + "grad_norm": 1.5241149856469085, + "language_loss": 0.67918235, + "learning_rate": 2.0976022478067735e-07, + "loss": 0.69714653, + "num_input_tokens_seen": 307545230, + "step": 14259, + "time_per_iteration": 2.641559600830078 + }, + { + "auxiliary_loss_clip": 0.01045521, + "auxiliary_loss_mlp": 0.01027207, + "balance_loss_clip": 1.02067757, + "balance_loss_mlp": 1.01634097, + "epoch": 0.8573575830452428, + "flos": 24535858296960.0, + "grad_norm": 1.8017580563938045, + "language_loss": 0.77105081, + "learning_rate": 2.0958662677141437e-07, + "loss": 0.79177803, + "num_input_tokens_seen": 307564900, + "step": 14260, + "time_per_iteration": 2.659391403198242 + }, + { + "auxiliary_loss_clip": 0.01034699, + "auxiliary_loss_mlp": 0.0102631, + "balance_loss_clip": 1.02223825, + "balance_loss_mlp": 1.01559329, + "epoch": 0.8574177062979107, + "flos": 24165403378560.0, + "grad_norm": 1.834368818879591, + "language_loss": 0.74412656, + "learning_rate": 2.09413096654806e-07, + "loss": 0.76473665, + "num_input_tokens_seen": 307583500, + "step": 14261, + "time_per_iteration": 2.6815032958984375 + }, + { + "auxiliary_loss_clip": 0.01048286, + "auxiliary_loss_mlp": 0.01030071, + "balance_loss_clip": 1.02647376, + "balance_loss_mlp": 1.01871634, + "epoch": 0.8574778295505787, + "flos": 17930737025280.0, + "grad_norm": 1.6317490083173032, + "language_loss": 0.78624451, + "learning_rate": 2.0923963443743276e-07, + "loss": 0.80702806, + "num_input_tokens_seen": 307601430, + "step": 14262, + "time_per_iteration": 2.5403292179107666 + }, + { + "auxiliary_loss_clip": 0.0102659, + "auxiliary_loss_mlp": 0.01027298, + "balance_loss_clip": 1.02368879, + "balance_loss_mlp": 1.01809525, + "epoch": 0.8575379528032466, + "flos": 21580697537280.0, + "grad_norm": 1.545346163550128, + "language_loss": 0.67726284, + "learning_rate": 2.0906624012587203e-07, + "loss": 0.69780171, + "num_input_tokens_seen": 307621495, + "step": 14263, + "time_per_iteration": 4.532073736190796 + }, + { + "auxiliary_loss_clip": 0.01030683, + "auxiliary_loss_mlp": 0.00747592, + "balance_loss_clip": 1.02395725, + "balance_loss_mlp": 1.00043464, + "epoch": 0.8575980760559146, + "flos": 21761579450880.0, + "grad_norm": 1.3780893126380442, + "language_loss": 0.797764, + "learning_rate": 2.088929137266986e-07, + "loss": 0.81554675, + "num_input_tokens_seen": 307640840, + "step": 14264, + "time_per_iteration": 2.7279419898986816 + }, + { + "auxiliary_loss_clip": 0.01025259, + "auxiliary_loss_mlp": 0.01030144, + "balance_loss_clip": 1.02239156, + "balance_loss_mlp": 1.0193851, + "epoch": 0.8576581993085826, + "flos": 34386442047360.0, + "grad_norm": 1.2996739420290104, + "language_loss": 0.69525504, + "learning_rate": 2.0871965524648582e-07, + "loss": 0.71580911, + "num_input_tokens_seen": 307663820, + "step": 14265, + "time_per_iteration": 2.8041164875030518 + }, + { + "auxiliary_loss_clip": 0.0105829, + "auxiliary_loss_mlp": 0.01021799, + "balance_loss_clip": 1.02410841, + "balance_loss_mlp": 1.01276255, + "epoch": 0.8577183225612506, + "flos": 23222497409280.0, + "grad_norm": 1.729002631913617, + "language_loss": 0.66251302, + "learning_rate": 2.085464646918027e-07, + "loss": 0.68331391, + "num_input_tokens_seen": 307682385, + "step": 14266, + "time_per_iteration": 2.5573384761810303 + }, + { + "auxiliary_loss_clip": 0.01043043, + "auxiliary_loss_mlp": 0.01028507, + "balance_loss_clip": 1.0263592, + "balance_loss_mlp": 1.01862442, + "epoch": 0.8577784458139185, + "flos": 28804164462720.0, + "grad_norm": 1.6102149383166073, + "language_loss": 0.75654322, + "learning_rate": 2.0837334206921731e-07, + "loss": 0.77725875, + "num_input_tokens_seen": 307704680, + "step": 14267, + "time_per_iteration": 2.749216079711914 + }, + { + "auxiliary_loss_clip": 0.01049062, + "auxiliary_loss_mlp": 0.01023302, + "balance_loss_clip": 1.02411962, + "balance_loss_mlp": 1.01387262, + "epoch": 0.8578385690665865, + "flos": 19755573626880.0, + "grad_norm": 1.8552247197444431, + "language_loss": 0.87763345, + "learning_rate": 2.082002873852946e-07, + "loss": 0.89835715, + "num_input_tokens_seen": 307723245, + "step": 14268, + "time_per_iteration": 2.6804299354553223 + }, + { + "auxiliary_loss_clip": 0.0105246, + "auxiliary_loss_mlp": 0.0103025, + "balance_loss_clip": 1.02503347, + "balance_loss_mlp": 1.01980186, + "epoch": 0.8578986923192544, + "flos": 20704082117760.0, + "grad_norm": 1.880714016697568, + "language_loss": 0.73074603, + "learning_rate": 2.0802730064659667e-07, + "loss": 0.75157309, + "num_input_tokens_seen": 307742510, + "step": 14269, + "time_per_iteration": 2.6534738540649414 + }, + { + "auxiliary_loss_clip": 0.01051505, + "auxiliary_loss_mlp": 0.01029023, + "balance_loss_clip": 1.02478266, + "balance_loss_mlp": 1.01878285, + "epoch": 0.8579588155719224, + "flos": 36101715189120.0, + "grad_norm": 1.7082025473767595, + "language_loss": 0.66721803, + "learning_rate": 2.0785438185968252e-07, + "loss": 0.68802333, + "num_input_tokens_seen": 307766030, + "step": 14270, + "time_per_iteration": 2.792271375656128 + }, + { + "auxiliary_loss_clip": 0.01028824, + "auxiliary_loss_mlp": 0.01022855, + "balance_loss_clip": 1.01867771, + "balance_loss_mlp": 1.01284766, + "epoch": 0.8580189388245905, + "flos": 22853479034880.0, + "grad_norm": 1.5446631984903738, + "language_loss": 0.73547864, + "learning_rate": 2.0768153103110997e-07, + "loss": 0.75599545, + "num_input_tokens_seen": 307785800, + "step": 14271, + "time_per_iteration": 2.665673017501831 + }, + { + "auxiliary_loss_clip": 0.00969318, + "auxiliary_loss_mlp": 0.00746542, + "balance_loss_clip": 1.0042187, + "balance_loss_mlp": 1.00060797, + "epoch": 0.8580790620772584, + "flos": 69642104290560.0, + "grad_norm": 0.7876753984058418, + "language_loss": 0.59522903, + "learning_rate": 2.0750874816743358e-07, + "loss": 0.61238766, + "num_input_tokens_seen": 307850995, + "step": 14272, + "time_per_iteration": 3.461603879928589 + }, + { + "auxiliary_loss_clip": 0.01037223, + "auxiliary_loss_mlp": 0.01037723, + "balance_loss_clip": 1.02275193, + "balance_loss_mlp": 1.02554607, + "epoch": 0.8581391853299264, + "flos": 13334243270400.0, + "grad_norm": 1.828285809691846, + "language_loss": 0.7529614, + "learning_rate": 2.0733603327520499e-07, + "loss": 0.77371091, + "num_input_tokens_seen": 307868585, + "step": 14273, + "time_per_iteration": 3.318028211593628 + }, + { + "auxiliary_loss_clip": 0.01050982, + "auxiliary_loss_mlp": 0.0102842, + "balance_loss_clip": 1.02456403, + "balance_loss_mlp": 1.01840019, + "epoch": 0.8581993085825943, + "flos": 19645651031040.0, + "grad_norm": 1.8037343375941004, + "language_loss": 0.82024306, + "learning_rate": 2.0716338636097385e-07, + "loss": 0.84103709, + "num_input_tokens_seen": 307886820, + "step": 14274, + "time_per_iteration": 2.6148524284362793 + }, + { + "auxiliary_loss_clip": 0.00996992, + "auxiliary_loss_mlp": 0.00999956, + "balance_loss_clip": 1.00146127, + "balance_loss_mlp": 0.99911517, + "epoch": 0.8582594318352623, + "flos": 55825077294720.0, + "grad_norm": 0.8006625704009244, + "language_loss": 0.60858226, + "learning_rate": 2.0699080743128672e-07, + "loss": 0.62855172, + "num_input_tokens_seen": 307944020, + "step": 14275, + "time_per_iteration": 3.2444701194763184 + }, + { + "auxiliary_loss_clip": 0.01053587, + "auxiliary_loss_mlp": 0.01024955, + "balance_loss_clip": 1.02607954, + "balance_loss_mlp": 1.01437497, + "epoch": 0.8583195550879302, + "flos": 24279563779200.0, + "grad_norm": 2.149938232933907, + "language_loss": 0.58855546, + "learning_rate": 2.0681829649268768e-07, + "loss": 0.60934085, + "num_input_tokens_seen": 307961055, + "step": 14276, + "time_per_iteration": 2.6522114276885986 + }, + { + "auxiliary_loss_clip": 0.01043208, + "auxiliary_loss_mlp": 0.0103226, + "balance_loss_clip": 1.02558231, + "balance_loss_mlp": 1.02201426, + "epoch": 0.8583796783405983, + "flos": 13444129952640.0, + "grad_norm": 2.010260875186316, + "language_loss": 0.76296365, + "learning_rate": 2.0664585355171838e-07, + "loss": 0.78371835, + "num_input_tokens_seen": 307978690, + "step": 14277, + "time_per_iteration": 4.381303548812866 + }, + { + "auxiliary_loss_clip": 0.01041188, + "auxiliary_loss_mlp": 0.01025373, + "balance_loss_clip": 1.02395511, + "balance_loss_mlp": 1.01475155, + "epoch": 0.8584398015932662, + "flos": 16180271533440.0, + "grad_norm": 1.642442829128693, + "language_loss": 0.83490747, + "learning_rate": 2.0647347861491803e-07, + "loss": 0.85557306, + "num_input_tokens_seen": 307995870, + "step": 14278, + "time_per_iteration": 2.761622905731201 + }, + { + "auxiliary_loss_clip": 0.01045287, + "auxiliary_loss_mlp": 0.01030154, + "balance_loss_clip": 1.02634156, + "balance_loss_mlp": 1.01876318, + "epoch": 0.8584999248459342, + "flos": 17450431338240.0, + "grad_norm": 1.9068181527400294, + "language_loss": 0.74661505, + "learning_rate": 2.0630117168882366e-07, + "loss": 0.76736945, + "num_input_tokens_seen": 308013645, + "step": 14279, + "time_per_iteration": 2.7613370418548584 + }, + { + "auxiliary_loss_clip": 0.01062554, + "auxiliary_loss_mlp": 0.01028575, + "balance_loss_clip": 1.02621329, + "balance_loss_mlp": 1.0184896, + "epoch": 0.8585600480986021, + "flos": 23441013797760.0, + "grad_norm": 3.6506201196511796, + "language_loss": 0.66112232, + "learning_rate": 2.0612893277996845e-07, + "loss": 0.68203366, + "num_input_tokens_seen": 308032490, + "step": 14280, + "time_per_iteration": 2.606318235397339 + }, + { + "auxiliary_loss_clip": 0.01049592, + "auxiliary_loss_mlp": 0.01024454, + "balance_loss_clip": 1.0237807, + "balance_loss_mlp": 1.01496458, + "epoch": 0.8586201713512701, + "flos": 19937927998080.0, + "grad_norm": 2.0040295524654868, + "language_loss": 0.6239028, + "learning_rate": 2.0595676189488343e-07, + "loss": 0.64464319, + "num_input_tokens_seen": 308052110, + "step": 14281, + "time_per_iteration": 2.6848087310791016 + }, + { + "auxiliary_loss_clip": 0.01043566, + "auxiliary_loss_mlp": 0.007477, + "balance_loss_clip": 1.02595639, + "balance_loss_mlp": 1.00045085, + "epoch": 0.858680294603938, + "flos": 15304769435520.0, + "grad_norm": 1.779208471854324, + "language_loss": 0.73100996, + "learning_rate": 2.0578465904009845e-07, + "loss": 0.74892265, + "num_input_tokens_seen": 308070660, + "step": 14282, + "time_per_iteration": 2.874408483505249 + }, + { + "auxiliary_loss_clip": 0.01031779, + "auxiliary_loss_mlp": 0.01021905, + "balance_loss_clip": 1.02084017, + "balance_loss_mlp": 1.01242161, + "epoch": 0.858740417856606, + "flos": 22711237176960.0, + "grad_norm": 1.621180080223055, + "language_loss": 0.75575614, + "learning_rate": 2.0561262422213832e-07, + "loss": 0.77629304, + "num_input_tokens_seen": 308089520, + "step": 14283, + "time_per_iteration": 2.6946804523468018 + }, + { + "auxiliary_loss_clip": 0.01044341, + "auxiliary_loss_mlp": 0.01027046, + "balance_loss_clip": 1.02184165, + "balance_loss_mlp": 1.01685989, + "epoch": 0.8588005411092741, + "flos": 34054303962240.0, + "grad_norm": 2.1844829808104773, + "language_loss": 0.59826601, + "learning_rate": 2.0544065744752736e-07, + "loss": 0.61897987, + "num_input_tokens_seen": 308111545, + "step": 14284, + "time_per_iteration": 2.844724178314209 + }, + { + "auxiliary_loss_clip": 0.01040738, + "auxiliary_loss_mlp": 0.01028219, + "balance_loss_clip": 1.02548134, + "balance_loss_mlp": 1.0184145, + "epoch": 0.858860664361942, + "flos": 28913584268160.0, + "grad_norm": 1.7789159977797957, + "language_loss": 0.75753617, + "learning_rate": 2.0526875872278749e-07, + "loss": 0.77822572, + "num_input_tokens_seen": 308129690, + "step": 14285, + "time_per_iteration": 2.731679677963257 + }, + { + "auxiliary_loss_clip": 0.01057229, + "auxiliary_loss_mlp": 0.01028595, + "balance_loss_clip": 1.02964449, + "balance_loss_mlp": 1.01747251, + "epoch": 0.85892078761461, + "flos": 19792525743360.0, + "grad_norm": 2.107677288393164, + "language_loss": 0.74445033, + "learning_rate": 2.0509692805443524e-07, + "loss": 0.7653085, + "num_input_tokens_seen": 308147410, + "step": 14286, + "time_per_iteration": 2.8117141723632812 + }, + { + "auxiliary_loss_clip": 0.00987009, + "auxiliary_loss_mlp": 0.00746531, + "balance_loss_clip": 1.0020442, + "balance_loss_mlp": 1.0004853, + "epoch": 0.8589809108672779, + "flos": 67106630039040.0, + "grad_norm": 1.2326786173306394, + "language_loss": 0.49490213, + "learning_rate": 2.0492516544898718e-07, + "loss": 0.51223755, + "num_input_tokens_seen": 308204875, + "step": 14287, + "time_per_iteration": 3.2084238529205322 + }, + { + "auxiliary_loss_clip": 0.0105389, + "auxiliary_loss_mlp": 0.01026915, + "balance_loss_clip": 1.02591431, + "balance_loss_mlp": 1.01667511, + "epoch": 0.8590410341199459, + "flos": 29716259541120.0, + "grad_norm": 1.8051225228622894, + "language_loss": 0.78994596, + "learning_rate": 2.0475347091295704e-07, + "loss": 0.81075406, + "num_input_tokens_seen": 308225690, + "step": 14288, + "time_per_iteration": 2.669896364212036 + }, + { + "auxiliary_loss_clip": 0.01035869, + "auxiliary_loss_mlp": 0.01029915, + "balance_loss_clip": 1.02719283, + "balance_loss_mlp": 1.01892376, + "epoch": 0.8591011573726138, + "flos": 23987430466560.0, + "grad_norm": 2.4837042573101598, + "language_loss": 0.80890489, + "learning_rate": 2.045818444528553e-07, + "loss": 0.82956278, + "num_input_tokens_seen": 308245255, + "step": 14289, + "time_per_iteration": 2.744187593460083 + }, + { + "auxiliary_loss_clip": 0.01050793, + "auxiliary_loss_mlp": 0.01026824, + "balance_loss_clip": 1.02526212, + "balance_loss_mlp": 1.01676893, + "epoch": 0.8591612806252819, + "flos": 14428656806400.0, + "grad_norm": 2.192810555840397, + "language_loss": 0.6516484, + "learning_rate": 2.0441028607518973e-07, + "loss": 0.67242455, + "num_input_tokens_seen": 308261755, + "step": 14290, + "time_per_iteration": 2.7606587409973145 + }, + { + "auxiliary_loss_clip": 0.01042404, + "auxiliary_loss_mlp": 0.0102681, + "balance_loss_clip": 1.0250268, + "balance_loss_mlp": 1.01617026, + "epoch": 0.8592214038779498, + "flos": 31577150419200.0, + "grad_norm": 1.9253269647399196, + "language_loss": 0.555318, + "learning_rate": 2.0423879578646642e-07, + "loss": 0.57601017, + "num_input_tokens_seen": 308285145, + "step": 14291, + "time_per_iteration": 2.759392738342285 + }, + { + "auxiliary_loss_clip": 0.01053275, + "auxiliary_loss_mlp": 0.0102898, + "balance_loss_clip": 1.02495956, + "balance_loss_mlp": 1.01863897, + "epoch": 0.8592815271306178, + "flos": 17457290835840.0, + "grad_norm": 2.8588911281326688, + "language_loss": 0.71292138, + "learning_rate": 2.0406737359318792e-07, + "loss": 0.73374391, + "num_input_tokens_seen": 308304130, + "step": 14292, + "time_per_iteration": 5.989933252334595 + }, + { + "auxiliary_loss_clip": 0.01052178, + "auxiliary_loss_mlp": 0.01026832, + "balance_loss_clip": 1.02441621, + "balance_loss_mlp": 1.01641905, + "epoch": 0.8593416503832857, + "flos": 25411360394880.0, + "grad_norm": 1.6811921142440758, + "language_loss": 0.71304834, + "learning_rate": 2.038960195018542e-07, + "loss": 0.7338385, + "num_input_tokens_seen": 308324670, + "step": 14293, + "time_per_iteration": 2.7913146018981934 + }, + { + "auxiliary_loss_clip": 0.01042062, + "auxiliary_loss_mlp": 0.01029826, + "balance_loss_clip": 1.02582037, + "balance_loss_mlp": 1.01943088, + "epoch": 0.8594017736359537, + "flos": 20996646393600.0, + "grad_norm": 1.4616137176741926, + "language_loss": 0.68949485, + "learning_rate": 2.0372473351896358e-07, + "loss": 0.71021372, + "num_input_tokens_seen": 308344215, + "step": 14294, + "time_per_iteration": 2.6993885040283203 + }, + { + "auxiliary_loss_clip": 0.01058067, + "auxiliary_loss_mlp": 0.01032525, + "balance_loss_clip": 1.02295184, + "balance_loss_mlp": 1.02263105, + "epoch": 0.8594618968886216, + "flos": 22091059929600.0, + "grad_norm": 2.0375223229502533, + "language_loss": 0.78182447, + "learning_rate": 2.0355351565101087e-07, + "loss": 0.80273044, + "num_input_tokens_seen": 308360520, + "step": 14295, + "time_per_iteration": 2.68434739112854 + }, + { + "auxiliary_loss_clip": 0.01036658, + "auxiliary_loss_mlp": 0.01036536, + "balance_loss_clip": 1.02311039, + "balance_loss_mlp": 1.024019, + "epoch": 0.8595220201412896, + "flos": 11656245467520.0, + "grad_norm": 5.02836092601782, + "language_loss": 0.69752067, + "learning_rate": 2.0338236590448975e-07, + "loss": 0.7182526, + "num_input_tokens_seen": 308376865, + "step": 14296, + "time_per_iteration": 2.731557607650757 + }, + { + "auxiliary_loss_clip": 0.0103249, + "auxiliary_loss_mlp": 0.01030157, + "balance_loss_clip": 1.02225113, + "balance_loss_mlp": 1.01911223, + "epoch": 0.8595821433939577, + "flos": 25040366772480.0, + "grad_norm": 2.2084988284160887, + "language_loss": 0.79322624, + "learning_rate": 2.0321128428588842e-07, + "loss": 0.81385273, + "num_input_tokens_seen": 308395870, + "step": 14297, + "time_per_iteration": 2.7683956623077393 + }, + { + "auxiliary_loss_clip": 0.0104377, + "auxiliary_loss_mlp": 0.01024829, + "balance_loss_clip": 1.02130437, + "balance_loss_mlp": 1.01572728, + "epoch": 0.8596422666466256, + "flos": 28511528359680.0, + "grad_norm": 1.8237427889254698, + "language_loss": 0.68015981, + "learning_rate": 2.030402708016954e-07, + "loss": 0.70084578, + "num_input_tokens_seen": 308417250, + "step": 14298, + "time_per_iteration": 2.7673299312591553 + }, + { + "auxiliary_loss_clip": 0.01033032, + "auxiliary_loss_mlp": 0.01031811, + "balance_loss_clip": 1.021595, + "balance_loss_mlp": 1.02185082, + "epoch": 0.8597023898992936, + "flos": 13589137157760.0, + "grad_norm": 6.356594159390487, + "language_loss": 0.68483615, + "learning_rate": 2.0286932545839576e-07, + "loss": 0.70548457, + "num_input_tokens_seen": 308434565, + "step": 14299, + "time_per_iteration": 2.7811028957366943 + }, + { + "auxiliary_loss_clip": 0.0103516, + "auxiliary_loss_mlp": 0.01035354, + "balance_loss_clip": 1.02652955, + "balance_loss_mlp": 1.02494144, + "epoch": 0.8597625131519615, + "flos": 32300821728000.0, + "grad_norm": 2.8415788818359498, + "language_loss": 0.7132296, + "learning_rate": 2.0269844826247096e-07, + "loss": 0.73393476, + "num_input_tokens_seen": 308450040, + "step": 14300, + "time_per_iteration": 2.806370258331299 + }, + { + "auxiliary_loss_clip": 0.01034522, + "auxiliary_loss_mlp": 0.0103363, + "balance_loss_clip": 1.02205253, + "balance_loss_mlp": 1.0225141, + "epoch": 0.8598226364046295, + "flos": 28730367970560.0, + "grad_norm": 1.5774472568526285, + "language_loss": 0.69474697, + "learning_rate": 2.0252763922040116e-07, + "loss": 0.71542853, + "num_input_tokens_seen": 308470545, + "step": 14301, + "time_per_iteration": 2.7284040451049805 + }, + { + "auxiliary_loss_clip": 0.01016012, + "auxiliary_loss_mlp": 0.01028808, + "balance_loss_clip": 1.02450359, + "balance_loss_mlp": 1.01844919, + "epoch": 0.8598827596572974, + "flos": 21871825269120.0, + "grad_norm": 2.5324903734236397, + "language_loss": 0.74258184, + "learning_rate": 2.023568983386641e-07, + "loss": 0.76303005, + "num_input_tokens_seen": 308490020, + "step": 14302, + "time_per_iteration": 2.844219446182251 + }, + { + "auxiliary_loss_clip": 0.01047396, + "auxiliary_loss_mlp": 0.01029439, + "balance_loss_clip": 1.02296185, + "balance_loss_mlp": 1.0200994, + "epoch": 0.8599428829099655, + "flos": 23767297966080.0, + "grad_norm": 1.7082915556000957, + "language_loss": 0.835783, + "learning_rate": 2.02186225623733e-07, + "loss": 0.85655135, + "num_input_tokens_seen": 308509065, + "step": 14303, + "time_per_iteration": 2.6614773273468018 + }, + { + "auxiliary_loss_clip": 0.01047294, + "auxiliary_loss_mlp": 0.01032067, + "balance_loss_clip": 1.02343643, + "balance_loss_mlp": 1.02078366, + "epoch": 0.8600030061626334, + "flos": 16212770363520.0, + "grad_norm": 2.0025961681801845, + "language_loss": 0.77332664, + "learning_rate": 2.0201562108208025e-07, + "loss": 0.79412025, + "num_input_tokens_seen": 308524725, + "step": 14304, + "time_per_iteration": 2.640737295150757 + }, + { + "auxiliary_loss_clip": 0.01063596, + "auxiliary_loss_mlp": 0.0103167, + "balance_loss_clip": 1.02581728, + "balance_loss_mlp": 1.02053595, + "epoch": 0.8600631294153014, + "flos": 15669370437120.0, + "grad_norm": 1.9713336396662582, + "language_loss": 0.53502315, + "learning_rate": 2.0184508472017537e-07, + "loss": 0.55597579, + "num_input_tokens_seen": 308543525, + "step": 14305, + "time_per_iteration": 2.7511556148529053 + }, + { + "auxiliary_loss_clip": 0.01059766, + "auxiliary_loss_mlp": 0.01025837, + "balance_loss_clip": 1.02401662, + "balance_loss_mlp": 1.01509023, + "epoch": 0.8601232526679693, + "flos": 17493093717120.0, + "grad_norm": 2.6131765758433065, + "language_loss": 0.83832693, + "learning_rate": 2.0167461654448558e-07, + "loss": 0.85918289, + "num_input_tokens_seen": 308557995, + "step": 14306, + "time_per_iteration": 2.7335751056671143 + }, + { + "auxiliary_loss_clip": 0.01049505, + "auxiliary_loss_mlp": 0.0074759, + "balance_loss_clip": 1.02425015, + "balance_loss_mlp": 1.00040042, + "epoch": 0.8601833759206373, + "flos": 26985935963520.0, + "grad_norm": 1.2852488387807857, + "language_loss": 0.71297491, + "learning_rate": 2.01504216561474e-07, + "loss": 0.73094589, + "num_input_tokens_seen": 308582750, + "step": 14307, + "time_per_iteration": 2.758772611618042 + }, + { + "auxiliary_loss_clip": 0.01048835, + "auxiliary_loss_mlp": 0.00747821, + "balance_loss_clip": 1.02353215, + "balance_loss_mlp": 1.00045776, + "epoch": 0.8602434991733052, + "flos": 25229760209280.0, + "grad_norm": 2.2676596889577247, + "language_loss": 0.63696796, + "learning_rate": 2.0133388477760316e-07, + "loss": 0.65493453, + "num_input_tokens_seen": 308603770, + "step": 14308, + "time_per_iteration": 2.717501401901245 + }, + { + "auxiliary_loss_clip": 0.00988044, + "auxiliary_loss_mlp": 0.01001433, + "balance_loss_clip": 1.00227296, + "balance_loss_mlp": 1.00056863, + "epoch": 0.8603036224259732, + "flos": 71015363107200.0, + "grad_norm": 0.6250404769649682, + "language_loss": 0.4844383, + "learning_rate": 2.0116362119933172e-07, + "loss": 0.50433308, + "num_input_tokens_seen": 308667735, + "step": 14309, + "time_per_iteration": 3.335200548171997 + }, + { + "auxiliary_loss_clip": 0.01013055, + "auxiliary_loss_mlp": 0.01034777, + "balance_loss_clip": 1.02403831, + "balance_loss_mlp": 1.02355981, + "epoch": 0.8603637456786413, + "flos": 20300625578880.0, + "grad_norm": 1.744748545122878, + "language_loss": 0.6703673, + "learning_rate": 2.0099342583311563e-07, + "loss": 0.69084567, + "num_input_tokens_seen": 308686300, + "step": 14310, + "time_per_iteration": 4.885660886764526 + }, + { + "auxiliary_loss_clip": 0.00995644, + "auxiliary_loss_mlp": 0.01029904, + "balance_loss_clip": 1.02005386, + "balance_loss_mlp": 1.01948524, + "epoch": 0.8604238689313092, + "flos": 21835842819840.0, + "grad_norm": 1.6947065967093884, + "language_loss": 0.78274608, + "learning_rate": 2.0082329868540905e-07, + "loss": 0.80300152, + "num_input_tokens_seen": 308705825, + "step": 14311, + "time_per_iteration": 3.210631847381592 + }, + { + "auxiliary_loss_clip": 0.01050583, + "auxiliary_loss_mlp": 0.01027978, + "balance_loss_clip": 1.02413869, + "balance_loss_mlp": 1.01797616, + "epoch": 0.8604839921839772, + "flos": 18004210295040.0, + "grad_norm": 2.0365806177772416, + "language_loss": 0.71764183, + "learning_rate": 2.006532397626639e-07, + "loss": 0.7384274, + "num_input_tokens_seen": 308723340, + "step": 14312, + "time_per_iteration": 2.749987840652466 + }, + { + "auxiliary_loss_clip": 0.01031763, + "auxiliary_loss_mlp": 0.01028475, + "balance_loss_clip": 1.02152634, + "balance_loss_mlp": 1.01801395, + "epoch": 0.8605441154366451, + "flos": 16252164604800.0, + "grad_norm": 1.8960807812884088, + "language_loss": 0.77175575, + "learning_rate": 2.0048324907132797e-07, + "loss": 0.79235816, + "num_input_tokens_seen": 308741280, + "step": 14313, + "time_per_iteration": 2.5955288410186768 + }, + { + "auxiliary_loss_clip": 0.01033903, + "auxiliary_loss_mlp": 0.01029217, + "balance_loss_clip": 1.02225089, + "balance_loss_mlp": 1.01787412, + "epoch": 0.8606042386893131, + "flos": 32267065921920.0, + "grad_norm": 1.607536963634083, + "language_loss": 0.72877717, + "learning_rate": 2.003133266178474e-07, + "loss": 0.74940836, + "num_input_tokens_seen": 308762875, + "step": 14314, + "time_per_iteration": 2.7189440727233887 + }, + { + "auxiliary_loss_clip": 0.01038612, + "auxiliary_loss_mlp": 0.01029946, + "balance_loss_clip": 1.02234006, + "balance_loss_mlp": 1.01974785, + "epoch": 0.860664361941981, + "flos": 20229774001920.0, + "grad_norm": 2.0277092290432055, + "language_loss": 0.69431233, + "learning_rate": 2.001434724086657e-07, + "loss": 0.71499789, + "num_input_tokens_seen": 308780315, + "step": 14315, + "time_per_iteration": 2.709923505783081 + }, + { + "auxiliary_loss_clip": 0.01049914, + "auxiliary_loss_mlp": 0.01032016, + "balance_loss_clip": 1.02420998, + "balance_loss_mlp": 1.0222888, + "epoch": 0.8607244851946491, + "flos": 25191622944000.0, + "grad_norm": 1.7079614916332868, + "language_loss": 0.72227591, + "learning_rate": 1.9997368645022418e-07, + "loss": 0.74309528, + "num_input_tokens_seen": 308799435, + "step": 14316, + "time_per_iteration": 2.6098575592041016 + }, + { + "auxiliary_loss_clip": 0.01043511, + "auxiliary_loss_mlp": 0.01026188, + "balance_loss_clip": 1.0266726, + "balance_loss_mlp": 1.01615667, + "epoch": 0.860784608447317, + "flos": 20482082110080.0, + "grad_norm": 1.6532083154440713, + "language_loss": 0.82826626, + "learning_rate": 1.9980396874896056e-07, + "loss": 0.84896326, + "num_input_tokens_seen": 308817730, + "step": 14317, + "time_per_iteration": 2.7259762287139893 + }, + { + "auxiliary_loss_clip": 0.01041168, + "auxiliary_loss_mlp": 0.0102961, + "balance_loss_clip": 1.02418721, + "balance_loss_mlp": 1.01919079, + "epoch": 0.860844731699985, + "flos": 50476037696640.0, + "grad_norm": 1.5663818524257134, + "language_loss": 0.66731739, + "learning_rate": 1.996343193113108e-07, + "loss": 0.68802518, + "num_input_tokens_seen": 308841735, + "step": 14318, + "time_per_iteration": 2.9232428073883057 + }, + { + "auxiliary_loss_clip": 0.01048895, + "auxiliary_loss_mlp": 0.01023539, + "balance_loss_clip": 1.02429342, + "balance_loss_mlp": 1.01441348, + "epoch": 0.8609048549526529, + "flos": 41172768455040.0, + "grad_norm": 1.5279275734655036, + "language_loss": 0.71751392, + "learning_rate": 1.9946473814370911e-07, + "loss": 0.73823828, + "num_input_tokens_seen": 308865050, + "step": 14319, + "time_per_iteration": 2.800924777984619 + }, + { + "auxiliary_loss_clip": 0.01042562, + "auxiliary_loss_mlp": 0.00747664, + "balance_loss_clip": 1.02681553, + "balance_loss_mlp": 1.00039959, + "epoch": 0.8609649782053209, + "flos": 23951196622080.0, + "grad_norm": 1.9655034259768962, + "language_loss": 0.6704582, + "learning_rate": 1.992952252525839e-07, + "loss": 0.68836045, + "num_input_tokens_seen": 308885375, + "step": 14320, + "time_per_iteration": 2.69132137298584 + }, + { + "auxiliary_loss_clip": 0.01037578, + "auxiliary_loss_mlp": 0.01033237, + "balance_loss_clip": 1.023031, + "balance_loss_mlp": 1.02108932, + "epoch": 0.8610251014579888, + "flos": 23112574813440.0, + "grad_norm": 2.506581630815103, + "language_loss": 0.79999685, + "learning_rate": 1.9912578064436446e-07, + "loss": 0.82070494, + "num_input_tokens_seen": 308904700, + "step": 14321, + "time_per_iteration": 2.699263095855713 + }, + { + "auxiliary_loss_clip": 0.01044823, + "auxiliary_loss_mlp": 0.00747605, + "balance_loss_clip": 1.02217412, + "balance_loss_mlp": 1.00032699, + "epoch": 0.8610852247106568, + "flos": 19426811420160.0, + "grad_norm": 1.7548137318351182, + "language_loss": 0.70908087, + "learning_rate": 1.9895640432547567e-07, + "loss": 0.72700512, + "num_input_tokens_seen": 308922985, + "step": 14322, + "time_per_iteration": 2.83980655670166 + }, + { + "auxiliary_loss_clip": 0.01038718, + "auxiliary_loss_mlp": 0.01033293, + "balance_loss_clip": 1.02301788, + "balance_loss_mlp": 1.02157474, + "epoch": 0.8611453479633249, + "flos": 19312076401920.0, + "grad_norm": 3.0160059299885758, + "language_loss": 0.55946594, + "learning_rate": 1.9878709630234102e-07, + "loss": 0.58018607, + "num_input_tokens_seen": 308940765, + "step": 14323, + "time_per_iteration": 2.6454827785491943 + }, + { + "auxiliary_loss_clip": 0.01028622, + "auxiliary_loss_mlp": 0.01024509, + "balance_loss_clip": 1.02312279, + "balance_loss_mlp": 1.01451945, + "epoch": 0.8612054712159928, + "flos": 23253667436160.0, + "grad_norm": 1.6007580579929905, + "language_loss": 0.75617468, + "learning_rate": 1.986178565813801e-07, + "loss": 0.77670598, + "num_input_tokens_seen": 308960110, + "step": 14324, + "time_per_iteration": 4.464059114456177 + }, + { + "auxiliary_loss_clip": 0.01016404, + "auxiliary_loss_mlp": 0.01032694, + "balance_loss_clip": 1.02325964, + "balance_loss_mlp": 1.02005219, + "epoch": 0.8612655944686608, + "flos": 16028440744320.0, + "grad_norm": 1.9369163292818892, + "language_loss": 0.66437256, + "learning_rate": 1.9844868516901036e-07, + "loss": 0.68486345, + "num_input_tokens_seen": 308976665, + "step": 14325, + "time_per_iteration": 2.746931314468384 + }, + { + "auxiliary_loss_clip": 0.01054728, + "auxiliary_loss_mlp": 0.01030606, + "balance_loss_clip": 1.02695346, + "balance_loss_mlp": 1.02002025, + "epoch": 0.8613257177213287, + "flos": 22492720788480.0, + "grad_norm": 1.8126659905467726, + "language_loss": 0.64787126, + "learning_rate": 1.982795820716472e-07, + "loss": 0.66872466, + "num_input_tokens_seen": 308997015, + "step": 14326, + "time_per_iteration": 2.7265267372131348 + }, + { + "auxiliary_loss_clip": 0.01033085, + "auxiliary_loss_mlp": 0.01029266, + "balance_loss_clip": 1.02021337, + "balance_loss_mlp": 1.01841831, + "epoch": 0.8613858409739967, + "flos": 17238056175360.0, + "grad_norm": 1.850594514375382, + "language_loss": 0.84297442, + "learning_rate": 1.9811054729570253e-07, + "loss": 0.86359799, + "num_input_tokens_seen": 309015250, + "step": 14327, + "time_per_iteration": 2.645836591720581 + }, + { + "auxiliary_loss_clip": 0.01051425, + "auxiliary_loss_mlp": 0.01030125, + "balance_loss_clip": 1.02400708, + "balance_loss_mlp": 1.01956344, + "epoch": 0.8614459642266646, + "flos": 22821123859200.0, + "grad_norm": 1.9078966677955964, + "language_loss": 0.74935699, + "learning_rate": 1.9794158084758661e-07, + "loss": 0.77017248, + "num_input_tokens_seen": 309034140, + "step": 14328, + "time_per_iteration": 2.6239025592803955 + }, + { + "auxiliary_loss_clip": 0.01050859, + "auxiliary_loss_mlp": 0.01023894, + "balance_loss_clip": 1.02406573, + "balance_loss_mlp": 1.01376724, + "epoch": 0.8615060874793327, + "flos": 26504301473280.0, + "grad_norm": 2.051973292889068, + "language_loss": 0.80148846, + "learning_rate": 1.9777268273370673e-07, + "loss": 0.822236, + "num_input_tokens_seen": 309055075, + "step": 14329, + "time_per_iteration": 2.6214733123779297 + }, + { + "auxiliary_loss_clip": 0.01043553, + "auxiliary_loss_mlp": 0.01026942, + "balance_loss_clip": 1.02638459, + "balance_loss_mlp": 1.01645136, + "epoch": 0.8615662107320006, + "flos": 24061011477120.0, + "grad_norm": 2.2029183152837595, + "language_loss": 0.76861656, + "learning_rate": 1.9760385296046757e-07, + "loss": 0.78932142, + "num_input_tokens_seen": 309074650, + "step": 14330, + "time_per_iteration": 2.6953186988830566 + }, + { + "auxiliary_loss_clip": 0.01051489, + "auxiliary_loss_mlp": 0.01024624, + "balance_loss_clip": 1.02462399, + "balance_loss_mlp": 1.01470017, + "epoch": 0.8616263339846686, + "flos": 24165044242560.0, + "grad_norm": 2.033340619736946, + "language_loss": 0.64730483, + "learning_rate": 1.974350915342702e-07, + "loss": 0.66806591, + "num_input_tokens_seen": 309094385, + "step": 14331, + "time_per_iteration": 2.628356695175171 + }, + { + "auxiliary_loss_clip": 0.01043101, + "auxiliary_loss_mlp": 0.0103106, + "balance_loss_clip": 1.02672064, + "balance_loss_mlp": 1.0213083, + "epoch": 0.8616864572373365, + "flos": 21724340025600.0, + "grad_norm": 1.6259715137519972, + "language_loss": 0.76004463, + "learning_rate": 1.9726639846151506e-07, + "loss": 0.78078628, + "num_input_tokens_seen": 309111815, + "step": 14332, + "time_per_iteration": 2.6276369094848633 + }, + { + "auxiliary_loss_clip": 0.01054651, + "auxiliary_loss_mlp": 0.01028118, + "balance_loss_clip": 1.02560914, + "balance_loss_mlp": 1.01673388, + "epoch": 0.8617465804900045, + "flos": 23766651521280.0, + "grad_norm": 1.610522637378325, + "language_loss": 0.66816527, + "learning_rate": 1.9709777374859904e-07, + "loss": 0.68899298, + "num_input_tokens_seen": 309131385, + "step": 14333, + "time_per_iteration": 2.672131299972534 + }, + { + "auxiliary_loss_clip": 0.0104192, + "auxiliary_loss_mlp": 0.01032622, + "balance_loss_clip": 1.02522504, + "balance_loss_mlp": 1.02003932, + "epoch": 0.8618067037426724, + "flos": 37703941251840.0, + "grad_norm": 1.699193359714115, + "language_loss": 0.61895949, + "learning_rate": 1.969292174019157e-07, + "loss": 0.63970488, + "num_input_tokens_seen": 309155020, + "step": 14334, + "time_per_iteration": 2.7522261142730713 + }, + { + "auxiliary_loss_clip": 0.01027962, + "auxiliary_loss_mlp": 0.01042964, + "balance_loss_clip": 1.02439642, + "balance_loss_mlp": 1.03088248, + "epoch": 0.8618668269953405, + "flos": 21471026336640.0, + "grad_norm": 1.905975967326966, + "language_loss": 0.69108164, + "learning_rate": 1.967607294278577e-07, + "loss": 0.71179092, + "num_input_tokens_seen": 309172865, + "step": 14335, + "time_per_iteration": 2.676199436187744 + }, + { + "auxiliary_loss_clip": 0.01053953, + "auxiliary_loss_mlp": 0.01029819, + "balance_loss_clip": 1.02594244, + "balance_loss_mlp": 1.01949584, + "epoch": 0.8619269502480085, + "flos": 22232691256320.0, + "grad_norm": 1.4922329071918485, + "language_loss": 0.82720792, + "learning_rate": 1.965923098328135e-07, + "loss": 0.84804571, + "num_input_tokens_seen": 309193575, + "step": 14336, + "time_per_iteration": 2.580556869506836 + }, + { + "auxiliary_loss_clip": 0.01064257, + "auxiliary_loss_mlp": 0.01030365, + "balance_loss_clip": 1.02550447, + "balance_loss_mlp": 1.01941562, + "epoch": 0.8619870735006764, + "flos": 22710626645760.0, + "grad_norm": 1.710440237292466, + "language_loss": 0.67433077, + "learning_rate": 1.9642395862316907e-07, + "loss": 0.69527698, + "num_input_tokens_seen": 309212680, + "step": 14337, + "time_per_iteration": 2.618710517883301 + }, + { + "auxiliary_loss_clip": 0.0101846, + "auxiliary_loss_mlp": 0.01031495, + "balance_loss_clip": 1.01983833, + "balance_loss_mlp": 1.02086186, + "epoch": 0.8620471967533444, + "flos": 37520293991040.0, + "grad_norm": 1.4386298217128974, + "language_loss": 0.67176795, + "learning_rate": 1.962556758053089e-07, + "loss": 0.69226754, + "num_input_tokens_seen": 309234485, + "step": 14338, + "time_per_iteration": 2.832008123397827 + }, + { + "auxiliary_loss_clip": 0.01043449, + "auxiliary_loss_mlp": 0.01030353, + "balance_loss_clip": 1.02629089, + "balance_loss_mlp": 1.02035761, + "epoch": 0.8621073200060123, + "flos": 19682459493120.0, + "grad_norm": 2.05478653233147, + "language_loss": 0.61643857, + "learning_rate": 1.9608746138561448e-07, + "loss": 0.63717663, + "num_input_tokens_seen": 309253630, + "step": 14339, + "time_per_iteration": 5.913698434829712 + }, + { + "auxiliary_loss_clip": 0.0104221, + "auxiliary_loss_mlp": 0.00747635, + "balance_loss_clip": 1.02451658, + "balance_loss_mlp": 1.00045228, + "epoch": 0.8621674432586803, + "flos": 14536855549440.0, + "grad_norm": 1.8354138549544632, + "language_loss": 0.6261096, + "learning_rate": 1.9591931537046458e-07, + "loss": 0.64400804, + "num_input_tokens_seen": 309270950, + "step": 14340, + "time_per_iteration": 2.6448183059692383 + }, + { + "auxiliary_loss_clip": 0.01017347, + "auxiliary_loss_mlp": 0.01021925, + "balance_loss_clip": 1.02375793, + "balance_loss_mlp": 1.01242399, + "epoch": 0.8622275665113482, + "flos": 20740100480640.0, + "grad_norm": 1.6819244603259305, + "language_loss": 0.80241394, + "learning_rate": 1.9575123776623493e-07, + "loss": 0.82280666, + "num_input_tokens_seen": 309288780, + "step": 14341, + "time_per_iteration": 2.73471736907959 + }, + { + "auxiliary_loss_clip": 0.01051891, + "auxiliary_loss_mlp": 0.01028317, + "balance_loss_clip": 1.02544236, + "balance_loss_mlp": 1.01875031, + "epoch": 0.8622876897640163, + "flos": 24715914197760.0, + "grad_norm": 6.869293062862502, + "language_loss": 0.7478618, + "learning_rate": 1.9558322857929887e-07, + "loss": 0.76866388, + "num_input_tokens_seen": 309310875, + "step": 14342, + "time_per_iteration": 2.7066071033477783 + }, + { + "auxiliary_loss_clip": 0.01023919, + "auxiliary_loss_mlp": 0.0102583, + "balance_loss_clip": 1.02379513, + "balance_loss_mlp": 1.01458895, + "epoch": 0.8623478130166842, + "flos": 17457362663040.0, + "grad_norm": 1.694393997326262, + "language_loss": 0.68283021, + "learning_rate": 1.95415287816028e-07, + "loss": 0.70332772, + "num_input_tokens_seen": 309329900, + "step": 14343, + "time_per_iteration": 2.7031877040863037 + }, + { + "auxiliary_loss_clip": 0.01051034, + "auxiliary_loss_mlp": 0.01036599, + "balance_loss_clip": 1.02325201, + "balance_loss_mlp": 1.02514303, + "epoch": 0.8624079362693522, + "flos": 18109176814080.0, + "grad_norm": 1.788277374921194, + "language_loss": 0.67553782, + "learning_rate": 1.9524741548278967e-07, + "loss": 0.69641417, + "num_input_tokens_seen": 309347870, + "step": 14344, + "time_per_iteration": 2.6725401878356934 + }, + { + "auxiliary_loss_clip": 0.01018767, + "auxiliary_loss_mlp": 0.01039476, + "balance_loss_clip": 1.02276909, + "balance_loss_mlp": 1.0281868, + "epoch": 0.8624680595220201, + "flos": 30666455971200.0, + "grad_norm": 1.5533182707072966, + "language_loss": 0.81283861, + "learning_rate": 1.9507961158595054e-07, + "loss": 0.83342099, + "num_input_tokens_seen": 309371695, + "step": 14345, + "time_per_iteration": 2.747190237045288 + }, + { + "auxiliary_loss_clip": 0.01056548, + "auxiliary_loss_mlp": 0.01031115, + "balance_loss_clip": 1.02737641, + "balance_loss_mlp": 1.01965261, + "epoch": 0.8625281827746881, + "flos": 37998588516480.0, + "grad_norm": 2.3863268001561324, + "language_loss": 0.50712669, + "learning_rate": 1.9491187613187355e-07, + "loss": 0.52800333, + "num_input_tokens_seen": 309394645, + "step": 14346, + "time_per_iteration": 2.8108882904052734 + }, + { + "auxiliary_loss_clip": 0.00987295, + "auxiliary_loss_mlp": 0.01025505, + "balance_loss_clip": 1.02166796, + "balance_loss_mlp": 1.01431155, + "epoch": 0.862588306027356, + "flos": 26249730808320.0, + "grad_norm": 1.4332617568791959, + "language_loss": 0.75091422, + "learning_rate": 1.9474420912691913e-07, + "loss": 0.77104223, + "num_input_tokens_seen": 309413170, + "step": 14347, + "time_per_iteration": 2.824472188949585 + }, + { + "auxiliary_loss_clip": 0.01034999, + "auxiliary_loss_mlp": 0.01028541, + "balance_loss_clip": 1.02384925, + "balance_loss_mlp": 1.01740074, + "epoch": 0.862648429280024, + "flos": 25878809013120.0, + "grad_norm": 2.061385314182715, + "language_loss": 0.8059572, + "learning_rate": 1.945766105774449e-07, + "loss": 0.82659262, + "num_input_tokens_seen": 309431315, + "step": 14348, + "time_per_iteration": 2.6741292476654053 + }, + { + "auxiliary_loss_clip": 0.01047569, + "auxiliary_loss_mlp": 0.01026817, + "balance_loss_clip": 1.02235723, + "balance_loss_mlp": 1.01690507, + "epoch": 0.862708552532692, + "flos": 37816413713280.0, + "grad_norm": 2.9855587795908733, + "language_loss": 0.66253287, + "learning_rate": 1.9440908048980665e-07, + "loss": 0.68327677, + "num_input_tokens_seen": 309453020, + "step": 14349, + "time_per_iteration": 2.7389934062957764 + }, + { + "auxiliary_loss_clip": 0.0104947, + "auxiliary_loss_mlp": 0.01031141, + "balance_loss_clip": 1.0233779, + "balance_loss_mlp": 1.02052569, + "epoch": 0.86276867578536, + "flos": 19091800247040.0, + "grad_norm": 3.5962717328716605, + "language_loss": 0.69598591, + "learning_rate": 1.942416188703573e-07, + "loss": 0.71679205, + "num_input_tokens_seen": 309469780, + "step": 14350, + "time_per_iteration": 2.6143314838409424 + }, + { + "auxiliary_loss_clip": 0.01032345, + "auxiliary_loss_mlp": 0.01029085, + "balance_loss_clip": 1.02195787, + "balance_loss_mlp": 1.01869011, + "epoch": 0.862828799038028, + "flos": 22164281804160.0, + "grad_norm": 1.7689147667406222, + "language_loss": 0.76598978, + "learning_rate": 1.9407422572544618e-07, + "loss": 0.78660411, + "num_input_tokens_seen": 309489610, + "step": 14351, + "time_per_iteration": 2.631514072418213 + }, + { + "auxiliary_loss_clip": 0.01050813, + "auxiliary_loss_mlp": 0.01028351, + "balance_loss_clip": 1.02452826, + "balance_loss_mlp": 1.01886189, + "epoch": 0.8628889222906959, + "flos": 23145576433920.0, + "grad_norm": 1.8769700783338663, + "language_loss": 0.84696889, + "learning_rate": 1.9390690106142204e-07, + "loss": 0.8677606, + "num_input_tokens_seen": 309508295, + "step": 14352, + "time_per_iteration": 2.630570888519287 + }, + { + "auxiliary_loss_clip": 0.00997097, + "auxiliary_loss_mlp": 0.01002286, + "balance_loss_clip": 1.00158787, + "balance_loss_mlp": 1.00144577, + "epoch": 0.8629490455433639, + "flos": 57817762151040.0, + "grad_norm": 0.7846114569491752, + "language_loss": 0.61969048, + "learning_rate": 1.9373964488462913e-07, + "loss": 0.63968432, + "num_input_tokens_seen": 309567960, + "step": 14353, + "time_per_iteration": 3.193582057952881 + }, + { + "auxiliary_loss_clip": 0.01060771, + "auxiliary_loss_mlp": 0.01026442, + "balance_loss_clip": 1.02496803, + "balance_loss_mlp": 1.01669097, + "epoch": 0.8630091687960318, + "flos": 15919667383680.0, + "grad_norm": 1.6866866732665649, + "language_loss": 0.8197853, + "learning_rate": 1.9357245720140948e-07, + "loss": 0.84065747, + "num_input_tokens_seen": 309586050, + "step": 14354, + "time_per_iteration": 2.579019546508789 + }, + { + "auxiliary_loss_clip": 0.0104063, + "auxiliary_loss_mlp": 0.01025025, + "balance_loss_clip": 1.02386129, + "balance_loss_mlp": 1.01437402, + "epoch": 0.8630692920486999, + "flos": 17961691570560.0, + "grad_norm": 1.8646464262858944, + "language_loss": 0.85220945, + "learning_rate": 1.934053380181031e-07, + "loss": 0.87286603, + "num_input_tokens_seen": 309602910, + "step": 14355, + "time_per_iteration": 2.8444504737854004 + }, + { + "auxiliary_loss_clip": 0.01021527, + "auxiliary_loss_mlp": 0.01029132, + "balance_loss_clip": 1.02229333, + "balance_loss_mlp": 1.01787817, + "epoch": 0.8631294153013678, + "flos": 22455158140800.0, + "grad_norm": 10.873920111930252, + "language_loss": 0.58781809, + "learning_rate": 1.9323828734104763e-07, + "loss": 0.60832465, + "num_input_tokens_seen": 309621175, + "step": 14356, + "time_per_iteration": 2.8616838455200195 + }, + { + "auxiliary_loss_clip": 0.01020744, + "auxiliary_loss_mlp": 0.0102993, + "balance_loss_clip": 1.02343321, + "balance_loss_mlp": 1.01862276, + "epoch": 0.8631895385540358, + "flos": 16837005847680.0, + "grad_norm": 1.5672448935694643, + "language_loss": 0.77041841, + "learning_rate": 1.9307130517657756e-07, + "loss": 0.79092515, + "num_input_tokens_seen": 309639395, + "step": 14357, + "time_per_iteration": 2.7170517444610596 + }, + { + "auxiliary_loss_clip": 0.01052567, + "auxiliary_loss_mlp": 0.01027268, + "balance_loss_clip": 1.02493215, + "balance_loss_mlp": 1.01680768, + "epoch": 0.8632496618067037, + "flos": 18697214367360.0, + "grad_norm": 2.2268287593018194, + "language_loss": 0.77618289, + "learning_rate": 1.9290439153102468e-07, + "loss": 0.79698122, + "num_input_tokens_seen": 309657265, + "step": 14358, + "time_per_iteration": 4.9390692710876465 + }, + { + "auxiliary_loss_clip": 0.01022568, + "auxiliary_loss_mlp": 0.01029257, + "balance_loss_clip": 1.02131236, + "balance_loss_mlp": 1.0174911, + "epoch": 0.8633097850593717, + "flos": 24279922915200.0, + "grad_norm": 1.3455890701186664, + "language_loss": 0.74919403, + "learning_rate": 1.9273754641071816e-07, + "loss": 0.76971221, + "num_input_tokens_seen": 309678610, + "step": 14359, + "time_per_iteration": 3.0048463344573975 + }, + { + "auxiliary_loss_clip": 0.01001053, + "auxiliary_loss_mlp": 0.01024839, + "balance_loss_clip": 1.02062476, + "balance_loss_mlp": 1.01446736, + "epoch": 0.8633699083120396, + "flos": 21178569801600.0, + "grad_norm": 1.9233270776375158, + "language_loss": 0.70589703, + "learning_rate": 1.9257076982198517e-07, + "loss": 0.726156, + "num_input_tokens_seen": 309697710, + "step": 14360, + "time_per_iteration": 2.986529588699341 + }, + { + "auxiliary_loss_clip": 0.01033651, + "auxiliary_loss_mlp": 0.01030333, + "balance_loss_clip": 1.02497745, + "balance_loss_mlp": 1.01918054, + "epoch": 0.8634300315647077, + "flos": 19244888012160.0, + "grad_norm": 1.9939878144977885, + "language_loss": 0.7634418, + "learning_rate": 1.9240406177114953e-07, + "loss": 0.7840817, + "num_input_tokens_seen": 309715985, + "step": 14361, + "time_per_iteration": 2.6659598350524902 + }, + { + "auxiliary_loss_clip": 0.01006785, + "auxiliary_loss_mlp": 0.01002152, + "balance_loss_clip": 1.00169313, + "balance_loss_mlp": 1.00130546, + "epoch": 0.8634901548173756, + "flos": 66195648282240.0, + "grad_norm": 0.9619689299928973, + "language_loss": 0.58820808, + "learning_rate": 1.922374222645329e-07, + "loss": 0.60829747, + "num_input_tokens_seen": 309779930, + "step": 14362, + "time_per_iteration": 3.214907169342041 + }, + { + "auxiliary_loss_clip": 0.01006812, + "auxiliary_loss_mlp": 0.01030228, + "balance_loss_clip": 1.02765632, + "balance_loss_mlp": 1.01910543, + "epoch": 0.8635502780700436, + "flos": 24789531121920.0, + "grad_norm": 1.8505910211058558, + "language_loss": 0.80502546, + "learning_rate": 1.9207085130845524e-07, + "loss": 0.82539588, + "num_input_tokens_seen": 309800580, + "step": 14363, + "time_per_iteration": 2.8958396911621094 + }, + { + "auxiliary_loss_clip": 0.01035289, + "auxiliary_loss_mlp": 0.01034671, + "balance_loss_clip": 1.02182889, + "balance_loss_mlp": 1.02201128, + "epoch": 0.8636104013227116, + "flos": 25189970918400.0, + "grad_norm": 2.4807568569327616, + "language_loss": 0.72905064, + "learning_rate": 1.9190434890923112e-07, + "loss": 0.74975026, + "num_input_tokens_seen": 309821725, + "step": 14364, + "time_per_iteration": 3.112250328063965 + }, + { + "auxiliary_loss_clip": 0.0103527, + "auxiliary_loss_mlp": 0.0102787, + "balance_loss_clip": 1.02112412, + "balance_loss_mlp": 1.01698005, + "epoch": 0.8636705245753795, + "flos": 23878441624320.0, + "grad_norm": 1.5772155679248268, + "language_loss": 0.71621704, + "learning_rate": 1.917379150731755e-07, + "loss": 0.73684841, + "num_input_tokens_seen": 309841565, + "step": 14365, + "time_per_iteration": 2.8025054931640625 + }, + { + "auxiliary_loss_clip": 0.01037774, + "auxiliary_loss_mlp": 0.01044207, + "balance_loss_clip": 1.02388227, + "balance_loss_mlp": 1.03117776, + "epoch": 0.8637306478280475, + "flos": 23110455911040.0, + "grad_norm": 2.0302096819200233, + "language_loss": 0.71325994, + "learning_rate": 1.915715498065993e-07, + "loss": 0.73407972, + "num_input_tokens_seen": 309858635, + "step": 14366, + "time_per_iteration": 2.676546335220337 + }, + { + "auxiliary_loss_clip": 0.01040312, + "auxiliary_loss_mlp": 0.01022703, + "balance_loss_clip": 1.02459526, + "balance_loss_mlp": 1.01293361, + "epoch": 0.8637907710807154, + "flos": 21906802137600.0, + "grad_norm": 1.8039553609000467, + "language_loss": 0.81421697, + "learning_rate": 1.9140525311581146e-07, + "loss": 0.83484715, + "num_input_tokens_seen": 309877885, + "step": 14367, + "time_per_iteration": 2.6801300048828125 + }, + { + "auxiliary_loss_clip": 0.01044998, + "auxiliary_loss_mlp": 0.01029179, + "balance_loss_clip": 1.02600396, + "balance_loss_mlp": 1.01775837, + "epoch": 0.8638508943333835, + "flos": 23580526222080.0, + "grad_norm": 2.5324328931316695, + "language_loss": 0.61924577, + "learning_rate": 1.9123902500711743e-07, + "loss": 0.63998759, + "num_input_tokens_seen": 309893140, + "step": 14368, + "time_per_iteration": 2.629816770553589 + }, + { + "auxiliary_loss_clip": 0.01052168, + "auxiliary_loss_mlp": 0.01029243, + "balance_loss_clip": 1.02568603, + "balance_loss_mlp": 1.01923513, + "epoch": 0.8639110175860514, + "flos": 25775853655680.0, + "grad_norm": 2.2984551049073256, + "language_loss": 0.76519829, + "learning_rate": 1.91072865486821e-07, + "loss": 0.78601247, + "num_input_tokens_seen": 309914175, + "step": 14369, + "time_per_iteration": 2.6470916271209717 + }, + { + "auxiliary_loss_clip": 0.01044989, + "auxiliary_loss_mlp": 0.01032929, + "balance_loss_clip": 1.02568555, + "balance_loss_mlp": 1.02154493, + "epoch": 0.8639711408387194, + "flos": 23369443948800.0, + "grad_norm": 2.0318832887962026, + "language_loss": 0.64305347, + "learning_rate": 1.9090677456122294e-07, + "loss": 0.66383266, + "num_input_tokens_seen": 309932395, + "step": 14370, + "time_per_iteration": 2.9238038063049316 + }, + { + "auxiliary_loss_clip": 0.01010724, + "auxiliary_loss_mlp": 0.01030869, + "balance_loss_clip": 1.02838278, + "balance_loss_mlp": 1.01965737, + "epoch": 0.8640312640913873, + "flos": 22127221946880.0, + "grad_norm": 1.7447011303888054, + "language_loss": 0.66025472, + "learning_rate": 1.907407522366209e-07, + "loss": 0.68067062, + "num_input_tokens_seen": 309951720, + "step": 14371, + "time_per_iteration": 4.5511884689331055 + }, + { + "auxiliary_loss_clip": 0.00987331, + "auxiliary_loss_mlp": 0.01000052, + "balance_loss_clip": 1.00190639, + "balance_loss_mlp": 0.99918216, + "epoch": 0.8640913873440553, + "flos": 57571735944960.0, + "grad_norm": 0.860194740852869, + "language_loss": 0.56870091, + "learning_rate": 1.905747985193107e-07, + "loss": 0.58857477, + "num_input_tokens_seen": 310006120, + "step": 14372, + "time_per_iteration": 3.1327311992645264 + }, + { + "auxiliary_loss_clip": 0.0106099, + "auxiliary_loss_mlp": 0.01032955, + "balance_loss_clip": 1.0259552, + "balance_loss_mlp": 1.02193975, + "epoch": 0.8641515105967232, + "flos": 23987430466560.0, + "grad_norm": 1.961906868704689, + "language_loss": 0.79333836, + "learning_rate": 1.9040891341558597e-07, + "loss": 0.81427777, + "num_input_tokens_seen": 310026740, + "step": 14373, + "time_per_iteration": 2.6047141551971436 + }, + { + "auxiliary_loss_clip": 0.01062473, + "auxiliary_loss_mlp": 0.01023818, + "balance_loss_clip": 1.02486157, + "balance_loss_mlp": 1.01370263, + "epoch": 0.8642116338493913, + "flos": 19062749122560.0, + "grad_norm": 1.7547719891937053, + "language_loss": 0.63811195, + "learning_rate": 1.9024309693173656e-07, + "loss": 0.65897489, + "num_input_tokens_seen": 310044135, + "step": 14374, + "time_per_iteration": 2.6041133403778076 + }, + { + "auxiliary_loss_clip": 0.01034213, + "auxiliary_loss_mlp": 0.01029354, + "balance_loss_clip": 1.024266, + "balance_loss_mlp": 1.01894069, + "epoch": 0.8642717571020592, + "flos": 18254148105600.0, + "grad_norm": 1.8084134077937133, + "language_loss": 0.77141011, + "learning_rate": 1.9007734907404993e-07, + "loss": 0.79204583, + "num_input_tokens_seen": 310061560, + "step": 14375, + "time_per_iteration": 2.633657693862915 + }, + { + "auxiliary_loss_clip": 0.01019812, + "auxiliary_loss_mlp": 0.00747493, + "balance_loss_clip": 1.02323937, + "balance_loss_mlp": 1.00035417, + "epoch": 0.8643318803547272, + "flos": 57663270777600.0, + "grad_norm": 1.7715261934376159, + "language_loss": 0.6078161, + "learning_rate": 1.899116698488117e-07, + "loss": 0.62548912, + "num_input_tokens_seen": 310087310, + "step": 14376, + "time_per_iteration": 3.0939459800720215 + }, + { + "auxiliary_loss_clip": 0.01016205, + "auxiliary_loss_mlp": 0.01033024, + "balance_loss_clip": 1.02185726, + "balance_loss_mlp": 1.02296829, + "epoch": 0.8643920036073952, + "flos": 19609524927360.0, + "grad_norm": 1.866321463420537, + "language_loss": 0.66543901, + "learning_rate": 1.8974605926230457e-07, + "loss": 0.68593132, + "num_input_tokens_seen": 310106260, + "step": 14377, + "time_per_iteration": 2.6867313385009766 + }, + { + "auxiliary_loss_clip": 0.01034354, + "auxiliary_loss_mlp": 0.01034351, + "balance_loss_clip": 1.02118289, + "balance_loss_mlp": 1.02284157, + "epoch": 0.8644521268600631, + "flos": 20850346298880.0, + "grad_norm": 1.621721486080577, + "language_loss": 0.70185244, + "learning_rate": 1.8958051732080804e-07, + "loss": 0.72253948, + "num_input_tokens_seen": 310125305, + "step": 14378, + "time_per_iteration": 2.6608569622039795 + }, + { + "auxiliary_loss_clip": 0.00996768, + "auxiliary_loss_mlp": 0.01000419, + "balance_loss_clip": 1.00172329, + "balance_loss_mlp": 0.99959624, + "epoch": 0.8645122501127311, + "flos": 66719550101760.0, + "grad_norm": 0.7988555318915908, + "language_loss": 0.60256886, + "learning_rate": 1.894150440305995e-07, + "loss": 0.62254071, + "num_input_tokens_seen": 310189270, + "step": 14379, + "time_per_iteration": 3.2361176013946533 + }, + { + "auxiliary_loss_clip": 0.01028631, + "auxiliary_loss_mlp": 0.0102562, + "balance_loss_clip": 1.02186286, + "balance_loss_mlp": 1.0155822, + "epoch": 0.864572373365399, + "flos": 21690009601920.0, + "grad_norm": 1.522743147299508, + "language_loss": 0.74429494, + "learning_rate": 1.8924963939795478e-07, + "loss": 0.76483744, + "num_input_tokens_seen": 310208395, + "step": 14380, + "time_per_iteration": 2.652711868286133 + }, + { + "auxiliary_loss_clip": 0.01027133, + "auxiliary_loss_mlp": 0.01030166, + "balance_loss_clip": 1.0208602, + "balance_loss_mlp": 1.01916349, + "epoch": 0.8646324966180671, + "flos": 20266402896000.0, + "grad_norm": 2.2037304835818485, + "language_loss": 0.75228989, + "learning_rate": 1.8908430342914473e-07, + "loss": 0.77286291, + "num_input_tokens_seen": 310227415, + "step": 14381, + "time_per_iteration": 2.655275344848633 + }, + { + "auxiliary_loss_clip": 0.01040974, + "auxiliary_loss_mlp": 0.01031968, + "balance_loss_clip": 1.0248456, + "balance_loss_mlp": 1.02176976, + "epoch": 0.864692619870735, + "flos": 11946188050560.0, + "grad_norm": 2.3784432538656866, + "language_loss": 0.8444289, + "learning_rate": 1.8891903613043892e-07, + "loss": 0.86515832, + "num_input_tokens_seen": 310242625, + "step": 14382, + "time_per_iteration": 2.6431331634521484 + }, + { + "auxiliary_loss_clip": 0.01055607, + "auxiliary_loss_mlp": 0.01030769, + "balance_loss_clip": 1.02671599, + "balance_loss_mlp": 1.01992679, + "epoch": 0.864752743123403, + "flos": 21470703114240.0, + "grad_norm": 1.7227568002051117, + "language_loss": 0.75656688, + "learning_rate": 1.8875383750810504e-07, + "loss": 0.77743065, + "num_input_tokens_seen": 310260585, + "step": 14383, + "time_per_iteration": 2.6352250576019287 + }, + { + "auxiliary_loss_clip": 0.01044423, + "auxiliary_loss_mlp": 0.01031802, + "balance_loss_clip": 1.02724218, + "balance_loss_mlp": 1.021294, + "epoch": 0.8648128663760709, + "flos": 19530018172800.0, + "grad_norm": 1.8093715978549272, + "language_loss": 0.85297024, + "learning_rate": 1.8858870756840738e-07, + "loss": 0.87373257, + "num_input_tokens_seen": 310277210, + "step": 14384, + "time_per_iteration": 2.6778976917266846 + }, + { + "auxiliary_loss_clip": 0.01048539, + "auxiliary_loss_mlp": 0.01028652, + "balance_loss_clip": 1.02259111, + "balance_loss_mlp": 1.0190556, + "epoch": 0.8648729896287389, + "flos": 21287953693440.0, + "grad_norm": 1.7735474516060437, + "language_loss": 0.80707914, + "learning_rate": 1.884236463176072e-07, + "loss": 0.82785106, + "num_input_tokens_seen": 310296610, + "step": 14385, + "time_per_iteration": 2.6421451568603516 + }, + { + "auxiliary_loss_clip": 0.01047072, + "auxiliary_loss_mlp": 0.0103045, + "balance_loss_clip": 1.02775216, + "balance_loss_mlp": 1.01952469, + "epoch": 0.8649331128814068, + "flos": 24604483230720.0, + "grad_norm": 1.9810667703473568, + "language_loss": 0.7255646, + "learning_rate": 1.8825865376196437e-07, + "loss": 0.7463398, + "num_input_tokens_seen": 310316830, + "step": 14386, + "time_per_iteration": 6.183815002441406 + }, + { + "auxiliary_loss_clip": 0.01046472, + "auxiliary_loss_mlp": 0.01034166, + "balance_loss_clip": 1.02401435, + "balance_loss_mlp": 1.0230732, + "epoch": 0.8649932361340749, + "flos": 15377811742080.0, + "grad_norm": 1.8107251939646745, + "language_loss": 0.81959873, + "learning_rate": 1.8809372990773476e-07, + "loss": 0.84040511, + "num_input_tokens_seen": 310334355, + "step": 14387, + "time_per_iteration": 2.602745294570923 + }, + { + "auxiliary_loss_clip": 0.01059818, + "auxiliary_loss_mlp": 0.01022909, + "balance_loss_clip": 1.02506363, + "balance_loss_mlp": 1.01321745, + "epoch": 0.8650533593867428, + "flos": 19901227276800.0, + "grad_norm": 1.8866597891172407, + "language_loss": 0.67865336, + "learning_rate": 1.8792887476117224e-07, + "loss": 0.69948059, + "num_input_tokens_seen": 310352900, + "step": 14388, + "time_per_iteration": 2.6026666164398193 + }, + { + "auxiliary_loss_clip": 0.01031986, + "auxiliary_loss_mlp": 0.01030418, + "balance_loss_clip": 1.02675498, + "balance_loss_mlp": 1.02093506, + "epoch": 0.8651134826394108, + "flos": 25626931868160.0, + "grad_norm": 1.4948894176651988, + "language_loss": 0.90265834, + "learning_rate": 1.877640883285283e-07, + "loss": 0.92328238, + "num_input_tokens_seen": 310372855, + "step": 14389, + "time_per_iteration": 2.8976478576660156 + }, + { + "auxiliary_loss_clip": 0.01024939, + "auxiliary_loss_mlp": 0.00747536, + "balance_loss_clip": 1.02779138, + "balance_loss_mlp": 1.00048947, + "epoch": 0.8651736058920788, + "flos": 18734525619840.0, + "grad_norm": 1.5070031131872597, + "language_loss": 0.70858425, + "learning_rate": 1.8759937061605212e-07, + "loss": 0.72630906, + "num_input_tokens_seen": 310391595, + "step": 14390, + "time_per_iteration": 2.7052857875823975 + }, + { + "auxiliary_loss_clip": 0.0106252, + "auxiliary_loss_mlp": 0.01033546, + "balance_loss_clip": 1.02453864, + "balance_loss_mlp": 1.023139, + "epoch": 0.8652337291447467, + "flos": 20776765288320.0, + "grad_norm": 1.549441292177802, + "language_loss": 0.82233649, + "learning_rate": 1.8743472162998941e-07, + "loss": 0.84329718, + "num_input_tokens_seen": 310410090, + "step": 14391, + "time_per_iteration": 2.566939353942871 + }, + { + "auxiliary_loss_clip": 0.0098087, + "auxiliary_loss_mlp": 0.01000639, + "balance_loss_clip": 1.00412846, + "balance_loss_mlp": 0.99956656, + "epoch": 0.8652938523974147, + "flos": 64227887464320.0, + "grad_norm": 0.7959253867094931, + "language_loss": 0.68010676, + "learning_rate": 1.8727014137658337e-07, + "loss": 0.69992185, + "num_input_tokens_seen": 310470055, + "step": 14392, + "time_per_iteration": 3.164334535598755 + }, + { + "auxiliary_loss_clip": 0.01055427, + "auxiliary_loss_mlp": 0.01029438, + "balance_loss_clip": 1.02569652, + "balance_loss_mlp": 1.01813698, + "epoch": 0.8653539756500827, + "flos": 18040587793920.0, + "grad_norm": 2.1033015517578146, + "language_loss": 0.75610811, + "learning_rate": 1.8710562986207523e-07, + "loss": 0.7769568, + "num_input_tokens_seen": 310487665, + "step": 14393, + "time_per_iteration": 2.585587739944458 + }, + { + "auxiliary_loss_clip": 0.01038633, + "auxiliary_loss_mlp": 0.0103058, + "balance_loss_clip": 1.02243745, + "balance_loss_mlp": 1.02026832, + "epoch": 0.8654140989027507, + "flos": 17382416935680.0, + "grad_norm": 1.7452140180276399, + "language_loss": 0.73531455, + "learning_rate": 1.8694118709270357e-07, + "loss": 0.75600672, + "num_input_tokens_seen": 310506130, + "step": 14394, + "time_per_iteration": 2.618107318878174 + }, + { + "auxiliary_loss_clip": 0.01050206, + "auxiliary_loss_mlp": 0.01025906, + "balance_loss_clip": 1.02369595, + "balance_loss_mlp": 1.01502228, + "epoch": 0.8654742221554186, + "flos": 53284862448000.0, + "grad_norm": 2.1204174073571953, + "language_loss": 0.65468192, + "learning_rate": 1.867768130747036e-07, + "loss": 0.67544299, + "num_input_tokens_seen": 310532445, + "step": 14395, + "time_per_iteration": 2.881598472595215 + }, + { + "auxiliary_loss_clip": 0.01042733, + "auxiliary_loss_mlp": 0.01031892, + "balance_loss_clip": 1.0227356, + "balance_loss_mlp": 1.02146077, + "epoch": 0.8655343454080866, + "flos": 23914711382400.0, + "grad_norm": 1.6936859515567517, + "language_loss": 0.67938387, + "learning_rate": 1.8661250781430838e-07, + "loss": 0.70013011, + "num_input_tokens_seen": 310552300, + "step": 14396, + "time_per_iteration": 2.588554859161377 + }, + { + "auxiliary_loss_clip": 0.0105569, + "auxiliary_loss_mlp": 0.01028296, + "balance_loss_clip": 1.02721941, + "balance_loss_mlp": 1.01784778, + "epoch": 0.8655944686607545, + "flos": 24097209408000.0, + "grad_norm": 2.0780236923023114, + "language_loss": 0.69466662, + "learning_rate": 1.8644827131774954e-07, + "loss": 0.71550649, + "num_input_tokens_seen": 310572710, + "step": 14397, + "time_per_iteration": 2.6874518394470215 + }, + { + "auxiliary_loss_clip": 0.01042718, + "auxiliary_loss_mlp": 0.01028613, + "balance_loss_clip": 1.02508473, + "balance_loss_mlp": 1.01871252, + "epoch": 0.8656545919134225, + "flos": 23112718467840.0, + "grad_norm": 2.0723792072786615, + "language_loss": 0.64121389, + "learning_rate": 1.86284103591253e-07, + "loss": 0.66192722, + "num_input_tokens_seen": 310592460, + "step": 14398, + "time_per_iteration": 2.9127159118652344 + }, + { + "auxiliary_loss_clip": 0.01027794, + "auxiliary_loss_mlp": 0.01031595, + "balance_loss_clip": 1.02442205, + "balance_loss_mlp": 1.02015066, + "epoch": 0.8657147151660904, + "flos": 21141761339520.0, + "grad_norm": 1.9286575161796617, + "language_loss": 0.76241863, + "learning_rate": 1.8612000464104517e-07, + "loss": 0.78301251, + "num_input_tokens_seen": 310609375, + "step": 14399, + "time_per_iteration": 2.677795648574829 + }, + { + "auxiliary_loss_clip": 0.01051622, + "auxiliary_loss_mlp": 0.01024998, + "balance_loss_clip": 1.02570724, + "balance_loss_mlp": 1.01546776, + "epoch": 0.8657748384187585, + "flos": 16289439943680.0, + "grad_norm": 2.8006134720214, + "language_loss": 0.93518615, + "learning_rate": 1.8595597447334855e-07, + "loss": 0.95595241, + "num_input_tokens_seen": 310627405, + "step": 14400, + "time_per_iteration": 2.607518196105957 + }, + { + "auxiliary_loss_clip": 0.00996065, + "auxiliary_loss_mlp": 0.01033043, + "balance_loss_clip": 1.02283669, + "balance_loss_mlp": 1.0224452, + "epoch": 0.8658349616714264, + "flos": 30843890179200.0, + "grad_norm": 1.9593280431422293, + "language_loss": 0.67712492, + "learning_rate": 1.8579201309438353e-07, + "loss": 0.69741595, + "num_input_tokens_seen": 310649945, + "step": 14401, + "time_per_iteration": 2.970477342605591 + }, + { + "auxiliary_loss_clip": 0.01053379, + "auxiliary_loss_mlp": 0.01027626, + "balance_loss_clip": 1.02511621, + "balance_loss_mlp": 1.01670623, + "epoch": 0.8658950849240944, + "flos": 18952862440320.0, + "grad_norm": 2.3662059869315115, + "language_loss": 0.7429117, + "learning_rate": 1.8562812051036714e-07, + "loss": 0.7637217, + "num_input_tokens_seen": 310668285, + "step": 14402, + "time_per_iteration": 2.595890998840332 + }, + { + "auxiliary_loss_clip": 0.00988007, + "auxiliary_loss_mlp": 0.01027927, + "balance_loss_clip": 1.02313066, + "balance_loss_mlp": 1.01796675, + "epoch": 0.8659552081767624, + "flos": 23364344217600.0, + "grad_norm": 2.082427616231175, + "language_loss": 0.75020039, + "learning_rate": 1.8546429672751397e-07, + "loss": 0.77035975, + "num_input_tokens_seen": 310687015, + "step": 14403, + "time_per_iteration": 2.849864959716797 + }, + { + "auxiliary_loss_clip": 0.01034327, + "auxiliary_loss_mlp": 0.01029085, + "balance_loss_clip": 1.02246523, + "balance_loss_mlp": 1.01741385, + "epoch": 0.8660153314294303, + "flos": 23841992298240.0, + "grad_norm": 2.054534273950104, + "language_loss": 0.73488855, + "learning_rate": 1.853005417520368e-07, + "loss": 0.75552273, + "num_input_tokens_seen": 310707580, + "step": 14404, + "time_per_iteration": 2.6802964210510254 + }, + { + "auxiliary_loss_clip": 0.0103288, + "auxiliary_loss_mlp": 0.01030834, + "balance_loss_clip": 1.02668285, + "balance_loss_mlp": 1.01984918, + "epoch": 0.8660754546820983, + "flos": 23112467072640.0, + "grad_norm": 1.9159183129141388, + "language_loss": 0.70421457, + "learning_rate": 1.851368555901447e-07, + "loss": 0.72485173, + "num_input_tokens_seen": 310727300, + "step": 14405, + "time_per_iteration": 4.470105171203613 + }, + { + "auxiliary_loss_clip": 0.01053616, + "auxiliary_loss_mlp": 0.00747686, + "balance_loss_clip": 1.02499676, + "balance_loss_mlp": 1.00048602, + "epoch": 0.8661355779347663, + "flos": 14391991998720.0, + "grad_norm": 1.8264609390647082, + "language_loss": 0.66450369, + "learning_rate": 1.8497323824804467e-07, + "loss": 0.68251669, + "num_input_tokens_seen": 310744935, + "step": 14406, + "time_per_iteration": 2.637256145477295 + }, + { + "auxiliary_loss_clip": 0.01041392, + "auxiliary_loss_mlp": 0.01024003, + "balance_loss_clip": 1.0252018, + "balance_loss_mlp": 1.01447797, + "epoch": 0.8661957011874343, + "flos": 21870137329920.0, + "grad_norm": 1.6759317586706697, + "language_loss": 0.83034259, + "learning_rate": 1.8480968973194177e-07, + "loss": 0.85099649, + "num_input_tokens_seen": 310765085, + "step": 14407, + "time_per_iteration": 2.8757824897766113 + }, + { + "auxiliary_loss_clip": 0.01051693, + "auxiliary_loss_mlp": 0.01033736, + "balance_loss_clip": 1.02518296, + "balance_loss_mlp": 1.02353215, + "epoch": 0.8662558244401022, + "flos": 21835160461440.0, + "grad_norm": 1.660034527286891, + "language_loss": 0.69961798, + "learning_rate": 1.8464621004803748e-07, + "loss": 0.72047222, + "num_input_tokens_seen": 310783260, + "step": 14408, + "time_per_iteration": 2.6030056476593018 + }, + { + "auxiliary_loss_clip": 0.01049461, + "auxiliary_loss_mlp": 0.01026482, + "balance_loss_clip": 1.02464855, + "balance_loss_mlp": 1.01671326, + "epoch": 0.8663159476927702, + "flos": 17384104874880.0, + "grad_norm": 1.9225112354795173, + "language_loss": 0.77103132, + "learning_rate": 1.844827992025304e-07, + "loss": 0.79179072, + "num_input_tokens_seen": 310801970, + "step": 14409, + "time_per_iteration": 2.6641108989715576 + }, + { + "auxiliary_loss_clip": 0.01055354, + "auxiliary_loss_mlp": 0.01032653, + "balance_loss_clip": 1.02674818, + "balance_loss_mlp": 1.02123308, + "epoch": 0.8663760709454381, + "flos": 22747722416640.0, + "grad_norm": 1.6511385605268063, + "language_loss": 0.77003187, + "learning_rate": 1.8431945720161757e-07, + "loss": 0.79091191, + "num_input_tokens_seen": 310822070, + "step": 14410, + "time_per_iteration": 2.7149741649627686 + }, + { + "auxiliary_loss_clip": 0.01023296, + "auxiliary_loss_mlp": 0.01029122, + "balance_loss_clip": 1.02227139, + "balance_loss_mlp": 1.01826227, + "epoch": 0.8664361941981061, + "flos": 17376850327680.0, + "grad_norm": 2.1705156496862656, + "language_loss": 0.77562034, + "learning_rate": 1.8415618405149315e-07, + "loss": 0.79614449, + "num_input_tokens_seen": 310838355, + "step": 14411, + "time_per_iteration": 2.6368939876556396 + }, + { + "auxiliary_loss_clip": 0.01037362, + "auxiliary_loss_mlp": 0.01029309, + "balance_loss_clip": 1.02201664, + "balance_loss_mlp": 1.01986766, + "epoch": 0.866496317450774, + "flos": 16034438315520.0, + "grad_norm": 3.3586461072768254, + "language_loss": 0.73835105, + "learning_rate": 1.8399297975834794e-07, + "loss": 0.75901777, + "num_input_tokens_seen": 310856055, + "step": 14412, + "time_per_iteration": 2.6903042793273926 + }, + { + "auxiliary_loss_clip": 0.01041988, + "auxiliary_loss_mlp": 0.00747469, + "balance_loss_clip": 1.02160537, + "balance_loss_mlp": 1.00043499, + "epoch": 0.8665564407034421, + "flos": 20814830726400.0, + "grad_norm": 1.7314908826342987, + "language_loss": 0.69892275, + "learning_rate": 1.83829844328371e-07, + "loss": 0.71681738, + "num_input_tokens_seen": 310876695, + "step": 14413, + "time_per_iteration": 2.6265203952789307 + }, + { + "auxiliary_loss_clip": 0.01050753, + "auxiliary_loss_mlp": 0.01028267, + "balance_loss_clip": 1.02458298, + "balance_loss_mlp": 1.0182054, + "epoch": 0.86661656395611, + "flos": 15815167741440.0, + "grad_norm": 1.9025112480803759, + "language_loss": 0.62455434, + "learning_rate": 1.8366677776774874e-07, + "loss": 0.64534456, + "num_input_tokens_seen": 310893880, + "step": 14414, + "time_per_iteration": 2.569837808609009 + }, + { + "auxiliary_loss_clip": 0.01032365, + "auxiliary_loss_mlp": 0.00747534, + "balance_loss_clip": 1.02499831, + "balance_loss_mlp": 1.00038314, + "epoch": 0.866676687208778, + "flos": 23036910814080.0, + "grad_norm": 1.5833625727053258, + "language_loss": 0.64011049, + "learning_rate": 1.8350378008266377e-07, + "loss": 0.65790957, + "num_input_tokens_seen": 310914145, + "step": 14415, + "time_per_iteration": 2.711552143096924 + }, + { + "auxiliary_loss_clip": 0.00978966, + "auxiliary_loss_mlp": 0.01002826, + "balance_loss_clip": 1.00319993, + "balance_loss_mlp": 1.00200391, + "epoch": 0.866736810461446, + "flos": 63802275212160.0, + "grad_norm": 0.7943997373527233, + "language_loss": 0.60457337, + "learning_rate": 1.8334085127929754e-07, + "loss": 0.62439126, + "num_input_tokens_seen": 310972825, + "step": 14416, + "time_per_iteration": 3.3310418128967285 + }, + { + "auxiliary_loss_clip": 0.01054006, + "auxiliary_loss_mlp": 0.00747702, + "balance_loss_clip": 1.02484167, + "balance_loss_mlp": 1.0004108, + "epoch": 0.8667969337141139, + "flos": 20449367798400.0, + "grad_norm": 2.0117026578459445, + "language_loss": 0.74695158, + "learning_rate": 1.831779913638285e-07, + "loss": 0.76496869, + "num_input_tokens_seen": 310992050, + "step": 14417, + "time_per_iteration": 2.649470567703247 + }, + { + "auxiliary_loss_clip": 0.01039563, + "auxiliary_loss_mlp": 0.01032261, + "balance_loss_clip": 1.02427125, + "balance_loss_mlp": 1.02205658, + "epoch": 0.866857056966782, + "flos": 21653703930240.0, + "grad_norm": 1.4809787854122192, + "language_loss": 0.74867356, + "learning_rate": 1.830152003424319e-07, + "loss": 0.76939183, + "num_input_tokens_seen": 311011105, + "step": 14418, + "time_per_iteration": 2.7173562049865723 + }, + { + "auxiliary_loss_clip": 0.01050538, + "auxiliary_loss_mlp": 0.0103028, + "balance_loss_clip": 1.02440488, + "balance_loss_mlp": 1.02032566, + "epoch": 0.8669171802194499, + "flos": 22852832590080.0, + "grad_norm": 1.5742890529286673, + "language_loss": 0.68335891, + "learning_rate": 1.8285247822128126e-07, + "loss": 0.70416707, + "num_input_tokens_seen": 311032080, + "step": 14419, + "time_per_iteration": 4.352175951004028 + }, + { + "auxiliary_loss_clip": 0.01051352, + "auxiliary_loss_mlp": 0.01025437, + "balance_loss_clip": 1.02390599, + "balance_loss_mlp": 1.01607919, + "epoch": 0.8669773034721179, + "flos": 18734166483840.0, + "grad_norm": 1.6711734626274577, + "language_loss": 0.79065049, + "learning_rate": 1.826898250065465e-07, + "loss": 0.81141835, + "num_input_tokens_seen": 311049735, + "step": 14420, + "time_per_iteration": 2.6378726959228516 + }, + { + "auxiliary_loss_clip": 0.01044433, + "auxiliary_loss_mlp": 0.01024396, + "balance_loss_clip": 1.02344751, + "balance_loss_mlp": 1.01397729, + "epoch": 0.8670374267247858, + "flos": 18916018064640.0, + "grad_norm": 1.5645671209563998, + "language_loss": 0.83838528, + "learning_rate": 1.8252724070439586e-07, + "loss": 0.85907358, + "num_input_tokens_seen": 311067675, + "step": 14421, + "time_per_iteration": 2.695176124572754 + }, + { + "auxiliary_loss_clip": 0.00990139, + "auxiliary_loss_mlp": 0.01001508, + "balance_loss_clip": 1.0038681, + "balance_loss_mlp": 1.00058365, + "epoch": 0.8670975499774538, + "flos": 48814527214080.0, + "grad_norm": 0.7070069794226038, + "language_loss": 0.49092489, + "learning_rate": 1.823647253209941e-07, + "loss": 0.51084137, + "num_input_tokens_seen": 311126605, + "step": 14422, + "time_per_iteration": 3.2100353240966797 + }, + { + "auxiliary_loss_clip": 0.01041635, + "auxiliary_loss_mlp": 0.00747524, + "balance_loss_clip": 1.02466452, + "balance_loss_mlp": 1.00043726, + "epoch": 0.8671576732301217, + "flos": 26136145025280.0, + "grad_norm": 1.5928863706727605, + "language_loss": 0.73186433, + "learning_rate": 1.8220227886250417e-07, + "loss": 0.74975592, + "num_input_tokens_seen": 311147325, + "step": 14423, + "time_per_iteration": 2.714367628097534 + }, + { + "auxiliary_loss_clip": 0.01013749, + "auxiliary_loss_mlp": 0.0102305, + "balance_loss_clip": 1.01942039, + "balance_loss_mlp": 1.01257753, + "epoch": 0.8672177964827897, + "flos": 18367446579840.0, + "grad_norm": 1.5022380059866756, + "language_loss": 0.76848829, + "learning_rate": 1.8203990133508684e-07, + "loss": 0.78885627, + "num_input_tokens_seen": 311165385, + "step": 14424, + "time_per_iteration": 2.6945221424102783 + }, + { + "auxiliary_loss_clip": 0.01020219, + "auxiliary_loss_mlp": 0.01033286, + "balance_loss_clip": 1.02081037, + "balance_loss_mlp": 1.02270603, + "epoch": 0.8672779197354576, + "flos": 28545355992960.0, + "grad_norm": 2.500121422042657, + "language_loss": 0.71148348, + "learning_rate": 1.8187759274489767e-07, + "loss": 0.73201853, + "num_input_tokens_seen": 311185860, + "step": 14425, + "time_per_iteration": 2.754941463470459 + }, + { + "auxiliary_loss_clip": 0.01053707, + "auxiliary_loss_mlp": 0.0102699, + "balance_loss_clip": 1.02562332, + "balance_loss_mlp": 1.01580262, + "epoch": 0.8673380429881257, + "flos": 22382474970240.0, + "grad_norm": 1.8509137680134886, + "language_loss": 0.67964566, + "learning_rate": 1.817153530980926e-07, + "loss": 0.70045269, + "num_input_tokens_seen": 311205810, + "step": 14426, + "time_per_iteration": 2.651360034942627 + }, + { + "auxiliary_loss_clip": 0.01024076, + "auxiliary_loss_mlp": 0.01025717, + "balance_loss_clip": 1.02501702, + "balance_loss_mlp": 1.01486313, + "epoch": 0.8673981662407936, + "flos": 20996430912000.0, + "grad_norm": 1.7299985734765593, + "language_loss": 0.70610744, + "learning_rate": 1.815531824008234e-07, + "loss": 0.72660536, + "num_input_tokens_seen": 311226080, + "step": 14427, + "time_per_iteration": 2.6893255710601807 + }, + { + "auxiliary_loss_clip": 0.01035471, + "auxiliary_loss_mlp": 0.01025246, + "balance_loss_clip": 1.02697206, + "balance_loss_mlp": 1.01505947, + "epoch": 0.8674582894934616, + "flos": 24426797627520.0, + "grad_norm": 1.66653842737746, + "language_loss": 0.68065965, + "learning_rate": 1.8139108065924004e-07, + "loss": 0.70126683, + "num_input_tokens_seen": 311246380, + "step": 14428, + "time_per_iteration": 2.7526485919952393 + }, + { + "auxiliary_loss_clip": 0.01042858, + "auxiliary_loss_mlp": 0.01025026, + "balance_loss_clip": 1.0257591, + "balance_loss_mlp": 1.01480401, + "epoch": 0.8675184127461296, + "flos": 20737514701440.0, + "grad_norm": 1.8573161657400667, + "language_loss": 0.70208579, + "learning_rate": 1.812290478794889e-07, + "loss": 0.72276461, + "num_input_tokens_seen": 311266465, + "step": 14429, + "time_per_iteration": 2.6707115173339844 + }, + { + "auxiliary_loss_clip": 0.01043228, + "auxiliary_loss_mlp": 0.01027569, + "balance_loss_clip": 1.02480006, + "balance_loss_mlp": 1.01718032, + "epoch": 0.8675785359987975, + "flos": 19135647774720.0, + "grad_norm": 2.1726559255587716, + "language_loss": 0.66652894, + "learning_rate": 1.810670840677151e-07, + "loss": 0.68723691, + "num_input_tokens_seen": 311285075, + "step": 14430, + "time_per_iteration": 2.6250436305999756 + }, + { + "auxiliary_loss_clip": 0.01022271, + "auxiliary_loss_mlp": 0.01036128, + "balance_loss_clip": 1.02478862, + "balance_loss_mlp": 1.0248214, + "epoch": 0.8676386592514655, + "flos": 22710662559360.0, + "grad_norm": 1.9532321115455682, + "language_loss": 0.68961996, + "learning_rate": 1.8090518923005948e-07, + "loss": 0.71020395, + "num_input_tokens_seen": 311303230, + "step": 14431, + "time_per_iteration": 2.891974925994873 + }, + { + "auxiliary_loss_clip": 0.01047112, + "auxiliary_loss_mlp": 0.01029782, + "balance_loss_clip": 1.02348709, + "balance_loss_mlp": 1.01831949, + "epoch": 0.8676987825041335, + "flos": 14209853109120.0, + "grad_norm": 2.3375152211674113, + "language_loss": 0.63216841, + "learning_rate": 1.8074336337266116e-07, + "loss": 0.65293741, + "num_input_tokens_seen": 311318070, + "step": 14432, + "time_per_iteration": 2.630800247192383 + }, + { + "auxiliary_loss_clip": 0.01054563, + "auxiliary_loss_mlp": 0.01036249, + "balance_loss_clip": 1.02640927, + "balance_loss_mlp": 1.02594936, + "epoch": 0.8677589057568015, + "flos": 13589927256960.0, + "grad_norm": 1.8447474888125042, + "language_loss": 0.78360385, + "learning_rate": 1.8058160650165656e-07, + "loss": 0.80451202, + "num_input_tokens_seen": 311334885, + "step": 14433, + "time_per_iteration": 4.200012922286987 + }, + { + "auxiliary_loss_clip": 0.00988263, + "auxiliary_loss_mlp": 0.01000727, + "balance_loss_clip": 1.00276327, + "balance_loss_mlp": 0.99986821, + "epoch": 0.8678190290094694, + "flos": 68933657370240.0, + "grad_norm": 0.7054495698161912, + "language_loss": 0.58498287, + "learning_rate": 1.804199186231805e-07, + "loss": 0.60487282, + "num_input_tokens_seen": 311399780, + "step": 14434, + "time_per_iteration": 4.899771451950073 + }, + { + "auxiliary_loss_clip": 0.01040789, + "auxiliary_loss_mlp": 0.01027556, + "balance_loss_clip": 1.0253737, + "balance_loss_mlp": 1.0185858, + "epoch": 0.8678791522621374, + "flos": 32557726776960.0, + "grad_norm": 1.772532016290786, + "language_loss": 0.79758763, + "learning_rate": 1.802582997433628e-07, + "loss": 0.81827116, + "num_input_tokens_seen": 311419610, + "step": 14435, + "time_per_iteration": 2.765850305557251 + }, + { + "auxiliary_loss_clip": 0.01041154, + "auxiliary_loss_mlp": 0.00747761, + "balance_loss_clip": 1.02238405, + "balance_loss_mlp": 1.00042105, + "epoch": 0.8679392755148053, + "flos": 35042637657600.0, + "grad_norm": 2.023719178912007, + "language_loss": 0.62101239, + "learning_rate": 1.8009674986833322e-07, + "loss": 0.63890153, + "num_input_tokens_seen": 311440045, + "step": 14436, + "time_per_iteration": 2.7522294521331787 + }, + { + "auxiliary_loss_clip": 0.01044612, + "auxiliary_loss_mlp": 0.01029132, + "balance_loss_clip": 1.02666879, + "balance_loss_mlp": 1.01771832, + "epoch": 0.8679993987674733, + "flos": 18552494471040.0, + "grad_norm": 2.446967584424186, + "language_loss": 0.7089411, + "learning_rate": 1.7993526900421706e-07, + "loss": 0.72967851, + "num_input_tokens_seen": 311456660, + "step": 14437, + "time_per_iteration": 2.676802396774292 + }, + { + "auxiliary_loss_clip": 0.01031959, + "auxiliary_loss_mlp": 0.01023274, + "balance_loss_clip": 1.02522063, + "balance_loss_mlp": 1.01272976, + "epoch": 0.8680595220201412, + "flos": 27454390162560.0, + "grad_norm": 1.9353442660935452, + "language_loss": 0.80536085, + "learning_rate": 1.797738571571381e-07, + "loss": 0.82591319, + "num_input_tokens_seen": 311475460, + "step": 14438, + "time_per_iteration": 2.7725725173950195 + }, + { + "auxiliary_loss_clip": 0.01048924, + "auxiliary_loss_mlp": 0.01022966, + "balance_loss_clip": 1.02459633, + "balance_loss_mlp": 1.01345921, + "epoch": 0.8681196452728093, + "flos": 19208797822080.0, + "grad_norm": 1.8692759879011396, + "language_loss": 0.67199343, + "learning_rate": 1.7961251433321656e-07, + "loss": 0.69271231, + "num_input_tokens_seen": 311494575, + "step": 14439, + "time_per_iteration": 2.665616273880005 + }, + { + "auxiliary_loss_clip": 0.01049972, + "auxiliary_loss_mlp": 0.0103307, + "balance_loss_clip": 1.02448726, + "balance_loss_mlp": 1.02348566, + "epoch": 0.8681797685254772, + "flos": 37560442417920.0, + "grad_norm": 1.5360274712616773, + "language_loss": 0.64217275, + "learning_rate": 1.7945124053857085e-07, + "loss": 0.66300321, + "num_input_tokens_seen": 311515805, + "step": 14440, + "time_per_iteration": 2.7806484699249268 + }, + { + "auxiliary_loss_clip": 0.01050146, + "auxiliary_loss_mlp": 0.01028689, + "balance_loss_clip": 1.02460885, + "balance_loss_mlp": 1.01872897, + "epoch": 0.8682398917781452, + "flos": 23289937194240.0, + "grad_norm": 1.6614079823090477, + "language_loss": 0.65323114, + "learning_rate": 1.7929003577931722e-07, + "loss": 0.67401952, + "num_input_tokens_seen": 311536000, + "step": 14441, + "time_per_iteration": 2.6959428787231445 + }, + { + "auxiliary_loss_clip": 0.01051523, + "auxiliary_loss_mlp": 0.01025141, + "balance_loss_clip": 1.02641797, + "balance_loss_mlp": 1.01569986, + "epoch": 0.8683000150308132, + "flos": 21872794936320.0, + "grad_norm": 1.4474799701613257, + "language_loss": 0.66165572, + "learning_rate": 1.7912890006156722e-07, + "loss": 0.68242234, + "num_input_tokens_seen": 311556220, + "step": 14442, + "time_per_iteration": 2.679971694946289 + }, + { + "auxiliary_loss_clip": 0.01035023, + "auxiliary_loss_mlp": 0.01030119, + "balance_loss_clip": 1.02353501, + "balance_loss_mlp": 1.01829362, + "epoch": 0.8683601382834811, + "flos": 14647209108480.0, + "grad_norm": 1.7335185696776103, + "language_loss": 0.72146004, + "learning_rate": 1.7896783339143195e-07, + "loss": 0.74211144, + "num_input_tokens_seen": 311572530, + "step": 14443, + "time_per_iteration": 2.6421165466308594 + }, + { + "auxiliary_loss_clip": 0.01063466, + "auxiliary_loss_mlp": 0.01029008, + "balance_loss_clip": 1.02578163, + "balance_loss_mlp": 1.01841056, + "epoch": 0.8684202615361492, + "flos": 26359904799360.0, + "grad_norm": 1.6292526704923318, + "language_loss": 0.83398193, + "learning_rate": 1.7880683577501877e-07, + "loss": 0.85490668, + "num_input_tokens_seen": 311591105, + "step": 14444, + "time_per_iteration": 2.635467052459717 + }, + { + "auxiliary_loss_clip": 0.01034037, + "auxiliary_loss_mlp": 0.01027353, + "balance_loss_clip": 1.02659559, + "balance_loss_mlp": 1.01661205, + "epoch": 0.8684803847888171, + "flos": 20704010290560.0, + "grad_norm": 1.7922124813178355, + "language_loss": 0.77017975, + "learning_rate": 1.7864590721843342e-07, + "loss": 0.79079366, + "num_input_tokens_seen": 311608350, + "step": 14445, + "time_per_iteration": 2.7031145095825195 + }, + { + "auxiliary_loss_clip": 0.01053886, + "auxiliary_loss_mlp": 0.01030621, + "balance_loss_clip": 1.02582192, + "balance_loss_mlp": 1.02004719, + "epoch": 0.8685405080414851, + "flos": 22638123043200.0, + "grad_norm": 1.9824645449197893, + "language_loss": 0.67682749, + "learning_rate": 1.7848504772777728e-07, + "loss": 0.69767261, + "num_input_tokens_seen": 311626380, + "step": 14446, + "time_per_iteration": 2.620983123779297 + }, + { + "auxiliary_loss_clip": 0.01046084, + "auxiliary_loss_mlp": 0.01028617, + "balance_loss_clip": 1.02411723, + "balance_loss_mlp": 1.01790059, + "epoch": 0.868600631294153, + "flos": 24822065865600.0, + "grad_norm": 1.883541846513052, + "language_loss": 0.82735765, + "learning_rate": 1.7832425730915102e-07, + "loss": 0.84810472, + "num_input_tokens_seen": 311644345, + "step": 14447, + "time_per_iteration": 2.728989601135254 + }, + { + "auxiliary_loss_clip": 0.00991835, + "auxiliary_loss_mlp": 0.01025764, + "balance_loss_clip": 1.02148986, + "balance_loss_mlp": 1.01582193, + "epoch": 0.868660754546821, + "flos": 25113983696640.0, + "grad_norm": 1.8309759857076713, + "language_loss": 0.73976737, + "learning_rate": 1.781635359686515e-07, + "loss": 0.75994337, + "num_input_tokens_seen": 311663340, + "step": 14448, + "time_per_iteration": 2.842597007751465 + }, + { + "auxiliary_loss_clip": 0.01034637, + "auxiliary_loss_mlp": 0.01031756, + "balance_loss_clip": 1.022012, + "balance_loss_mlp": 1.02049685, + "epoch": 0.8687208777994889, + "flos": 12677832178560.0, + "grad_norm": 2.0728840970540925, + "language_loss": 0.80540484, + "learning_rate": 1.7800288371237303e-07, + "loss": 0.82606876, + "num_input_tokens_seen": 311679860, + "step": 14449, + "time_per_iteration": 2.62101674079895 + }, + { + "auxiliary_loss_clip": 0.00978662, + "auxiliary_loss_mlp": 0.01005053, + "balance_loss_clip": 1.00329828, + "balance_loss_mlp": 1.00406313, + "epoch": 0.8687810010521569, + "flos": 65617235573760.0, + "grad_norm": 0.8055729126445282, + "language_loss": 0.6058228, + "learning_rate": 1.7784230054640758e-07, + "loss": 0.62565994, + "num_input_tokens_seen": 311738135, + "step": 14450, + "time_per_iteration": 3.1735267639160156 + }, + { + "auxiliary_loss_clip": 0.01035033, + "auxiliary_loss_mlp": 0.01032176, + "balance_loss_clip": 1.02591586, + "balance_loss_mlp": 1.02185285, + "epoch": 0.8688411243048249, + "flos": 24244012293120.0, + "grad_norm": 1.6144543951506056, + "language_loss": 0.75957799, + "learning_rate": 1.7768178647684517e-07, + "loss": 0.78025013, + "num_input_tokens_seen": 311756975, + "step": 14451, + "time_per_iteration": 2.6931262016296387 + }, + { + "auxiliary_loss_clip": 0.01050122, + "auxiliary_loss_mlp": 0.0102336, + "balance_loss_clip": 1.02452457, + "balance_loss_mlp": 1.01335824, + "epoch": 0.8689012475574929, + "flos": 18221828843520.0, + "grad_norm": 2.8399042297923156, + "language_loss": 0.71812809, + "learning_rate": 1.7752134150977205e-07, + "loss": 0.73886287, + "num_input_tokens_seen": 311771830, + "step": 14452, + "time_per_iteration": 4.403767347335815 + }, + { + "auxiliary_loss_clip": 0.01044056, + "auxiliary_loss_mlp": 0.00747583, + "balance_loss_clip": 1.02547932, + "balance_loss_mlp": 1.00037086, + "epoch": 0.8689613708101608, + "flos": 19646728439040.0, + "grad_norm": 2.0467799880187703, + "language_loss": 0.72475433, + "learning_rate": 1.7736096565127201e-07, + "loss": 0.74267066, + "num_input_tokens_seen": 311790130, + "step": 14453, + "time_per_iteration": 2.6543972492218018 + }, + { + "auxiliary_loss_clip": 0.01047269, + "auxiliary_loss_mlp": 0.01032017, + "balance_loss_clip": 1.02345586, + "balance_loss_mlp": 1.02168751, + "epoch": 0.8690214940628288, + "flos": 11728749070080.0, + "grad_norm": 2.381695650182082, + "language_loss": 0.74251562, + "learning_rate": 1.7720065890742664e-07, + "loss": 0.76330853, + "num_input_tokens_seen": 311808360, + "step": 14454, + "time_per_iteration": 2.59719181060791 + }, + { + "auxiliary_loss_clip": 0.0106229, + "auxiliary_loss_mlp": 0.01028743, + "balance_loss_clip": 1.02549911, + "balance_loss_mlp": 1.01875925, + "epoch": 0.8690816173154968, + "flos": 34936450076160.0, + "grad_norm": 2.0292051567663827, + "language_loss": 0.58968186, + "learning_rate": 1.7704042128431552e-07, + "loss": 0.61059213, + "num_input_tokens_seen": 311831325, + "step": 14455, + "time_per_iteration": 2.6839473247528076 + }, + { + "auxiliary_loss_clip": 0.01041587, + "auxiliary_loss_mlp": 0.01022967, + "balance_loss_clip": 1.02334559, + "balance_loss_mlp": 1.01285183, + "epoch": 0.8691417405681647, + "flos": 11614804151040.0, + "grad_norm": 2.3659389248115583, + "language_loss": 0.80121553, + "learning_rate": 1.7688025278801378e-07, + "loss": 0.82186103, + "num_input_tokens_seen": 311848090, + "step": 14456, + "time_per_iteration": 2.630765676498413 + }, + { + "auxiliary_loss_clip": 0.01004379, + "auxiliary_loss_mlp": 0.01037424, + "balance_loss_clip": 1.02300811, + "balance_loss_mlp": 1.02385843, + "epoch": 0.8692018638208328, + "flos": 24608038677120.0, + "grad_norm": 2.1123745283177335, + "language_loss": 0.74457359, + "learning_rate": 1.7672015342459568e-07, + "loss": 0.76499164, + "num_input_tokens_seen": 311867855, + "step": 14457, + "time_per_iteration": 2.8404600620269775 + }, + { + "auxiliary_loss_clip": 0.01013102, + "auxiliary_loss_mlp": 0.01025864, + "balance_loss_clip": 1.02427983, + "balance_loss_mlp": 1.01619649, + "epoch": 0.8692619870735007, + "flos": 25995124229760.0, + "grad_norm": 1.4891526121945888, + "language_loss": 0.78187048, + "learning_rate": 1.765601232001328e-07, + "loss": 0.80226016, + "num_input_tokens_seen": 311888675, + "step": 14458, + "time_per_iteration": 3.2514700889587402 + }, + { + "auxiliary_loss_clip": 0.01051416, + "auxiliary_loss_mlp": 0.01029867, + "balance_loss_clip": 1.02372336, + "balance_loss_mlp": 1.01863146, + "epoch": 0.8693221103261687, + "flos": 18041808856320.0, + "grad_norm": 1.7613323426416565, + "language_loss": 0.71145183, + "learning_rate": 1.7640016212069187e-07, + "loss": 0.73226464, + "num_input_tokens_seen": 311907310, + "step": 14459, + "time_per_iteration": 2.6693060398101807 + }, + { + "auxiliary_loss_clip": 0.01037025, + "auxiliary_loss_mlp": 0.01029244, + "balance_loss_clip": 1.02317202, + "balance_loss_mlp": 1.0202558, + "epoch": 0.8693822335788366, + "flos": 27492347859840.0, + "grad_norm": 1.3464067479722894, + "language_loss": 0.73901308, + "learning_rate": 1.762402701923398e-07, + "loss": 0.75967574, + "num_input_tokens_seen": 311929635, + "step": 14460, + "time_per_iteration": 2.7096540927886963 + }, + { + "auxiliary_loss_clip": 0.01043264, + "auxiliary_loss_mlp": 0.01037176, + "balance_loss_clip": 1.0246377, + "balance_loss_mlp": 1.02587461, + "epoch": 0.8694423568315046, + "flos": 24097712198400.0, + "grad_norm": 3.128534185179726, + "language_loss": 0.6471402, + "learning_rate": 1.7608044742113947e-07, + "loss": 0.66794461, + "num_input_tokens_seen": 311948800, + "step": 14461, + "time_per_iteration": 2.6681456565856934 + }, + { + "auxiliary_loss_clip": 0.01044458, + "auxiliary_loss_mlp": 0.0103079, + "balance_loss_clip": 1.02192163, + "balance_loss_mlp": 1.02029943, + "epoch": 0.8695024800841725, + "flos": 18362131367040.0, + "grad_norm": 2.2703063851192966, + "language_loss": 0.82833862, + "learning_rate": 1.7592069381315123e-07, + "loss": 0.84909111, + "num_input_tokens_seen": 311964090, + "step": 14462, + "time_per_iteration": 2.664994478225708 + }, + { + "auxiliary_loss_clip": 0.01048664, + "auxiliary_loss_mlp": 0.01032908, + "balance_loss_clip": 1.02257371, + "balance_loss_mlp": 1.02188158, + "epoch": 0.8695626033368405, + "flos": 14027750133120.0, + "grad_norm": 1.762812076817835, + "language_loss": 0.65256381, + "learning_rate": 1.757610093744335e-07, + "loss": 0.67337954, + "num_input_tokens_seen": 311981460, + "step": 14463, + "time_per_iteration": 2.587820291519165 + }, + { + "auxiliary_loss_clip": 0.01048285, + "auxiliary_loss_mlp": 0.01033616, + "balance_loss_clip": 1.02834809, + "balance_loss_mlp": 1.02248812, + "epoch": 0.8696227265895085, + "flos": 16836862193280.0, + "grad_norm": 2.197012247700408, + "language_loss": 0.66638935, + "learning_rate": 1.7560139411104058e-07, + "loss": 0.68720841, + "num_input_tokens_seen": 312000115, + "step": 14464, + "time_per_iteration": 2.7828497886657715 + }, + { + "auxiliary_loss_clip": 0.01042446, + "auxiliary_loss_mlp": 0.01031491, + "balance_loss_clip": 1.02395999, + "balance_loss_mlp": 1.02104247, + "epoch": 0.8696828498421765, + "flos": 21799070271360.0, + "grad_norm": 2.118848444547487, + "language_loss": 0.62444293, + "learning_rate": 1.7544184802902607e-07, + "loss": 0.64518237, + "num_input_tokens_seen": 312020770, + "step": 14465, + "time_per_iteration": 2.680534601211548 + }, + { + "auxiliary_loss_clip": 0.0104704, + "auxiliary_loss_mlp": 0.01034514, + "balance_loss_clip": 1.02335358, + "balance_loss_mlp": 1.02515638, + "epoch": 0.8697429730948444, + "flos": 22894812610560.0, + "grad_norm": 1.56048004164274, + "language_loss": 0.84512019, + "learning_rate": 1.7528237113443934e-07, + "loss": 0.8659358, + "num_input_tokens_seen": 312041870, + "step": 14466, + "time_per_iteration": 4.4136061668396 + }, + { + "auxiliary_loss_clip": 0.01039013, + "auxiliary_loss_mlp": 0.01039054, + "balance_loss_clip": 1.0264076, + "balance_loss_mlp": 1.02631664, + "epoch": 0.8698030963475124, + "flos": 24717458482560.0, + "grad_norm": 2.6131532649774543, + "language_loss": 0.61884528, + "learning_rate": 1.7512296343332779e-07, + "loss": 0.63962597, + "num_input_tokens_seen": 312058210, + "step": 14467, + "time_per_iteration": 2.774785280227661 + }, + { + "auxiliary_loss_clip": 0.01057635, + "auxiliary_loss_mlp": 0.01025236, + "balance_loss_clip": 1.02334678, + "balance_loss_mlp": 1.01615834, + "epoch": 0.8698632196001803, + "flos": 28442221067520.0, + "grad_norm": 1.3112032179300213, + "language_loss": 0.68698484, + "learning_rate": 1.7496362493173655e-07, + "loss": 0.7078135, + "num_input_tokens_seen": 312082665, + "step": 14468, + "time_per_iteration": 2.6834065914154053 + }, + { + "auxiliary_loss_clip": 0.01038634, + "auxiliary_loss_mlp": 0.01026365, + "balance_loss_clip": 1.02378702, + "balance_loss_mlp": 1.0168581, + "epoch": 0.8699233428528483, + "flos": 27636457224960.0, + "grad_norm": 1.44607034888762, + "language_loss": 0.70701718, + "learning_rate": 1.7480435563570773e-07, + "loss": 0.72766721, + "num_input_tokens_seen": 312101960, + "step": 14469, + "time_per_iteration": 2.6914944648742676 + }, + { + "auxiliary_loss_clip": 0.01046037, + "auxiliary_loss_mlp": 0.01026296, + "balance_loss_clip": 1.02298653, + "balance_loss_mlp": 1.01677108, + "epoch": 0.8699834661055164, + "flos": 20045659864320.0, + "grad_norm": 1.916195919576415, + "language_loss": 0.84142244, + "learning_rate": 1.7464515555128024e-07, + "loss": 0.86214578, + "num_input_tokens_seen": 312117125, + "step": 14470, + "time_per_iteration": 2.586425304412842 + }, + { + "auxiliary_loss_clip": 0.01035658, + "auxiliary_loss_mlp": 0.0102949, + "balance_loss_clip": 1.02355433, + "balance_loss_mlp": 1.01843917, + "epoch": 0.8700435893581843, + "flos": 23732787974400.0, + "grad_norm": 1.6590269118775571, + "language_loss": 0.73032743, + "learning_rate": 1.7448602468449148e-07, + "loss": 0.75097895, + "num_input_tokens_seen": 312135775, + "step": 14471, + "time_per_iteration": 2.6512951850891113 + }, + { + "auxiliary_loss_clip": 0.01061332, + "auxiliary_loss_mlp": 0.01026612, + "balance_loss_clip": 1.02560258, + "balance_loss_mlp": 1.01699734, + "epoch": 0.8701037126108523, + "flos": 23548422441600.0, + "grad_norm": 1.3718024493779941, + "language_loss": 0.79006565, + "learning_rate": 1.7432696304137573e-07, + "loss": 0.81094503, + "num_input_tokens_seen": 312156070, + "step": 14472, + "time_per_iteration": 2.5956871509552 + }, + { + "auxiliary_loss_clip": 0.01052645, + "auxiliary_loss_mlp": 0.00747746, + "balance_loss_clip": 1.02553952, + "balance_loss_mlp": 1.00043726, + "epoch": 0.8701638358635202, + "flos": 18843442634880.0, + "grad_norm": 1.888540163217799, + "language_loss": 0.73160696, + "learning_rate": 1.741679706279644e-07, + "loss": 0.74961078, + "num_input_tokens_seen": 312174380, + "step": 14473, + "time_per_iteration": 2.6510519981384277 + }, + { + "auxiliary_loss_clip": 0.01064274, + "auxiliary_loss_mlp": 0.01025539, + "balance_loss_clip": 1.02630925, + "balance_loss_mlp": 1.01534092, + "epoch": 0.8702239591161882, + "flos": 27928339142400.0, + "grad_norm": 1.5301948865358224, + "language_loss": 0.72291458, + "learning_rate": 1.7400904745028644e-07, + "loss": 0.74381268, + "num_input_tokens_seen": 312195130, + "step": 14474, + "time_per_iteration": 2.731167793273926 + }, + { + "auxiliary_loss_clip": 0.01040846, + "auxiliary_loss_mlp": 0.01033138, + "balance_loss_clip": 1.02364278, + "balance_loss_mlp": 1.02198625, + "epoch": 0.8702840823688561, + "flos": 17233997938560.0, + "grad_norm": 2.907176545119769, + "language_loss": 0.67362475, + "learning_rate": 1.7385019351436925e-07, + "loss": 0.69436455, + "num_input_tokens_seen": 312212300, + "step": 14475, + "time_per_iteration": 2.7540223598480225 + }, + { + "auxiliary_loss_clip": 0.01059872, + "auxiliary_loss_mlp": 0.01024266, + "balance_loss_clip": 1.02240157, + "balance_loss_mlp": 1.01379395, + "epoch": 0.8703442056215241, + "flos": 19427565605760.0, + "grad_norm": 1.6570242378431068, + "language_loss": 0.77776301, + "learning_rate": 1.736914088262349e-07, + "loss": 0.79860437, + "num_input_tokens_seen": 312231735, + "step": 14476, + "time_per_iteration": 2.583355188369751 + }, + { + "auxiliary_loss_clip": 0.01043709, + "auxiliary_loss_mlp": 0.01032638, + "balance_loss_clip": 1.02265477, + "balance_loss_mlp": 1.02202845, + "epoch": 0.8704043288741921, + "flos": 22273845264000.0, + "grad_norm": 1.5519753164248373, + "language_loss": 0.71999896, + "learning_rate": 1.7353269339190525e-07, + "loss": 0.74076241, + "num_input_tokens_seen": 312253060, + "step": 14477, + "time_per_iteration": 2.726472854614258 + }, + { + "auxiliary_loss_clip": 0.01054023, + "auxiliary_loss_mlp": 0.01025216, + "balance_loss_clip": 1.02585816, + "balance_loss_mlp": 1.01500559, + "epoch": 0.8704644521268601, + "flos": 16648725732480.0, + "grad_norm": 1.7955274135659054, + "language_loss": 0.59505534, + "learning_rate": 1.7337404721739946e-07, + "loss": 0.61584771, + "num_input_tokens_seen": 312269460, + "step": 14478, + "time_per_iteration": 2.741302967071533 + }, + { + "auxiliary_loss_clip": 0.01051324, + "auxiliary_loss_mlp": 0.01025909, + "balance_loss_clip": 1.02688098, + "balance_loss_mlp": 1.01663446, + "epoch": 0.870524575379528, + "flos": 24280210224000.0, + "grad_norm": 1.6161316138251451, + "language_loss": 0.71765101, + "learning_rate": 1.732154703087323e-07, + "loss": 0.73842335, + "num_input_tokens_seen": 312289830, + "step": 14479, + "time_per_iteration": 2.7004966735839844 + }, + { + "auxiliary_loss_clip": 0.01039947, + "auxiliary_loss_mlp": 0.01029843, + "balance_loss_clip": 1.02401137, + "balance_loss_mlp": 1.01945972, + "epoch": 0.870584698632196, + "flos": 28768684803840.0, + "grad_norm": 1.577721215105259, + "language_loss": 0.71057749, + "learning_rate": 1.7305696267191805e-07, + "loss": 0.73127538, + "num_input_tokens_seen": 312311320, + "step": 14480, + "time_per_iteration": 5.989829778671265 + }, + { + "auxiliary_loss_clip": 0.01014277, + "auxiliary_loss_mlp": 0.01026736, + "balance_loss_clip": 1.02192545, + "balance_loss_mlp": 1.01653194, + "epoch": 0.8706448218848639, + "flos": 32449635774720.0, + "grad_norm": 1.5710872289927926, + "language_loss": 0.69833887, + "learning_rate": 1.728985243129666e-07, + "loss": 0.71874893, + "num_input_tokens_seen": 312332095, + "step": 14481, + "time_per_iteration": 2.860856533050537 + }, + { + "auxiliary_loss_clip": 0.01050706, + "auxiliary_loss_mlp": 0.01027746, + "balance_loss_clip": 1.02414739, + "balance_loss_mlp": 1.01770854, + "epoch": 0.8707049451375319, + "flos": 22748009725440.0, + "grad_norm": 1.7843408516387615, + "language_loss": 0.76866084, + "learning_rate": 1.7274015523788643e-07, + "loss": 0.7894454, + "num_input_tokens_seen": 312351225, + "step": 14482, + "time_per_iteration": 2.636859893798828 + }, + { + "auxiliary_loss_clip": 0.01039458, + "auxiliary_loss_mlp": 0.01028859, + "balance_loss_clip": 1.02454507, + "balance_loss_mlp": 1.01912546, + "epoch": 0.8707650683902, + "flos": 15851976203520.0, + "grad_norm": 1.777494750224651, + "language_loss": 0.76574814, + "learning_rate": 1.7258185545268234e-07, + "loss": 0.78643131, + "num_input_tokens_seen": 312369730, + "step": 14483, + "time_per_iteration": 2.652923107147217 + }, + { + "auxiliary_loss_clip": 0.01053833, + "auxiliary_loss_mlp": 0.01033301, + "balance_loss_clip": 1.0248301, + "balance_loss_mlp": 1.02166009, + "epoch": 0.8708251916428679, + "flos": 16468131127680.0, + "grad_norm": 2.3058879176035045, + "language_loss": 0.62158871, + "learning_rate": 1.7242362496335749e-07, + "loss": 0.64246005, + "num_input_tokens_seen": 312386780, + "step": 14484, + "time_per_iteration": 2.6056289672851562 + }, + { + "auxiliary_loss_clip": 0.01061462, + "auxiliary_loss_mlp": 0.01028294, + "balance_loss_clip": 1.02535582, + "balance_loss_mlp": 1.01831007, + "epoch": 0.8708853148955359, + "flos": 15377847655680.0, + "grad_norm": 1.7595609919595812, + "language_loss": 0.68124664, + "learning_rate": 1.7226546377591222e-07, + "loss": 0.70214421, + "num_input_tokens_seen": 312404875, + "step": 14485, + "time_per_iteration": 2.593522310256958 + }, + { + "auxiliary_loss_clip": 0.01003323, + "auxiliary_loss_mlp": 0.00747657, + "balance_loss_clip": 1.02016044, + "balance_loss_mlp": 1.00042963, + "epoch": 0.8709454381482038, + "flos": 30551325903360.0, + "grad_norm": 1.9383214519282947, + "language_loss": 0.6269905, + "learning_rate": 1.7210737189634373e-07, + "loss": 0.64450026, + "num_input_tokens_seen": 312425280, + "step": 14486, + "time_per_iteration": 2.8254504203796387 + }, + { + "auxiliary_loss_clip": 0.01064861, + "auxiliary_loss_mlp": 0.0103023, + "balance_loss_clip": 1.02432179, + "balance_loss_mlp": 1.01861858, + "epoch": 0.8710055614008718, + "flos": 22601422321920.0, + "grad_norm": 2.570231719382706, + "language_loss": 0.61625683, + "learning_rate": 1.7194934933064653e-07, + "loss": 0.63720775, + "num_input_tokens_seen": 312443835, + "step": 14487, + "time_per_iteration": 2.6040098667144775 + }, + { + "auxiliary_loss_clip": 0.01041132, + "auxiliary_loss_mlp": 0.00747462, + "balance_loss_clip": 1.02508736, + "balance_loss_mlp": 1.00038588, + "epoch": 0.8710656846535397, + "flos": 18443146492800.0, + "grad_norm": 1.9341426065361438, + "language_loss": 0.67629635, + "learning_rate": 1.7179139608481318e-07, + "loss": 0.69418222, + "num_input_tokens_seen": 312460830, + "step": 14488, + "time_per_iteration": 2.649214506149292 + }, + { + "auxiliary_loss_clip": 0.0104522, + "auxiliary_loss_mlp": 0.00747419, + "balance_loss_clip": 1.02651787, + "balance_loss_mlp": 1.0003916, + "epoch": 0.8711258079062077, + "flos": 16503862181760.0, + "grad_norm": 3.11723500342469, + "language_loss": 0.86038107, + "learning_rate": 1.716335121648338e-07, + "loss": 0.87830746, + "num_input_tokens_seen": 312477575, + "step": 14489, + "time_per_iteration": 2.6468896865844727 + }, + { + "auxiliary_loss_clip": 0.01055259, + "auxiliary_loss_mlp": 0.0103, + "balance_loss_clip": 1.02556348, + "balance_loss_mlp": 1.01875257, + "epoch": 0.8711859311588757, + "flos": 15663336952320.0, + "grad_norm": 2.2844663799780482, + "language_loss": 0.75661421, + "learning_rate": 1.7147569757669445e-07, + "loss": 0.77746671, + "num_input_tokens_seen": 312492140, + "step": 14490, + "time_per_iteration": 2.576594591140747 + }, + { + "auxiliary_loss_clip": 0.01052839, + "auxiliary_loss_mlp": 0.01025531, + "balance_loss_clip": 1.02489591, + "balance_loss_mlp": 1.01474285, + "epoch": 0.8712460544115437, + "flos": 15557544420480.0, + "grad_norm": 2.5195641331744367, + "language_loss": 0.76587963, + "learning_rate": 1.7131795232638012e-07, + "loss": 0.78666329, + "num_input_tokens_seen": 312508400, + "step": 14491, + "time_per_iteration": 2.654024600982666 + }, + { + "auxiliary_loss_clip": 0.01036684, + "auxiliary_loss_mlp": 0.01024132, + "balance_loss_clip": 1.02987123, + "balance_loss_mlp": 1.01448846, + "epoch": 0.8713061776642116, + "flos": 16763568491520.0, + "grad_norm": 1.8063553456031072, + "language_loss": 0.67360538, + "learning_rate": 1.711602764198723e-07, + "loss": 0.69421351, + "num_input_tokens_seen": 312525915, + "step": 14492, + "time_per_iteration": 2.8133997917175293 + }, + { + "auxiliary_loss_clip": 0.01049364, + "auxiliary_loss_mlp": 0.0102617, + "balance_loss_clip": 1.02459741, + "balance_loss_mlp": 1.01692486, + "epoch": 0.8713663009168796, + "flos": 24279887001600.0, + "grad_norm": 1.8972076208783386, + "language_loss": 0.69733292, + "learning_rate": 1.7100266986314992e-07, + "loss": 0.71808827, + "num_input_tokens_seen": 312544735, + "step": 14493, + "time_per_iteration": 2.712247133255005 + }, + { + "auxiliary_loss_clip": 0.01065048, + "auxiliary_loss_mlp": 0.01030955, + "balance_loss_clip": 1.02709174, + "balance_loss_mlp": 1.0197736, + "epoch": 0.8714264241695475, + "flos": 23795594904960.0, + "grad_norm": 2.877690030501303, + "language_loss": 0.89456666, + "learning_rate": 1.7084513266218936e-07, + "loss": 0.91552663, + "num_input_tokens_seen": 312557910, + "step": 14494, + "time_per_iteration": 2.5365819931030273 + }, + { + "auxiliary_loss_clip": 0.01036191, + "auxiliary_loss_mlp": 0.01027352, + "balance_loss_clip": 1.02890122, + "balance_loss_mlp": 1.01770222, + "epoch": 0.8714865474222155, + "flos": 37997942071680.0, + "grad_norm": 2.137354143405612, + "language_loss": 0.59132099, + "learning_rate": 1.7068766482296514e-07, + "loss": 0.61195642, + "num_input_tokens_seen": 312580360, + "step": 14495, + "time_per_iteration": 2.8598945140838623 + }, + { + "auxiliary_loss_clip": 0.0103182, + "auxiliary_loss_mlp": 0.01032111, + "balance_loss_clip": 1.02391613, + "balance_loss_mlp": 1.02197242, + "epoch": 0.8715466706748836, + "flos": 22455696844800.0, + "grad_norm": 1.917593864502431, + "language_loss": 0.81128514, + "learning_rate": 1.7053026635144762e-07, + "loss": 0.83192438, + "num_input_tokens_seen": 312597550, + "step": 14496, + "time_per_iteration": 2.6662070751190186 + }, + { + "auxiliary_loss_clip": 0.01033997, + "auxiliary_loss_mlp": 0.0103138, + "balance_loss_clip": 1.02421522, + "balance_loss_mlp": 1.01984656, + "epoch": 0.8716067939275515, + "flos": 21215126868480.0, + "grad_norm": 6.372674166648907, + "language_loss": 0.78854197, + "learning_rate": 1.7037293725360624e-07, + "loss": 0.80919576, + "num_input_tokens_seen": 312616435, + "step": 14497, + "time_per_iteration": 2.8010215759277344 + }, + { + "auxiliary_loss_clip": 0.01063553, + "auxiliary_loss_mlp": 0.01030555, + "balance_loss_clip": 1.02552247, + "balance_loss_mlp": 1.01910472, + "epoch": 0.8716669171802195, + "flos": 22997732054400.0, + "grad_norm": 2.000748881638305, + "language_loss": 0.67151904, + "learning_rate": 1.70215677535406e-07, + "loss": 0.69246006, + "num_input_tokens_seen": 312632770, + "step": 14498, + "time_per_iteration": 2.603503942489624 + }, + { + "auxiliary_loss_clip": 0.0102773, + "auxiliary_loss_mlp": 0.01025184, + "balance_loss_clip": 1.02160156, + "balance_loss_mlp": 1.01520634, + "epoch": 0.8717270404328874, + "flos": 29784058462080.0, + "grad_norm": 1.5921754017125749, + "language_loss": 0.56903374, + "learning_rate": 1.700584872028108e-07, + "loss": 0.58956283, + "num_input_tokens_seen": 312651900, + "step": 14499, + "time_per_iteration": 2.7620813846588135 + }, + { + "auxiliary_loss_clip": 0.01025266, + "auxiliary_loss_mlp": 0.0103093, + "balance_loss_clip": 1.0232172, + "balance_loss_mlp": 1.01979613, + "epoch": 0.8717871636855554, + "flos": 22018125363840.0, + "grad_norm": 2.1069274952621724, + "language_loss": 0.79851246, + "learning_rate": 1.6990136626178097e-07, + "loss": 0.81907439, + "num_input_tokens_seen": 312671380, + "step": 14500, + "time_per_iteration": 4.57778787612915 + }, + { + "auxiliary_loss_clip": 0.01055284, + "auxiliary_loss_mlp": 0.0102864, + "balance_loss_clip": 1.02829456, + "balance_loss_mlp": 1.01826918, + "epoch": 0.8718472869382233, + "flos": 16654256426880.0, + "grad_norm": 2.969629977092775, + "language_loss": 0.72911555, + "learning_rate": 1.6974431471827466e-07, + "loss": 0.74995482, + "num_input_tokens_seen": 312689215, + "step": 14501, + "time_per_iteration": 2.6557350158691406 + }, + { + "auxiliary_loss_clip": 0.0102379, + "auxiliary_loss_mlp": 0.01030285, + "balance_loss_clip": 1.02256441, + "balance_loss_mlp": 1.01835251, + "epoch": 0.8719074101908914, + "flos": 19495328613120.0, + "grad_norm": 1.8756508683132895, + "language_loss": 0.64712095, + "learning_rate": 1.695873325782482e-07, + "loss": 0.66766173, + "num_input_tokens_seen": 312706400, + "step": 14502, + "time_per_iteration": 2.687039852142334 + }, + { + "auxiliary_loss_clip": 0.0103677, + "auxiliary_loss_mlp": 0.01035901, + "balance_loss_clip": 1.02322435, + "balance_loss_mlp": 1.02413464, + "epoch": 0.8719675334435593, + "flos": 33070890430080.0, + "grad_norm": 1.7686151458086878, + "language_loss": 0.69073981, + "learning_rate": 1.6943041984765262e-07, + "loss": 0.71146655, + "num_input_tokens_seen": 312727985, + "step": 14503, + "time_per_iteration": 2.848447322845459 + }, + { + "auxiliary_loss_clip": 0.01044398, + "auxiliary_loss_mlp": 0.01028144, + "balance_loss_clip": 1.02663684, + "balance_loss_mlp": 1.01755214, + "epoch": 0.8720276566962273, + "flos": 13626268842240.0, + "grad_norm": 3.43539180397454, + "language_loss": 0.69469744, + "learning_rate": 1.6927357653243912e-07, + "loss": 0.71542287, + "num_input_tokens_seen": 312745025, + "step": 14504, + "time_per_iteration": 2.767183303833008 + }, + { + "auxiliary_loss_clip": 0.01054086, + "auxiliary_loss_mlp": 0.00747723, + "balance_loss_clip": 1.02636909, + "balance_loss_mlp": 1.00038803, + "epoch": 0.8720877799488952, + "flos": 23514163845120.0, + "grad_norm": 1.8380083366955096, + "language_loss": 0.70242858, + "learning_rate": 1.691168026385552e-07, + "loss": 0.72044671, + "num_input_tokens_seen": 312764170, + "step": 14505, + "time_per_iteration": 2.6408088207244873 + }, + { + "auxiliary_loss_clip": 0.0104107, + "auxiliary_loss_mlp": 0.01027245, + "balance_loss_clip": 1.02419114, + "balance_loss_mlp": 1.01785767, + "epoch": 0.8721479032015632, + "flos": 20814148368000.0, + "grad_norm": 1.5094435622933826, + "language_loss": 0.78247994, + "learning_rate": 1.6896009817194545e-07, + "loss": 0.80316311, + "num_input_tokens_seen": 312783830, + "step": 14506, + "time_per_iteration": 2.7830207347869873 + }, + { + "auxiliary_loss_clip": 0.01042712, + "auxiliary_loss_mlp": 0.0102705, + "balance_loss_clip": 1.02401876, + "balance_loss_mlp": 1.01625538, + "epoch": 0.8722080264542311, + "flos": 19463655795840.0, + "grad_norm": 2.454677409131219, + "language_loss": 0.74052513, + "learning_rate": 1.6880346313855221e-07, + "loss": 0.76122272, + "num_input_tokens_seen": 312802015, + "step": 14507, + "time_per_iteration": 2.6567232608795166 + }, + { + "auxiliary_loss_clip": 0.01003164, + "auxiliary_loss_mlp": 0.01039743, + "balance_loss_clip": 1.02096605, + "balance_loss_mlp": 1.02671933, + "epoch": 0.8722681497068991, + "flos": 21761866759680.0, + "grad_norm": 2.0438124544538927, + "language_loss": 0.72132254, + "learning_rate": 1.686468975443156e-07, + "loss": 0.74175155, + "num_input_tokens_seen": 312820650, + "step": 14508, + "time_per_iteration": 2.8457980155944824 + }, + { + "auxiliary_loss_clip": 0.01046524, + "auxiliary_loss_mlp": 0.01034713, + "balance_loss_clip": 1.0270071, + "balance_loss_mlp": 1.02351928, + "epoch": 0.8723282729595672, + "flos": 28877134942080.0, + "grad_norm": 1.6778322026279775, + "language_loss": 0.68584085, + "learning_rate": 1.6849040139517202e-07, + "loss": 0.70665324, + "num_input_tokens_seen": 312841310, + "step": 14509, + "time_per_iteration": 2.736628770828247 + }, + { + "auxiliary_loss_clip": 0.01043004, + "auxiliary_loss_mlp": 0.01030921, + "balance_loss_clip": 1.02487803, + "balance_loss_mlp": 1.02064478, + "epoch": 0.8723883962122351, + "flos": 26469145036800.0, + "grad_norm": 2.5522488395531036, + "language_loss": 0.58603686, + "learning_rate": 1.683339746970558e-07, + "loss": 0.60677612, + "num_input_tokens_seen": 312862100, + "step": 14510, + "time_per_iteration": 2.7041361331939697 + }, + { + "auxiliary_loss_clip": 0.01065858, + "auxiliary_loss_mlp": 0.01029145, + "balance_loss_clip": 1.02561235, + "balance_loss_mlp": 1.01742721, + "epoch": 0.8724485194649031, + "flos": 20521476351360.0, + "grad_norm": 2.76872285527669, + "language_loss": 0.67207551, + "learning_rate": 1.6817761745589865e-07, + "loss": 0.69302547, + "num_input_tokens_seen": 312880220, + "step": 14511, + "time_per_iteration": 2.5366334915161133 + }, + { + "auxiliary_loss_clip": 0.01015255, + "auxiliary_loss_mlp": 0.01033527, + "balance_loss_clip": 1.02541661, + "balance_loss_mlp": 1.02236903, + "epoch": 0.872508642717571, + "flos": 24353360271360.0, + "grad_norm": 2.168125477581147, + "language_loss": 0.81674987, + "learning_rate": 1.6802132967763027e-07, + "loss": 0.83723772, + "num_input_tokens_seen": 312900765, + "step": 14512, + "time_per_iteration": 2.8470101356506348 + }, + { + "auxiliary_loss_clip": 0.00989024, + "auxiliary_loss_mlp": 0.01000446, + "balance_loss_clip": 1.00292087, + "balance_loss_mlp": 0.99955195, + "epoch": 0.872568765970239, + "flos": 61410012485760.0, + "grad_norm": 0.7931328467017336, + "language_loss": 0.58638859, + "learning_rate": 1.6786511136817617e-07, + "loss": 0.60628331, + "num_input_tokens_seen": 312955840, + "step": 14513, + "time_per_iteration": 3.0879805088043213 + }, + { + "auxiliary_loss_clip": 0.01053195, + "auxiliary_loss_mlp": 0.01026085, + "balance_loss_clip": 1.02583313, + "balance_loss_mlp": 1.01564193, + "epoch": 0.8726288892229069, + "flos": 22598046443520.0, + "grad_norm": 1.757197605211471, + "language_loss": 0.76883811, + "learning_rate": 1.6770896253346112e-07, + "loss": 0.78963089, + "num_input_tokens_seen": 312973565, + "step": 14514, + "time_per_iteration": 4.296548843383789 + }, + { + "auxiliary_loss_clip": 0.01056184, + "auxiliary_loss_mlp": 0.01026151, + "balance_loss_clip": 1.02655494, + "balance_loss_mlp": 1.01602411, + "epoch": 0.872689012475575, + "flos": 25885201633920.0, + "grad_norm": 2.0022085225282207, + "language_loss": 0.65365833, + "learning_rate": 1.675528831794055e-07, + "loss": 0.67448163, + "num_input_tokens_seen": 312994660, + "step": 14515, + "time_per_iteration": 2.6585326194763184 + }, + { + "auxiliary_loss_clip": 0.01048313, + "auxiliary_loss_mlp": 0.01032584, + "balance_loss_clip": 1.02373588, + "balance_loss_mlp": 1.02125311, + "epoch": 0.8727491357282429, + "flos": 21506721477120.0, + "grad_norm": 2.034855179444793, + "language_loss": 0.78992617, + "learning_rate": 1.6739687331192842e-07, + "loss": 0.81073511, + "num_input_tokens_seen": 313009860, + "step": 14516, + "time_per_iteration": 2.6297717094421387 + }, + { + "auxiliary_loss_clip": 0.01064393, + "auxiliary_loss_mlp": 0.01026436, + "balance_loss_clip": 1.02635503, + "balance_loss_mlp": 1.01530743, + "epoch": 0.8728092589809109, + "flos": 19207504932480.0, + "grad_norm": 2.3674714542920663, + "language_loss": 0.71897942, + "learning_rate": 1.672409329369453e-07, + "loss": 0.73988771, + "num_input_tokens_seen": 313027025, + "step": 14517, + "time_per_iteration": 2.543414831161499 + }, + { + "auxiliary_loss_clip": 0.01027854, + "auxiliary_loss_mlp": 0.01022915, + "balance_loss_clip": 1.02232003, + "balance_loss_mlp": 1.01329494, + "epoch": 0.8728693822335788, + "flos": 20595308757120.0, + "grad_norm": 3.065955029010592, + "language_loss": 0.72223926, + "learning_rate": 1.6708506206036966e-07, + "loss": 0.74274695, + "num_input_tokens_seen": 313046830, + "step": 14518, + "time_per_iteration": 2.6949241161346436 + }, + { + "auxiliary_loss_clip": 0.01033924, + "auxiliary_loss_mlp": 0.01033682, + "balance_loss_clip": 1.02250171, + "balance_loss_mlp": 1.0227145, + "epoch": 0.8729295054862468, + "flos": 21728613744000.0, + "grad_norm": 2.178123597468187, + "language_loss": 0.7427327, + "learning_rate": 1.6692926068811275e-07, + "loss": 0.76340878, + "num_input_tokens_seen": 313067715, + "step": 14519, + "time_per_iteration": 2.7758917808532715 + }, + { + "auxiliary_loss_clip": 0.01053108, + "auxiliary_loss_mlp": 0.01028503, + "balance_loss_clip": 1.02427244, + "balance_loss_mlp": 1.01673067, + "epoch": 0.8729896287389147, + "flos": 17673436926720.0, + "grad_norm": 2.4549868869873452, + "language_loss": 0.76802456, + "learning_rate": 1.6677352882608142e-07, + "loss": 0.78884071, + "num_input_tokens_seen": 313082305, + "step": 14520, + "time_per_iteration": 2.6127378940582275 + }, + { + "auxiliary_loss_clip": 0.01043026, + "auxiliary_loss_mlp": 0.01033158, + "balance_loss_clip": 1.02504206, + "balance_loss_mlp": 1.0217917, + "epoch": 0.8730497519915827, + "flos": 24571804832640.0, + "grad_norm": 1.6638625370952396, + "language_loss": 0.82069993, + "learning_rate": 1.666178664801816e-07, + "loss": 0.84146178, + "num_input_tokens_seen": 313101190, + "step": 14521, + "time_per_iteration": 2.7272708415985107 + }, + { + "auxiliary_loss_clip": 0.01055259, + "auxiliary_loss_mlp": 0.01031225, + "balance_loss_clip": 1.02710986, + "balance_loss_mlp": 1.02004957, + "epoch": 0.8731098752442508, + "flos": 13443734903040.0, + "grad_norm": 1.9887530583750446, + "language_loss": 0.76082373, + "learning_rate": 1.6646227365631616e-07, + "loss": 0.78168851, + "num_input_tokens_seen": 313118965, + "step": 14522, + "time_per_iteration": 2.6375091075897217 + }, + { + "auxiliary_loss_clip": 0.01046782, + "auxiliary_loss_mlp": 0.00747499, + "balance_loss_clip": 1.02186739, + "balance_loss_mlp": 1.00036931, + "epoch": 0.8731699984969187, + "flos": 23474446381440.0, + "grad_norm": 1.741544244780014, + "language_loss": 0.75381088, + "learning_rate": 1.66306750360385e-07, + "loss": 0.77175373, + "num_input_tokens_seen": 313139280, + "step": 14523, + "time_per_iteration": 2.6641666889190674 + }, + { + "auxiliary_loss_clip": 0.01049867, + "auxiliary_loss_mlp": 0.01029057, + "balance_loss_clip": 1.02342105, + "balance_loss_mlp": 1.01896596, + "epoch": 0.8732301217495867, + "flos": 17712651600000.0, + "grad_norm": 2.2858340887120976, + "language_loss": 0.78565431, + "learning_rate": 1.6615129659828542e-07, + "loss": 0.80644357, + "num_input_tokens_seen": 313156655, + "step": 14524, + "time_per_iteration": 2.698004961013794 + }, + { + "auxiliary_loss_clip": 0.01043753, + "auxiliary_loss_mlp": 0.0102837, + "balance_loss_clip": 1.02677608, + "balance_loss_mlp": 1.01878524, + "epoch": 0.8732902450022546, + "flos": 22054359208320.0, + "grad_norm": 2.135874839137038, + "language_loss": 0.7717154, + "learning_rate": 1.6599591237591272e-07, + "loss": 0.7924366, + "num_input_tokens_seen": 313174050, + "step": 14525, + "time_per_iteration": 2.7817251682281494 + }, + { + "auxiliary_loss_clip": 0.0098294, + "auxiliary_loss_mlp": 0.01031207, + "balance_loss_clip": 1.02398252, + "balance_loss_mlp": 1.02046013, + "epoch": 0.8733503682549226, + "flos": 22272983337600.0, + "grad_norm": 1.6257154223228811, + "language_loss": 0.68718255, + "learning_rate": 1.6584059769915902e-07, + "loss": 0.70732397, + "num_input_tokens_seen": 313192765, + "step": 14526, + "time_per_iteration": 3.104628801345825 + }, + { + "auxiliary_loss_clip": 0.01027315, + "auxiliary_loss_mlp": 0.01033971, + "balance_loss_clip": 1.02687407, + "balance_loss_mlp": 1.0222888, + "epoch": 0.8734104915075905, + "flos": 23364344217600.0, + "grad_norm": 2.0959949600230896, + "language_loss": 0.61175632, + "learning_rate": 1.6568535257391326e-07, + "loss": 0.63236916, + "num_input_tokens_seen": 313210925, + "step": 14527, + "time_per_iteration": 5.480044603347778 + }, + { + "auxiliary_loss_clip": 0.01060227, + "auxiliary_loss_mlp": 0.01033499, + "balance_loss_clip": 1.02881217, + "balance_loss_mlp": 1.02071333, + "epoch": 0.8734706147602586, + "flos": 17712292464000.0, + "grad_norm": 1.84585148915745, + "language_loss": 0.65588623, + "learning_rate": 1.6553017700606265e-07, + "loss": 0.6768235, + "num_input_tokens_seen": 313228250, + "step": 14528, + "time_per_iteration": 4.3904173374176025 + }, + { + "auxiliary_loss_clip": 0.01033764, + "auxiliary_loss_mlp": 0.01027524, + "balance_loss_clip": 1.02597225, + "balance_loss_mlp": 1.0175283, + "epoch": 0.8735307380129265, + "flos": 22049367217920.0, + "grad_norm": 22.467117049023585, + "language_loss": 0.89641047, + "learning_rate": 1.6537507100149205e-07, + "loss": 0.91702342, + "num_input_tokens_seen": 313247880, + "step": 14529, + "time_per_iteration": 2.749845266342163 + }, + { + "auxiliary_loss_clip": 0.01038423, + "auxiliary_loss_mlp": 0.01022048, + "balance_loss_clip": 1.0229125, + "balance_loss_mlp": 1.01174235, + "epoch": 0.8735908612655945, + "flos": 25338425829120.0, + "grad_norm": 3.5443900270273834, + "language_loss": 0.85013789, + "learning_rate": 1.6522003456608258e-07, + "loss": 0.87074256, + "num_input_tokens_seen": 313266790, + "step": 14530, + "time_per_iteration": 2.6935627460479736 + }, + { + "auxiliary_loss_clip": 0.01036985, + "auxiliary_loss_mlp": 0.01028229, + "balance_loss_clip": 1.0229032, + "balance_loss_mlp": 1.01865613, + "epoch": 0.8736509845182624, + "flos": 21540908246400.0, + "grad_norm": 1.9194731944499712, + "language_loss": 0.74267852, + "learning_rate": 1.650650677057128e-07, + "loss": 0.7633307, + "num_input_tokens_seen": 313286805, + "step": 14531, + "time_per_iteration": 2.672373056411743 + }, + { + "auxiliary_loss_clip": 0.01048575, + "auxiliary_loss_mlp": 0.0102479, + "balance_loss_clip": 1.02278423, + "balance_loss_mlp": 1.01468062, + "epoch": 0.8737111077709304, + "flos": 22017227523840.0, + "grad_norm": 1.9415632535165521, + "language_loss": 0.61619437, + "learning_rate": 1.6491017042625966e-07, + "loss": 0.63692802, + "num_input_tokens_seen": 313305415, + "step": 14532, + "time_per_iteration": 2.63456654548645 + }, + { + "auxiliary_loss_clip": 0.00997705, + "auxiliary_loss_mlp": 0.01000421, + "balance_loss_clip": 1.0021832, + "balance_loss_mlp": 0.99947929, + "epoch": 0.8737712310235983, + "flos": 70066315912320.0, + "grad_norm": 0.825126483404969, + "language_loss": 0.5869683, + "learning_rate": 1.6475534273359704e-07, + "loss": 0.60694957, + "num_input_tokens_seen": 313369940, + "step": 14533, + "time_per_iteration": 3.326561450958252 + }, + { + "auxiliary_loss_clip": 0.01032684, + "auxiliary_loss_mlp": 0.01028392, + "balance_loss_clip": 1.02271414, + "balance_loss_mlp": 1.01805663, + "epoch": 0.8738313542762663, + "flos": 28658331244800.0, + "grad_norm": 1.498914413479442, + "language_loss": 0.76876169, + "learning_rate": 1.646005846335954e-07, + "loss": 0.78937244, + "num_input_tokens_seen": 313390965, + "step": 14534, + "time_per_iteration": 2.6939218044281006 + }, + { + "auxiliary_loss_clip": 0.01031833, + "auxiliary_loss_mlp": 0.01030756, + "balance_loss_clip": 1.02087545, + "balance_loss_mlp": 1.02015853, + "epoch": 0.8738914775289344, + "flos": 22346384780160.0, + "grad_norm": 1.6855140387118415, + "language_loss": 0.74941158, + "learning_rate": 1.6444589613212357e-07, + "loss": 0.77003741, + "num_input_tokens_seen": 313409680, + "step": 14535, + "time_per_iteration": 2.6446375846862793 + }, + { + "auxiliary_loss_clip": 0.01060767, + "auxiliary_loss_mlp": 0.01026294, + "balance_loss_clip": 1.0237546, + "balance_loss_mlp": 1.0156666, + "epoch": 0.8739516007816023, + "flos": 31759648444800.0, + "grad_norm": 1.8004258704427605, + "language_loss": 0.74869156, + "learning_rate": 1.64291277235048e-07, + "loss": 0.76956224, + "num_input_tokens_seen": 313431335, + "step": 14536, + "time_per_iteration": 2.6280508041381836 + }, + { + "auxiliary_loss_clip": 0.01041556, + "auxiliary_loss_mlp": 0.01027784, + "balance_loss_clip": 1.0237968, + "balance_loss_mlp": 1.0181582, + "epoch": 0.8740117240342703, + "flos": 21211715076480.0, + "grad_norm": 2.0000132685661236, + "language_loss": 0.64068818, + "learning_rate": 1.641367279482304e-07, + "loss": 0.66138154, + "num_input_tokens_seen": 313449225, + "step": 14537, + "time_per_iteration": 2.652806520462036 + }, + { + "auxiliary_loss_clip": 0.01047098, + "auxiliary_loss_mlp": 0.01024953, + "balance_loss_clip": 1.02265608, + "balance_loss_mlp": 1.01399767, + "epoch": 0.8740718472869382, + "flos": 25186666867200.0, + "grad_norm": 1.6569358926617535, + "language_loss": 0.57494938, + "learning_rate": 1.6398224827753216e-07, + "loss": 0.59566993, + "num_input_tokens_seen": 313467715, + "step": 14538, + "time_per_iteration": 2.6373794078826904 + }, + { + "auxiliary_loss_clip": 0.01050136, + "auxiliary_loss_mlp": 0.01026853, + "balance_loss_clip": 1.02508652, + "balance_loss_mlp": 1.01665485, + "epoch": 0.8741319705396062, + "flos": 19500931134720.0, + "grad_norm": 2.0566381918624312, + "language_loss": 0.68701947, + "learning_rate": 1.6382783822881142e-07, + "loss": 0.70778942, + "num_input_tokens_seen": 313486805, + "step": 14539, + "time_per_iteration": 2.7352921962738037 + }, + { + "auxiliary_loss_clip": 0.01052504, + "auxiliary_loss_mlp": 0.01028333, + "balance_loss_clip": 1.02338159, + "balance_loss_mlp": 1.01750851, + "epoch": 0.8741920937922741, + "flos": 14100900180480.0, + "grad_norm": 2.2921265142047034, + "language_loss": 0.74495685, + "learning_rate": 1.6367349780792262e-07, + "loss": 0.76576531, + "num_input_tokens_seen": 313504880, + "step": 14540, + "time_per_iteration": 2.75109601020813 + }, + { + "auxiliary_loss_clip": 0.01034981, + "auxiliary_loss_mlp": 0.01035201, + "balance_loss_clip": 1.02225089, + "balance_loss_mlp": 1.02330947, + "epoch": 0.8742522170449422, + "flos": 27709858667520.0, + "grad_norm": 2.189195095189585, + "language_loss": 0.79040778, + "learning_rate": 1.635192270207193e-07, + "loss": 0.81110954, + "num_input_tokens_seen": 313524995, + "step": 14541, + "time_per_iteration": 2.71650767326355 + }, + { + "auxiliary_loss_clip": 0.01019033, + "auxiliary_loss_mlp": 0.01030477, + "balance_loss_clip": 1.02179027, + "balance_loss_mlp": 1.01789463, + "epoch": 0.8743123402976101, + "flos": 21142587352320.0, + "grad_norm": 2.125336770573566, + "language_loss": 0.66672212, + "learning_rate": 1.6336502587305035e-07, + "loss": 0.68721718, + "num_input_tokens_seen": 313541740, + "step": 14542, + "time_per_iteration": 2.754347085952759 + }, + { + "auxiliary_loss_clip": 0.01006348, + "auxiliary_loss_mlp": 0.01000651, + "balance_loss_clip": 1.00131512, + "balance_loss_mlp": 0.99972755, + "epoch": 0.8743724635502781, + "flos": 60870024351360.0, + "grad_norm": 0.7936008654805893, + "language_loss": 0.54457939, + "learning_rate": 1.632108943707642e-07, + "loss": 0.56464934, + "num_input_tokens_seen": 313593445, + "step": 14543, + "time_per_iteration": 3.0012309551239014 + }, + { + "auxiliary_loss_clip": 0.01035906, + "auxiliary_loss_mlp": 0.01032437, + "balance_loss_clip": 1.02562237, + "balance_loss_mlp": 1.02117765, + "epoch": 0.874432586802946, + "flos": 28109292883200.0, + "grad_norm": 2.337174371322107, + "language_loss": 0.6980989, + "learning_rate": 1.6305683251970458e-07, + "loss": 0.71878231, + "num_input_tokens_seen": 313615640, + "step": 14544, + "time_per_iteration": 2.735496759414673 + }, + { + "auxiliary_loss_clip": 0.01020837, + "auxiliary_loss_mlp": 0.01024674, + "balance_loss_clip": 1.02531648, + "balance_loss_mlp": 1.01536989, + "epoch": 0.874492710055614, + "flos": 23550289948800.0, + "grad_norm": 1.577037511632568, + "language_loss": 0.75912166, + "learning_rate": 1.62902840325714e-07, + "loss": 0.77957672, + "num_input_tokens_seen": 313635550, + "step": 14545, + "time_per_iteration": 2.8405399322509766 + }, + { + "auxiliary_loss_clip": 0.01048018, + "auxiliary_loss_mlp": 0.00747814, + "balance_loss_clip": 1.02268791, + "balance_loss_mlp": 1.00046253, + "epoch": 0.8745528333082819, + "flos": 40915647924480.0, + "grad_norm": 1.625688651250409, + "language_loss": 0.66074705, + "learning_rate": 1.6274891779463217e-07, + "loss": 0.67870539, + "num_input_tokens_seen": 313659275, + "step": 14546, + "time_per_iteration": 2.9230105876922607 + }, + { + "auxiliary_loss_clip": 0.01061283, + "auxiliary_loss_mlp": 0.01028578, + "balance_loss_clip": 1.02465606, + "balance_loss_mlp": 1.01824236, + "epoch": 0.87461295656095, + "flos": 23622901292160.0, + "grad_norm": 1.5688546576925262, + "language_loss": 0.72798681, + "learning_rate": 1.6259506493229536e-07, + "loss": 0.74888539, + "num_input_tokens_seen": 313680595, + "step": 14547, + "time_per_iteration": 4.754034757614136 + }, + { + "auxiliary_loss_clip": 0.01067502, + "auxiliary_loss_mlp": 0.01035221, + "balance_loss_clip": 1.02604449, + "balance_loss_mlp": 1.02346063, + "epoch": 0.874673079813618, + "flos": 38794116983040.0, + "grad_norm": 2.248576388420027, + "language_loss": 0.69415116, + "learning_rate": 1.6244128174453752e-07, + "loss": 0.71517837, + "num_input_tokens_seen": 313699730, + "step": 14548, + "time_per_iteration": 2.8047983646392822 + }, + { + "auxiliary_loss_clip": 0.01044381, + "auxiliary_loss_mlp": 0.01030891, + "balance_loss_clip": 1.02507746, + "balance_loss_mlp": 1.02017426, + "epoch": 0.8747332030662859, + "flos": 23696159080320.0, + "grad_norm": 2.059023751542488, + "language_loss": 0.70557868, + "learning_rate": 1.6228756823719093e-07, + "loss": 0.72633141, + "num_input_tokens_seen": 313720090, + "step": 14549, + "time_per_iteration": 2.83951473236084 + }, + { + "auxiliary_loss_clip": 0.01049597, + "auxiliary_loss_mlp": 0.00747756, + "balance_loss_clip": 1.02325344, + "balance_loss_mlp": 1.00041997, + "epoch": 0.8747933263189539, + "flos": 24462456854400.0, + "grad_norm": 2.198338388746011, + "language_loss": 0.83959377, + "learning_rate": 1.6213392441608352e-07, + "loss": 0.85756731, + "num_input_tokens_seen": 313736795, + "step": 14550, + "time_per_iteration": 2.795212507247925 + }, + { + "auxiliary_loss_clip": 0.01055079, + "auxiliary_loss_mlp": 0.01037947, + "balance_loss_clip": 1.02638102, + "balance_loss_mlp": 1.02743316, + "epoch": 0.8748534495716218, + "flos": 13809161917440.0, + "grad_norm": 1.642003259583093, + "language_loss": 0.71592742, + "learning_rate": 1.6198035028704183e-07, + "loss": 0.73685771, + "num_input_tokens_seen": 313754820, + "step": 14551, + "time_per_iteration": 3.0388009548187256 + }, + { + "auxiliary_loss_clip": 0.01052001, + "auxiliary_loss_mlp": 0.00747556, + "balance_loss_clip": 1.02532804, + "balance_loss_mlp": 1.00037813, + "epoch": 0.8749135728242898, + "flos": 29862092759040.0, + "grad_norm": 1.922837020461569, + "language_loss": 0.6412009, + "learning_rate": 1.6182684585588934e-07, + "loss": 0.65919638, + "num_input_tokens_seen": 313775830, + "step": 14552, + "time_per_iteration": 2.839277982711792 + }, + { + "auxiliary_loss_clip": 0.01031321, + "auxiliary_loss_mlp": 0.01025556, + "balance_loss_clip": 1.02305603, + "balance_loss_mlp": 1.01378405, + "epoch": 0.8749736960769577, + "flos": 24133479166080.0, + "grad_norm": 1.7354917508942214, + "language_loss": 0.79746956, + "learning_rate": 1.616734111284479e-07, + "loss": 0.81803834, + "num_input_tokens_seen": 313795745, + "step": 14553, + "time_per_iteration": 2.841949224472046 + }, + { + "auxiliary_loss_clip": 0.01043818, + "auxiliary_loss_mlp": 0.01032149, + "balance_loss_clip": 1.02213466, + "balance_loss_mlp": 1.02138448, + "epoch": 0.8750338193296258, + "flos": 17202540602880.0, + "grad_norm": 1.8818983571979175, + "language_loss": 0.70246112, + "learning_rate": 1.6152004611053416e-07, + "loss": 0.72322083, + "num_input_tokens_seen": 313813895, + "step": 14554, + "time_per_iteration": 2.5613672733306885 + }, + { + "auxiliary_loss_clip": 0.01045745, + "auxiliary_loss_mlp": 0.00747638, + "balance_loss_clip": 1.02702117, + "balance_loss_mlp": 1.00039887, + "epoch": 0.8750939425822937, + "flos": 23733218937600.0, + "grad_norm": 1.6750520604533756, + "language_loss": 0.83844888, + "learning_rate": 1.6136675080796457e-07, + "loss": 0.85638273, + "num_input_tokens_seen": 313834225, + "step": 14555, + "time_per_iteration": 2.8148767948150635 + }, + { + "auxiliary_loss_clip": 0.01051348, + "auxiliary_loss_mlp": 0.01031658, + "balance_loss_clip": 1.02421641, + "balance_loss_mlp": 1.02061915, + "epoch": 0.8751540658349617, + "flos": 26541684552960.0, + "grad_norm": 1.73357834750636, + "language_loss": 0.70754385, + "learning_rate": 1.6121352522655252e-07, + "loss": 0.72837389, + "num_input_tokens_seen": 313854430, + "step": 14556, + "time_per_iteration": 2.8450474739074707 + }, + { + "auxiliary_loss_clip": 0.0104449, + "auxiliary_loss_mlp": 0.01031325, + "balance_loss_clip": 1.02490497, + "balance_loss_mlp": 1.01908803, + "epoch": 0.8752141890876296, + "flos": 19386806647680.0, + "grad_norm": 1.8423080489128527, + "language_loss": 0.76624352, + "learning_rate": 1.6106036937210732e-07, + "loss": 0.78700173, + "num_input_tokens_seen": 313871600, + "step": 14557, + "time_per_iteration": 2.6533796787261963 + }, + { + "auxiliary_loss_clip": 0.01025671, + "auxiliary_loss_mlp": 0.01033384, + "balance_loss_clip": 1.02458167, + "balance_loss_mlp": 1.022089, + "epoch": 0.8752743123402976, + "flos": 25374408278400.0, + "grad_norm": 1.9378279241026228, + "language_loss": 0.82803595, + "learning_rate": 1.6090728325043767e-07, + "loss": 0.84862649, + "num_input_tokens_seen": 313891570, + "step": 14558, + "time_per_iteration": 2.6672935485839844 + }, + { + "auxiliary_loss_clip": 0.01005716, + "auxiliary_loss_mlp": 0.01001591, + "balance_loss_clip": 1.00087702, + "balance_loss_mlp": 1.00061917, + "epoch": 0.8753344355929655, + "flos": 59952398578560.0, + "grad_norm": 0.7946817472948202, + "language_loss": 0.56111324, + "learning_rate": 1.6075426686734784e-07, + "loss": 0.58118629, + "num_input_tokens_seen": 313951290, + "step": 14559, + "time_per_iteration": 3.151642322540283 + }, + { + "auxiliary_loss_clip": 0.01052223, + "auxiliary_loss_mlp": 0.0103283, + "balance_loss_clip": 1.02529371, + "balance_loss_mlp": 1.02281678, + "epoch": 0.8753945588456336, + "flos": 17894646835200.0, + "grad_norm": 2.996153326713021, + "language_loss": 0.65908593, + "learning_rate": 1.606013202286407e-07, + "loss": 0.67993647, + "num_input_tokens_seen": 313968645, + "step": 14560, + "time_per_iteration": 2.6500802040100098 + }, + { + "auxiliary_loss_clip": 0.01060743, + "auxiliary_loss_mlp": 0.01024411, + "balance_loss_clip": 1.02431381, + "balance_loss_mlp": 1.01449275, + "epoch": 0.8754546820983016, + "flos": 30914885410560.0, + "grad_norm": 1.8091640489797667, + "language_loss": 0.78820324, + "learning_rate": 1.6044844334011541e-07, + "loss": 0.80905479, + "num_input_tokens_seen": 313987580, + "step": 14561, + "time_per_iteration": 4.362871885299683 + }, + { + "auxiliary_loss_clip": 0.01064391, + "auxiliary_loss_mlp": 0.01028112, + "balance_loss_clip": 1.0253613, + "balance_loss_mlp": 1.01679289, + "epoch": 0.8755148053509695, + "flos": 20631075724800.0, + "grad_norm": 2.0615511211016533, + "language_loss": 0.77549607, + "learning_rate": 1.6029563620756982e-07, + "loss": 0.79642111, + "num_input_tokens_seen": 314004460, + "step": 14562, + "time_per_iteration": 2.6571621894836426 + }, + { + "auxiliary_loss_clip": 0.01057061, + "auxiliary_loss_mlp": 0.01025009, + "balance_loss_clip": 1.02285314, + "balance_loss_mlp": 1.01550198, + "epoch": 0.8755749286036375, + "flos": 34969739005440.0, + "grad_norm": 2.052826893419966, + "language_loss": 0.7183162, + "learning_rate": 1.601428988367981e-07, + "loss": 0.73913693, + "num_input_tokens_seen": 314026855, + "step": 14563, + "time_per_iteration": 2.752154588699341 + }, + { + "auxiliary_loss_clip": 0.01065733, + "auxiliary_loss_mlp": 0.01030805, + "balance_loss_clip": 1.02753198, + "balance_loss_mlp": 1.01991546, + "epoch": 0.8756350518563054, + "flos": 18186456925440.0, + "grad_norm": 2.3057456459645285, + "language_loss": 0.6504243, + "learning_rate": 1.5999023123359235e-07, + "loss": 0.67138964, + "num_input_tokens_seen": 314042830, + "step": 14564, + "time_per_iteration": 2.62141489982605 + }, + { + "auxiliary_loss_clip": 0.0105079, + "auxiliary_loss_mlp": 0.01030815, + "balance_loss_clip": 1.02393734, + "balance_loss_mlp": 1.0207957, + "epoch": 0.8756951751089734, + "flos": 20084012611200.0, + "grad_norm": 1.5486699493412743, + "language_loss": 0.70520604, + "learning_rate": 1.598376334037408e-07, + "loss": 0.72602212, + "num_input_tokens_seen": 314062225, + "step": 14565, + "time_per_iteration": 2.665827512741089 + }, + { + "auxiliary_loss_clip": 0.01045811, + "auxiliary_loss_mlp": 0.01029114, + "balance_loss_clip": 1.02486956, + "balance_loss_mlp": 1.01770532, + "epoch": 0.8757552983616413, + "flos": 27525241739520.0, + "grad_norm": 1.507491963144005, + "language_loss": 0.77578008, + "learning_rate": 1.5968510535303102e-07, + "loss": 0.79652929, + "num_input_tokens_seen": 314082325, + "step": 14566, + "time_per_iteration": 2.6511433124542236 + }, + { + "auxiliary_loss_clip": 0.01045379, + "auxiliary_loss_mlp": 0.0103337, + "balance_loss_clip": 1.02803278, + "balance_loss_mlp": 1.02293921, + "epoch": 0.8758154216143094, + "flos": 18073014796800.0, + "grad_norm": 1.6935168534894876, + "language_loss": 0.71006656, + "learning_rate": 1.5953264708724624e-07, + "loss": 0.73085403, + "num_input_tokens_seen": 314100310, + "step": 14567, + "time_per_iteration": 2.6943297386169434 + }, + { + "auxiliary_loss_clip": 0.01043372, + "auxiliary_loss_mlp": 0.0074775, + "balance_loss_clip": 1.02517366, + "balance_loss_mlp": 1.00043547, + "epoch": 0.8758755448669773, + "flos": 25045681985280.0, + "grad_norm": 1.710115490181833, + "language_loss": 0.74016011, + "learning_rate": 1.5938025861216776e-07, + "loss": 0.75807124, + "num_input_tokens_seen": 314121330, + "step": 14568, + "time_per_iteration": 2.691189765930176 + }, + { + "auxiliary_loss_clip": 0.01024769, + "auxiliary_loss_mlp": 0.01027567, + "balance_loss_clip": 1.02314115, + "balance_loss_mlp": 1.01748157, + "epoch": 0.8759356681196453, + "flos": 22856818999680.0, + "grad_norm": 2.058726001920138, + "language_loss": 0.86555851, + "learning_rate": 1.5922793993357475e-07, + "loss": 0.88608181, + "num_input_tokens_seen": 314139875, + "step": 14569, + "time_per_iteration": 2.7365689277648926 + }, + { + "auxiliary_loss_clip": 0.01033707, + "auxiliary_loss_mlp": 0.01027263, + "balance_loss_clip": 1.02569127, + "balance_loss_mlp": 1.01757741, + "epoch": 0.8759957913723132, + "flos": 21032521102080.0, + "grad_norm": 1.9263581063314807, + "language_loss": 0.74017417, + "learning_rate": 1.5907569105724284e-07, + "loss": 0.76078391, + "num_input_tokens_seen": 314157850, + "step": 14570, + "time_per_iteration": 2.687796115875244 + }, + { + "auxiliary_loss_clip": 0.010542, + "auxiliary_loss_mlp": 0.00747786, + "balance_loss_clip": 1.024966, + "balance_loss_mlp": 1.00049365, + "epoch": 0.8760559146249812, + "flos": 20010467514240.0, + "grad_norm": 1.5859634468334256, + "language_loss": 0.6800254, + "learning_rate": 1.5892351198894472e-07, + "loss": 0.69804525, + "num_input_tokens_seen": 314176720, + "step": 14571, + "time_per_iteration": 2.6872966289520264 + }, + { + "auxiliary_loss_clip": 0.01031885, + "auxiliary_loss_mlp": 0.01028179, + "balance_loss_clip": 1.02519906, + "balance_loss_mlp": 1.0180583, + "epoch": 0.8761160378776491, + "flos": 19974161842560.0, + "grad_norm": 2.4432815872253997, + "language_loss": 0.62684894, + "learning_rate": 1.5877140273445156e-07, + "loss": 0.64744955, + "num_input_tokens_seen": 314196645, + "step": 14572, + "time_per_iteration": 2.6934831142425537 + }, + { + "auxiliary_loss_clip": 0.01050409, + "auxiliary_loss_mlp": 0.01024522, + "balance_loss_clip": 1.02475512, + "balance_loss_mlp": 1.01513982, + "epoch": 0.8761761611303172, + "flos": 28804415857920.0, + "grad_norm": 1.8128775003182032, + "language_loss": 0.73759973, + "learning_rate": 1.5861936329953162e-07, + "loss": 0.758349, + "num_input_tokens_seen": 314217430, + "step": 14573, + "time_per_iteration": 2.678208827972412 + }, + { + "auxiliary_loss_clip": 0.01020986, + "auxiliary_loss_mlp": 0.00747437, + "balance_loss_clip": 1.02519739, + "balance_loss_mlp": 1.00035477, + "epoch": 0.8762362843829851, + "flos": 18332505624960.0, + "grad_norm": 1.9296498089217273, + "language_loss": 0.73045683, + "learning_rate": 1.5846739368994966e-07, + "loss": 0.74814105, + "num_input_tokens_seen": 314235310, + "step": 14574, + "time_per_iteration": 2.781801223754883 + }, + { + "auxiliary_loss_clip": 0.01050241, + "auxiliary_loss_mlp": 0.01029322, + "balance_loss_clip": 1.02384531, + "balance_loss_mlp": 1.01907003, + "epoch": 0.8762964076356531, + "flos": 15779149378560.0, + "grad_norm": 1.659810478420213, + "language_loss": 0.75742602, + "learning_rate": 1.5831549391146903e-07, + "loss": 0.77822167, + "num_input_tokens_seen": 314252355, + "step": 14575, + "time_per_iteration": 5.820122241973877 + }, + { + "auxiliary_loss_clip": 0.01035017, + "auxiliary_loss_mlp": 0.0103069, + "balance_loss_clip": 1.02458072, + "balance_loss_mlp": 1.02036071, + "epoch": 0.8763565308883211, + "flos": 33176754789120.0, + "grad_norm": 1.768561254610911, + "language_loss": 0.66961682, + "learning_rate": 1.5816366396984916e-07, + "loss": 0.69027388, + "num_input_tokens_seen": 314272755, + "step": 14576, + "time_per_iteration": 2.758100748062134 + }, + { + "auxiliary_loss_clip": 0.01031911, + "auxiliary_loss_mlp": 0.01026733, + "balance_loss_clip": 1.02178872, + "balance_loss_mlp": 1.01677954, + "epoch": 0.876416654140989, + "flos": 15888102307200.0, + "grad_norm": 1.782011559916398, + "language_loss": 0.66732001, + "learning_rate": 1.5801190387084806e-07, + "loss": 0.68790644, + "num_input_tokens_seen": 314291365, + "step": 14577, + "time_per_iteration": 2.626394748687744 + }, + { + "auxiliary_loss_clip": 0.01052235, + "auxiliary_loss_mlp": 0.01028372, + "balance_loss_clip": 1.02574611, + "balance_loss_mlp": 1.01783991, + "epoch": 0.876476777393657, + "flos": 25885237547520.0, + "grad_norm": 2.511050690493999, + "language_loss": 0.71388453, + "learning_rate": 1.5786021362021962e-07, + "loss": 0.73469055, + "num_input_tokens_seen": 314310075, + "step": 14578, + "time_per_iteration": 2.715972423553467 + }, + { + "auxiliary_loss_clip": 0.01061822, + "auxiliary_loss_mlp": 0.01033297, + "balance_loss_clip": 1.02427673, + "balance_loss_mlp": 1.02241921, + "epoch": 0.876536900646325, + "flos": 13589675861760.0, + "grad_norm": 1.844288264376638, + "language_loss": 0.71440446, + "learning_rate": 1.5770859322371676e-07, + "loss": 0.73535568, + "num_input_tokens_seen": 314325695, + "step": 14579, + "time_per_iteration": 2.5488436222076416 + }, + { + "auxiliary_loss_clip": 0.01028604, + "auxiliary_loss_mlp": 0.01035561, + "balance_loss_clip": 1.02156723, + "balance_loss_mlp": 1.02430212, + "epoch": 0.876597023898993, + "flos": 12203344494720.0, + "grad_norm": 1.756512581498279, + "language_loss": 0.70153499, + "learning_rate": 1.5755704268708912e-07, + "loss": 0.72217655, + "num_input_tokens_seen": 314343605, + "step": 14580, + "time_per_iteration": 2.622199058532715 + }, + { + "auxiliary_loss_clip": 0.010617, + "auxiliary_loss_mlp": 0.0074766, + "balance_loss_clip": 1.02627313, + "balance_loss_mlp": 1.00046635, + "epoch": 0.8766571471516609, + "flos": 25336773803520.0, + "grad_norm": 1.5508473886707461, + "language_loss": 0.65347862, + "learning_rate": 1.5740556201608256e-07, + "loss": 0.67157221, + "num_input_tokens_seen": 314364275, + "step": 14581, + "time_per_iteration": 2.6404991149902344 + }, + { + "auxiliary_loss_clip": 0.01039888, + "auxiliary_loss_mlp": 0.01028285, + "balance_loss_clip": 1.02486062, + "balance_loss_mlp": 1.01870024, + "epoch": 0.8767172704043289, + "flos": 30113287545600.0, + "grad_norm": 1.4373755802705839, + "language_loss": 0.73607802, + "learning_rate": 1.572541512164416e-07, + "loss": 0.75675976, + "num_input_tokens_seen": 314385140, + "step": 14582, + "time_per_iteration": 2.7053122520446777 + }, + { + "auxiliary_loss_clip": 0.0106057, + "auxiliary_loss_mlp": 0.00747643, + "balance_loss_clip": 1.02338088, + "balance_loss_mlp": 1.00034451, + "epoch": 0.8767773936569968, + "flos": 19281157770240.0, + "grad_norm": 4.573410246973873, + "language_loss": 0.66879231, + "learning_rate": 1.5710281029390826e-07, + "loss": 0.68687445, + "num_input_tokens_seen": 314403715, + "step": 14583, + "time_per_iteration": 2.5390305519104004 + }, + { + "auxiliary_loss_clip": 0.01053571, + "auxiliary_loss_mlp": 0.00747651, + "balance_loss_clip": 1.02505946, + "balance_loss_mlp": 1.00040042, + "epoch": 0.8768375169096648, + "flos": 21247230648960.0, + "grad_norm": 3.2253085060816074, + "language_loss": 0.7908116, + "learning_rate": 1.5695153925422067e-07, + "loss": 0.80882382, + "num_input_tokens_seen": 314421880, + "step": 14584, + "time_per_iteration": 2.6706433296203613 + }, + { + "auxiliary_loss_clip": 0.01032243, + "auxiliary_loss_mlp": 0.01029146, + "balance_loss_clip": 1.02401733, + "balance_loss_mlp": 1.01895308, + "epoch": 0.8768976401623327, + "flos": 23295539715840.0, + "grad_norm": 1.5001813436595246, + "language_loss": 0.72296208, + "learning_rate": 1.5680033810311555e-07, + "loss": 0.74357593, + "num_input_tokens_seen": 314441585, + "step": 14585, + "time_per_iteration": 2.717041254043579 + }, + { + "auxiliary_loss_clip": 0.01039436, + "auxiliary_loss_mlp": 0.01026331, + "balance_loss_clip": 1.02309489, + "balance_loss_mlp": 1.01572776, + "epoch": 0.8769577634150008, + "flos": 21361247395200.0, + "grad_norm": 1.9018219946974895, + "language_loss": 0.74258006, + "learning_rate": 1.5664920684632654e-07, + "loss": 0.76323771, + "num_input_tokens_seen": 314459020, + "step": 14586, + "time_per_iteration": 2.645033359527588 + }, + { + "auxiliary_loss_clip": 0.01060093, + "auxiliary_loss_mlp": 0.01028691, + "balance_loss_clip": 1.02343726, + "balance_loss_mlp": 1.01808131, + "epoch": 0.8770178866676687, + "flos": 23514056104320.0, + "grad_norm": 1.7193525257353766, + "language_loss": 0.78657979, + "learning_rate": 1.564981454895844e-07, + "loss": 0.80746758, + "num_input_tokens_seen": 314478935, + "step": 14587, + "time_per_iteration": 2.578047752380371 + }, + { + "auxiliary_loss_clip": 0.01052725, + "auxiliary_loss_mlp": 0.01026018, + "balance_loss_clip": 1.02594185, + "balance_loss_mlp": 1.01416886, + "epoch": 0.8770780099203367, + "flos": 19719052473600.0, + "grad_norm": 1.6469018650189806, + "language_loss": 0.73739243, + "learning_rate": 1.5634715403861697e-07, + "loss": 0.7581799, + "num_input_tokens_seen": 314497635, + "step": 14588, + "time_per_iteration": 2.6728053092956543 + }, + { + "auxiliary_loss_clip": 0.0101187, + "auxiliary_loss_mlp": 0.00747719, + "balance_loss_clip": 1.02459848, + "balance_loss_mlp": 1.00039864, + "epoch": 0.8771381331730047, + "flos": 21395901041280.0, + "grad_norm": 1.6236957123608013, + "language_loss": 0.66545677, + "learning_rate": 1.5619623249915016e-07, + "loss": 0.68305266, + "num_input_tokens_seen": 314515445, + "step": 14589, + "time_per_iteration": 2.8745059967041016 + }, + { + "auxiliary_loss_clip": 0.01052789, + "auxiliary_loss_mlp": 0.01032138, + "balance_loss_clip": 1.02551508, + "balance_loss_mlp": 1.02179027, + "epoch": 0.8771982564256726, + "flos": 20261770041600.0, + "grad_norm": 2.1656580881609453, + "language_loss": 0.70642602, + "learning_rate": 1.5604538087690732e-07, + "loss": 0.72727525, + "num_input_tokens_seen": 314533040, + "step": 14590, + "time_per_iteration": 2.7279579639434814 + }, + { + "auxiliary_loss_clip": 0.01037498, + "auxiliary_loss_mlp": 0.01041223, + "balance_loss_clip": 1.02379084, + "balance_loss_mlp": 1.02929068, + "epoch": 0.8772583796783406, + "flos": 12489372495360.0, + "grad_norm": 2.0408197028427963, + "language_loss": 0.74556744, + "learning_rate": 1.558945991776086e-07, + "loss": 0.76635468, + "num_input_tokens_seen": 314548280, + "step": 14591, + "time_per_iteration": 2.692384719848633 + }, + { + "auxiliary_loss_clip": 0.01058541, + "auxiliary_loss_mlp": 0.01022629, + "balance_loss_clip": 1.02403879, + "balance_loss_mlp": 1.01305699, + "epoch": 0.8773185029310085, + "flos": 15921103927680.0, + "grad_norm": 1.739789939270134, + "language_loss": 0.80200875, + "learning_rate": 1.5574388740697096e-07, + "loss": 0.82282054, + "num_input_tokens_seen": 314565345, + "step": 14592, + "time_per_iteration": 2.5324623584747314 + }, + { + "auxiliary_loss_clip": 0.01059026, + "auxiliary_loss_mlp": 0.0102631, + "balance_loss_clip": 1.02442336, + "balance_loss_mlp": 1.01696444, + "epoch": 0.8773786261836766, + "flos": 21504530747520.0, + "grad_norm": 1.547542043836973, + "language_loss": 0.82537544, + "learning_rate": 1.5559324557071052e-07, + "loss": 0.84622878, + "num_input_tokens_seen": 314584190, + "step": 14593, + "time_per_iteration": 2.5546212196350098 + }, + { + "auxiliary_loss_clip": 0.01041839, + "auxiliary_loss_mlp": 0.01023824, + "balance_loss_clip": 1.02233422, + "balance_loss_mlp": 1.01387012, + "epoch": 0.8774387494363445, + "flos": 26761493831040.0, + "grad_norm": 2.489099170727329, + "language_loss": 0.75941247, + "learning_rate": 1.5544267367453845e-07, + "loss": 0.78006911, + "num_input_tokens_seen": 314605625, + "step": 14594, + "time_per_iteration": 2.6503593921661377 + }, + { + "auxiliary_loss_clip": 0.01008344, + "auxiliary_loss_mlp": 0.01031915, + "balance_loss_clip": 1.01931477, + "balance_loss_mlp": 1.01937389, + "epoch": 0.8774988726890125, + "flos": 18478841633280.0, + "grad_norm": 2.4057933476331574, + "language_loss": 0.77819765, + "learning_rate": 1.552921717241651e-07, + "loss": 0.7986002, + "num_input_tokens_seen": 314622630, + "step": 14595, + "time_per_iteration": 4.585644721984863 + }, + { + "auxiliary_loss_clip": 0.01032312, + "auxiliary_loss_mlp": 0.01032046, + "balance_loss_clip": 1.02461958, + "balance_loss_mlp": 1.02112067, + "epoch": 0.8775589959416804, + "flos": 24426366664320.0, + "grad_norm": 1.4956523042147378, + "language_loss": 0.70731449, + "learning_rate": 1.5514173972529743e-07, + "loss": 0.72795808, + "num_input_tokens_seen": 314642460, + "step": 14596, + "time_per_iteration": 2.8119313716888428 + }, + { + "auxiliary_loss_clip": 0.01032211, + "auxiliary_loss_mlp": 0.01023341, + "balance_loss_clip": 1.02604175, + "balance_loss_mlp": 1.01351833, + "epoch": 0.8776191191943484, + "flos": 23440151871360.0, + "grad_norm": 1.6722149831615902, + "language_loss": 0.85623646, + "learning_rate": 1.5499137768364067e-07, + "loss": 0.87679207, + "num_input_tokens_seen": 314659875, + "step": 14597, + "time_per_iteration": 2.739168167114258 + }, + { + "auxiliary_loss_clip": 0.01052921, + "auxiliary_loss_mlp": 0.01027966, + "balance_loss_clip": 1.02604651, + "balance_loss_mlp": 1.01770854, + "epoch": 0.8776792424470163, + "flos": 26830872950400.0, + "grad_norm": 1.671199927956012, + "language_loss": 0.72849345, + "learning_rate": 1.5484108560489494e-07, + "loss": 0.74930227, + "num_input_tokens_seen": 314680260, + "step": 14598, + "time_per_iteration": 2.5958826541900635 + }, + { + "auxiliary_loss_clip": 0.01045306, + "auxiliary_loss_mlp": 0.00747553, + "balance_loss_clip": 1.02396119, + "balance_loss_mlp": 1.00033343, + "epoch": 0.8777393656996844, + "flos": 15626169354240.0, + "grad_norm": 1.9965136665862007, + "language_loss": 0.77470154, + "learning_rate": 1.5469086349476036e-07, + "loss": 0.79263014, + "num_input_tokens_seen": 314696260, + "step": 14599, + "time_per_iteration": 2.624993085861206 + }, + { + "auxiliary_loss_clip": 0.01030211, + "auxiliary_loss_mlp": 0.01028646, + "balance_loss_clip": 1.02316236, + "balance_loss_mlp": 1.01841187, + "epoch": 0.8777994889523523, + "flos": 18879999701760.0, + "grad_norm": 2.030739633812526, + "language_loss": 0.67686486, + "learning_rate": 1.545407113589332e-07, + "loss": 0.69745338, + "num_input_tokens_seen": 314714215, + "step": 14600, + "time_per_iteration": 2.7005062103271484 + }, + { + "auxiliary_loss_clip": 0.01052024, + "auxiliary_loss_mlp": 0.0103512, + "balance_loss_clip": 1.02404666, + "balance_loss_mlp": 1.02454627, + "epoch": 0.8778596122050203, + "flos": 48826516400640.0, + "grad_norm": 17.211780757216665, + "language_loss": 0.69281554, + "learning_rate": 1.543906292031072e-07, + "loss": 0.71368694, + "num_input_tokens_seen": 314735700, + "step": 14601, + "time_per_iteration": 2.8447422981262207 + }, + { + "auxiliary_loss_clip": 0.01057006, + "auxiliary_loss_mlp": 0.01029544, + "balance_loss_clip": 1.02628541, + "balance_loss_mlp": 1.01897013, + "epoch": 0.8779197354576883, + "flos": 25660184883840.0, + "grad_norm": 1.9632325175370042, + "language_loss": 0.73218739, + "learning_rate": 1.542406170329733e-07, + "loss": 0.75305283, + "num_input_tokens_seen": 314753335, + "step": 14602, + "time_per_iteration": 2.6861774921417236 + }, + { + "auxiliary_loss_clip": 0.01060068, + "auxiliary_loss_mlp": 0.01031603, + "balance_loss_clip": 1.02401102, + "balance_loss_mlp": 1.02194691, + "epoch": 0.8779798587103562, + "flos": 18843227153280.0, + "grad_norm": 1.8564086752546032, + "language_loss": 0.70910692, + "learning_rate": 1.5409067485422056e-07, + "loss": 0.73002368, + "num_input_tokens_seen": 314770800, + "step": 14603, + "time_per_iteration": 2.5781335830688477 + }, + { + "auxiliary_loss_clip": 0.00988287, + "auxiliary_loss_mlp": 0.01004222, + "balance_loss_clip": 1.00298858, + "balance_loss_mlp": 1.00335217, + "epoch": 0.8780399819630242, + "flos": 68613119377920.0, + "grad_norm": 0.7324217248639915, + "language_loss": 0.54136163, + "learning_rate": 1.539408026725344e-07, + "loss": 0.56128675, + "num_input_tokens_seen": 314837275, + "step": 14604, + "time_per_iteration": 3.2482786178588867 + }, + { + "auxiliary_loss_clip": 0.00978442, + "auxiliary_loss_mlp": 0.0101567, + "balance_loss_clip": 1.00262785, + "balance_loss_mlp": 1.01466298, + "epoch": 0.8781001052156922, + "flos": 65734807766400.0, + "grad_norm": 0.7200680811135265, + "language_loss": 0.59234691, + "learning_rate": 1.537910004935976e-07, + "loss": 0.612288, + "num_input_tokens_seen": 314902220, + "step": 14605, + "time_per_iteration": 3.2699882984161377 + }, + { + "auxiliary_loss_clip": 0.01025626, + "auxiliary_loss_mlp": 0.01031286, + "balance_loss_clip": 1.02655268, + "balance_loss_mlp": 1.02019334, + "epoch": 0.8781602284683602, + "flos": 22049654526720.0, + "grad_norm": 1.5867827007304605, + "language_loss": 0.84997845, + "learning_rate": 1.536412683230912e-07, + "loss": 0.87054753, + "num_input_tokens_seen": 314921645, + "step": 14606, + "time_per_iteration": 2.8375189304351807 + }, + { + "auxiliary_loss_clip": 0.01064521, + "auxiliary_loss_mlp": 0.01027514, + "balance_loss_clip": 1.02656424, + "balance_loss_mlp": 1.01599848, + "epoch": 0.8782203517210281, + "flos": 17562939713280.0, + "grad_norm": 1.7355926026234234, + "language_loss": 0.70299399, + "learning_rate": 1.534916061666931e-07, + "loss": 0.72391433, + "num_input_tokens_seen": 314939390, + "step": 14607, + "time_per_iteration": 2.835674285888672 + }, + { + "auxiliary_loss_clip": 0.01039685, + "auxiliary_loss_mlp": 0.01031686, + "balance_loss_clip": 1.02331543, + "balance_loss_mlp": 1.0223043, + "epoch": 0.8782804749736961, + "flos": 25520421064320.0, + "grad_norm": 1.9101270137707378, + "language_loss": 0.71652389, + "learning_rate": 1.533420140300785e-07, + "loss": 0.73723757, + "num_input_tokens_seen": 314959205, + "step": 14608, + "time_per_iteration": 4.377465009689331 + }, + { + "auxiliary_loss_clip": 0.01049219, + "auxiliary_loss_mlp": 0.01032351, + "balance_loss_clip": 1.02308917, + "balance_loss_mlp": 1.02162838, + "epoch": 0.878340598226364, + "flos": 21798747048960.0, + "grad_norm": 1.9088250719202087, + "language_loss": 0.87280154, + "learning_rate": 1.5319249191891936e-07, + "loss": 0.89361727, + "num_input_tokens_seen": 314977485, + "step": 14609, + "time_per_iteration": 2.783325433731079 + }, + { + "auxiliary_loss_clip": 0.01017832, + "auxiliary_loss_mlp": 0.01026514, + "balance_loss_clip": 1.02726912, + "balance_loss_mlp": 1.01623845, + "epoch": 0.878400721479032, + "flos": 21102403011840.0, + "grad_norm": 1.6110121630029857, + "language_loss": 0.70232117, + "learning_rate": 1.5304303983888643e-07, + "loss": 0.72276467, + "num_input_tokens_seen": 314997830, + "step": 14610, + "time_per_iteration": 2.8992302417755127 + }, + { + "auxiliary_loss_clip": 0.01051775, + "auxiliary_loss_mlp": 0.00747657, + "balance_loss_clip": 1.02665997, + "balance_loss_mlp": 1.00036609, + "epoch": 0.8784608447316999, + "flos": 20923532259840.0, + "grad_norm": 2.1971266088356485, + "language_loss": 0.80525076, + "learning_rate": 1.5289365779564612e-07, + "loss": 0.82324511, + "num_input_tokens_seen": 315016480, + "step": 14611, + "time_per_iteration": 2.766601085662842 + }, + { + "auxiliary_loss_clip": 0.01062811, + "auxiliary_loss_mlp": 0.01029429, + "balance_loss_clip": 1.02486801, + "balance_loss_mlp": 1.0190165, + "epoch": 0.878520967984368, + "flos": 23330660238720.0, + "grad_norm": 1.5385478558728798, + "language_loss": 0.76684785, + "learning_rate": 1.5274434579486338e-07, + "loss": 0.78777021, + "num_input_tokens_seen": 315036135, + "step": 14612, + "time_per_iteration": 2.6527907848358154 + }, + { + "auxiliary_loss_clip": 0.01011548, + "auxiliary_loss_mlp": 0.010312, + "balance_loss_clip": 1.02391315, + "balance_loss_mlp": 1.02148461, + "epoch": 0.8785810912370359, + "flos": 25518984520320.0, + "grad_norm": 1.3139988510691532, + "language_loss": 0.72416478, + "learning_rate": 1.525951038422002e-07, + "loss": 0.74459231, + "num_input_tokens_seen": 315057995, + "step": 14613, + "time_per_iteration": 2.7629497051239014 + }, + { + "auxiliary_loss_clip": 0.00985949, + "auxiliary_loss_mlp": 0.01000541, + "balance_loss_clip": 1.009202, + "balance_loss_mlp": 0.99973631, + "epoch": 0.8786412144897039, + "flos": 61841047691520.0, + "grad_norm": 1.2106260517860248, + "language_loss": 0.6458391, + "learning_rate": 1.5244593194331667e-07, + "loss": 0.66570401, + "num_input_tokens_seen": 315104010, + "step": 14614, + "time_per_iteration": 3.082832098007202 + }, + { + "auxiliary_loss_clip": 0.01006048, + "auxiliary_loss_mlp": 0.01001667, + "balance_loss_clip": 1.00103915, + "balance_loss_mlp": 1.00084424, + "epoch": 0.8787013377423719, + "flos": 70989364638720.0, + "grad_norm": 0.6588424515278646, + "language_loss": 0.5863384, + "learning_rate": 1.5229683010386762e-07, + "loss": 0.60641563, + "num_input_tokens_seen": 315174550, + "step": 14615, + "time_per_iteration": 3.271393060684204 + }, + { + "auxiliary_loss_clip": 0.0102154, + "auxiliary_loss_mlp": 0.01025103, + "balance_loss_clip": 1.02353668, + "balance_loss_mlp": 1.01497042, + "epoch": 0.8787614609950398, + "flos": 17347404153600.0, + "grad_norm": 1.828002708934499, + "language_loss": 0.72941625, + "learning_rate": 1.5214779832950807e-07, + "loss": 0.7498827, + "num_input_tokens_seen": 315191825, + "step": 14616, + "time_per_iteration": 2.8757894039154053 + }, + { + "auxiliary_loss_clip": 0.01006625, + "auxiliary_loss_mlp": 0.0099936, + "balance_loss_clip": 1.00157857, + "balance_loss_mlp": 0.99845421, + "epoch": 0.8788215842477078, + "flos": 72511401588480.0, + "grad_norm": 0.8437839215375823, + "language_loss": 0.58025879, + "learning_rate": 1.5199883662588953e-07, + "loss": 0.60031867, + "num_input_tokens_seen": 315255075, + "step": 14617, + "time_per_iteration": 3.3549201488494873 + }, + { + "auxiliary_loss_clip": 0.01034547, + "auxiliary_loss_mlp": 0.01031038, + "balance_loss_clip": 1.02289653, + "balance_loss_mlp": 1.02001119, + "epoch": 0.8788817075003758, + "flos": 24827452905600.0, + "grad_norm": 1.6504800140004945, + "language_loss": 0.83621454, + "learning_rate": 1.5184994499865987e-07, + "loss": 0.85687035, + "num_input_tokens_seen": 315273995, + "step": 14618, + "time_per_iteration": 2.889724016189575 + }, + { + "auxiliary_loss_clip": 0.01038561, + "auxiliary_loss_mlp": 0.01028174, + "balance_loss_clip": 1.02454233, + "balance_loss_mlp": 1.01843476, + "epoch": 0.8789418307530438, + "flos": 22638769488000.0, + "grad_norm": 1.5694856288090913, + "language_loss": 0.69242805, + "learning_rate": 1.5170112345346598e-07, + "loss": 0.71309543, + "num_input_tokens_seen": 315294485, + "step": 14619, + "time_per_iteration": 2.7661612033843994 + }, + { + "auxiliary_loss_clip": 0.0102263, + "auxiliary_loss_mlp": 0.01034364, + "balance_loss_clip": 1.02402067, + "balance_loss_mlp": 1.02412963, + "epoch": 0.8790019540057117, + "flos": 19785738072960.0, + "grad_norm": 1.7635032499830001, + "language_loss": 0.77973187, + "learning_rate": 1.5155237199595016e-07, + "loss": 0.80030179, + "num_input_tokens_seen": 315310420, + "step": 14620, + "time_per_iteration": 2.8748672008514404 + }, + { + "auxiliary_loss_clip": 0.01047082, + "auxiliary_loss_mlp": 0.01028904, + "balance_loss_clip": 1.02807319, + "balance_loss_mlp": 1.01747155, + "epoch": 0.8790620772583797, + "flos": 20229774001920.0, + "grad_norm": 2.1952048487097024, + "language_loss": 0.78901196, + "learning_rate": 1.514036906317542e-07, + "loss": 0.80977178, + "num_input_tokens_seen": 315330110, + "step": 14621, + "time_per_iteration": 2.745915651321411 + }, + { + "auxiliary_loss_clip": 0.01043662, + "auxiliary_loss_mlp": 0.0103115, + "balance_loss_clip": 1.02450633, + "balance_loss_mlp": 1.02062941, + "epoch": 0.8791222005110476, + "flos": 24130785646080.0, + "grad_norm": 1.8779734851291494, + "language_loss": 0.66709471, + "learning_rate": 1.5125507936651506e-07, + "loss": 0.68784285, + "num_input_tokens_seen": 315350080, + "step": 14622, + "time_per_iteration": 4.330897331237793 + }, + { + "auxiliary_loss_clip": 0.01042406, + "auxiliary_loss_mlp": 0.01031666, + "balance_loss_clip": 1.025653, + "balance_loss_mlp": 1.02125883, + "epoch": 0.8791823237637156, + "flos": 21614201948160.0, + "grad_norm": 1.8772348103811396, + "language_loss": 0.72980505, + "learning_rate": 1.511065382058687e-07, + "loss": 0.7505458, + "num_input_tokens_seen": 315366360, + "step": 14623, + "time_per_iteration": 4.397304534912109 + }, + { + "auxiliary_loss_clip": 0.01013699, + "auxiliary_loss_mlp": 0.01027264, + "balance_loss_clip": 1.0216192, + "balance_loss_mlp": 1.01694012, + "epoch": 0.8792424470163835, + "flos": 24243401761920.0, + "grad_norm": 1.6010383068933114, + "language_loss": 0.78353512, + "learning_rate": 1.5095806715544801e-07, + "loss": 0.80394477, + "num_input_tokens_seen": 315385890, + "step": 14624, + "time_per_iteration": 2.7657008171081543 + }, + { + "auxiliary_loss_clip": 0.01050532, + "auxiliary_loss_mlp": 0.01034409, + "balance_loss_clip": 1.0228498, + "balance_loss_mlp": 1.02269053, + "epoch": 0.8793025702690516, + "flos": 24893204751360.0, + "grad_norm": 1.9032653321369395, + "language_loss": 0.79605103, + "learning_rate": 1.5080966622088265e-07, + "loss": 0.81690043, + "num_input_tokens_seen": 315403400, + "step": 14625, + "time_per_iteration": 2.6886215209960938 + }, + { + "auxiliary_loss_clip": 0.01041059, + "auxiliary_loss_mlp": 0.01036866, + "balance_loss_clip": 1.02550626, + "balance_loss_mlp": 1.02679276, + "epoch": 0.8793626935217195, + "flos": 25373115388800.0, + "grad_norm": 1.4958360424084634, + "language_loss": 0.74286968, + "learning_rate": 1.5066133540779967e-07, + "loss": 0.76364887, + "num_input_tokens_seen": 315423670, + "step": 14626, + "time_per_iteration": 2.7081143856048584 + }, + { + "auxiliary_loss_clip": 0.01052039, + "auxiliary_loss_mlp": 0.01030841, + "balance_loss_clip": 1.02416229, + "balance_loss_mlp": 1.02013564, + "epoch": 0.8794228167743875, + "flos": 34678000742400.0, + "grad_norm": 1.846906552246741, + "language_loss": 0.708812, + "learning_rate": 1.505130747218246e-07, + "loss": 0.72964084, + "num_input_tokens_seen": 315446265, + "step": 14627, + "time_per_iteration": 2.713066577911377 + }, + { + "auxiliary_loss_clip": 0.01038771, + "auxiliary_loss_mlp": 0.01024268, + "balance_loss_clip": 1.0298816, + "balance_loss_mlp": 1.01334834, + "epoch": 0.8794829400270555, + "flos": 19464014931840.0, + "grad_norm": 1.7723123868319623, + "language_loss": 0.72217697, + "learning_rate": 1.5036488416857873e-07, + "loss": 0.74280739, + "num_input_tokens_seen": 315464655, + "step": 14628, + "time_per_iteration": 2.686274290084839 + }, + { + "auxiliary_loss_clip": 0.01028537, + "auxiliary_loss_mlp": 0.01029833, + "balance_loss_clip": 1.02265763, + "balance_loss_mlp": 1.01870453, + "epoch": 0.8795430632797234, + "flos": 15231403906560.0, + "grad_norm": 2.990833415883632, + "language_loss": 0.69043726, + "learning_rate": 1.5021676375368175e-07, + "loss": 0.71102095, + "num_input_tokens_seen": 315481090, + "step": 14629, + "time_per_iteration": 2.6558761596679688 + }, + { + "auxiliary_loss_clip": 0.01038049, + "auxiliary_loss_mlp": 0.0103001, + "balance_loss_clip": 1.02254891, + "balance_loss_mlp": 1.02099788, + "epoch": 0.8796031865323914, + "flos": 27744727795200.0, + "grad_norm": 1.6391656247394164, + "language_loss": 0.68690372, + "learning_rate": 1.5006871348275053e-07, + "loss": 0.70758426, + "num_input_tokens_seen": 315502010, + "step": 14630, + "time_per_iteration": 2.6643762588500977 + }, + { + "auxiliary_loss_clip": 0.01038723, + "auxiliary_loss_mlp": 0.01030931, + "balance_loss_clip": 1.02315986, + "balance_loss_mlp": 1.01995754, + "epoch": 0.8796633097850594, + "flos": 31285412156160.0, + "grad_norm": 1.5523810709606485, + "language_loss": 0.73858398, + "learning_rate": 1.499207333613999e-07, + "loss": 0.75928056, + "num_input_tokens_seen": 315523040, + "step": 14631, + "time_per_iteration": 2.6926796436309814 + }, + { + "auxiliary_loss_clip": 0.01038783, + "auxiliary_loss_mlp": 0.00747463, + "balance_loss_clip": 1.02308404, + "balance_loss_mlp": 1.00036371, + "epoch": 0.8797234330377274, + "flos": 24243150366720.0, + "grad_norm": 1.9212928216259095, + "language_loss": 0.6951865, + "learning_rate": 1.4977282339523954e-07, + "loss": 0.71304893, + "num_input_tokens_seen": 315541865, + "step": 14632, + "time_per_iteration": 2.644519805908203 + }, + { + "auxiliary_loss_clip": 0.010427, + "auxiliary_loss_mlp": 0.01028183, + "balance_loss_clip": 1.02582395, + "balance_loss_mlp": 1.01893842, + "epoch": 0.8797835562903953, + "flos": 24167414540160.0, + "grad_norm": 1.8718314790938575, + "language_loss": 0.65376151, + "learning_rate": 1.4962498358987929e-07, + "loss": 0.67447037, + "num_input_tokens_seen": 315561470, + "step": 14633, + "time_per_iteration": 2.6844239234924316 + }, + { + "auxiliary_loss_clip": 0.01026605, + "auxiliary_loss_mlp": 0.01029603, + "balance_loss_clip": 1.02230215, + "balance_loss_mlp": 1.01938713, + "epoch": 0.8798436795430633, + "flos": 19284677303040.0, + "grad_norm": 1.4082523758124985, + "language_loss": 0.84291601, + "learning_rate": 1.4947721395092528e-07, + "loss": 0.86347812, + "num_input_tokens_seen": 315583140, + "step": 14634, + "time_per_iteration": 2.722062349319458 + }, + { + "auxiliary_loss_clip": 0.01040567, + "auxiliary_loss_mlp": 0.00747647, + "balance_loss_clip": 1.02417672, + "balance_loss_mlp": 1.00042009, + "epoch": 0.8799038027957312, + "flos": 28179390274560.0, + "grad_norm": 2.939705974696814, + "language_loss": 0.80007744, + "learning_rate": 1.4932951448398056e-07, + "loss": 0.81795955, + "num_input_tokens_seen": 315601935, + "step": 14635, + "time_per_iteration": 2.7172141075134277 + }, + { + "auxiliary_loss_clip": 0.0103326, + "auxiliary_loss_mlp": 0.01022199, + "balance_loss_clip": 1.02430296, + "balance_loss_mlp": 1.01179767, + "epoch": 0.8799639260483992, + "flos": 24644703484800.0, + "grad_norm": 1.8477336508694908, + "language_loss": 0.65658689, + "learning_rate": 1.4918188519464648e-07, + "loss": 0.67714149, + "num_input_tokens_seen": 315619995, + "step": 14636, + "time_per_iteration": 2.650460958480835 + }, + { + "auxiliary_loss_clip": 0.01035277, + "auxiliary_loss_mlp": 0.01031872, + "balance_loss_clip": 1.02332163, + "balance_loss_mlp": 1.02008843, + "epoch": 0.8800240493010671, + "flos": 22200479735040.0, + "grad_norm": 1.4826956811146248, + "language_loss": 0.70497632, + "learning_rate": 1.4903432608852074e-07, + "loss": 0.72564781, + "num_input_tokens_seen": 315637895, + "step": 14637, + "time_per_iteration": 2.690974712371826 + }, + { + "auxiliary_loss_clip": 0.01042954, + "auxiliary_loss_mlp": 0.01026785, + "balance_loss_clip": 1.02680922, + "balance_loss_mlp": 1.01656854, + "epoch": 0.8800841725537352, + "flos": 14246086953600.0, + "grad_norm": 1.8958367967957805, + "language_loss": 0.66185808, + "learning_rate": 1.4888683717119843e-07, + "loss": 0.6825555, + "num_input_tokens_seen": 315655520, + "step": 14638, + "time_per_iteration": 2.698699712753296 + }, + { + "auxiliary_loss_clip": 0.0105122, + "auxiliary_loss_mlp": 0.01027219, + "balance_loss_clip": 1.02410126, + "balance_loss_mlp": 1.01712203, + "epoch": 0.8801442958064031, + "flos": 37415794348800.0, + "grad_norm": 2.619973072068571, + "language_loss": 0.58049458, + "learning_rate": 1.4873941844827286e-07, + "loss": 0.60127902, + "num_input_tokens_seen": 315678955, + "step": 14639, + "time_per_iteration": 2.7691750526428223 + }, + { + "auxiliary_loss_clip": 0.01032255, + "auxiliary_loss_mlp": 0.01031054, + "balance_loss_clip": 1.0235697, + "balance_loss_mlp": 1.02030158, + "epoch": 0.8802044190590711, + "flos": 25047334010880.0, + "grad_norm": 1.7925926681966877, + "language_loss": 0.74669653, + "learning_rate": 1.4859206992533402e-07, + "loss": 0.76732963, + "num_input_tokens_seen": 315700360, + "step": 14640, + "time_per_iteration": 2.693129301071167 + }, + { + "auxiliary_loss_clip": 0.01035926, + "auxiliary_loss_mlp": 0.01037445, + "balance_loss_clip": 1.02267504, + "balance_loss_mlp": 1.02634072, + "epoch": 0.8802645423117391, + "flos": 24133874215680.0, + "grad_norm": 1.8880850473215152, + "language_loss": 0.69782382, + "learning_rate": 1.4844479160796985e-07, + "loss": 0.71855754, + "num_input_tokens_seen": 315719270, + "step": 14641, + "time_per_iteration": 2.6775646209716797 + }, + { + "auxiliary_loss_clip": 0.01052737, + "auxiliary_loss_mlp": 0.01026771, + "balance_loss_clip": 1.02459157, + "balance_loss_mlp": 1.01569676, + "epoch": 0.880324665564407, + "flos": 17931203902080.0, + "grad_norm": 2.208112177582034, + "language_loss": 0.85402524, + "learning_rate": 1.4829758350176457e-07, + "loss": 0.87482035, + "num_input_tokens_seen": 315737425, + "step": 14642, + "time_per_iteration": 4.4710094928741455 + }, + { + "auxiliary_loss_clip": 0.01031712, + "auxiliary_loss_mlp": 0.0103183, + "balance_loss_clip": 1.03218269, + "balance_loss_mlp": 1.02010608, + "epoch": 0.880384788817075, + "flos": 21287630471040.0, + "grad_norm": 1.7110063700803244, + "language_loss": 0.78830063, + "learning_rate": 1.4815044561230038e-07, + "loss": 0.80893606, + "num_input_tokens_seen": 315755725, + "step": 14643, + "time_per_iteration": 2.7469282150268555 + }, + { + "auxiliary_loss_clip": 0.01045545, + "auxiliary_loss_mlp": 0.0102242, + "balance_loss_clip": 1.02147889, + "balance_loss_mlp": 1.01296711, + "epoch": 0.880444912069743, + "flos": 12458489777280.0, + "grad_norm": 1.6531544654548906, + "language_loss": 0.72798371, + "learning_rate": 1.4800337794515705e-07, + "loss": 0.74866337, + "num_input_tokens_seen": 315773835, + "step": 14644, + "time_per_iteration": 2.6508584022521973 + }, + { + "auxiliary_loss_clip": 0.01063657, + "auxiliary_loss_mlp": 0.00747725, + "balance_loss_clip": 1.02535605, + "balance_loss_mlp": 1.00042295, + "epoch": 0.880505035322411, + "flos": 13625945619840.0, + "grad_norm": 2.1105129984362483, + "language_loss": 0.79448426, + "learning_rate": 1.47856380505911e-07, + "loss": 0.81259811, + "num_input_tokens_seen": 315790615, + "step": 14645, + "time_per_iteration": 2.555767774581909 + }, + { + "auxiliary_loss_clip": 0.01045366, + "auxiliary_loss_mlp": 0.0103379, + "balance_loss_clip": 1.0217942, + "balance_loss_mlp": 1.02329409, + "epoch": 0.8805651585750789, + "flos": 23183067254400.0, + "grad_norm": 1.87207728247864, + "language_loss": 0.64382052, + "learning_rate": 1.477094533001364e-07, + "loss": 0.66461205, + "num_input_tokens_seen": 315811010, + "step": 14646, + "time_per_iteration": 2.6970303058624268 + }, + { + "auxiliary_loss_clip": 0.0103761, + "auxiliary_loss_mlp": 0.01029332, + "balance_loss_clip": 1.02703416, + "balance_loss_mlp": 1.01784575, + "epoch": 0.8806252818277469, + "flos": 14903000835840.0, + "grad_norm": 1.9336550400328847, + "language_loss": 0.76695287, + "learning_rate": 1.475625963334055e-07, + "loss": 0.78762227, + "num_input_tokens_seen": 315828130, + "step": 14647, + "time_per_iteration": 2.7053370475769043 + }, + { + "auxiliary_loss_clip": 0.01059533, + "auxiliary_loss_mlp": 0.01026127, + "balance_loss_clip": 1.02467048, + "balance_loss_mlp": 1.01666164, + "epoch": 0.8806854050804148, + "flos": 17639178330240.0, + "grad_norm": 1.9683061286103154, + "language_loss": 0.74927139, + "learning_rate": 1.4741580961128652e-07, + "loss": 0.77012795, + "num_input_tokens_seen": 315844900, + "step": 14648, + "time_per_iteration": 2.6508264541625977 + }, + { + "auxiliary_loss_clip": 0.01040416, + "auxiliary_loss_mlp": 0.01026192, + "balance_loss_clip": 1.02247262, + "balance_loss_mlp": 1.01660717, + "epoch": 0.8807455283330828, + "flos": 25332392344320.0, + "grad_norm": 1.6768278268144599, + "language_loss": 0.65388548, + "learning_rate": 1.4726909313934522e-07, + "loss": 0.67455155, + "num_input_tokens_seen": 315863745, + "step": 14649, + "time_per_iteration": 2.7098312377929688 + }, + { + "auxiliary_loss_clip": 0.01032554, + "auxiliary_loss_mlp": 0.01027751, + "balance_loss_clip": 1.02546787, + "balance_loss_mlp": 1.01665854, + "epoch": 0.8808056515857507, + "flos": 25265168040960.0, + "grad_norm": 1.2771299320620486, + "language_loss": 0.6250779, + "learning_rate": 1.4712244692314578e-07, + "loss": 0.6456809, + "num_input_tokens_seen": 315885765, + "step": 14650, + "time_per_iteration": 2.8240082263946533 + }, + { + "auxiliary_loss_clip": 0.01034063, + "auxiliary_loss_mlp": 0.01026282, + "balance_loss_clip": 1.02280593, + "balance_loss_mlp": 1.01651835, + "epoch": 0.8808657748384188, + "flos": 26578852151040.0, + "grad_norm": 1.3977306355385304, + "language_loss": 0.72757411, + "learning_rate": 1.4697587096824914e-07, + "loss": 0.74817759, + "num_input_tokens_seen": 315907340, + "step": 14651, + "time_per_iteration": 2.687993049621582 + }, + { + "auxiliary_loss_clip": 0.01054155, + "auxiliary_loss_mlp": 0.01031082, + "balance_loss_clip": 1.02537608, + "balance_loss_mlp": 1.01972151, + "epoch": 0.8809258980910867, + "flos": 18661231918080.0, + "grad_norm": 2.0798296950136708, + "language_loss": 0.72092056, + "learning_rate": 1.4682936528021284e-07, + "loss": 0.74177289, + "num_input_tokens_seen": 315924935, + "step": 14652, + "time_per_iteration": 2.6234095096588135 + }, + { + "auxiliary_loss_clip": 0.01039455, + "auxiliary_loss_mlp": 0.01028309, + "balance_loss_clip": 1.02194405, + "balance_loss_mlp": 1.01797342, + "epoch": 0.8809860213437547, + "flos": 19792274348160.0, + "grad_norm": 1.833589113889334, + "language_loss": 0.74504054, + "learning_rate": 1.4668292986459286e-07, + "loss": 0.76571822, + "num_input_tokens_seen": 315943165, + "step": 14653, + "time_per_iteration": 2.5913937091827393 + }, + { + "auxiliary_loss_clip": 0.01065006, + "auxiliary_loss_mlp": 0.01030972, + "balance_loss_clip": 1.02593374, + "balance_loss_mlp": 1.02003491, + "epoch": 0.8810461445964227, + "flos": 17894467267200.0, + "grad_norm": 1.8124827809227662, + "language_loss": 0.70935601, + "learning_rate": 1.465365647269421e-07, + "loss": 0.7303158, + "num_input_tokens_seen": 315961340, + "step": 14654, + "time_per_iteration": 2.57250714302063 + }, + { + "auxiliary_loss_clip": 0.01021471, + "auxiliary_loss_mlp": 0.01032642, + "balance_loss_clip": 1.02437997, + "balance_loss_mlp": 1.02197254, + "epoch": 0.8811062678490906, + "flos": 29163917128320.0, + "grad_norm": 1.7880688043568596, + "language_loss": 0.71679944, + "learning_rate": 1.4639026987281012e-07, + "loss": 0.73734057, + "num_input_tokens_seen": 315981335, + "step": 14655, + "time_per_iteration": 2.7715559005737305 + }, + { + "auxiliary_loss_clip": 0.01016145, + "auxiliary_loss_mlp": 0.01034822, + "balance_loss_clip": 1.02318048, + "balance_loss_mlp": 1.02362275, + "epoch": 0.8811663911017587, + "flos": 20338834671360.0, + "grad_norm": 1.8402748983772796, + "language_loss": 0.81398112, + "learning_rate": 1.462440453077449e-07, + "loss": 0.83449078, + "num_input_tokens_seen": 316001325, + "step": 14656, + "time_per_iteration": 4.516520738601685 + }, + { + "auxiliary_loss_clip": 0.01044265, + "auxiliary_loss_mlp": 0.01026056, + "balance_loss_clip": 1.02646446, + "balance_loss_mlp": 1.01604867, + "epoch": 0.8812265143544266, + "flos": 25885704424320.0, + "grad_norm": 1.8294716206811683, + "language_loss": 0.68304771, + "learning_rate": 1.460978910372914e-07, + "loss": 0.70375097, + "num_input_tokens_seen": 316022540, + "step": 14657, + "time_per_iteration": 2.7672290802001953 + }, + { + "auxiliary_loss_clip": 0.01045164, + "auxiliary_loss_mlp": 0.01033518, + "balance_loss_clip": 1.02643704, + "balance_loss_mlp": 1.02308106, + "epoch": 0.8812866376070946, + "flos": 27195509865600.0, + "grad_norm": 1.8766698268220996, + "language_loss": 0.8395685, + "learning_rate": 1.4595180706699207e-07, + "loss": 0.86035538, + "num_input_tokens_seen": 316037735, + "step": 14658, + "time_per_iteration": 2.716696262359619 + }, + { + "auxiliary_loss_clip": 0.01046459, + "auxiliary_loss_mlp": 0.01031499, + "balance_loss_clip": 1.02616441, + "balance_loss_mlp": 1.02018023, + "epoch": 0.8813467608597625, + "flos": 23807194997760.0, + "grad_norm": 1.7751703227225375, + "language_loss": 0.77285457, + "learning_rate": 1.4580579340238554e-07, + "loss": 0.79363412, + "num_input_tokens_seen": 316058105, + "step": 14659, + "time_per_iteration": 2.7156803607940674 + }, + { + "auxiliary_loss_clip": 0.01041296, + "auxiliary_loss_mlp": 0.01028803, + "balance_loss_clip": 1.02433717, + "balance_loss_mlp": 1.01831305, + "epoch": 0.8814068841124305, + "flos": 21105455667840.0, + "grad_norm": 2.1184471449524813, + "language_loss": 0.6067754, + "learning_rate": 1.4565985004900894e-07, + "loss": 0.62747633, + "num_input_tokens_seen": 316074415, + "step": 14660, + "time_per_iteration": 2.756843090057373 + }, + { + "auxiliary_loss_clip": 0.01047751, + "auxiliary_loss_mlp": 0.01035471, + "balance_loss_clip": 1.02797651, + "balance_loss_mlp": 1.02379477, + "epoch": 0.8814670073650984, + "flos": 24716991605760.0, + "grad_norm": 1.7859814281712711, + "language_loss": 0.77545846, + "learning_rate": 1.455139770123972e-07, + "loss": 0.79629064, + "num_input_tokens_seen": 316094405, + "step": 14661, + "time_per_iteration": 2.7230849266052246 + }, + { + "auxiliary_loss_clip": 0.01028425, + "auxiliary_loss_mlp": 0.01040121, + "balance_loss_clip": 1.02823472, + "balance_loss_mlp": 1.02898133, + "epoch": 0.8815271306177664, + "flos": 22966274718720.0, + "grad_norm": 1.5235471247562107, + "language_loss": 0.77140003, + "learning_rate": 1.45368174298081e-07, + "loss": 0.79208547, + "num_input_tokens_seen": 316113390, + "step": 14662, + "time_per_iteration": 2.792545795440674 + }, + { + "auxiliary_loss_clip": 0.01014198, + "auxiliary_loss_mlp": 0.01024664, + "balance_loss_clip": 1.02135611, + "balance_loss_mlp": 1.01509762, + "epoch": 0.8815872538704344, + "flos": 19460064435840.0, + "grad_norm": 1.8975916669480812, + "language_loss": 0.73766494, + "learning_rate": 1.4522244191158929e-07, + "loss": 0.7580536, + "num_input_tokens_seen": 316131085, + "step": 14663, + "time_per_iteration": 2.723365306854248 + }, + { + "auxiliary_loss_clip": 0.01053696, + "auxiliary_loss_mlp": 0.00747609, + "balance_loss_clip": 1.02557659, + "balance_loss_mlp": 1.00046837, + "epoch": 0.8816473771231024, + "flos": 32156604622080.0, + "grad_norm": 1.562611945221028, + "language_loss": 0.69764996, + "learning_rate": 1.450767798584489e-07, + "loss": 0.71566296, + "num_input_tokens_seen": 316151440, + "step": 14664, + "time_per_iteration": 2.717536449432373 + }, + { + "auxiliary_loss_clip": 0.00992681, + "auxiliary_loss_mlp": 0.01036178, + "balance_loss_clip": 1.02112627, + "balance_loss_mlp": 1.02596176, + "epoch": 0.8817075003757703, + "flos": 19682279925120.0, + "grad_norm": 1.449670400620959, + "language_loss": 0.81018591, + "learning_rate": 1.449311881441828e-07, + "loss": 0.8304745, + "num_input_tokens_seen": 316170750, + "step": 14665, + "time_per_iteration": 3.019991874694824 + }, + { + "auxiliary_loss_clip": 0.0104034, + "auxiliary_loss_mlp": 0.01031402, + "balance_loss_clip": 1.02474141, + "balance_loss_mlp": 1.02124524, + "epoch": 0.8817676236284383, + "flos": 15668616251520.0, + "grad_norm": 2.0399569623397853, + "language_loss": 0.57864761, + "learning_rate": 1.447856667743117e-07, + "loss": 0.59936506, + "num_input_tokens_seen": 316187265, + "step": 14666, + "time_per_iteration": 3.2910947799682617 + }, + { + "auxiliary_loss_clip": 0.01055624, + "auxiliary_loss_mlp": 0.01031928, + "balance_loss_clip": 1.02774096, + "balance_loss_mlp": 1.02049565, + "epoch": 0.8818277468811063, + "flos": 17895185539200.0, + "grad_norm": 1.9688982480214139, + "language_loss": 0.837789, + "learning_rate": 1.4464021575435403e-07, + "loss": 0.85866463, + "num_input_tokens_seen": 316206555, + "step": 14667, + "time_per_iteration": 2.588752031326294 + }, + { + "auxiliary_loss_clip": 0.01061048, + "auxiliary_loss_mlp": 0.01030251, + "balance_loss_clip": 1.02425182, + "balance_loss_mlp": 1.01939154, + "epoch": 0.8818878701337742, + "flos": 18770508069120.0, + "grad_norm": 1.7982902954217106, + "language_loss": 0.62058055, + "learning_rate": 1.4449483508982563e-07, + "loss": 0.64149356, + "num_input_tokens_seen": 316225210, + "step": 14668, + "time_per_iteration": 2.547447681427002 + }, + { + "auxiliary_loss_clip": 0.01049011, + "auxiliary_loss_mlp": 0.01025422, + "balance_loss_clip": 1.02378511, + "balance_loss_mlp": 1.01657629, + "epoch": 0.8819479933864423, + "flos": 17712292464000.0, + "grad_norm": 2.2796450774543495, + "language_loss": 0.57432258, + "learning_rate": 1.4434952478623918e-07, + "loss": 0.59506691, + "num_input_tokens_seen": 316242685, + "step": 14669, + "time_per_iteration": 4.210909843444824 + }, + { + "auxiliary_loss_clip": 0.01060376, + "auxiliary_loss_mlp": 0.01030492, + "balance_loss_clip": 1.02390492, + "balance_loss_mlp": 1.02038348, + "epoch": 0.8820081166391102, + "flos": 11728749070080.0, + "grad_norm": 1.9032682704862627, + "language_loss": 0.71509433, + "learning_rate": 1.442042848491043e-07, + "loss": 0.73600304, + "num_input_tokens_seen": 316260935, + "step": 14670, + "time_per_iteration": 4.180176019668579 + }, + { + "auxiliary_loss_clip": 0.0104152, + "auxiliary_loss_mlp": 0.01030304, + "balance_loss_clip": 1.02133119, + "balance_loss_mlp": 1.01986682, + "epoch": 0.8820682398917782, + "flos": 27490372611840.0, + "grad_norm": 2.3677293571705493, + "language_loss": 0.74046266, + "learning_rate": 1.44059115283929e-07, + "loss": 0.76118082, + "num_input_tokens_seen": 316281190, + "step": 14671, + "time_per_iteration": 2.6491057872772217 + }, + { + "auxiliary_loss_clip": 0.01041011, + "auxiliary_loss_mlp": 0.0102604, + "balance_loss_clip": 1.02243209, + "balance_loss_mlp": 1.0155195, + "epoch": 0.8821283631444461, + "flos": 16873850223360.0, + "grad_norm": 2.2013577279151075, + "language_loss": 0.84445679, + "learning_rate": 1.43914016096218e-07, + "loss": 0.86512733, + "num_input_tokens_seen": 316297115, + "step": 14672, + "time_per_iteration": 2.6749043464660645 + }, + { + "auxiliary_loss_clip": 0.01025713, + "auxiliary_loss_mlp": 0.01029619, + "balance_loss_clip": 1.0224154, + "balance_loss_mlp": 1.01905751, + "epoch": 0.8821884863971141, + "flos": 24280964409600.0, + "grad_norm": 2.42469389196792, + "language_loss": 0.72602129, + "learning_rate": 1.4376898729147336e-07, + "loss": 0.74657458, + "num_input_tokens_seen": 316318235, + "step": 14673, + "time_per_iteration": 2.7756540775299072 + }, + { + "auxiliary_loss_clip": 0.00987266, + "auxiliary_loss_mlp": 0.01011456, + "balance_loss_clip": 1.00176191, + "balance_loss_mlp": 1.01053762, + "epoch": 0.882248609649782, + "flos": 59432342492160.0, + "grad_norm": 0.8177381820249265, + "language_loss": 0.4946216, + "learning_rate": 1.4362402887519487e-07, + "loss": 0.51460886, + "num_input_tokens_seen": 316384705, + "step": 14674, + "time_per_iteration": 3.352430582046509 + }, + { + "auxiliary_loss_clip": 0.01035713, + "auxiliary_loss_mlp": 0.00747625, + "balance_loss_clip": 1.02269685, + "balance_loss_mlp": 1.00038826, + "epoch": 0.88230873290245, + "flos": 19937784343680.0, + "grad_norm": 2.056089343222039, + "language_loss": 0.76598537, + "learning_rate": 1.4347914085287971e-07, + "loss": 0.78381872, + "num_input_tokens_seen": 316401165, + "step": 14675, + "time_per_iteration": 2.6591148376464844 + }, + { + "auxiliary_loss_clip": 0.01040099, + "auxiliary_loss_mlp": 0.0103129, + "balance_loss_clip": 1.02444601, + "balance_loss_mlp": 1.02097249, + "epoch": 0.882368856155118, + "flos": 16362769559040.0, + "grad_norm": 1.69811521905194, + "language_loss": 0.79607427, + "learning_rate": 1.4333432323002105e-07, + "loss": 0.81678826, + "num_input_tokens_seen": 316418780, + "step": 14676, + "time_per_iteration": 2.6642420291900635 + }, + { + "auxiliary_loss_clip": 0.00981841, + "auxiliary_loss_mlp": 0.01002239, + "balance_loss_clip": 1.00484192, + "balance_loss_mlp": 1.00114822, + "epoch": 0.882428979407786, + "flos": 70594563277440.0, + "grad_norm": 0.7133179120924795, + "language_loss": 0.54752409, + "learning_rate": 1.431895760121109e-07, + "loss": 0.56736493, + "num_input_tokens_seen": 316482030, + "step": 14677, + "time_per_iteration": 3.42979097366333 + }, + { + "auxiliary_loss_clip": 0.01059024, + "auxiliary_loss_mlp": 0.01022394, + "balance_loss_clip": 1.02391446, + "balance_loss_mlp": 1.01227903, + "epoch": 0.8824891026604539, + "flos": 18150294908160.0, + "grad_norm": 2.170283726997348, + "language_loss": 0.64921701, + "learning_rate": 1.4304489920463847e-07, + "loss": 0.67003119, + "num_input_tokens_seen": 316499175, + "step": 14678, + "time_per_iteration": 2.586503028869629 + }, + { + "auxiliary_loss_clip": 0.01037618, + "auxiliary_loss_mlp": 0.01029795, + "balance_loss_clip": 1.02221441, + "balance_loss_mlp": 1.01901841, + "epoch": 0.8825492259131219, + "flos": 27232713377280.0, + "grad_norm": 2.352661392075301, + "language_loss": 0.71175957, + "learning_rate": 1.4290029281308936e-07, + "loss": 0.73243368, + "num_input_tokens_seen": 316519495, + "step": 14679, + "time_per_iteration": 2.7177090644836426 + }, + { + "auxiliary_loss_clip": 0.01038176, + "auxiliary_loss_mlp": 0.01026639, + "balance_loss_clip": 1.02361417, + "balance_loss_mlp": 1.01772833, + "epoch": 0.8826093491657898, + "flos": 22274419881600.0, + "grad_norm": 1.9403906165669744, + "language_loss": 0.64045966, + "learning_rate": 1.4275575684294694e-07, + "loss": 0.66110778, + "num_input_tokens_seen": 316538180, + "step": 14680, + "time_per_iteration": 2.6504766941070557 + }, + { + "auxiliary_loss_clip": 0.01061275, + "auxiliary_loss_mlp": 0.01028396, + "balance_loss_clip": 1.02474809, + "balance_loss_mlp": 1.01741052, + "epoch": 0.8826694724184578, + "flos": 14204753377920.0, + "grad_norm": 2.3309976120430553, + "language_loss": 0.77281761, + "learning_rate": 1.4261129129969328e-07, + "loss": 0.79371428, + "num_input_tokens_seen": 316551750, + "step": 14681, + "time_per_iteration": 2.541548013687134 + }, + { + "auxiliary_loss_clip": 0.01038034, + "auxiliary_loss_mlp": 0.01029449, + "balance_loss_clip": 1.02302706, + "balance_loss_mlp": 1.0183866, + "epoch": 0.8827295956711259, + "flos": 20631686256000.0, + "grad_norm": 2.9165698281054997, + "language_loss": 0.72705001, + "learning_rate": 1.424668961888047e-07, + "loss": 0.74772483, + "num_input_tokens_seen": 316570680, + "step": 14682, + "time_per_iteration": 2.6778948307037354 + }, + { + "auxiliary_loss_clip": 0.01029703, + "auxiliary_loss_mlp": 0.0102519, + "balance_loss_clip": 1.02907801, + "balance_loss_mlp": 1.01384139, + "epoch": 0.8827897189237938, + "flos": 18513064316160.0, + "grad_norm": 1.7675043861377702, + "language_loss": 0.74694872, + "learning_rate": 1.4232257151575765e-07, + "loss": 0.76749766, + "num_input_tokens_seen": 316588635, + "step": 14683, + "time_per_iteration": 2.8037939071655273 + }, + { + "auxiliary_loss_clip": 0.01028089, + "auxiliary_loss_mlp": 0.01028233, + "balance_loss_clip": 1.02304506, + "balance_loss_mlp": 1.01773024, + "epoch": 0.8828498421764618, + "flos": 22747399194240.0, + "grad_norm": 1.7228389617363349, + "language_loss": 0.66001368, + "learning_rate": 1.4217831728602492e-07, + "loss": 0.68057692, + "num_input_tokens_seen": 316607550, + "step": 14684, + "time_per_iteration": 2.707082748413086 + }, + { + "auxiliary_loss_clip": 0.01049691, + "auxiliary_loss_mlp": 0.01027662, + "balance_loss_clip": 1.0240128, + "balance_loss_mlp": 1.01779771, + "epoch": 0.8829099654291297, + "flos": 15012384727680.0, + "grad_norm": 1.9354306781377248, + "language_loss": 0.69322884, + "learning_rate": 1.4203413350507677e-07, + "loss": 0.71400237, + "num_input_tokens_seen": 316624460, + "step": 14685, + "time_per_iteration": 2.631654977798462 + }, + { + "auxiliary_loss_clip": 0.01010752, + "auxiliary_loss_mlp": 0.01031084, + "balance_loss_clip": 1.02492881, + "balance_loss_mlp": 1.01906729, + "epoch": 0.8829700886817977, + "flos": 16720546976640.0, + "grad_norm": 1.7763819455420802, + "language_loss": 0.74203825, + "learning_rate": 1.418900201783806e-07, + "loss": 0.7624566, + "num_input_tokens_seen": 316640765, + "step": 14686, + "time_per_iteration": 2.8755202293395996 + }, + { + "auxiliary_loss_clip": 0.01011263, + "auxiliary_loss_mlp": 0.01025789, + "balance_loss_clip": 1.02139819, + "balance_loss_mlp": 1.01524472, + "epoch": 0.8830302119344656, + "flos": 15263256291840.0, + "grad_norm": 1.8872499374778162, + "language_loss": 0.62337232, + "learning_rate": 1.417459773114007e-07, + "loss": 0.64374286, + "num_input_tokens_seen": 316656120, + "step": 14687, + "time_per_iteration": 2.8098387718200684 + }, + { + "auxiliary_loss_clip": 0.01054839, + "auxiliary_loss_mlp": 0.01033216, + "balance_loss_clip": 1.02635288, + "balance_loss_mlp": 1.0229342, + "epoch": 0.8830903351871336, + "flos": 28617751854720.0, + "grad_norm": 2.0199322693137125, + "language_loss": 0.68717277, + "learning_rate": 1.4160200490959984e-07, + "loss": 0.70805329, + "num_input_tokens_seen": 316676095, + "step": 14688, + "time_per_iteration": 4.462230205535889 + }, + { + "auxiliary_loss_clip": 0.01048794, + "auxiliary_loss_mlp": 0.01024531, + "balance_loss_clip": 1.0240345, + "balance_loss_mlp": 1.01436806, + "epoch": 0.8831504584398016, + "flos": 28001632844160.0, + "grad_norm": 1.6700176586071367, + "language_loss": 0.67090654, + "learning_rate": 1.4145810297843697e-07, + "loss": 0.69163978, + "num_input_tokens_seen": 316696235, + "step": 14689, + "time_per_iteration": 2.734381675720215 + }, + { + "auxiliary_loss_clip": 0.01034224, + "auxiliary_loss_mlp": 0.0102586, + "balance_loss_clip": 1.02631783, + "balance_loss_mlp": 1.01598918, + "epoch": 0.8832105816924696, + "flos": 26579642250240.0, + "grad_norm": 1.334480696231071, + "language_loss": 0.74634093, + "learning_rate": 1.4131427152336905e-07, + "loss": 0.76694179, + "num_input_tokens_seen": 316719680, + "step": 14690, + "time_per_iteration": 2.8378074169158936 + }, + { + "auxiliary_loss_clip": 0.01034835, + "auxiliary_loss_mlp": 0.01030991, + "balance_loss_clip": 1.022084, + "balance_loss_mlp": 1.02005374, + "epoch": 0.8832707049451375, + "flos": 24898771359360.0, + "grad_norm": 1.4035083582075008, + "language_loss": 0.72818106, + "learning_rate": 1.4117051054985018e-07, + "loss": 0.74883938, + "num_input_tokens_seen": 316739830, + "step": 14691, + "time_per_iteration": 2.7635128498077393 + }, + { + "auxiliary_loss_clip": 0.01035862, + "auxiliary_loss_mlp": 0.01023525, + "balance_loss_clip": 1.02612281, + "balance_loss_mlp": 1.01263511, + "epoch": 0.8833308281978055, + "flos": 15451141357440.0, + "grad_norm": 1.6876517271632625, + "language_loss": 0.52453363, + "learning_rate": 1.4102682006333243e-07, + "loss": 0.54512751, + "num_input_tokens_seen": 316758105, + "step": 14692, + "time_per_iteration": 2.740262508392334 + }, + { + "auxiliary_loss_clip": 0.01034061, + "auxiliary_loss_mlp": 0.01031782, + "balance_loss_clip": 1.02631831, + "balance_loss_mlp": 1.02148211, + "epoch": 0.8833909514504734, + "flos": 20301523418880.0, + "grad_norm": 1.798889074111789, + "language_loss": 0.6041435, + "learning_rate": 1.4088320006926346e-07, + "loss": 0.62480199, + "num_input_tokens_seen": 316777455, + "step": 14693, + "time_per_iteration": 2.68514347076416 + }, + { + "auxiliary_loss_clip": 0.01059944, + "auxiliary_loss_mlp": 0.01023132, + "balance_loss_clip": 1.02562046, + "balance_loss_mlp": 1.01358986, + "epoch": 0.8834510747031414, + "flos": 20374027021440.0, + "grad_norm": 1.488613354860105, + "language_loss": 0.75242627, + "learning_rate": 1.407396505730898e-07, + "loss": 0.77325702, + "num_input_tokens_seen": 316796300, + "step": 14694, + "time_per_iteration": 2.56644606590271 + }, + { + "auxiliary_loss_clip": 0.01043261, + "auxiliary_loss_mlp": 0.01025512, + "balance_loss_clip": 1.02280188, + "balance_loss_mlp": 1.01549208, + "epoch": 0.8835111979558095, + "flos": 29752026508800.0, + "grad_norm": 1.9440581761917475, + "language_loss": 0.72808075, + "learning_rate": 1.4059617158025527e-07, + "loss": 0.74876851, + "num_input_tokens_seen": 316819090, + "step": 14695, + "time_per_iteration": 2.6295623779296875 + }, + { + "auxiliary_loss_clip": 0.01047823, + "auxiliary_loss_mlp": 0.01021614, + "balance_loss_clip": 1.02451253, + "balance_loss_mlp": 1.01236343, + "epoch": 0.8835713212084774, + "flos": 24134556574080.0, + "grad_norm": 1.5933306922211552, + "language_loss": 0.79977667, + "learning_rate": 1.404527630961998e-07, + "loss": 0.82047105, + "num_input_tokens_seen": 316839250, + "step": 14696, + "time_per_iteration": 2.6957550048828125 + }, + { + "auxiliary_loss_clip": 0.01021807, + "auxiliary_loss_mlp": 0.01028227, + "balance_loss_clip": 1.02464974, + "balance_loss_mlp": 1.01870835, + "epoch": 0.8836314444611454, + "flos": 27672331933440.0, + "grad_norm": 1.3940948821667505, + "language_loss": 0.74812293, + "learning_rate": 1.4030942512636236e-07, + "loss": 0.76862329, + "num_input_tokens_seen": 316861315, + "step": 14697, + "time_per_iteration": 2.8070991039276123 + }, + { + "auxiliary_loss_clip": 0.01041841, + "auxiliary_loss_mlp": 0.01029866, + "balance_loss_clip": 1.02452278, + "balance_loss_mlp": 1.01972723, + "epoch": 0.8836915677138133, + "flos": 16836969934080.0, + "grad_norm": 1.8204553845780538, + "language_loss": 0.71824902, + "learning_rate": 1.401661576761779e-07, + "loss": 0.73896611, + "num_input_tokens_seen": 316879325, + "step": 14698, + "time_per_iteration": 2.658463478088379 + }, + { + "auxiliary_loss_clip": 0.00996576, + "auxiliary_loss_mlp": 0.0100263, + "balance_loss_clip": 1.00166893, + "balance_loss_mlp": 1.00167608, + "epoch": 0.8837516909664813, + "flos": 69310540823040.0, + "grad_norm": 0.8089777021245397, + "language_loss": 0.5375706, + "learning_rate": 1.4002296075107856e-07, + "loss": 0.55756271, + "num_input_tokens_seen": 316936425, + "step": 14699, + "time_per_iteration": 3.395376205444336 + }, + { + "auxiliary_loss_clip": 0.01039138, + "auxiliary_loss_mlp": 0.0102574, + "balance_loss_clip": 1.02373409, + "balance_loss_mlp": 1.01493931, + "epoch": 0.8838118142191492, + "flos": 21324726241920.0, + "grad_norm": 1.6465972115028258, + "language_loss": 0.76807863, + "learning_rate": 1.3987983435649508e-07, + "loss": 0.7887274, + "num_input_tokens_seen": 316956360, + "step": 14700, + "time_per_iteration": 2.759037971496582 + }, + { + "auxiliary_loss_clip": 0.01031217, + "auxiliary_loss_mlp": 0.01027148, + "balance_loss_clip": 1.02536058, + "balance_loss_mlp": 1.0172832, + "epoch": 0.8838719374718172, + "flos": 21470559459840.0, + "grad_norm": 2.4971027432639743, + "language_loss": 0.72877127, + "learning_rate": 1.3973677849785494e-07, + "loss": 0.74935496, + "num_input_tokens_seen": 316975295, + "step": 14701, + "time_per_iteration": 2.8009097576141357 + }, + { + "auxiliary_loss_clip": 0.0103696, + "auxiliary_loss_mlp": 0.01031398, + "balance_loss_clip": 1.02311921, + "balance_loss_mlp": 1.01937616, + "epoch": 0.8839320607244852, + "flos": 26468929555200.0, + "grad_norm": 1.6599722798052279, + "language_loss": 0.70990133, + "learning_rate": 1.3959379318058262e-07, + "loss": 0.73058498, + "num_input_tokens_seen": 316994520, + "step": 14702, + "time_per_iteration": 2.72894024848938 + }, + { + "auxiliary_loss_clip": 0.01023001, + "auxiliary_loss_mlp": 0.0103599, + "balance_loss_clip": 1.02282357, + "balance_loss_mlp": 1.02393794, + "epoch": 0.8839921839771532, + "flos": 45222270923520.0, + "grad_norm": 1.6618952552989554, + "language_loss": 0.71564531, + "learning_rate": 1.3945087841010006e-07, + "loss": 0.73623526, + "num_input_tokens_seen": 317018095, + "step": 14703, + "time_per_iteration": 4.645068407058716 + }, + { + "auxiliary_loss_clip": 0.01022033, + "auxiliary_loss_mlp": 0.01024046, + "balance_loss_clip": 1.02593553, + "balance_loss_mlp": 1.01486731, + "epoch": 0.8840523072298211, + "flos": 20006876154240.0, + "grad_norm": 1.7253270408414412, + "language_loss": 0.66754639, + "learning_rate": 1.3930803419182645e-07, + "loss": 0.68800724, + "num_input_tokens_seen": 317035755, + "step": 14704, + "time_per_iteration": 2.7738828659057617 + }, + { + "auxiliary_loss_clip": 0.01039161, + "auxiliary_loss_mlp": 0.01023976, + "balance_loss_clip": 1.02212048, + "balance_loss_mlp": 1.0146271, + "epoch": 0.8841124304824891, + "flos": 24426007528320.0, + "grad_norm": 1.5642472140398174, + "language_loss": 0.70494509, + "learning_rate": 1.3916526053117905e-07, + "loss": 0.7255764, + "num_input_tokens_seen": 317055765, + "step": 14705, + "time_per_iteration": 2.706601858139038 + }, + { + "auxiliary_loss_clip": 0.01042055, + "auxiliary_loss_mlp": 0.01028505, + "balance_loss_clip": 1.02575946, + "balance_loss_mlp": 1.01942706, + "epoch": 0.884172553735157, + "flos": 31284622056960.0, + "grad_norm": 1.3943531869880674, + "language_loss": 0.70863163, + "learning_rate": 1.3902255743357104e-07, + "loss": 0.72933728, + "num_input_tokens_seen": 317077955, + "step": 14706, + "time_per_iteration": 2.745323896408081 + }, + { + "auxiliary_loss_clip": 0.01051887, + "auxiliary_loss_mlp": 0.01026845, + "balance_loss_clip": 1.02460015, + "balance_loss_mlp": 1.01695013, + "epoch": 0.884232676987825, + "flos": 21391160446080.0, + "grad_norm": 1.7629754094798638, + "language_loss": 0.74421215, + "learning_rate": 1.3887992490441413e-07, + "loss": 0.76499945, + "num_input_tokens_seen": 317095825, + "step": 14707, + "time_per_iteration": 2.5908892154693604 + }, + { + "auxiliary_loss_clip": 0.00979461, + "auxiliary_loss_mlp": 0.01005854, + "balance_loss_clip": 1.00365114, + "balance_loss_mlp": 1.00492465, + "epoch": 0.8842928002404931, + "flos": 57911451799680.0, + "grad_norm": 0.7977510200811091, + "language_loss": 0.60367894, + "learning_rate": 1.387373629491173e-07, + "loss": 0.62353206, + "num_input_tokens_seen": 317152875, + "step": 14708, + "time_per_iteration": 3.1271181106567383 + }, + { + "auxiliary_loss_clip": 0.01030962, + "auxiliary_loss_mlp": 0.0103167, + "balance_loss_clip": 1.02158701, + "balance_loss_mlp": 1.02239537, + "epoch": 0.884352923493161, + "flos": 41463896186880.0, + "grad_norm": 1.6384621197545906, + "language_loss": 0.66731, + "learning_rate": 1.3859487157308625e-07, + "loss": 0.68793631, + "num_input_tokens_seen": 317176725, + "step": 14709, + "time_per_iteration": 2.829517364501953 + }, + { + "auxiliary_loss_clip": 0.01045375, + "auxiliary_loss_mlp": 0.01031876, + "balance_loss_clip": 1.02523327, + "balance_loss_mlp": 1.02044415, + "epoch": 0.884413046745829, + "flos": 46541234332800.0, + "grad_norm": 1.4303219728483232, + "language_loss": 0.62451935, + "learning_rate": 1.3845245078172373e-07, + "loss": 0.64529181, + "num_input_tokens_seen": 317206880, + "step": 14710, + "time_per_iteration": 2.8958537578582764 + }, + { + "auxiliary_loss_clip": 0.01029749, + "auxiliary_loss_mlp": 0.01021282, + "balance_loss_clip": 1.02378631, + "balance_loss_mlp": 1.01194787, + "epoch": 0.8844731699984969, + "flos": 19135324552320.0, + "grad_norm": 2.4785348299816854, + "language_loss": 0.63753283, + "learning_rate": 1.38310100580431e-07, + "loss": 0.65804315, + "num_input_tokens_seen": 317224135, + "step": 14711, + "time_per_iteration": 2.651902198791504 + }, + { + "auxiliary_loss_clip": 0.01029045, + "auxiliary_loss_mlp": 0.01029976, + "balance_loss_clip": 1.0227648, + "balance_loss_mlp": 1.01910996, + "epoch": 0.8845332932511649, + "flos": 23260634674560.0, + "grad_norm": 1.8815862007194872, + "language_loss": 0.76399273, + "learning_rate": 1.38167820974606e-07, + "loss": 0.78458285, + "num_input_tokens_seen": 317244505, + "step": 14712, + "time_per_iteration": 2.755500555038452 + }, + { + "auxiliary_loss_clip": 0.01000219, + "auxiliary_loss_mlp": 0.01023996, + "balance_loss_clip": 1.02008867, + "balance_loss_mlp": 1.01340985, + "epoch": 0.8845934165038328, + "flos": 17564591738880.0, + "grad_norm": 2.180081623330147, + "language_loss": 0.81323898, + "learning_rate": 1.3802561196964368e-07, + "loss": 0.83348107, + "num_input_tokens_seen": 317257830, + "step": 14713, + "time_per_iteration": 2.7397725582122803 + }, + { + "auxiliary_loss_clip": 0.01042245, + "auxiliary_loss_mlp": 0.01027804, + "balance_loss_clip": 1.02481413, + "balance_loss_mlp": 1.01678896, + "epoch": 0.8846535397565009, + "flos": 27485739757440.0, + "grad_norm": 1.5336961095253778, + "language_loss": 0.55679011, + "learning_rate": 1.3788347357093688e-07, + "loss": 0.57749069, + "num_input_tokens_seen": 317278430, + "step": 14714, + "time_per_iteration": 2.7089309692382812 + }, + { + "auxiliary_loss_clip": 0.01011014, + "auxiliary_loss_mlp": 0.01035295, + "balance_loss_clip": 1.02673626, + "balance_loss_mlp": 1.02375555, + "epoch": 0.8847136630091688, + "flos": 28761430256640.0, + "grad_norm": 1.7637989359626065, + "language_loss": 0.73678839, + "learning_rate": 1.377414057838755e-07, + "loss": 0.7572515, + "num_input_tokens_seen": 317295970, + "step": 14715, + "time_per_iteration": 2.8474409580230713 + }, + { + "auxiliary_loss_clip": 0.01050611, + "auxiliary_loss_mlp": 0.0102991, + "balance_loss_clip": 1.02392769, + "balance_loss_mlp": 1.02017653, + "epoch": 0.8847737862618368, + "flos": 23476924419840.0, + "grad_norm": 1.7215297494868864, + "language_loss": 0.75225747, + "learning_rate": 1.375994086138461e-07, + "loss": 0.77306271, + "num_input_tokens_seen": 317316185, + "step": 14716, + "time_per_iteration": 4.320281505584717 + }, + { + "auxiliary_loss_clip": 0.01027452, + "auxiliary_loss_mlp": 0.01033701, + "balance_loss_clip": 1.02401733, + "balance_loss_mlp": 1.02378821, + "epoch": 0.8848339095145047, + "flos": 18660872782080.0, + "grad_norm": 2.242273398353935, + "language_loss": 0.70884448, + "learning_rate": 1.3745748206623397e-07, + "loss": 0.72945595, + "num_input_tokens_seen": 317333275, + "step": 14717, + "time_per_iteration": 4.332103252410889 + }, + { + "auxiliary_loss_clip": 0.0104653, + "auxiliary_loss_mlp": 0.01025174, + "balance_loss_clip": 1.02330995, + "balance_loss_mlp": 1.01575053, + "epoch": 0.8848940327671727, + "flos": 32270298145920.0, + "grad_norm": 1.9378073128677817, + "language_loss": 0.74306172, + "learning_rate": 1.373156261464208e-07, + "loss": 0.76377869, + "num_input_tokens_seen": 317351245, + "step": 14718, + "time_per_iteration": 2.7048885822296143 + }, + { + "auxiliary_loss_clip": 0.0101565, + "auxiliary_loss_mlp": 0.01027297, + "balance_loss_clip": 1.0253756, + "balance_loss_mlp": 1.01625228, + "epoch": 0.8849541560198406, + "flos": 24021832717440.0, + "grad_norm": 1.8662571767628555, + "language_loss": 0.78625679, + "learning_rate": 1.3717384085978602e-07, + "loss": 0.80668616, + "num_input_tokens_seen": 317370740, + "step": 14719, + "time_per_iteration": 2.882565498352051 + }, + { + "auxiliary_loss_clip": 0.01061657, + "auxiliary_loss_mlp": 0.0102589, + "balance_loss_clip": 1.02493799, + "balance_loss_mlp": 1.0154413, + "epoch": 0.8850142792725086, + "flos": 16873060124160.0, + "grad_norm": 1.6641636313682806, + "language_loss": 0.72422928, + "learning_rate": 1.3703212621170579e-07, + "loss": 0.74510473, + "num_input_tokens_seen": 317388370, + "step": 14720, + "time_per_iteration": 2.6257641315460205 + }, + { + "auxiliary_loss_clip": 0.01042951, + "auxiliary_loss_mlp": 0.01026882, + "balance_loss_clip": 1.02461326, + "balance_loss_mlp": 1.01617742, + "epoch": 0.8850744025251767, + "flos": 24024059360640.0, + "grad_norm": 1.9524119406853333, + "language_loss": 0.82599628, + "learning_rate": 1.3689048220755383e-07, + "loss": 0.84669459, + "num_input_tokens_seen": 317407390, + "step": 14721, + "time_per_iteration": 2.7051398754119873 + }, + { + "auxiliary_loss_clip": 0.01038419, + "auxiliary_loss_mlp": 0.01031272, + "balance_loss_clip": 1.0216887, + "balance_loss_mlp": 1.02027464, + "epoch": 0.8851345257778446, + "flos": 47955575329920.0, + "grad_norm": 1.7156230517804214, + "language_loss": 0.62353742, + "learning_rate": 1.3674890885270186e-07, + "loss": 0.64423436, + "num_input_tokens_seen": 317430825, + "step": 14722, + "time_per_iteration": 2.92584490776062 + }, + { + "auxiliary_loss_clip": 0.01052374, + "auxiliary_loss_mlp": 0.01024738, + "balance_loss_clip": 1.02460802, + "balance_loss_mlp": 1.01443195, + "epoch": 0.8851946490305126, + "flos": 36611000173440.0, + "grad_norm": 1.9267383140434349, + "language_loss": 0.6877259, + "learning_rate": 1.3660740615251754e-07, + "loss": 0.70849699, + "num_input_tokens_seen": 317451905, + "step": 14723, + "time_per_iteration": 2.7470219135284424 + }, + { + "auxiliary_loss_clip": 0.01023657, + "auxiliary_loss_mlp": 0.01031314, + "balance_loss_clip": 1.02126813, + "balance_loss_mlp": 1.02090728, + "epoch": 0.8852547722831805, + "flos": 21544248211200.0, + "grad_norm": 1.6206212380600775, + "language_loss": 0.77934074, + "learning_rate": 1.3646597411236703e-07, + "loss": 0.7998904, + "num_input_tokens_seen": 317470030, + "step": 14724, + "time_per_iteration": 2.743112087249756 + }, + { + "auxiliary_loss_clip": 0.00996339, + "auxiliary_loss_mlp": 0.01000018, + "balance_loss_clip": 1.00155473, + "balance_loss_mlp": 0.99905819, + "epoch": 0.8853148955358485, + "flos": 63059246472960.0, + "grad_norm": 0.8072340632179058, + "language_loss": 0.5896309, + "learning_rate": 1.363246127376143e-07, + "loss": 0.60959446, + "num_input_tokens_seen": 317527460, + "step": 14725, + "time_per_iteration": 3.086089849472046 + }, + { + "auxiliary_loss_clip": 0.01038932, + "auxiliary_loss_mlp": 0.00747818, + "balance_loss_clip": 1.02360988, + "balance_loss_mlp": 1.00043499, + "epoch": 0.8853750187885164, + "flos": 18149828031360.0, + "grad_norm": 1.9943123638468305, + "language_loss": 0.68825114, + "learning_rate": 1.3618332203361837e-07, + "loss": 0.70611864, + "num_input_tokens_seen": 317544070, + "step": 14726, + "time_per_iteration": 2.671433210372925 + }, + { + "auxiliary_loss_clip": 0.0104655, + "auxiliary_loss_mlp": 0.00747555, + "balance_loss_clip": 1.02375865, + "balance_loss_mlp": 1.00038218, + "epoch": 0.8854351420411845, + "flos": 39570542392320.0, + "grad_norm": 1.2021063866593187, + "language_loss": 0.69643831, + "learning_rate": 1.3604210200573785e-07, + "loss": 0.71437937, + "num_input_tokens_seen": 317570275, + "step": 14727, + "time_per_iteration": 2.79634952545166 + }, + { + "auxiliary_loss_clip": 0.01044548, + "auxiliary_loss_mlp": 0.01032872, + "balance_loss_clip": 1.02761006, + "balance_loss_mlp": 1.02210808, + "epoch": 0.8854952652938524, + "flos": 23769309127680.0, + "grad_norm": 1.5170620948258642, + "language_loss": 0.69927377, + "learning_rate": 1.3590095265932733e-07, + "loss": 0.72004795, + "num_input_tokens_seen": 317590160, + "step": 14728, + "time_per_iteration": 2.6750078201293945 + }, + { + "auxiliary_loss_clip": 0.01031912, + "auxiliary_loss_mlp": 0.01027326, + "balance_loss_clip": 1.024575, + "balance_loss_mlp": 1.01736593, + "epoch": 0.8855553885465204, + "flos": 18290310122880.0, + "grad_norm": 2.2289977487237276, + "language_loss": 0.66641283, + "learning_rate": 1.3575987399973987e-07, + "loss": 0.68700522, + "num_input_tokens_seen": 317608340, + "step": 14729, + "time_per_iteration": 2.6588518619537354 + }, + { + "auxiliary_loss_clip": 0.01033152, + "auxiliary_loss_mlp": 0.01027945, + "balance_loss_clip": 1.02585459, + "balance_loss_mlp": 1.01816952, + "epoch": 0.8856155117991883, + "flos": 36867402432000.0, + "grad_norm": 1.682275416703746, + "language_loss": 0.62962329, + "learning_rate": 1.3561886603232453e-07, + "loss": 0.65023422, + "num_input_tokens_seen": 317629910, + "step": 14730, + "time_per_iteration": 2.7791154384613037 + }, + { + "auxiliary_loss_clip": 0.01031993, + "auxiliary_loss_mlp": 0.01027918, + "balance_loss_clip": 1.02491188, + "balance_loss_mlp": 1.01827407, + "epoch": 0.8856756350518563, + "flos": 22163886754560.0, + "grad_norm": 1.6279553711021688, + "language_loss": 0.79474914, + "learning_rate": 1.3547792876242904e-07, + "loss": 0.81534827, + "num_input_tokens_seen": 317650265, + "step": 14731, + "time_per_iteration": 2.7261343002319336 + }, + { + "auxiliary_loss_clip": 0.01025868, + "auxiliary_loss_mlp": 0.01031522, + "balance_loss_clip": 1.02037323, + "balance_loss_mlp": 1.02131176, + "epoch": 0.8857357583045242, + "flos": 20740962407040.0, + "grad_norm": 1.5563998768589407, + "language_loss": 0.83177149, + "learning_rate": 1.3533706219539708e-07, + "loss": 0.85234541, + "num_input_tokens_seen": 317669045, + "step": 14732, + "time_per_iteration": 2.695039749145508 + }, + { + "auxiliary_loss_clip": 0.00987208, + "auxiliary_loss_mlp": 0.01000805, + "balance_loss_clip": 1.0019927, + "balance_loss_mlp": 0.99995232, + "epoch": 0.8857958815571922, + "flos": 69892329409920.0, + "grad_norm": 1.4760671259538627, + "language_loss": 0.59945148, + "learning_rate": 1.3519626633657045e-07, + "loss": 0.6193316, + "num_input_tokens_seen": 317728065, + "step": 14733, + "time_per_iteration": 3.2185091972351074 + }, + { + "auxiliary_loss_clip": 0.01062748, + "auxiliary_loss_mlp": 0.00747558, + "balance_loss_clip": 1.0260464, + "balance_loss_mlp": 1.00036895, + "epoch": 0.8858560048098603, + "flos": 15121948187520.0, + "grad_norm": 1.8332459956857485, + "language_loss": 0.67171293, + "learning_rate": 1.3505554119128838e-07, + "loss": 0.689816, + "num_input_tokens_seen": 317746120, + "step": 14734, + "time_per_iteration": 2.5859122276306152 + }, + { + "auxiliary_loss_clip": 0.01037005, + "auxiliary_loss_mlp": 0.01032001, + "balance_loss_clip": 1.02502644, + "balance_loss_mlp": 1.02186179, + "epoch": 0.8859161280625282, + "flos": 16611019430400.0, + "grad_norm": 2.0974273381444024, + "language_loss": 0.75426483, + "learning_rate": 1.3491488676488682e-07, + "loss": 0.77495492, + "num_input_tokens_seen": 317762280, + "step": 14735, + "time_per_iteration": 4.542329549789429 + }, + { + "auxiliary_loss_clip": 0.01022969, + "auxiliary_loss_mlp": 0.01030878, + "balance_loss_clip": 1.02243924, + "balance_loss_mlp": 1.0199281, + "epoch": 0.8859762513151962, + "flos": 18694484933760.0, + "grad_norm": 2.160476314975451, + "language_loss": 0.7062403, + "learning_rate": 1.3477430306270066e-07, + "loss": 0.72677881, + "num_input_tokens_seen": 317780615, + "step": 14736, + "time_per_iteration": 2.8469295501708984 + }, + { + "auxiliary_loss_clip": 0.01038575, + "auxiliary_loss_mlp": 0.01030174, + "balance_loss_clip": 1.02802694, + "balance_loss_mlp": 1.01921916, + "epoch": 0.8860363745678641, + "flos": 19536877670400.0, + "grad_norm": 1.7064571276227234, + "language_loss": 0.84183103, + "learning_rate": 1.3463379009005892e-07, + "loss": 0.86251855, + "num_input_tokens_seen": 317798830, + "step": 14737, + "time_per_iteration": 2.729799747467041 + }, + { + "auxiliary_loss_clip": 0.01035802, + "auxiliary_loss_mlp": 0.01032051, + "balance_loss_clip": 1.02505231, + "balance_loss_mlp": 1.02013636, + "epoch": 0.8860964978205321, + "flos": 35954912304000.0, + "grad_norm": 2.2424472586256146, + "language_loss": 0.68034595, + "learning_rate": 1.3449334785229093e-07, + "loss": 0.70102453, + "num_input_tokens_seen": 317819235, + "step": 14738, + "time_per_iteration": 2.8031020164489746 + }, + { + "auxiliary_loss_clip": 0.01053353, + "auxiliary_loss_mlp": 0.01028502, + "balance_loss_clip": 1.02351201, + "balance_loss_mlp": 1.01790404, + "epoch": 0.8861566210732, + "flos": 21212577002880.0, + "grad_norm": 1.9072662244680403, + "language_loss": 0.75237465, + "learning_rate": 1.343529763547222e-07, + "loss": 0.77319312, + "num_input_tokens_seen": 317836785, + "step": 14739, + "time_per_iteration": 2.6010584831237793 + }, + { + "auxiliary_loss_clip": 0.01049979, + "auxiliary_loss_mlp": 0.01030568, + "balance_loss_clip": 1.02468252, + "balance_loss_mlp": 1.02104306, + "epoch": 0.886216744325868, + "flos": 14609071843200.0, + "grad_norm": 1.8170176811416285, + "language_loss": 0.87182355, + "learning_rate": 1.3421267560267559e-07, + "loss": 0.89262903, + "num_input_tokens_seen": 317854225, + "step": 14740, + "time_per_iteration": 2.5943243503570557 + }, + { + "auxiliary_loss_clip": 0.01014141, + "auxiliary_loss_mlp": 0.01030713, + "balance_loss_clip": 1.02248037, + "balance_loss_mlp": 1.0202229, + "epoch": 0.886276867578536, + "flos": 26651643062400.0, + "grad_norm": 1.6864469903409975, + "language_loss": 0.63191849, + "learning_rate": 1.34072445601471e-07, + "loss": 0.65236712, + "num_input_tokens_seen": 317874865, + "step": 14741, + "time_per_iteration": 2.774467945098877 + }, + { + "auxiliary_loss_clip": 0.01061261, + "auxiliary_loss_mlp": 0.01027298, + "balance_loss_clip": 1.02476656, + "balance_loss_mlp": 1.01737428, + "epoch": 0.886336990831204, + "flos": 16764071281920.0, + "grad_norm": 2.016047023265142, + "language_loss": 0.7264995, + "learning_rate": 1.3393228635642717e-07, + "loss": 0.74738508, + "num_input_tokens_seen": 317892830, + "step": 14742, + "time_per_iteration": 2.5516433715820312 + }, + { + "auxiliary_loss_clip": 0.01047176, + "auxiliary_loss_mlp": 0.00747614, + "balance_loss_clip": 1.02355981, + "balance_loss_mlp": 1.00043797, + "epoch": 0.8863971140838719, + "flos": 25265275781760.0, + "grad_norm": 1.8938090051353222, + "language_loss": 0.59456205, + "learning_rate": 1.3379219787285733e-07, + "loss": 0.61250997, + "num_input_tokens_seen": 317911780, + "step": 14743, + "time_per_iteration": 2.6845035552978516 + }, + { + "auxiliary_loss_clip": 0.01034578, + "auxiliary_loss_mlp": 0.01030714, + "balance_loss_clip": 1.02422023, + "balance_loss_mlp": 1.01872766, + "epoch": 0.8864572373365399, + "flos": 23404313076480.0, + "grad_norm": 1.825808338113292, + "language_loss": 0.6038186, + "learning_rate": 1.3365218015607437e-07, + "loss": 0.62447149, + "num_input_tokens_seen": 317932855, + "step": 14744, + "time_per_iteration": 2.777498245239258 + }, + { + "auxiliary_loss_clip": 0.01044726, + "auxiliary_loss_mlp": 0.00747685, + "balance_loss_clip": 1.02512181, + "balance_loss_mlp": 1.00035477, + "epoch": 0.8865173605892078, + "flos": 18548759456640.0, + "grad_norm": 1.5191979556037574, + "language_loss": 0.76776016, + "learning_rate": 1.3351223321138762e-07, + "loss": 0.78568435, + "num_input_tokens_seen": 317952090, + "step": 14745, + "time_per_iteration": 2.6641509532928467 + }, + { + "auxiliary_loss_clip": 0.01061191, + "auxiliary_loss_mlp": 0.00747589, + "balance_loss_clip": 1.02525926, + "balance_loss_mlp": 1.00040913, + "epoch": 0.8865774838418758, + "flos": 19025868833280.0, + "grad_norm": 1.7001619649384214, + "language_loss": 0.77137136, + "learning_rate": 1.3337235704410454e-07, + "loss": 0.78945923, + "num_input_tokens_seen": 317970370, + "step": 14746, + "time_per_iteration": 2.5816893577575684 + }, + { + "auxiliary_loss_clip": 0.01047941, + "auxiliary_loss_mlp": 0.01032138, + "balance_loss_clip": 1.0272131, + "balance_loss_mlp": 1.02116537, + "epoch": 0.8866376070945439, + "flos": 22163168482560.0, + "grad_norm": 2.072614950175118, + "language_loss": 0.76539624, + "learning_rate": 1.3323255165952873e-07, + "loss": 0.78619707, + "num_input_tokens_seen": 317989125, + "step": 14747, + "time_per_iteration": 2.6625168323516846 + }, + { + "auxiliary_loss_clip": 0.01037826, + "auxiliary_loss_mlp": 0.00747527, + "balance_loss_clip": 1.02267325, + "balance_loss_mlp": 1.00040579, + "epoch": 0.8866977303472118, + "flos": 20704261685760.0, + "grad_norm": 1.923719294964884, + "language_loss": 0.82682371, + "learning_rate": 1.3309281706296127e-07, + "loss": 0.84467721, + "num_input_tokens_seen": 318007820, + "step": 14748, + "time_per_iteration": 2.6937575340270996 + }, + { + "auxiliary_loss_clip": 0.01053293, + "auxiliary_loss_mlp": 0.01031571, + "balance_loss_clip": 1.02593422, + "balance_loss_mlp": 1.02070522, + "epoch": 0.8867578535998798, + "flos": 48794448533760.0, + "grad_norm": 3.0062440245974438, + "language_loss": 0.77442265, + "learning_rate": 1.3295315325970148e-07, + "loss": 0.79527134, + "num_input_tokens_seen": 318030435, + "step": 14749, + "time_per_iteration": 2.871284008026123 + }, + { + "auxiliary_loss_clip": 0.01005147, + "auxiliary_loss_mlp": 0.00747755, + "balance_loss_clip": 1.02474773, + "balance_loss_mlp": 1.00047684, + "epoch": 0.8868179768525477, + "flos": 21105312013440.0, + "grad_norm": 1.9020572328754766, + "language_loss": 0.69699144, + "learning_rate": 1.328135602550451e-07, + "loss": 0.71452045, + "num_input_tokens_seen": 318049465, + "step": 14750, + "time_per_iteration": 2.7941629886627197 + }, + { + "auxiliary_loss_clip": 0.01050418, + "auxiliary_loss_mlp": 0.01025787, + "balance_loss_clip": 1.02433515, + "balance_loss_mlp": 1.01647043, + "epoch": 0.8868781001052157, + "flos": 21830922656640.0, + "grad_norm": 3.11467780698153, + "language_loss": 0.59418285, + "learning_rate": 1.3267403805428546e-07, + "loss": 0.61494493, + "num_input_tokens_seen": 318067760, + "step": 14751, + "time_per_iteration": 4.298797607421875 + }, + { + "auxiliary_loss_clip": 0.01062357, + "auxiliary_loss_mlp": 0.01029133, + "balance_loss_clip": 1.02546811, + "balance_loss_mlp": 1.01843977, + "epoch": 0.8869382233578836, + "flos": 13516418073600.0, + "grad_norm": 2.5009099586322847, + "language_loss": 0.81715643, + "learning_rate": 1.3253458666271344e-07, + "loss": 0.83807135, + "num_input_tokens_seen": 318082785, + "step": 14752, + "time_per_iteration": 2.5301222801208496 + }, + { + "auxiliary_loss_clip": 0.01045107, + "auxiliary_loss_mlp": 0.01030058, + "balance_loss_clip": 1.02602053, + "balance_loss_mlp": 1.01904333, + "epoch": 0.8869983466105517, + "flos": 22704988210560.0, + "grad_norm": 1.831314869255316, + "language_loss": 0.79836565, + "learning_rate": 1.3239520608561793e-07, + "loss": 0.81911731, + "num_input_tokens_seen": 318101925, + "step": 14753, + "time_per_iteration": 2.717719554901123 + }, + { + "auxiliary_loss_clip": 0.01059775, + "auxiliary_loss_mlp": 0.01027947, + "balance_loss_clip": 1.02331257, + "balance_loss_mlp": 1.01795113, + "epoch": 0.8870584698632196, + "flos": 15340751884800.0, + "grad_norm": 2.938399878457521, + "language_loss": 0.65263647, + "learning_rate": 1.3225589632828248e-07, + "loss": 0.67351371, + "num_input_tokens_seen": 318119945, + "step": 14754, + "time_per_iteration": 2.5532467365264893 + }, + { + "auxiliary_loss_clip": 0.0106472, + "auxiliary_loss_mlp": 0.0102925, + "balance_loss_clip": 1.02629364, + "balance_loss_mlp": 1.01874197, + "epoch": 0.8871185931158876, + "flos": 26615624699520.0, + "grad_norm": 7.518447268493488, + "language_loss": 0.74490172, + "learning_rate": 1.3211665739599065e-07, + "loss": 0.76584136, + "num_input_tokens_seen": 318139685, + "step": 14755, + "time_per_iteration": 2.609039068222046 + }, + { + "auxiliary_loss_clip": 0.01033374, + "auxiliary_loss_mlp": 0.01027511, + "balance_loss_clip": 1.02042627, + "balance_loss_mlp": 1.01609683, + "epoch": 0.8871787163685555, + "flos": 21799034357760.0, + "grad_norm": 2.6773099154160307, + "language_loss": 0.77847224, + "learning_rate": 1.3197748929402262e-07, + "loss": 0.79908109, + "num_input_tokens_seen": 318160375, + "step": 14756, + "time_per_iteration": 2.657670497894287 + }, + { + "auxiliary_loss_clip": 0.01043322, + "auxiliary_loss_mlp": 0.01029439, + "balance_loss_clip": 1.02551293, + "balance_loss_mlp": 1.01899648, + "epoch": 0.8872388396212235, + "flos": 14902964922240.0, + "grad_norm": 5.463781512886187, + "language_loss": 0.76544958, + "learning_rate": 1.3183839202765535e-07, + "loss": 0.78617716, + "num_input_tokens_seen": 318177995, + "step": 14757, + "time_per_iteration": 2.6753270626068115 + }, + { + "auxiliary_loss_clip": 0.00999329, + "auxiliary_loss_mlp": 0.01030767, + "balance_loss_clip": 1.02047563, + "balance_loss_mlp": 1.0202527, + "epoch": 0.8872989628738914, + "flos": 26432157006720.0, + "grad_norm": 1.8281952888949933, + "language_loss": 0.67689264, + "learning_rate": 1.316993656021632e-07, + "loss": 0.69719356, + "num_input_tokens_seen": 318197030, + "step": 14758, + "time_per_iteration": 2.8164613246917725 + }, + { + "auxiliary_loss_clip": 0.01062316, + "auxiliary_loss_mlp": 0.01030917, + "balance_loss_clip": 1.02510738, + "balance_loss_mlp": 1.01990247, + "epoch": 0.8873590861265594, + "flos": 48142562555520.0, + "grad_norm": 1.957952526475135, + "language_loss": 0.69047433, + "learning_rate": 1.3156041002281915e-07, + "loss": 0.71140671, + "num_input_tokens_seen": 318221780, + "step": 14759, + "time_per_iteration": 2.886037588119507 + }, + { + "auxiliary_loss_clip": 0.01059346, + "auxiliary_loss_mlp": 0.01029057, + "balance_loss_clip": 1.02309608, + "balance_loss_mlp": 1.01900744, + "epoch": 0.8874192093792275, + "flos": 18332972501760.0, + "grad_norm": 1.6637001087861987, + "language_loss": 0.74420488, + "learning_rate": 1.3142152529489092e-07, + "loss": 0.76508892, + "num_input_tokens_seen": 318239710, + "step": 14760, + "time_per_iteration": 2.5046706199645996 + }, + { + "auxiliary_loss_clip": 0.01045294, + "auxiliary_loss_mlp": 0.01028731, + "balance_loss_clip": 1.02577066, + "balance_loss_mlp": 1.0177995, + "epoch": 0.8874793326318954, + "flos": 17894215872000.0, + "grad_norm": 3.043975280622343, + "language_loss": 0.75824988, + "learning_rate": 1.3128271142364565e-07, + "loss": 0.77899009, + "num_input_tokens_seen": 318257425, + "step": 14761, + "time_per_iteration": 2.805400848388672 + }, + { + "auxiliary_loss_clip": 0.01063289, + "auxiliary_loss_mlp": 0.01032412, + "balance_loss_clip": 1.02510595, + "balance_loss_mlp": 1.02199876, + "epoch": 0.8875394558845634, + "flos": 31102231772160.0, + "grad_norm": 1.7036884234361338, + "language_loss": 0.61517954, + "learning_rate": 1.3114396841434717e-07, + "loss": 0.63613659, + "num_input_tokens_seen": 318278485, + "step": 14762, + "time_per_iteration": 2.6273064613342285 + }, + { + "auxiliary_loss_clip": 0.01045183, + "auxiliary_loss_mlp": 0.01027266, + "balance_loss_clip": 1.02224684, + "balance_loss_mlp": 1.01632214, + "epoch": 0.8875995791372313, + "flos": 21142048648320.0, + "grad_norm": 1.9017456979791756, + "language_loss": 0.64208734, + "learning_rate": 1.3100529627225697e-07, + "loss": 0.66281182, + "num_input_tokens_seen": 318297560, + "step": 14763, + "time_per_iteration": 4.2743895053863525 + }, + { + "auxiliary_loss_clip": 0.01043454, + "auxiliary_loss_mlp": 0.00747614, + "balance_loss_clip": 1.02504575, + "balance_loss_mlp": 1.00041115, + "epoch": 0.8876597023898993, + "flos": 17455136019840.0, + "grad_norm": 2.1395329823121356, + "language_loss": 0.71220946, + "learning_rate": 1.3086669500263335e-07, + "loss": 0.73012018, + "num_input_tokens_seen": 318313060, + "step": 14764, + "time_per_iteration": 2.6916680335998535 + }, + { + "auxiliary_loss_clip": 0.01064166, + "auxiliary_loss_mlp": 0.01033606, + "balance_loss_clip": 1.0250082, + "balance_loss_mlp": 1.02301419, + "epoch": 0.8877198256425672, + "flos": 22707933125760.0, + "grad_norm": 2.0074416983523617, + "language_loss": 0.65851641, + "learning_rate": 1.3072816461073166e-07, + "loss": 0.67949414, + "num_input_tokens_seen": 318332030, + "step": 14765, + "time_per_iteration": 4.26338267326355 + }, + { + "auxiliary_loss_clip": 0.01028611, + "auxiliary_loss_mlp": 0.01020262, + "balance_loss_clip": 1.02397549, + "balance_loss_mlp": 1.01086879, + "epoch": 0.8877799488952353, + "flos": 24535104111360.0, + "grad_norm": 1.5522048364308234, + "language_loss": 0.76583415, + "learning_rate": 1.3058970510180568e-07, + "loss": 0.78632289, + "num_input_tokens_seen": 318351090, + "step": 14766, + "time_per_iteration": 2.7744314670562744 + }, + { + "auxiliary_loss_clip": 0.01030391, + "auxiliary_loss_mlp": 0.01028172, + "balance_loss_clip": 1.02158713, + "balance_loss_mlp": 1.01811671, + "epoch": 0.8878400721479032, + "flos": 20959191486720.0, + "grad_norm": 1.8332300748070895, + "language_loss": 0.73028243, + "learning_rate": 1.3045131648110496e-07, + "loss": 0.75086808, + "num_input_tokens_seen": 318372000, + "step": 14767, + "time_per_iteration": 2.642103433609009 + }, + { + "auxiliary_loss_clip": 0.01058782, + "auxiliary_loss_mlp": 0.01025919, + "balance_loss_clip": 1.02422023, + "balance_loss_mlp": 1.01630497, + "epoch": 0.8879001954005712, + "flos": 25295260659840.0, + "grad_norm": 2.2552369941162307, + "language_loss": 0.707524, + "learning_rate": 1.303129987538778e-07, + "loss": 0.72837102, + "num_input_tokens_seen": 318391530, + "step": 14768, + "time_per_iteration": 2.5848796367645264 + }, + { + "auxiliary_loss_clip": 0.01045819, + "auxiliary_loss_mlp": 0.0102845, + "balance_loss_clip": 1.02384531, + "balance_loss_mlp": 1.01853144, + "epoch": 0.8879603186532391, + "flos": 23185329811200.0, + "grad_norm": 1.6839328586007072, + "language_loss": 0.70127928, + "learning_rate": 1.3017475192536932e-07, + "loss": 0.72202194, + "num_input_tokens_seen": 318410690, + "step": 14769, + "time_per_iteration": 2.727231025695801 + }, + { + "auxiliary_loss_clip": 0.01033894, + "auxiliary_loss_mlp": 0.01030053, + "balance_loss_clip": 1.02320981, + "balance_loss_mlp": 1.02019441, + "epoch": 0.8880204419059071, + "flos": 13655427707520.0, + "grad_norm": 2.3376459565035246, + "language_loss": 0.66959071, + "learning_rate": 1.3003657600082174e-07, + "loss": 0.69023019, + "num_input_tokens_seen": 318427380, + "step": 14770, + "time_per_iteration": 2.6826071739196777 + }, + { + "auxiliary_loss_clip": 0.01047965, + "auxiliary_loss_mlp": 0.01027505, + "balance_loss_clip": 1.02381587, + "balance_loss_mlp": 1.01757455, + "epoch": 0.888080565158575, + "flos": 20631865824000.0, + "grad_norm": 1.87525428618315, + "language_loss": 0.65217286, + "learning_rate": 1.2989847098547424e-07, + "loss": 0.67292762, + "num_input_tokens_seen": 318448530, + "step": 14771, + "time_per_iteration": 2.6276679039001465 + }, + { + "auxiliary_loss_clip": 0.01036625, + "auxiliary_loss_mlp": 0.01024495, + "balance_loss_clip": 1.02235627, + "balance_loss_mlp": 1.01447511, + "epoch": 0.888140688411243, + "flos": 28620014411520.0, + "grad_norm": 1.813752355471052, + "language_loss": 0.82509416, + "learning_rate": 1.2976043688456396e-07, + "loss": 0.84570539, + "num_input_tokens_seen": 318468655, + "step": 14772, + "time_per_iteration": 2.636730432510376 + }, + { + "auxiliary_loss_clip": 0.01032651, + "auxiliary_loss_mlp": 0.01023164, + "balance_loss_clip": 1.02110624, + "balance_loss_mlp": 1.01418149, + "epoch": 0.8882008116639111, + "flos": 25520241496320.0, + "grad_norm": 1.476014715695351, + "language_loss": 0.76319814, + "learning_rate": 1.296224737033258e-07, + "loss": 0.78375632, + "num_input_tokens_seen": 318488740, + "step": 14773, + "time_per_iteration": 2.67130970954895 + }, + { + "auxiliary_loss_clip": 0.01037862, + "auxiliary_loss_mlp": 0.01025801, + "balance_loss_clip": 1.02294064, + "balance_loss_mlp": 1.01643085, + "epoch": 0.888260934916579, + "flos": 27673696650240.0, + "grad_norm": 1.8496548491187002, + "language_loss": 0.74649549, + "learning_rate": 1.294845814469907e-07, + "loss": 0.7671321, + "num_input_tokens_seen": 318508810, + "step": 14774, + "time_per_iteration": 2.7561964988708496 + }, + { + "auxiliary_loss_clip": 0.01023161, + "auxiliary_loss_mlp": 0.0074752, + "balance_loss_clip": 1.02528763, + "balance_loss_mlp": 1.00041485, + "epoch": 0.888321058169247, + "flos": 21611077464960.0, + "grad_norm": 2.334000464685014, + "language_loss": 0.71356213, + "learning_rate": 1.2934676012078783e-07, + "loss": 0.73126894, + "num_input_tokens_seen": 318526860, + "step": 14775, + "time_per_iteration": 2.7206943035125732 + }, + { + "auxiliary_loss_clip": 0.01060933, + "auxiliary_loss_mlp": 0.01027461, + "balance_loss_clip": 1.02474976, + "balance_loss_mlp": 1.0177213, + "epoch": 0.8883811814219149, + "flos": 18149109759360.0, + "grad_norm": 1.6275439271766865, + "language_loss": 0.79812288, + "learning_rate": 1.292090097299432e-07, + "loss": 0.8190068, + "num_input_tokens_seen": 318545180, + "step": 14776, + "time_per_iteration": 2.5513339042663574 + }, + { + "auxiliary_loss_clip": 0.01055355, + "auxiliary_loss_mlp": 0.01030509, + "balance_loss_clip": 1.02441049, + "balance_loss_mlp": 1.01949382, + "epoch": 0.8884413046745829, + "flos": 28324648874880.0, + "grad_norm": 1.7172750527166665, + "language_loss": 0.69301707, + "learning_rate": 1.290713302796802e-07, + "loss": 0.71387571, + "num_input_tokens_seen": 318564350, + "step": 14777, + "time_per_iteration": 2.6828973293304443 + }, + { + "auxiliary_loss_clip": 0.0104345, + "auxiliary_loss_mlp": 0.01031275, + "balance_loss_clip": 1.02058315, + "balance_loss_mlp": 1.02102864, + "epoch": 0.8885014279272508, + "flos": 15158756649600.0, + "grad_norm": 1.8326287853598475, + "language_loss": 0.70399773, + "learning_rate": 1.2893372177522e-07, + "loss": 0.72474498, + "num_input_tokens_seen": 318582275, + "step": 14778, + "time_per_iteration": 2.588994264602661 + }, + { + "auxiliary_loss_clip": 0.01060894, + "auxiliary_loss_mlp": 0.01028518, + "balance_loss_clip": 1.0245651, + "balance_loss_mlp": 1.01849258, + "epoch": 0.8885615511799189, + "flos": 19099593498240.0, + "grad_norm": 1.6123770624473963, + "language_loss": 0.77393621, + "learning_rate": 1.287961842217804e-07, + "loss": 0.79483032, + "num_input_tokens_seen": 318601230, + "step": 14779, + "time_per_iteration": 2.586651086807251 + }, + { + "auxiliary_loss_clip": 0.00991171, + "auxiliary_loss_mlp": 0.01000605, + "balance_loss_clip": 1.00513363, + "balance_loss_mlp": 0.99960941, + "epoch": 0.8886216744325868, + "flos": 51186567605760.0, + "grad_norm": 0.8766025697397989, + "language_loss": 0.56771731, + "learning_rate": 1.2865871762457747e-07, + "loss": 0.58763504, + "num_input_tokens_seen": 318645595, + "step": 14780, + "time_per_iteration": 3.0189380645751953 + }, + { + "auxiliary_loss_clip": 0.01006479, + "auxiliary_loss_mlp": 0.01000798, + "balance_loss_clip": 1.00177431, + "balance_loss_mlp": 0.99993342, + "epoch": 0.8886817976852548, + "flos": 61612981263360.0, + "grad_norm": 0.8057882360739413, + "language_loss": 0.62432778, + "learning_rate": 1.2852132198882326e-07, + "loss": 0.64440048, + "num_input_tokens_seen": 318707850, + "step": 14781, + "time_per_iteration": 3.283642292022705 + }, + { + "auxiliary_loss_clip": 0.00941191, + "auxiliary_loss_mlp": 0.01002817, + "balance_loss_clip": 1.00594401, + "balance_loss_mlp": 1.00182176, + "epoch": 0.8887419209379227, + "flos": 60646946935680.0, + "grad_norm": 0.7828194608529316, + "language_loss": 0.58146513, + "learning_rate": 1.2838399731972805e-07, + "loss": 0.6009053, + "num_input_tokens_seen": 318764915, + "step": 14782, + "time_per_iteration": 5.227778434753418 + }, + { + "auxiliary_loss_clip": 0.01061509, + "auxiliary_loss_mlp": 0.01028348, + "balance_loss_clip": 1.02654481, + "balance_loss_mlp": 1.01877522, + "epoch": 0.8888020441905907, + "flos": 29205861235200.0, + "grad_norm": 1.4934807487114767, + "language_loss": 0.65536171, + "learning_rate": 1.2824674362249922e-07, + "loss": 0.67626029, + "num_input_tokens_seen": 318785660, + "step": 14783, + "time_per_iteration": 2.9983620643615723 + }, + { + "auxiliary_loss_clip": 0.01064346, + "auxiliary_loss_mlp": 0.01031037, + "balance_loss_clip": 1.02578163, + "balance_loss_mlp": 1.02017105, + "epoch": 0.8888621674432586, + "flos": 22162701605760.0, + "grad_norm": 1.4013765078296974, + "language_loss": 0.77831948, + "learning_rate": 1.281095609023415e-07, + "loss": 0.79927325, + "num_input_tokens_seen": 318806080, + "step": 14784, + "time_per_iteration": 2.6105568408966064 + }, + { + "auxiliary_loss_clip": 0.01033506, + "auxiliary_loss_mlp": 0.01031812, + "balance_loss_clip": 1.02275503, + "balance_loss_mlp": 1.02085674, + "epoch": 0.8889222906959267, + "flos": 27672834723840.0, + "grad_norm": 2.138005051456918, + "language_loss": 0.60593522, + "learning_rate": 1.279724491644565e-07, + "loss": 0.62658834, + "num_input_tokens_seen": 318826445, + "step": 14785, + "time_per_iteration": 2.925516366958618 + }, + { + "auxiliary_loss_clip": 0.01026349, + "auxiliary_loss_mlp": 0.0103058, + "balance_loss_clip": 1.02251697, + "balance_loss_mlp": 1.02005339, + "epoch": 0.8889824139485947, + "flos": 14168627274240.0, + "grad_norm": 1.7748149592976634, + "language_loss": 0.64690822, + "learning_rate": 1.278354084140445e-07, + "loss": 0.66747749, + "num_input_tokens_seen": 318843915, + "step": 14786, + "time_per_iteration": 2.7342047691345215 + }, + { + "auxiliary_loss_clip": 0.01034018, + "auxiliary_loss_mlp": 0.00747752, + "balance_loss_clip": 1.02437961, + "balance_loss_mlp": 1.000386, + "epoch": 0.8890425372012626, + "flos": 12853003829760.0, + "grad_norm": 2.6862755477745734, + "language_loss": 0.85505378, + "learning_rate": 1.276984386563009e-07, + "loss": 0.87287146, + "num_input_tokens_seen": 318859670, + "step": 14787, + "time_per_iteration": 2.678642988204956 + }, + { + "auxiliary_loss_clip": 0.01040715, + "auxiliary_loss_mlp": 0.01028676, + "balance_loss_clip": 1.02465796, + "balance_loss_mlp": 1.0181675, + "epoch": 0.8891026604539306, + "flos": 21689291329920.0, + "grad_norm": 2.41989865522504, + "language_loss": 0.70880347, + "learning_rate": 1.2756153989642027e-07, + "loss": 0.72949737, + "num_input_tokens_seen": 318877855, + "step": 14788, + "time_per_iteration": 2.6963672637939453 + }, + { + "auxiliary_loss_clip": 0.01059735, + "auxiliary_loss_mlp": 0.01028161, + "balance_loss_clip": 1.02508748, + "balance_loss_mlp": 1.0182308, + "epoch": 0.8891627837065985, + "flos": 21871430219520.0, + "grad_norm": 1.5778565672666103, + "language_loss": 0.70077735, + "learning_rate": 1.274247121395935e-07, + "loss": 0.72165632, + "num_input_tokens_seen": 318896045, + "step": 14789, + "time_per_iteration": 2.6102914810180664 + }, + { + "auxiliary_loss_clip": 0.01052807, + "auxiliary_loss_mlp": 0.01022657, + "balance_loss_clip": 1.02572942, + "balance_loss_mlp": 1.01260185, + "epoch": 0.8892229069592665, + "flos": 21580230660480.0, + "grad_norm": 1.5306654425220825, + "language_loss": 0.70363152, + "learning_rate": 1.2728795539100956e-07, + "loss": 0.72438616, + "num_input_tokens_seen": 318915515, + "step": 14790, + "time_per_iteration": 2.6813299655914307 + }, + { + "auxiliary_loss_clip": 0.01044791, + "auxiliary_loss_mlp": 0.01024017, + "balance_loss_clip": 1.02708626, + "balance_loss_mlp": 1.01435542, + "epoch": 0.8892830302119344, + "flos": 23075981832960.0, + "grad_norm": 1.6743437057451407, + "language_loss": 0.73025715, + "learning_rate": 1.2715126965585387e-07, + "loss": 0.75094527, + "num_input_tokens_seen": 318934305, + "step": 14791, + "time_per_iteration": 2.694579839706421 + }, + { + "auxiliary_loss_clip": 0.01026465, + "auxiliary_loss_mlp": 0.01033057, + "balance_loss_clip": 1.02408242, + "balance_loss_mlp": 1.02287626, + "epoch": 0.8893431534646025, + "flos": 23072139077760.0, + "grad_norm": 1.849045870271446, + "language_loss": 0.74019772, + "learning_rate": 1.2701465493931008e-07, + "loss": 0.76079297, + "num_input_tokens_seen": 318953880, + "step": 14792, + "time_per_iteration": 2.689805746078491 + }, + { + "auxiliary_loss_clip": 0.00992885, + "auxiliary_loss_mlp": 0.01033686, + "balance_loss_clip": 1.02101243, + "balance_loss_mlp": 1.0217886, + "epoch": 0.8894032767172704, + "flos": 22454978572800.0, + "grad_norm": 2.0810619546723688, + "language_loss": 0.66697669, + "learning_rate": 1.2687811124655801e-07, + "loss": 0.68724239, + "num_input_tokens_seen": 318971395, + "step": 14793, + "time_per_iteration": 2.8682899475097656 + }, + { + "auxiliary_loss_clip": 0.0103421, + "auxiliary_loss_mlp": 0.01027927, + "balance_loss_clip": 1.02548683, + "balance_loss_mlp": 1.01703703, + "epoch": 0.8894633999699384, + "flos": 25338246261120.0, + "grad_norm": 1.6751858198755527, + "language_loss": 0.71197051, + "learning_rate": 1.2674163858277552e-07, + "loss": 0.73259187, + "num_input_tokens_seen": 318990580, + "step": 14794, + "time_per_iteration": 2.7426960468292236 + }, + { + "auxiliary_loss_clip": 0.01056085, + "auxiliary_loss_mlp": 0.01030246, + "balance_loss_clip": 1.0263176, + "balance_loss_mlp": 1.01935601, + "epoch": 0.8895235232226063, + "flos": 20994096528000.0, + "grad_norm": 1.5839239861887644, + "language_loss": 0.74925709, + "learning_rate": 1.2660523695313785e-07, + "loss": 0.77012038, + "num_input_tokens_seen": 319010040, + "step": 14795, + "time_per_iteration": 2.6170592308044434 + }, + { + "auxiliary_loss_clip": 0.00978672, + "auxiliary_loss_mlp": 0.01003723, + "balance_loss_clip": 1.00297487, + "balance_loss_mlp": 1.00287664, + "epoch": 0.8895836464752743, + "flos": 69732956764800.0, + "grad_norm": 0.7693893095026533, + "language_loss": 0.56117904, + "learning_rate": 1.2646890636281727e-07, + "loss": 0.58100301, + "num_input_tokens_seen": 319063860, + "step": 14796, + "time_per_iteration": 3.1909778118133545 + }, + { + "auxiliary_loss_clip": 0.01064506, + "auxiliary_loss_mlp": 0.01027837, + "balance_loss_clip": 1.0259614, + "balance_loss_mlp": 1.01619625, + "epoch": 0.8896437697279422, + "flos": 23221815050880.0, + "grad_norm": 1.678018913479077, + "language_loss": 0.70308781, + "learning_rate": 1.263326468169843e-07, + "loss": 0.72401118, + "num_input_tokens_seen": 319082335, + "step": 14797, + "time_per_iteration": 2.6154699325561523 + }, + { + "auxiliary_loss_clip": 0.00993269, + "auxiliary_loss_mlp": 0.01003567, + "balance_loss_clip": 1.00679302, + "balance_loss_mlp": 1.00266671, + "epoch": 0.8897038929806103, + "flos": 70752711882240.0, + "grad_norm": 0.7508800987383399, + "language_loss": 0.58073908, + "learning_rate": 1.2619645832080417e-07, + "loss": 0.60070747, + "num_input_tokens_seen": 319147075, + "step": 14798, + "time_per_iteration": 5.451514005661011 + }, + { + "auxiliary_loss_clip": 0.01049158, + "auxiliary_loss_mlp": 0.01026969, + "balance_loss_clip": 1.02300525, + "balance_loss_mlp": 1.01602542, + "epoch": 0.8897640162332782, + "flos": 19245103493760.0, + "grad_norm": 1.492762684158092, + "language_loss": 0.79068428, + "learning_rate": 1.2606034087944251e-07, + "loss": 0.81144553, + "num_input_tokens_seen": 319166630, + "step": 14799, + "time_per_iteration": 2.671201467514038 + }, + { + "auxiliary_loss_clip": 0.0099787, + "auxiliary_loss_mlp": 0.01001061, + "balance_loss_clip": 1.00249887, + "balance_loss_mlp": 1.0002861, + "epoch": 0.8898241394859462, + "flos": 41356275039360.0, + "grad_norm": 0.888479992434313, + "language_loss": 0.58097994, + "learning_rate": 1.2592429449806053e-07, + "loss": 0.60096931, + "num_input_tokens_seen": 319221865, + "step": 14800, + "time_per_iteration": 3.0966193675994873 + }, + { + "auxiliary_loss_clip": 0.01051917, + "auxiliary_loss_mlp": 0.01030158, + "balance_loss_clip": 1.02577746, + "balance_loss_mlp": 1.02027512, + "epoch": 0.8898842627386142, + "flos": 18986295024000.0, + "grad_norm": 1.6255992108068373, + "language_loss": 0.66393805, + "learning_rate": 1.2578831918181698e-07, + "loss": 0.68475878, + "num_input_tokens_seen": 319240710, + "step": 14801, + "time_per_iteration": 2.6302359104156494 + }, + { + "auxiliary_loss_clip": 0.01021543, + "auxiliary_loss_mlp": 0.01033973, + "balance_loss_clip": 1.02331758, + "balance_loss_mlp": 1.02145004, + "epoch": 0.8899443859912821, + "flos": 13217173868160.0, + "grad_norm": 2.843345609987986, + "language_loss": 0.75274938, + "learning_rate": 1.256524149358682e-07, + "loss": 0.77330452, + "num_input_tokens_seen": 319256495, + "step": 14802, + "time_per_iteration": 2.627919912338257 + }, + { + "auxiliary_loss_clip": 0.0105239, + "auxiliary_loss_mlp": 0.0102642, + "balance_loss_clip": 1.02670741, + "balance_loss_mlp": 1.01693702, + "epoch": 0.8900045092439501, + "flos": 22674680110080.0, + "grad_norm": 1.8163683823190113, + "language_loss": 0.73173285, + "learning_rate": 1.2551658176536805e-07, + "loss": 0.75252092, + "num_input_tokens_seen": 319273620, + "step": 14803, + "time_per_iteration": 2.6431028842926025 + }, + { + "auxiliary_loss_clip": 0.01030281, + "auxiliary_loss_mlp": 0.01029668, + "balance_loss_clip": 1.02266169, + "balance_loss_mlp": 1.01907659, + "epoch": 0.890064632496618, + "flos": 21141617685120.0, + "grad_norm": 1.7778268160671793, + "language_loss": 0.71846718, + "learning_rate": 1.2538081967546664e-07, + "loss": 0.73906672, + "num_input_tokens_seen": 319291720, + "step": 14804, + "time_per_iteration": 2.702775001525879 + }, + { + "auxiliary_loss_clip": 0.01051469, + "auxiliary_loss_mlp": 0.01029641, + "balance_loss_clip": 1.02424562, + "balance_loss_mlp": 1.01930547, + "epoch": 0.8901247557492861, + "flos": 23397058529280.0, + "grad_norm": 1.614547395614066, + "language_loss": 0.81458753, + "learning_rate": 1.252451286713123e-07, + "loss": 0.83539867, + "num_input_tokens_seen": 319310380, + "step": 14805, + "time_per_iteration": 2.6211442947387695 + }, + { + "auxiliary_loss_clip": 0.0105422, + "auxiliary_loss_mlp": 0.0102781, + "balance_loss_clip": 1.0254395, + "balance_loss_mlp": 1.01687264, + "epoch": 0.890184879001954, + "flos": 29169591477120.0, + "grad_norm": 1.8090095668721147, + "language_loss": 0.66821146, + "learning_rate": 1.251095087580505e-07, + "loss": 0.68903172, + "num_input_tokens_seen": 319331765, + "step": 14806, + "time_per_iteration": 2.619899272918701 + }, + { + "auxiliary_loss_clip": 0.01036359, + "auxiliary_loss_mlp": 0.0102826, + "balance_loss_clip": 1.02258265, + "balance_loss_mlp": 1.01741815, + "epoch": 0.890245002254622, + "flos": 14427830793600.0, + "grad_norm": 1.9104575326813114, + "language_loss": 0.6734848, + "learning_rate": 1.2497395994082438e-07, + "loss": 0.69413102, + "num_input_tokens_seen": 319349135, + "step": 14807, + "time_per_iteration": 2.7260804176330566 + }, + { + "auxiliary_loss_clip": 0.01040185, + "auxiliary_loss_mlp": 0.01023951, + "balance_loss_clip": 1.02442837, + "balance_loss_mlp": 1.01426578, + "epoch": 0.8903051255072899, + "flos": 22382187661440.0, + "grad_norm": 1.785852530168052, + "language_loss": 0.75473589, + "learning_rate": 1.248384822247732e-07, + "loss": 0.77537727, + "num_input_tokens_seen": 319368410, + "step": 14808, + "time_per_iteration": 2.577144145965576 + }, + { + "auxiliary_loss_clip": 0.01032215, + "auxiliary_loss_mlp": 0.01029164, + "balance_loss_clip": 1.02405977, + "balance_loss_mlp": 1.01969886, + "epoch": 0.8903652487599579, + "flos": 20777375819520.0, + "grad_norm": 1.7228510205829273, + "language_loss": 0.81152773, + "learning_rate": 1.2470307561503513e-07, + "loss": 0.83214158, + "num_input_tokens_seen": 319387535, + "step": 14809, + "time_per_iteration": 2.593259334564209 + }, + { + "auxiliary_loss_clip": 0.01051186, + "auxiliary_loss_mlp": 0.01025248, + "balance_loss_clip": 1.02354789, + "balance_loss_mlp": 1.01568174, + "epoch": 0.8904253720126258, + "flos": 24424499157120.0, + "grad_norm": 1.79216086004993, + "language_loss": 0.68371785, + "learning_rate": 1.2456774011674442e-07, + "loss": 0.7044822, + "num_input_tokens_seen": 319407210, + "step": 14810, + "time_per_iteration": 4.294346332550049 + }, + { + "auxiliary_loss_clip": 0.01028479, + "auxiliary_loss_mlp": 0.01026274, + "balance_loss_clip": 1.02256918, + "balance_loss_mlp": 1.01505089, + "epoch": 0.8904854952652939, + "flos": 19463871277440.0, + "grad_norm": 4.81098604465085, + "language_loss": 0.70168507, + "learning_rate": 1.2443247573503257e-07, + "loss": 0.7222327, + "num_input_tokens_seen": 319425340, + "step": 14811, + "time_per_iteration": 2.964583158493042 + }, + { + "auxiliary_loss_clip": 0.01026854, + "auxiliary_loss_mlp": 0.00747642, + "balance_loss_clip": 1.02280819, + "balance_loss_mlp": 1.00040007, + "epoch": 0.8905456185179618, + "flos": 50800741666560.0, + "grad_norm": 1.9730638040139017, + "language_loss": 0.6591711, + "learning_rate": 1.2429728247502924e-07, + "loss": 0.67691606, + "num_input_tokens_seen": 319448150, + "step": 14812, + "time_per_iteration": 4.673667669296265 + }, + { + "auxiliary_loss_clip": 0.01024106, + "auxiliary_loss_mlp": 0.0102865, + "balance_loss_clip": 1.02636528, + "balance_loss_mlp": 1.01876795, + "epoch": 0.8906057417706298, + "flos": 17784867893760.0, + "grad_norm": 1.6983205870232503, + "language_loss": 0.68290269, + "learning_rate": 1.24162160341861e-07, + "loss": 0.70343024, + "num_input_tokens_seen": 319466115, + "step": 14813, + "time_per_iteration": 2.8835771083831787 + }, + { + "auxiliary_loss_clip": 0.01032313, + "auxiliary_loss_mlp": 0.01036542, + "balance_loss_clip": 1.02075028, + "balance_loss_mlp": 1.02256441, + "epoch": 0.8906658650232978, + "flos": 21944867575680.0, + "grad_norm": 2.189681214396781, + "language_loss": 0.75409973, + "learning_rate": 1.2402710934065198e-07, + "loss": 0.77478832, + "num_input_tokens_seen": 319485255, + "step": 14814, + "time_per_iteration": 2.8535187244415283 + }, + { + "auxiliary_loss_clip": 0.01054886, + "auxiliary_loss_mlp": 0.01028547, + "balance_loss_clip": 1.02558732, + "balance_loss_mlp": 1.01738286, + "epoch": 0.8907259882759657, + "flos": 21287810039040.0, + "grad_norm": 2.0271393359006624, + "language_loss": 0.74136931, + "learning_rate": 1.2389212947652229e-07, + "loss": 0.76220363, + "num_input_tokens_seen": 319501800, + "step": 14815, + "time_per_iteration": 2.722810983657837 + }, + { + "auxiliary_loss_clip": 0.01019522, + "auxiliary_loss_mlp": 0.01027832, + "balance_loss_clip": 1.02063048, + "balance_loss_mlp": 1.01758599, + "epoch": 0.8907861115286337, + "flos": 20120426023680.0, + "grad_norm": 1.8115537407235598, + "language_loss": 0.75173777, + "learning_rate": 1.237572207545914e-07, + "loss": 0.77221131, + "num_input_tokens_seen": 319520415, + "step": 14816, + "time_per_iteration": 2.720956563949585 + }, + { + "auxiliary_loss_clip": 0.01042294, + "auxiliary_loss_mlp": 0.01025235, + "balance_loss_clip": 1.0242461, + "balance_loss_mlp": 1.01483369, + "epoch": 0.8908462347813016, + "flos": 20084156265600.0, + "grad_norm": 1.8021205579689674, + "language_loss": 0.77534008, + "learning_rate": 1.2362238317997476e-07, + "loss": 0.79601538, + "num_input_tokens_seen": 319538410, + "step": 14817, + "time_per_iteration": 2.7129740715026855 + }, + { + "auxiliary_loss_clip": 0.00978444, + "auxiliary_loss_mlp": 0.01003223, + "balance_loss_clip": 1.00289822, + "balance_loss_mlp": 1.00229919, + "epoch": 0.8909063580339697, + "flos": 65503649790720.0, + "grad_norm": 0.7968963355721855, + "language_loss": 0.56575572, + "learning_rate": 1.2348761675778517e-07, + "loss": 0.58557236, + "num_input_tokens_seen": 319602565, + "step": 14818, + "time_per_iteration": 3.3023531436920166 + }, + { + "auxiliary_loss_clip": 0.01023045, + "auxiliary_loss_mlp": 0.01032365, + "balance_loss_clip": 1.02565122, + "balance_loss_mlp": 1.02235782, + "epoch": 0.8909664812866376, + "flos": 29863062426240.0, + "grad_norm": 1.8074651602882654, + "language_loss": 0.64581299, + "learning_rate": 1.2335292149313325e-07, + "loss": 0.66636711, + "num_input_tokens_seen": 319624645, + "step": 14819, + "time_per_iteration": 2.80096173286438 + }, + { + "auxiliary_loss_clip": 0.01052023, + "auxiliary_loss_mlp": 0.01027909, + "balance_loss_clip": 1.0242455, + "balance_loss_mlp": 1.01695395, + "epoch": 0.8910266045393056, + "flos": 25447127362560.0, + "grad_norm": 1.9680296172474436, + "language_loss": 0.78149772, + "learning_rate": 1.2321829739112731e-07, + "loss": 0.80229706, + "num_input_tokens_seen": 319644040, + "step": 14820, + "time_per_iteration": 2.6512162685394287 + }, + { + "auxiliary_loss_clip": 0.01021702, + "auxiliary_loss_mlp": 0.00747455, + "balance_loss_clip": 1.02298641, + "balance_loss_mlp": 1.00041413, + "epoch": 0.8910867277919735, + "flos": 24499121662080.0, + "grad_norm": 2.010312155420503, + "language_loss": 0.76537979, + "learning_rate": 1.2308374445687087e-07, + "loss": 0.78307128, + "num_input_tokens_seen": 319663930, + "step": 14821, + "time_per_iteration": 2.8149263858795166 + }, + { + "auxiliary_loss_clip": 0.00999588, + "auxiliary_loss_mlp": 0.00746476, + "balance_loss_clip": 1.00374913, + "balance_loss_mlp": 1.00043488, + "epoch": 0.8911468510446415, + "flos": 60688136856960.0, + "grad_norm": 0.7954925045768843, + "language_loss": 0.5929476, + "learning_rate": 1.2294926269546712e-07, + "loss": 0.61040831, + "num_input_tokens_seen": 319721245, + "step": 14822, + "time_per_iteration": 3.1691999435424805 + }, + { + "auxiliary_loss_clip": 0.0105325, + "auxiliary_loss_mlp": 0.01029407, + "balance_loss_clip": 1.02564859, + "balance_loss_mlp": 1.01864815, + "epoch": 0.8912069742973094, + "flos": 25337492075520.0, + "grad_norm": 2.666451758090687, + "language_loss": 0.68930697, + "learning_rate": 1.2281485211201515e-07, + "loss": 0.71013355, + "num_input_tokens_seen": 319741200, + "step": 14823, + "time_per_iteration": 2.7174901962280273 + }, + { + "auxiliary_loss_clip": 0.01042523, + "auxiliary_loss_mlp": 0.01030034, + "balance_loss_clip": 1.0226903, + "balance_loss_mlp": 1.01913881, + "epoch": 0.8912670975499775, + "flos": 18223516782720.0, + "grad_norm": 1.502799777415011, + "language_loss": 0.69103324, + "learning_rate": 1.2268051271161262e-07, + "loss": 0.71175873, + "num_input_tokens_seen": 319759265, + "step": 14824, + "time_per_iteration": 2.6068453788757324 + }, + { + "auxiliary_loss_clip": 0.0101726, + "auxiliary_loss_mlp": 0.01030479, + "balance_loss_clip": 1.02313173, + "balance_loss_mlp": 1.01821208, + "epoch": 0.8913272208026454, + "flos": 26504481041280.0, + "grad_norm": 1.7690518039743244, + "language_loss": 0.70567191, + "learning_rate": 1.2254624449935303e-07, + "loss": 0.72614926, + "num_input_tokens_seen": 319777560, + "step": 14825, + "time_per_iteration": 2.9096617698669434 + }, + { + "auxiliary_loss_clip": 0.01038337, + "auxiliary_loss_mlp": 0.01029762, + "balance_loss_clip": 1.02265573, + "balance_loss_mlp": 1.01921177, + "epoch": 0.8913873440553134, + "flos": 18802324540800.0, + "grad_norm": 1.9380884191976617, + "language_loss": 0.70852774, + "learning_rate": 1.2241204748032786e-07, + "loss": 0.72920877, + "num_input_tokens_seen": 319794125, + "step": 14826, + "time_per_iteration": 2.7011971473693848 + }, + { + "auxiliary_loss_clip": 0.01051475, + "auxiliary_loss_mlp": 0.01024615, + "balance_loss_clip": 1.02510166, + "balance_loss_mlp": 1.01461911, + "epoch": 0.8914474673079814, + "flos": 20884892204160.0, + "grad_norm": 1.9432218890166848, + "language_loss": 0.74824941, + "learning_rate": 1.2227792165962615e-07, + "loss": 0.76901031, + "num_input_tokens_seen": 319810310, + "step": 14827, + "time_per_iteration": 2.6669857501983643 + }, + { + "auxiliary_loss_clip": 0.01053924, + "auxiliary_loss_mlp": 0.0102547, + "balance_loss_clip": 1.02534902, + "balance_loss_mlp": 1.0150032, + "epoch": 0.8915075905606493, + "flos": 20952439729920.0, + "grad_norm": 1.7784067315925627, + "language_loss": 0.78116649, + "learning_rate": 1.221438670423336e-07, + "loss": 0.80196041, + "num_input_tokens_seen": 319828505, + "step": 14828, + "time_per_iteration": 2.761544704437256 + }, + { + "auxiliary_loss_clip": 0.01020693, + "auxiliary_loss_mlp": 0.01030064, + "balance_loss_clip": 1.02285528, + "balance_loss_mlp": 1.01938927, + "epoch": 0.8915677138133173, + "flos": 23076305055360.0, + "grad_norm": 1.6307047693839873, + "language_loss": 0.75093931, + "learning_rate": 1.2200988363353392e-07, + "loss": 0.77144688, + "num_input_tokens_seen": 319848680, + "step": 14829, + "time_per_iteration": 4.486780405044556 + }, + { + "auxiliary_loss_clip": 0.010611, + "auxiliary_loss_mlp": 0.01030389, + "balance_loss_clip": 1.02417564, + "balance_loss_mlp": 1.02089417, + "epoch": 0.8916278370659853, + "flos": 23440259612160.0, + "grad_norm": 2.273220526157808, + "language_loss": 0.84351557, + "learning_rate": 1.2187597143830773e-07, + "loss": 0.86443049, + "num_input_tokens_seen": 319868835, + "step": 14830, + "time_per_iteration": 2.627046585083008 + }, + { + "auxiliary_loss_clip": 0.01049589, + "auxiliary_loss_mlp": 0.01024165, + "balance_loss_clip": 1.02448416, + "balance_loss_mlp": 1.01465225, + "epoch": 0.8916879603186533, + "flos": 25160488830720.0, + "grad_norm": 1.4784664140875396, + "language_loss": 0.75023293, + "learning_rate": 1.2174213046173299e-07, + "loss": 0.77097046, + "num_input_tokens_seen": 319891585, + "step": 14831, + "time_per_iteration": 2.7126152515411377 + }, + { + "auxiliary_loss_clip": 0.01053035, + "auxiliary_loss_mlp": 0.01024679, + "balance_loss_clip": 1.02508974, + "balance_loss_mlp": 1.01421833, + "epoch": 0.8917480835713212, + "flos": 20229845829120.0, + "grad_norm": 1.66791523796936, + "language_loss": 0.73119646, + "learning_rate": 1.216083607088847e-07, + "loss": 0.75197363, + "num_input_tokens_seen": 319910315, + "step": 14832, + "time_per_iteration": 2.785872220993042 + }, + { + "auxiliary_loss_clip": 0.00996447, + "auxiliary_loss_mlp": 0.00747742, + "balance_loss_clip": 1.02080464, + "balance_loss_mlp": 1.00038671, + "epoch": 0.8918082068239892, + "flos": 26101922342400.0, + "grad_norm": 1.679121805023297, + "language_loss": 0.67097265, + "learning_rate": 1.214746621848355e-07, + "loss": 0.68841457, + "num_input_tokens_seen": 319932275, + "step": 14833, + "time_per_iteration": 2.930368661880493 + }, + { + "auxiliary_loss_clip": 0.01056301, + "auxiliary_loss_mlp": 0.0102958, + "balance_loss_clip": 1.02630293, + "balance_loss_mlp": 1.01849961, + "epoch": 0.8918683300766571, + "flos": 24831439315200.0, + "grad_norm": 1.945088179208591, + "language_loss": 0.73644233, + "learning_rate": 1.2134103489465575e-07, + "loss": 0.75730115, + "num_input_tokens_seen": 319955335, + "step": 14834, + "time_per_iteration": 2.7261202335357666 + }, + { + "auxiliary_loss_clip": 0.01014643, + "auxiliary_loss_mlp": 0.01029265, + "balance_loss_clip": 1.02224874, + "balance_loss_mlp": 1.01863766, + "epoch": 0.8919284533293251, + "flos": 22305158945280.0, + "grad_norm": 2.137355201641259, + "language_loss": 0.79281545, + "learning_rate": 1.2120747884341188e-07, + "loss": 0.81325454, + "num_input_tokens_seen": 319973990, + "step": 14835, + "time_per_iteration": 2.7329561710357666 + }, + { + "auxiliary_loss_clip": 0.01058061, + "auxiliary_loss_mlp": 0.01025421, + "balance_loss_clip": 1.02297246, + "balance_loss_mlp": 1.0159142, + "epoch": 0.891988576581993, + "flos": 30373532559360.0, + "grad_norm": 1.7421763698342105, + "language_loss": 0.74211031, + "learning_rate": 1.210739940361689e-07, + "loss": 0.76294518, + "num_input_tokens_seen": 319995555, + "step": 14836, + "time_per_iteration": 2.6916205883026123 + }, + { + "auxiliary_loss_clip": 0.01041245, + "auxiliary_loss_mlp": 0.01028536, + "balance_loss_clip": 1.02376032, + "balance_loss_mlp": 1.01820624, + "epoch": 0.8920486998346611, + "flos": 15552947479680.0, + "grad_norm": 2.098685030404925, + "language_loss": 0.683716, + "learning_rate": 1.2094058047798838e-07, + "loss": 0.70441383, + "num_input_tokens_seen": 320012385, + "step": 14837, + "time_per_iteration": 2.6556262969970703 + }, + { + "auxiliary_loss_clip": 0.01002494, + "auxiliary_loss_mlp": 0.01031391, + "balance_loss_clip": 1.0221138, + "balance_loss_mlp": 1.02000642, + "epoch": 0.892108823087329, + "flos": 21214983214080.0, + "grad_norm": 1.7584410948250908, + "language_loss": 0.67876118, + "learning_rate": 1.2080723817392913e-07, + "loss": 0.69910002, + "num_input_tokens_seen": 320032390, + "step": 14838, + "time_per_iteration": 2.7830536365509033 + }, + { + "auxiliary_loss_clip": 0.01049525, + "auxiliary_loss_mlp": 0.0102644, + "balance_loss_clip": 1.02326858, + "balance_loss_mlp": 1.01547265, + "epoch": 0.892168946339997, + "flos": 21978982517760.0, + "grad_norm": 1.823228001809728, + "language_loss": 0.76149857, + "learning_rate": 1.2067396712904777e-07, + "loss": 0.78225815, + "num_input_tokens_seen": 320052885, + "step": 14839, + "time_per_iteration": 2.6369094848632812 + }, + { + "auxiliary_loss_clip": 0.00978211, + "auxiliary_loss_mlp": 0.00746541, + "balance_loss_clip": 1.00244212, + "balance_loss_mlp": 1.00048816, + "epoch": 0.892229069592665, + "flos": 67475289277440.0, + "grad_norm": 0.6815922716707238, + "language_loss": 0.49366653, + "learning_rate": 1.205407673483978e-07, + "loss": 0.51091409, + "num_input_tokens_seen": 320113685, + "step": 14840, + "time_per_iteration": 3.370373487472534 + }, + { + "auxiliary_loss_clip": 0.0106562, + "auxiliary_loss_mlp": 0.01030327, + "balance_loss_clip": 1.02446151, + "balance_loss_mlp": 1.01825702, + "epoch": 0.8922891928453329, + "flos": 19459561645440.0, + "grad_norm": 2.2342172560896683, + "language_loss": 0.64147919, + "learning_rate": 1.2040763883703074e-07, + "loss": 0.66243863, + "num_input_tokens_seen": 320130810, + "step": 14841, + "time_per_iteration": 2.625929594039917 + }, + { + "auxiliary_loss_clip": 0.01028256, + "auxiliary_loss_mlp": 0.00747571, + "balance_loss_clip": 1.02359557, + "balance_loss_mlp": 1.0004077, + "epoch": 0.8923493160980009, + "flos": 23367396873600.0, + "grad_norm": 1.5368493889674928, + "language_loss": 0.68222821, + "learning_rate": 1.2027458159999438e-07, + "loss": 0.69998646, + "num_input_tokens_seen": 320152170, + "step": 14842, + "time_per_iteration": 2.8341822624206543 + }, + { + "auxiliary_loss_clip": 0.01061443, + "auxiliary_loss_mlp": 0.01030813, + "balance_loss_clip": 1.02556872, + "balance_loss_mlp": 1.02155614, + "epoch": 0.8924094393506689, + "flos": 26177047637760.0, + "grad_norm": 2.0382202938589344, + "language_loss": 0.79956281, + "learning_rate": 1.2014159564233373e-07, + "loss": 0.82048535, + "num_input_tokens_seen": 320172360, + "step": 14843, + "time_per_iteration": 2.7067832946777344 + }, + { + "auxiliary_loss_clip": 0.01036781, + "auxiliary_loss_mlp": 0.01030847, + "balance_loss_clip": 1.02301824, + "balance_loss_mlp": 1.01934302, + "epoch": 0.8924695626033369, + "flos": 22018520413440.0, + "grad_norm": 2.7299953768177074, + "language_loss": 0.69113892, + "learning_rate": 1.2000868096909257e-07, + "loss": 0.71181518, + "num_input_tokens_seen": 320192130, + "step": 14844, + "time_per_iteration": 2.6298398971557617 + }, + { + "auxiliary_loss_clip": 0.01011302, + "auxiliary_loss_mlp": 0.01029013, + "balance_loss_clip": 1.02046108, + "balance_loss_mlp": 1.01840377, + "epoch": 0.8925296858560048, + "flos": 14793940166400.0, + "grad_norm": 2.0363113973395985, + "language_loss": 0.916278, + "learning_rate": 1.1987583758531038e-07, + "loss": 0.93668115, + "num_input_tokens_seen": 320207760, + "step": 14845, + "time_per_iteration": 4.520017862319946 + }, + { + "auxiliary_loss_clip": 0.01041448, + "auxiliary_loss_mlp": 0.010265, + "balance_loss_clip": 1.02244973, + "balance_loss_mlp": 1.01666534, + "epoch": 0.8925898091086728, + "flos": 22346636175360.0, + "grad_norm": 2.056301274671664, + "language_loss": 0.72384059, + "learning_rate": 1.1974306549602476e-07, + "loss": 0.74452001, + "num_input_tokens_seen": 320225325, + "step": 14846, + "time_per_iteration": 2.702558755874634 + }, + { + "auxiliary_loss_clip": 0.01035394, + "auxiliary_loss_mlp": 0.01031839, + "balance_loss_clip": 1.02559137, + "balance_loss_mlp": 1.02138388, + "epoch": 0.8926499323613407, + "flos": 45806322067200.0, + "grad_norm": 1.6846269353616368, + "language_loss": 0.56834793, + "learning_rate": 1.1961036470627094e-07, + "loss": 0.58902019, + "num_input_tokens_seen": 320247645, + "step": 14847, + "time_per_iteration": 2.995703935623169 + }, + { + "auxiliary_loss_clip": 0.0102344, + "auxiliary_loss_mlp": 0.01027004, + "balance_loss_clip": 1.02414227, + "balance_loss_mlp": 1.01711547, + "epoch": 0.8927100556140087, + "flos": 22127042378880.0, + "grad_norm": 2.4260084517603278, + "language_loss": 0.76591879, + "learning_rate": 1.1947773522108052e-07, + "loss": 0.78642321, + "num_input_tokens_seen": 320266005, + "step": 14848, + "time_per_iteration": 2.742312431335449 + }, + { + "auxiliary_loss_clip": 0.00993204, + "auxiliary_loss_mlp": 0.01038276, + "balance_loss_clip": 1.01901472, + "balance_loss_mlp": 1.02621841, + "epoch": 0.8927701788666766, + "flos": 28330143655680.0, + "grad_norm": 2.356218265809475, + "language_loss": 0.69296908, + "learning_rate": 1.1934517704548251e-07, + "loss": 0.7132839, + "num_input_tokens_seen": 320285555, + "step": 14849, + "time_per_iteration": 2.8790111541748047 + }, + { + "auxiliary_loss_clip": 0.01053795, + "auxiliary_loss_mlp": 0.01030055, + "balance_loss_clip": 1.02642715, + "balance_loss_mlp": 1.02012491, + "epoch": 0.8928303021193447, + "flos": 25294973351040.0, + "grad_norm": 1.7253892372508532, + "language_loss": 0.81004262, + "learning_rate": 1.1921269018450364e-07, + "loss": 0.83088112, + "num_input_tokens_seen": 320305395, + "step": 14850, + "time_per_iteration": 2.6823651790618896 + }, + { + "auxiliary_loss_clip": 0.01039248, + "auxiliary_loss_mlp": 0.01033412, + "balance_loss_clip": 1.02329195, + "balance_loss_mlp": 1.02304661, + "epoch": 0.8928904253720126, + "flos": 22236713579520.0, + "grad_norm": 1.5003064557528691, + "language_loss": 0.75020671, + "learning_rate": 1.1908027464316872e-07, + "loss": 0.77093327, + "num_input_tokens_seen": 320324220, + "step": 14851, + "time_per_iteration": 2.7238311767578125 + }, + { + "auxiliary_loss_clip": 0.01039431, + "auxiliary_loss_mlp": 0.0103136, + "balance_loss_clip": 1.02367544, + "balance_loss_mlp": 1.02093554, + "epoch": 0.8929505486246806, + "flos": 27092374940160.0, + "grad_norm": 1.8148212821459229, + "language_loss": 0.78532696, + "learning_rate": 1.1894793042649775e-07, + "loss": 0.80603492, + "num_input_tokens_seen": 320347195, + "step": 14852, + "time_per_iteration": 2.7432689666748047 + }, + { + "auxiliary_loss_clip": 0.01053231, + "auxiliary_loss_mlp": 0.01029901, + "balance_loss_clip": 1.02791035, + "balance_loss_mlp": 1.0197866, + "epoch": 0.8930106718773486, + "flos": 23039352938880.0, + "grad_norm": 1.417730246861122, + "language_loss": 0.69167483, + "learning_rate": 1.1881565753951006e-07, + "loss": 0.71250618, + "num_input_tokens_seen": 320366850, + "step": 14853, + "time_per_iteration": 2.7097010612487793 + }, + { + "auxiliary_loss_clip": 0.01023839, + "auxiliary_loss_mlp": 0.0102933, + "balance_loss_clip": 1.02655792, + "balance_loss_mlp": 1.01879787, + "epoch": 0.8930707951300165, + "flos": 35626652887680.0, + "grad_norm": 1.5114344967605922, + "language_loss": 0.67389917, + "learning_rate": 1.1868345598722118e-07, + "loss": 0.69443089, + "num_input_tokens_seen": 320388895, + "step": 14854, + "time_per_iteration": 2.9147746562957764 + }, + { + "auxiliary_loss_clip": 0.01032133, + "auxiliary_loss_mlp": 0.01030752, + "balance_loss_clip": 1.02194214, + "balance_loss_mlp": 1.0198679, + "epoch": 0.8931309183826845, + "flos": 23039891642880.0, + "grad_norm": 1.4560264103440244, + "language_loss": 0.74895525, + "learning_rate": 1.1855132577464399e-07, + "loss": 0.76958412, + "num_input_tokens_seen": 320408520, + "step": 14855, + "time_per_iteration": 2.899794816970825 + }, + { + "auxiliary_loss_clip": 0.01040609, + "auxiliary_loss_mlp": 0.01029768, + "balance_loss_clip": 1.02422428, + "balance_loss_mlp": 1.01977277, + "epoch": 0.8931910416353525, + "flos": 26504624695680.0, + "grad_norm": 1.8247955945137853, + "language_loss": 0.64194632, + "learning_rate": 1.1841926690678893e-07, + "loss": 0.66265005, + "num_input_tokens_seen": 320427400, + "step": 14856, + "time_per_iteration": 2.7686874866485596 + }, + { + "auxiliary_loss_clip": 0.01061829, + "auxiliary_loss_mlp": 0.01025238, + "balance_loss_clip": 1.02458811, + "balance_loss_mlp": 1.0155642, + "epoch": 0.8932511648880205, + "flos": 24973609345920.0, + "grad_norm": 1.5903967721311314, + "language_loss": 0.66258079, + "learning_rate": 1.1828727938866378e-07, + "loss": 0.68345147, + "num_input_tokens_seen": 320447570, + "step": 14857, + "time_per_iteration": 2.754502296447754 + }, + { + "auxiliary_loss_clip": 0.01020792, + "auxiliary_loss_mlp": 0.01033027, + "balance_loss_clip": 1.02761924, + "balance_loss_mlp": 1.02201843, + "epoch": 0.8933112881406884, + "flos": 24460733001600.0, + "grad_norm": 2.174330536198474, + "language_loss": 0.75236499, + "learning_rate": 1.1815536322527408e-07, + "loss": 0.7729032, + "num_input_tokens_seen": 320464405, + "step": 14858, + "time_per_iteration": 4.500789403915405 + }, + { + "auxiliary_loss_clip": 0.01050318, + "auxiliary_loss_mlp": 0.01026901, + "balance_loss_clip": 1.02418673, + "balance_loss_mlp": 1.01615405, + "epoch": 0.8933714113933564, + "flos": 28293083798400.0, + "grad_norm": 1.7610790121012756, + "language_loss": 0.69472909, + "learning_rate": 1.1802351842162139e-07, + "loss": 0.71550131, + "num_input_tokens_seen": 320485525, + "step": 14859, + "time_per_iteration": 2.7027480602264404 + }, + { + "auxiliary_loss_clip": 0.0101606, + "auxiliary_loss_mlp": 0.0102564, + "balance_loss_clip": 1.02256489, + "balance_loss_mlp": 1.01657462, + "epoch": 0.8934315346460243, + "flos": 21434864319360.0, + "grad_norm": 1.5130836891508008, + "language_loss": 0.75622278, + "learning_rate": 1.1789174498270526e-07, + "loss": 0.77663976, + "num_input_tokens_seen": 320506725, + "step": 14860, + "time_per_iteration": 2.867671012878418 + }, + { + "auxiliary_loss_clip": 0.0103645, + "auxiliary_loss_mlp": 0.01026114, + "balance_loss_clip": 1.02314091, + "balance_loss_mlp": 1.01427627, + "epoch": 0.8934916578986923, + "flos": 23769596436480.0, + "grad_norm": 1.694375691238827, + "language_loss": 0.57552737, + "learning_rate": 1.1776004291352303e-07, + "loss": 0.59615302, + "num_input_tokens_seen": 320525425, + "step": 14861, + "time_per_iteration": 2.6580114364624023 + }, + { + "auxiliary_loss_clip": 0.01035693, + "auxiliary_loss_mlp": 0.0102649, + "balance_loss_clip": 1.02223849, + "balance_loss_mlp": 1.01638091, + "epoch": 0.8935517811513602, + "flos": 18916161719040.0, + "grad_norm": 1.9368348274073, + "language_loss": 0.6342473, + "learning_rate": 1.176284122190685e-07, + "loss": 0.65486908, + "num_input_tokens_seen": 320543010, + "step": 14862, + "time_per_iteration": 2.687924861907959 + }, + { + "auxiliary_loss_clip": 0.01048196, + "auxiliary_loss_mlp": 0.01024484, + "balance_loss_clip": 1.0235405, + "balance_loss_mlp": 1.01419008, + "epoch": 0.8936119044040283, + "flos": 24061370613120.0, + "grad_norm": 1.696033987525599, + "language_loss": 0.7815876, + "learning_rate": 1.1749685290433298e-07, + "loss": 0.8023144, + "num_input_tokens_seen": 320562180, + "step": 14863, + "time_per_iteration": 2.640909194946289 + }, + { + "auxiliary_loss_clip": 0.01034236, + "auxiliary_loss_mlp": 0.01033729, + "balance_loss_clip": 1.02224755, + "balance_loss_mlp": 1.02329779, + "epoch": 0.8936720276566962, + "flos": 21324079797120.0, + "grad_norm": 1.717203744947241, + "language_loss": 0.70918441, + "learning_rate": 1.1736536497430627e-07, + "loss": 0.729864, + "num_input_tokens_seen": 320580395, + "step": 14864, + "time_per_iteration": 2.669487714767456 + }, + { + "auxiliary_loss_clip": 0.01050323, + "auxiliary_loss_mlp": 0.01035235, + "balance_loss_clip": 1.02388477, + "balance_loss_mlp": 1.02433288, + "epoch": 0.8937321509093642, + "flos": 18406122549120.0, + "grad_norm": 2.281076122093368, + "language_loss": 0.75881726, + "learning_rate": 1.1723394843397283e-07, + "loss": 0.77967286, + "num_input_tokens_seen": 320599505, + "step": 14865, + "time_per_iteration": 2.6048502922058105 + }, + { + "auxiliary_loss_clip": 0.01031078, + "auxiliary_loss_mlp": 0.01029661, + "balance_loss_clip": 1.02492619, + "balance_loss_mlp": 1.02042222, + "epoch": 0.8937922741620322, + "flos": 22054754257920.0, + "grad_norm": 1.5587807172908312, + "language_loss": 0.7187233, + "learning_rate": 1.1710260328831668e-07, + "loss": 0.73933071, + "num_input_tokens_seen": 320619825, + "step": 14866, + "time_per_iteration": 2.7865235805511475 + }, + { + "auxiliary_loss_clip": 0.01054204, + "auxiliary_loss_mlp": 0.01026266, + "balance_loss_clip": 1.02591145, + "balance_loss_mlp": 1.01469064, + "epoch": 0.8938523974147001, + "flos": 25664386775040.0, + "grad_norm": 2.8869909202825603, + "language_loss": 0.84102237, + "learning_rate": 1.1697132954231869e-07, + "loss": 0.86182714, + "num_input_tokens_seen": 320638515, + "step": 14867, + "time_per_iteration": 2.6773178577423096 + }, + { + "auxiliary_loss_clip": 0.01051082, + "auxiliary_loss_mlp": 0.01024335, + "balance_loss_clip": 1.02365911, + "balance_loss_mlp": 1.01540601, + "epoch": 0.8939125206673681, + "flos": 25742852035200.0, + "grad_norm": 1.6137232542356208, + "language_loss": 0.80222517, + "learning_rate": 1.168401272009567e-07, + "loss": 0.82297933, + "num_input_tokens_seen": 320659430, + "step": 14868, + "time_per_iteration": 2.6580891609191895 + }, + { + "auxiliary_loss_clip": 0.01034296, + "auxiliary_loss_mlp": 0.01026539, + "balance_loss_clip": 1.0246911, + "balance_loss_mlp": 1.01567316, + "epoch": 0.8939726439200361, + "flos": 27344503480320.0, + "grad_norm": 1.6615772684426517, + "language_loss": 0.77316785, + "learning_rate": 1.167089962692056e-07, + "loss": 0.79377621, + "num_input_tokens_seen": 320679295, + "step": 14869, + "time_per_iteration": 2.7614002227783203 + }, + { + "auxiliary_loss_clip": 0.01050716, + "auxiliary_loss_mlp": 0.00747533, + "balance_loss_clip": 1.02446127, + "balance_loss_mlp": 1.00035346, + "epoch": 0.8940327671727041, + "flos": 20338834671360.0, + "grad_norm": 1.6994260867460815, + "language_loss": 0.65427953, + "learning_rate": 1.1657793675203853e-07, + "loss": 0.67226201, + "num_input_tokens_seen": 320697535, + "step": 14870, + "time_per_iteration": 2.6889872550964355 + }, + { + "auxiliary_loss_clip": 0.009663, + "auxiliary_loss_mlp": 0.01001777, + "balance_loss_clip": 1.00138497, + "balance_loss_mlp": 1.00075185, + "epoch": 0.894092890425372, + "flos": 58410573235200.0, + "grad_norm": 0.7914428953056133, + "language_loss": 0.55979508, + "learning_rate": 1.1644694865442461e-07, + "loss": 0.57947582, + "num_input_tokens_seen": 320758635, + "step": 14871, + "time_per_iteration": 3.3834571838378906 + }, + { + "auxiliary_loss_clip": 0.01051254, + "auxiliary_loss_mlp": 0.01031093, + "balance_loss_clip": 1.02561939, + "balance_loss_mlp": 1.02175856, + "epoch": 0.89415301367804, + "flos": 19829657427840.0, + "grad_norm": 2.8237657907133187, + "language_loss": 0.76770341, + "learning_rate": 1.16316031981331e-07, + "loss": 0.78852689, + "num_input_tokens_seen": 320777175, + "step": 14872, + "time_per_iteration": 2.6675846576690674 + }, + { + "auxiliary_loss_clip": 0.01047435, + "auxiliary_loss_mlp": 0.01026633, + "balance_loss_clip": 1.02376771, + "balance_loss_mlp": 1.01772177, + "epoch": 0.8942131369307079, + "flos": 25775781828480.0, + "grad_norm": 1.6376184001753498, + "language_loss": 0.66871083, + "learning_rate": 1.1618518673772215e-07, + "loss": 0.68945152, + "num_input_tokens_seen": 320797670, + "step": 14873, + "time_per_iteration": 2.6801865100860596 + }, + { + "auxiliary_loss_clip": 0.01059378, + "auxiliary_loss_mlp": 0.01032002, + "balance_loss_clip": 1.02423334, + "balance_loss_mlp": 1.02143407, + "epoch": 0.8942732601833759, + "flos": 23149024139520.0, + "grad_norm": 1.6249736659768579, + "language_loss": 0.59650385, + "learning_rate": 1.1605441292856033e-07, + "loss": 0.61741763, + "num_input_tokens_seen": 320817410, + "step": 14874, + "time_per_iteration": 2.638486385345459 + }, + { + "auxiliary_loss_clip": 0.01037206, + "auxiliary_loss_mlp": 0.01028986, + "balance_loss_clip": 1.02834725, + "balance_loss_mlp": 1.01812625, + "epoch": 0.8943333834360438, + "flos": 27855548231040.0, + "grad_norm": 2.39787755114138, + "language_loss": 0.75473332, + "learning_rate": 1.1592371055880356e-07, + "loss": 0.77539527, + "num_input_tokens_seen": 320836745, + "step": 14875, + "time_per_iteration": 2.7541110515594482 + }, + { + "auxiliary_loss_clip": 0.01030515, + "auxiliary_loss_mlp": 0.01032586, + "balance_loss_clip": 1.02333593, + "balance_loss_mlp": 1.02010489, + "epoch": 0.8943935066887119, + "flos": 22163958581760.0, + "grad_norm": 1.8654077507810884, + "language_loss": 0.77447164, + "learning_rate": 1.1579307963340857e-07, + "loss": 0.79510266, + "num_input_tokens_seen": 320853305, + "step": 14876, + "time_per_iteration": 4.462343692779541 + }, + { + "auxiliary_loss_clip": 0.01050878, + "auxiliary_loss_mlp": 0.01026035, + "balance_loss_clip": 1.02453792, + "balance_loss_mlp": 1.01649225, + "epoch": 0.8944536299413798, + "flos": 21470056669440.0, + "grad_norm": 2.433951252793721, + "language_loss": 0.78755015, + "learning_rate": 1.156625201573287e-07, + "loss": 0.80831921, + "num_input_tokens_seen": 320872885, + "step": 14877, + "time_per_iteration": 2.645378589630127 + }, + { + "auxiliary_loss_clip": 0.01009824, + "auxiliary_loss_mlp": 0.01031178, + "balance_loss_clip": 1.02088726, + "balance_loss_mlp": 1.01925707, + "epoch": 0.8945137531940478, + "flos": 17748777703680.0, + "grad_norm": 1.8873810062363436, + "language_loss": 0.75473613, + "learning_rate": 1.155320321355151e-07, + "loss": 0.77514613, + "num_input_tokens_seen": 320889755, + "step": 14878, + "time_per_iteration": 2.7797279357910156 + }, + { + "auxiliary_loss_clip": 0.01045227, + "auxiliary_loss_mlp": 0.01027137, + "balance_loss_clip": 1.0227263, + "balance_loss_mlp": 1.01486433, + "epoch": 0.8945738764467158, + "flos": 21142264129920.0, + "grad_norm": 3.9709762102209782, + "language_loss": 0.76076114, + "learning_rate": 1.1540161557291539e-07, + "loss": 0.78148484, + "num_input_tokens_seen": 320907860, + "step": 14879, + "time_per_iteration": 2.6684324741363525 + }, + { + "auxiliary_loss_clip": 0.01033106, + "auxiliary_loss_mlp": 0.0102438, + "balance_loss_clip": 1.02551341, + "balance_loss_mlp": 1.01446795, + "epoch": 0.8946339996993837, + "flos": 14903000835840.0, + "grad_norm": 1.8717436668154246, + "language_loss": 0.74505126, + "learning_rate": 1.1527127047447538e-07, + "loss": 0.76562607, + "num_input_tokens_seen": 320925825, + "step": 14880, + "time_per_iteration": 2.882385730743408 + }, + { + "auxiliary_loss_clip": 0.01043678, + "auxiliary_loss_mlp": 0.01026079, + "balance_loss_clip": 1.02292895, + "balance_loss_mlp": 1.01499891, + "epoch": 0.8946941229520518, + "flos": 27382173868800.0, + "grad_norm": 2.2578919888848246, + "language_loss": 0.83072782, + "learning_rate": 1.1514099684513822e-07, + "loss": 0.85142541, + "num_input_tokens_seen": 320946165, + "step": 14881, + "time_per_iteration": 2.6834774017333984 + }, + { + "auxiliary_loss_clip": 0.01025585, + "auxiliary_loss_mlp": 0.00747682, + "balance_loss_clip": 1.02362514, + "balance_loss_mlp": 1.00041211, + "epoch": 0.8947542462047197, + "flos": 31796277338880.0, + "grad_norm": 1.7891071170978428, + "language_loss": 0.674541, + "learning_rate": 1.1501079468984287e-07, + "loss": 0.69227368, + "num_input_tokens_seen": 320969330, + "step": 14882, + "time_per_iteration": 2.979376792907715 + }, + { + "auxiliary_loss_clip": 0.01032665, + "auxiliary_loss_mlp": 0.01029672, + "balance_loss_clip": 1.02153993, + "balance_loss_mlp": 1.01775086, + "epoch": 0.8948143694573877, + "flos": 20883599314560.0, + "grad_norm": 2.2983465372077774, + "language_loss": 0.75011301, + "learning_rate": 1.1488066401352691e-07, + "loss": 0.77073634, + "num_input_tokens_seen": 320985055, + "step": 14883, + "time_per_iteration": 2.7148942947387695 + }, + { + "auxiliary_loss_clip": 0.0103877, + "auxiliary_loss_mlp": 0.01028793, + "balance_loss_clip": 1.02414632, + "balance_loss_mlp": 1.01876187, + "epoch": 0.8948744927100556, + "flos": 28215552291840.0, + "grad_norm": 1.6209012935147307, + "language_loss": 0.72321141, + "learning_rate": 1.147506048211253e-07, + "loss": 0.74388707, + "num_input_tokens_seen": 321004720, + "step": 14884, + "time_per_iteration": 2.9814393520355225 + }, + { + "auxiliary_loss_clip": 0.01029365, + "auxiliary_loss_mlp": 0.01025792, + "balance_loss_clip": 1.02072608, + "balance_loss_mlp": 1.01616013, + "epoch": 0.8949346159627236, + "flos": 21902672073600.0, + "grad_norm": 1.7233996792684514, + "language_loss": 0.75035024, + "learning_rate": 1.1462061711756987e-07, + "loss": 0.77090186, + "num_input_tokens_seen": 321022350, + "step": 14885, + "time_per_iteration": 2.686622381210327 + }, + { + "auxiliary_loss_clip": 0.01043008, + "auxiliary_loss_mlp": 0.01029823, + "balance_loss_clip": 1.02576053, + "balance_loss_mlp": 1.01902258, + "epoch": 0.8949947392153915, + "flos": 21359128492800.0, + "grad_norm": 1.7674001133441954, + "language_loss": 0.81731898, + "learning_rate": 1.1449070090778911e-07, + "loss": 0.83804727, + "num_input_tokens_seen": 321040450, + "step": 14886, + "time_per_iteration": 2.7462635040283203 + }, + { + "auxiliary_loss_clip": 0.01008068, + "auxiliary_loss_mlp": 0.01025315, + "balance_loss_clip": 1.02351928, + "balance_loss_mlp": 1.01501536, + "epoch": 0.8950548624680595, + "flos": 52445342799360.0, + "grad_norm": 1.5301928020611102, + "language_loss": 0.63584375, + "learning_rate": 1.1436085619671043e-07, + "loss": 0.65617758, + "num_input_tokens_seen": 321063970, + "step": 14887, + "time_per_iteration": 3.063089370727539 + }, + { + "auxiliary_loss_clip": 0.01036705, + "auxiliary_loss_mlp": 0.01029064, + "balance_loss_clip": 1.02183509, + "balance_loss_mlp": 1.01815605, + "epoch": 0.8951149857207275, + "flos": 20121323863680.0, + "grad_norm": 2.0781618224510505, + "language_loss": 0.60852778, + "learning_rate": 1.1423108298925698e-07, + "loss": 0.6291855, + "num_input_tokens_seen": 321083840, + "step": 14888, + "time_per_iteration": 2.757948160171509 + }, + { + "auxiliary_loss_clip": 0.0106314, + "auxiliary_loss_mlp": 0.01024817, + "balance_loss_clip": 1.02511084, + "balance_loss_mlp": 1.01516676, + "epoch": 0.8951751089733955, + "flos": 29862631463040.0, + "grad_norm": 1.7139065817560533, + "language_loss": 0.69866806, + "learning_rate": 1.1410138129034952e-07, + "loss": 0.71954763, + "num_input_tokens_seen": 321104165, + "step": 14889, + "time_per_iteration": 2.68597149848938 + }, + { + "auxiliary_loss_clip": 0.01054906, + "auxiliary_loss_mlp": 0.00747656, + "balance_loss_clip": 1.02595353, + "balance_loss_mlp": 1.00035632, + "epoch": 0.8952352322260634, + "flos": 15262789415040.0, + "grad_norm": 2.3189746022711137, + "language_loss": 0.71074188, + "learning_rate": 1.1397175110490676e-07, + "loss": 0.72876751, + "num_input_tokens_seen": 321117290, + "step": 14890, + "time_per_iteration": 2.6012632846832275 + }, + { + "auxiliary_loss_clip": 0.00978783, + "auxiliary_loss_mlp": 0.00747662, + "balance_loss_clip": 1.01791286, + "balance_loss_mlp": 1.00038195, + "epoch": 0.8952953554787314, + "flos": 26798338206720.0, + "grad_norm": 1.5979020811271247, + "language_loss": 0.75828999, + "learning_rate": 1.1384219243784454e-07, + "loss": 0.77555448, + "num_input_tokens_seen": 321137115, + "step": 14891, + "time_per_iteration": 3.0231399536132812 + }, + { + "auxiliary_loss_clip": 0.01010282, + "auxiliary_loss_mlp": 0.01028471, + "balance_loss_clip": 1.02283418, + "balance_loss_mlp": 1.01792073, + "epoch": 0.8953554787313994, + "flos": 14137205852160.0, + "grad_norm": 2.473436771113569, + "language_loss": 0.76454169, + "learning_rate": 1.1371270529407517e-07, + "loss": 0.78492922, + "num_input_tokens_seen": 321154490, + "step": 14892, + "time_per_iteration": 2.836325168609619 + }, + { + "auxiliary_loss_clip": 0.0105548, + "auxiliary_loss_mlp": 0.01027538, + "balance_loss_clip": 1.02902651, + "balance_loss_mlp": 1.01756012, + "epoch": 0.8954156019840673, + "flos": 25703314139520.0, + "grad_norm": 1.2868315589548465, + "language_loss": 0.81569767, + "learning_rate": 1.1358328967850895e-07, + "loss": 0.83652782, + "num_input_tokens_seen": 321175625, + "step": 14893, + "time_per_iteration": 4.429448843002319 + }, + { + "auxiliary_loss_clip": 0.01011245, + "auxiliary_loss_mlp": 0.01028844, + "balance_loss_clip": 1.02061415, + "balance_loss_mlp": 1.01890159, + "epoch": 0.8954757252367354, + "flos": 21907987286400.0, + "grad_norm": 1.796875046065781, + "language_loss": 0.74742115, + "learning_rate": 1.1345394559605348e-07, + "loss": 0.76782203, + "num_input_tokens_seen": 321193895, + "step": 14894, + "time_per_iteration": 2.836534023284912 + }, + { + "auxiliary_loss_clip": 0.01055166, + "auxiliary_loss_mlp": 0.01029233, + "balance_loss_clip": 1.02699232, + "balance_loss_mlp": 1.0185163, + "epoch": 0.8955358484894033, + "flos": 12970396454400.0, + "grad_norm": 1.7691343130977222, + "language_loss": 0.66823614, + "learning_rate": 1.1332467305161352e-07, + "loss": 0.68908012, + "num_input_tokens_seen": 321211610, + "step": 14895, + "time_per_iteration": 2.664858102798462 + }, + { + "auxiliary_loss_clip": 0.01055599, + "auxiliary_loss_mlp": 0.01027428, + "balance_loss_clip": 1.02654672, + "balance_loss_mlp": 1.01659226, + "epoch": 0.8955959717420713, + "flos": 17273966797440.0, + "grad_norm": 1.590270613559903, + "language_loss": 0.67297971, + "learning_rate": 1.1319547205009094e-07, + "loss": 0.69380999, + "num_input_tokens_seen": 321229805, + "step": 14896, + "time_per_iteration": 2.7187838554382324 + }, + { + "auxiliary_loss_clip": 0.01051978, + "auxiliary_loss_mlp": 0.01031401, + "balance_loss_clip": 1.02480221, + "balance_loss_mlp": 1.0211432, + "epoch": 0.8956560949947392, + "flos": 14793868339200.0, + "grad_norm": 2.020662896289093, + "language_loss": 0.75434256, + "learning_rate": 1.1306634259638492e-07, + "loss": 0.77517635, + "num_input_tokens_seen": 321247165, + "step": 14897, + "time_per_iteration": 2.62561297416687 + }, + { + "auxiliary_loss_clip": 0.00968074, + "auxiliary_loss_mlp": 0.00746382, + "balance_loss_clip": 1.00186086, + "balance_loss_mlp": 1.00029218, + "epoch": 0.8957162182474072, + "flos": 63607817957760.0, + "grad_norm": 0.7417398262910818, + "language_loss": 0.5535661, + "learning_rate": 1.129372846953931e-07, + "loss": 0.57071066, + "num_input_tokens_seen": 321308425, + "step": 14898, + "time_per_iteration": 3.515565872192383 + }, + { + "auxiliary_loss_clip": 0.01063754, + "auxiliary_loss_mlp": 0.0074771, + "balance_loss_clip": 1.02564716, + "balance_loss_mlp": 1.00034201, + "epoch": 0.8957763415000751, + "flos": 25009843190400.0, + "grad_norm": 1.4764080942706517, + "language_loss": 0.70287317, + "learning_rate": 1.12808298352008e-07, + "loss": 0.7209878, + "num_input_tokens_seen": 321329295, + "step": 14899, + "time_per_iteration": 2.603043794631958 + }, + { + "auxiliary_loss_clip": 0.0101604, + "auxiliary_loss_mlp": 0.01030898, + "balance_loss_clip": 1.02806723, + "balance_loss_mlp": 1.01870906, + "epoch": 0.8958364647527431, + "flos": 19828615933440.0, + "grad_norm": 1.6071113662007723, + "language_loss": 0.74087811, + "learning_rate": 1.1267938357112106e-07, + "loss": 0.76134747, + "num_input_tokens_seen": 321347580, + "step": 14900, + "time_per_iteration": 2.8128502368927 + }, + { + "auxiliary_loss_clip": 0.0097002, + "auxiliary_loss_mlp": 0.01002473, + "balance_loss_clip": 1.00356019, + "balance_loss_mlp": 1.00162697, + "epoch": 0.895896588005411, + "flos": 65537190115200.0, + "grad_norm": 0.7693264314175057, + "language_loss": 0.6183331, + "learning_rate": 1.1255054035762124e-07, + "loss": 0.63805807, + "num_input_tokens_seen": 321407820, + "step": 14901, + "time_per_iteration": 3.292362928390503 + }, + { + "auxiliary_loss_clip": 0.01050309, + "auxiliary_loss_mlp": 0.0102753, + "balance_loss_clip": 1.0235728, + "balance_loss_mlp": 1.01747477, + "epoch": 0.8959567112580791, + "flos": 25591021246080.0, + "grad_norm": 1.7021059815785158, + "language_loss": 0.70516747, + "learning_rate": 1.1242176871639441e-07, + "loss": 0.72594583, + "num_input_tokens_seen": 321426745, + "step": 14902, + "time_per_iteration": 2.6853818893432617 + }, + { + "auxiliary_loss_clip": 0.01040589, + "auxiliary_loss_mlp": 0.01024889, + "balance_loss_clip": 1.02499652, + "balance_loss_mlp": 1.01568007, + "epoch": 0.896016834510747, + "flos": 24201780877440.0, + "grad_norm": 1.6770654099952254, + "language_loss": 0.77963817, + "learning_rate": 1.1229306865232313e-07, + "loss": 0.80029297, + "num_input_tokens_seen": 321446165, + "step": 14903, + "time_per_iteration": 2.750311851501465 + }, + { + "auxiliary_loss_clip": 0.01036213, + "auxiliary_loss_mlp": 0.01029961, + "balance_loss_clip": 1.02351999, + "balance_loss_mlp": 1.01864219, + "epoch": 0.896076957763415, + "flos": 23075945919360.0, + "grad_norm": 1.9660535246859463, + "language_loss": 0.73285484, + "learning_rate": 1.121644401702877e-07, + "loss": 0.75351655, + "num_input_tokens_seen": 321465285, + "step": 14904, + "time_per_iteration": 2.8478140830993652 + }, + { + "auxiliary_loss_clip": 0.01052923, + "auxiliary_loss_mlp": 0.01023473, + "balance_loss_clip": 1.0248903, + "balance_loss_mlp": 1.0124284, + "epoch": 0.8961370810160829, + "flos": 22236605838720.0, + "grad_norm": 2.246297952236275, + "language_loss": 0.7502563, + "learning_rate": 1.12035883275166e-07, + "loss": 0.77102023, + "num_input_tokens_seen": 321483670, + "step": 14905, + "time_per_iteration": 5.925400018692017 + }, + { + "auxiliary_loss_clip": 0.01050741, + "auxiliary_loss_mlp": 0.01027695, + "balance_loss_clip": 1.02422953, + "balance_loss_mlp": 1.01762807, + "epoch": 0.8961972042687509, + "flos": 23072318645760.0, + "grad_norm": 4.094058754843928, + "language_loss": 0.76269138, + "learning_rate": 1.1190739797183279e-07, + "loss": 0.78347576, + "num_input_tokens_seen": 321501190, + "step": 14906, + "time_per_iteration": 2.653123617172241 + }, + { + "auxiliary_loss_clip": 0.01053103, + "auxiliary_loss_mlp": 0.01030808, + "balance_loss_clip": 1.02622497, + "balance_loss_mlp": 1.02058554, + "epoch": 0.896257327521419, + "flos": 18185882307840.0, + "grad_norm": 1.7892829070728593, + "language_loss": 0.74430776, + "learning_rate": 1.1177898426515996e-07, + "loss": 0.76514685, + "num_input_tokens_seen": 321518540, + "step": 14907, + "time_per_iteration": 2.5904901027679443 + }, + { + "auxiliary_loss_clip": 0.0105221, + "auxiliary_loss_mlp": 0.01033568, + "balance_loss_clip": 1.02600169, + "balance_loss_mlp": 1.02359009, + "epoch": 0.8963174507740869, + "flos": 17895472848000.0, + "grad_norm": 1.6013703480413544, + "language_loss": 0.83017427, + "learning_rate": 1.1165064216001785e-07, + "loss": 0.85103202, + "num_input_tokens_seen": 321536555, + "step": 14908, + "time_per_iteration": 2.6423301696777344 + }, + { + "auxiliary_loss_clip": 0.01043074, + "auxiliary_loss_mlp": 0.01030049, + "balance_loss_clip": 1.02443194, + "balance_loss_mlp": 1.01855147, + "epoch": 0.8963775740267549, + "flos": 21032269706880.0, + "grad_norm": 1.5966147697870612, + "language_loss": 0.70418149, + "learning_rate": 1.1152237166127232e-07, + "loss": 0.72491276, + "num_input_tokens_seen": 321557655, + "step": 14909, + "time_per_iteration": 2.672006607055664 + }, + { + "auxiliary_loss_clip": 0.01035302, + "auxiliary_loss_mlp": 0.01031428, + "balance_loss_clip": 1.02717769, + "balance_loss_mlp": 1.02085984, + "epoch": 0.8964376972794228, + "flos": 23179619548800.0, + "grad_norm": 2.328059164910668, + "language_loss": 0.71819657, + "learning_rate": 1.113941727737877e-07, + "loss": 0.73886383, + "num_input_tokens_seen": 321576160, + "step": 14910, + "time_per_iteration": 2.8298656940460205 + }, + { + "auxiliary_loss_clip": 0.01050181, + "auxiliary_loss_mlp": 0.01022971, + "balance_loss_clip": 1.02390909, + "balance_loss_mlp": 1.01300538, + "epoch": 0.8964978205320908, + "flos": 24972998814720.0, + "grad_norm": 5.2157013513366115, + "language_loss": 0.63700044, + "learning_rate": 1.1126604550242502e-07, + "loss": 0.65773201, + "num_input_tokens_seen": 321596205, + "step": 14911, + "time_per_iteration": 2.7545149326324463 + }, + { + "auxiliary_loss_clip": 0.01031083, + "auxiliary_loss_mlp": 0.00747777, + "balance_loss_clip": 1.02423728, + "balance_loss_mlp": 1.00044227, + "epoch": 0.8965579437847587, + "flos": 19172025273600.0, + "grad_norm": 1.723005164578223, + "language_loss": 0.75099367, + "learning_rate": 1.111379898520437e-07, + "loss": 0.76878226, + "num_input_tokens_seen": 321614800, + "step": 14912, + "time_per_iteration": 2.809727430343628 + }, + { + "auxiliary_loss_clip": 0.01033904, + "auxiliary_loss_mlp": 0.01031893, + "balance_loss_clip": 1.02213573, + "balance_loss_mlp": 1.02046657, + "epoch": 0.8966180670374267, + "flos": 24276690691200.0, + "grad_norm": 1.8686416076872872, + "language_loss": 0.81639028, + "learning_rate": 1.1101000582749876e-07, + "loss": 0.83704823, + "num_input_tokens_seen": 321633445, + "step": 14913, + "time_per_iteration": 2.943466901779175 + }, + { + "auxiliary_loss_clip": 0.01054925, + "auxiliary_loss_mlp": 0.0103026, + "balance_loss_clip": 1.02564967, + "balance_loss_mlp": 1.01862526, + "epoch": 0.8966781902900947, + "flos": 13553190622080.0, + "grad_norm": 2.117244352029133, + "language_loss": 0.60662782, + "learning_rate": 1.1088209343364407e-07, + "loss": 0.62747961, + "num_input_tokens_seen": 321650890, + "step": 14914, + "time_per_iteration": 2.5986952781677246 + }, + { + "auxiliary_loss_clip": 0.0098914, + "auxiliary_loss_mlp": 0.01000894, + "balance_loss_clip": 1.00302184, + "balance_loss_mlp": 0.99995273, + "epoch": 0.8967383135427627, + "flos": 65066114223360.0, + "grad_norm": 0.7103122420897913, + "language_loss": 0.55078995, + "learning_rate": 1.1075425267532956e-07, + "loss": 0.57069027, + "num_input_tokens_seen": 321710960, + "step": 14915, + "time_per_iteration": 3.377896785736084 + }, + { + "auxiliary_loss_clip": 0.01029599, + "auxiliary_loss_mlp": 0.01026052, + "balance_loss_clip": 1.02308559, + "balance_loss_mlp": 1.01656246, + "epoch": 0.8967984367954306, + "flos": 29713027317120.0, + "grad_norm": 1.4916375626650868, + "language_loss": 0.71429896, + "learning_rate": 1.1062648355740289e-07, + "loss": 0.73485547, + "num_input_tokens_seen": 321733290, + "step": 14916, + "time_per_iteration": 2.793346881866455 + }, + { + "auxiliary_loss_clip": 0.0104032, + "auxiliary_loss_mlp": 0.01024867, + "balance_loss_clip": 1.02338934, + "balance_loss_mlp": 1.01528871, + "epoch": 0.8968585600480986, + "flos": 25702488126720.0, + "grad_norm": 1.8989744905394783, + "language_loss": 0.777224, + "learning_rate": 1.1049878608470931e-07, + "loss": 0.79787588, + "num_input_tokens_seen": 321753120, + "step": 14917, + "time_per_iteration": 2.7499611377716064 + }, + { + "auxiliary_loss_clip": 0.01055823, + "auxiliary_loss_mlp": 0.01034221, + "balance_loss_clip": 1.02690673, + "balance_loss_mlp": 1.02241313, + "epoch": 0.8969186833007665, + "flos": 30044698525440.0, + "grad_norm": 2.5923074947868194, + "language_loss": 0.6851173, + "learning_rate": 1.1037116026209137e-07, + "loss": 0.70601773, + "num_input_tokens_seen": 321772840, + "step": 14918, + "time_per_iteration": 2.712606191635132 + }, + { + "auxiliary_loss_clip": 0.0102217, + "auxiliary_loss_mlp": 0.01028356, + "balance_loss_clip": 1.02478898, + "balance_loss_mlp": 1.01858056, + "epoch": 0.8969788065534345, + "flos": 22818143030400.0, + "grad_norm": 1.711438357823705, + "language_loss": 0.83444935, + "learning_rate": 1.102436060943881e-07, + "loss": 0.8549546, + "num_input_tokens_seen": 321791020, + "step": 14919, + "time_per_iteration": 2.818162202835083 + }, + { + "auxiliary_loss_clip": 0.01063125, + "auxiliary_loss_mlp": 0.00747825, + "balance_loss_clip": 1.02501512, + "balance_loss_mlp": 1.00041449, + "epoch": 0.8970389298061026, + "flos": 13261488272640.0, + "grad_norm": 2.139966560615291, + "language_loss": 0.72085005, + "learning_rate": 1.1011612358643696e-07, + "loss": 0.73895955, + "num_input_tokens_seen": 321810075, + "step": 14920, + "time_per_iteration": 2.6106326580047607 + }, + { + "auxiliary_loss_clip": 0.01045988, + "auxiliary_loss_mlp": 0.01026817, + "balance_loss_clip": 1.02163756, + "balance_loss_mlp": 1.01571298, + "epoch": 0.8970990530587705, + "flos": 10266071345280.0, + "grad_norm": 2.44170947210127, + "language_loss": 0.91316611, + "learning_rate": 1.0998871274307164e-07, + "loss": 0.93389422, + "num_input_tokens_seen": 321822635, + "step": 14921, + "time_per_iteration": 2.680072069168091 + }, + { + "auxiliary_loss_clip": 0.01001703, + "auxiliary_loss_mlp": 0.010302, + "balance_loss_clip": 1.02040541, + "balance_loss_mlp": 1.01864266, + "epoch": 0.8971591763114385, + "flos": 20302708567680.0, + "grad_norm": 1.9643690985314124, + "language_loss": 0.73908579, + "learning_rate": 1.0986137356912384e-07, + "loss": 0.75940484, + "num_input_tokens_seen": 321841130, + "step": 14922, + "time_per_iteration": 4.489915132522583 + }, + { + "auxiliary_loss_clip": 0.01012484, + "auxiliary_loss_mlp": 0.01030266, + "balance_loss_clip": 1.02039778, + "balance_loss_mlp": 1.01867294, + "epoch": 0.8972192995641064, + "flos": 23257043314560.0, + "grad_norm": 2.002008240861199, + "language_loss": 0.70522678, + "learning_rate": 1.097341060694219e-07, + "loss": 0.72565424, + "num_input_tokens_seen": 321859855, + "step": 14923, + "time_per_iteration": 2.764066219329834 + }, + { + "auxiliary_loss_clip": 0.01041974, + "auxiliary_loss_mlp": 0.01027165, + "balance_loss_clip": 1.02464724, + "balance_loss_mlp": 1.01577485, + "epoch": 0.8972794228167744, + "flos": 18369601395840.0, + "grad_norm": 3.0547228570324907, + "language_loss": 0.70708156, + "learning_rate": 1.0960691024879221e-07, + "loss": 0.72777295, + "num_input_tokens_seen": 321877990, + "step": 14924, + "time_per_iteration": 2.6398537158966064 + }, + { + "auxiliary_loss_clip": 0.01044928, + "auxiliary_loss_mlp": 0.0103014, + "balance_loss_clip": 1.02179778, + "balance_loss_mlp": 1.02051401, + "epoch": 0.8973395460694423, + "flos": 23952058548480.0, + "grad_norm": 2.8734902931007986, + "language_loss": 0.72038811, + "learning_rate": 1.0947978611205844e-07, + "loss": 0.74113882, + "num_input_tokens_seen": 321898120, + "step": 14925, + "time_per_iteration": 2.6836347579956055 + }, + { + "auxiliary_loss_clip": 0.01038142, + "auxiliary_loss_mlp": 0.00747739, + "balance_loss_clip": 1.02377033, + "balance_loss_mlp": 1.0004313, + "epoch": 0.8973996693221103, + "flos": 24970843998720.0, + "grad_norm": 1.602445989491078, + "language_loss": 0.8247593, + "learning_rate": 1.0935273366404008e-07, + "loss": 0.84261811, + "num_input_tokens_seen": 321918140, + "step": 14926, + "time_per_iteration": 2.70400071144104 + }, + { + "auxiliary_loss_clip": 0.01007008, + "auxiliary_loss_mlp": 0.01030461, + "balance_loss_clip": 1.02160037, + "balance_loss_mlp": 1.02039337, + "epoch": 0.8974597925747783, + "flos": 25738937452800.0, + "grad_norm": 4.408229691623543, + "language_loss": 0.7881484, + "learning_rate": 1.092257529095555e-07, + "loss": 0.80852306, + "num_input_tokens_seen": 321938580, + "step": 14927, + "time_per_iteration": 2.7701961994171143 + }, + { + "auxiliary_loss_clip": 0.01033589, + "auxiliary_loss_mlp": 0.01024463, + "balance_loss_clip": 1.02252316, + "balance_loss_mlp": 1.0147475, + "epoch": 0.8975199158274463, + "flos": 38071918131840.0, + "grad_norm": 1.5676899703100313, + "language_loss": 0.66454983, + "learning_rate": 1.0909884385341994e-07, + "loss": 0.68513036, + "num_input_tokens_seen": 321961135, + "step": 14928, + "time_per_iteration": 2.76552152633667 + }, + { + "auxiliary_loss_clip": 0.01043667, + "auxiliary_loss_mlp": 0.01038028, + "balance_loss_clip": 1.0257858, + "balance_loss_mlp": 1.0246532, + "epoch": 0.8975800390801142, + "flos": 25411683617280.0, + "grad_norm": 1.8257302008711895, + "language_loss": 0.70691502, + "learning_rate": 1.0897200650044602e-07, + "loss": 0.72773194, + "num_input_tokens_seen": 321980945, + "step": 14929, + "time_per_iteration": 0.029966354370117188 + }, + { + "auxiliary_loss_clip": 0.01044976, + "auxiliary_loss_mlp": 0.01031627, + "balance_loss_clip": 1.02717888, + "balance_loss_mlp": 1.02181029, + "epoch": 0.8976401623327822, + "flos": 21759604202880.0, + "grad_norm": 1.8127433133876025, + "language_loss": 0.68183124, + "learning_rate": 1.0884524085544256e-07, + "loss": 0.70259732, + "num_input_tokens_seen": 322000350, + "step": 14930, + "time_per_iteration": 2.645052671432495 + }, + { + "auxiliary_loss_clip": 0.01037954, + "auxiliary_loss_mlp": 0.01030023, + "balance_loss_clip": 1.02275848, + "balance_loss_mlp": 1.01943159, + "epoch": 0.8977002855854501, + "flos": 13845323934720.0, + "grad_norm": 1.6999774487150605, + "language_loss": 0.75045192, + "learning_rate": 1.0871854692321769e-07, + "loss": 0.77113163, + "num_input_tokens_seen": 322018980, + "step": 14931, + "time_per_iteration": 2.7792000770568848 + }, + { + "auxiliary_loss_clip": 0.01052875, + "auxiliary_loss_mlp": 0.01024896, + "balance_loss_clip": 1.0262351, + "balance_loss_mlp": 1.01578248, + "epoch": 0.8977604088381181, + "flos": 19427529692160.0, + "grad_norm": 1.6229034345187816, + "language_loss": 0.63162291, + "learning_rate": 1.0859192470857492e-07, + "loss": 0.65240061, + "num_input_tokens_seen": 322037675, + "step": 14932, + "time_per_iteration": 2.626688241958618 + }, + { + "auxiliary_loss_clip": 0.01049653, + "auxiliary_loss_mlp": 0.01025623, + "balance_loss_clip": 1.02526021, + "balance_loss_mlp": 1.01634836, + "epoch": 0.8978205320907862, + "flos": 22742083981440.0, + "grad_norm": 1.881688401339466, + "language_loss": 0.7163049, + "learning_rate": 1.0846537421631552e-07, + "loss": 0.73705763, + "num_input_tokens_seen": 322055130, + "step": 14933, + "time_per_iteration": 2.65567684173584 + }, + { + "auxiliary_loss_clip": 0.01012969, + "auxiliary_loss_mlp": 0.01026512, + "balance_loss_clip": 1.02096641, + "balance_loss_mlp": 1.01577771, + "epoch": 0.8978806553434541, + "flos": 21360529123200.0, + "grad_norm": 2.573077258075743, + "language_loss": 0.74867201, + "learning_rate": 1.0833889545123898e-07, + "loss": 0.76906681, + "num_input_tokens_seen": 322074850, + "step": 14934, + "time_per_iteration": 2.826303720474243 + }, + { + "auxiliary_loss_clip": 0.01014479, + "auxiliary_loss_mlp": 0.01039254, + "balance_loss_clip": 1.01932204, + "balance_loss_mlp": 1.02705312, + "epoch": 0.8979407785961221, + "flos": 20924178704640.0, + "grad_norm": 1.7573747808425944, + "language_loss": 0.60240877, + "learning_rate": 1.0821248841814123e-07, + "loss": 0.62294608, + "num_input_tokens_seen": 322093315, + "step": 14935, + "time_per_iteration": 2.9080231189727783 + }, + { + "auxiliary_loss_clip": 0.01029362, + "auxiliary_loss_mlp": 0.01026041, + "balance_loss_clip": 1.02372158, + "balance_loss_mlp": 1.01580071, + "epoch": 0.89800090184879, + "flos": 25228934196480.0, + "grad_norm": 2.463360811022019, + "language_loss": 0.76863563, + "learning_rate": 1.0808615312181512e-07, + "loss": 0.78918964, + "num_input_tokens_seen": 322112555, + "step": 14936, + "time_per_iteration": 2.7703254222869873 + }, + { + "auxiliary_loss_clip": 0.01037037, + "auxiliary_loss_mlp": 0.01031588, + "balance_loss_clip": 1.02293873, + "balance_loss_mlp": 1.0211755, + "epoch": 0.898061025101458, + "flos": 22562674525440.0, + "grad_norm": 2.055090404806331, + "language_loss": 0.74159706, + "learning_rate": 1.0795988956705193e-07, + "loss": 0.76228327, + "num_input_tokens_seen": 322130440, + "step": 14937, + "time_per_iteration": 2.8137128353118896 + }, + { + "auxiliary_loss_clip": 0.00980404, + "auxiliary_loss_mlp": 0.01002609, + "balance_loss_clip": 1.00400901, + "balance_loss_mlp": 1.00175059, + "epoch": 0.8981211483541259, + "flos": 56192551384320.0, + "grad_norm": 0.8426584924738793, + "language_loss": 0.63522363, + "learning_rate": 1.0783369775863915e-07, + "loss": 0.65505373, + "num_input_tokens_seen": 322187295, + "step": 14938, + "time_per_iteration": 3.2160580158233643 + }, + { + "auxiliary_loss_clip": 0.01041448, + "auxiliary_loss_mlp": 0.0102684, + "balance_loss_clip": 1.02499163, + "balance_loss_mlp": 1.01644516, + "epoch": 0.898181271606794, + "flos": 16392718523520.0, + "grad_norm": 2.7531697795849976, + "language_loss": 0.80595225, + "learning_rate": 1.0770757770136251e-07, + "loss": 0.82663512, + "num_input_tokens_seen": 322202965, + "step": 14939, + "time_per_iteration": 2.6527023315429688 + }, + { + "auxiliary_loss_clip": 0.0097698, + "auxiliary_loss_mlp": 0.01003266, + "balance_loss_clip": 1.00205302, + "balance_loss_mlp": 1.00247908, + "epoch": 0.8982413948594619, + "flos": 63440259989760.0, + "grad_norm": 0.7153273096399585, + "language_loss": 0.52869129, + "learning_rate": 1.0758152940000375e-07, + "loss": 0.54849374, + "num_input_tokens_seen": 322269490, + "step": 14940, + "time_per_iteration": 5.235373020172119 + }, + { + "auxiliary_loss_clip": 0.01062293, + "auxiliary_loss_mlp": 0.01029632, + "balance_loss_clip": 1.02484894, + "balance_loss_mlp": 1.01800346, + "epoch": 0.8983015181121299, + "flos": 21835340029440.0, + "grad_norm": 9.569093915951106, + "language_loss": 0.77745748, + "learning_rate": 1.0745555285934327e-07, + "loss": 0.79837668, + "num_input_tokens_seen": 322288060, + "step": 14941, + "time_per_iteration": 2.726698160171509 + }, + { + "auxiliary_loss_clip": 0.01050534, + "auxiliary_loss_mlp": 0.01031141, + "balance_loss_clip": 1.0238204, + "balance_loss_mlp": 1.02035904, + "epoch": 0.8983616413647978, + "flos": 28949961767040.0, + "grad_norm": 2.415407874981587, + "language_loss": 0.73502779, + "learning_rate": 1.0732964808415834e-07, + "loss": 0.75584447, + "num_input_tokens_seen": 322307930, + "step": 14942, + "time_per_iteration": 2.8326361179351807 + }, + { + "auxiliary_loss_clip": 0.01038792, + "auxiliary_loss_mlp": 0.01030527, + "balance_loss_clip": 1.02303946, + "balance_loss_mlp": 1.01970255, + "epoch": 0.8984217646174658, + "flos": 17785083375360.0, + "grad_norm": 2.505151213112191, + "language_loss": 0.80092919, + "learning_rate": 1.0720381507922205e-07, + "loss": 0.82162237, + "num_input_tokens_seen": 322326155, + "step": 14943, + "time_per_iteration": 2.8196115493774414 + }, + { + "auxiliary_loss_clip": 0.01044167, + "auxiliary_loss_mlp": 0.01027158, + "balance_loss_clip": 1.02500319, + "balance_loss_mlp": 1.0163877, + "epoch": 0.8984818878701337, + "flos": 23404528558080.0, + "grad_norm": 2.43838050792995, + "language_loss": 0.71373355, + "learning_rate": 1.0707805384930701e-07, + "loss": 0.73444676, + "num_input_tokens_seen": 322345850, + "step": 14944, + "time_per_iteration": 2.7021002769470215 + }, + { + "auxiliary_loss_clip": 0.01016191, + "auxiliary_loss_mlp": 0.01033014, + "balance_loss_clip": 1.02045894, + "balance_loss_mlp": 1.02103376, + "epoch": 0.8985420111228017, + "flos": 22346061557760.0, + "grad_norm": 2.04861086748157, + "language_loss": 0.75870311, + "learning_rate": 1.0695236439918187e-07, + "loss": 0.77919519, + "num_input_tokens_seen": 322364715, + "step": 14945, + "time_per_iteration": 2.815150260925293 + }, + { + "auxiliary_loss_clip": 0.01066488, + "auxiliary_loss_mlp": 0.01031432, + "balance_loss_clip": 1.02578163, + "balance_loss_mlp": 1.02033925, + "epoch": 0.8986021343754698, + "flos": 21392776558080.0, + "grad_norm": 1.8174892171971664, + "language_loss": 0.73240662, + "learning_rate": 1.0682674673361302e-07, + "loss": 0.75338578, + "num_input_tokens_seen": 322383570, + "step": 14946, + "time_per_iteration": 2.6229288578033447 + }, + { + "auxiliary_loss_clip": 0.01013102, + "auxiliary_loss_mlp": 0.01027018, + "balance_loss_clip": 1.0217737, + "balance_loss_mlp": 1.01602721, + "epoch": 0.8986622576281377, + "flos": 21325372686720.0, + "grad_norm": 2.158417526046985, + "language_loss": 0.64166135, + "learning_rate": 1.0670120085736334e-07, + "loss": 0.66206253, + "num_input_tokens_seen": 322401375, + "step": 14947, + "time_per_iteration": 2.7531113624572754 + }, + { + "auxiliary_loss_clip": 0.01040761, + "auxiliary_loss_mlp": 0.01032433, + "balance_loss_clip": 1.02437401, + "balance_loss_mlp": 1.02261591, + "epoch": 0.8987223808808057, + "flos": 23988292392960.0, + "grad_norm": 1.7965898362355421, + "language_loss": 0.69994938, + "learning_rate": 1.0657572677519411e-07, + "loss": 0.72068131, + "num_input_tokens_seen": 322421890, + "step": 14948, + "time_per_iteration": 2.65869140625 + }, + { + "auxiliary_loss_clip": 0.01031166, + "auxiliary_loss_mlp": 0.01025261, + "balance_loss_clip": 1.02375436, + "balance_loss_mlp": 1.01500893, + "epoch": 0.8987825041334736, + "flos": 41500956044160.0, + "grad_norm": 1.8371424084300199, + "language_loss": 0.74370384, + "learning_rate": 1.0645032449186309e-07, + "loss": 0.7642681, + "num_input_tokens_seen": 322445730, + "step": 14949, + "time_per_iteration": 2.89540696144104 + }, + { + "auxiliary_loss_clip": 0.01024194, + "auxiliary_loss_mlp": 0.01034608, + "balance_loss_clip": 1.02503133, + "balance_loss_mlp": 1.02260375, + "epoch": 0.8988426273861416, + "flos": 27564276844800.0, + "grad_norm": 1.7289518702868585, + "language_loss": 0.75694674, + "learning_rate": 1.0632499401212513e-07, + "loss": 0.77753478, + "num_input_tokens_seen": 322464595, + "step": 14950, + "time_per_iteration": 2.7931017875671387 + }, + { + "auxiliary_loss_clip": 0.01032919, + "auxiliary_loss_mlp": 0.01024613, + "balance_loss_clip": 1.02265096, + "balance_loss_mlp": 1.01458764, + "epoch": 0.8989027506388095, + "flos": 17092653920640.0, + "grad_norm": 1.6217880889210372, + "language_loss": 0.66507268, + "learning_rate": 1.0619973534073334e-07, + "loss": 0.68564796, + "num_input_tokens_seen": 322483305, + "step": 14951, + "time_per_iteration": 2.6687257289886475 + }, + { + "auxiliary_loss_clip": 0.0106289, + "auxiliary_loss_mlp": 0.01025794, + "balance_loss_clip": 1.02366757, + "balance_loss_mlp": 1.01603055, + "epoch": 0.8989628738914776, + "flos": 20555124416640.0, + "grad_norm": 1.9308025763953813, + "language_loss": 0.74005985, + "learning_rate": 1.0607454848243769e-07, + "loss": 0.76094669, + "num_input_tokens_seen": 322501905, + "step": 14952, + "time_per_iteration": 5.878637790679932 + }, + { + "auxiliary_loss_clip": 0.01063341, + "auxiliary_loss_mlp": 0.01031385, + "balance_loss_clip": 1.02629304, + "balance_loss_mlp": 1.021276, + "epoch": 0.8990229971441455, + "flos": 16251087196800.0, + "grad_norm": 2.4791298085260776, + "language_loss": 0.56933272, + "learning_rate": 1.0594943344198481e-07, + "loss": 0.59027994, + "num_input_tokens_seen": 322518135, + "step": 14953, + "time_per_iteration": 2.6165685653686523 + }, + { + "auxiliary_loss_clip": 0.01036432, + "auxiliary_loss_mlp": 0.01032114, + "balance_loss_clip": 1.02230155, + "balance_loss_mlp": 1.02155805, + "epoch": 0.8990831203968135, + "flos": 21981316901760.0, + "grad_norm": 3.9297089880604714, + "language_loss": 0.82121348, + "learning_rate": 1.0582439022411915e-07, + "loss": 0.84189892, + "num_input_tokens_seen": 322537905, + "step": 14954, + "time_per_iteration": 2.71596622467041 + }, + { + "auxiliary_loss_clip": 0.01061272, + "auxiliary_loss_mlp": 0.01031642, + "balance_loss_clip": 1.02511859, + "balance_loss_mlp": 1.02115762, + "epoch": 0.8991432436494814, + "flos": 27447171528960.0, + "grad_norm": 1.9291595389305407, + "language_loss": 0.60017896, + "learning_rate": 1.0569941883358224e-07, + "loss": 0.62110817, + "num_input_tokens_seen": 322557945, + "step": 14955, + "time_per_iteration": 2.7012827396392822 + }, + { + "auxiliary_loss_clip": 0.01052007, + "auxiliary_loss_mlp": 0.01028516, + "balance_loss_clip": 1.02590585, + "balance_loss_mlp": 1.01890182, + "epoch": 0.8992033669021494, + "flos": 21579835610880.0, + "grad_norm": 2.033177722983806, + "language_loss": 0.54866117, + "learning_rate": 1.0557451927511341e-07, + "loss": 0.56946641, + "num_input_tokens_seen": 322575765, + "step": 14956, + "time_per_iteration": 2.8108489513397217 + }, + { + "auxiliary_loss_clip": 0.01017714, + "auxiliary_loss_mlp": 0.01028974, + "balance_loss_clip": 1.02388394, + "balance_loss_mlp": 1.01845932, + "epoch": 0.8992634901548173, + "flos": 28584211530240.0, + "grad_norm": 2.0398713603079672, + "language_loss": 0.80145693, + "learning_rate": 1.0544969155344863e-07, + "loss": 0.82192385, + "num_input_tokens_seen": 322595665, + "step": 14957, + "time_per_iteration": 2.8337321281433105 + }, + { + "auxiliary_loss_clip": 0.01065363, + "auxiliary_loss_mlp": 0.01029253, + "balance_loss_clip": 1.02639234, + "balance_loss_mlp": 1.01833391, + "epoch": 0.8993236134074853, + "flos": 19867435557120.0, + "grad_norm": 1.7459477300481536, + "language_loss": 0.78914356, + "learning_rate": 1.0532493567332123e-07, + "loss": 0.81008971, + "num_input_tokens_seen": 322614755, + "step": 14958, + "time_per_iteration": 2.6699583530426025 + }, + { + "auxiliary_loss_clip": 0.00993566, + "auxiliary_loss_mlp": 0.0102736, + "balance_loss_clip": 1.02630925, + "balance_loss_mlp": 1.01759064, + "epoch": 0.8993837366601534, + "flos": 19390649402880.0, + "grad_norm": 1.3983375787867223, + "language_loss": 0.74961263, + "learning_rate": 1.0520025163946277e-07, + "loss": 0.76982188, + "num_input_tokens_seen": 322633425, + "step": 14959, + "time_per_iteration": 2.8672916889190674 + }, + { + "auxiliary_loss_clip": 0.0105856, + "auxiliary_loss_mlp": 0.01030242, + "balance_loss_clip": 1.02321458, + "balance_loss_mlp": 1.02022886, + "epoch": 0.8994438599128213, + "flos": 18551740285440.0, + "grad_norm": 2.1493449046649227, + "language_loss": 0.68256861, + "learning_rate": 1.0507563945660015e-07, + "loss": 0.70345664, + "num_input_tokens_seen": 322652065, + "step": 14960, + "time_per_iteration": 2.6382205486297607 + }, + { + "auxiliary_loss_clip": 0.01040879, + "auxiliary_loss_mlp": 0.01024208, + "balance_loss_clip": 1.02479517, + "balance_loss_mlp": 1.01449287, + "epoch": 0.8995039831654893, + "flos": 24427587726720.0, + "grad_norm": 1.529272121073032, + "language_loss": 0.65827453, + "learning_rate": 1.049510991294591e-07, + "loss": 0.67892545, + "num_input_tokens_seen": 322673275, + "step": 14961, + "time_per_iteration": 2.7227494716644287 + }, + { + "auxiliary_loss_clip": 0.01038883, + "auxiliary_loss_mlp": 0.01025921, + "balance_loss_clip": 1.02288318, + "balance_loss_mlp": 1.01638389, + "epoch": 0.8995641064181572, + "flos": 21251324799360.0, + "grad_norm": 1.4677446195300015, + "language_loss": 0.82941663, + "learning_rate": 1.0482663066276254e-07, + "loss": 0.85006464, + "num_input_tokens_seen": 322693375, + "step": 14962, + "time_per_iteration": 2.9006106853485107 + }, + { + "auxiliary_loss_clip": 0.01044937, + "auxiliary_loss_mlp": 0.01027879, + "balance_loss_clip": 1.02581429, + "balance_loss_mlp": 1.01633346, + "epoch": 0.8996242296708252, + "flos": 23513661054720.0, + "grad_norm": 1.8064112426443255, + "language_loss": 0.76178652, + "learning_rate": 1.047022340612298e-07, + "loss": 0.78251469, + "num_input_tokens_seen": 322712615, + "step": 14963, + "time_per_iteration": 2.6557631492614746 + }, + { + "auxiliary_loss_clip": 0.00964379, + "auxiliary_loss_mlp": 0.01005587, + "balance_loss_clip": 1.00712359, + "balance_loss_mlp": 1.00465703, + "epoch": 0.8996843529234931, + "flos": 62403230430720.0, + "grad_norm": 0.7835642940407141, + "language_loss": 0.57581776, + "learning_rate": 1.0457790932957867e-07, + "loss": 0.5955174, + "num_input_tokens_seen": 322766855, + "step": 14964, + "time_per_iteration": 3.1958415508270264 + }, + { + "auxiliary_loss_clip": 0.01056719, + "auxiliary_loss_mlp": 0.01030186, + "balance_loss_clip": 1.02717471, + "balance_loss_mlp": 1.01896214, + "epoch": 0.8997444761761612, + "flos": 24236829573120.0, + "grad_norm": 4.186055021897664, + "language_loss": 0.67808831, + "learning_rate": 1.0445365647252269e-07, + "loss": 0.69895732, + "num_input_tokens_seen": 322781130, + "step": 14965, + "time_per_iteration": 2.6232364177703857 + }, + { + "auxiliary_loss_clip": 0.01063289, + "auxiliary_loss_mlp": 0.01029669, + "balance_loss_clip": 1.02518129, + "balance_loss_mlp": 1.01961362, + "epoch": 0.8998045994288291, + "flos": 21361103740800.0, + "grad_norm": 2.480925608696234, + "language_loss": 0.71984053, + "learning_rate": 1.0432947549477433e-07, + "loss": 0.7407701, + "num_input_tokens_seen": 322800310, + "step": 14966, + "time_per_iteration": 2.6311941146850586 + }, + { + "auxiliary_loss_clip": 0.0102817, + "auxiliary_loss_mlp": 0.01030457, + "balance_loss_clip": 1.02350676, + "balance_loss_mlp": 1.01865554, + "epoch": 0.8998647226814971, + "flos": 28986159697920.0, + "grad_norm": 2.286210241522267, + "language_loss": 0.72931623, + "learning_rate": 1.0420536640104205e-07, + "loss": 0.74990249, + "num_input_tokens_seen": 322820955, + "step": 14967, + "time_per_iteration": 2.7745444774627686 + }, + { + "auxiliary_loss_clip": 0.01024658, + "auxiliary_loss_mlp": 0.0074767, + "balance_loss_clip": 1.02522469, + "balance_loss_mlp": 1.00037396, + "epoch": 0.899924845934165, + "flos": 13625909706240.0, + "grad_norm": 1.9626417754158316, + "language_loss": 0.72228444, + "learning_rate": 1.040813291960323e-07, + "loss": 0.7400077, + "num_input_tokens_seen": 322838780, + "step": 14968, + "time_per_iteration": 2.7907049655914307 + }, + { + "auxiliary_loss_clip": 0.01052434, + "auxiliary_loss_mlp": 0.010313, + "balance_loss_clip": 1.02522802, + "balance_loss_mlp": 1.02083373, + "epoch": 0.899984969186833, + "flos": 20882629647360.0, + "grad_norm": 2.1825960457973106, + "language_loss": 0.71246707, + "learning_rate": 1.0395736388444864e-07, + "loss": 0.73330438, + "num_input_tokens_seen": 322856710, + "step": 14969, + "time_per_iteration": 2.6841561794281006 + }, + { + "auxiliary_loss_clip": 0.01063214, + "auxiliary_loss_mlp": 0.01028333, + "balance_loss_clip": 1.02610958, + "balance_loss_mlp": 1.01790857, + "epoch": 0.9000450924395009, + "flos": 20921808407040.0, + "grad_norm": 3.784359534758786, + "language_loss": 0.75969821, + "learning_rate": 1.0383347047099201e-07, + "loss": 0.78061366, + "num_input_tokens_seen": 322876070, + "step": 14970, + "time_per_iteration": 4.448772430419922 + }, + { + "auxiliary_loss_clip": 0.01053655, + "auxiliary_loss_mlp": 0.01028056, + "balance_loss_clip": 1.02480304, + "balance_loss_mlp": 1.01867998, + "epoch": 0.900105215692169, + "flos": 17165049782400.0, + "grad_norm": 1.6521220269071597, + "language_loss": 0.72873139, + "learning_rate": 1.0370964896035972e-07, + "loss": 0.74954855, + "num_input_tokens_seen": 322895095, + "step": 14971, + "time_per_iteration": 2.6360244750976562 + }, + { + "auxiliary_loss_clip": 0.01031716, + "auxiliary_loss_mlp": 0.01026855, + "balance_loss_clip": 1.02387321, + "balance_loss_mlp": 1.01590586, + "epoch": 0.900165338944837, + "flos": 19931930426880.0, + "grad_norm": 2.4563932378197197, + "language_loss": 0.81638277, + "learning_rate": 1.035858993572476e-07, + "loss": 0.83696842, + "num_input_tokens_seen": 322911845, + "step": 14972, + "time_per_iteration": 2.772900104522705 + }, + { + "auxiliary_loss_clip": 0.01029725, + "auxiliary_loss_mlp": 0.01030921, + "balance_loss_clip": 1.02251244, + "balance_loss_mlp": 1.02025199, + "epoch": 0.9002254621975049, + "flos": 16107085572480.0, + "grad_norm": 1.7887142953312947, + "language_loss": 0.81488228, + "learning_rate": 1.0346222166634855e-07, + "loss": 0.83548874, + "num_input_tokens_seen": 322928170, + "step": 14973, + "time_per_iteration": 2.603769063949585 + }, + { + "auxiliary_loss_clip": 0.01061253, + "auxiliary_loss_mlp": 0.01033264, + "balance_loss_clip": 1.02450109, + "balance_loss_mlp": 1.02243352, + "epoch": 0.9002855854501729, + "flos": 28476120528000.0, + "grad_norm": 1.8774153751373537, + "language_loss": 0.58286422, + "learning_rate": 1.0333861589235193e-07, + "loss": 0.60380936, + "num_input_tokens_seen": 322948165, + "step": 14974, + "time_per_iteration": 2.6378896236419678 + }, + { + "auxiliary_loss_clip": 0.01065458, + "auxiliary_loss_mlp": 0.01031404, + "balance_loss_clip": 1.02764666, + "balance_loss_mlp": 1.02111602, + "epoch": 0.9003457087028408, + "flos": 25630307746560.0, + "grad_norm": 1.921701238337706, + "language_loss": 0.63431823, + "learning_rate": 1.0321508203994489e-07, + "loss": 0.65528691, + "num_input_tokens_seen": 322968880, + "step": 14975, + "time_per_iteration": 2.6174869537353516 + }, + { + "auxiliary_loss_clip": 0.01051596, + "auxiliary_loss_mlp": 0.01029146, + "balance_loss_clip": 1.02464175, + "balance_loss_mlp": 1.01870918, + "epoch": 0.9004058319555088, + "flos": 24389414547840.0, + "grad_norm": 1.6136514099595083, + "language_loss": 0.72926939, + "learning_rate": 1.0309162011381257e-07, + "loss": 0.75007677, + "num_input_tokens_seen": 322989395, + "step": 14976, + "time_per_iteration": 2.722485065460205 + }, + { + "auxiliary_loss_clip": 0.0104495, + "auxiliary_loss_mlp": 0.01029699, + "balance_loss_clip": 1.02578568, + "balance_loss_mlp": 1.01960802, + "epoch": 0.9004659552081767, + "flos": 29059345658880.0, + "grad_norm": 1.7576958841368944, + "language_loss": 0.69176406, + "learning_rate": 1.0296823011863565e-07, + "loss": 0.71251059, + "num_input_tokens_seen": 323009060, + "step": 14977, + "time_per_iteration": 2.6656675338745117 + }, + { + "auxiliary_loss_clip": 0.01040214, + "auxiliary_loss_mlp": 0.00747768, + "balance_loss_clip": 1.02450752, + "balance_loss_mlp": 1.00038457, + "epoch": 0.9005260784608448, + "flos": 16763855800320.0, + "grad_norm": 2.1710764678249443, + "language_loss": 0.65520191, + "learning_rate": 1.0284491205909351e-07, + "loss": 0.67308176, + "num_input_tokens_seen": 323027530, + "step": 14978, + "time_per_iteration": 2.696812391281128 + }, + { + "auxiliary_loss_clip": 0.01030063, + "auxiliary_loss_mlp": 0.01031986, + "balance_loss_clip": 1.02474535, + "balance_loss_mlp": 1.02025533, + "epoch": 0.9005862017135127, + "flos": 20376002269440.0, + "grad_norm": 1.6949020574416935, + "language_loss": 0.79243481, + "learning_rate": 1.0272166593986286e-07, + "loss": 0.81305528, + "num_input_tokens_seen": 323045370, + "step": 14979, + "time_per_iteration": 2.687997579574585 + }, + { + "auxiliary_loss_clip": 0.00987308, + "auxiliary_loss_mlp": 0.01001894, + "balance_loss_clip": 1.00203252, + "balance_loss_mlp": 1.00105917, + "epoch": 0.9006463249661807, + "flos": 67580255796480.0, + "grad_norm": 0.7224951364624288, + "language_loss": 0.53616631, + "learning_rate": 1.0259849176561642e-07, + "loss": 0.55605841, + "num_input_tokens_seen": 323105660, + "step": 14980, + "time_per_iteration": 3.340693473815918 + }, + { + "auxiliary_loss_clip": 0.01055446, + "auxiliary_loss_mlp": 0.01030529, + "balance_loss_clip": 1.02648234, + "balance_loss_mlp": 1.01938891, + "epoch": 0.9007064482188486, + "flos": 28293335193600.0, + "grad_norm": 1.6426572474817298, + "language_loss": 0.8245399, + "learning_rate": 1.0247538954102553e-07, + "loss": 0.84539962, + "num_input_tokens_seen": 323126365, + "step": 14981, + "time_per_iteration": 2.714707136154175 + }, + { + "auxiliary_loss_clip": 0.01021505, + "auxiliary_loss_mlp": 0.01029965, + "balance_loss_clip": 1.02417886, + "balance_loss_mlp": 1.01977277, + "epoch": 0.9007665714715166, + "flos": 21616320850560.0, + "grad_norm": 1.6988417438872134, + "language_loss": 0.81876391, + "learning_rate": 1.0235235927075758e-07, + "loss": 0.83927864, + "num_input_tokens_seen": 323145655, + "step": 14982, + "time_per_iteration": 2.7814791202545166 + }, + { + "auxiliary_loss_clip": 0.01027088, + "auxiliary_loss_mlp": 0.01030905, + "balance_loss_clip": 1.02136421, + "balance_loss_mlp": 1.02028346, + "epoch": 0.9008266947241845, + "flos": 26541864120960.0, + "grad_norm": 2.2945473769819045, + "language_loss": 0.71826202, + "learning_rate": 1.0222940095947885e-07, + "loss": 0.73884195, + "num_input_tokens_seen": 323164540, + "step": 14983, + "time_per_iteration": 2.7377583980560303 + }, + { + "auxiliary_loss_clip": 0.01051398, + "auxiliary_loss_mlp": 0.01022855, + "balance_loss_clip": 1.02589178, + "balance_loss_mlp": 1.01354504, + "epoch": 0.9008868179768525, + "flos": 23110527738240.0, + "grad_norm": 1.3179837768825948, + "language_loss": 0.74996775, + "learning_rate": 1.0210651461185115e-07, + "loss": 0.77071029, + "num_input_tokens_seen": 323186960, + "step": 14984, + "time_per_iteration": 2.6779990196228027 + }, + { + "auxiliary_loss_clip": 0.01059741, + "auxiliary_loss_mlp": 0.01027237, + "balance_loss_clip": 1.02415025, + "balance_loss_mlp": 1.0168184, + "epoch": 0.9009469412295206, + "flos": 19060809788160.0, + "grad_norm": 1.4486474436646912, + "language_loss": 0.70511991, + "learning_rate": 1.0198370023253456e-07, + "loss": 0.7259897, + "num_input_tokens_seen": 323206135, + "step": 14985, + "time_per_iteration": 2.559643030166626 + }, + { + "auxiliary_loss_clip": 0.01043198, + "auxiliary_loss_mlp": 0.01031516, + "balance_loss_clip": 1.02441454, + "balance_loss_mlp": 1.02074599, + "epoch": 0.9010070644821885, + "flos": 23222281927680.0, + "grad_norm": 1.7821085551173166, + "language_loss": 0.70561397, + "learning_rate": 1.0186095782618643e-07, + "loss": 0.7263611, + "num_input_tokens_seen": 323225980, + "step": 14986, + "time_per_iteration": 4.372863054275513 + }, + { + "auxiliary_loss_clip": 0.01047765, + "auxiliary_loss_mlp": 0.01030544, + "balance_loss_clip": 1.02246523, + "balance_loss_mlp": 1.02011895, + "epoch": 0.9010671877348565, + "flos": 17384823146880.0, + "grad_norm": 1.8397130526199237, + "language_loss": 0.76856363, + "learning_rate": 1.0173828739746104e-07, + "loss": 0.78934669, + "num_input_tokens_seen": 323243700, + "step": 14987, + "time_per_iteration": 2.590691089630127 + }, + { + "auxiliary_loss_clip": 0.01052048, + "auxiliary_loss_mlp": 0.01028276, + "balance_loss_clip": 1.02522159, + "balance_loss_mlp": 1.01775002, + "epoch": 0.9011273109875244, + "flos": 21908166854400.0, + "grad_norm": 1.808463521475361, + "language_loss": 0.73841965, + "learning_rate": 1.0161568895100981e-07, + "loss": 0.75922287, + "num_input_tokens_seen": 323261535, + "step": 14988, + "time_per_iteration": 2.654848575592041 + }, + { + "auxiliary_loss_clip": 0.01047155, + "auxiliary_loss_mlp": 0.0102714, + "balance_loss_clip": 1.0278331, + "balance_loss_mlp": 1.01603019, + "epoch": 0.9011874342401924, + "flos": 24060831909120.0, + "grad_norm": 1.8394939364288687, + "language_loss": 0.69473636, + "learning_rate": 1.0149316249148188e-07, + "loss": 0.71547931, + "num_input_tokens_seen": 323281855, + "step": 14989, + "time_per_iteration": 2.718017339706421 + }, + { + "auxiliary_loss_clip": 0.01061934, + "auxiliary_loss_mlp": 0.01026849, + "balance_loss_clip": 1.02539158, + "balance_loss_mlp": 1.01660895, + "epoch": 0.9012475574928603, + "flos": 16758791982720.0, + "grad_norm": 1.9489367692693937, + "language_loss": 0.79924041, + "learning_rate": 1.0137070802352376e-07, + "loss": 0.8201282, + "num_input_tokens_seen": 323299505, + "step": 14990, + "time_per_iteration": 2.552335500717163 + }, + { + "auxiliary_loss_clip": 0.0102657, + "auxiliary_loss_mlp": 0.01028857, + "balance_loss_clip": 1.02593482, + "balance_loss_mlp": 1.01812184, + "epoch": 0.9013076807455284, + "flos": 19971109186560.0, + "grad_norm": 2.509976573439703, + "language_loss": 0.78139126, + "learning_rate": 1.0124832555177842e-07, + "loss": 0.80194557, + "num_input_tokens_seen": 323318365, + "step": 14991, + "time_per_iteration": 2.791898250579834 + }, + { + "auxiliary_loss_clip": 0.00981856, + "auxiliary_loss_mlp": 0.0074665, + "balance_loss_clip": 1.00583696, + "balance_loss_mlp": 1.00062549, + "epoch": 0.9013678039981963, + "flos": 65180274624000.0, + "grad_norm": 0.7754662773745523, + "language_loss": 0.60243654, + "learning_rate": 1.0112601508088726e-07, + "loss": 0.61972159, + "num_input_tokens_seen": 323371835, + "step": 14992, + "time_per_iteration": 3.153668165206909 + }, + { + "auxiliary_loss_clip": 0.0105156, + "auxiliary_loss_mlp": 0.01026674, + "balance_loss_clip": 1.02404773, + "balance_loss_mlp": 1.01573038, + "epoch": 0.9014279272508643, + "flos": 20521224956160.0, + "grad_norm": 2.1464798684902386, + "language_loss": 0.83010501, + "learning_rate": 1.0100377661548764e-07, + "loss": 0.8508873, + "num_input_tokens_seen": 323388495, + "step": 14993, + "time_per_iteration": 2.6777021884918213 + }, + { + "auxiliary_loss_clip": 0.01061813, + "auxiliary_loss_mlp": 0.01032891, + "balance_loss_clip": 1.02473879, + "balance_loss_mlp": 1.02204943, + "epoch": 0.9014880505035322, + "flos": 17309051406720.0, + "grad_norm": 1.9461597640905546, + "language_loss": 0.731507, + "learning_rate": 1.0088161016021502e-07, + "loss": 0.75245404, + "num_input_tokens_seen": 323405280, + "step": 14994, + "time_per_iteration": 2.604355812072754 + }, + { + "auxiliary_loss_clip": 0.01048903, + "auxiliary_loss_mlp": 0.01025837, + "balance_loss_clip": 1.02397752, + "balance_loss_mlp": 1.01653266, + "epoch": 0.9015481737562002, + "flos": 28402862739840.0, + "grad_norm": 1.732497329955113, + "language_loss": 0.64654362, + "learning_rate": 1.0075951571970187e-07, + "loss": 0.66729099, + "num_input_tokens_seen": 323425310, + "step": 14995, + "time_per_iteration": 2.6922168731689453 + }, + { + "auxiliary_loss_clip": 0.0101675, + "auxiliary_loss_mlp": 0.01033978, + "balance_loss_clip": 1.01913106, + "balance_loss_mlp": 1.0225575, + "epoch": 0.9016082970088681, + "flos": 29752672953600.0, + "grad_norm": 3.33801579938564, + "language_loss": 0.66657573, + "learning_rate": 1.0063749329857873e-07, + "loss": 0.68708301, + "num_input_tokens_seen": 323447805, + "step": 14996, + "time_per_iteration": 2.8048255443573 + }, + { + "auxiliary_loss_clip": 0.01048745, + "auxiliary_loss_mlp": 0.01028281, + "balance_loss_clip": 1.02288783, + "balance_loss_mlp": 1.01885176, + "epoch": 0.9016684202615362, + "flos": 23513230091520.0, + "grad_norm": 1.6606626821440202, + "language_loss": 0.65883708, + "learning_rate": 1.0051554290147168e-07, + "loss": 0.67960739, + "num_input_tokens_seen": 323467150, + "step": 14997, + "time_per_iteration": 2.675041675567627 + }, + { + "auxiliary_loss_clip": 0.01030622, + "auxiliary_loss_mlp": 0.01031395, + "balance_loss_clip": 1.02085459, + "balance_loss_mlp": 1.02068985, + "epoch": 0.9017285435142042, + "flos": 16979247705600.0, + "grad_norm": 1.6997241009889423, + "language_loss": 0.77077949, + "learning_rate": 1.0039366453300613e-07, + "loss": 0.79139972, + "num_input_tokens_seen": 323484250, + "step": 14998, + "time_per_iteration": 2.643406629562378 + }, + { + "auxiliary_loss_clip": 0.01061085, + "auxiliary_loss_mlp": 0.01025926, + "balance_loss_clip": 1.02377737, + "balance_loss_mlp": 1.0155549, + "epoch": 0.9017886667668721, + "flos": 21393351175680.0, + "grad_norm": 1.7306372461987436, + "language_loss": 0.75024176, + "learning_rate": 1.0027185819780281e-07, + "loss": 0.77111185, + "num_input_tokens_seen": 323502910, + "step": 14999, + "time_per_iteration": 4.2373857498168945 + }, + { + "auxiliary_loss_clip": 0.01006817, + "auxiliary_loss_mlp": 0.01028146, + "balance_loss_clip": 1.02680385, + "balance_loss_mlp": 1.0171845, + "epoch": 0.9018487900195401, + "flos": 20996574566400.0, + "grad_norm": 2.0599262852840856, + "language_loss": 0.75825685, + "learning_rate": 1.0015012390048117e-07, + "loss": 0.77860647, + "num_input_tokens_seen": 323521820, + "step": 15000, + "time_per_iteration": 4.553350448608398 + }, + { + "auxiliary_loss_clip": 0.01051486, + "auxiliary_loss_mlp": 0.01026327, + "balance_loss_clip": 1.02531993, + "balance_loss_mlp": 1.01661134, + "epoch": 0.901908913272208, + "flos": 53358443458560.0, + "grad_norm": 2.0578319468303095, + "language_loss": 0.80735666, + "learning_rate": 1.0002846164565704e-07, + "loss": 0.82813478, + "num_input_tokens_seen": 323543200, + "step": 15001, + "time_per_iteration": 2.896362543106079 + }, + { + "auxiliary_loss_clip": 0.01031372, + "auxiliary_loss_mlp": 0.01027175, + "balance_loss_clip": 1.02535796, + "balance_loss_mlp": 1.01784062, + "epoch": 0.901969036524876, + "flos": 22089838867200.0, + "grad_norm": 1.4024555165615422, + "language_loss": 0.78449106, + "learning_rate": 9.990687143794407e-08, + "loss": 0.8050766, + "num_input_tokens_seen": 323563075, + "step": 15002, + "time_per_iteration": 2.727729320526123 + }, + { + "auxiliary_loss_clip": 0.01037044, + "auxiliary_loss_mlp": 0.01034066, + "balance_loss_clip": 1.0243423, + "balance_loss_mlp": 1.02142382, + "epoch": 0.9020291597775439, + "flos": 23835025059840.0, + "grad_norm": 1.9876260997162971, + "language_loss": 0.68119377, + "learning_rate": 9.978535328195347e-08, + "loss": 0.70190489, + "num_input_tokens_seen": 323579065, + "step": 15003, + "time_per_iteration": 2.6186554431915283 + }, + { + "auxiliary_loss_clip": 0.01036274, + "auxiliary_loss_mlp": 0.01035295, + "balance_loss_clip": 1.0224762, + "balance_loss_mlp": 1.0239706, + "epoch": 0.902089283030212, + "flos": 18326005263360.0, + "grad_norm": 1.7847712267836149, + "language_loss": 0.85941553, + "learning_rate": 9.9663907182292e-08, + "loss": 0.88013124, + "num_input_tokens_seen": 323594835, + "step": 15004, + "time_per_iteration": 2.6058285236358643 + }, + { + "auxiliary_loss_clip": 0.0102747, + "auxiliary_loss_mlp": 0.01031477, + "balance_loss_clip": 1.02363944, + "balance_loss_mlp": 1.02074862, + "epoch": 0.9021494062828799, + "flos": 24170359455360.0, + "grad_norm": 2.6153909594949134, + "language_loss": 0.72837692, + "learning_rate": 9.954253314356575e-08, + "loss": 0.74896634, + "num_input_tokens_seen": 323611475, + "step": 15005, + "time_per_iteration": 2.754046678543091 + }, + { + "auxiliary_loss_clip": 0.01051469, + "auxiliary_loss_mlp": 0.01026781, + "balance_loss_clip": 1.02204776, + "balance_loss_mlp": 1.01583195, + "epoch": 0.9022095295355479, + "flos": 21616859554560.0, + "grad_norm": 1.8871442092041846, + "language_loss": 0.71003771, + "learning_rate": 9.942123117037748e-08, + "loss": 0.73082018, + "num_input_tokens_seen": 323629730, + "step": 15006, + "time_per_iteration": 2.6672091484069824 + }, + { + "auxiliary_loss_clip": 0.01038385, + "auxiliary_loss_mlp": 0.01024525, + "balance_loss_clip": 1.02261806, + "balance_loss_mlp": 1.01455903, + "epoch": 0.9022696527882158, + "flos": 18726229578240.0, + "grad_norm": 1.7734171343558065, + "language_loss": 0.84038979, + "learning_rate": 9.930000126732618e-08, + "loss": 0.8610189, + "num_input_tokens_seen": 323646000, + "step": 15007, + "time_per_iteration": 2.5763800144195557 + }, + { + "auxiliary_loss_clip": 0.01030443, + "auxiliary_loss_mlp": 0.01025834, + "balance_loss_clip": 1.02147079, + "balance_loss_mlp": 1.01515329, + "epoch": 0.9023297760408838, + "flos": 26761206522240.0, + "grad_norm": 1.4725465413564345, + "language_loss": 0.78516257, + "learning_rate": 9.917884343900928e-08, + "loss": 0.80572534, + "num_input_tokens_seen": 323667250, + "step": 15008, + "time_per_iteration": 2.6860594749450684 + }, + { + "auxiliary_loss_clip": 0.0102531, + "auxiliary_loss_mlp": 0.01026011, + "balance_loss_clip": 1.0247674, + "balance_loss_mlp": 1.01573551, + "epoch": 0.9023898992935517, + "flos": 20522553759360.0, + "grad_norm": 1.9010258658916859, + "language_loss": 0.7328375, + "learning_rate": 9.905775769002156e-08, + "loss": 0.75335073, + "num_input_tokens_seen": 323687150, + "step": 15009, + "time_per_iteration": 2.6773452758789062 + }, + { + "auxiliary_loss_clip": 0.01060312, + "auxiliary_loss_mlp": 0.01029997, + "balance_loss_clip": 1.02358699, + "balance_loss_mlp": 1.01956332, + "epoch": 0.9024500225462198, + "flos": 17456644391040.0, + "grad_norm": 1.7187313482102136, + "language_loss": 0.73215824, + "learning_rate": 9.893674402495399e-08, + "loss": 0.75306135, + "num_input_tokens_seen": 323703660, + "step": 15010, + "time_per_iteration": 2.5711095333099365 + }, + { + "auxiliary_loss_clip": 0.01043624, + "auxiliary_loss_mlp": 0.01028231, + "balance_loss_clip": 1.02616668, + "balance_loss_mlp": 1.01726949, + "epoch": 0.9025101457988878, + "flos": 20813609664000.0, + "grad_norm": 1.979953827327476, + "language_loss": 0.74393666, + "learning_rate": 9.881580244839538e-08, + "loss": 0.76465523, + "num_input_tokens_seen": 323722060, + "step": 15011, + "time_per_iteration": 2.7736639976501465 + }, + { + "auxiliary_loss_clip": 0.01051593, + "auxiliary_loss_mlp": 0.01029756, + "balance_loss_clip": 1.02427769, + "balance_loss_mlp": 1.01896715, + "epoch": 0.9025702690515557, + "flos": 19026371623680.0, + "grad_norm": 4.2197411082632765, + "language_loss": 0.7313695, + "learning_rate": 9.869493296493204e-08, + "loss": 0.75218296, + "num_input_tokens_seen": 323740645, + "step": 15012, + "time_per_iteration": 2.6039741039276123 + }, + { + "auxiliary_loss_clip": 0.01032585, + "auxiliary_loss_mlp": 0.01031992, + "balance_loss_clip": 1.02531004, + "balance_loss_mlp": 1.02202642, + "epoch": 0.9026303923042237, + "flos": 19682818629120.0, + "grad_norm": 1.5964662798570006, + "language_loss": 0.69287181, + "learning_rate": 9.857413557914763e-08, + "loss": 0.71351761, + "num_input_tokens_seen": 323758905, + "step": 15013, + "time_per_iteration": 2.7433021068573 + }, + { + "auxiliary_loss_clip": 0.01050205, + "auxiliary_loss_mlp": 0.01029457, + "balance_loss_clip": 1.02485108, + "balance_loss_mlp": 1.01985478, + "epoch": 0.9026905155568916, + "flos": 24608110504320.0, + "grad_norm": 1.3574584707009087, + "language_loss": 0.73127806, + "learning_rate": 9.845341029562249e-08, + "loss": 0.75207472, + "num_input_tokens_seen": 323780595, + "step": 15014, + "time_per_iteration": 2.6454060077667236 + }, + { + "auxiliary_loss_clip": 0.01061225, + "auxiliary_loss_mlp": 0.01028098, + "balance_loss_clip": 1.02461004, + "balance_loss_mlp": 1.0177927, + "epoch": 0.9027506388095596, + "flos": 20521799573760.0, + "grad_norm": 4.7970079007819, + "language_loss": 0.72097838, + "learning_rate": 9.833275711893474e-08, + "loss": 0.7418716, + "num_input_tokens_seen": 323798160, + "step": 15015, + "time_per_iteration": 2.604318618774414 + }, + { + "auxiliary_loss_clip": 0.01038988, + "auxiliary_loss_mlp": 0.01030618, + "balance_loss_clip": 1.02344036, + "balance_loss_mlp": 1.02024066, + "epoch": 0.9028107620622275, + "flos": 22784494965120.0, + "grad_norm": 1.7819558130611375, + "language_loss": 0.6874299, + "learning_rate": 9.821217605365895e-08, + "loss": 0.70812595, + "num_input_tokens_seen": 323816810, + "step": 15016, + "time_per_iteration": 2.6910011768341064 + }, + { + "auxiliary_loss_clip": 0.0105964, + "auxiliary_loss_mlp": 0.01026617, + "balance_loss_clip": 1.02400565, + "balance_loss_mlp": 1.01753879, + "epoch": 0.9028708853148956, + "flos": 25410534382080.0, + "grad_norm": 2.5561776476314964, + "language_loss": 0.70637667, + "learning_rate": 9.809166710436855e-08, + "loss": 0.72723919, + "num_input_tokens_seen": 323836900, + "step": 15017, + "time_per_iteration": 4.397871732711792 + }, + { + "auxiliary_loss_clip": 0.01044272, + "auxiliary_loss_mlp": 0.01029386, + "balance_loss_clip": 1.02787352, + "balance_loss_mlp": 1.01945615, + "epoch": 0.9029310085675635, + "flos": 21871322478720.0, + "grad_norm": 2.014707720605099, + "language_loss": 0.69363058, + "learning_rate": 9.797123027563237e-08, + "loss": 0.71436715, + "num_input_tokens_seen": 323855325, + "step": 15018, + "time_per_iteration": 2.6981265544891357 + }, + { + "auxiliary_loss_clip": 0.01052485, + "auxiliary_loss_mlp": 0.01027508, + "balance_loss_clip": 1.0257107, + "balance_loss_mlp": 1.01651692, + "epoch": 0.9029911318202315, + "flos": 26214394803840.0, + "grad_norm": 1.714887782954769, + "language_loss": 0.69102198, + "learning_rate": 9.785086557201782e-08, + "loss": 0.71182191, + "num_input_tokens_seen": 323875650, + "step": 15019, + "time_per_iteration": 2.701007604598999 + }, + { + "auxiliary_loss_clip": 0.01059092, + "auxiliary_loss_mlp": 0.01027228, + "balance_loss_clip": 1.02397537, + "balance_loss_mlp": 1.01730943, + "epoch": 0.9030512550728994, + "flos": 15961360095360.0, + "grad_norm": 2.0872569759513553, + "language_loss": 0.71896541, + "learning_rate": 9.773057299808951e-08, + "loss": 0.73982859, + "num_input_tokens_seen": 323892920, + "step": 15020, + "time_per_iteration": 2.561403751373291 + }, + { + "auxiliary_loss_clip": 0.01045598, + "auxiliary_loss_mlp": 0.01030533, + "balance_loss_clip": 1.0227325, + "balance_loss_mlp": 1.01969695, + "epoch": 0.9031113783255674, + "flos": 23987610034560.0, + "grad_norm": 1.5714706046406945, + "language_loss": 0.74420768, + "learning_rate": 9.7610352558408e-08, + "loss": 0.76496899, + "num_input_tokens_seen": 323913835, + "step": 15021, + "time_per_iteration": 2.7771193981170654 + }, + { + "auxiliary_loss_clip": 0.01065399, + "auxiliary_loss_mlp": 0.01026061, + "balance_loss_clip": 1.02597177, + "balance_loss_mlp": 1.01506424, + "epoch": 0.9031715015782353, + "flos": 22237216369920.0, + "grad_norm": 1.9586955568516953, + "language_loss": 0.72863507, + "learning_rate": 9.749020425753251e-08, + "loss": 0.74954969, + "num_input_tokens_seen": 323933440, + "step": 15022, + "time_per_iteration": 2.7999343872070312 + }, + { + "auxiliary_loss_clip": 0.01025111, + "auxiliary_loss_mlp": 0.01025861, + "balance_loss_clip": 1.02291763, + "balance_loss_mlp": 1.0158236, + "epoch": 0.9032316248309034, + "flos": 26323168164480.0, + "grad_norm": 2.515441861968261, + "language_loss": 0.72356892, + "learning_rate": 9.737012810001943e-08, + "loss": 0.7440787, + "num_input_tokens_seen": 323954090, + "step": 15023, + "time_per_iteration": 2.76374888420105 + }, + { + "auxiliary_loss_clip": 0.01051486, + "auxiliary_loss_mlp": 0.01027121, + "balance_loss_clip": 1.02538311, + "balance_loss_mlp": 1.0170542, + "epoch": 0.9032917480835713, + "flos": 22636686499200.0, + "grad_norm": 2.0169478008639725, + "language_loss": 0.8262502, + "learning_rate": 9.725012409042155e-08, + "loss": 0.8470363, + "num_input_tokens_seen": 323974040, + "step": 15024, + "time_per_iteration": 2.6221678256988525 + }, + { + "auxiliary_loss_clip": 0.01052588, + "auxiliary_loss_mlp": 0.01026616, + "balance_loss_clip": 1.0245111, + "balance_loss_mlp": 1.01631629, + "epoch": 0.9033518713362393, + "flos": 23878764846720.0, + "grad_norm": 1.5961187831858272, + "language_loss": 0.69560993, + "learning_rate": 9.713019223328966e-08, + "loss": 0.71640199, + "num_input_tokens_seen": 323996125, + "step": 15025, + "time_per_iteration": 2.6267597675323486 + }, + { + "auxiliary_loss_clip": 0.01028991, + "auxiliary_loss_mlp": 0.01029843, + "balance_loss_clip": 1.02325046, + "balance_loss_mlp": 1.0199548, + "epoch": 0.9034119945889073, + "flos": 26905279973760.0, + "grad_norm": 1.6662951976914997, + "language_loss": 0.77116621, + "learning_rate": 9.70103325331717e-08, + "loss": 0.7917546, + "num_input_tokens_seen": 324017645, + "step": 15026, + "time_per_iteration": 2.7173783779144287 + }, + { + "auxiliary_loss_clip": 0.01051594, + "auxiliary_loss_mlp": 0.01025394, + "balance_loss_clip": 1.02507257, + "balance_loss_mlp": 1.01587522, + "epoch": 0.9034721178415752, + "flos": 20850166730880.0, + "grad_norm": 2.126175992142638, + "language_loss": 0.68611783, + "learning_rate": 9.68905449946129e-08, + "loss": 0.70688772, + "num_input_tokens_seen": 324036875, + "step": 15027, + "time_per_iteration": 2.6108508110046387 + }, + { + "auxiliary_loss_clip": 0.01006257, + "auxiliary_loss_mlp": 0.01032556, + "balance_loss_clip": 1.02151692, + "balance_loss_mlp": 1.02107596, + "epoch": 0.9035322410942432, + "flos": 22234307368320.0, + "grad_norm": 1.554966738183035, + "language_loss": 0.76001441, + "learning_rate": 9.677082962215477e-08, + "loss": 0.78040254, + "num_input_tokens_seen": 324057045, + "step": 15028, + "time_per_iteration": 2.759634017944336 + }, + { + "auxiliary_loss_clip": 0.01010161, + "auxiliary_loss_mlp": 0.01032888, + "balance_loss_clip": 1.02251625, + "balance_loss_mlp": 1.02221298, + "epoch": 0.9035923643469111, + "flos": 25923410726400.0, + "grad_norm": 1.7428018896902187, + "language_loss": 0.69119406, + "learning_rate": 9.665118642033765e-08, + "loss": 0.7116245, + "num_input_tokens_seen": 324079735, + "step": 15029, + "time_per_iteration": 2.8315553665161133 + }, + { + "auxiliary_loss_clip": 0.01055639, + "auxiliary_loss_mlp": 0.01029722, + "balance_loss_clip": 1.02671218, + "balance_loss_mlp": 1.01846242, + "epoch": 0.9036524875995792, + "flos": 20339804338560.0, + "grad_norm": 2.149832530286969, + "language_loss": 0.73707503, + "learning_rate": 9.653161539369858e-08, + "loss": 0.75792861, + "num_input_tokens_seen": 324097785, + "step": 15030, + "time_per_iteration": 2.632308006286621 + }, + { + "auxiliary_loss_clip": 0.0105498, + "auxiliary_loss_mlp": 0.01027306, + "balance_loss_clip": 1.02632475, + "balance_loss_mlp": 1.0168035, + "epoch": 0.9037126108522471, + "flos": 40114624677120.0, + "grad_norm": 1.9509808056190088, + "language_loss": 0.68122971, + "learning_rate": 9.641211654677151e-08, + "loss": 0.70205259, + "num_input_tokens_seen": 324121625, + "step": 15031, + "time_per_iteration": 2.8309640884399414 + }, + { + "auxiliary_loss_clip": 0.01040255, + "auxiliary_loss_mlp": 0.01023665, + "balance_loss_clip": 1.02456987, + "balance_loss_mlp": 1.01399767, + "epoch": 0.9037727341049151, + "flos": 23332024955520.0, + "grad_norm": 1.5805542394799414, + "language_loss": 0.76457, + "learning_rate": 9.629268988408723e-08, + "loss": 0.78520924, + "num_input_tokens_seen": 324142535, + "step": 15032, + "time_per_iteration": 2.7202999591827393 + }, + { + "auxiliary_loss_clip": 0.01061592, + "auxiliary_loss_mlp": 0.01029656, + "balance_loss_clip": 1.02433848, + "balance_loss_mlp": 1.01926732, + "epoch": 0.903832857357583, + "flos": 12822659815680.0, + "grad_norm": 1.9264225563001913, + "language_loss": 0.75161672, + "learning_rate": 9.617333541017502e-08, + "loss": 0.77252924, + "num_input_tokens_seen": 324159610, + "step": 15033, + "time_per_iteration": 2.6424806118011475 + }, + { + "auxiliary_loss_clip": 0.01022719, + "auxiliary_loss_mlp": 0.01031763, + "balance_loss_clip": 1.02158844, + "balance_loss_mlp": 1.02026486, + "epoch": 0.903892980610251, + "flos": 25703026830720.0, + "grad_norm": 1.8246600848297139, + "language_loss": 0.74112964, + "learning_rate": 9.605405312956105e-08, + "loss": 0.7616744, + "num_input_tokens_seen": 324182510, + "step": 15034, + "time_per_iteration": 4.521989345550537 + }, + { + "auxiliary_loss_clip": 0.010266, + "auxiliary_loss_mlp": 0.01030852, + "balance_loss_clip": 1.02367926, + "balance_loss_mlp": 1.02023673, + "epoch": 0.9039531038629189, + "flos": 14684089397760.0, + "grad_norm": 1.5846713358651316, + "language_loss": 0.6337539, + "learning_rate": 9.593484304676791e-08, + "loss": 0.65432847, + "num_input_tokens_seen": 324200555, + "step": 15035, + "time_per_iteration": 2.6898281574249268 + }, + { + "auxiliary_loss_clip": 0.01062945, + "auxiliary_loss_mlp": 0.01027002, + "balance_loss_clip": 1.0260098, + "balance_loss_mlp": 1.01556993, + "epoch": 0.904013227115587, + "flos": 24024921287040.0, + "grad_norm": 3.0848028196843984, + "language_loss": 0.62951422, + "learning_rate": 9.581570516631643e-08, + "loss": 0.65041375, + "num_input_tokens_seen": 324220255, + "step": 15036, + "time_per_iteration": 2.7829630374908447 + }, + { + "auxiliary_loss_clip": 0.01011528, + "auxiliary_loss_mlp": 0.01025099, + "balance_loss_clip": 1.02439809, + "balance_loss_mlp": 1.0153296, + "epoch": 0.9040733503682549, + "flos": 22856459863680.0, + "grad_norm": 1.5779362514580908, + "language_loss": 0.82202971, + "learning_rate": 9.569663949272455e-08, + "loss": 0.84239602, + "num_input_tokens_seen": 324237855, + "step": 15037, + "time_per_iteration": 2.8286850452423096 + }, + { + "auxiliary_loss_clip": 0.01063092, + "auxiliary_loss_mlp": 0.01026257, + "balance_loss_clip": 1.02510834, + "balance_loss_mlp": 1.0157007, + "epoch": 0.9041334736209229, + "flos": 19974951941760.0, + "grad_norm": 2.2013009058223196, + "language_loss": 0.67459559, + "learning_rate": 9.557764603050667e-08, + "loss": 0.69548905, + "num_input_tokens_seen": 324257050, + "step": 15038, + "time_per_iteration": 2.7551321983337402 + }, + { + "auxiliary_loss_clip": 0.01038728, + "auxiliary_loss_mlp": 0.01031252, + "balance_loss_clip": 1.02265811, + "balance_loss_mlp": 1.02070808, + "epoch": 0.9041935968735909, + "flos": 17530548624000.0, + "grad_norm": 2.03294744483898, + "language_loss": 0.75131357, + "learning_rate": 9.545872478417494e-08, + "loss": 0.77201343, + "num_input_tokens_seen": 324275510, + "step": 15039, + "time_per_iteration": 2.6994478702545166 + }, + { + "auxiliary_loss_clip": 0.01039459, + "auxiliary_loss_mlp": 0.0102769, + "balance_loss_clip": 1.02369118, + "balance_loss_mlp": 1.01817155, + "epoch": 0.9042537201262588, + "flos": 22780149419520.0, + "grad_norm": 1.543288691013273, + "language_loss": 0.7012769, + "learning_rate": 9.533987575823977e-08, + "loss": 0.72194839, + "num_input_tokens_seen": 324295150, + "step": 15040, + "time_per_iteration": 2.7858493328094482 + }, + { + "auxiliary_loss_clip": 0.01027186, + "auxiliary_loss_mlp": 0.01026304, + "balance_loss_clip": 1.02240729, + "balance_loss_mlp": 1.01623034, + "epoch": 0.9043138433789268, + "flos": 20595416497920.0, + "grad_norm": 1.9692170319297027, + "language_loss": 0.67562789, + "learning_rate": 9.522109895720709e-08, + "loss": 0.6961627, + "num_input_tokens_seen": 324313855, + "step": 15041, + "time_per_iteration": 2.7988948822021484 + }, + { + "auxiliary_loss_clip": 0.01051509, + "auxiliary_loss_mlp": 0.01028303, + "balance_loss_clip": 1.02465296, + "balance_loss_mlp": 1.01770508, + "epoch": 0.9043739666315948, + "flos": 32962978995840.0, + "grad_norm": 2.591642750543204, + "language_loss": 0.57467407, + "learning_rate": 9.510239438558155e-08, + "loss": 0.59547222, + "num_input_tokens_seen": 324338465, + "step": 15042, + "time_per_iteration": 2.7627248764038086 + }, + { + "auxiliary_loss_clip": 0.00987663, + "auxiliary_loss_mlp": 0.00746471, + "balance_loss_clip": 1.00250387, + "balance_loss_mlp": 1.00038397, + "epoch": 0.9044340898842628, + "flos": 67296418525440.0, + "grad_norm": 0.7853199249721378, + "language_loss": 0.56981087, + "learning_rate": 9.498376204786351e-08, + "loss": 0.58715224, + "num_input_tokens_seen": 324398740, + "step": 15043, + "time_per_iteration": 3.2763278484344482 + }, + { + "auxiliary_loss_clip": 0.01041471, + "auxiliary_loss_mlp": 0.01027723, + "balance_loss_clip": 1.02396393, + "balance_loss_mlp": 1.01698256, + "epoch": 0.9044942131369307, + "flos": 17713154390400.0, + "grad_norm": 1.7172093631006544, + "language_loss": 0.70107377, + "learning_rate": 9.486520194855274e-08, + "loss": 0.72176576, + "num_input_tokens_seen": 324417335, + "step": 15044, + "time_per_iteration": 3.021160125732422 + }, + { + "auxiliary_loss_clip": 0.01045533, + "auxiliary_loss_mlp": 0.01033785, + "balance_loss_clip": 1.02673209, + "balance_loss_mlp": 1.02247167, + "epoch": 0.9045543363895987, + "flos": 17820563034240.0, + "grad_norm": 2.6292740429110943, + "language_loss": 0.69098085, + "learning_rate": 9.474671409214407e-08, + "loss": 0.71177399, + "num_input_tokens_seen": 324433240, + "step": 15045, + "time_per_iteration": 2.754255533218384 + }, + { + "auxiliary_loss_clip": 0.01030758, + "auxiliary_loss_mlp": 0.01031531, + "balance_loss_clip": 1.02443087, + "balance_loss_mlp": 1.02040935, + "epoch": 0.9046144596422666, + "flos": 21872723109120.0, + "grad_norm": 2.0158316366518365, + "language_loss": 0.65926826, + "learning_rate": 9.462829848313081e-08, + "loss": 0.67989117, + "num_input_tokens_seen": 324452675, + "step": 15046, + "time_per_iteration": 2.6748251914978027 + }, + { + "auxiliary_loss_clip": 0.01031215, + "auxiliary_loss_mlp": 0.01032018, + "balance_loss_clip": 1.02367139, + "balance_loss_mlp": 1.02168822, + "epoch": 0.9046745828949346, + "flos": 17672646827520.0, + "grad_norm": 2.1447715152945084, + "language_loss": 0.61750478, + "learning_rate": 9.450995512600379e-08, + "loss": 0.6381371, + "num_input_tokens_seen": 324467865, + "step": 15047, + "time_per_iteration": 4.584944725036621 + }, + { + "auxiliary_loss_clip": 0.01062641, + "auxiliary_loss_mlp": 0.00747616, + "balance_loss_clip": 1.0265708, + "balance_loss_mlp": 1.00041628, + "epoch": 0.9047347061476025, + "flos": 25702559953920.0, + "grad_norm": 1.8589431783807062, + "language_loss": 0.71299291, + "learning_rate": 9.439168402525032e-08, + "loss": 0.73109543, + "num_input_tokens_seen": 324490430, + "step": 15048, + "time_per_iteration": 4.2350311279296875 + }, + { + "auxiliary_loss_clip": 0.01050363, + "auxiliary_loss_mlp": 0.01026492, + "balance_loss_clip": 1.02202296, + "balance_loss_mlp": 1.01546502, + "epoch": 0.9047948294002706, + "flos": 15158146118400.0, + "grad_norm": 2.5287568049113385, + "language_loss": 0.74923855, + "learning_rate": 9.427348518535483e-08, + "loss": 0.77000713, + "num_input_tokens_seen": 324506620, + "step": 15049, + "time_per_iteration": 2.6023874282836914 + }, + { + "auxiliary_loss_clip": 0.01052063, + "auxiliary_loss_mlp": 0.01028048, + "balance_loss_clip": 1.02666724, + "balance_loss_mlp": 1.01783168, + "epoch": 0.9048549526529385, + "flos": 21872292145920.0, + "grad_norm": 1.8492295672536347, + "language_loss": 0.7551688, + "learning_rate": 9.415535861079993e-08, + "loss": 0.77596986, + "num_input_tokens_seen": 324525505, + "step": 15050, + "time_per_iteration": 2.6392931938171387 + }, + { + "auxiliary_loss_clip": 0.01061663, + "auxiliary_loss_mlp": 0.00747618, + "balance_loss_clip": 1.02485061, + "balance_loss_mlp": 1.00041091, + "epoch": 0.9049150759056065, + "flos": 23546626761600.0, + "grad_norm": 1.654512878718559, + "language_loss": 0.81708699, + "learning_rate": 9.403730430606472e-08, + "loss": 0.83517981, + "num_input_tokens_seen": 324544415, + "step": 15051, + "time_per_iteration": 2.6642568111419678 + }, + { + "auxiliary_loss_clip": 0.01051373, + "auxiliary_loss_mlp": 0.01027472, + "balance_loss_clip": 1.02517557, + "balance_loss_mlp": 1.01795328, + "epoch": 0.9049751991582745, + "flos": 19645902426240.0, + "grad_norm": 2.0939840801816585, + "language_loss": 0.88985705, + "learning_rate": 9.391932227562582e-08, + "loss": 0.91064548, + "num_input_tokens_seen": 324562555, + "step": 15052, + "time_per_iteration": 2.716334819793701 + }, + { + "auxiliary_loss_clip": 0.01052184, + "auxiliary_loss_mlp": 0.01032211, + "balance_loss_clip": 1.02504015, + "balance_loss_mlp": 1.02173197, + "epoch": 0.9050353224109424, + "flos": 15596220389760.0, + "grad_norm": 2.15414953228031, + "language_loss": 0.7701422, + "learning_rate": 9.380141252395724e-08, + "loss": 0.79098612, + "num_input_tokens_seen": 324580865, + "step": 15053, + "time_per_iteration": 2.757648468017578 + }, + { + "auxiliary_loss_clip": 0.01049484, + "auxiliary_loss_mlp": 0.01034368, + "balance_loss_clip": 1.02370453, + "balance_loss_mlp": 1.02420509, + "epoch": 0.9050954456636104, + "flos": 28183592165760.0, + "grad_norm": 1.4428929450932397, + "language_loss": 0.72576493, + "learning_rate": 9.368357505553049e-08, + "loss": 0.74660337, + "num_input_tokens_seen": 324600665, + "step": 15054, + "time_per_iteration": 2.885680913925171 + }, + { + "auxiliary_loss_clip": 0.01006573, + "auxiliary_loss_mlp": 0.01030677, + "balance_loss_clip": 1.01966214, + "balance_loss_mlp": 1.01987052, + "epoch": 0.9051555689162784, + "flos": 25731611078400.0, + "grad_norm": 1.9667427311223367, + "language_loss": 0.83339274, + "learning_rate": 9.356580987481333e-08, + "loss": 0.85376525, + "num_input_tokens_seen": 324618145, + "step": 15055, + "time_per_iteration": 2.913789749145508 + }, + { + "auxiliary_loss_clip": 0.01051223, + "auxiliary_loss_mlp": 0.0103171, + "balance_loss_clip": 1.02585864, + "balance_loss_mlp": 1.02127349, + "epoch": 0.9052156921689464, + "flos": 23257258796160.0, + "grad_norm": 1.8569934728000241, + "language_loss": 0.84883654, + "learning_rate": 9.344811698627176e-08, + "loss": 0.86966586, + "num_input_tokens_seen": 324638165, + "step": 15056, + "time_per_iteration": 2.6342124938964844 + }, + { + "auxiliary_loss_clip": 0.0103155, + "auxiliary_loss_mlp": 0.01027283, + "balance_loss_clip": 1.02363443, + "balance_loss_mlp": 1.01745999, + "epoch": 0.9052758154216143, + "flos": 29564285097600.0, + "grad_norm": 1.73294811171556, + "language_loss": 0.7145583, + "learning_rate": 9.333049639436863e-08, + "loss": 0.73514664, + "num_input_tokens_seen": 324658560, + "step": 15057, + "time_per_iteration": 2.7344250679016113 + }, + { + "auxiliary_loss_clip": 0.01046842, + "auxiliary_loss_mlp": 0.01026849, + "balance_loss_clip": 1.02280927, + "balance_loss_mlp": 1.0169189, + "epoch": 0.9053359386742823, + "flos": 22127688823680.0, + "grad_norm": 1.6025192308022824, + "language_loss": 0.80673963, + "learning_rate": 9.321294810356418e-08, + "loss": 0.82747656, + "num_input_tokens_seen": 324679185, + "step": 15058, + "time_per_iteration": 2.6657583713531494 + }, + { + "auxiliary_loss_clip": 0.00996699, + "auxiliary_loss_mlp": 0.01001001, + "balance_loss_clip": 1.00197124, + "balance_loss_mlp": 1.00012445, + "epoch": 0.9053960619269502, + "flos": 67090112760960.0, + "grad_norm": 0.6697045862262182, + "language_loss": 0.51398093, + "learning_rate": 9.309547211831592e-08, + "loss": 0.53395796, + "num_input_tokens_seen": 324744830, + "step": 15059, + "time_per_iteration": 3.358219861984253 + }, + { + "auxiliary_loss_clip": 0.01027319, + "auxiliary_loss_mlp": 0.010266, + "balance_loss_clip": 1.02872396, + "balance_loss_mlp": 1.01630092, + "epoch": 0.9054561851796182, + "flos": 15815419136640.0, + "grad_norm": 1.8647465773358358, + "language_loss": 0.66908991, + "learning_rate": 9.297806844307831e-08, + "loss": 0.6896292, + "num_input_tokens_seen": 324762905, + "step": 15060, + "time_per_iteration": 2.884075880050659 + }, + { + "auxiliary_loss_clip": 0.01043417, + "auxiliary_loss_mlp": 0.01029404, + "balance_loss_clip": 1.02511024, + "balance_loss_mlp": 1.01877642, + "epoch": 0.9055163084322861, + "flos": 17566997950080.0, + "grad_norm": 2.0863555867037022, + "language_loss": 0.63864493, + "learning_rate": 9.286073708230357e-08, + "loss": 0.65937316, + "num_input_tokens_seen": 324781905, + "step": 15061, + "time_per_iteration": 2.7323124408721924 + }, + { + "auxiliary_loss_clip": 0.01035976, + "auxiliary_loss_mlp": 0.01033453, + "balance_loss_clip": 1.02382827, + "balance_loss_mlp": 1.02259934, + "epoch": 0.9055764316849542, + "flos": 17639573379840.0, + "grad_norm": 1.8460000428047414, + "language_loss": 0.71686381, + "learning_rate": 9.274347804044058e-08, + "loss": 0.73755813, + "num_input_tokens_seen": 324799260, + "step": 15062, + "time_per_iteration": 2.878058910369873 + }, + { + "auxiliary_loss_clip": 0.01060265, + "auxiliary_loss_mlp": 0.01030565, + "balance_loss_clip": 1.0241456, + "balance_loss_mlp": 1.02071285, + "epoch": 0.9056365549376221, + "flos": 20120856986880.0, + "grad_norm": 2.2187030831650083, + "language_loss": 0.70870197, + "learning_rate": 9.2626291321936e-08, + "loss": 0.72961032, + "num_input_tokens_seen": 324817800, + "step": 15063, + "time_per_iteration": 2.644888401031494 + }, + { + "auxiliary_loss_clip": 0.01020244, + "auxiliary_loss_mlp": 0.01026438, + "balance_loss_clip": 1.02286696, + "balance_loss_mlp": 1.01670432, + "epoch": 0.9056966781902901, + "flos": 27598786836480.0, + "grad_norm": 1.5600401633890468, + "language_loss": 0.72442567, + "learning_rate": 9.250917693123406e-08, + "loss": 0.74489248, + "num_input_tokens_seen": 324838445, + "step": 15064, + "time_per_iteration": 4.484559774398804 + }, + { + "auxiliary_loss_clip": 0.01053564, + "auxiliary_loss_mlp": 0.01027339, + "balance_loss_clip": 1.02479458, + "balance_loss_mlp": 1.01729012, + "epoch": 0.9057568014429581, + "flos": 25920106675200.0, + "grad_norm": 1.7788107095613368, + "language_loss": 0.69646788, + "learning_rate": 9.23921348727752e-08, + "loss": 0.71727693, + "num_input_tokens_seen": 324859895, + "step": 15065, + "time_per_iteration": 2.7170374393463135 + }, + { + "auxiliary_loss_clip": 0.0102736, + "auxiliary_loss_mlp": 0.0103339, + "balance_loss_clip": 1.02340949, + "balance_loss_mlp": 1.02359104, + "epoch": 0.905816924695626, + "flos": 22930364096640.0, + "grad_norm": 1.606340569494364, + "language_loss": 0.63176572, + "learning_rate": 9.227516515099743e-08, + "loss": 0.65237319, + "num_input_tokens_seen": 324879580, + "step": 15066, + "time_per_iteration": 2.699868679046631 + }, + { + "auxiliary_loss_clip": 0.00985717, + "auxiliary_loss_mlp": 0.01029263, + "balance_loss_clip": 1.01890659, + "balance_loss_mlp": 1.01766396, + "epoch": 0.905877047948294, + "flos": 22157422306560.0, + "grad_norm": 1.9671780043143772, + "language_loss": 0.80131984, + "learning_rate": 9.215826777033675e-08, + "loss": 0.82146966, + "num_input_tokens_seen": 324898950, + "step": 15067, + "time_per_iteration": 2.9359896183013916 + }, + { + "auxiliary_loss_clip": 0.01044498, + "auxiliary_loss_mlp": 0.01031045, + "balance_loss_clip": 1.02580953, + "balance_loss_mlp": 1.02005959, + "epoch": 0.905937171200962, + "flos": 15304805349120.0, + "grad_norm": 1.6127457112979249, + "language_loss": 0.69982046, + "learning_rate": 9.204144273522563e-08, + "loss": 0.72057593, + "num_input_tokens_seen": 324917455, + "step": 15068, + "time_per_iteration": 2.75345516204834 + }, + { + "auxiliary_loss_clip": 0.01058307, + "auxiliary_loss_mlp": 0.0102542, + "balance_loss_clip": 1.0230062, + "balance_loss_mlp": 1.01546049, + "epoch": 0.90599729445363, + "flos": 19462973437440.0, + "grad_norm": 1.8277613767148972, + "language_loss": 0.85457915, + "learning_rate": 9.19246900500943e-08, + "loss": 0.8754164, + "num_input_tokens_seen": 324934495, + "step": 15069, + "time_per_iteration": 2.6643247604370117 + }, + { + "auxiliary_loss_clip": 0.01051841, + "auxiliary_loss_mlp": 0.01029413, + "balance_loss_clip": 1.02388954, + "balance_loss_mlp": 1.01789784, + "epoch": 0.9060574177062979, + "flos": 23732967542400.0, + "grad_norm": 2.5349305158077695, + "language_loss": 0.59319806, + "learning_rate": 9.180800971936987e-08, + "loss": 0.61401063, + "num_input_tokens_seen": 324953230, + "step": 15070, + "time_per_iteration": 2.6390371322631836 + }, + { + "auxiliary_loss_clip": 0.01033637, + "auxiliary_loss_mlp": 0.01021232, + "balance_loss_clip": 1.02485752, + "balance_loss_mlp": 1.01012778, + "epoch": 0.9061175409589659, + "flos": 17311134395520.0, + "grad_norm": 1.9545482828858107, + "language_loss": 0.8164497, + "learning_rate": 9.169140174747724e-08, + "loss": 0.83699834, + "num_input_tokens_seen": 324969880, + "step": 15071, + "time_per_iteration": 2.6805551052093506 + }, + { + "auxiliary_loss_clip": 0.01065149, + "auxiliary_loss_mlp": 0.01037206, + "balance_loss_clip": 1.02546453, + "balance_loss_mlp": 1.02589893, + "epoch": 0.9061776642116338, + "flos": 17778439359360.0, + "grad_norm": 1.7538516092619127, + "language_loss": 0.61646318, + "learning_rate": 9.157486613883758e-08, + "loss": 0.63748682, + "num_input_tokens_seen": 324987005, + "step": 15072, + "time_per_iteration": 2.578376293182373 + }, + { + "auxiliary_loss_clip": 0.0103956, + "auxiliary_loss_mlp": 0.01029497, + "balance_loss_clip": 1.02250195, + "balance_loss_mlp": 1.01923871, + "epoch": 0.9062377874643018, + "flos": 42777688037760.0, + "grad_norm": 1.8863774064894776, + "language_loss": 0.73065221, + "learning_rate": 9.145840289787021e-08, + "loss": 0.75134271, + "num_input_tokens_seen": 325010700, + "step": 15073, + "time_per_iteration": 2.8535022735595703 + }, + { + "auxiliary_loss_clip": 0.01049452, + "auxiliary_loss_mlp": 0.01024775, + "balance_loss_clip": 1.02462053, + "balance_loss_mlp": 1.01524401, + "epoch": 0.9062979107169697, + "flos": 16361620323840.0, + "grad_norm": 1.9914954169868977, + "language_loss": 0.80786967, + "learning_rate": 9.134201202899161e-08, + "loss": 0.82861191, + "num_input_tokens_seen": 325028760, + "step": 15074, + "time_per_iteration": 2.6489272117614746 + }, + { + "auxiliary_loss_clip": 0.00961401, + "auxiliary_loss_mlp": 0.00746573, + "balance_loss_clip": 1.00534081, + "balance_loss_mlp": 1.00044775, + "epoch": 0.9063580339696378, + "flos": 69313988528640.0, + "grad_norm": 0.737443348738882, + "language_loss": 0.5236882, + "learning_rate": 9.122569353661513e-08, + "loss": 0.54076791, + "num_input_tokens_seen": 325093545, + "step": 15075, + "time_per_iteration": 3.5017831325531006 + }, + { + "auxiliary_loss_clip": 0.00979834, + "auxiliary_loss_mlp": 0.01002436, + "balance_loss_clip": 1.00344515, + "balance_loss_mlp": 1.00155985, + "epoch": 0.9064181572223057, + "flos": 58794747148800.0, + "grad_norm": 1.0102097014470441, + "language_loss": 0.62130737, + "learning_rate": 9.11094474251517e-08, + "loss": 0.64113009, + "num_input_tokens_seen": 325152295, + "step": 15076, + "time_per_iteration": 3.507546901702881 + }, + { + "auxiliary_loss_clip": 0.01050738, + "auxiliary_loss_mlp": 0.01032505, + "balance_loss_clip": 1.02424407, + "balance_loss_mlp": 1.02218735, + "epoch": 0.9064782804749737, + "flos": 21762692772480.0, + "grad_norm": 1.6969124531231417, + "language_loss": 0.82380092, + "learning_rate": 9.09932736990091e-08, + "loss": 0.84463334, + "num_input_tokens_seen": 325169705, + "step": 15077, + "time_per_iteration": 2.632159948348999 + }, + { + "auxiliary_loss_clip": 0.01035082, + "auxiliary_loss_mlp": 0.00747471, + "balance_loss_clip": 1.02123451, + "balance_loss_mlp": 1.00036573, + "epoch": 0.9065384037276417, + "flos": 21397373498880.0, + "grad_norm": 1.498928610284819, + "language_loss": 0.83804399, + "learning_rate": 9.08771723625934e-08, + "loss": 0.85586953, + "num_input_tokens_seen": 325189175, + "step": 15078, + "time_per_iteration": 2.705134391784668 + }, + { + "auxiliary_loss_clip": 0.01049466, + "auxiliary_loss_mlp": 0.00747524, + "balance_loss_clip": 1.02491188, + "balance_loss_mlp": 1.00031829, + "epoch": 0.9065985269803096, + "flos": 38283646849920.0, + "grad_norm": 1.9434593237736053, + "language_loss": 0.65361154, + "learning_rate": 9.076114342030617e-08, + "loss": 0.67158151, + "num_input_tokens_seen": 325211020, + "step": 15079, + "time_per_iteration": 2.7668087482452393 + }, + { + "auxiliary_loss_clip": 0.00987597, + "auxiliary_loss_mlp": 0.0102447, + "balance_loss_clip": 1.02150893, + "balance_loss_mlp": 1.01415205, + "epoch": 0.9066586502329776, + "flos": 44818562989440.0, + "grad_norm": 1.8622243716285785, + "language_loss": 0.71124703, + "learning_rate": 9.064518687654765e-08, + "loss": 0.73136771, + "num_input_tokens_seen": 325236970, + "step": 15080, + "time_per_iteration": 3.044719696044922 + }, + { + "auxiliary_loss_clip": 0.01055357, + "auxiliary_loss_mlp": 0.01028308, + "balance_loss_clip": 1.02732778, + "balance_loss_mlp": 1.01681662, + "epoch": 0.9067187734856456, + "flos": 18623992492800.0, + "grad_norm": 2.2862858829108754, + "language_loss": 0.71490234, + "learning_rate": 9.052930273571547e-08, + "loss": 0.73573899, + "num_input_tokens_seen": 325252670, + "step": 15081, + "time_per_iteration": 4.547373294830322 + }, + { + "auxiliary_loss_clip": 0.01043309, + "auxiliary_loss_mlp": 0.01030168, + "balance_loss_clip": 1.02621269, + "balance_loss_mlp": 1.0199461, + "epoch": 0.9067788967383136, + "flos": 22747578762240.0, + "grad_norm": 1.9343324997853706, + "language_loss": 0.74563777, + "learning_rate": 9.04134910022032e-08, + "loss": 0.76637256, + "num_input_tokens_seen": 325273860, + "step": 15082, + "time_per_iteration": 2.6970577239990234 + }, + { + "auxiliary_loss_clip": 0.01030156, + "auxiliary_loss_mlp": 0.01027534, + "balance_loss_clip": 1.02491224, + "balance_loss_mlp": 1.01819968, + "epoch": 0.9068390199909815, + "flos": 27670787648640.0, + "grad_norm": 1.7608762940410527, + "language_loss": 0.78109348, + "learning_rate": 9.029775168040266e-08, + "loss": 0.80167031, + "num_input_tokens_seen": 325294140, + "step": 15083, + "time_per_iteration": 2.7253293991088867 + }, + { + "auxiliary_loss_clip": 0.01040221, + "auxiliary_loss_mlp": 0.00747401, + "balance_loss_clip": 1.02550507, + "balance_loss_mlp": 1.00033367, + "epoch": 0.9068991432436495, + "flos": 24244012293120.0, + "grad_norm": 1.6311115700217638, + "language_loss": 0.68847549, + "learning_rate": 9.01820847747028e-08, + "loss": 0.70635176, + "num_input_tokens_seen": 325313130, + "step": 15084, + "time_per_iteration": 2.680227041244507 + }, + { + "auxiliary_loss_clip": 0.01062569, + "auxiliary_loss_mlp": 0.01027309, + "balance_loss_clip": 1.02639294, + "balance_loss_mlp": 1.01708126, + "epoch": 0.9069592664963174, + "flos": 28033305661440.0, + "grad_norm": 1.6963577265334369, + "language_loss": 0.66981518, + "learning_rate": 9.006649028948965e-08, + "loss": 0.690714, + "num_input_tokens_seen": 325334880, + "step": 15085, + "time_per_iteration": 2.6319291591644287 + }, + { + "auxiliary_loss_clip": 0.00981964, + "auxiliary_loss_mlp": 0.01002956, + "balance_loss_clip": 1.00532293, + "balance_loss_mlp": 1.00160277, + "epoch": 0.9070193897489854, + "flos": 68778414789120.0, + "grad_norm": 0.7684888365736259, + "language_loss": 0.61304343, + "learning_rate": 8.995096822914638e-08, + "loss": 0.63289261, + "num_input_tokens_seen": 325394175, + "step": 15086, + "time_per_iteration": 3.3664114475250244 + }, + { + "auxiliary_loss_clip": 0.01045678, + "auxiliary_loss_mlp": 0.01032625, + "balance_loss_clip": 1.02319455, + "balance_loss_mlp": 1.02123404, + "epoch": 0.9070795130016533, + "flos": 23441624328960.0, + "grad_norm": 1.5152115488058844, + "language_loss": 0.72135758, + "learning_rate": 8.983551859805416e-08, + "loss": 0.74214065, + "num_input_tokens_seen": 325415020, + "step": 15087, + "time_per_iteration": 2.712318181991577 + }, + { + "auxiliary_loss_clip": 0.01040721, + "auxiliary_loss_mlp": 0.01025854, + "balance_loss_clip": 1.02425551, + "balance_loss_mlp": 1.0156256, + "epoch": 0.9071396362543214, + "flos": 18916413114240.0, + "grad_norm": 2.079572194790902, + "language_loss": 0.76601171, + "learning_rate": 8.972014140059058e-08, + "loss": 0.78667754, + "num_input_tokens_seen": 325433595, + "step": 15088, + "time_per_iteration": 2.6595146656036377 + }, + { + "auxiliary_loss_clip": 0.01031175, + "auxiliary_loss_mlp": 0.01026884, + "balance_loss_clip": 1.02186298, + "balance_loss_mlp": 1.01684022, + "epoch": 0.9071997595069893, + "flos": 25228646887680.0, + "grad_norm": 2.064541359439486, + "language_loss": 0.73464763, + "learning_rate": 8.960483664113038e-08, + "loss": 0.75522816, + "num_input_tokens_seen": 325451605, + "step": 15089, + "time_per_iteration": 2.656245470046997 + }, + { + "auxiliary_loss_clip": 0.01057413, + "auxiliary_loss_mlp": 0.01028899, + "balance_loss_clip": 1.02345133, + "balance_loss_mlp": 1.01964879, + "epoch": 0.9072598827596573, + "flos": 24346608514560.0, + "grad_norm": 1.949648171074959, + "language_loss": 0.75242293, + "learning_rate": 8.948960432404628e-08, + "loss": 0.7732861, + "num_input_tokens_seen": 325470645, + "step": 15090, + "time_per_iteration": 2.5945165157318115 + }, + { + "auxiliary_loss_clip": 0.01033545, + "auxiliary_loss_mlp": 0.01028588, + "balance_loss_clip": 1.02358985, + "balance_loss_mlp": 1.01755476, + "epoch": 0.9073200060123253, + "flos": 22674967418880.0, + "grad_norm": 2.085406956009227, + "language_loss": 0.77877098, + "learning_rate": 8.93744444537079e-08, + "loss": 0.79939234, + "num_input_tokens_seen": 325488070, + "step": 15091, + "time_per_iteration": 2.69290828704834 + }, + { + "auxiliary_loss_clip": 0.01034424, + "auxiliary_loss_mlp": 0.01025681, + "balance_loss_clip": 1.02217531, + "balance_loss_mlp": 1.01636517, + "epoch": 0.9073801292649932, + "flos": 23695476721920.0, + "grad_norm": 1.5078632437412998, + "language_loss": 0.86126113, + "learning_rate": 8.925935703448217e-08, + "loss": 0.88186216, + "num_input_tokens_seen": 325509285, + "step": 15092, + "time_per_iteration": 2.8010358810424805 + }, + { + "auxiliary_loss_clip": 0.01042136, + "auxiliary_loss_mlp": 0.01029085, + "balance_loss_clip": 1.02687037, + "balance_loss_mlp": 1.01874948, + "epoch": 0.9074402525176612, + "flos": 25375413859200.0, + "grad_norm": 2.494503035800966, + "language_loss": 0.7886197, + "learning_rate": 8.914434207073296e-08, + "loss": 0.80933189, + "num_input_tokens_seen": 325529360, + "step": 15093, + "time_per_iteration": 2.7270467281341553 + }, + { + "auxiliary_loss_clip": 0.00997539, + "auxiliary_loss_mlp": 0.01000743, + "balance_loss_clip": 1.00219011, + "balance_loss_mlp": 0.99984854, + "epoch": 0.9075003757703292, + "flos": 67649024384640.0, + "grad_norm": 0.7403426005489692, + "language_loss": 0.57031882, + "learning_rate": 8.902939956682188e-08, + "loss": 0.59030163, + "num_input_tokens_seen": 325583565, + "step": 15094, + "time_per_iteration": 5.0019307136535645 + }, + { + "auxiliary_loss_clip": 0.01050921, + "auxiliary_loss_mlp": 0.01031478, + "balance_loss_clip": 1.0241704, + "balance_loss_mlp": 1.02021313, + "epoch": 0.9075604990229972, + "flos": 22453649769600.0, + "grad_norm": 7.6244000365565, + "language_loss": 0.71565181, + "learning_rate": 8.891452952710742e-08, + "loss": 0.73647577, + "num_input_tokens_seen": 325603690, + "step": 15095, + "time_per_iteration": 4.403621673583984 + }, + { + "auxiliary_loss_clip": 0.01024194, + "auxiliary_loss_mlp": 0.01032564, + "balance_loss_clip": 1.02368891, + "balance_loss_mlp": 1.02179384, + "epoch": 0.9076206222756651, + "flos": 19536662188800.0, + "grad_norm": 1.5930539039284057, + "language_loss": 0.73889947, + "learning_rate": 8.879973195594526e-08, + "loss": 0.75946712, + "num_input_tokens_seen": 325622255, + "step": 15096, + "time_per_iteration": 2.778799295425415 + }, + { + "auxiliary_loss_clip": 0.01061801, + "auxiliary_loss_mlp": 0.01035892, + "balance_loss_clip": 1.0244081, + "balance_loss_mlp": 1.02407837, + "epoch": 0.9076807455283331, + "flos": 30116914819200.0, + "grad_norm": 2.424409399267532, + "language_loss": 0.5747475, + "learning_rate": 8.868500685768898e-08, + "loss": 0.59572446, + "num_input_tokens_seen": 325640165, + "step": 15097, + "time_per_iteration": 2.6643834114074707 + }, + { + "auxiliary_loss_clip": 0.01039407, + "auxiliary_loss_mlp": 0.01022655, + "balance_loss_clip": 1.022259, + "balance_loss_mlp": 1.0132432, + "epoch": 0.907740868781001, + "flos": 18697537589760.0, + "grad_norm": 2.3221741082469745, + "language_loss": 0.7962929, + "learning_rate": 8.857035423668935e-08, + "loss": 0.81691349, + "num_input_tokens_seen": 325659455, + "step": 15098, + "time_per_iteration": 2.559014320373535 + }, + { + "auxiliary_loss_clip": 0.01020685, + "auxiliary_loss_mlp": 0.00747682, + "balance_loss_clip": 1.02384388, + "balance_loss_mlp": 1.00037265, + "epoch": 0.907800992033669, + "flos": 22638805401600.0, + "grad_norm": 2.0729246617324013, + "language_loss": 0.66439486, + "learning_rate": 8.845577409729266e-08, + "loss": 0.68207848, + "num_input_tokens_seen": 325678095, + "step": 15099, + "time_per_iteration": 2.769787549972534 + }, + { + "auxiliary_loss_clip": 0.01041667, + "auxiliary_loss_mlp": 0.01031338, + "balance_loss_clip": 1.02405453, + "balance_loss_mlp": 1.02057981, + "epoch": 0.907861115286337, + "flos": 21287666384640.0, + "grad_norm": 2.257177751065333, + "language_loss": 0.69919527, + "learning_rate": 8.834126644384477e-08, + "loss": 0.71992534, + "num_input_tokens_seen": 325695825, + "step": 15100, + "time_per_iteration": 2.679152011871338 + }, + { + "auxiliary_loss_clip": 0.00997879, + "auxiliary_loss_mlp": 0.01002775, + "balance_loss_clip": 1.00235486, + "balance_loss_mlp": 1.00201464, + "epoch": 0.907921238539005, + "flos": 69739493040000.0, + "grad_norm": 0.6253862396380353, + "language_loss": 0.53500259, + "learning_rate": 8.822683128068775e-08, + "loss": 0.55500913, + "num_input_tokens_seen": 325764515, + "step": 15101, + "time_per_iteration": 3.2067620754241943 + }, + { + "auxiliary_loss_clip": 0.01025159, + "auxiliary_loss_mlp": 0.01026942, + "balance_loss_clip": 1.02228439, + "balance_loss_mlp": 1.01645172, + "epoch": 0.9079813617916729, + "flos": 23477391296640.0, + "grad_norm": 1.7223980681833608, + "language_loss": 0.67912316, + "learning_rate": 8.811246861216081e-08, + "loss": 0.69964409, + "num_input_tokens_seen": 325783235, + "step": 15102, + "time_per_iteration": 2.7072315216064453 + }, + { + "auxiliary_loss_clip": 0.0104507, + "auxiliary_loss_mlp": 0.01028064, + "balance_loss_clip": 1.02402902, + "balance_loss_mlp": 1.01816404, + "epoch": 0.9080414850443409, + "flos": 22929933133440.0, + "grad_norm": 1.859045532311165, + "language_loss": 0.79244846, + "learning_rate": 8.799817844260049e-08, + "loss": 0.81317973, + "num_input_tokens_seen": 325800195, + "step": 15103, + "time_per_iteration": 2.716463327407837 + }, + { + "auxiliary_loss_clip": 0.01034242, + "auxiliary_loss_mlp": 0.01030834, + "balance_loss_clip": 1.0228194, + "balance_loss_mlp": 1.020123, + "epoch": 0.9081016082970089, + "flos": 26177083551360.0, + "grad_norm": 6.29810289887937, + "language_loss": 0.7149986, + "learning_rate": 8.78839607763413e-08, + "loss": 0.73564935, + "num_input_tokens_seen": 325820215, + "step": 15104, + "time_per_iteration": 2.8863847255706787 + }, + { + "auxiliary_loss_clip": 0.0103675, + "auxiliary_loss_mlp": 0.01023663, + "balance_loss_clip": 1.02260232, + "balance_loss_mlp": 1.01456785, + "epoch": 0.9081617315496768, + "flos": 24462169545600.0, + "grad_norm": 1.758857707526129, + "language_loss": 0.77329433, + "learning_rate": 8.77698156177138e-08, + "loss": 0.79389846, + "num_input_tokens_seen": 325838415, + "step": 15105, + "time_per_iteration": 2.7300732135772705 + }, + { + "auxiliary_loss_clip": 0.01061746, + "auxiliary_loss_mlp": 0.00747656, + "balance_loss_clip": 1.02400422, + "balance_loss_mlp": 1.00041664, + "epoch": 0.9082218548023449, + "flos": 24746868743040.0, + "grad_norm": 1.9694898761325292, + "language_loss": 0.73803592, + "learning_rate": 8.765574297104628e-08, + "loss": 0.75612998, + "num_input_tokens_seen": 325855580, + "step": 15106, + "time_per_iteration": 2.6995716094970703 + }, + { + "auxiliary_loss_clip": 0.01004722, + "auxiliary_loss_mlp": 0.01029354, + "balance_loss_clip": 1.01810002, + "balance_loss_mlp": 1.01812482, + "epoch": 0.9082819780550128, + "flos": 24421302846720.0, + "grad_norm": 2.6094922966971223, + "language_loss": 0.80469978, + "learning_rate": 8.754174284066462e-08, + "loss": 0.82504064, + "num_input_tokens_seen": 325874890, + "step": 15107, + "time_per_iteration": 2.8076541423797607 + }, + { + "auxiliary_loss_clip": 0.00988819, + "auxiliary_loss_mlp": 0.0100123, + "balance_loss_clip": 1.00339341, + "balance_loss_mlp": 1.00022221, + "epoch": 0.9083421013076808, + "flos": 59609704872960.0, + "grad_norm": 0.8171059211416, + "language_loss": 0.59697914, + "learning_rate": 8.742781523089205e-08, + "loss": 0.61687958, + "num_input_tokens_seen": 325935835, + "step": 15108, + "time_per_iteration": 3.2267935276031494 + }, + { + "auxiliary_loss_clip": 0.01040456, + "auxiliary_loss_mlp": 0.0102458, + "balance_loss_clip": 1.02323997, + "balance_loss_mlp": 1.01447666, + "epoch": 0.9084022245603487, + "flos": 33620216100480.0, + "grad_norm": 1.6697668028839476, + "language_loss": 0.73933923, + "learning_rate": 8.73139601460482e-08, + "loss": 0.75998962, + "num_input_tokens_seen": 325958035, + "step": 15109, + "time_per_iteration": 2.736114740371704 + }, + { + "auxiliary_loss_clip": 0.01029962, + "auxiliary_loss_mlp": 0.0102714, + "balance_loss_clip": 1.0235889, + "balance_loss_mlp": 1.01756191, + "epoch": 0.9084623478130167, + "flos": 24971705925120.0, + "grad_norm": 1.9692388185670189, + "language_loss": 0.7128852, + "learning_rate": 8.720017759045073e-08, + "loss": 0.73345625, + "num_input_tokens_seen": 325979870, + "step": 15110, + "time_per_iteration": 2.722019672393799 + }, + { + "auxiliary_loss_clip": 0.0102848, + "auxiliary_loss_mlp": 0.01029552, + "balance_loss_clip": 1.01933622, + "balance_loss_mlp": 1.01901388, + "epoch": 0.9085224710656846, + "flos": 31461804869760.0, + "grad_norm": 1.8397992358901565, + "language_loss": 0.6909526, + "learning_rate": 8.708646756841421e-08, + "loss": 0.71153295, + "num_input_tokens_seen": 325998245, + "step": 15111, + "time_per_iteration": 4.500600576400757 + }, + { + "auxiliary_loss_clip": 0.00975183, + "auxiliary_loss_mlp": 0.01001074, + "balance_loss_clip": 1.00217128, + "balance_loss_mlp": 1.00031745, + "epoch": 0.9085825943183526, + "flos": 64917012867840.0, + "grad_norm": 0.6949972037484395, + "language_loss": 0.51698905, + "learning_rate": 8.697283008425026e-08, + "loss": 0.53675163, + "num_input_tokens_seen": 326061770, + "step": 15112, + "time_per_iteration": 3.3169355392456055 + }, + { + "auxiliary_loss_clip": 0.0104968, + "auxiliary_loss_mlp": 0.01028047, + "balance_loss_clip": 1.02294922, + "balance_loss_mlp": 1.01756835, + "epoch": 0.9086427175710206, + "flos": 18953221576320.0, + "grad_norm": 2.047074710217517, + "language_loss": 0.69902658, + "learning_rate": 8.685926514226837e-08, + "loss": 0.71980381, + "num_input_tokens_seen": 326080945, + "step": 15113, + "time_per_iteration": 2.684065103530884 + }, + { + "auxiliary_loss_clip": 0.01052568, + "auxiliary_loss_mlp": 0.01029703, + "balance_loss_clip": 1.02528477, + "balance_loss_mlp": 1.01962447, + "epoch": 0.9087028408236886, + "flos": 34014873807360.0, + "grad_norm": 2.072876729903773, + "language_loss": 0.79446572, + "learning_rate": 8.674577274677508e-08, + "loss": 0.81528842, + "num_input_tokens_seen": 326100630, + "step": 15114, + "time_per_iteration": 2.752896785736084 + }, + { + "auxiliary_loss_clip": 0.01025679, + "auxiliary_loss_mlp": 0.01029923, + "balance_loss_clip": 1.02568793, + "balance_loss_mlp": 1.0186522, + "epoch": 0.9087629640763565, + "flos": 21944580266880.0, + "grad_norm": 1.9798165752621995, + "language_loss": 0.69718385, + "learning_rate": 8.663235290207405e-08, + "loss": 0.71773994, + "num_input_tokens_seen": 326120145, + "step": 15115, + "time_per_iteration": 2.7905657291412354 + }, + { + "auxiliary_loss_clip": 0.01033634, + "auxiliary_loss_mlp": 0.01027672, + "balance_loss_clip": 1.02540493, + "balance_loss_mlp": 1.01675284, + "epoch": 0.9088230873290245, + "flos": 21762908254080.0, + "grad_norm": 1.862323011244157, + "language_loss": 0.65872824, + "learning_rate": 8.651900561246561e-08, + "loss": 0.67934132, + "num_input_tokens_seen": 326140715, + "step": 15116, + "time_per_iteration": 2.8342082500457764 + }, + { + "auxiliary_loss_clip": 0.01062191, + "auxiliary_loss_mlp": 0.01030629, + "balance_loss_clip": 1.02654266, + "balance_loss_mlp": 1.02040696, + "epoch": 0.9088832105816925, + "flos": 21541267382400.0, + "grad_norm": 1.65544552928033, + "language_loss": 0.69759631, + "learning_rate": 8.640573088224812e-08, + "loss": 0.71852446, + "num_input_tokens_seen": 326159130, + "step": 15117, + "time_per_iteration": 2.639420509338379 + }, + { + "auxiliary_loss_clip": 0.01031371, + "auxiliary_loss_mlp": 0.01024038, + "balance_loss_clip": 1.02526009, + "balance_loss_mlp": 1.0140717, + "epoch": 0.9089433338343604, + "flos": 25996704428160.0, + "grad_norm": 1.4649871249661164, + "language_loss": 0.74300689, + "learning_rate": 8.629252871571745e-08, + "loss": 0.76356089, + "num_input_tokens_seen": 326181375, + "step": 15118, + "time_per_iteration": 2.786710739135742 + }, + { + "auxiliary_loss_clip": 0.01040352, + "auxiliary_loss_mlp": 0.01035703, + "balance_loss_clip": 1.02320135, + "balance_loss_mlp": 1.02375269, + "epoch": 0.9090034570870285, + "flos": 21178426147200.0, + "grad_norm": 2.54548845453292, + "language_loss": 0.73402297, + "learning_rate": 8.617939911716554e-08, + "loss": 0.75478351, + "num_input_tokens_seen": 326199740, + "step": 15119, + "time_per_iteration": 2.700410842895508 + }, + { + "auxiliary_loss_clip": 0.0102606, + "auxiliary_loss_mlp": 0.01034696, + "balance_loss_clip": 1.02337432, + "balance_loss_mlp": 1.02246487, + "epoch": 0.9090635803396964, + "flos": 16141811045760.0, + "grad_norm": 2.6649164105349104, + "language_loss": 0.71568692, + "learning_rate": 8.60663420908827e-08, + "loss": 0.73629451, + "num_input_tokens_seen": 326214350, + "step": 15120, + "time_per_iteration": 2.6874330043792725 + }, + { + "auxiliary_loss_clip": 0.01061797, + "auxiliary_loss_mlp": 0.00747644, + "balance_loss_clip": 1.02444303, + "balance_loss_mlp": 1.00048316, + "epoch": 0.9091237035923644, + "flos": 20591537829120.0, + "grad_norm": 2.3262746136000145, + "language_loss": 0.65406156, + "learning_rate": 8.595335764115596e-08, + "loss": 0.67215598, + "num_input_tokens_seen": 326234580, + "step": 15121, + "time_per_iteration": 2.7229509353637695 + }, + { + "auxiliary_loss_clip": 0.01050923, + "auxiliary_loss_mlp": 0.01030051, + "balance_loss_clip": 1.02390742, + "balance_loss_mlp": 1.01911318, + "epoch": 0.9091838268450323, + "flos": 52227760164480.0, + "grad_norm": 1.7116263378769734, + "language_loss": 0.70173311, + "learning_rate": 8.58404457722699e-08, + "loss": 0.72254282, + "num_input_tokens_seen": 326259080, + "step": 15122, + "time_per_iteration": 2.910817861557007 + }, + { + "auxiliary_loss_clip": 0.01011804, + "auxiliary_loss_mlp": 0.01029471, + "balance_loss_clip": 1.02156997, + "balance_loss_mlp": 1.01932585, + "epoch": 0.9092439500977003, + "flos": 20559613616640.0, + "grad_norm": 1.2404789138016357, + "language_loss": 0.7462588, + "learning_rate": 8.572760648850575e-08, + "loss": 0.76667154, + "num_input_tokens_seen": 326280175, + "step": 15123, + "time_per_iteration": 2.7444450855255127 + }, + { + "auxiliary_loss_clip": 0.01051891, + "auxiliary_loss_mlp": 0.01028931, + "balance_loss_clip": 1.02624154, + "balance_loss_mlp": 1.01883376, + "epoch": 0.9093040733503682, + "flos": 28617859595520.0, + "grad_norm": 1.918472158950973, + "language_loss": 0.7543925, + "learning_rate": 8.561483979414253e-08, + "loss": 0.77520072, + "num_input_tokens_seen": 326297990, + "step": 15124, + "time_per_iteration": 2.6815149784088135 + }, + { + "auxiliary_loss_clip": 0.01042627, + "auxiliary_loss_mlp": 0.01030602, + "balance_loss_clip": 1.02238286, + "balance_loss_mlp": 1.0197773, + "epoch": 0.9093641966030362, + "flos": 23440187784960.0, + "grad_norm": 2.1416308000153537, + "language_loss": 0.72471106, + "learning_rate": 8.55021456934566e-08, + "loss": 0.74544334, + "num_input_tokens_seen": 326316735, + "step": 15125, + "time_per_iteration": 2.671856641769409 + }, + { + "auxiliary_loss_clip": 0.01026419, + "auxiliary_loss_mlp": 0.01034031, + "balance_loss_clip": 1.02471507, + "balance_loss_mlp": 1.02342117, + "epoch": 0.9094243198557042, + "flos": 16800197385600.0, + "grad_norm": 1.665270576518133, + "language_loss": 0.79108381, + "learning_rate": 8.538952419072143e-08, + "loss": 0.8116883, + "num_input_tokens_seen": 326334370, + "step": 15126, + "time_per_iteration": 2.687546968460083 + }, + { + "auxiliary_loss_clip": 0.01023867, + "auxiliary_loss_mlp": 0.01033797, + "balance_loss_clip": 1.02547967, + "balance_loss_mlp": 1.02306247, + "epoch": 0.9094844431083722, + "flos": 24273278899200.0, + "grad_norm": 1.528036504652238, + "language_loss": 0.75460196, + "learning_rate": 8.527697529020694e-08, + "loss": 0.77517861, + "num_input_tokens_seen": 326353435, + "step": 15127, + "time_per_iteration": 2.7009153366088867 + }, + { + "auxiliary_loss_clip": 0.00983954, + "auxiliary_loss_mlp": 0.0103156, + "balance_loss_clip": 1.02096498, + "balance_loss_mlp": 1.02091444, + "epoch": 0.9095445663610401, + "flos": 21944652094080.0, + "grad_norm": 2.015460362021263, + "language_loss": 0.62772357, + "learning_rate": 8.516449899618173e-08, + "loss": 0.64787871, + "num_input_tokens_seen": 326371810, + "step": 15128, + "time_per_iteration": 2.7337820529937744 + }, + { + "auxiliary_loss_clip": 0.0102898, + "auxiliary_loss_mlp": 0.01023609, + "balance_loss_clip": 1.023332, + "balance_loss_mlp": 1.01395321, + "epoch": 0.9096046896137081, + "flos": 19792848965760.0, + "grad_norm": 1.731327952268771, + "language_loss": 0.76796764, + "learning_rate": 8.505209531291013e-08, + "loss": 0.78849351, + "num_input_tokens_seen": 326391380, + "step": 15129, + "time_per_iteration": 4.468201160430908 + }, + { + "auxiliary_loss_clip": 0.01046654, + "auxiliary_loss_mlp": 0.01027191, + "balance_loss_clip": 1.02273893, + "balance_loss_mlp": 1.01716518, + "epoch": 0.909664812866376, + "flos": 22638087129600.0, + "grad_norm": 1.7870362000484759, + "language_loss": 0.83270454, + "learning_rate": 8.49397642446552e-08, + "loss": 0.85344297, + "num_input_tokens_seen": 326408800, + "step": 15130, + "time_per_iteration": 2.5824153423309326 + }, + { + "auxiliary_loss_clip": 0.01042508, + "auxiliary_loss_mlp": 0.01029225, + "balance_loss_clip": 1.02515793, + "balance_loss_mlp": 1.01836479, + "epoch": 0.909724936119044, + "flos": 39852153020160.0, + "grad_norm": 1.710285516899743, + "language_loss": 0.75157917, + "learning_rate": 8.482750579567644e-08, + "loss": 0.77229649, + "num_input_tokens_seen": 326431565, + "step": 15131, + "time_per_iteration": 2.8128011226654053 + }, + { + "auxiliary_loss_clip": 0.01034077, + "auxiliary_loss_mlp": 0.01031124, + "balance_loss_clip": 1.02338934, + "balance_loss_mlp": 1.02009153, + "epoch": 0.9097850593717121, + "flos": 35071616954880.0, + "grad_norm": 1.862008981625942, + "language_loss": 0.59785604, + "learning_rate": 8.471531997023085e-08, + "loss": 0.61850804, + "num_input_tokens_seen": 326451715, + "step": 15132, + "time_per_iteration": 2.8576772212982178 + }, + { + "auxiliary_loss_clip": 0.01024967, + "auxiliary_loss_mlp": 0.01029682, + "balance_loss_clip": 1.02627587, + "balance_loss_mlp": 1.02008545, + "epoch": 0.90984518262438, + "flos": 23367468700800.0, + "grad_norm": 2.2440526738550024, + "language_loss": 0.82675385, + "learning_rate": 8.460320677257193e-08, + "loss": 0.84730041, + "num_input_tokens_seen": 326470855, + "step": 15133, + "time_per_iteration": 2.7070891857147217 + }, + { + "auxiliary_loss_clip": 0.01037023, + "auxiliary_loss_mlp": 0.01029147, + "balance_loss_clip": 1.02216339, + "balance_loss_mlp": 1.01886535, + "epoch": 0.909905305877048, + "flos": 27523302405120.0, + "grad_norm": 1.7430767163755272, + "language_loss": 0.74262458, + "learning_rate": 8.449116620695118e-08, + "loss": 0.76328623, + "num_input_tokens_seen": 326490480, + "step": 15134, + "time_per_iteration": 2.723803997039795 + }, + { + "auxiliary_loss_clip": 0.0103763, + "auxiliary_loss_mlp": 0.01032155, + "balance_loss_clip": 1.02629256, + "balance_loss_mlp": 1.02165246, + "epoch": 0.9099654291297159, + "flos": 24347865490560.0, + "grad_norm": 1.5211148983650975, + "language_loss": 0.72648799, + "learning_rate": 8.437919827761786e-08, + "loss": 0.74718589, + "num_input_tokens_seen": 326509445, + "step": 15135, + "time_per_iteration": 2.739412784576416 + }, + { + "auxiliary_loss_clip": 0.01051206, + "auxiliary_loss_mlp": 0.01030068, + "balance_loss_clip": 1.02540851, + "balance_loss_mlp": 1.02012062, + "epoch": 0.9100255523823839, + "flos": 21215234609280.0, + "grad_norm": 1.7061087279103313, + "language_loss": 0.69931197, + "learning_rate": 8.426730298881702e-08, + "loss": 0.72012472, + "num_input_tokens_seen": 326528380, + "step": 15136, + "time_per_iteration": 2.6401736736297607 + }, + { + "auxiliary_loss_clip": 0.00966289, + "auxiliary_loss_mlp": 0.01001051, + "balance_loss_clip": 1.00283194, + "balance_loss_mlp": 1.00009692, + "epoch": 0.9100856756350518, + "flos": 46052276446080.0, + "grad_norm": 0.8276984957385215, + "language_loss": 0.59300554, + "learning_rate": 8.415548034479214e-08, + "loss": 0.61267889, + "num_input_tokens_seen": 326576940, + "step": 15137, + "time_per_iteration": 3.017714738845825 + }, + { + "auxiliary_loss_clip": 0.0105172, + "auxiliary_loss_mlp": 0.01033672, + "balance_loss_clip": 1.02486205, + "balance_loss_mlp": 1.02324128, + "epoch": 0.9101457988877198, + "flos": 20229917656320.0, + "grad_norm": 1.7205902645961533, + "language_loss": 0.82400131, + "learning_rate": 8.40437303497834e-08, + "loss": 0.84485519, + "num_input_tokens_seen": 326596100, + "step": 15138, + "time_per_iteration": 2.6394927501678467 + }, + { + "auxiliary_loss_clip": 0.01050174, + "auxiliary_loss_mlp": 0.01021498, + "balance_loss_clip": 1.02552676, + "balance_loss_mlp": 1.0119251, + "epoch": 0.9102059221403878, + "flos": 26615157822720.0, + "grad_norm": 1.4418268594786607, + "language_loss": 0.81357735, + "learning_rate": 8.39320530080283e-08, + "loss": 0.83429408, + "num_input_tokens_seen": 326615700, + "step": 15139, + "time_per_iteration": 2.701207399368286 + }, + { + "auxiliary_loss_clip": 0.01032981, + "auxiliary_loss_mlp": 0.01030476, + "balance_loss_clip": 1.02615929, + "balance_loss_mlp": 1.02066493, + "epoch": 0.9102660453930558, + "flos": 21908561904000.0, + "grad_norm": 2.3989608718348685, + "language_loss": 0.77335167, + "learning_rate": 8.382044832376167e-08, + "loss": 0.79398626, + "num_input_tokens_seen": 326635905, + "step": 15140, + "time_per_iteration": 2.732891082763672 + }, + { + "auxiliary_loss_clip": 0.01059929, + "auxiliary_loss_mlp": 0.01028069, + "balance_loss_clip": 1.02369606, + "balance_loss_mlp": 1.01822209, + "epoch": 0.9103261686457237, + "flos": 36176660916480.0, + "grad_norm": 1.7121049107531496, + "language_loss": 0.66647339, + "learning_rate": 8.370891630121569e-08, + "loss": 0.68735337, + "num_input_tokens_seen": 326661855, + "step": 15141, + "time_per_iteration": 4.389519929885864 + }, + { + "auxiliary_loss_clip": 0.01050688, + "auxiliary_loss_mlp": 0.01031003, + "balance_loss_clip": 1.02352643, + "balance_loss_mlp": 1.02070332, + "epoch": 0.9103862918983917, + "flos": 23878549365120.0, + "grad_norm": 1.7833774558545832, + "language_loss": 0.74918664, + "learning_rate": 8.359745694462005e-08, + "loss": 0.77000356, + "num_input_tokens_seen": 326679320, + "step": 15142, + "time_per_iteration": 4.2982518672943115 + }, + { + "auxiliary_loss_clip": 0.01022274, + "auxiliary_loss_mlp": 0.01031555, + "balance_loss_clip": 1.02057278, + "balance_loss_mlp": 1.02119565, + "epoch": 0.9104464151510596, + "flos": 14939521989120.0, + "grad_norm": 1.9634859901131931, + "language_loss": 0.64781916, + "learning_rate": 8.348607025820076e-08, + "loss": 0.66835743, + "num_input_tokens_seen": 326698110, + "step": 15143, + "time_per_iteration": 2.888667345046997 + }, + { + "auxiliary_loss_clip": 0.01062621, + "auxiliary_loss_mlp": 0.01029974, + "balance_loss_clip": 1.02386475, + "balance_loss_mlp": 1.01903069, + "epoch": 0.9105065384037276, + "flos": 33655803500160.0, + "grad_norm": 2.04611943263143, + "language_loss": 0.60980082, + "learning_rate": 8.337475624618152e-08, + "loss": 0.63072681, + "num_input_tokens_seen": 326718370, + "step": 15144, + "time_per_iteration": 2.7856392860412598 + }, + { + "auxiliary_loss_clip": 0.01017315, + "auxiliary_loss_mlp": 0.0102627, + "balance_loss_clip": 1.01995373, + "balance_loss_mlp": 1.01634622, + "epoch": 0.9105666616563957, + "flos": 24316695463680.0, + "grad_norm": 1.6058125192200101, + "language_loss": 0.70798367, + "learning_rate": 8.326351491278382e-08, + "loss": 0.72841954, + "num_input_tokens_seen": 326738445, + "step": 15145, + "time_per_iteration": 2.8416848182678223 + }, + { + "auxiliary_loss_clip": 0.01004378, + "auxiliary_loss_mlp": 0.01027559, + "balance_loss_clip": 1.02153254, + "balance_loss_mlp": 1.01791537, + "epoch": 0.9106267849090636, + "flos": 29971692132480.0, + "grad_norm": 1.7818883261337033, + "language_loss": 0.70677185, + "learning_rate": 8.315234626222545e-08, + "loss": 0.72709119, + "num_input_tokens_seen": 326758855, + "step": 15146, + "time_per_iteration": 2.8457529544830322 + }, + { + "auxiliary_loss_clip": 0.0104152, + "auxiliary_loss_mlp": 0.01028715, + "balance_loss_clip": 1.02466357, + "balance_loss_mlp": 1.01911855, + "epoch": 0.9106869081617316, + "flos": 25337743470720.0, + "grad_norm": 2.192029491126413, + "language_loss": 0.72886163, + "learning_rate": 8.304125029872233e-08, + "loss": 0.74956393, + "num_input_tokens_seen": 326777140, + "step": 15147, + "time_per_iteration": 2.7000327110290527 + }, + { + "auxiliary_loss_clip": 0.01035846, + "auxiliary_loss_mlp": 0.01025874, + "balance_loss_clip": 1.02576673, + "balance_loss_mlp": 1.01562166, + "epoch": 0.9107470314143995, + "flos": 18187031543040.0, + "grad_norm": 1.87224489828462, + "language_loss": 0.79912889, + "learning_rate": 8.293022702648711e-08, + "loss": 0.81974614, + "num_input_tokens_seen": 326794070, + "step": 15148, + "time_per_iteration": 2.669084310531616 + }, + { + "auxiliary_loss_clip": 0.01033176, + "auxiliary_loss_mlp": 0.01031342, + "balance_loss_clip": 1.02472019, + "balance_loss_mlp": 1.02089942, + "epoch": 0.9108071546670675, + "flos": 23550828652800.0, + "grad_norm": 1.5971740508350463, + "language_loss": 0.68035924, + "learning_rate": 8.281927644972996e-08, + "loss": 0.70100439, + "num_input_tokens_seen": 326814695, + "step": 15149, + "time_per_iteration": 2.702772617340088 + }, + { + "auxiliary_loss_clip": 0.01061027, + "auxiliary_loss_mlp": 0.0102652, + "balance_loss_clip": 1.02536798, + "balance_loss_mlp": 1.01597619, + "epoch": 0.9108672779197354, + "flos": 25630307746560.0, + "grad_norm": 1.577681774033717, + "language_loss": 0.63453811, + "learning_rate": 8.270839857265776e-08, + "loss": 0.65541363, + "num_input_tokens_seen": 326835295, + "step": 15150, + "time_per_iteration": 2.622828960418701 + }, + { + "auxiliary_loss_clip": 0.01020619, + "auxiliary_loss_mlp": 0.01028145, + "balance_loss_clip": 1.02320611, + "balance_loss_mlp": 1.01773167, + "epoch": 0.9109274011724035, + "flos": 22339094319360.0, + "grad_norm": 2.1969163570577463, + "language_loss": 0.72575611, + "learning_rate": 8.259759339947514e-08, + "loss": 0.74624372, + "num_input_tokens_seen": 326853350, + "step": 15151, + "time_per_iteration": 2.782289743423462 + }, + { + "auxiliary_loss_clip": 0.01050279, + "auxiliary_loss_mlp": 0.01023276, + "balance_loss_clip": 1.02365148, + "balance_loss_mlp": 1.01322663, + "epoch": 0.9109875244250714, + "flos": 26688200129280.0, + "grad_norm": 1.638297792518358, + "language_loss": 0.64150667, + "learning_rate": 8.248686093438429e-08, + "loss": 0.66224229, + "num_input_tokens_seen": 326873425, + "step": 15152, + "time_per_iteration": 2.6752233505249023 + }, + { + "auxiliary_loss_clip": 0.01042469, + "auxiliary_loss_mlp": 0.00747568, + "balance_loss_clip": 1.02467084, + "balance_loss_mlp": 1.00039494, + "epoch": 0.9110476476777394, + "flos": 22930112701440.0, + "grad_norm": 2.4254337577518745, + "language_loss": 0.73650211, + "learning_rate": 8.23762011815834e-08, + "loss": 0.75440246, + "num_input_tokens_seen": 326893455, + "step": 15153, + "time_per_iteration": 2.778273344039917 + }, + { + "auxiliary_loss_clip": 0.01015129, + "auxiliary_loss_mlp": 0.01038976, + "balance_loss_clip": 1.02002943, + "balance_loss_mlp": 1.02707338, + "epoch": 0.9111077709304073, + "flos": 13472857854720.0, + "grad_norm": 1.9997975912817942, + "language_loss": 0.72242177, + "learning_rate": 8.226561414526956e-08, + "loss": 0.74296284, + "num_input_tokens_seen": 326910210, + "step": 15154, + "time_per_iteration": 2.6488499641418457 + }, + { + "auxiliary_loss_clip": 0.01044652, + "auxiliary_loss_mlp": 0.01029939, + "balance_loss_clip": 1.0275749, + "balance_loss_mlp": 1.0201993, + "epoch": 0.9111678941830753, + "flos": 20850561780480.0, + "grad_norm": 2.047448932459831, + "language_loss": 0.82752454, + "learning_rate": 8.215509982963564e-08, + "loss": 0.84827048, + "num_input_tokens_seen": 326929350, + "step": 15155, + "time_per_iteration": 2.688138008117676 + }, + { + "auxiliary_loss_clip": 0.01053797, + "auxiliary_loss_mlp": 0.01024715, + "balance_loss_clip": 1.0267452, + "balance_loss_mlp": 1.0140698, + "epoch": 0.9112280174357432, + "flos": 19682244011520.0, + "grad_norm": 1.979562597087053, + "language_loss": 0.59733713, + "learning_rate": 8.204465823887252e-08, + "loss": 0.61812228, + "num_input_tokens_seen": 326949060, + "step": 15156, + "time_per_iteration": 2.640857458114624 + }, + { + "auxiliary_loss_clip": 0.01051911, + "auxiliary_loss_mlp": 0.0102582, + "balance_loss_clip": 1.02323925, + "balance_loss_mlp": 1.01466203, + "epoch": 0.9112881406884112, + "flos": 25447163276160.0, + "grad_norm": 2.9115922555315366, + "language_loss": 0.74211371, + "learning_rate": 8.193428937716796e-08, + "loss": 0.76289105, + "num_input_tokens_seen": 326968950, + "step": 15157, + "time_per_iteration": 2.783308506011963 + }, + { + "auxiliary_loss_clip": 0.01013942, + "auxiliary_loss_mlp": 0.01033985, + "balance_loss_clip": 1.02054524, + "balance_loss_mlp": 1.02443051, + "epoch": 0.9113482639410793, + "flos": 33066975847680.0, + "grad_norm": 1.6127640070220786, + "language_loss": 0.5942024, + "learning_rate": 8.182399324870747e-08, + "loss": 0.61468172, + "num_input_tokens_seen": 326989455, + "step": 15158, + "time_per_iteration": 4.769587278366089 + }, + { + "auxiliary_loss_clip": 0.01014028, + "auxiliary_loss_mlp": 0.0103189, + "balance_loss_clip": 1.0263437, + "balance_loss_mlp": 1.02201915, + "epoch": 0.9114083871937472, + "flos": 21835591424640.0, + "grad_norm": 1.8156566619974712, + "language_loss": 0.67667067, + "learning_rate": 8.171376985767375e-08, + "loss": 0.69712985, + "num_input_tokens_seen": 327009640, + "step": 15159, + "time_per_iteration": 3.0465290546417236 + }, + { + "auxiliary_loss_clip": 0.0104195, + "auxiliary_loss_mlp": 0.01022653, + "balance_loss_clip": 1.02455759, + "balance_loss_mlp": 1.01235294, + "epoch": 0.9114685104464152, + "flos": 27088999061760.0, + "grad_norm": 2.7761445986863067, + "language_loss": 0.78660893, + "learning_rate": 8.160361920824588e-08, + "loss": 0.80725503, + "num_input_tokens_seen": 327027690, + "step": 15160, + "time_per_iteration": 2.7805044651031494 + }, + { + "auxiliary_loss_clip": 0.0106339, + "auxiliary_loss_mlp": 0.01026334, + "balance_loss_clip": 1.02614188, + "balance_loss_mlp": 1.01487172, + "epoch": 0.9115286336990831, + "flos": 17967042696960.0, + "grad_norm": 1.6317915598496187, + "language_loss": 0.68984073, + "learning_rate": 8.149354130460073e-08, + "loss": 0.71073794, + "num_input_tokens_seen": 327045915, + "step": 15161, + "time_per_iteration": 2.5127804279327393 + }, + { + "auxiliary_loss_clip": 0.01016757, + "auxiliary_loss_mlp": 0.01031912, + "balance_loss_clip": 1.02333355, + "balance_loss_mlp": 1.02018213, + "epoch": 0.9115887569517511, + "flos": 22929861306240.0, + "grad_norm": 1.694293183685141, + "language_loss": 0.76389259, + "learning_rate": 8.138353615091321e-08, + "loss": 0.7843793, + "num_input_tokens_seen": 327066355, + "step": 15162, + "time_per_iteration": 2.792837381362915 + }, + { + "auxiliary_loss_clip": 0.01034517, + "auxiliary_loss_mlp": 0.01031006, + "balance_loss_clip": 1.0254513, + "balance_loss_mlp": 1.02089667, + "epoch": 0.911648880204419, + "flos": 23988436047360.0, + "grad_norm": 1.8533915665581486, + "language_loss": 0.66953218, + "learning_rate": 8.127360375135395e-08, + "loss": 0.69018739, + "num_input_tokens_seen": 327086735, + "step": 15163, + "time_per_iteration": 2.654726505279541 + }, + { + "auxiliary_loss_clip": 0.01019122, + "auxiliary_loss_mlp": 0.01028906, + "balance_loss_clip": 1.02289367, + "balance_loss_mlp": 1.01808763, + "epoch": 0.911709003457087, + "flos": 17055306754560.0, + "grad_norm": 2.6424683939052955, + "language_loss": 0.70868105, + "learning_rate": 8.116374411009186e-08, + "loss": 0.72916126, + "num_input_tokens_seen": 327104035, + "step": 15164, + "time_per_iteration": 2.703460693359375 + }, + { + "auxiliary_loss_clip": 0.01061645, + "auxiliary_loss_mlp": 0.01029887, + "balance_loss_clip": 1.02680421, + "balance_loss_mlp": 1.0198977, + "epoch": 0.911769126709755, + "flos": 21653344794240.0, + "grad_norm": 1.6196925401407942, + "language_loss": 0.76156431, + "learning_rate": 8.105395723129315e-08, + "loss": 0.78247964, + "num_input_tokens_seen": 327124370, + "step": 15165, + "time_per_iteration": 2.6119399070739746 + }, + { + "auxiliary_loss_clip": 0.01044499, + "auxiliary_loss_mlp": 0.01032156, + "balance_loss_clip": 1.02370453, + "balance_loss_mlp": 1.02127767, + "epoch": 0.911829249962423, + "flos": 24790321221120.0, + "grad_norm": 2.1500310557789666, + "language_loss": 0.72260308, + "learning_rate": 8.094424311912074e-08, + "loss": 0.74336958, + "num_input_tokens_seen": 327140915, + "step": 15166, + "time_per_iteration": 2.619028329849243 + }, + { + "auxiliary_loss_clip": 0.01011242, + "auxiliary_loss_mlp": 0.01035333, + "balance_loss_clip": 1.02235198, + "balance_loss_mlp": 1.02434778, + "epoch": 0.9118893732150909, + "flos": 20959406968320.0, + "grad_norm": 1.6940822389729073, + "language_loss": 0.73084408, + "learning_rate": 8.083460177773482e-08, + "loss": 0.75130975, + "num_input_tokens_seen": 327158940, + "step": 15167, + "time_per_iteration": 2.691795825958252 + }, + { + "auxiliary_loss_clip": 0.00989805, + "auxiliary_loss_mlp": 0.01004178, + "balance_loss_clip": 1.00362754, + "balance_loss_mlp": 1.00332594, + "epoch": 0.9119494964677589, + "flos": 67917385872000.0, + "grad_norm": 0.7763851796237768, + "language_loss": 0.65546834, + "learning_rate": 8.072503321129298e-08, + "loss": 0.67540818, + "num_input_tokens_seen": 327217450, + "step": 15168, + "time_per_iteration": 3.244508743286133 + }, + { + "auxiliary_loss_clip": 0.01040191, + "auxiliary_loss_mlp": 0.01025446, + "balance_loss_clip": 1.0243485, + "balance_loss_mlp": 1.01569438, + "epoch": 0.9120096197204268, + "flos": 18551524803840.0, + "grad_norm": 1.9313520539236673, + "language_loss": 0.77976066, + "learning_rate": 8.061553742395033e-08, + "loss": 0.80041707, + "num_input_tokens_seen": 327233905, + "step": 15169, + "time_per_iteration": 2.5941402912139893 + }, + { + "auxiliary_loss_clip": 0.01051241, + "auxiliary_loss_mlp": 0.01027919, + "balance_loss_clip": 1.02478445, + "balance_loss_mlp": 1.01834083, + "epoch": 0.9120697429730948, + "flos": 19025725178880.0, + "grad_norm": 1.6474677770739579, + "language_loss": 0.8244853, + "learning_rate": 8.05061144198591e-08, + "loss": 0.84527695, + "num_input_tokens_seen": 327252430, + "step": 15170, + "time_per_iteration": 2.597698926925659 + }, + { + "auxiliary_loss_clip": 0.01053911, + "auxiliary_loss_mlp": 0.01028589, + "balance_loss_clip": 1.02592349, + "balance_loss_mlp": 1.01783609, + "epoch": 0.9121298662257629, + "flos": 17163685065600.0, + "grad_norm": 2.021083020497968, + "language_loss": 0.772802, + "learning_rate": 8.039676420316799e-08, + "loss": 0.7936269, + "num_input_tokens_seen": 327269215, + "step": 15171, + "time_per_iteration": 2.601003885269165 + }, + { + "auxiliary_loss_clip": 0.00984796, + "auxiliary_loss_mlp": 0.01032509, + "balance_loss_clip": 1.02102864, + "balance_loss_mlp": 1.02120197, + "epoch": 0.9121899894784308, + "flos": 19682710888320.0, + "grad_norm": 1.317830505532027, + "language_loss": 0.66936892, + "learning_rate": 8.02874867780241e-08, + "loss": 0.68954194, + "num_input_tokens_seen": 327290320, + "step": 15172, + "time_per_iteration": 3.0345895290374756 + }, + { + "auxiliary_loss_clip": 0.01037449, + "auxiliary_loss_mlp": 0.01030887, + "balance_loss_clip": 1.02544534, + "balance_loss_mlp": 1.02006245, + "epoch": 0.9122501127310988, + "flos": 22235743912320.0, + "grad_norm": 1.6868830648444013, + "language_loss": 0.74745226, + "learning_rate": 8.017828214857103e-08, + "loss": 0.76813561, + "num_input_tokens_seen": 327310150, + "step": 15173, + "time_per_iteration": 2.8597710132598877 + }, + { + "auxiliary_loss_clip": 0.01048463, + "auxiliary_loss_mlp": 0.01027615, + "balance_loss_clip": 1.02785659, + "balance_loss_mlp": 1.0151931, + "epoch": 0.9123102359837667, + "flos": 15957122290560.0, + "grad_norm": 2.0601053639493467, + "language_loss": 0.66148412, + "learning_rate": 8.00691503189499e-08, + "loss": 0.6822449, + "num_input_tokens_seen": 327326660, + "step": 15174, + "time_per_iteration": 2.718860149383545 + }, + { + "auxiliary_loss_clip": 0.01047404, + "auxiliary_loss_mlp": 0.01029847, + "balance_loss_clip": 1.02451932, + "balance_loss_mlp": 1.0184443, + "epoch": 0.9123703592364347, + "flos": 25155784149120.0, + "grad_norm": 1.8050271913139588, + "language_loss": 0.75075561, + "learning_rate": 7.996009129329894e-08, + "loss": 0.77152812, + "num_input_tokens_seen": 327346700, + "step": 15175, + "time_per_iteration": 2.6471829414367676 + }, + { + "auxiliary_loss_clip": 0.00996131, + "auxiliary_loss_mlp": 0.01003775, + "balance_loss_clip": 1.00117588, + "balance_loss_mlp": 1.00300586, + "epoch": 0.9124304824891026, + "flos": 60801650812800.0, + "grad_norm": 0.9738821860423548, + "language_loss": 0.58564377, + "learning_rate": 7.985110507575421e-08, + "loss": 0.6056428, + "num_input_tokens_seen": 327403050, + "step": 15176, + "time_per_iteration": 5.19498085975647 + }, + { + "auxiliary_loss_clip": 0.01032211, + "auxiliary_loss_mlp": 0.01032717, + "balance_loss_clip": 1.02185416, + "balance_loss_mlp": 1.02190518, + "epoch": 0.9124906057417707, + "flos": 18150941352960.0, + "grad_norm": 2.0598128830804234, + "language_loss": 0.65207261, + "learning_rate": 7.97421916704475e-08, + "loss": 0.67272192, + "num_input_tokens_seen": 327422225, + "step": 15177, + "time_per_iteration": 2.753530502319336 + }, + { + "auxiliary_loss_clip": 0.01035507, + "auxiliary_loss_mlp": 0.01024869, + "balance_loss_clip": 1.02330756, + "balance_loss_mlp": 1.01447392, + "epoch": 0.9125507289944386, + "flos": 11686769049600.0, + "grad_norm": 1.9308665517612367, + "language_loss": 0.81297386, + "learning_rate": 7.963335108150926e-08, + "loss": 0.83357763, + "num_input_tokens_seen": 327437025, + "step": 15178, + "time_per_iteration": 2.641502857208252 + }, + { + "auxiliary_loss_clip": 0.01001127, + "auxiliary_loss_mlp": 0.01031961, + "balance_loss_clip": 1.02016902, + "balance_loss_mlp": 1.02001619, + "epoch": 0.9126108522471066, + "flos": 17748813617280.0, + "grad_norm": 2.2805621391072664, + "language_loss": 0.78707135, + "learning_rate": 7.952458331306711e-08, + "loss": 0.80740219, + "num_input_tokens_seen": 327453915, + "step": 15179, + "time_per_iteration": 2.7965753078460693 + }, + { + "auxiliary_loss_clip": 0.01039367, + "auxiliary_loss_mlp": 0.01028578, + "balance_loss_clip": 1.02366102, + "balance_loss_mlp": 1.01887465, + "epoch": 0.9126709754997745, + "flos": 27635738952960.0, + "grad_norm": 1.623934682483322, + "language_loss": 0.68016458, + "learning_rate": 7.941588836924507e-08, + "loss": 0.70084405, + "num_input_tokens_seen": 327474415, + "step": 15180, + "time_per_iteration": 2.790430784225464 + }, + { + "auxiliary_loss_clip": 0.01048664, + "auxiliary_loss_mlp": 0.01024208, + "balance_loss_clip": 1.02224898, + "balance_loss_mlp": 1.01475489, + "epoch": 0.9127310987524425, + "flos": 15924982596480.0, + "grad_norm": 3.243429788547794, + "language_loss": 0.75011241, + "learning_rate": 7.930726625416495e-08, + "loss": 0.77084112, + "num_input_tokens_seen": 327492750, + "step": 15181, + "time_per_iteration": 2.7248125076293945 + }, + { + "auxiliary_loss_clip": 0.01066049, + "auxiliary_loss_mlp": 0.01027902, + "balance_loss_clip": 1.02687895, + "balance_loss_mlp": 1.01758492, + "epoch": 0.9127912220051104, + "flos": 21536885923200.0, + "grad_norm": 1.8301802332310482, + "language_loss": 0.74749458, + "learning_rate": 7.919871697194614e-08, + "loss": 0.76843405, + "num_input_tokens_seen": 327509470, + "step": 15182, + "time_per_iteration": 2.554111957550049 + }, + { + "auxiliary_loss_clip": 0.01063184, + "auxiliary_loss_mlp": 0.0102659, + "balance_loss_clip": 1.02433109, + "balance_loss_mlp": 1.01567054, + "epoch": 0.9128513452577784, + "flos": 24063561342720.0, + "grad_norm": 3.2196680387759806, + "language_loss": 0.76441479, + "learning_rate": 7.909024052670421e-08, + "loss": 0.78531253, + "num_input_tokens_seen": 327530520, + "step": 15183, + "time_per_iteration": 2.6300392150878906 + }, + { + "auxiliary_loss_clip": 0.01055796, + "auxiliary_loss_mlp": 0.01029385, + "balance_loss_clip": 1.02681112, + "balance_loss_mlp": 1.01924038, + "epoch": 0.9129114685104465, + "flos": 16216469464320.0, + "grad_norm": 2.1748187351842767, + "language_loss": 0.76498467, + "learning_rate": 7.898183692255256e-08, + "loss": 0.78583646, + "num_input_tokens_seen": 327546960, + "step": 15184, + "time_per_iteration": 2.6340653896331787 + }, + { + "auxiliary_loss_clip": 0.0104488, + "auxiliary_loss_mlp": 0.01029938, + "balance_loss_clip": 1.02539003, + "balance_loss_mlp": 1.02004361, + "epoch": 0.9129715917631144, + "flos": 19384364522880.0, + "grad_norm": 2.390329602455932, + "language_loss": 0.74078125, + "learning_rate": 7.887350616360233e-08, + "loss": 0.76152945, + "num_input_tokens_seen": 327564830, + "step": 15185, + "time_per_iteration": 2.585355281829834 + }, + { + "auxiliary_loss_clip": 0.01042591, + "auxiliary_loss_mlp": 0.01029314, + "balance_loss_clip": 1.02508092, + "balance_loss_mlp": 1.01901984, + "epoch": 0.9130317150157824, + "flos": 20590460421120.0, + "grad_norm": 1.8775624956864676, + "language_loss": 0.68273342, + "learning_rate": 7.876524825396158e-08, + "loss": 0.70345247, + "num_input_tokens_seen": 327583675, + "step": 15186, + "time_per_iteration": 2.641517400741577 + }, + { + "auxiliary_loss_clip": 0.01040571, + "auxiliary_loss_mlp": 0.01035629, + "balance_loss_clip": 1.02433276, + "balance_loss_mlp": 1.02309978, + "epoch": 0.9130918382684503, + "flos": 20189230525440.0, + "grad_norm": 2.088634305329377, + "language_loss": 0.77479875, + "learning_rate": 7.865706319773502e-08, + "loss": 0.79556072, + "num_input_tokens_seen": 327602280, + "step": 15187, + "time_per_iteration": 4.256535291671753 + }, + { + "auxiliary_loss_clip": 0.01062251, + "auxiliary_loss_mlp": 0.00747713, + "balance_loss_clip": 1.02506924, + "balance_loss_mlp": 1.00039101, + "epoch": 0.9131519615211183, + "flos": 25556870390400.0, + "grad_norm": 2.022772496293726, + "language_loss": 0.65791488, + "learning_rate": 7.854895099902515e-08, + "loss": 0.67601448, + "num_input_tokens_seen": 327623515, + "step": 15188, + "time_per_iteration": 2.5863959789276123 + }, + { + "auxiliary_loss_clip": 0.00990644, + "auxiliary_loss_mlp": 0.01033561, + "balance_loss_clip": 1.01969826, + "balance_loss_mlp": 1.02238488, + "epoch": 0.9132120847737862, + "flos": 17931563038080.0, + "grad_norm": 1.8292881807362165, + "language_loss": 0.76224661, + "learning_rate": 7.844091166193157e-08, + "loss": 0.7824887, + "num_input_tokens_seen": 327642875, + "step": 15189, + "time_per_iteration": 4.512547969818115 + }, + { + "auxiliary_loss_clip": 0.01049679, + "auxiliary_loss_mlp": 0.01026736, + "balance_loss_clip": 1.02395844, + "balance_loss_mlp": 1.01741934, + "epoch": 0.9132722080264543, + "flos": 20047635112320.0, + "grad_norm": 1.7326075119046755, + "language_loss": 0.75669271, + "learning_rate": 7.8332945190551e-08, + "loss": 0.77745688, + "num_input_tokens_seen": 327662450, + "step": 15190, + "time_per_iteration": 2.587076425552368 + }, + { + "auxiliary_loss_clip": 0.00997675, + "auxiliary_loss_mlp": 0.0100182, + "balance_loss_clip": 1.00259185, + "balance_loss_mlp": 1.0009259, + "epoch": 0.9133323312791222, + "flos": 70439967141120.0, + "grad_norm": 0.6992661371487846, + "language_loss": 0.57303339, + "learning_rate": 7.822505158897797e-08, + "loss": 0.59302831, + "num_input_tokens_seen": 327723845, + "step": 15191, + "time_per_iteration": 3.2215518951416016 + }, + { + "auxiliary_loss_clip": 0.01063826, + "auxiliary_loss_mlp": 0.01029478, + "balance_loss_clip": 1.02575493, + "balance_loss_mlp": 1.01831985, + "epoch": 0.9133924545317902, + "flos": 25483792170240.0, + "grad_norm": 2.3542299882520177, + "language_loss": 0.74235308, + "learning_rate": 7.81172308613034e-08, + "loss": 0.76328611, + "num_input_tokens_seen": 327742590, + "step": 15192, + "time_per_iteration": 2.5935022830963135 + }, + { + "auxiliary_loss_clip": 0.01050316, + "auxiliary_loss_mlp": 0.01022941, + "balance_loss_clip": 1.02565575, + "balance_loss_mlp": 1.01284957, + "epoch": 0.9134525777844581, + "flos": 39930690107520.0, + "grad_norm": 2.0269731664355457, + "language_loss": 0.6915921, + "learning_rate": 7.800948301161647e-08, + "loss": 0.71232474, + "num_input_tokens_seen": 327764350, + "step": 15193, + "time_per_iteration": 2.7927699089050293 + }, + { + "auxiliary_loss_clip": 0.01048681, + "auxiliary_loss_mlp": 0.01030938, + "balance_loss_clip": 1.02426267, + "balance_loss_mlp": 1.02118635, + "epoch": 0.9135127010371261, + "flos": 20886723797760.0, + "grad_norm": 1.4817493543419538, + "language_loss": 0.73097932, + "learning_rate": 7.790180804400215e-08, + "loss": 0.7517755, + "num_input_tokens_seen": 327783120, + "step": 15194, + "time_per_iteration": 2.679888963699341 + }, + { + "auxiliary_loss_clip": 0.01017569, + "auxiliary_loss_mlp": 0.01031654, + "balance_loss_clip": 1.02257526, + "balance_loss_mlp": 1.01882672, + "epoch": 0.913572824289794, + "flos": 20813250528000.0, + "grad_norm": 1.8012314949089128, + "language_loss": 0.61672068, + "learning_rate": 7.779420596254383e-08, + "loss": 0.63721293, + "num_input_tokens_seen": 327801960, + "step": 15195, + "time_per_iteration": 2.772707939147949 + }, + { + "auxiliary_loss_clip": 0.01053259, + "auxiliary_loss_mlp": 0.01028877, + "balance_loss_clip": 1.02520025, + "balance_loss_mlp": 1.01830888, + "epoch": 0.913632947542462, + "flos": 25703278225920.0, + "grad_norm": 1.4537954487239566, + "language_loss": 0.71273732, + "learning_rate": 7.768667677132201e-08, + "loss": 0.73355871, + "num_input_tokens_seen": 327823795, + "step": 15196, + "time_per_iteration": 2.695077896118164 + }, + { + "auxiliary_loss_clip": 0.01040536, + "auxiliary_loss_mlp": 0.01033296, + "balance_loss_clip": 1.02465189, + "balance_loss_mlp": 1.02315116, + "epoch": 0.9136930707951301, + "flos": 26286216048000.0, + "grad_norm": 2.649107394646201, + "language_loss": 0.71613544, + "learning_rate": 7.757922047441411e-08, + "loss": 0.73687375, + "num_input_tokens_seen": 327845175, + "step": 15197, + "time_per_iteration": 2.7053658962249756 + }, + { + "auxiliary_loss_clip": 0.01040052, + "auxiliary_loss_mlp": 0.01026683, + "balance_loss_clip": 1.02266192, + "balance_loss_mlp": 1.01636553, + "epoch": 0.913753194047798, + "flos": 22091885942400.0, + "grad_norm": 1.948135864032872, + "language_loss": 0.77843976, + "learning_rate": 7.747183707589489e-08, + "loss": 0.79910707, + "num_input_tokens_seen": 327863150, + "step": 15198, + "time_per_iteration": 2.7057807445526123 + }, + { + "auxiliary_loss_clip": 0.01049408, + "auxiliary_loss_mlp": 0.01027064, + "balance_loss_clip": 1.02353311, + "balance_loss_mlp": 1.01708627, + "epoch": 0.913813317300466, + "flos": 23587206151680.0, + "grad_norm": 1.3734413525496223, + "language_loss": 0.6789422, + "learning_rate": 7.736452657983616e-08, + "loss": 0.69970697, + "num_input_tokens_seen": 327883445, + "step": 15199, + "time_per_iteration": 2.6408586502075195 + }, + { + "auxiliary_loss_clip": 0.01050121, + "auxiliary_loss_mlp": 0.00747567, + "balance_loss_clip": 1.02400243, + "balance_loss_mlp": 1.00040841, + "epoch": 0.9138734405531339, + "flos": 28876452583680.0, + "grad_norm": 1.4952737045407352, + "language_loss": 0.67665857, + "learning_rate": 7.725728899030714e-08, + "loss": 0.69463545, + "num_input_tokens_seen": 327905745, + "step": 15200, + "time_per_iteration": 2.685042142868042 + }, + { + "auxiliary_loss_clip": 0.0105123, + "auxiliary_loss_mlp": 0.01028595, + "balance_loss_clip": 1.02636886, + "balance_loss_mlp": 1.01917171, + "epoch": 0.9139335638058019, + "flos": 22821087945600.0, + "grad_norm": 1.573812899417761, + "language_loss": 0.7129516, + "learning_rate": 7.715012431137435e-08, + "loss": 0.73374987, + "num_input_tokens_seen": 327925435, + "step": 15201, + "time_per_iteration": 2.611462116241455 + }, + { + "auxiliary_loss_clip": 0.0104849, + "auxiliary_loss_mlp": 0.01023339, + "balance_loss_clip": 1.02248383, + "balance_loss_mlp": 1.01435089, + "epoch": 0.9139936870584698, + "flos": 18004174381440.0, + "grad_norm": 1.8023242739367717, + "language_loss": 0.70994258, + "learning_rate": 7.704303254710165e-08, + "loss": 0.73066086, + "num_input_tokens_seen": 327944145, + "step": 15202, + "time_per_iteration": 2.6461918354034424 + }, + { + "auxiliary_loss_clip": 0.01061069, + "auxiliary_loss_mlp": 0.0102958, + "balance_loss_clip": 1.02405405, + "balance_loss_mlp": 1.01872015, + "epoch": 0.9140538103111379, + "flos": 15813767111040.0, + "grad_norm": 1.8852283528573126, + "language_loss": 0.66736048, + "learning_rate": 7.693601370155001e-08, + "loss": 0.68826699, + "num_input_tokens_seen": 327960565, + "step": 15203, + "time_per_iteration": 2.610687017440796 + }, + { + "auxiliary_loss_clip": 0.01050766, + "auxiliary_loss_mlp": 0.01027688, + "balance_loss_clip": 1.02424574, + "balance_loss_mlp": 1.01648188, + "epoch": 0.9141139335638058, + "flos": 23987035416960.0, + "grad_norm": 1.6460850902792392, + "language_loss": 0.68698227, + "learning_rate": 7.682906777877751e-08, + "loss": 0.70776683, + "num_input_tokens_seen": 327981180, + "step": 15204, + "time_per_iteration": 2.659477472305298 + }, + { + "auxiliary_loss_clip": 0.01048519, + "auxiliary_loss_mlp": 0.01023439, + "balance_loss_clip": 1.02174187, + "balance_loss_mlp": 1.01272202, + "epoch": 0.9141740568164738, + "flos": 24024418496640.0, + "grad_norm": 2.440142679438116, + "language_loss": 0.59604043, + "learning_rate": 7.672219478283915e-08, + "loss": 0.61676002, + "num_input_tokens_seen": 328001500, + "step": 15205, + "time_per_iteration": 4.562456846237183 + }, + { + "auxiliary_loss_clip": 0.01019358, + "auxiliary_loss_mlp": 0.01028766, + "balance_loss_clip": 1.02294326, + "balance_loss_mlp": 1.0182693, + "epoch": 0.9142341800691417, + "flos": 27018291139200.0, + "grad_norm": 1.6348417342913977, + "language_loss": 0.81360793, + "learning_rate": 7.661539471778811e-08, + "loss": 0.83408916, + "num_input_tokens_seen": 328023025, + "step": 15206, + "time_per_iteration": 2.7945172786712646 + }, + { + "auxiliary_loss_clip": 0.01016818, + "auxiliary_loss_mlp": 0.01024852, + "balance_loss_clip": 1.02142584, + "balance_loss_mlp": 1.01419449, + "epoch": 0.9142943033218097, + "flos": 20412487509120.0, + "grad_norm": 3.543647836155908, + "language_loss": 0.73539197, + "learning_rate": 7.650866758767382e-08, + "loss": 0.75580859, + "num_input_tokens_seen": 328041410, + "step": 15207, + "time_per_iteration": 2.6933183670043945 + }, + { + "auxiliary_loss_clip": 0.01026733, + "auxiliary_loss_mlp": 0.0103475, + "balance_loss_clip": 1.02821469, + "balance_loss_mlp": 1.02387214, + "epoch": 0.9143544265744776, + "flos": 19755322231680.0, + "grad_norm": 1.8772815776678908, + "language_loss": 0.72948194, + "learning_rate": 7.640201339654373e-08, + "loss": 0.7500968, + "num_input_tokens_seen": 328060495, + "step": 15208, + "time_per_iteration": 2.7427899837493896 + }, + { + "auxiliary_loss_clip": 0.01042197, + "auxiliary_loss_mlp": 0.0102309, + "balance_loss_clip": 1.02466631, + "balance_loss_mlp": 1.01336861, + "epoch": 0.9144145498271457, + "flos": 17165444832000.0, + "grad_norm": 2.3976398306031013, + "language_loss": 0.86213559, + "learning_rate": 7.629543214844237e-08, + "loss": 0.88278848, + "num_input_tokens_seen": 328076905, + "step": 15209, + "time_per_iteration": 2.587345600128174 + }, + { + "auxiliary_loss_clip": 0.01047547, + "auxiliary_loss_mlp": 0.01030643, + "balance_loss_clip": 1.0293386, + "balance_loss_mlp": 1.02095151, + "epoch": 0.9144746730798137, + "flos": 23726072131200.0, + "grad_norm": 1.5708749596251643, + "language_loss": 0.75438786, + "learning_rate": 7.618892384741093e-08, + "loss": 0.77516973, + "num_input_tokens_seen": 328096960, + "step": 15210, + "time_per_iteration": 2.791879653930664 + }, + { + "auxiliary_loss_clip": 0.01036805, + "auxiliary_loss_mlp": 0.01030285, + "balance_loss_clip": 1.02093601, + "balance_loss_mlp": 1.02006841, + "epoch": 0.9145347963324816, + "flos": 25847854467840.0, + "grad_norm": 1.8259407943617736, + "language_loss": 0.78086364, + "learning_rate": 7.6082488497488e-08, + "loss": 0.80153453, + "num_input_tokens_seen": 328115445, + "step": 15211, + "time_per_iteration": 2.736194133758545 + }, + { + "auxiliary_loss_clip": 0.0105328, + "auxiliary_loss_mlp": 0.01025803, + "balance_loss_clip": 1.02608418, + "balance_loss_mlp": 1.01604593, + "epoch": 0.9145949195851496, + "flos": 19242769109760.0, + "grad_norm": 1.7588975401252, + "language_loss": 0.82980752, + "learning_rate": 7.597612610270986e-08, + "loss": 0.85059834, + "num_input_tokens_seen": 328133965, + "step": 15212, + "time_per_iteration": 2.643005847930908 + }, + { + "auxiliary_loss_clip": 0.01050719, + "auxiliary_loss_mlp": 0.01025647, + "balance_loss_clip": 1.02538264, + "balance_loss_mlp": 1.01625919, + "epoch": 0.9146550428378175, + "flos": 18296379521280.0, + "grad_norm": 1.7865148460833524, + "language_loss": 0.83781695, + "learning_rate": 7.586983666711022e-08, + "loss": 0.85858059, + "num_input_tokens_seen": 328151520, + "step": 15213, + "time_per_iteration": 2.6267929077148438 + }, + { + "auxiliary_loss_clip": 0.01043034, + "auxiliary_loss_mlp": 0.0102559, + "balance_loss_clip": 1.02477527, + "balance_loss_mlp": 1.01568389, + "epoch": 0.9147151660904855, + "flos": 20084264006400.0, + "grad_norm": 1.6675777263898417, + "language_loss": 0.7055949, + "learning_rate": 7.576362019471894e-08, + "loss": 0.72628111, + "num_input_tokens_seen": 328171275, + "step": 15214, + "time_per_iteration": 2.662587881088257 + }, + { + "auxiliary_loss_clip": 0.01055604, + "auxiliary_loss_mlp": 0.01031268, + "balance_loss_clip": 1.02654314, + "balance_loss_mlp": 1.0201633, + "epoch": 0.9147752893431534, + "flos": 24389127239040.0, + "grad_norm": 1.5994941747591418, + "language_loss": 0.62512696, + "learning_rate": 7.565747668956413e-08, + "loss": 0.64599568, + "num_input_tokens_seen": 328192115, + "step": 15215, + "time_per_iteration": 2.7257094383239746 + }, + { + "auxiliary_loss_clip": 0.01043141, + "auxiliary_loss_mlp": 0.01026258, + "balance_loss_clip": 1.03103995, + "balance_loss_mlp": 1.01527917, + "epoch": 0.9148354125958215, + "flos": 18150402648960.0, + "grad_norm": 2.119775088365902, + "language_loss": 0.76338905, + "learning_rate": 7.555140615567058e-08, + "loss": 0.78408301, + "num_input_tokens_seen": 328208990, + "step": 15216, + "time_per_iteration": 2.8515753746032715 + }, + { + "auxiliary_loss_clip": 0.01035561, + "auxiliary_loss_mlp": 0.01035397, + "balance_loss_clip": 1.02353239, + "balance_loss_mlp": 1.02363145, + "epoch": 0.9148955358484894, + "flos": 23367540528000.0, + "grad_norm": 2.325069825412257, + "language_loss": 0.68383944, + "learning_rate": 7.544540859706062e-08, + "loss": 0.70454901, + "num_input_tokens_seen": 328227840, + "step": 15217, + "time_per_iteration": 2.8227431774139404 + }, + { + "auxiliary_loss_clip": 0.01049183, + "auxiliary_loss_mlp": 0.01027389, + "balance_loss_clip": 1.02410173, + "balance_loss_mlp": 1.01741171, + "epoch": 0.9149556591011574, + "flos": 18076498416000.0, + "grad_norm": 1.7274917159140075, + "language_loss": 0.79979426, + "learning_rate": 7.533948401775347e-08, + "loss": 0.82055998, + "num_input_tokens_seen": 328246250, + "step": 15218, + "time_per_iteration": 2.749162435531616 + }, + { + "auxiliary_loss_clip": 0.00982123, + "auxiliary_loss_mlp": 0.01002682, + "balance_loss_clip": 1.00624204, + "balance_loss_mlp": 1.00181222, + "epoch": 0.9150157823538253, + "flos": 54586374825600.0, + "grad_norm": 0.8498940906165368, + "language_loss": 0.59226942, + "learning_rate": 7.523363242176595e-08, + "loss": 0.61211741, + "num_input_tokens_seen": 328303625, + "step": 15219, + "time_per_iteration": 3.3512649536132812 + }, + { + "auxiliary_loss_clip": 0.0104785, + "auxiliary_loss_mlp": 0.01032752, + "balance_loss_clip": 1.02302408, + "balance_loss_mlp": 1.02221942, + "epoch": 0.9150759056064933, + "flos": 17893102550400.0, + "grad_norm": 1.871842986148632, + "language_loss": 0.78697956, + "learning_rate": 7.512785381311216e-08, + "loss": 0.80778557, + "num_input_tokens_seen": 328322135, + "step": 15220, + "time_per_iteration": 2.7279059886932373 + }, + { + "auxiliary_loss_clip": 0.01017649, + "auxiliary_loss_mlp": 0.01039568, + "balance_loss_clip": 1.0225842, + "balance_loss_mlp": 1.02656817, + "epoch": 0.9151360288591612, + "flos": 18073517587200.0, + "grad_norm": 2.498114326403547, + "language_loss": 0.65772587, + "learning_rate": 7.50221481958031e-08, + "loss": 0.67829806, + "num_input_tokens_seen": 328340750, + "step": 15221, + "time_per_iteration": 2.6866652965545654 + }, + { + "auxiliary_loss_clip": 0.01042326, + "auxiliary_loss_mlp": 0.01027316, + "balance_loss_clip": 1.02459538, + "balance_loss_mlp": 1.01760077, + "epoch": 0.9151961521118293, + "flos": 19354523299200.0, + "grad_norm": 1.7929729557716383, + "language_loss": 0.8441515, + "learning_rate": 7.491651557384692e-08, + "loss": 0.8648479, + "num_input_tokens_seen": 328359995, + "step": 15222, + "time_per_iteration": 2.7127461433410645 + }, + { + "auxiliary_loss_clip": 0.00994551, + "auxiliary_loss_mlp": 0.00999101, + "balance_loss_clip": 1.00881648, + "balance_loss_mlp": 0.99817717, + "epoch": 0.9152562753644973, + "flos": 72146621018880.0, + "grad_norm": 0.7276817536572502, + "language_loss": 0.49611205, + "learning_rate": 7.481095595124953e-08, + "loss": 0.51604855, + "num_input_tokens_seen": 328426865, + "step": 15223, + "time_per_iteration": 5.108011484146118 + }, + { + "auxiliary_loss_clip": 0.01035034, + "auxiliary_loss_mlp": 0.01038774, + "balance_loss_clip": 1.02562702, + "balance_loss_mlp": 1.02756262, + "epoch": 0.9153163986171652, + "flos": 20777016683520.0, + "grad_norm": 1.8331913247547107, + "language_loss": 0.71952891, + "learning_rate": 7.470546933201349e-08, + "loss": 0.74026692, + "num_input_tokens_seen": 328445970, + "step": 15224, + "time_per_iteration": 2.8559489250183105 + }, + { + "auxiliary_loss_clip": 0.01050632, + "auxiliary_loss_mlp": 0.01025004, + "balance_loss_clip": 1.02475286, + "balance_loss_mlp": 1.01447225, + "epoch": 0.9153765218698332, + "flos": 23040107124480.0, + "grad_norm": 1.8962179858127741, + "language_loss": 0.80869126, + "learning_rate": 7.460005572013895e-08, + "loss": 0.82944763, + "num_input_tokens_seen": 328464585, + "step": 15225, + "time_per_iteration": 2.7401418685913086 + }, + { + "auxiliary_loss_clip": 0.01059812, + "auxiliary_loss_mlp": 0.01022793, + "balance_loss_clip": 1.02306843, + "balance_loss_mlp": 1.0131489, + "epoch": 0.9154366451225011, + "flos": 28990900293120.0, + "grad_norm": 1.4404016507991522, + "language_loss": 0.71330869, + "learning_rate": 7.44947151196238e-08, + "loss": 0.73413479, + "num_input_tokens_seen": 328490155, + "step": 15226, + "time_per_iteration": 2.6837246417999268 + }, + { + "auxiliary_loss_clip": 0.00991256, + "auxiliary_loss_mlp": 0.01030046, + "balance_loss_clip": 1.0226624, + "balance_loss_mlp": 1.01931691, + "epoch": 0.9154967683751691, + "flos": 22309504490880.0, + "grad_norm": 2.1592370606445166, + "language_loss": 0.74657917, + "learning_rate": 7.43894475344613e-08, + "loss": 0.76679218, + "num_input_tokens_seen": 328508275, + "step": 15227, + "time_per_iteration": 3.064589023590088 + }, + { + "auxiliary_loss_clip": 0.01039116, + "auxiliary_loss_mlp": 0.01028848, + "balance_loss_clip": 1.02400041, + "balance_loss_mlp": 1.01882243, + "epoch": 0.915556891627837, + "flos": 24571481610240.0, + "grad_norm": 1.5451523686452389, + "language_loss": 0.74069071, + "learning_rate": 7.428425296864404e-08, + "loss": 0.7613703, + "num_input_tokens_seen": 328529425, + "step": 15228, + "time_per_iteration": 3.2721569538116455 + }, + { + "auxiliary_loss_clip": 0.01028486, + "auxiliary_loss_mlp": 0.01031302, + "balance_loss_clip": 1.0220542, + "balance_loss_mlp": 1.02116299, + "epoch": 0.9156170148805051, + "flos": 22164676853760.0, + "grad_norm": 1.507791875332794, + "language_loss": 0.71986377, + "learning_rate": 7.417913142616106e-08, + "loss": 0.74046171, + "num_input_tokens_seen": 328550200, + "step": 15229, + "time_per_iteration": 2.740914821624756 + }, + { + "auxiliary_loss_clip": 0.01065004, + "auxiliary_loss_mlp": 0.01032576, + "balance_loss_clip": 1.02731955, + "balance_loss_mlp": 1.02169192, + "epoch": 0.915677138133173, + "flos": 20920659171840.0, + "grad_norm": 1.6805929071877814, + "language_loss": 0.83181882, + "learning_rate": 7.407408291099848e-08, + "loss": 0.85279465, + "num_input_tokens_seen": 328568540, + "step": 15230, + "time_per_iteration": 2.5895798206329346 + }, + { + "auxiliary_loss_clip": 0.01017516, + "auxiliary_loss_mlp": 0.01025177, + "balance_loss_clip": 1.02387047, + "balance_loss_mlp": 1.01521063, + "epoch": 0.915737261385841, + "flos": 24345136056960.0, + "grad_norm": 1.8324510146743902, + "language_loss": 0.83449304, + "learning_rate": 7.396910742713957e-08, + "loss": 0.85491997, + "num_input_tokens_seen": 328587300, + "step": 15231, + "time_per_iteration": 2.8422629833221436 + }, + { + "auxiliary_loss_clip": 0.01043195, + "auxiliary_loss_mlp": 0.0102438, + "balance_loss_clip": 1.02066708, + "balance_loss_mlp": 1.01412845, + "epoch": 0.9157973846385089, + "flos": 26761386090240.0, + "grad_norm": 1.5401042303898766, + "language_loss": 0.72303343, + "learning_rate": 7.386420497856516e-08, + "loss": 0.74370921, + "num_input_tokens_seen": 328610055, + "step": 15232, + "time_per_iteration": 2.691859722137451 + }, + { + "auxiliary_loss_clip": 0.01063287, + "auxiliary_loss_mlp": 0.01029859, + "balance_loss_clip": 1.02537751, + "balance_loss_mlp": 1.01935124, + "epoch": 0.9158575078911769, + "flos": 18478733892480.0, + "grad_norm": 2.115345844365963, + "language_loss": 0.67494988, + "learning_rate": 7.375937556925338e-08, + "loss": 0.69588137, + "num_input_tokens_seen": 328626815, + "step": 15233, + "time_per_iteration": 2.6184749603271484 + }, + { + "auxiliary_loss_clip": 0.01044963, + "auxiliary_loss_mlp": 0.01031961, + "balance_loss_clip": 1.02728415, + "balance_loss_mlp": 1.02092814, + "epoch": 0.9159176311438448, + "flos": 21798926616960.0, + "grad_norm": 1.9244103902345595, + "language_loss": 0.69806302, + "learning_rate": 7.365461920317861e-08, + "loss": 0.71883225, + "num_input_tokens_seen": 328643995, + "step": 15234, + "time_per_iteration": 4.52862811088562 + }, + { + "auxiliary_loss_clip": 0.01042503, + "auxiliary_loss_mlp": 0.01030011, + "balance_loss_clip": 1.02517223, + "balance_loss_mlp": 1.01950908, + "epoch": 0.9159777543965129, + "flos": 24783749032320.0, + "grad_norm": 1.7518053742466007, + "language_loss": 0.88324434, + "learning_rate": 7.354993588431391e-08, + "loss": 0.90396953, + "num_input_tokens_seen": 328659565, + "step": 15235, + "time_per_iteration": 2.6894311904907227 + }, + { + "auxiliary_loss_clip": 0.00999706, + "auxiliary_loss_mlp": 0.01034949, + "balance_loss_clip": 1.02187097, + "balance_loss_mlp": 1.02224743, + "epoch": 0.9160378776491809, + "flos": 26868758820480.0, + "grad_norm": 1.6029096958454383, + "language_loss": 0.77000833, + "learning_rate": 7.344532561662853e-08, + "loss": 0.79035485, + "num_input_tokens_seen": 328679045, + "step": 15236, + "time_per_iteration": 4.47669529914856 + }, + { + "auxiliary_loss_clip": 0.00966988, + "auxiliary_loss_mlp": 0.01003567, + "balance_loss_clip": 1.01031649, + "balance_loss_mlp": 1.00270319, + "epoch": 0.9160980009018488, + "flos": 70578222589440.0, + "grad_norm": 0.6774775031470399, + "language_loss": 0.62221789, + "learning_rate": 7.334078840409019e-08, + "loss": 0.64192355, + "num_input_tokens_seen": 328744565, + "step": 15237, + "time_per_iteration": 3.5486135482788086 + }, + { + "auxiliary_loss_clip": 0.01063788, + "auxiliary_loss_mlp": 0.0074748, + "balance_loss_clip": 1.02510977, + "balance_loss_mlp": 1.0003804, + "epoch": 0.9161581241545168, + "flos": 16289332202880.0, + "grad_norm": 2.7340539212195254, + "language_loss": 0.7490226, + "learning_rate": 7.323632425066151e-08, + "loss": 0.76713526, + "num_input_tokens_seen": 328762455, + "step": 15238, + "time_per_iteration": 3.005110263824463 + }, + { + "auxiliary_loss_clip": 0.01062368, + "auxiliary_loss_mlp": 0.01025652, + "balance_loss_clip": 1.02517796, + "balance_loss_mlp": 1.01536393, + "epoch": 0.9162182474071847, + "flos": 18438154502400.0, + "grad_norm": 1.8458788297231616, + "language_loss": 0.74791777, + "learning_rate": 7.313193316030464e-08, + "loss": 0.76879799, + "num_input_tokens_seen": 328780320, + "step": 15239, + "time_per_iteration": 2.5804805755615234 + }, + { + "auxiliary_loss_clip": 0.01030516, + "auxiliary_loss_mlp": 0.01029203, + "balance_loss_clip": 1.02330852, + "balance_loss_mlp": 1.01864147, + "epoch": 0.9162783706598527, + "flos": 19167248764800.0, + "grad_norm": 2.176753376180919, + "language_loss": 0.63298345, + "learning_rate": 7.302761513697819e-08, + "loss": 0.65358055, + "num_input_tokens_seen": 328797570, + "step": 15240, + "time_per_iteration": 2.6618847846984863 + }, + { + "auxiliary_loss_clip": 0.01039354, + "auxiliary_loss_mlp": 0.0074746, + "balance_loss_clip": 1.02480733, + "balance_loss_mlp": 1.00036919, + "epoch": 0.9163384939125206, + "flos": 20412990299520.0, + "grad_norm": 1.782713251107469, + "language_loss": 0.76249015, + "learning_rate": 7.292337018463746e-08, + "loss": 0.78035825, + "num_input_tokens_seen": 328814075, + "step": 15241, + "time_per_iteration": 2.6624438762664795 + }, + { + "auxiliary_loss_clip": 0.010569, + "auxiliary_loss_mlp": 0.01029072, + "balance_loss_clip": 1.02569675, + "balance_loss_mlp": 1.0169307, + "epoch": 0.9163986171651887, + "flos": 19645902426240.0, + "grad_norm": 2.182751684325131, + "language_loss": 0.67580664, + "learning_rate": 7.281919830723549e-08, + "loss": 0.69666636, + "num_input_tokens_seen": 328831990, + "step": 15242, + "time_per_iteration": 3.020141124725342 + }, + { + "auxiliary_loss_clip": 0.0104823, + "auxiliary_loss_mlp": 0.01028914, + "balance_loss_clip": 1.02246988, + "balance_loss_mlp": 1.01800084, + "epoch": 0.9164587404178566, + "flos": 12823054865280.0, + "grad_norm": 1.8385156377294212, + "language_loss": 0.80649924, + "learning_rate": 7.271509950872334e-08, + "loss": 0.82727075, + "num_input_tokens_seen": 328849105, + "step": 15243, + "time_per_iteration": 2.622504711151123 + }, + { + "auxiliary_loss_clip": 0.01033625, + "auxiliary_loss_mlp": 0.01025796, + "balance_loss_clip": 1.02065051, + "balance_loss_mlp": 1.01485872, + "epoch": 0.9165188636705246, + "flos": 22309396750080.0, + "grad_norm": 1.707056861767999, + "language_loss": 0.81941128, + "learning_rate": 7.261107379304721e-08, + "loss": 0.84000546, + "num_input_tokens_seen": 328866810, + "step": 15244, + "time_per_iteration": 2.9203760623931885 + }, + { + "auxiliary_loss_clip": 0.01065392, + "auxiliary_loss_mlp": 0.01031363, + "balance_loss_clip": 1.02566469, + "balance_loss_mlp": 1.02037215, + "epoch": 0.9165789869231925, + "flos": 18223337214720.0, + "grad_norm": 2.3526224871685035, + "language_loss": 0.72828388, + "learning_rate": 7.250712116415214e-08, + "loss": 0.74925143, + "num_input_tokens_seen": 328885325, + "step": 15245, + "time_per_iteration": 2.7102439403533936 + }, + { + "auxiliary_loss_clip": 0.01042347, + "auxiliary_loss_mlp": 0.01027615, + "balance_loss_clip": 1.02488625, + "balance_loss_mlp": 1.01758397, + "epoch": 0.9166391101758605, + "flos": 13691553811200.0, + "grad_norm": 1.6715428137497448, + "language_loss": 0.74708766, + "learning_rate": 7.240324162598033e-08, + "loss": 0.76778728, + "num_input_tokens_seen": 328902655, + "step": 15246, + "time_per_iteration": 2.663421154022217 + }, + { + "auxiliary_loss_clip": 0.01033688, + "auxiliary_loss_mlp": 0.01026607, + "balance_loss_clip": 1.02303231, + "balance_loss_mlp": 1.01556873, + "epoch": 0.9166992334285284, + "flos": 17346793622400.0, + "grad_norm": 2.0203887414605575, + "language_loss": 0.75251126, + "learning_rate": 7.229943518247106e-08, + "loss": 0.7731142, + "num_input_tokens_seen": 328918440, + "step": 15247, + "time_per_iteration": 2.707456350326538 + }, + { + "auxiliary_loss_clip": 0.01055239, + "auxiliary_loss_mlp": 0.01028119, + "balance_loss_clip": 1.02685285, + "balance_loss_mlp": 1.01645422, + "epoch": 0.9167593566811965, + "flos": 23731135948800.0, + "grad_norm": 1.67614590668299, + "language_loss": 0.75698543, + "learning_rate": 7.219570183756052e-08, + "loss": 0.77781898, + "num_input_tokens_seen": 328938055, + "step": 15248, + "time_per_iteration": 2.679805278778076 + }, + { + "auxiliary_loss_clip": 0.01048424, + "auxiliary_loss_mlp": 0.01035296, + "balance_loss_clip": 1.02240932, + "balance_loss_mlp": 1.02382207, + "epoch": 0.9168194799338644, + "flos": 27818201064960.0, + "grad_norm": 2.244201344474767, + "language_loss": 0.72645795, + "learning_rate": 7.209204159518178e-08, + "loss": 0.74729514, + "num_input_tokens_seen": 328957895, + "step": 15249, + "time_per_iteration": 2.7432446479797363 + }, + { + "auxiliary_loss_clip": 0.01016093, + "auxiliary_loss_mlp": 0.01027588, + "balance_loss_clip": 1.02279902, + "balance_loss_mlp": 1.01631713, + "epoch": 0.9168796031865324, + "flos": 21717552355200.0, + "grad_norm": 2.079534638264103, + "language_loss": 0.76070273, + "learning_rate": 7.198845445926616e-08, + "loss": 0.78113949, + "num_input_tokens_seen": 328971365, + "step": 15250, + "time_per_iteration": 2.8058621883392334 + }, + { + "auxiliary_loss_clip": 0.01020158, + "auxiliary_loss_mlp": 0.01025583, + "balance_loss_clip": 1.02190053, + "balance_loss_mlp": 1.01501477, + "epoch": 0.9169397264392004, + "flos": 23404420817280.0, + "grad_norm": 1.8664991095522279, + "language_loss": 0.75704598, + "learning_rate": 7.188494043374138e-08, + "loss": 0.77750343, + "num_input_tokens_seen": 328990830, + "step": 15251, + "time_per_iteration": 2.7894837856292725 + }, + { + "auxiliary_loss_clip": 0.01037761, + "auxiliary_loss_mlp": 0.01032102, + "balance_loss_clip": 1.02545834, + "balance_loss_mlp": 1.01988292, + "epoch": 0.9169998496918683, + "flos": 23950981140480.0, + "grad_norm": 2.317296767432572, + "language_loss": 0.80205214, + "learning_rate": 7.178149952253298e-08, + "loss": 0.82275081, + "num_input_tokens_seen": 329008345, + "step": 15252, + "time_per_iteration": 2.7217907905578613 + }, + { + "auxiliary_loss_clip": 0.01061498, + "auxiliary_loss_mlp": 0.01031781, + "balance_loss_clip": 1.02336311, + "balance_loss_mlp": 1.02151716, + "epoch": 0.9170599729445363, + "flos": 18332469711360.0, + "grad_norm": 1.6440337015951036, + "language_loss": 0.77054846, + "learning_rate": 7.167813172956316e-08, + "loss": 0.79148126, + "num_input_tokens_seen": 329027440, + "step": 15253, + "time_per_iteration": 5.150804042816162 + }, + { + "auxiliary_loss_clip": 0.01054571, + "auxiliary_loss_mlp": 0.0102514, + "balance_loss_clip": 1.02628243, + "balance_loss_mlp": 1.01483464, + "epoch": 0.9171200961972042, + "flos": 22674859678080.0, + "grad_norm": 2.9347976996734206, + "language_loss": 0.73033369, + "learning_rate": 7.157483705875256e-08, + "loss": 0.75113082, + "num_input_tokens_seen": 329046445, + "step": 15254, + "time_per_iteration": 2.779448986053467 + }, + { + "auxiliary_loss_clip": 0.01032212, + "auxiliary_loss_mlp": 0.0102563, + "balance_loss_clip": 1.02515888, + "balance_loss_mlp": 1.01555085, + "epoch": 0.9171802194498723, + "flos": 26719298328960.0, + "grad_norm": 1.7457174620239522, + "language_loss": 0.79305655, + "learning_rate": 7.14716155140167e-08, + "loss": 0.81363499, + "num_input_tokens_seen": 329065555, + "step": 15255, + "time_per_iteration": 2.848825693130493 + }, + { + "auxiliary_loss_clip": 0.01054322, + "auxiliary_loss_mlp": 0.01029164, + "balance_loss_clip": 1.02546716, + "balance_loss_mlp": 1.01859629, + "epoch": 0.9172403427025402, + "flos": 37889240538240.0, + "grad_norm": 1.7973200994735639, + "language_loss": 0.68462205, + "learning_rate": 7.136846709927047e-08, + "loss": 0.70545691, + "num_input_tokens_seen": 329087515, + "step": 15256, + "time_per_iteration": 2.9073195457458496 + }, + { + "auxiliary_loss_clip": 0.01044128, + "auxiliary_loss_mlp": 0.01034154, + "balance_loss_clip": 1.02256763, + "balance_loss_mlp": 1.02302539, + "epoch": 0.9173004659552082, + "flos": 17055163100160.0, + "grad_norm": 2.310178965063389, + "language_loss": 0.83879864, + "learning_rate": 7.126539181842561e-08, + "loss": 0.85958147, + "num_input_tokens_seen": 329106820, + "step": 15257, + "time_per_iteration": 2.666153907775879 + }, + { + "auxiliary_loss_clip": 0.01031927, + "auxiliary_loss_mlp": 0.01030345, + "balance_loss_clip": 1.02025855, + "balance_loss_mlp": 1.02065277, + "epoch": 0.9173605892078761, + "flos": 22201593056640.0, + "grad_norm": 1.6138086176701734, + "language_loss": 0.77430069, + "learning_rate": 7.116238967539012e-08, + "loss": 0.79492342, + "num_input_tokens_seen": 329126515, + "step": 15258, + "time_per_iteration": 2.6800363063812256 + }, + { + "auxiliary_loss_clip": 0.01046539, + "auxiliary_loss_mlp": 0.01031737, + "balance_loss_clip": 1.02667117, + "balance_loss_mlp": 1.02100253, + "epoch": 0.9174207124605441, + "flos": 16507776764160.0, + "grad_norm": 1.8574191923007821, + "language_loss": 0.78624737, + "learning_rate": 7.105946067406999e-08, + "loss": 0.8070302, + "num_input_tokens_seen": 329142660, + "step": 15259, + "time_per_iteration": 2.63702654838562 + }, + { + "auxiliary_loss_clip": 0.01015088, + "auxiliary_loss_mlp": 0.0103231, + "balance_loss_clip": 1.02183533, + "balance_loss_mlp": 1.02280354, + "epoch": 0.917480835713212, + "flos": 24535606901760.0, + "grad_norm": 1.550711124755336, + "language_loss": 0.7610184, + "learning_rate": 7.095660481836895e-08, + "loss": 0.78149235, + "num_input_tokens_seen": 329162575, + "step": 15260, + "time_per_iteration": 2.7838170528411865 + }, + { + "auxiliary_loss_clip": 0.01020481, + "auxiliary_loss_mlp": 0.01027295, + "balance_loss_clip": 1.02327108, + "balance_loss_mlp": 1.01691175, + "epoch": 0.9175409589658801, + "flos": 20880726226560.0, + "grad_norm": 1.5173482859432115, + "language_loss": 0.60963529, + "learning_rate": 7.085382211218637e-08, + "loss": 0.63011307, + "num_input_tokens_seen": 329182090, + "step": 15261, + "time_per_iteration": 2.7883098125457764 + }, + { + "auxiliary_loss_clip": 0.01035404, + "auxiliary_loss_mlp": 0.01028487, + "balance_loss_clip": 1.02187967, + "balance_loss_mlp": 1.018682, + "epoch": 0.917601082218548, + "flos": 14276035918080.0, + "grad_norm": 1.7749533776605284, + "language_loss": 0.73828173, + "learning_rate": 7.075111255942002e-08, + "loss": 0.75892067, + "num_input_tokens_seen": 329196535, + "step": 15262, + "time_per_iteration": 2.6164417266845703 + }, + { + "auxiliary_loss_clip": 0.01061977, + "auxiliary_loss_mlp": 0.01030248, + "balance_loss_clip": 1.02333915, + "balance_loss_mlp": 1.01955462, + "epoch": 0.917661205471216, + "flos": 19099234362240.0, + "grad_norm": 1.7247347028487607, + "language_loss": 0.77504343, + "learning_rate": 7.064847616396496e-08, + "loss": 0.79596567, + "num_input_tokens_seen": 329215135, + "step": 15263, + "time_per_iteration": 2.596015214920044 + }, + { + "auxiliary_loss_clip": 0.01063381, + "auxiliary_loss_mlp": 0.0102882, + "balance_loss_clip": 1.02446222, + "balance_loss_mlp": 1.0183121, + "epoch": 0.917721328723884, + "flos": 21106568989440.0, + "grad_norm": 1.860660206373528, + "language_loss": 0.75909358, + "learning_rate": 7.054591292971324e-08, + "loss": 0.78001559, + "num_input_tokens_seen": 329235150, + "step": 15264, + "time_per_iteration": 2.665703773498535 + }, + { + "auxiliary_loss_clip": 0.01041257, + "auxiliary_loss_mlp": 0.01030579, + "balance_loss_clip": 1.02480841, + "balance_loss_mlp": 1.02061915, + "epoch": 0.9177814519765519, + "flos": 21943215550080.0, + "grad_norm": 1.9685727083865352, + "language_loss": 0.83405668, + "learning_rate": 7.044342286055394e-08, + "loss": 0.85477495, + "num_input_tokens_seen": 329254365, + "step": 15265, + "time_per_iteration": 2.8547937870025635 + }, + { + "auxiliary_loss_clip": 0.01066352, + "auxiliary_loss_mlp": 0.01034714, + "balance_loss_clip": 1.02644515, + "balance_loss_mlp": 1.02295423, + "epoch": 0.9178415752292199, + "flos": 24205982768640.0, + "grad_norm": 1.645328115388597, + "language_loss": 0.73179281, + "learning_rate": 7.034100596037306e-08, + "loss": 0.75280344, + "num_input_tokens_seen": 329274385, + "step": 15266, + "time_per_iteration": 2.653564929962158 + }, + { + "auxiliary_loss_clip": 0.01061334, + "auxiliary_loss_mlp": 0.01027833, + "balance_loss_clip": 1.02473629, + "balance_loss_mlp": 1.01806426, + "epoch": 0.9179016984818879, + "flos": 20042068504320.0, + "grad_norm": 1.5410790690732137, + "language_loss": 0.77952826, + "learning_rate": 7.023866223305486e-08, + "loss": 0.80041993, + "num_input_tokens_seen": 329292160, + "step": 15267, + "time_per_iteration": 2.77227783203125 + }, + { + "auxiliary_loss_clip": 0.00999537, + "auxiliary_loss_mlp": 0.00746613, + "balance_loss_clip": 1.00394154, + "balance_loss_mlp": 1.00057721, + "epoch": 0.9179618217345559, + "flos": 65555901100800.0, + "grad_norm": 0.7374173006688751, + "language_loss": 0.56248897, + "learning_rate": 7.013639168247975e-08, + "loss": 0.57995045, + "num_input_tokens_seen": 329351870, + "step": 15268, + "time_per_iteration": 3.275315761566162 + }, + { + "auxiliary_loss_clip": 0.01064553, + "auxiliary_loss_mlp": 0.0074749, + "balance_loss_clip": 1.02601314, + "balance_loss_mlp": 1.00042272, + "epoch": 0.9180219449872238, + "flos": 21324618501120.0, + "grad_norm": 1.7215154507925698, + "language_loss": 0.76216894, + "learning_rate": 7.0034194312526e-08, + "loss": 0.78028941, + "num_input_tokens_seen": 329370930, + "step": 15269, + "time_per_iteration": 2.698707342147827 + }, + { + "auxiliary_loss_clip": 0.01029546, + "auxiliary_loss_mlp": 0.01030799, + "balance_loss_clip": 1.02177811, + "balance_loss_mlp": 1.02012932, + "epoch": 0.9180820682398918, + "flos": 41060008684800.0, + "grad_norm": 4.157179188468113, + "language_loss": 0.72585177, + "learning_rate": 6.993207012706936e-08, + "loss": 0.74645531, + "num_input_tokens_seen": 329391275, + "step": 15270, + "time_per_iteration": 2.8501522541046143 + }, + { + "auxiliary_loss_clip": 0.01059888, + "auxiliary_loss_mlp": 0.01026186, + "balance_loss_clip": 1.02351177, + "balance_loss_mlp": 1.01591599, + "epoch": 0.9181421914925597, + "flos": 28072915384320.0, + "grad_norm": 1.4854215432979643, + "language_loss": 0.79623055, + "learning_rate": 6.98300191299821e-08, + "loss": 0.81709135, + "num_input_tokens_seen": 329412775, + "step": 15271, + "time_per_iteration": 4.377681255340576 + }, + { + "auxiliary_loss_clip": 0.01019414, + "auxiliary_loss_mlp": 0.01030725, + "balance_loss_clip": 1.02017868, + "balance_loss_mlp": 1.01978779, + "epoch": 0.9182023147452277, + "flos": 29169411909120.0, + "grad_norm": 1.8500706194162582, + "language_loss": 0.72529697, + "learning_rate": 6.972804132513355e-08, + "loss": 0.74579835, + "num_input_tokens_seen": 329432440, + "step": 15272, + "time_per_iteration": 2.7298390865325928 + }, + { + "auxiliary_loss_clip": 0.01031307, + "auxiliary_loss_mlp": 0.01027841, + "balance_loss_clip": 1.02348542, + "balance_loss_mlp": 1.01854253, + "epoch": 0.9182624379978956, + "flos": 24060831909120.0, + "grad_norm": 1.8922644783012939, + "language_loss": 0.72851193, + "learning_rate": 6.962613671639105e-08, + "loss": 0.74910343, + "num_input_tokens_seen": 329450605, + "step": 15273, + "time_per_iteration": 2.7845797538757324 + }, + { + "auxiliary_loss_clip": 0.01025625, + "auxiliary_loss_mlp": 0.01024719, + "balance_loss_clip": 1.02215767, + "balance_loss_mlp": 1.01559913, + "epoch": 0.9183225612505637, + "flos": 23293528554240.0, + "grad_norm": 1.488369072087557, + "language_loss": 0.74184924, + "learning_rate": 6.952430530761933e-08, + "loss": 0.76235271, + "num_input_tokens_seen": 329470550, + "step": 15274, + "time_per_iteration": 2.7813456058502197 + }, + { + "auxiliary_loss_clip": 0.01045898, + "auxiliary_loss_mlp": 0.01033604, + "balance_loss_clip": 1.02135611, + "balance_loss_mlp": 1.02375102, + "epoch": 0.9183826845032316, + "flos": 19609237618560.0, + "grad_norm": 1.4062276778306562, + "language_loss": 0.68732536, + "learning_rate": 6.942254710267902e-08, + "loss": 0.70812035, + "num_input_tokens_seen": 329489765, + "step": 15275, + "time_per_iteration": 2.748852491378784 + }, + { + "auxiliary_loss_clip": 0.01049576, + "auxiliary_loss_mlp": 0.01029585, + "balance_loss_clip": 1.0231055, + "balance_loss_mlp": 1.01958895, + "epoch": 0.9184428077558996, + "flos": 18479057114880.0, + "grad_norm": 1.9976895693712098, + "language_loss": 0.72405708, + "learning_rate": 6.932086210542953e-08, + "loss": 0.74484867, + "num_input_tokens_seen": 329507040, + "step": 15276, + "time_per_iteration": 2.6371569633483887 + }, + { + "auxiliary_loss_clip": 0.01040345, + "auxiliary_loss_mlp": 0.01028456, + "balance_loss_clip": 1.02386963, + "balance_loss_mlp": 1.01854324, + "epoch": 0.9185029310085676, + "flos": 20741034234240.0, + "grad_norm": 1.5855629517090348, + "language_loss": 0.734496, + "learning_rate": 6.921925031972642e-08, + "loss": 0.75518394, + "num_input_tokens_seen": 329525540, + "step": 15277, + "time_per_iteration": 2.705552101135254 + }, + { + "auxiliary_loss_clip": 0.00978525, + "auxiliary_loss_mlp": 0.01009227, + "balance_loss_clip": 1.00286508, + "balance_loss_mlp": 1.00824344, + "epoch": 0.9185630542612355, + "flos": 68209231875840.0, + "grad_norm": 0.7123555274682539, + "language_loss": 0.59243423, + "learning_rate": 6.91177117494226e-08, + "loss": 0.61231178, + "num_input_tokens_seen": 329592905, + "step": 15278, + "time_per_iteration": 3.537891149520874 + }, + { + "auxiliary_loss_clip": 0.01025517, + "auxiliary_loss_mlp": 0.01021596, + "balance_loss_clip": 1.02112186, + "balance_loss_mlp": 1.01284623, + "epoch": 0.9186231775139035, + "flos": 12239470598400.0, + "grad_norm": 1.5577285815883204, + "language_loss": 0.64300036, + "learning_rate": 6.901624639836879e-08, + "loss": 0.66347152, + "num_input_tokens_seen": 329610150, + "step": 15279, + "time_per_iteration": 2.786600351333618 + }, + { + "auxiliary_loss_clip": 0.01006531, + "auxiliary_loss_mlp": 0.00746509, + "balance_loss_clip": 1.00150323, + "balance_loss_mlp": 1.00043726, + "epoch": 0.9186833007665715, + "flos": 63939237770880.0, + "grad_norm": 0.8433425371674257, + "language_loss": 0.60195589, + "learning_rate": 6.891485427041211e-08, + "loss": 0.61948633, + "num_input_tokens_seen": 329673650, + "step": 15280, + "time_per_iteration": 3.171227216720581 + }, + { + "auxiliary_loss_clip": 0.010424, + "auxiliary_loss_mlp": 0.01031442, + "balance_loss_clip": 1.02401817, + "balance_loss_mlp": 1.0208149, + "epoch": 0.9187434240192395, + "flos": 19974700546560.0, + "grad_norm": 1.7692524315218283, + "language_loss": 0.69726056, + "learning_rate": 6.881353536939815e-08, + "loss": 0.71799898, + "num_input_tokens_seen": 329692520, + "step": 15281, + "time_per_iteration": 4.408520936965942 + }, + { + "auxiliary_loss_clip": 0.01042451, + "auxiliary_loss_mlp": 0.01026486, + "balance_loss_clip": 1.02498484, + "balance_loss_mlp": 1.01522708, + "epoch": 0.9188035472719074, + "flos": 25227820874880.0, + "grad_norm": 1.6361116153657205, + "language_loss": 0.84802639, + "learning_rate": 6.871228969916831e-08, + "loss": 0.86871576, + "num_input_tokens_seen": 329713750, + "step": 15282, + "time_per_iteration": 2.7670111656188965 + }, + { + "auxiliary_loss_clip": 0.01037568, + "auxiliary_loss_mlp": 0.01030576, + "balance_loss_clip": 1.02261996, + "balance_loss_mlp": 1.02010393, + "epoch": 0.9188636705245754, + "flos": 18405547931520.0, + "grad_norm": 1.9868644845225103, + "language_loss": 0.5992924, + "learning_rate": 6.861111726356194e-08, + "loss": 0.6199739, + "num_input_tokens_seen": 329730960, + "step": 15283, + "time_per_iteration": 2.618873357772827 + }, + { + "auxiliary_loss_clip": 0.01056007, + "auxiliary_loss_mlp": 0.0074765, + "balance_loss_clip": 1.02641964, + "balance_loss_mlp": 1.00036621, + "epoch": 0.9189237937772433, + "flos": 23769129559680.0, + "grad_norm": 1.474481763536535, + "language_loss": 0.64931691, + "learning_rate": 6.851001806641554e-08, + "loss": 0.66735339, + "num_input_tokens_seen": 329750975, + "step": 15284, + "time_per_iteration": 4.232694149017334 + }, + { + "auxiliary_loss_clip": 0.01060558, + "auxiliary_loss_mlp": 0.01029897, + "balance_loss_clip": 1.0239296, + "balance_loss_mlp": 1.01900697, + "epoch": 0.9189839170299113, + "flos": 21214624078080.0, + "grad_norm": 2.6415806591259052, + "language_loss": 0.7352128, + "learning_rate": 6.840899211156292e-08, + "loss": 0.75611734, + "num_input_tokens_seen": 329769645, + "step": 15285, + "time_per_iteration": 2.6003832817077637 + }, + { + "auxiliary_loss_clip": 0.0106052, + "auxiliary_loss_mlp": 0.01028548, + "balance_loss_clip": 1.02416611, + "balance_loss_mlp": 1.01790857, + "epoch": 0.9190440402825792, + "flos": 16727370560640.0, + "grad_norm": 1.7811907863514216, + "language_loss": 0.71627975, + "learning_rate": 6.830803940283458e-08, + "loss": 0.7371704, + "num_input_tokens_seen": 329788185, + "step": 15286, + "time_per_iteration": 2.568686008453369 + }, + { + "auxiliary_loss_clip": 0.01062936, + "auxiliary_loss_mlp": 0.01027356, + "balance_loss_clip": 1.02535892, + "balance_loss_mlp": 1.01655507, + "epoch": 0.9191041635352473, + "flos": 23441193365760.0, + "grad_norm": 2.4031998661872565, + "language_loss": 0.73582649, + "learning_rate": 6.820715994405945e-08, + "loss": 0.75672936, + "num_input_tokens_seen": 329806780, + "step": 15287, + "time_per_iteration": 2.600628137588501 + }, + { + "auxiliary_loss_clip": 0.01064558, + "auxiliary_loss_mlp": 0.01027305, + "balance_loss_clip": 1.02690172, + "balance_loss_mlp": 1.01623082, + "epoch": 0.9191642867879152, + "flos": 18807532012800.0, + "grad_norm": 2.095747490056827, + "language_loss": 0.65316844, + "learning_rate": 6.810635373906226e-08, + "loss": 0.67408705, + "num_input_tokens_seen": 329826350, + "step": 15288, + "time_per_iteration": 2.70373797416687 + }, + { + "auxiliary_loss_clip": 0.0106656, + "auxiliary_loss_mlp": 0.01031319, + "balance_loss_clip": 1.028543, + "balance_loss_mlp": 1.02109718, + "epoch": 0.9192244100405832, + "flos": 32160950167680.0, + "grad_norm": 2.038501393798214, + "language_loss": 0.70971274, + "learning_rate": 6.800562079166549e-08, + "loss": 0.73069155, + "num_input_tokens_seen": 329846160, + "step": 15289, + "time_per_iteration": 2.683497428894043 + }, + { + "auxiliary_loss_clip": 0.01018582, + "auxiliary_loss_mlp": 0.01031323, + "balance_loss_clip": 1.02226806, + "balance_loss_mlp": 1.0198487, + "epoch": 0.9192845332932512, + "flos": 16357669827840.0, + "grad_norm": 2.102037910326136, + "language_loss": 0.74748456, + "learning_rate": 6.790496110568921e-08, + "loss": 0.76798368, + "num_input_tokens_seen": 329862020, + "step": 15290, + "time_per_iteration": 2.672583818435669 + }, + { + "auxiliary_loss_clip": 0.01022436, + "auxiliary_loss_mlp": 0.0102437, + "balance_loss_clip": 1.02521706, + "balance_loss_mlp": 1.01460028, + "epoch": 0.9193446565459191, + "flos": 26614475464320.0, + "grad_norm": 1.8937904955083087, + "language_loss": 0.71905941, + "learning_rate": 6.78043746849506e-08, + "loss": 0.73952746, + "num_input_tokens_seen": 329880185, + "step": 15291, + "time_per_iteration": 2.8308298587799072 + }, + { + "auxiliary_loss_clip": 0.01035902, + "auxiliary_loss_mlp": 0.01024585, + "balance_loss_clip": 1.02263474, + "balance_loss_mlp": 1.01460099, + "epoch": 0.9194047797985871, + "flos": 22492182084480.0, + "grad_norm": 1.799854746189516, + "language_loss": 0.71106821, + "learning_rate": 6.770386153326346e-08, + "loss": 0.73167312, + "num_input_tokens_seen": 329900255, + "step": 15292, + "time_per_iteration": 2.7786192893981934 + }, + { + "auxiliary_loss_clip": 0.01043167, + "auxiliary_loss_mlp": 0.01026519, + "balance_loss_clip": 1.02519369, + "balance_loss_mlp": 1.01621914, + "epoch": 0.9194649030512551, + "flos": 25078791346560.0, + "grad_norm": 2.6640996704071425, + "language_loss": 0.72927552, + "learning_rate": 6.760342165443988e-08, + "loss": 0.7499724, + "num_input_tokens_seen": 329919095, + "step": 15293, + "time_per_iteration": 2.6760456562042236 + }, + { + "auxiliary_loss_clip": 0.01060505, + "auxiliary_loss_mlp": 0.01023414, + "balance_loss_clip": 1.0245651, + "balance_loss_mlp": 1.01335323, + "epoch": 0.9195250263039231, + "flos": 11911139354880.0, + "grad_norm": 1.8616389611339796, + "language_loss": 0.77725017, + "learning_rate": 6.750305505228837e-08, + "loss": 0.79808939, + "num_input_tokens_seen": 329936505, + "step": 15294, + "time_per_iteration": 2.6240367889404297 + }, + { + "auxiliary_loss_clip": 0.01038761, + "auxiliary_loss_mlp": 0.01030883, + "balance_loss_clip": 1.0234859, + "balance_loss_mlp": 1.0191468, + "epoch": 0.919585149556591, + "flos": 21834154880640.0, + "grad_norm": 1.5476187903174636, + "language_loss": 0.77285826, + "learning_rate": 6.74027617306141e-08, + "loss": 0.79355472, + "num_input_tokens_seen": 329956795, + "step": 15295, + "time_per_iteration": 2.701754093170166 + }, + { + "auxiliary_loss_clip": 0.0105997, + "auxiliary_loss_mlp": 0.01023069, + "balance_loss_clip": 1.02481961, + "balance_loss_mlp": 1.01428974, + "epoch": 0.919645272809259, + "flos": 28184059042560.0, + "grad_norm": 2.153190236386243, + "language_loss": 0.71411854, + "learning_rate": 6.730254169322114e-08, + "loss": 0.73494887, + "num_input_tokens_seen": 329977195, + "step": 15296, + "time_per_iteration": 2.6534953117370605 + }, + { + "auxiliary_loss_clip": 0.01063495, + "auxiliary_loss_mlp": 0.01035661, + "balance_loss_clip": 1.02638054, + "balance_loss_mlp": 1.02518284, + "epoch": 0.9197053960619269, + "flos": 18332828847360.0, + "grad_norm": 2.0154733132633607, + "language_loss": 0.7507636, + "learning_rate": 6.720239494390912e-08, + "loss": 0.7717551, + "num_input_tokens_seen": 329992095, + "step": 15297, + "time_per_iteration": 2.644843101501465 + }, + { + "auxiliary_loss_clip": 0.01052402, + "auxiliary_loss_mlp": 0.0074772, + "balance_loss_clip": 1.02529848, + "balance_loss_mlp": 1.00041842, + "epoch": 0.9197655193145949, + "flos": 28183448511360.0, + "grad_norm": 1.6651226863081898, + "language_loss": 0.73774737, + "learning_rate": 6.710232148647676e-08, + "loss": 0.75574857, + "num_input_tokens_seen": 330011490, + "step": 15298, + "time_per_iteration": 2.6684048175811768 + }, + { + "auxiliary_loss_clip": 0.01043567, + "auxiliary_loss_mlp": 0.0103152, + "balance_loss_clip": 1.02632928, + "balance_loss_mlp": 1.020612, + "epoch": 0.9198256425672628, + "flos": 17306321973120.0, + "grad_norm": 2.231887582517929, + "language_loss": 0.79300129, + "learning_rate": 6.70023213247175e-08, + "loss": 0.81375217, + "num_input_tokens_seen": 330027885, + "step": 15299, + "time_per_iteration": 2.678133010864258 + }, + { + "auxiliary_loss_clip": 0.01033502, + "auxiliary_loss_mlp": 0.01021848, + "balance_loss_clip": 1.02505672, + "balance_loss_mlp": 1.01206088, + "epoch": 0.9198857658199309, + "flos": 17858520731520.0, + "grad_norm": 2.0635212165662438, + "language_loss": 0.63853526, + "learning_rate": 6.690239446242385e-08, + "loss": 0.65908873, + "num_input_tokens_seen": 330046230, + "step": 15300, + "time_per_iteration": 4.591938734054565 + }, + { + "auxiliary_loss_clip": 0.01038915, + "auxiliary_loss_mlp": 0.00747387, + "balance_loss_clip": 1.02525222, + "balance_loss_mlp": 1.00032306, + "epoch": 0.9199458890725988, + "flos": 22127545169280.0, + "grad_norm": 1.9361752544485014, + "language_loss": 0.69649601, + "learning_rate": 6.680254090338545e-08, + "loss": 0.71435905, + "num_input_tokens_seen": 330065535, + "step": 15301, + "time_per_iteration": 2.6234278678894043 + }, + { + "auxiliary_loss_clip": 0.01047977, + "auxiliary_loss_mlp": 0.01035786, + "balance_loss_clip": 1.02369714, + "balance_loss_mlp": 1.02298903, + "epoch": 0.9200060123252668, + "flos": 16034043265920.0, + "grad_norm": 1.8076599851835216, + "language_loss": 0.70781136, + "learning_rate": 6.670276065138814e-08, + "loss": 0.72864902, + "num_input_tokens_seen": 330082920, + "step": 15302, + "time_per_iteration": 2.649214506149292 + }, + { + "auxiliary_loss_clip": 0.01061318, + "auxiliary_loss_mlp": 0.0102965, + "balance_loss_clip": 1.02400279, + "balance_loss_mlp": 1.01970184, + "epoch": 0.9200661355779348, + "flos": 26864521015680.0, + "grad_norm": 1.9717907865125683, + "language_loss": 0.76300728, + "learning_rate": 6.660305371021579e-08, + "loss": 0.78391695, + "num_input_tokens_seen": 330101165, + "step": 15303, + "time_per_iteration": 2.6827189922332764 + }, + { + "auxiliary_loss_clip": 0.01041189, + "auxiliary_loss_mlp": 0.01031932, + "balance_loss_clip": 1.02482688, + "balance_loss_mlp": 1.02172756, + "epoch": 0.9201262588306027, + "flos": 12786749193600.0, + "grad_norm": 3.954867324563947, + "language_loss": 0.87771297, + "learning_rate": 6.650342008365006e-08, + "loss": 0.89844418, + "num_input_tokens_seen": 330118775, + "step": 15304, + "time_per_iteration": 2.664344310760498 + }, + { + "auxiliary_loss_clip": 0.01004381, + "auxiliary_loss_mlp": 0.01033628, + "balance_loss_clip": 1.02236891, + "balance_loss_mlp": 1.02024722, + "epoch": 0.9201863820832707, + "flos": 20631614428800.0, + "grad_norm": 2.8842669975899993, + "language_loss": 0.77176607, + "learning_rate": 6.64038597754677e-08, + "loss": 0.79214621, + "num_input_tokens_seen": 330135570, + "step": 15305, + "time_per_iteration": 2.773378849029541 + }, + { + "auxiliary_loss_clip": 0.01042018, + "auxiliary_loss_mlp": 0.01029176, + "balance_loss_clip": 1.02304137, + "balance_loss_mlp": 1.01882231, + "epoch": 0.9202465053359387, + "flos": 26395815421440.0, + "grad_norm": 2.319343156314655, + "language_loss": 0.81612021, + "learning_rate": 6.630437278944501e-08, + "loss": 0.83683217, + "num_input_tokens_seen": 330152840, + "step": 15306, + "time_per_iteration": 2.695679187774658 + }, + { + "auxiliary_loss_clip": 0.01027723, + "auxiliary_loss_mlp": 0.01027928, + "balance_loss_clip": 1.02357769, + "balance_loss_mlp": 1.01864147, + "epoch": 0.9203066285886067, + "flos": 10488179093760.0, + "grad_norm": 1.9139337835853045, + "language_loss": 0.72002602, + "learning_rate": 6.62049591293541e-08, + "loss": 0.74058259, + "num_input_tokens_seen": 330168605, + "step": 15307, + "time_per_iteration": 2.6875741481781006 + }, + { + "auxiliary_loss_clip": 0.01055617, + "auxiliary_loss_mlp": 0.01027891, + "balance_loss_clip": 1.02600384, + "balance_loss_mlp": 1.0171082, + "epoch": 0.9203667518412746, + "flos": 19390721230080.0, + "grad_norm": 1.8598298905199993, + "language_loss": 0.78792405, + "learning_rate": 6.610561879896526e-08, + "loss": 0.80875909, + "num_input_tokens_seen": 330186160, + "step": 15308, + "time_per_iteration": 2.6092700958251953 + }, + { + "auxiliary_loss_clip": 0.01038117, + "auxiliary_loss_mlp": 0.01027309, + "balance_loss_clip": 1.02226162, + "balance_loss_mlp": 1.01691437, + "epoch": 0.9204268750939426, + "flos": 15924982596480.0, + "grad_norm": 1.8494436035499222, + "language_loss": 0.77673197, + "learning_rate": 6.600635180204484e-08, + "loss": 0.79738623, + "num_input_tokens_seen": 330201780, + "step": 15309, + "time_per_iteration": 2.720721960067749 + }, + { + "auxiliary_loss_clip": 0.01003203, + "auxiliary_loss_mlp": 0.01025419, + "balance_loss_clip": 1.02002931, + "balance_loss_mlp": 1.01463079, + "epoch": 0.9204869983466105, + "flos": 16471758401280.0, + "grad_norm": 1.7704978428031093, + "language_loss": 0.66415453, + "learning_rate": 6.590715814235781e-08, + "loss": 0.68444073, + "num_input_tokens_seen": 330219165, + "step": 15310, + "time_per_iteration": 2.6910948753356934 + }, + { + "auxiliary_loss_clip": 0.0100539, + "auxiliary_loss_mlp": 0.01029024, + "balance_loss_clip": 1.02151382, + "balance_loss_mlp": 1.01800895, + "epoch": 0.9205471215992785, + "flos": 21539220307200.0, + "grad_norm": 1.775524040872597, + "language_loss": 0.66491842, + "learning_rate": 6.580803782366495e-08, + "loss": 0.6852625, + "num_input_tokens_seen": 330238975, + "step": 15311, + "time_per_iteration": 2.896355152130127 + }, + { + "auxiliary_loss_clip": 0.01052726, + "auxiliary_loss_mlp": 0.01030882, + "balance_loss_clip": 1.02478933, + "balance_loss_mlp": 1.02055883, + "epoch": 0.9206072448519464, + "flos": 25005892694400.0, + "grad_norm": 1.7267018628558444, + "language_loss": 0.75883448, + "learning_rate": 6.570899084972503e-08, + "loss": 0.77967054, + "num_input_tokens_seen": 330259755, + "step": 15312, + "time_per_iteration": 2.672276735305786 + }, + { + "auxiliary_loss_clip": 0.01050325, + "auxiliary_loss_mlp": 0.01029773, + "balance_loss_clip": 1.02492857, + "balance_loss_mlp": 1.01984906, + "epoch": 0.9206673681046145, + "flos": 20522661500160.0, + "grad_norm": 1.8715088069934942, + "language_loss": 0.79653275, + "learning_rate": 6.561001722429394e-08, + "loss": 0.8173337, + "num_input_tokens_seen": 330277660, + "step": 15313, + "time_per_iteration": 2.6855521202087402 + }, + { + "auxiliary_loss_clip": 0.01051582, + "auxiliary_loss_mlp": 0.01029195, + "balance_loss_clip": 1.02386796, + "balance_loss_mlp": 1.01893103, + "epoch": 0.9207274913572824, + "flos": 20883455660160.0, + "grad_norm": 1.944362044448653, + "language_loss": 0.7827574, + "learning_rate": 6.55111169511251e-08, + "loss": 0.8035652, + "num_input_tokens_seen": 330295455, + "step": 15314, + "time_per_iteration": 2.6731982231140137 + }, + { + "auxiliary_loss_clip": 0.01046425, + "auxiliary_loss_mlp": 0.01032806, + "balance_loss_clip": 1.02654135, + "balance_loss_mlp": 1.02099216, + "epoch": 0.9207876146099504, + "flos": 22708256348160.0, + "grad_norm": 2.237561649424735, + "language_loss": 0.78913796, + "learning_rate": 6.541229003396864e-08, + "loss": 0.80993032, + "num_input_tokens_seen": 330315310, + "step": 15315, + "time_per_iteration": 2.8205838203430176 + }, + { + "auxiliary_loss_clip": 0.01045413, + "auxiliary_loss_mlp": 0.01031253, + "balance_loss_clip": 1.0254209, + "balance_loss_mlp": 1.02029157, + "epoch": 0.9208477378626184, + "flos": 18507354053760.0, + "grad_norm": 7.555478343239604, + "language_loss": 0.76047021, + "learning_rate": 6.531353647657156e-08, + "loss": 0.78123689, + "num_input_tokens_seen": 330333260, + "step": 15316, + "time_per_iteration": 2.837667942047119 + }, + { + "auxiliary_loss_clip": 0.01061713, + "auxiliary_loss_mlp": 0.01030811, + "balance_loss_clip": 1.02402937, + "balance_loss_mlp": 1.01993966, + "epoch": 0.9209078611152863, + "flos": 22999635475200.0, + "grad_norm": 1.8846202877252372, + "language_loss": 0.69206774, + "learning_rate": 6.521485628267931e-08, + "loss": 0.71299303, + "num_input_tokens_seen": 330352465, + "step": 15317, + "time_per_iteration": 2.630612373352051 + }, + { + "auxiliary_loss_clip": 0.0105262, + "auxiliary_loss_mlp": 0.01028937, + "balance_loss_clip": 1.02551746, + "balance_loss_mlp": 1.01844716, + "epoch": 0.9209679843679544, + "flos": 24061514267520.0, + "grad_norm": 1.7699528963285158, + "language_loss": 0.83544719, + "learning_rate": 6.511624945603378e-08, + "loss": 0.85626274, + "num_input_tokens_seen": 330372685, + "step": 15318, + "time_per_iteration": 4.3817832469940186 + }, + { + "auxiliary_loss_clip": 0.01042305, + "auxiliary_loss_mlp": 0.01031702, + "balance_loss_clip": 1.02534819, + "balance_loss_mlp": 1.02134323, + "epoch": 0.9210281076206223, + "flos": 13553370190080.0, + "grad_norm": 2.155536330065068, + "language_loss": 0.859056, + "learning_rate": 6.501771600037354e-08, + "loss": 0.87979615, + "num_input_tokens_seen": 330388860, + "step": 15319, + "time_per_iteration": 2.5515329837799072 + }, + { + "auxiliary_loss_clip": 0.01005564, + "auxiliary_loss_mlp": 0.01002607, + "balance_loss_clip": 1.00053358, + "balance_loss_mlp": 1.0017848, + "epoch": 0.9210882308732903, + "flos": 71426289674880.0, + "grad_norm": 0.7725847854079223, + "language_loss": 0.56166065, + "learning_rate": 6.491925591943559e-08, + "loss": 0.58174241, + "num_input_tokens_seen": 330448735, + "step": 15320, + "time_per_iteration": 3.1999154090881348 + }, + { + "auxiliary_loss_clip": 0.01019717, + "auxiliary_loss_mlp": 0.01038975, + "balance_loss_clip": 1.02506471, + "balance_loss_mlp": 1.02623701, + "epoch": 0.9211483541259582, + "flos": 18509113820160.0, + "grad_norm": 2.035177185583856, + "language_loss": 0.64072728, + "learning_rate": 6.482086921695384e-08, + "loss": 0.66131413, + "num_input_tokens_seen": 330465600, + "step": 15321, + "time_per_iteration": 2.6984164714813232 + }, + { + "auxiliary_loss_clip": 0.01026142, + "auxiliary_loss_mlp": 0.01026994, + "balance_loss_clip": 1.02305758, + "balance_loss_mlp": 1.01693904, + "epoch": 0.9212084773786262, + "flos": 23258228463360.0, + "grad_norm": 1.4031422036496612, + "language_loss": 0.7156623, + "learning_rate": 6.47225558966582e-08, + "loss": 0.73619366, + "num_input_tokens_seen": 330485770, + "step": 15322, + "time_per_iteration": 2.890596389770508 + }, + { + "auxiliary_loss_clip": 0.01022621, + "auxiliary_loss_mlp": 0.01028245, + "balance_loss_clip": 1.02495003, + "balance_loss_mlp": 1.01865494, + "epoch": 0.9212686006312941, + "flos": 16289511770880.0, + "grad_norm": 1.7016529341114552, + "language_loss": 0.70231259, + "learning_rate": 6.462431596227725e-08, + "loss": 0.7228213, + "num_input_tokens_seen": 330504255, + "step": 15323, + "time_per_iteration": 2.7017362117767334 + }, + { + "auxiliary_loss_clip": 0.01035334, + "auxiliary_loss_mlp": 0.01031856, + "balance_loss_clip": 1.02225137, + "balance_loss_mlp": 1.01972055, + "epoch": 0.9213287238839621, + "flos": 19785773986560.0, + "grad_norm": 1.7712072247352986, + "language_loss": 0.74620461, + "learning_rate": 6.452614941753597e-08, + "loss": 0.76687658, + "num_input_tokens_seen": 330520705, + "step": 15324, + "time_per_iteration": 2.69333815574646 + }, + { + "auxiliary_loss_clip": 0.01053258, + "auxiliary_loss_mlp": 0.01035792, + "balance_loss_clip": 1.02550793, + "balance_loss_mlp": 1.02511692, + "epoch": 0.92138884713663, + "flos": 21030402199680.0, + "grad_norm": 2.5365896813740245, + "language_loss": 0.71543932, + "learning_rate": 6.442805626615744e-08, + "loss": 0.73632985, + "num_input_tokens_seen": 330539245, + "step": 15325, + "time_per_iteration": 2.6672892570495605 + }, + { + "auxiliary_loss_clip": 0.01042989, + "auxiliary_loss_mlp": 0.01028443, + "balance_loss_clip": 1.02594554, + "balance_loss_mlp": 1.01806593, + "epoch": 0.9214489703892981, + "flos": 28587264186240.0, + "grad_norm": 1.505304296191775, + "language_loss": 0.78443247, + "learning_rate": 6.433003651186109e-08, + "loss": 0.80514675, + "num_input_tokens_seen": 330561815, + "step": 15326, + "time_per_iteration": 2.74981951713562 + }, + { + "auxiliary_loss_clip": 0.0105361, + "auxiliary_loss_mlp": 0.01033155, + "balance_loss_clip": 1.02535856, + "balance_loss_mlp": 1.02248585, + "epoch": 0.921509093641966, + "flos": 16361476669440.0, + "grad_norm": 2.698063662202882, + "language_loss": 0.71622086, + "learning_rate": 6.42320901583635e-08, + "loss": 0.7370885, + "num_input_tokens_seen": 330579760, + "step": 15327, + "time_per_iteration": 2.64286470413208 + }, + { + "auxiliary_loss_clip": 0.01057381, + "auxiliary_loss_mlp": 0.01039489, + "balance_loss_clip": 1.02813482, + "balance_loss_mlp": 1.02790761, + "epoch": 0.921569216894634, + "flos": 26830837036800.0, + "grad_norm": 1.795174834830277, + "language_loss": 0.77920836, + "learning_rate": 6.413421720937906e-08, + "loss": 0.8001771, + "num_input_tokens_seen": 330598545, + "step": 15328, + "time_per_iteration": 2.6729907989501953 + }, + { + "auxiliary_loss_clip": 0.01044142, + "auxiliary_loss_mlp": 0.0102663, + "balance_loss_clip": 1.02675104, + "balance_loss_mlp": 1.01670623, + "epoch": 0.921629340147302, + "flos": 24645134448000.0, + "grad_norm": 2.778357707996684, + "language_loss": 0.71349305, + "learning_rate": 6.4036417668619e-08, + "loss": 0.73420072, + "num_input_tokens_seen": 330616700, + "step": 15329, + "time_per_iteration": 4.29954195022583 + }, + { + "auxiliary_loss_clip": 0.01050383, + "auxiliary_loss_mlp": 0.01022554, + "balance_loss_clip": 1.02392042, + "balance_loss_mlp": 1.01317191, + "epoch": 0.9216894633999699, + "flos": 15086504442240.0, + "grad_norm": 2.042115861974427, + "language_loss": 0.86915326, + "learning_rate": 6.393869153979192e-08, + "loss": 0.88988268, + "num_input_tokens_seen": 330633355, + "step": 15330, + "time_per_iteration": 2.6064627170562744 + }, + { + "auxiliary_loss_clip": 0.01021133, + "auxiliary_loss_mlp": 0.01028228, + "balance_loss_clip": 1.02006412, + "balance_loss_mlp": 1.01774979, + "epoch": 0.921749586652638, + "flos": 19204524103680.0, + "grad_norm": 2.2286202290377686, + "language_loss": 0.75612009, + "learning_rate": 6.384103882660397e-08, + "loss": 0.77661371, + "num_input_tokens_seen": 330651470, + "step": 15331, + "time_per_iteration": 4.350120306015015 + }, + { + "auxiliary_loss_clip": 0.01051942, + "auxiliary_loss_mlp": 0.01022851, + "balance_loss_clip": 1.02464283, + "balance_loss_mlp": 1.0131768, + "epoch": 0.9218097099053059, + "flos": 20522446018560.0, + "grad_norm": 1.7304389778365472, + "language_loss": 0.75337565, + "learning_rate": 6.374345953275794e-08, + "loss": 0.77412355, + "num_input_tokens_seen": 330669170, + "step": 15332, + "time_per_iteration": 2.7400312423706055 + }, + { + "auxiliary_loss_clip": 0.01008302, + "auxiliary_loss_mlp": 0.01030239, + "balance_loss_clip": 1.02109814, + "balance_loss_mlp": 1.02033257, + "epoch": 0.9218698331579739, + "flos": 17348625216000.0, + "grad_norm": 2.113223421937637, + "language_loss": 0.74943411, + "learning_rate": 6.364595366195358e-08, + "loss": 0.7698195, + "num_input_tokens_seen": 330686635, + "step": 15333, + "time_per_iteration": 2.709763288497925 + }, + { + "auxiliary_loss_clip": 0.00999208, + "auxiliary_loss_mlp": 0.01000991, + "balance_loss_clip": 1.00358224, + "balance_loss_mlp": 0.99995977, + "epoch": 0.9219299564106418, + "flos": 61958332575360.0, + "grad_norm": 0.8394485577031559, + "language_loss": 0.52917922, + "learning_rate": 6.354852121788879e-08, + "loss": 0.54918122, + "num_input_tokens_seen": 330749160, + "step": 15334, + "time_per_iteration": 3.2096190452575684 + }, + { + "auxiliary_loss_clip": 0.01038262, + "auxiliary_loss_mlp": 0.01028587, + "balance_loss_clip": 1.02444828, + "balance_loss_mlp": 1.0187577, + "epoch": 0.9219900796633098, + "flos": 15701761526400.0, + "grad_norm": 1.7488147654172492, + "language_loss": 0.62565708, + "learning_rate": 6.345116220425839e-08, + "loss": 0.64632559, + "num_input_tokens_seen": 330766840, + "step": 15335, + "time_per_iteration": 2.6816303730010986 + }, + { + "auxiliary_loss_clip": 0.01001122, + "auxiliary_loss_mlp": 0.01029271, + "balance_loss_clip": 1.01933742, + "balance_loss_mlp": 1.0184052, + "epoch": 0.9220502029159777, + "flos": 24932670819840.0, + "grad_norm": 1.6019502016483576, + "language_loss": 0.71503019, + "learning_rate": 6.335387662475366e-08, + "loss": 0.73533416, + "num_input_tokens_seen": 330785585, + "step": 15336, + "time_per_iteration": 2.8795926570892334 + }, + { + "auxiliary_loss_clip": 0.01038286, + "auxiliary_loss_mlp": 0.0102961, + "balance_loss_clip": 1.02336919, + "balance_loss_mlp": 1.02048492, + "epoch": 0.9221103261686457, + "flos": 15667215621120.0, + "grad_norm": 1.8911261203361527, + "language_loss": 0.71651208, + "learning_rate": 6.325666448306433e-08, + "loss": 0.73719102, + "num_input_tokens_seen": 330800750, + "step": 15337, + "time_per_iteration": 2.8274130821228027 + }, + { + "auxiliary_loss_clip": 0.00998731, + "auxiliary_loss_mlp": 0.00999379, + "balance_loss_clip": 1.00279725, + "balance_loss_mlp": 0.99847281, + "epoch": 0.9221704494213137, + "flos": 67516299630720.0, + "grad_norm": 0.8849244913707791, + "language_loss": 0.65315878, + "learning_rate": 6.31595257828763e-08, + "loss": 0.67313987, + "num_input_tokens_seen": 330863640, + "step": 15338, + "time_per_iteration": 3.2057321071624756 + }, + { + "auxiliary_loss_clip": 0.0105593, + "auxiliary_loss_mlp": 0.01028821, + "balance_loss_clip": 1.02702975, + "balance_loss_mlp": 1.01771033, + "epoch": 0.9222305726739817, + "flos": 30226945155840.0, + "grad_norm": 3.0531208590236947, + "language_loss": 0.66912663, + "learning_rate": 6.306246052787289e-08, + "loss": 0.68997419, + "num_input_tokens_seen": 330884675, + "step": 15339, + "time_per_iteration": 2.716050863265991 + }, + { + "auxiliary_loss_clip": 0.01063328, + "auxiliary_loss_mlp": 0.01027018, + "balance_loss_clip": 1.02614498, + "balance_loss_mlp": 1.01699281, + "epoch": 0.9222906959266496, + "flos": 25337204766720.0, + "grad_norm": 1.831920909265448, + "language_loss": 0.71726686, + "learning_rate": 6.296546872173513e-08, + "loss": 0.73817033, + "num_input_tokens_seen": 330904125, + "step": 15340, + "time_per_iteration": 2.6460561752319336 + }, + { + "auxiliary_loss_clip": 0.01026956, + "auxiliary_loss_mlp": 0.01029649, + "balance_loss_clip": 1.02198458, + "balance_loss_mlp": 1.01946306, + "epoch": 0.9223508191793176, + "flos": 27599864244480.0, + "grad_norm": 1.5284762474464961, + "language_loss": 0.70057052, + "learning_rate": 6.286855036814098e-08, + "loss": 0.72113657, + "num_input_tokens_seen": 330925140, + "step": 15341, + "time_per_iteration": 2.7492005825042725 + }, + { + "auxiliary_loss_clip": 0.010197, + "auxiliary_loss_mlp": 0.01026367, + "balance_loss_clip": 1.02507985, + "balance_loss_mlp": 1.01728892, + "epoch": 0.9224109424319856, + "flos": 27307587277440.0, + "grad_norm": 1.5131937925930312, + "language_loss": 0.67275226, + "learning_rate": 6.277170547076571e-08, + "loss": 0.69321293, + "num_input_tokens_seen": 330946625, + "step": 15342, + "time_per_iteration": 2.7817721366882324 + }, + { + "auxiliary_loss_clip": 0.01025842, + "auxiliary_loss_mlp": 0.01032552, + "balance_loss_clip": 1.02748036, + "balance_loss_mlp": 1.02266419, + "epoch": 0.9224710656846535, + "flos": 48208314401280.0, + "grad_norm": 1.9652035461027026, + "language_loss": 0.69557077, + "learning_rate": 6.26749340332815e-08, + "loss": 0.71615469, + "num_input_tokens_seen": 330967795, + "step": 15343, + "time_per_iteration": 2.9645869731903076 + }, + { + "auxiliary_loss_clip": 0.00989644, + "auxiliary_loss_mlp": 0.01000257, + "balance_loss_clip": 1.00368881, + "balance_loss_mlp": 0.99951476, + "epoch": 0.9225311889373216, + "flos": 66722171794560.0, + "grad_norm": 0.7311276220919685, + "language_loss": 0.52052116, + "learning_rate": 6.257823605935786e-08, + "loss": 0.54042017, + "num_input_tokens_seen": 331040850, + "step": 15344, + "time_per_iteration": 3.411154270172119 + }, + { + "auxiliary_loss_clip": 0.01056712, + "auxiliary_loss_mlp": 0.01028974, + "balance_loss_clip": 1.02343631, + "balance_loss_mlp": 1.01980674, + "epoch": 0.9225913121899895, + "flos": 22271295398400.0, + "grad_norm": 1.7061329181797624, + "language_loss": 0.70517516, + "learning_rate": 6.248161155266162e-08, + "loss": 0.72603208, + "num_input_tokens_seen": 331060595, + "step": 15345, + "time_per_iteration": 2.5739970207214355 + }, + { + "auxiliary_loss_clip": 0.01041093, + "auxiliary_loss_mlp": 0.01032724, + "balance_loss_clip": 1.02342546, + "balance_loss_mlp": 1.02260911, + "epoch": 0.9226514354426575, + "flos": 20082719721600.0, + "grad_norm": 1.5979549145652758, + "language_loss": 0.7763024, + "learning_rate": 6.238506051685677e-08, + "loss": 0.79704058, + "num_input_tokens_seen": 331080195, + "step": 15346, + "time_per_iteration": 2.7359459400177 + }, + { + "auxiliary_loss_clip": 0.01046827, + "auxiliary_loss_mlp": 0.01033206, + "balance_loss_clip": 1.02690065, + "balance_loss_mlp": 1.02186918, + "epoch": 0.9227115586953254, + "flos": 16070851728000.0, + "grad_norm": 2.000886026778612, + "language_loss": 0.76358622, + "learning_rate": 6.228858295560457e-08, + "loss": 0.78438652, + "num_input_tokens_seen": 331097645, + "step": 15347, + "time_per_iteration": 2.5972604751586914 + }, + { + "auxiliary_loss_clip": 0.01049019, + "auxiliary_loss_mlp": 0.01027335, + "balance_loss_clip": 1.02546406, + "balance_loss_mlp": 1.01832843, + "epoch": 0.9227716819479934, + "flos": 20446027833600.0, + "grad_norm": 1.4546266295328985, + "language_loss": 0.76820922, + "learning_rate": 6.219217887256367e-08, + "loss": 0.7889728, + "num_input_tokens_seen": 331116830, + "step": 15348, + "time_per_iteration": 4.538194179534912 + }, + { + "auxiliary_loss_clip": 0.01041094, + "auxiliary_loss_mlp": 0.0103093, + "balance_loss_clip": 1.02343166, + "balance_loss_mlp": 1.0199331, + "epoch": 0.9228318052006613, + "flos": 25007401065600.0, + "grad_norm": 1.6297557762948138, + "language_loss": 0.67581964, + "learning_rate": 6.209584827138959e-08, + "loss": 0.69653988, + "num_input_tokens_seen": 331137235, + "step": 15349, + "time_per_iteration": 2.7562241554260254 + }, + { + "auxiliary_loss_clip": 0.01017791, + "auxiliary_loss_mlp": 0.01023789, + "balance_loss_clip": 1.02279341, + "balance_loss_mlp": 1.01362085, + "epoch": 0.9228919284533293, + "flos": 12677257560960.0, + "grad_norm": 2.3247966985465944, + "language_loss": 0.87288558, + "learning_rate": 6.199959115573495e-08, + "loss": 0.89330137, + "num_input_tokens_seen": 331153155, + "step": 15350, + "time_per_iteration": 2.7145495414733887 + }, + { + "auxiliary_loss_clip": 0.0098892, + "auxiliary_loss_mlp": 0.01001274, + "balance_loss_clip": 1.00315332, + "balance_loss_mlp": 1.00049877, + "epoch": 0.9229520517059973, + "flos": 69986162712960.0, + "grad_norm": 0.766820780908874, + "language_loss": 0.60372812, + "learning_rate": 6.190340752924994e-08, + "loss": 0.62363005, + "num_input_tokens_seen": 331214895, + "step": 15351, + "time_per_iteration": 3.227008104324341 + }, + { + "auxiliary_loss_clip": 0.01040905, + "auxiliary_loss_mlp": 0.01027487, + "balance_loss_clip": 1.02274704, + "balance_loss_mlp": 1.01766372, + "epoch": 0.9230121749586653, + "flos": 14793832425600.0, + "grad_norm": 2.5953800004259717, + "language_loss": 0.77545685, + "learning_rate": 6.180729739558233e-08, + "loss": 0.79614079, + "num_input_tokens_seen": 331232185, + "step": 15352, + "time_per_iteration": 2.7215209007263184 + }, + { + "auxiliary_loss_clip": 0.01026318, + "auxiliary_loss_mlp": 0.01041267, + "balance_loss_clip": 1.02147341, + "balance_loss_mlp": 1.02873254, + "epoch": 0.9230722982113332, + "flos": 22967208472320.0, + "grad_norm": 1.98589038974172, + "language_loss": 0.59941554, + "learning_rate": 6.171126075837585e-08, + "loss": 0.62009144, + "num_input_tokens_seen": 331251065, + "step": 15353, + "time_per_iteration": 2.8076016902923584 + }, + { + "auxiliary_loss_clip": 0.01034098, + "auxiliary_loss_mlp": 0.01028264, + "balance_loss_clip": 1.02239466, + "balance_loss_mlp": 1.01836944, + "epoch": 0.9231324214640012, + "flos": 18551452976640.0, + "grad_norm": 1.685056777507483, + "language_loss": 0.74790144, + "learning_rate": 6.161529762127293e-08, + "loss": 0.76852512, + "num_input_tokens_seen": 331269110, + "step": 15354, + "time_per_iteration": 2.7644145488739014 + }, + { + "auxiliary_loss_clip": 0.01064754, + "auxiliary_loss_mlp": 0.01031274, + "balance_loss_clip": 1.02513885, + "balance_loss_mlp": 1.01971054, + "epoch": 0.9231925447166691, + "flos": 22082727974400.0, + "grad_norm": 1.838957140832521, + "language_loss": 0.64861691, + "learning_rate": 6.1519407987912e-08, + "loss": 0.66957724, + "num_input_tokens_seen": 331286555, + "step": 15355, + "time_per_iteration": 2.626829147338867 + }, + { + "auxiliary_loss_clip": 0.01035015, + "auxiliary_loss_mlp": 0.01036881, + "balance_loss_clip": 1.02317715, + "balance_loss_mlp": 1.02571726, + "epoch": 0.9232526679693371, + "flos": 26541145848960.0, + "grad_norm": 1.4756620407723167, + "language_loss": 0.74233752, + "learning_rate": 6.142359186192947e-08, + "loss": 0.76305652, + "num_input_tokens_seen": 331307660, + "step": 15356, + "time_per_iteration": 2.8117575645446777 + }, + { + "auxiliary_loss_clip": 0.01035714, + "auxiliary_loss_mlp": 0.01026058, + "balance_loss_clip": 1.02289462, + "balance_loss_mlp": 1.01542497, + "epoch": 0.9233127912220052, + "flos": 14756664827520.0, + "grad_norm": 1.7369453211793764, + "language_loss": 0.61250663, + "learning_rate": 6.132784924695844e-08, + "loss": 0.63312429, + "num_input_tokens_seen": 331324885, + "step": 15357, + "time_per_iteration": 2.669219732284546 + }, + { + "auxiliary_loss_clip": 0.01035056, + "auxiliary_loss_mlp": 0.01031398, + "balance_loss_clip": 1.0243156, + "balance_loss_mlp": 1.02039528, + "epoch": 0.9233729144746731, + "flos": 25261792162560.0, + "grad_norm": 1.6651908516846945, + "language_loss": 0.70134568, + "learning_rate": 6.123218014662956e-08, + "loss": 0.7220102, + "num_input_tokens_seen": 331345885, + "step": 15358, + "time_per_iteration": 2.8131821155548096 + }, + { + "auxiliary_loss_clip": 0.01062341, + "auxiliary_loss_mlp": 0.01030902, + "balance_loss_clip": 1.02522099, + "balance_loss_mlp": 1.02076316, + "epoch": 0.9234330377273411, + "flos": 27849837968640.0, + "grad_norm": 2.07877419377843, + "language_loss": 0.7324723, + "learning_rate": 6.113658456457104e-08, + "loss": 0.75340474, + "num_input_tokens_seen": 331364320, + "step": 15359, + "time_per_iteration": 2.63859486579895 + }, + { + "auxiliary_loss_clip": 0.01005103, + "auxiliary_loss_mlp": 0.01030993, + "balance_loss_clip": 1.02593684, + "balance_loss_mlp": 1.0209552, + "epoch": 0.923493160980009, + "flos": 24608361899520.0, + "grad_norm": 1.7981635983913684, + "language_loss": 0.65071154, + "learning_rate": 6.104106250440732e-08, + "loss": 0.67107248, + "num_input_tokens_seen": 331384135, + "step": 15360, + "time_per_iteration": 2.923997402191162 + }, + { + "auxiliary_loss_clip": 0.00997001, + "auxiliary_loss_mlp": 0.007465, + "balance_loss_clip": 1.00165582, + "balance_loss_mlp": 1.00039899, + "epoch": 0.923553284232677, + "flos": 67700916558720.0, + "grad_norm": 0.7775563521442852, + "language_loss": 0.5512414, + "learning_rate": 6.094561396976083e-08, + "loss": 0.56867647, + "num_input_tokens_seen": 331440645, + "step": 15361, + "time_per_iteration": 3.586014986038208 + }, + { + "auxiliary_loss_clip": 0.01025486, + "auxiliary_loss_mlp": 0.01027045, + "balance_loss_clip": 1.02115893, + "balance_loss_mlp": 1.01580954, + "epoch": 0.9236134074853449, + "flos": 18807244704000.0, + "grad_norm": 1.6429524272710498, + "language_loss": 0.69832504, + "learning_rate": 6.085023896425112e-08, + "loss": 0.71885037, + "num_input_tokens_seen": 331459580, + "step": 15362, + "time_per_iteration": 2.774296998977661 + }, + { + "auxiliary_loss_clip": 0.0104534, + "auxiliary_loss_mlp": 0.01028353, + "balance_loss_clip": 1.02291203, + "balance_loss_mlp": 1.01590133, + "epoch": 0.923673530738013, + "flos": 27782362270080.0, + "grad_norm": 2.8048465746765627, + "language_loss": 0.75769752, + "learning_rate": 6.075493749149463e-08, + "loss": 0.77843446, + "num_input_tokens_seen": 331481560, + "step": 15363, + "time_per_iteration": 2.6631600856781006 + }, + { + "auxiliary_loss_clip": 0.0106233, + "auxiliary_loss_mlp": 0.01028144, + "balance_loss_clip": 1.02516353, + "balance_loss_mlp": 1.01811266, + "epoch": 0.9237336539906809, + "flos": 26797117144320.0, + "grad_norm": 1.869698753157936, + "language_loss": 0.83476782, + "learning_rate": 6.065970955510514e-08, + "loss": 0.85567254, + "num_input_tokens_seen": 331499090, + "step": 15364, + "time_per_iteration": 2.6909759044647217 + }, + { + "auxiliary_loss_clip": 0.01030389, + "auxiliary_loss_mlp": 0.01026494, + "balance_loss_clip": 1.02442205, + "balance_loss_mlp": 1.01712441, + "epoch": 0.9237937772433489, + "flos": 23587708942080.0, + "grad_norm": 1.5613024904317034, + "language_loss": 0.68073952, + "learning_rate": 6.056455515869419e-08, + "loss": 0.70130837, + "num_input_tokens_seen": 331519420, + "step": 15365, + "time_per_iteration": 4.729899168014526 + }, + { + "auxiliary_loss_clip": 0.01061776, + "auxiliary_loss_mlp": 0.01025117, + "balance_loss_clip": 1.02437019, + "balance_loss_mlp": 1.01488256, + "epoch": 0.9238539004960168, + "flos": 26140562398080.0, + "grad_norm": 2.125954008936762, + "language_loss": 0.62878799, + "learning_rate": 6.046947430586913e-08, + "loss": 0.64965689, + "num_input_tokens_seen": 331538720, + "step": 15366, + "time_per_iteration": 2.643273115158081 + }, + { + "auxiliary_loss_clip": 0.01033251, + "auxiliary_loss_mlp": 0.01027148, + "balance_loss_clip": 1.02609801, + "balance_loss_mlp": 1.01634753, + "epoch": 0.9239140237486848, + "flos": 21068000760960.0, + "grad_norm": 1.4221664256869146, + "language_loss": 0.74511361, + "learning_rate": 6.037446700023619e-08, + "loss": 0.76571757, + "num_input_tokens_seen": 331558505, + "step": 15367, + "time_per_iteration": 2.7634291648864746 + }, + { + "auxiliary_loss_clip": 0.01040036, + "auxiliary_loss_mlp": 0.00747447, + "balance_loss_clip": 1.02502298, + "balance_loss_mlp": 1.00037718, + "epoch": 0.9239741470013527, + "flos": 24607930936320.0, + "grad_norm": 2.460534883250827, + "language_loss": 0.64875674, + "learning_rate": 6.027953324539759e-08, + "loss": 0.66663152, + "num_input_tokens_seen": 331578440, + "step": 15368, + "time_per_iteration": 2.972182512283325 + }, + { + "auxiliary_loss_clip": 0.01052618, + "auxiliary_loss_mlp": 0.01030671, + "balance_loss_clip": 1.02435398, + "balance_loss_mlp": 1.01983452, + "epoch": 0.9240342702540207, + "flos": 24718248581760.0, + "grad_norm": 1.708510446196811, + "language_loss": 0.7463221, + "learning_rate": 6.018467304495401e-08, + "loss": 0.76715499, + "num_input_tokens_seen": 331598945, + "step": 15369, + "time_per_iteration": 2.6568503379821777 + }, + { + "auxiliary_loss_clip": 0.01057583, + "auxiliary_loss_mlp": 0.01035762, + "balance_loss_clip": 1.02773428, + "balance_loss_mlp": 1.02390671, + "epoch": 0.9240943935066888, + "flos": 20849987162880.0, + "grad_norm": 1.8856967165781056, + "language_loss": 0.76064831, + "learning_rate": 6.008988640250145e-08, + "loss": 0.78158176, + "num_input_tokens_seen": 331616700, + "step": 15370, + "time_per_iteration": 2.702261209487915 + }, + { + "auxiliary_loss_clip": 0.01061976, + "auxiliary_loss_mlp": 0.01029586, + "balance_loss_clip": 1.02476859, + "balance_loss_mlp": 1.01912558, + "epoch": 0.9241545167593567, + "flos": 24462313200000.0, + "grad_norm": 2.4023134692090506, + "language_loss": 0.67375982, + "learning_rate": 5.999517332163528e-08, + "loss": 0.69467545, + "num_input_tokens_seen": 331635625, + "step": 15371, + "time_per_iteration": 2.669323444366455 + }, + { + "auxiliary_loss_clip": 0.00987887, + "auxiliary_loss_mlp": 0.01000509, + "balance_loss_clip": 1.0025022, + "balance_loss_mlp": 0.99967498, + "epoch": 0.9242146400120247, + "flos": 61827259847040.0, + "grad_norm": 0.7282948411122974, + "language_loss": 0.5774098, + "learning_rate": 5.99005338059464e-08, + "loss": 0.59729379, + "num_input_tokens_seen": 331698595, + "step": 15372, + "time_per_iteration": 3.2136952877044678 + }, + { + "auxiliary_loss_clip": 0.01062532, + "auxiliary_loss_mlp": 0.0102842, + "balance_loss_clip": 1.02713227, + "balance_loss_mlp": 1.01921141, + "epoch": 0.9242747632646926, + "flos": 22048397550720.0, + "grad_norm": 1.786951576501965, + "language_loss": 0.69627261, + "learning_rate": 5.98059678590237e-08, + "loss": 0.71718216, + "num_input_tokens_seen": 331717975, + "step": 15373, + "time_per_iteration": 2.6426937580108643 + }, + { + "auxiliary_loss_clip": 0.01047073, + "auxiliary_loss_mlp": 0.01033354, + "balance_loss_clip": 1.02463078, + "balance_loss_mlp": 1.02280951, + "epoch": 0.9243348865173606, + "flos": 18478338842880.0, + "grad_norm": 2.292456282179126, + "language_loss": 0.75637853, + "learning_rate": 5.971147548445299e-08, + "loss": 0.77718276, + "num_input_tokens_seen": 331737220, + "step": 15374, + "time_per_iteration": 2.731816291809082 + }, + { + "auxiliary_loss_clip": 0.01023982, + "auxiliary_loss_mlp": 0.01031263, + "balance_loss_clip": 1.02356791, + "balance_loss_mlp": 1.02080858, + "epoch": 0.9243950097700285, + "flos": 23258767167360.0, + "grad_norm": 1.582252168384595, + "language_loss": 0.64661008, + "learning_rate": 5.961705668581784e-08, + "loss": 0.66716254, + "num_input_tokens_seen": 331757300, + "step": 15375, + "time_per_iteration": 2.7187983989715576 + }, + { + "auxiliary_loss_clip": 0.01043606, + "auxiliary_loss_mlp": 0.01028979, + "balance_loss_clip": 1.02745819, + "balance_loss_mlp": 1.01832151, + "epoch": 0.9244551330226966, + "flos": 29749081593600.0, + "grad_norm": 2.2725150908925467, + "language_loss": 0.66584027, + "learning_rate": 5.952271146669829e-08, + "loss": 0.68656611, + "num_input_tokens_seen": 331776995, + "step": 15376, + "time_per_iteration": 4.5945868492126465 + }, + { + "auxiliary_loss_clip": 0.01005496, + "auxiliary_loss_mlp": 0.00999953, + "balance_loss_clip": 1.00045753, + "balance_loss_mlp": 0.99914199, + "epoch": 0.9245152562753645, + "flos": 68864960609280.0, + "grad_norm": 0.6540049759241426, + "language_loss": 0.61163938, + "learning_rate": 5.94284398306717e-08, + "loss": 0.63169384, + "num_input_tokens_seen": 331845015, + "step": 15377, + "time_per_iteration": 3.2310993671417236 + }, + { + "auxiliary_loss_clip": 0.01022376, + "auxiliary_loss_mlp": 0.01030875, + "balance_loss_clip": 1.02201867, + "balance_loss_mlp": 1.02040863, + "epoch": 0.9245753795280325, + "flos": 21579260993280.0, + "grad_norm": 1.7820153711808995, + "language_loss": 0.7404238, + "learning_rate": 5.933424178131341e-08, + "loss": 0.76095629, + "num_input_tokens_seen": 331862795, + "step": 15378, + "time_per_iteration": 2.669947624206543 + }, + { + "auxiliary_loss_clip": 0.01064033, + "auxiliary_loss_mlp": 0.0102814, + "balance_loss_clip": 1.0262202, + "balance_loss_mlp": 1.01743507, + "epoch": 0.9246355027807004, + "flos": 34496077334400.0, + "grad_norm": 1.873302678293214, + "language_loss": 0.62354612, + "learning_rate": 5.924011732219503e-08, + "loss": 0.64446783, + "num_input_tokens_seen": 331882535, + "step": 15379, + "time_per_iteration": 4.427612543106079 + }, + { + "auxiliary_loss_clip": 0.00983966, + "auxiliary_loss_mlp": 0.01029718, + "balance_loss_clip": 1.02134287, + "balance_loss_mlp": 1.01800573, + "epoch": 0.9246956260333684, + "flos": 15953854152960.0, + "grad_norm": 2.1808458213887105, + "language_loss": 0.84080654, + "learning_rate": 5.914606645688591e-08, + "loss": 0.86094338, + "num_input_tokens_seen": 331899335, + "step": 15380, + "time_per_iteration": 2.7685999870300293 + }, + { + "auxiliary_loss_clip": 0.01063395, + "auxiliary_loss_mlp": 0.01028135, + "balance_loss_clip": 1.0245018, + "balance_loss_mlp": 1.01726937, + "epoch": 0.9247557492860363, + "flos": 23368366540800.0, + "grad_norm": 1.5022158002135453, + "language_loss": 0.73489165, + "learning_rate": 5.905208918895233e-08, + "loss": 0.75580692, + "num_input_tokens_seen": 331919030, + "step": 15381, + "time_per_iteration": 2.680961847305298 + }, + { + "auxiliary_loss_clip": 0.01055392, + "auxiliary_loss_mlp": 0.01031797, + "balance_loss_clip": 1.02821732, + "balance_loss_mlp": 1.02149737, + "epoch": 0.9248158725387043, + "flos": 23039855729280.0, + "grad_norm": 1.7687283309207984, + "language_loss": 0.78464824, + "learning_rate": 5.8958185521958524e-08, + "loss": 0.80552018, + "num_input_tokens_seen": 331936465, + "step": 15382, + "time_per_iteration": 2.593425989151001 + }, + { + "auxiliary_loss_clip": 0.01042317, + "auxiliary_loss_mlp": 0.01032789, + "balance_loss_clip": 1.02461171, + "balance_loss_mlp": 1.02223349, + "epoch": 0.9248759957913724, + "flos": 22522418357760.0, + "grad_norm": 1.611936523973065, + "language_loss": 0.74967051, + "learning_rate": 5.886435545946455e-08, + "loss": 0.77042162, + "num_input_tokens_seen": 331954625, + "step": 15383, + "time_per_iteration": 2.6347153186798096 + }, + { + "auxiliary_loss_clip": 0.01040807, + "auxiliary_loss_mlp": 0.01024937, + "balance_loss_clip": 1.02393091, + "balance_loss_mlp": 1.01526296, + "epoch": 0.9249361190440403, + "flos": 25447271016960.0, + "grad_norm": 1.51655587858945, + "language_loss": 0.75580001, + "learning_rate": 5.8770599005028456e-08, + "loss": 0.77645743, + "num_input_tokens_seen": 331975865, + "step": 15384, + "time_per_iteration": 2.6756160259246826 + }, + { + "auxiliary_loss_clip": 0.01042887, + "auxiliary_loss_mlp": 0.01029336, + "balance_loss_clip": 1.02620161, + "balance_loss_mlp": 1.01854157, + "epoch": 0.9249962422967083, + "flos": 12378623886720.0, + "grad_norm": 1.9591441120460673, + "language_loss": 0.66641259, + "learning_rate": 5.8676916162206045e-08, + "loss": 0.68713474, + "num_input_tokens_seen": 331992760, + "step": 15385, + "time_per_iteration": 2.6873526573181152 + }, + { + "auxiliary_loss_clip": 0.01060188, + "auxiliary_loss_mlp": 0.01031568, + "balance_loss_clip": 1.02376306, + "balance_loss_mlp": 1.02159023, + "epoch": 0.9250563655493762, + "flos": 22929430343040.0, + "grad_norm": 2.0874970510617876, + "language_loss": 0.80491209, + "learning_rate": 5.85833069345496e-08, + "loss": 0.82582963, + "num_input_tokens_seen": 332011890, + "step": 15386, + "time_per_iteration": 2.583634853363037 + }, + { + "auxiliary_loss_clip": 0.01047594, + "auxiliary_loss_mlp": 0.01035983, + "balance_loss_clip": 1.02306831, + "balance_loss_mlp": 1.02519441, + "epoch": 0.9251164888020442, + "flos": 18478662065280.0, + "grad_norm": 1.729471097345806, + "language_loss": 0.74937379, + "learning_rate": 5.8489771325608504e-08, + "loss": 0.77020955, + "num_input_tokens_seen": 332029485, + "step": 15387, + "time_per_iteration": 2.6870789527893066 + }, + { + "auxiliary_loss_clip": 0.01049808, + "auxiliary_loss_mlp": 0.01030411, + "balance_loss_clip": 1.02432013, + "balance_loss_mlp": 1.02101099, + "epoch": 0.9251766120547121, + "flos": 33037062796800.0, + "grad_norm": 1.3037582309879925, + "language_loss": 0.69878459, + "learning_rate": 5.839630933893014e-08, + "loss": 0.71958679, + "num_input_tokens_seen": 332052970, + "step": 15388, + "time_per_iteration": 2.730501413345337 + }, + { + "auxiliary_loss_clip": 0.01053044, + "auxiliary_loss_mlp": 0.01026839, + "balance_loss_clip": 1.0251658, + "balance_loss_mlp": 1.01643193, + "epoch": 0.9252367353073802, + "flos": 24387906176640.0, + "grad_norm": 1.8322461758891486, + "language_loss": 0.81937236, + "learning_rate": 5.8302920978058115e-08, + "loss": 0.84017122, + "num_input_tokens_seen": 332070395, + "step": 15389, + "time_per_iteration": 2.689119577407837 + }, + { + "auxiliary_loss_clip": 0.01053287, + "auxiliary_loss_mlp": 0.01030614, + "balance_loss_clip": 1.02485466, + "balance_loss_mlp": 1.01891398, + "epoch": 0.9252968585600481, + "flos": 18916844077440.0, + "grad_norm": 1.73178287834595, + "language_loss": 0.79343623, + "learning_rate": 5.820960624653381e-08, + "loss": 0.81427526, + "num_input_tokens_seen": 332090185, + "step": 15390, + "time_per_iteration": 2.661916971206665 + }, + { + "auxiliary_loss_clip": 0.01028186, + "auxiliary_loss_mlp": 0.01034686, + "balance_loss_clip": 1.02414846, + "balance_loss_mlp": 1.02365947, + "epoch": 0.9253569818127161, + "flos": 21725345606400.0, + "grad_norm": 2.3619282468866487, + "language_loss": 0.75445652, + "learning_rate": 5.811636514789597e-08, + "loss": 0.77508521, + "num_input_tokens_seen": 332109050, + "step": 15391, + "time_per_iteration": 2.7591514587402344 + }, + { + "auxiliary_loss_clip": 0.01038602, + "auxiliary_loss_mlp": 0.01030506, + "balance_loss_clip": 1.02300858, + "balance_loss_mlp": 1.01869845, + "epoch": 0.925417105065384, + "flos": 34240357434240.0, + "grad_norm": 2.380447128666937, + "language_loss": 0.52407104, + "learning_rate": 5.80231976856802e-08, + "loss": 0.54476213, + "num_input_tokens_seen": 332131180, + "step": 15392, + "time_per_iteration": 2.895925283432007 + }, + { + "auxiliary_loss_clip": 0.01060175, + "auxiliary_loss_mlp": 0.01026202, + "balance_loss_clip": 1.02310371, + "balance_loss_mlp": 1.01688564, + "epoch": 0.925477228318052, + "flos": 25959536830080.0, + "grad_norm": 1.8610210582711175, + "language_loss": 0.77169698, + "learning_rate": 5.7930103863419454e-08, + "loss": 0.79256076, + "num_input_tokens_seen": 332149555, + "step": 15393, + "time_per_iteration": 2.837021827697754 + }, + { + "auxiliary_loss_clip": 0.01036712, + "auxiliary_loss_mlp": 0.01027854, + "balance_loss_clip": 1.02323961, + "balance_loss_mlp": 1.01723826, + "epoch": 0.9255373515707199, + "flos": 11838240702720.0, + "grad_norm": 1.7192290740676615, + "language_loss": 0.69351196, + "learning_rate": 5.783708368464357e-08, + "loss": 0.71415764, + "num_input_tokens_seen": 332165830, + "step": 15394, + "time_per_iteration": 2.821507215499878 + }, + { + "auxiliary_loss_clip": 0.01061967, + "auxiliary_loss_mlp": 0.0102429, + "balance_loss_clip": 1.02548456, + "balance_loss_mlp": 1.01406741, + "epoch": 0.925597474823388, + "flos": 21434325615360.0, + "grad_norm": 1.6174587906684437, + "language_loss": 0.73205125, + "learning_rate": 5.7744137152879956e-08, + "loss": 0.75291383, + "num_input_tokens_seen": 332185130, + "step": 15395, + "time_per_iteration": 4.625601530075073 + }, + { + "auxiliary_loss_clip": 0.0101603, + "auxiliary_loss_mlp": 0.01029112, + "balance_loss_clip": 1.02125359, + "balance_loss_mlp": 1.02025533, + "epoch": 0.925657598076056, + "flos": 22857573185280.0, + "grad_norm": 2.6972184061112543, + "language_loss": 0.7170825, + "learning_rate": 5.7651264271653785e-08, + "loss": 0.73753393, + "num_input_tokens_seen": 332203695, + "step": 15396, + "time_per_iteration": 2.82305645942688 + }, + { + "auxiliary_loss_clip": 0.01060943, + "auxiliary_loss_mlp": 0.01026123, + "balance_loss_clip": 1.02470756, + "balance_loss_mlp": 1.01557863, + "epoch": 0.9257177213287239, + "flos": 25704032411520.0, + "grad_norm": 1.5449333598999597, + "language_loss": 0.87430149, + "learning_rate": 5.755846504448603e-08, + "loss": 0.89517218, + "num_input_tokens_seen": 332224850, + "step": 15397, + "time_per_iteration": 2.6430704593658447 + }, + { + "auxiliary_loss_clip": 0.01005844, + "auxiliary_loss_mlp": 0.0100347, + "balance_loss_clip": 1.00080609, + "balance_loss_mlp": 1.00257587, + "epoch": 0.9257778445813919, + "flos": 59592933221760.0, + "grad_norm": 0.7986009643540907, + "language_loss": 0.55158961, + "learning_rate": 5.746573947489586e-08, + "loss": 0.57168269, + "num_input_tokens_seen": 332278085, + "step": 15398, + "time_per_iteration": 3.0640692710876465 + }, + { + "auxiliary_loss_clip": 0.01041205, + "auxiliary_loss_mlp": 0.01031797, + "balance_loss_clip": 1.02379322, + "balance_loss_mlp": 1.01959014, + "epoch": 0.9258379678340598, + "flos": 27709427704320.0, + "grad_norm": 2.0399457883321563, + "language_loss": 0.76142472, + "learning_rate": 5.7373087566400025e-08, + "loss": 0.78215474, + "num_input_tokens_seen": 332297875, + "step": 15399, + "time_per_iteration": 2.7411301136016846 + }, + { + "auxiliary_loss_clip": 0.01030407, + "auxiliary_loss_mlp": 0.01025445, + "balance_loss_clip": 1.02053118, + "balance_loss_mlp": 1.01587296, + "epoch": 0.9258980910867278, + "flos": 24863543095680.0, + "grad_norm": 1.4685298885463356, + "language_loss": 0.78109545, + "learning_rate": 5.7280509322510826e-08, + "loss": 0.80165398, + "num_input_tokens_seen": 332318500, + "step": 15400, + "time_per_iteration": 2.712615966796875 + }, + { + "auxiliary_loss_clip": 0.00995857, + "auxiliary_loss_mlp": 0.01000247, + "balance_loss_clip": 1.00080395, + "balance_loss_mlp": 0.99936515, + "epoch": 0.9259582143393957, + "flos": 63134587249920.0, + "grad_norm": 0.7337855362941027, + "language_loss": 0.51318151, + "learning_rate": 5.718800474673946e-08, + "loss": 0.53314257, + "num_input_tokens_seen": 332381980, + "step": 15401, + "time_per_iteration": 3.169386625289917 + }, + { + "auxiliary_loss_clip": 0.01048649, + "auxiliary_loss_mlp": 0.01032329, + "balance_loss_clip": 1.0244379, + "balance_loss_mlp": 1.02269673, + "epoch": 0.9260183375920638, + "flos": 24127122458880.0, + "grad_norm": 1.8185691031451237, + "language_loss": 0.82343107, + "learning_rate": 5.709557384259378e-08, + "loss": 0.8442409, + "num_input_tokens_seen": 332399510, + "step": 15402, + "time_per_iteration": 2.655576467514038 + }, + { + "auxiliary_loss_clip": 0.01006, + "auxiliary_loss_mlp": 0.01000302, + "balance_loss_clip": 1.00109875, + "balance_loss_mlp": 0.99950331, + "epoch": 0.9260784608447317, + "flos": 63042872849280.0, + "grad_norm": 0.7276275156324726, + "language_loss": 0.51097792, + "learning_rate": 5.700321661357876e-08, + "loss": 0.53104103, + "num_input_tokens_seen": 332459130, + "step": 15403, + "time_per_iteration": 3.2815678119659424 + }, + { + "auxiliary_loss_clip": 0.00986875, + "auxiliary_loss_mlp": 0.01001165, + "balance_loss_clip": 1.00181031, + "balance_loss_mlp": 1.00042582, + "epoch": 0.9261385840973997, + "flos": 70585979927040.0, + "grad_norm": 0.6868497920692994, + "language_loss": 0.58783817, + "learning_rate": 5.69109330631965e-08, + "loss": 0.60771859, + "num_input_tokens_seen": 332526555, + "step": 15404, + "time_per_iteration": 3.279109477996826 + }, + { + "auxiliary_loss_clip": 0.01041907, + "auxiliary_loss_mlp": 0.0103076, + "balance_loss_clip": 1.0239011, + "balance_loss_mlp": 1.01956034, + "epoch": 0.9261987073500676, + "flos": 20229917656320.0, + "grad_norm": 6.079371067984138, + "language_loss": 0.71581757, + "learning_rate": 5.681872319494596e-08, + "loss": 0.73654425, + "num_input_tokens_seen": 332544005, + "step": 15405, + "time_per_iteration": 2.6330089569091797 + }, + { + "auxiliary_loss_clip": 0.01021964, + "auxiliary_loss_mlp": 0.01039555, + "balance_loss_clip": 1.02379298, + "balance_loss_mlp": 1.02768779, + "epoch": 0.9262588306027356, + "flos": 20954163582720.0, + "grad_norm": 1.736471800783704, + "language_loss": 0.68699157, + "learning_rate": 5.672658701232458e-08, + "loss": 0.70760679, + "num_input_tokens_seen": 332563070, + "step": 15406, + "time_per_iteration": 2.7073984146118164 + }, + { + "auxiliary_loss_clip": 0.01012256, + "auxiliary_loss_mlp": 0.01035844, + "balance_loss_clip": 1.02096915, + "balance_loss_mlp": 1.02323127, + "epoch": 0.9263189538554035, + "flos": 22158679282560.0, + "grad_norm": 2.277694765055834, + "language_loss": 0.76349986, + "learning_rate": 5.663452451882555e-08, + "loss": 0.78398085, + "num_input_tokens_seen": 332579620, + "step": 15407, + "time_per_iteration": 2.780308246612549 + }, + { + "auxiliary_loss_clip": 0.01028909, + "auxiliary_loss_mlp": 0.01036784, + "balance_loss_clip": 1.02128422, + "balance_loss_mlp": 1.02523303, + "epoch": 0.9263790771080715, + "flos": 18187211111040.0, + "grad_norm": 2.6774375261005594, + "language_loss": 0.72548431, + "learning_rate": 5.6542535717940096e-08, + "loss": 0.74614125, + "num_input_tokens_seen": 332597795, + "step": 15408, + "time_per_iteration": 2.720360279083252 + }, + { + "auxiliary_loss_clip": 0.01028386, + "auxiliary_loss_mlp": 0.01028788, + "balance_loss_clip": 1.02219009, + "balance_loss_mlp": 1.01954305, + "epoch": 0.9264392003607396, + "flos": 48178545004800.0, + "grad_norm": 1.7885971969998127, + "language_loss": 0.68378377, + "learning_rate": 5.645062061315675e-08, + "loss": 0.70435548, + "num_input_tokens_seen": 332620375, + "step": 15409, + "time_per_iteration": 3.0086894035339355 + }, + { + "auxiliary_loss_clip": 0.01032751, + "auxiliary_loss_mlp": 0.01029411, + "balance_loss_clip": 1.02630007, + "balance_loss_mlp": 1.01843739, + "epoch": 0.9264993236134075, + "flos": 26389458714240.0, + "grad_norm": 1.7358011189177376, + "language_loss": 0.75661016, + "learning_rate": 5.6358779207960506e-08, + "loss": 0.77723181, + "num_input_tokens_seen": 332639510, + "step": 15410, + "time_per_iteration": 2.7607481479644775 + }, + { + "auxiliary_loss_clip": 0.01024327, + "auxiliary_loss_mlp": 0.01026728, + "balance_loss_clip": 1.02611709, + "balance_loss_mlp": 1.0167737, + "epoch": 0.9265594468660755, + "flos": 20920084554240.0, + "grad_norm": 1.4875807516996853, + "language_loss": 0.82126534, + "learning_rate": 5.6267011505833905e-08, + "loss": 0.84177589, + "num_input_tokens_seen": 332658350, + "step": 15411, + "time_per_iteration": 2.8391506671905518 + }, + { + "auxiliary_loss_clip": 0.01037288, + "auxiliary_loss_mlp": 0.01032468, + "balance_loss_clip": 1.02626491, + "balance_loss_mlp": 1.02231693, + "epoch": 0.9266195701187434, + "flos": 17525017929600.0, + "grad_norm": 1.6306524431229918, + "language_loss": 0.75168455, + "learning_rate": 5.617531751025728e-08, + "loss": 0.77238214, + "num_input_tokens_seen": 332676715, + "step": 15412, + "time_per_iteration": 4.365439176559448 + }, + { + "auxiliary_loss_clip": 0.01061062, + "auxiliary_loss_mlp": 0.01024652, + "balance_loss_clip": 1.02449012, + "balance_loss_mlp": 1.0146687, + "epoch": 0.9266796933714114, + "flos": 33688733293440.0, + "grad_norm": 1.8900121497566231, + "language_loss": 0.67024553, + "learning_rate": 5.6083697224707406e-08, + "loss": 0.69110262, + "num_input_tokens_seen": 332701470, + "step": 15413, + "time_per_iteration": 2.685835361480713 + }, + { + "auxiliary_loss_clip": 0.0100455, + "auxiliary_loss_mlp": 0.01028273, + "balance_loss_clip": 1.02303028, + "balance_loss_mlp": 1.01716864, + "epoch": 0.9267398166240793, + "flos": 18916520855040.0, + "grad_norm": 1.7336161425728691, + "language_loss": 0.75907427, + "learning_rate": 5.5992150652658167e-08, + "loss": 0.77940255, + "num_input_tokens_seen": 332719060, + "step": 15414, + "time_per_iteration": 2.8928043842315674 + }, + { + "auxiliary_loss_clip": 0.01054402, + "auxiliary_loss_mlp": 0.0102919, + "balance_loss_clip": 1.02718151, + "balance_loss_mlp": 1.01943886, + "epoch": 0.9267999398767474, + "flos": 20478957626880.0, + "grad_norm": 2.5374761355468793, + "language_loss": 0.81211501, + "learning_rate": 5.59006777975819e-08, + "loss": 0.83295089, + "num_input_tokens_seen": 332736345, + "step": 15415, + "time_per_iteration": 2.6890125274658203 + }, + { + "auxiliary_loss_clip": 0.010365, + "auxiliary_loss_mlp": 0.01027205, + "balance_loss_clip": 1.02083683, + "balance_loss_mlp": 1.01641667, + "epoch": 0.9268600631294153, + "flos": 24789351553920.0, + "grad_norm": 1.3593649621923223, + "language_loss": 0.54310179, + "learning_rate": 5.580927866294671e-08, + "loss": 0.56373882, + "num_input_tokens_seen": 332756270, + "step": 15416, + "time_per_iteration": 2.7393531799316406 + }, + { + "auxiliary_loss_clip": 0.01022324, + "auxiliary_loss_mlp": 0.01031246, + "balance_loss_clip": 1.0222528, + "balance_loss_mlp": 1.02076793, + "epoch": 0.9269201863820833, + "flos": 18697178453760.0, + "grad_norm": 1.6316940871167787, + "language_loss": 0.71870744, + "learning_rate": 5.571795325221807e-08, + "loss": 0.73924309, + "num_input_tokens_seen": 332775185, + "step": 15417, + "time_per_iteration": 2.7022316455841064 + }, + { + "auxiliary_loss_clip": 0.01053447, + "auxiliary_loss_mlp": 0.01027997, + "balance_loss_clip": 1.02678657, + "balance_loss_mlp": 1.0174768, + "epoch": 0.9269803096347512, + "flos": 20923999136640.0, + "grad_norm": 2.0700511717615893, + "language_loss": 0.7549752, + "learning_rate": 5.5626701568859624e-08, + "loss": 0.77578968, + "num_input_tokens_seen": 332794320, + "step": 15418, + "time_per_iteration": 2.7688748836517334 + }, + { + "auxiliary_loss_clip": 0.01049484, + "auxiliary_loss_mlp": 0.01026703, + "balance_loss_clip": 1.0234127, + "balance_loss_mlp": 1.01635575, + "epoch": 0.9270404328874192, + "flos": 28002710252160.0, + "grad_norm": 1.4859105083225779, + "language_loss": 0.76290888, + "learning_rate": 5.553552361633174e-08, + "loss": 0.78367078, + "num_input_tokens_seen": 332818095, + "step": 15419, + "time_per_iteration": 2.6843161582946777 + }, + { + "auxiliary_loss_clip": 0.01059218, + "auxiliary_loss_mlp": 0.01027863, + "balance_loss_clip": 1.02431202, + "balance_loss_mlp": 1.01845765, + "epoch": 0.9271005561400871, + "flos": 25889870401920.0, + "grad_norm": 1.8011645243176875, + "language_loss": 0.76260489, + "learning_rate": 5.5444419398091636e-08, + "loss": 0.78347576, + "num_input_tokens_seen": 332839860, + "step": 15420, + "time_per_iteration": 2.6223573684692383 + }, + { + "auxiliary_loss_clip": 0.01052798, + "auxiliary_loss_mlp": 0.01025807, + "balance_loss_clip": 1.02398455, + "balance_loss_mlp": 1.01535785, + "epoch": 0.9271606793927551, + "flos": 27053914452480.0, + "grad_norm": 1.8317146760473368, + "language_loss": 0.76596671, + "learning_rate": 5.535338891759389e-08, + "loss": 0.78675276, + "num_input_tokens_seen": 332861155, + "step": 15421, + "time_per_iteration": 2.615938663482666 + }, + { + "auxiliary_loss_clip": 0.01040168, + "auxiliary_loss_mlp": 0.01031288, + "balance_loss_clip": 1.02467966, + "balance_loss_mlp": 1.02088118, + "epoch": 0.9272208026454232, + "flos": 26209869690240.0, + "grad_norm": 2.3551887869216093, + "language_loss": 0.7275517, + "learning_rate": 5.526243217829041e-08, + "loss": 0.74826628, + "num_input_tokens_seen": 332881110, + "step": 15422, + "time_per_iteration": 2.733854055404663 + }, + { + "auxiliary_loss_clip": 0.01050104, + "auxiliary_loss_mlp": 0.01036018, + "balance_loss_clip": 1.02346361, + "balance_loss_mlp": 1.0250206, + "epoch": 0.9272809258980911, + "flos": 12458453863680.0, + "grad_norm": 1.8798258815108029, + "language_loss": 0.77368033, + "learning_rate": 5.517154918363065e-08, + "loss": 0.79454148, + "num_input_tokens_seen": 332899350, + "step": 15423, + "time_per_iteration": 4.27144718170166 + }, + { + "auxiliary_loss_clip": 0.01053813, + "auxiliary_loss_mlp": 0.01029154, + "balance_loss_clip": 1.0254128, + "balance_loss_mlp": 1.0185926, + "epoch": 0.9273410491507591, + "flos": 22856890826880.0, + "grad_norm": 2.9790708082568624, + "language_loss": 0.75125396, + "learning_rate": 5.508073993706053e-08, + "loss": 0.77208364, + "num_input_tokens_seen": 332918105, + "step": 15424, + "time_per_iteration": 2.5746989250183105 + }, + { + "auxiliary_loss_clip": 0.00997567, + "auxiliary_loss_mlp": 0.01000223, + "balance_loss_clip": 1.00218678, + "balance_loss_mlp": 0.99937069, + "epoch": 0.927401172403427, + "flos": 47665384329600.0, + "grad_norm": 0.7854816094235284, + "language_loss": 0.60623109, + "learning_rate": 5.499000444202351e-08, + "loss": 0.62620902, + "num_input_tokens_seen": 332969490, + "step": 15425, + "time_per_iteration": 4.528989315032959 + }, + { + "auxiliary_loss_clip": 0.01042099, + "auxiliary_loss_mlp": 0.00747528, + "balance_loss_clip": 1.02507341, + "balance_loss_mlp": 1.00038016, + "epoch": 0.927461295656095, + "flos": 29972374490880.0, + "grad_norm": 1.4363551034342517, + "language_loss": 0.70433402, + "learning_rate": 5.489934270196106e-08, + "loss": 0.72223032, + "num_input_tokens_seen": 332988805, + "step": 15426, + "time_per_iteration": 2.796233654022217 + }, + { + "auxiliary_loss_clip": 0.01041064, + "auxiliary_loss_mlp": 0.01025955, + "balance_loss_clip": 1.0244869, + "balance_loss_mlp": 1.01622152, + "epoch": 0.9275214189087629, + "flos": 20375427651840.0, + "grad_norm": 1.8570840942530025, + "language_loss": 0.82801449, + "learning_rate": 5.480875472030977e-08, + "loss": 0.84868473, + "num_input_tokens_seen": 333007960, + "step": 15427, + "time_per_iteration": 2.6609435081481934 + }, + { + "auxiliary_loss_clip": 0.01027106, + "auxiliary_loss_mlp": 0.0103191, + "balance_loss_clip": 1.02415657, + "balance_loss_mlp": 1.0214138, + "epoch": 0.927581542161431, + "flos": 22383193242240.0, + "grad_norm": 1.4830078426131612, + "language_loss": 0.77022749, + "learning_rate": 5.471824050050555e-08, + "loss": 0.79081768, + "num_input_tokens_seen": 333026035, + "step": 15428, + "time_per_iteration": 2.7125115394592285 + }, + { + "auxiliary_loss_clip": 0.01022396, + "auxiliary_loss_mlp": 0.01033035, + "balance_loss_clip": 1.02146864, + "balance_loss_mlp": 1.02203774, + "epoch": 0.9276416654140989, + "flos": 23952453598080.0, + "grad_norm": 1.8889818042563784, + "language_loss": 0.74188554, + "learning_rate": 5.4627800045980555e-08, + "loss": 0.76243985, + "num_input_tokens_seen": 333045590, + "step": 15429, + "time_per_iteration": 2.7181336879730225 + }, + { + "auxiliary_loss_clip": 0.01023453, + "auxiliary_loss_mlp": 0.01031367, + "balance_loss_clip": 1.02078962, + "balance_loss_mlp": 1.02159762, + "epoch": 0.9277017886667669, + "flos": 13917719796480.0, + "grad_norm": 1.9687616726285768, + "language_loss": 0.75197321, + "learning_rate": 5.45374333601647e-08, + "loss": 0.77252144, + "num_input_tokens_seen": 333063355, + "step": 15430, + "time_per_iteration": 2.7758095264434814 + }, + { + "auxiliary_loss_clip": 0.01047897, + "auxiliary_loss_mlp": 0.01031171, + "balance_loss_clip": 1.02197111, + "balance_loss_mlp": 1.02035904, + "epoch": 0.9277619119194348, + "flos": 35666478092160.0, + "grad_norm": 1.5844740185596524, + "language_loss": 0.76310652, + "learning_rate": 5.444714044648391e-08, + "loss": 0.78389716, + "num_input_tokens_seen": 333088045, + "step": 15431, + "time_per_iteration": 2.75498104095459 + }, + { + "auxiliary_loss_clip": 0.01049506, + "auxiliary_loss_mlp": 0.01027492, + "balance_loss_clip": 1.02415752, + "balance_loss_mlp": 1.01776421, + "epoch": 0.9278220351721028, + "flos": 23841238112640.0, + "grad_norm": 1.6088305455386898, + "language_loss": 0.70689201, + "learning_rate": 5.4356921308363e-08, + "loss": 0.72766197, + "num_input_tokens_seen": 333108005, + "step": 15432, + "time_per_iteration": 2.625133752822876 + }, + { + "auxiliary_loss_clip": 0.01023507, + "auxiliary_loss_mlp": 0.01032166, + "balance_loss_clip": 1.02502394, + "balance_loss_mlp": 1.02190757, + "epoch": 0.9278821584247707, + "flos": 15228135768960.0, + "grad_norm": 7.628330136876883, + "language_loss": 0.82288957, + "learning_rate": 5.4266775949222354e-08, + "loss": 0.84344631, + "num_input_tokens_seen": 333124335, + "step": 15433, + "time_per_iteration": 2.784536838531494 + }, + { + "auxiliary_loss_clip": 0.01060022, + "auxiliary_loss_mlp": 0.01024576, + "balance_loss_clip": 1.02561021, + "balance_loss_mlp": 1.01543868, + "epoch": 0.9279422816774388, + "flos": 24681404206080.0, + "grad_norm": 2.0373486555504994, + "language_loss": 0.66612202, + "learning_rate": 5.417670437248056e-08, + "loss": 0.68696803, + "num_input_tokens_seen": 333143995, + "step": 15434, + "time_per_iteration": 2.764997959136963 + }, + { + "auxiliary_loss_clip": 0.01032294, + "auxiliary_loss_mlp": 0.01023794, + "balance_loss_clip": 1.02207088, + "balance_loss_mlp": 1.01407266, + "epoch": 0.9280024049301068, + "flos": 19169188099200.0, + "grad_norm": 1.6979546001366548, + "language_loss": 0.68725157, + "learning_rate": 5.40867065815529e-08, + "loss": 0.70781243, + "num_input_tokens_seen": 333162805, + "step": 15435, + "time_per_iteration": 2.649247884750366 + }, + { + "auxiliary_loss_clip": 0.01063492, + "auxiliary_loss_mlp": 0.01028969, + "balance_loss_clip": 1.02580166, + "balance_loss_mlp": 1.01846099, + "epoch": 0.9280625281827747, + "flos": 11393701983360.0, + "grad_norm": 4.593002300015284, + "language_loss": 0.71778405, + "learning_rate": 5.399678257985263e-08, + "loss": 0.73870867, + "num_input_tokens_seen": 333175770, + "step": 15436, + "time_per_iteration": 2.5256218910217285 + }, + { + "auxiliary_loss_clip": 0.01041505, + "auxiliary_loss_mlp": 0.01025113, + "balance_loss_clip": 1.02432704, + "balance_loss_mlp": 1.01471221, + "epoch": 0.9281226514354427, + "flos": 24785616539520.0, + "grad_norm": 1.9283873305807655, + "language_loss": 0.67153394, + "learning_rate": 5.390693237078925e-08, + "loss": 0.69220006, + "num_input_tokens_seen": 333194775, + "step": 15437, + "time_per_iteration": 2.640706777572632 + }, + { + "auxiliary_loss_clip": 0.0105544, + "auxiliary_loss_mlp": 0.01031525, + "balance_loss_clip": 1.02572882, + "balance_loss_mlp": 1.02013481, + "epoch": 0.9281827746881106, + "flos": 15083128563840.0, + "grad_norm": 2.077116277098942, + "language_loss": 0.71213645, + "learning_rate": 5.3817155957770254e-08, + "loss": 0.73300612, + "num_input_tokens_seen": 333208920, + "step": 15438, + "time_per_iteration": 2.5672383308410645 + }, + { + "auxiliary_loss_clip": 0.01061752, + "auxiliary_loss_mlp": 0.01025606, + "balance_loss_clip": 1.02488232, + "balance_loss_mlp": 1.01523483, + "epoch": 0.9282428979407786, + "flos": 24135059364480.0, + "grad_norm": 1.6471938038733167, + "language_loss": 0.64979792, + "learning_rate": 5.3727453344199366e-08, + "loss": 0.67067158, + "num_input_tokens_seen": 333229350, + "step": 15439, + "time_per_iteration": 2.5745131969451904 + }, + { + "auxiliary_loss_clip": 0.01038647, + "auxiliary_loss_mlp": 0.01028751, + "balance_loss_clip": 1.02287459, + "balance_loss_mlp": 1.01860595, + "epoch": 0.9283030211934465, + "flos": 24823215100800.0, + "grad_norm": 2.550290120446864, + "language_loss": 0.70206714, + "learning_rate": 5.363782453347876e-08, + "loss": 0.72274113, + "num_input_tokens_seen": 333246125, + "step": 15440, + "time_per_iteration": 2.6790530681610107 + }, + { + "auxiliary_loss_clip": 0.0102617, + "auxiliary_loss_mlp": 0.00747657, + "balance_loss_clip": 1.02347946, + "balance_loss_mlp": 1.00035357, + "epoch": 0.9283631444461146, + "flos": 23981037845760.0, + "grad_norm": 1.6280393626940666, + "language_loss": 0.76814324, + "learning_rate": 5.354826952900682e-08, + "loss": 0.78588146, + "num_input_tokens_seen": 333263685, + "step": 15441, + "time_per_iteration": 2.7253620624542236 + }, + { + "auxiliary_loss_clip": 0.01042854, + "auxiliary_loss_mlp": 0.01025583, + "balance_loss_clip": 1.02291787, + "balance_loss_mlp": 1.01703572, + "epoch": 0.9284232676987825, + "flos": 22784530878720.0, + "grad_norm": 1.6056071987045077, + "language_loss": 0.64370567, + "learning_rate": 5.345878833417949e-08, + "loss": 0.66439009, + "num_input_tokens_seen": 333282435, + "step": 15442, + "time_per_iteration": 4.460875034332275 + }, + { + "auxiliary_loss_clip": 0.01018118, + "auxiliary_loss_mlp": 0.01039828, + "balance_loss_clip": 1.02291775, + "balance_loss_mlp": 1.02782369, + "epoch": 0.9284833909514505, + "flos": 19500500171520.0, + "grad_norm": 2.0289947500173646, + "language_loss": 0.80983442, + "learning_rate": 5.3369380952390295e-08, + "loss": 0.83041382, + "num_input_tokens_seen": 333300400, + "step": 15443, + "time_per_iteration": 2.793196439743042 + }, + { + "auxiliary_loss_clip": 0.01051008, + "auxiliary_loss_mlp": 0.00747617, + "balance_loss_clip": 1.02425957, + "balance_loss_mlp": 1.00041151, + "epoch": 0.9285435142041184, + "flos": 23185976256000.0, + "grad_norm": 1.9707243848039484, + "language_loss": 0.65671104, + "learning_rate": 5.328004738702896e-08, + "loss": 0.67469728, + "num_input_tokens_seen": 333318980, + "step": 15444, + "time_per_iteration": 2.781294107437134 + }, + { + "auxiliary_loss_clip": 0.01028372, + "auxiliary_loss_mlp": 0.01029394, + "balance_loss_clip": 1.02366889, + "balance_loss_mlp": 1.0189333, + "epoch": 0.9286036374567864, + "flos": 17675519915520.0, + "grad_norm": 2.019574363033036, + "language_loss": 0.73201692, + "learning_rate": 5.3190787641483215e-08, + "loss": 0.75259453, + "num_input_tokens_seen": 333334135, + "step": 15445, + "time_per_iteration": 2.897702217102051 + }, + { + "auxiliary_loss_clip": 0.01043879, + "auxiliary_loss_mlp": 0.010308, + "balance_loss_clip": 1.02356005, + "balance_loss_mlp": 1.0196954, + "epoch": 0.9286637607094543, + "flos": 20886687884160.0, + "grad_norm": 2.8743592681056613, + "language_loss": 0.71399587, + "learning_rate": 5.3101601719138135e-08, + "loss": 0.73474264, + "num_input_tokens_seen": 333353325, + "step": 15446, + "time_per_iteration": 2.754582166671753 + }, + { + "auxiliary_loss_clip": 0.01008902, + "auxiliary_loss_mlp": 0.01027656, + "balance_loss_clip": 1.02470565, + "balance_loss_mlp": 1.01664662, + "epoch": 0.9287238839621224, + "flos": 19026012487680.0, + "grad_norm": 1.716221484694846, + "language_loss": 0.69379675, + "learning_rate": 5.301248962337523e-08, + "loss": 0.71416235, + "num_input_tokens_seen": 333371110, + "step": 15447, + "time_per_iteration": 2.8309414386749268 + }, + { + "auxiliary_loss_clip": 0.01056252, + "auxiliary_loss_mlp": 0.01026221, + "balance_loss_clip": 1.02338994, + "balance_loss_mlp": 1.01716757, + "epoch": 0.9287840072147904, + "flos": 20557027837440.0, + "grad_norm": 1.5778584346960032, + "language_loss": 0.72338504, + "learning_rate": 5.292345135757403e-08, + "loss": 0.74420977, + "num_input_tokens_seen": 333391420, + "step": 15448, + "time_per_iteration": 2.6110565662384033 + }, + { + "auxiliary_loss_clip": 0.01061177, + "auxiliary_loss_mlp": 0.01026276, + "balance_loss_clip": 1.02438593, + "balance_loss_mlp": 1.0150708, + "epoch": 0.9288441304674583, + "flos": 21250822008960.0, + "grad_norm": 2.0104166358909485, + "language_loss": 0.74232638, + "learning_rate": 5.283448692511072e-08, + "loss": 0.76320088, + "num_input_tokens_seen": 333410365, + "step": 15449, + "time_per_iteration": 2.643012523651123 + }, + { + "auxiliary_loss_clip": 0.01061626, + "auxiliary_loss_mlp": 0.00747579, + "balance_loss_clip": 1.02409649, + "balance_loss_mlp": 1.00040412, + "epoch": 0.9289042537201263, + "flos": 27669853895040.0, + "grad_norm": 2.22748364288272, + "language_loss": 0.6777544, + "learning_rate": 5.27455963293586e-08, + "loss": 0.69584638, + "num_input_tokens_seen": 333430000, + "step": 15450, + "time_per_iteration": 2.6462652683258057 + }, + { + "auxiliary_loss_clip": 0.01030587, + "auxiliary_loss_mlp": 0.01021926, + "balance_loss_clip": 1.02340865, + "balance_loss_mlp": 1.01154315, + "epoch": 0.9289643769727942, + "flos": 19317750750720.0, + "grad_norm": 1.9611096645575352, + "language_loss": 0.71926391, + "learning_rate": 5.265677957368875e-08, + "loss": 0.73978901, + "num_input_tokens_seen": 333445800, + "step": 15451, + "time_per_iteration": 2.8211770057678223 + }, + { + "auxiliary_loss_clip": 0.01034796, + "auxiliary_loss_mlp": 0.0103888, + "balance_loss_clip": 1.02194738, + "balance_loss_mlp": 1.02723289, + "epoch": 0.9290245002254622, + "flos": 14058058233600.0, + "grad_norm": 1.9911600584567517, + "language_loss": 0.73355997, + "learning_rate": 5.25680366614687e-08, + "loss": 0.75429678, + "num_input_tokens_seen": 333461550, + "step": 15452, + "time_per_iteration": 2.7595391273498535 + }, + { + "auxiliary_loss_clip": 0.01044543, + "auxiliary_loss_mlp": 0.01025794, + "balance_loss_clip": 1.02733111, + "balance_loss_mlp": 1.0151782, + "epoch": 0.9290846234781301, + "flos": 20047132321920.0, + "grad_norm": 2.2254050290070793, + "language_loss": 0.7423476, + "learning_rate": 5.2479367596064196e-08, + "loss": 0.76305103, + "num_input_tokens_seen": 333478835, + "step": 15453, + "time_per_iteration": 2.6986303329467773 + }, + { + "auxiliary_loss_clip": 0.00967077, + "auxiliary_loss_mlp": 0.01001155, + "balance_loss_clip": 1.00374293, + "balance_loss_mlp": 1.00034428, + "epoch": 0.9291447467307982, + "flos": 61227514460160.0, + "grad_norm": 0.8263394597013837, + "language_loss": 0.60677332, + "learning_rate": 5.2390772380837226e-08, + "loss": 0.62645566, + "num_input_tokens_seen": 333535250, + "step": 15454, + "time_per_iteration": 3.1973752975463867 + }, + { + "auxiliary_loss_clip": 0.01035275, + "auxiliary_loss_mlp": 0.0103198, + "balance_loss_clip": 1.02148449, + "balance_loss_mlp": 1.02060699, + "epoch": 0.9292048699834661, + "flos": 20553328736640.0, + "grad_norm": 1.696076583711069, + "language_loss": 0.69171244, + "learning_rate": 5.230225101914709e-08, + "loss": 0.712385, + "num_input_tokens_seen": 333553805, + "step": 15455, + "time_per_iteration": 2.721614122390747 + }, + { + "auxiliary_loss_clip": 0.01019692, + "auxiliary_loss_mlp": 0.0102866, + "balance_loss_clip": 1.02472723, + "balance_loss_mlp": 1.01759136, + "epoch": 0.9292649932361341, + "flos": 23623655477760.0, + "grad_norm": 1.7605646094616332, + "language_loss": 0.64626253, + "learning_rate": 5.22138035143509e-08, + "loss": 0.66674614, + "num_input_tokens_seen": 333572800, + "step": 15456, + "time_per_iteration": 2.7662508487701416 + }, + { + "auxiliary_loss_clip": 0.01016005, + "auxiliary_loss_mlp": 0.01029103, + "balance_loss_clip": 1.02227306, + "balance_loss_mlp": 1.01802826, + "epoch": 0.929325116488802, + "flos": 15009942602880.0, + "grad_norm": 1.787681240167295, + "language_loss": 0.68418992, + "learning_rate": 5.2125429869802615e-08, + "loss": 0.70464098, + "num_input_tokens_seen": 333588520, + "step": 15457, + "time_per_iteration": 2.7589454650878906 + }, + { + "auxiliary_loss_clip": 0.01041569, + "auxiliary_loss_mlp": 0.01025075, + "balance_loss_clip": 1.02317917, + "balance_loss_mlp": 1.01529384, + "epoch": 0.92938523974147, + "flos": 17967365919360.0, + "grad_norm": 2.304471837390313, + "language_loss": 0.80622876, + "learning_rate": 5.203713008885291e-08, + "loss": 0.82689524, + "num_input_tokens_seen": 333603435, + "step": 15458, + "time_per_iteration": 2.7674121856689453 + }, + { + "auxiliary_loss_clip": 0.01053808, + "auxiliary_loss_mlp": 0.01030829, + "balance_loss_clip": 1.02536857, + "balance_loss_mlp": 1.02029657, + "epoch": 0.9294453629941379, + "flos": 23003047267200.0, + "grad_norm": 1.571932973212281, + "language_loss": 0.72131717, + "learning_rate": 5.194890417485065e-08, + "loss": 0.74216354, + "num_input_tokens_seen": 333623305, + "step": 15459, + "time_per_iteration": 2.8524999618530273 + }, + { + "auxiliary_loss_clip": 0.01025021, + "auxiliary_loss_mlp": 0.01031546, + "balance_loss_clip": 1.02256513, + "balance_loss_mlp": 1.02061439, + "epoch": 0.929505486246806, + "flos": 17055234927360.0, + "grad_norm": 2.1295946855571772, + "language_loss": 0.59096068, + "learning_rate": 5.1860752131141384e-08, + "loss": 0.61152637, + "num_input_tokens_seen": 333641205, + "step": 15460, + "time_per_iteration": 4.476568222045898 + }, + { + "auxiliary_loss_clip": 0.01012531, + "auxiliary_loss_mlp": 0.01031662, + "balance_loss_clip": 1.02137804, + "balance_loss_mlp": 1.0202837, + "epoch": 0.9295656094994739, + "flos": 27340409329920.0, + "grad_norm": 1.8159006265178903, + "language_loss": 0.8062396, + "learning_rate": 5.177267396106733e-08, + "loss": 0.82668155, + "num_input_tokens_seen": 333659615, + "step": 15461, + "time_per_iteration": 2.8057708740234375 + }, + { + "auxiliary_loss_clip": 0.01038331, + "auxiliary_loss_mlp": 0.01022933, + "balance_loss_clip": 1.02314758, + "balance_loss_mlp": 1.01292515, + "epoch": 0.9296257327521419, + "flos": 21470954509440.0, + "grad_norm": 2.31624083881523, + "language_loss": 0.78473204, + "learning_rate": 5.168466966796869e-08, + "loss": 0.8053447, + "num_input_tokens_seen": 333678985, + "step": 15462, + "time_per_iteration": 2.634737253189087 + }, + { + "auxiliary_loss_clip": 0.01014557, + "auxiliary_loss_mlp": 0.01022857, + "balance_loss_clip": 1.01850677, + "balance_loss_mlp": 1.01180625, + "epoch": 0.9296858560048099, + "flos": 16362661818240.0, + "grad_norm": 1.9633908414151051, + "language_loss": 0.62595725, + "learning_rate": 5.159673925518282e-08, + "loss": 0.64633131, + "num_input_tokens_seen": 333696410, + "step": 15463, + "time_per_iteration": 2.6630187034606934 + }, + { + "auxiliary_loss_clip": 0.01032203, + "auxiliary_loss_mlp": 0.01026556, + "balance_loss_clip": 1.01989412, + "balance_loss_mlp": 1.0168941, + "epoch": 0.9297459792574778, + "flos": 29858609139840.0, + "grad_norm": 1.394071350022646, + "language_loss": 0.70874763, + "learning_rate": 5.15088827260437e-08, + "loss": 0.72933519, + "num_input_tokens_seen": 333716615, + "step": 15464, + "time_per_iteration": 2.7412965297698975 + }, + { + "auxiliary_loss_clip": 0.01032411, + "auxiliary_loss_mlp": 0.01028386, + "balance_loss_clip": 1.02154016, + "balance_loss_mlp": 1.01797271, + "epoch": 0.9298061025101458, + "flos": 15924838942080.0, + "grad_norm": 1.8042646497881984, + "language_loss": 0.77407897, + "learning_rate": 5.1421100083883115e-08, + "loss": 0.79468691, + "num_input_tokens_seen": 333732800, + "step": 15465, + "time_per_iteration": 2.5840420722961426 + }, + { + "auxiliary_loss_clip": 0.00960536, + "auxiliary_loss_mlp": 0.01000416, + "balance_loss_clip": 1.0039084, + "balance_loss_mlp": 0.99953407, + "epoch": 0.9298662257628137, + "flos": 64096994304000.0, + "grad_norm": 0.711162089840346, + "language_loss": 0.56492001, + "learning_rate": 5.133339133202952e-08, + "loss": 0.58452952, + "num_input_tokens_seen": 333799300, + "step": 15466, + "time_per_iteration": 3.5527162551879883 + }, + { + "auxiliary_loss_clip": 0.01038727, + "auxiliary_loss_mlp": 0.01036442, + "balance_loss_clip": 1.02189958, + "balance_loss_mlp": 1.02481854, + "epoch": 0.9299263490154818, + "flos": 24280210224000.0, + "grad_norm": 2.008611151534319, + "language_loss": 0.72716969, + "learning_rate": 5.1245756473809355e-08, + "loss": 0.74792147, + "num_input_tokens_seen": 333820360, + "step": 15467, + "time_per_iteration": 3.0411911010742188 + }, + { + "auxiliary_loss_clip": 0.01029252, + "auxiliary_loss_mlp": 0.01030406, + "balance_loss_clip": 1.02268445, + "balance_loss_mlp": 1.01949883, + "epoch": 0.9299864722681497, + "flos": 23294354567040.0, + "grad_norm": 1.63288613196522, + "language_loss": 0.71951902, + "learning_rate": 5.1158195512545076e-08, + "loss": 0.74011564, + "num_input_tokens_seen": 333840415, + "step": 15468, + "time_per_iteration": 3.178584337234497 + }, + { + "auxiliary_loss_clip": 0.01046026, + "auxiliary_loss_mlp": 0.01035621, + "balance_loss_clip": 1.02228546, + "balance_loss_mlp": 1.02340198, + "epoch": 0.9300465955208177, + "flos": 21395972868480.0, + "grad_norm": 1.7681708787568104, + "language_loss": 0.75500965, + "learning_rate": 5.107070845155737e-08, + "loss": 0.7758261, + "num_input_tokens_seen": 333859910, + "step": 15469, + "time_per_iteration": 2.643968343734741 + }, + { + "auxiliary_loss_clip": 0.01042106, + "auxiliary_loss_mlp": 0.01031542, + "balance_loss_clip": 1.025841, + "balance_loss_mlp": 1.0212481, + "epoch": 0.9301067187734856, + "flos": 24571445696640.0, + "grad_norm": 1.9741038569022202, + "language_loss": 0.75502288, + "learning_rate": 5.098329529416379e-08, + "loss": 0.77575934, + "num_input_tokens_seen": 333880495, + "step": 15470, + "time_per_iteration": 2.713879346847534 + }, + { + "auxiliary_loss_clip": 0.0102412, + "auxiliary_loss_mlp": 0.01028221, + "balance_loss_clip": 1.02529359, + "balance_loss_mlp": 1.01871967, + "epoch": 0.9301668420261536, + "flos": 22196960202240.0, + "grad_norm": 1.6778985801288817, + "language_loss": 0.7471149, + "learning_rate": 5.089595604367902e-08, + "loss": 0.76763833, + "num_input_tokens_seen": 333897640, + "step": 15471, + "time_per_iteration": 4.511587381362915 + }, + { + "auxiliary_loss_clip": 0.01053473, + "auxiliary_loss_mlp": 0.01025104, + "balance_loss_clip": 1.02559114, + "balance_loss_mlp": 1.01436305, + "epoch": 0.9302269652788215, + "flos": 17747628468480.0, + "grad_norm": 2.1096969932853327, + "language_loss": 0.69333088, + "learning_rate": 5.080869070341487e-08, + "loss": 0.71411669, + "num_input_tokens_seen": 333913670, + "step": 15472, + "time_per_iteration": 4.329259634017944 + }, + { + "auxiliary_loss_clip": 0.01032275, + "auxiliary_loss_mlp": 0.01025956, + "balance_loss_clip": 1.02239895, + "balance_loss_mlp": 1.01646066, + "epoch": 0.9302870885314896, + "flos": 19390793057280.0, + "grad_norm": 1.8530761658183506, + "language_loss": 0.88408136, + "learning_rate": 5.0721499276680233e-08, + "loss": 0.90466368, + "num_input_tokens_seen": 333934105, + "step": 15473, + "time_per_iteration": 2.8579070568084717 + }, + { + "auxiliary_loss_clip": 0.01046146, + "auxiliary_loss_mlp": 0.01031224, + "balance_loss_clip": 1.02602279, + "balance_loss_mlp": 1.01947618, + "epoch": 0.9303472117841575, + "flos": 21760286561280.0, + "grad_norm": 1.8698402537163763, + "language_loss": 0.63936597, + "learning_rate": 5.063438176678203e-08, + "loss": 0.66013968, + "num_input_tokens_seen": 333953635, + "step": 15474, + "time_per_iteration": 2.668720245361328 + }, + { + "auxiliary_loss_clip": 0.0106134, + "auxiliary_loss_mlp": 0.01028618, + "balance_loss_clip": 1.02518094, + "balance_loss_mlp": 1.01856852, + "epoch": 0.9304073350368255, + "flos": 19609740408960.0, + "grad_norm": 1.7652362006015452, + "language_loss": 0.75125277, + "learning_rate": 5.054733817702339e-08, + "loss": 0.77215242, + "num_input_tokens_seen": 333971825, + "step": 15475, + "time_per_iteration": 2.5274405479431152 + }, + { + "auxiliary_loss_clip": 0.01044883, + "auxiliary_loss_mlp": 0.01027217, + "balance_loss_clip": 1.02178633, + "balance_loss_mlp": 1.01711369, + "epoch": 0.9304674582894935, + "flos": 30441582875520.0, + "grad_norm": 2.0336959275565927, + "language_loss": 0.66409296, + "learning_rate": 5.0460368510704786e-08, + "loss": 0.68481392, + "num_input_tokens_seen": 333990120, + "step": 15476, + "time_per_iteration": 2.6677188873291016 + }, + { + "auxiliary_loss_clip": 0.01023707, + "auxiliary_loss_mlp": 0.01030746, + "balance_loss_clip": 1.02537203, + "balance_loss_mlp": 1.02007663, + "epoch": 0.9305275815421614, + "flos": 17785693906560.0, + "grad_norm": 9.133971785504238, + "language_loss": 0.68623435, + "learning_rate": 5.0373472771124914e-08, + "loss": 0.70677888, + "num_input_tokens_seen": 334007970, + "step": 15477, + "time_per_iteration": 2.7433855533599854 + }, + { + "auxiliary_loss_clip": 0.01041679, + "auxiliary_loss_mlp": 0.0102521, + "balance_loss_clip": 1.02598405, + "balance_loss_mlp": 1.01594734, + "epoch": 0.9305877047948294, + "flos": 25298456970240.0, + "grad_norm": 1.8835097348994434, + "language_loss": 0.5880571, + "learning_rate": 5.0286650961578027e-08, + "loss": 0.60872602, + "num_input_tokens_seen": 334027120, + "step": 15478, + "time_per_iteration": 2.7289295196533203 + }, + { + "auxiliary_loss_clip": 0.01047863, + "auxiliary_loss_mlp": 0.01028974, + "balance_loss_clip": 1.02743149, + "balance_loss_mlp": 1.01729703, + "epoch": 0.9306478280474973, + "flos": 16977236544000.0, + "grad_norm": 2.549539609365022, + "language_loss": 0.78697282, + "learning_rate": 5.01999030853566e-08, + "loss": 0.80774117, + "num_input_tokens_seen": 334042785, + "step": 15479, + "time_per_iteration": 2.5905845165252686 + }, + { + "auxiliary_loss_clip": 0.01060435, + "auxiliary_loss_mlp": 0.01026664, + "balance_loss_clip": 1.02392411, + "balance_loss_mlp": 1.01696062, + "epoch": 0.9307079513001654, + "flos": 35663353608960.0, + "grad_norm": 1.6358412245036127, + "language_loss": 0.68860197, + "learning_rate": 5.0113229145750445e-08, + "loss": 0.70947301, + "num_input_tokens_seen": 334063480, + "step": 15480, + "time_per_iteration": 2.646472454071045 + }, + { + "auxiliary_loss_clip": 0.01061302, + "auxiliary_loss_mlp": 0.01029257, + "balance_loss_clip": 1.02447343, + "balance_loss_mlp": 1.01896918, + "epoch": 0.9307680745528333, + "flos": 19208151377280.0, + "grad_norm": 1.7338195822002462, + "language_loss": 0.67553151, + "learning_rate": 5.002662914604583e-08, + "loss": 0.69643712, + "num_input_tokens_seen": 334082005, + "step": 15481, + "time_per_iteration": 2.626338005065918 + }, + { + "auxiliary_loss_clip": 0.01032437, + "auxiliary_loss_mlp": 0.01027481, + "balance_loss_clip": 1.02070045, + "balance_loss_mlp": 1.01702034, + "epoch": 0.9308281978055013, + "flos": 19062641381760.0, + "grad_norm": 1.816950105833098, + "language_loss": 0.74418885, + "learning_rate": 4.994010308952701e-08, + "loss": 0.76478803, + "num_input_tokens_seen": 334101375, + "step": 15482, + "time_per_iteration": 2.6254639625549316 + }, + { + "auxiliary_loss_clip": 0.0104874, + "auxiliary_loss_mlp": 0.01024431, + "balance_loss_clip": 1.02288461, + "balance_loss_mlp": 1.01474559, + "epoch": 0.9308883210581692, + "flos": 20521548178560.0, + "grad_norm": 2.700156470665717, + "language_loss": 0.79612923, + "learning_rate": 4.985365097947469e-08, + "loss": 0.81686091, + "num_input_tokens_seen": 334119460, + "step": 15483, + "time_per_iteration": 2.625304937362671 + }, + { + "auxiliary_loss_clip": 0.01044478, + "auxiliary_loss_mlp": 0.01029606, + "balance_loss_clip": 1.02692819, + "balance_loss_mlp": 1.01903188, + "epoch": 0.9309484443108372, + "flos": 13001422826880.0, + "grad_norm": 2.056592484585102, + "language_loss": 0.74614459, + "learning_rate": 4.976727281916782e-08, + "loss": 0.7668854, + "num_input_tokens_seen": 334136065, + "step": 15484, + "time_per_iteration": 2.664320468902588 + }, + { + "auxiliary_loss_clip": 0.01044615, + "auxiliary_loss_mlp": 0.01031101, + "balance_loss_clip": 1.02614832, + "balance_loss_mlp": 1.02071786, + "epoch": 0.9310085675635051, + "flos": 12567765928320.0, + "grad_norm": 2.622555936990627, + "language_loss": 0.76141226, + "learning_rate": 4.968096861188087e-08, + "loss": 0.7821694, + "num_input_tokens_seen": 334153690, + "step": 15485, + "time_per_iteration": 2.6047070026397705 + }, + { + "auxiliary_loss_clip": 0.01012656, + "auxiliary_loss_mlp": 0.01029511, + "balance_loss_clip": 1.02133191, + "balance_loss_mlp": 1.01785779, + "epoch": 0.9310686908161732, + "flos": 23477570864640.0, + "grad_norm": 1.8464635846677644, + "language_loss": 0.78344995, + "learning_rate": 4.959473836088723e-08, + "loss": 0.80387163, + "num_input_tokens_seen": 334171880, + "step": 15486, + "time_per_iteration": 2.7299344539642334 + }, + { + "auxiliary_loss_clip": 0.01034928, + "auxiliary_loss_mlp": 0.01029884, + "balance_loss_clip": 1.02676582, + "balance_loss_mlp": 1.01894629, + "epoch": 0.9311288140688411, + "flos": 24170287628160.0, + "grad_norm": 5.402939218900678, + "language_loss": 0.76757073, + "learning_rate": 4.950858206945674e-08, + "loss": 0.78821886, + "num_input_tokens_seen": 334190005, + "step": 15487, + "time_per_iteration": 2.7052459716796875 + }, + { + "auxiliary_loss_clip": 0.01034865, + "auxiliary_loss_mlp": 0.01026523, + "balance_loss_clip": 1.02682543, + "balance_loss_mlp": 1.01598501, + "epoch": 0.9311889373215091, + "flos": 35590203561600.0, + "grad_norm": 2.106561045598465, + "language_loss": 0.67128271, + "learning_rate": 4.942249974085633e-08, + "loss": 0.69189662, + "num_input_tokens_seen": 334209545, + "step": 15488, + "time_per_iteration": 2.8809657096862793 + }, + { + "auxiliary_loss_clip": 0.01040297, + "auxiliary_loss_mlp": 0.0102962, + "balance_loss_clip": 1.02541232, + "balance_loss_mlp": 1.01914787, + "epoch": 0.9312490605741771, + "flos": 20230528187520.0, + "grad_norm": 1.8540762641170998, + "language_loss": 0.74887669, + "learning_rate": 4.933649137834983e-08, + "loss": 0.76957583, + "num_input_tokens_seen": 334228900, + "step": 15489, + "time_per_iteration": 2.729665517807007 + }, + { + "auxiliary_loss_clip": 0.01062883, + "auxiliary_loss_mlp": 0.01029603, + "balance_loss_clip": 1.02502561, + "balance_loss_mlp": 1.01923192, + "epoch": 0.931309183826845, + "flos": 13950577762560.0, + "grad_norm": 3.0017676419199595, + "language_loss": 0.80922556, + "learning_rate": 4.925055698519931e-08, + "loss": 0.83015037, + "num_input_tokens_seen": 334245500, + "step": 15490, + "time_per_iteration": 4.363323450088501 + }, + { + "auxiliary_loss_clip": 0.01022943, + "auxiliary_loss_mlp": 0.010279, + "balance_loss_clip": 1.02536857, + "balance_loss_mlp": 1.01700997, + "epoch": 0.931369307079513, + "flos": 20156731695360.0, + "grad_norm": 2.3189716636163205, + "language_loss": 0.7192198, + "learning_rate": 4.9164696564663264e-08, + "loss": 0.73972821, + "num_input_tokens_seen": 334264370, + "step": 15491, + "time_per_iteration": 2.817436695098877 + }, + { + "auxiliary_loss_clip": 0.0103715, + "auxiliary_loss_mlp": 0.00747462, + "balance_loss_clip": 1.02258778, + "balance_loss_mlp": 1.00035715, + "epoch": 0.931429430332181, + "flos": 25338569483520.0, + "grad_norm": 1.9085676106067535, + "language_loss": 0.74401283, + "learning_rate": 4.9078910119997096e-08, + "loss": 0.76185894, + "num_input_tokens_seen": 334283905, + "step": 15492, + "time_per_iteration": 2.724979877471924 + }, + { + "auxiliary_loss_clip": 0.00996718, + "auxiliary_loss_mlp": 0.01002657, + "balance_loss_clip": 1.00113046, + "balance_loss_mlp": 1.00183427, + "epoch": 0.931489553584849, + "flos": 71226193985280.0, + "grad_norm": 0.708622560522646, + "language_loss": 0.53404212, + "learning_rate": 4.899319765445442e-08, + "loss": 0.55403584, + "num_input_tokens_seen": 334339925, + "step": 15493, + "time_per_iteration": 3.1339426040649414 + }, + { + "auxiliary_loss_clip": 0.01048935, + "auxiliary_loss_mlp": 0.01029988, + "balance_loss_clip": 1.02367783, + "balance_loss_mlp": 1.02022481, + "epoch": 0.9315496768375169, + "flos": 14643653662080.0, + "grad_norm": 1.7731363947466094, + "language_loss": 0.70764577, + "learning_rate": 4.890755917128531e-08, + "loss": 0.72843504, + "num_input_tokens_seen": 334357225, + "step": 15494, + "time_per_iteration": 2.5797982215881348 + }, + { + "auxiliary_loss_clip": 0.01044829, + "auxiliary_loss_mlp": 0.01026763, + "balance_loss_clip": 1.02493811, + "balance_loss_mlp": 1.01651049, + "epoch": 0.9316098000901849, + "flos": 28329928174080.0, + "grad_norm": 1.6274405814543405, + "language_loss": 0.68299544, + "learning_rate": 4.882199467373671e-08, + "loss": 0.70371139, + "num_input_tokens_seen": 334375945, + "step": 15495, + "time_per_iteration": 2.7166879177093506 + }, + { + "auxiliary_loss_clip": 0.01058221, + "auxiliary_loss_mlp": 0.01028648, + "balance_loss_clip": 1.02228451, + "balance_loss_mlp": 1.01932645, + "epoch": 0.9316699233428528, + "flos": 28512677594880.0, + "grad_norm": 2.030521458085875, + "language_loss": 0.61604851, + "learning_rate": 4.8736504165053815e-08, + "loss": 0.63691723, + "num_input_tokens_seen": 334395310, + "step": 15496, + "time_per_iteration": 2.580207109451294 + }, + { + "auxiliary_loss_clip": 0.01052392, + "auxiliary_loss_mlp": 0.01032709, + "balance_loss_clip": 1.02443266, + "balance_loss_mlp": 1.02224827, + "epoch": 0.9317300465955208, + "flos": 33693402061440.0, + "grad_norm": 1.5791610945786754, + "language_loss": 0.773444, + "learning_rate": 4.865108764847825e-08, + "loss": 0.79429501, + "num_input_tokens_seen": 334416965, + "step": 15497, + "time_per_iteration": 2.791142225265503 + }, + { + "auxiliary_loss_clip": 0.01055977, + "auxiliary_loss_mlp": 0.00747506, + "balance_loss_clip": 1.0265758, + "balance_loss_mlp": 1.00031853, + "epoch": 0.9317901698481887, + "flos": 23658237296640.0, + "grad_norm": 2.8314657349634764, + "language_loss": 0.66698062, + "learning_rate": 4.856574512724898e-08, + "loss": 0.68501544, + "num_input_tokens_seen": 334435620, + "step": 15498, + "time_per_iteration": 2.644897937774658 + }, + { + "auxiliary_loss_clip": 0.01043235, + "auxiliary_loss_mlp": 0.01032125, + "balance_loss_clip": 1.02549648, + "balance_loss_mlp": 1.02157497, + "epoch": 0.9318502931008568, + "flos": 20960017499520.0, + "grad_norm": 1.5354022469854798, + "language_loss": 0.79796529, + "learning_rate": 4.8480476604602305e-08, + "loss": 0.81871885, + "num_input_tokens_seen": 334456210, + "step": 15499, + "time_per_iteration": 2.6742286682128906 + }, + { + "auxiliary_loss_clip": 0.01020518, + "auxiliary_loss_mlp": 0.01033323, + "balance_loss_clip": 1.02476585, + "balance_loss_mlp": 1.02209902, + "epoch": 0.9319104163535247, + "flos": 23441049711360.0, + "grad_norm": 1.9362692870190579, + "language_loss": 0.77038813, + "learning_rate": 4.8395282083771196e-08, + "loss": 0.79092652, + "num_input_tokens_seen": 334475485, + "step": 15500, + "time_per_iteration": 2.836369752883911 + }, + { + "auxiliary_loss_clip": 0.01030819, + "auxiliary_loss_mlp": 0.01024451, + "balance_loss_clip": 1.02370834, + "balance_loss_mlp": 1.0147295, + "epoch": 0.9319705396061927, + "flos": 22347426274560.0, + "grad_norm": 1.7635209840274872, + "language_loss": 0.72329515, + "learning_rate": 4.8310161567987064e-08, + "loss": 0.74384785, + "num_input_tokens_seen": 334494740, + "step": 15501, + "time_per_iteration": 2.896552324295044 + }, + { + "auxiliary_loss_clip": 0.01063472, + "auxiliary_loss_mlp": 0.01031375, + "balance_loss_clip": 1.02564716, + "balance_loss_mlp": 1.0207113, + "epoch": 0.9320306628588607, + "flos": 20993557824000.0, + "grad_norm": 1.7949381929308295, + "language_loss": 0.66398001, + "learning_rate": 4.822511506047666e-08, + "loss": 0.68492848, + "num_input_tokens_seen": 334511910, + "step": 15502, + "time_per_iteration": 2.5809569358825684 + }, + { + "auxiliary_loss_clip": 0.01053998, + "auxiliary_loss_mlp": 0.00747798, + "balance_loss_clip": 1.0254755, + "balance_loss_mlp": 1.00049019, + "epoch": 0.9320907861115286, + "flos": 24538300421760.0, + "grad_norm": 1.7150873066509003, + "language_loss": 0.65557301, + "learning_rate": 4.814014256446586e-08, + "loss": 0.6735909, + "num_input_tokens_seen": 334533150, + "step": 15503, + "time_per_iteration": 2.677076578140259 + }, + { + "auxiliary_loss_clip": 0.01021183, + "auxiliary_loss_mlp": 0.01030103, + "balance_loss_clip": 1.0205394, + "balance_loss_mlp": 1.01883137, + "epoch": 0.9321509093641966, + "flos": 19785414850560.0, + "grad_norm": 1.6222177336666115, + "language_loss": 0.7522248, + "learning_rate": 4.805524408317652e-08, + "loss": 0.77273762, + "num_input_tokens_seen": 334550940, + "step": 15504, + "time_per_iteration": 2.6939878463745117 + }, + { + "auxiliary_loss_clip": 0.0105316, + "auxiliary_loss_mlp": 0.00747611, + "balance_loss_clip": 1.02506638, + "balance_loss_mlp": 1.00037909, + "epoch": 0.9322110326168646, + "flos": 24972675592320.0, + "grad_norm": 2.0107806282675043, + "language_loss": 0.70753562, + "learning_rate": 4.797041961982762e-08, + "loss": 0.72554338, + "num_input_tokens_seen": 334570935, + "step": 15505, + "time_per_iteration": 2.6624197959899902 + }, + { + "auxiliary_loss_clip": 0.01041513, + "auxiliary_loss_mlp": 0.01028895, + "balance_loss_clip": 1.02382112, + "balance_loss_mlp": 1.01803517, + "epoch": 0.9322711558695326, + "flos": 16143642639360.0, + "grad_norm": 2.3189449723274325, + "language_loss": 0.75477272, + "learning_rate": 4.788566917763614e-08, + "loss": 0.77547681, + "num_input_tokens_seen": 334589315, + "step": 15506, + "time_per_iteration": 2.6021339893341064 + }, + { + "auxiliary_loss_clip": 0.01024698, + "auxiliary_loss_mlp": 0.01023531, + "balance_loss_clip": 1.02408171, + "balance_loss_mlp": 1.01369011, + "epoch": 0.9323312791222005, + "flos": 23732428838400.0, + "grad_norm": 1.9845717143029815, + "language_loss": 0.83154738, + "learning_rate": 4.780099275981597e-08, + "loss": 0.85202968, + "num_input_tokens_seen": 334608990, + "step": 15507, + "time_per_iteration": 4.58607292175293 + }, + { + "auxiliary_loss_clip": 0.01063078, + "auxiliary_loss_mlp": 0.01026857, + "balance_loss_clip": 1.02519691, + "balance_loss_mlp": 1.01658678, + "epoch": 0.9323914023748685, + "flos": 20777914523520.0, + "grad_norm": 1.7058167599765368, + "language_loss": 0.68162578, + "learning_rate": 4.771639036957742e-08, + "loss": 0.70252514, + "num_input_tokens_seen": 334628655, + "step": 15508, + "time_per_iteration": 2.650810956954956 + }, + { + "auxiliary_loss_clip": 0.01028196, + "auxiliary_loss_mlp": 0.01027188, + "balance_loss_clip": 1.02327693, + "balance_loss_mlp": 1.01702499, + "epoch": 0.9324515256275364, + "flos": 23915178259200.0, + "grad_norm": 1.6198967551596586, + "language_loss": 0.72508669, + "learning_rate": 4.7631862010129033e-08, + "loss": 0.74564052, + "num_input_tokens_seen": 334648295, + "step": 15509, + "time_per_iteration": 2.704813241958618 + }, + { + "auxiliary_loss_clip": 0.01051948, + "auxiliary_loss_mlp": 0.01027881, + "balance_loss_clip": 1.02470183, + "balance_loss_mlp": 1.01765275, + "epoch": 0.9325116488802044, + "flos": 18005215875840.0, + "grad_norm": 1.934454881220635, + "language_loss": 0.74661303, + "learning_rate": 4.754740768467624e-08, + "loss": 0.76741129, + "num_input_tokens_seen": 334666280, + "step": 15510, + "time_per_iteration": 2.624842405319214 + }, + { + "auxiliary_loss_clip": 0.01049511, + "auxiliary_loss_mlp": 0.01025102, + "balance_loss_clip": 1.02221692, + "balance_loss_mlp": 1.0148859, + "epoch": 0.9325717721328723, + "flos": 29021603443200.0, + "grad_norm": 2.0650447960408727, + "language_loss": 0.7031427, + "learning_rate": 4.746302739642161e-08, + "loss": 0.72388887, + "num_input_tokens_seen": 334688830, + "step": 15511, + "time_per_iteration": 2.7093238830566406 + }, + { + "auxiliary_loss_clip": 0.0103361, + "auxiliary_loss_mlp": 0.01031178, + "balance_loss_clip": 1.02253997, + "balance_loss_mlp": 1.02084851, + "epoch": 0.9326318953855404, + "flos": 21646341642240.0, + "grad_norm": 4.701762062079118, + "language_loss": 0.78178096, + "learning_rate": 4.737872114856412e-08, + "loss": 0.80242884, + "num_input_tokens_seen": 334705205, + "step": 15512, + "time_per_iteration": 2.6460366249084473 + }, + { + "auxiliary_loss_clip": 0.01060256, + "auxiliary_loss_mlp": 0.01026167, + "balance_loss_clip": 1.02406645, + "balance_loss_mlp": 1.01582527, + "epoch": 0.9326920186382083, + "flos": 26065724411520.0, + "grad_norm": 1.6535531318704422, + "language_loss": 0.80339682, + "learning_rate": 4.7294488944301436e-08, + "loss": 0.82426107, + "num_input_tokens_seen": 334723830, + "step": 15513, + "time_per_iteration": 2.5784542560577393 + }, + { + "auxiliary_loss_clip": 0.01037116, + "auxiliary_loss_mlp": 0.01028585, + "balance_loss_clip": 1.02545357, + "balance_loss_mlp": 1.01705754, + "epoch": 0.9327521418908763, + "flos": 12057116227200.0, + "grad_norm": 1.9437804935373801, + "language_loss": 0.8010385, + "learning_rate": 4.721033078682768e-08, + "loss": 0.82169557, + "num_input_tokens_seen": 334740825, + "step": 15514, + "time_per_iteration": 2.542294502258301 + }, + { + "auxiliary_loss_clip": 0.01032197, + "auxiliary_loss_mlp": 0.01034743, + "balance_loss_clip": 1.02436996, + "balance_loss_mlp": 1.02483678, + "epoch": 0.9328122651435443, + "flos": 43834395271680.0, + "grad_norm": 1.661794782613842, + "language_loss": 0.71585673, + "learning_rate": 4.7126246679333626e-08, + "loss": 0.73652613, + "num_input_tokens_seen": 334765825, + "step": 15515, + "time_per_iteration": 2.7332613468170166 + }, + { + "auxiliary_loss_clip": 0.01047535, + "auxiliary_loss_mlp": 0.01032178, + "balance_loss_clip": 1.02684426, + "balance_loss_mlp": 1.02084076, + "epoch": 0.9328723883962122, + "flos": 15194954580480.0, + "grad_norm": 2.6861040122609783, + "language_loss": 0.8076635, + "learning_rate": 4.704223662500806e-08, + "loss": 0.82846069, + "num_input_tokens_seen": 334782680, + "step": 15516, + "time_per_iteration": 2.7594006061553955 + }, + { + "auxiliary_loss_clip": 0.01020972, + "auxiliary_loss_mlp": 0.0103445, + "balance_loss_clip": 1.02172971, + "balance_loss_mlp": 1.0225414, + "epoch": 0.9329325116488802, + "flos": 20261770041600.0, + "grad_norm": 1.989014700851724, + "language_loss": 0.81133443, + "learning_rate": 4.695830062703643e-08, + "loss": 0.83188868, + "num_input_tokens_seen": 334800160, + "step": 15517, + "time_per_iteration": 2.805591344833374 + }, + { + "auxiliary_loss_clip": 0.01042665, + "auxiliary_loss_mlp": 0.01029821, + "balance_loss_clip": 1.02461326, + "balance_loss_mlp": 1.01856732, + "epoch": 0.9329926349015482, + "flos": 13115008609920.0, + "grad_norm": 1.9917542531706947, + "language_loss": 0.74675417, + "learning_rate": 4.687443868860219e-08, + "loss": 0.767479, + "num_input_tokens_seen": 334815840, + "step": 15518, + "time_per_iteration": 4.54601526260376 + }, + { + "auxiliary_loss_clip": 0.01035852, + "auxiliary_loss_mlp": 0.01032989, + "balance_loss_clip": 1.02232778, + "balance_loss_mlp": 1.02256441, + "epoch": 0.9330527581542162, + "flos": 23040250778880.0, + "grad_norm": 2.128922632768536, + "language_loss": 0.75866216, + "learning_rate": 4.679065081288458e-08, + "loss": 0.77935052, + "num_input_tokens_seen": 334834735, + "step": 15519, + "time_per_iteration": 4.291054010391235 + }, + { + "auxiliary_loss_clip": 0.01007388, + "auxiliary_loss_mlp": 0.01029168, + "balance_loss_clip": 1.02125573, + "balance_loss_mlp": 1.01801062, + "epoch": 0.9331128814068841, + "flos": 15559627409280.0, + "grad_norm": 2.1363270047149983, + "language_loss": 0.83314705, + "learning_rate": 4.6706937003061275e-08, + "loss": 0.85351264, + "num_input_tokens_seen": 334853490, + "step": 15520, + "time_per_iteration": 2.8327479362487793 + }, + { + "auxiliary_loss_clip": 0.01048099, + "auxiliary_loss_mlp": 0.01026446, + "balance_loss_clip": 1.02313161, + "balance_loss_mlp": 1.01654577, + "epoch": 0.9331730046595521, + "flos": 22271762275200.0, + "grad_norm": 1.644766930664792, + "language_loss": 0.76415181, + "learning_rate": 4.6623297262306846e-08, + "loss": 0.78489733, + "num_input_tokens_seen": 334873675, + "step": 15521, + "time_per_iteration": 2.6531386375427246 + }, + { + "auxiliary_loss_clip": 0.01052535, + "auxiliary_loss_mlp": 0.01027597, + "balance_loss_clip": 1.02505505, + "balance_loss_mlp": 1.01763153, + "epoch": 0.93323312791222, + "flos": 15777641007360.0, + "grad_norm": 1.735988830991831, + "language_loss": 0.7767756, + "learning_rate": 4.6539731593792545e-08, + "loss": 0.7975769, + "num_input_tokens_seen": 334890970, + "step": 15522, + "time_per_iteration": 2.6027915477752686 + }, + { + "auxiliary_loss_clip": 0.01027048, + "auxiliary_loss_mlp": 0.00747649, + "balance_loss_clip": 1.02197576, + "balance_loss_mlp": 1.00036097, + "epoch": 0.933293251164888, + "flos": 22010978557440.0, + "grad_norm": 2.393528322200381, + "language_loss": 0.62835771, + "learning_rate": 4.6456240000687373e-08, + "loss": 0.64610469, + "num_input_tokens_seen": 334906635, + "step": 15523, + "time_per_iteration": 2.6924147605895996 + }, + { + "auxiliary_loss_clip": 0.01038432, + "auxiliary_loss_mlp": 0.01029621, + "balance_loss_clip": 1.02383995, + "balance_loss_mlp": 1.01959538, + "epoch": 0.933353374417556, + "flos": 26031358074240.0, + "grad_norm": 2.013964738548211, + "language_loss": 0.68234265, + "learning_rate": 4.63728224861577e-08, + "loss": 0.7030232, + "num_input_tokens_seen": 334926230, + "step": 15524, + "time_per_iteration": 2.73128604888916 + }, + { + "auxiliary_loss_clip": 0.01019976, + "auxiliary_loss_mlp": 0.01029709, + "balance_loss_clip": 1.02326524, + "balance_loss_mlp": 1.01891422, + "epoch": 0.933413497670224, + "flos": 24900100162560.0, + "grad_norm": 1.6239398018494122, + "language_loss": 0.73740733, + "learning_rate": 4.628947905336589e-08, + "loss": 0.75790417, + "num_input_tokens_seen": 334946680, + "step": 15525, + "time_per_iteration": 2.801069974899292 + }, + { + "auxiliary_loss_clip": 0.01014621, + "auxiliary_loss_mlp": 0.01033372, + "balance_loss_clip": 1.02381718, + "balance_loss_mlp": 1.02280974, + "epoch": 0.9334736209228919, + "flos": 23688689051520.0, + "grad_norm": 1.6715627059889246, + "language_loss": 0.83648098, + "learning_rate": 4.6206209705473175e-08, + "loss": 0.85696089, + "num_input_tokens_seen": 334964785, + "step": 15526, + "time_per_iteration": 2.8379054069519043 + }, + { + "auxiliary_loss_clip": 0.01018002, + "auxiliary_loss_mlp": 0.01029579, + "balance_loss_clip": 1.02429152, + "balance_loss_mlp": 1.01885641, + "epoch": 0.9335337441755599, + "flos": 15377344865280.0, + "grad_norm": 1.8361952954811651, + "language_loss": 0.69217545, + "learning_rate": 4.61230144456366e-08, + "loss": 0.71265125, + "num_input_tokens_seen": 334982400, + "step": 15527, + "time_per_iteration": 2.716033935546875 + }, + { + "auxiliary_loss_clip": 0.01062983, + "auxiliary_loss_mlp": 0.01026862, + "balance_loss_clip": 1.02491999, + "balance_loss_mlp": 1.01524544, + "epoch": 0.9335938674282279, + "flos": 16106726436480.0, + "grad_norm": 1.9977090847526955, + "language_loss": 0.65815455, + "learning_rate": 4.603989327701141e-08, + "loss": 0.67905307, + "num_input_tokens_seen": 334999685, + "step": 15528, + "time_per_iteration": 2.672682762145996 + }, + { + "auxiliary_loss_clip": 0.01063013, + "auxiliary_loss_mlp": 0.01029872, + "balance_loss_clip": 1.02428246, + "balance_loss_mlp": 1.01898825, + "epoch": 0.9336539906808958, + "flos": 18952898353920.0, + "grad_norm": 1.7944768462919578, + "language_loss": 0.75101709, + "learning_rate": 4.5956846202748867e-08, + "loss": 0.77194595, + "num_input_tokens_seen": 335019160, + "step": 15529, + "time_per_iteration": 2.5994532108306885 + }, + { + "auxiliary_loss_clip": 0.01014606, + "auxiliary_loss_mlp": 0.01026646, + "balance_loss_clip": 1.02107668, + "balance_loss_mlp": 1.01706195, + "epoch": 0.9337141139335638, + "flos": 18109104986880.0, + "grad_norm": 1.717709194148374, + "language_loss": 0.629094, + "learning_rate": 4.5873873225998674e-08, + "loss": 0.64950651, + "num_input_tokens_seen": 335037350, + "step": 15530, + "time_per_iteration": 2.7146224975585938 + }, + { + "auxiliary_loss_clip": 0.0104194, + "auxiliary_loss_mlp": 0.01028949, + "balance_loss_clip": 1.02555025, + "balance_loss_mlp": 1.0191977, + "epoch": 0.9337742371862318, + "flos": 17345716214400.0, + "grad_norm": 1.7731138020436426, + "language_loss": 0.72468573, + "learning_rate": 4.5790974349907194e-08, + "loss": 0.74539465, + "num_input_tokens_seen": 335056060, + "step": 15531, + "time_per_iteration": 2.7282819747924805 + }, + { + "auxiliary_loss_clip": 0.01037905, + "auxiliary_loss_mlp": 0.01028871, + "balance_loss_clip": 1.02305818, + "balance_loss_mlp": 1.01802933, + "epoch": 0.9338343604388998, + "flos": 29058986522880.0, + "grad_norm": 2.005549280838796, + "language_loss": 0.71002012, + "learning_rate": 4.5708149577617925e-08, + "loss": 0.73068786, + "num_input_tokens_seen": 335075410, + "step": 15532, + "time_per_iteration": 2.7696022987365723 + }, + { + "auxiliary_loss_clip": 0.01063133, + "auxiliary_loss_mlp": 0.00747608, + "balance_loss_clip": 1.02569103, + "balance_loss_mlp": 1.00039434, + "epoch": 0.9338944836915677, + "flos": 18660908695680.0, + "grad_norm": 1.5014468093114959, + "language_loss": 0.73500085, + "learning_rate": 4.5625398912271016e-08, + "loss": 0.75310826, + "num_input_tokens_seen": 335095190, + "step": 15533, + "time_per_iteration": 2.672241687774658 + }, + { + "auxiliary_loss_clip": 0.01026698, + "auxiliary_loss_mlp": 0.01023494, + "balance_loss_clip": 1.02233243, + "balance_loss_mlp": 1.01395679, + "epoch": 0.9339546069442357, + "flos": 16617735273600.0, + "grad_norm": 1.7502854779076773, + "language_loss": 0.79821211, + "learning_rate": 4.554272235700507e-08, + "loss": 0.81871402, + "num_input_tokens_seen": 335113825, + "step": 15534, + "time_per_iteration": 2.7350664138793945 + }, + { + "auxiliary_loss_clip": 0.01057691, + "auxiliary_loss_mlp": 0.01025744, + "balance_loss_clip": 1.02466953, + "balance_loss_mlp": 1.01665449, + "epoch": 0.9340147301969036, + "flos": 23693106424320.0, + "grad_norm": 2.7233849396405163, + "language_loss": 0.74310297, + "learning_rate": 4.546011991495513e-08, + "loss": 0.76393729, + "num_input_tokens_seen": 335136425, + "step": 15535, + "time_per_iteration": 2.746101140975952 + }, + { + "auxiliary_loss_clip": 0.01054529, + "auxiliary_loss_mlp": 0.01027228, + "balance_loss_clip": 1.02696157, + "balance_loss_mlp": 1.01652932, + "epoch": 0.9340748534495716, + "flos": 28654452576000.0, + "grad_norm": 1.7932959725355393, + "language_loss": 0.77507061, + "learning_rate": 4.537759158925292e-08, + "loss": 0.79588819, + "num_input_tokens_seen": 335157925, + "step": 15536, + "time_per_iteration": 2.8482160568237305 + }, + { + "auxiliary_loss_clip": 0.01031028, + "auxiliary_loss_mlp": 0.0102349, + "balance_loss_clip": 1.02369428, + "balance_loss_mlp": 1.01363742, + "epoch": 0.9341349767022396, + "flos": 24899633285760.0, + "grad_norm": 1.9704546738385018, + "language_loss": 0.80711699, + "learning_rate": 4.5295137383028593e-08, + "loss": 0.82766223, + "num_input_tokens_seen": 335177840, + "step": 15537, + "time_per_iteration": 4.531624794006348 + }, + { + "auxiliary_loss_clip": 0.01045304, + "auxiliary_loss_mlp": 0.0102935, + "balance_loss_clip": 1.0265224, + "balance_loss_mlp": 1.01931822, + "epoch": 0.9341950999549076, + "flos": 29059525226880.0, + "grad_norm": 1.8289140931847605, + "language_loss": 0.78012168, + "learning_rate": 4.5212757299408764e-08, + "loss": 0.80086815, + "num_input_tokens_seen": 335199470, + "step": 15538, + "time_per_iteration": 2.732107162475586 + }, + { + "auxiliary_loss_clip": 0.01041344, + "auxiliary_loss_mlp": 0.01025168, + "balance_loss_clip": 1.02505469, + "balance_loss_mlp": 1.01499343, + "epoch": 0.9342552232075755, + "flos": 23587062497280.0, + "grad_norm": 1.6801337496433109, + "language_loss": 0.73599041, + "learning_rate": 4.513045134151672e-08, + "loss": 0.75665551, + "num_input_tokens_seen": 335218885, + "step": 15539, + "time_per_iteration": 2.6980345249176025 + }, + { + "auxiliary_loss_clip": 0.01016929, + "auxiliary_loss_mlp": 0.01024617, + "balance_loss_clip": 1.02710593, + "balance_loss_mlp": 1.01519346, + "epoch": 0.9343153464602435, + "flos": 36721389646080.0, + "grad_norm": 2.5278356547096728, + "language_loss": 0.64648312, + "learning_rate": 4.504821951247373e-08, + "loss": 0.66689855, + "num_input_tokens_seen": 335239485, + "step": 15540, + "time_per_iteration": 2.8848414421081543 + }, + { + "auxiliary_loss_clip": 0.01050736, + "auxiliary_loss_mlp": 0.01027319, + "balance_loss_clip": 1.02388334, + "balance_loss_mlp": 1.01762164, + "epoch": 0.9343754697129115, + "flos": 22236498097920.0, + "grad_norm": 1.6868954482758276, + "language_loss": 0.76654565, + "learning_rate": 4.496606181539864e-08, + "loss": 0.78732622, + "num_input_tokens_seen": 335258355, + "step": 15541, + "time_per_iteration": 2.628316879272461 + }, + { + "auxiliary_loss_clip": 0.01055764, + "auxiliary_loss_mlp": 0.0102726, + "balance_loss_clip": 1.02819407, + "balance_loss_mlp": 1.016644, + "epoch": 0.9344355929655794, + "flos": 29710333797120.0, + "grad_norm": 2.313202446435932, + "language_loss": 0.66672999, + "learning_rate": 4.4883978253406066e-08, + "loss": 0.6875602, + "num_input_tokens_seen": 335276835, + "step": 15542, + "time_per_iteration": 2.70155930519104 + }, + { + "auxiliary_loss_clip": 0.01024445, + "auxiliary_loss_mlp": 0.01029505, + "balance_loss_clip": 1.02359009, + "balance_loss_mlp": 1.01868093, + "epoch": 0.9344957162182475, + "flos": 18880394751360.0, + "grad_norm": 1.664513591604751, + "language_loss": 0.69485551, + "learning_rate": 4.480196882960907e-08, + "loss": 0.71539497, + "num_input_tokens_seen": 335296220, + "step": 15543, + "time_per_iteration": 2.6116700172424316 + }, + { + "auxiliary_loss_clip": 0.0105006, + "auxiliary_loss_mlp": 0.01031937, + "balance_loss_clip": 1.02238083, + "balance_loss_mlp": 1.02056396, + "epoch": 0.9345558394709154, + "flos": 27417761268480.0, + "grad_norm": 1.815151203556593, + "language_loss": 0.69600582, + "learning_rate": 4.4720033547117394e-08, + "loss": 0.71682578, + "num_input_tokens_seen": 335316335, + "step": 15544, + "time_per_iteration": 2.696415901184082 + }, + { + "auxiliary_loss_clip": 0.01046174, + "auxiliary_loss_mlp": 0.01037041, + "balance_loss_clip": 1.02335727, + "balance_loss_mlp": 1.02566886, + "epoch": 0.9346159627235834, + "flos": 20741285629440.0, + "grad_norm": 1.6412502857025237, + "language_loss": 0.77143669, + "learning_rate": 4.463817240903789e-08, + "loss": 0.79226887, + "num_input_tokens_seen": 335335545, + "step": 15545, + "time_per_iteration": 2.7143094539642334 + }, + { + "auxiliary_loss_clip": 0.01053244, + "auxiliary_loss_mlp": 0.01026026, + "balance_loss_clip": 1.02578056, + "balance_loss_mlp": 1.01645398, + "epoch": 0.9346760859762513, + "flos": 21069221823360.0, + "grad_norm": 1.6023142107092663, + "language_loss": 0.68953174, + "learning_rate": 4.455638541847495e-08, + "loss": 0.71032447, + "num_input_tokens_seen": 335355350, + "step": 15546, + "time_per_iteration": 2.6369547843933105 + }, + { + "auxiliary_loss_clip": 0.01018086, + "auxiliary_loss_mlp": 0.01026418, + "balance_loss_clip": 1.02043724, + "balance_loss_mlp": 1.01670837, + "epoch": 0.9347362092289193, + "flos": 29204927481600.0, + "grad_norm": 1.8845828763825918, + "language_loss": 0.82416087, + "learning_rate": 4.447467257852966e-08, + "loss": 0.84460592, + "num_input_tokens_seen": 335375160, + "step": 15547, + "time_per_iteration": 2.739564895629883 + }, + { + "auxiliary_loss_clip": 0.0104289, + "auxiliary_loss_mlp": 0.01032236, + "balance_loss_clip": 1.02199221, + "balance_loss_mlp": 1.02169156, + "epoch": 0.9347963324815872, + "flos": 19427350124160.0, + "grad_norm": 2.059846648276398, + "language_loss": 0.83516437, + "learning_rate": 4.439303389230087e-08, + "loss": 0.85591567, + "num_input_tokens_seen": 335394080, + "step": 15548, + "time_per_iteration": 2.6604864597320557 + }, + { + "auxiliary_loss_clip": 0.01054324, + "auxiliary_loss_mlp": 0.01030918, + "balance_loss_clip": 1.02488232, + "balance_loss_mlp": 1.01922953, + "epoch": 0.9348564557342552, + "flos": 36901840596480.0, + "grad_norm": 1.6610743445986407, + "language_loss": 0.65422249, + "learning_rate": 4.4311469362884326e-08, + "loss": 0.67507482, + "num_input_tokens_seen": 335414230, + "step": 15549, + "time_per_iteration": 2.728935956954956 + }, + { + "auxiliary_loss_clip": 0.01053812, + "auxiliary_loss_mlp": 0.01029074, + "balance_loss_clip": 1.02576959, + "balance_loss_mlp": 1.01832104, + "epoch": 0.9349165789869232, + "flos": 21690117342720.0, + "grad_norm": 2.3852583234320246, + "language_loss": 0.8023895, + "learning_rate": 4.4229978993372665e-08, + "loss": 0.82321835, + "num_input_tokens_seen": 335432890, + "step": 15550, + "time_per_iteration": 2.625164031982422 + }, + { + "auxiliary_loss_clip": 0.01053507, + "auxiliary_loss_mlp": 0.01026425, + "balance_loss_clip": 1.02660835, + "balance_loss_mlp": 1.0165906, + "epoch": 0.9349767022395912, + "flos": 18844053166080.0, + "grad_norm": 1.5820244511421389, + "language_loss": 0.75667781, + "learning_rate": 4.4148562786856524e-08, + "loss": 0.77747715, + "num_input_tokens_seen": 335452085, + "step": 15551, + "time_per_iteration": 2.6507530212402344 + }, + { + "auxiliary_loss_clip": 0.01003557, + "auxiliary_loss_mlp": 0.01029813, + "balance_loss_clip": 1.02208316, + "balance_loss_mlp": 1.02100396, + "epoch": 0.9350368254922591, + "flos": 24973429777920.0, + "grad_norm": 1.6126295427649147, + "language_loss": 0.7339921, + "learning_rate": 4.406722074642255e-08, + "loss": 0.75432575, + "num_input_tokens_seen": 335472130, + "step": 15552, + "time_per_iteration": 2.7920169830322266 + }, + { + "auxiliary_loss_clip": 0.01013925, + "auxiliary_loss_mlp": 0.01035343, + "balance_loss_clip": 1.02157819, + "balance_loss_mlp": 1.02451277, + "epoch": 0.9350969487449271, + "flos": 23070594792960.0, + "grad_norm": 1.7224991259186735, + "language_loss": 0.77354193, + "learning_rate": 4.3985952875155386e-08, + "loss": 0.7940346, + "num_input_tokens_seen": 335489970, + "step": 15553, + "time_per_iteration": 2.7647225856781006 + }, + { + "auxiliary_loss_clip": 0.01028374, + "auxiliary_loss_mlp": 0.01037981, + "balance_loss_clip": 1.02242589, + "balance_loss_mlp": 1.0267694, + "epoch": 0.9351570719975951, + "flos": 18625177641600.0, + "grad_norm": 1.5675131203349013, + "language_loss": 0.78458279, + "learning_rate": 4.390475917613723e-08, + "loss": 0.80524635, + "num_input_tokens_seen": 335509125, + "step": 15554, + "time_per_iteration": 2.7426748275756836 + }, + { + "auxiliary_loss_clip": 0.01030721, + "auxiliary_loss_mlp": 0.01026583, + "balance_loss_clip": 1.02066028, + "balance_loss_mlp": 1.01723075, + "epoch": 0.935217195250263, + "flos": 15888353702400.0, + "grad_norm": 1.5062011442432823, + "language_loss": 0.6929605, + "learning_rate": 4.382363965244695e-08, + "loss": 0.71353352, + "num_input_tokens_seen": 335525620, + "step": 15555, + "time_per_iteration": 4.44148063659668 + }, + { + "auxiliary_loss_clip": 0.00973766, + "auxiliary_loss_mlp": 0.01041463, + "balance_loss_clip": 1.01962256, + "balance_loss_mlp": 1.029459, + "epoch": 0.935277318502931, + "flos": 24390312387840.0, + "grad_norm": 1.6106753176276971, + "language_loss": 0.7542299, + "learning_rate": 4.374259430715965e-08, + "loss": 0.77438217, + "num_input_tokens_seen": 335547565, + "step": 15556, + "time_per_iteration": 2.966339111328125 + }, + { + "auxiliary_loss_clip": 0.0104016, + "auxiliary_loss_mlp": 0.01028329, + "balance_loss_clip": 1.02332091, + "balance_loss_mlp": 1.01848245, + "epoch": 0.935337441755599, + "flos": 27600259294080.0, + "grad_norm": 1.7986762011018431, + "language_loss": 0.72102606, + "learning_rate": 4.366162314334953e-08, + "loss": 0.74171102, + "num_input_tokens_seen": 335570285, + "step": 15557, + "time_per_iteration": 3.0070712566375732 + }, + { + "auxiliary_loss_clip": 0.01063035, + "auxiliary_loss_mlp": 0.01030982, + "balance_loss_clip": 1.02536142, + "balance_loss_mlp": 1.02037847, + "epoch": 0.935397565008267, + "flos": 20482872209280.0, + "grad_norm": 1.5635393050745159, + "language_loss": 0.63229322, + "learning_rate": 4.358072616408681e-08, + "loss": 0.65323341, + "num_input_tokens_seen": 335588600, + "step": 15558, + "time_per_iteration": 2.5717897415161133 + }, + { + "auxiliary_loss_clip": 0.01041188, + "auxiliary_loss_mlp": 0.01027323, + "balance_loss_clip": 1.02430582, + "balance_loss_mlp": 1.01640964, + "epoch": 0.9354576882609349, + "flos": 23654394541440.0, + "grad_norm": 1.825423037798088, + "language_loss": 0.72912836, + "learning_rate": 4.34999033724388e-08, + "loss": 0.74981344, + "num_input_tokens_seen": 335606235, + "step": 15559, + "time_per_iteration": 2.6379354000091553 + }, + { + "auxiliary_loss_clip": 0.01003326, + "auxiliary_loss_mlp": 0.00747552, + "balance_loss_clip": 1.02024615, + "balance_loss_mlp": 1.00034809, + "epoch": 0.9355178115136029, + "flos": 36684904406400.0, + "grad_norm": 1.7140681418785069, + "language_loss": 0.63636255, + "learning_rate": 4.341915477147062e-08, + "loss": 0.65387142, + "num_input_tokens_seen": 335628240, + "step": 15560, + "time_per_iteration": 2.859941005706787 + }, + { + "auxiliary_loss_clip": 0.01003745, + "auxiliary_loss_mlp": 0.01033449, + "balance_loss_clip": 1.0248518, + "balance_loss_mlp": 1.02114105, + "epoch": 0.9355779347662708, + "flos": 14460401450880.0, + "grad_norm": 2.1688206774799093, + "language_loss": 0.64129865, + "learning_rate": 4.3338480364244034e-08, + "loss": 0.66167057, + "num_input_tokens_seen": 335643755, + "step": 15561, + "time_per_iteration": 2.755575180053711 + }, + { + "auxiliary_loss_clip": 0.01061905, + "auxiliary_loss_mlp": 0.01031948, + "balance_loss_clip": 1.02582073, + "balance_loss_mlp": 1.02160072, + "epoch": 0.9356380580189388, + "flos": 23185976256000.0, + "grad_norm": 1.6943255335414826, + "language_loss": 0.75322139, + "learning_rate": 4.325788015381859e-08, + "loss": 0.77415991, + "num_input_tokens_seen": 335665160, + "step": 15562, + "time_per_iteration": 2.583889961242676 + }, + { + "auxiliary_loss_clip": 0.00996311, + "auxiliary_loss_mlp": 0.01000178, + "balance_loss_clip": 1.00159013, + "balance_loss_mlp": 0.9993372, + "epoch": 0.9356981812716068, + "flos": 67471626090240.0, + "grad_norm": 1.067145634279502, + "language_loss": 0.62414277, + "learning_rate": 4.31773541432503e-08, + "loss": 0.64410764, + "num_input_tokens_seen": 335715240, + "step": 15563, + "time_per_iteration": 3.0534255504608154 + }, + { + "auxiliary_loss_clip": 0.01011288, + "auxiliary_loss_mlp": 0.01031122, + "balance_loss_clip": 1.02332568, + "balance_loss_mlp": 1.02143669, + "epoch": 0.9357583045242748, + "flos": 24681619687680.0, + "grad_norm": 1.668178287030258, + "language_loss": 0.78635389, + "learning_rate": 4.3096902335592714e-08, + "loss": 0.80677801, + "num_input_tokens_seen": 335734970, + "step": 15564, + "time_per_iteration": 2.78355073928833 + }, + { + "auxiliary_loss_clip": 0.01062302, + "auxiliary_loss_mlp": 0.01027742, + "balance_loss_clip": 1.02439511, + "balance_loss_mlp": 1.01666737, + "epoch": 0.9358184277769427, + "flos": 19463727623040.0, + "grad_norm": 1.98367532990988, + "language_loss": 0.78209448, + "learning_rate": 4.301652473389694e-08, + "loss": 0.80299497, + "num_input_tokens_seen": 335753435, + "step": 15565, + "time_per_iteration": 4.265955209732056 + }, + { + "auxiliary_loss_clip": 0.01047056, + "auxiliary_loss_mlp": 0.01024352, + "balance_loss_clip": 1.02162254, + "balance_loss_mlp": 1.01467824, + "epoch": 0.9358785510296107, + "flos": 18916987731840.0, + "grad_norm": 1.9571930896802956, + "language_loss": 0.72399527, + "learning_rate": 4.2936221341210774e-08, + "loss": 0.74470937, + "num_input_tokens_seen": 335772105, + "step": 15566, + "time_per_iteration": 4.472599983215332 + }, + { + "auxiliary_loss_clip": 0.0102236, + "auxiliary_loss_mlp": 0.00747585, + "balance_loss_clip": 1.02163684, + "balance_loss_mlp": 1.00036263, + "epoch": 0.9359386742822787, + "flos": 23441265192960.0, + "grad_norm": 3.002462079467363, + "language_loss": 0.68002999, + "learning_rate": 4.285599216057889e-08, + "loss": 0.69772947, + "num_input_tokens_seen": 335789125, + "step": 15567, + "time_per_iteration": 2.755453586578369 + }, + { + "auxiliary_loss_clip": 0.01034088, + "auxiliary_loss_mlp": 0.01029222, + "balance_loss_clip": 1.02381563, + "balance_loss_mlp": 1.0182786, + "epoch": 0.9359987975349466, + "flos": 32744067557760.0, + "grad_norm": 1.8488132755417446, + "language_loss": 0.61498398, + "learning_rate": 4.277583719504418e-08, + "loss": 0.63561708, + "num_input_tokens_seen": 335810995, + "step": 15568, + "time_per_iteration": 2.7802727222442627 + }, + { + "auxiliary_loss_clip": 0.01033033, + "auxiliary_loss_mlp": 0.01033671, + "balance_loss_clip": 1.02020621, + "balance_loss_mlp": 1.02328813, + "epoch": 0.9360589207876147, + "flos": 22819651401600.0, + "grad_norm": 1.8623229583012328, + "language_loss": 0.78846955, + "learning_rate": 4.269575644764556e-08, + "loss": 0.80913663, + "num_input_tokens_seen": 335830580, + "step": 15569, + "time_per_iteration": 2.7853097915649414 + }, + { + "auxiliary_loss_clip": 0.0104282, + "auxiliary_loss_mlp": 0.01028886, + "balance_loss_clip": 1.02520418, + "balance_loss_mlp": 1.01793635, + "epoch": 0.9361190440402826, + "flos": 20885251340160.0, + "grad_norm": 2.257642106300605, + "language_loss": 0.69370753, + "learning_rate": 4.261574992142014e-08, + "loss": 0.71442461, + "num_input_tokens_seen": 335846515, + "step": 15570, + "time_per_iteration": 2.690659761428833 + }, + { + "auxiliary_loss_clip": 0.01055972, + "auxiliary_loss_mlp": 0.01028672, + "balance_loss_clip": 1.02760625, + "balance_loss_mlp": 1.01821697, + "epoch": 0.9361791672929506, + "flos": 19317822577920.0, + "grad_norm": 1.8377768255419695, + "language_loss": 0.79033411, + "learning_rate": 4.2535817619401726e-08, + "loss": 0.81118047, + "num_input_tokens_seen": 335863350, + "step": 15571, + "time_per_iteration": 2.60455322265625 + }, + { + "auxiliary_loss_clip": 0.01029078, + "auxiliary_loss_mlp": 0.01029325, + "balance_loss_clip": 1.02283764, + "balance_loss_mlp": 1.01916802, + "epoch": 0.9362392905456185, + "flos": 15158182032000.0, + "grad_norm": 2.145518804719659, + "language_loss": 0.77549368, + "learning_rate": 4.2455959544621224e-08, + "loss": 0.79607773, + "num_input_tokens_seen": 335880510, + "step": 15572, + "time_per_iteration": 2.75312876701355 + }, + { + "auxiliary_loss_clip": 0.01039324, + "auxiliary_loss_mlp": 0.01033731, + "balance_loss_clip": 1.02411008, + "balance_loss_mlp": 1.02382493, + "epoch": 0.9362994137982865, + "flos": 22085888371200.0, + "grad_norm": 1.760217879869163, + "language_loss": 0.77826595, + "learning_rate": 4.237617570010688e-08, + "loss": 0.79899645, + "num_input_tokens_seen": 335899440, + "step": 15573, + "time_per_iteration": 2.76279354095459 + }, + { + "auxiliary_loss_clip": 0.01022022, + "auxiliary_loss_mlp": 0.01028792, + "balance_loss_clip": 1.02105653, + "balance_loss_mlp": 1.01887345, + "epoch": 0.9363595370509544, + "flos": 23512260424320.0, + "grad_norm": 1.5325753411930978, + "language_loss": 0.74253488, + "learning_rate": 4.2296466088884044e-08, + "loss": 0.76304305, + "num_input_tokens_seen": 335919540, + "step": 15574, + "time_per_iteration": 2.7244715690612793 + }, + { + "auxiliary_loss_clip": 0.01012819, + "auxiliary_loss_mlp": 0.01030072, + "balance_loss_clip": 1.02290201, + "balance_loss_mlp": 1.01933718, + "epoch": 0.9364196603036224, + "flos": 27123473139840.0, + "grad_norm": 1.8074560725528497, + "language_loss": 0.68099242, + "learning_rate": 4.221683071397564e-08, + "loss": 0.70142132, + "num_input_tokens_seen": 335939665, + "step": 15575, + "time_per_iteration": 2.723355293273926 + }, + { + "auxiliary_loss_clip": 0.01034124, + "auxiliary_loss_mlp": 0.01032002, + "balance_loss_clip": 1.02315319, + "balance_loss_mlp": 1.02127886, + "epoch": 0.9364797835562904, + "flos": 18479057114880.0, + "grad_norm": 1.5763440087383112, + "language_loss": 0.65023327, + "learning_rate": 4.2137269578401026e-08, + "loss": 0.6708945, + "num_input_tokens_seen": 335958580, + "step": 15576, + "time_per_iteration": 2.67379093170166 + }, + { + "auxiliary_loss_clip": 0.0104503, + "auxiliary_loss_mlp": 0.01025644, + "balance_loss_clip": 1.02149057, + "balance_loss_mlp": 1.01386595, + "epoch": 0.9365399068089584, + "flos": 13005552890880.0, + "grad_norm": 2.28458722062135, + "language_loss": 0.75627899, + "learning_rate": 4.2057782685177566e-08, + "loss": 0.77698576, + "num_input_tokens_seen": 335974965, + "step": 15577, + "time_per_iteration": 2.589118719100952 + }, + { + "auxiliary_loss_clip": 0.01015919, + "auxiliary_loss_mlp": 0.01026187, + "balance_loss_clip": 1.02061462, + "balance_loss_mlp": 1.0156492, + "epoch": 0.9366000300616263, + "flos": 25666433850240.0, + "grad_norm": 2.655107079072335, + "language_loss": 0.51513112, + "learning_rate": 4.1978370037318855e-08, + "loss": 0.53555214, + "num_input_tokens_seen": 335996575, + "step": 15578, + "time_per_iteration": 2.808688163757324 + }, + { + "auxiliary_loss_clip": 0.0099572, + "auxiliary_loss_mlp": 0.01029538, + "balance_loss_clip": 1.01871061, + "balance_loss_mlp": 1.01923203, + "epoch": 0.9366601533142943, + "flos": 21433355948160.0, + "grad_norm": 1.5277071927372496, + "language_loss": 0.70853698, + "learning_rate": 4.189903163783692e-08, + "loss": 0.72878951, + "num_input_tokens_seen": 336017265, + "step": 15579, + "time_per_iteration": 2.789597272872925 + }, + { + "auxiliary_loss_clip": 0.01033725, + "auxiliary_loss_mlp": 0.01025829, + "balance_loss_clip": 1.02111089, + "balance_loss_mlp": 1.01597643, + "epoch": 0.9367202765669622, + "flos": 24093222998400.0, + "grad_norm": 1.9671086330241419, + "language_loss": 0.76717377, + "learning_rate": 4.181976748973959e-08, + "loss": 0.78776932, + "num_input_tokens_seen": 336035905, + "step": 15580, + "time_per_iteration": 2.6611220836639404 + }, + { + "auxiliary_loss_clip": 0.010476, + "auxiliary_loss_mlp": 0.01030727, + "balance_loss_clip": 1.023821, + "balance_loss_mlp": 1.01914024, + "epoch": 0.9367803998196302, + "flos": 20888842700160.0, + "grad_norm": 1.9426445716353855, + "language_loss": 0.66127896, + "learning_rate": 4.1740577596033114e-08, + "loss": 0.68206215, + "num_input_tokens_seen": 336055585, + "step": 15581, + "time_per_iteration": 2.678872585296631 + }, + { + "auxiliary_loss_clip": 0.01051537, + "auxiliary_loss_mlp": 0.01024876, + "balance_loss_clip": 1.02480876, + "balance_loss_mlp": 1.01434374, + "epoch": 0.9368405230722983, + "flos": 22564362464640.0, + "grad_norm": 1.5549814510515738, + "language_loss": 0.76926023, + "learning_rate": 4.166146195972042e-08, + "loss": 0.79002428, + "num_input_tokens_seen": 336076695, + "step": 15582, + "time_per_iteration": 2.857823610305786 + }, + { + "auxiliary_loss_clip": 0.0099035, + "auxiliary_loss_mlp": 0.01027898, + "balance_loss_clip": 1.02116418, + "balance_loss_mlp": 1.0173775, + "epoch": 0.9369006463249662, + "flos": 18880215183360.0, + "grad_norm": 1.6484878818622162, + "language_loss": 0.73648483, + "learning_rate": 4.1582420583800905e-08, + "loss": 0.75666732, + "num_input_tokens_seen": 336094740, + "step": 15583, + "time_per_iteration": 2.8309075832366943 + }, + { + "auxiliary_loss_clip": 0.01065297, + "auxiliary_loss_mlp": 0.01029231, + "balance_loss_clip": 1.02646518, + "balance_loss_mlp": 1.01822793, + "epoch": 0.9369607695776342, + "flos": 26432516142720.0, + "grad_norm": 2.742502998821468, + "language_loss": 0.8428123, + "learning_rate": 4.1503453471272376e-08, + "loss": 0.86375749, + "num_input_tokens_seen": 336113985, + "step": 15584, + "time_per_iteration": 4.357911825180054 + }, + { + "auxiliary_loss_clip": 0.01056292, + "auxiliary_loss_mlp": 0.00747689, + "balance_loss_clip": 1.02610254, + "balance_loss_mlp": 1.00039446, + "epoch": 0.9370208928303021, + "flos": 39567346081920.0, + "grad_norm": 1.596076306116999, + "language_loss": 0.72380197, + "learning_rate": 4.1424560625129334e-08, + "loss": 0.74184173, + "num_input_tokens_seen": 336136395, + "step": 15585, + "time_per_iteration": 2.811978578567505 + }, + { + "auxiliary_loss_clip": 0.01029047, + "auxiliary_loss_mlp": 0.01021742, + "balance_loss_clip": 1.02286458, + "balance_loss_mlp": 1.01266456, + "epoch": 0.9370810160829701, + "flos": 22963114321920.0, + "grad_norm": 1.8612382091427078, + "language_loss": 0.8033393, + "learning_rate": 4.134574204836316e-08, + "loss": 0.82384717, + "num_input_tokens_seen": 336156345, + "step": 15586, + "time_per_iteration": 2.7605340480804443 + }, + { + "auxiliary_loss_clip": 0.01030356, + "auxiliary_loss_mlp": 0.01033663, + "balance_loss_clip": 1.0239563, + "balance_loss_mlp": 1.02278519, + "epoch": 0.937141139335638, + "flos": 23075048079360.0, + "grad_norm": 1.579971197654062, + "language_loss": 0.76799309, + "learning_rate": 4.126699774396258e-08, + "loss": 0.78863335, + "num_input_tokens_seen": 336176760, + "step": 15587, + "time_per_iteration": 2.8290746212005615 + }, + { + "auxiliary_loss_clip": 0.01043493, + "auxiliary_loss_mlp": 0.01031233, + "balance_loss_clip": 1.02443683, + "balance_loss_mlp": 1.02042055, + "epoch": 0.937201262588306, + "flos": 16356664247040.0, + "grad_norm": 1.9674988889951732, + "language_loss": 0.87962675, + "learning_rate": 4.118832771491387e-08, + "loss": 0.900374, + "num_input_tokens_seen": 336193285, + "step": 15588, + "time_per_iteration": 2.6896426677703857 + }, + { + "auxiliary_loss_clip": 0.01058988, + "auxiliary_loss_mlp": 0.00747606, + "balance_loss_clip": 1.02421737, + "balance_loss_mlp": 1.00037956, + "epoch": 0.937261385840974, + "flos": 20194078861440.0, + "grad_norm": 1.7144936003308349, + "language_loss": 0.78246248, + "learning_rate": 4.11097319642002e-08, + "loss": 0.80052841, + "num_input_tokens_seen": 336211425, + "step": 15589, + "time_per_iteration": 2.666841983795166 + }, + { + "auxiliary_loss_clip": 0.0105845, + "auxiliary_loss_mlp": 0.01030614, + "balance_loss_clip": 1.02401459, + "balance_loss_mlp": 1.02033854, + "epoch": 0.937321509093642, + "flos": 18295948558080.0, + "grad_norm": 1.970274320216797, + "language_loss": 0.78206718, + "learning_rate": 4.103121049480163e-08, + "loss": 0.80295783, + "num_input_tokens_seen": 336230205, + "step": 15590, + "time_per_iteration": 2.62182354927063 + }, + { + "auxiliary_loss_clip": 0.010271, + "auxiliary_loss_mlp": 0.01034024, + "balance_loss_clip": 1.02178264, + "balance_loss_mlp": 1.02214479, + "epoch": 0.9373816323463099, + "flos": 25884662929920.0, + "grad_norm": 1.7083072268633155, + "language_loss": 0.71617174, + "learning_rate": 4.095276330969577e-08, + "loss": 0.73678297, + "num_input_tokens_seen": 336252440, + "step": 15591, + "time_per_iteration": 2.6757142543792725 + }, + { + "auxiliary_loss_clip": 0.0105579, + "auxiliary_loss_mlp": 0.00747724, + "balance_loss_clip": 1.02588892, + "balance_loss_mlp": 1.00038719, + "epoch": 0.9374417555989779, + "flos": 27198849830400.0, + "grad_norm": 2.680790590521957, + "language_loss": 0.53627813, + "learning_rate": 4.0874390411857804e-08, + "loss": 0.5543133, + "num_input_tokens_seen": 336273845, + "step": 15592, + "time_per_iteration": 2.691995859146118 + }, + { + "auxiliary_loss_clip": 0.01051783, + "auxiliary_loss_mlp": 0.01024387, + "balance_loss_clip": 1.02613163, + "balance_loss_mlp": 1.01445103, + "epoch": 0.9375018788516458, + "flos": 23621249266560.0, + "grad_norm": 1.5491308982071434, + "language_loss": 0.67334831, + "learning_rate": 4.0796091804259136e-08, + "loss": 0.69411004, + "num_input_tokens_seen": 336292790, + "step": 15593, + "time_per_iteration": 2.664931058883667 + }, + { + "auxiliary_loss_clip": 0.01040871, + "auxiliary_loss_mlp": 0.01026455, + "balance_loss_clip": 1.02322853, + "balance_loss_mlp": 1.01635218, + "epoch": 0.9375620021043138, + "flos": 22678774260480.0, + "grad_norm": 1.7782269065148808, + "language_loss": 0.74072725, + "learning_rate": 4.0717867489868715e-08, + "loss": 0.76140058, + "num_input_tokens_seen": 336312600, + "step": 15594, + "time_per_iteration": 2.698944091796875 + }, + { + "auxiliary_loss_clip": 0.0104847, + "auxiliary_loss_mlp": 0.01026761, + "balance_loss_clip": 1.0230906, + "balance_loss_mlp": 1.01749873, + "epoch": 0.9376221253569819, + "flos": 27560254521600.0, + "grad_norm": 1.5015516450794753, + "language_loss": 0.73629284, + "learning_rate": 4.063971747165351e-08, + "loss": 0.75704521, + "num_input_tokens_seen": 336332770, + "step": 15595, + "time_per_iteration": 2.7435860633850098 + }, + { + "auxiliary_loss_clip": 0.01033132, + "auxiliary_loss_mlp": 0.01027067, + "balance_loss_clip": 1.02361608, + "balance_loss_mlp": 1.01726174, + "epoch": 0.9376822486096498, + "flos": 24129887806080.0, + "grad_norm": 1.8096725101997804, + "language_loss": 0.76487523, + "learning_rate": 4.056164175257626e-08, + "loss": 0.78547728, + "num_input_tokens_seen": 336351445, + "step": 15596, + "time_per_iteration": 2.630645275115967 + }, + { + "auxiliary_loss_clip": 0.01033361, + "auxiliary_loss_mlp": 0.01026664, + "balance_loss_clip": 1.02447248, + "balance_loss_mlp": 1.01675737, + "epoch": 0.9377423718623178, + "flos": 22784028088320.0, + "grad_norm": 1.9930406358681128, + "language_loss": 0.78705013, + "learning_rate": 4.0483640335597926e-08, + "loss": 0.80765033, + "num_input_tokens_seen": 336368690, + "step": 15597, + "time_per_iteration": 2.6178267002105713 + }, + { + "auxiliary_loss_clip": 0.01063943, + "auxiliary_loss_mlp": 0.01027336, + "balance_loss_clip": 1.02490687, + "balance_loss_mlp": 1.01685095, + "epoch": 0.9378024951149857, + "flos": 19168900790400.0, + "grad_norm": 1.602619571572336, + "language_loss": 0.81463587, + "learning_rate": 4.0405713223676363e-08, + "loss": 0.83554864, + "num_input_tokens_seen": 336388165, + "step": 15598, + "time_per_iteration": 2.5567872524261475 + }, + { + "auxiliary_loss_clip": 0.01035383, + "auxiliary_loss_mlp": 0.01026693, + "balance_loss_clip": 1.0245626, + "balance_loss_mlp": 1.01581514, + "epoch": 0.9378626183676537, + "flos": 23505508667520.0, + "grad_norm": 1.8859752002938999, + "language_loss": 0.62641382, + "learning_rate": 4.0327860419766994e-08, + "loss": 0.64703453, + "num_input_tokens_seen": 336406475, + "step": 15599, + "time_per_iteration": 2.7254562377929688 + }, + { + "auxiliary_loss_clip": 0.01024679, + "auxiliary_loss_mlp": 0.01030057, + "balance_loss_clip": 1.02265167, + "balance_loss_mlp": 1.01926875, + "epoch": 0.9379227416203216, + "flos": 18405655672320.0, + "grad_norm": 1.6943230787325863, + "language_loss": 0.73237979, + "learning_rate": 4.0250081926821e-08, + "loss": 0.75292718, + "num_input_tokens_seen": 336424690, + "step": 15600, + "time_per_iteration": 2.78999662399292 + }, + { + "auxiliary_loss_clip": 0.01042559, + "auxiliary_loss_mlp": 0.01027671, + "balance_loss_clip": 1.0253408, + "balance_loss_mlp": 1.01788378, + "epoch": 0.9379828648729897, + "flos": 17821855923840.0, + "grad_norm": 1.8942854467106023, + "language_loss": 0.69359815, + "learning_rate": 4.0172377747788474e-08, + "loss": 0.71430045, + "num_input_tokens_seen": 336443055, + "step": 15601, + "time_per_iteration": 2.6148126125335693 + }, + { + "auxiliary_loss_clip": 0.00998563, + "auxiliary_loss_mlp": 0.01000375, + "balance_loss_clip": 1.00314736, + "balance_loss_mlp": 0.99951094, + "epoch": 0.9380429881256576, + "flos": 68024399466240.0, + "grad_norm": 0.7558526420661397, + "language_loss": 0.58128953, + "learning_rate": 4.009474788561573e-08, + "loss": 0.6012789, + "num_input_tokens_seen": 336510190, + "step": 15602, + "time_per_iteration": 5.149209499359131 + }, + { + "auxiliary_loss_clip": 0.00988932, + "auxiliary_loss_mlp": 0.01033453, + "balance_loss_clip": 1.02228713, + "balance_loss_mlp": 1.02205062, + "epoch": 0.9381031113783256, + "flos": 20776980769920.0, + "grad_norm": 2.1717557558181704, + "language_loss": 0.72247481, + "learning_rate": 4.001719234324663e-08, + "loss": 0.74269867, + "num_input_tokens_seen": 336529250, + "step": 15603, + "time_per_iteration": 2.7804784774780273 + }, + { + "auxiliary_loss_clip": 0.01054676, + "auxiliary_loss_mlp": 0.01027076, + "balance_loss_clip": 1.02209234, + "balance_loss_mlp": 1.01812911, + "epoch": 0.9381632346309935, + "flos": 19025078734080.0, + "grad_norm": 1.8746941133591584, + "language_loss": 0.76055306, + "learning_rate": 3.993971112362171e-08, + "loss": 0.78137052, + "num_input_tokens_seen": 336548530, + "step": 15604, + "time_per_iteration": 2.547543525695801 + }, + { + "auxiliary_loss_clip": 0.01034024, + "auxiliary_loss_mlp": 0.01032261, + "balance_loss_clip": 1.0211494, + "balance_loss_mlp": 1.01974416, + "epoch": 0.9382233578836615, + "flos": 23513840622720.0, + "grad_norm": 1.9864205963478545, + "language_loss": 0.65555495, + "learning_rate": 3.9862304229679734e-08, + "loss": 0.67621779, + "num_input_tokens_seen": 336568510, + "step": 15605, + "time_per_iteration": 2.700284719467163 + }, + { + "auxiliary_loss_clip": 0.01035191, + "auxiliary_loss_mlp": 0.0074779, + "balance_loss_clip": 1.02539277, + "balance_loss_mlp": 1.00039959, + "epoch": 0.9382834811363294, + "flos": 43067882016000.0, + "grad_norm": 1.9637150648496868, + "language_loss": 0.67218745, + "learning_rate": 3.9784971664355683e-08, + "loss": 0.69001734, + "num_input_tokens_seen": 336592020, + "step": 15606, + "time_per_iteration": 2.9237606525421143 + }, + { + "auxiliary_loss_clip": 0.01046589, + "auxiliary_loss_mlp": 0.01024173, + "balance_loss_clip": 1.02234793, + "balance_loss_mlp": 1.01463604, + "epoch": 0.9383436043889974, + "flos": 16436242828800.0, + "grad_norm": 1.8143779558853907, + "language_loss": 0.77301568, + "learning_rate": 3.970771343058166e-08, + "loss": 0.79372334, + "num_input_tokens_seen": 336610010, + "step": 15607, + "time_per_iteration": 2.723109006881714 + }, + { + "auxiliary_loss_clip": 0.01053022, + "auxiliary_loss_mlp": 0.01025343, + "balance_loss_clip": 1.02456403, + "balance_loss_mlp": 1.01515698, + "epoch": 0.9384037276416655, + "flos": 20740603271040.0, + "grad_norm": 1.7347859231782121, + "language_loss": 0.8256439, + "learning_rate": 3.963052953128776e-08, + "loss": 0.84642756, + "num_input_tokens_seen": 336628520, + "step": 15608, + "time_per_iteration": 2.7275238037109375 + }, + { + "auxiliary_loss_clip": 0.01054085, + "auxiliary_loss_mlp": 0.01029212, + "balance_loss_clip": 1.02681804, + "balance_loss_mlp": 1.01858425, + "epoch": 0.9384638508943334, + "flos": 19062677295360.0, + "grad_norm": 2.181055854066296, + "language_loss": 0.69207942, + "learning_rate": 3.9553419969400536e-08, + "loss": 0.71291244, + "num_input_tokens_seen": 336647365, + "step": 15609, + "time_per_iteration": 2.7459826469421387 + }, + { + "auxiliary_loss_clip": 0.01034623, + "auxiliary_loss_mlp": 0.01028724, + "balance_loss_clip": 1.02441025, + "balance_loss_mlp": 1.01760805, + "epoch": 0.9385239741470014, + "flos": 23404887694080.0, + "grad_norm": 1.8075934653232026, + "language_loss": 0.75333226, + "learning_rate": 3.9476384747844316e-08, + "loss": 0.77396572, + "num_input_tokens_seen": 336667165, + "step": 15610, + "time_per_iteration": 2.7934741973876953 + }, + { + "auxiliary_loss_clip": 0.01007441, + "auxiliary_loss_mlp": 0.01024878, + "balance_loss_clip": 1.02662659, + "balance_loss_mlp": 1.01492989, + "epoch": 0.9385840973996693, + "flos": 12824742804480.0, + "grad_norm": 2.0748497257266116, + "language_loss": 0.7523635, + "learning_rate": 3.939942386953987e-08, + "loss": 0.77268666, + "num_input_tokens_seen": 336684130, + "step": 15611, + "time_per_iteration": 4.467010021209717 + }, + { + "auxiliary_loss_clip": 0.01033731, + "auxiliary_loss_mlp": 0.0102816, + "balance_loss_clip": 1.02766085, + "balance_loss_mlp": 1.01808703, + "epoch": 0.9386442206523373, + "flos": 15486980152320.0, + "grad_norm": 2.155544482036362, + "language_loss": 0.65839189, + "learning_rate": 3.9322537337405756e-08, + "loss": 0.67901087, + "num_input_tokens_seen": 336701520, + "step": 15612, + "time_per_iteration": 2.8791580200195312 + }, + { + "auxiliary_loss_clip": 0.01048892, + "auxiliary_loss_mlp": 0.01026049, + "balance_loss_clip": 1.02368283, + "balance_loss_mlp": 1.01641059, + "epoch": 0.9387043439050052, + "flos": 21178821196800.0, + "grad_norm": 1.6821207570035517, + "language_loss": 0.56856126, + "learning_rate": 3.924572515435742e-08, + "loss": 0.58931065, + "num_input_tokens_seen": 336720675, + "step": 15613, + "time_per_iteration": 4.276970148086548 + }, + { + "auxiliary_loss_clip": 0.0104041, + "auxiliary_loss_mlp": 0.01027989, + "balance_loss_clip": 1.02324653, + "balance_loss_mlp": 1.01795721, + "epoch": 0.9387644671576733, + "flos": 27668273696640.0, + "grad_norm": 2.3362130054871217, + "language_loss": 0.71010816, + "learning_rate": 3.916898732330764e-08, + "loss": 0.73079216, + "num_input_tokens_seen": 336741005, + "step": 15614, + "time_per_iteration": 2.6449694633483887 + }, + { + "auxiliary_loss_clip": 0.01055088, + "auxiliary_loss_mlp": 0.01027041, + "balance_loss_clip": 1.02612615, + "balance_loss_mlp": 1.01602006, + "epoch": 0.9388245904103412, + "flos": 18836331742080.0, + "grad_norm": 1.7820679789797231, + "language_loss": 0.81142128, + "learning_rate": 3.9092323847166544e-08, + "loss": 0.83224261, + "num_input_tokens_seen": 336757990, + "step": 15615, + "time_per_iteration": 2.6034438610076904 + }, + { + "auxiliary_loss_clip": 0.01033837, + "auxiliary_loss_mlp": 0.01027988, + "balance_loss_clip": 1.02222204, + "balance_loss_mlp": 1.01745594, + "epoch": 0.9388847136630092, + "flos": 25483828083840.0, + "grad_norm": 2.90532946408121, + "language_loss": 0.7171039, + "learning_rate": 3.901573472884134e-08, + "loss": 0.7377221, + "num_input_tokens_seen": 336777705, + "step": 15616, + "time_per_iteration": 2.658568859100342 + }, + { + "auxiliary_loss_clip": 0.01064103, + "auxiliary_loss_mlp": 0.01026889, + "balance_loss_clip": 1.02685285, + "balance_loss_mlp": 1.01662552, + "epoch": 0.9389448369156771, + "flos": 18734992496640.0, + "grad_norm": 2.3051863083472726, + "language_loss": 0.66077244, + "learning_rate": 3.89392199712355e-08, + "loss": 0.68168235, + "num_input_tokens_seen": 336798275, + "step": 15617, + "time_per_iteration": 2.5678510665893555 + }, + { + "auxiliary_loss_clip": 0.0105534, + "auxiliary_loss_mlp": 0.01031728, + "balance_loss_clip": 1.02640164, + "balance_loss_mlp": 1.02029586, + "epoch": 0.9390049601683451, + "flos": 21717839664000.0, + "grad_norm": 2.2205411363295426, + "language_loss": 0.73483026, + "learning_rate": 3.886277957725092e-08, + "loss": 0.75570089, + "num_input_tokens_seen": 336813835, + "step": 15618, + "time_per_iteration": 2.638737678527832 + }, + { + "auxiliary_loss_clip": 0.01065253, + "auxiliary_loss_mlp": 0.01030694, + "balance_loss_clip": 1.02589607, + "balance_loss_mlp": 1.01896405, + "epoch": 0.939065083421013, + "flos": 19391224020480.0, + "grad_norm": 1.9348389357197175, + "language_loss": 0.69985974, + "learning_rate": 3.878641354978662e-08, + "loss": 0.72081923, + "num_input_tokens_seen": 336832210, + "step": 15619, + "time_per_iteration": 2.6548027992248535 + }, + { + "auxiliary_loss_clip": 0.01034295, + "auxiliary_loss_mlp": 0.0103133, + "balance_loss_clip": 1.02139938, + "balance_loss_mlp": 1.02049375, + "epoch": 0.939125206673681, + "flos": 24681511946880.0, + "grad_norm": 2.1288614400581056, + "language_loss": 0.7747708, + "learning_rate": 3.8710121891737834e-08, + "loss": 0.79542702, + "num_input_tokens_seen": 336851380, + "step": 15620, + "time_per_iteration": 2.729064702987671 + }, + { + "auxiliary_loss_clip": 0.01048305, + "auxiliary_loss_mlp": 0.01024892, + "balance_loss_clip": 1.02337551, + "balance_loss_mlp": 1.01467013, + "epoch": 0.9391853299263491, + "flos": 16325961096960.0, + "grad_norm": 3.1706516339489483, + "language_loss": 0.73722899, + "learning_rate": 3.8633904605998025e-08, + "loss": 0.75796103, + "num_input_tokens_seen": 336868525, + "step": 15621, + "time_per_iteration": 2.602656602859497 + }, + { + "auxiliary_loss_clip": 0.01027384, + "auxiliary_loss_mlp": 0.01029572, + "balance_loss_clip": 1.02199328, + "balance_loss_mlp": 1.01868248, + "epoch": 0.939245453179017, + "flos": 11655778590720.0, + "grad_norm": 1.9681244522238397, + "language_loss": 0.66051674, + "learning_rate": 3.855776169545688e-08, + "loss": 0.6810863, + "num_input_tokens_seen": 336886200, + "step": 15622, + "time_per_iteration": 2.790558338165283 + }, + { + "auxiliary_loss_clip": 0.01031485, + "auxiliary_loss_mlp": 0.01033111, + "balance_loss_clip": 1.02052557, + "balance_loss_mlp": 1.02178597, + "epoch": 0.939305576431685, + "flos": 23148700917120.0, + "grad_norm": 1.5747002486723756, + "language_loss": 0.71819443, + "learning_rate": 3.848169316300209e-08, + "loss": 0.73884034, + "num_input_tokens_seen": 336905815, + "step": 15623, + "time_per_iteration": 2.7753798961639404 + }, + { + "auxiliary_loss_clip": 0.01056122, + "auxiliary_loss_mlp": 0.0102653, + "balance_loss_clip": 1.02714026, + "balance_loss_mlp": 1.01606917, + "epoch": 0.9393656996843529, + "flos": 33287790706560.0, + "grad_norm": 1.85457430028854, + "language_loss": 0.72452205, + "learning_rate": 3.84056990115178e-08, + "loss": 0.74534857, + "num_input_tokens_seen": 336928460, + "step": 15624, + "time_per_iteration": 2.713637351989746 + }, + { + "auxiliary_loss_clip": 0.01025414, + "auxiliary_loss_mlp": 0.01027367, + "balance_loss_clip": 1.0215863, + "balance_loss_mlp": 1.01686442, + "epoch": 0.9394258229370209, + "flos": 21689434984320.0, + "grad_norm": 2.2821266169108694, + "language_loss": 0.89319015, + "learning_rate": 3.832977924388614e-08, + "loss": 0.91371787, + "num_input_tokens_seen": 336948320, + "step": 15625, + "time_per_iteration": 2.70477294921875 + }, + { + "auxiliary_loss_clip": 0.01050923, + "auxiliary_loss_mlp": 0.01029319, + "balance_loss_clip": 1.0244174, + "balance_loss_mlp": 1.01856589, + "epoch": 0.9394859461896888, + "flos": 23874203819520.0, + "grad_norm": 1.7931844929088914, + "language_loss": 0.83506531, + "learning_rate": 3.825393386298592e-08, + "loss": 0.85586774, + "num_input_tokens_seen": 336967670, + "step": 15626, + "time_per_iteration": 2.7089684009552 + }, + { + "auxiliary_loss_clip": 0.00989139, + "auxiliary_loss_mlp": 0.01001374, + "balance_loss_clip": 1.00278854, + "balance_loss_mlp": 1.00049162, + "epoch": 0.9395460694423569, + "flos": 61566116993280.0, + "grad_norm": 0.7758076685570088, + "language_loss": 0.56107819, + "learning_rate": 3.8178162871693284e-08, + "loss": 0.58098328, + "num_input_tokens_seen": 337028395, + "step": 15627, + "time_per_iteration": 3.2614104747772217 + }, + { + "auxiliary_loss_clip": 0.01009788, + "auxiliary_loss_mlp": 0.01034346, + "balance_loss_clip": 1.02133775, + "balance_loss_mlp": 1.02312255, + "epoch": 0.9396061926950248, + "flos": 20995712640000.0, + "grad_norm": 1.4776441647625034, + "language_loss": 0.70189309, + "learning_rate": 3.810246627288105e-08, + "loss": 0.72233438, + "num_input_tokens_seen": 337048150, + "step": 15628, + "time_per_iteration": 2.8760952949523926 + }, + { + "auxiliary_loss_clip": 0.01051275, + "auxiliary_loss_mlp": 0.01029113, + "balance_loss_clip": 1.02498889, + "balance_loss_mlp": 1.01902246, + "epoch": 0.9396663159476928, + "flos": 27487786832640.0, + "grad_norm": 1.4708263606579386, + "language_loss": 0.75500417, + "learning_rate": 3.8026844069420025e-08, + "loss": 0.77580798, + "num_input_tokens_seen": 337069315, + "step": 15629, + "time_per_iteration": 2.7201390266418457 + }, + { + "auxiliary_loss_clip": 0.00998413, + "auxiliary_loss_mlp": 0.01031549, + "balance_loss_clip": 1.02011919, + "balance_loss_mlp": 1.02115989, + "epoch": 0.9397264392003607, + "flos": 19427457864960.0, + "grad_norm": 2.359601720360879, + "language_loss": 0.74147886, + "learning_rate": 3.795129626417748e-08, + "loss": 0.76177847, + "num_input_tokens_seen": 337087765, + "step": 15630, + "time_per_iteration": 2.7627499103546143 + }, + { + "auxiliary_loss_clip": 0.01032575, + "auxiliary_loss_mlp": 0.01031255, + "balance_loss_clip": 1.02421808, + "balance_loss_mlp": 1.02117038, + "epoch": 0.9397865624530287, + "flos": 18004820826240.0, + "grad_norm": 1.8510033796931418, + "language_loss": 0.69575781, + "learning_rate": 3.787582286001845e-08, + "loss": 0.71639609, + "num_input_tokens_seen": 337106265, + "step": 15631, + "time_per_iteration": 4.53400182723999 + }, + { + "auxiliary_loss_clip": 0.010112, + "auxiliary_loss_mlp": 0.01034973, + "balance_loss_clip": 1.02136266, + "balance_loss_mlp": 1.02476287, + "epoch": 0.9398466857056966, + "flos": 22564613859840.0, + "grad_norm": 1.6441563333412639, + "language_loss": 0.75216854, + "learning_rate": 3.7800423859805086e-08, + "loss": 0.77263021, + "num_input_tokens_seen": 337126090, + "step": 15632, + "time_per_iteration": 2.8226535320281982 + }, + { + "auxiliary_loss_clip": 0.01053304, + "auxiliary_loss_mlp": 0.01030078, + "balance_loss_clip": 1.0246973, + "balance_loss_mlp": 1.01843131, + "epoch": 0.9399068089583646, + "flos": 24535678728960.0, + "grad_norm": 5.632478095964012, + "language_loss": 0.74235839, + "learning_rate": 3.772509926639622e-08, + "loss": 0.76319224, + "num_input_tokens_seen": 337145655, + "step": 15633, + "time_per_iteration": 2.656630516052246 + }, + { + "auxiliary_loss_clip": 0.01064575, + "auxiliary_loss_mlp": 0.01029782, + "balance_loss_clip": 1.02548337, + "balance_loss_mlp": 1.01864147, + "epoch": 0.9399669322110327, + "flos": 25630343660160.0, + "grad_norm": 1.7782902462010775, + "language_loss": 0.72632921, + "learning_rate": 3.764984908264823e-08, + "loss": 0.74727279, + "num_input_tokens_seen": 337164805, + "step": 15634, + "time_per_iteration": 2.642298936843872 + }, + { + "auxiliary_loss_clip": 0.01049331, + "auxiliary_loss_mlp": 0.01028144, + "balance_loss_clip": 1.02222967, + "balance_loss_mlp": 1.0171473, + "epoch": 0.9400270554637006, + "flos": 17089385783040.0, + "grad_norm": 1.5983851041097716, + "language_loss": 0.68861693, + "learning_rate": 3.75746733114144e-08, + "loss": 0.70939171, + "num_input_tokens_seen": 337182280, + "step": 15635, + "time_per_iteration": 2.58640456199646 + }, + { + "auxiliary_loss_clip": 0.01021984, + "auxiliary_loss_mlp": 0.01025464, + "balance_loss_clip": 1.02674699, + "balance_loss_mlp": 1.01551056, + "epoch": 0.9400871787163686, + "flos": 22055113393920.0, + "grad_norm": 1.5596672850104816, + "language_loss": 0.74506265, + "learning_rate": 3.7499571955545985e-08, + "loss": 0.76553714, + "num_input_tokens_seen": 337203495, + "step": 15636, + "time_per_iteration": 2.7452211380004883 + }, + { + "auxiliary_loss_clip": 0.01054329, + "auxiliary_loss_mlp": 0.01029959, + "balance_loss_clip": 1.02615929, + "balance_loss_mlp": 1.01985633, + "epoch": 0.9401473019690365, + "flos": 16982767238400.0, + "grad_norm": 2.071566996030656, + "language_loss": 0.83484036, + "learning_rate": 3.7424545017890054e-08, + "loss": 0.85568321, + "num_input_tokens_seen": 337220435, + "step": 15637, + "time_per_iteration": 2.569021463394165 + }, + { + "auxiliary_loss_clip": 0.01026066, + "auxiliary_loss_mlp": 0.01030614, + "balance_loss_clip": 1.0262506, + "balance_loss_mlp": 1.02008748, + "epoch": 0.9402074252217045, + "flos": 19681956702720.0, + "grad_norm": 2.2053570168403738, + "language_loss": 0.68819308, + "learning_rate": 3.7349592501292325e-08, + "loss": 0.7087599, + "num_input_tokens_seen": 337238095, + "step": 15638, + "time_per_iteration": 2.729933023452759 + }, + { + "auxiliary_loss_clip": 0.0104809, + "auxiliary_loss_mlp": 0.01029141, + "balance_loss_clip": 1.02398992, + "balance_loss_mlp": 1.02001595, + "epoch": 0.9402675484743724, + "flos": 24754302858240.0, + "grad_norm": 1.8495091957772398, + "language_loss": 0.8482573, + "learning_rate": 3.727471440859498e-08, + "loss": 0.86902958, + "num_input_tokens_seen": 337256645, + "step": 15639, + "time_per_iteration": 2.803678035736084 + }, + { + "auxiliary_loss_clip": 0.01036679, + "auxiliary_loss_mlp": 0.00747606, + "balance_loss_clip": 1.02233148, + "balance_loss_mlp": 1.00035548, + "epoch": 0.9403276717270405, + "flos": 25558630156800.0, + "grad_norm": 1.6079735724842998, + "language_loss": 0.78201801, + "learning_rate": 3.719991074263662e-08, + "loss": 0.79986084, + "num_input_tokens_seen": 337278360, + "step": 15640, + "time_per_iteration": 2.6847689151763916 + }, + { + "auxiliary_loss_clip": 0.01053099, + "auxiliary_loss_mlp": 0.01033588, + "balance_loss_clip": 1.0242486, + "balance_loss_mlp": 1.02298987, + "epoch": 0.9403877949797084, + "flos": 26689852154880.0, + "grad_norm": 1.5186770994654213, + "language_loss": 0.74099892, + "learning_rate": 3.7125181506254544e-08, + "loss": 0.76186585, + "num_input_tokens_seen": 337302480, + "step": 15641, + "time_per_iteration": 2.696976900100708 + }, + { + "auxiliary_loss_clip": 0.01054529, + "auxiliary_loss_mlp": 0.01029547, + "balance_loss_clip": 1.0244813, + "balance_loss_mlp": 1.01720881, + "epoch": 0.9404479182323764, + "flos": 15011666455680.0, + "grad_norm": 2.3661959807368875, + "language_loss": 0.82427198, + "learning_rate": 3.7050526702282256e-08, + "loss": 0.84511268, + "num_input_tokens_seen": 337316600, + "step": 15642, + "time_per_iteration": 2.561032295227051 + }, + { + "auxiliary_loss_clip": 0.0104879, + "auxiliary_loss_mlp": 0.01028172, + "balance_loss_clip": 1.02312541, + "balance_loss_mlp": 1.01848078, + "epoch": 0.9405080414850443, + "flos": 24973573432320.0, + "grad_norm": 1.8266552006730643, + "language_loss": 0.68283987, + "learning_rate": 3.697594633355084e-08, + "loss": 0.70360947, + "num_input_tokens_seen": 337336895, + "step": 15643, + "time_per_iteration": 2.6962974071502686 + }, + { + "auxiliary_loss_clip": 0.01049271, + "auxiliary_loss_mlp": 0.01035884, + "balance_loss_clip": 1.02457011, + "balance_loss_mlp": 1.02434468, + "epoch": 0.9405681647377123, + "flos": 20844743777280.0, + "grad_norm": 1.7854818636082082, + "language_loss": 0.76771015, + "learning_rate": 3.6901440402888226e-08, + "loss": 0.7885617, + "num_input_tokens_seen": 337355105, + "step": 15644, + "time_per_iteration": 2.655701160430908 + }, + { + "auxiliary_loss_clip": 0.01041711, + "auxiliary_loss_mlp": 0.0102778, + "balance_loss_clip": 1.02261138, + "balance_loss_mlp": 1.0180645, + "epoch": 0.9406282879903802, + "flos": 23805578885760.0, + "grad_norm": 1.8123971406956447, + "language_loss": 0.67627263, + "learning_rate": 3.682700891311974e-08, + "loss": 0.69696748, + "num_input_tokens_seen": 337374905, + "step": 15645, + "time_per_iteration": 2.647632122039795 + }, + { + "auxiliary_loss_clip": 0.01030737, + "auxiliary_loss_mlp": 0.00747528, + "balance_loss_clip": 1.02453947, + "balance_loss_mlp": 1.00030494, + "epoch": 0.9406884112430483, + "flos": 27674953626240.0, + "grad_norm": 1.9492216767173067, + "language_loss": 0.70592368, + "learning_rate": 3.6752651867067774e-08, + "loss": 0.72370636, + "num_input_tokens_seen": 337397130, + "step": 15646, + "time_per_iteration": 2.6689963340759277 + }, + { + "auxiliary_loss_clip": 0.01041787, + "auxiliary_loss_mlp": 0.01028228, + "balance_loss_clip": 1.02135372, + "balance_loss_mlp": 1.01809502, + "epoch": 0.9407485344957163, + "flos": 23075048079360.0, + "grad_norm": 1.7946861246027555, + "language_loss": 0.74072939, + "learning_rate": 3.667836926755208e-08, + "loss": 0.76142955, + "num_input_tokens_seen": 337418660, + "step": 15647, + "time_per_iteration": 2.6972744464874268 + }, + { + "auxiliary_loss_clip": 0.00989128, + "auxiliary_loss_mlp": 0.01001554, + "balance_loss_clip": 1.00303674, + "balance_loss_mlp": 1.00070143, + "epoch": 0.9408086577483842, + "flos": 71014034304000.0, + "grad_norm": 0.9954185071404342, + "language_loss": 0.63540542, + "learning_rate": 3.660416111738907e-08, + "loss": 0.6553123, + "num_input_tokens_seen": 337478055, + "step": 15648, + "time_per_iteration": 3.3589751720428467 + }, + { + "auxiliary_loss_clip": 0.01059601, + "auxiliary_loss_mlp": 0.0102749, + "balance_loss_clip": 1.02504075, + "balance_loss_mlp": 1.0183351, + "epoch": 0.9408687810010522, + "flos": 23730956380800.0, + "grad_norm": 1.530637250898182, + "language_loss": 0.6649338, + "learning_rate": 3.653002741939337e-08, + "loss": 0.68580467, + "num_input_tokens_seen": 337499405, + "step": 15649, + "time_per_iteration": 2.6665923595428467 + }, + { + "auxiliary_loss_clip": 0.01027472, + "auxiliary_loss_mlp": 0.01027076, + "balance_loss_clip": 1.02176714, + "balance_loss_mlp": 1.0170331, + "epoch": 0.9409289042537201, + "flos": 18369314087040.0, + "grad_norm": 3.2684293447057597, + "language_loss": 0.77378696, + "learning_rate": 3.645596817637586e-08, + "loss": 0.7943325, + "num_input_tokens_seen": 337517195, + "step": 15650, + "time_per_iteration": 4.798877716064453 + }, + { + "auxiliary_loss_clip": 0.01020263, + "auxiliary_loss_mlp": 0.01024944, + "balance_loss_clip": 1.02554154, + "balance_loss_mlp": 1.01504993, + "epoch": 0.9409890275063881, + "flos": 23878333883520.0, + "grad_norm": 1.6540235587870968, + "language_loss": 0.74595582, + "learning_rate": 3.638198339114451e-08, + "loss": 0.76640785, + "num_input_tokens_seen": 337535245, + "step": 15651, + "time_per_iteration": 2.8327739238739014 + }, + { + "auxiliary_loss_clip": 0.01060965, + "auxiliary_loss_mlp": 0.01027694, + "balance_loss_clip": 1.02395916, + "balance_loss_mlp": 1.017102, + "epoch": 0.941049150759056, + "flos": 16545088016640.0, + "grad_norm": 1.7590059551563495, + "language_loss": 0.7244671, + "learning_rate": 3.630807306650507e-08, + "loss": 0.74535364, + "num_input_tokens_seen": 337553040, + "step": 15652, + "time_per_iteration": 2.7302615642547607 + }, + { + "auxiliary_loss_clip": 0.01030752, + "auxiliary_loss_mlp": 0.0103251, + "balance_loss_clip": 1.02457273, + "balance_loss_mlp": 1.02119696, + "epoch": 0.9411092740117241, + "flos": 25118401069440.0, + "grad_norm": 1.6394484506656026, + "language_loss": 0.66820335, + "learning_rate": 3.6234237205260645e-08, + "loss": 0.68883598, + "num_input_tokens_seen": 337574580, + "step": 15653, + "time_per_iteration": 2.9405455589294434 + }, + { + "auxiliary_loss_clip": 0.01063243, + "auxiliary_loss_mlp": 0.01032237, + "balance_loss_clip": 1.02579951, + "balance_loss_mlp": 1.02164531, + "epoch": 0.941169397264392, + "flos": 21142264129920.0, + "grad_norm": 2.897785573281038, + "language_loss": 0.77575916, + "learning_rate": 3.6160475810210536e-08, + "loss": 0.79671395, + "num_input_tokens_seen": 337593010, + "step": 15654, + "time_per_iteration": 2.703381299972534 + }, + { + "auxiliary_loss_clip": 0.01056821, + "auxiliary_loss_mlp": 0.01024013, + "balance_loss_clip": 1.02673674, + "balance_loss_mlp": 1.01364207, + "epoch": 0.94122952051706, + "flos": 38508914995200.0, + "grad_norm": 1.5851853617833036, + "language_loss": 0.70066643, + "learning_rate": 3.6086788884152065e-08, + "loss": 0.72147477, + "num_input_tokens_seen": 337616170, + "step": 15655, + "time_per_iteration": 2.9603753089904785 + }, + { + "auxiliary_loss_clip": 0.0106215, + "auxiliary_loss_mlp": 0.01029406, + "balance_loss_clip": 1.02522612, + "balance_loss_mlp": 1.01846886, + "epoch": 0.9412896437697279, + "flos": 18369206346240.0, + "grad_norm": 2.3297106598168895, + "language_loss": 0.71959794, + "learning_rate": 3.601317642987944e-08, + "loss": 0.74051356, + "num_input_tokens_seen": 337635215, + "step": 15656, + "time_per_iteration": 2.624408483505249 + }, + { + "auxiliary_loss_clip": 0.01024017, + "auxiliary_loss_mlp": 0.01024149, + "balance_loss_clip": 1.022488, + "balance_loss_mlp": 1.01426673, + "epoch": 0.9413497670223959, + "flos": 25884950238720.0, + "grad_norm": 2.7470550940452116, + "language_loss": 0.77581668, + "learning_rate": 3.593963845018377e-08, + "loss": 0.79629838, + "num_input_tokens_seen": 337654195, + "step": 15657, + "time_per_iteration": 2.6798338890075684 + }, + { + "auxiliary_loss_clip": 0.0103069, + "auxiliary_loss_mlp": 0.01026917, + "balance_loss_clip": 1.02317953, + "balance_loss_mlp": 1.01588368, + "epoch": 0.9414098902750638, + "flos": 16618309891200.0, + "grad_norm": 2.0467779952936533, + "language_loss": 0.84280491, + "learning_rate": 3.586617494785371e-08, + "loss": 0.86338097, + "num_input_tokens_seen": 337671810, + "step": 15658, + "time_per_iteration": 2.6390435695648193 + }, + { + "auxiliary_loss_clip": 0.0106744, + "auxiliary_loss_mlp": 0.01030483, + "balance_loss_clip": 1.0268172, + "balance_loss_mlp": 1.0183537, + "epoch": 0.9414700135277319, + "flos": 18625033987200.0, + "grad_norm": 1.7782827794740277, + "language_loss": 0.70545352, + "learning_rate": 3.5792785925675254e-08, + "loss": 0.7264328, + "num_input_tokens_seen": 337689410, + "step": 15659, + "time_per_iteration": 4.1578209400177 + }, + { + "auxiliary_loss_clip": 0.01040897, + "auxiliary_loss_mlp": 0.01036334, + "balance_loss_clip": 1.02416754, + "balance_loss_mlp": 1.02646363, + "epoch": 0.9415301367803999, + "flos": 26280146649600.0, + "grad_norm": 1.747179839946936, + "language_loss": 0.79352701, + "learning_rate": 3.571947138643172e-08, + "loss": 0.81429929, + "num_input_tokens_seen": 337709950, + "step": 15660, + "time_per_iteration": 4.349665880203247 + }, + { + "auxiliary_loss_clip": 0.01027448, + "auxiliary_loss_mlp": 0.01026478, + "balance_loss_clip": 1.02285457, + "balance_loss_mlp": 1.01714993, + "epoch": 0.9415902600330678, + "flos": 23261388860160.0, + "grad_norm": 1.4948500377818201, + "language_loss": 0.67922819, + "learning_rate": 3.564623133290201e-08, + "loss": 0.69976741, + "num_input_tokens_seen": 337731320, + "step": 15661, + "time_per_iteration": 2.755131244659424 + }, + { + "auxiliary_loss_clip": 0.01044297, + "auxiliary_loss_mlp": 0.01029379, + "balance_loss_clip": 1.02224278, + "balance_loss_mlp": 1.01930559, + "epoch": 0.9416503832857358, + "flos": 14719138093440.0, + "grad_norm": 2.1601173556731923, + "language_loss": 0.66053843, + "learning_rate": 3.557306576786434e-08, + "loss": 0.68127513, + "num_input_tokens_seen": 337747720, + "step": 15662, + "time_per_iteration": 2.547257423400879 + }, + { + "auxiliary_loss_clip": 0.00987956, + "auxiliary_loss_mlp": 0.01001539, + "balance_loss_clip": 1.00266862, + "balance_loss_mlp": 1.00077629, + "epoch": 0.9417105065384037, + "flos": 70312698276480.0, + "grad_norm": 0.7732691347319465, + "language_loss": 0.59262884, + "learning_rate": 3.5499974694092935e-08, + "loss": 0.61252379, + "num_input_tokens_seen": 337806930, + "step": 15663, + "time_per_iteration": 3.3222014904022217 + }, + { + "auxiliary_loss_clip": 0.010566, + "auxiliary_loss_mlp": 0.01031665, + "balance_loss_clip": 1.02600336, + "balance_loss_mlp": 1.02072716, + "epoch": 0.9417706297910717, + "flos": 34057895322240.0, + "grad_norm": 1.963204816603673, + "language_loss": 0.67028105, + "learning_rate": 3.542695811435914e-08, + "loss": 0.69116366, + "num_input_tokens_seen": 337828100, + "step": 15664, + "time_per_iteration": 2.7179672718048096 + }, + { + "auxiliary_loss_clip": 0.01043039, + "auxiliary_loss_mlp": 0.01026947, + "balance_loss_clip": 1.02627504, + "balance_loss_mlp": 1.01717162, + "epoch": 0.9418307530437396, + "flos": 16471614746880.0, + "grad_norm": 1.9461019057513012, + "language_loss": 0.72961771, + "learning_rate": 3.535401603143207e-08, + "loss": 0.75031757, + "num_input_tokens_seen": 337844805, + "step": 15665, + "time_per_iteration": 2.729199171066284 + }, + { + "auxiliary_loss_clip": 0.0106068, + "auxiliary_loss_mlp": 0.01029723, + "balance_loss_clip": 1.02583444, + "balance_loss_mlp": 1.0195123, + "epoch": 0.9418908762964077, + "flos": 11253543114240.0, + "grad_norm": 1.8012218127431623, + "language_loss": 0.63894928, + "learning_rate": 3.528114844807773e-08, + "loss": 0.65985334, + "num_input_tokens_seen": 337860490, + "step": 15666, + "time_per_iteration": 2.5264217853546143 + }, + { + "auxiliary_loss_clip": 0.01032218, + "auxiliary_loss_mlp": 0.01029325, + "balance_loss_clip": 1.0242734, + "balance_loss_mlp": 1.01879859, + "epoch": 0.9419509995490756, + "flos": 18438836860800.0, + "grad_norm": 1.6629138513887485, + "language_loss": 0.78677344, + "learning_rate": 3.520835536705902e-08, + "loss": 0.80738878, + "num_input_tokens_seen": 337878360, + "step": 15667, + "time_per_iteration": 2.689464807510376 + }, + { + "auxiliary_loss_clip": 0.01059337, + "auxiliary_loss_mlp": 0.01024586, + "balance_loss_clip": 1.02428293, + "balance_loss_mlp": 1.01528132, + "epoch": 0.9420111228017436, + "flos": 20737945664640.0, + "grad_norm": 1.6628184421786318, + "language_loss": 0.75203407, + "learning_rate": 3.5135636791136404e-08, + "loss": 0.77287328, + "num_input_tokens_seen": 337895635, + "step": 15668, + "time_per_iteration": 2.6102232933044434 + }, + { + "auxiliary_loss_clip": 0.01014506, + "auxiliary_loss_mlp": 0.0102727, + "balance_loss_clip": 1.02384067, + "balance_loss_mlp": 1.01665425, + "epoch": 0.9420712460544115, + "flos": 21141940907520.0, + "grad_norm": 2.271737406066768, + "language_loss": 0.58725756, + "learning_rate": 3.506299272306723e-08, + "loss": 0.60767531, + "num_input_tokens_seen": 337913940, + "step": 15669, + "time_per_iteration": 2.8520286083221436 + }, + { + "auxiliary_loss_clip": 0.01021573, + "auxiliary_loss_mlp": 0.01024979, + "balance_loss_clip": 1.02155447, + "balance_loss_mlp": 1.01510286, + "epoch": 0.9421313693070795, + "flos": 15851760721920.0, + "grad_norm": 1.5243128819303016, + "language_loss": 0.7666558, + "learning_rate": 3.4990423165606406e-08, + "loss": 0.78712142, + "num_input_tokens_seen": 337932015, + "step": 15670, + "time_per_iteration": 2.6930205821990967 + }, + { + "auxiliary_loss_clip": 0.01062732, + "auxiliary_loss_mlp": 0.01031447, + "balance_loss_clip": 1.02557921, + "balance_loss_mlp": 1.02105165, + "epoch": 0.9421914925597474, + "flos": 32415915882240.0, + "grad_norm": 1.909322848940883, + "language_loss": 0.65423357, + "learning_rate": 3.491792812150574e-08, + "loss": 0.67517537, + "num_input_tokens_seen": 337953345, + "step": 15671, + "time_per_iteration": 2.6526119709014893 + }, + { + "auxiliary_loss_clip": 0.01035488, + "auxiliary_loss_mlp": 0.01030852, + "balance_loss_clip": 1.02365398, + "balance_loss_mlp": 1.02046299, + "epoch": 0.9422516158124155, + "flos": 19718513769600.0, + "grad_norm": 2.1169255799960958, + "language_loss": 0.79303741, + "learning_rate": 3.48455075935139e-08, + "loss": 0.8137008, + "num_input_tokens_seen": 337973685, + "step": 15672, + "time_per_iteration": 2.7333850860595703 + }, + { + "auxiliary_loss_clip": 0.01027725, + "auxiliary_loss_mlp": 0.01037156, + "balance_loss_clip": 1.02383113, + "balance_loss_mlp": 1.02494311, + "epoch": 0.9423117390650835, + "flos": 16253277926400.0, + "grad_norm": 2.042732634331616, + "language_loss": 0.73578614, + "learning_rate": 3.47731615843776e-08, + "loss": 0.75643498, + "num_input_tokens_seen": 337989175, + "step": 15673, + "time_per_iteration": 2.6810576915740967 + }, + { + "auxiliary_loss_clip": 0.01040969, + "auxiliary_loss_mlp": 0.01027984, + "balance_loss_clip": 1.0207535, + "balance_loss_mlp": 1.01670694, + "epoch": 0.9423718623177514, + "flos": 31796564647680.0, + "grad_norm": 1.525958115218196, + "language_loss": 0.70211375, + "learning_rate": 3.470089009683974e-08, + "loss": 0.72280324, + "num_input_tokens_seen": 338011800, + "step": 15674, + "time_per_iteration": 2.6692280769348145 + }, + { + "auxiliary_loss_clip": 0.01061166, + "auxiliary_loss_mlp": 0.01022725, + "balance_loss_clip": 1.02449131, + "balance_loss_mlp": 1.01278853, + "epoch": 0.9424319855704194, + "flos": 23331809473920.0, + "grad_norm": 1.786385389233632, + "language_loss": 0.81532168, + "learning_rate": 3.462869313364125e-08, + "loss": 0.8361606, + "num_input_tokens_seen": 338032120, + "step": 15675, + "time_per_iteration": 2.7447025775909424 + }, + { + "auxiliary_loss_clip": 0.01037378, + "auxiliary_loss_mlp": 0.01026762, + "balance_loss_clip": 1.02376091, + "balance_loss_mlp": 1.01660538, + "epoch": 0.9424921088230873, + "flos": 20777627214720.0, + "grad_norm": 1.5869498652327922, + "language_loss": 0.62363476, + "learning_rate": 3.4556570697519494e-08, + "loss": 0.6442762, + "num_input_tokens_seen": 338051880, + "step": 15676, + "time_per_iteration": 2.7536203861236572 + }, + { + "auxiliary_loss_clip": 0.01048028, + "auxiliary_loss_mlp": 0.01032618, + "balance_loss_clip": 1.02937162, + "balance_loss_mlp": 1.0223068, + "epoch": 0.9425522320757553, + "flos": 19026658932480.0, + "grad_norm": 2.417984846832723, + "language_loss": 0.67156136, + "learning_rate": 3.448452279120984e-08, + "loss": 0.69236785, + "num_input_tokens_seen": 338069665, + "step": 15677, + "time_per_iteration": 2.6906325817108154 + }, + { + "auxiliary_loss_clip": 0.01024258, + "auxiliary_loss_mlp": 0.01031301, + "balance_loss_clip": 1.02096653, + "balance_loss_mlp": 1.01955283, + "epoch": 0.9426123553284232, + "flos": 25155353185920.0, + "grad_norm": 1.7691920069325615, + "language_loss": 0.64078152, + "learning_rate": 3.441254941744387e-08, + "loss": 0.66133708, + "num_input_tokens_seen": 338090490, + "step": 15678, + "time_per_iteration": 2.6994268894195557 + }, + { + "auxiliary_loss_clip": 0.01033743, + "auxiliary_loss_mlp": 0.01024028, + "balance_loss_clip": 1.02617443, + "balance_loss_mlp": 1.01402676, + "epoch": 0.9426724785810913, + "flos": 21179359900800.0, + "grad_norm": 1.5115665659502067, + "language_loss": 0.74199921, + "learning_rate": 3.434065057895097e-08, + "loss": 0.76257694, + "num_input_tokens_seen": 338109825, + "step": 15679, + "time_per_iteration": 4.451513051986694 + }, + { + "auxiliary_loss_clip": 0.01045518, + "auxiliary_loss_mlp": 0.01028091, + "balance_loss_clip": 1.02593613, + "balance_loss_mlp": 1.01752925, + "epoch": 0.9427326018337592, + "flos": 14756916222720.0, + "grad_norm": 2.249166082744624, + "language_loss": 0.77649295, + "learning_rate": 3.426882627845762e-08, + "loss": 0.79722905, + "num_input_tokens_seen": 338125790, + "step": 15680, + "time_per_iteration": 2.640434503555298 + }, + { + "auxiliary_loss_clip": 0.01051498, + "auxiliary_loss_mlp": 0.01031411, + "balance_loss_clip": 1.02430868, + "balance_loss_mlp": 1.02121234, + "epoch": 0.9427927250864272, + "flos": 20923640000640.0, + "grad_norm": 2.328760216668091, + "language_loss": 0.75148511, + "learning_rate": 3.419707651868742e-08, + "loss": 0.77231419, + "num_input_tokens_seen": 338145610, + "step": 15681, + "time_per_iteration": 2.653021812438965 + }, + { + "auxiliary_loss_clip": 0.01033472, + "auxiliary_loss_mlp": 0.0103091, + "balance_loss_clip": 1.02406597, + "balance_loss_mlp": 1.02009726, + "epoch": 0.9428528483390951, + "flos": 19752520970880.0, + "grad_norm": 1.8708607984311012, + "language_loss": 0.65719497, + "learning_rate": 3.412540130236086e-08, + "loss": 0.6778388, + "num_input_tokens_seen": 338165960, + "step": 15682, + "time_per_iteration": 2.661060333251953 + }, + { + "auxiliary_loss_clip": 0.01028791, + "auxiliary_loss_mlp": 0.01028673, + "balance_loss_clip": 1.02290297, + "balance_loss_mlp": 1.01834941, + "epoch": 0.9429129715917631, + "flos": 24534996370560.0, + "grad_norm": 1.815804002976912, + "language_loss": 0.76862741, + "learning_rate": 3.405380063219665e-08, + "loss": 0.78920203, + "num_input_tokens_seen": 338187215, + "step": 15683, + "time_per_iteration": 2.727137565612793 + }, + { + "auxiliary_loss_clip": 0.01052584, + "auxiliary_loss_mlp": 0.01038764, + "balance_loss_clip": 1.02527368, + "balance_loss_mlp": 1.02725434, + "epoch": 0.942973094844431, + "flos": 17959824063360.0, + "grad_norm": 3.027373497110162, + "language_loss": 0.75040174, + "learning_rate": 3.398227451090885e-08, + "loss": 0.77131522, + "num_input_tokens_seen": 338201825, + "step": 15684, + "time_per_iteration": 2.574786424636841 + }, + { + "auxiliary_loss_clip": 0.01058299, + "auxiliary_loss_mlp": 0.01023562, + "balance_loss_clip": 1.0235976, + "balance_loss_mlp": 1.0141201, + "epoch": 0.9430332180970991, + "flos": 26137689310080.0, + "grad_norm": 1.7085861622824021, + "language_loss": 0.77140129, + "learning_rate": 3.391082294121017e-08, + "loss": 0.79221988, + "num_input_tokens_seen": 338220865, + "step": 15685, + "time_per_iteration": 2.5993480682373047 + }, + { + "auxiliary_loss_clip": 0.0104773, + "auxiliary_loss_mlp": 0.01025275, + "balance_loss_clip": 1.02266502, + "balance_loss_mlp": 1.01559556, + "epoch": 0.943093341349767, + "flos": 23951376190080.0, + "grad_norm": 1.8788843796919763, + "language_loss": 0.7556572, + "learning_rate": 3.383944592581023e-08, + "loss": 0.77638716, + "num_input_tokens_seen": 338240160, + "step": 15686, + "time_per_iteration": 2.6148221492767334 + }, + { + "auxiliary_loss_clip": 0.01052785, + "auxiliary_loss_mlp": 0.0102872, + "balance_loss_clip": 1.02395546, + "balance_loss_mlp": 1.0176754, + "epoch": 0.943153464602435, + "flos": 17968407413760.0, + "grad_norm": 2.094887471157417, + "language_loss": 0.8053565, + "learning_rate": 3.376814346741575e-08, + "loss": 0.82617152, + "num_input_tokens_seen": 338259305, + "step": 15687, + "time_per_iteration": 2.5600533485412598 + }, + { + "auxiliary_loss_clip": 0.01045125, + "auxiliary_loss_mlp": 0.01034286, + "balance_loss_clip": 1.02535725, + "balance_loss_mlp": 1.02259755, + "epoch": 0.943213587855103, + "flos": 14501519544960.0, + "grad_norm": 2.1257121465859865, + "language_loss": 0.75650245, + "learning_rate": 3.369691556873011e-08, + "loss": 0.7772966, + "num_input_tokens_seen": 338274950, + "step": 15688, + "time_per_iteration": 2.748929738998413 + }, + { + "auxiliary_loss_clip": 0.01033225, + "auxiliary_loss_mlp": 0.01023343, + "balance_loss_clip": 1.02317226, + "balance_loss_mlp": 1.01307321, + "epoch": 0.9432737111077709, + "flos": 28986411093120.0, + "grad_norm": 1.7324275766716497, + "language_loss": 0.68274969, + "learning_rate": 3.3625762232454504e-08, + "loss": 0.70331538, + "num_input_tokens_seen": 338295585, + "step": 15689, + "time_per_iteration": 2.7174413204193115 + }, + { + "auxiliary_loss_clip": 0.0104558, + "auxiliary_loss_mlp": 0.01029145, + "balance_loss_clip": 1.02280068, + "balance_loss_mlp": 1.02043676, + "epoch": 0.9433338343604389, + "flos": 21609066303360.0, + "grad_norm": 2.094888046078375, + "language_loss": 0.80308229, + "learning_rate": 3.35546834612872e-08, + "loss": 0.82382953, + "num_input_tokens_seen": 338314555, + "step": 15690, + "time_per_iteration": 2.6483235359191895 + }, + { + "auxiliary_loss_clip": 0.01051802, + "auxiliary_loss_mlp": 0.01026034, + "balance_loss_clip": 1.02538478, + "balance_loss_mlp": 1.01606202, + "epoch": 0.9433939576131068, + "flos": 33182285483520.0, + "grad_norm": 1.7637788274448836, + "language_loss": 0.60591161, + "learning_rate": 3.348367925792317e-08, + "loss": 0.62668997, + "num_input_tokens_seen": 338336260, + "step": 15691, + "time_per_iteration": 2.7303507328033447 + }, + { + "auxiliary_loss_clip": 0.01020082, + "auxiliary_loss_mlp": 0.0102572, + "balance_loss_clip": 1.02333522, + "balance_loss_mlp": 1.0141865, + "epoch": 0.9434540808657749, + "flos": 20486391742080.0, + "grad_norm": 1.6445711092506288, + "language_loss": 0.66526443, + "learning_rate": 3.341274962505514e-08, + "loss": 0.68572247, + "num_input_tokens_seen": 338354680, + "step": 15692, + "time_per_iteration": 2.758284330368042 + }, + { + "auxiliary_loss_clip": 0.01053322, + "auxiliary_loss_mlp": 0.01031769, + "balance_loss_clip": 1.02573466, + "balance_loss_mlp": 1.02163005, + "epoch": 0.9435142041184428, + "flos": 21542955321600.0, + "grad_norm": 2.3676579637162387, + "language_loss": 0.74440932, + "learning_rate": 3.334189456537251e-08, + "loss": 0.76526028, + "num_input_tokens_seen": 338372490, + "step": 15693, + "time_per_iteration": 2.6501593589782715 + }, + { + "auxiliary_loss_clip": 0.01027796, + "auxiliary_loss_mlp": 0.01031854, + "balance_loss_clip": 1.02397728, + "balance_loss_mlp": 1.02002823, + "epoch": 0.9435743273711108, + "flos": 25009089004800.0, + "grad_norm": 2.0460267793816764, + "language_loss": 0.73493707, + "learning_rate": 3.327111408156291e-08, + "loss": 0.75553364, + "num_input_tokens_seen": 338390870, + "step": 15694, + "time_per_iteration": 2.759401559829712 + }, + { + "auxiliary_loss_clip": 0.0096496, + "auxiliary_loss_mlp": 0.00999313, + "balance_loss_clip": 1.00153828, + "balance_loss_mlp": 0.99847853, + "epoch": 0.9436344506237787, + "flos": 60158707320960.0, + "grad_norm": 0.6855758987683208, + "language_loss": 0.50608695, + "learning_rate": 3.3200408176309316e-08, + "loss": 0.52572966, + "num_input_tokens_seen": 338453075, + "step": 15695, + "time_per_iteration": 3.3636767864227295 + }, + { + "auxiliary_loss_clip": 0.01028348, + "auxiliary_loss_mlp": 0.01028392, + "balance_loss_clip": 1.01990008, + "balance_loss_mlp": 1.01861119, + "epoch": 0.9436945738764467, + "flos": 22237252283520.0, + "grad_norm": 1.74310089081726, + "language_loss": 0.64549088, + "learning_rate": 3.312977685229335e-08, + "loss": 0.66605824, + "num_input_tokens_seen": 338471770, + "step": 15696, + "time_per_iteration": 2.629075288772583 + }, + { + "auxiliary_loss_clip": 0.01050686, + "auxiliary_loss_mlp": 0.01024487, + "balance_loss_clip": 1.02403975, + "balance_loss_mlp": 1.01489663, + "epoch": 0.9437546971291146, + "flos": 25045179194880.0, + "grad_norm": 1.5447469245018386, + "language_loss": 0.66049361, + "learning_rate": 3.305922011219353e-08, + "loss": 0.68124533, + "num_input_tokens_seen": 338492190, + "step": 15697, + "time_per_iteration": 4.420735836029053 + }, + { + "auxiliary_loss_clip": 0.00979014, + "auxiliary_loss_mlp": 0.01012204, + "balance_loss_clip": 1.00329232, + "balance_loss_mlp": 1.01137531, + "epoch": 0.9438148203817827, + "flos": 56790788400000.0, + "grad_norm": 0.8506942530599897, + "language_loss": 0.63189983, + "learning_rate": 3.298873795868506e-08, + "loss": 0.65181208, + "num_input_tokens_seen": 338552560, + "step": 15698, + "time_per_iteration": 3.177380323410034 + }, + { + "auxiliary_loss_clip": 0.01044241, + "auxiliary_loss_mlp": 0.01035297, + "balance_loss_clip": 1.02538872, + "balance_loss_mlp": 1.02341807, + "epoch": 0.9438749436344506, + "flos": 22346384780160.0, + "grad_norm": 1.700472161022312, + "language_loss": 0.6931268, + "learning_rate": 3.291833039444092e-08, + "loss": 0.71392214, + "num_input_tokens_seen": 338571770, + "step": 15699, + "time_per_iteration": 2.6995575428009033 + }, + { + "auxiliary_loss_clip": 0.01023836, + "auxiliary_loss_mlp": 0.01026102, + "balance_loss_clip": 1.022434, + "balance_loss_mlp": 1.01629114, + "epoch": 0.9439350668871186, + "flos": 13370800337280.0, + "grad_norm": 1.912027463499425, + "language_loss": 0.74649262, + "learning_rate": 3.2847997422130734e-08, + "loss": 0.76699197, + "num_input_tokens_seen": 338587310, + "step": 15700, + "time_per_iteration": 2.7817225456237793 + }, + { + "auxiliary_loss_clip": 0.00982019, + "auxiliary_loss_mlp": 0.01030904, + "balance_loss_clip": 1.02036989, + "balance_loss_mlp": 1.02104509, + "epoch": 0.9439951901397866, + "flos": 17785334770560.0, + "grad_norm": 1.5727200227235394, + "language_loss": 0.70387876, + "learning_rate": 3.2777739044421495e-08, + "loss": 0.72400796, + "num_input_tokens_seen": 338606235, + "step": 15701, + "time_per_iteration": 2.785524606704712 + }, + { + "auxiliary_loss_clip": 0.01021114, + "auxiliary_loss_mlp": 0.01025127, + "balance_loss_clip": 1.02304339, + "balance_loss_mlp": 1.0146966, + "epoch": 0.9440553133924545, + "flos": 18879568738560.0, + "grad_norm": 1.849793808688669, + "language_loss": 0.77693856, + "learning_rate": 3.2707555263977505e-08, + "loss": 0.79740095, + "num_input_tokens_seen": 338624090, + "step": 15702, + "time_per_iteration": 2.6401681900024414 + }, + { + "auxiliary_loss_clip": 0.01045525, + "auxiliary_loss_mlp": 0.01038699, + "balance_loss_clip": 1.02246869, + "balance_loss_mlp": 1.02734995, + "epoch": 0.9441154366451225, + "flos": 19572967860480.0, + "grad_norm": 1.8275157225053207, + "language_loss": 0.66429663, + "learning_rate": 3.2637446083460194e-08, + "loss": 0.68513888, + "num_input_tokens_seen": 338643695, + "step": 15703, + "time_per_iteration": 2.512721061706543 + }, + { + "auxiliary_loss_clip": 0.01054841, + "auxiliary_loss_mlp": 0.01026159, + "balance_loss_clip": 1.02658331, + "balance_loss_mlp": 1.0152092, + "epoch": 0.9441755598977905, + "flos": 30294995472000.0, + "grad_norm": 1.7269765207718633, + "language_loss": 0.73343712, + "learning_rate": 3.256741150552833e-08, + "loss": 0.75424707, + "num_input_tokens_seen": 338664725, + "step": 15704, + "time_per_iteration": 2.564218282699585 + }, + { + "auxiliary_loss_clip": 0.0105057, + "auxiliary_loss_mlp": 0.0103465, + "balance_loss_clip": 1.02418399, + "balance_loss_mlp": 1.02399278, + "epoch": 0.9442356831504585, + "flos": 20667884186880.0, + "grad_norm": 1.7620721996107933, + "language_loss": 0.7459107, + "learning_rate": 3.2497451532837336e-08, + "loss": 0.76676291, + "num_input_tokens_seen": 338683990, + "step": 15705, + "time_per_iteration": 2.4933409690856934 + }, + { + "auxiliary_loss_clip": 0.01043632, + "auxiliary_loss_mlp": 0.01034161, + "balance_loss_clip": 1.02601707, + "balance_loss_mlp": 1.024863, + "epoch": 0.9442958064031264, + "flos": 16107265140480.0, + "grad_norm": 1.7713421030143512, + "language_loss": 0.77129459, + "learning_rate": 3.2427566168039986e-08, + "loss": 0.79207253, + "num_input_tokens_seen": 338702025, + "step": 15706, + "time_per_iteration": 2.5415148735046387 + }, + { + "auxiliary_loss_clip": 0.01051555, + "auxiliary_loss_mlp": 0.01024731, + "balance_loss_clip": 1.02572727, + "balance_loss_mlp": 1.01506901, + "epoch": 0.9443559296557944, + "flos": 20447392550400.0, + "grad_norm": 1.6294372270598887, + "language_loss": 0.69224524, + "learning_rate": 3.23577554137866e-08, + "loss": 0.71300811, + "num_input_tokens_seen": 338720920, + "step": 15707, + "time_per_iteration": 4.1963348388671875 + }, + { + "auxiliary_loss_clip": 0.0105482, + "auxiliary_loss_mlp": 0.01023392, + "balance_loss_clip": 1.02101183, + "balance_loss_mlp": 1.01462364, + "epoch": 0.9444160529084623, + "flos": 21610897896960.0, + "grad_norm": 2.038514284263327, + "language_loss": 0.69166291, + "learning_rate": 3.22880192727244e-08, + "loss": 0.71244496, + "num_input_tokens_seen": 338739590, + "step": 15708, + "time_per_iteration": 4.0103185176849365 + }, + { + "auxiliary_loss_clip": 0.01050851, + "auxiliary_loss_mlp": 0.01027736, + "balance_loss_clip": 1.02494586, + "balance_loss_mlp": 1.01788902, + "epoch": 0.9444761761611303, + "flos": 18441781776000.0, + "grad_norm": 13.059080642898513, + "language_loss": 0.70747948, + "learning_rate": 3.221835774749748e-08, + "loss": 0.7282654, + "num_input_tokens_seen": 338757240, + "step": 15709, + "time_per_iteration": 2.547379493713379 + }, + { + "auxiliary_loss_clip": 0.01026682, + "auxiliary_loss_mlp": 0.01026463, + "balance_loss_clip": 1.02897382, + "balance_loss_mlp": 1.01646698, + "epoch": 0.9445362994137982, + "flos": 20957144411520.0, + "grad_norm": 2.000366318024109, + "language_loss": 0.84286946, + "learning_rate": 3.214877084074774e-08, + "loss": 0.86340088, + "num_input_tokens_seen": 338773750, + "step": 15710, + "time_per_iteration": 2.6274304389953613 + }, + { + "auxiliary_loss_clip": 0.01038298, + "auxiliary_loss_mlp": 0.01032044, + "balance_loss_clip": 1.02813005, + "balance_loss_mlp": 1.02077913, + "epoch": 0.9445964226664663, + "flos": 20303283185280.0, + "grad_norm": 1.5610754844604862, + "language_loss": 0.71393824, + "learning_rate": 3.2079258555113956e-08, + "loss": 0.73464167, + "num_input_tokens_seen": 338792115, + "step": 15711, + "time_per_iteration": 2.607046365737915 + }, + { + "auxiliary_loss_clip": 0.01048856, + "auxiliary_loss_mlp": 0.01028077, + "balance_loss_clip": 1.02512968, + "balance_loss_mlp": 1.01738381, + "epoch": 0.9446565459191342, + "flos": 26396030903040.0, + "grad_norm": 1.7477147114653344, + "language_loss": 0.69538689, + "learning_rate": 3.200982089323179e-08, + "loss": 0.71615624, + "num_input_tokens_seen": 338812480, + "step": 15712, + "time_per_iteration": 2.5723538398742676 + }, + { + "auxiliary_loss_clip": 0.01055088, + "auxiliary_loss_mlp": 0.01036997, + "balance_loss_clip": 1.02668357, + "balance_loss_mlp": 1.02595806, + "epoch": 0.9447166691718022, + "flos": 16544764794240.0, + "grad_norm": 2.0418797351351503, + "language_loss": 0.70617908, + "learning_rate": 3.1940457857734246e-08, + "loss": 0.7270999, + "num_input_tokens_seen": 338829105, + "step": 15713, + "time_per_iteration": 2.5180983543395996 + }, + { + "auxiliary_loss_clip": 0.01036686, + "auxiliary_loss_mlp": 0.01030563, + "balance_loss_clip": 1.02173889, + "balance_loss_mlp": 1.01971483, + "epoch": 0.9447767924244702, + "flos": 29164635400320.0, + "grad_norm": 3.501939193439773, + "language_loss": 0.76556587, + "learning_rate": 3.187116945125212e-08, + "loss": 0.78623837, + "num_input_tokens_seen": 338850670, + "step": 15714, + "time_per_iteration": 2.642786979675293 + }, + { + "auxiliary_loss_clip": 0.01039011, + "auxiliary_loss_mlp": 0.0102655, + "balance_loss_clip": 1.02895927, + "balance_loss_mlp": 1.01628613, + "epoch": 0.9448369156771381, + "flos": 19274908803840.0, + "grad_norm": 3.544052434082875, + "language_loss": 0.67331612, + "learning_rate": 3.1801955676412194e-08, + "loss": 0.69397169, + "num_input_tokens_seen": 338867795, + "step": 15715, + "time_per_iteration": 2.6045968532562256 + }, + { + "auxiliary_loss_clip": 0.01033166, + "auxiliary_loss_mlp": 0.01027826, + "balance_loss_clip": 1.02566516, + "balance_loss_mlp": 1.01697218, + "epoch": 0.9448970389298061, + "flos": 23841166285440.0, + "grad_norm": 1.6309157632421427, + "language_loss": 0.74797893, + "learning_rate": 3.173281653583948e-08, + "loss": 0.7685889, + "num_input_tokens_seen": 338887205, + "step": 15716, + "time_per_iteration": 2.640490770339966 + }, + { + "auxiliary_loss_clip": 0.01049202, + "auxiliary_loss_mlp": 0.0102794, + "balance_loss_clip": 1.02994478, + "balance_loss_mlp": 1.01696086, + "epoch": 0.944957162182474, + "flos": 22382259488640.0, + "grad_norm": 1.8516593081774415, + "language_loss": 0.62442601, + "learning_rate": 3.166375203215565e-08, + "loss": 0.64519745, + "num_input_tokens_seen": 338906130, + "step": 15717, + "time_per_iteration": 2.567951202392578 + }, + { + "auxiliary_loss_clip": 0.01055717, + "auxiliary_loss_mlp": 0.01028738, + "balance_loss_clip": 1.02862287, + "balance_loss_mlp": 1.01833081, + "epoch": 0.9450172854351421, + "flos": 17383889393280.0, + "grad_norm": 1.8570658262543922, + "language_loss": 0.79220319, + "learning_rate": 3.1594762167979514e-08, + "loss": 0.81304777, + "num_input_tokens_seen": 338923045, + "step": 15718, + "time_per_iteration": 2.5436503887176514 + }, + { + "auxiliary_loss_clip": 0.00998115, + "auxiliary_loss_mlp": 0.01001738, + "balance_loss_clip": 1.00260091, + "balance_loss_mlp": 1.00095737, + "epoch": 0.94507740868781, + "flos": 68466352406400.0, + "grad_norm": 0.7087663902430482, + "language_loss": 0.57803547, + "learning_rate": 3.152584694592719e-08, + "loss": 0.59803402, + "num_input_tokens_seen": 338987545, + "step": 15719, + "time_per_iteration": 3.1554343700408936 + }, + { + "auxiliary_loss_clip": 0.01018768, + "auxiliary_loss_mlp": 0.00747621, + "balance_loss_clip": 1.02308547, + "balance_loss_mlp": 1.00035655, + "epoch": 0.945137531940478, + "flos": 21142479611520.0, + "grad_norm": 1.7994002285023052, + "language_loss": 0.75856423, + "learning_rate": 3.145700636861193e-08, + "loss": 0.77622819, + "num_input_tokens_seen": 339007830, + "step": 15720, + "time_per_iteration": 2.808455467224121 + }, + { + "auxiliary_loss_clip": 0.01051673, + "auxiliary_loss_mlp": 0.01027177, + "balance_loss_clip": 1.02507448, + "balance_loss_mlp": 1.01789093, + "epoch": 0.9451976551931459, + "flos": 24533918962560.0, + "grad_norm": 1.785558406716844, + "language_loss": 0.72900248, + "learning_rate": 3.138824043864452e-08, + "loss": 0.74979097, + "num_input_tokens_seen": 339028980, + "step": 15721, + "time_per_iteration": 2.8292925357818604 + }, + { + "auxiliary_loss_clip": 0.01014448, + "auxiliary_loss_mlp": 0.01029771, + "balance_loss_clip": 1.02211261, + "balance_loss_mlp": 1.01880395, + "epoch": 0.9452577784458139, + "flos": 23440582834560.0, + "grad_norm": 1.9039455549015138, + "language_loss": 0.85217357, + "learning_rate": 3.131954915863244e-08, + "loss": 0.87261581, + "num_input_tokens_seen": 339047950, + "step": 15722, + "time_per_iteration": 2.777208089828491 + }, + { + "auxiliary_loss_clip": 0.00988116, + "auxiliary_loss_mlp": 0.01000757, + "balance_loss_clip": 1.00344181, + "balance_loss_mlp": 0.99989843, + "epoch": 0.9453179016984818, + "flos": 52017686449920.0, + "grad_norm": 0.9011663320440656, + "language_loss": 0.64542145, + "learning_rate": 3.125093253118005e-08, + "loss": 0.66531014, + "num_input_tokens_seen": 339104535, + "step": 15723, + "time_per_iteration": 3.1629912853240967 + }, + { + "auxiliary_loss_clip": 0.01033213, + "auxiliary_loss_mlp": 0.01027423, + "balance_loss_clip": 1.02575243, + "balance_loss_mlp": 1.01643753, + "epoch": 0.9453780249511499, + "flos": 13473001509120.0, + "grad_norm": 1.9059501299718893, + "language_loss": 0.72562051, + "learning_rate": 3.1182390558889715e-08, + "loss": 0.74622685, + "num_input_tokens_seen": 339122050, + "step": 15724, + "time_per_iteration": 2.6468393802642822 + }, + { + "auxiliary_loss_clip": 0.01028365, + "auxiliary_loss_mlp": 0.01026102, + "balance_loss_clip": 1.02269959, + "balance_loss_mlp": 1.01632667, + "epoch": 0.9454381482038178, + "flos": 23258515772160.0, + "grad_norm": 2.12486870332839, + "language_loss": 0.84856462, + "learning_rate": 3.111392324436024e-08, + "loss": 0.86910927, + "num_input_tokens_seen": 339138940, + "step": 15725, + "time_per_iteration": 2.6509287357330322 + }, + { + "auxiliary_loss_clip": 0.01047663, + "auxiliary_loss_mlp": 0.01025355, + "balance_loss_clip": 1.02846813, + "balance_loss_mlp": 1.01543665, + "epoch": 0.9454982714564858, + "flos": 19496621502720.0, + "grad_norm": 1.6606184659733034, + "language_loss": 0.71108115, + "learning_rate": 3.104553059018822e-08, + "loss": 0.73181129, + "num_input_tokens_seen": 339158245, + "step": 15726, + "time_per_iteration": 4.488880634307861 + }, + { + "auxiliary_loss_clip": 0.01035346, + "auxiliary_loss_mlp": 0.01028983, + "balance_loss_clip": 1.02378249, + "balance_loss_mlp": 1.01774192, + "epoch": 0.9455583947091538, + "flos": 23258120722560.0, + "grad_norm": 1.6703949838315293, + "language_loss": 0.6074301, + "learning_rate": 3.097721259896735e-08, + "loss": 0.62807345, + "num_input_tokens_seen": 339178200, + "step": 15727, + "time_per_iteration": 2.9350473880767822 + }, + { + "auxiliary_loss_clip": 0.01046822, + "auxiliary_loss_mlp": 0.01028604, + "balance_loss_clip": 1.02130699, + "balance_loss_mlp": 1.01890588, + "epoch": 0.9456185179618217, + "flos": 17673041877120.0, + "grad_norm": 1.5916214614695323, + "language_loss": 0.81412315, + "learning_rate": 3.0908969273287566e-08, + "loss": 0.83487737, + "num_input_tokens_seen": 339193950, + "step": 15728, + "time_per_iteration": 2.76572322845459 + }, + { + "auxiliary_loss_clip": 0.00963225, + "auxiliary_loss_mlp": 0.01005761, + "balance_loss_clip": 1.00711441, + "balance_loss_mlp": 1.00481343, + "epoch": 0.9456786412144897, + "flos": 61415040389760.0, + "grad_norm": 0.7311338498789364, + "language_loss": 0.59135586, + "learning_rate": 3.08408006157368e-08, + "loss": 0.61104572, + "num_input_tokens_seen": 339252330, + "step": 15729, + "time_per_iteration": 3.2930915355682373 + }, + { + "auxiliary_loss_clip": 0.01059589, + "auxiliary_loss_mlp": 0.01020898, + "balance_loss_clip": 1.02402616, + "balance_loss_mlp": 1.01045489, + "epoch": 0.9457387644671577, + "flos": 18588369179520.0, + "grad_norm": 1.9637822683700918, + "language_loss": 0.76317215, + "learning_rate": 3.077270662890052e-08, + "loss": 0.78397703, + "num_input_tokens_seen": 339270325, + "step": 15730, + "time_per_iteration": 2.652076005935669 + }, + { + "auxiliary_loss_clip": 0.01036676, + "auxiliary_loss_mlp": 0.01028058, + "balance_loss_clip": 1.02681518, + "balance_loss_mlp": 1.01716805, + "epoch": 0.9457988877198257, + "flos": 21108544237440.0, + "grad_norm": 1.4433533533875367, + "language_loss": 0.62453127, + "learning_rate": 3.070468731536047e-08, + "loss": 0.64517856, + "num_input_tokens_seen": 339291980, + "step": 15731, + "time_per_iteration": 2.8009161949157715 + }, + { + "auxiliary_loss_clip": 0.01052734, + "auxiliary_loss_mlp": 0.01026274, + "balance_loss_clip": 1.02373338, + "balance_loss_mlp": 1.01530087, + "epoch": 0.9458590109724936, + "flos": 26688379697280.0, + "grad_norm": 2.002744144056041, + "language_loss": 0.64136076, + "learning_rate": 3.063674267769589e-08, + "loss": 0.66215086, + "num_input_tokens_seen": 339311795, + "step": 15732, + "time_per_iteration": 2.787966012954712 + }, + { + "auxiliary_loss_clip": 0.01056125, + "auxiliary_loss_mlp": 0.01027469, + "balance_loss_clip": 1.02666414, + "balance_loss_mlp": 1.01606059, + "epoch": 0.9459191342251616, + "flos": 18661591054080.0, + "grad_norm": 1.76070180009861, + "language_loss": 0.83760864, + "learning_rate": 3.056887271848363e-08, + "loss": 0.85844457, + "num_input_tokens_seen": 339327745, + "step": 15733, + "time_per_iteration": 2.7147462368011475 + }, + { + "auxiliary_loss_clip": 0.01047823, + "auxiliary_loss_mlp": 0.01024877, + "balance_loss_clip": 1.02320874, + "balance_loss_mlp": 1.01559067, + "epoch": 0.9459792574778295, + "flos": 23398459159680.0, + "grad_norm": 1.5432730337506606, + "language_loss": 0.72003806, + "learning_rate": 3.0501077440297173e-08, + "loss": 0.74076509, + "num_input_tokens_seen": 339346445, + "step": 15734, + "time_per_iteration": 2.740130662918091 + }, + { + "auxiliary_loss_clip": 0.01047187, + "auxiliary_loss_mlp": 0.01027575, + "balance_loss_clip": 1.0228225, + "balance_loss_mlp": 1.01908159, + "epoch": 0.9460393807304975, + "flos": 24392969994240.0, + "grad_norm": 1.7268536661555505, + "language_loss": 0.86696959, + "learning_rate": 3.043335684570692e-08, + "loss": 0.88771719, + "num_input_tokens_seen": 339367945, + "step": 15735, + "time_per_iteration": 2.6817522048950195 + }, + { + "auxiliary_loss_clip": 0.01042419, + "auxiliary_loss_mlp": 0.0102705, + "balance_loss_clip": 1.02464938, + "balance_loss_mlp": 1.01714969, + "epoch": 0.9460995039831654, + "flos": 21939408708480.0, + "grad_norm": 1.9884798582410539, + "language_loss": 0.67566687, + "learning_rate": 3.036571093728102e-08, + "loss": 0.69636154, + "num_input_tokens_seen": 339386060, + "step": 15736, + "time_per_iteration": 2.843201160430908 + }, + { + "auxiliary_loss_clip": 0.00972762, + "auxiliary_loss_mlp": 0.01000287, + "balance_loss_clip": 1.00528872, + "balance_loss_mlp": 0.99938124, + "epoch": 0.9461596272358335, + "flos": 70322466775680.0, + "grad_norm": 0.8692192542604945, + "language_loss": 0.65302473, + "learning_rate": 3.029813971758499e-08, + "loss": 0.67275518, + "num_input_tokens_seen": 339446695, + "step": 15737, + "time_per_iteration": 3.2828707695007324 + }, + { + "auxiliary_loss_clip": 0.00999884, + "auxiliary_loss_mlp": 0.01000768, + "balance_loss_clip": 1.00426733, + "balance_loss_mlp": 0.99974883, + "epoch": 0.9462197504885014, + "flos": 58591242645120.0, + "grad_norm": 0.8351415999644085, + "language_loss": 0.5876832, + "learning_rate": 3.0230643189181225e-08, + "loss": 0.60768974, + "num_input_tokens_seen": 339510080, + "step": 15738, + "time_per_iteration": 3.259296178817749 + }, + { + "auxiliary_loss_clip": 0.01047, + "auxiliary_loss_mlp": 0.01029467, + "balance_loss_clip": 1.02215052, + "balance_loss_mlp": 1.02015662, + "epoch": 0.9462798737411694, + "flos": 23433759250560.0, + "grad_norm": 1.9036264253786286, + "language_loss": 0.71508908, + "learning_rate": 3.016322135462834e-08, + "loss": 0.73585367, + "num_input_tokens_seen": 339529335, + "step": 15739, + "time_per_iteration": 2.7478036880493164 + }, + { + "auxiliary_loss_clip": 0.01046585, + "auxiliary_loss_mlp": 0.01031429, + "balance_loss_clip": 1.02235448, + "balance_loss_mlp": 1.02076578, + "epoch": 0.9463399969938374, + "flos": 25046077034880.0, + "grad_norm": 9.225717866921881, + "language_loss": 0.6383003, + "learning_rate": 3.009587421648363e-08, + "loss": 0.65908051, + "num_input_tokens_seen": 339548820, + "step": 15740, + "time_per_iteration": 2.6559391021728516 + }, + { + "auxiliary_loss_clip": 0.01040805, + "auxiliary_loss_mlp": 0.01027702, + "balance_loss_clip": 1.02555859, + "balance_loss_mlp": 1.0180465, + "epoch": 0.9464001202465053, + "flos": 24352606085760.0, + "grad_norm": 1.6417381565955536, + "language_loss": 0.66235542, + "learning_rate": 3.0028601777301045e-08, + "loss": 0.6830405, + "num_input_tokens_seen": 339566775, + "step": 15741, + "time_per_iteration": 2.6508891582489014 + }, + { + "auxiliary_loss_clip": 0.01051553, + "auxiliary_loss_mlp": 0.01025457, + "balance_loss_clip": 1.02517593, + "balance_loss_mlp": 1.01553249, + "epoch": 0.9464602434991733, + "flos": 17165444832000.0, + "grad_norm": 2.1223051424723933, + "language_loss": 0.76069528, + "learning_rate": 2.9961404039630987e-08, + "loss": 0.78146535, + "num_input_tokens_seen": 339581905, + "step": 15742, + "time_per_iteration": 2.7499866485595703 + }, + { + "auxiliary_loss_clip": 0.01045749, + "auxiliary_loss_mlp": 0.01027273, + "balance_loss_clip": 1.02246189, + "balance_loss_mlp": 1.01755702, + "epoch": 0.9465203667518413, + "flos": 19938107566080.0, + "grad_norm": 1.7829644474717314, + "language_loss": 0.72450304, + "learning_rate": 2.989428100602187e-08, + "loss": 0.7452333, + "num_input_tokens_seen": 339599870, + "step": 15743, + "time_per_iteration": 4.365837812423706 + }, + { + "auxiliary_loss_clip": 0.01036721, + "auxiliary_loss_mlp": 0.01031243, + "balance_loss_clip": 1.02752423, + "balance_loss_mlp": 1.02054989, + "epoch": 0.9465804900045093, + "flos": 20120318282880.0, + "grad_norm": 1.6943739150359098, + "language_loss": 0.79415953, + "learning_rate": 2.982723267901943e-08, + "loss": 0.81483918, + "num_input_tokens_seen": 339620250, + "step": 15744, + "time_per_iteration": 2.7434349060058594 + }, + { + "auxiliary_loss_clip": 0.01045428, + "auxiliary_loss_mlp": 0.01033901, + "balance_loss_clip": 1.02616453, + "balance_loss_mlp": 1.02300513, + "epoch": 0.9466406132571772, + "flos": 23911622812800.0, + "grad_norm": 1.7954360088408259, + "language_loss": 0.78500152, + "learning_rate": 2.9760259061165417e-08, + "loss": 0.80579484, + "num_input_tokens_seen": 339639900, + "step": 15745, + "time_per_iteration": 2.8505561351776123 + }, + { + "auxiliary_loss_clip": 0.01036012, + "auxiliary_loss_mlp": 0.01029861, + "balance_loss_clip": 1.02315307, + "balance_loss_mlp": 1.01945949, + "epoch": 0.9467007365098452, + "flos": 19933223316480.0, + "grad_norm": 1.400318494959485, + "language_loss": 0.69916987, + "learning_rate": 2.9693360155000014e-08, + "loss": 0.71982861, + "num_input_tokens_seen": 339658970, + "step": 15746, + "time_per_iteration": 2.716458797454834 + }, + { + "auxiliary_loss_clip": 0.01044377, + "auxiliary_loss_mlp": 0.01026045, + "balance_loss_clip": 1.02650118, + "balance_loss_mlp": 1.01522112, + "epoch": 0.9467608597625131, + "flos": 19310496203520.0, + "grad_norm": 2.058026276255394, + "language_loss": 0.56042111, + "learning_rate": 2.962653596305964e-08, + "loss": 0.58112538, + "num_input_tokens_seen": 339675600, + "step": 15747, + "time_per_iteration": 2.654627561569214 + }, + { + "auxiliary_loss_clip": 0.00950468, + "auxiliary_loss_mlp": 0.01012046, + "balance_loss_clip": 1.00459599, + "balance_loss_mlp": 1.01118171, + "epoch": 0.9468209830151811, + "flos": 69630252802560.0, + "grad_norm": 0.6602687503527379, + "language_loss": 0.53302151, + "learning_rate": 2.955978648787871e-08, + "loss": 0.55264664, + "num_input_tokens_seen": 339744505, + "step": 15748, + "time_per_iteration": 3.611210346221924 + }, + { + "auxiliary_loss_clip": 0.01039949, + "auxiliary_loss_mlp": 0.01038028, + "balance_loss_clip": 1.02374434, + "balance_loss_mlp": 1.02780545, + "epoch": 0.946881106267849, + "flos": 27016639113600.0, + "grad_norm": 1.7036714047670471, + "language_loss": 0.66397655, + "learning_rate": 2.9493111731988096e-08, + "loss": 0.68475634, + "num_input_tokens_seen": 339765810, + "step": 15749, + "time_per_iteration": 2.782545566558838 + }, + { + "auxiliary_loss_clip": 0.01028759, + "auxiliary_loss_mlp": 0.01027355, + "balance_loss_clip": 1.0214957, + "balance_loss_mlp": 1.01511848, + "epoch": 0.9469412295205171, + "flos": 20190092451840.0, + "grad_norm": 2.0290156551677145, + "language_loss": 0.76276338, + "learning_rate": 2.942651169791621e-08, + "loss": 0.7833246, + "num_input_tokens_seen": 339784125, + "step": 15750, + "time_per_iteration": 2.7638115882873535 + }, + { + "auxiliary_loss_clip": 0.01051901, + "auxiliary_loss_mlp": 0.01026843, + "balance_loss_clip": 1.02526999, + "balance_loss_mlp": 1.01682949, + "epoch": 0.947001352773185, + "flos": 21324905809920.0, + "grad_norm": 1.648858048934579, + "language_loss": 0.67704713, + "learning_rate": 2.9359986388188372e-08, + "loss": 0.69783461, + "num_input_tokens_seen": 339803450, + "step": 15751, + "time_per_iteration": 2.699305772781372 + }, + { + "auxiliary_loss_clip": 0.0102698, + "auxiliary_loss_mlp": 0.01024387, + "balance_loss_clip": 1.02209949, + "balance_loss_mlp": 1.01492739, + "epoch": 0.947061476025853, + "flos": 21944041562880.0, + "grad_norm": 1.5268267742280683, + "language_loss": 0.65455902, + "learning_rate": 2.929353580532723e-08, + "loss": 0.67507273, + "num_input_tokens_seen": 339823215, + "step": 15752, + "time_per_iteration": 2.8064157962799072 + }, + { + "auxiliary_loss_clip": 0.01045298, + "auxiliary_loss_mlp": 0.01031164, + "balance_loss_clip": 1.02226949, + "balance_loss_mlp": 1.01999402, + "epoch": 0.947121599278521, + "flos": 21394715892480.0, + "grad_norm": 2.274192636639651, + "language_loss": 0.71704245, + "learning_rate": 2.9227159951852764e-08, + "loss": 0.73780704, + "num_input_tokens_seen": 339842230, + "step": 15753, + "time_per_iteration": 2.715811252593994 + }, + { + "auxiliary_loss_clip": 0.01063988, + "auxiliary_loss_mlp": 0.01029679, + "balance_loss_clip": 1.02503479, + "balance_loss_mlp": 1.01757908, + "epoch": 0.9471817225311889, + "flos": 23075730437760.0, + "grad_norm": 2.5523404722565366, + "language_loss": 0.69791645, + "learning_rate": 2.9160858830281855e-08, + "loss": 0.71885312, + "num_input_tokens_seen": 339861640, + "step": 15754, + "time_per_iteration": 4.302027940750122 + }, + { + "auxiliary_loss_clip": 0.01063472, + "auxiliary_loss_mlp": 0.01031377, + "balance_loss_clip": 1.02402925, + "balance_loss_mlp": 1.02079725, + "epoch": 0.947241845783857, + "flos": 11910744305280.0, + "grad_norm": 2.0831115045427904, + "language_loss": 0.79056561, + "learning_rate": 2.9094632443129153e-08, + "loss": 0.81151414, + "num_input_tokens_seen": 339878210, + "step": 15755, + "time_per_iteration": 4.182845115661621 + }, + { + "auxiliary_loss_clip": 0.01028435, + "auxiliary_loss_mlp": 0.01034884, + "balance_loss_clip": 1.02368736, + "balance_loss_mlp": 1.02128851, + "epoch": 0.9473019690365249, + "flos": 20740675098240.0, + "grad_norm": 2.3035600690869584, + "language_loss": 0.75080401, + "learning_rate": 2.9028480792904876e-08, + "loss": 0.77143717, + "num_input_tokens_seen": 339894255, + "step": 15756, + "time_per_iteration": 2.6541874408721924 + }, + { + "auxiliary_loss_clip": 0.01031775, + "auxiliary_loss_mlp": 0.01027601, + "balance_loss_clip": 1.02005124, + "balance_loss_mlp": 1.0173254, + "epoch": 0.9473620922891929, + "flos": 17639896602240.0, + "grad_norm": 1.970524799425657, + "language_loss": 0.74836552, + "learning_rate": 2.8962403882118347e-08, + "loss": 0.76895928, + "num_input_tokens_seen": 339912425, + "step": 15757, + "time_per_iteration": 2.6176114082336426 + }, + { + "auxiliary_loss_clip": 0.01045084, + "auxiliary_loss_mlp": 0.0103226, + "balance_loss_clip": 1.02477431, + "balance_loss_mlp": 1.02110815, + "epoch": 0.9474222155418608, + "flos": 23550002640000.0, + "grad_norm": 2.3756227684739297, + "language_loss": 0.79259074, + "learning_rate": 2.889640171327512e-08, + "loss": 0.81336415, + "num_input_tokens_seen": 339929635, + "step": 15758, + "time_per_iteration": 2.665451765060425 + }, + { + "auxiliary_loss_clip": 0.01027295, + "auxiliary_loss_mlp": 0.00747625, + "balance_loss_clip": 1.02242136, + "balance_loss_mlp": 1.00035882, + "epoch": 0.9474823387945288, + "flos": 27089753247360.0, + "grad_norm": 2.2646197618300747, + "language_loss": 0.71948469, + "learning_rate": 2.8830474288877638e-08, + "loss": 0.73723388, + "num_input_tokens_seen": 339951200, + "step": 15759, + "time_per_iteration": 2.6706597805023193 + }, + { + "auxiliary_loss_clip": 0.01049985, + "auxiliary_loss_mlp": 0.01025399, + "balance_loss_clip": 1.02591419, + "balance_loss_mlp": 1.01657176, + "epoch": 0.9475424620471967, + "flos": 22966526113920.0, + "grad_norm": 1.9143899507605666, + "language_loss": 0.75588107, + "learning_rate": 2.8764621611426344e-08, + "loss": 0.77663487, + "num_input_tokens_seen": 339971820, + "step": 15760, + "time_per_iteration": 2.630347490310669 + }, + { + "auxiliary_loss_clip": 0.01062135, + "auxiliary_loss_mlp": 0.00747638, + "balance_loss_clip": 1.02531719, + "balance_loss_mlp": 1.00041831, + "epoch": 0.9476025852998647, + "flos": 20047671025920.0, + "grad_norm": 1.747427222062295, + "language_loss": 0.73232615, + "learning_rate": 2.8698843683418128e-08, + "loss": 0.75042385, + "num_input_tokens_seen": 339989420, + "step": 15761, + "time_per_iteration": 2.5638186931610107 + }, + { + "auxiliary_loss_clip": 0.01043536, + "auxiliary_loss_mlp": 0.01035578, + "balance_loss_clip": 1.02793455, + "balance_loss_mlp": 1.02598763, + "epoch": 0.9476627085525327, + "flos": 14975468524800.0, + "grad_norm": 2.0125046255264047, + "language_loss": 0.71455944, + "learning_rate": 2.863314050734722e-08, + "loss": 0.73535061, + "num_input_tokens_seen": 340006690, + "step": 15762, + "time_per_iteration": 2.626493453979492 + }, + { + "auxiliary_loss_clip": 0.01063988, + "auxiliary_loss_mlp": 0.01032662, + "balance_loss_clip": 1.02409494, + "balance_loss_mlp": 1.02074707, + "epoch": 0.9477228318052007, + "flos": 18697788984960.0, + "grad_norm": 1.8049938708469129, + "language_loss": 0.67272139, + "learning_rate": 2.856751208570518e-08, + "loss": 0.69368792, + "num_input_tokens_seen": 340025480, + "step": 15763, + "time_per_iteration": 2.5742478370666504 + }, + { + "auxiliary_loss_clip": 0.0106117, + "auxiliary_loss_mlp": 0.01028598, + "balance_loss_clip": 1.02374804, + "balance_loss_mlp": 1.01868033, + "epoch": 0.9477829550578686, + "flos": 23875065745920.0, + "grad_norm": 1.8536071033477994, + "language_loss": 0.69952488, + "learning_rate": 2.8501958420980466e-08, + "loss": 0.72042257, + "num_input_tokens_seen": 340043785, + "step": 15764, + "time_per_iteration": 2.6412839889526367 + }, + { + "auxiliary_loss_clip": 0.01049791, + "auxiliary_loss_mlp": 0.00747421, + "balance_loss_clip": 1.02579761, + "balance_loss_mlp": 1.00030065, + "epoch": 0.9478430783105366, + "flos": 22562890007040.0, + "grad_norm": 1.5549382439970119, + "language_loss": 0.70815939, + "learning_rate": 2.8436479515659306e-08, + "loss": 0.72613156, + "num_input_tokens_seen": 340064360, + "step": 15765, + "time_per_iteration": 2.7845299243927 + }, + { + "auxiliary_loss_clip": 0.00984715, + "auxiliary_loss_mlp": 0.0099951, + "balance_loss_clip": 1.00167358, + "balance_loss_mlp": 0.99868107, + "epoch": 0.9479032015632046, + "flos": 60857885554560.0, + "grad_norm": 1.358478397566655, + "language_loss": 0.59075874, + "learning_rate": 2.8371075372224384e-08, + "loss": 0.61060101, + "num_input_tokens_seen": 340114425, + "step": 15766, + "time_per_iteration": 3.0002434253692627 + }, + { + "auxiliary_loss_clip": 0.0100723, + "auxiliary_loss_mlp": 0.01037994, + "balance_loss_clip": 1.02242208, + "balance_loss_mlp": 1.02721786, + "epoch": 0.9479633248158725, + "flos": 14683873916160.0, + "grad_norm": 1.9156068620453774, + "language_loss": 0.74096751, + "learning_rate": 2.8305745993155938e-08, + "loss": 0.76141977, + "num_input_tokens_seen": 340132200, + "step": 15767, + "time_per_iteration": 2.788808822631836 + }, + { + "auxiliary_loss_clip": 0.01035499, + "auxiliary_loss_mlp": 0.01030565, + "balance_loss_clip": 1.02529562, + "balance_loss_mlp": 1.01898408, + "epoch": 0.9480234480685406, + "flos": 20333878594560.0, + "grad_norm": 2.2349357117208224, + "language_loss": 0.7334457, + "learning_rate": 2.8240491380931096e-08, + "loss": 0.75410628, + "num_input_tokens_seen": 340149175, + "step": 15768, + "time_per_iteration": 2.778756618499756 + }, + { + "auxiliary_loss_clip": 0.00969702, + "auxiliary_loss_mlp": 0.01003097, + "balance_loss_clip": 1.00423813, + "balance_loss_mlp": 1.00217295, + "epoch": 0.9480835713212085, + "flos": 70293092428800.0, + "grad_norm": 0.738818903966609, + "language_loss": 0.55323696, + "learning_rate": 2.8175311538024326e-08, + "loss": 0.57296491, + "num_input_tokens_seen": 340208155, + "step": 15769, + "time_per_iteration": 3.3291749954223633 + }, + { + "auxiliary_loss_clip": 0.01013295, + "auxiliary_loss_mlp": 0.01027449, + "balance_loss_clip": 1.0228833, + "balance_loss_mlp": 1.01716733, + "epoch": 0.9481436945738765, + "flos": 25449749055360.0, + "grad_norm": 1.281655441414364, + "language_loss": 0.77121615, + "learning_rate": 2.8110206466907428e-08, + "loss": 0.79162359, + "num_input_tokens_seen": 340229275, + "step": 15770, + "time_per_iteration": 2.839815378189087 + }, + { + "auxiliary_loss_clip": 0.01046693, + "auxiliary_loss_mlp": 0.01033131, + "balance_loss_clip": 1.0278573, + "balance_loss_mlp": 1.02161503, + "epoch": 0.9482038178265444, + "flos": 26979902478720.0, + "grad_norm": 1.796854489699979, + "language_loss": 0.80262268, + "learning_rate": 2.8045176170049313e-08, + "loss": 0.82342088, + "num_input_tokens_seen": 340248920, + "step": 15771, + "time_per_iteration": 2.837589740753174 + }, + { + "auxiliary_loss_clip": 0.01028081, + "auxiliary_loss_mlp": 0.01027771, + "balance_loss_clip": 1.02230084, + "balance_loss_mlp": 1.01738143, + "epoch": 0.9482639410792124, + "flos": 17785442511360.0, + "grad_norm": 2.7207396556296217, + "language_loss": 0.69864875, + "learning_rate": 2.7980220649915566e-08, + "loss": 0.71920729, + "num_input_tokens_seen": 340266775, + "step": 15772, + "time_per_iteration": 2.9479594230651855 + }, + { + "auxiliary_loss_clip": 0.01043098, + "auxiliary_loss_mlp": 0.01031914, + "balance_loss_clip": 1.02291942, + "balance_loss_mlp": 1.02104855, + "epoch": 0.9483240643318803, + "flos": 20996682307200.0, + "grad_norm": 1.4968429591258483, + "language_loss": 0.73737079, + "learning_rate": 2.7915339908969327e-08, + "loss": 0.75812095, + "num_input_tokens_seen": 340285295, + "step": 15773, + "time_per_iteration": 2.778047561645508 + }, + { + "auxiliary_loss_clip": 0.01032449, + "auxiliary_loss_mlp": 0.01031347, + "balance_loss_clip": 1.02348733, + "balance_loss_mlp": 1.02041531, + "epoch": 0.9483841875845483, + "flos": 20083294339200.0, + "grad_norm": 2.305846310040382, + "language_loss": 0.62751979, + "learning_rate": 2.7850533949671072e-08, + "loss": 0.64815772, + "num_input_tokens_seen": 340304265, + "step": 15774, + "time_per_iteration": 4.455244541168213 + }, + { + "auxiliary_loss_clip": 0.01061859, + "auxiliary_loss_mlp": 0.01030997, + "balance_loss_clip": 1.02451634, + "balance_loss_mlp": 1.02032816, + "epoch": 0.9484443108372163, + "flos": 20813645577600.0, + "grad_norm": 1.7672802424632805, + "language_loss": 0.59252405, + "learning_rate": 2.7785802774478396e-08, + "loss": 0.61345255, + "num_input_tokens_seen": 340323690, + "step": 15775, + "time_per_iteration": 2.6248092651367188 + }, + { + "auxiliary_loss_clip": 0.01042755, + "auxiliary_loss_mlp": 0.01027154, + "balance_loss_clip": 1.02488101, + "balance_loss_mlp": 1.01597798, + "epoch": 0.9485044340898843, + "flos": 36429184506240.0, + "grad_norm": 1.6391360924436968, + "language_loss": 0.61752313, + "learning_rate": 2.772114638584555e-08, + "loss": 0.63822216, + "num_input_tokens_seen": 340345830, + "step": 15776, + "time_per_iteration": 2.7976152896881104 + }, + { + "auxiliary_loss_clip": 0.01036297, + "auxiliary_loss_mlp": 0.01027851, + "balance_loss_clip": 1.02203751, + "balance_loss_mlp": 1.01681805, + "epoch": 0.9485645573425522, + "flos": 22602535643520.0, + "grad_norm": 1.7609909124036696, + "language_loss": 0.73421115, + "learning_rate": 2.765656478622458e-08, + "loss": 0.75485265, + "num_input_tokens_seen": 340365910, + "step": 15777, + "time_per_iteration": 2.754560708999634 + }, + { + "auxiliary_loss_clip": 0.01058751, + "auxiliary_loss_mlp": 0.01035004, + "balance_loss_clip": 1.02749753, + "balance_loss_mlp": 1.02356648, + "epoch": 0.9486246805952202, + "flos": 22017766227840.0, + "grad_norm": 2.216033010670804, + "language_loss": 0.72225481, + "learning_rate": 2.759205797806441e-08, + "loss": 0.74319232, + "num_input_tokens_seen": 340383935, + "step": 15778, + "time_per_iteration": 2.68835711479187 + }, + { + "auxiliary_loss_clip": 0.01050405, + "auxiliary_loss_mlp": 0.00747516, + "balance_loss_clip": 1.02635121, + "balance_loss_mlp": 1.00035489, + "epoch": 0.9486848038478882, + "flos": 16508674604160.0, + "grad_norm": 1.8612386801749161, + "language_loss": 0.7016809, + "learning_rate": 2.7527625963810865e-08, + "loss": 0.71966004, + "num_input_tokens_seen": 340402760, + "step": 15779, + "time_per_iteration": 2.604991912841797 + }, + { + "auxiliary_loss_clip": 0.01061031, + "auxiliary_loss_mlp": 0.01029514, + "balance_loss_clip": 1.02431762, + "balance_loss_mlp": 1.01842737, + "epoch": 0.9487449271005561, + "flos": 19244385221760.0, + "grad_norm": 1.9661984326484974, + "language_loss": 0.78059876, + "learning_rate": 2.7463268745907542e-08, + "loss": 0.80150425, + "num_input_tokens_seen": 340422105, + "step": 15780, + "time_per_iteration": 2.5609371662139893 + }, + { + "auxiliary_loss_clip": 0.01044413, + "auxiliary_loss_mlp": 0.00747651, + "balance_loss_clip": 1.02713633, + "balance_loss_mlp": 1.00037122, + "epoch": 0.9488050503532242, + "flos": 21762692772480.0, + "grad_norm": 1.6093023757292988, + "language_loss": 0.66369593, + "learning_rate": 2.7398986326794494e-08, + "loss": 0.68161654, + "num_input_tokens_seen": 340441160, + "step": 15781, + "time_per_iteration": 2.6745123863220215 + }, + { + "auxiliary_loss_clip": 0.01061156, + "auxiliary_loss_mlp": 0.01030646, + "balance_loss_clip": 1.02464211, + "balance_loss_mlp": 1.02007854, + "epoch": 0.9488651736058921, + "flos": 18368919037440.0, + "grad_norm": 2.2009676149008426, + "language_loss": 0.79768789, + "learning_rate": 2.733477870890999e-08, + "loss": 0.81860596, + "num_input_tokens_seen": 340458200, + "step": 15782, + "time_per_iteration": 2.528062343597412 + }, + { + "auxiliary_loss_clip": 0.00997388, + "auxiliary_loss_mlp": 0.01000998, + "balance_loss_clip": 1.00236714, + "balance_loss_mlp": 1.000193, + "epoch": 0.9489252968585601, + "flos": 70084057230720.0, + "grad_norm": 0.7177526024796499, + "language_loss": 0.59813815, + "learning_rate": 2.7270645894688082e-08, + "loss": 0.61812204, + "num_input_tokens_seen": 340526420, + "step": 15783, + "time_per_iteration": 3.2855067253112793 + }, + { + "auxiliary_loss_clip": 0.01051987, + "auxiliary_loss_mlp": 0.01031783, + "balance_loss_clip": 1.02462697, + "balance_loss_mlp": 1.0212028, + "epoch": 0.948985420111228, + "flos": 27855440490240.0, + "grad_norm": 1.73722842587947, + "language_loss": 0.73917603, + "learning_rate": 2.720658788656105e-08, + "loss": 0.7600137, + "num_input_tokens_seen": 340546325, + "step": 15784, + "time_per_iteration": 2.6211957931518555 + }, + { + "auxiliary_loss_clip": 0.01014896, + "auxiliary_loss_mlp": 0.01029172, + "balance_loss_clip": 1.02315903, + "balance_loss_mlp": 1.01765013, + "epoch": 0.949045543363896, + "flos": 24316049018880.0, + "grad_norm": 1.9522432250348876, + "language_loss": 0.6976881, + "learning_rate": 2.714260468695806e-08, + "loss": 0.7181288, + "num_input_tokens_seen": 340565145, + "step": 15785, + "time_per_iteration": 2.809389114379883 + }, + { + "auxiliary_loss_clip": 0.01062431, + "auxiliary_loss_mlp": 0.01024226, + "balance_loss_clip": 1.02478909, + "balance_loss_mlp": 1.01409316, + "epoch": 0.9491056666165639, + "flos": 24241677909120.0, + "grad_norm": 2.2968918945907877, + "language_loss": 0.75749266, + "learning_rate": 2.707869629830495e-08, + "loss": 0.77835923, + "num_input_tokens_seen": 340585465, + "step": 15786, + "time_per_iteration": 2.5860397815704346 + }, + { + "auxiliary_loss_clip": 0.01033065, + "auxiliary_loss_mlp": 0.01027799, + "balance_loss_clip": 1.0259726, + "balance_loss_mlp": 1.01825058, + "epoch": 0.949165789869232, + "flos": 24531261356160.0, + "grad_norm": 1.796077728702329, + "language_loss": 0.78785884, + "learning_rate": 2.7014862723025335e-08, + "loss": 0.80846751, + "num_input_tokens_seen": 340606010, + "step": 15787, + "time_per_iteration": 2.7519822120666504 + }, + { + "auxiliary_loss_clip": 0.01052779, + "auxiliary_loss_mlp": 0.01025835, + "balance_loss_clip": 1.02725542, + "balance_loss_mlp": 1.01598215, + "epoch": 0.9492259131218999, + "flos": 22235348862720.0, + "grad_norm": 1.5209497580145657, + "language_loss": 0.76409578, + "learning_rate": 2.6951103963540388e-08, + "loss": 0.78488195, + "num_input_tokens_seen": 340626135, + "step": 15788, + "time_per_iteration": 2.6595542430877686 + }, + { + "auxiliary_loss_clip": 0.01052965, + "auxiliary_loss_mlp": 0.01030804, + "balance_loss_clip": 1.02501488, + "balance_loss_mlp": 1.01921666, + "epoch": 0.9492860363745679, + "flos": 22966310632320.0, + "grad_norm": 2.008423545606366, + "language_loss": 0.71519297, + "learning_rate": 2.6887420022266848e-08, + "loss": 0.7360307, + "num_input_tokens_seen": 340644870, + "step": 15789, + "time_per_iteration": 2.6381213665008545 + }, + { + "auxiliary_loss_clip": 0.01015925, + "auxiliary_loss_mlp": 0.01026671, + "balance_loss_clip": 1.022789, + "balance_loss_mlp": 1.01559615, + "epoch": 0.9493461596272358, + "flos": 18370283754240.0, + "grad_norm": 1.720101249480595, + "language_loss": 0.7278434, + "learning_rate": 2.682381090161989e-08, + "loss": 0.74826932, + "num_input_tokens_seen": 340663695, + "step": 15790, + "time_per_iteration": 2.738755702972412 + }, + { + "auxiliary_loss_clip": 0.01025214, + "auxiliary_loss_mlp": 0.01029595, + "balance_loss_clip": 1.02206349, + "balance_loss_mlp": 1.01797843, + "epoch": 0.9494062828799038, + "flos": 20011724490240.0, + "grad_norm": 1.7446434446344805, + "language_loss": 0.77469385, + "learning_rate": 2.6760276604012033e-08, + "loss": 0.79524195, + "num_input_tokens_seen": 340682970, + "step": 15791, + "time_per_iteration": 4.452749967575073 + }, + { + "auxiliary_loss_clip": 0.01053731, + "auxiliary_loss_mlp": 0.01028045, + "balance_loss_clip": 1.02459896, + "balance_loss_mlp": 1.0173279, + "epoch": 0.9494664061325718, + "flos": 27228583313280.0, + "grad_norm": 2.128250140151558, + "language_loss": 0.73768276, + "learning_rate": 2.6696817131852234e-08, + "loss": 0.75850052, + "num_input_tokens_seen": 340702275, + "step": 15792, + "time_per_iteration": 2.7431399822235107 + }, + { + "auxiliary_loss_clip": 0.01048296, + "auxiliary_loss_mlp": 0.01030771, + "balance_loss_clip": 1.02265489, + "balance_loss_mlp": 1.02071524, + "epoch": 0.9495265293852397, + "flos": 18369816877440.0, + "grad_norm": 3.5582907887092223, + "language_loss": 0.77996314, + "learning_rate": 2.663343248754679e-08, + "loss": 0.80075377, + "num_input_tokens_seen": 340719060, + "step": 15793, + "time_per_iteration": 2.5953683853149414 + }, + { + "auxiliary_loss_clip": 0.01035669, + "auxiliary_loss_mlp": 0.01025916, + "balance_loss_clip": 1.02344823, + "balance_loss_mlp": 1.0156405, + "epoch": 0.9495866526379078, + "flos": 23075766351360.0, + "grad_norm": 1.8143083115225223, + "language_loss": 0.77399838, + "learning_rate": 2.6570122673499562e-08, + "loss": 0.7946142, + "num_input_tokens_seen": 340737815, + "step": 15794, + "time_per_iteration": 2.6722323894500732 + }, + { + "auxiliary_loss_clip": 0.0103266, + "auxiliary_loss_mlp": 0.00747726, + "balance_loss_clip": 1.02403522, + "balance_loss_mlp": 1.00043297, + "epoch": 0.9496467758905757, + "flos": 17529902179200.0, + "grad_norm": 1.835103672718816, + "language_loss": 0.61019588, + "learning_rate": 2.650688769211107e-08, + "loss": 0.62799972, + "num_input_tokens_seen": 340756035, + "step": 15795, + "time_per_iteration": 2.6965584754943848 + }, + { + "auxiliary_loss_clip": 0.01050447, + "auxiliary_loss_mlp": 0.01030314, + "balance_loss_clip": 1.02465224, + "balance_loss_mlp": 1.01975799, + "epoch": 0.9497068991432437, + "flos": 24133910129280.0, + "grad_norm": 1.6173005374471205, + "language_loss": 0.79386818, + "learning_rate": 2.644372754577895e-08, + "loss": 0.81467569, + "num_input_tokens_seen": 340775620, + "step": 15796, + "time_per_iteration": 2.6327593326568604 + }, + { + "auxiliary_loss_clip": 0.01053684, + "auxiliary_loss_mlp": 0.01026989, + "balance_loss_clip": 1.02581644, + "balance_loss_mlp": 1.01599193, + "epoch": 0.9497670223959116, + "flos": 20303319098880.0, + "grad_norm": 2.4025478917784038, + "language_loss": 0.7546705, + "learning_rate": 2.6380642236898398e-08, + "loss": 0.77547729, + "num_input_tokens_seen": 340794510, + "step": 15797, + "time_per_iteration": 2.622922420501709 + }, + { + "auxiliary_loss_clip": 0.01029166, + "auxiliary_loss_mlp": 0.00747651, + "balance_loss_clip": 1.0249753, + "balance_loss_mlp": 1.00036812, + "epoch": 0.9498271456485796, + "flos": 13698916099200.0, + "grad_norm": 2.7171182648774015, + "language_loss": 0.65733397, + "learning_rate": 2.6317631767861727e-08, + "loss": 0.67510217, + "num_input_tokens_seen": 340812955, + "step": 15798, + "time_per_iteration": 2.723052740097046 + }, + { + "auxiliary_loss_clip": 0.01053265, + "auxiliary_loss_mlp": 0.01027686, + "balance_loss_clip": 1.02583623, + "balance_loss_mlp": 1.01721311, + "epoch": 0.9498872689012475, + "flos": 20814004713600.0, + "grad_norm": 2.443849214058833, + "language_loss": 0.77200621, + "learning_rate": 2.6254696141058575e-08, + "loss": 0.79281574, + "num_input_tokens_seen": 340829200, + "step": 15799, + "time_per_iteration": 2.6081581115722656 + }, + { + "auxiliary_loss_clip": 0.01044226, + "auxiliary_loss_mlp": 0.01031563, + "balance_loss_clip": 1.02289331, + "balance_loss_mlp": 1.02124572, + "epoch": 0.9499473921539155, + "flos": 21032700670080.0, + "grad_norm": 4.029488837351426, + "language_loss": 0.70897007, + "learning_rate": 2.6191835358874814e-08, + "loss": 0.72972798, + "num_input_tokens_seen": 340848035, + "step": 15800, + "time_per_iteration": 2.619145393371582 + }, + { + "auxiliary_loss_clip": 0.01033233, + "auxiliary_loss_mlp": 0.01027381, + "balance_loss_clip": 1.02066231, + "balance_loss_mlp": 1.01693225, + "epoch": 0.9500075154065835, + "flos": 20998693468800.0, + "grad_norm": 1.56327426044139, + "language_loss": 0.71952242, + "learning_rate": 2.6129049423694315e-08, + "loss": 0.74012858, + "num_input_tokens_seen": 340870025, + "step": 15801, + "time_per_iteration": 4.350564956665039 + }, + { + "auxiliary_loss_clip": 0.01050718, + "auxiliary_loss_mlp": 0.01030108, + "balance_loss_clip": 1.02471662, + "balance_loss_mlp": 1.01987362, + "epoch": 0.9500676386592515, + "flos": 25121956515840.0, + "grad_norm": 1.658170924004151, + "language_loss": 0.80872834, + "learning_rate": 2.6066338337898508e-08, + "loss": 0.82953656, + "num_input_tokens_seen": 340892290, + "step": 15802, + "time_per_iteration": 2.6363401412963867 + }, + { + "auxiliary_loss_clip": 0.01065031, + "auxiliary_loss_mlp": 0.01028789, + "balance_loss_clip": 1.02703047, + "balance_loss_mlp": 1.01816177, + "epoch": 0.9501277619119194, + "flos": 27523625627520.0, + "grad_norm": 2.436472680691168, + "language_loss": 0.67970741, + "learning_rate": 2.60037021038646e-08, + "loss": 0.70064557, + "num_input_tokens_seen": 340912260, + "step": 15803, + "time_per_iteration": 4.337688684463501 + }, + { + "auxiliary_loss_clip": 0.01035068, + "auxiliary_loss_mlp": 0.01030551, + "balance_loss_clip": 1.02302241, + "balance_loss_mlp": 1.02010202, + "epoch": 0.9501878851645874, + "flos": 20813968800000.0, + "grad_norm": 1.6707823003524043, + "language_loss": 0.76122022, + "learning_rate": 2.5941140723968247e-08, + "loss": 0.78187644, + "num_input_tokens_seen": 340928930, + "step": 15804, + "time_per_iteration": 2.705129861831665 + }, + { + "auxiliary_loss_clip": 0.01046901, + "auxiliary_loss_mlp": 0.01029352, + "balance_loss_clip": 1.02538598, + "balance_loss_mlp": 1.01804531, + "epoch": 0.9502480084172553, + "flos": 18369385914240.0, + "grad_norm": 1.6183633000299078, + "language_loss": 0.73088473, + "learning_rate": 2.5878654200581775e-08, + "loss": 0.75164723, + "num_input_tokens_seen": 340946615, + "step": 15805, + "time_per_iteration": 2.6526882648468018 + }, + { + "auxiliary_loss_clip": 0.01036795, + "auxiliary_loss_mlp": 0.01032319, + "balance_loss_clip": 1.02659798, + "balance_loss_mlp": 1.02193594, + "epoch": 0.9503081316699233, + "flos": 23549607590400.0, + "grad_norm": 1.5339183750878134, + "language_loss": 0.8028872, + "learning_rate": 2.5816242536074618e-08, + "loss": 0.8235783, + "num_input_tokens_seen": 340967545, + "step": 15806, + "time_per_iteration": 2.717679977416992 + }, + { + "auxiliary_loss_clip": 0.0102554, + "auxiliary_loss_mlp": 0.0103051, + "balance_loss_clip": 1.02361226, + "balance_loss_mlp": 1.01956713, + "epoch": 0.9503682549225914, + "flos": 18040444139520.0, + "grad_norm": 2.160866749404478, + "language_loss": 0.82509232, + "learning_rate": 2.5753905732813108e-08, + "loss": 0.84565282, + "num_input_tokens_seen": 340984955, + "step": 15807, + "time_per_iteration": 2.6644792556762695 + }, + { + "auxiliary_loss_clip": 0.01041954, + "auxiliary_loss_mlp": 0.01028945, + "balance_loss_clip": 1.02219343, + "balance_loss_mlp": 1.01871133, + "epoch": 0.9504283781752593, + "flos": 25886135387520.0, + "grad_norm": 1.5760762898049963, + "language_loss": 0.71493149, + "learning_rate": 2.5691643793161355e-08, + "loss": 0.73564053, + "num_input_tokens_seen": 341007300, + "step": 15808, + "time_per_iteration": 2.7065277099609375 + }, + { + "auxiliary_loss_clip": 0.01052854, + "auxiliary_loss_mlp": 0.01029192, + "balance_loss_clip": 1.02549028, + "balance_loss_mlp": 1.01848722, + "epoch": 0.9504885014279273, + "flos": 22124025636480.0, + "grad_norm": 1.4120150769112525, + "language_loss": 0.70062101, + "learning_rate": 2.562945671948058e-08, + "loss": 0.72144139, + "num_input_tokens_seen": 341026695, + "step": 15809, + "time_per_iteration": 2.684647798538208 + }, + { + "auxiliary_loss_clip": 0.01036586, + "auxiliary_loss_mlp": 0.01023337, + "balance_loss_clip": 1.02364409, + "balance_loss_mlp": 1.01303184, + "epoch": 0.9505486246805952, + "flos": 21615961714560.0, + "grad_norm": 1.6949644933211396, + "language_loss": 0.75667512, + "learning_rate": 2.5567344514128452e-08, + "loss": 0.77727437, + "num_input_tokens_seen": 341047080, + "step": 15810, + "time_per_iteration": 2.7624218463897705 + }, + { + "auxiliary_loss_clip": 0.01029179, + "auxiliary_loss_mlp": 0.01041097, + "balance_loss_clip": 1.02287555, + "balance_loss_mlp": 1.0302552, + "epoch": 0.9506087479332632, + "flos": 22528236360960.0, + "grad_norm": 1.456147651285901, + "language_loss": 0.79961586, + "learning_rate": 2.5505307179460643e-08, + "loss": 0.82031858, + "num_input_tokens_seen": 341067310, + "step": 15811, + "time_per_iteration": 2.7600765228271484 + }, + { + "auxiliary_loss_clip": 0.01034032, + "auxiliary_loss_mlp": 0.01028819, + "balance_loss_clip": 1.02266204, + "balance_loss_mlp": 1.01808453, + "epoch": 0.9506688711859311, + "flos": 27527360641920.0, + "grad_norm": 1.9721123149912776, + "language_loss": 0.69967997, + "learning_rate": 2.5443344717829495e-08, + "loss": 0.72030854, + "num_input_tokens_seen": 341085110, + "step": 15812, + "time_per_iteration": 2.7285585403442383 + }, + { + "auxiliary_loss_clip": 0.01024415, + "auxiliary_loss_mlp": 0.01032758, + "balance_loss_clip": 1.02190971, + "balance_loss_mlp": 1.02090311, + "epoch": 0.9507289944385992, + "flos": 19865783531520.0, + "grad_norm": 1.5156255036241795, + "language_loss": 0.65348375, + "learning_rate": 2.538145713158446e-08, + "loss": 0.67405546, + "num_input_tokens_seen": 341103190, + "step": 15813, + "time_per_iteration": 2.7498598098754883 + }, + { + "auxiliary_loss_clip": 0.01054722, + "auxiliary_loss_mlp": 0.01034744, + "balance_loss_clip": 1.0260309, + "balance_loss_mlp": 1.02384806, + "epoch": 0.9507891176912671, + "flos": 25193274969600.0, + "grad_norm": 1.4188412120495566, + "language_loss": 0.70468777, + "learning_rate": 2.5319644423072327e-08, + "loss": 0.72558236, + "num_input_tokens_seen": 341125695, + "step": 15814, + "time_per_iteration": 2.6800622940063477 + }, + { + "auxiliary_loss_clip": 0.01050702, + "auxiliary_loss_mlp": 0.01026664, + "balance_loss_clip": 1.02493143, + "balance_loss_mlp": 1.0169549, + "epoch": 0.9508492409439351, + "flos": 24899561458560.0, + "grad_norm": 2.4900574244092843, + "language_loss": 0.63475645, + "learning_rate": 2.5257906594637445e-08, + "loss": 0.65553015, + "num_input_tokens_seen": 341143930, + "step": 15815, + "time_per_iteration": 2.67874813079834 + }, + { + "auxiliary_loss_clip": 0.01039705, + "auxiliary_loss_mlp": 0.01025696, + "balance_loss_clip": 1.02327728, + "balance_loss_mlp": 1.0159502, + "epoch": 0.950909364196603, + "flos": 29784094375680.0, + "grad_norm": 13.15460278974792, + "language_loss": 0.58592033, + "learning_rate": 2.519624364862061e-08, + "loss": 0.6065743, + "num_input_tokens_seen": 341164280, + "step": 15816, + "time_per_iteration": 2.780980110168457 + }, + { + "auxiliary_loss_clip": 0.01059736, + "auxiliary_loss_mlp": 0.01034764, + "balance_loss_clip": 1.02384722, + "balance_loss_mlp": 1.02472043, + "epoch": 0.950969487449271, + "flos": 24717781704960.0, + "grad_norm": 1.3666405694191721, + "language_loss": 0.73659664, + "learning_rate": 2.513465558735994e-08, + "loss": 0.75754166, + "num_input_tokens_seen": 341183670, + "step": 15817, + "time_per_iteration": 2.618558168411255 + }, + { + "auxiliary_loss_clip": 0.01043011, + "auxiliary_loss_mlp": 0.01030867, + "balance_loss_clip": 1.02487123, + "balance_loss_mlp": 1.01891041, + "epoch": 0.9510296107019389, + "flos": 13699167494400.0, + "grad_norm": 1.5950477209352556, + "language_loss": 0.60314649, + "learning_rate": 2.5073142413190918e-08, + "loss": 0.62388527, + "num_input_tokens_seen": 341201900, + "step": 15818, + "time_per_iteration": 2.6205060482025146 + }, + { + "auxiliary_loss_clip": 0.01065122, + "auxiliary_loss_mlp": 0.01032517, + "balance_loss_clip": 1.02729523, + "balance_loss_mlp": 1.02162123, + "epoch": 0.9510897339546069, + "flos": 17311852667520.0, + "grad_norm": 1.7832718277341144, + "language_loss": 0.69651115, + "learning_rate": 2.5011704128446552e-08, + "loss": 0.71748745, + "num_input_tokens_seen": 341218340, + "step": 15819, + "time_per_iteration": 2.545745849609375 + }, + { + "auxiliary_loss_clip": 0.01025057, + "auxiliary_loss_mlp": 0.01025851, + "balance_loss_clip": 1.02582288, + "balance_loss_mlp": 1.01535439, + "epoch": 0.951149857207275, + "flos": 14793940166400.0, + "grad_norm": 2.1996012936892813, + "language_loss": 0.74129897, + "learning_rate": 2.49503407354561e-08, + "loss": 0.76180804, + "num_input_tokens_seen": 341235885, + "step": 15820, + "time_per_iteration": 2.688779592514038 + }, + { + "auxiliary_loss_clip": 0.01043032, + "auxiliary_loss_mlp": 0.01030851, + "balance_loss_clip": 1.02501225, + "balance_loss_mlp": 1.02022338, + "epoch": 0.9512099804599429, + "flos": 19391152193280.0, + "grad_norm": 2.044419724179257, + "language_loss": 0.78174305, + "learning_rate": 2.4889052236546804e-08, + "loss": 0.80248183, + "num_input_tokens_seen": 341255280, + "step": 15821, + "time_per_iteration": 2.6361467838287354 + }, + { + "auxiliary_loss_clip": 0.01027512, + "auxiliary_loss_mlp": 0.01029016, + "balance_loss_clip": 1.02258837, + "balance_loss_mlp": 1.01810265, + "epoch": 0.9512701037126109, + "flos": 36757874885760.0, + "grad_norm": 1.3709462399431505, + "language_loss": 0.70988655, + "learning_rate": 2.4827838634042586e-08, + "loss": 0.73045182, + "num_input_tokens_seen": 341279055, + "step": 15822, + "time_per_iteration": 4.855983734130859 + }, + { + "auxiliary_loss_clip": 0.01052175, + "auxiliary_loss_mlp": 0.0103223, + "balance_loss_clip": 1.02541804, + "balance_loss_mlp": 1.02245533, + "epoch": 0.9513302269652788, + "flos": 22638266697600.0, + "grad_norm": 1.5918261703456666, + "language_loss": 0.66157246, + "learning_rate": 2.47666999302647e-08, + "loss": 0.6824165, + "num_input_tokens_seen": 341298560, + "step": 15823, + "time_per_iteration": 2.6171910762786865 + }, + { + "auxiliary_loss_clip": 0.01042008, + "auxiliary_loss_mlp": 0.01028145, + "balance_loss_clip": 1.02300572, + "balance_loss_mlp": 1.01829803, + "epoch": 0.9513903502179468, + "flos": 22893232412160.0, + "grad_norm": 1.89142376636757, + "language_loss": 0.76854807, + "learning_rate": 2.4705636127531292e-08, + "loss": 0.7892496, + "num_input_tokens_seen": 341316650, + "step": 15824, + "time_per_iteration": 2.6114039421081543 + }, + { + "auxiliary_loss_clip": 0.01063821, + "auxiliary_loss_mlp": 0.01026235, + "balance_loss_clip": 1.02439284, + "balance_loss_mlp": 1.01519036, + "epoch": 0.9514504734706147, + "flos": 27928626451200.0, + "grad_norm": 2.3205778440768934, + "language_loss": 0.73568571, + "learning_rate": 2.4644647228158065e-08, + "loss": 0.75658619, + "num_input_tokens_seen": 341336185, + "step": 15825, + "time_per_iteration": 2.654513359069824 + }, + { + "auxiliary_loss_clip": 0.00998195, + "auxiliary_loss_mlp": 0.01001973, + "balance_loss_clip": 1.00241089, + "balance_loss_mlp": 1.00106132, + "epoch": 0.9515105967232828, + "flos": 67366767312000.0, + "grad_norm": 0.8274113053089763, + "language_loss": 0.53405875, + "learning_rate": 2.458373323445806e-08, + "loss": 0.5540604, + "num_input_tokens_seen": 341395795, + "step": 15826, + "time_per_iteration": 3.1527044773101807 + }, + { + "auxiliary_loss_clip": 0.01040036, + "auxiliary_loss_mlp": 0.01034088, + "balance_loss_clip": 1.0241909, + "balance_loss_mlp": 1.02382398, + "epoch": 0.9515707199759507, + "flos": 25846525664640.0, + "grad_norm": 3.044039820158247, + "language_loss": 0.72569442, + "learning_rate": 2.452289414874076e-08, + "loss": 0.74643564, + "num_input_tokens_seen": 341415675, + "step": 15827, + "time_per_iteration": 2.758812665939331 + }, + { + "auxiliary_loss_clip": 0.01043637, + "auxiliary_loss_mlp": 0.01027654, + "balance_loss_clip": 1.02565694, + "balance_loss_mlp": 1.01694942, + "epoch": 0.9516308432286187, + "flos": 21828983322240.0, + "grad_norm": 2.1924120207643836, + "language_loss": 0.74301934, + "learning_rate": 2.4462129973313207e-08, + "loss": 0.76373231, + "num_input_tokens_seen": 341432990, + "step": 15828, + "time_per_iteration": 2.6886088848114014 + }, + { + "auxiliary_loss_clip": 0.01021001, + "auxiliary_loss_mlp": 0.01030778, + "balance_loss_clip": 1.02437687, + "balance_loss_mlp": 1.02138972, + "epoch": 0.9516909664812866, + "flos": 27269593666560.0, + "grad_norm": 1.6567801504074136, + "language_loss": 0.73075604, + "learning_rate": 2.440144071047978e-08, + "loss": 0.75127387, + "num_input_tokens_seen": 341454100, + "step": 15829, + "time_per_iteration": 2.8195407390594482 + }, + { + "auxiliary_loss_clip": 0.01045693, + "auxiliary_loss_mlp": 0.01029664, + "balance_loss_clip": 1.02327871, + "balance_loss_mlp": 1.01879191, + "epoch": 0.9517510897339546, + "flos": 21215342350080.0, + "grad_norm": 1.6549060726364373, + "language_loss": 0.61385411, + "learning_rate": 2.4340826362541533e-08, + "loss": 0.63460767, + "num_input_tokens_seen": 341472955, + "step": 15830, + "time_per_iteration": 2.6364355087280273 + }, + { + "auxiliary_loss_clip": 0.01056061, + "auxiliary_loss_mlp": 0.01029551, + "balance_loss_clip": 1.02747464, + "balance_loss_mlp": 1.01774299, + "epoch": 0.9518112129866225, + "flos": 18733986915840.0, + "grad_norm": 1.7733339341276635, + "language_loss": 0.72917354, + "learning_rate": 2.428028693179729e-08, + "loss": 0.75002968, + "num_input_tokens_seen": 341490165, + "step": 15831, + "time_per_iteration": 2.6628031730651855 + }, + { + "auxiliary_loss_clip": 0.01005223, + "auxiliary_loss_mlp": 0.01022484, + "balance_loss_clip": 1.02063799, + "balance_loss_mlp": 1.01311398, + "epoch": 0.9518713362392905, + "flos": 16763676232320.0, + "grad_norm": 1.536391532205783, + "language_loss": 0.65313339, + "learning_rate": 2.4219822420542545e-08, + "loss": 0.67341042, + "num_input_tokens_seen": 341508055, + "step": 15832, + "time_per_iteration": 2.7084407806396484 + }, + { + "auxiliary_loss_clip": 0.01046092, + "auxiliary_loss_mlp": 0.01033738, + "balance_loss_clip": 1.02563381, + "balance_loss_mlp": 1.02275848, + "epoch": 0.9519314594919586, + "flos": 15230649720960.0, + "grad_norm": 1.8148393917630508, + "language_loss": 0.78314209, + "learning_rate": 2.4159432831070135e-08, + "loss": 0.80394036, + "num_input_tokens_seen": 341526155, + "step": 15833, + "time_per_iteration": 2.637911558151245 + }, + { + "auxiliary_loss_clip": 0.01023076, + "auxiliary_loss_mlp": 0.01028908, + "balance_loss_clip": 1.02299774, + "balance_loss_mlp": 1.01907372, + "epoch": 0.9519915827446265, + "flos": 19352943100800.0, + "grad_norm": 2.0284899739366633, + "language_loss": 0.74836445, + "learning_rate": 2.4099118165670007e-08, + "loss": 0.76888424, + "num_input_tokens_seen": 341540450, + "step": 15834, + "time_per_iteration": 2.6190595626831055 + }, + { + "auxiliary_loss_clip": 0.01058016, + "auxiliary_loss_mlp": 0.01031493, + "balance_loss_clip": 1.02786386, + "balance_loss_mlp": 1.01964331, + "epoch": 0.9520517059972945, + "flos": 22266303408000.0, + "grad_norm": 1.978845093473577, + "language_loss": 0.76241499, + "learning_rate": 2.4038878426629216e-08, + "loss": 0.78331012, + "num_input_tokens_seen": 341557865, + "step": 15835, + "time_per_iteration": 2.6467435359954834 + }, + { + "auxiliary_loss_clip": 0.01038729, + "auxiliary_loss_mlp": 0.01030649, + "balance_loss_clip": 1.02283025, + "balance_loss_mlp": 1.01985455, + "epoch": 0.9521118292499624, + "flos": 14862313704960.0, + "grad_norm": 1.9426378942220826, + "language_loss": 0.66364217, + "learning_rate": 2.397871361623238e-08, + "loss": 0.68433595, + "num_input_tokens_seen": 341573890, + "step": 15836, + "time_per_iteration": 2.665658950805664 + }, + { + "auxiliary_loss_clip": 0.01030039, + "auxiliary_loss_mlp": 0.0102731, + "balance_loss_clip": 1.02441788, + "balance_loss_mlp": 1.01702809, + "epoch": 0.9521719525026304, + "flos": 23508812718720.0, + "grad_norm": 1.8651446831865361, + "language_loss": 0.7046876, + "learning_rate": 2.391862373676057e-08, + "loss": 0.72526109, + "num_input_tokens_seen": 341593770, + "step": 15837, + "time_per_iteration": 2.8215208053588867 + }, + { + "auxiliary_loss_clip": 0.01064088, + "auxiliary_loss_mlp": 0.01032252, + "balance_loss_clip": 1.02555132, + "balance_loss_mlp": 1.02038443, + "epoch": 0.9522320757552983, + "flos": 19714922409600.0, + "grad_norm": 1.8975260470562096, + "language_loss": 0.73681808, + "learning_rate": 2.3858608790492617e-08, + "loss": 0.75778145, + "num_input_tokens_seen": 341612065, + "step": 15838, + "time_per_iteration": 4.2915709018707275 + }, + { + "auxiliary_loss_clip": 0.01028086, + "auxiliary_loss_mlp": 0.01025865, + "balance_loss_clip": 1.02398992, + "balance_loss_mlp": 1.01579762, + "epoch": 0.9522921990079664, + "flos": 25921291824000.0, + "grad_norm": 1.8126821387597194, + "language_loss": 0.77776611, + "learning_rate": 2.379866877970449e-08, + "loss": 0.79830557, + "num_input_tokens_seen": 341631365, + "step": 15839, + "time_per_iteration": 2.7124204635620117 + }, + { + "auxiliary_loss_clip": 0.0103461, + "auxiliary_loss_mlp": 0.01028642, + "balance_loss_clip": 1.02556109, + "balance_loss_mlp": 1.01834857, + "epoch": 0.9523523222606343, + "flos": 19208115463680.0, + "grad_norm": 1.4786364453705736, + "language_loss": 0.80344021, + "learning_rate": 2.3738803706668585e-08, + "loss": 0.82407272, + "num_input_tokens_seen": 341650300, + "step": 15840, + "time_per_iteration": 2.687710762023926 + }, + { + "auxiliary_loss_clip": 0.01036991, + "auxiliary_loss_mlp": 0.01026083, + "balance_loss_clip": 1.0232116, + "balance_loss_mlp": 1.01720762, + "epoch": 0.9524124455133023, + "flos": 20921269703040.0, + "grad_norm": 1.8655339110962872, + "language_loss": 0.72959709, + "learning_rate": 2.3679013573655314e-08, + "loss": 0.75022787, + "num_input_tokens_seen": 341667680, + "step": 15841, + "time_per_iteration": 2.6450352668762207 + }, + { + "auxiliary_loss_clip": 0.01036862, + "auxiliary_loss_mlp": 0.01024218, + "balance_loss_clip": 1.02320623, + "balance_loss_mlp": 1.01467514, + "epoch": 0.9524725687659702, + "flos": 18843550375680.0, + "grad_norm": 1.6970819187042427, + "language_loss": 0.78976369, + "learning_rate": 2.3619298382931972e-08, + "loss": 0.8103745, + "num_input_tokens_seen": 341685760, + "step": 15842, + "time_per_iteration": 2.697587728500366 + }, + { + "auxiliary_loss_clip": 0.01042388, + "auxiliary_loss_mlp": 0.01027665, + "balance_loss_clip": 1.02599907, + "balance_loss_mlp": 1.01718688, + "epoch": 0.9525326920186382, + "flos": 22674680110080.0, + "grad_norm": 1.5882943440379818, + "language_loss": 0.72077942, + "learning_rate": 2.3559658136762973e-08, + "loss": 0.74147993, + "num_input_tokens_seen": 341705300, + "step": 15843, + "time_per_iteration": 2.7457640171051025 + }, + { + "auxiliary_loss_clip": 0.01036449, + "auxiliary_loss_mlp": 0.00747605, + "balance_loss_clip": 1.02549958, + "balance_loss_mlp": 1.00039792, + "epoch": 0.9525928152713061, + "flos": 22086642556800.0, + "grad_norm": 2.177709256965526, + "language_loss": 0.78064108, + "learning_rate": 2.3500092837409612e-08, + "loss": 0.79848158, + "num_input_tokens_seen": 341724565, + "step": 15844, + "time_per_iteration": 2.6785483360290527 + }, + { + "auxiliary_loss_clip": 0.010265, + "auxiliary_loss_mlp": 0.01030073, + "balance_loss_clip": 1.02340269, + "balance_loss_mlp": 1.01765692, + "epoch": 0.9526529385239741, + "flos": 20704728562560.0, + "grad_norm": 2.006923958879668, + "language_loss": 0.70517504, + "learning_rate": 2.3440602487130977e-08, + "loss": 0.72574073, + "num_input_tokens_seen": 341743605, + "step": 15845, + "time_per_iteration": 2.7652199268341064 + }, + { + "auxiliary_loss_clip": 0.01034206, + "auxiliary_loss_mlp": 0.01029705, + "balance_loss_clip": 1.02501869, + "balance_loss_mlp": 1.01948905, + "epoch": 0.9527130617766422, + "flos": 23368043318400.0, + "grad_norm": 1.4922459386362514, + "language_loss": 0.75633478, + "learning_rate": 2.338118708818282e-08, + "loss": 0.77697384, + "num_input_tokens_seen": 341763475, + "step": 15846, + "time_per_iteration": 2.743049144744873 + }, + { + "auxiliary_loss_clip": 0.01034159, + "auxiliary_loss_mlp": 0.01022119, + "balance_loss_clip": 1.02582073, + "balance_loss_mlp": 1.01202166, + "epoch": 0.9527731850293101, + "flos": 18985935888000.0, + "grad_norm": 1.6142655086848745, + "language_loss": 0.78202009, + "learning_rate": 2.3321846642817998e-08, + "loss": 0.80258292, + "num_input_tokens_seen": 341781265, + "step": 15847, + "time_per_iteration": 2.7228665351867676 + }, + { + "auxiliary_loss_clip": 0.0102729, + "auxiliary_loss_mlp": 0.01033462, + "balance_loss_clip": 1.02443671, + "balance_loss_mlp": 1.02355015, + "epoch": 0.9528333082819781, + "flos": 19318038059520.0, + "grad_norm": 1.515581585126324, + "language_loss": 0.78158718, + "learning_rate": 2.326258115328672e-08, + "loss": 0.80219471, + "num_input_tokens_seen": 341798825, + "step": 15848, + "time_per_iteration": 4.33902907371521 + }, + { + "auxiliary_loss_clip": 0.01040618, + "auxiliary_loss_mlp": 0.01035241, + "balance_loss_clip": 1.02436924, + "balance_loss_mlp": 1.02401161, + "epoch": 0.952893431534646, + "flos": 23951340276480.0, + "grad_norm": 2.3002204711677052, + "language_loss": 0.72220492, + "learning_rate": 2.320339062183674e-08, + "loss": 0.74296355, + "num_input_tokens_seen": 341819480, + "step": 15849, + "time_per_iteration": 2.77815318107605 + }, + { + "auxiliary_loss_clip": 0.01057327, + "auxiliary_loss_mlp": 0.01034179, + "balance_loss_clip": 1.02695775, + "balance_loss_mlp": 1.0232352, + "epoch": 0.952953554787314, + "flos": 21030545854080.0, + "grad_norm": 1.4982163372849255, + "language_loss": 0.75231135, + "learning_rate": 2.314427505071226e-08, + "loss": 0.77322644, + "num_input_tokens_seen": 341838035, + "step": 15850, + "time_per_iteration": 4.246366024017334 + }, + { + "auxiliary_loss_clip": 0.01037154, + "auxiliary_loss_mlp": 0.01025491, + "balance_loss_clip": 1.02228928, + "balance_loss_mlp": 1.01530504, + "epoch": 0.9530136780399819, + "flos": 22382870019840.0, + "grad_norm": 2.244066676977746, + "language_loss": 0.72736543, + "learning_rate": 2.308523444215482e-08, + "loss": 0.74799192, + "num_input_tokens_seen": 341855895, + "step": 15851, + "time_per_iteration": 2.8123342990875244 + }, + { + "auxiliary_loss_clip": 0.01041027, + "auxiliary_loss_mlp": 0.0102312, + "balance_loss_clip": 1.0244472, + "balance_loss_mlp": 1.01341665, + "epoch": 0.95307380129265, + "flos": 22159613036160.0, + "grad_norm": 1.8371024502991777, + "language_loss": 0.79446256, + "learning_rate": 2.3026268798403525e-08, + "loss": 0.81510401, + "num_input_tokens_seen": 341875240, + "step": 15852, + "time_per_iteration": 2.867400884628296 + }, + { + "auxiliary_loss_clip": 0.01049604, + "auxiliary_loss_mlp": 0.01028863, + "balance_loss_clip": 1.0234319, + "balance_loss_mlp": 1.01814032, + "epoch": 0.9531339245453179, + "flos": 44022747214080.0, + "grad_norm": 1.4093353309311005, + "language_loss": 0.5995087, + "learning_rate": 2.2967378121694138e-08, + "loss": 0.62029338, + "num_input_tokens_seen": 341901020, + "step": 15853, + "time_per_iteration": 2.845102548599243 + }, + { + "auxiliary_loss_clip": 0.01039734, + "auxiliary_loss_mlp": 0.01027692, + "balance_loss_clip": 1.02402759, + "balance_loss_mlp": 1.01803005, + "epoch": 0.9531940477979859, + "flos": 20266690204800.0, + "grad_norm": 1.666223210776338, + "language_loss": 0.72547036, + "learning_rate": 2.290856241425998e-08, + "loss": 0.74614465, + "num_input_tokens_seen": 341919365, + "step": 15854, + "time_per_iteration": 2.6328439712524414 + }, + { + "auxiliary_loss_clip": 0.0104059, + "auxiliary_loss_mlp": 0.01028833, + "balance_loss_clip": 1.02294636, + "balance_loss_mlp": 1.01865816, + "epoch": 0.9532541710506538, + "flos": 25335732309120.0, + "grad_norm": 2.082383357216158, + "language_loss": 0.67234707, + "learning_rate": 2.284982167833127e-08, + "loss": 0.69304132, + "num_input_tokens_seen": 341939985, + "step": 15855, + "time_per_iteration": 2.686720132827759 + }, + { + "auxiliary_loss_clip": 0.01061043, + "auxiliary_loss_mlp": 0.01027485, + "balance_loss_clip": 1.02387798, + "balance_loss_mlp": 1.01731622, + "epoch": 0.9533142943033218, + "flos": 26469288691200.0, + "grad_norm": 2.4584026061780504, + "language_loss": 0.76602232, + "learning_rate": 2.279115591613556e-08, + "loss": 0.78690755, + "num_input_tokens_seen": 341959255, + "step": 15856, + "time_per_iteration": 2.6164021492004395 + }, + { + "auxiliary_loss_clip": 0.01030669, + "auxiliary_loss_mlp": 0.01028637, + "balance_loss_clip": 1.02018237, + "balance_loss_mlp": 1.01871896, + "epoch": 0.9533744175559897, + "flos": 23656944407040.0, + "grad_norm": 1.806115743616457, + "language_loss": 0.77719593, + "learning_rate": 2.2732565129897075e-08, + "loss": 0.79778898, + "num_input_tokens_seen": 341977205, + "step": 15857, + "time_per_iteration": 2.641324281692505 + }, + { + "auxiliary_loss_clip": 0.00997958, + "auxiliary_loss_mlp": 0.01000715, + "balance_loss_clip": 1.00259709, + "balance_loss_mlp": 0.99990445, + "epoch": 0.9534345408086577, + "flos": 61052055500160.0, + "grad_norm": 0.7113136641560329, + "language_loss": 0.62652767, + "learning_rate": 2.267404932183803e-08, + "loss": 0.64651442, + "num_input_tokens_seen": 342038545, + "step": 15858, + "time_per_iteration": 3.2478506565093994 + }, + { + "auxiliary_loss_clip": 0.0101273, + "auxiliary_loss_mlp": 0.01028934, + "balance_loss_clip": 1.02243733, + "balance_loss_mlp": 1.01895595, + "epoch": 0.9534946640613258, + "flos": 18951677291520.0, + "grad_norm": 1.5302674105361749, + "language_loss": 0.56790382, + "learning_rate": 2.2615608494177097e-08, + "loss": 0.58832043, + "num_input_tokens_seen": 342058195, + "step": 15859, + "time_per_iteration": 2.7164793014526367 + }, + { + "auxiliary_loss_clip": 0.01059107, + "auxiliary_loss_mlp": 0.01024832, + "balance_loss_clip": 1.02418089, + "balance_loss_mlp": 1.0157069, + "epoch": 0.9535547873139937, + "flos": 16654292340480.0, + "grad_norm": 2.242503595139865, + "language_loss": 0.81660938, + "learning_rate": 2.2557242649130504e-08, + "loss": 0.83744878, + "num_input_tokens_seen": 342075025, + "step": 15860, + "time_per_iteration": 2.8294990062713623 + }, + { + "auxiliary_loss_clip": 0.01018892, + "auxiliary_loss_mlp": 0.00747574, + "balance_loss_clip": 1.02191448, + "balance_loss_mlp": 1.00040889, + "epoch": 0.9536149105666617, + "flos": 20667776446080.0, + "grad_norm": 1.809248299735166, + "language_loss": 0.66949493, + "learning_rate": 2.249895178891159e-08, + "loss": 0.6871596, + "num_input_tokens_seen": 342094595, + "step": 15861, + "time_per_iteration": 2.731158971786499 + }, + { + "auxiliary_loss_clip": 0.01050944, + "auxiliary_loss_mlp": 0.01032088, + "balance_loss_clip": 1.02432823, + "balance_loss_mlp": 1.02131724, + "epoch": 0.9536750338193296, + "flos": 30700499086080.0, + "grad_norm": 1.6560873911676544, + "language_loss": 0.65636301, + "learning_rate": 2.244073591573037e-08, + "loss": 0.67719334, + "num_input_tokens_seen": 342115970, + "step": 15862, + "time_per_iteration": 2.660900831222534 + }, + { + "auxiliary_loss_clip": 0.01021031, + "auxiliary_loss_mlp": 0.01026621, + "balance_loss_clip": 1.02216947, + "balance_loss_mlp": 1.01691186, + "epoch": 0.9537351570719976, + "flos": 20405484357120.0, + "grad_norm": 1.8735435982261834, + "language_loss": 0.67657423, + "learning_rate": 2.238259503179485e-08, + "loss": 0.69705075, + "num_input_tokens_seen": 342134080, + "step": 15863, + "time_per_iteration": 2.6947309970855713 + }, + { + "auxiliary_loss_clip": 0.01038193, + "auxiliary_loss_mlp": 0.01023108, + "balance_loss_clip": 1.02239549, + "balance_loss_mlp": 1.01280189, + "epoch": 0.9537952803246655, + "flos": 29929245235200.0, + "grad_norm": 2.120717158967481, + "language_loss": 0.78361273, + "learning_rate": 2.2324529139309267e-08, + "loss": 0.80422574, + "num_input_tokens_seen": 342154725, + "step": 15864, + "time_per_iteration": 2.7865827083587646 + }, + { + "auxiliary_loss_clip": 0.01031295, + "auxiliary_loss_mlp": 0.0102653, + "balance_loss_clip": 1.02463388, + "balance_loss_mlp": 1.01698184, + "epoch": 0.9538554035773336, + "flos": 20521404524160.0, + "grad_norm": 2.991753312232448, + "language_loss": 0.59701955, + "learning_rate": 2.226653824047586e-08, + "loss": 0.61759782, + "num_input_tokens_seen": 342172275, + "step": 15865, + "time_per_iteration": 2.7606377601623535 + }, + { + "auxiliary_loss_clip": 0.01026559, + "auxiliary_loss_mlp": 0.01033178, + "balance_loss_clip": 1.02392852, + "balance_loss_mlp": 1.0213939, + "epoch": 0.9539155268300015, + "flos": 18406517598720.0, + "grad_norm": 1.8513703788763816, + "language_loss": 0.69818664, + "learning_rate": 2.2208622337493765e-08, + "loss": 0.71878397, + "num_input_tokens_seen": 342190880, + "step": 15866, + "time_per_iteration": 2.9776406288146973 + }, + { + "auxiliary_loss_clip": 0.01035449, + "auxiliary_loss_mlp": 0.01032755, + "balance_loss_clip": 1.02175438, + "balance_loss_mlp": 1.02084565, + "epoch": 0.9539756500826695, + "flos": 26213281482240.0, + "grad_norm": 2.5605310450528753, + "language_loss": 0.84995055, + "learning_rate": 2.215078143255855e-08, + "loss": 0.87063265, + "num_input_tokens_seen": 342208165, + "step": 15867, + "time_per_iteration": 2.729125738143921 + }, + { + "auxiliary_loss_clip": 0.00999802, + "auxiliary_loss_mlp": 0.01002581, + "balance_loss_clip": 1.0042696, + "balance_loss_mlp": 1.00154996, + "epoch": 0.9540357733353374, + "flos": 68289097766400.0, + "grad_norm": 0.7672738646266872, + "language_loss": 0.61851245, + "learning_rate": 2.2093015527864024e-08, + "loss": 0.63853627, + "num_input_tokens_seen": 342277110, + "step": 15868, + "time_per_iteration": 3.3997559547424316 + }, + { + "auxiliary_loss_clip": 0.01033903, + "auxiliary_loss_mlp": 0.01024256, + "balance_loss_clip": 1.02634358, + "balance_loss_mlp": 1.01421273, + "epoch": 0.9540958965880054, + "flos": 21288276915840.0, + "grad_norm": 1.878903463897115, + "language_loss": 0.59831417, + "learning_rate": 2.2035324625600425e-08, + "loss": 0.61889577, + "num_input_tokens_seen": 342294695, + "step": 15869, + "time_per_iteration": 4.468047857284546 + }, + { + "auxiliary_loss_clip": 0.0101857, + "auxiliary_loss_mlp": 0.00747693, + "balance_loss_clip": 1.02106595, + "balance_loss_mlp": 1.00039959, + "epoch": 0.9541560198406733, + "flos": 19751407649280.0, + "grad_norm": 1.9903388903230173, + "language_loss": 0.71321261, + "learning_rate": 2.197770872795579e-08, + "loss": 0.73087525, + "num_input_tokens_seen": 342314970, + "step": 15870, + "time_per_iteration": 2.822650909423828 + }, + { + "auxiliary_loss_clip": 0.01029207, + "auxiliary_loss_mlp": 0.01026365, + "balance_loss_clip": 1.02374625, + "balance_loss_mlp": 1.01624393, + "epoch": 0.9542161430933414, + "flos": 24715626888960.0, + "grad_norm": 2.1697708764376773, + "language_loss": 0.76753789, + "learning_rate": 2.1920167837114368e-08, + "loss": 0.78809363, + "num_input_tokens_seen": 342334255, + "step": 15871, + "time_per_iteration": 2.7775142192840576 + }, + { + "auxiliary_loss_clip": 0.01047568, + "auxiliary_loss_mlp": 0.01026782, + "balance_loss_clip": 1.02305365, + "balance_loss_mlp": 1.01583815, + "epoch": 0.9542762663460094, + "flos": 31065818359680.0, + "grad_norm": 1.9459128078132606, + "language_loss": 0.58318341, + "learning_rate": 2.1862701955258634e-08, + "loss": 0.6039269, + "num_input_tokens_seen": 342354730, + "step": 15872, + "time_per_iteration": 2.7068448066711426 + }, + { + "auxiliary_loss_clip": 0.01038243, + "auxiliary_loss_mlp": 0.01028765, + "balance_loss_clip": 1.02379954, + "balance_loss_mlp": 1.01684403, + "epoch": 0.9543363895986773, + "flos": 20776729374720.0, + "grad_norm": 1.612654327302576, + "language_loss": 0.74653625, + "learning_rate": 2.1805311084567514e-08, + "loss": 0.76720631, + "num_input_tokens_seen": 342374565, + "step": 15873, + "time_per_iteration": 2.6382546424865723 + }, + { + "auxiliary_loss_clip": 0.0106348, + "auxiliary_loss_mlp": 0.01032936, + "balance_loss_clip": 1.02610707, + "balance_loss_mlp": 1.02168226, + "epoch": 0.9543965128513453, + "flos": 24462744163200.0, + "grad_norm": 1.8606579118875282, + "language_loss": 0.62387067, + "learning_rate": 2.1747995227217265e-08, + "loss": 0.64483488, + "num_input_tokens_seen": 342394590, + "step": 15874, + "time_per_iteration": 2.6783716678619385 + }, + { + "auxiliary_loss_clip": 0.0103897, + "auxiliary_loss_mlp": 0.01032151, + "balance_loss_clip": 1.02341759, + "balance_loss_mlp": 1.0219171, + "epoch": 0.9544566361040132, + "flos": 15261532439040.0, + "grad_norm": 1.9045418279192146, + "language_loss": 0.89455104, + "learning_rate": 2.169075438538104e-08, + "loss": 0.91526222, + "num_input_tokens_seen": 342410445, + "step": 15875, + "time_per_iteration": 2.7183430194854736 + }, + { + "auxiliary_loss_clip": 0.01066, + "auxiliary_loss_mlp": 0.01031349, + "balance_loss_clip": 1.0264082, + "balance_loss_mlp": 1.01983333, + "epoch": 0.9545167593566812, + "flos": 25918777872000.0, + "grad_norm": 1.5370757807890545, + "language_loss": 0.67893732, + "learning_rate": 2.1633588561229765e-08, + "loss": 0.69991088, + "num_input_tokens_seen": 342430970, + "step": 15876, + "time_per_iteration": 2.6218817234039307 + }, + { + "auxiliary_loss_clip": 0.01051847, + "auxiliary_loss_mlp": 0.01031003, + "balance_loss_clip": 1.02357662, + "balance_loss_mlp": 1.01975548, + "epoch": 0.9545768826093491, + "flos": 25628188844160.0, + "grad_norm": 1.9107833556577294, + "language_loss": 0.6922732, + "learning_rate": 2.1576497756931267e-08, + "loss": 0.71310174, + "num_input_tokens_seen": 342449505, + "step": 15877, + "time_per_iteration": 2.6972239017486572 + }, + { + "auxiliary_loss_clip": 0.01024712, + "auxiliary_loss_mlp": 0.01029359, + "balance_loss_clip": 1.02288508, + "balance_loss_mlp": 1.01803374, + "epoch": 0.9546370058620172, + "flos": 22491499726080.0, + "grad_norm": 2.240496336311471, + "language_loss": 0.70850903, + "learning_rate": 2.1519481974650035e-08, + "loss": 0.72904968, + "num_input_tokens_seen": 342470390, + "step": 15878, + "time_per_iteration": 2.728855848312378 + }, + { + "auxiliary_loss_clip": 0.01059212, + "auxiliary_loss_mlp": 0.01023916, + "balance_loss_clip": 1.02348351, + "balance_loss_mlp": 1.01392639, + "epoch": 0.9546971291146851, + "flos": 24609582961920.0, + "grad_norm": 1.4182945318132352, + "language_loss": 0.68115842, + "learning_rate": 2.1462541216548335e-08, + "loss": 0.70198971, + "num_input_tokens_seen": 342492560, + "step": 15879, + "time_per_iteration": 2.6415436267852783 + }, + { + "auxiliary_loss_clip": 0.01026236, + "auxiliary_loss_mlp": 0.007475, + "balance_loss_clip": 1.02222276, + "balance_loss_mlp": 1.00031352, + "epoch": 0.9547572523673531, + "flos": 28657756627200.0, + "grad_norm": 2.0085766850729683, + "language_loss": 0.85321367, + "learning_rate": 2.1405675484785334e-08, + "loss": 0.87095094, + "num_input_tokens_seen": 342512315, + "step": 15880, + "time_per_iteration": 2.779374837875366 + }, + { + "auxiliary_loss_clip": 0.01003772, + "auxiliary_loss_mlp": 0.01031985, + "balance_loss_clip": 1.02343643, + "balance_loss_mlp": 1.0209043, + "epoch": 0.954817375620021, + "flos": 33802606385280.0, + "grad_norm": 1.8440138702835582, + "language_loss": 0.71792459, + "learning_rate": 2.134888478151753e-08, + "loss": 0.7382822, + "num_input_tokens_seen": 342533060, + "step": 15881, + "time_per_iteration": 2.8654074668884277 + }, + { + "auxiliary_loss_clip": 0.01052203, + "auxiliary_loss_mlp": 0.0103144, + "balance_loss_clip": 1.0260309, + "balance_loss_mlp": 1.02056265, + "epoch": 0.954877498872689, + "flos": 14428225843200.0, + "grad_norm": 1.8464114908635654, + "language_loss": 0.71256316, + "learning_rate": 2.1292169108898083e-08, + "loss": 0.73339957, + "num_input_tokens_seen": 342550830, + "step": 15882, + "time_per_iteration": 2.770157814025879 + }, + { + "auxiliary_loss_clip": 0.01040692, + "auxiliary_loss_mlp": 0.01030092, + "balance_loss_clip": 1.0240593, + "balance_loss_mlp": 1.02023292, + "epoch": 0.9549376221253569, + "flos": 59269447336320.0, + "grad_norm": 1.9888864818774623, + "language_loss": 0.6602056, + "learning_rate": 2.1235528469078168e-08, + "loss": 0.68091345, + "num_input_tokens_seen": 342575070, + "step": 15883, + "time_per_iteration": 3.0426714420318604 + }, + { + "auxiliary_loss_clip": 0.01054419, + "auxiliary_loss_mlp": 0.01029516, + "balance_loss_clip": 1.02623558, + "balance_loss_mlp": 1.018525, + "epoch": 0.954997745378025, + "flos": 17274397760640.0, + "grad_norm": 2.1141314563496763, + "language_loss": 0.77810401, + "learning_rate": 2.1178962864205175e-08, + "loss": 0.79894334, + "num_input_tokens_seen": 342592215, + "step": 15884, + "time_per_iteration": 2.6828572750091553 + }, + { + "auxiliary_loss_clip": 0.01062903, + "auxiliary_loss_mlp": 0.01026384, + "balance_loss_clip": 1.024827, + "balance_loss_mlp": 1.01566064, + "epoch": 0.955057868630693, + "flos": 13006378903680.0, + "grad_norm": 1.6674303590701434, + "language_loss": 0.7777698, + "learning_rate": 2.1122472296424054e-08, + "loss": 0.79866266, + "num_input_tokens_seen": 342610030, + "step": 15885, + "time_per_iteration": 4.516565322875977 + }, + { + "auxiliary_loss_clip": 0.01062006, + "auxiliary_loss_mlp": 0.01030346, + "balance_loss_clip": 1.02419138, + "balance_loss_mlp": 1.02038002, + "epoch": 0.9551179918833609, + "flos": 22637692080000.0, + "grad_norm": 1.7127988867720985, + "language_loss": 0.70176297, + "learning_rate": 2.1066056767877317e-08, + "loss": 0.72268653, + "num_input_tokens_seen": 342626475, + "step": 15886, + "time_per_iteration": 2.606358051300049 + }, + { + "auxiliary_loss_clip": 0.01035816, + "auxiliary_loss_mlp": 0.01031355, + "balance_loss_clip": 1.02566278, + "balance_loss_mlp": 1.01922512, + "epoch": 0.9551781151360289, + "flos": 21542811667200.0, + "grad_norm": 1.618940865185337, + "language_loss": 0.72550571, + "learning_rate": 2.1009716280703916e-08, + "loss": 0.74617738, + "num_input_tokens_seen": 342646645, + "step": 15887, + "time_per_iteration": 2.742866277694702 + }, + { + "auxiliary_loss_clip": 0.01036767, + "auxiliary_loss_mlp": 0.01028443, + "balance_loss_clip": 1.02208459, + "balance_loss_mlp": 1.01888287, + "epoch": 0.9552382383886968, + "flos": 20702250524160.0, + "grad_norm": 2.1042456455527714, + "language_loss": 0.56807446, + "learning_rate": 2.0953450837040364e-08, + "loss": 0.58872652, + "num_input_tokens_seen": 342663615, + "step": 15888, + "time_per_iteration": 2.7181527614593506 + }, + { + "auxiliary_loss_clip": 0.00996775, + "auxiliary_loss_mlp": 0.01000729, + "balance_loss_clip": 1.00139594, + "balance_loss_mlp": 0.99995387, + "epoch": 0.9552983616413648, + "flos": 67769792887680.0, + "grad_norm": 0.711715130460261, + "language_loss": 0.57878542, + "learning_rate": 2.0897260439020514e-08, + "loss": 0.59876049, + "num_input_tokens_seen": 342728275, + "step": 15889, + "time_per_iteration": 3.3519790172576904 + }, + { + "auxiliary_loss_clip": 0.01061323, + "auxiliary_loss_mlp": 0.0102582, + "balance_loss_clip": 1.02332044, + "balance_loss_mlp": 1.01516259, + "epoch": 0.9553584848940327, + "flos": 21579979265280.0, + "grad_norm": 1.7429706175986133, + "language_loss": 0.6716516, + "learning_rate": 2.084114508877466e-08, + "loss": 0.692523, + "num_input_tokens_seen": 342748860, + "step": 15890, + "time_per_iteration": 2.6201517581939697 + }, + { + "auxiliary_loss_clip": 0.01062483, + "auxiliary_loss_mlp": 0.01029889, + "balance_loss_clip": 1.0258956, + "balance_loss_mlp": 1.01989329, + "epoch": 0.9554186081467008, + "flos": 24208173498240.0, + "grad_norm": 1.958905428794899, + "language_loss": 0.73879325, + "learning_rate": 2.0785104788430874e-08, + "loss": 0.75971699, + "num_input_tokens_seen": 342769705, + "step": 15891, + "time_per_iteration": 2.774785041809082 + }, + { + "auxiliary_loss_clip": 0.01027485, + "auxiliary_loss_mlp": 0.01027007, + "balance_loss_clip": 1.02361107, + "balance_loss_mlp": 1.0180192, + "epoch": 0.9554787313993687, + "flos": 16251554073600.0, + "grad_norm": 1.832397693466897, + "language_loss": 0.77955061, + "learning_rate": 2.072913954011435e-08, + "loss": 0.80009556, + "num_input_tokens_seen": 342787000, + "step": 15892, + "time_per_iteration": 2.7525174617767334 + }, + { + "auxiliary_loss_clip": 0.01060632, + "auxiliary_loss_mlp": 0.01030593, + "balance_loss_clip": 1.02406991, + "balance_loss_mlp": 1.01981628, + "epoch": 0.9555388546520367, + "flos": 23404133508480.0, + "grad_norm": 1.4570846258976244, + "language_loss": 0.69772673, + "learning_rate": 2.0673249345947386e-08, + "loss": 0.71863896, + "num_input_tokens_seen": 342807795, + "step": 15893, + "time_per_iteration": 2.608405828475952 + }, + { + "auxiliary_loss_clip": 0.01045024, + "auxiliary_loss_mlp": 0.00747549, + "balance_loss_clip": 1.02773833, + "balance_loss_mlp": 1.00033522, + "epoch": 0.9555989779047046, + "flos": 14794047907200.0, + "grad_norm": 2.6761152054882618, + "language_loss": 0.65453285, + "learning_rate": 2.0617434208048955e-08, + "loss": 0.67245853, + "num_input_tokens_seen": 342825490, + "step": 15894, + "time_per_iteration": 2.778120994567871 + }, + { + "auxiliary_loss_clip": 0.01052904, + "auxiliary_loss_mlp": 0.01030474, + "balance_loss_clip": 1.02502322, + "balance_loss_mlp": 1.01954246, + "epoch": 0.9556591011573726, + "flos": 22236749493120.0, + "grad_norm": 1.8815761330451122, + "language_loss": 0.82031333, + "learning_rate": 2.056169412853581e-08, + "loss": 0.84114707, + "num_input_tokens_seen": 342844965, + "step": 15895, + "time_per_iteration": 4.304355144500732 + }, + { + "auxiliary_loss_clip": 0.01042765, + "auxiliary_loss_mlp": 0.01030142, + "balance_loss_clip": 1.02527404, + "balance_loss_mlp": 1.01960957, + "epoch": 0.9557192244100405, + "flos": 27855296835840.0, + "grad_norm": 1.4407553901537167, + "language_loss": 0.71946937, + "learning_rate": 2.0506029109521593e-08, + "loss": 0.74019843, + "num_input_tokens_seen": 342865915, + "step": 15896, + "time_per_iteration": 4.5468590259552 + }, + { + "auxiliary_loss_clip": 0.01060175, + "auxiliary_loss_mlp": 0.01030034, + "balance_loss_clip": 1.02405429, + "balance_loss_mlp": 1.01932311, + "epoch": 0.9557793476627086, + "flos": 17602800831360.0, + "grad_norm": 1.907801762644475, + "language_loss": 0.79093975, + "learning_rate": 2.045043915311706e-08, + "loss": 0.81184185, + "num_input_tokens_seen": 342884000, + "step": 15897, + "time_per_iteration": 2.5818498134613037 + }, + { + "auxiliary_loss_clip": 0.01034634, + "auxiliary_loss_mlp": 0.01028056, + "balance_loss_clip": 1.02052474, + "balance_loss_mlp": 1.01705337, + "epoch": 0.9558394709153766, + "flos": 23875496709120.0, + "grad_norm": 1.6919099954159817, + "language_loss": 0.72937119, + "learning_rate": 2.03949242614303e-08, + "loss": 0.74999809, + "num_input_tokens_seen": 342903095, + "step": 15898, + "time_per_iteration": 2.6108250617980957 + }, + { + "auxiliary_loss_clip": 0.00981189, + "auxiliary_loss_mlp": 0.01007221, + "balance_loss_clip": 1.00497043, + "balance_loss_mlp": 1.00641656, + "epoch": 0.9558995941680445, + "flos": 53682001171200.0, + "grad_norm": 0.8819092485266644, + "language_loss": 0.52322447, + "learning_rate": 2.033948443656652e-08, + "loss": 0.54310858, + "num_input_tokens_seen": 342958155, + "step": 15899, + "time_per_iteration": 3.2560200691223145 + }, + { + "auxiliary_loss_clip": 0.01054411, + "auxiliary_loss_mlp": 0.0103162, + "balance_loss_clip": 1.02484822, + "balance_loss_mlp": 1.01992548, + "epoch": 0.9559597174207125, + "flos": 13764488376960.0, + "grad_norm": 2.6682615430186054, + "language_loss": 0.68907779, + "learning_rate": 2.028411968062782e-08, + "loss": 0.70993805, + "num_input_tokens_seen": 342972500, + "step": 15900, + "time_per_iteration": 2.7950503826141357 + }, + { + "auxiliary_loss_clip": 0.01051336, + "auxiliary_loss_mlp": 0.0074769, + "balance_loss_clip": 1.02389741, + "balance_loss_mlp": 1.00046802, + "epoch": 0.9560198406733804, + "flos": 19936347799680.0, + "grad_norm": 6.891197015507298, + "language_loss": 0.83473945, + "learning_rate": 2.0228829995713627e-08, + "loss": 0.85272968, + "num_input_tokens_seen": 342989035, + "step": 15901, + "time_per_iteration": 2.9820258617401123 + }, + { + "auxiliary_loss_clip": 0.00978775, + "auxiliary_loss_mlp": 0.01002074, + "balance_loss_clip": 1.00267959, + "balance_loss_mlp": 1.00109625, + "epoch": 0.9560799639260484, + "flos": 57289550699520.0, + "grad_norm": 0.740878377953256, + "language_loss": 0.54315174, + "learning_rate": 2.0173615383920485e-08, + "loss": 0.56296021, + "num_input_tokens_seen": 343051675, + "step": 15902, + "time_per_iteration": 3.456669330596924 + }, + { + "auxiliary_loss_clip": 0.0104226, + "auxiliary_loss_mlp": 0.01026296, + "balance_loss_clip": 1.02713072, + "balance_loss_mlp": 1.01752794, + "epoch": 0.9561400871787163, + "flos": 18917167299840.0, + "grad_norm": 1.5791213162030169, + "language_loss": 0.85645592, + "learning_rate": 2.01184758473425e-08, + "loss": 0.87714148, + "num_input_tokens_seen": 343068895, + "step": 15903, + "time_per_iteration": 2.839792013168335 + }, + { + "auxiliary_loss_clip": 0.01034028, + "auxiliary_loss_mlp": 0.00747567, + "balance_loss_clip": 1.02259791, + "balance_loss_mlp": 1.00040913, + "epoch": 0.9562002104313844, + "flos": 18038576632320.0, + "grad_norm": 3.454223790343244, + "language_loss": 0.80390006, + "learning_rate": 2.0063411388070217e-08, + "loss": 0.82171595, + "num_input_tokens_seen": 343087115, + "step": 15904, + "time_per_iteration": 2.739440441131592 + }, + { + "auxiliary_loss_clip": 0.01048712, + "auxiliary_loss_mlp": 0.01027843, + "balance_loss_clip": 1.02298856, + "balance_loss_mlp": 1.01697159, + "epoch": 0.9562603336840523, + "flos": 24717673964160.0, + "grad_norm": 2.0681204907758355, + "language_loss": 0.59942782, + "learning_rate": 2.0008422008191972e-08, + "loss": 0.62019336, + "num_input_tokens_seen": 343105575, + "step": 15905, + "time_per_iteration": 2.6698367595672607 + }, + { + "auxiliary_loss_clip": 0.01051039, + "auxiliary_loss_mlp": 0.01026114, + "balance_loss_clip": 1.0243752, + "balance_loss_mlp": 1.01574862, + "epoch": 0.9563204569367203, + "flos": 21177205084800.0, + "grad_norm": 1.877025139185009, + "language_loss": 0.70844209, + "learning_rate": 1.995350770979254e-08, + "loss": 0.7292136, + "num_input_tokens_seen": 343123025, + "step": 15906, + "time_per_iteration": 2.713587999343872 + }, + { + "auxiliary_loss_clip": 0.01008488, + "auxiliary_loss_mlp": 0.01029306, + "balance_loss_clip": 1.02240443, + "balance_loss_mlp": 1.0182668, + "epoch": 0.9563805801893882, + "flos": 20229738088320.0, + "grad_norm": 1.6676427241092278, + "language_loss": 0.71027195, + "learning_rate": 1.9898668494954473e-08, + "loss": 0.73064995, + "num_input_tokens_seen": 343141625, + "step": 15907, + "time_per_iteration": 2.693669319152832 + }, + { + "auxiliary_loss_clip": 0.01019364, + "auxiliary_loss_mlp": 0.01027189, + "balance_loss_clip": 1.02254498, + "balance_loss_mlp": 1.01693654, + "epoch": 0.9564407034420562, + "flos": 25411001258880.0, + "grad_norm": 1.9081599395988083, + "language_loss": 0.69978106, + "learning_rate": 1.9843904365757447e-08, + "loss": 0.72024655, + "num_input_tokens_seen": 343161300, + "step": 15908, + "time_per_iteration": 2.7733352184295654 + }, + { + "auxiliary_loss_clip": 0.01041825, + "auxiliary_loss_mlp": 0.00747619, + "balance_loss_clip": 1.02553248, + "balance_loss_mlp": 1.00037611, + "epoch": 0.9565008266947241, + "flos": 18623884752000.0, + "grad_norm": 1.9434498577369912, + "language_loss": 0.8300494, + "learning_rate": 1.978921532427802e-08, + "loss": 0.84794384, + "num_input_tokens_seen": 343177815, + "step": 15909, + "time_per_iteration": 2.7863545417785645 + }, + { + "auxiliary_loss_clip": 0.01051802, + "auxiliary_loss_mlp": 0.01028572, + "balance_loss_clip": 1.02435589, + "balance_loss_mlp": 1.01861191, + "epoch": 0.9565609499473922, + "flos": 24862142465280.0, + "grad_norm": 1.9028633948091318, + "language_loss": 0.6740188, + "learning_rate": 1.9734601372590086e-08, + "loss": 0.69482249, + "num_input_tokens_seen": 343198140, + "step": 15910, + "time_per_iteration": 2.634542465209961 + }, + { + "auxiliary_loss_clip": 0.01053787, + "auxiliary_loss_mlp": 0.01031491, + "balance_loss_clip": 1.02553165, + "balance_loss_mlp": 1.02103019, + "epoch": 0.9566210732000601, + "flos": 21798459740160.0, + "grad_norm": 1.7983948182168803, + "language_loss": 0.74407554, + "learning_rate": 1.968006251276444e-08, + "loss": 0.76492834, + "num_input_tokens_seen": 343218280, + "step": 15911, + "time_per_iteration": 2.5903947353363037 + }, + { + "auxiliary_loss_clip": 0.01052234, + "auxiliary_loss_mlp": 0.01025887, + "balance_loss_clip": 1.02429867, + "balance_loss_mlp": 1.01585007, + "epoch": 0.9566811964527281, + "flos": 18697609416960.0, + "grad_norm": 2.2694939350749523, + "language_loss": 0.69383466, + "learning_rate": 1.9625598746869198e-08, + "loss": 0.71461582, + "num_input_tokens_seen": 343236850, + "step": 15912, + "time_per_iteration": 2.666901111602783 + }, + { + "auxiliary_loss_clip": 0.01036086, + "auxiliary_loss_mlp": 0.01033521, + "balance_loss_clip": 1.02159882, + "balance_loss_mlp": 1.02271461, + "epoch": 0.9567413197053961, + "flos": 13000632727680.0, + "grad_norm": 2.3907736721157313, + "language_loss": 0.71989024, + "learning_rate": 1.95712100769696e-08, + "loss": 0.74058628, + "num_input_tokens_seen": 343253065, + "step": 15913, + "time_per_iteration": 2.711228609085083 + }, + { + "auxiliary_loss_clip": 0.00994273, + "auxiliary_loss_mlp": 0.01026775, + "balance_loss_clip": 1.02428401, + "balance_loss_mlp": 1.0172503, + "epoch": 0.956801442958064, + "flos": 19719267955200.0, + "grad_norm": 2.1762446863316938, + "language_loss": 0.73430169, + "learning_rate": 1.9516896505128444e-08, + "loss": 0.75451219, + "num_input_tokens_seen": 343270330, + "step": 15914, + "time_per_iteration": 2.8561851978302 + }, + { + "auxiliary_loss_clip": 0.0106142, + "auxiliary_loss_mlp": 0.01026526, + "balance_loss_clip": 1.02525806, + "balance_loss_mlp": 1.016083, + "epoch": 0.956861566210732, + "flos": 18222834424320.0, + "grad_norm": 2.982626398118441, + "language_loss": 0.67335159, + "learning_rate": 1.9462658033404965e-08, + "loss": 0.69423103, + "num_input_tokens_seen": 343289625, + "step": 15915, + "time_per_iteration": 2.586690902709961 + }, + { + "auxiliary_loss_clip": 0.01050493, + "auxiliary_loss_mlp": 0.01023877, + "balance_loss_clip": 1.02467918, + "balance_loss_mlp": 1.01397085, + "epoch": 0.9569216894634, + "flos": 22196960202240.0, + "grad_norm": 1.7089515932970625, + "language_loss": 0.64147878, + "learning_rate": 1.9408494663855967e-08, + "loss": 0.6622225, + "num_input_tokens_seen": 343309200, + "step": 15916, + "time_per_iteration": 2.6029677391052246 + }, + { + "auxiliary_loss_clip": 0.01056395, + "auxiliary_loss_mlp": 0.01025718, + "balance_loss_clip": 1.02267528, + "balance_loss_mlp": 1.01615763, + "epoch": 0.956981812716068, + "flos": 21689291329920.0, + "grad_norm": 1.8392443441788848, + "language_loss": 0.80829006, + "learning_rate": 1.935440639853536e-08, + "loss": 0.82911122, + "num_input_tokens_seen": 343326270, + "step": 15917, + "time_per_iteration": 4.334550857543945 + }, + { + "auxiliary_loss_clip": 0.01032701, + "auxiliary_loss_mlp": 0.0103071, + "balance_loss_clip": 1.02296352, + "balance_loss_mlp": 1.01992774, + "epoch": 0.9570419359687359, + "flos": 13990905757440.0, + "grad_norm": 2.5181120296285795, + "language_loss": 0.72944438, + "learning_rate": 1.9300393239494172e-08, + "loss": 0.75007844, + "num_input_tokens_seen": 343344430, + "step": 15918, + "time_per_iteration": 2.607247829437256 + }, + { + "auxiliary_loss_clip": 0.00980149, + "auxiliary_loss_mlp": 0.01001562, + "balance_loss_clip": 1.0040431, + "balance_loss_mlp": 1.00073934, + "epoch": 0.9571020592214039, + "flos": 65196938534400.0, + "grad_norm": 0.6347052903368032, + "language_loss": 0.53132319, + "learning_rate": 1.924645518878032e-08, + "loss": 0.55114031, + "num_input_tokens_seen": 343416155, + "step": 15919, + "time_per_iteration": 3.351120710372925 + }, + { + "auxiliary_loss_clip": 0.01057392, + "auxiliary_loss_mlp": 0.01032623, + "balance_loss_clip": 1.027807, + "balance_loss_mlp": 1.02143526, + "epoch": 0.9571621824740718, + "flos": 17384068961280.0, + "grad_norm": 2.977913734360458, + "language_loss": 0.75395817, + "learning_rate": 1.919259224843972e-08, + "loss": 0.77485836, + "num_input_tokens_seen": 343431715, + "step": 15920, + "time_per_iteration": 2.6366028785705566 + }, + { + "auxiliary_loss_clip": 0.01037379, + "auxiliary_loss_mlp": 0.0103154, + "balance_loss_clip": 1.02703512, + "balance_loss_mlp": 1.02010822, + "epoch": 0.9572223057267398, + "flos": 14538184352640.0, + "grad_norm": 2.1884886089332505, + "language_loss": 0.79208332, + "learning_rate": 1.9138804420514298e-08, + "loss": 0.81277251, + "num_input_tokens_seen": 343450425, + "step": 15921, + "time_per_iteration": 2.6981418132781982 + }, + { + "auxiliary_loss_clip": 0.01053431, + "auxiliary_loss_mlp": 0.01027624, + "balance_loss_clip": 1.02328181, + "balance_loss_mlp": 1.01630473, + "epoch": 0.9572824289794077, + "flos": 33947793158400.0, + "grad_norm": 2.2278337741948224, + "language_loss": 0.51127702, + "learning_rate": 1.9085091707044197e-08, + "loss": 0.53208756, + "num_input_tokens_seen": 343470445, + "step": 15922, + "time_per_iteration": 2.838737726211548 + }, + { + "auxiliary_loss_clip": 0.01008581, + "auxiliary_loss_mlp": 0.01033047, + "balance_loss_clip": 1.01984906, + "balance_loss_mlp": 1.02085221, + "epoch": 0.9573425522320758, + "flos": 18694915896960.0, + "grad_norm": 2.009605122779364, + "language_loss": 0.83747333, + "learning_rate": 1.903145411006557e-08, + "loss": 0.85788965, + "num_input_tokens_seen": 343485200, + "step": 15923, + "time_per_iteration": 2.7996103763580322 + }, + { + "auxiliary_loss_clip": 0.01034239, + "auxiliary_loss_mlp": 0.01028059, + "balance_loss_clip": 1.0217104, + "balance_loss_mlp": 1.01846266, + "epoch": 0.9574026754847437, + "flos": 28510307297280.0, + "grad_norm": 1.5852074394498785, + "language_loss": 0.75320536, + "learning_rate": 1.8977891631613008e-08, + "loss": 0.77382839, + "num_input_tokens_seen": 343505080, + "step": 15924, + "time_per_iteration": 2.784579277038574 + }, + { + "auxiliary_loss_clip": 0.01041551, + "auxiliary_loss_mlp": 0.01029126, + "balance_loss_clip": 1.02389085, + "balance_loss_mlp": 1.01856983, + "epoch": 0.9574627987374117, + "flos": 24352390604160.0, + "grad_norm": 2.137082134837963, + "language_loss": 0.86036527, + "learning_rate": 1.892440427371711e-08, + "loss": 0.88107204, + "num_input_tokens_seen": 343523995, + "step": 15925, + "time_per_iteration": 2.840867280960083 + }, + { + "auxiliary_loss_clip": 0.01032672, + "auxiliary_loss_mlp": 0.01027302, + "balance_loss_clip": 1.02558064, + "balance_loss_mlp": 1.01628113, + "epoch": 0.9575229219900797, + "flos": 23510680225920.0, + "grad_norm": 1.7272693014699143, + "language_loss": 0.75207853, + "learning_rate": 1.8870992038406474e-08, + "loss": 0.7726782, + "num_input_tokens_seen": 343542015, + "step": 15926, + "time_per_iteration": 2.8057658672332764 + }, + { + "auxiliary_loss_clip": 0.01043165, + "auxiliary_loss_mlp": 0.01025096, + "balance_loss_clip": 1.02603269, + "balance_loss_mlp": 1.01566672, + "epoch": 0.9575830452427476, + "flos": 22674823764480.0, + "grad_norm": 1.6481214675349474, + "language_loss": 0.77795887, + "learning_rate": 1.8817654927706373e-08, + "loss": 0.79864144, + "num_input_tokens_seen": 343561680, + "step": 15927, + "time_per_iteration": 2.7393343448638916 + }, + { + "auxiliary_loss_clip": 0.01020568, + "auxiliary_loss_mlp": 0.01032221, + "balance_loss_clip": 1.0239507, + "balance_loss_mlp": 1.02046144, + "epoch": 0.9576431684954156, + "flos": 30485250835200.0, + "grad_norm": 1.6968300848933418, + "language_loss": 0.68462992, + "learning_rate": 1.8764392943639183e-08, + "loss": 0.70515782, + "num_input_tokens_seen": 343585290, + "step": 15928, + "time_per_iteration": 2.8612523078918457 + }, + { + "auxiliary_loss_clip": 0.01043983, + "auxiliary_loss_mlp": 0.01028484, + "balance_loss_clip": 1.02642632, + "balance_loss_mlp": 1.0178442, + "epoch": 0.9577032917480836, + "flos": 21687387909120.0, + "grad_norm": 2.2368704027119906, + "language_loss": 0.82003307, + "learning_rate": 1.871120608822485e-08, + "loss": 0.84075773, + "num_input_tokens_seen": 343604045, + "step": 15929, + "time_per_iteration": 2.640963077545166 + }, + { + "auxiliary_loss_clip": 0.01028249, + "auxiliary_loss_mlp": 0.01038765, + "balance_loss_clip": 1.0235666, + "balance_loss_mlp": 1.02707624, + "epoch": 0.9577634150007516, + "flos": 29023147728000.0, + "grad_norm": 1.6957361560936053, + "language_loss": 0.72394377, + "learning_rate": 1.8658094363480202e-08, + "loss": 0.74461389, + "num_input_tokens_seen": 343626595, + "step": 15930, + "time_per_iteration": 2.807727575302124 + }, + { + "auxiliary_loss_clip": 0.00990664, + "auxiliary_loss_mlp": 0.01029539, + "balance_loss_clip": 1.01991117, + "balance_loss_mlp": 1.01873875, + "epoch": 0.9578235382534195, + "flos": 19282235178240.0, + "grad_norm": 1.4708543110537267, + "language_loss": 0.62435985, + "learning_rate": 1.8605057771419185e-08, + "loss": 0.64456189, + "num_input_tokens_seen": 343646195, + "step": 15931, + "time_per_iteration": 2.7377231121063232 + }, + { + "auxiliary_loss_clip": 0.01059206, + "auxiliary_loss_mlp": 0.01025451, + "balance_loss_clip": 1.0240109, + "balance_loss_mlp": 1.01622438, + "epoch": 0.9578836615060875, + "flos": 13699275235200.0, + "grad_norm": 1.7468765316436234, + "language_loss": 0.68854773, + "learning_rate": 1.8552096314052633e-08, + "loss": 0.70939434, + "num_input_tokens_seen": 343663665, + "step": 15932, + "time_per_iteration": 4.262119770050049 + }, + { + "auxiliary_loss_clip": 0.01033649, + "auxiliary_loss_mlp": 0.01032814, + "balance_loss_clip": 1.02442884, + "balance_loss_mlp": 1.02142978, + "epoch": 0.9579437847587554, + "flos": 17054516655360.0, + "grad_norm": 1.6931101334241956, + "language_loss": 0.75387818, + "learning_rate": 1.849920999338961e-08, + "loss": 0.77454281, + "num_input_tokens_seen": 343682145, + "step": 15933, + "time_per_iteration": 2.683237314224243 + }, + { + "auxiliary_loss_clip": 0.00973707, + "auxiliary_loss_mlp": 0.01001502, + "balance_loss_clip": 1.00775313, + "balance_loss_mlp": 1.00054216, + "epoch": 0.9580039080114234, + "flos": 60570887886720.0, + "grad_norm": 0.7356129325356816, + "language_loss": 0.57356316, + "learning_rate": 1.8446398811434948e-08, + "loss": 0.59331518, + "num_input_tokens_seen": 343744685, + "step": 15934, + "time_per_iteration": 3.4297103881835938 + }, + { + "auxiliary_loss_clip": 0.01006172, + "auxiliary_loss_mlp": 0.00746574, + "balance_loss_clip": 1.00124025, + "balance_loss_mlp": 1.0004977, + "epoch": 0.9580640312640913, + "flos": 66235365745920.0, + "grad_norm": 0.9153542079698638, + "language_loss": 0.65994012, + "learning_rate": 1.8393662770191277e-08, + "loss": 0.67746758, + "num_input_tokens_seen": 343801835, + "step": 15935, + "time_per_iteration": 3.1061460971832275 + }, + { + "auxiliary_loss_clip": 0.00993131, + "auxiliary_loss_mlp": 0.01001257, + "balance_loss_clip": 1.00664425, + "balance_loss_mlp": 1.00006449, + "epoch": 0.9581241545167594, + "flos": 62218002971520.0, + "grad_norm": 0.7860731189361518, + "language_loss": 0.57099491, + "learning_rate": 1.8341001871658546e-08, + "loss": 0.59093881, + "num_input_tokens_seen": 343861515, + "step": 15936, + "time_per_iteration": 3.222799777984619 + }, + { + "auxiliary_loss_clip": 0.01007036, + "auxiliary_loss_mlp": 0.01029007, + "balance_loss_clip": 1.02269888, + "balance_loss_mlp": 1.0182662, + "epoch": 0.9581842777694273, + "flos": 23768088065280.0, + "grad_norm": 2.16916617649837, + "language_loss": 0.78018832, + "learning_rate": 1.8288416117833825e-08, + "loss": 0.80054873, + "num_input_tokens_seen": 343881240, + "step": 15937, + "time_per_iteration": 2.883852243423462 + }, + { + "auxiliary_loss_clip": 0.0104705, + "auxiliary_loss_mlp": 0.0102583, + "balance_loss_clip": 1.0238905, + "balance_loss_mlp": 1.0147438, + "epoch": 0.9582444010220953, + "flos": 21213079793280.0, + "grad_norm": 2.018143920881168, + "language_loss": 0.6838491, + "learning_rate": 1.8235905510710636e-08, + "loss": 0.70457792, + "num_input_tokens_seen": 343900885, + "step": 15938, + "time_per_iteration": 2.785978317260742 + }, + { + "auxiliary_loss_clip": 0.01033814, + "auxiliary_loss_mlp": 0.01031302, + "balance_loss_clip": 1.02500772, + "balance_loss_mlp": 1.02088308, + "epoch": 0.9583045242747633, + "flos": 23805147922560.0, + "grad_norm": 2.50205210099349, + "language_loss": 0.65689933, + "learning_rate": 1.8183470052280712e-08, + "loss": 0.67755049, + "num_input_tokens_seen": 343918460, + "step": 15939, + "time_per_iteration": 2.9194252490997314 + }, + { + "auxiliary_loss_clip": 0.01032297, + "auxiliary_loss_mlp": 0.01034879, + "balance_loss_clip": 1.02214682, + "balance_loss_mlp": 1.02444232, + "epoch": 0.9583646475274312, + "flos": 24131468004480.0, + "grad_norm": 1.5150640268856177, + "language_loss": 0.73514462, + "learning_rate": 1.8131109744532025e-08, + "loss": 0.7558164, + "num_input_tokens_seen": 343938030, + "step": 15940, + "time_per_iteration": 2.7020959854125977 + }, + { + "auxiliary_loss_clip": 0.01062398, + "auxiliary_loss_mlp": 0.01030847, + "balance_loss_clip": 1.02518082, + "balance_loss_mlp": 1.01933765, + "epoch": 0.9584247707800992, + "flos": 20886651970560.0, + "grad_norm": 2.6392343636131934, + "language_loss": 0.72756529, + "learning_rate": 1.8078824589450535e-08, + "loss": 0.74849772, + "num_input_tokens_seen": 343956635, + "step": 15941, + "time_per_iteration": 2.6264452934265137 + }, + { + "auxiliary_loss_clip": 0.01041867, + "auxiliary_loss_mlp": 0.01033024, + "balance_loss_clip": 1.0246278, + "balance_loss_mlp": 1.02290857, + "epoch": 0.9584848940327672, + "flos": 26067591918720.0, + "grad_norm": 1.9888937117349512, + "language_loss": 0.7111882, + "learning_rate": 1.8026614589018442e-08, + "loss": 0.73193717, + "num_input_tokens_seen": 343976625, + "step": 15942, + "time_per_iteration": 2.7280144691467285 + }, + { + "auxiliary_loss_clip": 0.01062954, + "auxiliary_loss_mlp": 0.01031687, + "balance_loss_clip": 1.02511239, + "balance_loss_mlp": 1.02048767, + "epoch": 0.9585450172854352, + "flos": 34492988764800.0, + "grad_norm": 1.6098729580645923, + "language_loss": 0.71895063, + "learning_rate": 1.797447974521571e-08, + "loss": 0.73989707, + "num_input_tokens_seen": 343997790, + "step": 15943, + "time_per_iteration": 4.385866165161133 + }, + { + "auxiliary_loss_clip": 0.01054041, + "auxiliary_loss_mlp": 0.0103406, + "balance_loss_clip": 1.02514327, + "balance_loss_mlp": 1.02275252, + "epoch": 0.9586051405381031, + "flos": 23110743219840.0, + "grad_norm": 1.619475843740096, + "language_loss": 0.68421918, + "learning_rate": 1.792242006001965e-08, + "loss": 0.70510018, + "num_input_tokens_seen": 344016935, + "step": 15944, + "time_per_iteration": 4.300639629364014 + }, + { + "auxiliary_loss_clip": 0.01061004, + "auxiliary_loss_mlp": 0.01029802, + "balance_loss_clip": 1.02406693, + "balance_loss_mlp": 1.01920986, + "epoch": 0.9586652637907711, + "flos": 19603994232960.0, + "grad_norm": 1.6155244555261652, + "language_loss": 0.65898037, + "learning_rate": 1.7870435535403795e-08, + "loss": 0.67988837, + "num_input_tokens_seen": 344035590, + "step": 15945, + "time_per_iteration": 2.6505446434020996 + }, + { + "auxiliary_loss_clip": 0.00942812, + "auxiliary_loss_mlp": 0.01005657, + "balance_loss_clip": 1.00876355, + "balance_loss_mlp": 1.00487638, + "epoch": 0.958725387043439, + "flos": 72073327317120.0, + "grad_norm": 0.7411904837824425, + "language_loss": 0.6191318, + "learning_rate": 1.7818526173339678e-08, + "loss": 0.6386165, + "num_input_tokens_seen": 344100845, + "step": 15946, + "time_per_iteration": 3.7570202350616455 + }, + { + "auxiliary_loss_clip": 0.01061959, + "auxiliary_loss_mlp": 0.01029238, + "balance_loss_clip": 1.02606118, + "balance_loss_mlp": 1.01921248, + "epoch": 0.958785510296107, + "flos": 28911932242560.0, + "grad_norm": 3.5672522984809603, + "language_loss": 0.74975061, + "learning_rate": 1.7766691975795723e-08, + "loss": 0.77066255, + "num_input_tokens_seen": 344121780, + "step": 15947, + "time_per_iteration": 3.6732969284057617 + }, + { + "auxiliary_loss_clip": 0.01035495, + "auxiliary_loss_mlp": 0.0102587, + "balance_loss_clip": 1.02271163, + "balance_loss_mlp": 1.01616049, + "epoch": 0.958845633548775, + "flos": 18477189607680.0, + "grad_norm": 2.13912332277631, + "language_loss": 0.69492006, + "learning_rate": 1.771493294473747e-08, + "loss": 0.71553367, + "num_input_tokens_seen": 344140150, + "step": 15948, + "time_per_iteration": 2.739652156829834 + }, + { + "auxiliary_loss_clip": 0.01012433, + "auxiliary_loss_mlp": 0.01028728, + "balance_loss_clip": 1.02376676, + "balance_loss_mlp": 1.01893544, + "epoch": 0.958905756801443, + "flos": 24206916522240.0, + "grad_norm": 1.8682722135826926, + "language_loss": 0.78918397, + "learning_rate": 1.7663249082127574e-08, + "loss": 0.80959558, + "num_input_tokens_seen": 344158200, + "step": 15949, + "time_per_iteration": 2.9061267375946045 + }, + { + "auxiliary_loss_clip": 0.01063512, + "auxiliary_loss_mlp": 0.01030202, + "balance_loss_clip": 1.02645695, + "balance_loss_mlp": 1.01913333, + "epoch": 0.9589658800541109, + "flos": 25007939769600.0, + "grad_norm": 2.2377474381988405, + "language_loss": 0.68143177, + "learning_rate": 1.761164038992602e-08, + "loss": 0.70236892, + "num_input_tokens_seen": 344174720, + "step": 15950, + "time_per_iteration": 2.6097280979156494 + }, + { + "auxiliary_loss_clip": 0.01042964, + "auxiliary_loss_mlp": 0.01030183, + "balance_loss_clip": 1.02536952, + "balance_loss_mlp": 1.02093863, + "epoch": 0.9590260033067789, + "flos": 23514558894720.0, + "grad_norm": 2.994257186068954, + "language_loss": 0.86202693, + "learning_rate": 1.7560106870089687e-08, + "loss": 0.88275838, + "num_input_tokens_seen": 344192580, + "step": 15951, + "time_per_iteration": 2.6262047290802 + }, + { + "auxiliary_loss_clip": 0.01037697, + "auxiliary_loss_mlp": 0.01037027, + "balance_loss_clip": 1.02452075, + "balance_loss_mlp": 1.02570796, + "epoch": 0.9590861265594469, + "flos": 25520349237120.0, + "grad_norm": 2.808859011315436, + "language_loss": 0.80023772, + "learning_rate": 1.7508648524572568e-08, + "loss": 0.82098496, + "num_input_tokens_seen": 344210345, + "step": 15952, + "time_per_iteration": 2.6337764263153076 + }, + { + "auxiliary_loss_clip": 0.01053208, + "auxiliary_loss_mlp": 0.01028099, + "balance_loss_clip": 1.02568507, + "balance_loss_mlp": 1.01703084, + "epoch": 0.9591462498121148, + "flos": 21179323987200.0, + "grad_norm": 1.7079364471528418, + "language_loss": 0.69446468, + "learning_rate": 1.7457265355326434e-08, + "loss": 0.71527779, + "num_input_tokens_seen": 344229540, + "step": 15953, + "time_per_iteration": 2.6146535873413086 + }, + { + "auxiliary_loss_clip": 0.01007334, + "auxiliary_loss_mlp": 0.01031698, + "balance_loss_clip": 1.02353811, + "balance_loss_mlp": 1.01966441, + "epoch": 0.9592063730647828, + "flos": 21723047136000.0, + "grad_norm": 3.385792455510539, + "language_loss": 0.58623761, + "learning_rate": 1.7405957364299285e-08, + "loss": 0.60662794, + "num_input_tokens_seen": 344247830, + "step": 15954, + "time_per_iteration": 2.7855067253112793 + }, + { + "auxiliary_loss_clip": 0.01052598, + "auxiliary_loss_mlp": 0.01034186, + "balance_loss_clip": 1.02450478, + "balance_loss_mlp": 1.02289104, + "epoch": 0.9592664963174508, + "flos": 29891395278720.0, + "grad_norm": 2.152816835274365, + "language_loss": 0.73622084, + "learning_rate": 1.7354724553437117e-08, + "loss": 0.75708866, + "num_input_tokens_seen": 344267760, + "step": 15955, + "time_per_iteration": 2.6604273319244385 + }, + { + "auxiliary_loss_clip": 0.01043518, + "auxiliary_loss_mlp": 0.01029805, + "balance_loss_clip": 1.02552843, + "balance_loss_mlp": 1.01852143, + "epoch": 0.9593266195701188, + "flos": 17999613354240.0, + "grad_norm": 1.733823041742509, + "language_loss": 0.6223098, + "learning_rate": 1.7303566924682378e-08, + "loss": 0.64304298, + "num_input_tokens_seen": 344284905, + "step": 15956, + "time_per_iteration": 2.6192989349365234 + }, + { + "auxiliary_loss_clip": 0.0103432, + "auxiliary_loss_mlp": 0.01028189, + "balance_loss_clip": 1.02704453, + "balance_loss_mlp": 1.01740098, + "epoch": 0.9593867428227867, + "flos": 18838271076480.0, + "grad_norm": 1.9034513534016255, + "language_loss": 0.5984875, + "learning_rate": 1.725248447997507e-08, + "loss": 0.61911255, + "num_input_tokens_seen": 344302025, + "step": 15957, + "time_per_iteration": 2.6897621154785156 + }, + { + "auxiliary_loss_clip": 0.01024913, + "auxiliary_loss_mlp": 0.01033364, + "balance_loss_clip": 1.02405357, + "balance_loss_mlp": 1.02245092, + "epoch": 0.9594468660754547, + "flos": 29567050444800.0, + "grad_norm": 2.3115634758065777, + "language_loss": 0.74413812, + "learning_rate": 1.7201477221252314e-08, + "loss": 0.76472092, + "num_input_tokens_seen": 344321935, + "step": 15958, + "time_per_iteration": 2.9815962314605713 + }, + { + "auxiliary_loss_clip": 0.0103928, + "auxiliary_loss_mlp": 0.00747501, + "balance_loss_clip": 1.02279162, + "balance_loss_mlp": 1.00031972, + "epoch": 0.9595069893281226, + "flos": 20703256104960.0, + "grad_norm": 3.006172117236047, + "language_loss": 0.74357581, + "learning_rate": 1.7150545150448116e-08, + "loss": 0.76144361, + "num_input_tokens_seen": 344340405, + "step": 15959, + "time_per_iteration": 2.7903506755828857 + }, + { + "auxiliary_loss_clip": 0.01045212, + "auxiliary_loss_mlp": 0.01027332, + "balance_loss_clip": 1.02399659, + "balance_loss_mlp": 1.01650167, + "epoch": 0.9595671125807906, + "flos": 22453613856000.0, + "grad_norm": 2.421948156189964, + "language_loss": 0.64888811, + "learning_rate": 1.7099688269493816e-08, + "loss": 0.6696136, + "num_input_tokens_seen": 344359925, + "step": 15960, + "time_per_iteration": 2.7159812450408936 + }, + { + "auxiliary_loss_clip": 0.01061188, + "auxiliary_loss_mlp": 0.01030177, + "balance_loss_clip": 1.02546167, + "balance_loss_mlp": 1.0197705, + "epoch": 0.9596272358334585, + "flos": 23915214172800.0, + "grad_norm": 1.954445746155753, + "language_loss": 0.77730978, + "learning_rate": 1.7048906580318544e-08, + "loss": 0.79822338, + "num_input_tokens_seen": 344379100, + "step": 15961, + "time_per_iteration": 2.6650846004486084 + }, + { + "auxiliary_loss_clip": 0.0101398, + "auxiliary_loss_mlp": 0.01024352, + "balance_loss_clip": 1.02533948, + "balance_loss_mlp": 1.01476121, + "epoch": 0.9596873590861266, + "flos": 17672539086720.0, + "grad_norm": 1.9786176588495226, + "language_loss": 0.76114029, + "learning_rate": 1.699820008484698e-08, + "loss": 0.78152359, + "num_input_tokens_seen": 344396895, + "step": 15962, + "time_per_iteration": 2.7892186641693115 + }, + { + "auxiliary_loss_clip": 0.01047064, + "auxiliary_loss_mlp": 0.01031302, + "balance_loss_clip": 1.0272882, + "balance_loss_mlp": 1.02017939, + "epoch": 0.9597474823387945, + "flos": 25808532053760.0, + "grad_norm": 2.0197403924399553, + "language_loss": 0.71589833, + "learning_rate": 1.6947568785002698e-08, + "loss": 0.736682, + "num_input_tokens_seen": 344415115, + "step": 15963, + "time_per_iteration": 2.649223804473877 + }, + { + "auxiliary_loss_clip": 0.01042525, + "auxiliary_loss_mlp": 0.01031358, + "balance_loss_clip": 1.02735329, + "balance_loss_mlp": 1.02163029, + "epoch": 0.9598076055914625, + "flos": 23768519028480.0, + "grad_norm": 1.6040595036736114, + "language_loss": 0.74443555, + "learning_rate": 1.689701268270527e-08, + "loss": 0.76517433, + "num_input_tokens_seen": 344435185, + "step": 15964, + "time_per_iteration": 4.407239198684692 + }, + { + "auxiliary_loss_clip": 0.0096745, + "auxiliary_loss_mlp": 0.01001057, + "balance_loss_clip": 1.00415635, + "balance_loss_mlp": 1.00018072, + "epoch": 0.9598677288441305, + "flos": 56515962464640.0, + "grad_norm": 0.8961392921732196, + "language_loss": 0.57613504, + "learning_rate": 1.684653177987161e-08, + "loss": 0.59582007, + "num_input_tokens_seen": 344488950, + "step": 15965, + "time_per_iteration": 3.2413392066955566 + }, + { + "auxiliary_loss_clip": 0.01063152, + "auxiliary_loss_mlp": 0.01028912, + "balance_loss_clip": 1.02539814, + "balance_loss_mlp": 1.01904726, + "epoch": 0.9599278520967984, + "flos": 22997480659200.0, + "grad_norm": 1.6684375124879702, + "language_loss": 0.78406125, + "learning_rate": 1.6796126078416627e-08, + "loss": 0.80498195, + "num_input_tokens_seen": 344506740, + "step": 15966, + "time_per_iteration": 2.5543296337127686 + }, + { + "auxiliary_loss_clip": 0.01034035, + "auxiliary_loss_mlp": 0.01024496, + "balance_loss_clip": 1.02092528, + "balance_loss_mlp": 1.01416695, + "epoch": 0.9599879753494664, + "flos": 23039676161280.0, + "grad_norm": 1.7964136845428031, + "language_loss": 0.79581553, + "learning_rate": 1.674579558025102e-08, + "loss": 0.81640077, + "num_input_tokens_seen": 344526670, + "step": 15967, + "time_per_iteration": 2.6229376792907715 + }, + { + "auxiliary_loss_clip": 0.01007377, + "auxiliary_loss_mlp": 0.01028973, + "balance_loss_clip": 1.0221349, + "balance_loss_mlp": 1.01702869, + "epoch": 0.9600480986021344, + "flos": 16392287560320.0, + "grad_norm": 2.5834228081497614, + "language_loss": 0.80861592, + "learning_rate": 1.669554028728348e-08, + "loss": 0.82897943, + "num_input_tokens_seen": 344541995, + "step": 15968, + "time_per_iteration": 2.7084569931030273 + }, + { + "auxiliary_loss_clip": 0.01015876, + "auxiliary_loss_mlp": 0.01036826, + "balance_loss_clip": 1.02354956, + "balance_loss_mlp": 1.02452326, + "epoch": 0.9601082218548024, + "flos": 24276439296000.0, + "grad_norm": 2.3714448118687335, + "language_loss": 0.6717509, + "learning_rate": 1.6645360201420044e-08, + "loss": 0.69227791, + "num_input_tokens_seen": 344559980, + "step": 15969, + "time_per_iteration": 2.848404884338379 + }, + { + "auxiliary_loss_clip": 0.0104671, + "auxiliary_loss_mlp": 0.01033088, + "balance_loss_clip": 1.02301526, + "balance_loss_mlp": 1.0235033, + "epoch": 0.9601683451074703, + "flos": 19609991804160.0, + "grad_norm": 2.6542605842201863, + "language_loss": 0.79793656, + "learning_rate": 1.6595255324563186e-08, + "loss": 0.81873453, + "num_input_tokens_seen": 344577765, + "step": 15970, + "time_per_iteration": 2.7496817111968994 + }, + { + "auxiliary_loss_clip": 0.01049563, + "auxiliary_loss_mlp": 0.01029468, + "balance_loss_clip": 1.0248313, + "balance_loss_mlp": 1.01932323, + "epoch": 0.9602284683601383, + "flos": 26651104358400.0, + "grad_norm": 1.5275721949591052, + "language_loss": 0.77116781, + "learning_rate": 1.654522565861316e-08, + "loss": 0.79195809, + "num_input_tokens_seen": 344597650, + "step": 15971, + "time_per_iteration": 2.7657663822174072 + }, + { + "auxiliary_loss_clip": 0.01044044, + "auxiliary_loss_mlp": 0.01027108, + "balance_loss_clip": 1.02398801, + "balance_loss_mlp": 1.01626599, + "epoch": 0.9602885916128062, + "flos": 15554096714880.0, + "grad_norm": 1.7453837055072503, + "language_loss": 0.67044461, + "learning_rate": 1.64952712054669e-08, + "loss": 0.69115615, + "num_input_tokens_seen": 344613580, + "step": 15972, + "time_per_iteration": 2.6008806228637695 + }, + { + "auxiliary_loss_clip": 0.01049804, + "auxiliary_loss_mlp": 0.00747623, + "balance_loss_clip": 1.02377844, + "balance_loss_mlp": 1.00036561, + "epoch": 0.9603487148654742, + "flos": 16502353810560.0, + "grad_norm": 2.082394855809421, + "language_loss": 0.75933194, + "learning_rate": 1.644539196701844e-08, + "loss": 0.77730626, + "num_input_tokens_seen": 344626910, + "step": 15973, + "time_per_iteration": 2.6071267127990723 + }, + { + "auxiliary_loss_clip": 0.01021743, + "auxiliary_loss_mlp": 0.01039179, + "balance_loss_clip": 1.02772939, + "balance_loss_mlp": 1.02682924, + "epoch": 0.9604088381181421, + "flos": 20845354308480.0, + "grad_norm": 2.7083526542975855, + "language_loss": 0.68890691, + "learning_rate": 1.639558794515983e-08, + "loss": 0.70951611, + "num_input_tokens_seen": 344644330, + "step": 15974, + "time_per_iteration": 2.735719919204712 + }, + { + "auxiliary_loss_clip": 0.01051563, + "auxiliary_loss_mlp": 0.01025743, + "balance_loss_clip": 1.02340102, + "balance_loss_mlp": 1.0148356, + "epoch": 0.9604689613708102, + "flos": 19683105937920.0, + "grad_norm": 1.8245930003603552, + "language_loss": 0.6806674, + "learning_rate": 1.6345859141779105e-08, + "loss": 0.70144045, + "num_input_tokens_seen": 344663910, + "step": 15975, + "time_per_iteration": 2.6084787845611572 + }, + { + "auxiliary_loss_clip": 0.0105982, + "auxiliary_loss_mlp": 0.01023614, + "balance_loss_clip": 1.0252037, + "balance_loss_mlp": 1.01354051, + "epoch": 0.9605290846234781, + "flos": 24097568544000.0, + "grad_norm": 2.2679649063584497, + "language_loss": 0.55340958, + "learning_rate": 1.6296205558762322e-08, + "loss": 0.5742439, + "num_input_tokens_seen": 344682320, + "step": 15976, + "time_per_iteration": 2.5634796619415283 + }, + { + "auxiliary_loss_clip": 0.01032681, + "auxiliary_loss_mlp": 0.01023614, + "balance_loss_clip": 1.02168131, + "balance_loss_mlp": 1.01395202, + "epoch": 0.9605892078761461, + "flos": 27122575299840.0, + "grad_norm": 1.7805128727166826, + "language_loss": 0.68349779, + "learning_rate": 1.624662719799219e-08, + "loss": 0.70406073, + "num_input_tokens_seen": 344701355, + "step": 15977, + "time_per_iteration": 2.8322834968566895 + }, + { + "auxiliary_loss_clip": 0.0104999, + "auxiliary_loss_mlp": 0.01030542, + "balance_loss_clip": 1.02361214, + "balance_loss_mlp": 1.02041531, + "epoch": 0.9606493311288141, + "flos": 14136918543360.0, + "grad_norm": 1.7966873521137385, + "language_loss": 0.82348329, + "learning_rate": 1.6197124061348766e-08, + "loss": 0.84428871, + "num_input_tokens_seen": 344717980, + "step": 15978, + "time_per_iteration": 2.624070882797241 + }, + { + "auxiliary_loss_clip": 0.01052585, + "auxiliary_loss_mlp": 0.01028228, + "balance_loss_clip": 1.02331007, + "balance_loss_mlp": 1.01752889, + "epoch": 0.960709454381482, + "flos": 15813336147840.0, + "grad_norm": 2.7254571746719516, + "language_loss": 0.83653426, + "learning_rate": 1.614769615070921e-08, + "loss": 0.85734236, + "num_input_tokens_seen": 344733480, + "step": 15979, + "time_per_iteration": 2.6846022605895996 + }, + { + "auxiliary_loss_clip": 0.01062929, + "auxiliary_loss_mlp": 0.01033022, + "balance_loss_clip": 1.02518487, + "balance_loss_mlp": 1.02330089, + "epoch": 0.96076957763415, + "flos": 22565403959040.0, + "grad_norm": 1.5534059698300045, + "language_loss": 0.79928142, + "learning_rate": 1.6098343467947805e-08, + "loss": 0.82024097, + "num_input_tokens_seen": 344752130, + "step": 15980, + "time_per_iteration": 4.253259897232056 + }, + { + "auxiliary_loss_clip": 0.0105228, + "auxiliary_loss_mlp": 0.01023979, + "balance_loss_clip": 1.02439499, + "balance_loss_mlp": 1.01395321, + "epoch": 0.960829700886818, + "flos": 24681260551680.0, + "grad_norm": 1.723782206571809, + "language_loss": 0.68609911, + "learning_rate": 1.6049066014935942e-08, + "loss": 0.70686173, + "num_input_tokens_seen": 344771195, + "step": 15981, + "time_per_iteration": 2.7188832759857178 + }, + { + "auxiliary_loss_clip": 0.01049725, + "auxiliary_loss_mlp": 0.00747609, + "balance_loss_clip": 1.02426815, + "balance_loss_mlp": 1.00040722, + "epoch": 0.960889824139486, + "flos": 26542223256960.0, + "grad_norm": 1.4610871867467525, + "language_loss": 0.69633484, + "learning_rate": 1.5999863793542344e-08, + "loss": 0.71430814, + "num_input_tokens_seen": 344793150, + "step": 15982, + "time_per_iteration": 2.784315347671509 + }, + { + "auxiliary_loss_clip": 0.0098059, + "auxiliary_loss_mlp": 0.01001572, + "balance_loss_clip": 1.00451827, + "balance_loss_mlp": 1.00061882, + "epoch": 0.9609499473921539, + "flos": 71114942586240.0, + "grad_norm": 0.6716853732856185, + "language_loss": 0.53265929, + "learning_rate": 1.595073680563286e-08, + "loss": 0.55248094, + "num_input_tokens_seen": 344852855, + "step": 15983, + "time_per_iteration": 3.4021267890930176 + }, + { + "auxiliary_loss_clip": 0.01063097, + "auxiliary_loss_mlp": 0.01028844, + "balance_loss_clip": 1.02617729, + "balance_loss_mlp": 1.01831162, + "epoch": 0.9610100706448219, + "flos": 20552466810240.0, + "grad_norm": 2.256461361771799, + "language_loss": 0.67511153, + "learning_rate": 1.5901685053070212e-08, + "loss": 0.69603086, + "num_input_tokens_seen": 344869830, + "step": 15984, + "time_per_iteration": 2.6374664306640625 + }, + { + "auxiliary_loss_clip": 0.01032409, + "auxiliary_loss_mlp": 0.01029119, + "balance_loss_clip": 1.02723837, + "balance_loss_mlp": 1.01994634, + "epoch": 0.9610701938974898, + "flos": 14064199459200.0, + "grad_norm": 1.5090491916107427, + "language_loss": 0.67296672, + "learning_rate": 1.5852708537714477e-08, + "loss": 0.693582, + "num_input_tokens_seen": 344888905, + "step": 15985, + "time_per_iteration": 2.7924346923828125 + }, + { + "auxiliary_loss_clip": 0.01064015, + "auxiliary_loss_mlp": 0.01026657, + "balance_loss_clip": 1.02587926, + "balance_loss_mlp": 1.01647651, + "epoch": 0.9611303171501578, + "flos": 20229989483520.0, + "grad_norm": 7.7820638919421405, + "language_loss": 0.78791928, + "learning_rate": 1.580380726142283e-08, + "loss": 0.80882597, + "num_input_tokens_seen": 344907160, + "step": 15986, + "time_per_iteration": 2.621814250946045 + }, + { + "auxiliary_loss_clip": 0.01016351, + "auxiliary_loss_mlp": 0.0102845, + "balance_loss_clip": 1.02717412, + "balance_loss_mlp": 1.01702428, + "epoch": 0.9611904404028258, + "flos": 20951075013120.0, + "grad_norm": 3.4310797491687484, + "language_loss": 0.63963622, + "learning_rate": 1.5754981226049792e-08, + "loss": 0.66008431, + "num_input_tokens_seen": 344922400, + "step": 15987, + "time_per_iteration": 2.719938039779663 + }, + { + "auxiliary_loss_clip": 0.01059691, + "auxiliary_loss_mlp": 0.01025977, + "balance_loss_clip": 1.02506828, + "balance_loss_mlp": 1.01688147, + "epoch": 0.9612505636554938, + "flos": 24827740214400.0, + "grad_norm": 1.8014031124538765, + "language_loss": 0.66635501, + "learning_rate": 1.5706230433446544e-08, + "loss": 0.68721175, + "num_input_tokens_seen": 344941910, + "step": 15988, + "time_per_iteration": 2.594604253768921 + }, + { + "auxiliary_loss_clip": 0.01051348, + "auxiliary_loss_mlp": 0.01037302, + "balance_loss_clip": 1.02427554, + "balance_loss_mlp": 1.02769971, + "epoch": 0.9613106869081617, + "flos": 17164977955200.0, + "grad_norm": 1.8788687872701786, + "language_loss": 0.7456066, + "learning_rate": 1.5657554885462055e-08, + "loss": 0.76649308, + "num_input_tokens_seen": 344960020, + "step": 15989, + "time_per_iteration": 2.568822145462036 + }, + { + "auxiliary_loss_clip": 0.00987775, + "auxiliary_loss_mlp": 0.01009509, + "balance_loss_clip": 1.00250363, + "balance_loss_mlp": 1.00867486, + "epoch": 0.9613708101608297, + "flos": 61563818522880.0, + "grad_norm": 0.8404238832005877, + "language_loss": 0.63147426, + "learning_rate": 1.5608954583941737e-08, + "loss": 0.65144712, + "num_input_tokens_seen": 345018290, + "step": 15990, + "time_per_iteration": 4.796757221221924 + }, + { + "auxiliary_loss_clip": 0.0105233, + "auxiliary_loss_mlp": 0.01032207, + "balance_loss_clip": 1.0247227, + "balance_loss_mlp": 1.02210975, + "epoch": 0.9614309334134977, + "flos": 27417904922880.0, + "grad_norm": 1.867121714966085, + "language_loss": 0.77459162, + "learning_rate": 1.5560429530729003e-08, + "loss": 0.79543698, + "num_input_tokens_seen": 345040235, + "step": 15991, + "time_per_iteration": 4.317039251327515 + }, + { + "auxiliary_loss_clip": 0.01066134, + "auxiliary_loss_mlp": 0.01029093, + "balance_loss_clip": 1.02497363, + "balance_loss_mlp": 1.01779222, + "epoch": 0.9614910566661656, + "flos": 22819148611200.0, + "grad_norm": 2.9937100248003365, + "language_loss": 0.84791374, + "learning_rate": 1.5511979727663493e-08, + "loss": 0.86886597, + "num_input_tokens_seen": 345054540, + "step": 15992, + "time_per_iteration": 2.5745480060577393 + }, + { + "auxiliary_loss_clip": 0.01040613, + "auxiliary_loss_mlp": 0.01027658, + "balance_loss_clip": 1.02338457, + "balance_loss_mlp": 1.01686406, + "epoch": 0.9615511799188337, + "flos": 20667812359680.0, + "grad_norm": 3.360019886485675, + "language_loss": 0.72429574, + "learning_rate": 1.5463605176582406e-08, + "loss": 0.74497843, + "num_input_tokens_seen": 345074035, + "step": 15993, + "time_per_iteration": 2.6263909339904785 + }, + { + "auxiliary_loss_clip": 0.01024138, + "auxiliary_loss_mlp": 0.01028629, + "balance_loss_clip": 1.02446413, + "balance_loss_mlp": 1.01813269, + "epoch": 0.9616113031715016, + "flos": 33149212035840.0, + "grad_norm": 1.3562322725374814, + "language_loss": 0.68035495, + "learning_rate": 1.5415305879320716e-08, + "loss": 0.70088261, + "num_input_tokens_seen": 345099270, + "step": 15994, + "time_per_iteration": 2.774031400680542 + }, + { + "auxiliary_loss_clip": 0.01024381, + "auxiliary_loss_mlp": 0.01026944, + "balance_loss_clip": 1.02547133, + "balance_loss_mlp": 1.01648951, + "epoch": 0.9616714264241696, + "flos": 25009807276800.0, + "grad_norm": 1.76991909608619, + "language_loss": 0.84762406, + "learning_rate": 1.5367081837709183e-08, + "loss": 0.86813736, + "num_input_tokens_seen": 345116975, + "step": 15995, + "time_per_iteration": 2.7617578506469727 + }, + { + "auxiliary_loss_clip": 0.01054851, + "auxiliary_loss_mlp": 0.01034054, + "balance_loss_clip": 1.02631974, + "balance_loss_mlp": 1.02279496, + "epoch": 0.9617315496768375, + "flos": 13547480359680.0, + "grad_norm": 1.9206133547213942, + "language_loss": 0.75947952, + "learning_rate": 1.5318933053576788e-08, + "loss": 0.78036857, + "num_input_tokens_seen": 345133645, + "step": 15996, + "time_per_iteration": 2.7009987831115723 + }, + { + "auxiliary_loss_clip": 0.01040883, + "auxiliary_loss_mlp": 0.01028395, + "balance_loss_clip": 1.02371836, + "balance_loss_mlp": 1.01789284, + "epoch": 0.9617916729295055, + "flos": 11254512781440.0, + "grad_norm": 2.000723798384592, + "language_loss": 0.77091169, + "learning_rate": 1.52708595287494e-08, + "loss": 0.7916044, + "num_input_tokens_seen": 345150740, + "step": 15997, + "time_per_iteration": 2.8111674785614014 + }, + { + "auxiliary_loss_clip": 0.0105816, + "auxiliary_loss_mlp": 0.0074744, + "balance_loss_clip": 1.02366674, + "balance_loss_mlp": 1.00038099, + "epoch": 0.9618517961821734, + "flos": 22819723228800.0, + "grad_norm": 1.514227665168492, + "language_loss": 0.67327029, + "learning_rate": 1.522286126505001e-08, + "loss": 0.69132626, + "num_input_tokens_seen": 345170365, + "step": 15998, + "time_per_iteration": 2.6517436504364014 + }, + { + "auxiliary_loss_clip": 0.01028281, + "auxiliary_loss_mlp": 0.01028381, + "balance_loss_clip": 1.01986206, + "balance_loss_mlp": 1.01694274, + "epoch": 0.9619119194348414, + "flos": 16617340224000.0, + "grad_norm": 2.1906073269767483, + "language_loss": 0.72685552, + "learning_rate": 1.5174938264298498e-08, + "loss": 0.7474221, + "num_input_tokens_seen": 345188930, + "step": 15999, + "time_per_iteration": 2.676119804382324 + }, + { + "auxiliary_loss_clip": 0.01032913, + "auxiliary_loss_mlp": 0.01028533, + "balance_loss_clip": 1.0211693, + "balance_loss_mlp": 1.01882291, + "epoch": 0.9619720426875094, + "flos": 24535140024960.0, + "grad_norm": 2.430366657159648, + "language_loss": 0.65328419, + "learning_rate": 1.5127090528312514e-08, + "loss": 0.6738987, + "num_input_tokens_seen": 345209615, + "step": 16000, + "time_per_iteration": 2.775123119354248 + }, + { + "auxiliary_loss_clip": 0.01026591, + "auxiliary_loss_mlp": 0.01026165, + "balance_loss_clip": 1.02198696, + "balance_loss_mlp": 1.01531148, + "epoch": 0.9620321659401774, + "flos": 20632224960000.0, + "grad_norm": 1.850489655894483, + "language_loss": 0.75490665, + "learning_rate": 1.5079318058905723e-08, + "loss": 0.77543426, + "num_input_tokens_seen": 345229175, + "step": 16001, + "time_per_iteration": 2.82307505607605 + }, + { + "auxiliary_loss_clip": 0.01050273, + "auxiliary_loss_mlp": 0.01026743, + "balance_loss_clip": 1.02357137, + "balance_loss_mlp": 1.0160917, + "epoch": 0.9620922891928453, + "flos": 18515290959360.0, + "grad_norm": 1.5257148032327705, + "language_loss": 0.6830377, + "learning_rate": 1.5031620857890447e-08, + "loss": 0.70380783, + "num_input_tokens_seen": 345247815, + "step": 16002, + "time_per_iteration": 2.6376988887786865 + }, + { + "auxiliary_loss_clip": 0.01052106, + "auxiliary_loss_mlp": 0.01025231, + "balance_loss_clip": 1.02560186, + "balance_loss_mlp": 1.01507473, + "epoch": 0.9621524124455133, + "flos": 28767391914240.0, + "grad_norm": 1.203206992275754, + "language_loss": 0.64572603, + "learning_rate": 1.4983998927074804e-08, + "loss": 0.66649938, + "num_input_tokens_seen": 345269935, + "step": 16003, + "time_per_iteration": 2.9144723415374756 + }, + { + "auxiliary_loss_clip": 0.01013756, + "auxiliary_loss_mlp": 0.0103419, + "balance_loss_clip": 1.02505827, + "balance_loss_mlp": 1.02410436, + "epoch": 0.9622125356981813, + "flos": 19098875226240.0, + "grad_norm": 1.776935089860914, + "language_loss": 0.76012641, + "learning_rate": 1.493645226826512e-08, + "loss": 0.78060585, + "num_input_tokens_seen": 345288310, + "step": 16004, + "time_per_iteration": 2.866239547729492 + }, + { + "auxiliary_loss_clip": 0.01051378, + "auxiliary_loss_mlp": 0.01026168, + "balance_loss_clip": 1.02529478, + "balance_loss_mlp": 1.01596379, + "epoch": 0.9622726589508492, + "flos": 20302816308480.0, + "grad_norm": 1.9006373533471075, + "language_loss": 0.7958256, + "learning_rate": 1.4888980883263958e-08, + "loss": 0.81660104, + "num_input_tokens_seen": 345306615, + "step": 16005, + "time_per_iteration": 2.614046573638916 + }, + { + "auxiliary_loss_clip": 0.01048641, + "auxiliary_loss_mlp": 0.01025865, + "balance_loss_clip": 1.02322996, + "balance_loss_mlp": 1.01630425, + "epoch": 0.9623327822035173, + "flos": 54929750889600.0, + "grad_norm": 2.1896580471574243, + "language_loss": 0.67596245, + "learning_rate": 1.4841584773871652e-08, + "loss": 0.69670749, + "num_input_tokens_seen": 345331935, + "step": 16006, + "time_per_iteration": 2.9228484630584717 + }, + { + "auxiliary_loss_clip": 0.01033293, + "auxiliary_loss_mlp": 0.01028953, + "balance_loss_clip": 1.02265251, + "balance_loss_mlp": 1.01909459, + "epoch": 0.9623929054561852, + "flos": 21759029585280.0, + "grad_norm": 1.5562315884744962, + "language_loss": 0.78175247, + "learning_rate": 1.479426394188521e-08, + "loss": 0.80237496, + "num_input_tokens_seen": 345351510, + "step": 16007, + "time_per_iteration": 2.668961763381958 + }, + { + "auxiliary_loss_clip": 0.01063959, + "auxiliary_loss_mlp": 0.0102859, + "balance_loss_clip": 1.02683794, + "balance_loss_mlp": 1.01802206, + "epoch": 0.9624530287088532, + "flos": 17931563038080.0, + "grad_norm": 2.1805611002609977, + "language_loss": 0.67536134, + "learning_rate": 1.4747018389099198e-08, + "loss": 0.69628686, + "num_input_tokens_seen": 345367750, + "step": 16008, + "time_per_iteration": 2.5336296558380127 + }, + { + "auxiliary_loss_clip": 0.01046918, + "auxiliary_loss_mlp": 0.01029603, + "balance_loss_clip": 1.02708864, + "balance_loss_mlp": 1.01833725, + "epoch": 0.9625131519615211, + "flos": 23253739263360.0, + "grad_norm": 2.336406795152175, + "language_loss": 0.73146474, + "learning_rate": 1.469984811730529e-08, + "loss": 0.75223005, + "num_input_tokens_seen": 345384790, + "step": 16009, + "time_per_iteration": 2.6558027267456055 + }, + { + "auxiliary_loss_clip": 0.01047834, + "auxiliary_loss_mlp": 0.01029094, + "balance_loss_clip": 1.0222671, + "balance_loss_mlp": 1.01875269, + "epoch": 0.9625732752141891, + "flos": 18916628595840.0, + "grad_norm": 1.6982845845984404, + "language_loss": 0.75355548, + "learning_rate": 1.4652753128292061e-08, + "loss": 0.77432477, + "num_input_tokens_seen": 345403390, + "step": 16010, + "time_per_iteration": 4.414773225784302 + }, + { + "auxiliary_loss_clip": 0.01057956, + "auxiliary_loss_mlp": 0.01033267, + "balance_loss_clip": 1.02687359, + "balance_loss_mlp": 1.01995778, + "epoch": 0.962633398466857, + "flos": 16252918790400.0, + "grad_norm": 1.6996445734533931, + "language_loss": 0.69686162, + "learning_rate": 1.4605733423845635e-08, + "loss": 0.71777391, + "num_input_tokens_seen": 345418685, + "step": 16011, + "time_per_iteration": 2.6687636375427246 + }, + { + "auxiliary_loss_clip": 0.01051582, + "auxiliary_loss_mlp": 0.0102854, + "balance_loss_clip": 1.02564955, + "balance_loss_mlp": 1.01891947, + "epoch": 0.962693521719525, + "flos": 54197424403200.0, + "grad_norm": 1.792772799338497, + "language_loss": 0.68455958, + "learning_rate": 1.4558789005748585e-08, + "loss": 0.70536083, + "num_input_tokens_seen": 345442380, + "step": 16012, + "time_per_iteration": 2.887669801712036 + }, + { + "auxiliary_loss_clip": 0.01039036, + "auxiliary_loss_mlp": 0.01035012, + "balance_loss_clip": 1.02205706, + "balance_loss_mlp": 1.02297187, + "epoch": 0.962753644972193, + "flos": 33105795471360.0, + "grad_norm": 1.8565633695113923, + "language_loss": 0.72415745, + "learning_rate": 1.4511919875781264e-08, + "loss": 0.74489784, + "num_input_tokens_seen": 345463815, + "step": 16013, + "time_per_iteration": 2.738020658493042 + }, + { + "auxiliary_loss_clip": 0.01032661, + "auxiliary_loss_mlp": 0.0102853, + "balance_loss_clip": 1.02403486, + "balance_loss_mlp": 1.01741433, + "epoch": 0.962813768224861, + "flos": 42230660837760.0, + "grad_norm": 2.3180269059877316, + "language_loss": 0.6279068, + "learning_rate": 1.4465126035720698e-08, + "loss": 0.64851868, + "num_input_tokens_seen": 345484525, + "step": 16014, + "time_per_iteration": 2.857748031616211 + }, + { + "auxiliary_loss_clip": 0.010381, + "auxiliary_loss_mlp": 0.01024538, + "balance_loss_clip": 1.02437544, + "balance_loss_mlp": 1.01586545, + "epoch": 0.9628738914775289, + "flos": 43944677003520.0, + "grad_norm": 1.5475470235497908, + "language_loss": 0.71842867, + "learning_rate": 1.4418407487341688e-08, + "loss": 0.73905504, + "num_input_tokens_seen": 345508295, + "step": 16015, + "time_per_iteration": 2.8902101516723633 + }, + { + "auxiliary_loss_clip": 0.01021489, + "auxiliary_loss_mlp": 0.01025079, + "balance_loss_clip": 1.02037024, + "balance_loss_mlp": 1.01480329, + "epoch": 0.9629340147301969, + "flos": 15596184476160.0, + "grad_norm": 2.694516581521183, + "language_loss": 0.77024978, + "learning_rate": 1.4371764232415707e-08, + "loss": 0.79071546, + "num_input_tokens_seen": 345525155, + "step": 16016, + "time_per_iteration": 2.802718162536621 + }, + { + "auxiliary_loss_clip": 0.01005959, + "auxiliary_loss_mlp": 0.01001203, + "balance_loss_clip": 1.00074792, + "balance_loss_mlp": 1.00042772, + "epoch": 0.9629941379828649, + "flos": 62951011816320.0, + "grad_norm": 0.8120080897326181, + "language_loss": 0.63135076, + "learning_rate": 1.4325196272711337e-08, + "loss": 0.65142238, + "num_input_tokens_seen": 345578905, + "step": 16017, + "time_per_iteration": 3.11551833152771 + }, + { + "auxiliary_loss_clip": 0.01045575, + "auxiliary_loss_mlp": 0.01025768, + "balance_loss_clip": 1.02630854, + "balance_loss_mlp": 1.01563525, + "epoch": 0.9630542612355328, + "flos": 29899116702720.0, + "grad_norm": 1.8001954426071802, + "language_loss": 0.66298378, + "learning_rate": 1.4278703609994502e-08, + "loss": 0.68369722, + "num_input_tokens_seen": 345598965, + "step": 16018, + "time_per_iteration": 2.7960712909698486 + }, + { + "auxiliary_loss_clip": 0.01012393, + "auxiliary_loss_mlp": 0.01030268, + "balance_loss_clip": 1.02412271, + "balance_loss_mlp": 1.02023065, + "epoch": 0.9631143844882009, + "flos": 17894575008000.0, + "grad_norm": 1.7488323990466192, + "language_loss": 0.79426098, + "learning_rate": 1.4232286246028457e-08, + "loss": 0.81468761, + "num_input_tokens_seen": 345617945, + "step": 16019, + "time_per_iteration": 2.826554298400879 + }, + { + "auxiliary_loss_clip": 0.01023157, + "auxiliary_loss_mlp": 0.01025962, + "balance_loss_clip": 1.02192819, + "balance_loss_mlp": 1.01695013, + "epoch": 0.9631745077408688, + "flos": 26139161767680.0, + "grad_norm": 2.2822333401390154, + "language_loss": 0.7161634, + "learning_rate": 1.4185944182572907e-08, + "loss": 0.73665464, + "num_input_tokens_seen": 345637920, + "step": 16020, + "time_per_iteration": 2.882262945175171 + }, + { + "auxiliary_loss_clip": 0.01042393, + "auxiliary_loss_mlp": 0.0102238, + "balance_loss_clip": 1.02600348, + "balance_loss_mlp": 1.01308775, + "epoch": 0.9632346309935368, + "flos": 24973645259520.0, + "grad_norm": 1.64185375330509, + "language_loss": 0.769665, + "learning_rate": 1.4139677421385331e-08, + "loss": 0.79031277, + "num_input_tokens_seen": 345656195, + "step": 16021, + "time_per_iteration": 2.782792806625366 + }, + { + "auxiliary_loss_clip": 0.01031063, + "auxiliary_loss_mlp": 0.0102575, + "balance_loss_clip": 1.02318025, + "balance_loss_mlp": 1.01325703, + "epoch": 0.9632947542462047, + "flos": 23617226943360.0, + "grad_norm": 1.9984001793383708, + "language_loss": 0.64833474, + "learning_rate": 1.4093485964220331e-08, + "loss": 0.66890287, + "num_input_tokens_seen": 345676700, + "step": 16022, + "time_per_iteration": 2.77781081199646 + }, + { + "auxiliary_loss_clip": 0.01029317, + "auxiliary_loss_mlp": 0.01030595, + "balance_loss_clip": 1.02026045, + "balance_loss_mlp": 1.02039099, + "epoch": 0.9633548774988727, + "flos": 26395599939840.0, + "grad_norm": 2.1049891190353813, + "language_loss": 0.73251843, + "learning_rate": 1.4047369812829168e-08, + "loss": 0.75311756, + "num_input_tokens_seen": 345696725, + "step": 16023, + "time_per_iteration": 2.830500602722168 + }, + { + "auxiliary_loss_clip": 0.01045101, + "auxiliary_loss_mlp": 0.01027237, + "balance_loss_clip": 1.02287579, + "balance_loss_mlp": 1.01647234, + "epoch": 0.9634150007515406, + "flos": 23767728929280.0, + "grad_norm": 1.447033207831492, + "language_loss": 0.8173337, + "learning_rate": 1.4001328968960891e-08, + "loss": 0.83805704, + "num_input_tokens_seen": 345716245, + "step": 16024, + "time_per_iteration": 2.7790985107421875 + }, + { + "auxiliary_loss_clip": 0.01054365, + "auxiliary_loss_mlp": 0.0102914, + "balance_loss_clip": 1.02468491, + "balance_loss_mlp": 1.018435, + "epoch": 0.9634751240042086, + "flos": 24135346673280.0, + "grad_norm": 2.088987646086256, + "language_loss": 0.81479591, + "learning_rate": 1.3955363434361212e-08, + "loss": 0.83563095, + "num_input_tokens_seen": 345739060, + "step": 16025, + "time_per_iteration": 2.7397964000701904 + }, + { + "auxiliary_loss_clip": 0.01052887, + "auxiliary_loss_mlp": 0.01024456, + "balance_loss_clip": 1.02473974, + "balance_loss_mlp": 1.01405537, + "epoch": 0.9635352472568766, + "flos": 24349086552960.0, + "grad_norm": 1.6771694426499193, + "language_loss": 0.75986582, + "learning_rate": 1.3909473210773181e-08, + "loss": 0.78063929, + "num_input_tokens_seen": 345758325, + "step": 16026, + "time_per_iteration": 2.7414300441741943 + }, + { + "auxiliary_loss_clip": 0.01028777, + "auxiliary_loss_mlp": 0.0074776, + "balance_loss_clip": 1.02407849, + "balance_loss_mlp": 1.00042403, + "epoch": 0.9635953705095446, + "flos": 23984772860160.0, + "grad_norm": 1.7911475427147214, + "language_loss": 0.63168299, + "learning_rate": 1.3863658299936965e-08, + "loss": 0.64944839, + "num_input_tokens_seen": 345778530, + "step": 16027, + "time_per_iteration": 2.7976672649383545 + }, + { + "auxiliary_loss_clip": 0.01048856, + "auxiliary_loss_mlp": 0.01026833, + "balance_loss_clip": 1.02386379, + "balance_loss_mlp": 1.01586568, + "epoch": 0.9636554937622125, + "flos": 19828436365440.0, + "grad_norm": 1.8020571045878337, + "language_loss": 0.87279558, + "learning_rate": 1.3817918703589837e-08, + "loss": 0.89355242, + "num_input_tokens_seen": 345796535, + "step": 16028, + "time_per_iteration": 4.401047229766846 + }, + { + "auxiliary_loss_clip": 0.00982412, + "auxiliary_loss_mlp": 0.0100283, + "balance_loss_clip": 1.01373029, + "balance_loss_mlp": 1.00101781, + "epoch": 0.9637156170148805, + "flos": 67435499986560.0, + "grad_norm": 1.2610778647226548, + "language_loss": 0.5320031, + "learning_rate": 1.3772254423466412e-08, + "loss": 0.5518555, + "num_input_tokens_seen": 345859700, + "step": 16029, + "time_per_iteration": 3.2786660194396973 + }, + { + "auxiliary_loss_clip": 0.01062111, + "auxiliary_loss_mlp": 0.01028536, + "balance_loss_clip": 1.02502966, + "balance_loss_mlp": 1.01830244, + "epoch": 0.9637757402675484, + "flos": 20300912887680.0, + "grad_norm": 1.4348996984376852, + "language_loss": 0.74154711, + "learning_rate": 1.372666546129797e-08, + "loss": 0.76245356, + "num_input_tokens_seen": 345878760, + "step": 16030, + "time_per_iteration": 2.567440986633301 + }, + { + "auxiliary_loss_clip": 0.01034828, + "auxiliary_loss_mlp": 0.01026745, + "balance_loss_clip": 1.02338576, + "balance_loss_mlp": 1.01679134, + "epoch": 0.9638358635202164, + "flos": 27234544970880.0, + "grad_norm": 1.866162614307304, + "language_loss": 0.66153991, + "learning_rate": 1.3681151818813575e-08, + "loss": 0.68215567, + "num_input_tokens_seen": 345900445, + "step": 16031, + "time_per_iteration": 2.6917715072631836 + }, + { + "auxiliary_loss_clip": 0.00996395, + "auxiliary_loss_mlp": 0.0074652, + "balance_loss_clip": 1.00146782, + "balance_loss_mlp": 1.00045872, + "epoch": 0.9638959867728845, + "flos": 70288998278400.0, + "grad_norm": 0.8384347677351328, + "language_loss": 0.60766995, + "learning_rate": 1.3635713497738955e-08, + "loss": 0.62509912, + "num_input_tokens_seen": 345961020, + "step": 16032, + "time_per_iteration": 3.2055535316467285 + }, + { + "auxiliary_loss_clip": 0.01047926, + "auxiliary_loss_mlp": 0.01029848, + "balance_loss_clip": 1.02415657, + "balance_loss_mlp": 1.02078843, + "epoch": 0.9639561100255524, + "flos": 25407517639680.0, + "grad_norm": 1.7960984413550625, + "language_loss": 0.66172868, + "learning_rate": 1.3590350499796954e-08, + "loss": 0.68250644, + "num_input_tokens_seen": 345980210, + "step": 16033, + "time_per_iteration": 2.6250345706939697 + }, + { + "auxiliary_loss_clip": 0.01005372, + "auxiliary_loss_mlp": 0.01026536, + "balance_loss_clip": 1.02389133, + "balance_loss_mlp": 1.01668882, + "epoch": 0.9640162332782204, + "flos": 18113881495680.0, + "grad_norm": 1.5943280943652576, + "language_loss": 0.65256023, + "learning_rate": 1.3545062826707976e-08, + "loss": 0.67287928, + "num_input_tokens_seen": 345998280, + "step": 16034, + "time_per_iteration": 2.841188430786133 + }, + { + "auxiliary_loss_clip": 0.01019557, + "auxiliary_loss_mlp": 0.01030632, + "balance_loss_clip": 1.02195799, + "balance_loss_mlp": 1.01949227, + "epoch": 0.9640763565308883, + "flos": 23440295525760.0, + "grad_norm": 2.392075409559422, + "language_loss": 0.73962104, + "learning_rate": 1.3499850480189313e-08, + "loss": 0.7601229, + "num_input_tokens_seen": 346015545, + "step": 16035, + "time_per_iteration": 2.755448579788208 + }, + { + "auxiliary_loss_clip": 0.01063334, + "auxiliary_loss_mlp": 0.0102838, + "balance_loss_clip": 1.02686989, + "balance_loss_mlp": 1.01762128, + "epoch": 0.9641364797835563, + "flos": 22419355259520.0, + "grad_norm": 2.0216127537542428, + "language_loss": 0.81825066, + "learning_rate": 1.3454713461955591e-08, + "loss": 0.83916783, + "num_input_tokens_seen": 346034055, + "step": 16036, + "time_per_iteration": 2.702792167663574 + }, + { + "auxiliary_loss_clip": 0.01032319, + "auxiliary_loss_mlp": 0.01029418, + "balance_loss_clip": 1.02357769, + "balance_loss_mlp": 1.01877272, + "epoch": 0.9641966030362242, + "flos": 30622357048320.0, + "grad_norm": 1.726949876570117, + "language_loss": 0.69714725, + "learning_rate": 1.340965177371789e-08, + "loss": 0.71776456, + "num_input_tokens_seen": 346054130, + "step": 16037, + "time_per_iteration": 2.7852115631103516 + }, + { + "auxiliary_loss_clip": 0.0106229, + "auxiliary_loss_mlp": 0.01025932, + "balance_loss_clip": 1.02504659, + "balance_loss_mlp": 1.01603711, + "epoch": 0.9642567262888923, + "flos": 20953122088320.0, + "grad_norm": 1.9128567710387696, + "language_loss": 0.62944663, + "learning_rate": 1.3364665417185506e-08, + "loss": 0.65032881, + "num_input_tokens_seen": 346072990, + "step": 16038, + "time_per_iteration": 4.409843444824219 + }, + { + "auxiliary_loss_clip": 0.01027881, + "auxiliary_loss_mlp": 0.0074769, + "balance_loss_clip": 1.02358282, + "balance_loss_mlp": 1.00047815, + "epoch": 0.9643168495415602, + "flos": 22639415932800.0, + "grad_norm": 2.162944621043543, + "language_loss": 0.71234775, + "learning_rate": 1.3319754394064187e-08, + "loss": 0.73010349, + "num_input_tokens_seen": 346093745, + "step": 16039, + "time_per_iteration": 4.4167211055755615 + }, + { + "auxiliary_loss_clip": 0.01020302, + "auxiliary_loss_mlp": 0.01028989, + "balance_loss_clip": 1.02345395, + "balance_loss_mlp": 1.01821828, + "epoch": 0.9643769727942282, + "flos": 20266259241600.0, + "grad_norm": 2.0472634090175696, + "language_loss": 0.72941995, + "learning_rate": 1.327491870605657e-08, + "loss": 0.74991286, + "num_input_tokens_seen": 346110115, + "step": 16040, + "time_per_iteration": 2.755614995956421 + }, + { + "auxiliary_loss_clip": 0.010523, + "auxiliary_loss_mlp": 0.01028768, + "balance_loss_clip": 1.02432525, + "balance_loss_mlp": 1.01762795, + "epoch": 0.9644370960468961, + "flos": 13881845088000.0, + "grad_norm": 2.0211541087267753, + "language_loss": 0.7328487, + "learning_rate": 1.3230158354863296e-08, + "loss": 0.75365937, + "num_input_tokens_seen": 346127165, + "step": 16041, + "time_per_iteration": 2.6904871463775635 + }, + { + "auxiliary_loss_clip": 0.01035021, + "auxiliary_loss_mlp": 0.01028527, + "balance_loss_clip": 1.02240062, + "balance_loss_mlp": 1.01911569, + "epoch": 0.9644972192995641, + "flos": 17238199829760.0, + "grad_norm": 1.7478566523785937, + "language_loss": 0.72073174, + "learning_rate": 1.3185473342181674e-08, + "loss": 0.74136716, + "num_input_tokens_seen": 346145950, + "step": 16042, + "time_per_iteration": 2.7160842418670654 + }, + { + "auxiliary_loss_clip": 0.01025906, + "auxiliary_loss_mlp": 0.01029483, + "balance_loss_clip": 1.02321684, + "balance_loss_mlp": 1.01890361, + "epoch": 0.964557342552232, + "flos": 23840340272640.0, + "grad_norm": 1.6185285655938948, + "language_loss": 0.81128806, + "learning_rate": 1.3140863669705683e-08, + "loss": 0.83184195, + "num_input_tokens_seen": 346165005, + "step": 16043, + "time_per_iteration": 2.729720115661621 + }, + { + "auxiliary_loss_clip": 0.01035735, + "auxiliary_loss_mlp": 0.01028454, + "balance_loss_clip": 1.02324939, + "balance_loss_mlp": 1.01824951, + "epoch": 0.9646174658049, + "flos": 21653129312640.0, + "grad_norm": 1.6982329709111845, + "language_loss": 0.71708637, + "learning_rate": 1.3096329339127522e-08, + "loss": 0.73772824, + "num_input_tokens_seen": 346185095, + "step": 16044, + "time_per_iteration": 2.725844144821167 + }, + { + "auxiliary_loss_clip": 0.01033616, + "auxiliary_loss_mlp": 0.01025066, + "balance_loss_clip": 1.02107525, + "balance_loss_mlp": 1.015172, + "epoch": 0.9646775890575681, + "flos": 17129570123520.0, + "grad_norm": 1.8038580383449636, + "language_loss": 0.69784045, + "learning_rate": 1.3051870352135397e-08, + "loss": 0.71842724, + "num_input_tokens_seen": 346202580, + "step": 16045, + "time_per_iteration": 2.671297311782837 + }, + { + "auxiliary_loss_clip": 0.00997032, + "auxiliary_loss_mlp": 0.01034521, + "balance_loss_clip": 1.02245498, + "balance_loss_mlp": 1.02308857, + "epoch": 0.964737712310236, + "flos": 13005732458880.0, + "grad_norm": 3.1259984479798364, + "language_loss": 0.75190556, + "learning_rate": 1.3007486710415737e-08, + "loss": 0.77222109, + "num_input_tokens_seen": 346219395, + "step": 16046, + "time_per_iteration": 2.762701988220215 + }, + { + "auxiliary_loss_clip": 0.01055622, + "auxiliary_loss_mlp": 0.01031739, + "balance_loss_clip": 1.02613866, + "balance_loss_mlp": 1.02033067, + "epoch": 0.964797835562904, + "flos": 24279240556800.0, + "grad_norm": 1.8743000155834577, + "language_loss": 0.62506098, + "learning_rate": 1.2963178415651199e-08, + "loss": 0.64593458, + "num_input_tokens_seen": 346239715, + "step": 16047, + "time_per_iteration": 2.6975746154785156 + }, + { + "auxiliary_loss_clip": 0.01044606, + "auxiliary_loss_mlp": 0.01029117, + "balance_loss_clip": 1.0273602, + "balance_loss_mlp": 1.01884103, + "epoch": 0.9648579588155719, + "flos": 20522697413760.0, + "grad_norm": 1.9172835619248607, + "language_loss": 0.69225371, + "learning_rate": 1.2918945469521992e-08, + "loss": 0.71299094, + "num_input_tokens_seen": 346258500, + "step": 16048, + "time_per_iteration": 2.6355106830596924 + }, + { + "auxiliary_loss_clip": 0.01052852, + "auxiliary_loss_mlp": 0.01026264, + "balance_loss_clip": 1.02481794, + "balance_loss_mlp": 1.01578581, + "epoch": 0.9649180820682399, + "flos": 32154844855680.0, + "grad_norm": 1.9093723908479785, + "language_loss": 0.63749361, + "learning_rate": 1.2874787873705662e-08, + "loss": 0.65828478, + "num_input_tokens_seen": 346279110, + "step": 16049, + "time_per_iteration": 2.7009451389312744 + }, + { + "auxiliary_loss_clip": 0.01051622, + "auxiliary_loss_mlp": 0.01026363, + "balance_loss_clip": 1.02484179, + "balance_loss_mlp": 1.01649833, + "epoch": 0.9649782053209078, + "flos": 20522589672960.0, + "grad_norm": 1.5134826369716947, + "language_loss": 0.70916057, + "learning_rate": 1.2830705629876427e-08, + "loss": 0.72994035, + "num_input_tokens_seen": 346297860, + "step": 16050, + "time_per_iteration": 2.6908342838287354 + }, + { + "auxiliary_loss_clip": 0.0104885, + "auxiliary_loss_mlp": 0.01036142, + "balance_loss_clip": 1.02190602, + "balance_loss_mlp": 1.02406597, + "epoch": 0.9650383285735759, + "flos": 43067953843200.0, + "grad_norm": 1.894329527901869, + "language_loss": 0.69661063, + "learning_rate": 1.278669873970606e-08, + "loss": 0.71746051, + "num_input_tokens_seen": 346319860, + "step": 16051, + "time_per_iteration": 2.7807679176330566 + }, + { + "auxiliary_loss_clip": 0.0099665, + "auxiliary_loss_mlp": 0.0100033, + "balance_loss_clip": 1.00143099, + "balance_loss_mlp": 0.99946016, + "epoch": 0.9650984518262438, + "flos": 61748255882880.0, + "grad_norm": 0.8406292247228658, + "language_loss": 0.59186089, + "learning_rate": 1.2742767204863004e-08, + "loss": 0.61183071, + "num_input_tokens_seen": 346379025, + "step": 16052, + "time_per_iteration": 3.219054698944092 + }, + { + "auxiliary_loss_clip": 0.01058176, + "auxiliary_loss_mlp": 0.01024774, + "balance_loss_clip": 1.0233897, + "balance_loss_mlp": 1.01491523, + "epoch": 0.9651585750789118, + "flos": 29789337761280.0, + "grad_norm": 1.5598347536709194, + "language_loss": 0.74477577, + "learning_rate": 1.2698911027013482e-08, + "loss": 0.76560533, + "num_input_tokens_seen": 346402250, + "step": 16053, + "time_per_iteration": 2.856126070022583 + }, + { + "auxiliary_loss_clip": 0.01035911, + "auxiliary_loss_mlp": 0.01031066, + "balance_loss_clip": 1.02400196, + "balance_loss_mlp": 1.02037871, + "epoch": 0.9652186983315797, + "flos": 16873060124160.0, + "grad_norm": 1.945505750304777, + "language_loss": 0.68687904, + "learning_rate": 1.2655130207820386e-08, + "loss": 0.70754886, + "num_input_tokens_seen": 346419555, + "step": 16054, + "time_per_iteration": 2.756265163421631 + }, + { + "auxiliary_loss_clip": 0.01053193, + "auxiliary_loss_mlp": 0.00747688, + "balance_loss_clip": 1.0272162, + "balance_loss_mlp": 1.00041187, + "epoch": 0.9652788215842477, + "flos": 31649761762560.0, + "grad_norm": 1.449965275256783, + "language_loss": 0.62343866, + "learning_rate": 1.2611424748943944e-08, + "loss": 0.64144748, + "num_input_tokens_seen": 346441245, + "step": 16055, + "time_per_iteration": 2.842571258544922 + }, + { + "auxiliary_loss_clip": 0.01029905, + "auxiliary_loss_mlp": 0.010321, + "balance_loss_clip": 1.02498317, + "balance_loss_mlp": 1.02135944, + "epoch": 0.9653389448369156, + "flos": 24754266944640.0, + "grad_norm": 1.9030475479351645, + "language_loss": 0.76573151, + "learning_rate": 1.2567794652041719e-08, + "loss": 0.78635156, + "num_input_tokens_seen": 346460065, + "step": 16056, + "time_per_iteration": 2.738051176071167 + }, + { + "auxiliary_loss_clip": 0.01036363, + "auxiliary_loss_mlp": 0.01028447, + "balance_loss_clip": 1.02343154, + "balance_loss_mlp": 1.01821923, + "epoch": 0.9653990680895836, + "flos": 20297249700480.0, + "grad_norm": 1.4840200511469874, + "language_loss": 0.71325505, + "learning_rate": 1.2524239918767498e-08, + "loss": 0.73390317, + "num_input_tokens_seen": 346478005, + "step": 16057, + "time_per_iteration": 2.711469888687134 + }, + { + "auxiliary_loss_clip": 0.01059737, + "auxiliary_loss_mlp": 0.01030021, + "balance_loss_clip": 1.02445817, + "balance_loss_mlp": 1.0201025, + "epoch": 0.9654591913422517, + "flos": 22528775064960.0, + "grad_norm": 2.2285104682191816, + "language_loss": 0.71827996, + "learning_rate": 1.2480760550773295e-08, + "loss": 0.73917758, + "num_input_tokens_seen": 346497575, + "step": 16058, + "time_per_iteration": 4.438020467758179 + }, + { + "auxiliary_loss_clip": 0.01052143, + "auxiliary_loss_mlp": 0.01033952, + "balance_loss_clip": 1.02449751, + "balance_loss_mlp": 1.02382565, + "epoch": 0.9655193145949196, + "flos": 26763002202240.0, + "grad_norm": 1.4322540938251347, + "language_loss": 0.74372482, + "learning_rate": 1.2437356549708011e-08, + "loss": 0.76458579, + "num_input_tokens_seen": 346520000, + "step": 16059, + "time_per_iteration": 2.6837384700775146 + }, + { + "auxiliary_loss_clip": 0.01041898, + "auxiliary_loss_mlp": 0.01029166, + "balance_loss_clip": 1.02389717, + "balance_loss_mlp": 1.01875901, + "epoch": 0.9655794378475876, + "flos": 41970703132800.0, + "grad_norm": 4.567970954119142, + "language_loss": 0.73571593, + "learning_rate": 1.239402791721722e-08, + "loss": 0.75642657, + "num_input_tokens_seen": 346541605, + "step": 16060, + "time_per_iteration": 2.8278160095214844 + }, + { + "auxiliary_loss_clip": 0.01039455, + "auxiliary_loss_mlp": 0.01026168, + "balance_loss_clip": 1.0245657, + "balance_loss_mlp": 1.01727557, + "epoch": 0.9656395611002555, + "flos": 27709427704320.0, + "grad_norm": 1.5932240981680417, + "language_loss": 0.76770711, + "learning_rate": 1.2350774654944273e-08, + "loss": 0.7883634, + "num_input_tokens_seen": 346560955, + "step": 16061, + "time_per_iteration": 2.8518707752227783 + }, + { + "auxiliary_loss_clip": 0.00987369, + "auxiliary_loss_mlp": 0.01003153, + "balance_loss_clip": 1.00204968, + "balance_loss_mlp": 1.0023303, + "epoch": 0.9656996843529235, + "flos": 68968562411520.0, + "grad_norm": 0.7538167563217105, + "language_loss": 0.64186442, + "learning_rate": 1.2307596764528749e-08, + "loss": 0.66176963, + "num_input_tokens_seen": 346621615, + "step": 16062, + "time_per_iteration": 3.3538312911987305 + }, + { + "auxiliary_loss_clip": 0.01008896, + "auxiliary_loss_mlp": 0.01025169, + "balance_loss_clip": 1.0195502, + "balance_loss_mlp": 1.01525688, + "epoch": 0.9657598076055914, + "flos": 20631327120000.0, + "grad_norm": 2.138359023761701, + "language_loss": 0.92992342, + "learning_rate": 1.226449424760867e-08, + "loss": 0.95026404, + "num_input_tokens_seen": 346637460, + "step": 16063, + "time_per_iteration": 2.739396572113037 + }, + { + "auxiliary_loss_clip": 0.01050378, + "auxiliary_loss_mlp": 0.0102979, + "balance_loss_clip": 1.02386081, + "balance_loss_mlp": 1.0194788, + "epoch": 0.9658199308582595, + "flos": 20448577699200.0, + "grad_norm": 1.8322909192090697, + "language_loss": 0.82152367, + "learning_rate": 1.2221467105818062e-08, + "loss": 0.84232533, + "num_input_tokens_seen": 346655625, + "step": 16064, + "time_per_iteration": 2.642340898513794 + }, + { + "auxiliary_loss_clip": 0.01047259, + "auxiliary_loss_mlp": 0.00747432, + "balance_loss_clip": 1.02538943, + "balance_loss_mlp": 1.0003711, + "epoch": 0.9658800541109274, + "flos": 24718033100160.0, + "grad_norm": 1.4842784373814977, + "language_loss": 0.8430016, + "learning_rate": 1.2178515340788731e-08, + "loss": 0.86094856, + "num_input_tokens_seen": 346675220, + "step": 16065, + "time_per_iteration": 2.6911346912384033 + }, + { + "auxiliary_loss_clip": 0.01038519, + "auxiliary_loss_mlp": 0.01029885, + "balance_loss_clip": 1.02285314, + "balance_loss_mlp": 1.01933467, + "epoch": 0.9659401773635954, + "flos": 21610035970560.0, + "grad_norm": 1.5800617992460315, + "language_loss": 0.67286175, + "learning_rate": 1.2135638954149151e-08, + "loss": 0.69354576, + "num_input_tokens_seen": 346694710, + "step": 16066, + "time_per_iteration": 2.657750129699707 + }, + { + "auxiliary_loss_clip": 0.01061581, + "auxiliary_loss_mlp": 0.01022066, + "balance_loss_clip": 1.02459502, + "balance_loss_mlp": 1.01213026, + "epoch": 0.9660003006162633, + "flos": 20301200196480.0, + "grad_norm": 1.824670613735363, + "language_loss": 0.81898099, + "learning_rate": 1.209283794752558e-08, + "loss": 0.8398174, + "num_input_tokens_seen": 346712645, + "step": 16067, + "time_per_iteration": 2.7541046142578125 + }, + { + "auxiliary_loss_clip": 0.01042927, + "auxiliary_loss_mlp": 0.01023323, + "balance_loss_clip": 1.02670598, + "balance_loss_mlp": 1.01316071, + "epoch": 0.9660604238689313, + "flos": 24461954064000.0, + "grad_norm": 2.2095392717922184, + "language_loss": 0.69260639, + "learning_rate": 1.2050112322540496e-08, + "loss": 0.71326894, + "num_input_tokens_seen": 346732375, + "step": 16068, + "time_per_iteration": 2.743223190307617 + }, + { + "auxiliary_loss_clip": 0.0103912, + "auxiliary_loss_mlp": 0.01028417, + "balance_loss_clip": 1.02118742, + "balance_loss_mlp": 1.0194999, + "epoch": 0.9661205471215992, + "flos": 19864023765120.0, + "grad_norm": 1.7315897133217604, + "language_loss": 0.6789937, + "learning_rate": 1.20074620808146e-08, + "loss": 0.699669, + "num_input_tokens_seen": 346750430, + "step": 16069, + "time_per_iteration": 2.7306134700775146 + }, + { + "auxiliary_loss_clip": 0.01043701, + "auxiliary_loss_mlp": 0.01025736, + "balance_loss_clip": 1.02561188, + "balance_loss_mlp": 1.01591301, + "epoch": 0.9661806703742672, + "flos": 20557889763840.0, + "grad_norm": 1.9602855547041953, + "language_loss": 0.89056849, + "learning_rate": 1.1964887223964826e-08, + "loss": 0.91126287, + "num_input_tokens_seen": 346768455, + "step": 16070, + "time_per_iteration": 2.779540777206421 + }, + { + "auxiliary_loss_clip": 0.01064642, + "auxiliary_loss_mlp": 0.0102903, + "balance_loss_clip": 1.02704751, + "balance_loss_mlp": 1.01830077, + "epoch": 0.9662407936269353, + "flos": 21430949736960.0, + "grad_norm": 1.7579989316986382, + "language_loss": 0.77235615, + "learning_rate": 1.1922387753605878e-08, + "loss": 0.79329288, + "num_input_tokens_seen": 346786530, + "step": 16071, + "time_per_iteration": 2.672685384750366 + }, + { + "auxiliary_loss_clip": 0.01032914, + "auxiliary_loss_mlp": 0.01028213, + "balance_loss_clip": 1.02072561, + "balance_loss_mlp": 1.01623893, + "epoch": 0.9663009168796032, + "flos": 14902893095040.0, + "grad_norm": 1.7257580771439842, + "language_loss": 0.65861678, + "learning_rate": 1.1879963671349137e-08, + "loss": 0.67922801, + "num_input_tokens_seen": 346804635, + "step": 16072, + "time_per_iteration": 2.778078317642212 + }, + { + "auxiliary_loss_clip": 0.01052819, + "auxiliary_loss_mlp": 0.01024623, + "balance_loss_clip": 1.0254457, + "balance_loss_mlp": 1.01468658, + "epoch": 0.9663610401322712, + "flos": 24310877460480.0, + "grad_norm": 1.630836707458602, + "language_loss": 0.77199578, + "learning_rate": 1.1837614978803534e-08, + "loss": 0.79277027, + "num_input_tokens_seen": 346823070, + "step": 16073, + "time_per_iteration": 2.7655954360961914 + }, + { + "auxiliary_loss_clip": 0.01066052, + "auxiliary_loss_mlp": 0.0103329, + "balance_loss_clip": 1.02655292, + "balance_loss_mlp": 1.0223465, + "epoch": 0.9664211633849391, + "flos": 17637849527040.0, + "grad_norm": 2.1396976775109446, + "language_loss": 0.75879407, + "learning_rate": 1.1795341677574677e-08, + "loss": 0.77978742, + "num_input_tokens_seen": 346841180, + "step": 16074, + "time_per_iteration": 2.6445114612579346 + }, + { + "auxiliary_loss_clip": 0.01044883, + "auxiliary_loss_mlp": 0.01030933, + "balance_loss_clip": 1.0256145, + "balance_loss_mlp": 1.0203054, + "epoch": 0.9664812866376071, + "flos": 29789409588480.0, + "grad_norm": 1.5197563682822801, + "language_loss": 0.75809944, + "learning_rate": 1.1753143769265728e-08, + "loss": 0.77885759, + "num_input_tokens_seen": 346864250, + "step": 16075, + "time_per_iteration": 2.747239589691162 + }, + { + "auxiliary_loss_clip": 0.01032779, + "auxiliary_loss_mlp": 0.01029978, + "balance_loss_clip": 1.02534997, + "balance_loss_mlp": 1.01951694, + "epoch": 0.966541409890275, + "flos": 14282320798080.0, + "grad_norm": 1.8093571460494924, + "language_loss": 0.78673369, + "learning_rate": 1.171102125547696e-08, + "loss": 0.80736125, + "num_input_tokens_seen": 346881955, + "step": 16076, + "time_per_iteration": 4.584211826324463 + }, + { + "auxiliary_loss_clip": 0.01047447, + "auxiliary_loss_mlp": 0.01037007, + "balance_loss_clip": 1.02809381, + "balance_loss_mlp": 1.02543807, + "epoch": 0.9666015331429431, + "flos": 19860432405120.0, + "grad_norm": 1.6586935598769381, + "language_loss": 0.72371757, + "learning_rate": 1.166897413780532e-08, + "loss": 0.74456215, + "num_input_tokens_seen": 346900445, + "step": 16077, + "time_per_iteration": 2.697976589202881 + }, + { + "auxiliary_loss_clip": 0.01044424, + "auxiliary_loss_mlp": 0.01027937, + "balance_loss_clip": 1.02237201, + "balance_loss_mlp": 1.01723766, + "epoch": 0.966661656395611, + "flos": 27125951178240.0, + "grad_norm": 1.7775577638818107, + "language_loss": 0.5968557, + "learning_rate": 1.1627002417845533e-08, + "loss": 0.61757934, + "num_input_tokens_seen": 346920135, + "step": 16078, + "time_per_iteration": 2.661233425140381 + }, + { + "auxiliary_loss_clip": 0.01052798, + "auxiliary_loss_mlp": 0.01030798, + "balance_loss_clip": 1.02457786, + "balance_loss_mlp": 1.02007473, + "epoch": 0.966721779648279, + "flos": 21508229848320.0, + "grad_norm": 1.846090252325684, + "language_loss": 0.72188056, + "learning_rate": 1.158510609718899e-08, + "loss": 0.74271649, + "num_input_tokens_seen": 346940450, + "step": 16079, + "time_per_iteration": 2.7029809951782227 + }, + { + "auxiliary_loss_clip": 0.01049357, + "auxiliary_loss_mlp": 0.01025333, + "balance_loss_clip": 1.02461183, + "balance_loss_mlp": 1.01570678, + "epoch": 0.9667819029009469, + "flos": 23878118401920.0, + "grad_norm": 1.6770356225929637, + "language_loss": 0.72375488, + "learning_rate": 1.1543285177424644e-08, + "loss": 0.74450183, + "num_input_tokens_seen": 346960935, + "step": 16080, + "time_per_iteration": 2.771461248397827 + }, + { + "auxiliary_loss_clip": 0.01027823, + "auxiliary_loss_mlp": 0.01030652, + "balance_loss_clip": 1.02187848, + "balance_loss_mlp": 1.0194881, + "epoch": 0.9668420261536149, + "flos": 21507224267520.0, + "grad_norm": 1.853058504651927, + "language_loss": 0.73827785, + "learning_rate": 1.1501539660138115e-08, + "loss": 0.75886261, + "num_input_tokens_seen": 346980100, + "step": 16081, + "time_per_iteration": 2.972888946533203 + }, + { + "auxiliary_loss_clip": 0.01033127, + "auxiliary_loss_mlp": 0.01025481, + "balance_loss_clip": 1.02216816, + "balance_loss_mlp": 1.01482975, + "epoch": 0.9669021494062828, + "flos": 26687266375680.0, + "grad_norm": 1.8099648185232249, + "language_loss": 0.67788577, + "learning_rate": 1.145986954691236e-08, + "loss": 0.69847178, + "num_input_tokens_seen": 347001250, + "step": 16082, + "time_per_iteration": 2.8491456508636475 + }, + { + "auxiliary_loss_clip": 0.01017515, + "auxiliary_loss_mlp": 0.01032826, + "balance_loss_clip": 1.02152801, + "balance_loss_mlp": 1.02127445, + "epoch": 0.9669622726589508, + "flos": 29825032901760.0, + "grad_norm": 1.5996138883194149, + "language_loss": 0.76752126, + "learning_rate": 1.141827483932789e-08, + "loss": 0.78802466, + "num_input_tokens_seen": 347022975, + "step": 16083, + "time_per_iteration": 2.822366237640381 + }, + { + "auxiliary_loss_clip": 0.01017914, + "auxiliary_loss_mlp": 0.01029143, + "balance_loss_clip": 1.02317548, + "balance_loss_mlp": 1.01872396, + "epoch": 0.9670223959116189, + "flos": 22922499018240.0, + "grad_norm": 1.9154081184506067, + "language_loss": 0.79281694, + "learning_rate": 1.1376755538961669e-08, + "loss": 0.8132875, + "num_input_tokens_seen": 347038780, + "step": 16084, + "time_per_iteration": 2.9031283855438232 + }, + { + "auxiliary_loss_clip": 0.01053292, + "auxiliary_loss_mlp": 0.01025605, + "balance_loss_clip": 1.02371335, + "balance_loss_mlp": 1.01454866, + "epoch": 0.9670825191642868, + "flos": 18624495283200.0, + "grad_norm": 2.0562390627822476, + "language_loss": 0.67254335, + "learning_rate": 1.1335311647387991e-08, + "loss": 0.69333231, + "num_input_tokens_seen": 347056705, + "step": 16085, + "time_per_iteration": 4.32056188583374 + }, + { + "auxiliary_loss_clip": 0.01043582, + "auxiliary_loss_mlp": 0.01027351, + "balance_loss_clip": 1.02446282, + "balance_loss_mlp": 1.01629424, + "epoch": 0.9671426424169548, + "flos": 24497936513280.0, + "grad_norm": 3.051338844577573, + "language_loss": 0.68465775, + "learning_rate": 1.1293943166178709e-08, + "loss": 0.70536715, + "num_input_tokens_seen": 347075710, + "step": 16086, + "time_per_iteration": 2.818007707595825 + }, + { + "auxiliary_loss_clip": 0.01046523, + "auxiliary_loss_mlp": 0.01029606, + "balance_loss_clip": 1.02269042, + "balance_loss_mlp": 1.01886487, + "epoch": 0.9672027656696227, + "flos": 20371189847040.0, + "grad_norm": 2.1535422180306925, + "language_loss": 0.78510612, + "learning_rate": 1.125265009690235e-08, + "loss": 0.80586743, + "num_input_tokens_seen": 347092325, + "step": 16087, + "time_per_iteration": 4.2293901443481445 + }, + { + "auxiliary_loss_clip": 0.01030603, + "auxiliary_loss_mlp": 0.01024076, + "balance_loss_clip": 1.02142262, + "balance_loss_mlp": 1.01397288, + "epoch": 0.9672628889222907, + "flos": 18880179269760.0, + "grad_norm": 1.888616089464616, + "language_loss": 0.71306193, + "learning_rate": 1.1211432441124769e-08, + "loss": 0.73360872, + "num_input_tokens_seen": 347110595, + "step": 16088, + "time_per_iteration": 2.6423420906066895 + }, + { + "auxiliary_loss_clip": 0.01059004, + "auxiliary_loss_mlp": 0.0074759, + "balance_loss_clip": 1.02438188, + "balance_loss_mlp": 1.00032473, + "epoch": 0.9673230121749586, + "flos": 28695247447680.0, + "grad_norm": 1.9522093967268153, + "language_loss": 0.70437038, + "learning_rate": 1.117029020040916e-08, + "loss": 0.72243631, + "num_input_tokens_seen": 347131625, + "step": 16089, + "time_per_iteration": 2.6630165576934814 + }, + { + "auxiliary_loss_clip": 0.01064021, + "auxiliary_loss_mlp": 0.01032036, + "balance_loss_clip": 1.02574122, + "balance_loss_mlp": 1.0216713, + "epoch": 0.9673831354276267, + "flos": 20484452407680.0, + "grad_norm": 2.459574148458249, + "language_loss": 0.74912584, + "learning_rate": 1.1129223376315167e-08, + "loss": 0.77008641, + "num_input_tokens_seen": 347147910, + "step": 16090, + "time_per_iteration": 2.638523817062378 + }, + { + "auxiliary_loss_clip": 0.01043707, + "auxiliary_loss_mlp": 0.01029802, + "balance_loss_clip": 1.02485979, + "balance_loss_mlp": 1.01938868, + "epoch": 0.9674432586802946, + "flos": 26797548107520.0, + "grad_norm": 1.6837508353127213, + "language_loss": 0.68863392, + "learning_rate": 1.1088231970400653e-08, + "loss": 0.70936894, + "num_input_tokens_seen": 347168805, + "step": 16091, + "time_per_iteration": 2.746245861053467 + }, + { + "auxiliary_loss_clip": 0.01061751, + "auxiliary_loss_mlp": 0.01026767, + "balance_loss_clip": 1.02471006, + "balance_loss_mlp": 1.0159781, + "epoch": 0.9675033819329626, + "flos": 22310941034880.0, + "grad_norm": 1.7551622511296225, + "language_loss": 0.76837486, + "learning_rate": 1.1047315984219484e-08, + "loss": 0.78926003, + "num_input_tokens_seen": 347189455, + "step": 16092, + "time_per_iteration": 2.6573901176452637 + }, + { + "auxiliary_loss_clip": 0.01061403, + "auxiliary_loss_mlp": 0.01026198, + "balance_loss_clip": 1.02494931, + "balance_loss_mlp": 1.01678658, + "epoch": 0.9675635051856305, + "flos": 12675713276160.0, + "grad_norm": 2.849393417911346, + "language_loss": 0.76321483, + "learning_rate": 1.1006475419323313e-08, + "loss": 0.78409088, + "num_input_tokens_seen": 347206030, + "step": 16093, + "time_per_iteration": 2.650221109390259 + }, + { + "auxiliary_loss_clip": 0.01041369, + "auxiliary_loss_mlp": 0.01028575, + "balance_loss_clip": 1.02474713, + "balance_loss_mlp": 1.01729774, + "epoch": 0.9676236284382985, + "flos": 24608469640320.0, + "grad_norm": 1.4967797297471126, + "language_loss": 0.69018769, + "learning_rate": 1.096571027726112e-08, + "loss": 0.71088719, + "num_input_tokens_seen": 347226250, + "step": 16094, + "time_per_iteration": 2.799835681915283 + }, + { + "auxiliary_loss_clip": 0.01053451, + "auxiliary_loss_mlp": 0.01025198, + "balance_loss_clip": 1.02530062, + "balance_loss_mlp": 1.01527405, + "epoch": 0.9676837516909664, + "flos": 23367145478400.0, + "grad_norm": 1.482796892357651, + "language_loss": 0.76049209, + "learning_rate": 1.0925020559578557e-08, + "loss": 0.78127855, + "num_input_tokens_seen": 347247350, + "step": 16095, + "time_per_iteration": 2.7109227180480957 + }, + { + "auxiliary_loss_clip": 0.01066344, + "auxiliary_loss_mlp": 0.01034341, + "balance_loss_clip": 1.02754486, + "balance_loss_mlp": 1.02341568, + "epoch": 0.9677438749436345, + "flos": 20486894532480.0, + "grad_norm": 2.0267506902215047, + "language_loss": 0.70727396, + "learning_rate": 1.0884406267818392e-08, + "loss": 0.72828078, + "num_input_tokens_seen": 347266870, + "step": 16096, + "time_per_iteration": 2.6885061264038086 + }, + { + "auxiliary_loss_clip": 0.01041398, + "auxiliary_loss_mlp": 0.01026108, + "balance_loss_clip": 1.02396941, + "balance_loss_mlp": 1.01593935, + "epoch": 0.9678039981963025, + "flos": 47555889719040.0, + "grad_norm": 2.2848591231437534, + "language_loss": 0.71902227, + "learning_rate": 1.0843867403520946e-08, + "loss": 0.73969734, + "num_input_tokens_seen": 347290120, + "step": 16097, + "time_per_iteration": 2.9350244998931885 + }, + { + "auxiliary_loss_clip": 0.01061043, + "auxiliary_loss_mlp": 0.01030142, + "balance_loss_clip": 1.02474737, + "balance_loss_mlp": 1.01980639, + "epoch": 0.9678641214489704, + "flos": 25040474513280.0, + "grad_norm": 2.1240866272774412, + "language_loss": 0.78094029, + "learning_rate": 1.0803403968223434e-08, + "loss": 0.80185223, + "num_input_tokens_seen": 347308785, + "step": 16098, + "time_per_iteration": 2.692939281463623 + }, + { + "auxiliary_loss_clip": 0.01033694, + "auxiliary_loss_mlp": 0.01025882, + "balance_loss_clip": 1.0256474, + "balance_loss_mlp": 1.01643443, + "epoch": 0.9679242447016384, + "flos": 19240937516160.0, + "grad_norm": 1.7132217833508883, + "language_loss": 0.90349138, + "learning_rate": 1.0763015963459965e-08, + "loss": 0.92408711, + "num_input_tokens_seen": 347326375, + "step": 16099, + "time_per_iteration": 2.6469013690948486 + }, + { + "auxiliary_loss_clip": 0.01052392, + "auxiliary_loss_mlp": 0.01031909, + "balance_loss_clip": 1.024261, + "balance_loss_mlp": 1.02102494, + "epoch": 0.9679843679543063, + "flos": 33254681345280.0, + "grad_norm": 1.6658280976406905, + "language_loss": 0.66217625, + "learning_rate": 1.0722703390762643e-08, + "loss": 0.68301922, + "num_input_tokens_seen": 347348250, + "step": 16100, + "time_per_iteration": 2.801016092300415 + }, + { + "auxiliary_loss_clip": 0.01035414, + "auxiliary_loss_mlp": 0.01029689, + "balance_loss_clip": 1.02805781, + "balance_loss_mlp": 1.01904404, + "epoch": 0.9680444912069743, + "flos": 22783633038720.0, + "grad_norm": 1.6439611630527513, + "language_loss": 0.73590839, + "learning_rate": 1.0682466251659584e-08, + "loss": 0.75655943, + "num_input_tokens_seen": 347367400, + "step": 16101, + "time_per_iteration": 2.807626485824585 + }, + { + "auxiliary_loss_clip": 0.01041055, + "auxiliary_loss_mlp": 0.01028426, + "balance_loss_clip": 1.02426636, + "balance_loss_mlp": 1.01776242, + "epoch": 0.9681046144596422, + "flos": 24024095274240.0, + "grad_norm": 3.3520645597882113, + "language_loss": 0.73735452, + "learning_rate": 1.0642304547676672e-08, + "loss": 0.75804931, + "num_input_tokens_seen": 347387600, + "step": 16102, + "time_per_iteration": 2.769341230392456 + }, + { + "auxiliary_loss_clip": 0.01028845, + "auxiliary_loss_mlp": 0.01034359, + "balance_loss_clip": 1.0276475, + "balance_loss_mlp": 1.02262926, + "epoch": 0.9681647377123103, + "flos": 23441013797760.0, + "grad_norm": 1.7761604892415315, + "language_loss": 0.77503681, + "learning_rate": 1.0602218280337139e-08, + "loss": 0.7956689, + "num_input_tokens_seen": 347406915, + "step": 16103, + "time_per_iteration": 2.8706395626068115 + }, + { + "auxiliary_loss_clip": 0.01036121, + "auxiliary_loss_mlp": 0.01027963, + "balance_loss_clip": 1.02114332, + "balance_loss_mlp": 1.01824403, + "epoch": 0.9682248609649782, + "flos": 22675075159680.0, + "grad_norm": 2.0037122485771524, + "language_loss": 0.80308735, + "learning_rate": 1.0562207451160655e-08, + "loss": 0.82372808, + "num_input_tokens_seen": 347425140, + "step": 16104, + "time_per_iteration": 2.6439249515533447 + }, + { + "auxiliary_loss_clip": 0.01039428, + "auxiliary_loss_mlp": 0.01030928, + "balance_loss_clip": 1.0200603, + "balance_loss_mlp": 1.02173126, + "epoch": 0.9682849842176462, + "flos": 24428413739520.0, + "grad_norm": 1.423826905568692, + "language_loss": 0.77559984, + "learning_rate": 1.0522272061664672e-08, + "loss": 0.79630339, + "num_input_tokens_seen": 347446350, + "step": 16105, + "time_per_iteration": 4.467924356460571 + }, + { + "auxiliary_loss_clip": 0.00974711, + "auxiliary_loss_mlp": 0.01000333, + "balance_loss_clip": 1.0012598, + "balance_loss_mlp": 0.99946833, + "epoch": 0.9683451074703141, + "flos": 59995132784640.0, + "grad_norm": 0.820264877843422, + "language_loss": 0.56746531, + "learning_rate": 1.0482412113363536e-08, + "loss": 0.58721572, + "num_input_tokens_seen": 347510135, + "step": 16106, + "time_per_iteration": 3.3758046627044678 + }, + { + "auxiliary_loss_clip": 0.00984719, + "auxiliary_loss_mlp": 0.01001669, + "balance_loss_clip": 1.00817275, + "balance_loss_mlp": 1.00073934, + "epoch": 0.9684052307229821, + "flos": 52696145514240.0, + "grad_norm": 0.8753294074528016, + "language_loss": 0.61589754, + "learning_rate": 1.0442627607768707e-08, + "loss": 0.63576138, + "num_input_tokens_seen": 347562505, + "step": 16107, + "time_per_iteration": 3.154038429260254 + }, + { + "auxiliary_loss_clip": 0.01051619, + "auxiliary_loss_mlp": 0.01034752, + "balance_loss_clip": 1.02514422, + "balance_loss_mlp": 1.02313495, + "epoch": 0.96846535397565, + "flos": 22783848520320.0, + "grad_norm": 2.476370647255276, + "language_loss": 0.73856163, + "learning_rate": 1.040291854638875e-08, + "loss": 0.7594254, + "num_input_tokens_seen": 347579150, + "step": 16108, + "time_per_iteration": 2.6601498126983643 + }, + { + "auxiliary_loss_clip": 0.01044529, + "auxiliary_loss_mlp": 0.01026404, + "balance_loss_clip": 1.02316773, + "balance_loss_mlp": 1.01540089, + "epoch": 0.968525477228318, + "flos": 23323980309120.0, + "grad_norm": 4.4153843786268006, + "language_loss": 0.57232225, + "learning_rate": 1.0363284930729576e-08, + "loss": 0.59303159, + "num_input_tokens_seen": 347596705, + "step": 16109, + "time_per_iteration": 2.5866622924804688 + }, + { + "auxiliary_loss_clip": 0.00997403, + "auxiliary_loss_mlp": 0.01005654, + "balance_loss_clip": 1.00192714, + "balance_loss_mlp": 1.00480163, + "epoch": 0.9685856004809861, + "flos": 67882947707520.0, + "grad_norm": 0.6904406744663413, + "language_loss": 0.54280746, + "learning_rate": 1.0323726762294205e-08, + "loss": 0.56283802, + "num_input_tokens_seen": 347661870, + "step": 16110, + "time_per_iteration": 3.1788625717163086 + }, + { + "auxiliary_loss_clip": 0.0099789, + "auxiliary_loss_mlp": 0.01036505, + "balance_loss_clip": 1.02130306, + "balance_loss_mlp": 1.02407789, + "epoch": 0.968645723733654, + "flos": 33947900899200.0, + "grad_norm": 1.4267323669802776, + "language_loss": 0.62726742, + "learning_rate": 1.0284244042582325e-08, + "loss": 0.64761138, + "num_input_tokens_seen": 347684295, + "step": 16111, + "time_per_iteration": 2.87558650970459 + }, + { + "auxiliary_loss_clip": 0.01037806, + "auxiliary_loss_mlp": 0.01024691, + "balance_loss_clip": 1.02223086, + "balance_loss_mlp": 1.01587534, + "epoch": 0.968705846986322, + "flos": 18551488890240.0, + "grad_norm": 2.0784873431365156, + "language_loss": 0.74585694, + "learning_rate": 1.024483677309118e-08, + "loss": 0.76648194, + "num_input_tokens_seen": 347702585, + "step": 16112, + "time_per_iteration": 2.829143762588501 + }, + { + "auxiliary_loss_clip": 0.01051902, + "auxiliary_loss_mlp": 0.01026187, + "balance_loss_clip": 1.02553344, + "balance_loss_mlp": 1.0164063, + "epoch": 0.9687659702389899, + "flos": 17420913336960.0, + "grad_norm": 2.0814454743692843, + "language_loss": 0.66754615, + "learning_rate": 1.020550495531558e-08, + "loss": 0.68832701, + "num_input_tokens_seen": 347721810, + "step": 16113, + "time_per_iteration": 2.8140673637390137 + }, + { + "auxiliary_loss_clip": 0.00997369, + "auxiliary_loss_mlp": 0.01004424, + "balance_loss_clip": 1.00246787, + "balance_loss_mlp": 1.00349414, + "epoch": 0.9688260934916579, + "flos": 62047176865920.0, + "grad_norm": 0.6964016965819917, + "language_loss": 0.56588173, + "learning_rate": 1.0166248590746329e-08, + "loss": 0.58589965, + "num_input_tokens_seen": 347782330, + "step": 16114, + "time_per_iteration": 3.1928281784057617 + }, + { + "auxiliary_loss_clip": 0.01035241, + "auxiliary_loss_mlp": 0.0103329, + "balance_loss_clip": 1.02320659, + "balance_loss_mlp": 1.02225137, + "epoch": 0.9688862167443258, + "flos": 15076520461440.0, + "grad_norm": 1.9185791163363723, + "language_loss": 0.82535148, + "learning_rate": 1.0127067680872458e-08, + "loss": 0.84603679, + "num_input_tokens_seen": 347794835, + "step": 16115, + "time_per_iteration": 2.7922656536102295 + }, + { + "auxiliary_loss_clip": 0.01047921, + "auxiliary_loss_mlp": 0.01023892, + "balance_loss_clip": 1.0248723, + "balance_loss_mlp": 1.01476634, + "epoch": 0.9689463399969939, + "flos": 19938215306880.0, + "grad_norm": 1.669228141849784, + "language_loss": 0.72076702, + "learning_rate": 1.0087962227179448e-08, + "loss": 0.74148518, + "num_input_tokens_seen": 347814320, + "step": 16116, + "time_per_iteration": 2.7060611248016357 + }, + { + "auxiliary_loss_clip": 0.01026824, + "auxiliary_loss_mlp": 0.0103249, + "balance_loss_clip": 1.02487969, + "balance_loss_mlp": 1.02126002, + "epoch": 0.9690064632496618, + "flos": 19573039687680.0, + "grad_norm": 2.107155409788758, + "language_loss": 0.76000845, + "learning_rate": 1.0048932231150553e-08, + "loss": 0.78060162, + "num_input_tokens_seen": 347832125, + "step": 16117, + "time_per_iteration": 2.896765947341919 + }, + { + "auxiliary_loss_clip": 0.01062414, + "auxiliary_loss_mlp": 0.01028034, + "balance_loss_clip": 1.02482843, + "balance_loss_mlp": 1.01721001, + "epoch": 0.9690665865023298, + "flos": 21872292145920.0, + "grad_norm": 2.4280448701443684, + "language_loss": 0.77347016, + "learning_rate": 1.000997769426548e-08, + "loss": 0.79437459, + "num_input_tokens_seen": 347850765, + "step": 16118, + "time_per_iteration": 2.706895112991333 + }, + { + "auxiliary_loss_clip": 0.01036046, + "auxiliary_loss_mlp": 0.0074753, + "balance_loss_clip": 1.02245259, + "balance_loss_mlp": 1.00040412, + "epoch": 0.9691267097549977, + "flos": 20994491577600.0, + "grad_norm": 1.644236385402325, + "language_loss": 0.78166175, + "learning_rate": 9.971098618001272e-09, + "loss": 0.79949749, + "num_input_tokens_seen": 347870125, + "step": 16119, + "time_per_iteration": 2.7609686851501465 + }, + { + "auxiliary_loss_clip": 0.01011645, + "auxiliary_loss_mlp": 0.01029907, + "balance_loss_clip": 1.02058482, + "balance_loss_mlp": 1.01947045, + "epoch": 0.9691868330076657, + "flos": 24279132816000.0, + "grad_norm": 1.5065261575388769, + "language_loss": 0.75636649, + "learning_rate": 9.932295003832747e-09, + "loss": 0.77678204, + "num_input_tokens_seen": 347890615, + "step": 16120, + "time_per_iteration": 2.8706493377685547 + }, + { + "auxiliary_loss_clip": 0.01051343, + "auxiliary_loss_mlp": 0.01029213, + "balance_loss_clip": 1.02501106, + "balance_loss_mlp": 1.01959872, + "epoch": 0.9692469562603336, + "flos": 17675699483520.0, + "grad_norm": 3.9381537122240755, + "language_loss": 0.69682443, + "learning_rate": 9.89356685323095e-09, + "loss": 0.71763003, + "num_input_tokens_seen": 347908685, + "step": 16121, + "time_per_iteration": 2.697150468826294 + }, + { + "auxiliary_loss_clip": 0.01050082, + "auxiliary_loss_mlp": 0.01026968, + "balance_loss_clip": 1.02384186, + "balance_loss_mlp": 1.01692462, + "epoch": 0.9693070795130017, + "flos": 26834392483200.0, + "grad_norm": 3.197328097111792, + "language_loss": 0.68733466, + "learning_rate": 9.854914167664486e-09, + "loss": 0.70810521, + "num_input_tokens_seen": 347926385, + "step": 16122, + "time_per_iteration": 2.6907408237457275 + }, + { + "auxiliary_loss_clip": 0.01017366, + "auxiliary_loss_mlp": 0.01027884, + "balance_loss_clip": 1.01983595, + "balance_loss_mlp": 1.01725698, + "epoch": 0.9693672027656697, + "flos": 18077288515200.0, + "grad_norm": 1.6501723726419144, + "language_loss": 0.75387621, + "learning_rate": 9.81633694859907e-09, + "loss": 0.77432871, + "num_input_tokens_seen": 347945290, + "step": 16123, + "time_per_iteration": 4.533437490463257 + }, + { + "auxiliary_loss_clip": 0.01026255, + "auxiliary_loss_mlp": 0.01034406, + "balance_loss_clip": 1.02284193, + "balance_loss_mlp": 1.02286053, + "epoch": 0.9694273260183376, + "flos": 21763015994880.0, + "grad_norm": 1.5396617958752974, + "language_loss": 0.74975812, + "learning_rate": 9.777835197497753e-09, + "loss": 0.77036476, + "num_input_tokens_seen": 347966330, + "step": 16124, + "time_per_iteration": 2.6929831504821777 + }, + { + "auxiliary_loss_clip": 0.01052375, + "auxiliary_loss_mlp": 0.01030424, + "balance_loss_clip": 1.02472281, + "balance_loss_mlp": 1.02017772, + "epoch": 0.9694874492710056, + "flos": 24426115269120.0, + "grad_norm": 2.830165923067552, + "language_loss": 0.73635614, + "learning_rate": 9.739408915820258e-09, + "loss": 0.75718415, + "num_input_tokens_seen": 347982590, + "step": 16125, + "time_per_iteration": 2.6661601066589355 + }, + { + "auxiliary_loss_clip": 0.00996874, + "auxiliary_loss_mlp": 0.01001411, + "balance_loss_clip": 1.00151265, + "balance_loss_mlp": 1.00054669, + "epoch": 0.9695475725236735, + "flos": 67650748237440.0, + "grad_norm": 0.8709608791908362, + "language_loss": 0.61490709, + "learning_rate": 9.70105810502364e-09, + "loss": 0.63488996, + "num_input_tokens_seen": 348043310, + "step": 16126, + "time_per_iteration": 3.1235134601593018 + }, + { + "auxiliary_loss_clip": 0.01052011, + "auxiliary_loss_mlp": 0.01032749, + "balance_loss_clip": 1.02629423, + "balance_loss_mlp": 1.02284849, + "epoch": 0.9696076957763415, + "flos": 19129326981120.0, + "grad_norm": 1.60419249606102, + "language_loss": 0.74873209, + "learning_rate": 9.662782766562738e-09, + "loss": 0.76957977, + "num_input_tokens_seen": 348062200, + "step": 16127, + "time_per_iteration": 2.679645299911499 + }, + { + "auxiliary_loss_clip": 0.01020083, + "auxiliary_loss_mlp": 0.01029989, + "balance_loss_clip": 1.02290142, + "balance_loss_mlp": 1.01903343, + "epoch": 0.9696678190290094, + "flos": 15486836497920.0, + "grad_norm": 1.5835045582056393, + "language_loss": 0.68921274, + "learning_rate": 9.62458290188839e-09, + "loss": 0.70971346, + "num_input_tokens_seen": 348080685, + "step": 16128, + "time_per_iteration": 2.7565815448760986 + }, + { + "auxiliary_loss_clip": 0.01030757, + "auxiliary_loss_mlp": 0.01033093, + "balance_loss_clip": 1.02455866, + "balance_loss_mlp": 1.02231622, + "epoch": 0.9697279422816775, + "flos": 36208692869760.0, + "grad_norm": 1.53906707109548, + "language_loss": 0.65438783, + "learning_rate": 9.586458512449213e-09, + "loss": 0.67502642, + "num_input_tokens_seen": 348102500, + "step": 16129, + "time_per_iteration": 2.862940549850464 + }, + { + "auxiliary_loss_clip": 0.01037407, + "auxiliary_loss_mlp": 0.01026835, + "balance_loss_clip": 1.02667332, + "balance_loss_mlp": 1.01590919, + "epoch": 0.9697880655343454, + "flos": 25484007651840.0, + "grad_norm": 2.2213997504464147, + "language_loss": 0.63326007, + "learning_rate": 9.548409599691166e-09, + "loss": 0.65390247, + "num_input_tokens_seen": 348122515, + "step": 16130, + "time_per_iteration": 2.856705904006958 + }, + { + "auxiliary_loss_clip": 0.01049787, + "auxiliary_loss_mlp": 0.01027983, + "balance_loss_clip": 1.02320099, + "balance_loss_mlp": 1.0173378, + "epoch": 0.9698481887870134, + "flos": 15333533251200.0, + "grad_norm": 2.435211709959644, + "language_loss": 0.70328319, + "learning_rate": 9.510436165056867e-09, + "loss": 0.72406089, + "num_input_tokens_seen": 348138775, + "step": 16131, + "time_per_iteration": 2.6160929203033447 + }, + { + "auxiliary_loss_clip": 0.01062972, + "auxiliary_loss_mlp": 0.00747611, + "balance_loss_clip": 1.02452397, + "balance_loss_mlp": 1.00040889, + "epoch": 0.9699083120396813, + "flos": 21982250655360.0, + "grad_norm": 2.269710458629869, + "language_loss": 0.76823139, + "learning_rate": 9.472538209986058e-09, + "loss": 0.78633714, + "num_input_tokens_seen": 348157115, + "step": 16132, + "time_per_iteration": 2.6396355628967285 + }, + { + "auxiliary_loss_clip": 0.01029645, + "auxiliary_loss_mlp": 0.0103083, + "balance_loss_clip": 1.02303767, + "balance_loss_mlp": 1.01955831, + "epoch": 0.9699684352923493, + "flos": 15664055224320.0, + "grad_norm": 3.56227871981195, + "language_loss": 0.78953373, + "learning_rate": 9.434715735916477e-09, + "loss": 0.81013852, + "num_input_tokens_seen": 348173035, + "step": 16133, + "time_per_iteration": 4.2567877769470215 + }, + { + "auxiliary_loss_clip": 0.01039402, + "auxiliary_loss_mlp": 0.01026714, + "balance_loss_clip": 1.0244354, + "balance_loss_mlp": 1.01724827, + "epoch": 0.9700285585450172, + "flos": 21908382336000.0, + "grad_norm": 1.7647927323930606, + "language_loss": 0.64560819, + "learning_rate": 9.396968744281863e-09, + "loss": 0.6662693, + "num_input_tokens_seen": 348192960, + "step": 16134, + "time_per_iteration": 4.323192834854126 + }, + { + "auxiliary_loss_clip": 0.01036676, + "auxiliary_loss_mlp": 0.01029751, + "balance_loss_clip": 1.0224936, + "balance_loss_mlp": 1.01883125, + "epoch": 0.9700886817976853, + "flos": 23914890950400.0, + "grad_norm": 2.2135047094637343, + "language_loss": 0.80728793, + "learning_rate": 9.359297236513519e-09, + "loss": 0.82795215, + "num_input_tokens_seen": 348212805, + "step": 16135, + "time_per_iteration": 2.6831605434417725 + }, + { + "auxiliary_loss_clip": 0.01053957, + "auxiliary_loss_mlp": 0.01029382, + "balance_loss_clip": 1.02505064, + "balance_loss_mlp": 1.01833105, + "epoch": 0.9701488050503532, + "flos": 25447845634560.0, + "grad_norm": 1.7780229354576127, + "language_loss": 0.73077834, + "learning_rate": 9.321701214040079e-09, + "loss": 0.75161171, + "num_input_tokens_seen": 348232900, + "step": 16136, + "time_per_iteration": 2.6841161251068115 + }, + { + "auxiliary_loss_clip": 0.01061992, + "auxiliary_loss_mlp": 0.01025492, + "balance_loss_clip": 1.02628851, + "balance_loss_mlp": 1.01613402, + "epoch": 0.9702089283030212, + "flos": 20590855470720.0, + "grad_norm": 1.4753938301656515, + "language_loss": 0.7632156, + "learning_rate": 9.28418067828729e-09, + "loss": 0.78409052, + "num_input_tokens_seen": 348253065, + "step": 16137, + "time_per_iteration": 2.75956654548645 + }, + { + "auxiliary_loss_clip": 0.00974393, + "auxiliary_loss_mlp": 0.01003348, + "balance_loss_clip": 1.00709844, + "balance_loss_mlp": 1.00207853, + "epoch": 0.9702690515556892, + "flos": 70651516291200.0, + "grad_norm": 0.9512702373248645, + "language_loss": 0.54920256, + "learning_rate": 9.246735630678015e-09, + "loss": 0.56897998, + "num_input_tokens_seen": 348316075, + "step": 16138, + "time_per_iteration": 3.4436521530151367 + }, + { + "auxiliary_loss_clip": 0.01040649, + "auxiliary_loss_mlp": 0.01026166, + "balance_loss_clip": 1.02351117, + "balance_loss_mlp": 1.01642036, + "epoch": 0.9703291748083571, + "flos": 35881439034240.0, + "grad_norm": 1.8518134777006738, + "language_loss": 0.70545715, + "learning_rate": 9.209366072632007e-09, + "loss": 0.7261253, + "num_input_tokens_seen": 348337605, + "step": 16139, + "time_per_iteration": 2.810272455215454 + }, + { + "auxiliary_loss_clip": 0.01055059, + "auxiliary_loss_mlp": 0.01030847, + "balance_loss_clip": 1.02743137, + "balance_loss_mlp": 1.02004075, + "epoch": 0.9703892980610251, + "flos": 24316479982080.0, + "grad_norm": 1.6018793597512422, + "language_loss": 0.72328234, + "learning_rate": 9.172072005566134e-09, + "loss": 0.7441414, + "num_input_tokens_seen": 348359430, + "step": 16140, + "time_per_iteration": 2.6627724170684814 + }, + { + "auxiliary_loss_clip": 0.01056507, + "auxiliary_loss_mlp": 0.00747628, + "balance_loss_clip": 1.02624774, + "balance_loss_mlp": 1.00041342, + "epoch": 0.970449421313693, + "flos": 18003743418240.0, + "grad_norm": 2.5534209383287667, + "language_loss": 0.68211472, + "learning_rate": 9.13485343089504e-09, + "loss": 0.70015603, + "num_input_tokens_seen": 348377890, + "step": 16141, + "time_per_iteration": 2.604827642440796 + }, + { + "auxiliary_loss_clip": 0.01049686, + "auxiliary_loss_mlp": 0.01027425, + "balance_loss_clip": 1.02423537, + "balance_loss_mlp": 1.01717329, + "epoch": 0.9705095445663611, + "flos": 25337994865920.0, + "grad_norm": 1.9548378744082082, + "language_loss": 0.6858899, + "learning_rate": 9.097710350029597e-09, + "loss": 0.70666099, + "num_input_tokens_seen": 348396550, + "step": 16142, + "time_per_iteration": 2.6640682220458984 + }, + { + "auxiliary_loss_clip": 0.01013511, + "auxiliary_loss_mlp": 0.01027312, + "balance_loss_clip": 1.02317667, + "balance_loss_mlp": 1.01658881, + "epoch": 0.970569667819029, + "flos": 26833602384000.0, + "grad_norm": 1.8758499914207927, + "language_loss": 0.56136221, + "learning_rate": 9.060642764378457e-09, + "loss": 0.58177042, + "num_input_tokens_seen": 348417120, + "step": 16143, + "time_per_iteration": 2.9450981616973877 + }, + { + "auxiliary_loss_clip": 0.01051811, + "auxiliary_loss_mlp": 0.01026514, + "balance_loss_clip": 1.02482224, + "balance_loss_mlp": 1.0167985, + "epoch": 0.970629791071697, + "flos": 25848644567040.0, + "grad_norm": 2.5030341253561614, + "language_loss": 0.68056953, + "learning_rate": 9.023650675347382e-09, + "loss": 0.70135278, + "num_input_tokens_seen": 348437750, + "step": 16144, + "time_per_iteration": 2.8909852504730225 + }, + { + "auxiliary_loss_clip": 0.01052597, + "auxiliary_loss_mlp": 0.01034383, + "balance_loss_clip": 1.02594161, + "balance_loss_mlp": 1.02416086, + "epoch": 0.9706899143243649, + "flos": 36540184510080.0, + "grad_norm": 2.8388651944411945, + "language_loss": 0.7202217, + "learning_rate": 8.986734084339253e-09, + "loss": 0.74109155, + "num_input_tokens_seen": 348460935, + "step": 16145, + "time_per_iteration": 2.7492258548736572 + }, + { + "auxiliary_loss_clip": 0.01042457, + "auxiliary_loss_mlp": 0.01026675, + "balance_loss_clip": 1.02383983, + "balance_loss_mlp": 1.01559472, + "epoch": 0.9707500375770329, + "flos": 12268234414080.0, + "grad_norm": 2.60433436812975, + "language_loss": 0.79216862, + "learning_rate": 8.949892992753395e-09, + "loss": 0.81285989, + "num_input_tokens_seen": 348474480, + "step": 16146, + "time_per_iteration": 2.641674280166626 + }, + { + "auxiliary_loss_clip": 0.00979706, + "auxiliary_loss_mlp": 0.01000888, + "balance_loss_clip": 1.00453258, + "balance_loss_mlp": 1.00004184, + "epoch": 0.9708101608297008, + "flos": 60853040196480.0, + "grad_norm": 0.7651055672648785, + "language_loss": 0.54611689, + "learning_rate": 8.91312740198713e-09, + "loss": 0.5659228, + "num_input_tokens_seen": 348541220, + "step": 16147, + "time_per_iteration": 3.3446290493011475 + }, + { + "auxiliary_loss_clip": 0.01022077, + "auxiliary_loss_mlp": 0.00747737, + "balance_loss_clip": 1.0213927, + "balance_loss_mlp": 1.00044549, + "epoch": 0.9708702840823689, + "flos": 27124766029440.0, + "grad_norm": 5.3130002949657165, + "language_loss": 0.61033052, + "learning_rate": 8.876437313434682e-09, + "loss": 0.62802863, + "num_input_tokens_seen": 348559230, + "step": 16148, + "time_per_iteration": 2.8709728717803955 + }, + { + "auxiliary_loss_clip": 0.01032284, + "auxiliary_loss_mlp": 0.01035384, + "balance_loss_clip": 1.02506816, + "balance_loss_mlp": 1.02494764, + "epoch": 0.9709304073350368, + "flos": 20777699041920.0, + "grad_norm": 1.7444911676538013, + "language_loss": 0.73480964, + "learning_rate": 8.839822728487155e-09, + "loss": 0.75548631, + "num_input_tokens_seen": 348577850, + "step": 16149, + "time_per_iteration": 2.9465115070343018 + }, + { + "auxiliary_loss_clip": 0.01050785, + "auxiliary_loss_mlp": 0.01032919, + "balance_loss_clip": 1.02329552, + "balance_loss_mlp": 1.02256513, + "epoch": 0.9709905305877048, + "flos": 41934541115520.0, + "grad_norm": 2.1851738196198838, + "language_loss": 0.75219393, + "learning_rate": 8.803283648533222e-09, + "loss": 0.773031, + "num_input_tokens_seen": 348598345, + "step": 16150, + "time_per_iteration": 3.0586798191070557 + }, + { + "auxiliary_loss_clip": 0.01048851, + "auxiliary_loss_mlp": 0.01029337, + "balance_loss_clip": 1.02758849, + "balance_loss_mlp": 1.01674271, + "epoch": 0.9710506538403728, + "flos": 17165588486400.0, + "grad_norm": 2.377823277389918, + "language_loss": 0.73567474, + "learning_rate": 8.766820074958214e-09, + "loss": 0.75645661, + "num_input_tokens_seen": 348616300, + "step": 16151, + "time_per_iteration": 2.901980400085449 + }, + { + "auxiliary_loss_clip": 0.01051941, + "auxiliary_loss_mlp": 0.01024834, + "balance_loss_clip": 1.02562118, + "balance_loss_mlp": 1.01452255, + "epoch": 0.9711107770930407, + "flos": 21173470070400.0, + "grad_norm": 1.986046622986649, + "language_loss": 0.74603194, + "learning_rate": 8.730432009145027e-09, + "loss": 0.76679969, + "num_input_tokens_seen": 348633845, + "step": 16152, + "time_per_iteration": 2.893265724182129 + }, + { + "auxiliary_loss_clip": 0.01034142, + "auxiliary_loss_mlp": 0.01029885, + "balance_loss_clip": 1.02646339, + "balance_loss_mlp": 1.01951349, + "epoch": 0.9711709003457087, + "flos": 22237072715520.0, + "grad_norm": 2.070554113791124, + "language_loss": 0.67340422, + "learning_rate": 8.694119452473448e-09, + "loss": 0.69404447, + "num_input_tokens_seen": 348653070, + "step": 16153, + "time_per_iteration": 4.7530577182769775 + }, + { + "auxiliary_loss_clip": 0.01004016, + "auxiliary_loss_mlp": 0.01025434, + "balance_loss_clip": 1.02396059, + "balance_loss_mlp": 1.01602209, + "epoch": 0.9712310235983767, + "flos": 26213856099840.0, + "grad_norm": 1.9500299894302946, + "language_loss": 0.70867687, + "learning_rate": 8.65788240632037e-09, + "loss": 0.72897136, + "num_input_tokens_seen": 348672145, + "step": 16154, + "time_per_iteration": 2.9585235118865967 + }, + { + "auxiliary_loss_clip": 0.01020204, + "auxiliary_loss_mlp": 0.01030778, + "balance_loss_clip": 1.0293026, + "balance_loss_mlp": 1.01879144, + "epoch": 0.9712911468510447, + "flos": 20668171495680.0, + "grad_norm": 1.8929745341762478, + "language_loss": 0.80709863, + "learning_rate": 8.621720872059812e-09, + "loss": 0.82760847, + "num_input_tokens_seen": 348690615, + "step": 16155, + "time_per_iteration": 2.839778423309326 + }, + { + "auxiliary_loss_clip": 0.0105571, + "auxiliary_loss_mlp": 0.00747764, + "balance_loss_clip": 1.02614748, + "balance_loss_mlp": 1.00036895, + "epoch": 0.9713512701037126, + "flos": 13552903313280.0, + "grad_norm": 1.936612721891097, + "language_loss": 0.67548442, + "learning_rate": 8.58563485106334e-09, + "loss": 0.69351912, + "num_input_tokens_seen": 348708665, + "step": 16156, + "time_per_iteration": 2.691438674926758 + }, + { + "auxiliary_loss_clip": 0.0105206, + "auxiliary_loss_mlp": 0.01030021, + "balance_loss_clip": 1.02373838, + "balance_loss_mlp": 1.01965606, + "epoch": 0.9714113933563806, + "flos": 25848752307840.0, + "grad_norm": 2.973155233188768, + "language_loss": 0.91483229, + "learning_rate": 8.54962434469919e-09, + "loss": 0.93565315, + "num_input_tokens_seen": 348726105, + "step": 16157, + "time_per_iteration": 2.724496841430664 + }, + { + "auxiliary_loss_clip": 0.01024624, + "auxiliary_loss_mlp": 0.0074761, + "balance_loss_clip": 1.02451336, + "balance_loss_mlp": 1.00044894, + "epoch": 0.9714715166090485, + "flos": 12743081233920.0, + "grad_norm": 2.098091338962076, + "language_loss": 0.72433484, + "learning_rate": 8.513689354332721e-09, + "loss": 0.74205714, + "num_input_tokens_seen": 348743360, + "step": 16158, + "time_per_iteration": 2.750577449798584 + }, + { + "auxiliary_loss_clip": 0.01014313, + "auxiliary_loss_mlp": 0.01034357, + "balance_loss_clip": 1.02348065, + "balance_loss_mlp": 1.02371192, + "epoch": 0.9715316398617165, + "flos": 18405547931520.0, + "grad_norm": 2.1748463197366545, + "language_loss": 0.59852082, + "learning_rate": 8.477829881326836e-09, + "loss": 0.61900753, + "num_input_tokens_seen": 348759045, + "step": 16159, + "time_per_iteration": 2.709702491760254 + }, + { + "auxiliary_loss_clip": 0.01056831, + "auxiliary_loss_mlp": 0.0102574, + "balance_loss_clip": 1.02340531, + "balance_loss_mlp": 1.01673388, + "epoch": 0.9715917631143844, + "flos": 28913799749760.0, + "grad_norm": 1.635311469966319, + "language_loss": 0.78534377, + "learning_rate": 8.44204592704112e-09, + "loss": 0.80616939, + "num_input_tokens_seen": 348779910, + "step": 16160, + "time_per_iteration": 2.6871180534362793 + }, + { + "auxiliary_loss_clip": 0.01005942, + "auxiliary_loss_mlp": 0.01000387, + "balance_loss_clip": 1.00092173, + "balance_loss_mlp": 0.99960023, + "epoch": 0.9716518863670525, + "flos": 65939712900480.0, + "grad_norm": 0.7674786393294567, + "language_loss": 0.54308194, + "learning_rate": 8.406337492832704e-09, + "loss": 0.56314522, + "num_input_tokens_seen": 348838995, + "step": 16161, + "time_per_iteration": 3.259709596633911 + }, + { + "auxiliary_loss_clip": 0.01049727, + "auxiliary_loss_mlp": 0.00747364, + "balance_loss_clip": 1.02517939, + "balance_loss_mlp": 1.00038421, + "epoch": 0.9717120096197204, + "flos": 17712759340800.0, + "grad_norm": 1.8976995270526078, + "language_loss": 0.71632582, + "learning_rate": 8.3707045800554e-09, + "loss": 0.7342968, + "num_input_tokens_seen": 348858090, + "step": 16162, + "time_per_iteration": 2.7136778831481934 + }, + { + "auxiliary_loss_clip": 0.01025871, + "auxiliary_loss_mlp": 0.01026443, + "balance_loss_clip": 1.02163184, + "balance_loss_mlp": 1.01588702, + "epoch": 0.9717721328723884, + "flos": 24463426521600.0, + "grad_norm": 1.7297173702424242, + "language_loss": 0.78688347, + "learning_rate": 8.335147190060787e-09, + "loss": 0.80740666, + "num_input_tokens_seen": 348877885, + "step": 16163, + "time_per_iteration": 2.7044589519500732 + }, + { + "auxiliary_loss_clip": 0.0104268, + "auxiliary_loss_mlp": 0.01026345, + "balance_loss_clip": 1.02567828, + "balance_loss_mlp": 1.01664138, + "epoch": 0.9718322561250564, + "flos": 20776477979520.0, + "grad_norm": 1.827601897463615, + "language_loss": 0.72955883, + "learning_rate": 8.299665324196903e-09, + "loss": 0.75024903, + "num_input_tokens_seen": 348897720, + "step": 16164, + "time_per_iteration": 2.6525540351867676 + }, + { + "auxiliary_loss_clip": 0.00998344, + "auxiliary_loss_mlp": 0.01035835, + "balance_loss_clip": 1.02006769, + "balance_loss_mlp": 1.02336538, + "epoch": 0.9718923793777243, + "flos": 19025904746880.0, + "grad_norm": 2.6174920926347567, + "language_loss": 0.83748466, + "learning_rate": 8.264258983809114e-09, + "loss": 0.85782647, + "num_input_tokens_seen": 348915410, + "step": 16165, + "time_per_iteration": 2.9014785289764404 + }, + { + "auxiliary_loss_clip": 0.01025601, + "auxiliary_loss_mlp": 0.01025484, + "balance_loss_clip": 1.0235343, + "balance_loss_mlp": 1.0164535, + "epoch": 0.9719525026303923, + "flos": 21871717528320.0, + "grad_norm": 1.511458913663408, + "language_loss": 0.79014713, + "learning_rate": 8.228928170240345e-09, + "loss": 0.81065798, + "num_input_tokens_seen": 348934335, + "step": 16166, + "time_per_iteration": 2.6815197467803955 + }, + { + "auxiliary_loss_clip": 0.0104524, + "auxiliary_loss_mlp": 0.01025962, + "balance_loss_clip": 1.02695334, + "balance_loss_mlp": 1.01549578, + "epoch": 0.9720126258830603, + "flos": 14429303251200.0, + "grad_norm": 1.7534029600691914, + "language_loss": 0.70716584, + "learning_rate": 8.193672884830195e-09, + "loss": 0.72787786, + "num_input_tokens_seen": 348952405, + "step": 16167, + "time_per_iteration": 2.6034724712371826 + }, + { + "auxiliary_loss_clip": 0.0104333, + "auxiliary_loss_mlp": 0.01028553, + "balance_loss_clip": 1.0282774, + "balance_loss_mlp": 1.01868248, + "epoch": 0.9720727491357283, + "flos": 26251167352320.0, + "grad_norm": 1.8476545633352779, + "language_loss": 0.75524831, + "learning_rate": 8.158493128915812e-09, + "loss": 0.77596718, + "num_input_tokens_seen": 348973580, + "step": 16168, + "time_per_iteration": 2.7463157176971436 + }, + { + "auxiliary_loss_clip": 0.01007003, + "auxiliary_loss_mlp": 0.0104026, + "balance_loss_clip": 1.02066076, + "balance_loss_mlp": 1.02730227, + "epoch": 0.9721328723883962, + "flos": 22674105492480.0, + "grad_norm": 4.264135874687857, + "language_loss": 0.72799176, + "learning_rate": 8.123388903830797e-09, + "loss": 0.74846441, + "num_input_tokens_seen": 348992035, + "step": 16169, + "time_per_iteration": 2.7371184825897217 + }, + { + "auxiliary_loss_clip": 0.0102614, + "auxiliary_loss_mlp": 0.010324, + "balance_loss_clip": 1.02136171, + "balance_loss_mlp": 1.02082443, + "epoch": 0.9721929956410642, + "flos": 28074172360320.0, + "grad_norm": 2.3930040459223396, + "language_loss": 0.57935333, + "learning_rate": 8.088360210906309e-09, + "loss": 0.59993875, + "num_input_tokens_seen": 349013160, + "step": 16170, + "time_per_iteration": 2.7873377799987793 + }, + { + "auxiliary_loss_clip": 0.01031256, + "auxiliary_loss_mlp": 0.01027596, + "balance_loss_clip": 1.02396464, + "balance_loss_mlp": 1.01679528, + "epoch": 0.9722531188937321, + "flos": 20996251344000.0, + "grad_norm": 1.847891856228623, + "language_loss": 0.71917963, + "learning_rate": 8.053407051471062e-09, + "loss": 0.73976815, + "num_input_tokens_seen": 349033485, + "step": 16171, + "time_per_iteration": 4.421235799789429 + }, + { + "auxiliary_loss_clip": 0.01024376, + "auxiliary_loss_mlp": 0.01029039, + "balance_loss_clip": 1.02216578, + "balance_loss_mlp": 1.01857209, + "epoch": 0.9723132421464001, + "flos": 16070600332800.0, + "grad_norm": 1.6403459923770285, + "language_loss": 0.68158823, + "learning_rate": 8.018529426850218e-09, + "loss": 0.70212239, + "num_input_tokens_seen": 349051705, + "step": 16172, + "time_per_iteration": 2.7222065925598145 + }, + { + "auxiliary_loss_clip": 0.01043778, + "auxiliary_loss_mlp": 0.0102579, + "balance_loss_clip": 1.02231944, + "balance_loss_mlp": 1.01551962, + "epoch": 0.972373365399068, + "flos": 27745769289600.0, + "grad_norm": 2.071181892459489, + "language_loss": 0.86135399, + "learning_rate": 7.983727338366274e-09, + "loss": 0.88204968, + "num_input_tokens_seen": 349070825, + "step": 16173, + "time_per_iteration": 2.6335651874542236 + }, + { + "auxiliary_loss_clip": 0.01008072, + "auxiliary_loss_mlp": 0.01033528, + "balance_loss_clip": 1.02066278, + "balance_loss_mlp": 1.02086794, + "epoch": 0.9724334886517361, + "flos": 23002939526400.0, + "grad_norm": 1.8787614823023455, + "language_loss": 0.6428982, + "learning_rate": 7.949000787339289e-09, + "loss": 0.66331422, + "num_input_tokens_seen": 349089730, + "step": 16174, + "time_per_iteration": 2.7427868843078613 + }, + { + "auxiliary_loss_clip": 0.01048541, + "auxiliary_loss_mlp": 0.01024009, + "balance_loss_clip": 1.02381015, + "balance_loss_mlp": 1.01419854, + "epoch": 0.972493611904404, + "flos": 25447055535360.0, + "grad_norm": 1.5267384943215645, + "language_loss": 0.77997434, + "learning_rate": 7.914349775085538e-09, + "loss": 0.80069983, + "num_input_tokens_seen": 349111315, + "step": 16175, + "time_per_iteration": 2.7264697551727295 + }, + { + "auxiliary_loss_clip": 0.0105137, + "auxiliary_loss_mlp": 0.01029011, + "balance_loss_clip": 1.02364254, + "balance_loss_mlp": 1.01766181, + "epoch": 0.972553735157072, + "flos": 16983054547200.0, + "grad_norm": 2.1904439493743033, + "language_loss": 0.56714946, + "learning_rate": 7.879774302919307e-09, + "loss": 0.58795327, + "num_input_tokens_seen": 349129495, + "step": 16176, + "time_per_iteration": 2.602010488510132 + }, + { + "auxiliary_loss_clip": 0.01041793, + "auxiliary_loss_mlp": 0.01026767, + "balance_loss_clip": 1.02537584, + "balance_loss_mlp": 1.01739752, + "epoch": 0.97261385840974, + "flos": 26104651776000.0, + "grad_norm": 2.0983098689037947, + "language_loss": 0.72843087, + "learning_rate": 7.845274372151545e-09, + "loss": 0.74911654, + "num_input_tokens_seen": 349148850, + "step": 16177, + "time_per_iteration": 2.7604846954345703 + }, + { + "auxiliary_loss_clip": 0.01042632, + "auxiliary_loss_mlp": 0.01027752, + "balance_loss_clip": 1.0242523, + "balance_loss_mlp": 1.01747644, + "epoch": 0.9726739816624079, + "flos": 25447881548160.0, + "grad_norm": 1.942519704159412, + "language_loss": 0.68689406, + "learning_rate": 7.810849984090984e-09, + "loss": 0.70759785, + "num_input_tokens_seen": 349167620, + "step": 16178, + "time_per_iteration": 2.8397228717803955 + }, + { + "auxiliary_loss_clip": 0.01006105, + "auxiliary_loss_mlp": 0.01029109, + "balance_loss_clip": 1.02047527, + "balance_loss_mlp": 1.01843953, + "epoch": 0.972734104915076, + "flos": 29014923513600.0, + "grad_norm": 10.310069587699509, + "language_loss": 0.67414939, + "learning_rate": 7.776501140042358e-09, + "loss": 0.69450152, + "num_input_tokens_seen": 349185845, + "step": 16179, + "time_per_iteration": 4.437688589096069 + }, + { + "auxiliary_loss_clip": 0.01040616, + "auxiliary_loss_mlp": 0.00747569, + "balance_loss_clip": 1.02574158, + "balance_loss_mlp": 1.00035667, + "epoch": 0.9727942281677439, + "flos": 23437637919360.0, + "grad_norm": 1.7526113014327669, + "language_loss": 0.77045417, + "learning_rate": 7.742227841308624e-09, + "loss": 0.78833598, + "num_input_tokens_seen": 349204525, + "step": 16180, + "time_per_iteration": 2.764340877532959 + }, + { + "auxiliary_loss_clip": 0.01053979, + "auxiliary_loss_mlp": 0.01028916, + "balance_loss_clip": 1.02472246, + "balance_loss_mlp": 1.01810968, + "epoch": 0.9728543514204119, + "flos": 31724599749120.0, + "grad_norm": 1.5311831581321054, + "language_loss": 0.76665485, + "learning_rate": 7.708030089189188e-09, + "loss": 0.78748381, + "num_input_tokens_seen": 349228075, + "step": 16181, + "time_per_iteration": 4.3092567920684814 + }, + { + "auxiliary_loss_clip": 0.01060657, + "auxiliary_loss_mlp": 0.01031344, + "balance_loss_clip": 1.02429569, + "balance_loss_mlp": 1.02134871, + "epoch": 0.9729144746730798, + "flos": 16289368116480.0, + "grad_norm": 1.4317823873617985, + "language_loss": 0.63327181, + "learning_rate": 7.67390788498079e-09, + "loss": 0.65419173, + "num_input_tokens_seen": 349246990, + "step": 16182, + "time_per_iteration": 2.6022021770477295 + }, + { + "auxiliary_loss_clip": 0.00997059, + "auxiliary_loss_mlp": 0.01036884, + "balance_loss_clip": 1.03095186, + "balance_loss_mlp": 1.02520776, + "epoch": 0.9729745979257478, + "flos": 25041408266880.0, + "grad_norm": 1.8795166970602675, + "language_loss": 0.62218767, + "learning_rate": 7.639861229977507e-09, + "loss": 0.6425271, + "num_input_tokens_seen": 349265890, + "step": 16183, + "time_per_iteration": 3.196559429168701 + }, + { + "auxiliary_loss_clip": 0.01040008, + "auxiliary_loss_mlp": 0.01029884, + "balance_loss_clip": 1.02423286, + "balance_loss_mlp": 1.01920271, + "epoch": 0.9730347211784157, + "flos": 22638733574400.0, + "grad_norm": 1.8801805840173553, + "language_loss": 0.78260922, + "learning_rate": 7.605890125470527e-09, + "loss": 0.80330807, + "num_input_tokens_seen": 349285275, + "step": 16184, + "time_per_iteration": 3.016228675842285 + }, + { + "auxiliary_loss_clip": 0.01017224, + "auxiliary_loss_mlp": 0.01026774, + "balance_loss_clip": 1.01827371, + "balance_loss_mlp": 1.01646197, + "epoch": 0.9730948444310837, + "flos": 10998613313280.0, + "grad_norm": 2.2194177697977473, + "language_loss": 0.79704547, + "learning_rate": 7.571994572747709e-09, + "loss": 0.81748545, + "num_input_tokens_seen": 349301515, + "step": 16185, + "time_per_iteration": 2.663299798965454 + }, + { + "auxiliary_loss_clip": 0.01028777, + "auxiliary_loss_mlp": 0.01026412, + "balance_loss_clip": 1.0223757, + "balance_loss_mlp": 1.0165894, + "epoch": 0.9731549676837516, + "flos": 16799479113600.0, + "grad_norm": 1.9522574268201673, + "language_loss": 0.77930653, + "learning_rate": 7.538174573094469e-09, + "loss": 0.79985845, + "num_input_tokens_seen": 349319590, + "step": 16186, + "time_per_iteration": 2.781857967376709 + }, + { + "auxiliary_loss_clip": 0.010421, + "auxiliary_loss_mlp": 0.01025227, + "balance_loss_clip": 1.02549934, + "balance_loss_mlp": 1.01489747, + "epoch": 0.9732150909364197, + "flos": 21141761339520.0, + "grad_norm": 1.698462323576636, + "language_loss": 0.65459526, + "learning_rate": 7.504430127793337e-09, + "loss": 0.67526853, + "num_input_tokens_seen": 349339230, + "step": 16187, + "time_per_iteration": 2.7537460327148438 + }, + { + "auxiliary_loss_clip": 0.01032354, + "auxiliary_loss_mlp": 0.0103134, + "balance_loss_clip": 1.02100217, + "balance_loss_mlp": 1.02034903, + "epoch": 0.9732752141890876, + "flos": 33727337435520.0, + "grad_norm": 1.5858656205065256, + "language_loss": 0.80245435, + "learning_rate": 7.47076123812418e-09, + "loss": 0.82309133, + "num_input_tokens_seen": 349361155, + "step": 16188, + "time_per_iteration": 2.8899481296539307 + }, + { + "auxiliary_loss_clip": 0.01024926, + "auxiliary_loss_mlp": 0.01023953, + "balance_loss_clip": 1.02195358, + "balance_loss_mlp": 1.01517892, + "epoch": 0.9733353374417556, + "flos": 23404384903680.0, + "grad_norm": 1.7913242233850102, + "language_loss": 0.78303909, + "learning_rate": 7.437167905363084e-09, + "loss": 0.80352789, + "num_input_tokens_seen": 349379335, + "step": 16189, + "time_per_iteration": 2.820425271987915 + }, + { + "auxiliary_loss_clip": 0.0104284, + "auxiliary_loss_mlp": 0.01026938, + "balance_loss_clip": 1.02197826, + "balance_loss_mlp": 1.01638246, + "epoch": 0.9733954606944236, + "flos": 39165792963840.0, + "grad_norm": 1.708233834750507, + "language_loss": 0.51061833, + "learning_rate": 7.403650130784367e-09, + "loss": 0.53131604, + "num_input_tokens_seen": 349401575, + "step": 16190, + "time_per_iteration": 2.7550506591796875 + }, + { + "auxiliary_loss_clip": 0.01052094, + "auxiliary_loss_mlp": 0.01027922, + "balance_loss_clip": 1.02516055, + "balance_loss_mlp": 1.01759815, + "epoch": 0.9734555839470915, + "flos": 21981819692160.0, + "grad_norm": 1.6075147932363705, + "language_loss": 0.80989784, + "learning_rate": 7.3702079156590105e-09, + "loss": 0.83069795, + "num_input_tokens_seen": 349420650, + "step": 16191, + "time_per_iteration": 2.6260032653808594 + }, + { + "auxiliary_loss_clip": 0.01043235, + "auxiliary_loss_mlp": 0.01027416, + "balance_loss_clip": 1.02152503, + "balance_loss_mlp": 1.01785481, + "epoch": 0.9735157071997596, + "flos": 16575539771520.0, + "grad_norm": 1.6587318263849888, + "language_loss": 0.82621324, + "learning_rate": 7.336841261255111e-09, + "loss": 0.84691978, + "num_input_tokens_seen": 349436830, + "step": 16192, + "time_per_iteration": 2.6909523010253906 + }, + { + "auxiliary_loss_clip": 0.0100903, + "auxiliary_loss_mlp": 0.01032649, + "balance_loss_clip": 1.02719176, + "balance_loss_mlp": 1.02148509, + "epoch": 0.9735758304524275, + "flos": 20223237726720.0, + "grad_norm": 2.619235616892809, + "language_loss": 0.74788851, + "learning_rate": 7.303550168837658e-09, + "loss": 0.7683053, + "num_input_tokens_seen": 349454325, + "step": 16193, + "time_per_iteration": 2.7860379219055176 + }, + { + "auxiliary_loss_clip": 0.01039521, + "auxiliary_loss_mlp": 0.01025986, + "balance_loss_clip": 1.02405, + "balance_loss_mlp": 1.01678872, + "epoch": 0.9736359537050955, + "flos": 23653353047040.0, + "grad_norm": 1.6948199679136076, + "language_loss": 0.85232985, + "learning_rate": 7.270334639669417e-09, + "loss": 0.87298489, + "num_input_tokens_seen": 349470230, + "step": 16194, + "time_per_iteration": 2.785231590270996 + }, + { + "auxiliary_loss_clip": 0.01023386, + "auxiliary_loss_mlp": 0.01029535, + "balance_loss_clip": 1.02237892, + "balance_loss_mlp": 1.0193727, + "epoch": 0.9736960769577634, + "flos": 15560202026880.0, + "grad_norm": 2.0795720567039977, + "language_loss": 0.75985825, + "learning_rate": 7.237194675009828e-09, + "loss": 0.78038752, + "num_input_tokens_seen": 349486250, + "step": 16195, + "time_per_iteration": 2.7881057262420654 + }, + { + "auxiliary_loss_clip": 0.00982791, + "auxiliary_loss_mlp": 0.01003033, + "balance_loss_clip": 1.00648642, + "balance_loss_mlp": 1.00206113, + "epoch": 0.9737562002104314, + "flos": 65351783088000.0, + "grad_norm": 0.7070821333028684, + "language_loss": 0.52477837, + "learning_rate": 7.204130276115439e-09, + "loss": 0.54463661, + "num_input_tokens_seen": 349545865, + "step": 16196, + "time_per_iteration": 3.2177791595458984 + }, + { + "auxiliary_loss_clip": 0.01041257, + "auxiliary_loss_mlp": 0.01027865, + "balance_loss_clip": 1.02523685, + "balance_loss_mlp": 1.01788759, + "epoch": 0.9738163234630993, + "flos": 27196730928000.0, + "grad_norm": 1.52211717748429, + "language_loss": 0.76173151, + "learning_rate": 7.171141444240136e-09, + "loss": 0.78242272, + "num_input_tokens_seen": 349566080, + "step": 16197, + "time_per_iteration": 2.706847667694092 + }, + { + "auxiliary_loss_clip": 0.01063722, + "auxiliary_loss_mlp": 0.01023084, + "balance_loss_clip": 1.02476048, + "balance_loss_mlp": 1.01297498, + "epoch": 0.9738764467157673, + "flos": 21069365477760.0, + "grad_norm": 1.7864060488804478, + "language_loss": 0.6777963, + "learning_rate": 7.13822818063492e-09, + "loss": 0.69866437, + "num_input_tokens_seen": 349585665, + "step": 16198, + "time_per_iteration": 2.620354413986206 + }, + { + "auxiliary_loss_clip": 0.01061026, + "auxiliary_loss_mlp": 0.01026623, + "balance_loss_clip": 1.0237391, + "balance_loss_mlp": 1.0161382, + "epoch": 0.9739365699684353, + "flos": 21361211481600.0, + "grad_norm": 1.8620879331021647, + "language_loss": 0.7806437, + "learning_rate": 7.10539048654768e-09, + "loss": 0.80152023, + "num_input_tokens_seen": 349605125, + "step": 16199, + "time_per_iteration": 2.598949432373047 + }, + { + "auxiliary_loss_clip": 0.01044541, + "auxiliary_loss_mlp": 0.01031343, + "balance_loss_clip": 1.02656078, + "balance_loss_mlp": 1.02051234, + "epoch": 0.9739966932211033, + "flos": 21902061542400.0, + "grad_norm": 1.8009878109531292, + "language_loss": 0.79412496, + "learning_rate": 7.072628363223865e-09, + "loss": 0.81488371, + "num_input_tokens_seen": 349623360, + "step": 16200, + "time_per_iteration": 4.396477699279785 + }, + { + "auxiliary_loss_clip": 0.01018506, + "auxiliary_loss_mlp": 0.01033989, + "balance_loss_clip": 1.02745557, + "balance_loss_mlp": 1.02296185, + "epoch": 0.9740568164737712, + "flos": 24827345164800.0, + "grad_norm": 2.0228799507183832, + "language_loss": 0.68787867, + "learning_rate": 7.039941811905592e-09, + "loss": 0.70840359, + "num_input_tokens_seen": 349644390, + "step": 16201, + "time_per_iteration": 2.9177658557891846 + }, + { + "auxiliary_loss_clip": 0.01031801, + "auxiliary_loss_mlp": 0.01026604, + "balance_loss_clip": 1.02332163, + "balance_loss_mlp": 1.01632845, + "epoch": 0.9741169397264392, + "flos": 23623583650560.0, + "grad_norm": 1.3544889286394688, + "language_loss": 0.72755861, + "learning_rate": 7.0073308338325364e-09, + "loss": 0.74814272, + "num_input_tokens_seen": 349663200, + "step": 16202, + "time_per_iteration": 3.16424298286438 + }, + { + "auxiliary_loss_clip": 0.01043431, + "auxiliary_loss_mlp": 0.01028377, + "balance_loss_clip": 1.02549922, + "balance_loss_mlp": 1.01779079, + "epoch": 0.9741770629791072, + "flos": 18841144164480.0, + "grad_norm": 2.6363405513333955, + "language_loss": 0.72967446, + "learning_rate": 6.974795430241265e-09, + "loss": 0.75039256, + "num_input_tokens_seen": 349681975, + "step": 16203, + "time_per_iteration": 2.6760454177856445 + }, + { + "auxiliary_loss_clip": 0.01061948, + "auxiliary_loss_mlp": 0.01029177, + "balance_loss_clip": 1.02524424, + "balance_loss_mlp": 1.01904464, + "epoch": 0.9742371862317751, + "flos": 22346241125760.0, + "grad_norm": 1.7103116346049583, + "language_loss": 0.77469611, + "learning_rate": 6.942335602365235e-09, + "loss": 0.79560745, + "num_input_tokens_seen": 349701185, + "step": 16204, + "time_per_iteration": 2.6104230880737305 + }, + { + "auxiliary_loss_clip": 0.01045036, + "auxiliary_loss_mlp": 0.01031643, + "balance_loss_clip": 1.02600694, + "balance_loss_mlp": 1.0210278, + "epoch": 0.9742973094844432, + "flos": 21762764599680.0, + "grad_norm": 1.7192497066199073, + "language_loss": 0.79412627, + "learning_rate": 6.909951351435905e-09, + "loss": 0.81489301, + "num_input_tokens_seen": 349720360, + "step": 16205, + "time_per_iteration": 2.6865100860595703 + }, + { + "auxiliary_loss_clip": 0.01061688, + "auxiliary_loss_mlp": 0.01029522, + "balance_loss_clip": 1.02574909, + "balance_loss_mlp": 1.01935959, + "epoch": 0.9743574327371111, + "flos": 26248725227520.0, + "grad_norm": 1.4786808606403126, + "language_loss": 0.74565262, + "learning_rate": 6.87764267868074e-09, + "loss": 0.76656473, + "num_input_tokens_seen": 349741040, + "step": 16206, + "time_per_iteration": 2.596356153488159 + }, + { + "auxiliary_loss_clip": 0.0101195, + "auxiliary_loss_mlp": 0.01030321, + "balance_loss_clip": 1.02597368, + "balance_loss_mlp": 1.01952648, + "epoch": 0.9744175559897791, + "flos": 12349321367040.0, + "grad_norm": 2.2354963873394014, + "language_loss": 0.83985746, + "learning_rate": 6.8454095853252015e-09, + "loss": 0.86028016, + "num_input_tokens_seen": 349758895, + "step": 16207, + "time_per_iteration": 2.8791322708129883 + }, + { + "auxiliary_loss_clip": 0.01050761, + "auxiliary_loss_mlp": 0.01028364, + "balance_loss_clip": 1.02433217, + "balance_loss_mlp": 1.01873767, + "epoch": 0.974477679242447, + "flos": 28397834835840.0, + "grad_norm": 2.0829411983997357, + "language_loss": 0.70962292, + "learning_rate": 6.813252072591425e-09, + "loss": 0.73041415, + "num_input_tokens_seen": 349779740, + "step": 16208, + "time_per_iteration": 2.6908421516418457 + }, + { + "auxiliary_loss_clip": 0.01023302, + "auxiliary_loss_mlp": 0.01024472, + "balance_loss_clip": 1.02156329, + "balance_loss_mlp": 1.01582944, + "epoch": 0.974537802495115, + "flos": 17785370684160.0, + "grad_norm": 1.7418999943290014, + "language_loss": 0.77444249, + "learning_rate": 6.781170141698878e-09, + "loss": 0.79492021, + "num_input_tokens_seen": 349796820, + "step": 16209, + "time_per_iteration": 2.7832818031311035 + }, + { + "auxiliary_loss_clip": 0.0102514, + "auxiliary_loss_mlp": 0.00747655, + "balance_loss_clip": 1.02183795, + "balance_loss_mlp": 1.00042069, + "epoch": 0.9745979257477829, + "flos": 23842315520640.0, + "grad_norm": 1.7907267783024694, + "language_loss": 0.79182184, + "learning_rate": 6.749163793864144e-09, + "loss": 0.80954981, + "num_input_tokens_seen": 349816550, + "step": 16210, + "time_per_iteration": 2.8190925121307373 + }, + { + "auxiliary_loss_clip": 0.01033072, + "auxiliary_loss_mlp": 0.01029924, + "balance_loss_clip": 1.02191973, + "balance_loss_mlp": 1.01930797, + "epoch": 0.9746580490004509, + "flos": 27016172236800.0, + "grad_norm": 1.9684791810581603, + "language_loss": 0.78306639, + "learning_rate": 6.7172330303009176e-09, + "loss": 0.80369639, + "num_input_tokens_seen": 349834350, + "step": 16211, + "time_per_iteration": 2.7441353797912598 + }, + { + "auxiliary_loss_clip": 0.01029685, + "auxiliary_loss_mlp": 0.01028833, + "balance_loss_clip": 1.0227046, + "balance_loss_mlp": 1.01771033, + "epoch": 0.9747181722531189, + "flos": 19792022952960.0, + "grad_norm": 2.838631399807202, + "language_loss": 0.77989662, + "learning_rate": 6.685377852219787e-09, + "loss": 0.8004818, + "num_input_tokens_seen": 349853460, + "step": 16212, + "time_per_iteration": 2.7773571014404297 + }, + { + "auxiliary_loss_clip": 0.01032069, + "auxiliary_loss_mlp": 0.01031003, + "balance_loss_clip": 1.02194047, + "balance_loss_mlp": 1.0207448, + "epoch": 0.9747782955057869, + "flos": 31430598929280.0, + "grad_norm": 1.4448482943976317, + "language_loss": 0.80031341, + "learning_rate": 6.653598260829118e-09, + "loss": 0.82094419, + "num_input_tokens_seen": 349874830, + "step": 16213, + "time_per_iteration": 2.7600669860839844 + }, + { + "auxiliary_loss_clip": 0.01020762, + "auxiliary_loss_mlp": 0.01025534, + "balance_loss_clip": 1.02241516, + "balance_loss_mlp": 1.0153296, + "epoch": 0.9748384187584548, + "flos": 15961288268160.0, + "grad_norm": 2.9496055370331384, + "language_loss": 0.66209638, + "learning_rate": 6.6218942573335044e-09, + "loss": 0.68255937, + "num_input_tokens_seen": 349893690, + "step": 16214, + "time_per_iteration": 2.8160412311553955 + }, + { + "auxiliary_loss_clip": 0.01049939, + "auxiliary_loss_mlp": 0.01030839, + "balance_loss_clip": 1.02964807, + "balance_loss_mlp": 1.01949, + "epoch": 0.9748985420111228, + "flos": 20558715776640.0, + "grad_norm": 1.7654085971084381, + "language_loss": 0.74125576, + "learning_rate": 6.5902658429355386e-09, + "loss": 0.76206356, + "num_input_tokens_seen": 349912480, + "step": 16215, + "time_per_iteration": 2.8016884326934814 + }, + { + "auxiliary_loss_clip": 0.01017676, + "auxiliary_loss_mlp": 0.01028406, + "balance_loss_clip": 1.02355075, + "balance_loss_mlp": 1.01834476, + "epoch": 0.9749586652637908, + "flos": 36721605127680.0, + "grad_norm": 1.6839196142130792, + "language_loss": 0.67133558, + "learning_rate": 6.558713018834483e-09, + "loss": 0.69179642, + "num_input_tokens_seen": 349932470, + "step": 16216, + "time_per_iteration": 2.9010934829711914 + }, + { + "auxiliary_loss_clip": 0.01003787, + "auxiliary_loss_mlp": 0.01029315, + "balance_loss_clip": 1.01994717, + "balance_loss_mlp": 1.01824069, + "epoch": 0.9750187885164587, + "flos": 10999223844480.0, + "grad_norm": 1.7319859640774162, + "language_loss": 0.71797764, + "learning_rate": 6.527235786226937e-09, + "loss": 0.73830861, + "num_input_tokens_seen": 349949060, + "step": 16217, + "time_per_iteration": 2.7710177898406982 + }, + { + "auxiliary_loss_clip": 0.01033486, + "auxiliary_loss_mlp": 0.010271, + "balance_loss_clip": 1.02574134, + "balance_loss_mlp": 1.01674092, + "epoch": 0.9750789117691268, + "flos": 25739512070400.0, + "grad_norm": 1.627322277172809, + "language_loss": 0.78358924, + "learning_rate": 6.495834146306167e-09, + "loss": 0.80419505, + "num_input_tokens_seen": 349968010, + "step": 16218, + "time_per_iteration": 4.460792541503906 + }, + { + "auxiliary_loss_clip": 0.01039949, + "auxiliary_loss_mlp": 0.01029013, + "balance_loss_clip": 1.02437735, + "balance_loss_mlp": 1.01843381, + "epoch": 0.9751390350217947, + "flos": 13333955961600.0, + "grad_norm": 2.39194186290498, + "language_loss": 0.77938479, + "learning_rate": 6.464508100263222e-09, + "loss": 0.80007446, + "num_input_tokens_seen": 349985270, + "step": 16219, + "time_per_iteration": 2.6653318405151367 + }, + { + "auxiliary_loss_clip": 0.01044258, + "auxiliary_loss_mlp": 0.0102674, + "balance_loss_clip": 1.02569604, + "balance_loss_mlp": 1.01676226, + "epoch": 0.9751991582744627, + "flos": 22820621068800.0, + "grad_norm": 1.7003666832928945, + "language_loss": 0.80961871, + "learning_rate": 6.433257649285817e-09, + "loss": 0.8303287, + "num_input_tokens_seen": 350003935, + "step": 16220, + "time_per_iteration": 2.69830584526062 + }, + { + "auxiliary_loss_clip": 0.01060014, + "auxiliary_loss_mlp": 0.01027512, + "balance_loss_clip": 1.0234108, + "balance_loss_mlp": 1.01758146, + "epoch": 0.9752592815271306, + "flos": 19646189735040.0, + "grad_norm": 1.8368765638789737, + "language_loss": 0.75432408, + "learning_rate": 6.402082794559227e-09, + "loss": 0.77519935, + "num_input_tokens_seen": 350023595, + "step": 16221, + "time_per_iteration": 2.57995343208313 + }, + { + "auxiliary_loss_clip": 0.01027116, + "auxiliary_loss_mlp": 0.01029244, + "balance_loss_clip": 1.02202725, + "balance_loss_mlp": 1.01941538, + "epoch": 0.9753194047797986, + "flos": 26690462686080.0, + "grad_norm": 1.5714123834868372, + "language_loss": 0.66341352, + "learning_rate": 6.370983537265395e-09, + "loss": 0.68397713, + "num_input_tokens_seen": 350045920, + "step": 16222, + "time_per_iteration": 2.8960859775543213 + }, + { + "auxiliary_loss_clip": 0.0105013, + "auxiliary_loss_mlp": 0.01024943, + "balance_loss_clip": 1.02453375, + "balance_loss_mlp": 1.0151912, + "epoch": 0.9753795280324665, + "flos": 23221779137280.0, + "grad_norm": 1.6822802065105245, + "language_loss": 0.88281274, + "learning_rate": 6.3399598785836004e-09, + "loss": 0.9035635, + "num_input_tokens_seen": 350063925, + "step": 16223, + "time_per_iteration": 2.677725076675415 + }, + { + "auxiliary_loss_clip": 0.01006615, + "auxiliary_loss_mlp": 0.01028807, + "balance_loss_clip": 1.02239692, + "balance_loss_mlp": 1.01894271, + "epoch": 0.9754396512851345, + "flos": 19463835363840.0, + "grad_norm": 1.9282165193353005, + "language_loss": 0.74651414, + "learning_rate": 6.309011819690457e-09, + "loss": 0.76686835, + "num_input_tokens_seen": 350080900, + "step": 16224, + "time_per_iteration": 2.7348520755767822 + }, + { + "auxiliary_loss_clip": 0.00989407, + "auxiliary_loss_mlp": 0.01000493, + "balance_loss_clip": 1.00336027, + "balance_loss_mlp": 0.99953318, + "epoch": 0.9754997745378025, + "flos": 68459313340800.0, + "grad_norm": 0.8104946057971131, + "language_loss": 0.59144962, + "learning_rate": 6.278139361759249e-09, + "loss": 0.61134863, + "num_input_tokens_seen": 350144550, + "step": 16225, + "time_per_iteration": 3.2764084339141846 + }, + { + "auxiliary_loss_clip": 0.01030891, + "auxiliary_loss_mlp": 0.00747484, + "balance_loss_clip": 1.02460682, + "balance_loss_mlp": 1.00037909, + "epoch": 0.9755598977904705, + "flos": 26395168976640.0, + "grad_norm": 1.7083181954501905, + "language_loss": 0.68986404, + "learning_rate": 6.247342505960818e-09, + "loss": 0.7076478, + "num_input_tokens_seen": 350164050, + "step": 16226, + "time_per_iteration": 4.655322074890137 + }, + { + "auxiliary_loss_clip": 0.01046208, + "auxiliary_loss_mlp": 0.010346, + "balance_loss_clip": 1.02395058, + "balance_loss_mlp": 1.02378774, + "epoch": 0.9756200210431384, + "flos": 16617663446400.0, + "grad_norm": 1.5894912507013967, + "language_loss": 0.82864755, + "learning_rate": 6.216621253462894e-09, + "loss": 0.84945565, + "num_input_tokens_seen": 350181350, + "step": 16227, + "time_per_iteration": 2.702413320541382 + }, + { + "auxiliary_loss_clip": 0.01060154, + "auxiliary_loss_mlp": 0.01023215, + "balance_loss_clip": 1.02425337, + "balance_loss_mlp": 1.01342797, + "epoch": 0.9756801442958064, + "flos": 23623044946560.0, + "grad_norm": 1.8638878407509811, + "language_loss": 0.77582216, + "learning_rate": 6.185975605430549e-09, + "loss": 0.79665589, + "num_input_tokens_seen": 350199765, + "step": 16228, + "time_per_iteration": 4.240791320800781 + }, + { + "auxiliary_loss_clip": 0.00998962, + "auxiliary_loss_mlp": 0.01002751, + "balance_loss_clip": 1.00346792, + "balance_loss_mlp": 1.00191677, + "epoch": 0.9757402675484744, + "flos": 61625799440640.0, + "grad_norm": 0.8408002785337756, + "language_loss": 0.55878222, + "learning_rate": 6.155405563025962e-09, + "loss": 0.57879937, + "num_input_tokens_seen": 350256420, + "step": 16229, + "time_per_iteration": 3.1948070526123047 + }, + { + "auxiliary_loss_clip": 0.01049564, + "auxiliary_loss_mlp": 0.01027986, + "balance_loss_clip": 1.02360511, + "balance_loss_mlp": 1.0173521, + "epoch": 0.9758003908011423, + "flos": 24058964401920.0, + "grad_norm": 1.731447987961269, + "language_loss": 0.75337648, + "learning_rate": 6.124911127407984e-09, + "loss": 0.77415198, + "num_input_tokens_seen": 350276270, + "step": 16230, + "time_per_iteration": 2.759936571121216 + }, + { + "auxiliary_loss_clip": 0.01038717, + "auxiliary_loss_mlp": 0.01026273, + "balance_loss_clip": 1.02480078, + "balance_loss_mlp": 1.01692677, + "epoch": 0.9758605140538104, + "flos": 17493093717120.0, + "grad_norm": 2.2128129578760998, + "language_loss": 0.72598433, + "learning_rate": 6.094492299733245e-09, + "loss": 0.74663424, + "num_input_tokens_seen": 350295000, + "step": 16231, + "time_per_iteration": 2.7122623920440674 + }, + { + "auxiliary_loss_clip": 0.01043735, + "auxiliary_loss_mlp": 0.01031073, + "balance_loss_clip": 1.02590549, + "balance_loss_mlp": 1.02030218, + "epoch": 0.9759206373064783, + "flos": 24826950115200.0, + "grad_norm": 1.7637181157796264, + "language_loss": 0.76468801, + "learning_rate": 6.064149081155267e-09, + "loss": 0.78543615, + "num_input_tokens_seen": 350314980, + "step": 16232, + "time_per_iteration": 2.910675048828125 + }, + { + "auxiliary_loss_clip": 0.00990803, + "auxiliary_loss_mlp": 0.01001408, + "balance_loss_clip": 1.00451326, + "balance_loss_mlp": 1.00040686, + "epoch": 0.9759807605591463, + "flos": 68161182456960.0, + "grad_norm": 0.7874846253270917, + "language_loss": 0.53845119, + "learning_rate": 6.033881472824465e-09, + "loss": 0.55837333, + "num_input_tokens_seen": 350371985, + "step": 16233, + "time_per_iteration": 3.1038033962249756 + }, + { + "auxiliary_loss_clip": 0.01063039, + "auxiliary_loss_mlp": 0.01029345, + "balance_loss_clip": 1.02464139, + "balance_loss_mlp": 1.01873553, + "epoch": 0.9760408838118142, + "flos": 18989239939200.0, + "grad_norm": 2.240416641813113, + "language_loss": 0.71481872, + "learning_rate": 6.003689475888807e-09, + "loss": 0.73574257, + "num_input_tokens_seen": 350390590, + "step": 16234, + "time_per_iteration": 2.536214828491211 + }, + { + "auxiliary_loss_clip": 0.01052858, + "auxiliary_loss_mlp": 0.01028558, + "balance_loss_clip": 1.02439213, + "balance_loss_mlp": 1.01765037, + "epoch": 0.9761010070644822, + "flos": 17125978763520.0, + "grad_norm": 2.3474439625458903, + "language_loss": 0.78852773, + "learning_rate": 5.973573091493156e-09, + "loss": 0.80934185, + "num_input_tokens_seen": 350403770, + "step": 16235, + "time_per_iteration": 2.592756748199463 + }, + { + "auxiliary_loss_clip": 0.01045587, + "auxiliary_loss_mlp": 0.01031204, + "balance_loss_clip": 1.02566051, + "balance_loss_mlp": 1.0193429, + "epoch": 0.9761611303171501, + "flos": 22052599441920.0, + "grad_norm": 1.8229251760714102, + "language_loss": 0.76906037, + "learning_rate": 5.943532320779265e-09, + "loss": 0.78982824, + "num_input_tokens_seen": 350421870, + "step": 16236, + "time_per_iteration": 2.6243484020233154 + }, + { + "auxiliary_loss_clip": 0.01049267, + "auxiliary_loss_mlp": 0.01023645, + "balance_loss_clip": 1.02351558, + "balance_loss_mlp": 1.0140965, + "epoch": 0.9762212535698181, + "flos": 21757521214080.0, + "grad_norm": 1.6582076789331217, + "language_loss": 0.75688648, + "learning_rate": 5.913567164886446e-09, + "loss": 0.77761561, + "num_input_tokens_seen": 350440025, + "step": 16237, + "time_per_iteration": 2.6505000591278076 + }, + { + "auxiliary_loss_clip": 0.01015659, + "auxiliary_loss_mlp": 0.01029997, + "balance_loss_clip": 1.02209902, + "balance_loss_mlp": 1.01800489, + "epoch": 0.9762813768224861, + "flos": 25921615046400.0, + "grad_norm": 1.6825870816090056, + "language_loss": 0.73053831, + "learning_rate": 5.8836776249509e-09, + "loss": 0.75099492, + "num_input_tokens_seen": 350459435, + "step": 16238, + "time_per_iteration": 2.785144567489624 + }, + { + "auxiliary_loss_clip": 0.01043187, + "auxiliary_loss_mlp": 0.00747671, + "balance_loss_clip": 1.02543354, + "balance_loss_mlp": 1.00039268, + "epoch": 0.9763415000751541, + "flos": 24051853509120.0, + "grad_norm": 2.892344437449435, + "language_loss": 0.84105849, + "learning_rate": 5.8538637021063875e-09, + "loss": 0.85896707, + "num_input_tokens_seen": 350472655, + "step": 16239, + "time_per_iteration": 2.6712334156036377 + }, + { + "auxiliary_loss_clip": 0.01017672, + "auxiliary_loss_mlp": 0.01031642, + "balance_loss_clip": 1.02151287, + "balance_loss_mlp": 1.01907754, + "epoch": 0.976401623327822, + "flos": 17018677860480.0, + "grad_norm": 2.678451429879656, + "language_loss": 0.60143626, + "learning_rate": 5.824125397483115e-09, + "loss": 0.62192941, + "num_input_tokens_seen": 350488160, + "step": 16240, + "time_per_iteration": 2.687074661254883 + }, + { + "auxiliary_loss_clip": 0.01023286, + "auxiliary_loss_mlp": 0.0103063, + "balance_loss_clip": 1.02335882, + "balance_loss_mlp": 1.02039027, + "epoch": 0.97646174658049, + "flos": 16106941918080.0, + "grad_norm": 3.3541329482846356, + "language_loss": 0.82551003, + "learning_rate": 5.7944627122088474e-09, + "loss": 0.84604925, + "num_input_tokens_seen": 350506065, + "step": 16241, + "time_per_iteration": 2.7641072273254395 + }, + { + "auxiliary_loss_clip": 0.01023289, + "auxiliary_loss_mlp": 0.01034064, + "balance_loss_clip": 1.02354622, + "balance_loss_mlp": 1.02399731, + "epoch": 0.9765218698331579, + "flos": 21252725429760.0, + "grad_norm": 3.353776206514129, + "language_loss": 0.83352107, + "learning_rate": 5.764875647408463e-09, + "loss": 0.85409456, + "num_input_tokens_seen": 350524495, + "step": 16242, + "time_per_iteration": 2.709930419921875 + }, + { + "auxiliary_loss_clip": 0.0105287, + "auxiliary_loss_mlp": 0.01027236, + "balance_loss_clip": 1.02544451, + "balance_loss_mlp": 1.01713324, + "epoch": 0.9765819930858259, + "flos": 18588045957120.0, + "grad_norm": 1.5098998203881786, + "language_loss": 0.75325346, + "learning_rate": 5.7353642042037294e-09, + "loss": 0.77405453, + "num_input_tokens_seen": 350544185, + "step": 16243, + "time_per_iteration": 2.6141464710235596 + }, + { + "auxiliary_loss_clip": 0.01047346, + "auxiliary_loss_mlp": 0.01035049, + "balance_loss_clip": 1.02281141, + "balance_loss_mlp": 1.02375984, + "epoch": 0.976642116338494, + "flos": 20266833859200.0, + "grad_norm": 2.3441731712039195, + "language_loss": 0.70289356, + "learning_rate": 5.705928383713754e-09, + "loss": 0.72371745, + "num_input_tokens_seen": 350562675, + "step": 16244, + "time_per_iteration": 2.621182918548584 + }, + { + "auxiliary_loss_clip": 0.0104432, + "auxiliary_loss_mlp": 0.01028279, + "balance_loss_clip": 1.02667665, + "balance_loss_mlp": 1.01713312, + "epoch": 0.9767022395911619, + "flos": 25550477769600.0, + "grad_norm": 1.8347590708135821, + "language_loss": 0.83659613, + "learning_rate": 5.676568187055197e-09, + "loss": 0.85732216, + "num_input_tokens_seen": 350581535, + "step": 16245, + "time_per_iteration": 2.7748472690582275 + }, + { + "auxiliary_loss_clip": 0.01014668, + "auxiliary_loss_mlp": 0.01025123, + "balance_loss_clip": 1.0232048, + "balance_loss_mlp": 1.01494229, + "epoch": 0.9767623628438299, + "flos": 21762656858880.0, + "grad_norm": 1.4119434964066677, + "language_loss": 0.78721976, + "learning_rate": 5.647283615340726e-09, + "loss": 0.80761766, + "num_input_tokens_seen": 350601615, + "step": 16246, + "time_per_iteration": 2.79685115814209 + }, + { + "auxiliary_loss_clip": 0.01056431, + "auxiliary_loss_mlp": 0.01026391, + "balance_loss_clip": 1.02343512, + "balance_loss_mlp": 1.01748598, + "epoch": 0.9768224860964978, + "flos": 15851114277120.0, + "grad_norm": 1.5753725239391851, + "language_loss": 0.74068284, + "learning_rate": 5.6180746696812275e-09, + "loss": 0.76151109, + "num_input_tokens_seen": 350619580, + "step": 16247, + "time_per_iteration": 4.372573375701904 + }, + { + "auxiliary_loss_clip": 0.01000215, + "auxiliary_loss_mlp": 0.01030219, + "balance_loss_clip": 1.0208087, + "balance_loss_mlp": 1.01940727, + "epoch": 0.9768826093491658, + "flos": 25151151294720.0, + "grad_norm": 1.6959428386314253, + "language_loss": 0.79749298, + "learning_rate": 5.58894135118404e-09, + "loss": 0.8177973, + "num_input_tokens_seen": 350640015, + "step": 16248, + "time_per_iteration": 2.8218510150909424 + }, + { + "auxiliary_loss_clip": 0.01011605, + "auxiliary_loss_mlp": 0.01043905, + "balance_loss_clip": 1.02625537, + "balance_loss_mlp": 1.03092265, + "epoch": 0.9769427326018337, + "flos": 22967028904320.0, + "grad_norm": 1.7537163404219682, + "language_loss": 0.79079795, + "learning_rate": 5.559883660954278e-09, + "loss": 0.81135309, + "num_input_tokens_seen": 350659155, + "step": 16249, + "time_per_iteration": 2.840792417526245 + }, + { + "auxiliary_loss_clip": 0.01049685, + "auxiliary_loss_mlp": 0.0103092, + "balance_loss_clip": 1.02511239, + "balance_loss_mlp": 1.02040541, + "epoch": 0.9770028558545018, + "flos": 15264297786240.0, + "grad_norm": 2.1181684186411354, + "language_loss": 0.66677505, + "learning_rate": 5.530901600093507e-09, + "loss": 0.68758106, + "num_input_tokens_seen": 350676615, + "step": 16250, + "time_per_iteration": 2.691372871398926 + }, + { + "auxiliary_loss_clip": 0.01006081, + "auxiliary_loss_mlp": 0.01001032, + "balance_loss_clip": 1.00095439, + "balance_loss_mlp": 1.00013185, + "epoch": 0.9770629791071697, + "flos": 71450348808960.0, + "grad_norm": 0.7880979447126966, + "language_loss": 0.59875917, + "learning_rate": 5.501995169700846e-09, + "loss": 0.6188302, + "num_input_tokens_seen": 350736805, + "step": 16251, + "time_per_iteration": 3.2406911849975586 + }, + { + "auxiliary_loss_clip": 0.01045317, + "auxiliary_loss_mlp": 0.0102616, + "balance_loss_clip": 1.02365577, + "balance_loss_mlp": 1.01555061, + "epoch": 0.9771231023598377, + "flos": 22412854897920.0, + "grad_norm": 1.901071932636367, + "language_loss": 0.7858398, + "learning_rate": 5.473164370872307e-09, + "loss": 0.80655456, + "num_input_tokens_seen": 350753600, + "step": 16252, + "time_per_iteration": 2.6942310333251953 + }, + { + "auxiliary_loss_clip": 0.01042676, + "auxiliary_loss_mlp": 0.01029167, + "balance_loss_clip": 1.02284908, + "balance_loss_mlp": 1.01843166, + "epoch": 0.9771832256125056, + "flos": 19025940660480.0, + "grad_norm": 2.8815715053420576, + "language_loss": 0.64296812, + "learning_rate": 5.444409204701461e-09, + "loss": 0.66368651, + "num_input_tokens_seen": 350771225, + "step": 16253, + "time_per_iteration": 2.7307565212249756 + }, + { + "auxiliary_loss_clip": 0.01055324, + "auxiliary_loss_mlp": 0.01028435, + "balance_loss_clip": 1.02624798, + "balance_loss_mlp": 1.0161562, + "epoch": 0.9772433488651736, + "flos": 17822143232640.0, + "grad_norm": 2.293003910608915, + "language_loss": 0.76639301, + "learning_rate": 5.415729672278324e-09, + "loss": 0.78723061, + "num_input_tokens_seen": 350789100, + "step": 16254, + "time_per_iteration": 2.642648220062256 + }, + { + "auxiliary_loss_clip": 0.01053752, + "auxiliary_loss_mlp": 0.01030632, + "balance_loss_clip": 1.02480316, + "balance_loss_mlp": 1.01968312, + "epoch": 0.9773034721178415, + "flos": 37629785623680.0, + "grad_norm": 1.7878613361773594, + "language_loss": 0.63702989, + "learning_rate": 5.387125774690471e-09, + "loss": 0.65787375, + "num_input_tokens_seen": 350811085, + "step": 16255, + "time_per_iteration": 2.718696117401123 + }, + { + "auxiliary_loss_clip": 0.01034108, + "auxiliary_loss_mlp": 0.00747659, + "balance_loss_clip": 1.0243485, + "balance_loss_mlp": 1.00039053, + "epoch": 0.9773635953705095, + "flos": 20302457172480.0, + "grad_norm": 1.7074279340245841, + "language_loss": 0.75869608, + "learning_rate": 5.358597513023033e-09, + "loss": 0.7765137, + "num_input_tokens_seen": 350831065, + "step": 16256, + "time_per_iteration": 2.6884162425994873 + }, + { + "auxiliary_loss_clip": 0.01061945, + "auxiliary_loss_mlp": 0.01034249, + "balance_loss_clip": 1.02646828, + "balance_loss_mlp": 1.02327633, + "epoch": 0.9774237186231776, + "flos": 22309253095680.0, + "grad_norm": 2.076536604966455, + "language_loss": 0.78468561, + "learning_rate": 5.330144888357369e-09, + "loss": 0.80564761, + "num_input_tokens_seen": 350849675, + "step": 16257, + "time_per_iteration": 2.5756382942199707 + }, + { + "auxiliary_loss_clip": 0.01051686, + "auxiliary_loss_mlp": 0.0102936, + "balance_loss_clip": 1.02511311, + "balance_loss_mlp": 1.01879811, + "epoch": 0.9774838418758455, + "flos": 24204905360640.0, + "grad_norm": 1.6976861193829291, + "language_loss": 0.74944425, + "learning_rate": 5.301767901772391e-09, + "loss": 0.77025473, + "num_input_tokens_seen": 350868955, + "step": 16258, + "time_per_iteration": 2.6497576236724854 + }, + { + "auxiliary_loss_clip": 0.00998331, + "auxiliary_loss_mlp": 0.01000414, + "balance_loss_clip": 1.0030272, + "balance_loss_mlp": 0.99962139, + "epoch": 0.9775439651285135, + "flos": 66357139829760.0, + "grad_norm": 0.6947166272995265, + "language_loss": 0.59842682, + "learning_rate": 5.273466554344353e-09, + "loss": 0.61841428, + "num_input_tokens_seen": 350935110, + "step": 16259, + "time_per_iteration": 3.2853779792785645 + }, + { + "auxiliary_loss_clip": 0.0104644, + "auxiliary_loss_mlp": 0.01031647, + "balance_loss_clip": 1.02604651, + "balance_loss_mlp": 1.0208106, + "epoch": 0.9776040883811814, + "flos": 22601565976320.0, + "grad_norm": 1.5821137932321414, + "language_loss": 0.73647839, + "learning_rate": 5.2452408471461705e-09, + "loss": 0.75725925, + "num_input_tokens_seen": 350953220, + "step": 16260, + "time_per_iteration": 2.7139053344726562 + }, + { + "auxiliary_loss_clip": 0.01053202, + "auxiliary_loss_mlp": 0.01030175, + "balance_loss_clip": 1.0251826, + "balance_loss_mlp": 1.01925516, + "epoch": 0.9776642116338494, + "flos": 18442176825600.0, + "grad_norm": 1.8335957735621677, + "language_loss": 0.79495335, + "learning_rate": 5.2170907812485456e-09, + "loss": 0.81578708, + "num_input_tokens_seen": 350971915, + "step": 16261, + "time_per_iteration": 2.5993263721466064 + }, + { + "auxiliary_loss_clip": 0.01051367, + "auxiliary_loss_mlp": 0.01021095, + "balance_loss_clip": 1.02412879, + "balance_loss_mlp": 1.01105797, + "epoch": 0.9777243348865173, + "flos": 22638446265600.0, + "grad_norm": 2.710108042323779, + "language_loss": 0.74173069, + "learning_rate": 5.189016357718845e-09, + "loss": 0.76245534, + "num_input_tokens_seen": 350990470, + "step": 16262, + "time_per_iteration": 2.676331043243408 + }, + { + "auxiliary_loss_clip": 0.0105202, + "auxiliary_loss_mlp": 0.01027998, + "balance_loss_clip": 1.02416813, + "balance_loss_mlp": 1.01656568, + "epoch": 0.9777844581391854, + "flos": 31321394605440.0, + "grad_norm": 2.14220312573777, + "language_loss": 0.70130968, + "learning_rate": 5.16101757762133e-09, + "loss": 0.72210979, + "num_input_tokens_seen": 351010755, + "step": 16263, + "time_per_iteration": 2.7580583095550537 + }, + { + "auxiliary_loss_clip": 0.01052089, + "auxiliary_loss_mlp": 0.01028687, + "balance_loss_clip": 1.02489352, + "balance_loss_mlp": 1.01891208, + "epoch": 0.9778445813918533, + "flos": 23039101543680.0, + "grad_norm": 1.5286717534665428, + "language_loss": 0.6592778, + "learning_rate": 5.133094442018038e-09, + "loss": 0.6800856, + "num_input_tokens_seen": 351029965, + "step": 16264, + "time_per_iteration": 2.660271644592285 + }, + { + "auxiliary_loss_clip": 0.0102755, + "auxiliary_loss_mlp": 0.01033187, + "balance_loss_clip": 1.02759337, + "balance_loss_mlp": 1.02133727, + "epoch": 0.9779047046445213, + "flos": 17566351505280.0, + "grad_norm": 1.7690631975807385, + "language_loss": 0.73129344, + "learning_rate": 5.105246951967679e-09, + "loss": 0.75190085, + "num_input_tokens_seen": 351046205, + "step": 16265, + "time_per_iteration": 4.4214489459991455 + }, + { + "auxiliary_loss_clip": 0.01051537, + "auxiliary_loss_mlp": 0.01028564, + "balance_loss_clip": 1.02487457, + "balance_loss_mlp": 1.01832974, + "epoch": 0.9779648278971892, + "flos": 20741141975040.0, + "grad_norm": 1.7812919332165935, + "language_loss": 0.68877208, + "learning_rate": 5.077475108526297e-09, + "loss": 0.70957315, + "num_input_tokens_seen": 351065390, + "step": 16266, + "time_per_iteration": 2.673013687133789 + }, + { + "auxiliary_loss_clip": 0.0101465, + "auxiliary_loss_mlp": 0.01028024, + "balance_loss_clip": 1.02227485, + "balance_loss_mlp": 1.01847494, + "epoch": 0.9780249511498572, + "flos": 21026954494080.0, + "grad_norm": 1.6723064652828907, + "language_loss": 0.86796474, + "learning_rate": 5.049778912747049e-09, + "loss": 0.88839144, + "num_input_tokens_seen": 351084355, + "step": 16267, + "time_per_iteration": 2.718257188796997 + }, + { + "auxiliary_loss_clip": 0.00998397, + "auxiliary_loss_mlp": 0.01031055, + "balance_loss_clip": 1.02225232, + "balance_loss_mlp": 1.01991463, + "epoch": 0.9780850744025251, + "flos": 30774223751040.0, + "grad_norm": 1.9086207526640937, + "language_loss": 0.69593525, + "learning_rate": 5.022158365679985e-09, + "loss": 0.71622974, + "num_input_tokens_seen": 351105870, + "step": 16268, + "time_per_iteration": 2.883723735809326 + }, + { + "auxiliary_loss_clip": 0.01039468, + "auxiliary_loss_mlp": 0.01025061, + "balance_loss_clip": 1.02358651, + "balance_loss_mlp": 1.01480341, + "epoch": 0.9781451976551931, + "flos": 20302995876480.0, + "grad_norm": 1.4893433164819503, + "language_loss": 0.7360518, + "learning_rate": 4.994613468372711e-09, + "loss": 0.75669706, + "num_input_tokens_seen": 351124760, + "step": 16269, + "time_per_iteration": 2.7779057025909424 + }, + { + "auxiliary_loss_clip": 0.01035063, + "auxiliary_loss_mlp": 0.01030065, + "balance_loss_clip": 1.02343106, + "balance_loss_mlp": 1.01812041, + "epoch": 0.9782053209078612, + "flos": 24316479982080.0, + "grad_norm": 1.6473593006738145, + "language_loss": 0.7047416, + "learning_rate": 4.967144221869501e-09, + "loss": 0.72539288, + "num_input_tokens_seen": 351142820, + "step": 16270, + "time_per_iteration": 2.74960994720459 + }, + { + "auxiliary_loss_clip": 0.01063308, + "auxiliary_loss_mlp": 0.01030668, + "balance_loss_clip": 1.02550352, + "balance_loss_mlp": 1.02038074, + "epoch": 0.9782654441605291, + "flos": 32489425065600.0, + "grad_norm": 1.7249310280604953, + "language_loss": 0.63821822, + "learning_rate": 4.939750627212191e-09, + "loss": 0.65915799, + "num_input_tokens_seen": 351164805, + "step": 16271, + "time_per_iteration": 2.6562650203704834 + }, + { + "auxiliary_loss_clip": 0.0104061, + "auxiliary_loss_mlp": 0.01031415, + "balance_loss_clip": 1.02542353, + "balance_loss_mlp": 1.02060854, + "epoch": 0.9783255674131971, + "flos": 26979076465920.0, + "grad_norm": 1.4232985246620393, + "language_loss": 0.70221567, + "learning_rate": 4.912432685439505e-09, + "loss": 0.72293597, + "num_input_tokens_seen": 351187005, + "step": 16272, + "time_per_iteration": 2.7783310413360596 + }, + { + "auxiliary_loss_clip": 0.01016024, + "auxiliary_loss_mlp": 0.01030855, + "balance_loss_clip": 1.02787566, + "balance_loss_mlp": 1.01961911, + "epoch": 0.978385690665865, + "flos": 23112251591040.0, + "grad_norm": 1.712306614677791, + "language_loss": 0.66761982, + "learning_rate": 4.88519039758728e-09, + "loss": 0.68808854, + "num_input_tokens_seen": 351208450, + "step": 16273, + "time_per_iteration": 2.8014841079711914 + }, + { + "auxiliary_loss_clip": 0.01031843, + "auxiliary_loss_mlp": 0.01022788, + "balance_loss_clip": 1.02420962, + "balance_loss_mlp": 1.011922, + "epoch": 0.978445813918533, + "flos": 25409672455680.0, + "grad_norm": 2.029348543716854, + "language_loss": 0.74275357, + "learning_rate": 4.85802376468869e-09, + "loss": 0.76329982, + "num_input_tokens_seen": 351229585, + "step": 16274, + "time_per_iteration": 4.609980344772339 + }, + { + "auxiliary_loss_clip": 0.01035848, + "auxiliary_loss_mlp": 0.01028249, + "balance_loss_clip": 1.02381539, + "balance_loss_mlp": 1.01818824, + "epoch": 0.9785059371712009, + "flos": 23550218121600.0, + "grad_norm": 1.595904596602063, + "language_loss": 0.7788983, + "learning_rate": 4.830932787773579e-09, + "loss": 0.79953933, + "num_input_tokens_seen": 351249525, + "step": 16275, + "time_per_iteration": 4.476066827774048 + }, + { + "auxiliary_loss_clip": 0.00991264, + "auxiliary_loss_mlp": 0.01024925, + "balance_loss_clip": 1.02315903, + "balance_loss_mlp": 1.01379704, + "epoch": 0.978566060423869, + "flos": 34351177870080.0, + "grad_norm": 2.6252030833229325, + "language_loss": 0.70964634, + "learning_rate": 4.803917467869567e-09, + "loss": 0.72980827, + "num_input_tokens_seen": 351272530, + "step": 16276, + "time_per_iteration": 2.97724986076355 + }, + { + "auxiliary_loss_clip": 0.01030745, + "auxiliary_loss_mlp": 0.01025845, + "balance_loss_clip": 1.02189326, + "balance_loss_mlp": 1.01624322, + "epoch": 0.9786261836765369, + "flos": 11618862387840.0, + "grad_norm": 1.7490835959915236, + "language_loss": 0.85416335, + "learning_rate": 4.776977806000726e-09, + "loss": 0.87472928, + "num_input_tokens_seen": 351288530, + "step": 16277, + "time_per_iteration": 2.951396942138672 + }, + { + "auxiliary_loss_clip": 0.01042682, + "auxiliary_loss_mlp": 0.01025303, + "balance_loss_clip": 1.02288294, + "balance_loss_mlp": 1.0144068, + "epoch": 0.9786863069292049, + "flos": 17420949250560.0, + "grad_norm": 2.025223267484331, + "language_loss": 0.7085979, + "learning_rate": 4.7501138031891264e-09, + "loss": 0.72927773, + "num_input_tokens_seen": 351305890, + "step": 16278, + "time_per_iteration": 2.6420705318450928 + }, + { + "auxiliary_loss_clip": 0.01040695, + "auxiliary_loss_mlp": 0.01025444, + "balance_loss_clip": 1.02290821, + "balance_loss_mlp": 1.01463199, + "epoch": 0.9787464301818728, + "flos": 20844923345280.0, + "grad_norm": 1.8472313047514006, + "language_loss": 0.84401679, + "learning_rate": 4.723325460453065e-09, + "loss": 0.86467814, + "num_input_tokens_seen": 351325010, + "step": 16279, + "time_per_iteration": 2.5909438133239746 + }, + { + "auxiliary_loss_clip": 0.01050328, + "auxiliary_loss_mlp": 0.01028607, + "balance_loss_clip": 1.0240345, + "balance_loss_mlp": 1.01790845, + "epoch": 0.9788065534345408, + "flos": 18222942165120.0, + "grad_norm": 1.8826475028682823, + "language_loss": 0.79266655, + "learning_rate": 4.696612778808395e-09, + "loss": 0.81345594, + "num_input_tokens_seen": 351343060, + "step": 16280, + "time_per_iteration": 2.712012767791748 + }, + { + "auxiliary_loss_clip": 0.01019007, + "auxiliary_loss_mlp": 0.01031615, + "balance_loss_clip": 1.02336478, + "balance_loss_mlp": 1.02219796, + "epoch": 0.9788666766872087, + "flos": 21578219498880.0, + "grad_norm": 1.728876273443307, + "language_loss": 0.7958442, + "learning_rate": 4.669975759268085e-09, + "loss": 0.8163504, + "num_input_tokens_seen": 351363260, + "step": 16281, + "time_per_iteration": 2.669806957244873 + }, + { + "auxiliary_loss_clip": 0.01054421, + "auxiliary_loss_mlp": 0.01029258, + "balance_loss_clip": 1.02696502, + "balance_loss_mlp": 1.01823092, + "epoch": 0.9789267999398767, + "flos": 24900495212160.0, + "grad_norm": 1.6150334122498091, + "language_loss": 0.80337906, + "learning_rate": 4.643414402842216e-09, + "loss": 0.82421589, + "num_input_tokens_seen": 351382610, + "step": 16282, + "time_per_iteration": 2.7779204845428467 + }, + { + "auxiliary_loss_clip": 0.0104261, + "auxiliary_loss_mlp": 0.01034455, + "balance_loss_clip": 1.02465248, + "balance_loss_mlp": 1.02426267, + "epoch": 0.9789869231925448, + "flos": 19573111514880.0, + "grad_norm": 1.8978998878102507, + "language_loss": 0.83094037, + "learning_rate": 4.616928710538204e-09, + "loss": 0.85171098, + "num_input_tokens_seen": 351401075, + "step": 16283, + "time_per_iteration": 2.9315907955169678 + }, + { + "auxiliary_loss_clip": 0.01052204, + "auxiliary_loss_mlp": 0.0102975, + "balance_loss_clip": 1.02560651, + "balance_loss_mlp": 1.01931894, + "epoch": 0.9790470464452127, + "flos": 16796641939200.0, + "grad_norm": 2.108190464767395, + "language_loss": 0.72203583, + "learning_rate": 4.590518683360134e-09, + "loss": 0.74285531, + "num_input_tokens_seen": 351419275, + "step": 16284, + "time_per_iteration": 2.684938430786133 + }, + { + "auxiliary_loss_clip": 0.01047895, + "auxiliary_loss_mlp": 0.01030256, + "balance_loss_clip": 1.02339113, + "balance_loss_mlp": 1.02056992, + "epoch": 0.9791071696978807, + "flos": 18369350000640.0, + "grad_norm": 2.026246867488626, + "language_loss": 0.64356554, + "learning_rate": 4.56418432230965e-09, + "loss": 0.66434705, + "num_input_tokens_seen": 351437375, + "step": 16285, + "time_per_iteration": 2.865950107574463 + }, + { + "auxiliary_loss_clip": 0.01043102, + "auxiliary_loss_mlp": 0.01028044, + "balance_loss_clip": 1.02587259, + "balance_loss_mlp": 1.01758385, + "epoch": 0.9791672929505486, + "flos": 24170323541760.0, + "grad_norm": 1.8472657489349762, + "language_loss": 0.70593238, + "learning_rate": 4.537925628385286e-09, + "loss": 0.72664392, + "num_input_tokens_seen": 351457810, + "step": 16286, + "time_per_iteration": 2.7579505443573 + }, + { + "auxiliary_loss_clip": 0.01051431, + "auxiliary_loss_mlp": 0.01027157, + "balance_loss_clip": 1.0256958, + "balance_loss_mlp": 1.01691663, + "epoch": 0.9792274162032166, + "flos": 24354114456960.0, + "grad_norm": 2.1112048150202427, + "language_loss": 0.58643973, + "learning_rate": 4.511742602582691e-09, + "loss": 0.60722566, + "num_input_tokens_seen": 351478825, + "step": 16287, + "time_per_iteration": 2.6830763816833496 + }, + { + "auxiliary_loss_clip": 0.01052413, + "auxiliary_loss_mlp": 0.01031168, + "balance_loss_clip": 1.0260601, + "balance_loss_mlp": 1.02088642, + "epoch": 0.9792875394558845, + "flos": 26395779507840.0, + "grad_norm": 2.603880756841279, + "language_loss": 0.81761223, + "learning_rate": 4.485635245894626e-09, + "loss": 0.83844805, + "num_input_tokens_seen": 351498785, + "step": 16288, + "time_per_iteration": 2.686497926712036 + }, + { + "auxiliary_loss_clip": 0.01036269, + "auxiliary_loss_mlp": 0.00747687, + "balance_loss_clip": 1.02297401, + "balance_loss_mlp": 1.00036001, + "epoch": 0.9793476627085526, + "flos": 28148004766080.0, + "grad_norm": 1.505540273421019, + "language_loss": 0.719504, + "learning_rate": 4.459603559311631e-09, + "loss": 0.73734355, + "num_input_tokens_seen": 351520235, + "step": 16289, + "time_per_iteration": 2.71645450592041 + }, + { + "auxiliary_loss_clip": 0.01032978, + "auxiliary_loss_mlp": 0.01031406, + "balance_loss_clip": 1.02583957, + "balance_loss_mlp": 1.02039146, + "epoch": 0.9794077859612205, + "flos": 16763927627520.0, + "grad_norm": 8.518234475411965, + "language_loss": 0.75183195, + "learning_rate": 4.43364754382003e-09, + "loss": 0.77247578, + "num_input_tokens_seen": 351538900, + "step": 16290, + "time_per_iteration": 2.738107919692993 + }, + { + "auxiliary_loss_clip": 0.01052124, + "auxiliary_loss_mlp": 0.01029623, + "balance_loss_clip": 1.02406001, + "balance_loss_mlp": 1.01846528, + "epoch": 0.9794679092138885, + "flos": 19280834547840.0, + "grad_norm": 1.5700101140556748, + "language_loss": 0.6722846, + "learning_rate": 4.4077672004048105e-09, + "loss": 0.69310206, + "num_input_tokens_seen": 351558715, + "step": 16291, + "time_per_iteration": 2.698275089263916 + }, + { + "auxiliary_loss_clip": 0.01064304, + "auxiliary_loss_mlp": 0.00747674, + "balance_loss_clip": 1.02504754, + "balance_loss_mlp": 1.00037026, + "epoch": 0.9795280324665564, + "flos": 32156640535680.0, + "grad_norm": 1.651785865258889, + "language_loss": 0.62830305, + "learning_rate": 4.3819625300467456e-09, + "loss": 0.64642286, + "num_input_tokens_seen": 351578450, + "step": 16292, + "time_per_iteration": 2.6623620986938477 + }, + { + "auxiliary_loss_clip": 0.01031817, + "auxiliary_loss_mlp": 0.01029741, + "balance_loss_clip": 1.02520823, + "balance_loss_mlp": 1.01973319, + "epoch": 0.9795881557192244, + "flos": 19060953442560.0, + "grad_norm": 2.11369915294967, + "language_loss": 0.73534727, + "learning_rate": 4.356233533724829e-09, + "loss": 0.75596285, + "num_input_tokens_seen": 351597195, + "step": 16293, + "time_per_iteration": 4.441349506378174 + }, + { + "auxiliary_loss_clip": 0.01052075, + "auxiliary_loss_mlp": 0.01025987, + "balance_loss_clip": 1.02418065, + "balance_loss_mlp": 1.01547885, + "epoch": 0.9796482789718923, + "flos": 28329928174080.0, + "grad_norm": 1.8213571770195576, + "language_loss": 0.83500469, + "learning_rate": 4.330580212414503e-09, + "loss": 0.85578525, + "num_input_tokens_seen": 351617460, + "step": 16294, + "time_per_iteration": 2.647599697113037 + }, + { + "auxiliary_loss_clip": 0.01025056, + "auxiliary_loss_mlp": 0.01031063, + "balance_loss_clip": 1.02181864, + "balance_loss_mlp": 1.02191925, + "epoch": 0.9797084022245603, + "flos": 17967976450560.0, + "grad_norm": 1.836756951501514, + "language_loss": 0.71744174, + "learning_rate": 4.305002567088767e-09, + "loss": 0.73800296, + "num_input_tokens_seen": 351635900, + "step": 16295, + "time_per_iteration": 2.7049920558929443 + }, + { + "auxiliary_loss_clip": 0.01055449, + "auxiliary_loss_mlp": 0.01033653, + "balance_loss_clip": 1.02586031, + "balance_loss_mlp": 1.02269197, + "epoch": 0.9797685254772284, + "flos": 20266726118400.0, + "grad_norm": 1.8223964294156525, + "language_loss": 0.80640578, + "learning_rate": 4.2795005987170674e-09, + "loss": 0.82729679, + "num_input_tokens_seen": 351655400, + "step": 16296, + "time_per_iteration": 2.8116259574890137 + }, + { + "auxiliary_loss_clip": 0.01034944, + "auxiliary_loss_mlp": 0.01030801, + "balance_loss_clip": 1.02311087, + "balance_loss_mlp": 1.02041793, + "epoch": 0.9798286487298963, + "flos": 26907147480960.0, + "grad_norm": 1.6885816717548496, + "language_loss": 0.7564733, + "learning_rate": 4.254074308266853e-09, + "loss": 0.77713072, + "num_input_tokens_seen": 351675505, + "step": 16297, + "time_per_iteration": 2.728823184967041 + }, + { + "auxiliary_loss_clip": 0.01048377, + "auxiliary_loss_mlp": 0.0103089, + "balance_loss_clip": 1.02297997, + "balance_loss_mlp": 1.02062047, + "epoch": 0.9798887719825643, + "flos": 27161071701120.0, + "grad_norm": 1.5686841376256782, + "language_loss": 0.78498095, + "learning_rate": 4.228723696702019e-09, + "loss": 0.80577362, + "num_input_tokens_seen": 351697920, + "step": 16298, + "time_per_iteration": 2.7415802478790283 + }, + { + "auxiliary_loss_clip": 0.01047721, + "auxiliary_loss_mlp": 0.01024981, + "balance_loss_clip": 1.02335477, + "balance_loss_mlp": 1.01546764, + "epoch": 0.9799488952352322, + "flos": 20668422890880.0, + "grad_norm": 2.1662398179862854, + "language_loss": 0.72705662, + "learning_rate": 4.203448764984019e-09, + "loss": 0.74778354, + "num_input_tokens_seen": 351717615, + "step": 16299, + "time_per_iteration": 2.6566274166107178 + }, + { + "auxiliary_loss_clip": 0.01036959, + "auxiliary_loss_mlp": 0.01025611, + "balance_loss_clip": 1.02317178, + "balance_loss_mlp": 1.01466203, + "epoch": 0.9800090184879002, + "flos": 21981209160960.0, + "grad_norm": 2.0056927284282717, + "language_loss": 0.89355123, + "learning_rate": 4.178249514071419e-09, + "loss": 0.91417694, + "num_input_tokens_seen": 351735260, + "step": 16300, + "time_per_iteration": 2.7259583473205566 + }, + { + "auxiliary_loss_clip": 0.01051658, + "auxiliary_loss_mlp": 0.01028257, + "balance_loss_clip": 1.02356172, + "balance_loss_mlp": 1.01736164, + "epoch": 0.9800691417405681, + "flos": 21288420570240.0, + "grad_norm": 2.913632204284217, + "language_loss": 0.78159112, + "learning_rate": 4.1531259449194555e-09, + "loss": 0.80239034, + "num_input_tokens_seen": 351755800, + "step": 16301, + "time_per_iteration": 2.6165246963500977 + }, + { + "auxiliary_loss_clip": 0.01042875, + "auxiliary_loss_mlp": 0.01032732, + "balance_loss_clip": 1.02499461, + "balance_loss_mlp": 1.02223015, + "epoch": 0.9801292649932362, + "flos": 18439878355200.0, + "grad_norm": 1.976984606471965, + "language_loss": 0.75481141, + "learning_rate": 4.128078058480921e-09, + "loss": 0.77556753, + "num_input_tokens_seen": 351774790, + "step": 16302, + "time_per_iteration": 2.7227225303649902 + }, + { + "auxiliary_loss_clip": 0.01032861, + "auxiliary_loss_mlp": 0.01027579, + "balance_loss_clip": 1.02388096, + "balance_loss_mlp": 1.01692736, + "epoch": 0.9801893882459041, + "flos": 25046364343680.0, + "grad_norm": 1.8218105141501757, + "language_loss": 0.79513836, + "learning_rate": 4.103105855705724e-09, + "loss": 0.81574273, + "num_input_tokens_seen": 351792855, + "step": 16303, + "time_per_iteration": 2.8665528297424316 + }, + { + "auxiliary_loss_clip": 0.01019103, + "auxiliary_loss_mlp": 0.01034138, + "balance_loss_clip": 1.02213192, + "balance_loss_mlp": 1.02236044, + "epoch": 0.9802495114985721, + "flos": 18511484117760.0, + "grad_norm": 1.9789947835490658, + "language_loss": 0.82588059, + "learning_rate": 4.078209337540883e-09, + "loss": 0.84641302, + "num_input_tokens_seen": 351811450, + "step": 16304, + "time_per_iteration": 2.7312610149383545 + }, + { + "auxiliary_loss_clip": 0.01027766, + "auxiliary_loss_mlp": 0.01022055, + "balance_loss_clip": 1.0241667, + "balance_loss_mlp": 1.01307261, + "epoch": 0.98030963475124, + "flos": 21469841187840.0, + "grad_norm": 1.7887465989964337, + "language_loss": 0.70257103, + "learning_rate": 4.053388504930089e-09, + "loss": 0.72306925, + "num_input_tokens_seen": 351831960, + "step": 16305, + "time_per_iteration": 2.7038938999176025 + }, + { + "auxiliary_loss_clip": 0.01038737, + "auxiliary_loss_mlp": 0.01040292, + "balance_loss_clip": 1.02549791, + "balance_loss_mlp": 1.02813876, + "epoch": 0.980369758003908, + "flos": 20412272027520.0, + "grad_norm": 1.8038919909860913, + "language_loss": 0.71965849, + "learning_rate": 4.028643358815032e-09, + "loss": 0.74044883, + "num_input_tokens_seen": 351851585, + "step": 16306, + "time_per_iteration": 2.843808650970459 + }, + { + "auxiliary_loss_clip": 0.01033619, + "auxiliary_loss_mlp": 0.01031386, + "balance_loss_clip": 1.02219021, + "balance_loss_mlp": 1.02171195, + "epoch": 0.9804298812565759, + "flos": 23399177431680.0, + "grad_norm": 6.274454611669473, + "language_loss": 0.73463839, + "learning_rate": 4.00397390013385e-09, + "loss": 0.75528836, + "num_input_tokens_seen": 351871085, + "step": 16307, + "time_per_iteration": 2.780820846557617 + }, + { + "auxiliary_loss_clip": 0.01034577, + "auxiliary_loss_mlp": 0.01026974, + "balance_loss_clip": 1.02886665, + "balance_loss_mlp": 1.01822376, + "epoch": 0.980490004509244, + "flos": 23292666627840.0, + "grad_norm": 1.5112499289883687, + "language_loss": 0.74504173, + "learning_rate": 3.979380129822018e-09, + "loss": 0.76565725, + "num_input_tokens_seen": 351891775, + "step": 16308, + "time_per_iteration": 2.781412124633789 + }, + { + "auxiliary_loss_clip": 0.00987608, + "auxiliary_loss_mlp": 0.01000775, + "balance_loss_clip": 1.00249553, + "balance_loss_mlp": 0.99992222, + "epoch": 0.980550127761912, + "flos": 56051027798400.0, + "grad_norm": 0.7567809263597806, + "language_loss": 0.57819861, + "learning_rate": 3.954862048811902e-09, + "loss": 0.59808242, + "num_input_tokens_seen": 351946770, + "step": 16309, + "time_per_iteration": 3.203108787536621 + }, + { + "auxiliary_loss_clip": 0.010226, + "auxiliary_loss_mlp": 0.01028525, + "balance_loss_clip": 1.02414715, + "balance_loss_mlp": 1.01798666, + "epoch": 0.9806102510145799, + "flos": 25333290184320.0, + "grad_norm": 1.708018917795123, + "language_loss": 0.66344881, + "learning_rate": 3.930419658033646e-09, + "loss": 0.68396002, + "num_input_tokens_seen": 351966155, + "step": 16310, + "time_per_iteration": 2.81408953666687 + }, + { + "auxiliary_loss_clip": 0.00990402, + "auxiliary_loss_mlp": 0.01001485, + "balance_loss_clip": 1.00436974, + "balance_loss_mlp": 1.00065601, + "epoch": 0.9806703742672479, + "flos": 67274837429760.0, + "grad_norm": 3.0651987388398454, + "language_loss": 0.54530221, + "learning_rate": 3.906052958413841e-09, + "loss": 0.56522107, + "num_input_tokens_seen": 352031655, + "step": 16311, + "time_per_iteration": 3.3804283142089844 + }, + { + "auxiliary_loss_clip": 0.01048841, + "auxiliary_loss_mlp": 0.01023007, + "balance_loss_clip": 1.02404463, + "balance_loss_mlp": 1.01300538, + "epoch": 0.9807304975199158, + "flos": 25228970110080.0, + "grad_norm": 2.4069258433739456, + "language_loss": 0.80433553, + "learning_rate": 3.881761950876638e-09, + "loss": 0.82505399, + "num_input_tokens_seen": 352051920, + "step": 16312, + "time_per_iteration": 2.732771873474121 + }, + { + "auxiliary_loss_clip": 0.01037658, + "auxiliary_loss_mlp": 0.01026779, + "balance_loss_clip": 1.02324867, + "balance_loss_mlp": 1.01692653, + "epoch": 0.9807906207725838, + "flos": 17456392995840.0, + "grad_norm": 3.3559133024004137, + "language_loss": 0.6322062, + "learning_rate": 3.8575466363430785e-09, + "loss": 0.65285063, + "num_input_tokens_seen": 352069315, + "step": 16313, + "time_per_iteration": 4.409528970718384 + }, + { + "auxiliary_loss_clip": 0.01051884, + "auxiliary_loss_mlp": 0.01028693, + "balance_loss_clip": 1.025231, + "balance_loss_mlp": 1.0185008, + "epoch": 0.9808507440252517, + "flos": 21032413361280.0, + "grad_norm": 4.981442317770143, + "language_loss": 0.72852349, + "learning_rate": 3.833407015731316e-09, + "loss": 0.74932927, + "num_input_tokens_seen": 352089480, + "step": 16314, + "time_per_iteration": 2.7138113975524902 + }, + { + "auxiliary_loss_clip": 0.00979529, + "auxiliary_loss_mlp": 0.01001805, + "balance_loss_clip": 1.00422931, + "balance_loss_mlp": 1.00094652, + "epoch": 0.9809108672779198, + "flos": 64044491598720.0, + "grad_norm": 0.6904373891823694, + "language_loss": 0.51683998, + "learning_rate": 3.80934308995684e-09, + "loss": 0.53665334, + "num_input_tokens_seen": 352150000, + "step": 16315, + "time_per_iteration": 3.2544472217559814 + }, + { + "auxiliary_loss_clip": 0.01049685, + "auxiliary_loss_mlp": 0.01027338, + "balance_loss_clip": 1.02343929, + "balance_loss_mlp": 1.01745582, + "epoch": 0.9809709905305877, + "flos": 22780616296320.0, + "grad_norm": 1.2965544390164305, + "language_loss": 0.69586062, + "learning_rate": 3.785354859932033e-09, + "loss": 0.71663082, + "num_input_tokens_seen": 352170990, + "step": 16316, + "time_per_iteration": 2.7434003353118896 + }, + { + "auxiliary_loss_clip": 0.01061717, + "auxiliary_loss_mlp": 0.01027333, + "balance_loss_clip": 1.02424443, + "balance_loss_mlp": 1.01714706, + "epoch": 0.9810311137832557, + "flos": 37013415217920.0, + "grad_norm": 1.7072069491980146, + "language_loss": 0.55062735, + "learning_rate": 3.76144232656661e-09, + "loss": 0.57151783, + "num_input_tokens_seen": 352195335, + "step": 16317, + "time_per_iteration": 2.7386088371276855 + }, + { + "auxiliary_loss_clip": 0.01000968, + "auxiliary_loss_mlp": 0.01030189, + "balance_loss_clip": 1.01907563, + "balance_loss_mlp": 1.01951957, + "epoch": 0.9810912370359236, + "flos": 18916305373440.0, + "grad_norm": 1.56983383307671, + "language_loss": 0.7282064, + "learning_rate": 3.737605490767404e-09, + "loss": 0.74851805, + "num_input_tokens_seen": 352214170, + "step": 16318, + "time_per_iteration": 2.822632312774658 + }, + { + "auxiliary_loss_clip": 0.01040022, + "auxiliary_loss_mlp": 0.01026283, + "balance_loss_clip": 1.02409542, + "balance_loss_mlp": 1.01668072, + "epoch": 0.9811513602885916, + "flos": 18441602208000.0, + "grad_norm": 2.20651407312768, + "language_loss": 0.82455945, + "learning_rate": 3.7138443534383555e-09, + "loss": 0.84522253, + "num_input_tokens_seen": 352231470, + "step": 16319, + "time_per_iteration": 2.7005889415740967 + }, + { + "auxiliary_loss_clip": 0.00996201, + "auxiliary_loss_mlp": 0.01008433, + "balance_loss_clip": 1.00102925, + "balance_loss_mlp": 1.00746179, + "epoch": 0.9812114835412595, + "flos": 68058945371520.0, + "grad_norm": 0.7326372436902263, + "language_loss": 0.5359112, + "learning_rate": 3.6901589154803014e-09, + "loss": 0.55595756, + "num_input_tokens_seen": 352291770, + "step": 16320, + "time_per_iteration": 4.9190685749053955 + }, + { + "auxiliary_loss_clip": 0.01017085, + "auxiliary_loss_mlp": 0.01030524, + "balance_loss_clip": 1.02262473, + "balance_loss_mlp": 1.01983118, + "epoch": 0.9812716067939276, + "flos": 25373007648000.0, + "grad_norm": 1.497834585357082, + "language_loss": 0.73024833, + "learning_rate": 3.6665491777914116e-09, + "loss": 0.75072443, + "num_input_tokens_seen": 352310735, + "step": 16321, + "time_per_iteration": 2.875882148742676 + }, + { + "auxiliary_loss_clip": 0.01039064, + "auxiliary_loss_mlp": 0.01029797, + "balance_loss_clip": 1.02553082, + "balance_loss_mlp": 1.01956296, + "epoch": 0.9813317300465956, + "flos": 22856818999680.0, + "grad_norm": 1.9456852057903895, + "language_loss": 0.78428072, + "learning_rate": 3.6430151412669698e-09, + "loss": 0.80496931, + "num_input_tokens_seen": 352329545, + "step": 16322, + "time_per_iteration": 4.4093523025512695 + }, + { + "auxiliary_loss_clip": 0.01050294, + "auxiliary_loss_mlp": 0.01031854, + "balance_loss_clip": 1.0243578, + "balance_loss_mlp": 1.0213573, + "epoch": 0.9813918532992635, + "flos": 23586954756480.0, + "grad_norm": 1.5111769009104168, + "language_loss": 0.8068707, + "learning_rate": 3.619556806799595e-09, + "loss": 0.82769215, + "num_input_tokens_seen": 352352080, + "step": 16323, + "time_per_iteration": 2.724270820617676 + }, + { + "auxiliary_loss_clip": 0.01062709, + "auxiliary_loss_mlp": 0.01030328, + "balance_loss_clip": 1.02473009, + "balance_loss_mlp": 1.02041531, + "epoch": 0.9814519765519315, + "flos": 19606328616960.0, + "grad_norm": 2.460518174124119, + "language_loss": 0.84446526, + "learning_rate": 3.596174175278799e-09, + "loss": 0.86539561, + "num_input_tokens_seen": 352366455, + "step": 16324, + "time_per_iteration": 2.6203761100769043 + }, + { + "auxiliary_loss_clip": 0.01042838, + "auxiliary_loss_mlp": 0.01026627, + "balance_loss_clip": 1.02564216, + "balance_loss_mlp": 1.01570106, + "epoch": 0.9815120998045994, + "flos": 33946284787200.0, + "grad_norm": 1.5280778124286167, + "language_loss": 0.74553668, + "learning_rate": 3.5728672475909827e-09, + "loss": 0.7662313, + "num_input_tokens_seen": 352386090, + "step": 16325, + "time_per_iteration": 2.7644436359405518 + }, + { + "auxiliary_loss_clip": 0.01022097, + "auxiliary_loss_mlp": 0.01030317, + "balance_loss_clip": 1.02634287, + "balance_loss_mlp": 1.02102518, + "epoch": 0.9815722230572674, + "flos": 20850023076480.0, + "grad_norm": 1.7221625305091417, + "language_loss": 0.76875722, + "learning_rate": 3.5496360246201063e-09, + "loss": 0.78928137, + "num_input_tokens_seen": 352404000, + "step": 16326, + "time_per_iteration": 2.987917423248291 + }, + { + "auxiliary_loss_clip": 0.01025902, + "auxiliary_loss_mlp": 0.01026428, + "balance_loss_clip": 1.02265882, + "balance_loss_mlp": 1.01577115, + "epoch": 0.9816323463099353, + "flos": 22894525301760.0, + "grad_norm": 3.0595434284413434, + "language_loss": 0.66657686, + "learning_rate": 3.5264805072470205e-09, + "loss": 0.68710017, + "num_input_tokens_seen": 352423540, + "step": 16327, + "time_per_iteration": 2.9494707584381104 + }, + { + "auxiliary_loss_clip": 0.01052681, + "auxiliary_loss_mlp": 0.01034804, + "balance_loss_clip": 1.02417135, + "balance_loss_mlp": 1.02370572, + "epoch": 0.9816924695626034, + "flos": 31539444117120.0, + "grad_norm": 1.4306217116133426, + "language_loss": 0.73772979, + "learning_rate": 3.5034006963501337e-09, + "loss": 0.75860465, + "num_input_tokens_seen": 352445530, + "step": 16328, + "time_per_iteration": 2.7300913333892822 + }, + { + "auxiliary_loss_clip": 0.01041556, + "auxiliary_loss_mlp": 0.01037707, + "balance_loss_clip": 1.02372265, + "balance_loss_mlp": 1.02582169, + "epoch": 0.9817525928152713, + "flos": 21506901045120.0, + "grad_norm": 6.849161523654544, + "language_loss": 0.81157863, + "learning_rate": 3.4803965928040802e-09, + "loss": 0.83237123, + "num_input_tokens_seen": 352466325, + "step": 16329, + "time_per_iteration": 2.947740316390991 + }, + { + "auxiliary_loss_clip": 0.01062485, + "auxiliary_loss_mlp": 0.01027242, + "balance_loss_clip": 1.02384174, + "balance_loss_mlp": 1.01621556, + "epoch": 0.9818127160679393, + "flos": 25550513683200.0, + "grad_norm": 1.8623350403483994, + "language_loss": 0.76191616, + "learning_rate": 3.4574681974817168e-09, + "loss": 0.78281343, + "num_input_tokens_seen": 352485505, + "step": 16330, + "time_per_iteration": 2.6205148696899414 + }, + { + "auxiliary_loss_clip": 0.01067926, + "auxiliary_loss_mlp": 0.01029916, + "balance_loss_clip": 1.02630496, + "balance_loss_mlp": 1.01725554, + "epoch": 0.9818728393206072, + "flos": 28803661672320.0, + "grad_norm": 3.464643550050292, + "language_loss": 0.66434395, + "learning_rate": 3.434615511252126e-09, + "loss": 0.6853224, + "num_input_tokens_seen": 352505360, + "step": 16331, + "time_per_iteration": 2.723644971847534 + }, + { + "auxiliary_loss_clip": 0.01048529, + "auxiliary_loss_mlp": 0.01027838, + "balance_loss_clip": 1.02304184, + "balance_loss_mlp": 1.01818204, + "epoch": 0.9819329625732752, + "flos": 23222246014080.0, + "grad_norm": 3.340003253078826, + "language_loss": 0.7331937, + "learning_rate": 3.411838534981948e-09, + "loss": 0.75395733, + "num_input_tokens_seen": 352524035, + "step": 16332, + "time_per_iteration": 2.668025255203247 + }, + { + "auxiliary_loss_clip": 0.01050963, + "auxiliary_loss_mlp": 0.01025717, + "balance_loss_clip": 1.02534604, + "balance_loss_mlp": 1.01586461, + "epoch": 0.9819930858259431, + "flos": 17530440883200.0, + "grad_norm": 1.97419671702213, + "language_loss": 0.76572824, + "learning_rate": 3.389137269534936e-09, + "loss": 0.78649497, + "num_input_tokens_seen": 352543210, + "step": 16333, + "time_per_iteration": 2.6581737995147705 + }, + { + "auxiliary_loss_clip": 0.01051674, + "auxiliary_loss_mlp": 0.0074758, + "balance_loss_clip": 1.02516198, + "balance_loss_mlp": 1.00031853, + "epoch": 0.9820532090786112, + "flos": 12529915971840.0, + "grad_norm": 2.0661647993858923, + "language_loss": 0.7299614, + "learning_rate": 3.366511715771958e-09, + "loss": 0.74795389, + "num_input_tokens_seen": 352559770, + "step": 16334, + "time_per_iteration": 2.6833138465881348 + }, + { + "auxiliary_loss_clip": 0.01025211, + "auxiliary_loss_mlp": 0.01032845, + "balance_loss_clip": 1.02545643, + "balance_loss_mlp": 1.02221131, + "epoch": 0.9821133323312792, + "flos": 18840174497280.0, + "grad_norm": 1.9403702171301722, + "language_loss": 0.78490138, + "learning_rate": 3.3439618745509934e-09, + "loss": 0.80548191, + "num_input_tokens_seen": 352577690, + "step": 16335, + "time_per_iteration": 2.7856788635253906 + }, + { + "auxiliary_loss_clip": 0.010377, + "auxiliary_loss_mlp": 0.01033759, + "balance_loss_clip": 1.0226922, + "balance_loss_mlp": 1.02109337, + "epoch": 0.9821734555839471, + "flos": 34824013528320.0, + "grad_norm": 2.203388130689989, + "language_loss": 0.64264745, + "learning_rate": 3.3214877467271362e-09, + "loss": 0.66336203, + "num_input_tokens_seen": 352598850, + "step": 16336, + "time_per_iteration": 2.805588722229004 + }, + { + "auxiliary_loss_clip": 0.01037758, + "auxiliary_loss_mlp": 0.01033553, + "balance_loss_clip": 1.0263927, + "balance_loss_mlp": 1.02139378, + "epoch": 0.9822335788366151, + "flos": 17128169493120.0, + "grad_norm": 1.9705958548653253, + "language_loss": 0.72766978, + "learning_rate": 3.299089333152372e-09, + "loss": 0.74838293, + "num_input_tokens_seen": 352616130, + "step": 16337, + "time_per_iteration": 2.704378128051758 + }, + { + "auxiliary_loss_clip": 0.01044933, + "auxiliary_loss_mlp": 0.01027841, + "balance_loss_clip": 1.02470183, + "balance_loss_mlp": 1.01687336, + "epoch": 0.982293702089283, + "flos": 20813250528000.0, + "grad_norm": 1.600621560336819, + "language_loss": 0.73028815, + "learning_rate": 3.2767666346764645e-09, + "loss": 0.7510159, + "num_input_tokens_seen": 352636885, + "step": 16338, + "time_per_iteration": 2.584787607192993 + }, + { + "auxiliary_loss_clip": 0.00992575, + "auxiliary_loss_mlp": 0.01031407, + "balance_loss_clip": 1.01968312, + "balance_loss_mlp": 1.02055931, + "epoch": 0.982353825341951, + "flos": 24680829588480.0, + "grad_norm": 1.9734033146668626, + "language_loss": 0.81339335, + "learning_rate": 3.2545196521454045e-09, + "loss": 0.83363318, + "num_input_tokens_seen": 352657905, + "step": 16339, + "time_per_iteration": 2.9153831005096436 + }, + { + "auxiliary_loss_clip": 0.01006969, + "auxiliary_loss_mlp": 0.01033434, + "balance_loss_clip": 1.01887965, + "balance_loss_mlp": 1.02289021, + "epoch": 0.982413948594619, + "flos": 20850489953280.0, + "grad_norm": 1.7146592745540195, + "language_loss": 0.62424052, + "learning_rate": 3.232348386403405e-09, + "loss": 0.6446445, + "num_input_tokens_seen": 352676320, + "step": 16340, + "time_per_iteration": 4.557361602783203 + }, + { + "auxiliary_loss_clip": 0.01065855, + "auxiliary_loss_mlp": 0.01030347, + "balance_loss_clip": 1.0269388, + "balance_loss_mlp": 1.01978469, + "epoch": 0.982474071847287, + "flos": 15377380778880.0, + "grad_norm": 2.070101648735831, + "language_loss": 0.85932374, + "learning_rate": 3.2102528382904613e-09, + "loss": 0.88028568, + "num_input_tokens_seen": 352692665, + "step": 16341, + "time_per_iteration": 2.573251485824585 + }, + { + "auxiliary_loss_clip": 0.01033411, + "auxiliary_loss_mlp": 0.01026196, + "balance_loss_clip": 1.02252603, + "balance_loss_mlp": 1.01586676, + "epoch": 0.9825341950999549, + "flos": 23774732081280.0, + "grad_norm": 1.4234348272495083, + "language_loss": 0.67159539, + "learning_rate": 3.188233008645014e-09, + "loss": 0.69219148, + "num_input_tokens_seen": 352716130, + "step": 16342, + "time_per_iteration": 2.865595579147339 + }, + { + "auxiliary_loss_clip": 0.01062399, + "auxiliary_loss_mlp": 0.01024892, + "balance_loss_clip": 1.02485919, + "balance_loss_mlp": 1.01477158, + "epoch": 0.9825943183526229, + "flos": 22746285872640.0, + "grad_norm": 1.540692474659, + "language_loss": 0.77631223, + "learning_rate": 3.16628889830195e-09, + "loss": 0.79718518, + "num_input_tokens_seen": 352734705, + "step": 16343, + "time_per_iteration": 2.5973052978515625 + }, + { + "auxiliary_loss_clip": 0.01029919, + "auxiliary_loss_mlp": 0.01026942, + "balance_loss_clip": 1.02401114, + "balance_loss_mlp": 1.01787043, + "epoch": 0.9826544416052908, + "flos": 27709966408320.0, + "grad_norm": 1.4516601655408752, + "language_loss": 0.75536406, + "learning_rate": 3.1444205080932707e-09, + "loss": 0.77593267, + "num_input_tokens_seen": 352756225, + "step": 16344, + "time_per_iteration": 2.785421848297119 + }, + { + "auxiliary_loss_clip": 0.01033521, + "auxiliary_loss_mlp": 0.01030773, + "balance_loss_clip": 1.02159834, + "balance_loss_mlp": 1.01946044, + "epoch": 0.9827145648579588, + "flos": 26941657472640.0, + "grad_norm": 1.8152369794111645, + "language_loss": 0.66695929, + "learning_rate": 3.122627838848313e-09, + "loss": 0.68760222, + "num_input_tokens_seen": 352776210, + "step": 16345, + "time_per_iteration": 2.6941938400268555 + }, + { + "auxiliary_loss_clip": 0.01049676, + "auxiliary_loss_mlp": 0.01022878, + "balance_loss_clip": 1.02522242, + "balance_loss_mlp": 1.01383567, + "epoch": 0.9827746881106267, + "flos": 21866545969920.0, + "grad_norm": 1.4474959129057718, + "language_loss": 0.79371494, + "learning_rate": 3.1009108913933045e-09, + "loss": 0.81444043, + "num_input_tokens_seen": 352795455, + "step": 16346, + "time_per_iteration": 2.786900520324707 + }, + { + "auxiliary_loss_clip": 0.01054855, + "auxiliary_loss_mlp": 0.01029618, + "balance_loss_clip": 1.02438354, + "balance_loss_mlp": 1.01833463, + "epoch": 0.9828348113632948, + "flos": 20850777262080.0, + "grad_norm": 2.128203264017552, + "language_loss": 0.75217772, + "learning_rate": 3.079269666552031e-09, + "loss": 0.77302241, + "num_input_tokens_seen": 352812895, + "step": 16347, + "time_per_iteration": 2.700883626937866 + }, + { + "auxiliary_loss_clip": 0.00986467, + "auxiliary_loss_mlp": 0.0103141, + "balance_loss_clip": 1.01891243, + "balance_loss_mlp": 1.02127743, + "epoch": 0.9828949346159628, + "flos": 34569227381760.0, + "grad_norm": 1.9744122756896123, + "language_loss": 0.66908574, + "learning_rate": 3.0577041651449474e-09, + "loss": 0.68926454, + "num_input_tokens_seen": 352835470, + "step": 16348, + "time_per_iteration": 2.9796369075775146 + }, + { + "auxiliary_loss_clip": 0.01042954, + "auxiliary_loss_mlp": 0.01027545, + "balance_loss_clip": 1.02443528, + "balance_loss_mlp": 1.01729929, + "epoch": 0.9829550578686307, + "flos": 24457464864000.0, + "grad_norm": 1.8090213993197797, + "language_loss": 0.69157779, + "learning_rate": 3.0362143879898437e-09, + "loss": 0.71228284, + "num_input_tokens_seen": 352854295, + "step": 16349, + "time_per_iteration": 2.889816999435425 + }, + { + "auxiliary_loss_clip": 0.01028738, + "auxiliary_loss_mlp": 0.01027004, + "balance_loss_clip": 1.02344131, + "balance_loss_mlp": 1.01785505, + "epoch": 0.9830151811212987, + "flos": 16910084067840.0, + "grad_norm": 2.647878628185799, + "language_loss": 0.76073229, + "learning_rate": 3.0148003359014018e-09, + "loss": 0.7812897, + "num_input_tokens_seen": 352869695, + "step": 16350, + "time_per_iteration": 2.632830858230591 + }, + { + "auxiliary_loss_clip": 0.01030754, + "auxiliary_loss_mlp": 0.01027037, + "balance_loss_clip": 1.02435493, + "balance_loss_mlp": 1.01589656, + "epoch": 0.9830753043739666, + "flos": 21288312829440.0, + "grad_norm": 1.950608727223221, + "language_loss": 0.84417331, + "learning_rate": 2.9934620096920826e-09, + "loss": 0.86475122, + "num_input_tokens_seen": 352887430, + "step": 16351, + "time_per_iteration": 2.702939510345459 + }, + { + "auxiliary_loss_clip": 0.01038246, + "auxiliary_loss_mlp": 0.01023687, + "balance_loss_clip": 1.02967215, + "balance_loss_mlp": 1.01335192, + "epoch": 0.9831354276266346, + "flos": 31723522341120.0, + "grad_norm": 1.6329785616202996, + "language_loss": 0.68679643, + "learning_rate": 2.972199410170795e-09, + "loss": 0.70741582, + "num_input_tokens_seen": 352907555, + "step": 16352, + "time_per_iteration": 2.748469352722168 + }, + { + "auxiliary_loss_clip": 0.01039484, + "auxiliary_loss_mlp": 0.00747527, + "balance_loss_clip": 1.02334595, + "balance_loss_mlp": 1.00032282, + "epoch": 0.9831955508793025, + "flos": 21619050284160.0, + "grad_norm": 1.4010941053744554, + "language_loss": 0.66284239, + "learning_rate": 2.951012538143782e-09, + "loss": 0.68071252, + "num_input_tokens_seen": 352928670, + "step": 16353, + "time_per_iteration": 2.622748374938965 + }, + { + "auxiliary_loss_clip": 0.01037228, + "auxiliary_loss_mlp": 0.01030129, + "balance_loss_clip": 1.02227688, + "balance_loss_mlp": 1.02051497, + "epoch": 0.9832556741319706, + "flos": 22968214053120.0, + "grad_norm": 1.8880814885816082, + "language_loss": 0.74580228, + "learning_rate": 2.9299013944144025e-09, + "loss": 0.76647592, + "num_input_tokens_seen": 352948345, + "step": 16354, + "time_per_iteration": 2.673164129257202 + }, + { + "auxiliary_loss_clip": 0.01052928, + "auxiliary_loss_mlp": 0.01026155, + "balance_loss_clip": 1.02547073, + "balance_loss_mlp": 1.01553965, + "epoch": 0.9833157973846385, + "flos": 21323900229120.0, + "grad_norm": 1.8712076133822406, + "language_loss": 0.77198493, + "learning_rate": 2.9088659797835702e-09, + "loss": 0.79277575, + "num_input_tokens_seen": 352967250, + "step": 16355, + "time_per_iteration": 2.6395082473754883 + }, + { + "auxiliary_loss_clip": 0.01049222, + "auxiliary_loss_mlp": 0.01026637, + "balance_loss_clip": 1.02402425, + "balance_loss_mlp": 1.0168916, + "epoch": 0.9833759206373065, + "flos": 21068719032960.0, + "grad_norm": 1.9629144886337042, + "language_loss": 0.73259473, + "learning_rate": 2.8879062950484256e-09, + "loss": 0.75335336, + "num_input_tokens_seen": 352984725, + "step": 16356, + "time_per_iteration": 2.569056749343872 + }, + { + "auxiliary_loss_clip": 0.01037138, + "auxiliary_loss_mlp": 0.01028425, + "balance_loss_clip": 1.02339125, + "balance_loss_mlp": 1.01773143, + "epoch": 0.9834360438899744, + "flos": 18697322108160.0, + "grad_norm": 1.6189630854287935, + "language_loss": 0.7562263, + "learning_rate": 2.8670223410041104e-09, + "loss": 0.77688193, + "num_input_tokens_seen": 353003480, + "step": 16357, + "time_per_iteration": 2.6508307456970215 + }, + { + "auxiliary_loss_clip": 0.01041786, + "auxiliary_loss_mlp": 0.01022397, + "balance_loss_clip": 1.02448082, + "balance_loss_mlp": 1.01194203, + "epoch": 0.9834961671426424, + "flos": 21105240186240.0, + "grad_norm": 3.9761498194241267, + "language_loss": 0.80376863, + "learning_rate": 2.846214118442436e-09, + "loss": 0.82441044, + "num_input_tokens_seen": 353021425, + "step": 16358, + "time_per_iteration": 2.6300137042999268 + }, + { + "auxiliary_loss_clip": 0.01047225, + "auxiliary_loss_mlp": 0.01023463, + "balance_loss_clip": 1.02194524, + "balance_loss_mlp": 1.01390886, + "epoch": 0.9835562903953103, + "flos": 26687625511680.0, + "grad_norm": 2.069997825367689, + "language_loss": 0.67941165, + "learning_rate": 2.8254816281523263e-09, + "loss": 0.70011854, + "num_input_tokens_seen": 353039870, + "step": 16359, + "time_per_iteration": 2.6641666889190674 + }, + { + "auxiliary_loss_clip": 0.01058806, + "auxiliary_loss_mlp": 0.01028912, + "balance_loss_clip": 1.02406764, + "balance_loss_mlp": 1.01982212, + "epoch": 0.9836164136479784, + "flos": 22090162089600.0, + "grad_norm": 1.9675423515254067, + "language_loss": 0.69878531, + "learning_rate": 2.804824870920264e-09, + "loss": 0.71966243, + "num_input_tokens_seen": 353059750, + "step": 16360, + "time_per_iteration": 4.274181365966797 + }, + { + "auxiliary_loss_clip": 0.01052797, + "auxiliary_loss_mlp": 0.01030045, + "balance_loss_clip": 1.0242877, + "balance_loss_mlp": 1.01946509, + "epoch": 0.9836765369006463, + "flos": 23878405710720.0, + "grad_norm": 1.7587764628138205, + "language_loss": 0.83895332, + "learning_rate": 2.7842438475293996e-09, + "loss": 0.85978174, + "num_input_tokens_seen": 353079940, + "step": 16361, + "time_per_iteration": 2.7171683311462402 + }, + { + "auxiliary_loss_clip": 0.01060528, + "auxiliary_loss_mlp": 0.01026054, + "balance_loss_clip": 1.02466738, + "balance_loss_mlp": 1.01596332, + "epoch": 0.9837366601533143, + "flos": 25845017293440.0, + "grad_norm": 2.0210771557232508, + "language_loss": 0.75823104, + "learning_rate": 2.76373855876022e-09, + "loss": 0.77909684, + "num_input_tokens_seen": 353099990, + "step": 16362, + "time_per_iteration": 2.635047435760498 + }, + { + "auxiliary_loss_clip": 0.01062942, + "auxiliary_loss_mlp": 0.01029473, + "balance_loss_clip": 1.02550805, + "balance_loss_mlp": 1.01917899, + "epoch": 0.9837967834059823, + "flos": 21358015171200.0, + "grad_norm": 2.598516329300069, + "language_loss": 0.71149558, + "learning_rate": 2.7433090053901043e-09, + "loss": 0.73241973, + "num_input_tokens_seen": 353118710, + "step": 16363, + "time_per_iteration": 2.589897632598877 + }, + { + "auxiliary_loss_clip": 0.01037554, + "auxiliary_loss_mlp": 0.01025884, + "balance_loss_clip": 1.02300453, + "balance_loss_mlp": 1.01670456, + "epoch": 0.9838569066586502, + "flos": 18515793749760.0, + "grad_norm": 1.661015390810598, + "language_loss": 0.6300711, + "learning_rate": 2.7229551881937653e-09, + "loss": 0.65070546, + "num_input_tokens_seen": 353136415, + "step": 16364, + "time_per_iteration": 2.7010245323181152 + }, + { + "auxiliary_loss_clip": 0.01030141, + "auxiliary_loss_mlp": 0.01029666, + "balance_loss_clip": 1.03169179, + "balance_loss_mlp": 1.02060592, + "epoch": 0.9839170299113182, + "flos": 22452392793600.0, + "grad_norm": 1.6698344339423343, + "language_loss": 0.75104856, + "learning_rate": 2.702677107943252e-09, + "loss": 0.77164668, + "num_input_tokens_seen": 353154650, + "step": 16365, + "time_per_iteration": 2.777146816253662 + }, + { + "auxiliary_loss_clip": 0.01032244, + "auxiliary_loss_mlp": 0.01023535, + "balance_loss_clip": 1.02537942, + "balance_loss_mlp": 1.01247799, + "epoch": 0.9839771531639862, + "flos": 27892320779520.0, + "grad_norm": 1.6427732664378496, + "language_loss": 0.76136184, + "learning_rate": 2.6824747654072832e-09, + "loss": 0.78191966, + "num_input_tokens_seen": 353174065, + "step": 16366, + "time_per_iteration": 2.7865278720855713 + }, + { + "auxiliary_loss_clip": 0.0105792, + "auxiliary_loss_mlp": 0.01022886, + "balance_loss_clip": 1.02257156, + "balance_loss_mlp": 1.01335573, + "epoch": 0.9840372764166542, + "flos": 28214510797440.0, + "grad_norm": 1.7024524496647206, + "language_loss": 0.76956737, + "learning_rate": 2.662348161352357e-09, + "loss": 0.79037541, + "num_input_tokens_seen": 353193560, + "step": 16367, + "time_per_iteration": 2.668210983276367 + }, + { + "auxiliary_loss_clip": 0.01043346, + "auxiliary_loss_mlp": 0.01030789, + "balance_loss_clip": 1.02711368, + "balance_loss_mlp": 1.02021492, + "epoch": 0.9840973996693221, + "flos": 23403989854080.0, + "grad_norm": 1.43559580066184, + "language_loss": 0.61544943, + "learning_rate": 2.642297296540974e-09, + "loss": 0.63619077, + "num_input_tokens_seen": 353213525, + "step": 16368, + "time_per_iteration": 4.37973427772522 + }, + { + "auxiliary_loss_clip": 0.01046796, + "auxiliary_loss_mlp": 0.01028199, + "balance_loss_clip": 1.02305114, + "balance_loss_mlp": 1.01942515, + "epoch": 0.9841575229219901, + "flos": 21395865127680.0, + "grad_norm": 1.4435047756610355, + "language_loss": 0.65301514, + "learning_rate": 2.6223221717340816e-09, + "loss": 0.67376512, + "num_input_tokens_seen": 353234000, + "step": 16369, + "time_per_iteration": 4.350262880325317 + }, + { + "auxiliary_loss_clip": 0.01051644, + "auxiliary_loss_mlp": 0.00747551, + "balance_loss_clip": 1.02431464, + "balance_loss_mlp": 1.00037968, + "epoch": 0.984217646174658, + "flos": 24464072966400.0, + "grad_norm": 1.8238536871743904, + "language_loss": 0.68660235, + "learning_rate": 2.6024227876886295e-09, + "loss": 0.70459431, + "num_input_tokens_seen": 353254940, + "step": 16370, + "time_per_iteration": 2.723057985305786 + }, + { + "auxiliary_loss_clip": 0.01062839, + "auxiliary_loss_mlp": 0.01027771, + "balance_loss_clip": 1.02485228, + "balance_loss_mlp": 1.01675034, + "epoch": 0.984277769427326, + "flos": 16435057680000.0, + "grad_norm": 1.7793855984881979, + "language_loss": 0.7305038, + "learning_rate": 2.582599145159792e-09, + "loss": 0.75140989, + "num_input_tokens_seen": 353272590, + "step": 16371, + "time_per_iteration": 2.612396001815796 + }, + { + "auxiliary_loss_clip": 0.00997912, + "auxiliary_loss_mlp": 0.01000966, + "balance_loss_clip": 1.0027082, + "balance_loss_mlp": 1.00008965, + "epoch": 0.9843378926799939, + "flos": 64530615288960.0, + "grad_norm": 0.7723409384122919, + "language_loss": 0.65184462, + "learning_rate": 2.562851244898745e-09, + "loss": 0.6718334, + "num_input_tokens_seen": 353334380, + "step": 16372, + "time_per_iteration": 3.210855484008789 + }, + { + "auxiliary_loss_clip": 0.01049485, + "auxiliary_loss_mlp": 0.01027517, + "balance_loss_clip": 1.02342772, + "balance_loss_mlp": 1.01749122, + "epoch": 0.984398015932662, + "flos": 17382811985280.0, + "grad_norm": 1.6148937337529201, + "language_loss": 0.70521969, + "learning_rate": 2.5431790876544456e-09, + "loss": 0.7259897, + "num_input_tokens_seen": 353351640, + "step": 16373, + "time_per_iteration": 2.5774307250976562 + }, + { + "auxiliary_loss_clip": 0.0106036, + "auxiliary_loss_mlp": 0.01027544, + "balance_loss_clip": 1.02441812, + "balance_loss_mlp": 1.01752424, + "epoch": 0.9844581391853299, + "flos": 23879088069120.0, + "grad_norm": 1.7060074917699752, + "language_loss": 0.81205821, + "learning_rate": 2.523582674173186e-09, + "loss": 0.83293724, + "num_input_tokens_seen": 353372555, + "step": 16374, + "time_per_iteration": 2.591228485107422 + }, + { + "auxiliary_loss_clip": 0.01029349, + "auxiliary_loss_mlp": 0.01030716, + "balance_loss_clip": 1.02987254, + "balance_loss_mlp": 1.02082121, + "epoch": 0.9845182624379979, + "flos": 19865352568320.0, + "grad_norm": 1.8830532985908743, + "language_loss": 0.69326949, + "learning_rate": 2.504062005197927e-09, + "loss": 0.71387005, + "num_input_tokens_seen": 353391385, + "step": 16375, + "time_per_iteration": 2.8479652404785156 + }, + { + "auxiliary_loss_clip": 0.0103395, + "auxiliary_loss_mlp": 0.01032195, + "balance_loss_clip": 1.02115047, + "balance_loss_mlp": 1.01988697, + "epoch": 0.9845783856906659, + "flos": 28254659224320.0, + "grad_norm": 1.7665347524462975, + "language_loss": 0.81053591, + "learning_rate": 2.484617081468521e-09, + "loss": 0.83119738, + "num_input_tokens_seen": 353411630, + "step": 16376, + "time_per_iteration": 2.8136000633239746 + }, + { + "auxiliary_loss_clip": 0.01059149, + "auxiliary_loss_mlp": 0.0103026, + "balance_loss_clip": 1.02399957, + "balance_loss_mlp": 1.02006817, + "epoch": 0.9846385089433338, + "flos": 28328383889280.0, + "grad_norm": 1.498911176586862, + "language_loss": 0.62198317, + "learning_rate": 2.4652479037228224e-09, + "loss": 0.64287734, + "num_input_tokens_seen": 353432895, + "step": 16377, + "time_per_iteration": 2.6128365993499756 + }, + { + "auxiliary_loss_clip": 0.01035573, + "auxiliary_loss_mlp": 0.0103215, + "balance_loss_clip": 1.02642107, + "balance_loss_mlp": 1.02129602, + "epoch": 0.9846986321960018, + "flos": 24316767290880.0, + "grad_norm": 3.3822866662810784, + "language_loss": 0.72749019, + "learning_rate": 2.445954472695133e-09, + "loss": 0.74816746, + "num_input_tokens_seen": 353454195, + "step": 16378, + "time_per_iteration": 2.8088788986206055 + }, + { + "auxiliary_loss_clip": 0.0106123, + "auxiliary_loss_mlp": 0.01029913, + "balance_loss_clip": 1.02433622, + "balance_loss_mlp": 1.0200839, + "epoch": 0.9847587554486698, + "flos": 27271999877760.0, + "grad_norm": 1.5229121897420588, + "language_loss": 0.70974696, + "learning_rate": 2.426736789116868e-09, + "loss": 0.73065841, + "num_input_tokens_seen": 353475125, + "step": 16379, + "time_per_iteration": 2.5943145751953125 + }, + { + "auxiliary_loss_clip": 0.01032125, + "auxiliary_loss_mlp": 0.01028237, + "balance_loss_clip": 1.02441776, + "balance_loss_mlp": 1.01787162, + "epoch": 0.9848188787013378, + "flos": 16542717719040.0, + "grad_norm": 2.190693838293775, + "language_loss": 0.6813466, + "learning_rate": 2.407594853716999e-09, + "loss": 0.70195019, + "num_input_tokens_seen": 353493265, + "step": 16380, + "time_per_iteration": 2.7271029949188232 + }, + { + "auxiliary_loss_clip": 0.01027442, + "auxiliary_loss_mlp": 0.01032218, + "balance_loss_clip": 1.02317142, + "balance_loss_mlp": 1.02126312, + "epoch": 0.9848790019540057, + "flos": 20193647898240.0, + "grad_norm": 1.808057236514275, + "language_loss": 0.7876991, + "learning_rate": 2.38852866722139e-09, + "loss": 0.80829573, + "num_input_tokens_seen": 353511650, + "step": 16381, + "time_per_iteration": 2.725555181503296 + }, + { + "auxiliary_loss_clip": 0.01053498, + "auxiliary_loss_mlp": 0.01026973, + "balance_loss_clip": 1.02580118, + "balance_loss_mlp": 1.01640534, + "epoch": 0.9849391252066737, + "flos": 28259723041920.0, + "grad_norm": 1.3887273486895524, + "language_loss": 0.82360303, + "learning_rate": 2.3695382303527965e-09, + "loss": 0.84440774, + "num_input_tokens_seen": 353534035, + "step": 16382, + "time_per_iteration": 2.838791608810425 + }, + { + "auxiliary_loss_clip": 0.01029117, + "auxiliary_loss_mlp": 0.01031592, + "balance_loss_clip": 1.02016234, + "balance_loss_mlp": 1.01985621, + "epoch": 0.9849992484593416, + "flos": 22454942659200.0, + "grad_norm": 1.8689857038202822, + "language_loss": 0.74330276, + "learning_rate": 2.3506235438315316e-09, + "loss": 0.76390994, + "num_input_tokens_seen": 353549950, + "step": 16383, + "time_per_iteration": 2.733607530593872 + }, + { + "auxiliary_loss_clip": 0.01023733, + "auxiliary_loss_mlp": 0.01029795, + "balance_loss_clip": 1.02715588, + "balance_loss_mlp": 1.01924539, + "epoch": 0.9850593717120096, + "flos": 34497190656000.0, + "grad_norm": 1.6966734930197451, + "language_loss": 0.662938, + "learning_rate": 2.3317846083750203e-09, + "loss": 0.68347335, + "num_input_tokens_seen": 353573745, + "step": 16384, + "time_per_iteration": 2.9537558555603027 + }, + { + "auxiliary_loss_clip": 0.01045706, + "auxiliary_loss_mlp": 0.01034215, + "balance_loss_clip": 1.0263617, + "balance_loss_mlp": 1.02254486, + "epoch": 0.9851194949646775, + "flos": 38837282152320.0, + "grad_norm": 1.6665648549701075, + "language_loss": 0.70421922, + "learning_rate": 2.313021424697359e-09, + "loss": 0.7250185, + "num_input_tokens_seen": 353595335, + "step": 16385, + "time_per_iteration": 2.7894399166107178 + }, + { + "auxiliary_loss_clip": 0.01045355, + "auxiliary_loss_mlp": 0.01028887, + "balance_loss_clip": 1.02783942, + "balance_loss_mlp": 1.01894534, + "epoch": 0.9851796182173456, + "flos": 17712436118400.0, + "grad_norm": 6.531904705237546, + "language_loss": 0.80923206, + "learning_rate": 2.294333993509978e-09, + "loss": 0.82997441, + "num_input_tokens_seen": 353614270, + "step": 16386, + "time_per_iteration": 2.7088239192962646 + }, + { + "auxiliary_loss_clip": 0.01030089, + "auxiliary_loss_mlp": 0.0103226, + "balance_loss_clip": 1.02350378, + "balance_loss_mlp": 1.02143002, + "epoch": 0.9852397414700135, + "flos": 27454318335360.0, + "grad_norm": 1.9762330498717746, + "language_loss": 0.67967045, + "learning_rate": 2.2757223155216442e-09, + "loss": 0.7002939, + "num_input_tokens_seen": 353634900, + "step": 16387, + "time_per_iteration": 4.705344915390015 + }, + { + "auxiliary_loss_clip": 0.0104696, + "auxiliary_loss_mlp": 0.0074743, + "balance_loss_clip": 1.02311349, + "balance_loss_mlp": 1.00033545, + "epoch": 0.9852998647226815, + "flos": 18296702743680.0, + "grad_norm": 2.1479264370525404, + "language_loss": 0.73651928, + "learning_rate": 2.257186391438237e-09, + "loss": 0.75446314, + "num_input_tokens_seen": 353652890, + "step": 16388, + "time_per_iteration": 2.7109010219573975 + }, + { + "auxiliary_loss_clip": 0.01049692, + "auxiliary_loss_mlp": 0.01028086, + "balance_loss_clip": 1.02365375, + "balance_loss_mlp": 1.017923, + "epoch": 0.9853599879753495, + "flos": 19642562461440.0, + "grad_norm": 4.040417627239467, + "language_loss": 0.82040596, + "learning_rate": 2.238726221962528e-09, + "loss": 0.84118378, + "num_input_tokens_seen": 353671295, + "step": 16389, + "time_per_iteration": 2.6164331436157227 + }, + { + "auxiliary_loss_clip": 0.01037339, + "auxiliary_loss_mlp": 0.00747563, + "balance_loss_clip": 1.02288866, + "balance_loss_mlp": 1.00037193, + "epoch": 0.9854201112280174, + "flos": 23841956384640.0, + "grad_norm": 1.9200037355169175, + "language_loss": 0.66738713, + "learning_rate": 2.2203418077946234e-09, + "loss": 0.68523622, + "num_input_tokens_seen": 353690560, + "step": 16390, + "time_per_iteration": 2.680962562561035 + }, + { + "auxiliary_loss_clip": 0.01021745, + "auxiliary_loss_mlp": 0.0103135, + "balance_loss_clip": 1.02340949, + "balance_loss_mlp": 1.02067447, + "epoch": 0.9854802344806854, + "flos": 30080573233920.0, + "grad_norm": 2.03448262994677, + "language_loss": 0.77330893, + "learning_rate": 2.2020331496312994e-09, + "loss": 0.79383993, + "num_input_tokens_seen": 353710660, + "step": 16391, + "time_per_iteration": 2.7380197048187256 + }, + { + "auxiliary_loss_clip": 0.01021137, + "auxiliary_loss_mlp": 0.0074745, + "balance_loss_clip": 1.0219152, + "balance_loss_mlp": 1.00033176, + "epoch": 0.9855403577333534, + "flos": 21907412668800.0, + "grad_norm": 2.934719997972267, + "language_loss": 0.68063641, + "learning_rate": 2.1838002481673333e-09, + "loss": 0.6983223, + "num_input_tokens_seen": 353730440, + "step": 16392, + "time_per_iteration": 2.8268280029296875 + }, + { + "auxiliary_loss_clip": 0.0102857, + "auxiliary_loss_mlp": 0.01026336, + "balance_loss_clip": 1.0223248, + "balance_loss_mlp": 1.01519632, + "epoch": 0.9856004809860214, + "flos": 15413794191360.0, + "grad_norm": 1.9595140097143675, + "language_loss": 0.56135786, + "learning_rate": 2.1656431040937286e-09, + "loss": 0.58190697, + "num_input_tokens_seen": 353748360, + "step": 16393, + "time_per_iteration": 2.749781608581543 + }, + { + "auxiliary_loss_clip": 0.01021726, + "auxiliary_loss_mlp": 0.01029509, + "balance_loss_clip": 1.02069795, + "balance_loss_mlp": 1.01758265, + "epoch": 0.9856606042386893, + "flos": 13653201064320.0, + "grad_norm": 2.727397103367875, + "language_loss": 0.79059279, + "learning_rate": 2.1475617180990444e-09, + "loss": 0.81110513, + "num_input_tokens_seen": 353760880, + "step": 16394, + "time_per_iteration": 2.811363458633423 + }, + { + "auxiliary_loss_clip": 0.01046986, + "auxiliary_loss_mlp": 0.01033152, + "balance_loss_clip": 1.02286577, + "balance_loss_mlp": 1.02199364, + "epoch": 0.9857207274913573, + "flos": 23479151063040.0, + "grad_norm": 1.5047746910787025, + "language_loss": 0.76406431, + "learning_rate": 2.129556090869178e-09, + "loss": 0.78486568, + "num_input_tokens_seen": 353782255, + "step": 16395, + "time_per_iteration": 2.7548022270202637 + }, + { + "auxiliary_loss_clip": 0.0104684, + "auxiliary_loss_mlp": 0.01025931, + "balance_loss_clip": 1.02272511, + "balance_loss_mlp": 1.01570261, + "epoch": 0.9857808507440252, + "flos": 21065486808960.0, + "grad_norm": 2.4745288938239622, + "language_loss": 0.75163275, + "learning_rate": 2.1116262230866933e-09, + "loss": 0.77236044, + "num_input_tokens_seen": 353803580, + "step": 16396, + "time_per_iteration": 2.872343063354492 + }, + { + "auxiliary_loss_clip": 0.01023635, + "auxiliary_loss_mlp": 0.01021921, + "balance_loss_clip": 1.02402651, + "balance_loss_mlp": 1.01193678, + "epoch": 0.9858409739966932, + "flos": 25301365971840.0, + "grad_norm": 1.5579041653143948, + "language_loss": 0.70856178, + "learning_rate": 2.0937721154317133e-09, + "loss": 0.72901726, + "num_input_tokens_seen": 353824200, + "step": 16397, + "time_per_iteration": 2.8772692680358887 + }, + { + "auxiliary_loss_clip": 0.0103613, + "auxiliary_loss_mlp": 0.01025409, + "balance_loss_clip": 1.0243578, + "balance_loss_mlp": 1.01613474, + "epoch": 0.9859010972493611, + "flos": 20558751690240.0, + "grad_norm": 1.7057216406208575, + "language_loss": 0.71186554, + "learning_rate": 2.0759937685810304e-09, + "loss": 0.732481, + "num_input_tokens_seen": 353843350, + "step": 16398, + "time_per_iteration": 2.679029941558838 + }, + { + "auxiliary_loss_clip": 0.0102723, + "auxiliary_loss_mlp": 0.01022615, + "balance_loss_clip": 1.0233475, + "balance_loss_mlp": 1.01311386, + "epoch": 0.9859612205020292, + "flos": 24754985216640.0, + "grad_norm": 1.409197818368163, + "language_loss": 0.74210072, + "learning_rate": 2.058291183208771e-09, + "loss": 0.76259911, + "num_input_tokens_seen": 353864520, + "step": 16399, + "time_per_iteration": 2.7367889881134033 + }, + { + "auxiliary_loss_clip": 0.01061908, + "auxiliary_loss_mlp": 0.0102593, + "balance_loss_clip": 1.02427578, + "balance_loss_mlp": 1.01568389, + "epoch": 0.9860213437546971, + "flos": 21105850717440.0, + "grad_norm": 2.7124466704109564, + "language_loss": 0.57347333, + "learning_rate": 2.0406643599863993e-09, + "loss": 0.59435171, + "num_input_tokens_seen": 353882240, + "step": 16400, + "time_per_iteration": 2.650343179702759 + }, + { + "auxiliary_loss_clip": 0.01045264, + "auxiliary_loss_mlp": 0.01028476, + "balance_loss_clip": 1.02446938, + "balance_loss_mlp": 1.01732373, + "epoch": 0.9860814670073651, + "flos": 19136078737920.0, + "grad_norm": 1.632877835150563, + "language_loss": 0.80304408, + "learning_rate": 2.023113299582491e-09, + "loss": 0.82378149, + "num_input_tokens_seen": 353901590, + "step": 16401, + "time_per_iteration": 2.6268701553344727 + }, + { + "auxiliary_loss_clip": 0.01051331, + "auxiliary_loss_mlp": 0.01028848, + "balance_loss_clip": 1.02494025, + "balance_loss_mlp": 1.01772523, + "epoch": 0.9861415902600331, + "flos": 17237050594560.0, + "grad_norm": 1.7913023398148442, + "language_loss": 0.78043616, + "learning_rate": 2.005638002662069e-09, + "loss": 0.80123794, + "num_input_tokens_seen": 353918785, + "step": 16402, + "time_per_iteration": 2.612450361251831 + }, + { + "auxiliary_loss_clip": 0.01053623, + "auxiliary_loss_mlp": 0.01031466, + "balance_loss_clip": 1.02504563, + "balance_loss_mlp": 1.02130342, + "epoch": 0.986201713512701, + "flos": 27782577751680.0, + "grad_norm": 1.6132776569440435, + "language_loss": 0.69953698, + "learning_rate": 1.9882384698881596e-09, + "loss": 0.72038788, + "num_input_tokens_seen": 353940390, + "step": 16403, + "time_per_iteration": 2.7016353607177734 + }, + { + "auxiliary_loss_clip": 0.01043211, + "auxiliary_loss_mlp": 0.0102776, + "balance_loss_clip": 1.02094388, + "balance_loss_mlp": 1.01757967, + "epoch": 0.986261836765369, + "flos": 28730403884160.0, + "grad_norm": 1.8819827218045675, + "language_loss": 0.74480778, + "learning_rate": 1.9709147019204566e-09, + "loss": 0.76551747, + "num_input_tokens_seen": 353962180, + "step": 16404, + "time_per_iteration": 2.671273946762085 + }, + { + "auxiliary_loss_clip": 0.01047522, + "auxiliary_loss_mlp": 0.00747709, + "balance_loss_clip": 1.02288127, + "balance_loss_mlp": 1.00039363, + "epoch": 0.986321960018037, + "flos": 34313471568000.0, + "grad_norm": 1.6170419309403343, + "language_loss": 0.69628823, + "learning_rate": 1.953666699415768e-09, + "loss": 0.71424055, + "num_input_tokens_seen": 353984305, + "step": 16405, + "time_per_iteration": 2.7084619998931885 + }, + { + "auxiliary_loss_clip": 0.01041074, + "auxiliary_loss_mlp": 0.01027142, + "balance_loss_clip": 1.02461302, + "balance_loss_mlp": 1.01772451, + "epoch": 0.986382083270705, + "flos": 25189755436800.0, + "grad_norm": 1.8111937239947247, + "language_loss": 0.6942414, + "learning_rate": 1.93649446302846e-09, + "loss": 0.7149235, + "num_input_tokens_seen": 354004495, + "step": 16406, + "time_per_iteration": 2.7562365531921387 + }, + { + "auxiliary_loss_clip": 0.01009203, + "auxiliary_loss_mlp": 0.010273, + "balance_loss_clip": 1.02532041, + "balance_loss_mlp": 1.01689267, + "epoch": 0.9864422065233729, + "flos": 11025904671360.0, + "grad_norm": 2.504311394644653, + "language_loss": 0.75126469, + "learning_rate": 1.9193979934095663e-09, + "loss": 0.77162975, + "num_input_tokens_seen": 354015985, + "step": 16407, + "time_per_iteration": 4.54134464263916 + }, + { + "auxiliary_loss_clip": 0.0103936, + "auxiliary_loss_mlp": 0.01030181, + "balance_loss_clip": 1.02366567, + "balance_loss_mlp": 1.02054894, + "epoch": 0.9865023297760409, + "flos": 16545590807040.0, + "grad_norm": 1.8014672646057697, + "language_loss": 0.77292752, + "learning_rate": 1.9023772912072357e-09, + "loss": 0.79362297, + "num_input_tokens_seen": 354033260, + "step": 16408, + "time_per_iteration": 2.7260618209838867 + }, + { + "auxiliary_loss_clip": 0.01054638, + "auxiliary_loss_mlp": 0.01028214, + "balance_loss_clip": 1.0252533, + "balance_loss_mlp": 1.01679969, + "epoch": 0.9865624530287088, + "flos": 18880179269760.0, + "grad_norm": 1.6666466902781127, + "language_loss": 0.67916143, + "learning_rate": 1.8854323570669515e-09, + "loss": 0.69998991, + "num_input_tokens_seen": 354052825, + "step": 16409, + "time_per_iteration": 2.597313642501831 + }, + { + "auxiliary_loss_clip": 0.00989278, + "auxiliary_loss_mlp": 0.01000632, + "balance_loss_clip": 1.00311518, + "balance_loss_mlp": 0.99978524, + "epoch": 0.9866225762813768, + "flos": 68887798680960.0, + "grad_norm": 0.8088323299621831, + "language_loss": 0.61098081, + "learning_rate": 1.8685631916313118e-09, + "loss": 0.63087994, + "num_input_tokens_seen": 354113920, + "step": 16410, + "time_per_iteration": 3.265265464782715 + }, + { + "auxiliary_loss_clip": 0.01051752, + "auxiliary_loss_mlp": 0.01028288, + "balance_loss_clip": 1.02418065, + "balance_loss_mlp": 1.01808929, + "epoch": 0.9866826995340447, + "flos": 29023111814400.0, + "grad_norm": 1.9503772077569972, + "language_loss": 0.66035432, + "learning_rate": 1.8517697955400258e-09, + "loss": 0.68115473, + "num_input_tokens_seen": 354134210, + "step": 16411, + "time_per_iteration": 2.6374242305755615 + }, + { + "auxiliary_loss_clip": 0.01006001, + "auxiliary_loss_mlp": 0.01000099, + "balance_loss_clip": 1.00087368, + "balance_loss_mlp": 0.99930066, + "epoch": 0.9867428227867128, + "flos": 65376814867200.0, + "grad_norm": 0.724238418097676, + "language_loss": 0.56174225, + "learning_rate": 1.8350521694299182e-09, + "loss": 0.58180326, + "num_input_tokens_seen": 354198010, + "step": 16412, + "time_per_iteration": 3.224252939224243 + }, + { + "auxiliary_loss_clip": 0.01032187, + "auxiliary_loss_mlp": 0.01028575, + "balance_loss_clip": 1.02388644, + "balance_loss_mlp": 1.01729774, + "epoch": 0.9868029460393807, + "flos": 26506312634880.0, + "grad_norm": 1.8484213936056557, + "language_loss": 0.73075962, + "learning_rate": 1.818410313934926e-09, + "loss": 0.75136721, + "num_input_tokens_seen": 354220000, + "step": 16413, + "time_per_iteration": 2.701664924621582 + }, + { + "auxiliary_loss_clip": 0.01023415, + "auxiliary_loss_mlp": 0.01028452, + "balance_loss_clip": 1.024194, + "balance_loss_mlp": 1.01787806, + "epoch": 0.9868630692920487, + "flos": 22967280299520.0, + "grad_norm": 1.4088938412974283, + "language_loss": 0.71319973, + "learning_rate": 1.8018442296858782e-09, + "loss": 0.7337184, + "num_input_tokens_seen": 354240910, + "step": 16414, + "time_per_iteration": 2.858898401260376 + }, + { + "auxiliary_loss_clip": 0.01042657, + "auxiliary_loss_mlp": 0.01033884, + "balance_loss_clip": 1.02328897, + "balance_loss_mlp": 1.02359676, + "epoch": 0.9869231925447167, + "flos": 19828687760640.0, + "grad_norm": 1.7613059499294421, + "language_loss": 0.70121324, + "learning_rate": 1.7853539173111608e-09, + "loss": 0.72197866, + "num_input_tokens_seen": 354259430, + "step": 16415, + "time_per_iteration": 4.304932117462158 + }, + { + "auxiliary_loss_clip": 0.0102057, + "auxiliary_loss_mlp": 0.01027474, + "balance_loss_clip": 1.02198792, + "balance_loss_mlp": 1.01803863, + "epoch": 0.9869833157973846, + "flos": 20195228096640.0, + "grad_norm": 1.6911497116820418, + "language_loss": 0.75587785, + "learning_rate": 1.7689393774362737e-09, + "loss": 0.77635831, + "num_input_tokens_seen": 354279490, + "step": 16416, + "time_per_iteration": 4.30546236038208 + }, + { + "auxiliary_loss_clip": 0.01041206, + "auxiliary_loss_mlp": 0.01026711, + "balance_loss_clip": 1.02559161, + "balance_loss_mlp": 1.01657832, + "epoch": 0.9870434390500527, + "flos": 16099507802880.0, + "grad_norm": 2.520659024087148, + "language_loss": 0.7046749, + "learning_rate": 1.7526006106833858e-09, + "loss": 0.72535402, + "num_input_tokens_seen": 354295080, + "step": 16417, + "time_per_iteration": 2.708286762237549 + }, + { + "auxiliary_loss_clip": 0.01045791, + "auxiliary_loss_mlp": 0.01032073, + "balance_loss_clip": 1.02662039, + "balance_loss_mlp": 1.02092707, + "epoch": 0.9871035623027206, + "flos": 21760753438080.0, + "grad_norm": 1.5390025258822182, + "language_loss": 0.70699954, + "learning_rate": 1.7363376176720013e-09, + "loss": 0.7277782, + "num_input_tokens_seen": 354314610, + "step": 16418, + "time_per_iteration": 2.749532699584961 + }, + { + "auxiliary_loss_clip": 0.01006054, + "auxiliary_loss_mlp": 0.00999701, + "balance_loss_clip": 1.00102544, + "balance_loss_mlp": 0.99886668, + "epoch": 0.9871636855553886, + "flos": 70219583245440.0, + "grad_norm": 0.6562475100780222, + "language_loss": 0.53714699, + "learning_rate": 1.7201503990189603e-09, + "loss": 0.55720448, + "num_input_tokens_seen": 354383115, + "step": 16419, + "time_per_iteration": 3.3157732486724854 + }, + { + "auxiliary_loss_clip": 0.01041216, + "auxiliary_loss_mlp": 0.0103507, + "balance_loss_clip": 1.02245975, + "balance_loss_mlp": 1.02364373, + "epoch": 0.9872238088080565, + "flos": 25045825639680.0, + "grad_norm": 1.905836103941338, + "language_loss": 0.78197992, + "learning_rate": 1.7040389553382162e-09, + "loss": 0.80274284, + "num_input_tokens_seen": 354403115, + "step": 16420, + "time_per_iteration": 2.753282308578491 + }, + { + "auxiliary_loss_clip": 0.01037067, + "auxiliary_loss_mlp": 0.010247, + "balance_loss_clip": 1.02926111, + "balance_loss_mlp": 1.01392949, + "epoch": 0.9872839320607245, + "flos": 19465846525440.0, + "grad_norm": 1.4974365341251699, + "language_loss": 0.71036392, + "learning_rate": 1.6880032872403916e-09, + "loss": 0.73098159, + "num_input_tokens_seen": 354424520, + "step": 16421, + "time_per_iteration": 2.746541976928711 + }, + { + "auxiliary_loss_clip": 0.01054214, + "auxiliary_loss_mlp": 0.01031818, + "balance_loss_clip": 1.0258441, + "balance_loss_mlp": 1.02038574, + "epoch": 0.9873440553133924, + "flos": 26942914448640.0, + "grad_norm": 2.0420043381699737, + "language_loss": 0.81791568, + "learning_rate": 1.6720433953338886e-09, + "loss": 0.83877605, + "num_input_tokens_seen": 354444800, + "step": 16422, + "time_per_iteration": 2.7574775218963623 + }, + { + "auxiliary_loss_clip": 0.01026858, + "auxiliary_loss_mlp": 0.01025072, + "balance_loss_clip": 1.02255118, + "balance_loss_mlp": 1.01518953, + "epoch": 0.9874041785660604, + "flos": 19062210418560.0, + "grad_norm": 1.7542895927957556, + "language_loss": 0.85855126, + "learning_rate": 1.656159280223779e-09, + "loss": 0.87907052, + "num_input_tokens_seen": 354464590, + "step": 16423, + "time_per_iteration": 2.724579095840454 + }, + { + "auxiliary_loss_clip": 0.01054953, + "auxiliary_loss_mlp": 0.01026338, + "balance_loss_clip": 1.02657878, + "balance_loss_mlp": 1.01583564, + "epoch": 0.9874643018187284, + "flos": 21105814803840.0, + "grad_norm": 1.8652918498421254, + "language_loss": 0.70508981, + "learning_rate": 1.6403509425122475e-09, + "loss": 0.72590268, + "num_input_tokens_seen": 354484145, + "step": 16424, + "time_per_iteration": 2.6901559829711914 + }, + { + "auxiliary_loss_clip": 0.01050566, + "auxiliary_loss_mlp": 0.00747559, + "balance_loss_clip": 1.02323794, + "balance_loss_mlp": 1.00033951, + "epoch": 0.9875244250713964, + "flos": 24426043441920.0, + "grad_norm": 2.179370255434449, + "language_loss": 0.80621654, + "learning_rate": 1.6246183827990366e-09, + "loss": 0.82419777, + "num_input_tokens_seen": 354502475, + "step": 16425, + "time_per_iteration": 2.6361207962036133 + }, + { + "auxiliary_loss_clip": 0.01006297, + "auxiliary_loss_mlp": 0.0102939, + "balance_loss_clip": 1.02017856, + "balance_loss_mlp": 1.01795149, + "epoch": 0.9875845483240643, + "flos": 25117610970240.0, + "grad_norm": 3.344588730520328, + "language_loss": 0.80092937, + "learning_rate": 1.6089616016803364e-09, + "loss": 0.8212862, + "num_input_tokens_seen": 354521855, + "step": 16426, + "time_per_iteration": 2.8018932342529297 + }, + { + "auxiliary_loss_clip": 0.01055383, + "auxiliary_loss_mlp": 0.01028504, + "balance_loss_clip": 1.02775347, + "balance_loss_mlp": 1.01808476, + "epoch": 0.9876446715767323, + "flos": 16581788737920.0, + "grad_norm": 1.8802172295225454, + "language_loss": 0.84463048, + "learning_rate": 1.593380599750338e-09, + "loss": 0.86546934, + "num_input_tokens_seen": 354539535, + "step": 16427, + "time_per_iteration": 2.6613359451293945 + }, + { + "auxiliary_loss_clip": 0.01060562, + "auxiliary_loss_mlp": 0.01028194, + "balance_loss_clip": 1.02486587, + "balance_loss_mlp": 1.01832342, + "epoch": 0.9877047948294003, + "flos": 21616141282560.0, + "grad_norm": 2.032182083019046, + "language_loss": 0.70484161, + "learning_rate": 1.577875377599458e-09, + "loss": 0.72572917, + "num_input_tokens_seen": 354557430, + "step": 16428, + "time_per_iteration": 2.569922685623169 + }, + { + "auxiliary_loss_clip": 0.01024835, + "auxiliary_loss_mlp": 0.01032236, + "balance_loss_clip": 1.02256489, + "balance_loss_mlp": 1.02231157, + "epoch": 0.9877649180820682, + "flos": 21178497974400.0, + "grad_norm": 2.2888317851625817, + "language_loss": 0.79900622, + "learning_rate": 1.5624459358158926e-09, + "loss": 0.81957698, + "num_input_tokens_seen": 354574735, + "step": 16429, + "time_per_iteration": 2.7135744094848633 + }, + { + "auxiliary_loss_clip": 0.01060679, + "auxiliary_loss_mlp": 0.0102778, + "balance_loss_clip": 1.02495313, + "balance_loss_mlp": 1.01801097, + "epoch": 0.9878250413347363, + "flos": 39749233576320.0, + "grad_norm": 1.749348798160359, + "language_loss": 0.62231028, + "learning_rate": 1.5470922749845073e-09, + "loss": 0.64319491, + "num_input_tokens_seen": 354597050, + "step": 16430, + "time_per_iteration": 2.7135941982269287 + }, + { + "auxiliary_loss_clip": 0.01062845, + "auxiliary_loss_mlp": 0.01032115, + "balance_loss_clip": 1.0255965, + "balance_loss_mlp": 1.02189326, + "epoch": 0.9878851645874042, + "flos": 29425634599680.0, + "grad_norm": 1.2946989624697487, + "language_loss": 0.72924668, + "learning_rate": 1.531814395687725e-09, + "loss": 0.75019628, + "num_input_tokens_seen": 354619095, + "step": 16431, + "time_per_iteration": 2.7591257095336914 + }, + { + "auxiliary_loss_clip": 0.01061678, + "auxiliary_loss_mlp": 0.01030606, + "balance_loss_clip": 1.02519739, + "balance_loss_mlp": 1.0201453, + "epoch": 0.9879452878400722, + "flos": 15806261168640.0, + "grad_norm": 2.306920934622221, + "language_loss": 0.80949253, + "learning_rate": 1.5166122985048602e-09, + "loss": 0.83041537, + "num_input_tokens_seen": 354633790, + "step": 16432, + "time_per_iteration": 2.550248146057129 + }, + { + "auxiliary_loss_clip": 0.01047697, + "auxiliary_loss_mlp": 0.01025284, + "balance_loss_clip": 1.02319574, + "balance_loss_mlp": 1.01630116, + "epoch": 0.9880054110927401, + "flos": 22233912318720.0, + "grad_norm": 1.444327114713957, + "language_loss": 0.80418289, + "learning_rate": 1.5014859840123405e-09, + "loss": 0.82491267, + "num_input_tokens_seen": 354653180, + "step": 16433, + "time_per_iteration": 2.6105566024780273 + }, + { + "auxiliary_loss_clip": 0.01060211, + "auxiliary_loss_mlp": 0.01032937, + "balance_loss_clip": 1.02507997, + "balance_loss_mlp": 1.02265489, + "epoch": 0.9880655343454081, + "flos": 28763836467840.0, + "grad_norm": 2.198058212536506, + "language_loss": 0.65385389, + "learning_rate": 1.4864354527837075e-09, + "loss": 0.67478538, + "num_input_tokens_seen": 354669900, + "step": 16434, + "time_per_iteration": 2.685896635055542 + }, + { + "auxiliary_loss_clip": 0.01050267, + "auxiliary_loss_mlp": 0.01028821, + "balance_loss_clip": 1.02306509, + "balance_loss_mlp": 1.01827741, + "epoch": 0.988125657598076, + "flos": 32853379622400.0, + "grad_norm": 1.6902469357705, + "language_loss": 0.6935761, + "learning_rate": 1.4714607053896154e-09, + "loss": 0.71436703, + "num_input_tokens_seen": 354693165, + "step": 16435, + "time_per_iteration": 4.547488689422607 + }, + { + "auxiliary_loss_clip": 0.01016102, + "auxiliary_loss_mlp": 0.01030008, + "balance_loss_clip": 1.02615857, + "balance_loss_mlp": 1.01898682, + "epoch": 0.988185780850744, + "flos": 19390685316480.0, + "grad_norm": 1.6424422702238672, + "language_loss": 0.75493503, + "learning_rate": 1.4565617423980548e-09, + "loss": 0.77539617, + "num_input_tokens_seen": 354711915, + "step": 16436, + "time_per_iteration": 2.7304935455322266 + }, + { + "auxiliary_loss_clip": 0.01033469, + "auxiliary_loss_mlp": 0.01027028, + "balance_loss_clip": 1.02282977, + "balance_loss_mlp": 1.01620984, + "epoch": 0.988245904103412, + "flos": 22528415928960.0, + "grad_norm": 2.2071179893328496, + "language_loss": 0.74602097, + "learning_rate": 1.4417385643741286e-09, + "loss": 0.76662588, + "num_input_tokens_seen": 354729135, + "step": 16437, + "time_per_iteration": 2.679095506668091 + }, + { + "auxiliary_loss_clip": 0.01027477, + "auxiliary_loss_mlp": 0.01031654, + "balance_loss_clip": 1.0228014, + "balance_loss_mlp": 1.02173018, + "epoch": 0.98830602735608, + "flos": 28659193171200.0, + "grad_norm": 1.4152125546006415, + "language_loss": 0.60103261, + "learning_rate": 1.4269911718796103e-09, + "loss": 0.62162387, + "num_input_tokens_seen": 354752530, + "step": 16438, + "time_per_iteration": 2.80964994430542 + }, + { + "auxiliary_loss_clip": 0.01032179, + "auxiliary_loss_mlp": 0.01026865, + "balance_loss_clip": 1.02188849, + "balance_loss_mlp": 1.01596367, + "epoch": 0.9883661506087479, + "flos": 20996035862400.0, + "grad_norm": 1.807975287709362, + "language_loss": 0.71706569, + "learning_rate": 1.4123195654738295e-09, + "loss": 0.73765618, + "num_input_tokens_seen": 354771135, + "step": 16439, + "time_per_iteration": 2.6444995403289795 + }, + { + "auxiliary_loss_clip": 0.01050039, + "auxiliary_loss_mlp": 0.01028478, + "balance_loss_clip": 1.02383423, + "balance_loss_mlp": 1.01815414, + "epoch": 0.9884262738614159, + "flos": 32706109860480.0, + "grad_norm": 1.5341440709636482, + "language_loss": 0.60008383, + "learning_rate": 1.3977237457134528e-09, + "loss": 0.62086904, + "num_input_tokens_seen": 354791800, + "step": 16440, + "time_per_iteration": 2.722442626953125 + }, + { + "auxiliary_loss_clip": 0.01061528, + "auxiliary_loss_mlp": 0.01029233, + "balance_loss_clip": 1.02390897, + "balance_loss_mlp": 1.01910079, + "epoch": 0.9884863971140839, + "flos": 17564699479680.0, + "grad_norm": 2.381430945762584, + "language_loss": 0.76029789, + "learning_rate": 1.3832037131513707e-09, + "loss": 0.78120553, + "num_input_tokens_seen": 354809200, + "step": 16441, + "time_per_iteration": 2.541253089904785 + }, + { + "auxiliary_loss_clip": 0.01037187, + "auxiliary_loss_mlp": 0.01026089, + "balance_loss_clip": 1.02231693, + "balance_loss_mlp": 1.01533675, + "epoch": 0.9885465203667518, + "flos": 40552519380480.0, + "grad_norm": 2.262521755012829, + "language_loss": 0.67751515, + "learning_rate": 1.3687594683386982e-09, + "loss": 0.69814789, + "num_input_tokens_seen": 354829945, + "step": 16442, + "time_per_iteration": 2.8275699615478516 + }, + { + "auxiliary_loss_clip": 0.01049883, + "auxiliary_loss_mlp": 0.01025841, + "balance_loss_clip": 1.02368379, + "balance_loss_mlp": 1.01567817, + "epoch": 0.9886066436194199, + "flos": 13807976768640.0, + "grad_norm": 2.1157946698693406, + "language_loss": 0.74429423, + "learning_rate": 1.3543910118227753e-09, + "loss": 0.76505142, + "num_input_tokens_seen": 354845055, + "step": 16443, + "time_per_iteration": 2.6538009643554688 + }, + { + "auxiliary_loss_clip": 0.01042434, + "auxiliary_loss_mlp": 0.01031185, + "balance_loss_clip": 1.0243386, + "balance_loss_mlp": 1.02008641, + "epoch": 0.9886667668720878, + "flos": 23325129544320.0, + "grad_norm": 2.0310505365931264, + "language_loss": 0.73558223, + "learning_rate": 1.3400983441487213e-09, + "loss": 0.75631845, + "num_input_tokens_seen": 354864680, + "step": 16444, + "time_per_iteration": 2.759000301361084 + }, + { + "auxiliary_loss_clip": 0.01017362, + "auxiliary_loss_mlp": 0.01037488, + "balance_loss_clip": 1.02594471, + "balance_loss_mlp": 1.02610326, + "epoch": 0.9887268901247558, + "flos": 22706029704960.0, + "grad_norm": 1.9213743141679307, + "language_loss": 0.690552, + "learning_rate": 1.325881465858547e-09, + "loss": 0.71110058, + "num_input_tokens_seen": 354885685, + "step": 16445, + "time_per_iteration": 2.837056875228882 + }, + { + "auxiliary_loss_clip": 0.0105539, + "auxiliary_loss_mlp": 0.01026191, + "balance_loss_clip": 1.02638876, + "balance_loss_mlp": 1.01535451, + "epoch": 0.9887870133774237, + "flos": 13041283944960.0, + "grad_norm": 3.8889734866831462, + "language_loss": 0.60390306, + "learning_rate": 1.311740377491155e-09, + "loss": 0.62471879, + "num_input_tokens_seen": 354901505, + "step": 16446, + "time_per_iteration": 2.579183578491211 + }, + { + "auxiliary_loss_clip": 0.01033302, + "auxiliary_loss_mlp": 0.01028354, + "balance_loss_clip": 1.02407622, + "balance_loss_mlp": 1.01893079, + "epoch": 0.9888471366300917, + "flos": 15158864390400.0, + "grad_norm": 5.481414402984646, + "language_loss": 0.70646483, + "learning_rate": 1.297675079582783e-09, + "loss": 0.72708136, + "num_input_tokens_seen": 354920060, + "step": 16447, + "time_per_iteration": 2.721036195755005 + }, + { + "auxiliary_loss_clip": 0.01059393, + "auxiliary_loss_mlp": 0.00747633, + "balance_loss_clip": 1.02376664, + "balance_loss_mlp": 1.00036025, + "epoch": 0.9889072598827596, + "flos": 25118796119040.0, + "grad_norm": 1.7220719847668167, + "language_loss": 0.83721942, + "learning_rate": 1.2836855726667818e-09, + "loss": 0.8552897, + "num_input_tokens_seen": 354938690, + "step": 16448, + "time_per_iteration": 2.5693814754486084 + }, + { + "auxiliary_loss_clip": 0.0104916, + "auxiliary_loss_mlp": 0.01026298, + "balance_loss_clip": 1.02384257, + "balance_loss_mlp": 1.01713729, + "epoch": 0.9889673831354276, + "flos": 16728663450240.0, + "grad_norm": 1.5098049807949419, + "language_loss": 0.69769406, + "learning_rate": 1.26977185727406e-09, + "loss": 0.71844864, + "num_input_tokens_seen": 354956955, + "step": 16449, + "time_per_iteration": 2.6512460708618164 + }, + { + "auxiliary_loss_clip": 0.01052637, + "auxiliary_loss_mlp": 0.01028484, + "balance_loss_clip": 1.02478004, + "balance_loss_mlp": 1.0182085, + "epoch": 0.9890275063880956, + "flos": 35585175657600.0, + "grad_norm": 3.943684212148369, + "language_loss": 0.7380985, + "learning_rate": 1.25593393393153e-09, + "loss": 0.75890976, + "num_input_tokens_seen": 354976800, + "step": 16450, + "time_per_iteration": 2.7270517349243164 + }, + { + "auxiliary_loss_clip": 0.01061922, + "auxiliary_loss_mlp": 0.01031042, + "balance_loss_clip": 1.02350879, + "balance_loss_mlp": 1.02066493, + "epoch": 0.9890876296407636, + "flos": 18952359649920.0, + "grad_norm": 1.7717518759061768, + "language_loss": 0.79417688, + "learning_rate": 1.242171803164549e-09, + "loss": 0.81510651, + "num_input_tokens_seen": 354996625, + "step": 16451, + "time_per_iteration": 2.5033373832702637 + }, + { + "auxiliary_loss_clip": 0.01026216, + "auxiliary_loss_mlp": 0.01033318, + "balance_loss_clip": 1.0223968, + "balance_loss_mlp": 1.0219276, + "epoch": 0.9891477528934315, + "flos": 23769309127680.0, + "grad_norm": 1.9594116701957895, + "language_loss": 0.70017517, + "learning_rate": 1.2284854654946996e-09, + "loss": 0.72077048, + "num_input_tokens_seen": 355014535, + "step": 16452, + "time_per_iteration": 2.7274441719055176 + }, + { + "auxiliary_loss_clip": 0.01060817, + "auxiliary_loss_mlp": 0.01022365, + "balance_loss_clip": 1.02614391, + "balance_loss_mlp": 1.01331949, + "epoch": 0.9892078761460995, + "flos": 20772922533120.0, + "grad_norm": 1.6730433118472485, + "language_loss": 0.74023193, + "learning_rate": 1.2148749214409004e-09, + "loss": 0.76106369, + "num_input_tokens_seen": 355033280, + "step": 16453, + "time_per_iteration": 2.571578025817871 + }, + { + "auxiliary_loss_clip": 0.01024236, + "auxiliary_loss_mlp": 0.01032958, + "balance_loss_clip": 1.02550936, + "balance_loss_mlp": 1.02377248, + "epoch": 0.9892679993987675, + "flos": 23367827836800.0, + "grad_norm": 2.158854850235848, + "language_loss": 0.6958853, + "learning_rate": 1.2013401715191828e-09, + "loss": 0.71645725, + "num_input_tokens_seen": 355053320, + "step": 16454, + "time_per_iteration": 4.477778911590576 + }, + { + "auxiliary_loss_clip": 0.01033343, + "auxiliary_loss_mlp": 0.01028872, + "balance_loss_clip": 1.02290428, + "balance_loss_mlp": 1.01903713, + "epoch": 0.9893281226514354, + "flos": 22705419173760.0, + "grad_norm": 2.681104115081898, + "language_loss": 0.75827539, + "learning_rate": 1.1878812162433583e-09, + "loss": 0.77889752, + "num_input_tokens_seen": 355070230, + "step": 16455, + "time_per_iteration": 2.6161017417907715 + }, + { + "auxiliary_loss_clip": 0.01041752, + "auxiliary_loss_mlp": 0.01024371, + "balance_loss_clip": 1.026196, + "balance_loss_mlp": 1.01425028, + "epoch": 0.9893882459041035, + "flos": 21796664060160.0, + "grad_norm": 1.5687033630470024, + "language_loss": 0.65447164, + "learning_rate": 1.1744980561230188e-09, + "loss": 0.67513287, + "num_input_tokens_seen": 355090125, + "step": 16456, + "time_per_iteration": 2.77913761138916 + }, + { + "auxiliary_loss_clip": 0.01052787, + "auxiliary_loss_mlp": 0.01026514, + "balance_loss_clip": 1.02608144, + "balance_loss_mlp": 1.01657772, + "epoch": 0.9894483691567714, + "flos": 18113773754880.0, + "grad_norm": 1.6838894930256214, + "language_loss": 0.73673469, + "learning_rate": 1.161190691666203e-09, + "loss": 0.75752771, + "num_input_tokens_seen": 355107890, + "step": 16457, + "time_per_iteration": 2.622783660888672 + }, + { + "auxiliary_loss_clip": 0.01062172, + "auxiliary_loss_mlp": 0.01024811, + "balance_loss_clip": 1.02588439, + "balance_loss_mlp": 1.01463044, + "epoch": 0.9895084924094394, + "flos": 31211615664000.0, + "grad_norm": 2.315426712275177, + "language_loss": 0.68962157, + "learning_rate": 1.1479591233773954e-09, + "loss": 0.71049136, + "num_input_tokens_seen": 355126340, + "step": 16458, + "time_per_iteration": 2.6398963928222656 + }, + { + "auxiliary_loss_clip": 0.01048758, + "auxiliary_loss_mlp": 0.01029984, + "balance_loss_clip": 1.0237844, + "balance_loss_mlp": 1.01986301, + "epoch": 0.9895686156621073, + "flos": 19678042120320.0, + "grad_norm": 1.9149559376482754, + "language_loss": 0.79292411, + "learning_rate": 1.1348033517581956e-09, + "loss": 0.81371152, + "num_input_tokens_seen": 355144025, + "step": 16459, + "time_per_iteration": 2.6563189029693604 + }, + { + "auxiliary_loss_clip": 0.01041162, + "auxiliary_loss_mlp": 0.01031979, + "balance_loss_clip": 1.02371979, + "balance_loss_mlp": 1.02170336, + "epoch": 0.9896287389147753, + "flos": 23581675457280.0, + "grad_norm": 2.5648957376959594, + "language_loss": 0.70806956, + "learning_rate": 1.1217233773075373e-09, + "loss": 0.72880101, + "num_input_tokens_seen": 355163125, + "step": 16460, + "time_per_iteration": 2.7804713249206543 + }, + { + "auxiliary_loss_clip": 0.01041428, + "auxiliary_loss_mlp": 0.01026634, + "balance_loss_clip": 1.02391601, + "balance_loss_mlp": 1.01619184, + "epoch": 0.9896888621674432, + "flos": 29605331364480.0, + "grad_norm": 1.7594058041924228, + "language_loss": 0.87547946, + "learning_rate": 1.1087192005214685e-09, + "loss": 0.89616007, + "num_input_tokens_seen": 355184060, + "step": 16461, + "time_per_iteration": 2.7755279541015625 + }, + { + "auxiliary_loss_clip": 0.01050428, + "auxiliary_loss_mlp": 0.01026918, + "balance_loss_clip": 1.02411103, + "balance_loss_mlp": 1.01646388, + "epoch": 0.9897489854201112, + "flos": 23695045758720.0, + "grad_norm": 1.715612503390224, + "language_loss": 0.62978834, + "learning_rate": 1.09579082189315e-09, + "loss": 0.65056181, + "num_input_tokens_seen": 355204505, + "step": 16462, + "time_per_iteration": 4.347283840179443 + }, + { + "auxiliary_loss_clip": 0.01056023, + "auxiliary_loss_mlp": 0.01026115, + "balance_loss_clip": 1.02846158, + "balance_loss_mlp": 1.0163995, + "epoch": 0.9898091086727792, + "flos": 13225146687360.0, + "grad_norm": 1.8434415498227155, + "language_loss": 0.72931278, + "learning_rate": 1.0829382419126343e-09, + "loss": 0.75013417, + "num_input_tokens_seen": 355223055, + "step": 16463, + "time_per_iteration": 2.6341986656188965 + }, + { + "auxiliary_loss_clip": 0.01052412, + "auxiliary_loss_mlp": 0.01025028, + "balance_loss_clip": 1.02540231, + "balance_loss_mlp": 1.01401305, + "epoch": 0.9898692319254472, + "flos": 22930400010240.0, + "grad_norm": 1.8031420746836009, + "language_loss": 0.6976617, + "learning_rate": 1.0701614610675314e-09, + "loss": 0.71843612, + "num_input_tokens_seen": 355242000, + "step": 16464, + "time_per_iteration": 4.32135796546936 + }, + { + "auxiliary_loss_clip": 0.01034828, + "auxiliary_loss_mlp": 0.01027633, + "balance_loss_clip": 1.02547383, + "balance_loss_mlp": 1.01677322, + "epoch": 0.9899293551781151, + "flos": 12458346122880.0, + "grad_norm": 2.0949815625457338, + "language_loss": 0.73599064, + "learning_rate": 1.0574604798421204e-09, + "loss": 0.75661528, + "num_input_tokens_seen": 355260175, + "step": 16465, + "time_per_iteration": 2.68477463722229 + }, + { + "auxiliary_loss_clip": 0.01059614, + "auxiliary_loss_mlp": 0.01030524, + "balance_loss_clip": 1.02365041, + "balance_loss_mlp": 1.02123153, + "epoch": 0.9899894784307831, + "flos": 26871129118080.0, + "grad_norm": 1.6689015852580282, + "language_loss": 0.86323297, + "learning_rate": 1.0448352987182386e-09, + "loss": 0.88413435, + "num_input_tokens_seen": 355281930, + "step": 16466, + "time_per_iteration": 2.9135754108428955 + }, + { + "auxiliary_loss_clip": 0.01023508, + "auxiliary_loss_mlp": 0.0102602, + "balance_loss_clip": 1.02441931, + "balance_loss_mlp": 1.01594687, + "epoch": 0.990049601683451, + "flos": 21542093395200.0, + "grad_norm": 1.623580151774922, + "language_loss": 0.71767771, + "learning_rate": 1.0322859181743915e-09, + "loss": 0.73817301, + "num_input_tokens_seen": 355301555, + "step": 16467, + "time_per_iteration": 2.7110824584960938 + }, + { + "auxiliary_loss_clip": 0.01030151, + "auxiliary_loss_mlp": 0.01029538, + "balance_loss_clip": 1.02073073, + "balance_loss_mlp": 1.0187974, + "epoch": 0.990109724936119, + "flos": 28771809287040.0, + "grad_norm": 1.5436490811266814, + "language_loss": 0.65118122, + "learning_rate": 1.019812338686643e-09, + "loss": 0.67177814, + "num_input_tokens_seen": 355324925, + "step": 16468, + "time_per_iteration": 2.7561042308807373 + }, + { + "auxiliary_loss_clip": 0.010349, + "auxiliary_loss_mlp": 0.01030881, + "balance_loss_clip": 1.02566266, + "balance_loss_mlp": 1.0207895, + "epoch": 0.9901698481887871, + "flos": 29274270687360.0, + "grad_norm": 1.6919426995076134, + "language_loss": 0.61858857, + "learning_rate": 1.0074145607281704e-09, + "loss": 0.63924634, + "num_input_tokens_seen": 355343875, + "step": 16469, + "time_per_iteration": 2.8301384449005127 + }, + { + "auxiliary_loss_clip": 0.01025658, + "auxiliary_loss_mlp": 0.01026252, + "balance_loss_clip": 1.02213013, + "balance_loss_mlp": 1.01546371, + "epoch": 0.990229971441455, + "flos": 15959025711360.0, + "grad_norm": 3.0368582105255677, + "language_loss": 0.70834345, + "learning_rate": 9.950925847685976e-10, + "loss": 0.72886252, + "num_input_tokens_seen": 355358835, + "step": 16470, + "time_per_iteration": 2.689389705657959 + }, + { + "auxiliary_loss_clip": 0.00996118, + "auxiliary_loss_mlp": 0.0100119, + "balance_loss_clip": 1.00137568, + "balance_loss_mlp": 1.00033212, + "epoch": 0.990290094694123, + "flos": 69780287911680.0, + "grad_norm": 0.6606167566508655, + "language_loss": 0.55515534, + "learning_rate": 9.828464112755509e-10, + "loss": 0.57512838, + "num_input_tokens_seen": 355431225, + "step": 16471, + "time_per_iteration": 3.386013984680176 + }, + { + "auxiliary_loss_clip": 0.01044072, + "auxiliary_loss_mlp": 0.01030038, + "balance_loss_clip": 1.0263046, + "balance_loss_mlp": 1.019804, + "epoch": 0.9903502179467909, + "flos": 16252451913600.0, + "grad_norm": 2.9534942154969253, + "language_loss": 0.83842832, + "learning_rate": 9.706760407131032e-10, + "loss": 0.85916936, + "num_input_tokens_seen": 355448250, + "step": 16472, + "time_per_iteration": 2.6603307723999023 + }, + { + "auxiliary_loss_clip": 0.01050881, + "auxiliary_loss_mlp": 0.01025586, + "balance_loss_clip": 1.02529168, + "balance_loss_mlp": 1.01559615, + "epoch": 0.9904103411994589, + "flos": 21688393489920.0, + "grad_norm": 1.8114330767123699, + "language_loss": 0.85538018, + "learning_rate": 9.585814735431075e-10, + "loss": 0.87614489, + "num_input_tokens_seen": 355467040, + "step": 16473, + "time_per_iteration": 2.635822057723999 + }, + { + "auxiliary_loss_clip": 0.01059702, + "auxiliary_loss_mlp": 0.01028523, + "balance_loss_clip": 1.02393401, + "balance_loss_mlp": 1.01897418, + "epoch": 0.9904704644521268, + "flos": 25739440243200.0, + "grad_norm": 1.7028699191252925, + "language_loss": 0.84481013, + "learning_rate": 9.465627102240859e-10, + "loss": 0.86569244, + "num_input_tokens_seen": 355487825, + "step": 16474, + "time_per_iteration": 2.5629491806030273 + }, + { + "auxiliary_loss_clip": 0.01037394, + "auxiliary_loss_mlp": 0.01033296, + "balance_loss_clip": 1.02127671, + "balance_loss_mlp": 1.0233717, + "epoch": 0.9905305877047949, + "flos": 21908346422400.0, + "grad_norm": 1.7547177051731853, + "language_loss": 0.76269126, + "learning_rate": 9.346197512116738e-10, + "loss": 0.78339815, + "num_input_tokens_seen": 355507445, + "step": 16475, + "time_per_iteration": 2.5540995597839355 + }, + { + "auxiliary_loss_clip": 0.01024855, + "auxiliary_loss_mlp": 0.01034167, + "balance_loss_clip": 1.02181649, + "balance_loss_mlp": 1.0224669, + "epoch": 0.9905907109574628, + "flos": 21392417422080.0, + "grad_norm": 1.57339716753056, + "language_loss": 0.75647473, + "learning_rate": 9.227525969588423e-10, + "loss": 0.77706492, + "num_input_tokens_seen": 355527205, + "step": 16476, + "time_per_iteration": 2.6193759441375732 + }, + { + "auxiliary_loss_clip": 0.01053328, + "auxiliary_loss_mlp": 0.00747626, + "balance_loss_clip": 1.02350783, + "balance_loss_mlp": 1.00037646, + "epoch": 0.9906508342101308, + "flos": 20521620005760.0, + "grad_norm": 2.004308442605115, + "language_loss": 0.67551708, + "learning_rate": 9.109612479154538e-10, + "loss": 0.69352663, + "num_input_tokens_seen": 355544740, + "step": 16477, + "time_per_iteration": 2.6112122535705566 + }, + { + "auxiliary_loss_clip": 0.01046029, + "auxiliary_loss_mlp": 0.01028713, + "balance_loss_clip": 1.02615929, + "balance_loss_mlp": 1.0178709, + "epoch": 0.9907109574627987, + "flos": 21361211481600.0, + "grad_norm": 6.513357465380215, + "language_loss": 0.71891487, + "learning_rate": 8.992457045289282e-10, + "loss": 0.73966229, + "num_input_tokens_seen": 355564385, + "step": 16478, + "time_per_iteration": 2.7388315200805664 + }, + { + "auxiliary_loss_clip": 0.0106254, + "auxiliary_loss_mlp": 0.01034112, + "balance_loss_clip": 1.02536082, + "balance_loss_mlp": 1.02264988, + "epoch": 0.9907710807154667, + "flos": 17338605321600.0, + "grad_norm": 2.211243423349487, + "language_loss": 0.80818379, + "learning_rate": 8.876059672433545e-10, + "loss": 0.82915032, + "num_input_tokens_seen": 355579260, + "step": 16479, + "time_per_iteration": 2.6373307704925537 + }, + { + "auxiliary_loss_clip": 0.01052706, + "auxiliary_loss_mlp": 0.01030562, + "balance_loss_clip": 1.02505279, + "balance_loss_mlp": 1.02034009, + "epoch": 0.9908312039681346, + "flos": 28621881918720.0, + "grad_norm": 1.6372018052920259, + "language_loss": 0.66007149, + "learning_rate": 8.760420364999355e-10, + "loss": 0.68090421, + "num_input_tokens_seen": 355599790, + "step": 16480, + "time_per_iteration": 2.749112129211426 + }, + { + "auxiliary_loss_clip": 0.01048644, + "auxiliary_loss_mlp": 0.01023392, + "balance_loss_clip": 1.02351761, + "balance_loss_mlp": 1.01420093, + "epoch": 0.9908913272208026, + "flos": 35770654512000.0, + "grad_norm": 1.8971658841304782, + "language_loss": 0.7231828, + "learning_rate": 8.645539127374313e-10, + "loss": 0.74390316, + "num_input_tokens_seen": 355620925, + "step": 16481, + "time_per_iteration": 2.7000210285186768 + }, + { + "auxiliary_loss_clip": 0.010481, + "auxiliary_loss_mlp": 0.01022164, + "balance_loss_clip": 1.02352929, + "balance_loss_mlp": 1.01284146, + "epoch": 0.9909514504734707, + "flos": 19902196944000.0, + "grad_norm": 2.283602890000601, + "language_loss": 0.77275956, + "learning_rate": 8.531415963912713e-10, + "loss": 0.79346216, + "num_input_tokens_seen": 355639165, + "step": 16482, + "time_per_iteration": 4.383305311203003 + }, + { + "auxiliary_loss_clip": 0.0105213, + "auxiliary_loss_mlp": 0.01026845, + "balance_loss_clip": 1.02486026, + "balance_loss_mlp": 1.01649213, + "epoch": 0.9910115737261386, + "flos": 20004793165440.0, + "grad_norm": 1.8596625960638684, + "language_loss": 0.75367147, + "learning_rate": 8.418050878944427e-10, + "loss": 0.77446115, + "num_input_tokens_seen": 355657320, + "step": 16483, + "time_per_iteration": 2.7681684494018555 + }, + { + "auxiliary_loss_clip": 0.00997274, + "auxiliary_loss_mlp": 0.01003096, + "balance_loss_clip": 1.00172925, + "balance_loss_mlp": 1.00232112, + "epoch": 0.9910716969788066, + "flos": 70688432494080.0, + "grad_norm": 0.6768506526418318, + "language_loss": 0.53644431, + "learning_rate": 8.305443876768237e-10, + "loss": 0.55644798, + "num_input_tokens_seen": 355726370, + "step": 16484, + "time_per_iteration": 3.2997922897338867 + }, + { + "auxiliary_loss_clip": 0.01058664, + "auxiliary_loss_mlp": 0.01026979, + "balance_loss_clip": 1.02398384, + "balance_loss_mlp": 1.01706111, + "epoch": 0.9911318202314745, + "flos": 21434038306560.0, + "grad_norm": 1.6806443161675544, + "language_loss": 0.81645232, + "learning_rate": 8.19359496165184e-10, + "loss": 0.83730876, + "num_input_tokens_seen": 355745840, + "step": 16485, + "time_per_iteration": 2.592813491821289 + }, + { + "auxiliary_loss_clip": 0.01018579, + "auxiliary_loss_mlp": 0.01030284, + "balance_loss_clip": 1.02168834, + "balance_loss_mlp": 1.01896536, + "epoch": 0.9911919434841425, + "flos": 19826820253440.0, + "grad_norm": 1.7569189420185949, + "language_loss": 0.81589466, + "learning_rate": 8.082504137836288e-10, + "loss": 0.83638328, + "num_input_tokens_seen": 355763385, + "step": 16486, + "time_per_iteration": 2.646170139312744 + }, + { + "auxiliary_loss_clip": 0.01054417, + "auxiliary_loss_mlp": 0.01027109, + "balance_loss_clip": 1.02640939, + "balance_loss_mlp": 1.01710176, + "epoch": 0.9912520667368104, + "flos": 41719364691840.0, + "grad_norm": 1.3034300298700219, + "language_loss": 0.66234136, + "learning_rate": 7.972171409538209e-10, + "loss": 0.68315661, + "num_input_tokens_seen": 355786075, + "step": 16487, + "time_per_iteration": 2.7817068099975586 + }, + { + "auxiliary_loss_clip": 0.01048096, + "auxiliary_loss_mlp": 0.00747643, + "balance_loss_clip": 1.02347553, + "balance_loss_mlp": 1.00042486, + "epoch": 0.9913121899894785, + "flos": 23769668263680.0, + "grad_norm": 1.568521522624832, + "language_loss": 0.76755267, + "learning_rate": 7.862596780936481e-10, + "loss": 0.78551012, + "num_input_tokens_seen": 355806295, + "step": 16488, + "time_per_iteration": 2.8058953285217285 + }, + { + "auxiliary_loss_clip": 0.01035244, + "auxiliary_loss_mlp": 0.01025668, + "balance_loss_clip": 1.02517188, + "balance_loss_mlp": 1.01501656, + "epoch": 0.9913723132421464, + "flos": 23769668263680.0, + "grad_norm": 2.709722968027424, + "language_loss": 0.68516409, + "learning_rate": 7.753780256190001e-10, + "loss": 0.70577317, + "num_input_tokens_seen": 355825730, + "step": 16489, + "time_per_iteration": 2.703809976577759 + }, + { + "auxiliary_loss_clip": 0.00967032, + "auxiliary_loss_mlp": 0.01000412, + "balance_loss_clip": 1.003263, + "balance_loss_mlp": 0.99949378, + "epoch": 0.9914324364948144, + "flos": 71267419820160.0, + "grad_norm": 0.6080907922678426, + "language_loss": 0.52581096, + "learning_rate": 7.645721839424357e-10, + "loss": 0.54548538, + "num_input_tokens_seen": 355891545, + "step": 16490, + "time_per_iteration": 3.333549737930298 + }, + { + "auxiliary_loss_clip": 0.01031665, + "auxiliary_loss_mlp": 0.0103213, + "balance_loss_clip": 1.02419639, + "balance_loss_mlp": 1.02059686, + "epoch": 0.9914925597474823, + "flos": 23695440808320.0, + "grad_norm": 1.6230373534773375, + "language_loss": 0.75473928, + "learning_rate": 7.538421534734052e-10, + "loss": 0.77537715, + "num_input_tokens_seen": 355909920, + "step": 16491, + "time_per_iteration": 2.695585250854492 + }, + { + "auxiliary_loss_clip": 0.01027629, + "auxiliary_loss_mlp": 0.0103017, + "balance_loss_clip": 1.02779746, + "balance_loss_mlp": 1.01865411, + "epoch": 0.9915526830001503, + "flos": 13433822749440.0, + "grad_norm": 2.5620470737592047, + "language_loss": 0.70249593, + "learning_rate": 7.431879346191383e-10, + "loss": 0.7230739, + "num_input_tokens_seen": 355923130, + "step": 16492, + "time_per_iteration": 2.7488491535186768 + }, + { + "auxiliary_loss_clip": 0.01025181, + "auxiliary_loss_mlp": 0.01031318, + "balance_loss_clip": 1.02242351, + "balance_loss_mlp": 1.02021992, + "epoch": 0.9916128062528182, + "flos": 20740962407040.0, + "grad_norm": 1.7047137264628343, + "language_loss": 0.68298829, + "learning_rate": 7.326095277837563e-10, + "loss": 0.7035532, + "num_input_tokens_seen": 355941960, + "step": 16493, + "time_per_iteration": 2.6957054138183594 + }, + { + "auxiliary_loss_clip": 0.01045574, + "auxiliary_loss_mlp": 0.01034381, + "balance_loss_clip": 1.02637672, + "balance_loss_mlp": 1.0232234, + "epoch": 0.9916729295054862, + "flos": 22487082353280.0, + "grad_norm": 1.6788269629503199, + "language_loss": 0.71161473, + "learning_rate": 7.221069333678276e-10, + "loss": 0.73241431, + "num_input_tokens_seen": 355961640, + "step": 16494, + "time_per_iteration": 2.7607743740081787 + }, + { + "auxiliary_loss_clip": 0.01051274, + "auxiliary_loss_mlp": 0.01030149, + "balance_loss_clip": 1.024405, + "balance_loss_mlp": 1.01898563, + "epoch": 0.9917330527581543, + "flos": 14792467708800.0, + "grad_norm": 2.5960709561406707, + "language_loss": 0.68274039, + "learning_rate": 7.116801517701443e-10, + "loss": 0.70355463, + "num_input_tokens_seen": 355977980, + "step": 16495, + "time_per_iteration": 2.5835347175598145 + }, + { + "auxiliary_loss_clip": 0.00987729, + "auxiliary_loss_mlp": 0.01004417, + "balance_loss_clip": 1.00214708, + "balance_loss_mlp": 1.00346899, + "epoch": 0.9917931760108222, + "flos": 59191595585280.0, + "grad_norm": 0.7278797547633709, + "language_loss": 0.53467822, + "learning_rate": 7.013291833859458e-10, + "loss": 0.55459964, + "num_input_tokens_seen": 356042900, + "step": 16496, + "time_per_iteration": 3.3250133991241455 + }, + { + "auxiliary_loss_clip": 0.01037109, + "auxiliary_loss_mlp": 0.00747698, + "balance_loss_clip": 1.0231216, + "balance_loss_mlp": 1.00036657, + "epoch": 0.9918532992634902, + "flos": 26761637485440.0, + "grad_norm": 1.9579433659292484, + "language_loss": 0.71413362, + "learning_rate": 6.91054028607585e-10, + "loss": 0.73198164, + "num_input_tokens_seen": 356063000, + "step": 16497, + "time_per_iteration": 2.7008163928985596 + }, + { + "auxiliary_loss_clip": 0.01034639, + "auxiliary_loss_mlp": 0.0103425, + "balance_loss_clip": 1.02397478, + "balance_loss_mlp": 1.02287769, + "epoch": 0.9919134225161581, + "flos": 14975719920000.0, + "grad_norm": 1.9099835627842325, + "language_loss": 0.82013315, + "learning_rate": 6.808546878249721e-10, + "loss": 0.84082204, + "num_input_tokens_seen": 356078130, + "step": 16498, + "time_per_iteration": 2.7078254222869873 + }, + { + "auxiliary_loss_clip": 0.01036473, + "auxiliary_loss_mlp": 0.0103597, + "balance_loss_clip": 1.02713037, + "balance_loss_mlp": 1.0251931, + "epoch": 0.9919735457688261, + "flos": 27818201064960.0, + "grad_norm": 1.4813434844386344, + "language_loss": 0.67732775, + "learning_rate": 6.707311614246869e-10, + "loss": 0.69805217, + "num_input_tokens_seen": 356101655, + "step": 16499, + "time_per_iteration": 2.7506723403930664 + }, + { + "auxiliary_loss_clip": 0.01063322, + "auxiliary_loss_mlp": 0.0102715, + "balance_loss_clip": 1.02621436, + "balance_loss_mlp": 1.01706445, + "epoch": 0.992033669021494, + "flos": 22562782266240.0, + "grad_norm": 1.9205482988039662, + "language_loss": 0.82073665, + "learning_rate": 6.606834497904223e-10, + "loss": 0.84164131, + "num_input_tokens_seen": 356121425, + "step": 16500, + "time_per_iteration": 2.603651762008667 + }, + { + "auxiliary_loss_clip": 0.01031207, + "auxiliary_loss_mlp": 0.01027016, + "balance_loss_clip": 1.02346134, + "balance_loss_mlp": 1.01622188, + "epoch": 0.9920937922741621, + "flos": 25374587846400.0, + "grad_norm": 2.0327348142182924, + "language_loss": 0.82130814, + "learning_rate": 6.507115533036511e-10, + "loss": 0.84189034, + "num_input_tokens_seen": 356140710, + "step": 16501, + "time_per_iteration": 4.4952661991119385 + }, + { + "auxiliary_loss_clip": 0.0105194, + "auxiliary_loss_mlp": 0.01023667, + "balance_loss_clip": 1.02401686, + "balance_loss_mlp": 1.01359987, + "epoch": 0.99215391552683, + "flos": 22054466949120.0, + "grad_norm": 2.207144508141291, + "language_loss": 0.76941651, + "learning_rate": 6.408154723420711e-10, + "loss": 0.79017258, + "num_input_tokens_seen": 356159835, + "step": 16502, + "time_per_iteration": 2.603175401687622 + }, + { + "auxiliary_loss_clip": 0.01043687, + "auxiliary_loss_mlp": 0.01026112, + "balance_loss_clip": 1.02494931, + "balance_loss_mlp": 1.01537132, + "epoch": 0.992214038779498, + "flos": 15413937845760.0, + "grad_norm": 2.387424846682763, + "language_loss": 0.71752113, + "learning_rate": 6.309952072811597e-10, + "loss": 0.73821914, + "num_input_tokens_seen": 356177555, + "step": 16503, + "time_per_iteration": 2.600536584854126 + }, + { + "auxiliary_loss_clip": 0.00995889, + "auxiliary_loss_mlp": 0.01012303, + "balance_loss_clip": 1.00074577, + "balance_loss_mlp": 1.01137304, + "epoch": 0.9922741620321659, + "flos": 62014498467840.0, + "grad_norm": 0.6369624895468725, + "language_loss": 0.55125594, + "learning_rate": 6.212507584932858e-10, + "loss": 0.57133782, + "num_input_tokens_seen": 356244975, + "step": 16504, + "time_per_iteration": 3.3508384227752686 + }, + { + "auxiliary_loss_clip": 0.01030612, + "auxiliary_loss_mlp": 0.01025071, + "balance_loss_clip": 1.02514434, + "balance_loss_mlp": 1.01541495, + "epoch": 0.9923342852848339, + "flos": 17165480745600.0, + "grad_norm": 1.8196373407732664, + "language_loss": 0.69371986, + "learning_rate": 6.115821263481536e-10, + "loss": 0.71427667, + "num_input_tokens_seen": 356262605, + "step": 16505, + "time_per_iteration": 2.748563528060913 + }, + { + "auxiliary_loss_clip": 0.01031928, + "auxiliary_loss_mlp": 0.01028585, + "balance_loss_clip": 1.02397096, + "balance_loss_mlp": 1.01644993, + "epoch": 0.9923944085375018, + "flos": 23183210908800.0, + "grad_norm": 2.039743016666148, + "language_loss": 0.65318108, + "learning_rate": 6.019893112119146e-10, + "loss": 0.67378616, + "num_input_tokens_seen": 356278935, + "step": 16506, + "time_per_iteration": 2.665562391281128 + }, + { + "auxiliary_loss_clip": 0.01005768, + "auxiliary_loss_mlp": 0.01025013, + "balance_loss_clip": 1.02325428, + "balance_loss_mlp": 1.01442707, + "epoch": 0.9924545317901698, + "flos": 20813861059200.0, + "grad_norm": 2.178819613032347, + "language_loss": 0.62802213, + "learning_rate": 5.924723134487219e-10, + "loss": 0.64832985, + "num_input_tokens_seen": 356295675, + "step": 16507, + "time_per_iteration": 2.8222570419311523 + }, + { + "auxiliary_loss_clip": 0.0106189, + "auxiliary_loss_mlp": 0.01030958, + "balance_loss_clip": 1.02455282, + "balance_loss_mlp": 1.02002048, + "epoch": 0.9925146550428379, + "flos": 20083437993600.0, + "grad_norm": 3.026485766973584, + "language_loss": 0.72657847, + "learning_rate": 5.830311334193983e-10, + "loss": 0.74750698, + "num_input_tokens_seen": 356312885, + "step": 16508, + "time_per_iteration": 2.6830263137817383 + }, + { + "auxiliary_loss_clip": 0.01063235, + "auxiliary_loss_mlp": 0.01028425, + "balance_loss_clip": 1.02568603, + "balance_loss_mlp": 1.01763105, + "epoch": 0.9925747782955058, + "flos": 24973717086720.0, + "grad_norm": 1.657269494398965, + "language_loss": 0.7019105, + "learning_rate": 5.736657714818793e-10, + "loss": 0.72282708, + "num_input_tokens_seen": 356334070, + "step": 16509, + "time_per_iteration": 2.7111542224884033 + }, + { + "auxiliary_loss_clip": 0.01052223, + "auxiliary_loss_mlp": 0.01027972, + "balance_loss_clip": 1.02470016, + "balance_loss_mlp": 1.01759481, + "epoch": 0.9926349015481738, + "flos": 60472526492160.0, + "grad_norm": 1.9771478195423056, + "language_loss": 0.68237615, + "learning_rate": 5.643762279912146e-10, + "loss": 0.70317805, + "num_input_tokens_seen": 356359410, + "step": 16510, + "time_per_iteration": 4.699380397796631 + }, + { + "auxiliary_loss_clip": 0.01028823, + "auxiliary_loss_mlp": 0.01034451, + "balance_loss_clip": 1.02351654, + "balance_loss_mlp": 1.02342451, + "epoch": 0.9926950248008417, + "flos": 20741716592640.0, + "grad_norm": 2.0293628495948384, + "language_loss": 0.81289226, + "learning_rate": 5.551625032997886e-10, + "loss": 0.83352506, + "num_input_tokens_seen": 356378345, + "step": 16511, + "time_per_iteration": 4.458330869674683 + }, + { + "auxiliary_loss_clip": 0.01021731, + "auxiliary_loss_mlp": 0.01029054, + "balance_loss_clip": 1.02169728, + "balance_loss_mlp": 1.01920176, + "epoch": 0.9927551480535097, + "flos": 24352965221760.0, + "grad_norm": 1.855352247341762, + "language_loss": 0.91517937, + "learning_rate": 5.460245977570998e-10, + "loss": 0.93568724, + "num_input_tokens_seen": 356397345, + "step": 16512, + "time_per_iteration": 2.826789140701294 + }, + { + "auxiliary_loss_clip": 0.00977615, + "auxiliary_loss_mlp": 0.01002715, + "balance_loss_clip": 1.00234306, + "balance_loss_mlp": 1.00191581, + "epoch": 0.9928152713061776, + "flos": 71275572207360.0, + "grad_norm": 0.698128982475976, + "language_loss": 0.55219793, + "learning_rate": 5.369625117095378e-10, + "loss": 0.57200134, + "num_input_tokens_seen": 356459160, + "step": 16513, + "time_per_iteration": 3.3776416778564453 + }, + { + "auxiliary_loss_clip": 0.01040576, + "auxiliary_loss_mlp": 0.01029871, + "balance_loss_clip": 1.02399397, + "balance_loss_mlp": 1.01910663, + "epoch": 0.9928753945588457, + "flos": 57809499045120.0, + "grad_norm": 1.4327253265168605, + "language_loss": 0.64937747, + "learning_rate": 5.279762455006054e-10, + "loss": 0.67008197, + "num_input_tokens_seen": 356486405, + "step": 16514, + "time_per_iteration": 2.9981491565704346 + }, + { + "auxiliary_loss_clip": 0.01026497, + "auxiliary_loss_mlp": 0.0102778, + "balance_loss_clip": 1.02143586, + "balance_loss_mlp": 1.01674676, + "epoch": 0.9929355178115136, + "flos": 19568981450880.0, + "grad_norm": 2.8849909402698457, + "language_loss": 0.73212993, + "learning_rate": 5.190657994713632e-10, + "loss": 0.75267267, + "num_input_tokens_seen": 356502905, + "step": 16515, + "time_per_iteration": 2.7939276695251465 + }, + { + "auxiliary_loss_clip": 0.01023364, + "auxiliary_loss_mlp": 0.01030023, + "balance_loss_clip": 1.02290821, + "balance_loss_mlp": 1.01860273, + "epoch": 0.9929956410641816, + "flos": 22964658606720.0, + "grad_norm": 1.668136899371335, + "language_loss": 0.77047276, + "learning_rate": 5.102311739593191e-10, + "loss": 0.79100657, + "num_input_tokens_seen": 356523830, + "step": 16516, + "time_per_iteration": 2.707958936691284 + }, + { + "auxiliary_loss_clip": 0.01030039, + "auxiliary_loss_mlp": 0.01026175, + "balance_loss_clip": 1.02300143, + "balance_loss_mlp": 1.017097, + "epoch": 0.9930557643168495, + "flos": 22566409539840.0, + "grad_norm": 3.552716062806885, + "language_loss": 0.78057843, + "learning_rate": 5.014723692997602e-10, + "loss": 0.80114061, + "num_input_tokens_seen": 356543965, + "step": 16517, + "time_per_iteration": 2.751220464706421 + }, + { + "auxiliary_loss_clip": 0.01056909, + "auxiliary_loss_mlp": 0.01035163, + "balance_loss_clip": 1.0278163, + "balance_loss_mlp": 1.02294946, + "epoch": 0.9931158875695175, + "flos": 17201032231680.0, + "grad_norm": 2.2084882180864964, + "language_loss": 0.67656314, + "learning_rate": 4.927893858248655e-10, + "loss": 0.6974839, + "num_input_tokens_seen": 356561530, + "step": 16518, + "time_per_iteration": 2.687664031982422 + }, + { + "auxiliary_loss_clip": 0.00990968, + "auxiliary_loss_mlp": 0.01000397, + "balance_loss_clip": 1.00434184, + "balance_loss_mlp": 0.99944973, + "epoch": 0.9931760108221854, + "flos": 63711204278400.0, + "grad_norm": 0.7334482655197148, + "language_loss": 0.53467393, + "learning_rate": 4.84182223863483e-10, + "loss": 0.5545876, + "num_input_tokens_seen": 356616845, + "step": 16519, + "time_per_iteration": 3.1056177616119385 + }, + { + "auxiliary_loss_clip": 0.01016416, + "auxiliary_loss_mlp": 0.01030741, + "balance_loss_clip": 1.02075267, + "balance_loss_mlp": 1.01920128, + "epoch": 0.9932361340748534, + "flos": 15304805349120.0, + "grad_norm": 1.9368164301501403, + "language_loss": 0.60097963, + "learning_rate": 4.756508837426842e-10, + "loss": 0.6214512, + "num_input_tokens_seen": 356633560, + "step": 16520, + "time_per_iteration": 2.692115068435669 + }, + { + "auxiliary_loss_clip": 0.0102816, + "auxiliary_loss_mlp": 0.01028769, + "balance_loss_clip": 1.022856, + "balance_loss_mlp": 1.01846313, + "epoch": 0.9932962573275215, + "flos": 36064906727040.0, + "grad_norm": 1.760266435827202, + "language_loss": 0.61900938, + "learning_rate": 4.671953657853223e-10, + "loss": 0.6395787, + "num_input_tokens_seen": 356657600, + "step": 16521, + "time_per_iteration": 2.8893327713012695 + }, + { + "auxiliary_loss_clip": 0.01047905, + "auxiliary_loss_mlp": 0.01030639, + "balance_loss_clip": 1.02811742, + "balance_loss_mlp": 1.01951098, + "epoch": 0.9933563805801894, + "flos": 21470523546240.0, + "grad_norm": 1.8303379905273702, + "language_loss": 0.7447058, + "learning_rate": 4.5881567031225145e-10, + "loss": 0.76549131, + "num_input_tokens_seen": 356675880, + "step": 16522, + "time_per_iteration": 2.6463286876678467 + }, + { + "auxiliary_loss_clip": 0.01028004, + "auxiliary_loss_mlp": 0.01026115, + "balance_loss_clip": 1.02403808, + "balance_loss_mlp": 1.01635134, + "epoch": 0.9934165038328574, + "flos": 23986532626560.0, + "grad_norm": 1.706101262797577, + "language_loss": 0.73093998, + "learning_rate": 4.5051179764143964e-10, + "loss": 0.75148118, + "num_input_tokens_seen": 356696000, + "step": 16523, + "time_per_iteration": 2.6972951889038086 + }, + { + "auxiliary_loss_clip": 0.0103893, + "auxiliary_loss_mlp": 0.00747589, + "balance_loss_clip": 1.02299285, + "balance_loss_mlp": 1.00038767, + "epoch": 0.9934766270855253, + "flos": 21907807718400.0, + "grad_norm": 1.641434179925596, + "language_loss": 0.71431178, + "learning_rate": 4.422837480875241e-10, + "loss": 0.73217696, + "num_input_tokens_seen": 356716845, + "step": 16524, + "time_per_iteration": 2.6987380981445312 + }, + { + "auxiliary_loss_clip": 0.01030703, + "auxiliary_loss_mlp": 0.01028623, + "balance_loss_clip": 1.02409625, + "balance_loss_mlp": 1.0185492, + "epoch": 0.9935367503381933, + "flos": 17129139160320.0, + "grad_norm": 1.8700493088153252, + "language_loss": 0.79591799, + "learning_rate": 4.341315219624775e-10, + "loss": 0.81651127, + "num_input_tokens_seen": 356732100, + "step": 16525, + "time_per_iteration": 2.6501669883728027 + }, + { + "auxiliary_loss_clip": 0.01022194, + "auxiliary_loss_mlp": 0.01026387, + "balance_loss_clip": 1.02411497, + "balance_loss_mlp": 1.01574159, + "epoch": 0.9935968735908612, + "flos": 22346241125760.0, + "grad_norm": 1.5716992253008693, + "language_loss": 0.75254619, + "learning_rate": 4.2605511957582995e-10, + "loss": 0.77303195, + "num_input_tokens_seen": 356751480, + "step": 16526, + "time_per_iteration": 2.818056583404541 + }, + { + "auxiliary_loss_clip": 0.01058215, + "auxiliary_loss_mlp": 0.00747588, + "balance_loss_clip": 1.02340174, + "balance_loss_mlp": 1.00032425, + "epoch": 0.9936569968435293, + "flos": 29460539640960.0, + "grad_norm": 3.9581928893184735, + "language_loss": 0.72577995, + "learning_rate": 4.180545412333369e-10, + "loss": 0.74383801, + "num_input_tokens_seen": 356772650, + "step": 16527, + "time_per_iteration": 2.6539154052734375 + }, + { + "auxiliary_loss_clip": 0.01043328, + "auxiliary_loss_mlp": 0.01025824, + "balance_loss_clip": 1.02508855, + "balance_loss_mlp": 1.01550674, + "epoch": 0.9937171200961972, + "flos": 16544046522240.0, + "grad_norm": 2.046525948999796, + "language_loss": 0.76254648, + "learning_rate": 4.1012978723875547e-10, + "loss": 0.78323793, + "num_input_tokens_seen": 356788510, + "step": 16528, + "time_per_iteration": 2.8033664226531982 + }, + { + "auxiliary_loss_clip": 0.01040039, + "auxiliary_loss_mlp": 0.01027659, + "balance_loss_clip": 1.02216446, + "balance_loss_mlp": 1.01673985, + "epoch": 0.9937772433488652, + "flos": 24390276474240.0, + "grad_norm": 11.660205660433757, + "language_loss": 0.67960882, + "learning_rate": 4.022808578922898e-10, + "loss": 0.70028579, + "num_input_tokens_seen": 356809115, + "step": 16529, + "time_per_iteration": 4.518170118331909 + }, + { + "auxiliary_loss_clip": 0.0105531, + "auxiliary_loss_mlp": 0.01033243, + "balance_loss_clip": 1.02553511, + "balance_loss_mlp": 1.02066672, + "epoch": 0.9938373666015331, + "flos": 15669909141120.0, + "grad_norm": 2.573289280703622, + "language_loss": 0.65647471, + "learning_rate": 3.9450775349170186e-10, + "loss": 0.67736024, + "num_input_tokens_seen": 356826410, + "step": 16530, + "time_per_iteration": 2.6951050758361816 + }, + { + "auxiliary_loss_clip": 0.01044609, + "auxiliary_loss_mlp": 0.01024754, + "balance_loss_clip": 1.02317548, + "balance_loss_mlp": 1.01449609, + "epoch": 0.9938974898542011, + "flos": 19496190539520.0, + "grad_norm": 2.1630800099580068, + "language_loss": 0.71397662, + "learning_rate": 3.8681047433186676e-10, + "loss": 0.73467022, + "num_input_tokens_seen": 356844990, + "step": 16531, + "time_per_iteration": 2.768113136291504 + }, + { + "auxiliary_loss_clip": 0.01046581, + "auxiliary_loss_mlp": 0.01028283, + "balance_loss_clip": 1.02467489, + "balance_loss_mlp": 1.01732183, + "epoch": 0.993957613106869, + "flos": 26906896085760.0, + "grad_norm": 1.37553575856133, + "language_loss": 0.73941636, + "learning_rate": 3.791890207045512e-10, + "loss": 0.76016504, + "num_input_tokens_seen": 356866530, + "step": 16532, + "time_per_iteration": 2.9096057415008545 + }, + { + "auxiliary_loss_clip": 0.0101076, + "auxiliary_loss_mlp": 0.01028737, + "balance_loss_clip": 1.02079606, + "balance_loss_mlp": 1.0194447, + "epoch": 0.994017736359537, + "flos": 14939593816320.0, + "grad_norm": 1.5873754947994938, + "language_loss": 0.70635003, + "learning_rate": 3.7164339289885717e-10, + "loss": 0.72674501, + "num_input_tokens_seen": 356884660, + "step": 16533, + "time_per_iteration": 2.932136058807373 + }, + { + "auxiliary_loss_clip": 0.01054335, + "auxiliary_loss_mlp": 0.01025507, + "balance_loss_clip": 1.02560651, + "balance_loss_mlp": 1.01465297, + "epoch": 0.9940778596122051, + "flos": 15377883569280.0, + "grad_norm": 2.007861550683006, + "language_loss": 0.84759855, + "learning_rate": 3.641735912007782e-10, + "loss": 0.868397, + "num_input_tokens_seen": 356900895, + "step": 16534, + "time_per_iteration": 2.743427038192749 + }, + { + "auxiliary_loss_clip": 0.01023821, + "auxiliary_loss_mlp": 0.01024884, + "balance_loss_clip": 1.02286303, + "balance_loss_mlp": 1.0154134, + "epoch": 0.994137982864873, + "flos": 25228108183680.0, + "grad_norm": 1.3993746587611335, + "language_loss": 0.65929163, + "learning_rate": 3.567796158934211e-10, + "loss": 0.6797787, + "num_input_tokens_seen": 356920985, + "step": 16535, + "time_per_iteration": 2.7971456050872803 + }, + { + "auxiliary_loss_clip": 0.01031103, + "auxiliary_loss_mlp": 0.01024496, + "balance_loss_clip": 1.02575016, + "balance_loss_mlp": 1.01524544, + "epoch": 0.994198106117541, + "flos": 18442140912000.0, + "grad_norm": 1.5065918383843182, + "language_loss": 0.64761496, + "learning_rate": 3.4946146725767235e-10, + "loss": 0.66817093, + "num_input_tokens_seen": 356939800, + "step": 16536, + "time_per_iteration": 2.841869831085205 + }, + { + "auxiliary_loss_clip": 0.01014252, + "auxiliary_loss_mlp": 0.01031759, + "balance_loss_clip": 1.02116299, + "balance_loss_mlp": 1.01967692, + "epoch": 0.9942582293702089, + "flos": 16654112772480.0, + "grad_norm": 1.848343711020192, + "language_loss": 0.7836476, + "learning_rate": 3.4221914557064357e-10, + "loss": 0.80410779, + "num_input_tokens_seen": 356957780, + "step": 16537, + "time_per_iteration": 2.703991413116455 + }, + { + "auxiliary_loss_clip": 0.01058578, + "auxiliary_loss_mlp": 0.01031297, + "balance_loss_clip": 1.02728415, + "balance_loss_mlp": 1.02004385, + "epoch": 0.9943183526228769, + "flos": 21944580266880.0, + "grad_norm": 1.7795427986702559, + "language_loss": 0.69112992, + "learning_rate": 3.35052651107004e-10, + "loss": 0.71202862, + "num_input_tokens_seen": 356979185, + "step": 16538, + "time_per_iteration": 2.599931240081787 + }, + { + "auxiliary_loss_clip": 0.01021953, + "auxiliary_loss_mlp": 0.01030365, + "balance_loss_clip": 1.02158725, + "balance_loss_mlp": 1.01990485, + "epoch": 0.9943784758755448, + "flos": 23842566915840.0, + "grad_norm": 1.9615151240482198, + "language_loss": 0.74868822, + "learning_rate": 3.2796198413853614e-10, + "loss": 0.76921141, + "num_input_tokens_seen": 356997735, + "step": 16539, + "time_per_iteration": 2.7287991046905518 + }, + { + "auxiliary_loss_clip": 0.01018465, + "auxiliary_loss_mlp": 0.01030818, + "balance_loss_clip": 1.02311206, + "balance_loss_mlp": 1.0199461, + "epoch": 0.9944385991282129, + "flos": 21469984842240.0, + "grad_norm": 2.554688605411198, + "language_loss": 0.70752919, + "learning_rate": 3.209471449341361e-10, + "loss": 0.7280221, + "num_input_tokens_seen": 357015660, + "step": 16540, + "time_per_iteration": 2.690979480743408 + }, + { + "auxiliary_loss_clip": 0.01048672, + "auxiliary_loss_mlp": 0.01024203, + "balance_loss_clip": 1.02307546, + "balance_loss_mlp": 1.01522708, + "epoch": 0.9944987223808808, + "flos": 22927024131840.0, + "grad_norm": 1.9524206072716084, + "language_loss": 0.74790806, + "learning_rate": 3.140081337600353e-10, + "loss": 0.76863676, + "num_input_tokens_seen": 357034800, + "step": 16541, + "time_per_iteration": 2.5861120223999023 + }, + { + "auxiliary_loss_clip": 0.01032541, + "auxiliary_loss_mlp": 0.01033187, + "balance_loss_clip": 1.02258182, + "balance_loss_mlp": 1.02267325, + "epoch": 0.9945588456335488, + "flos": 22383013674240.0, + "grad_norm": 1.9546709525765713, + "language_loss": 0.76410186, + "learning_rate": 3.0714495087891255e-10, + "loss": 0.78475916, + "num_input_tokens_seen": 357053785, + "step": 16542, + "time_per_iteration": 2.6722240447998047 + }, + { + "auxiliary_loss_clip": 0.01054878, + "auxiliary_loss_mlp": 0.01026871, + "balance_loss_clip": 1.02564085, + "balance_loss_mlp": 1.01551652, + "epoch": 0.9946189688862167, + "flos": 21397517153280.0, + "grad_norm": 2.2433179564525543, + "language_loss": 0.7405709, + "learning_rate": 3.0035759655122615e-10, + "loss": 0.76138836, + "num_input_tokens_seen": 357072025, + "step": 16543, + "time_per_iteration": 2.604684829711914 + }, + { + "auxiliary_loss_clip": 0.01044946, + "auxiliary_loss_mlp": 0.01027864, + "balance_loss_clip": 1.0235076, + "balance_loss_mlp": 1.01697457, + "epoch": 0.9946790921388847, + "flos": 12416545670400.0, + "grad_norm": 2.208246408250098, + "language_loss": 0.82117897, + "learning_rate": 2.9364607103454785e-10, + "loss": 0.84190708, + "num_input_tokens_seen": 357086960, + "step": 16544, + "time_per_iteration": 2.5981853008270264 + }, + { + "auxiliary_loss_clip": 0.01062066, + "auxiliary_loss_mlp": 0.01026844, + "balance_loss_clip": 1.02518463, + "balance_loss_mlp": 1.01622224, + "epoch": 0.9947392153915526, + "flos": 19058295836160.0, + "grad_norm": 1.8272297453767172, + "language_loss": 0.78270507, + "learning_rate": 2.870103745831187e-10, + "loss": 0.80359411, + "num_input_tokens_seen": 357105095, + "step": 16545, + "time_per_iteration": 2.6693108081817627 + }, + { + "auxiliary_loss_clip": 0.01027523, + "auxiliary_loss_mlp": 0.01027009, + "balance_loss_clip": 1.02274787, + "balance_loss_mlp": 1.01680422, + "epoch": 0.9947993386442207, + "flos": 27308808339840.0, + "grad_norm": 1.6796005337993936, + "language_loss": 0.72468191, + "learning_rate": 2.8045050744873733e-10, + "loss": 0.74522722, + "num_input_tokens_seen": 357125065, + "step": 16546, + "time_per_iteration": 2.916019916534424 + }, + { + "auxiliary_loss_clip": 0.01046026, + "auxiliary_loss_mlp": 0.01030376, + "balance_loss_clip": 1.02254343, + "balance_loss_mlp": 1.02073789, + "epoch": 0.9948594618968887, + "flos": 20806498771200.0, + "grad_norm": 1.9565861615106863, + "language_loss": 0.77467346, + "learning_rate": 2.739664698798716e-10, + "loss": 0.79543757, + "num_input_tokens_seen": 357141600, + "step": 16547, + "time_per_iteration": 2.8063313961029053 + }, + { + "auxiliary_loss_clip": 0.01039306, + "auxiliary_loss_mlp": 0.01025708, + "balance_loss_clip": 1.02321196, + "balance_loss_mlp": 1.01602817, + "epoch": 0.9949195851495566, + "flos": 23292953936640.0, + "grad_norm": 1.9606718621889163, + "language_loss": 0.70279598, + "learning_rate": 2.67558262122769e-10, + "loss": 0.72344607, + "num_input_tokens_seen": 357157880, + "step": 16548, + "time_per_iteration": 2.7120163440704346 + }, + { + "auxiliary_loss_clip": 0.01047006, + "auxiliary_loss_mlp": 0.01028957, + "balance_loss_clip": 1.02448273, + "balance_loss_mlp": 1.01859188, + "epoch": 0.9949797084022246, + "flos": 18515470527360.0, + "grad_norm": 1.6871860154365619, + "language_loss": 0.756302, + "learning_rate": 2.6122588442012427e-10, + "loss": 0.7770617, + "num_input_tokens_seen": 357176705, + "step": 16549, + "time_per_iteration": 4.396971225738525 + }, + { + "auxiliary_loss_clip": 0.01034892, + "auxiliary_loss_mlp": 0.01027038, + "balance_loss_clip": 1.02489364, + "balance_loss_mlp": 1.01551032, + "epoch": 0.9950398316548925, + "flos": 30407719328640.0, + "grad_norm": 1.5475170599782384, + "language_loss": 0.74254751, + "learning_rate": 2.5496933701241177e-10, + "loss": 0.7631669, + "num_input_tokens_seen": 357197630, + "step": 16550, + "time_per_iteration": 2.7546133995056152 + }, + { + "auxiliary_loss_clip": 0.01011092, + "auxiliary_loss_mlp": 0.00747543, + "balance_loss_clip": 1.02041483, + "balance_loss_mlp": 1.00038695, + "epoch": 0.9950999549075605, + "flos": 19900868140800.0, + "grad_norm": 1.830336346123699, + "language_loss": 0.77760267, + "learning_rate": 2.4878862013655297e-10, + "loss": 0.79518902, + "num_input_tokens_seen": 357215445, + "step": 16551, + "time_per_iteration": 3.1862618923187256 + }, + { + "auxiliary_loss_clip": 0.01046143, + "auxiliary_loss_mlp": 0.01026061, + "balance_loss_clip": 1.02363372, + "balance_loss_mlp": 1.01719761, + "epoch": 0.9951600781602284, + "flos": 17603555016960.0, + "grad_norm": 1.6061192044721069, + "language_loss": 0.66478842, + "learning_rate": 2.426837340270271e-10, + "loss": 0.68551046, + "num_input_tokens_seen": 357234285, + "step": 16552, + "time_per_iteration": 2.6796793937683105 + }, + { + "auxiliary_loss_clip": 0.01063124, + "auxiliary_loss_mlp": 0.01024782, + "balance_loss_clip": 1.02553344, + "balance_loss_mlp": 1.01455975, + "epoch": 0.9952202014128965, + "flos": 28950715952640.0, + "grad_norm": 1.3322713741109375, + "language_loss": 0.8139385, + "learning_rate": 2.3665467891520465e-10, + "loss": 0.83481759, + "num_input_tokens_seen": 357257565, + "step": 16553, + "time_per_iteration": 2.669508695602417 + }, + { + "auxiliary_loss_clip": 0.00996229, + "auxiliary_loss_mlp": 0.01001552, + "balance_loss_clip": 1.0014317, + "balance_loss_mlp": 1.00062776, + "epoch": 0.9952803246655644, + "flos": 70810386145920.0, + "grad_norm": 0.7166685555050625, + "language_loss": 0.57374132, + "learning_rate": 2.3070145503001348e-10, + "loss": 0.59371907, + "num_input_tokens_seen": 357320205, + "step": 16554, + "time_per_iteration": 3.3229026794433594 + }, + { + "auxiliary_loss_clip": 0.01043609, + "auxiliary_loss_mlp": 0.01029627, + "balance_loss_clip": 1.024575, + "balance_loss_mlp": 1.01941633, + "epoch": 0.9953404479182324, + "flos": 21799070271360.0, + "grad_norm": 1.7276821695807663, + "language_loss": 0.77007562, + "learning_rate": 2.24824062597051e-10, + "loss": 0.79080802, + "num_input_tokens_seen": 357340695, + "step": 16555, + "time_per_iteration": 2.729417562484741 + }, + { + "auxiliary_loss_clip": 0.01026559, + "auxiliary_loss_mlp": 0.01028067, + "balance_loss_clip": 1.02188993, + "balance_loss_mlp": 1.01726675, + "epoch": 0.9954005711709003, + "flos": 21937397546880.0, + "grad_norm": 2.0595507209897064, + "language_loss": 0.86375195, + "learning_rate": 2.1902250183902793e-10, + "loss": 0.88429821, + "num_input_tokens_seen": 357357505, + "step": 16556, + "time_per_iteration": 2.8267245292663574 + }, + { + "auxiliary_loss_clip": 0.01017512, + "auxiliary_loss_mlp": 0.01027995, + "balance_loss_clip": 1.02334249, + "balance_loss_mlp": 1.01751089, + "epoch": 0.9954606944235683, + "flos": 19354559212800.0, + "grad_norm": 1.6679180293182563, + "language_loss": 0.73212242, + "learning_rate": 2.132967729762125e-10, + "loss": 0.75257748, + "num_input_tokens_seen": 357375395, + "step": 16557, + "time_per_iteration": 4.34111213684082 + }, + { + "auxiliary_loss_clip": 0.01050574, + "auxiliary_loss_mlp": 0.01029071, + "balance_loss_clip": 1.0248282, + "balance_loss_mlp": 1.01937389, + "epoch": 0.9955208176762362, + "flos": 30518611591680.0, + "grad_norm": 3.351961294656511, + "language_loss": 0.76776123, + "learning_rate": 2.0764687622554233e-10, + "loss": 0.78855771, + "num_input_tokens_seen": 357397375, + "step": 16558, + "time_per_iteration": 4.32559061050415 + }, + { + "auxiliary_loss_clip": 0.01037309, + "auxiliary_loss_mlp": 0.01029299, + "balance_loss_clip": 1.02160561, + "balance_loss_mlp": 1.0184983, + "epoch": 0.9955809409289043, + "flos": 30008249199360.0, + "grad_norm": 2.062153748193651, + "language_loss": 0.63668066, + "learning_rate": 2.0207281180129044e-10, + "loss": 0.65734673, + "num_input_tokens_seen": 357418880, + "step": 16559, + "time_per_iteration": 2.7731680870056152 + }, + { + "auxiliary_loss_clip": 0.01044716, + "auxiliary_loss_mlp": 0.01029448, + "balance_loss_clip": 1.02251649, + "balance_loss_mlp": 1.01883268, + "epoch": 0.9956410641815723, + "flos": 21543278544000.0, + "grad_norm": 2.027750337393336, + "language_loss": 0.74134827, + "learning_rate": 1.965745799148433e-10, + "loss": 0.76208985, + "num_input_tokens_seen": 357438310, + "step": 16560, + "time_per_iteration": 2.577418327331543 + }, + { + "auxiliary_loss_clip": 0.01003612, + "auxiliary_loss_mlp": 0.01027009, + "balance_loss_clip": 1.02339911, + "balance_loss_mlp": 1.0169239, + "epoch": 0.9957011874342402, + "flos": 21689470897920.0, + "grad_norm": 1.7217738711843291, + "language_loss": 0.78673053, + "learning_rate": 1.9115218077470073e-10, + "loss": 0.80703676, + "num_input_tokens_seen": 357457155, + "step": 16561, + "time_per_iteration": 2.78776216506958 + }, + { + "auxiliary_loss_clip": 0.01059263, + "auxiliary_loss_mlp": 0.01026871, + "balance_loss_clip": 1.02491975, + "balance_loss_mlp": 1.01745343, + "epoch": 0.9957613106869082, + "flos": 17702667619200.0, + "grad_norm": 2.466050923641435, + "language_loss": 0.65781748, + "learning_rate": 1.8580561458647614e-10, + "loss": 0.67867887, + "num_input_tokens_seen": 357468060, + "step": 16562, + "time_per_iteration": 2.5531771183013916 + }, + { + "auxiliary_loss_clip": 0.01045097, + "auxiliary_loss_mlp": 0.00747623, + "balance_loss_clip": 1.02601457, + "balance_loss_mlp": 1.00038278, + "epoch": 0.9958214339395761, + "flos": 30555994671360.0, + "grad_norm": 1.7128598963582786, + "language_loss": 0.64730257, + "learning_rate": 1.805348815528962e-10, + "loss": 0.6652298, + "num_input_tokens_seen": 357489665, + "step": 16563, + "time_per_iteration": 2.7809956073760986 + }, + { + "auxiliary_loss_clip": 0.01039469, + "auxiliary_loss_mlp": 0.0102932, + "balance_loss_clip": 1.02408874, + "balance_loss_mlp": 1.01887715, + "epoch": 0.9958815571922441, + "flos": 24169174306560.0, + "grad_norm": 1.7020338213429265, + "language_loss": 0.64788258, + "learning_rate": 1.7533998187380105e-10, + "loss": 0.66857046, + "num_input_tokens_seen": 357511975, + "step": 16564, + "time_per_iteration": 2.6643290519714355 + }, + { + "auxiliary_loss_clip": 0.01038516, + "auxiliary_loss_mlp": 0.00747491, + "balance_loss_clip": 1.0233438, + "balance_loss_mlp": 1.00036514, + "epoch": 0.995941680444912, + "flos": 15487016065920.0, + "grad_norm": 1.7991541063292924, + "language_loss": 0.73993254, + "learning_rate": 1.7022091574636633e-10, + "loss": 0.75779259, + "num_input_tokens_seen": 357529345, + "step": 16565, + "time_per_iteration": 2.614030361175537 + }, + { + "auxiliary_loss_clip": 0.0104077, + "auxiliary_loss_mlp": 0.01026327, + "balance_loss_clip": 1.02267885, + "balance_loss_mlp": 1.01705229, + "epoch": 0.9960018036975801, + "flos": 18621227145600.0, + "grad_norm": 1.7956128249642793, + "language_loss": 0.79083878, + "learning_rate": 1.6517768336443694e-10, + "loss": 0.81150973, + "num_input_tokens_seen": 357547615, + "step": 16566, + "time_per_iteration": 2.808542251586914 + }, + { + "auxiliary_loss_clip": 0.01018869, + "auxiliary_loss_mlp": 0.00747708, + "balance_loss_clip": 1.02391768, + "balance_loss_mlp": 1.00044155, + "epoch": 0.996061926950248, + "flos": 20084120352000.0, + "grad_norm": 2.4170557346740034, + "language_loss": 0.70946658, + "learning_rate": 1.6021028491941535e-10, + "loss": 0.72713226, + "num_input_tokens_seen": 357567380, + "step": 16567, + "time_per_iteration": 2.8363969326019287 + }, + { + "auxiliary_loss_clip": 0.01048619, + "auxiliary_loss_mlp": 0.0103107, + "balance_loss_clip": 1.02330101, + "balance_loss_mlp": 1.01971483, + "epoch": 0.996122050202916, + "flos": 24347829576960.0, + "grad_norm": 2.066098178611725, + "language_loss": 0.79074901, + "learning_rate": 1.5531872059959538e-10, + "loss": 0.81154597, + "num_input_tokens_seen": 357586435, + "step": 16568, + "time_per_iteration": 2.6904234886169434 + }, + { + "auxiliary_loss_clip": 0.01047581, + "auxiliary_loss_mlp": 0.0102548, + "balance_loss_clip": 1.02387905, + "balance_loss_mlp": 1.01656866, + "epoch": 0.9961821734555839, + "flos": 24199302839040.0, + "grad_norm": 1.6684279656668335, + "language_loss": 0.82074523, + "learning_rate": 1.5050299059060634e-10, + "loss": 0.84147584, + "num_input_tokens_seen": 357604720, + "step": 16569, + "time_per_iteration": 2.692412853240967 + }, + { + "auxiliary_loss_clip": 0.01022981, + "auxiliary_loss_mlp": 0.0074756, + "balance_loss_clip": 1.02386117, + "balance_loss_mlp": 1.00033796, + "epoch": 0.9962422967082519, + "flos": 22633741584000.0, + "grad_norm": 2.6412355799179483, + "language_loss": 0.7039277, + "learning_rate": 1.457630950747468e-10, + "loss": 0.72163308, + "num_input_tokens_seen": 357622345, + "step": 16570, + "time_per_iteration": 2.7051455974578857 + }, + { + "auxiliary_loss_clip": 0.01033276, + "auxiliary_loss_mlp": 0.01026247, + "balance_loss_clip": 1.02551341, + "balance_loss_mlp": 1.01601243, + "epoch": 0.9963024199609198, + "flos": 26396030903040.0, + "grad_norm": 1.5766961116800833, + "language_loss": 0.75166571, + "learning_rate": 1.4109903423209502e-10, + "loss": 0.7722609, + "num_input_tokens_seen": 357642710, + "step": 16571, + "time_per_iteration": 2.7891716957092285 + }, + { + "auxiliary_loss_clip": 0.01039891, + "auxiliary_loss_mlp": 0.01029626, + "balance_loss_clip": 1.02300417, + "balance_loss_mlp": 1.01932001, + "epoch": 0.9963625432135879, + "flos": 16581537342720.0, + "grad_norm": 1.9312016482427945, + "language_loss": 0.80004489, + "learning_rate": 1.3651080823939843e-10, + "loss": 0.8207401, + "num_input_tokens_seen": 357659870, + "step": 16572, + "time_per_iteration": 2.6911568641662598 + }, + { + "auxiliary_loss_clip": 0.01040467, + "auxiliary_loss_mlp": 0.01030447, + "balance_loss_clip": 1.02446604, + "balance_loss_mlp": 1.02024281, + "epoch": 0.9964226664662559, + "flos": 26468534505600.0, + "grad_norm": 1.7707236031848623, + "language_loss": 0.70453572, + "learning_rate": 1.3199841727074e-10, + "loss": 0.72524482, + "num_input_tokens_seen": 357677075, + "step": 16573, + "time_per_iteration": 2.735079050064087 + }, + { + "auxiliary_loss_clip": 0.01043409, + "auxiliary_loss_mlp": 0.01030918, + "balance_loss_clip": 1.02478933, + "balance_loss_mlp": 1.02008808, + "epoch": 0.9964827897189238, + "flos": 27448320764160.0, + "grad_norm": 1.7074247183025988, + "language_loss": 0.63244665, + "learning_rate": 1.275618614968721e-10, + "loss": 0.65318996, + "num_input_tokens_seen": 357696715, + "step": 16574, + "time_per_iteration": 2.759567975997925 + }, + { + "auxiliary_loss_clip": 0.01037683, + "auxiliary_loss_mlp": 0.01031544, + "balance_loss_clip": 1.02769661, + "balance_loss_mlp": 1.01997423, + "epoch": 0.9965429129715918, + "flos": 11721566350080.0, + "grad_norm": 4.190383129798035, + "language_loss": 0.76088011, + "learning_rate": 1.2320114108654856e-10, + "loss": 0.7815724, + "num_input_tokens_seen": 357712345, + "step": 16575, + "time_per_iteration": 2.792025566101074 + }, + { + "auxiliary_loss_clip": 0.01043268, + "auxiliary_loss_mlp": 0.01027949, + "balance_loss_clip": 1.02553964, + "balance_loss_mlp": 1.01752424, + "epoch": 0.9966030362242597, + "flos": 19756004590080.0, + "grad_norm": 1.997937466166208, + "language_loss": 0.70584291, + "learning_rate": 1.1891625620474855e-10, + "loss": 0.72655511, + "num_input_tokens_seen": 357731815, + "step": 16576, + "time_per_iteration": 2.725867748260498 + }, + { + "auxiliary_loss_clip": 0.01050753, + "auxiliary_loss_mlp": 0.01024445, + "balance_loss_clip": 1.02433455, + "balance_loss_mlp": 1.01448488, + "epoch": 0.9966631594769277, + "flos": 23915178259200.0, + "grad_norm": 1.8068613394238497, + "language_loss": 0.72043675, + "learning_rate": 1.1470720701400871e-10, + "loss": 0.7411887, + "num_input_tokens_seen": 357751640, + "step": 16577, + "time_per_iteration": 4.514354705810547 + }, + { + "auxiliary_loss_clip": 0.01040898, + "auxiliary_loss_mlp": 0.01034363, + "balance_loss_clip": 1.02388072, + "balance_loss_mlp": 1.02408767, + "epoch": 0.9967232827295956, + "flos": 15559591495680.0, + "grad_norm": 2.2200580090812925, + "language_loss": 0.78827786, + "learning_rate": 1.1057399367397912e-10, + "loss": 0.80903041, + "num_input_tokens_seen": 357769850, + "step": 16578, + "time_per_iteration": 2.636047840118408 + }, + { + "auxiliary_loss_clip": 0.01023652, + "auxiliary_loss_mlp": 0.00747558, + "balance_loss_clip": 1.02599382, + "balance_loss_mlp": 1.00037265, + "epoch": 0.9967834059822637, + "flos": 20813035046400.0, + "grad_norm": 1.6836481174595053, + "language_loss": 0.76380706, + "learning_rate": 1.0651661634142328e-10, + "loss": 0.78151906, + "num_input_tokens_seen": 357789550, + "step": 16579, + "time_per_iteration": 2.722175359725952 + }, + { + "auxiliary_loss_clip": 0.01044321, + "auxiliary_loss_mlp": 0.01033477, + "balance_loss_clip": 1.02594757, + "balance_loss_mlp": 1.02182388, + "epoch": 0.9968435292349316, + "flos": 36719234830080.0, + "grad_norm": 2.1980855701072035, + "language_loss": 0.69719416, + "learning_rate": 1.0253507516999604e-10, + "loss": 0.71797216, + "num_input_tokens_seen": 357809525, + "step": 16580, + "time_per_iteration": 2.8044750690460205 + }, + { + "auxiliary_loss_clip": 0.01014296, + "auxiliary_loss_mlp": 0.01025148, + "balance_loss_clip": 1.02196598, + "balance_loss_mlp": 1.01525378, + "epoch": 0.9969036524875996, + "flos": 26760919213440.0, + "grad_norm": 1.7511917138514697, + "language_loss": 0.79602313, + "learning_rate": 9.862937031113184e-11, + "loss": 0.81641757, + "num_input_tokens_seen": 357829795, + "step": 16581, + "time_per_iteration": 2.7313592433929443 + }, + { + "auxiliary_loss_clip": 0.01040832, + "auxiliary_loss_mlp": 0.01025431, + "balance_loss_clip": 1.02610552, + "balance_loss_mlp": 1.01589465, + "epoch": 0.9969637757402675, + "flos": 24827237424000.0, + "grad_norm": 1.8682499871513405, + "language_loss": 0.80294001, + "learning_rate": 9.479950191249031e-11, + "loss": 0.82360268, + "num_input_tokens_seen": 357851655, + "step": 16582, + "time_per_iteration": 2.679831027984619 + }, + { + "auxiliary_loss_clip": 0.01047217, + "auxiliary_loss_mlp": 0.01025105, + "balance_loss_clip": 1.02268553, + "balance_loss_mlp": 1.01482284, + "epoch": 0.9970238989929355, + "flos": 23038742407680.0, + "grad_norm": 1.5440086797495143, + "language_loss": 0.60281861, + "learning_rate": 9.104547011951069e-11, + "loss": 0.62354177, + "num_input_tokens_seen": 357871205, + "step": 16583, + "time_per_iteration": 2.6754467487335205 + }, + { + "auxiliary_loss_clip": 0.01043539, + "auxiliary_loss_mlp": 0.01031258, + "balance_loss_clip": 1.02537155, + "balance_loss_mlp": 1.02089858, + "epoch": 0.9970840222456034, + "flos": 25298816106240.0, + "grad_norm": 1.651458390742081, + "language_loss": 0.77774215, + "learning_rate": 8.736727507452357e-11, + "loss": 0.79849005, + "num_input_tokens_seen": 357892145, + "step": 16584, + "time_per_iteration": 2.729207754135132 + }, + { + "auxiliary_loss_clip": 0.01037898, + "auxiliary_loss_mlp": 0.01028352, + "balance_loss_clip": 1.02313352, + "balance_loss_mlp": 1.01894617, + "epoch": 0.9971441454982715, + "flos": 21615602578560.0, + "grad_norm": 1.5220254489308458, + "language_loss": 0.69432145, + "learning_rate": 8.376491691697297e-11, + "loss": 0.71498394, + "num_input_tokens_seen": 357911205, + "step": 16585, + "time_per_iteration": 2.639967441558838 + }, + { + "auxiliary_loss_clip": 0.01061182, + "auxiliary_loss_mlp": 0.01027429, + "balance_loss_clip": 1.02556407, + "balance_loss_mlp": 1.01686668, + "epoch": 0.9972042687509394, + "flos": 14975612179200.0, + "grad_norm": 2.511634783503747, + "language_loss": 0.81049335, + "learning_rate": 8.023839578363834e-11, + "loss": 0.83137941, + "num_input_tokens_seen": 357928190, + "step": 16586, + "time_per_iteration": 2.743565320968628 + }, + { + "auxiliary_loss_clip": 0.01035255, + "auxiliary_loss_mlp": 0.01033708, + "balance_loss_clip": 1.02094173, + "balance_loss_mlp": 1.02407551, + "epoch": 0.9972643920036074, + "flos": 25806664546560.0, + "grad_norm": 1.5673022240141499, + "language_loss": 0.77425432, + "learning_rate": 7.678771180796851e-11, + "loss": 0.79494393, + "num_input_tokens_seen": 357946985, + "step": 16587, + "time_per_iteration": 2.785714626312256 + }, + { + "auxiliary_loss_clip": 0.01034449, + "auxiliary_loss_mlp": 0.01030869, + "balance_loss_clip": 1.02489972, + "balance_loss_mlp": 1.0207665, + "epoch": 0.9973245152562754, + "flos": 23326242865920.0, + "grad_norm": 2.2725464363467305, + "language_loss": 0.73002023, + "learning_rate": 7.341286512074773e-11, + "loss": 0.75067341, + "num_input_tokens_seen": 357966720, + "step": 16588, + "time_per_iteration": 2.8390979766845703 + }, + { + "auxiliary_loss_clip": 0.01064248, + "auxiliary_loss_mlp": 0.01025316, + "balance_loss_clip": 1.02495193, + "balance_loss_mlp": 1.01440859, + "epoch": 0.9973846385089433, + "flos": 12166212810240.0, + "grad_norm": 2.386667057809984, + "language_loss": 0.82473302, + "learning_rate": 7.011385585031781e-11, + "loss": 0.84562862, + "num_input_tokens_seen": 357981375, + "step": 16589, + "time_per_iteration": 2.5360968112945557 + }, + { + "auxiliary_loss_clip": 0.01053162, + "auxiliary_loss_mlp": 0.01031234, + "balance_loss_clip": 1.02489638, + "balance_loss_mlp": 1.01894331, + "epoch": 0.9974447617616113, + "flos": 20045157073920.0, + "grad_norm": 2.311863079972314, + "language_loss": 0.70164323, + "learning_rate": 6.689068412168986e-11, + "loss": 0.72248727, + "num_input_tokens_seen": 358000290, + "step": 16590, + "time_per_iteration": 2.6440656185150146 + }, + { + "auxiliary_loss_clip": 0.01038214, + "auxiliary_loss_mlp": 0.01027115, + "balance_loss_clip": 1.02400446, + "balance_loss_mlp": 1.01605797, + "epoch": 0.9975048850142793, + "flos": 32014614159360.0, + "grad_norm": 1.6300824027883278, + "language_loss": 0.6383146, + "learning_rate": 6.374335005676634e-11, + "loss": 0.65896791, + "num_input_tokens_seen": 358022075, + "step": 16591, + "time_per_iteration": 2.799506664276123 + }, + { + "auxiliary_loss_clip": 0.01041127, + "auxiliary_loss_mlp": 0.01025327, + "balance_loss_clip": 1.02391934, + "balance_loss_mlp": 1.01551628, + "epoch": 0.9975650082669473, + "flos": 36933728895360.0, + "grad_norm": 1.4761431920339503, + "language_loss": 0.72897363, + "learning_rate": 6.067185377522933e-11, + "loss": 0.74963814, + "num_input_tokens_seen": 358043940, + "step": 16592, + "time_per_iteration": 2.7491159439086914 + }, + { + "auxiliary_loss_clip": 0.01041969, + "auxiliary_loss_mlp": 0.01029874, + "balance_loss_clip": 1.02453566, + "balance_loss_mlp": 1.0189662, + "epoch": 0.9976251315196152, + "flos": 16472117537280.0, + "grad_norm": 4.017082592729483, + "language_loss": 0.84964812, + "learning_rate": 5.767619539343016e-11, + "loss": 0.87036657, + "num_input_tokens_seen": 358062720, + "step": 16593, + "time_per_iteration": 2.6333606243133545 + }, + { + "auxiliary_loss_clip": 0.01058921, + "auxiliary_loss_mlp": 0.00747505, + "balance_loss_clip": 1.02475071, + "balance_loss_mlp": 1.00038099, + "epoch": 0.9976852547722832, + "flos": 19646836179840.0, + "grad_norm": 1.7662221876760473, + "language_loss": 0.69581461, + "learning_rate": 5.4756375024833656e-11, + "loss": 0.71387893, + "num_input_tokens_seen": 358081560, + "step": 16594, + "time_per_iteration": 2.56149959564209 + }, + { + "auxiliary_loss_clip": 0.01024679, + "auxiliary_loss_mlp": 0.01023582, + "balance_loss_clip": 1.02542484, + "balance_loss_mlp": 1.0128715, + "epoch": 0.9977453780249511, + "flos": 20448434044800.0, + "grad_norm": 1.933312077815996, + "language_loss": 0.72825742, + "learning_rate": 5.1912392780462113e-11, + "loss": 0.74874008, + "num_input_tokens_seen": 358099065, + "step": 16595, + "time_per_iteration": 2.775113582611084 + }, + { + "auxiliary_loss_clip": 0.00984645, + "auxiliary_loss_mlp": 0.01002016, + "balance_loss_clip": 1.00162864, + "balance_loss_mlp": 1.00112236, + "epoch": 0.9978055012776191, + "flos": 65455097581440.0, + "grad_norm": 0.7829883146717308, + "language_loss": 0.60300767, + "learning_rate": 4.9144248768007156e-11, + "loss": 0.62287426, + "num_input_tokens_seen": 358156095, + "step": 16596, + "time_per_iteration": 4.769975423812866 + }, + { + "auxiliary_loss_clip": 0.01050238, + "auxiliary_loss_mlp": 0.01031417, + "balance_loss_clip": 1.02445853, + "balance_loss_mlp": 1.0210582, + "epoch": 0.997865624530287, + "flos": 20631506688000.0, + "grad_norm": 2.129019839503141, + "language_loss": 0.77792203, + "learning_rate": 4.645194309227385e-11, + "loss": 0.7987386, + "num_input_tokens_seen": 358175230, + "step": 16597, + "time_per_iteration": 2.6339521408081055 + }, + { + "auxiliary_loss_clip": 0.01049059, + "auxiliary_loss_mlp": 0.01028851, + "balance_loss_clip": 1.0231142, + "balance_loss_mlp": 1.01814628, + "epoch": 0.9979257477829551, + "flos": 29387102284800.0, + "grad_norm": 1.8325238782322952, + "language_loss": 0.82209539, + "learning_rate": 4.383547585562475e-11, + "loss": 0.84287453, + "num_input_tokens_seen": 358197075, + "step": 16598, + "time_per_iteration": 2.676191568374634 + }, + { + "auxiliary_loss_clip": 0.01034633, + "auxiliary_loss_mlp": 0.01034328, + "balance_loss_clip": 1.02366924, + "balance_loss_mlp": 1.0229733, + "epoch": 0.997985871035623, + "flos": 22635070387200.0, + "grad_norm": 2.024843462045602, + "language_loss": 0.64653617, + "learning_rate": 4.129484715709175e-11, + "loss": 0.66722578, + "num_input_tokens_seen": 358215925, + "step": 16599, + "time_per_iteration": 2.7329459190368652 + }, + { + "auxiliary_loss_clip": 0.00987322, + "auxiliary_loss_mlp": 0.01002795, + "balance_loss_clip": 1.00224745, + "balance_loss_mlp": 1.00194252, + "epoch": 0.998045994288291, + "flos": 61806968663040.0, + "grad_norm": 0.8545579289320789, + "language_loss": 0.62313664, + "learning_rate": 3.8830057093264256e-11, + "loss": 0.6430378, + "num_input_tokens_seen": 358269035, + "step": 16600, + "time_per_iteration": 3.1431941986083984 + }, + { + "auxiliary_loss_clip": 0.01039096, + "auxiliary_loss_mlp": 0.01025866, + "balance_loss_clip": 1.02407849, + "balance_loss_mlp": 1.01713347, + "epoch": 0.998106117540959, + "flos": 19245534456960.0, + "grad_norm": 1.7454206288494625, + "language_loss": 0.78529251, + "learning_rate": 3.644110575717896e-11, + "loss": 0.80594206, + "num_input_tokens_seen": 358287680, + "step": 16601, + "time_per_iteration": 2.6309733390808105 + }, + { + "auxiliary_loss_clip": 0.01029155, + "auxiliary_loss_mlp": 0.01029257, + "balance_loss_clip": 1.02250314, + "balance_loss_mlp": 1.01867688, + "epoch": 0.9981662407936269, + "flos": 21106209853440.0, + "grad_norm": 1.9668768476499177, + "language_loss": 0.82358122, + "learning_rate": 3.412799323987414e-11, + "loss": 0.84416533, + "num_input_tokens_seen": 358304080, + "step": 16602, + "time_per_iteration": 2.7998898029327393 + }, + { + "auxiliary_loss_clip": 0.01034635, + "auxiliary_loss_mlp": 0.01031558, + "balance_loss_clip": 1.02735615, + "balance_loss_mlp": 1.02083504, + "epoch": 0.998226364046295, + "flos": 24316839118080.0, + "grad_norm": 2.2221906750517255, + "language_loss": 0.62382382, + "learning_rate": 3.189071962883538e-11, + "loss": 0.64448571, + "num_input_tokens_seen": 358323670, + "step": 16603, + "time_per_iteration": 2.7652766704559326 + }, + { + "auxiliary_loss_clip": 0.01036044, + "auxiliary_loss_mlp": 0.01028397, + "balance_loss_clip": 1.02193999, + "balance_loss_mlp": 1.01788259, + "epoch": 0.9982864872989629, + "flos": 23836389776640.0, + "grad_norm": 1.8557110104420935, + "language_loss": 0.70759815, + "learning_rate": 2.972928500866168e-11, + "loss": 0.72824258, + "num_input_tokens_seen": 358341980, + "step": 16604, + "time_per_iteration": 2.6515512466430664 + }, + { + "auxiliary_loss_clip": 0.01061377, + "auxiliary_loss_mlp": 0.01023955, + "balance_loss_clip": 1.02483916, + "balance_loss_mlp": 1.01351237, + "epoch": 0.9983466105516309, + "flos": 18333116156160.0, + "grad_norm": 11.70278290506814, + "language_loss": 0.64776075, + "learning_rate": 2.7643689461953613e-11, + "loss": 0.66861403, + "num_input_tokens_seen": 358360400, + "step": 16605, + "time_per_iteration": 5.751803398132324 + }, + { + "auxiliary_loss_clip": 0.01028592, + "auxiliary_loss_mlp": 0.01025251, + "balance_loss_clip": 1.02392864, + "balance_loss_mlp": 1.01575041, + "epoch": 0.9984067338042988, + "flos": 17236763285760.0, + "grad_norm": 1.7469499013320184, + "language_loss": 0.71273577, + "learning_rate": 2.5633933067092938e-11, + "loss": 0.73327422, + "num_input_tokens_seen": 358378990, + "step": 16606, + "time_per_iteration": 2.6240313053131104 + }, + { + "auxiliary_loss_clip": 0.01052655, + "auxiliary_loss_mlp": 0.00747646, + "balance_loss_clip": 1.02492595, + "balance_loss_mlp": 1.00039411, + "epoch": 0.9984668570569668, + "flos": 20667884186880.0, + "grad_norm": 1.9820806694137607, + "language_loss": 0.81835496, + "learning_rate": 2.370001590090709e-11, + "loss": 0.83635795, + "num_input_tokens_seen": 358395970, + "step": 16607, + "time_per_iteration": 2.6166718006134033 + }, + { + "auxiliary_loss_clip": 0.01028833, + "auxiliary_loss_mlp": 0.01029463, + "balance_loss_clip": 1.02164698, + "balance_loss_mlp": 1.01855516, + "epoch": 0.9985269803096347, + "flos": 30262532555520.0, + "grad_norm": 1.5350159718525727, + "language_loss": 0.66655737, + "learning_rate": 2.184193803622669e-11, + "loss": 0.68714029, + "num_input_tokens_seen": 358417355, + "step": 16608, + "time_per_iteration": 2.788703680038452 + }, + { + "auxiliary_loss_clip": 0.01013949, + "auxiliary_loss_mlp": 0.01029833, + "balance_loss_clip": 1.02491951, + "balance_loss_mlp": 1.01909852, + "epoch": 0.9985871035623027, + "flos": 10560970005120.0, + "grad_norm": 9.786657170067457, + "language_loss": 0.80440986, + "learning_rate": 2.0059699543883978e-11, + "loss": 0.8248477, + "num_input_tokens_seen": 358434345, + "step": 16609, + "time_per_iteration": 2.750261068344116 + }, + { + "auxiliary_loss_clip": 0.0104056, + "auxiliary_loss_mlp": 0.0102998, + "balance_loss_clip": 1.02278244, + "balance_loss_mlp": 1.01962066, + "epoch": 0.9986472268149706, + "flos": 16873455173760.0, + "grad_norm": 1.5190157998744078, + "language_loss": 0.62956452, + "learning_rate": 1.8353300491158462e-11, + "loss": 0.65026993, + "num_input_tokens_seen": 358452870, + "step": 16610, + "time_per_iteration": 2.737729787826538 + }, + { + "auxiliary_loss_clip": 0.01041378, + "auxiliary_loss_mlp": 0.01029626, + "balance_loss_clip": 1.02331817, + "balance_loss_mlp": 1.02010727, + "epoch": 0.9987073500676387, + "flos": 22054538776320.0, + "grad_norm": 5.924552761592003, + "language_loss": 0.67223227, + "learning_rate": 1.672274094288717e-11, + "loss": 0.69294232, + "num_input_tokens_seen": 358472210, + "step": 16611, + "time_per_iteration": 2.655857563018799 + }, + { + "auxiliary_loss_clip": 0.01024163, + "auxiliary_loss_mlp": 0.01030352, + "balance_loss_clip": 1.02400231, + "balance_loss_mlp": 1.01893735, + "epoch": 0.9987674733203066, + "flos": 30482880537600.0, + "grad_norm": 1.70159051852431, + "language_loss": 0.69696069, + "learning_rate": 1.5168020961020544e-11, + "loss": 0.71750587, + "num_input_tokens_seen": 358493840, + "step": 16612, + "time_per_iteration": 2.7865395545959473 + }, + { + "auxiliary_loss_clip": 0.01038072, + "auxiliary_loss_mlp": 0.01027068, + "balance_loss_clip": 1.02366328, + "balance_loss_mlp": 1.01766241, + "epoch": 0.9988275965729746, + "flos": 27745230585600.0, + "grad_norm": 1.5671135317806564, + "language_loss": 0.74256599, + "learning_rate": 1.3689140604400407e-11, + "loss": 0.76321745, + "num_input_tokens_seen": 358515060, + "step": 16613, + "time_per_iteration": 2.7623836994171143 + }, + { + "auxiliary_loss_clip": 0.01027039, + "auxiliary_loss_mlp": 0.00747651, + "balance_loss_clip": 1.02442539, + "balance_loss_mlp": 1.00037956, + "epoch": 0.9988877198256426, + "flos": 17524191916800.0, + "grad_norm": 2.00326275468181, + "language_loss": 0.73576158, + "learning_rate": 1.2286099928981996e-11, + "loss": 0.75350845, + "num_input_tokens_seen": 358528200, + "step": 16614, + "time_per_iteration": 2.7004003524780273 + }, + { + "auxiliary_loss_clip": 0.0104602, + "auxiliary_loss_mlp": 0.01027473, + "balance_loss_clip": 1.02458572, + "balance_loss_mlp": 1.01771533, + "epoch": 0.9989478430783105, + "flos": 20996502739200.0, + "grad_norm": 1.8398649950772745, + "language_loss": 0.72734487, + "learning_rate": 1.0958898988278065e-11, + "loss": 0.74807978, + "num_input_tokens_seen": 358548360, + "step": 16615, + "time_per_iteration": 2.598911762237549 + }, + { + "auxiliary_loss_clip": 0.01065487, + "auxiliary_loss_mlp": 0.00747588, + "balance_loss_clip": 1.02681482, + "balance_loss_mlp": 1.00045276, + "epoch": 0.9990079663309785, + "flos": 13370620769280.0, + "grad_norm": 1.9949535660354, + "language_loss": 0.77338636, + "learning_rate": 9.70753783247069e-12, + "loss": 0.79151714, + "num_input_tokens_seen": 358566270, + "step": 16616, + "time_per_iteration": 2.535712480545044 + }, + { + "auxiliary_loss_clip": 0.01034272, + "auxiliary_loss_mlp": 0.0102554, + "balance_loss_clip": 1.0250932, + "balance_loss_mlp": 1.01527619, + "epoch": 0.9990680895836465, + "flos": 17310236555520.0, + "grad_norm": 2.1137429434108346, + "language_loss": 0.82757652, + "learning_rate": 8.532016508855378e-12, + "loss": 0.84817469, + "num_input_tokens_seen": 358584710, + "step": 16617, + "time_per_iteration": 2.587275981903076 + }, + { + "auxiliary_loss_clip": 0.01041661, + "auxiliary_loss_mlp": 0.01024234, + "balance_loss_clip": 1.02387416, + "balance_loss_mlp": 1.01409531, + "epoch": 0.9991282128363145, + "flos": 24207993930240.0, + "grad_norm": 1.8051320940177855, + "language_loss": 0.79083687, + "learning_rate": 7.43233506206309e-12, + "loss": 0.81149578, + "num_input_tokens_seen": 358606750, + "step": 16618, + "time_per_iteration": 2.6781511306762695 + }, + { + "auxiliary_loss_clip": 0.01059974, + "auxiliary_loss_mlp": 0.01025903, + "balance_loss_clip": 1.02438426, + "balance_loss_mlp": 1.01577652, + "epoch": 0.9991883360889824, + "flos": 21175301664000.0, + "grad_norm": 1.6852061372550453, + "language_loss": 0.74775499, + "learning_rate": 6.408493534060255e-12, + "loss": 0.76861376, + "num_input_tokens_seen": 358624675, + "step": 16619, + "time_per_iteration": 2.5408153533935547 + }, + { + "auxiliary_loss_clip": 0.01046844, + "auxiliary_loss_mlp": 0.01024125, + "balance_loss_clip": 1.02298772, + "balance_loss_mlp": 1.01490426, + "epoch": 0.9992484593416504, + "flos": 19901155449600.0, + "grad_norm": 2.1152158033829482, + "language_loss": 0.86674619, + "learning_rate": 5.460491963260594e-12, + "loss": 0.88745582, + "num_input_tokens_seen": 358640715, + "step": 16620, + "time_per_iteration": 2.6361985206604004 + }, + { + "auxiliary_loss_clip": 0.01022061, + "auxiliary_loss_mlp": 0.010245, + "balance_loss_clip": 1.02041698, + "balance_loss_mlp": 1.01478398, + "epoch": 0.9993085825943183, + "flos": 24857832833280.0, + "grad_norm": 1.8912704096119488, + "language_loss": 0.72488642, + "learning_rate": 4.58833038607942e-12, + "loss": 0.74535203, + "num_input_tokens_seen": 358659630, + "step": 16621, + "time_per_iteration": 2.7343242168426514 + }, + { + "auxiliary_loss_clip": 0.00979014, + "auxiliary_loss_mlp": 0.01001429, + "balance_loss_clip": 1.00387478, + "balance_loss_mlp": 1.00064802, + "epoch": 0.9993687058469863, + "flos": 71284478780160.0, + "grad_norm": 0.7410212140768038, + "language_loss": 0.5656535, + "learning_rate": 3.79200883515729e-12, + "loss": 0.58545792, + "num_input_tokens_seen": 358727840, + "step": 16622, + "time_per_iteration": 3.4488048553466797 + }, + { + "auxiliary_loss_clip": 0.01022634, + "auxiliary_loss_mlp": 0.01028514, + "balance_loss_clip": 1.02169371, + "balance_loss_mlp": 1.01783848, + "epoch": 0.9994288290996542, + "flos": 12199573566720.0, + "grad_norm": 2.035314665598921, + "language_loss": 0.70947224, + "learning_rate": 3.071527340914315e-12, + "loss": 0.72998369, + "num_input_tokens_seen": 358744125, + "step": 16623, + "time_per_iteration": 2.7044498920440674 + }, + { + "auxiliary_loss_clip": 0.01023964, + "auxiliary_loss_mlp": 0.01027977, + "balance_loss_clip": 1.0247848, + "balance_loss_mlp": 1.016819, + "epoch": 0.9994889523523223, + "flos": 17889942153600.0, + "grad_norm": 1.7691289786670643, + "language_loss": 0.74302697, + "learning_rate": 2.4268859304399368e-12, + "loss": 0.76354635, + "num_input_tokens_seen": 358761420, + "step": 16624, + "time_per_iteration": 4.461879014968872 + }, + { + "auxiliary_loss_clip": 0.01024562, + "auxiliary_loss_mlp": 0.01024306, + "balance_loss_clip": 1.02451003, + "balance_loss_mlp": 1.01426303, + "epoch": 0.9995490756049902, + "flos": 26578888064640.0, + "grad_norm": 1.4929120889603875, + "language_loss": 0.73605752, + "learning_rate": 1.8580846286031514e-12, + "loss": 0.75654614, + "num_input_tokens_seen": 358782600, + "step": 16625, + "time_per_iteration": 2.701786756515503 + }, + { + "auxiliary_loss_clip": 0.01051812, + "auxiliary_loss_mlp": 0.01031365, + "balance_loss_clip": 1.02593124, + "balance_loss_mlp": 1.02153051, + "epoch": 0.9996091988576582, + "flos": 22200048771840.0, + "grad_norm": 2.0544528375753055, + "language_loss": 0.76461673, + "learning_rate": 1.3651234567202408e-12, + "loss": 0.78544849, + "num_input_tokens_seen": 358801220, + "step": 16626, + "time_per_iteration": 2.7255027294158936 + }, + { + "auxiliary_loss_clip": 0.01059855, + "auxiliary_loss_mlp": 0.01029192, + "balance_loss_clip": 1.02430189, + "balance_loss_mlp": 1.01929736, + "epoch": 0.9996693221103262, + "flos": 27373195468800.0, + "grad_norm": 1.7367640997318998, + "language_loss": 0.82496655, + "learning_rate": 9.480024334429515e-13, + "loss": 0.84585702, + "num_input_tokens_seen": 358819190, + "step": 16627, + "time_per_iteration": 2.6860249042510986 + }, + { + "auxiliary_loss_clip": 0.01045605, + "auxiliary_loss_mlp": 0.01034129, + "balance_loss_clip": 1.0256238, + "balance_loss_mlp": 1.02314973, + "epoch": 0.9997294453629941, + "flos": 26870410846080.0, + "grad_norm": 2.064855295656456, + "language_loss": 0.70912015, + "learning_rate": 6.067215747584952e-13, + "loss": 0.72991753, + "num_input_tokens_seen": 358839850, + "step": 16628, + "time_per_iteration": 2.7769665718078613 + }, + { + "auxiliary_loss_clip": 0.01050724, + "auxiliary_loss_mlp": 0.01026201, + "balance_loss_clip": 1.02391243, + "balance_loss_mlp": 1.01609814, + "epoch": 0.9997895686156621, + "flos": 23476996247040.0, + "grad_norm": 1.4575316209615732, + "language_loss": 0.75174475, + "learning_rate": 3.4128089332341456e-13, + "loss": 0.77251399, + "num_input_tokens_seen": 358859805, + "step": 16629, + "time_per_iteration": 2.725741147994995 + }, + { + "auxiliary_loss_clip": 0.01042628, + "auxiliary_loss_mlp": 0.0103103, + "balance_loss_clip": 1.02432144, + "balance_loss_mlp": 1.02051604, + "epoch": 0.9998496918683301, + "flos": 20224961579520.0, + "grad_norm": 1.718028521902689, + "language_loss": 0.6033057, + "learning_rate": 1.5168039935176126e-13, + "loss": 0.62404227, + "num_input_tokens_seen": 358877900, + "step": 16630, + "time_per_iteration": 2.7564780712127686 + }, + { + "auxiliary_loss_clip": 0.01017566, + "auxiliary_loss_mlp": 0.01026203, + "balance_loss_clip": 1.02272987, + "balance_loss_mlp": 1.01605773, + "epoch": 0.9999098151209981, + "flos": 21652913831040.0, + "grad_norm": 2.00318576807805, + "language_loss": 0.60250056, + "learning_rate": 3.792010017100722e-14, + "loss": 0.62293822, + "num_input_tokens_seen": 358897285, + "step": 16631, + "time_per_iteration": 2.867750644683838 + }, + { + "auxiliary_loss_clip": 0.01004787, + "auxiliary_loss_mlp": 0.00747623, + "balance_loss_clip": 1.02351224, + "balance_loss_mlp": 1.00037193, + "epoch": 0.999969938373666, + "flos": 11544599018880.0, + "grad_norm": 1.7810768141300723, + "language_loss": 0.72415024, + "learning_rate": 0.0, + "loss": 0.74167436, + "num_input_tokens_seen": 358911570, + "step": 16632, + "time_per_iteration": 2.9194304943084717 + }, + { + "epoch": 0.999969938373666, + "num_input_tokens_seen": 358911570, + "step": 16632, + "total_flos": 1.3992169073237033e+18, + "train_loss": 0.7686992226801939, + "train_runtime": 48926.6414, + "train_samples_per_second": 13.598, + "train_steps_per_second": 0.34 + } + ], + "logging_steps": 1.0, + "max_steps": 16632, + "num_input_tokens_seen": 358911570, + "num_train_epochs": 1, + "save_steps": 3328, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.3992169073237033e+18, + "train_batch_size": 5, + "trial_name": null, + "trial_params": null +}